src/jalview/io/gff/GffHelperBase.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.MappingType;
  27 import jalview.datamodel.SequenceDummy;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.io.FeaturesFile;
  31 import jalview.util.MapList;
  32 import jalview.util.StringUtils;
  33
  34 import java.util.ArrayList;
  35 import java.util.Arrays;
  36 import java.util.HashMap;
  37 import java.util.List;
  38 import java.util.Map;
  39 import java.util.Map.Entry;
  40
  41 /**
  42  * Base class with common functionality for flavours of GFF handler (GFF2 or
  43  * GFF3)
  44  */
  45 public abstract class GffHelperBase implements GffHelperI
  46 {
  47   private static final String NOTE = "Note";
  48
  49   /*
  50    * GFF columns 1-9 (zero-indexed):
  51    */
  52   protected static final int SEQID_COL = 0;
  53
  54   protected static final int SOURCE_COL = 1;
  55
  56   protected static final int TYPE_COL = 2;
  57
  58   protected static final int START_COL = 3;
  59
  60   protected static final int END_COL = 4;
  61
  62   protected static final int SCORE_COL = 5;
  63
  64   protected static final int STRAND_COL = 6;
  65
  66   protected static final int PHASE_COL = 7;
  67
  68   protected static final int ATTRIBUTES_COL = 8;
  69
  70   private AlignmentI lastmatchedAl = null;
  71
  72   private SequenceIdMatcher matcher = null;
  73
  74   /**
  75    * Constructs and returns a mapping, or null if data appear invalid
  76    *
  77    * @param fromStart
  78    * @param fromEnd
  79    * @param toStart
  80    * @param toEnd
  81    * @param mappingType
  82    *          type of mapping (e.g. protein to nucleotide)
  83    * @return
  84    */
  85   protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
  86           int toStart, int toEnd, MappingType mappingType)
  87   {
  88     int[] from = new int[] { fromStart, fromEnd };
  89     int[] to = new int[] { toStart, toEnd };
  90
  91     /*
  92      * Jalview always models from dna to protein, so switch values if the
  93      * GFF mapping is from protein to dna
  94      */
  95     if (mappingType == MappingType.PeptideToNucleotide)
  96     {
  97       int[] temp = from;
  98       from = to;
  99       to = temp;
 100       mappingType = mappingType.getInverse();
 101     }
 102
 103     int fromRatio = mappingType.getFromRatio();
 104     int toRatio = mappingType.getToRatio();
 105
 106     /*
 107      * sanity check that mapped residue counts match
 108      * TODO understand why PASA generates such cases...
 109      */
 110     if (!trimMapping(from, to, fromRatio, toRatio))
 111     {
 112       System.err.println("Ignoring mapping from " + Arrays.toString(from)
 113               + " to " + Arrays.toString(to) + " as counts don't match!");
 114       return null;
 115     }
 116
 117     /*
 118      * If a codon has an intron gap, there will be contiguous 'toRanges';
 119      * this is handled for us by the MapList constructor.
 120      * (It is not clear that exonerate ever generates this case)
 121      */
 122
 123     return new MapList(from, to, fromRatio, toRatio);
 124   }
 125
 126   /**
 127    * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
 128    * tries to trim the end of the longer so they do. Returns true if the
 129    * mappings could be made equivalent, else false. Note the range array values
 130    * may be modified by this method.
 131    *
 132    * @param from
 133    * @param to
 134    * @param fromRatio
 135    * @param toRatio
 136    * @return
 137    */
 138   protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
 139           int toRatio)
 140   {
 141     int fromLength = Math.abs(from[1] - from[0]) + 1;
 142     int toLength = Math.abs(to[1] - to[0]) + 1;
 143     int fromOverlap = fromLength * toRatio - toLength * fromRatio;
 144     if (fromOverlap == 0)
 145     {
 146       return true;
 147     }
 148     if (fromOverlap > 0 && fromOverlap % toRatio == 0)
 149     {
 150       /*
 151        * restrict from range to make them match up
 152        * it's kind of arbitrary which end we truncate - here it is the end
 153        */
 154       System.err.print(
 155               "Truncating mapping from " + Arrays.toString(from) + " to ");
 156       if (from[1] > from[0])
 157       {
 158         from[1] -= fromOverlap / toRatio;
 159       }
 160       else
 161       {
 162         from[1] += fromOverlap / toRatio;
 163       }
 164       System.err.println(Arrays.toString(from));
 165       return true;
 166     }
 167     else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
 168     {
 169       fromOverlap = -fromOverlap; // > 0
 170       /*
 171        * restrict to range to make them match up
 172        */
 173       System.err.print(
 174               "Truncating mapping to " + Arrays.toString(to) + " to ");
 175       if (to[1] > to[0])
 176       {
 177         to[1] -= fromOverlap / fromRatio;
 178       }
 179       else
 180       {
 181         to[1] += fromOverlap / fromRatio;
 182       }
 183       System.err.println(Arrays.toString(to));
 184       return true;
 185     }
 186
 187     /*
 188      * Couldn't truncate to an exact match..
 189      */
 190     return false;
 191   }
 192
 193   /**
 194    * Returns a sequence matching the given id, as follows
 195    * <ul>
 196    * <li>strict matching is on exact sequence name</li>
 197    * <li>relaxed matching allows matching on a token within the sequence name,
 198    * or a dbxref</li>
 199    * <li>first tries to find a match in the alignment sequences</li>
 200    * <li>else tries to find a match in the new sequences already generated while
 201    * parsing the features file</li>
 202    * <li>else creates a new placeholder sequence, adds it to the new sequences
 203    * list, and returns it</li>
 204    * </ul>
 205    *
 206    * @param seqId
 207    * @param align
 208    * @param newseqs
 209    * @param relaxedIdMatching
 210    *
 211    * @return
 212    */
 213   protected SequenceI findSequence(String seqId, AlignmentI align,
 214           List<SequenceI> newseqs, boolean relaxedIdMatching)
 215   {
 216     if (seqId == null)
 217     {
 218       return null;
 219     }
 220     SequenceI match = null;
 221     if (relaxedIdMatching)
 222     {
 223       if (lastmatchedAl != align)
 224       {
 225         lastmatchedAl = align;
 226         matcher = new SequenceIdMatcher(align.getSequencesArray());
 227         if (newseqs != null)
 228         {
 229           matcher.addAll(newseqs);
 230         }
 231       }
 232       match = matcher.findIdMatch(seqId);
 233     }
 234     else
 235     {
 236       match = align.findName(seqId, true);
 237       if (match == null && newseqs != null)
 238       {
 239         for (SequenceI m : newseqs)
 240         {
 241           if (seqId.equals(m.getName()))
 242           {
 243             return m;
 244           }
 245         }
 246       }
 247
 248     }
 249     if (match == null && newseqs != null)
 250     {
 251       match = new SequenceDummy(seqId);
 252       if (relaxedIdMatching)
 253       {
 254         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
 255       }
 256       // add dummy sequence to the newseqs list
 257       newseqs.add(match);
 258     }
 259     return match;
 260   }
 261
 262   /**
 263    * Parses the input line to a map of name / value(s) pairs. For example the
 264    * line <br>
 265    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
 266    * <br>
 267    * if parsed with delimiter=";" and separators {' ', '='} <br>
 268    * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
 269    * prediction}, source={Pfam}} <br>
 270    *
 271    * This method supports parsing of either GFF2 format (which uses space ' ' as
 272    * the name/value delimiter, and allows multiple occurrences of the same
 273    * name), or GFF3 format (which uses '=' as the name/value delimiter, and
 274    * strictly does not allow repeat occurrences of the same name - but does
 275    * allow a comma-separated list of values).
 276    *
 277    * @param text
 278    * @param namesDelimiter
 279    *          the major delimiter between name-value pairs
 280    * @param nameValueSeparator
 281    *          one or more separators used between name and value
 282    * @param valuesDelimiter
 283    *          delimits a list of more than one value
 284    * @return the name-values map (which may be empty but never null)
 285    */
 286   public static Map<String, List<String>> parseNameValuePairs(String text,
 287           String namesDelimiter, char nameValueSeparator,
 288           String valuesDelimiter)
 289   {
 290     Map<String, List<String>> map = new HashMap<>();
 291     if (text == null || text.trim().length() == 0)
 292     {
 293       return map;
 294     }
 295
 296     for (String pair : text.trim().split(namesDelimiter))
 297     {
 298       pair = pair.trim();
 299       if (pair.length() == 0)
 300       {
 301         continue;
 302       }
 303
 304       int sepPos = pair.indexOf(nameValueSeparator);
 305       if (sepPos == -1)
 306       {
 307         // no name=value present
 308         continue;
 309       }
 310
 311       String key = pair.substring(0, sepPos).trim();
 312       String values = pair.substring(sepPos + 1).trim();
 313       if (values.length() > 0)
 314       {
 315         List<String> vals = map.get(key);
 316         if (vals == null)
 317         {
 318           vals = new ArrayList<>();
 319           map.put(key, vals);
 320         }
 321         for (String val : values.split(valuesDelimiter))
 322         {
 323           vals.add(val);
 324         }
 325       }
 326     }
 327     return map;
 328   }
 329
 330   /**
 331    * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
 332    * to call this method then adjust the SequenceFeature depending on the
 333    * particular usage of different tools that generate GFF.
 334    *
 335    * @param gff
 336    * @param attributes
 337    * @return
 338    */
 339   protected SequenceFeature buildSequenceFeature(String[] gff,
 340           Map<String, List<String>> attributes)
 341   {
 342     return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
 343   }
 344
 345   /**
 346    * @param gff
 347    * @param typeColumn
 348    * @param group
 349    * @param attributes
 350    * @return
 351    */
 352   protected SequenceFeature buildSequenceFeature(String[] gff,
 353           int typeColumn, String group, Map<String, List<String>> attributes)
 354   {
 355     try
 356     {
 357       int start = Integer.parseInt(gff[START_COL]);
 358       int end = Integer.parseInt(gff[END_COL]);
 359
 360       /*
 361        * default 'score' is 0 rather than Float.NaN - see JAL-2554
 362        */
 363       float score = 0f;
 364       try
 365       {
 366         score = Float.parseFloat(gff[SCORE_COL]);
 367       } catch (NumberFormatException nfe)
 368       {
 369         // e.g. '.' - leave as zero
 370       }
 371
 372       SequenceFeature sf = new SequenceFeature(gff[typeColumn],
 373               gff[SOURCE_COL], start, end, score, group);
 374
 375       sf.setStrand(gff[STRAND_COL]);
 376
 377       sf.setPhase(gff[PHASE_COL]);
 378
 379       if (attributes != null)
 380       {
 381         /*
 382          * Add attributes in column 9 to the sequence feature's
 383          * 'otherData' table; use Note as a best proxy for description;
 384          * decode any encoded comma, equals, semi-colon as per GFF3 spec
 385          */
 386         for (Entry<String, List<String>> attr : attributes.entrySet())
 387         {
 388           String key = attr.getKey();
 389           List<String> value = attr.getValue();
 390           if (key.startsWith(FeaturesFile.MAP_ATTRIBUTE_PREFIX))
 391           {
 392             /*
 393              * e.g. jvmap_CSQ={ALLELE_NUM=1,CDS_position=249,Codons=caG/caT}
 394              */
 395             String trueKey = key
 396                     .substring(FeaturesFile.MAP_ATTRIBUTE_PREFIX.length());
 397             if (trueKey.isEmpty() || value.isEmpty()
 398                     || !value.get(0).startsWith("{")
 399                     || !value.get(value.size() - 1).endsWith("}"))
 400             {
 401               System.err.println("Malformed GFF data '" + value.toString()
 402                       + "' for " + key);
 403               continue;
 404             }
 405             Map<String, String> values = new HashMap<>();
 406             for (String entry : value)
 407             {
 408               if (entry.startsWith("{"))
 409               {
 410                 entry = entry.substring(1);
 411               }
 412               if (entry.endsWith("}"))
 413               {
 414                 entry = entry.substring(0, entry.length() - 1);
 415               }
 416               String[] fields = entry.split(",");
 417             for (String field : fields)
 418             {
 419               String[] keyValue = field.split("=");
 420               if (keyValue.length == 2)
 421               {
 422                 String theKey = StringUtils.urlDecode(keyValue[0],
 423                         GFF_ENCODABLE);
 424                 String theValue = StringUtils.urlDecode(keyValue[1],
 425                         GFF_ENCODABLE);
 426                 values.put(theKey, theValue);
 427               }
 428             }
 429             }
 430             sf.setValue(trueKey, values);
 431           }
 432           else
 433           {
 434             String values = StringUtils
 435                     .listToDelimitedString(value, ",");
 436             values = StringUtils.urlDecode(values, GFF_ENCODABLE);
 437             sf.setValue(key, values);
 438             if (NOTE.equals(key))
 439             {
 440               sf.setDescription(values);
 441             }
 442           }
 443         }
 444       }
 445
 446       return sf;
 447     } catch (NumberFormatException nfe)
 448     {
 449       System.err.println("Invalid number in gff: " + nfe.getMessage());
 450       return null;
 451     }
 452   }
 453
 454   /**
 455    * Returns the character used to separate attributes names from values in GFF
 456    * column 9. This is space for GFF2, '=' for GFF3.
 457    *
 458    * @return
 459    */
 460   protected abstract char getNameValueSeparator();
 461
 462   /**
 463    * Returns any existing mapping held on the alignment between the given
 464    * dataset sequences, or a new one if none found. This is a convenience method
 465    * to facilitate processing multiple GFF lines that make up a single 'spliced'
 466    * mapping, by extending the first mapping as the others are read.
 467    *
 468    * @param align
 469    * @param fromSeq
 470    * @param toSeq
 471    * @return
 472    */
 473   protected AlignedCodonFrame getMapping(AlignmentI align,
 474           SequenceI fromSeq, SequenceI toSeq)
 475   {
 476     AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
 477     if (acf == null)
 478     {
 479       acf = new AlignedCodonFrame();
 480     }
 481     return acf;
 482   }
 483
 484 }