src/jalview/io/gff/GffHelperBase.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import static jalview.io.FeaturesFile.MAP_ATTRIBUTE_PREFIX;
  24
  25 import jalview.analysis.SequenceIdMatcher;
  26 import jalview.datamodel.AlignedCodonFrame;
  27 import jalview.datamodel.AlignmentI;
  28 import jalview.datamodel.MappingType;
  29 import jalview.datamodel.SequenceDummy;
  30 import jalview.datamodel.SequenceFeature;
  31 import jalview.datamodel.SequenceI;
  32 import jalview.io.FeaturesFile;
  33 import jalview.util.MapList;
  34 import jalview.util.StringUtils;
  35
  36 import java.util.ArrayList;
  37 import java.util.Arrays;
  38 import java.util.HashMap;
  39 import java.util.List;
  40 import java.util.Map;
  41 import java.util.Map.Entry;
  42
  43 /**
  44  * Base class with common functionality for flavours of GFF handler (GFF2 or
  45  * GFF3)
  46  */
  47 public abstract class GffHelperBase implements GffHelperI
  48 {
  49   private static final String COMMA = ",";
  50
  51   private static final String NOTE = "Note";
  52
  53   /*
  54    * GFF columns 1-9 (zero-indexed):
  55    */
  56   protected static final int SEQID_COL = 0;
  57
  58   protected static final int SOURCE_COL = 1;
  59
  60   protected static final int TYPE_COL = 2;
  61
  62   protected static final int START_COL = 3;
  63
  64   protected static final int END_COL = 4;
  65
  66   protected static final int SCORE_COL = 5;
  67
  68   protected static final int STRAND_COL = 6;
  69
  70   protected static final int PHASE_COL = 7;
  71
  72   protected static final int ATTRIBUTES_COL = 8;
  73
  74   private AlignmentI lastmatchedAl = null;
  75
  76   private SequenceIdMatcher matcher = null;
  77
  78   /**
  79    * Constructs and returns a mapping, or null if data appear invalid
  80    *
  81    * @param fromStart
  82    * @param fromEnd
  83    * @param toStart
  84    * @param toEnd
  85    * @param mappingType
  86    *          type of mapping (e.g. protein to nucleotide)
  87    * @return
  88    */
  89   protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
  90           int toStart, int toEnd, MappingType mappingType)
  91   {
  92     int[] from = new int[] { fromStart, fromEnd };
  93     int[] to = new int[] { toStart, toEnd };
  94
  95     /*
  96      * Jalview always models from dna to protein, so switch values if the
  97      * GFF mapping is from protein to dna
  98      */
  99     if (mappingType == MappingType.PeptideToNucleotide)
 100     {
 101       int[] temp = from;
 102       from = to;
 103       to = temp;
 104       mappingType = mappingType.getInverse();
 105     }
 106
 107     int fromRatio = mappingType.getFromRatio();
 108     int toRatio = mappingType.getToRatio();
 109
 110     /*
 111      * sanity check that mapped residue counts match
 112      * TODO understand why PASA generates such cases...
 113      */
 114     if (!trimMapping(from, to, fromRatio, toRatio))
 115     {
 116       System.err.println("Ignoring mapping from " + Arrays.toString(from)
 117               + " to " + Arrays.toString(to) + " as counts don't match!");
 118       return null;
 119     }
 120
 121     /*
 122      * If a codon has an intron gap, there will be contiguous 'toRanges';
 123      * this is handled for us by the MapList constructor.
 124      * (It is not clear that exonerate ever generates this case)
 125      */
 126
 127     return new MapList(from, to, fromRatio, toRatio);
 128   }
 129
 130   /**
 131    * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
 132    * tries to trim the end of the longer so they do. Returns true if the
 133    * mappings could be made equivalent, else false. Note the range array values
 134    * may be modified by this method.
 135    *
 136    * @param from
 137    * @param to
 138    * @param fromRatio
 139    * @param toRatio
 140    * @return
 141    */
 142   protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
 143           int toRatio)
 144   {
 145     int fromLength = Math.abs(from[1] - from[0]) + 1;
 146     int toLength = Math.abs(to[1] - to[0]) + 1;
 147     int fromOverlap = fromLength * toRatio - toLength * fromRatio;
 148     if (fromOverlap == 0)
 149     {
 150       return true;
 151     }
 152     if (fromOverlap > 0 && fromOverlap % toRatio == 0)
 153     {
 154       /*
 155        * restrict from range to make them match up
 156        * it's kind of arbitrary which end we truncate - here it is the end
 157        */
 158       System.err.print(
 159               "Truncating mapping from " + Arrays.toString(from) + " to ");
 160       if (from[1] > from[0])
 161       {
 162         from[1] -= fromOverlap / toRatio;
 163       }
 164       else
 165       {
 166         from[1] += fromOverlap / toRatio;
 167       }
 168       System.err.println(Arrays.toString(from));
 169       return true;
 170     }
 171     else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
 172     {
 173       fromOverlap = -fromOverlap; // > 0
 174       /*
 175        * restrict to range to make them match up
 176        */
 177       System.err.print(
 178               "Truncating mapping to " + Arrays.toString(to) + " to ");
 179       if (to[1] > to[0])
 180       {
 181         to[1] -= fromOverlap / fromRatio;
 182       }
 183       else
 184       {
 185         to[1] += fromOverlap / fromRatio;
 186       }
 187       System.err.println(Arrays.toString(to));
 188       return true;
 189     }
 190
 191     /*
 192      * Couldn't truncate to an exact match..
 193      */
 194     return false;
 195   }
 196
 197   /**
 198    * Returns a sequence matching the given id, as follows
 199    * <ul>
 200    * <li>strict matching is on exact sequence name</li>
 201    * <li>relaxed matching allows matching on a token within the sequence name,
 202    * or a dbxref</li>
 203    * <li>first tries to find a match in the alignment sequences</li>
 204    * <li>else tries to find a match in the new sequences already generated while
 205    * parsing the features file</li>
 206    * <li>else creates a new placeholder sequence, adds it to the new sequences
 207    * list, and returns it</li>
 208    * </ul>
 209    *
 210    * @param seqId
 211    * @param align
 212    * @param newseqs
 213    * @param relaxedIdMatching
 214    *
 215    * @return
 216    */
 217   protected SequenceI findSequence(String seqId, AlignmentI align,
 218           List<SequenceI> newseqs, boolean relaxedIdMatching)
 219   {
 220     if (seqId == null)
 221     {
 222       return null;
 223     }
 224     SequenceI match = null;
 225     if (relaxedIdMatching)
 226     {
 227       if (lastmatchedAl != align)
 228       {
 229         lastmatchedAl = align;
 230         matcher = new SequenceIdMatcher(align.getSequencesArray());
 231         if (newseqs != null)
 232         {
 233           matcher.addAll(newseqs);
 234         }
 235       }
 236       match = matcher.findIdMatch(seqId);
 237     }
 238     else
 239     {
 240       match = align.findName(seqId, true);
 241       if (match == null && newseqs != null)
 242       {
 243         for (SequenceI m : newseqs)
 244         {
 245           if (seqId.equals(m.getName()))
 246           {
 247             return m;
 248           }
 249         }
 250       }
 251
 252     }
 253     if (match == null && newseqs != null)
 254     {
 255       match = new SequenceDummy(seqId);
 256       if (relaxedIdMatching)
 257       {
 258         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
 259       }
 260       // add dummy sequence to the newseqs list
 261       newseqs.add(match);
 262     }
 263     return match;
 264   }
 265
 266   /**
 267    * Parses the input line to a map of name / value(s) pairs. For example the line
 268    * <br>
 269    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
 270    * <br>
 271    * if parsed with delimiter=";" and separators {' ', '='} <br>
 272    * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
 273    * prediction}, source={Pfam}} <br>
 274    *
 275    * This method supports parsing of either GFF2 format (which uses space ' ' as
 276    * the name/value delimiter, and allows multiple occurrences of the same name),
 277    * or GFF3 format (which uses '=' as the name/value delimiter, and strictly does
 278    * not allow repeat occurrences of the same name - but does allow a
 279    * comma-separated list of values).
 280    * <p>
 281    * Returns a (possibly empty) map of lists of values by attribute name.
 282    *
 283    * @param text
 284    * @param namesDelimiter
 285    *                             the major delimiter between name-value pairs
 286    * @param nameValueSeparator
 287    *                             separator used between name and value
 288    * @param valuesDelimiter
 289    *                             delimits a list of more than one value
 290    * @return
 291    */
 292   public static Map<String, List<String>> parseNameValuePairs(String text,
 293           String namesDelimiter, char nameValueSeparator,
 294           String valuesDelimiter)
 295   {
 296     Map<String, List<String>> map = new HashMap<>();
 297     if (text == null || text.trim().length() == 0)
 298     {
 299       return map;
 300     }
 301
 302     for (String pair : text.trim().split(namesDelimiter))
 303     {
 304       pair = pair.trim();
 305       if (pair.length() == 0)
 306       {
 307         continue;
 308       }
 309
 310       int sepPos = pair.indexOf(nameValueSeparator);
 311       if (sepPos == -1)
 312       {
 313         // no name=value found
 314         continue;
 315       }
 316
 317       String key = pair.substring(0, sepPos).trim();
 318       String values = pair.substring(sepPos + 1).trim();
 319       if (values.length() > 0)
 320       {
 321         List<String> vals = map.get(key);
 322         if (vals == null)
 323         {
 324           vals = new ArrayList<>();
 325           map.put(key, vals);
 326         }
 327
 328         /*
 329          * special case: formatted as jvmap_AttName={a=b,c=d,...}
 330          * save the value within { } for parsing at a later stage
 331          */
 332         if (key.startsWith(MAP_ATTRIBUTE_PREFIX))
 333         {
 334
 335           if (key.length() > MAP_ATTRIBUTE_PREFIX.length()
 336                   && values.startsWith("{")
 337                   && values.endsWith("}"))
 338           {
 339             vals.add(values.substring(1, values.length() - 1));
 340           }
 341           else
 342           {
 343             System.err.println("Malformed GFF data '" + values.toString()
 344                     + "' for " + key);
 345           }
 346         }
 347         else
 348         {
 349           for (String val : values.split(valuesDelimiter))
 350           {
 351             vals.add(val);
 352           }
 353         }
 354       }
 355     }
 356     return map;
 357   }
 358
 359   /**
 360    * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
 361    * to call this method then adjust the SequenceFeature depending on the
 362    * particular usage of different tools that generate GFF.
 363    *
 364    * @param gff
 365    * @param attributes
 366    * @return
 367    */
 368   protected SequenceFeature buildSequenceFeature(String[] gff,
 369           Map<String, List<String>> attributes)
 370   {
 371     return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
 372   }
 373
 374   /**
 375    * @param gff
 376    * @param typeColumn
 377    * @param group
 378    * @param attributes
 379    * @return
 380    */
 381   protected SequenceFeature buildSequenceFeature(String[] gff,
 382           int typeColumn, String group, Map<String, List<String>> attributes)
 383   {
 384     try
 385     {
 386       int start = Integer.parseInt(gff[START_COL]);
 387       int end = Integer.parseInt(gff[END_COL]);
 388
 389       /*
 390        * default 'score' is 0 rather than Float.NaN - see JAL-2554
 391        */
 392       float score = 0f;
 393       try
 394       {
 395         score = Float.parseFloat(gff[SCORE_COL]);
 396       } catch (NumberFormatException nfe)
 397       {
 398         // e.g. '.' - leave as zero
 399       }
 400
 401       SequenceFeature sf = new SequenceFeature(gff[typeColumn],
 402               gff[SOURCE_COL], start, end, score, group);
 403
 404       sf.setStrand(gff[STRAND_COL]);
 405
 406       sf.setPhase(gff[PHASE_COL]);
 407
 408       if (attributes != null)
 409       {
 410         /*
 411          * Add attributes in column 9 to the sequence feature's
 412          * 'otherData' table; use Note as a best proxy for description;
 413          * decode any encoded comma, equals, semi-colon as per GFF3 spec
 414          */
 415         for (Entry<String, List<String>> attr : attributes.entrySet())
 416         {
 417           String key = attr.getKey();
 418           List<String> values = attr.getValue();
 419           if (key.startsWith(FeaturesFile.MAP_ATTRIBUTE_PREFIX))
 420           {
 421             key = key.substring(FeaturesFile.MAP_ATTRIBUTE_PREFIX.length());
 422             Map<String, String> valueMap = parseAttributeMap(values);
 423             sf.setValue(key, valueMap);
 424           }
 425           else
 426           {
 427             String csvValues = StringUtils.listToDelimitedString(values,
 428                     COMMA);
 429             csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);
 430             sf.setValue(key, csvValues);
 431             if (NOTE.equals(key))
 432             {
 433               sf.setDescription(csvValues);
 434             }
 435           }
 436         }
 437       }
 438
 439       return sf;
 440     } catch (NumberFormatException nfe)
 441     {
 442       System.err.println("Invalid number in gff: " + nfe.getMessage());
 443       return null;
 444     }
 445   }
 446
 447   /**
 448    * Parses one or more list of comma-separated key=value pairs into a Map of
 449    * {key, value}
 450    *
 451    * @param values
 452    * @return
 453    */
 454   protected Map<String, String> parseAttributeMap(List<String> values)
 455   {
 456     Map<String, String> map = new HashMap<>();
 457     for (String entry : values)
 458     {
 459       String[] fields = entry.split(COMMA);
 460       for (String field : fields)
 461       {
 462         String[] keyValue = field.split("=");
 463         if (keyValue.length == 2)
 464         {
 465           String theKey = StringUtils.urlDecode(keyValue[0],
 466                   GFF_ENCODABLE);
 467           String theValue = StringUtils.urlDecode(keyValue[1],
 468                   GFF_ENCODABLE);
 469           map.put(theKey, theValue);
 470         }
 471       }
 472     }
 473     return map;
 474   }
 475
 476   /**
 477    * Returns any existing mapping held on the alignment between the given
 478    * dataset sequences, or a new one if none found. This is a convenience method
 479    * to facilitate processing multiple GFF lines that make up a single 'spliced'
 480    * mapping, by extending the first mapping as the others are read.
 481    *
 482    * @param align
 483    * @param fromSeq
 484    * @param toSeq
 485    * @return
 486    */
 487   protected AlignedCodonFrame getMapping(AlignmentI align,
 488           SequenceI fromSeq, SequenceI toSeq)
 489   {
 490     AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
 491     if (acf == null)
 492     {
 493       acf = new AlignedCodonFrame();
 494     }
 495     return acf;
 496   }
 497
 498 }