src/jalview/io/gff/GffHelperBase.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.MappingType;
  27 import jalview.datamodel.SequenceDummy;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.util.MapList;
  31 import jalview.util.StringUtils;
  32
  33 import java.util.ArrayList;
  34 import java.util.Arrays;
  35 import java.util.HashMap;
  36 import java.util.List;
  37 import java.util.Map;
  38 import java.util.Map.Entry;
  39
  40 /**
  41  * Base class with common functionality for flavours of GFF handler (GFF2 or
  42  * GFF3)
  43  */
  44 public abstract class GffHelperBase implements GffHelperI
  45 {
  46   protected static final String COMMA = ",";
  47
  48   protected static final String EQUALS = "=";
  49
  50   protected static final String NOTE = "Note";
  51
  52   /*
  53    * GFF columns 1-9 (zero-indexed):
  54    */
  55   protected static final int SEQID_COL = 0;
  56
  57   protected static final int SOURCE_COL = 1;
  58
  59   protected static final int TYPE_COL = 2;
  60
  61   protected static final int START_COL = 3;
  62
  63   protected static final int END_COL = 4;
  64
  65   protected static final int SCORE_COL = 5;
  66
  67   protected static final int STRAND_COL = 6;
  68
  69   protected static final int PHASE_COL = 7;
  70
  71   protected static final int ATTRIBUTES_COL = 8;
  72
  73   private AlignmentI lastmatchedAl = null;
  74
  75   private SequenceIdMatcher matcher = null;
  76
  77   /**
  78    * Constructs and returns a mapping, or null if data appear invalid
  79    *
  80    * @param fromStart
  81    * @param fromEnd
  82    * @param toStart
  83    * @param toEnd
  84    * @param mappingType
  85    *          type of mapping (e.g. protein to nucleotide)
  86    * @return
  87    */
  88   protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
  89           int toStart, int toEnd, MappingType mappingType)
  90   {
  91     int[] from = new int[] { fromStart, fromEnd };
  92     int[] to = new int[] { toStart, toEnd };
  93
  94     /*
  95      * Jalview always models from dna to protein, so switch values if the
  96      * GFF mapping is from protein to dna
  97      */
  98     if (mappingType == MappingType.PeptideToNucleotide)
  99     {
 100       int[] temp = from;
 101       from = to;
 102       to = temp;
 103       mappingType = mappingType.getInverse();
 104     }
 105
 106     int fromRatio = mappingType.getFromRatio();
 107     int toRatio = mappingType.getToRatio();
 108
 109     /*
 110      * sanity check that mapped residue counts match
 111      * TODO understand why PASA generates such cases...
 112      */
 113     if (!trimMapping(from, to, fromRatio, toRatio))
 114     {
 115       System.err.println("Ignoring mapping from " + Arrays.toString(from)
 116               + " to " + Arrays.toString(to) + " as counts don't match!");
 117       return null;
 118     }
 119
 120     /*
 121      * If a codon has an intron gap, there will be contiguous 'toRanges';
 122      * this is handled for us by the MapList constructor.
 123      * (It is not clear that exonerate ever generates this case)
 124      */
 125
 126     return new MapList(from, to, fromRatio, toRatio);
 127   }
 128
 129   /**
 130    * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
 131    * tries to trim the end of the longer so they do. Returns true if the
 132    * mappings could be made equivalent, else false. Note the range array values
 133    * may be modified by this method.
 134    *
 135    * @param from
 136    * @param to
 137    * @param fromRatio
 138    * @param toRatio
 139    * @return
 140    */
 141   protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
 142           int toRatio)
 143   {
 144     int fromLength = Math.abs(from[1] - from[0]) + 1;
 145     int toLength = Math.abs(to[1] - to[0]) + 1;
 146     int fromOverlap = fromLength * toRatio - toLength * fromRatio;
 147     if (fromOverlap == 0)
 148     {
 149       return true;
 150     }
 151     if (fromOverlap > 0 && fromOverlap % toRatio == 0)
 152     {
 153       /*
 154        * restrict from range to make them match up
 155        * it's kind of arbitrary which end we truncate - here it is the end
 156        */
 157       System.err.print(
 158               "Truncating mapping from " + Arrays.toString(from) + " to ");
 159       if (from[1] > from[0])
 160       {
 161         from[1] -= fromOverlap / toRatio;
 162       }
 163       else
 164       {
 165         from[1] += fromOverlap / toRatio;
 166       }
 167       System.err.println(Arrays.toString(from));
 168       return true;
 169     }
 170     else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
 171     {
 172       fromOverlap = -fromOverlap; // > 0
 173       /*
 174        * restrict to range to make them match up
 175        */
 176       System.err.print(
 177               "Truncating mapping to " + Arrays.toString(to) + " to ");
 178       if (to[1] > to[0])
 179       {
 180         to[1] -= fromOverlap / fromRatio;
 181       }
 182       else
 183       {
 184         to[1] += fromOverlap / fromRatio;
 185       }
 186       System.err.println(Arrays.toString(to));
 187       return true;
 188     }
 189
 190     /*
 191      * Couldn't truncate to an exact match..
 192      */
 193     return false;
 194   }
 195
 196   /**
 197    * Returns a sequence matching the given id, as follows
 198    * <ul>
 199    * <li>strict matching is on exact sequence name</li>
 200    * <li>relaxed matching allows matching on a token within the sequence name,
 201    * or a dbxref</li>
 202    * <li>first tries to find a match in the alignment sequences</li>
 203    * <li>else tries to find a match in the new sequences already generated while
 204    * parsing the features file</li>
 205    * <li>else creates a new placeholder sequence, adds it to the new sequences
 206    * list, and returns it</li>
 207    * </ul>
 208    *
 209    * @param seqId
 210    * @param align
 211    * @param newseqs
 212    * @param relaxedIdMatching
 213    *
 214    * @return
 215    */
 216   protected SequenceI findSequence(String seqId, AlignmentI align,
 217           List<SequenceI> newseqs, boolean relaxedIdMatching)
 218   {
 219     if (seqId == null)
 220     {
 221       return null;
 222     }
 223     SequenceI match = null;
 224     if (relaxedIdMatching)
 225     {
 226       if (lastmatchedAl != align)
 227       {
 228         lastmatchedAl = align;
 229         matcher = new SequenceIdMatcher(align.getSequencesArray());
 230         if (newseqs != null)
 231         {
 232           matcher.addAll(newseqs);
 233         }
 234       }
 235       match = matcher.findIdMatch(seqId);
 236     }
 237     else
 238     {
 239       match = align.findName(seqId, true);
 240       if (match == null && newseqs != null)
 241       {
 242         for (SequenceI m : newseqs)
 243         {
 244           if (seqId.equals(m.getName()))
 245           {
 246             return m;
 247           }
 248         }
 249       }
 250
 251     }
 252     if (match == null && newseqs != null)
 253     {
 254       match = new SequenceDummy(seqId);
 255       if (relaxedIdMatching)
 256       {
 257         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
 258       }
 259       // add dummy sequence to the newseqs list
 260       newseqs.add(match);
 261     }
 262     return match;
 263   }
 264
 265   /**
 266    * Parses the input line to a map of name / value(s) pairs. For example the
 267    * line
 268    *
 269    * <pre>
 270    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
 271    * </pre>
 272    *
 273    * if parsed with delimiter=";" and separators {' ', '='} <br>
 274    * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
 275    * prediction}, source={Pfam}} <br>
 276    *
 277    * This method supports parsing of either GFF2 format (which uses space ' ' as
 278    * the name/value delimiter, and allows multiple occurrences of the same
 279    * name), or GFF3 format (which uses '=' as the name/value delimiter, and
 280    * strictly does not allow repeat occurrences of the same name - but does
 281    * allow a comma-separated list of values).
 282    * <p>
 283    * Returns a (possibly empty) map of lists of values by attribute name.
 284    *
 285    * @param text
 286    * @param namesDelimiter
 287    *          the major delimiter between name-value pairs
 288    * @param nameValueSeparator
 289    *          separator used between name and value
 290    * @param valuesDelimiter
 291    *          delimits a list of more than one value
 292    * @return
 293    */
 294   public static Map<String, List<String>> parseNameValuePairs(String text,
 295           String namesDelimiter, char nameValueSeparator,
 296           String valuesDelimiter)
 297   {
 298     Map<String, List<String>> map = new HashMap<>();
 299     if (text == null || text.trim().length() == 0)
 300     {
 301       return map;
 302     }
 303
 304     /*
 305      * split by major delimiter (; for GFF3)
 306      */
 307     for (String nameValuePair : text.trim().split(namesDelimiter))
 308     {
 309       nameValuePair = nameValuePair.trim();
 310       if (nameValuePair.length() == 0)
 311       {
 312         continue;
 313       }
 314
 315       /*
 316        * find name/value separator (= for GFF3)
 317        */
 318       int sepPos = nameValuePair.indexOf(nameValueSeparator);
 319       if (sepPos == -1)
 320       {
 321         // no name=value found
 322         continue;
 323       }
 324
 325       String name = nameValuePair.substring(0, sepPos).trim();
 326       String values = nameValuePair.substring(sepPos + 1).trim();
 327       if (values.isEmpty())
 328       {
 329         continue;
 330       }
 331
 332       List<String> vals = map.get(name);
 333       if (vals == null)
 334       {
 335         vals = new ArrayList<>();
 336         map.put(name, vals);
 337       }
 338
 339       /*
 340        * if 'values' contains more name/value separators, parse as a map
 341        * (nested sub-attribute values)
 342        */
 343       if (values.indexOf(nameValueSeparator) != -1)
 344       {
 345         vals.add(values);
 346       }
 347       else
 348       {
 349         for (String val : values.split(valuesDelimiter))
 350         {
 351           vals.add(val);
 352         }
 353       }
 354     }
 355
 356     return map;
 357   }
 358
 359   /**
 360    * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
 361    * to call this method then adjust the SequenceFeature depending on the
 362    * particular usage of different tools that generate GFF.
 363    *
 364    * @param gff
 365    * @param attributes
 366    * @return
 367    */
 368   protected SequenceFeature buildSequenceFeature(String[] gff,
 369           Map<String, List<String>> attributes)
 370   {
 371     return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
 372   }
 373
 374   /**
 375    * @param gff
 376    * @param typeColumn
 377    * @param group
 378    * @param attributes
 379    * @return
 380    */
 381   protected SequenceFeature buildSequenceFeature(String[] gff,
 382           int typeColumn, String group, Map<String, List<String>> attributes)
 383   {
 384     try
 385     {
 386       int start = Integer.parseInt(gff[START_COL]);
 387       int end = Integer.parseInt(gff[END_COL]);
 388
 389       /*
 390        * default 'score' is 0 rather than Float.NaN - see JAL-2554
 391        */
 392       float score = 0f;
 393       try
 394       {
 395         score = Float.parseFloat(gff[SCORE_COL]);
 396       } catch (NumberFormatException nfe)
 397       {
 398         // e.g. '.' - leave as zero
 399       }
 400
 401       SequenceFeature sf = new SequenceFeature(gff[typeColumn],
 402               gff[SOURCE_COL], start, end, score, group);
 403
 404       sf.setStrand(gff[STRAND_COL]);
 405
 406       sf.setPhase(gff[PHASE_COL]);
 407
 408       if (attributes != null)
 409       {
 410         /*
 411          * Add attributes in column 9 to the sequence feature's
 412          * 'otherData' table; use Note as a best proxy for description;
 413          * decode any encoded comma, equals, semi-colon as per GFF3 spec
 414          */
 415         for (Entry<String, List<String>> attr : attributes.entrySet())
 416         {
 417           String key = attr.getKey();
 418           List<String> values = attr.getValue();
 419           if (values.size() == 1 && values.get(0).contains(EQUALS))
 420           {
 421             /*
 422              * 'value' is actually nested subattributes as x=a,y=b,z=c
 423              */
 424             Map<String, String> valueMap = parseAttributeMap(values.get(0));
 425             sf.setValue(key, valueMap);
 426           }
 427           else
 428           {
 429             String csvValues = StringUtils.listToDelimitedString(values,
 430                     COMMA);
 431             csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);
 432             sf.setValue(key, csvValues);
 433             if (NOTE.equals(key))
 434             {
 435               sf.setDescription(csvValues);
 436             }
 437           }
 438         }
 439       }
 440
 441       return sf;
 442     } catch (NumberFormatException nfe)
 443     {
 444       System.err.println("Invalid number in gff: " + nfe.getMessage());
 445       return null;
 446     }
 447   }
 448
 449   /**
 450    * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map
 451    * of {@code key,
 452    * value} <br>
 453    * An input string like {@code a=b,c,d=e,f=g,h} is parsed to
 454    *
 455    * <pre>
 456    * a = "b,c"
 457    * d = "e"
 458    * f = "g,h"
 459    * </pre>
 460    *
 461    * @param s
 462    *
 463    * @return
 464    */
 465   protected static Map<String, String> parseAttributeMap(String s)
 466   {
 467     Map<String, String> map = new HashMap<>();
 468     String[] fields = s.split(EQUALS);
 469     int i = 0;
 470     while (i < fields.length - 1)
 471     {
 472       boolean lastPair = i == fields.length - 2;
 473       String before = fields[i];
 474       String after = fields[i + 1];
 475
 476       /*
 477        * if 'key' looks like a,b,c then the last token is the
 478        * key
 479        */
 480       String theKey = before.contains(COMMA)
 481               ? before.substring(before.lastIndexOf(COMMA) + 1)
 482               : before;
 483
 484       /*
 485        * if 'value' looks like a,b,c then all but the last token is the value,
 486        * unless this is the last field (no more = to follow), in which case
 487        * all of it makes up the value
 488        */
 489       String theValue = after.contains(COMMA) && !lastPair
 490               ? after.substring(0, after.lastIndexOf(COMMA))
 491               : after;
 492       map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE),
 493               StringUtils.urlDecode(theValue, GFF_ENCODABLE));
 494       i += 1;
 495     }
 496
 497     return map;
 498   }
 499
 500   /**
 501    * Returns any existing mapping held on the alignment between the given
 502    * dataset sequences, or a new one if none found. This is a convenience method
 503    * to facilitate processing multiple GFF lines that make up a single 'spliced'
 504    * mapping, by extending the first mapping as the others are read.
 505    *
 506    * @param align
 507    * @param fromSeq
 508    * @param toSeq
 509    * @return
 510    */
 511   protected AlignedCodonFrame getMapping(AlignmentI align,
 512           SequenceI fromSeq, SequenceI toSeq)
 513   {
 514     AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
 515     if (acf == null)
 516     {
 517       acf = new AlignedCodonFrame();
 518     }
 519     return acf;
 520   }
 521
 522 }