src/jalview/io/gff/GffHelperBase.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.MappingType;
  27 import jalview.datamodel.SequenceDummy;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.util.MapList;
  31 import jalview.util.StringUtils;
  32
  33 import java.util.ArrayList;
  34 import java.util.Arrays;
  35 import java.util.HashMap;
  36 import java.util.List;
  37 import java.util.Map;
  38 import java.util.Map.Entry;
  39
  40 /**
  41  * Base class with common functionality for flavours of GFF handler (GFF2 or
  42  * GFF3)
  43  */
  44 public abstract class GffHelperBase implements GffHelperI
  45 {
  46   private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: ";
  47
  48   protected static final String COMMA = ",";
  49
  50   protected static final String EQUALS = "=";
  51
  52   protected static final String NOTE = "Note";
  53
  54   /*
  55    * GFF columns 1-9 (zero-indexed):
  56    */
  57   protected static final int SEQID_COL = 0;
  58
  59   protected static final int SOURCE_COL = 1;
  60
  61   protected static final int TYPE_COL = 2;
  62
  63   protected static final int START_COL = 3;
  64
  65   protected static final int END_COL = 4;
  66
  67   protected static final int SCORE_COL = 5;
  68
  69   protected static final int STRAND_COL = 6;
  70
  71   protected static final int PHASE_COL = 7;
  72
  73   protected static final int ATTRIBUTES_COL = 8;
  74
  75   private AlignmentI lastmatchedAl = null;
  76
  77   private SequenceIdMatcher matcher = null;
  78
  79   /**
  80    * Constructs and returns a mapping, or null if data appear invalid
  81    *
  82    * @param fromStart
  83    * @param fromEnd
  84    * @param toStart
  85    * @param toEnd
  86    * @param mappingType
  87    *          type of mapping (e.g. protein to nucleotide)
  88    * @return
  89    */
  90   protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
  91           int toStart, int toEnd, MappingType mappingType)
  92   {
  93     int[] from = new int[] { fromStart, fromEnd };
  94     int[] to = new int[] { toStart, toEnd };
  95
  96     /*
  97      * Jalview always models from dna to protein, so switch values if the
  98      * GFF mapping is from protein to dna
  99      */
 100     if (mappingType == MappingType.PeptideToNucleotide)
 101     {
 102       int[] temp = from;
 103       from = to;
 104       to = temp;
 105       mappingType = mappingType.getInverse();
 106     }
 107
 108     int fromRatio = mappingType.getFromRatio();
 109     int toRatio = mappingType.getToRatio();
 110
 111     /*
 112      * sanity check that mapped residue counts match
 113      * TODO understand why PASA generates such cases...
 114      */
 115     if (!trimMapping(from, to, fromRatio, toRatio))
 116     {
 117       jalview.bin.Console.errPrintln(
 118               "Ignoring mapping from " + Arrays.toString(from) + " to "
 119                       + Arrays.toString(to) + " as counts don't match!");
 120       return null;
 121     }
 122
 123     /*
 124      * If a codon has an intron gap, there will be contiguous 'toRanges';
 125      * this is handled for us by the MapList constructor.
 126      * (It is not clear that exonerate ever generates this case)
 127      */
 128
 129     return new MapList(from, to, fromRatio, toRatio);
 130   }
 131
 132   /**
 133    * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
 134    * tries to trim the end of the longer so they do. Returns true if the
 135    * mappings could be made equivalent, else false. Note the range array values
 136    * may be modified by this method.
 137    *
 138    * @param from
 139    * @param to
 140    * @param fromRatio
 141    * @param toRatio
 142    * @return
 143    */
 144   protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
 145           int toRatio)
 146   {
 147     int fromLength = Math.abs(from[1] - from[0]) + 1;
 148     int toLength = Math.abs(to[1] - to[0]) + 1;
 149     int fromOverlap = fromLength * toRatio - toLength * fromRatio;
 150     if (fromOverlap == 0)
 151     {
 152       return true;
 153     }
 154     if (fromOverlap > 0 && fromOverlap % toRatio == 0)
 155     {
 156       /*
 157        * restrict from range to make them match up
 158        * it's kind of arbitrary which end we truncate - here it is the end
 159        */
 160       System.err.print(
 161               "Truncating mapping from " + Arrays.toString(from) + " to ");
 162       if (from[1] > from[0])
 163       {
 164         from[1] -= fromOverlap / toRatio;
 165       }
 166       else
 167       {
 168         from[1] += fromOverlap / toRatio;
 169       }
 170       jalview.bin.Console.errPrintln(Arrays.toString(from));
 171       return true;
 172     }
 173     else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
 174     {
 175       fromOverlap = -fromOverlap; // > 0
 176       /*
 177        * restrict to range to make them match up
 178        */
 179       System.err.print(
 180               "Truncating mapping to " + Arrays.toString(to) + " to ");
 181       if (to[1] > to[0])
 182       {
 183         to[1] -= fromOverlap / fromRatio;
 184       }
 185       else
 186       {
 187         to[1] += fromOverlap / fromRatio;
 188       }
 189       jalview.bin.Console.errPrintln(Arrays.toString(to));
 190       return true;
 191     }
 192
 193     /*
 194      * Couldn't truncate to an exact match..
 195      */
 196     return false;
 197   }
 198
 199   /**
 200    * Returns a sequence matching the given id, as follows
 201    * <ul>
 202    * <li>strict matching is on exact sequence name</li>
 203    * <li>relaxed matching allows matching on a token within the sequence name,
 204    * or a dbxref</li>
 205    * <li>first tries to find a match in the alignment sequences</li>
 206    * <li>else tries to find a match in the new sequences already generated while
 207    * parsing the features file</li>
 208    * <li>else creates a new placeholder sequence, adds it to the new sequences
 209    * list, and returns it</li>
 210    * </ul>
 211    *
 212    * @param seqId
 213    * @param align
 214    * @param newseqs
 215    * @param relaxedIdMatching
 216    *
 217    * @return
 218    */
 219   protected SequenceI findSequence(String seqId, AlignmentI align,
 220           List<SequenceI> newseqs, boolean relaxedIdMatching)
 221   {
 222     if (seqId == null)
 223     {
 224       return null;
 225     }
 226     SequenceI match = null;
 227     if (relaxedIdMatching)
 228     {
 229       if (lastmatchedAl != align)
 230       {
 231         lastmatchedAl = align;
 232         matcher = new SequenceIdMatcher(align.getSequencesArray());
 233         if (newseqs != null)
 234         {
 235           matcher.addAll(newseqs);
 236         }
 237       }
 238       match = matcher.findIdMatch(seqId);
 239     }
 240     else
 241     {
 242       match = align.findName(seqId, true);
 243       if (match == null && newseqs != null)
 244       {
 245         for (SequenceI m : newseqs)
 246         {
 247           if (seqId.equals(m.getName()))
 248           {
 249             return m;
 250           }
 251         }
 252       }
 253
 254     }
 255     if (match == null && newseqs != null)
 256     {
 257       match = new SequenceDummy(seqId);
 258       if (relaxedIdMatching)
 259       {
 260         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
 261       }
 262       // add dummy sequence to the newseqs list
 263       newseqs.add(match);
 264     }
 265     return match;
 266   }
 267
 268   /**
 269    * Parses the input line to a map of name / value(s) pairs. For example the
 270    * line
 271    *
 272    * <pre>
 273    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
 274    * </pre>
 275    *
 276    * if parsed with delimiter=";" and separators {' ', '='} <br>
 277    * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
 278    * prediction}, source={Pfam}} <br>
 279    *
 280    * This method supports parsing of either GFF2 format (which uses space ' ' as
 281    * the name/value delimiter, and allows multiple occurrences of the same
 282    * name), or GFF3 format (which uses '=' as the name/value delimiter, and
 283    * strictly does not allow repeat occurrences of the same name - but does
 284    * allow a comma-separated list of values).
 285    * <p>
 286    * Returns a (possibly empty) map of lists of values by attribute name.
 287    *
 288    * @param text
 289    * @param namesDelimiter
 290    *          the major delimiter between name-value pairs
 291    * @param nameValueSeparator
 292    *          separator used between name and value
 293    * @param valuesDelimiter
 294    *          delimits a list of more than one value
 295    * @return
 296    */
 297   public static Map<String, List<String>> parseNameValuePairs(String text,
 298           String namesDelimiter, char nameValueSeparator,
 299           String valuesDelimiter)
 300   {
 301     Map<String, List<String>> map = new HashMap<>();
 302     if (text == null || text.trim().length() == 0)
 303     {
 304       return map;
 305     }
 306
 307     /*
 308      * split by major delimiter (; for GFF3)
 309      */
 310     for (String nameValuePair : text.trim().split(namesDelimiter))
 311     {
 312       nameValuePair = nameValuePair.trim();
 313       if (nameValuePair.length() == 0)
 314       {
 315         continue;
 316       }
 317
 318       /*
 319        * find name/value separator (= for GFF3)
 320        */
 321       int sepPos = nameValuePair.indexOf(nameValueSeparator);
 322       if (sepPos == -1)
 323       {
 324         // no name=value found
 325         continue;
 326       }
 327
 328       String name = nameValuePair.substring(0, sepPos).trim();
 329       String values = nameValuePair.substring(sepPos + 1).trim();
 330       if (values.isEmpty())
 331       {
 332         continue;
 333       }
 334
 335       List<String> vals = map.get(name);
 336       if (vals == null)
 337       {
 338         vals = new ArrayList<>();
 339         map.put(name, vals);
 340       }
 341
 342       /*
 343        * if 'values' contains more name/value separators, parse as a map
 344        * (nested sub-attribute values)
 345        */
 346       if (values.indexOf(nameValueSeparator) != -1)
 347       {
 348         vals.add(values);
 349       }
 350       else
 351       {
 352         for (String val : values.split(valuesDelimiter))
 353         {
 354           vals.add(val);
 355         }
 356       }
 357     }
 358
 359     return map;
 360   }
 361
 362   /**
 363    * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
 364    * to call this method then adjust the SequenceFeature depending on the
 365    * particular usage of different tools that generate GFF.
 366    *
 367    * @param gff
 368    * @param attributes
 369    * @return
 370    */
 371   protected SequenceFeature buildSequenceFeature(String[] gff,
 372           Map<String, List<String>> attributes)
 373   {
 374     return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
 375   }
 376
 377   /**
 378    * @param gff
 379    * @param typeColumn
 380    * @param group
 381    * @param attributes
 382    * @return
 383    */
 384   protected SequenceFeature buildSequenceFeature(String[] gff,
 385           int typeColumn, String group,
 386           Map<String, List<String>> attributes)
 387   {
 388     try
 389     {
 390       int start = Integer.parseInt(gff[START_COL]);
 391       int end = Integer.parseInt(gff[END_COL]);
 392
 393       /*
 394        * default 'score' is 0 rather than Float.NaN - see JAL-2554
 395        */
 396       float score = 0f;
 397       try
 398       {
 399         score = Float.parseFloat(gff[SCORE_COL]);
 400       } catch (NumberFormatException nfe)
 401       {
 402         // e.g. '.' - leave as zero
 403       }
 404
 405       SequenceFeature sf = new SequenceFeature(gff[typeColumn],
 406               gff[SOURCE_COL], start, end, score, group);
 407
 408       sf.setStrand(gff[STRAND_COL]);
 409
 410       sf.setPhase(gff[PHASE_COL]);
 411
 412       if (attributes != null)
 413       {
 414         /*
 415          * Add attributes in column 9 to the sequence feature's
 416          * 'otherData' table; use Note as a best proxy for description;
 417          * decode any encoded comma, equals, semi-colon as per GFF3 spec
 418          */
 419         for (Entry<String, List<String>> attr : attributes.entrySet())
 420         {
 421           String key = attr.getKey();
 422           List<String> values = attr.getValue();
 423           if (values.size() == 1 && values.get(0).contains(EQUALS))
 424           {
 425             /*
 426              * 'value' is actually nested subattributes as x=a,y=b,z=c
 427              */
 428             Map<String, String> valueMap = parseAttributeMap(values.get(0));
 429             sf.setValue(key, valueMap);
 430           }
 431           else
 432           {
 433             String csvValues = StringUtils.listToDelimitedString(values,
 434                     COMMA);
 435             csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);
 436             sf.setValue(key, csvValues);
 437             if (NOTE.equals(key))
 438             {
 439               sf.setDescription(csvValues);
 440             }
 441           }
 442         }
 443       }
 444
 445       return sf;
 446     } catch (NumberFormatException nfe)
 447     {
 448       jalview.bin.Console
 449               .errPrintln("Invalid number in gff: " + nfe.getMessage());
 450       return null;
 451     }
 452   }
 453
 454   /**
 455    * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map
 456    * of {@code key,
 457    * value} <br>
 458    * An input string like {@code a=b,c,d=e,f=g,h} is parsed to
 459    *
 460    * <pre>
 461    * a = "b,c"
 462    * d = "e"
 463    * f = "g,h"
 464    * </pre>
 465    *
 466    * @param s
 467    *
 468    * @return
 469    */
 470   protected static Map<String, String> parseAttributeMap(String s)
 471   {
 472     Map<String, String> map = new HashMap<>();
 473     String[] fields = s.split(EQUALS);
 474
 475     /*
 476      * format validation
 477      */
 478     boolean valid = true;
 479     if (fields.length < 2)
 480     {
 481       /*
 482        * need at least A=B here
 483        */
 484       valid = false;
 485     }
 486     else if (fields[0].isEmpty() || fields[0].contains(COMMA))
 487     {
 488       /*
 489        * A,B=C is not a valid start, nor is =C
 490        */
 491       valid = false;
 492     }
 493     else
 494     {
 495       for (int i = 1; i < fields.length - 1; i++)
 496       {
 497         if (fields[i].isEmpty() || !fields[i].contains(COMMA))
 498         {
 499           /*
 500            * intermediate tokens must include value,name
 501            */
 502           valid = false;
 503         }
 504       }
 505     }
 506
 507     if (!valid)
 508     {
 509       jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);
 510       return map;
 511     }
 512
 513     int i = 0;
 514     while (i < fields.length - 1)
 515     {
 516       boolean lastPair = i == fields.length - 2;
 517       String before = fields[i];
 518       String after = fields[i + 1];
 519
 520       /*
 521        * if 'key' looks like a,b,c then the last token is the
 522        * key
 523        */
 524       String theKey = before.contains(COMMA)
 525               ? before.substring(before.lastIndexOf(COMMA) + 1)
 526               : before;
 527
 528       theKey = theKey.trim();
 529       if (theKey.isEmpty())
 530       {
 531         jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);
 532         map.clear();
 533         return map;
 534       }
 535
 536       /*
 537        * if 'value' looks like a,b,c then all but the last token is the value,
 538        * unless this is the last field (no more = to follow), in which case
 539        * all of it makes up the value
 540        */
 541       String theValue = after.contains(COMMA) && !lastPair
 542               ? after.substring(0, after.lastIndexOf(COMMA))
 543               : after;
 544       map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE),
 545               StringUtils.urlDecode(theValue, GFF_ENCODABLE));
 546       i += 1;
 547     }
 548
 549     return map;
 550   }
 551
 552   /**
 553    * Returns any existing mapping held on the alignment between the given
 554    * dataset sequences, or a new one if none found. This is a convenience method
 555    * to facilitate processing multiple GFF lines that make up a single 'spliced'
 556    * mapping, by extending the first mapping as the others are read.
 557    *
 558    * @param align
 559    * @param fromSeq
 560    * @param toSeq
 561    * @return
 562    */
 563   protected AlignedCodonFrame getMapping(AlignmentI align,
 564           SequenceI fromSeq, SequenceI toSeq)
 565   {
 566     AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
 567     if (acf == null)
 568     {
 569       acf = new AlignedCodonFrame();
 570     }
 571     return acf;
 572   }
 573
 574 }