src/jalview/io/gff/GffHelperBase.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.analysis.SequenceIdMatcher;
  24 import jalview.datamodel.AlignedCodonFrame;
  25 import jalview.datamodel.AlignmentI;
  26 import jalview.datamodel.MappingType;
  27 import jalview.datamodel.SequenceDummy;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.util.MapList;
  31 import jalview.util.StringUtils;
  32
  33 import java.util.ArrayList;
  34 import java.util.Arrays;
  35 import java.util.HashMap;
  36 import java.util.List;
  37 import java.util.Map;
  38 import java.util.Map.Entry;
  39
  40 /**
  41  * Base class with common functionality for flavours of GFF handler (GFF2 or
  42  * GFF3)
  43  */
  44 public abstract class GffHelperBase implements GffHelperI
  45 {
  46   private static final String NOTE = "Note";
  47
  48   /*
  49    * GFF columns 1-9 (zero-indexed):
  50    */
  51   protected static final int SEQID_COL = 0;
  52
  53   protected static final int SOURCE_COL = 1;
  54
  55   protected static final int TYPE_COL = 2;
  56
  57   protected static final int START_COL = 3;
  58
  59   protected static final int END_COL = 4;
  60
  61   protected static final int SCORE_COL = 5;
  62
  63   protected static final int STRAND_COL = 6;
  64
  65   protected static final int PHASE_COL = 7;
  66
  67   protected static final int ATTRIBUTES_COL = 8;
  68
  69   private AlignmentI lastmatchedAl = null;
  70
  71   private SequenceIdMatcher matcher = null;
  72
  73   /**
  74    * Constructs and returns a mapping, or null if data appear invalid
  75    *
  76    * @param fromStart
  77    * @param fromEnd
  78    * @param toStart
  79    * @param toEnd
  80    * @param mappingType
  81    *          type of mapping (e.g. protein to nucleotide)
  82    * @return
  83    */
  84   protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
  85           int toStart, int toEnd, MappingType mappingType)
  86   {
  87     int[] from = new int[] { fromStart, fromEnd };
  88     int[] to = new int[] { toStart, toEnd };
  89
  90     /*
  91      * Jalview always models from dna to protein, so switch values if the
  92      * GFF mapping is from protein to dna
  93      */
  94     if (mappingType == MappingType.PeptideToNucleotide)
  95     {
  96       int[] temp = from;
  97       from = to;
  98       to = temp;
  99       mappingType = mappingType.getInverse();
 100     }
 101
 102     int fromRatio = mappingType.getFromRatio();
 103     int toRatio = mappingType.getToRatio();
 104
 105     /*
 106      * sanity check that mapped residue counts match
 107      * TODO understand why PASA generates such cases...
 108      */
 109     if (!trimMapping(from, to, fromRatio, toRatio))
 110     {
 111       System.err.println("Ignoring mapping from " + Arrays.toString(from)
 112               + " to " + Arrays.toString(to) + " as counts don't match!");
 113       return null;
 114     }
 115
 116     /*
 117      * If a codon has an intron gap, there will be contiguous 'toRanges';
 118      * this is handled for us by the MapList constructor.
 119      * (It is not clear that exonerate ever generates this case)
 120      */
 121
 122     return new MapList(from, to, fromRatio, toRatio);
 123   }
 124
 125   /**
 126    * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
 127    * tries to trim the end of the longer so they do. Returns true if the
 128    * mappings could be made equivalent, else false. Note the range array values
 129    * may be modified by this method.
 130    *
 131    * @param from
 132    * @param to
 133    * @param fromRatio
 134    * @param toRatio
 135    * @return
 136    */
 137   protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
 138           int toRatio)
 139   {
 140     int fromLength = Math.abs(from[1] - from[0]) + 1;
 141     int toLength = Math.abs(to[1] - to[0]) + 1;
 142     int fromOverlap = fromLength * toRatio - toLength * fromRatio;
 143     if (fromOverlap == 0)
 144     {
 145       return true;
 146     }
 147     if (fromOverlap > 0 && fromOverlap % toRatio == 0)
 148     {
 149       /*
 150        * restrict from range to make them match up
 151        * it's kind of arbitrary which end we truncate - here it is the end
 152        */
 153       System.err.print(
 154               "Truncating mapping from " + Arrays.toString(from) + " to ");
 155       if (from[1] > from[0])
 156       {
 157         from[1] -= fromOverlap / toRatio;
 158       }
 159       else
 160       {
 161         from[1] += fromOverlap / toRatio;
 162       }
 163       System.err.println(Arrays.toString(from));
 164       return true;
 165     }
 166     else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
 167     {
 168       fromOverlap = -fromOverlap; // > 0
 169       /*
 170        * restrict to range to make them match up
 171        */
 172       System.err.print(
 173               "Truncating mapping to " + Arrays.toString(to) + " to ");
 174       if (to[1] > to[0])
 175       {
 176         to[1] -= fromOverlap / fromRatio;
 177       }
 178       else
 179       {
 180         to[1] += fromOverlap / fromRatio;
 181       }
 182       System.err.println(Arrays.toString(to));
 183       return true;
 184     }
 185
 186     /*
 187      * Couldn't truncate to an exact match..
 188      */
 189     return false;
 190   }
 191
 192   /**
 193    * Returns a sequence matching the given id, as follows
 194    * <ul>
 195    * <li>strict matching is on exact sequence name</li>
 196    * <li>relaxed matching allows matching on a token within the sequence name,
 197    * or a dbxref</li>
 198    * <li>first tries to find a match in the alignment sequences</li>
 199    * <li>else tries to find a match in the new sequences already generated while
 200    * parsing the features file</li>
 201    * <li>else creates a new placeholder sequence, adds it to the new sequences
 202    * list, and returns it</li>
 203    * </ul>
 204    *
 205    * @param seqId
 206    * @param align
 207    * @param newseqs
 208    * @param relaxedIdMatching
 209    *
 210    * @return
 211    */
 212   protected SequenceI findSequence(String seqId, AlignmentI align,
 213           List<SequenceI> newseqs, boolean relaxedIdMatching)
 214   {
 215     if (seqId == null)
 216     {
 217       return null;
 218     }
 219     SequenceI match = null;
 220     if (relaxedIdMatching)
 221     {
 222       if (lastmatchedAl != align)
 223       {
 224         lastmatchedAl = align;
 225         matcher = new SequenceIdMatcher(align.getSequencesArray());
 226         if (newseqs != null)
 227         {
 228           matcher.addAll(newseqs);
 229         }
 230       }
 231       match = matcher.findIdMatch(seqId);
 232     }
 233     else
 234     {
 235       match = align.findName(seqId, true);
 236       if (match == null && newseqs != null)
 237       {
 238         for (SequenceI m : newseqs)
 239         {
 240           if (seqId.equals(m.getName()))
 241           {
 242             return m;
 243           }
 244         }
 245       }
 246
 247     }
 248     if (match == null && newseqs != null)
 249     {
 250       match = new SequenceDummy(seqId);
 251       if (relaxedIdMatching)
 252       {
 253         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
 254       }
 255       // add dummy sequence to the newseqs list
 256       newseqs.add(match);
 257     }
 258     return match;
 259   }
 260
 261   /**
 262    * Parses the input line to a map of name / value(s) pairs. For example the
 263    * line <br>
 264    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
 265    * <br>
 266    * if parsed with delimiter=";" and separators {' ', '='} <br>
 267    * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
 268    * prediction}, source={Pfam}} <br>
 269    *
 270    * This method supports parsing of either GFF2 format (which uses space ' ' as
 271    * the name/value delimiter, and allows multiple occurrences of the same
 272    * name), or GFF3 format (which uses '=' as the name/value delimiter, and
 273    * strictly does not allow repeat occurrences of the same name - but does
 274    * allow a comma-separated list of values).
 275    *
 276    * @param text
 277    * @param namesDelimiter
 278    *          the major delimiter between name-value pairs
 279    * @param nameValueSeparator
 280    *          one or more separators used between name and value
 281    * @param valuesDelimiter
 282    *          delimits a list of more than one value
 283    * @return the name-values map (which may be empty but never null)
 284    */
 285   public static Map<String, List<String>> parseNameValuePairs(String text,
 286           String namesDelimiter, char nameValueSeparator,
 287           String valuesDelimiter)
 288   {
 289     Map<String, List<String>> map = new HashMap<String, List<String>>();
 290     if (text == null || text.trim().length() == 0)
 291     {
 292       return map;
 293     }
 294
 295     for (String pair : text.trim().split(namesDelimiter))
 296     {
 297       pair = pair.trim();
 298       if (pair.length() == 0)
 299       {
 300         continue;
 301       }
 302
 303       int sepPos = pair.indexOf(nameValueSeparator);
 304       if (sepPos == -1)
 305       {
 306         // no name=value present
 307         continue;
 308       }
 309
 310       String key = pair.substring(0, sepPos).trim();
 311       String values = pair.substring(sepPos + 1).trim();
 312       if (values.length() > 0)
 313       {
 314         List<String> vals = map.get(key);
 315         if (vals == null)
 316         {
 317           vals = new ArrayList<String>();
 318           map.put(key, vals);
 319         }
 320         for (String val : values.split(valuesDelimiter))
 321         {
 322           vals.add(val);
 323         }
 324       }
 325     }
 326     return map;
 327   }
 328
 329   /**
 330    * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
 331    * to call this method then adjust the SequenceFeature depending on the
 332    * particular usage of different tools that generate GFF.
 333    *
 334    * @param gff
 335    * @param attributes
 336    * @return
 337    */
 338   protected SequenceFeature buildSequenceFeature(String[] gff,
 339           Map<String, List<String>> attributes)
 340   {
 341     try
 342     {
 343       int start = Integer.parseInt(gff[START_COL]);
 344       int end = Integer.parseInt(gff[END_COL]);
 345
 346       /*
 347        * default 'score' is 0 rather than Float.NaN as the latter currently
 348        * disables the 'graduated colour => colour by label' option
 349        */
 350       float score = 0f;
 351       try
 352       {
 353         score = Float.parseFloat(gff[SCORE_COL]);
 354       } catch (NumberFormatException nfe)
 355       {
 356         // e.g. '.' - leave as zero
 357       }
 358
 359       SequenceFeature sf = new SequenceFeature(gff[TYPE_COL],
 360               gff[SOURCE_COL], start, end, score, gff[SOURCE_COL]);
 361
 362       sf.setStrand(gff[STRAND_COL]);
 363
 364       sf.setPhase(gff[PHASE_COL]);
 365
 366       if (attributes != null)
 367       {
 368         /*
 369          * save 'raw' column 9 to allow roundtrip output as input
 370          */
 371         sf.setAttributes(gff[ATTRIBUTES_COL]);
 372
 373         /*
 374          * Add attributes in column 9 to the sequence feature's
 375          * 'otherData' table; use Note as a best proxy for description
 376          */
 377         for (Entry<String, List<String>> attr : attributes.entrySet())
 378         {
 379           String values = StringUtils.listToDelimitedString(attr.getValue(),
 380                   ",");
 381           sf.setValue(attr.getKey(), values);
 382           if (NOTE.equals(attr.getKey()))
 383           {
 384             sf.setDescription(values);
 385           }
 386         }
 387       }
 388
 389       return sf;
 390     } catch (NumberFormatException nfe)
 391     {
 392       System.err.println("Invalid number in gff: " + nfe.getMessage());
 393       return null;
 394     }
 395   }
 396
 397   /**
 398    * Returns the character used to separate attributes names from values in GFF
 399    * column 9. This is space for GFF2, '=' for GFF3.
 400    *
 401    * @return
 402    */
 403   protected abstract char getNameValueSeparator();
 404
 405   /**
 406    * Returns any existing mapping held on the alignment between the given
 407    * dataset sequences, or a new one if none found. This is a convenience method
 408    * to facilitate processing multiple GFF lines that make up a single 'spliced'
 409    * mapping, by extending the first mapping as the others are read.
 410    *
 411    * @param align
 412    * @param fromSeq
 413    * @param toSeq
 414    * @return
 415    */
 416   protected AlignedCodonFrame getMapping(AlignmentI align,
 417           SequenceI fromSeq, SequenceI toSeq)
 418   {
 419     AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
 420     if (acf == null)
 421     {
 422       acf = new AlignedCodonFrame();
 423     }
 424     return acf;
 425   }
 426
 427 }