src/jalview/io/gff/GffHelperBase.java

   1 package jalview.io.gff;
   2
   3 import jalview.analysis.SequenceIdMatcher;
   4 import jalview.datamodel.AlignedCodonFrame;
   5 import jalview.datamodel.AlignmentI;
   6 import jalview.datamodel.MappingType;
   7 import jalview.datamodel.SequenceDummy;
   8 import jalview.datamodel.SequenceFeature;
   9 import jalview.datamodel.SequenceI;
  10 import jalview.util.MapList;
  11 import jalview.util.StringUtils;
  12
  13 import java.util.ArrayList;
  14 import java.util.Arrays;
  15 import java.util.HashMap;
  16 import java.util.List;
  17 import java.util.Map;
  18 import java.util.Map.Entry;
  19
  20 /**
  21  * Base class with common functionality for flavours of GFF handler (GFF2 or
  22  * GFF3)
  23  */
  24 public abstract class GffHelperBase implements GffHelperI
  25 {
  26   private static final String NOTE = "Note";
  27
  28   /*
  29    * GFF columns 1-9 (zero-indexed):
  30    */
  31   protected static final int SEQID_COL = 0;
  32
  33   protected static final int SOURCE_COL = 1;
  34
  35   protected static final int TYPE_COL = 2;
  36
  37   protected static final int START_COL = 3;
  38
  39   protected static final int END_COL = 4;
  40
  41   protected static final int SCORE_COL = 5;
  42
  43   protected static final int STRAND_COL = 6;
  44
  45   protected static final int PHASE_COL = 7;
  46
  47   protected static final int ATTRIBUTES_COL = 8;
  48
  49   private AlignmentI lastmatchedAl = null;
  50
  51   private SequenceIdMatcher matcher = null;
  52
  53   /**
  54    * Constructs and returns a mapping, or null if data appear invalid
  55    *
  56    * @param fromStart
  57    * @param fromEnd
  58    * @param toStart
  59    * @param toEnd
  60    * @param mappingType
  61    *          type of mapping (e.g. protein to nucleotide)
  62    * @return
  63    */
  64   protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
  65           int toStart, int toEnd, MappingType mappingType)
  66   {
  67     int[] from = new int[] { fromStart, fromEnd };
  68     int[] to = new int[] { toStart, toEnd };
  69
  70     /*
  71      * Jalview always models from dna to protein, so switch values if the
  72      * GFF mapping is from protein to dna
  73      */
  74     if (mappingType == MappingType.PeptideToNucleotide)
  75     {
  76       int[] temp = from;
  77       from = to;
  78       to = temp;
  79       mappingType = mappingType.getInverse();
  80     }
  81
  82     int fromRatio = mappingType.getFromRatio();
  83     int toRatio = mappingType.getToRatio();
  84
  85     /*
  86      * sanity check that mapped residue counts match
  87      * TODO understand why PASA generates such cases...
  88      */
  89     if (!trimMapping(from, to, fromRatio, toRatio))
  90     {
  91       System.err.println("Ignoring mapping from " + Arrays.toString(from)
  92               + " to " + Arrays.toString(to) + " as counts don't match!");
  93       return null;
  94     }
  95
  96     /*
  97      * If a codon has an intron gap, there will be contiguous 'toRanges';
  98      * this is handled for us by the MapList constructor.
  99      * (It is not clear that exonerate ever generates this case)
 100      */
 101
 102     return new MapList(from, to, fromRatio, toRatio);
 103   }
 104
 105   /**
 106    * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
 107    * tries to trim the end of the longer so they do. Returns true if the
 108    * mappings could be made equivalent, else false. Note the range array values
 109    * may be modified by this method.
 110    *
 111    * @param from
 112    * @param to
 113    * @param fromRatio
 114    * @param toRatio
 115    * @return
 116    */
 117   protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
 118           int toRatio)
 119   {
 120     int fromLength = Math.abs(from[1] - from[0]) + 1;
 121     int toLength = Math.abs(to[1] - to[0]) + 1;
 122     int fromOverlap = fromLength * toRatio - toLength * fromRatio;
 123     if (fromOverlap == 0)
 124     {
 125       return true;
 126     }
 127     if (fromOverlap > 0 && fromOverlap % toRatio == 0)
 128     {
 129       /*
 130        * restrict from range to make them match up
 131        * it's kind of arbitrary which end we truncate - here it is the end
 132        */
 133       System.err.print("Truncating mapping from " + Arrays.toString(from)
 134               + " to ");
 135       if (from[1] > from[0])
 136       {
 137         from[1] -= fromOverlap / toRatio;
 138       }
 139       else
 140       {
 141         from[1] += fromOverlap / toRatio;
 142       }
 143       System.err.println(Arrays.toString(from));
 144       return true;
 145     }
 146     else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
 147     {
 148       fromOverlap = -fromOverlap; // > 0
 149       /*
 150        * restrict to range to make them match up
 151        */
 152       System.err.print("Truncating mapping to " + Arrays.toString(to)
 153               + " to ");
 154       if (to[1] > to[0])
 155       {
 156         to[1] -= fromOverlap / fromRatio;
 157       }
 158       else
 159       {
 160         to[1] += fromOverlap / fromRatio;
 161       }
 162       System.err.println(Arrays.toString(to));
 163       return true;
 164     }
 165
 166     /*
 167      * Couldn't truncate to an exact match..
 168      */
 169     return false;
 170   }
 171
 172   /**
 173    * Returns a sequence matching the given id, as follows
 174    * <ul>
 175    * <li>strict matching is on exact sequence name</li>
 176    * <li>relaxed matching allows matching on a token within the sequence name,
 177    * or a dbxref</li>
 178    * <li>first tries to find a match in the alignment sequences</li>
 179    * <li>else tries to find a match in the new sequences already generated while
 180    * parsing the features file</li>
 181    * <li>else creates a new placeholder sequence, adds it to the new sequences
 182    * list, and returns it</li>
 183    * </ul>
 184    *
 185    * @param seqId
 186    * @param align
 187    * @param newseqs
 188    * @param relaxedIdMatching
 189    *
 190    * @return
 191    */
 192   protected SequenceI findSequence(String seqId, AlignmentI align,
 193           List<SequenceI> newseqs, boolean relaxedIdMatching)
 194   {
 195     if (seqId == null)
 196     {
 197       return null;
 198     }
 199     SequenceI match = null;
 200     if (relaxedIdMatching)
 201     {
 202       if (lastmatchedAl != align)
 203       {
 204         lastmatchedAl = align;
 205         matcher = new SequenceIdMatcher(align.getSequencesArray());
 206         if (newseqs != null)
 207         {
 208           matcher.addAll(newseqs);
 209         }
 210       }
 211       match = matcher.findIdMatch(seqId);
 212     }
 213     else
 214     {
 215       match = align.findName(seqId, true);
 216       if (match == null && newseqs != null)
 217       {
 218         for (SequenceI m : newseqs)
 219         {
 220           if (seqId.equals(m.getName()))
 221           {
 222             return m;
 223           }
 224         }
 225       }
 226
 227     }
 228     if (match == null && newseqs != null)
 229     {
 230       match = new SequenceDummy(seqId);
 231       if (relaxedIdMatching)
 232       {
 233         matcher.addAll(Arrays.asList(new SequenceI[] { match }));
 234       }
 235       // add dummy sequence to the newseqs list
 236       newseqs.add(match);
 237     }
 238     return match;
 239   }
 240
 241   /**
 242    * Parses the input line to a map of name / value(s) pairs. For example the
 243    * line <br>
 244    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal <br>
 245    * if parsed with delimiter=";" and separators {' ', '='} <br>
 246    * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
 247    * prediction}, source={Pfam}} <br>
 248    *
 249    * This method supports parsing of either GFF2 format (which uses space ' ' as
 250    * the name/value delimiter, and allows multiple occurrences of the same
 251    * name), or GFF3 format (which uses '=' as the name/value delimiter, and
 252    * strictly does not allow repeat occurrences of the same name - but does
 253    * allow a comma-separated list of values).
 254    *
 255    * @param text
 256    * @param namesDelimiter
 257    *          the major delimiter between name-value pairs
 258    * @param nameValueSeparator
 259    *          one or more separators used between name and value
 260    * @param valuesDelimiter
 261    *          delimits a list of more than one value
 262    * @return the name-values map (which may be empty but never null)
 263    */
 264   public static Map<String, List<String>> parseNameValuePairs(String text,
 265           String namesDelimiter, char nameValueSeparator,
 266           String valuesDelimiter)
 267   {
 268     Map<String, List<String>> map = new HashMap<String, List<String>>();
 269     if (text == null || text.trim().length() == 0)
 270     {
 271       return map;
 272     }
 273
 274     for (String pair : text.trim().split(namesDelimiter))
 275     {
 276       pair = pair.trim();
 277       if (pair.length() == 0)
 278       {
 279         continue;
 280       }
 281
 282       int sepPos = pair.indexOf(nameValueSeparator);
 283       if (sepPos == -1)
 284       {
 285         // no name=value present
 286         continue;
 287       }
 288
 289       String key = pair.substring(0, sepPos).trim();
 290       String values = pair.substring(sepPos + 1).trim();
 291       if (values.length() > 0)
 292       {
 293         List<String> vals = map.get(key);
 294         if (vals == null)
 295         {
 296           vals = new ArrayList<String>();
 297           map.put(key, vals);
 298         }
 299         for (String val : values.split(valuesDelimiter))
 300         {
 301           vals.add(val);
 302         }
 303       }
 304     }
 305     return map;
 306   }
 307
 308   /**
 309    * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
 310    * to call this method then adjust the SequenceFeature depending on the
 311    * particular usage of different tools that generate GFF.
 312    *
 313    * @param gff
 314    * @param attributes
 315    * @return
 316    */
 317   protected SequenceFeature buildSequenceFeature(String[] gff,
 318           Map<String, List<String>> attributes)
 319   {
 320     try
 321     {
 322       int start = Integer.parseInt(gff[START_COL]);
 323       int end = Integer.parseInt(gff[END_COL]);
 324
 325       /*
 326        * default 'score' is 0 rather than Float.NaN as the latter currently
 327        * disables the 'graduated colour => colour by label' option
 328        */
 329       float score = 0f;
 330       try
 331       {
 332         score = Float.parseFloat(gff[SCORE_COL]);
 333       } catch (NumberFormatException nfe)
 334       {
 335         // e.g. '.' - leave as zero
 336       }
 337
 338       SequenceFeature sf = new SequenceFeature(gff[TYPE_COL],
 339               gff[SOURCE_COL], start, end, score, gff[SOURCE_COL]);
 340
 341       sf.setStrand(gff[STRAND_COL]);
 342
 343       sf.setPhase(gff[PHASE_COL]);
 344
 345       if (attributes != null)
 346       {
 347         /*
 348          * save 'raw' column 9 to allow roundtrip output as input
 349          */
 350         sf.setAttributes(gff[ATTRIBUTES_COL]);
 351
 352         /*
 353          * Add attributes in column 9 to the sequence feature's
 354          * 'otherData' table; use Note as a best proxy for description
 355          */
 356         for (Entry<String, List<String>> attr : attributes.entrySet())
 357         {
 358           String values = StringUtils.listToDelimitedString(
 359                   attr.getValue(), ",");
 360           sf.setValue(attr.getKey(), values);
 361           if (NOTE.equals(attr.getKey()))
 362           {
 363             sf.setDescription(values);
 364           }
 365         }
 366       }
 367
 368       return sf;
 369     } catch (NumberFormatException nfe)
 370     {
 371       System.err.println("Invalid number in gff: " + nfe.getMessage());
 372       return null;
 373     }
 374   }
 375
 376   /**
 377    * Returns the character used to separate attributes names from values in GFF
 378    * column 9. This is space for GFF2, '=' for GFF3.
 379    *
 380    * @return
 381    */
 382   protected abstract char getNameValueSeparator();
 383
 384   /**
 385    * Returns any existing mapping held on the alignment between the given
 386    * dataset sequences, or a new one if none found. This is a convenience method
 387    * to facilitate processing multiple GFF lines that make up a single 'spliced'
 388    * mapping, by extending the first mapping as the others are read.
 389    *
 390    * @param align
 391    * @param fromSeq
 392    * @param toSeq
 393    * @return
 394    */
 395   protected AlignedCodonFrame getMapping(AlignmentI align,
 396           SequenceI fromSeq, SequenceI toSeq)
 397   {
 398     AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
 399     if (acf == null)
 400     {
 401       acf = new AlignedCodonFrame();
 402     }
 403     return acf;
 404   }
 405
 406 }