src/jalview/io/gff/Gff3Helper.java

   1 package jalview.io.gff;
   2
   3 import jalview.datamodel.AlignedCodonFrame;
   4 import jalview.datamodel.AlignmentI;
   5 import jalview.datamodel.MappingType;
   6 import jalview.datamodel.SequenceFeature;
   7 import jalview.datamodel.SequenceI;
   8 import jalview.util.MapList;
   9 import jalview.util.StringUtils;
  10
  11 import java.io.IOException;
  12 import java.util.List;
  13 import java.util.Map;
  14
  15 /**
  16  * Base class with generic / common functionality for processing GFF3 data.
  17  * Override this as required for any specialisations resulting from
  18  * peculiarities of GFF3 generated by particular tools.
  19  */
  20 public class Gff3Helper extends GffHelperBase
  21 {
  22   protected static final String TARGET = "Target";
  23
  24   protected static final String ID = "ID";
  25
  26   private static final String NAME = "Name";
  27
  28   /**
  29    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
  30    * separate multiple values for a name
  31    *
  32    * @param text
  33    * @return
  34    */
  35   public static Map<String, List<String>> parseNameValuePairs(String text)
  36   {
  37     return parseNameValuePairs(text, ";", '=', ",");
  38   }
  39
  40   /**
  41    * Process one GFF feature line (as modelled by SequenceFeature)
  42    *
  43    * @param seq
  44    *          the sequence with which this feature is associated
  45    * @param sf
  46    *          the sequence feature with ATTRIBUTES property containing any
  47    *          additional attributes
  48    * @param align
  49    *          the alignment we are adding GFF to
  50    * @param newseqs
  51    *          any new sequences referenced by the GFF
  52    * @param relaxedIdMatching
  53    *          if true, match word tokens in sequence names
  54    * @return true if the sequence feature should be added to the sequence, else
  55    *         false (i.e. it has been processed in another way e.g. to generate a
  56    *         mapping)
  57    * @throws IOException
  58    */
  59   @Override
  60   public SequenceFeature processGff(SequenceI seq, String[] gff,
  61           AlignmentI align, List<SequenceI> newseqs,
  62           boolean relaxedIdMatching) throws IOException
  63   {
  64     /*
  65      * (For now) we don't process mappings from reverse complement ; to do
  66      * this would require (a) creating a virtual sequence placeholder for
  67      * the reverse complement (b) resolving the sequence by its id from some
  68      * source (GFF ##FASTA or other) (c) creating the reverse complement
  69      * sequence (d) updating the mapping to be to the reverse complement
  70      */
  71     if ("-".equals(gff[STRAND_COL]))
  72     {
  73       System.err
  74               .println("Skipping mapping from reverse complement as not yet supported");
  75       return null;
  76     }
  77     SequenceFeature sf = null;
  78
  79     if (gff.length == 9)
  80     {
  81       String soTerm = gff[TYPE_COL];
  82       String atts = gff[ATTRIBUTES_COL];
  83       Map<String, List<String>> attributes = parseNameValuePairs(atts);
  84
  85       if (SequenceOntology.getInstance().isProteinMatch(soTerm))
  86       {
  87         sf = processProteinMatch(attributes, seq, gff, align,
  88                 newseqs, relaxedIdMatching);
  89       }
  90       else if (SequenceOntology.getInstance().isNucleotideMatch(soTerm))
  91       {
  92         sf = processNucleotideMatch(attributes, seq, gff, align,
  93                 newseqs, relaxedIdMatching);
  94       }
  95       else
  96       {
  97         sf = buildSequenceFeature(gff, attributes);
  98       }
  99     }
 100     else
 101     {
 102       /*
 103        * fall back on generating a sequence feature with no special processing
 104        */
 105       sf = buildSequenceFeature(gff, null);
 106     }
 107
 108     return sf;
 109   }
 110
 111   /**
 112    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
 113    *
 114    * @param attributes
 115    *          parsed GFF column 9 key/value(s)
 116    * @param seq
 117    *          the sequence the GFF feature is on
 118    * @param gffColumns
 119    *          the GFF column data
 120    * @param align
 121    *          the alignment the sequence belongs to, where any new mappings
 122    *          should be added
 123    * @param newseqs
 124    *          a list of new 'virtual sequences' generated while parsing GFF
 125    * @param relaxedIdMatching
 126    *          if true allow fuzzy search for a matching target sequence
 127    * @return a sequence feature, if one should be added to the sequence, else
 128    *         null
 129    * @throws IOException
 130    */
 131   protected SequenceFeature processNucleotideMatch(
 132           Map<String, List<String>> attributes, SequenceI seq,
 133           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 134           boolean relaxedIdMatching)
 135           throws IOException
 136   {
 137     String strand = gffColumns[STRAND_COL];
 138     if ("-1".equals(strand))
 139     {
 140       System.err
 141               .println("Currently ignoring mappings from reverse complement");
 142       return null;
 143     }
 144
 145     List<String> targets = attributes.get(TARGET);
 146     if (targets == null)
 147     {
 148       System.err.println("'Target' missing in GFF");
 149       return null;
 150     }
 151
 152     /*
 153      * Typically we only expect one Target per GFF line, but this can handle
 154      * multiple matches, to the same or different sequences (e.g. dna variants)
 155      */
 156     for (String target : targets)
 157     {
 158       /*
 159        * Process "seqid start end [strand]"
 160        */
 161       String[] tokens = target.split(" ");
 162       if (tokens.length < 3)
 163       {
 164         System.err.println("Incomplete Target: " + target);
 165         continue;
 166       }
 167
 168       /*
 169        * Locate the mapped sequence in the alignment, or as a
 170        * (new or existing) virtual sequence in the newseqs list
 171        */
 172       String targetId = findTargetId(tokens[0], attributes);
 173       SequenceI mappedSequence1 = findSequence(targetId, align,
 174       newseqs, relaxedIdMatching);
 175       SequenceI mappedSequence = mappedSequence1;
 176       if (mappedSequence == null)
 177       {
 178         continue;
 179       }
 180
 181       /*
 182        * get any existing mapping for these sequences (or start one),
 183        * and add this mapped range
 184        */
 185       AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
 186
 187       try
 188       {
 189         int toStart = Integer.parseInt(tokens[1]);
 190         int toEnd = Integer.parseInt(tokens[2]);
 191         if (tokens.length > 3 && "-".equals(tokens[3]))
 192         {
 193           // mapping to reverse strand - swap start/end
 194           int temp = toStart;
 195           toStart = toEnd;
 196           toEnd = temp;
 197         }
 198
 199         int fromStart = Integer.parseInt(gffColumns[START_COL]);
 200         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
 201         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
 202                 toStart, toEnd,
 203                 MappingType.NucleotideToNucleotide);
 204
 205         if (mapping != null)
 206         {
 207           acf.addMap(seq, mappedSequence, mapping);
 208           align.addCodonFrame(acf);
 209         }
 210       } catch (NumberFormatException nfe)
 211       {
 212         System.err.println("Invalid start or end in Target " + target);
 213       }
 214     }
 215
 216     SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
 217     return sf;
 218   }
 219
 220   /**
 221    * Returns the target sequence id extracted from the GFF name/value pairs.
 222    * Default (standard behaviour) is the first token for "Target". This may be
 223    * overridden where tools report this in a non-standard way.
 224    *
 225    * @param target
 226    *          first token of a "Target" value from GFF column 9, typically
 227    *          "seqid start end"
 228    * @param set
 229    *          a map with all parsed column 9 attributes
 230    * @return
 231    */
 232   @SuppressWarnings("unused")
 233   protected String findTargetId(String target, Map<String, List<String>> set)
 234   {
 235     return target;
 236   }
 237
 238   /**
 239    * Processes one GFF 'protein_match'; fields of interest are
 240    * <ul>
 241    * <li>feature group - the database reporting a match e.g. Pfam</li>
 242    * <li>Name - the matched entry's accession id in the database</li>
 243    * <li>ID - a sequence identifier for the matched region (which may be
 244    * appended as FASTA in the GFF file)</li>
 245    * </ul>
 246    *
 247    * @param set
 248    *          parsed GFF column 9 key/value(s)
 249    * @param seq
 250    *          the sequence the GFF feature is on
 251    * @param gffColumns
 252    *          the sequence feature holding GFF data
 253    * @param align
 254    *          the alignment the sequence belongs to, where any new mappings
 255    *          should be added
 256    * @param newseqs
 257    *          a list of new 'virtual sequences' generated while parsing GFF
 258    * @param relaxedIdMatching
 259    *          if true allow fuzzy search for a matching target sequence
 260    * @return the (real or virtual) sequence(s) mapped to by this match
 261    * @throws IOException
 262    */
 263   protected SequenceFeature processProteinMatch(
 264           Map<String, List<String>> set, SequenceI seq,
 265           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 266           boolean relaxedIdMatching)
 267   {
 268     // This is currently tailored to InterProScan GFF output:
 269     // ID holds the ID of the matched sequence, Target references the
 270     // query sequence; this looks wrong, as ID should just be the GFF internal
 271     // ID of the GFF feature, while Target would normally reference the matched
 272     // sequence.
 273     // TODO refactor as needed if other protein-protein GFF varies
 274
 275     SequenceFeature sf = buildSequenceFeature(gffColumns, set);
 276
 277     /*
 278      * locate the mapped sequence in the alignment, or as a
 279      * (new or existing) virtual sequence in the newseqs list
 280      */
 281     List<String> targets = set.get(TARGET);
 282     if (targets != null)
 283     {
 284       for (String target : targets)
 285       {
 286
 287         SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align,
 288         newseqs, relaxedIdMatching);
 289         SequenceI mappedSequence = mappedSequence1;
 290         if (mappedSequence == null)
 291         {
 292           continue;
 293         }
 294
 295         /*
 296          * give the mapped sequence a copy of the sequence feature, with
 297          * start/end range adjusted
 298          */
 299         SequenceFeature sf2 = new SequenceFeature(sf);
 300         sf2.setBegin(1);
 301         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
 302         sf2.setEnd(sequenceFeatureLength);
 303         mappedSequence.addSequenceFeature(sf2);
 304
 305         /*
 306          * add a property to the mapped sequence so that it can eventually be
 307          * renamed with its qualified accession id; renaming has to wait until
 308          * all sequence reference resolution is complete
 309          */
 310         String accessionId = StringUtils.listToDelimitedString(
 311                 set.get(NAME), ",");
 312         if (accessionId.length() > 0)
 313         {
 314           String database = sf.getType(); // TODO InterProScan only??
 315           String qualifiedAccId = database + "|" + accessionId;
 316           sf2.setValue(RENAME_TOKEN, qualifiedAccId);
 317         }
 318
 319         /*
 320          * get any existing mapping for these sequences (or start one),
 321          * and add this mapped range
 322          */
 323         AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
 324         int[] from = new int[] { sf.getBegin(), sf.getEnd() };
 325         int[] to = new int[] { 1, sequenceFeatureLength };
 326         MapList mapping = new MapList(from, to, 1, 1);
 327
 328         alco.addMap(seq, mappedSequence, mapping);
 329         align.addCodonFrame(alco);
 330       }
 331     }
 332
 333     return sf;
 334   }
 335
 336   /**
 337    * Return '=' as the name-value separator used in column 9 attributes.
 338    */
 339   @Override
 340   protected char getNameValueSeparator()
 341   {
 342     return '=';
 343   }
 344
 345   /**
 346    * Modifies the default SequenceFeature in order to set the Target sequence id
 347    * as the description
 348    */
 349   @Override
 350   protected SequenceFeature buildSequenceFeature(String[] gff,
 351           Map<String, List<String>> attributes)
 352   {
 353     SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
 354     String target = (String) sf.getValue(TARGET);
 355     if (target != null)
 356     {
 357       sf.setDescription(target.split(" ")[0]);
 358     }
 359     return sf;
 360   }
 361 }