src/jalview/io/gff/Gff3Helper.java

   1 package jalview.io.gff;
   2
   3 import jalview.datamodel.AlignedCodonFrame;
   4 import jalview.datamodel.AlignmentI;
   5 import jalview.datamodel.MappingType;
   6 import jalview.datamodel.SequenceFeature;
   7 import jalview.datamodel.SequenceI;
   8 import jalview.util.MapList;
   9 import jalview.util.StringUtils;
  10
  11 import java.io.IOException;
  12 import java.util.List;
  13 import java.util.Map;
  14
  15 /**
  16  * Base class with generic / common functionality for processing GFF3 data.
  17  * Override this as required for any specialisations resulting from
  18  * peculiarities of GFF3 generated by particular tools.
  19  */
  20 public class Gff3Helper extends GffHelperBase
  21 {
  22   protected static final String TARGET = "Target";
  23
  24   protected static final String ID = "ID";
  25
  26   private static final String NAME = "Name";
  27
  28   /**
  29    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
  30    * separate multiple values for a name
  31    *
  32    * @param text
  33    * @return
  34    */
  35   public static Map<String, List<String>> parseNameValuePairs(String text)
  36   {
  37     return parseNameValuePairs(text, ";", '=', ",");
  38   }
  39
  40   /**
  41    * Process one GFF feature line (as modelled by SequenceFeature)
  42    *
  43    * @param seq
  44    *          the sequence with which this feature is associated
  45    * @param sf
  46    *          the sequence feature with ATTRIBUTES property containing any
  47    *          additional attributes
  48    * @param align
  49    *          the alignment we are adding GFF to
  50    * @param newseqs
  51    *          any new sequences referenced by the GFF
  52    * @param relaxedIdMatching
  53    *          if true, match word tokens in sequence names
  54    * @return true if the sequence feature should be added to the sequence, else
  55    *         false (i.e. it has been processed in another way e.g. to generate a
  56    *         mapping)
  57    * @throws IOException
  58    */
  59   @Override
  60   public SequenceFeature processGff(SequenceI seq, String[] gff,
  61           AlignmentI align, List<SequenceI> newseqs,
  62           boolean relaxedIdMatching) throws IOException
  63   {
  64     SequenceFeature sf = null;
  65
  66     if (gff.length == 9)
  67     {
  68       String soTerm = gff[TYPE_COL];
  69       String atts = gff[ATTRIBUTES_COL];
  70       Map<String, List<String>> attributes = parseNameValuePairs(atts);
  71
  72       if (SequenceOntology.getInstance().isProteinMatch(soTerm))
  73       {
  74         sf = processProteinMatch(attributes, seq, gff, align,
  75                 newseqs, relaxedIdMatching);
  76       }
  77       else if (SequenceOntology.getInstance().isNucleotideMatch(soTerm))
  78       {
  79         sf = processNucleotideMatch(attributes, seq, gff, align,
  80                 newseqs, relaxedIdMatching);
  81       }
  82       else
  83       {
  84         sf = buildSequenceFeature(gff, attributes);
  85       }
  86     }
  87     else
  88     {
  89       /*
  90        * fall back on generating a sequence feature with no special processing
  91        */
  92       sf = buildSequenceFeature(gff, null);
  93     }
  94
  95     return sf;
  96   }
  97
  98   /**
  99    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
 100    *
 101    * @param attributes
 102    *          parsed GFF column 9 key/value(s)
 103    * @param seq
 104    *          the sequence the GFF feature is on
 105    * @param gffColumns
 106    *          the GFF column data
 107    * @param align
 108    *          the alignment the sequence belongs to, where any new mappings
 109    *          should be added
 110    * @param newseqs
 111    *          a list of new 'virtual sequences' generated while parsing GFF
 112    * @param relaxedIdMatching
 113    *          if true allow fuzzy search for a matching target sequence
 114    * @return a sequence feature, if one should be added to the sequence, else
 115    *         null
 116    * @throws IOException
 117    */
 118   protected SequenceFeature processNucleotideMatch(
 119           Map<String, List<String>> attributes, SequenceI seq,
 120           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 121           boolean relaxedIdMatching)
 122           throws IOException
 123   {
 124     String strand = gffColumns[STRAND_COL];
 125
 126     /*
 127      * (For now) we don't process mappings from reverse complement ; to do
 128      * this would require (a) creating a virtual sequence placeholder for
 129      * the reverse complement (b) resolving the sequence by its id from some
 130      * source (GFF ##FASTA or other) (c) creating the reverse complement
 131      * sequence (d) updating the mapping to be to the reverse complement
 132      */
 133     if ("-".equals(strand))
 134     {
 135       System.err
 136               .println("Skipping mapping from reverse complement as not yet supported");
 137       return null;
 138     }
 139
 140     List<String> targets = attributes.get(TARGET);
 141     if (targets == null)
 142     {
 143       System.err.println("'Target' missing in GFF");
 144       return null;
 145     }
 146
 147     /*
 148      * Typically we only expect one Target per GFF line, but this can handle
 149      * multiple matches, to the same or different sequences (e.g. dna variants)
 150      */
 151     for (String target : targets)
 152     {
 153       /*
 154        * Process "seqid start end [strand]"
 155        */
 156       String[] tokens = target.split(" ");
 157       if (tokens.length < 3)
 158       {
 159         System.err.println("Incomplete Target: " + target);
 160         continue;
 161       }
 162
 163       /*
 164        * Locate the mapped sequence in the alignment, or as a
 165        * (new or existing) virtual sequence in the newseqs list
 166        */
 167       String targetId = findTargetId(tokens[0], attributes);
 168       SequenceI mappedSequence1 = findSequence(targetId, align,
 169       newseqs, relaxedIdMatching);
 170       SequenceI mappedSequence = mappedSequence1;
 171       if (mappedSequence == null)
 172       {
 173         continue;
 174       }
 175
 176       /*
 177        * get any existing mapping for these sequences (or start one),
 178        * and add this mapped range
 179        */
 180       AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
 181
 182       try
 183       {
 184         int toStart = Integer.parseInt(tokens[1]);
 185         int toEnd = Integer.parseInt(tokens[2]);
 186         if (tokens.length > 3 && "-".equals(tokens[3]))
 187         {
 188           // mapping to reverse strand - swap start/end
 189           int temp = toStart;
 190           toStart = toEnd;
 191           toEnd = temp;
 192         }
 193
 194         int fromStart = Integer.parseInt(gffColumns[START_COL]);
 195         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
 196         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
 197                 toStart, toEnd,
 198                 MappingType.NucleotideToNucleotide);
 199
 200         if (mapping != null)
 201         {
 202           acf.addMap(seq, mappedSequence, mapping);
 203           align.addCodonFrame(acf);
 204         }
 205       } catch (NumberFormatException nfe)
 206       {
 207         System.err.println("Invalid start or end in Target " + target);
 208       }
 209     }
 210
 211     SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
 212     return sf;
 213   }
 214
 215   /**
 216    * Returns the target sequence id extracted from the GFF name/value pairs.
 217    * Default (standard behaviour) is the first token for "Target". This may be
 218    * overridden where tools report this in a non-standard way.
 219    *
 220    * @param target
 221    *          first token of a "Target" value from GFF column 9, typically
 222    *          "seqid start end"
 223    * @param set
 224    *          a map with all parsed column 9 attributes
 225    * @return
 226    */
 227   @SuppressWarnings("unused")
 228   protected String findTargetId(String target, Map<String, List<String>> set)
 229   {
 230     return target;
 231   }
 232
 233   /**
 234    * Processes one GFF 'protein_match'; fields of interest are
 235    * <ul>
 236    * <li>feature group - the database reporting a match e.g. Pfam</li>
 237    * <li>Name - the matched entry's accession id in the database</li>
 238    * <li>ID - a sequence identifier for the matched region (which may be
 239    * appended as FASTA in the GFF file)</li>
 240    * </ul>
 241    *
 242    * @param set
 243    *          parsed GFF column 9 key/value(s)
 244    * @param seq
 245    *          the sequence the GFF feature is on
 246    * @param gffColumns
 247    *          the sequence feature holding GFF data
 248    * @param align
 249    *          the alignment the sequence belongs to, where any new mappings
 250    *          should be added
 251    * @param newseqs
 252    *          a list of new 'virtual sequences' generated while parsing GFF
 253    * @param relaxedIdMatching
 254    *          if true allow fuzzy search for a matching target sequence
 255    * @return the (real or virtual) sequence(s) mapped to by this match
 256    * @throws IOException
 257    */
 258   protected SequenceFeature processProteinMatch(
 259           Map<String, List<String>> set, SequenceI seq,
 260           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 261           boolean relaxedIdMatching)
 262   {
 263     // This is currently tailored to InterProScan GFF output:
 264     // ID holds the ID of the matched sequence, Target references the
 265     // query sequence; this looks wrong, as ID should just be the GFF internal
 266     // ID of the GFF feature, while Target would normally reference the matched
 267     // sequence.
 268     // TODO refactor as needed if other protein-protein GFF varies
 269
 270     SequenceFeature sf = buildSequenceFeature(gffColumns, set);
 271
 272     /*
 273      * locate the mapped sequence in the alignment, or as a
 274      * (new or existing) virtual sequence in the newseqs list
 275      */
 276     List<String> targets = set.get(TARGET);
 277     if (targets != null)
 278     {
 279       for (String target : targets)
 280       {
 281
 282         SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align,
 283         newseqs, relaxedIdMatching);
 284         SequenceI mappedSequence = mappedSequence1;
 285         if (mappedSequence == null)
 286         {
 287           continue;
 288         }
 289
 290         /*
 291          * give the mapped sequence a copy of the sequence feature, with
 292          * start/end range adjusted
 293          */
 294         SequenceFeature sf2 = new SequenceFeature(sf);
 295         sf2.setBegin(1);
 296         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
 297         sf2.setEnd(sequenceFeatureLength);
 298         mappedSequence.addSequenceFeature(sf2);
 299
 300         /*
 301          * add a property to the mapped sequence so that it can eventually be
 302          * renamed with its qualified accession id; renaming has to wait until
 303          * all sequence reference resolution is complete
 304          */
 305         String accessionId = StringUtils.listToDelimitedString(
 306                 set.get(NAME), ",");
 307         if (accessionId.length() > 0)
 308         {
 309           String database = sf.getType(); // TODO InterProScan only??
 310           String qualifiedAccId = database + "|" + accessionId;
 311           sf2.setValue(RENAME_TOKEN, qualifiedAccId);
 312         }
 313
 314         /*
 315          * get any existing mapping for these sequences (or start one),
 316          * and add this mapped range
 317          */
 318         AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
 319         int[] from = new int[] { sf.getBegin(), sf.getEnd() };
 320         int[] to = new int[] { 1, sequenceFeatureLength };
 321         MapList mapping = new MapList(from, to, 1, 1);
 322
 323         alco.addMap(seq, mappedSequence, mapping);
 324         align.addCodonFrame(alco);
 325       }
 326     }
 327
 328     return sf;
 329   }
 330
 331   /**
 332    * Return '=' as the name-value separator used in column 9 attributes.
 333    */
 334   @Override
 335   protected char getNameValueSeparator()
 336   {
 337     return '=';
 338   }
 339
 340   /**
 341    * Modifies the default SequenceFeature in order to set the Target sequence id
 342    * as the description
 343    */
 344   @Override
 345   protected SequenceFeature buildSequenceFeature(String[] gff,
 346           Map<String, List<String>> attributes)
 347   {
 348     SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
 349     String desc = getDescription(sf, attributes);
 350     if (desc != null)
 351     {
 352       sf.setDescription(desc);
 353     }
 354     return sf;
 355   }
 356
 357   /**
 358    * Apply heuristic rules to try to get the most useful feature description
 359    *
 360    * @param sf
 361    * @param attributes
 362    * @return
 363    */
 364   protected String getDescription(SequenceFeature sf,
 365           Map<String, List<String>> attributes)
 366   {
 367     String desc = null;
 368     String target = (String) sf.getValue(TARGET);
 369     if (target != null)
 370     {
 371       desc = target.split(" ")[0];
 372     }
 373
 374     if (SequenceOntology.getInstance().isSequenceVariant(sf.getType()))
 375     {
 376       /*
 377        * Ensembl returns dna variants as 'alleles'
 378        */
 379       desc = StringUtils.listToDelimitedString(
 380               attributes.get("alleles"), ",");
 381     }
 382     return desc;
 383   }
 384 }