src/jalview/io/gff/Gff3Helper.java

   1 package jalview.io.gff;
   2
   3 import jalview.datamodel.AlignedCodonFrame;
   4 import jalview.datamodel.AlignmentI;
   5 import jalview.datamodel.MappingType;
   6 import jalview.datamodel.SequenceFeature;
   7 import jalview.datamodel.SequenceI;
   8 import jalview.util.MapList;
   9 import jalview.util.StringUtils;
  10
  11 import java.io.IOException;
  12 import java.util.List;
  13 import java.util.Map;
  14
  15 /**
  16  * Base class with generic / common functionality for processing GFF3 data.
  17  * Override this as required for any specialisations resulting from
  18  * peculiarities of GFF3 generated by particular tools.
  19  */
  20 public class Gff3Helper extends GffHelperBase
  21 {
  22   protected static final String TARGET = "Target";
  23
  24   protected static final String ID = "ID";
  25
  26   private static final String NAME = "Name";
  27
  28   /**
  29    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
  30    * separate multiple values for a name
  31    *
  32    * @param text
  33    * @return
  34    */
  35   public static Map<String, List<String>> parseNameValuePairs(String text)
  36   {
  37     return parseNameValuePairs(text, ";", '=', ",");
  38   }
  39
  40   /**
  41    * Process one GFF feature line (as modelled by SequenceFeature)
  42    *
  43    * @param seq
  44    *          the sequence with which this feature is associated
  45    * @param sf
  46    *          the sequence feature with ATTRIBUTES property containing any
  47    *          additional attributes
  48    * @param align
  49    *          the alignment we are adding GFF to
  50    * @param newseqs
  51    *          any new sequences referenced by the GFF
  52    * @param relaxedIdMatching
  53    *          if true, match word tokens in sequence names
  54    * @return true if the sequence feature should be added to the sequence, else
  55    *         false (i.e. it has been processed in another way e.g. to generate a
  56    *         mapping)
  57    * @throws IOException
  58    */
  59   @Override
  60   public SequenceFeature processGff(SequenceI seq, String[] gff,
  61           AlignmentI align, List<SequenceI> newseqs,
  62           boolean relaxedIdMatching) throws IOException
  63   {
  64     SequenceFeature sf = null;
  65
  66     if (gff.length == 9)
  67     {
  68       String soTerm = gff[TYPE_COL];
  69       String atts = gff[ATTRIBUTES_COL];
  70       Map<String, List<String>> attributes = parseNameValuePairs(atts);
  71
  72       SequenceOntologyI so = SequenceOntologyFactory.getInstance();
  73       if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
  74       {
  75         sf = processProteinMatch(attributes, seq, gff, align, newseqs,
  76                 relaxedIdMatching);
  77       }
  78       else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
  79       {
  80         sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
  81                 relaxedIdMatching);
  82       }
  83       else
  84       {
  85         sf = buildSequenceFeature(gff, attributes);
  86       }
  87     }
  88     else
  89     {
  90       /*
  91        * fall back on generating a sequence feature with no special processing
  92        */
  93       sf = buildSequenceFeature(gff, null);
  94     }
  95
  96     return sf;
  97   }
  98
  99   /**
 100    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
 101    *
 102    * @param attributes
 103    *          parsed GFF column 9 key/value(s)
 104    * @param seq
 105    *          the sequence the GFF feature is on
 106    * @param gffColumns
 107    *          the GFF column data
 108    * @param align
 109    *          the alignment the sequence belongs to, where any new mappings
 110    *          should be added
 111    * @param newseqs
 112    *          a list of new 'virtual sequences' generated while parsing GFF
 113    * @param relaxedIdMatching
 114    *          if true allow fuzzy search for a matching target sequence
 115    * @return a sequence feature, if one should be added to the sequence, else
 116    *         null
 117    * @throws IOException
 118    */
 119   protected SequenceFeature processNucleotideMatch(
 120           Map<String, List<String>> attributes, SequenceI seq,
 121           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 122           boolean relaxedIdMatching) throws IOException
 123   {
 124     String strand = gffColumns[STRAND_COL];
 125
 126     /*
 127      * (For now) we don't process mappings from reverse complement ; to do
 128      * this would require (a) creating a virtual sequence placeholder for
 129      * the reverse complement (b) resolving the sequence by its id from some
 130      * source (GFF ##FASTA or other) (c) creating the reverse complement
 131      * sequence (d) updating the mapping to be to the reverse complement
 132      */
 133     if ("-".equals(strand))
 134     {
 135       System.err
 136               .println("Skipping mapping from reverse complement as not yet supported");
 137       return null;
 138     }
 139
 140     List<String> targets = attributes.get(TARGET);
 141     if (targets == null)
 142     {
 143       System.err.println("'Target' missing in GFF");
 144       return null;
 145     }
 146
 147     /*
 148      * Typically we only expect one Target per GFF line, but this can handle
 149      * multiple matches, to the same or different sequences (e.g. dna variants)
 150      */
 151     for (String target : targets)
 152     {
 153       /*
 154        * Process "seqid start end [strand]"
 155        */
 156       String[] tokens = target.split(" ");
 157       if (tokens.length < 3)
 158       {
 159         System.err.println("Incomplete Target: " + target);
 160         continue;
 161       }
 162
 163       /*
 164        * Locate the mapped sequence in the alignment, or as a
 165        * (new or existing) virtual sequence in the newseqs list
 166        */
 167       String targetId = findTargetId(tokens[0], attributes);
 168       SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
 169               relaxedIdMatching);
 170       SequenceI mappedSequence = mappedSequence1;
 171       if (mappedSequence == null)
 172       {
 173         continue;
 174       }
 175
 176       /*
 177        * get any existing mapping for these sequences (or start one),
 178        * and add this mapped range
 179        */
 180       AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
 181
 182       try
 183       {
 184         int toStart = Integer.parseInt(tokens[1]);
 185         int toEnd = Integer.parseInt(tokens[2]);
 186         if (tokens.length > 3 && "-".equals(tokens[3]))
 187         {
 188           // mapping to reverse strand - swap start/end
 189           int temp = toStart;
 190           toStart = toEnd;
 191           toEnd = temp;
 192         }
 193
 194         int fromStart = Integer.parseInt(gffColumns[START_COL]);
 195         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
 196         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
 197                 toStart, toEnd, MappingType.NucleotideToNucleotide);
 198
 199         if (mapping != null)
 200         {
 201           acf.addMap(seq, mappedSequence, mapping);
 202           align.addCodonFrame(acf);
 203         }
 204       } catch (NumberFormatException nfe)
 205       {
 206         System.err.println("Invalid start or end in Target " + target);
 207       }
 208     }
 209
 210     SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
 211     return sf;
 212   }
 213
 214   /**
 215    * Returns the target sequence id extracted from the GFF name/value pairs.
 216    * Default (standard behaviour) is the first token for "Target". This may be
 217    * overridden where tools report this in a non-standard way.
 218    *
 219    * @param target
 220    *          first token of a "Target" value from GFF column 9, typically
 221    *          "seqid start end"
 222    * @param set
 223    *          a map with all parsed column 9 attributes
 224    * @return
 225    */
 226   @SuppressWarnings("unused")
 227   protected String findTargetId(String target, Map<String, List<String>> set)
 228   {
 229     return target;
 230   }
 231
 232   /**
 233    * Processes one GFF 'protein_match'; fields of interest are
 234    * <ul>
 235    * <li>feature group - the database reporting a match e.g. Pfam</li>
 236    * <li>Name - the matched entry's accession id in the database</li>
 237    * <li>ID - a sequence identifier for the matched region (which may be
 238    * appended as FASTA in the GFF file)</li>
 239    * </ul>
 240    *
 241    * @param set
 242    *          parsed GFF column 9 key/value(s)
 243    * @param seq
 244    *          the sequence the GFF feature is on
 245    * @param gffColumns
 246    *          the sequence feature holding GFF data
 247    * @param align
 248    *          the alignment the sequence belongs to, where any new mappings
 249    *          should be added
 250    * @param newseqs
 251    *          a list of new 'virtual sequences' generated while parsing GFF
 252    * @param relaxedIdMatching
 253    *          if true allow fuzzy search for a matching target sequence
 254    * @return the (real or virtual) sequence(s) mapped to by this match
 255    * @throws IOException
 256    */
 257   protected SequenceFeature processProteinMatch(
 258           Map<String, List<String>> set, SequenceI seq,
 259           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 260           boolean relaxedIdMatching)
 261   {
 262     // This is currently tailored to InterProScan GFF output:
 263     // ID holds the ID of the matched sequence, Target references the
 264     // query sequence; this looks wrong, as ID should just be the GFF internal
 265     // ID of the GFF feature, while Target would normally reference the matched
 266     // sequence.
 267     // TODO refactor as needed if other protein-protein GFF varies
 268
 269     SequenceFeature sf = buildSequenceFeature(gffColumns, set);
 270
 271     /*
 272      * locate the mapped sequence in the alignment, or as a
 273      * (new or existing) virtual sequence in the newseqs list
 274      */
 275     List<String> targets = set.get(TARGET);
 276     if (targets != null)
 277     {
 278       for (String target : targets)
 279       {
 280
 281         SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
 282                 align, newseqs, relaxedIdMatching);
 283         SequenceI mappedSequence = mappedSequence1;
 284         if (mappedSequence == null)
 285         {
 286           continue;
 287         }
 288
 289         /*
 290          * give the mapped sequence a copy of the sequence feature, with
 291          * start/end range adjusted
 292          */
 293         SequenceFeature sf2 = new SequenceFeature(sf);
 294         sf2.setBegin(1);
 295         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
 296         sf2.setEnd(sequenceFeatureLength);
 297         mappedSequence.addSequenceFeature(sf2);
 298
 299         /*
 300          * add a property to the mapped sequence so that it can eventually be
 301          * renamed with its qualified accession id; renaming has to wait until
 302          * all sequence reference resolution is complete
 303          */
 304         String accessionId = StringUtils.listToDelimitedString(
 305                 set.get(NAME), ",");
 306         if (accessionId.length() > 0)
 307         {
 308           String database = sf.getType(); // TODO InterProScan only??
 309           String qualifiedAccId = database + "|" + accessionId;
 310           sf2.setValue(RENAME_TOKEN, qualifiedAccId);
 311         }
 312
 313         /*
 314          * get any existing mapping for these sequences (or start one),
 315          * and add this mapped range
 316          */
 317         AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
 318         int[] from = new int[] { sf.getBegin(), sf.getEnd() };
 319         int[] to = new int[] { 1, sequenceFeatureLength };
 320         MapList mapping = new MapList(from, to, 1, 1);
 321
 322         alco.addMap(seq, mappedSequence, mapping);
 323         align.addCodonFrame(alco);
 324       }
 325     }
 326
 327     return sf;
 328   }
 329
 330   /**
 331    * Return '=' as the name-value separator used in column 9 attributes.
 332    */
 333   @Override
 334   protected char getNameValueSeparator()
 335   {
 336     return '=';
 337   }
 338
 339   /**
 340    * Modifies the default SequenceFeature in order to set the Target sequence id
 341    * as the description
 342    */
 343   @Override
 344   protected SequenceFeature buildSequenceFeature(String[] gff,
 345           Map<String, List<String>> attributes)
 346   {
 347     SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
 348     String desc = getDescription(sf, attributes);
 349     if (desc != null)
 350     {
 351       sf.setDescription(desc);
 352     }
 353     return sf;
 354   }
 355
 356   /**
 357    * Apply heuristic rules to try to get the most useful feature description
 358    *
 359    * @param sf
 360    * @param attributes
 361    * @return
 362    */
 363   protected String getDescription(SequenceFeature sf,
 364           Map<String, List<String>> attributes)
 365   {
 366     String desc = null;
 367     String target = (String) sf.getValue(TARGET);
 368     if (target != null)
 369     {
 370       desc = target.split(" ")[0];
 371     }
 372
 373     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 374     String type = sf.getType();
 375     if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
 376     {
 377       /*
 378        * Ensembl returns dna variants as 'alleles'
 379        */
 380       desc = StringUtils.listToDelimitedString(attributes.get("alleles"),
 381               ",");
 382     }
 383
 384     /*
 385      * extract 'Name' for a transcript (to show gene name)
 386      * or an exon (so 'colour by label' shows exon boundaries)
 387      */
 388     if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
 389             || so.isA(type, SequenceOntologyI.TRANSCRIPT)
 390             || so.isA(type, SequenceOntologyI.EXON))
 391     {
 392       desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
 393     }
 394
 395     /*
 396      * if the above fails, try ID
 397      */
 398     if (desc == null)
 399     {
 400       desc = (String) sf.getValue(ID);
 401     }
 402
 403     return desc;
 404   }
 405 }