src/jalview/io/gff/Gff3Helper.java

   1 package jalview.io.gff;
   2
   3 import jalview.datamodel.AlignedCodonFrame;
   4 import jalview.datamodel.AlignmentI;
   5 import jalview.datamodel.MappingType;
   6 import jalview.datamodel.SequenceFeature;
   7 import jalview.datamodel.SequenceI;
   8 import jalview.ext.ensembl.EnsemblSeqProxy;
   9 import jalview.util.MapList;
  10 import jalview.util.StringUtils;
  11
  12 import java.io.IOException;
  13 import java.util.List;
  14 import java.util.Map;
  15
  16 /**
  17  * Base class with generic / common functionality for processing GFF3 data.
  18  * Override this as required for any specialisations resulting from
  19  * peculiarities of GFF3 generated by particular tools.
  20  */
  21 public class Gff3Helper extends GffHelperBase
  22 {
  23   protected static final String TARGET = "Target";
  24
  25   protected static final String ID = "ID";
  26
  27   private static final String NAME = "Name";
  28
  29   /**
  30    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
  31    * separate multiple values for a name
  32    *
  33    * @param text
  34    * @return
  35    */
  36   public static Map<String, List<String>> parseNameValuePairs(String text)
  37   {
  38     return parseNameValuePairs(text, ";", '=', ",");
  39   }
  40
  41   /**
  42    * Process one GFF feature line (as modelled by SequenceFeature)
  43    *
  44    * @param seq
  45    *          the sequence with which this feature is associated
  46    * @param sf
  47    *          the sequence feature with ATTRIBUTES property containing any
  48    *          additional attributes
  49    * @param align
  50    *          the alignment we are adding GFF to
  51    * @param newseqs
  52    *          any new sequences referenced by the GFF
  53    * @param relaxedIdMatching
  54    *          if true, match word tokens in sequence names
  55    * @return true if the sequence feature should be added to the sequence, else
  56    *         false (i.e. it has been processed in another way e.g. to generate a
  57    *         mapping)
  58    * @throws IOException
  59    */
  60   @Override
  61   public SequenceFeature processGff(SequenceI seq, String[] gff,
  62           AlignmentI align, List<SequenceI> newseqs,
  63           boolean relaxedIdMatching) throws IOException
  64   {
  65     SequenceFeature sf = null;
  66
  67     if (gff.length == 9)
  68     {
  69       String soTerm = gff[TYPE_COL];
  70       String atts = gff[ATTRIBUTES_COL];
  71       Map<String, List<String>> attributes = parseNameValuePairs(atts);
  72
  73       SequenceOntologyI so = SequenceOntologyFactory.getInstance();
  74       if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
  75       {
  76         sf = processProteinMatch(attributes, seq, gff, align, newseqs,
  77                 relaxedIdMatching);
  78       }
  79       else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
  80       {
  81         sf = processNucleotideMatch(attributes, seq, gff, align,
  82                 newseqs, relaxedIdMatching);
  83       }
  84       else
  85       {
  86         sf = buildSequenceFeature(gff, attributes);
  87       }
  88     }
  89     else
  90     {
  91       /*
  92        * fall back on generating a sequence feature with no special processing
  93        */
  94       sf = buildSequenceFeature(gff, null);
  95     }
  96
  97     return sf;
  98   }
  99
 100   /**
 101    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
 102    *
 103    * @param attributes
 104    *          parsed GFF column 9 key/value(s)
 105    * @param seq
 106    *          the sequence the GFF feature is on
 107    * @param gffColumns
 108    *          the GFF column data
 109    * @param align
 110    *          the alignment the sequence belongs to, where any new mappings
 111    *          should be added
 112    * @param newseqs
 113    *          a list of new 'virtual sequences' generated while parsing GFF
 114    * @param relaxedIdMatching
 115    *          if true allow fuzzy search for a matching target sequence
 116    * @return a sequence feature, if one should be added to the sequence, else
 117    *         null
 118    * @throws IOException
 119    */
 120   protected SequenceFeature processNucleotideMatch(
 121           Map<String, List<String>> attributes, SequenceI seq,
 122           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 123           boolean relaxedIdMatching)
 124           throws IOException
 125   {
 126     String strand = gffColumns[STRAND_COL];
 127
 128     /*
 129      * (For now) we don't process mappings from reverse complement ; to do
 130      * this would require (a) creating a virtual sequence placeholder for
 131      * the reverse complement (b) resolving the sequence by its id from some
 132      * source (GFF ##FASTA or other) (c) creating the reverse complement
 133      * sequence (d) updating the mapping to be to the reverse complement
 134      */
 135     if ("-".equals(strand))
 136     {
 137       System.err
 138               .println("Skipping mapping from reverse complement as not yet supported");
 139       return null;
 140     }
 141
 142     List<String> targets = attributes.get(TARGET);
 143     if (targets == null)
 144     {
 145       System.err.println("'Target' missing in GFF");
 146       return null;
 147     }
 148
 149     /*
 150      * Typically we only expect one Target per GFF line, but this can handle
 151      * multiple matches, to the same or different sequences (e.g. dna variants)
 152      */
 153     for (String target : targets)
 154     {
 155       /*
 156        * Process "seqid start end [strand]"
 157        */
 158       String[] tokens = target.split(" ");
 159       if (tokens.length < 3)
 160       {
 161         System.err.println("Incomplete Target: " + target);
 162         continue;
 163       }
 164
 165       /*
 166        * Locate the mapped sequence in the alignment, or as a
 167        * (new or existing) virtual sequence in the newseqs list
 168        */
 169       String targetId = findTargetId(tokens[0], attributes);
 170       SequenceI mappedSequence1 = findSequence(targetId, align,
 171       newseqs, relaxedIdMatching);
 172       SequenceI mappedSequence = mappedSequence1;
 173       if (mappedSequence == null)
 174       {
 175         continue;
 176       }
 177
 178       /*
 179        * get any existing mapping for these sequences (or start one),
 180        * and add this mapped range
 181        */
 182       AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
 183
 184       try
 185       {
 186         int toStart = Integer.parseInt(tokens[1]);
 187         int toEnd = Integer.parseInt(tokens[2]);
 188         if (tokens.length > 3 && "-".equals(tokens[3]))
 189         {
 190           // mapping to reverse strand - swap start/end
 191           int temp = toStart;
 192           toStart = toEnd;
 193           toEnd = temp;
 194         }
 195
 196         int fromStart = Integer.parseInt(gffColumns[START_COL]);
 197         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
 198         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
 199                 toStart, toEnd,
 200                 MappingType.NucleotideToNucleotide);
 201
 202         if (mapping != null)
 203         {
 204           acf.addMap(seq, mappedSequence, mapping);
 205           align.addCodonFrame(acf);
 206         }
 207       } catch (NumberFormatException nfe)
 208       {
 209         System.err.println("Invalid start or end in Target " + target);
 210       }
 211     }
 212
 213     SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
 214     return sf;
 215   }
 216
 217   /**
 218    * Returns the target sequence id extracted from the GFF name/value pairs.
 219    * Default (standard behaviour) is the first token for "Target". This may be
 220    * overridden where tools report this in a non-standard way.
 221    *
 222    * @param target
 223    *          first token of a "Target" value from GFF column 9, typically
 224    *          "seqid start end"
 225    * @param set
 226    *          a map with all parsed column 9 attributes
 227    * @return
 228    */
 229   @SuppressWarnings("unused")
 230   protected String findTargetId(String target, Map<String, List<String>> set)
 231   {
 232     return target;
 233   }
 234
 235   /**
 236    * Processes one GFF 'protein_match'; fields of interest are
 237    * <ul>
 238    * <li>feature group - the database reporting a match e.g. Pfam</li>
 239    * <li>Name - the matched entry's accession id in the database</li>
 240    * <li>ID - a sequence identifier for the matched region (which may be
 241    * appended as FASTA in the GFF file)</li>
 242    * </ul>
 243    *
 244    * @param set
 245    *          parsed GFF column 9 key/value(s)
 246    * @param seq
 247    *          the sequence the GFF feature is on
 248    * @param gffColumns
 249    *          the sequence feature holding GFF data
 250    * @param align
 251    *          the alignment the sequence belongs to, where any new mappings
 252    *          should be added
 253    * @param newseqs
 254    *          a list of new 'virtual sequences' generated while parsing GFF
 255    * @param relaxedIdMatching
 256    *          if true allow fuzzy search for a matching target sequence
 257    * @return the (real or virtual) sequence(s) mapped to by this match
 258    * @throws IOException
 259    */
 260   protected SequenceFeature processProteinMatch(
 261           Map<String, List<String>> set, SequenceI seq,
 262           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 263           boolean relaxedIdMatching)
 264   {
 265     // This is currently tailored to InterProScan GFF output:
 266     // ID holds the ID of the matched sequence, Target references the
 267     // query sequence; this looks wrong, as ID should just be the GFF internal
 268     // ID of the GFF feature, while Target would normally reference the matched
 269     // sequence.
 270     // TODO refactor as needed if other protein-protein GFF varies
 271
 272     SequenceFeature sf = buildSequenceFeature(gffColumns, set);
 273
 274     /*
 275      * locate the mapped sequence in the alignment, or as a
 276      * (new or existing) virtual sequence in the newseqs list
 277      */
 278     List<String> targets = set.get(TARGET);
 279     if (targets != null)
 280     {
 281       for (String target : targets)
 282       {
 283
 284         SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align,
 285         newseqs, relaxedIdMatching);
 286         SequenceI mappedSequence = mappedSequence1;
 287         if (mappedSequence == null)
 288         {
 289           continue;
 290         }
 291
 292         /*
 293          * give the mapped sequence a copy of the sequence feature, with
 294          * start/end range adjusted
 295          */
 296         SequenceFeature sf2 = new SequenceFeature(sf);
 297         sf2.setBegin(1);
 298         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
 299         sf2.setEnd(sequenceFeatureLength);
 300         mappedSequence.addSequenceFeature(sf2);
 301
 302         /*
 303          * add a property to the mapped sequence so that it can eventually be
 304          * renamed with its qualified accession id; renaming has to wait until
 305          * all sequence reference resolution is complete
 306          */
 307         String accessionId = StringUtils.listToDelimitedString(
 308                 set.get(NAME), ",");
 309         if (accessionId.length() > 0)
 310         {
 311           String database = sf.getType(); // TODO InterProScan only??
 312           String qualifiedAccId = database + "|" + accessionId;
 313           sf2.setValue(RENAME_TOKEN, qualifiedAccId);
 314         }
 315
 316         /*
 317          * get any existing mapping for these sequences (or start one),
 318          * and add this mapped range
 319          */
 320         AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
 321         int[] from = new int[] { sf.getBegin(), sf.getEnd() };
 322         int[] to = new int[] { 1, sequenceFeatureLength };
 323         MapList mapping = new MapList(from, to, 1, 1);
 324
 325         alco.addMap(seq, mappedSequence, mapping);
 326         align.addCodonFrame(alco);
 327       }
 328     }
 329
 330     return sf;
 331   }
 332
 333   /**
 334    * Return '=' as the name-value separator used in column 9 attributes.
 335    */
 336   @Override
 337   protected char getNameValueSeparator()
 338   {
 339     return '=';
 340   }
 341
 342   /**
 343    * Modifies the default SequenceFeature in order to set the Target sequence id
 344    * as the description
 345    */
 346   @Override
 347   protected SequenceFeature buildSequenceFeature(String[] gff,
 348           Map<String, List<String>> attributes)
 349   {
 350     SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
 351     String desc = getDescription(sf, attributes);
 352     if (desc != null)
 353     {
 354       sf.setDescription(desc);
 355     }
 356     return sf;
 357   }
 358
 359   /**
 360    * Apply heuristic rules to try to get the most useful feature description
 361    *
 362    * @param sf
 363    * @param attributes
 364    * @return
 365    */
 366   protected String getDescription(SequenceFeature sf,
 367           Map<String, List<String>> attributes)
 368   {
 369     String desc = null;
 370     String target = (String) sf.getValue(TARGET);
 371     if (target != null)
 372     {
 373       desc = target.split(" ")[0];
 374     }
 375
 376     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 377     String type = sf.getType();
 378     if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
 379     {
 380       /*
 381        * Ensembl returns dna variants as 'alleles'
 382        */
 383       desc = StringUtils.listToDelimitedString(
 384               attributes.get("alleles"), ",");
 385     }
 386
 387     /*
 388      * extract 'Name' for a transcript (to show gene name)
 389      * or an exon (so 'colour by label' shows exon boundaries)
 390      */
 391     if (EnsemblSeqProxy.isTranscript(type)
 392             || so.isA(type, SequenceOntology.EXON))
 393     {
 394       desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
 395     }
 396     return desc;
 397   }
 398 }