src/jalview/io/gff/Gff3Helper.java

   1 package jalview.io.gff;
   2
   3 import jalview.datamodel.AlignedCodonFrame;
   4 import jalview.datamodel.AlignmentI;
   5 import jalview.datamodel.MappingType;
   6 import jalview.datamodel.SequenceFeature;
   7 import jalview.datamodel.SequenceI;
   8 import jalview.util.MapList;
   9 import jalview.util.StringUtils;
  10
  11 import java.io.IOException;
  12 import java.util.List;
  13 import java.util.Map;
  14
  15 /**
  16  * Base class with generic / common functionality for processing GFF3 data.
  17  * Override this as required for any specialisations resulting from
  18  * peculiarities of GFF3 generated by particular tools.
  19  */
  20 public class Gff3Helper extends GffHelperBase
  21 {
  22   protected static final String TARGET = "Target";
  23
  24   protected static final String ID = "ID";
  25
  26   private static final String NAME = "Name";
  27
  28   /**
  29    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
  30    * separate multiple values for a name
  31    *
  32    * @param text
  33    * @return
  34    */
  35   public static Map<String, List<String>> parseNameValuePairs(String text)
  36   {
  37     return parseNameValuePairs(text, ";", '=', ",");
  38   }
  39
  40   /**
  41    * Process one GFF feature line (as modelled by SequenceFeature)
  42    *
  43    * @param seq
  44    *          the sequence with which this feature is associated
  45    * @param sf
  46    *          the sequence feature with ATTRIBUTES property containing any
  47    *          additional attributes
  48    * @param align
  49    *          the alignment we are adding GFF to
  50    * @param newseqs
  51    *          any new sequences referenced by the GFF
  52    * @param relaxedIdMatching
  53    *          if true, match word tokens in sequence names
  54    * @return true if the sequence feature should be added to the sequence, else
  55    *         false (i.e. it has been processed in another way e.g. to generate a
  56    *         mapping)
  57    * @throws IOException
  58    */
  59   @Override
  60   public SequenceFeature processGff(SequenceI seq, String[] gff,
  61           AlignmentI align, List<SequenceI> newseqs,
  62           boolean relaxedIdMatching) throws IOException
  63   {
  64     SequenceFeature sf = null;
  65
  66     if (gff.length == 9)
  67     {
  68       String soTerm = gff[TYPE_COL];
  69       String atts = gff[ATTRIBUTES_COL];
  70       Map<String, List<String>> attributes = parseNameValuePairs(atts);
  71
  72       SequenceOntologyI so = SequenceOntologyFactory.getInstance();
  73       if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
  74       {
  75         sf = processProteinMatch(attributes, seq, gff, align, newseqs,
  76                 relaxedIdMatching);
  77       }
  78       else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
  79       {
  80         sf = processNucleotideMatch(attributes, seq, gff, align,
  81                 newseqs, relaxedIdMatching);
  82       }
  83       else
  84       {
  85         sf = buildSequenceFeature(gff, attributes);
  86       }
  87     }
  88     else
  89     {
  90       /*
  91        * fall back on generating a sequence feature with no special processing
  92        */
  93       sf = buildSequenceFeature(gff, null);
  94     }
  95
  96     return sf;
  97   }
  98
  99   /**
 100    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
 101    *
 102    * @param attributes
 103    *          parsed GFF column 9 key/value(s)
 104    * @param seq
 105    *          the sequence the GFF feature is on
 106    * @param gffColumns
 107    *          the GFF column data
 108    * @param align
 109    *          the alignment the sequence belongs to, where any new mappings
 110    *          should be added
 111    * @param newseqs
 112    *          a list of new 'virtual sequences' generated while parsing GFF
 113    * @param relaxedIdMatching
 114    *          if true allow fuzzy search for a matching target sequence
 115    * @return a sequence feature, if one should be added to the sequence, else
 116    *         null
 117    * @throws IOException
 118    */
 119   protected SequenceFeature processNucleotideMatch(
 120           Map<String, List<String>> attributes, SequenceI seq,
 121           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 122           boolean relaxedIdMatching)
 123           throws IOException
 124   {
 125     String strand = gffColumns[STRAND_COL];
 126
 127     /*
 128      * (For now) we don't process mappings from reverse complement ; to do
 129      * this would require (a) creating a virtual sequence placeholder for
 130      * the reverse complement (b) resolving the sequence by its id from some
 131      * source (GFF ##FASTA or other) (c) creating the reverse complement
 132      * sequence (d) updating the mapping to be to the reverse complement
 133      */
 134     if ("-".equals(strand))
 135     {
 136       System.err
 137               .println("Skipping mapping from reverse complement as not yet supported");
 138       return null;
 139     }
 140
 141     List<String> targets = attributes.get(TARGET);
 142     if (targets == null)
 143     {
 144       System.err.println("'Target' missing in GFF");
 145       return null;
 146     }
 147
 148     /*
 149      * Typically we only expect one Target per GFF line, but this can handle
 150      * multiple matches, to the same or different sequences (e.g. dna variants)
 151      */
 152     for (String target : targets)
 153     {
 154       /*
 155        * Process "seqid start end [strand]"
 156        */
 157       String[] tokens = target.split(" ");
 158       if (tokens.length < 3)
 159       {
 160         System.err.println("Incomplete Target: " + target);
 161         continue;
 162       }
 163
 164       /*
 165        * Locate the mapped sequence in the alignment, or as a
 166        * (new or existing) virtual sequence in the newseqs list
 167        */
 168       String targetId = findTargetId(tokens[0], attributes);
 169       SequenceI mappedSequence1 = findSequence(targetId, align,
 170       newseqs, relaxedIdMatching);
 171       SequenceI mappedSequence = mappedSequence1;
 172       if (mappedSequence == null)
 173       {
 174         continue;
 175       }
 176
 177       /*
 178        * get any existing mapping for these sequences (or start one),
 179        * and add this mapped range
 180        */
 181       AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
 182
 183       try
 184       {
 185         int toStart = Integer.parseInt(tokens[1]);
 186         int toEnd = Integer.parseInt(tokens[2]);
 187         if (tokens.length > 3 && "-".equals(tokens[3]))
 188         {
 189           // mapping to reverse strand - swap start/end
 190           int temp = toStart;
 191           toStart = toEnd;
 192           toEnd = temp;
 193         }
 194
 195         int fromStart = Integer.parseInt(gffColumns[START_COL]);
 196         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
 197         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
 198                 toStart, toEnd,
 199                 MappingType.NucleotideToNucleotide);
 200
 201         if (mapping != null)
 202         {
 203           acf.addMap(seq, mappedSequence, mapping);
 204           align.addCodonFrame(acf);
 205         }
 206       } catch (NumberFormatException nfe)
 207       {
 208         System.err.println("Invalid start or end in Target " + target);
 209       }
 210     }
 211
 212     SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
 213     return sf;
 214   }
 215
 216   /**
 217    * Returns the target sequence id extracted from the GFF name/value pairs.
 218    * Default (standard behaviour) is the first token for "Target". This may be
 219    * overridden where tools report this in a non-standard way.
 220    *
 221    * @param target
 222    *          first token of a "Target" value from GFF column 9, typically
 223    *          "seqid start end"
 224    * @param set
 225    *          a map with all parsed column 9 attributes
 226    * @return
 227    */
 228   @SuppressWarnings("unused")
 229   protected String findTargetId(String target, Map<String, List<String>> set)
 230   {
 231     return target;
 232   }
 233
 234   /**
 235    * Processes one GFF 'protein_match'; fields of interest are
 236    * <ul>
 237    * <li>feature group - the database reporting a match e.g. Pfam</li>
 238    * <li>Name - the matched entry's accession id in the database</li>
 239    * <li>ID - a sequence identifier for the matched region (which may be
 240    * appended as FASTA in the GFF file)</li>
 241    * </ul>
 242    *
 243    * @param set
 244    *          parsed GFF column 9 key/value(s)
 245    * @param seq
 246    *          the sequence the GFF feature is on
 247    * @param gffColumns
 248    *          the sequence feature holding GFF data
 249    * @param align
 250    *          the alignment the sequence belongs to, where any new mappings
 251    *          should be added
 252    * @param newseqs
 253    *          a list of new 'virtual sequences' generated while parsing GFF
 254    * @param relaxedIdMatching
 255    *          if true allow fuzzy search for a matching target sequence
 256    * @return the (real or virtual) sequence(s) mapped to by this match
 257    * @throws IOException
 258    */
 259   protected SequenceFeature processProteinMatch(
 260           Map<String, List<String>> set, SequenceI seq,
 261           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 262           boolean relaxedIdMatching)
 263   {
 264     // This is currently tailored to InterProScan GFF output:
 265     // ID holds the ID of the matched sequence, Target references the
 266     // query sequence; this looks wrong, as ID should just be the GFF internal
 267     // ID of the GFF feature, while Target would normally reference the matched
 268     // sequence.
 269     // TODO refactor as needed if other protein-protein GFF varies
 270
 271     SequenceFeature sf = buildSequenceFeature(gffColumns, set);
 272
 273     /*
 274      * locate the mapped sequence in the alignment, or as a
 275      * (new or existing) virtual sequence in the newseqs list
 276      */
 277     List<String> targets = set.get(TARGET);
 278     if (targets != null)
 279     {
 280       for (String target : targets)
 281       {
 282
 283         SequenceI mappedSequence1 = findSequence(findTargetId(target, set), align,
 284         newseqs, relaxedIdMatching);
 285         SequenceI mappedSequence = mappedSequence1;
 286         if (mappedSequence == null)
 287         {
 288           continue;
 289         }
 290
 291         /*
 292          * give the mapped sequence a copy of the sequence feature, with
 293          * start/end range adjusted
 294          */
 295         SequenceFeature sf2 = new SequenceFeature(sf);
 296         sf2.setBegin(1);
 297         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
 298         sf2.setEnd(sequenceFeatureLength);
 299         mappedSequence.addSequenceFeature(sf2);
 300
 301         /*
 302          * add a property to the mapped sequence so that it can eventually be
 303          * renamed with its qualified accession id; renaming has to wait until
 304          * all sequence reference resolution is complete
 305          */
 306         String accessionId = StringUtils.listToDelimitedString(
 307                 set.get(NAME), ",");
 308         if (accessionId.length() > 0)
 309         {
 310           String database = sf.getType(); // TODO InterProScan only??
 311           String qualifiedAccId = database + "|" + accessionId;
 312           sf2.setValue(RENAME_TOKEN, qualifiedAccId);
 313         }
 314
 315         /*
 316          * get any existing mapping for these sequences (or start one),
 317          * and add this mapped range
 318          */
 319         AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
 320         int[] from = new int[] { sf.getBegin(), sf.getEnd() };
 321         int[] to = new int[] { 1, sequenceFeatureLength };
 322         MapList mapping = new MapList(from, to, 1, 1);
 323
 324         alco.addMap(seq, mappedSequence, mapping);
 325         align.addCodonFrame(alco);
 326       }
 327     }
 328
 329     return sf;
 330   }
 331
 332   /**
 333    * Return '=' as the name-value separator used in column 9 attributes.
 334    */
 335   @Override
 336   protected char getNameValueSeparator()
 337   {
 338     return '=';
 339   }
 340
 341   /**
 342    * Modifies the default SequenceFeature in order to set the Target sequence id
 343    * as the description
 344    */
 345   @Override
 346   protected SequenceFeature buildSequenceFeature(String[] gff,
 347           Map<String, List<String>> attributes)
 348   {
 349     SequenceFeature sf = super.buildSequenceFeature(gff, attributes);
 350     String desc = getDescription(sf, attributes);
 351     if (desc != null)
 352     {
 353       sf.setDescription(desc);
 354     }
 355     return sf;
 356   }
 357
 358   /**
 359    * Apply heuristic rules to try to get the most useful feature description
 360    *
 361    * @param sf
 362    * @param attributes
 363    * @return
 364    */
 365   protected String getDescription(SequenceFeature sf,
 366           Map<String, List<String>> attributes)
 367   {
 368     String desc = null;
 369     String target = (String) sf.getValue(TARGET);
 370     if (target != null)
 371     {
 372       desc = target.split(" ")[0];
 373     }
 374
 375     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 376     String type = sf.getType();
 377     if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
 378     {
 379       /*
 380        * Ensembl returns dna variants as 'alleles'
 381        */
 382       desc = StringUtils.listToDelimitedString(
 383               attributes.get("alleles"), ",");
 384     }
 385
 386     /*
 387      * extract 'Name' for a transcript (to show gene name)
 388      * or an exon (so 'colour by label' shows exon boundaries)
 389      */
 390     if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
 391             || so.isA(type, SequenceOntologyI.TRANSCRIPT)
 392             || so.isA(type, SequenceOntologyI.EXON))
 393     {
 394       desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
 395     }
 396
 397     /*
 398      * if the above fails, try ID
 399      */
 400     if (desc == null)
 401     {
 402       desc = (String) sf.getValue(ID);
 403     }
 404
 405     return desc;
 406   }
 407 }