src/jalview/io/gff/Gff3Helper.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentI;
  25 import jalview.datamodel.MappingType;
  26 import jalview.datamodel.SequenceFeature;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.MapList;
  29 import jalview.util.StringUtils;
  30
  31 import java.io.IOException;
  32 import java.util.List;
  33 import java.util.Map;
  34
  35 /**
  36  * Base class with generic / common functionality for processing GFF3 data.
  37  * Override this as required for any specialisations resulting from
  38  * peculiarities of GFF3 generated by particular tools.
  39  */
  40 public class Gff3Helper extends GffHelperBase
  41 {
  42   protected static final String TARGET = "Target";
  43
  44   protected static final String ID = "ID";
  45
  46   private static final String NAME = "Name";
  47
  48   /**
  49    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
  50    * separate multiple values for a name
  51    *
  52    * @param text
  53    * @return
  54    */
  55   public static Map<String, List<String>> parseNameValuePairs(String text)
  56   {
  57     return parseNameValuePairs(text, ";", '=', ",");
  58   }
  59
  60   /**
  61    * Process one GFF feature line (as modelled by SequenceFeature)
  62    *
  63    * @param seq
  64    *          the sequence with which this feature is associated
  65    * @param sf
  66    *          the sequence feature with ATTRIBUTES property containing any
  67    *          additional attributes
  68    * @param align
  69    *          the alignment we are adding GFF to
  70    * @param newseqs
  71    *          any new sequences referenced by the GFF
  72    * @param relaxedIdMatching
  73    *          if true, match word tokens in sequence names
  74    * @return true if the sequence feature should be added to the sequence, else
  75    *         false (i.e. it has been processed in another way e.g. to generate a
  76    *         mapping)
  77    * @throws IOException
  78    */
  79   @Override
  80   public SequenceFeature processGff(SequenceI seq, String[] gff,
  81           AlignmentI align, List<SequenceI> newseqs,
  82           boolean relaxedIdMatching) throws IOException
  83   {
  84     SequenceFeature sf = null;
  85
  86     if (gff.length == 9)
  87     {
  88       String soTerm = gff[TYPE_COL];
  89       String atts = gff[ATTRIBUTES_COL];
  90       Map<String, List<String>> attributes = parseNameValuePairs(atts);
  91
  92       SequenceOntologyI so = SequenceOntologyFactory.getInstance();
  93       if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
  94       {
  95         sf = processProteinMatch(attributes, seq, gff, align, newseqs,
  96                 relaxedIdMatching);
  97       }
  98       else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
  99       {
 100         sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
 101                 relaxedIdMatching);
 102       }
 103       else
 104       {
 105         sf = buildSequenceFeature(gff, attributes);
 106       }
 107     }
 108     else
 109     {
 110       /*
 111        * fall back on generating a sequence feature with no special processing
 112        */
 113       sf = buildSequenceFeature(gff, null);
 114     }
 115
 116     return sf;
 117   }
 118
 119   /**
 120    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
 121    *
 122    * @param attributes
 123    *          parsed GFF column 9 key/value(s)
 124    * @param seq
 125    *          the sequence the GFF feature is on
 126    * @param gffColumns
 127    *          the GFF column data
 128    * @param align
 129    *          the alignment the sequence belongs to, where any new mappings
 130    *          should be added
 131    * @param newseqs
 132    *          a list of new 'virtual sequences' generated while parsing GFF
 133    * @param relaxedIdMatching
 134    *          if true allow fuzzy search for a matching target sequence
 135    * @return a sequence feature, if one should be added to the sequence, else
 136    *         null
 137    * @throws IOException
 138    */
 139   protected SequenceFeature processNucleotideMatch(
 140           Map<String, List<String>> attributes, SequenceI seq,
 141           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 142           boolean relaxedIdMatching) throws IOException
 143   {
 144     String strand = gffColumns[STRAND_COL];
 145
 146     /*
 147      * (For now) we don't process mappings from reverse complement ; to do
 148      * this would require (a) creating a virtual sequence placeholder for
 149      * the reverse complement (b) resolving the sequence by its id from some
 150      * source (GFF ##FASTA or other) (c) creating the reverse complement
 151      * sequence (d) updating the mapping to be to the reverse complement
 152      */
 153     if ("-".equals(strand))
 154     {
 155       System.err.println(
 156               "Skipping mapping from reverse complement as not yet supported");
 157       return null;
 158     }
 159
 160     List<String> targets = attributes.get(TARGET);
 161     if (targets == null)
 162     {
 163       System.err.println("'Target' missing in GFF");
 164       return null;
 165     }
 166
 167     /*
 168      * Typically we only expect one Target per GFF line, but this can handle
 169      * multiple matches, to the same or different sequences (e.g. dna variants)
 170      */
 171     for (String target : targets)
 172     {
 173       /*
 174        * Process "seqid start end [strand]"
 175        */
 176       String[] tokens = target.split(" ");
 177       if (tokens.length < 3)
 178       {
 179         System.err.println("Incomplete Target: " + target);
 180         continue;
 181       }
 182
 183       /*
 184        * Locate the mapped sequence in the alignment, or as a
 185        * (new or existing) virtual sequence in the newseqs list
 186        */
 187       String targetId = findTargetId(tokens[0], attributes);
 188       SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
 189               relaxedIdMatching);
 190       SequenceI mappedSequence = mappedSequence1;
 191       if (mappedSequence == null)
 192       {
 193         continue;
 194       }
 195
 196       /*
 197        * get any existing mapping for these sequences (or start one),
 198        * and add this mapped range
 199        */
 200       AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
 201
 202       try
 203       {
 204         int toStart = Integer.parseInt(tokens[1]);
 205         int toEnd = Integer.parseInt(tokens[2]);
 206         if (tokens.length > 3 && "-".equals(tokens[3]))
 207         {
 208           // mapping to reverse strand - swap start/end
 209           int temp = toStart;
 210           toStart = toEnd;
 211           toEnd = temp;
 212         }
 213
 214         int fromStart = Integer.parseInt(gffColumns[START_COL]);
 215         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
 216         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
 217                 toStart, toEnd, MappingType.NucleotideToNucleotide);
 218
 219         if (mapping != null)
 220         {
 221           acf.addMap(seq, mappedSequence, mapping);
 222           align.addCodonFrame(acf);
 223         }
 224       } catch (NumberFormatException nfe)
 225       {
 226         System.err.println("Invalid start or end in Target " + target);
 227       }
 228     }
 229
 230     SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
 231     return sf;
 232   }
 233
 234   /**
 235    * Returns the target sequence id extracted from the GFF name/value pairs.
 236    * Default (standard behaviour) is the first token for "Target". This may be
 237    * overridden where tools report this in a non-standard way.
 238    *
 239    * @param target
 240    *          first token of a "Target" value from GFF column 9, typically
 241    *          "seqid start end"
 242    * @param set
 243    *          a map with all parsed column 9 attributes
 244    * @return
 245    */
 246   @SuppressWarnings("unused")
 247   protected String findTargetId(String target,
 248           Map<String, List<String>> set)
 249   {
 250     return target;
 251   }
 252
 253   /**
 254    * Processes one GFF 'protein_match'; fields of interest are
 255    * <ul>
 256    * <li>feature group - the database reporting a match e.g. Pfam</li>
 257    * <li>Name - the matched entry's accession id in the database</li>
 258    * <li>ID - a sequence identifier for the matched region (which may be
 259    * appended as FASTA in the GFF file)</li>
 260    * </ul>
 261    *
 262    * @param set
 263    *          parsed GFF column 9 key/value(s)
 264    * @param seq
 265    *          the sequence the GFF feature is on
 266    * @param gffColumns
 267    *          the sequence feature holding GFF data
 268    * @param align
 269    *          the alignment the sequence belongs to, where any new mappings
 270    *          should be added
 271    * @param newseqs
 272    *          a list of new 'virtual sequences' generated while parsing GFF
 273    * @param relaxedIdMatching
 274    *          if true allow fuzzy search for a matching target sequence
 275    * @return the (real or virtual) sequence(s) mapped to by this match
 276    * @throws IOException
 277    */
 278   protected SequenceFeature processProteinMatch(
 279           Map<String, List<String>> set, SequenceI seq, String[] gffColumns,
 280           AlignmentI align, List<SequenceI> newseqs,
 281           boolean relaxedIdMatching)
 282   {
 283     // This is currently tailored to InterProScan GFF output:
 284     // ID holds the ID of the matched sequence, Target references the
 285     // query sequence; this looks wrong, as ID should just be the GFF internal
 286     // ID of the GFF feature, while Target would normally reference the matched
 287     // sequence.
 288     // TODO refactor as needed if other protein-protein GFF varies
 289
 290     SequenceFeature sf = buildSequenceFeature(gffColumns, set);
 291
 292     /*
 293      * locate the mapped sequence in the alignment, or as a
 294      * (new or existing) virtual sequence in the newseqs list
 295      */
 296     List<String> targets = set.get(TARGET);
 297     if (targets != null)
 298     {
 299       for (String target : targets)
 300       {
 301
 302         SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
 303                 align, newseqs, relaxedIdMatching);
 304         SequenceI mappedSequence = mappedSequence1;
 305         if (mappedSequence == null)
 306         {
 307           continue;
 308         }
 309
 310         /*
 311          * give the mapped sequence a copy of the sequence feature, with
 312          * start/end range adjusted
 313          */
 314         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
 315         SequenceFeature sf2 = new SequenceFeature(sf, 1,
 316                 sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
 317         mappedSequence.addSequenceFeature(sf2);
 318
 319         /*
 320          * add a property to the mapped sequence so that it can eventually be
 321          * renamed with its qualified accession id; renaming has to wait until
 322          * all sequence reference resolution is complete
 323          */
 324         String accessionId = StringUtils
 325                 .listToDelimitedString(set.get(NAME), ",");
 326         if (accessionId.length() > 0)
 327         {
 328           String database = sf.getType(); // TODO InterProScan only??
 329           String qualifiedAccId = database + "|" + accessionId;
 330           sf2.setValue(RENAME_TOKEN, qualifiedAccId);
 331         }
 332
 333         /*
 334          * get any existing mapping for these sequences (or start one),
 335          * and add this mapped range
 336          */
 337         AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
 338         int[] from = new int[] { sf.getBegin(), sf.getEnd() };
 339         int[] to = new int[] { 1, sequenceFeatureLength };
 340         MapList mapping = new MapList(from, to, 1, 1);
 341
 342         alco.addMap(seq, mappedSequence, mapping);
 343         align.addCodonFrame(alco);
 344       }
 345     }
 346
 347     return sf;
 348   }
 349
 350   /**
 351    * Return '=' as the name-value separator used in column 9 attributes.
 352    */
 353   @Override
 354   protected char getNameValueSeparator()
 355   {
 356     return '=';
 357   }
 358
 359   /**
 360    * Modifies the default SequenceFeature in order to set the Target sequence id
 361    * as the description
 362    */
 363   @Override
 364   protected SequenceFeature buildSequenceFeature(String[] gff,
 365           int typeColumn, String group,
 366           Map<String, List<String>> attributes)
 367   {
 368     SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
 369             attributes);
 370     String desc = getDescription(sf, attributes);
 371     if (desc != null)
 372     {
 373       sf.setDescription(desc);
 374     }
 375     return sf;
 376   }
 377
 378   /**
 379    * Apply heuristic rules to try to get the most useful feature description
 380    *
 381    * @param sf
 382    * @param attributes
 383    * @return
 384    */
 385   protected String getDescription(SequenceFeature sf,
 386           Map<String, List<String>> attributes)
 387   {
 388     String desc = null;
 389     String target = (String) sf.getValue(TARGET);
 390     if (target != null)
 391     {
 392       desc = target.split(" ")[0];
 393     }
 394
 395     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 396     String type = sf.getType();
 397     if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
 398     {
 399       /*
 400        * Ensembl returns dna variants as 'alleles'
 401        */
 402       desc = StringUtils.listToDelimitedString(attributes.get("alleles"),
 403               ",");
 404     }
 405
 406     /*
 407      * extract 'Name' for a transcript (to show gene name)
 408      * or an exon (so 'colour by label' shows exon boundaries)
 409      */
 410     if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
 411             || so.isA(type, SequenceOntologyI.TRANSCRIPT)
 412             || so.isA(type, SequenceOntologyI.EXON))
 413     {
 414       desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
 415     }
 416
 417     /*
 418      * if the above fails, try ID
 419      */
 420     if (desc == null)
 421     {
 422       desc = (String) sf.getValue(ID);
 423     }
 424
 425     return desc;
 426   }
 427 }