src/jalview/io/gff/Gff3Helper.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentI;
  25 import jalview.datamodel.MappingType;
  26 import jalview.datamodel.SequenceFeature;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.MapList;
  29 import jalview.util.StringUtils;
  30
  31 import java.io.IOException;
  32 import java.util.List;
  33 import java.util.Map;
  34
  35 /**
  36  * Base class with generic / common functionality for processing GFF3 data.
  37  * Override this as required for any specialisations resulting from
  38  * peculiarities of GFF3 generated by particular tools.
  39  */
  40 public class Gff3Helper extends GffHelperBase
  41 {
  42   public static final String ALLELES = "alleles";
  43
  44   protected static final String TARGET = "Target";
  45
  46   protected static final String ID = "ID";
  47
  48   private static final String NAME = "Name";
  49
  50   /**
  51    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
  52    * separate multiple values for a name
  53    *
  54    * @param text
  55    * @return
  56    */
  57   public static Map<String, List<String>> parseNameValuePairs(String text)
  58   {
  59     return parseNameValuePairs(text, ";", '=', ",");
  60   }
  61
  62   /**
  63    * Process one GFF feature line (as modelled by SequenceFeature)
  64    *
  65    * @param seq
  66    *          the sequence with which this feature is associated
  67    * @param sf
  68    *          the sequence feature with ATTRIBUTES property containing any
  69    *          additional attributes
  70    * @param align
  71    *          the alignment we are adding GFF to
  72    * @param newseqs
  73    *          any new sequences referenced by the GFF
  74    * @param relaxedIdMatching
  75    *          if true, match word tokens in sequence names
  76    * @return true if the sequence feature should be added to the sequence, else
  77    *         false (i.e. it has been processed in another way e.g. to generate a
  78    *         mapping)
  79    * @throws IOException
  80    */
  81   @Override
  82   public SequenceFeature processGff(SequenceI seq, String[] gff,
  83           AlignmentI align, List<SequenceI> newseqs,
  84           boolean relaxedIdMatching) throws IOException
  85   {
  86     SequenceFeature sf = null;
  87
  88     if (gff.length == 9)
  89     {
  90       String soTerm = gff[TYPE_COL];
  91       String atts = gff[ATTRIBUTES_COL];
  92       Map<String, List<String>> attributes = parseNameValuePairs(atts);
  93
  94       SequenceOntologyI so = SequenceOntologyFactory.getInstance();
  95       if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
  96       {
  97         sf = processProteinMatch(attributes, seq, gff, align, newseqs,
  98                 relaxedIdMatching);
  99       }
 100       else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
 101       {
 102         sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
 103                 relaxedIdMatching);
 104       }
 105       else
 106       {
 107         sf = buildSequenceFeature(gff, attributes);
 108       }
 109     }
 110     else
 111     {
 112       /*
 113        * fall back on generating a sequence feature with no special processing
 114        */
 115       sf = buildSequenceFeature(gff, null);
 116     }
 117
 118     return sf;
 119   }
 120
 121   /**
 122    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
 123    *
 124    * @param attributes
 125    *          parsed GFF column 9 key/value(s)
 126    * @param seq
 127    *          the sequence the GFF feature is on
 128    * @param gffColumns
 129    *          the GFF column data
 130    * @param align
 131    *          the alignment the sequence belongs to, where any new mappings
 132    *          should be added
 133    * @param newseqs
 134    *          a list of new 'virtual sequences' generated while parsing GFF
 135    * @param relaxedIdMatching
 136    *          if true allow fuzzy search for a matching target sequence
 137    * @return a sequence feature, if one should be added to the sequence, else
 138    *         null
 139    * @throws IOException
 140    */
 141   protected SequenceFeature processNucleotideMatch(
 142           Map<String, List<String>> attributes, SequenceI seq,
 143           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 144           boolean relaxedIdMatching) throws IOException
 145   {
 146     String strand = gffColumns[STRAND_COL];
 147
 148     /*
 149      * (For now) we don't process mappings from reverse complement ; to do
 150      * this would require (a) creating a virtual sequence placeholder for
 151      * the reverse complement (b) resolving the sequence by its id from some
 152      * source (GFF ##FASTA or other) (c) creating the reverse complement
 153      * sequence (d) updating the mapping to be to the reverse complement
 154      */
 155     if ("-".equals(strand))
 156     {
 157       jalview.bin.Console.errPrintln(
 158               "Skipping mapping from reverse complement as not yet supported");
 159       return null;
 160     }
 161
 162     List<String> targets = attributes.get(TARGET);
 163     if (targets == null)
 164     {
 165       jalview.bin.Console.errPrintln("'Target' missing in GFF");
 166       return null;
 167     }
 168
 169     /*
 170      * Typically we only expect one Target per GFF line, but this can handle
 171      * multiple matches, to the same or different sequences (e.g. dna variants)
 172      */
 173     for (String target : targets)
 174     {
 175       /*
 176        * Process "seqid start end [strand]"
 177        */
 178       String[] tokens = target.split(" ");
 179       if (tokens.length < 3)
 180       {
 181         jalview.bin.Console.errPrintln("Incomplete Target: " + target);
 182         continue;
 183       }
 184
 185       /*
 186        * Locate the mapped sequence in the alignment, or as a
 187        * (new or existing) virtual sequence in the newseqs list
 188        */
 189       String targetId = findTargetId(tokens[0], attributes);
 190       SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
 191               relaxedIdMatching);
 192       SequenceI mappedSequence = mappedSequence1;
 193       if (mappedSequence == null)
 194       {
 195         continue;
 196       }
 197
 198       /*
 199        * get any existing mapping for these sequences (or start one),
 200        * and add this mapped range
 201        */
 202       AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
 203
 204       try
 205       {
 206         int toStart = Integer.parseInt(tokens[1]);
 207         int toEnd = Integer.parseInt(tokens[2]);
 208         if (tokens.length > 3 && "-".equals(tokens[3]))
 209         {
 210           // mapping to reverse strand - swap start/end
 211           int temp = toStart;
 212           toStart = toEnd;
 213           toEnd = temp;
 214         }
 215
 216         int fromStart = Integer.parseInt(gffColumns[START_COL]);
 217         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
 218         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
 219                 toStart, toEnd, MappingType.NucleotideToNucleotide);
 220
 221         if (mapping != null)
 222         {
 223           acf.addMap(seq, mappedSequence, mapping);
 224           align.addCodonFrame(acf);
 225         }
 226       } catch (NumberFormatException nfe)
 227       {
 228         jalview.bin.Console
 229                 .errPrintln("Invalid start or end in Target " + target);
 230       }
 231     }
 232
 233     SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
 234     return sf;
 235   }
 236
 237   /**
 238    * Returns the target sequence id extracted from the GFF name/value pairs.
 239    * Default (standard behaviour) is the first token for "Target". This may be
 240    * overridden where tools report this in a non-standard way.
 241    *
 242    * @param target
 243    *          first token of a "Target" value from GFF column 9, typically
 244    *          "seqid start end"
 245    * @param set
 246    *          a map with all parsed column 9 attributes
 247    * @return
 248    */
 249   @SuppressWarnings("unused")
 250   protected String findTargetId(String target,
 251           Map<String, List<String>> set)
 252   {
 253     return target;
 254   }
 255
 256   /**
 257    * Processes one GFF 'protein_match'; fields of interest are
 258    * <ul>
 259    * <li>feature group - the database reporting a match e.g. Pfam</li>
 260    * <li>Name - the matched entry's accession id in the database</li>
 261    * <li>ID - a sequence identifier for the matched region (which may be
 262    * appended as FASTA in the GFF file)</li>
 263    * </ul>
 264    *
 265    * @param set
 266    *          parsed GFF column 9 key/value(s)
 267    * @param seq
 268    *          the sequence the GFF feature is on
 269    * @param gffColumns
 270    *          the sequence feature holding GFF data
 271    * @param align
 272    *          the alignment the sequence belongs to, where any new mappings
 273    *          should be added
 274    * @param newseqs
 275    *          a list of new 'virtual sequences' generated while parsing GFF
 276    * @param relaxedIdMatching
 277    *          if true allow fuzzy search for a matching target sequence
 278    * @return the (real or virtual) sequence(s) mapped to by this match
 279    * @throws IOException
 280    */
 281   protected SequenceFeature processProteinMatch(
 282           Map<String, List<String>> set, SequenceI seq, String[] gffColumns,
 283           AlignmentI align, List<SequenceI> newseqs,
 284           boolean relaxedIdMatching)
 285   {
 286     // This is currently tailored to InterProScan GFF output:
 287     // ID holds the ID of the matched sequence, Target references the
 288     // query sequence; this looks wrong, as ID should just be the GFF internal
 289     // ID of the GFF feature, while Target would normally reference the matched
 290     // sequence.
 291     // TODO refactor as needed if other protein-protein GFF varies
 292
 293     SequenceFeature sf = buildSequenceFeature(gffColumns, set);
 294
 295     /*
 296      * locate the mapped sequence in the alignment, or as a
 297      * (new or existing) virtual sequence in the newseqs list
 298      */
 299     List<String> targets = set.get(TARGET);
 300     if (targets != null)
 301     {
 302       for (String target : targets)
 303       {
 304
 305         SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
 306                 align, newseqs, relaxedIdMatching);
 307         SequenceI mappedSequence = mappedSequence1;
 308         if (mappedSequence == null)
 309         {
 310           continue;
 311         }
 312
 313         /*
 314          * give the mapped sequence a copy of the sequence feature, with
 315          * start/end range adjusted
 316          */
 317         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
 318         SequenceFeature sf2 = new SequenceFeature(sf, 1,
 319                 sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
 320         mappedSequence.addSequenceFeature(sf2);
 321
 322         /*
 323          * add a property to the mapped sequence so that it can eventually be
 324          * renamed with its qualified accession id; renaming has to wait until
 325          * all sequence reference resolution is complete
 326          */
 327         String accessionId = StringUtils
 328                 .listToDelimitedString(set.get(NAME), ",");
 329         if (accessionId.length() > 0)
 330         {
 331           String database = sf.getType(); // TODO InterProScan only??
 332           String qualifiedAccId = database + "|" + accessionId;
 333           sf2.setValue(RENAME_TOKEN, qualifiedAccId);
 334         }
 335
 336         /*
 337          * get any existing mapping for these sequences (or start one),
 338          * and add this mapped range
 339          */
 340         AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
 341         int[] from = new int[] { sf.getBegin(), sf.getEnd() };
 342         int[] to = new int[] { 1, sequenceFeatureLength };
 343         MapList mapping = new MapList(from, to, 1, 1);
 344
 345         alco.addMap(seq, mappedSequence, mapping);
 346         align.addCodonFrame(alco);
 347       }
 348     }
 349
 350     return sf;
 351   }
 352
 353   /**
 354    * Modifies the default SequenceFeature in order to set the Target sequence id
 355    * as the description
 356    */
 357   @Override
 358   protected SequenceFeature buildSequenceFeature(String[] gff,
 359           int typeColumn, String group,
 360           Map<String, List<String>> attributes)
 361   {
 362     SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
 363             attributes);
 364     String desc = getDescription(sf, attributes);
 365     if (desc != null)
 366     {
 367       sf.setDescription(desc);
 368     }
 369     return sf;
 370   }
 371
 372   /**
 373    * Apply heuristic rules to try to get the most useful feature description
 374    *
 375    * @param sf
 376    * @param attributes
 377    * @return
 378    */
 379   protected String getDescription(SequenceFeature sf,
 380           Map<String, List<String>> attributes)
 381   {
 382     String desc = null;
 383     String target = (String) sf.getValue(TARGET);
 384     if (target != null)
 385     {
 386       desc = target.split(" ")[0];
 387     }
 388
 389     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 390     String type = sf.getType();
 391     if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
 392     {
 393       /*
 394        * Ensembl returns dna variants as 'alleles'
 395        */
 396       desc = StringUtils.listToDelimitedString(attributes.get(ALLELES),
 397               ",");
 398     }
 399
 400     /*
 401      * extract 'Name' for a transcript (to show gene name)
 402      * or an exon (so 'colour by label' shows exon boundaries)
 403      */
 404     if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
 405             || so.isA(type, SequenceOntologyI.TRANSCRIPT)
 406             || so.isA(type, SequenceOntologyI.EXON))
 407     {
 408       desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
 409     }
 410
 411     /*
 412      * if the above fails, try ID
 413      */
 414     if (desc == null)
 415     {
 416       desc = (String) sf.getValue(ID);
 417     }
 418
 419     /*
 420      * and decode comma, equals, semi-colon as required by GFF3 spec
 421      */
 422     desc = StringUtils.urlDecode(desc, GFF_ENCODABLE);
 423
 424     return desc;
 425   }
 426 }