src/jalview/io/gff/Gff3Helper.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentI;
  25 import jalview.datamodel.MappingType;
  26 import jalview.datamodel.SequenceFeature;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.MapList;
  29 import jalview.util.StringUtils;
  30
  31 import java.io.IOException;
  32 import java.util.List;
  33 import java.util.Map;
  34
  35 /**
  36  * Base class with generic / common functionality for processing GFF3 data.
  37  * Override this as required for any specialisations resulting from
  38  * peculiarities of GFF3 generated by particular tools.
  39  */
  40 public class Gff3Helper extends GffHelperBase
  41 {
  42   public static final String ALLELES = "alleles";
  43
  44   protected static final String TARGET = "Target";
  45
  46   protected static final String ID = "ID";
  47
  48   private static final String NAME = "Name";
  49
  50   /**
  51    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
  52    * separate multiple values for a name
  53    *
  54    * @param text
  55    * @return
  56    */
  57   public static Map<String, List<String>> parseNameValuePairs(String text)
  58   {
  59     return parseNameValuePairs(text, ";", '=', ",");
  60   }
  61
  62   /**
  63    * Process one GFF feature line (as modelled by SequenceFeature)
  64    *
  65    * @param seq
  66    *          the sequence with which this feature is associated
  67    * @param sf
  68    *          the sequence feature with ATTRIBUTES property containing any
  69    *          additional attributes
  70    * @param align
  71    *          the alignment we are adding GFF to
  72    * @param newseqs
  73    *          any new sequences referenced by the GFF
  74    * @param relaxedIdMatching
  75    *          if true, match word tokens in sequence names
  76    * @return true if the sequence feature should be added to the sequence, else
  77    *         false (i.e. it has been processed in another way e.g. to generate a
  78    *         mapping)
  79    * @throws IOException
  80    */
  81   @Override
  82   public SequenceFeature processGff(SequenceI seq, String[] gff,
  83           AlignmentI align, List<SequenceI> newseqs,
  84           boolean relaxedIdMatching) throws IOException
  85   {
  86     SequenceFeature sf = null;
  87
  88     if (gff.length == 9)
  89     {
  90       String soTerm = gff[TYPE_COL];
  91       String atts = gff[ATTRIBUTES_COL];
  92       Map<String, List<String>> attributes = parseNameValuePairs(atts);
  93
  94       SequenceOntologyI so = SequenceOntologyFactory.getInstance();
  95       if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
  96       {
  97         sf = processProteinMatch(attributes, seq, gff, align, newseqs,
  98                 relaxedIdMatching);
  99       }
 100       else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
 101       {
 102         sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
 103                 relaxedIdMatching);
 104       }
 105       else
 106       {
 107         sf = buildSequenceFeature(gff, attributes);
 108       }
 109     }
 110     else
 111     {
 112       /*
 113        * fall back on generating a sequence feature with no special processing
 114        */
 115       sf = buildSequenceFeature(gff, null);
 116     }
 117
 118     return sf;
 119   }
 120
 121   /**
 122    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
 123    *
 124    * @param attributes
 125    *          parsed GFF column 9 key/value(s)
 126    * @param seq
 127    *          the sequence the GFF feature is on
 128    * @param gffColumns
 129    *          the GFF column data
 130    * @param align
 131    *          the alignment the sequence belongs to, where any new mappings
 132    *          should be added
 133    * @param newseqs
 134    *          a list of new 'virtual sequences' generated while parsing GFF
 135    * @param relaxedIdMatching
 136    *          if true allow fuzzy search for a matching target sequence
 137    * @return a sequence feature, if one should be added to the sequence, else
 138    *         null
 139    * @throws IOException
 140    */
 141   protected SequenceFeature processNucleotideMatch(
 142           Map<String, List<String>> attributes, SequenceI seq,
 143           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 144           boolean relaxedIdMatching) throws IOException
 145   {
 146     String strand = gffColumns[STRAND_COL];
 147
 148     /*
 149      * (For now) we don't process mappings from reverse complement ; to do
 150      * this would require (a) creating a virtual sequence placeholder for
 151      * the reverse complement (b) resolving the sequence by its id from some
 152      * source (GFF ##FASTA or other) (c) creating the reverse complement
 153      * sequence (d) updating the mapping to be to the reverse complement
 154      */
 155     if ("-".equals(strand))
 156     {
 157       System.err.println(
 158               "Skipping mapping from reverse complement as not yet supported");
 159       return null;
 160     }
 161
 162     List<String> targets = attributes.get(TARGET);
 163     if (targets == null)
 164     {
 165       System.err.println("'Target' missing in GFF");
 166       return null;
 167     }
 168
 169     /*
 170      * Typically we only expect one Target per GFF line, but this can handle
 171      * multiple matches, to the same or different sequences (e.g. dna variants)
 172      */
 173     for (String target : targets)
 174     {
 175       /*
 176        * Process "seqid start end [strand]"
 177        */
 178       String[] tokens = target.split(" ");
 179       if (tokens.length < 3)
 180       {
 181         System.err.println("Incomplete Target: " + target);
 182         continue;
 183       }
 184
 185       /*
 186        * Locate the mapped sequence in the alignment, or as a
 187        * (new or existing) virtual sequence in the newseqs list
 188        */
 189       String targetId = findTargetId(tokens[0], attributes);
 190       SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
 191               relaxedIdMatching);
 192       SequenceI mappedSequence = mappedSequence1;
 193       if (mappedSequence == null)
 194       {
 195         continue;
 196       }
 197
 198       /*
 199        * get any existing mapping for these sequences (or start one),
 200        * and add this mapped range
 201        */
 202       AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
 203
 204       try
 205       {
 206         int toStart = Integer.parseInt(tokens[1]);
 207         int toEnd = Integer.parseInt(tokens[2]);
 208         if (tokens.length > 3 && "-".equals(tokens[3]))
 209         {
 210           // mapping to reverse strand - swap start/end
 211           int temp = toStart;
 212           toStart = toEnd;
 213           toEnd = temp;
 214         }
 215
 216         int fromStart = Integer.parseInt(gffColumns[START_COL]);
 217         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
 218         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
 219                 toStart, toEnd, MappingType.NucleotideToNucleotide);
 220
 221         if (mapping != null)
 222         {
 223           acf.addMap(seq, mappedSequence, mapping);
 224           align.addCodonFrame(acf);
 225         }
 226       } catch (NumberFormatException nfe)
 227       {
 228         System.err.println("Invalid start or end in Target " + target);
 229       }
 230     }
 231
 232     SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
 233     return sf;
 234   }
 235
 236   /**
 237    * Returns the target sequence id extracted from the GFF name/value pairs.
 238    * Default (standard behaviour) is the first token for "Target". This may be
 239    * overridden where tools report this in a non-standard way.
 240    *
 241    * @param target
 242    *          first token of a "Target" value from GFF column 9, typically
 243    *          "seqid start end"
 244    * @param set
 245    *          a map with all parsed column 9 attributes
 246    * @return
 247    */
 248   @SuppressWarnings("unused")
 249   protected String findTargetId(String target,
 250           Map<String, List<String>> set)
 251   {
 252     return target;
 253   }
 254
 255   /**
 256    * Processes one GFF 'protein_match'; fields of interest are
 257    * <ul>
 258    * <li>feature group - the database reporting a match e.g. Pfam</li>
 259    * <li>Name - the matched entry's accession id in the database</li>
 260    * <li>ID - a sequence identifier for the matched region (which may be
 261    * appended as FASTA in the GFF file)</li>
 262    * </ul>
 263    *
 264    * @param set
 265    *          parsed GFF column 9 key/value(s)
 266    * @param seq
 267    *          the sequence the GFF feature is on
 268    * @param gffColumns
 269    *          the sequence feature holding GFF data
 270    * @param align
 271    *          the alignment the sequence belongs to, where any new mappings
 272    *          should be added
 273    * @param newseqs
 274    *          a list of new 'virtual sequences' generated while parsing GFF
 275    * @param relaxedIdMatching
 276    *          if true allow fuzzy search for a matching target sequence
 277    * @return the (real or virtual) sequence(s) mapped to by this match
 278    * @throws IOException
 279    */
 280   protected SequenceFeature processProteinMatch(
 281           Map<String, List<String>> set, SequenceI seq, String[] gffColumns,
 282           AlignmentI align, List<SequenceI> newseqs,
 283           boolean relaxedIdMatching)
 284   {
 285     // This is currently tailored to InterProScan GFF output:
 286     // ID holds the ID of the matched sequence, Target references the
 287     // query sequence; this looks wrong, as ID should just be the GFF internal
 288     // ID of the GFF feature, while Target would normally reference the matched
 289     // sequence.
 290     // TODO refactor as needed if other protein-protein GFF varies
 291
 292     SequenceFeature sf = buildSequenceFeature(gffColumns, set);
 293
 294     /*
 295      * locate the mapped sequence in the alignment, or as a
 296      * (new or existing) virtual sequence in the newseqs list
 297      */
 298     List<String> targets = set.get(TARGET);
 299     if (targets != null)
 300     {
 301       for (String target : targets)
 302       {
 303
 304         SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
 305                 align, newseqs, relaxedIdMatching);
 306         SequenceI mappedSequence = mappedSequence1;
 307         if (mappedSequence == null)
 308         {
 309           continue;
 310         }
 311
 312         /*
 313          * give the mapped sequence a copy of the sequence feature, with
 314          * start/end range adjusted
 315          */
 316         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
 317         SequenceFeature sf2 = new SequenceFeature(sf, 1,
 318                 sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
 319         mappedSequence.addSequenceFeature(sf2);
 320
 321         /*
 322          * add a property to the mapped sequence so that it can eventually be
 323          * renamed with its qualified accession id; renaming has to wait until
 324          * all sequence reference resolution is complete
 325          */
 326         String accessionId = StringUtils
 327                 .listToDelimitedString(set.get(NAME), ",");
 328         if (accessionId.length() > 0)
 329         {
 330           String database = sf.getType(); // TODO InterProScan only??
 331           String qualifiedAccId = database + "|" + accessionId;
 332           sf2.setValue(RENAME_TOKEN, qualifiedAccId);
 333         }
 334
 335         /*
 336          * get any existing mapping for these sequences (or start one),
 337          * and add this mapped range
 338          */
 339         AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
 340         int[] from = new int[] { sf.getBegin(), sf.getEnd() };
 341         int[] to = new int[] { 1, sequenceFeatureLength };
 342         MapList mapping = new MapList(from, to, 1, 1);
 343
 344         alco.addMap(seq, mappedSequence, mapping);
 345         align.addCodonFrame(alco);
 346       }
 347     }
 348
 349     return sf;
 350   }
 351
 352   /**
 353    * Modifies the default SequenceFeature in order to set the Target sequence id
 354    * as the description
 355    */
 356   @Override
 357   protected SequenceFeature buildSequenceFeature(String[] gff,
 358           int typeColumn, String group,
 359           Map<String, List<String>> attributes)
 360   {
 361     SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
 362             attributes);
 363     String desc = getDescription(sf, attributes);
 364     if (desc != null)
 365     {
 366       sf.setDescription(desc);
 367     }
 368     return sf;
 369   }
 370
 371   /**
 372    * Apply heuristic rules to try to get the most useful feature description
 373    *
 374    * @param sf
 375    * @param attributes
 376    * @return
 377    */
 378   protected String getDescription(SequenceFeature sf,
 379           Map<String, List<String>> attributes)
 380   {
 381     String desc = null;
 382     String target = (String) sf.getValue(TARGET);
 383     if (target != null)
 384     {
 385       desc = target.split(" ")[0];
 386     }
 387
 388     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 389     String type = sf.getType();
 390     if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
 391     {
 392       /*
 393        * Ensembl returns dna variants as 'alleles'
 394        */
 395       desc = StringUtils.listToDelimitedString(attributes.get(ALLELES),
 396               ",");
 397     }
 398
 399     /*
 400      * extract 'Name' for a transcript (to show gene name)
 401      * or an exon (so 'colour by label' shows exon boundaries)
 402      */
 403     if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
 404             || so.isA(type, SequenceOntologyI.TRANSCRIPT)
 405             || so.isA(type, SequenceOntologyI.EXON))
 406     {
 407       desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
 408     }
 409
 410     /*
 411      * if the above fails, try ID
 412      */
 413     if (desc == null)
 414     {
 415       desc = (String) sf.getValue(ID);
 416     }
 417
 418     /*
 419      * and decode comma, equals, semi-colon as required by GFF3 spec
 420      */
 421     desc = StringUtils.urlDecode(desc, GFF_ENCODABLE);
 422
 423     return desc;
 424   }
 425 }