src/jalview/io/gff/Gff3Helper.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentI;
  25 import jalview.datamodel.MappingType;
  26 import jalview.datamodel.SequenceFeature;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.MapList;
  29 import jalview.util.StringUtils;
  30
  31 import java.io.IOException;
  32 import java.util.List;
  33 import java.util.Map;
  34
  35 /**
  36  * Base class with generic / common functionality for processing GFF3 data.
  37  * Override this as required for any specialisations resulting from
  38  * peculiarities of GFF3 generated by particular tools.
  39  */
  40 public class Gff3Helper extends GffHelperBase
  41 {
  42   protected static final String TARGET = "Target";
  43
  44   protected static final String ID = "ID";
  45
  46   private static final String NAME = "Name";
  47
  48   /**
  49    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
  50    * separate multiple values for a name
  51    *
  52    * @param text
  53    * @return
  54    */
  55   public static Map<String, List<String>> parseNameValuePairs(String text)
  56   {
  57     return parseNameValuePairs(text, ";", '=', ",");
  58   }
  59
  60   /**
  61    * Process one GFF feature line (as modelled by SequenceFeature)
  62    *
  63    * @param seq
  64    *          the sequence with which this feature is associated
  65    * @param sf
  66    *          the sequence feature with ATTRIBUTES property containing any
  67    *          additional attributes
  68    * @param align
  69    *          the alignment we are adding GFF to
  70    * @param newseqs
  71    *          any new sequences referenced by the GFF
  72    * @param relaxedIdMatching
  73    *          if true, match word tokens in sequence names
  74    * @return true if the sequence feature should be added to the sequence, else
  75    *         false (i.e. it has been processed in another way e.g. to generate a
  76    *         mapping)
  77    * @throws IOException
  78    */
  79   @Override
  80   public SequenceFeature processGff(SequenceI seq, String[] gff,
  81           AlignmentI align, List<SequenceI> newseqs,
  82           boolean relaxedIdMatching) throws IOException
  83   {
  84     SequenceFeature sf = null;
  85
  86     if (gff.length == 9)
  87     {
  88       String soTerm = gff[TYPE_COL];
  89       String atts = gff[ATTRIBUTES_COL];
  90       Map<String, List<String>> attributes = parseNameValuePairs(atts);
  91
  92       SequenceOntologyI so = SequenceOntologyFactory.getInstance();
  93       if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
  94       {
  95         sf = processProteinMatch(attributes, seq, gff, align, newseqs,
  96                 relaxedIdMatching);
  97       }
  98       else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
  99       {
 100         sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
 101                 relaxedIdMatching);
 102       }
 103       else
 104       {
 105         sf = buildSequenceFeature(gff, attributes);
 106       }
 107     }
 108     else
 109     {
 110       /*
 111        * fall back on generating a sequence feature with no special processing
 112        */
 113       sf = buildSequenceFeature(gff, null);
 114     }
 115
 116     return sf;
 117   }
 118
 119   /**
 120    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
 121    *
 122    * @param attributes
 123    *          parsed GFF column 9 key/value(s)
 124    * @param seq
 125    *          the sequence the GFF feature is on
 126    * @param gffColumns
 127    *          the GFF column data
 128    * @param align
 129    *          the alignment the sequence belongs to, where any new mappings
 130    *          should be added
 131    * @param newseqs
 132    *          a list of new 'virtual sequences' generated while parsing GFF
 133    * @param relaxedIdMatching
 134    *          if true allow fuzzy search for a matching target sequence
 135    * @return a sequence feature, if one should be added to the sequence, else
 136    *         null
 137    * @throws IOException
 138    */
 139   protected SequenceFeature processNucleotideMatch(
 140           Map<String, List<String>> attributes, SequenceI seq,
 141           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 142           boolean relaxedIdMatching) throws IOException
 143   {
 144     String strand = gffColumns[STRAND_COL];
 145
 146     /*
 147      * (For now) we don't process mappings from reverse complement ; to do
 148      * this would require (a) creating a virtual sequence placeholder for
 149      * the reverse complement (b) resolving the sequence by its id from some
 150      * source (GFF ##FASTA or other) (c) creating the reverse complement
 151      * sequence (d) updating the mapping to be to the reverse complement
 152      */
 153     if ("-".equals(strand))
 154     {
 155       System.err
 156               .println("Skipping mapping from reverse complement as not yet supported");
 157       return null;
 158     }
 159
 160     List<String> targets = attributes.get(TARGET);
 161     if (targets == null)
 162     {
 163       System.err.println("'Target' missing in GFF");
 164       return null;
 165     }
 166
 167     /*
 168      * Typically we only expect one Target per GFF line, but this can handle
 169      * multiple matches, to the same or different sequences (e.g. dna variants)
 170      */
 171     for (String target : targets)
 172     {
 173       /*
 174        * Process "seqid start end [strand]"
 175        */
 176       String[] tokens = target.split(" ");
 177       if (tokens.length < 3)
 178       {
 179         System.err.println("Incomplete Target: " + target);
 180         continue;
 181       }
 182
 183       /*
 184        * Locate the mapped sequence in the alignment, or as a
 185        * (new or existing) virtual sequence in the newseqs list
 186        */
 187       String targetId = findTargetId(tokens[0], attributes);
 188       SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
 189               relaxedIdMatching);
 190       SequenceI mappedSequence = mappedSequence1;
 191       if (mappedSequence == null)
 192       {
 193         continue;
 194       }
 195
 196       /*
 197        * get any existing mapping for these sequences (or start one),
 198        * and add this mapped range
 199        */
 200       AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
 201
 202       try
 203       {
 204         int toStart = Integer.parseInt(tokens[1]);
 205         int toEnd = Integer.parseInt(tokens[2]);
 206         if (tokens.length > 3 && "-".equals(tokens[3]))
 207         {
 208           // mapping to reverse strand - swap start/end
 209           int temp = toStart;
 210           toStart = toEnd;
 211           toEnd = temp;
 212         }
 213
 214         int fromStart = Integer.parseInt(gffColumns[START_COL]);
 215         int fromEnd = Integer.parseInt(gffColumns[END_COL]);
 216         MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
 217                 toStart, toEnd, MappingType.NucleotideToNucleotide);
 218
 219         if (mapping != null)
 220         {
 221           acf.addMap(seq, mappedSequence, mapping);
 222           align.addCodonFrame(acf);
 223         }
 224       } catch (NumberFormatException nfe)
 225       {
 226         System.err.println("Invalid start or end in Target " + target);
 227       }
 228     }
 229
 230     SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
 231     return sf;
 232   }
 233
 234   /**
 235    * Returns the target sequence id extracted from the GFF name/value pairs.
 236    * Default (standard behaviour) is the first token for "Target". This may be
 237    * overridden where tools report this in a non-standard way.
 238    *
 239    * @param target
 240    *          first token of a "Target" value from GFF column 9, typically
 241    *          "seqid start end"
 242    * @param set
 243    *          a map with all parsed column 9 attributes
 244    * @return
 245    */
 246   @SuppressWarnings("unused")
 247   protected String findTargetId(String target, Map<String, List<String>> set)
 248   {
 249     return target;
 250   }
 251
 252   /**
 253    * Processes one GFF 'protein_match'; fields of interest are
 254    * <ul>
 255    * <li>feature group - the database reporting a match e.g. Pfam</li>
 256    * <li>Name - the matched entry's accession id in the database</li>
 257    * <li>ID - a sequence identifier for the matched region (which may be
 258    * appended as FASTA in the GFF file)</li>
 259    * </ul>
 260    *
 261    * @param set
 262    *          parsed GFF column 9 key/value(s)
 263    * @param seq
 264    *          the sequence the GFF feature is on
 265    * @param gffColumns
 266    *          the sequence feature holding GFF data
 267    * @param align
 268    *          the alignment the sequence belongs to, where any new mappings
 269    *          should be added
 270    * @param newseqs
 271    *          a list of new 'virtual sequences' generated while parsing GFF
 272    * @param relaxedIdMatching
 273    *          if true allow fuzzy search for a matching target sequence
 274    * @return the (real or virtual) sequence(s) mapped to by this match
 275    * @throws IOException
 276    */
 277   protected SequenceFeature processProteinMatch(
 278           Map<String, List<String>> set, SequenceI seq,
 279           String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
 280           boolean relaxedIdMatching)
 281   {
 282     // This is currently tailored to InterProScan GFF output:
 283     // ID holds the ID of the matched sequence, Target references the
 284     // query sequence; this looks wrong, as ID should just be the GFF internal
 285     // ID of the GFF feature, while Target would normally reference the matched
 286     // sequence.
 287     // TODO refactor as needed if other protein-protein GFF varies
 288
 289     SequenceFeature sf = buildSequenceFeature(gffColumns, set);
 290
 291     /*
 292      * locate the mapped sequence in the alignment, or as a
 293      * (new or existing) virtual sequence in the newseqs list
 294      */
 295     List<String> targets = set.get(TARGET);
 296     if (targets != null)
 297     {
 298       for (String target : targets)
 299       {
 300
 301         SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
 302                 align, newseqs, relaxedIdMatching);
 303         SequenceI mappedSequence = mappedSequence1;
 304         if (mappedSequence == null)
 305         {
 306           continue;
 307         }
 308
 309         /*
 310          * give the mapped sequence a copy of the sequence feature, with
 311          * start/end range adjusted
 312          */
 313         int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
 314         SequenceFeature sf2 = new SequenceFeature(sf, 1,
 315                 sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
 316         mappedSequence.addSequenceFeature(sf2);
 317
 318         /*
 319          * add a property to the mapped sequence so that it can eventually be
 320          * renamed with its qualified accession id; renaming has to wait until
 321          * all sequence reference resolution is complete
 322          */
 323         String accessionId = StringUtils.listToDelimitedString(
 324                 set.get(NAME), ",");
 325         if (accessionId.length() > 0)
 326         {
 327           String database = sf.getType(); // TODO InterProScan only??
 328           String qualifiedAccId = database + "|" + accessionId;
 329           sf2.setValue(RENAME_TOKEN, qualifiedAccId);
 330         }
 331
 332         /*
 333          * get any existing mapping for these sequences (or start one),
 334          * and add this mapped range
 335          */
 336         AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
 337         int[] from = new int[] { sf.getBegin(), sf.getEnd() };
 338         int[] to = new int[] { 1, sequenceFeatureLength };
 339         MapList mapping = new MapList(from, to, 1, 1);
 340
 341         alco.addMap(seq, mappedSequence, mapping);
 342         align.addCodonFrame(alco);
 343       }
 344     }
 345
 346     return sf;
 347   }
 348
 349   /**
 350    * Return '=' as the name-value separator used in column 9 attributes.
 351    */
 352   @Override
 353   protected char getNameValueSeparator()
 354   {
 355     return '=';
 356   }
 357
 358   /**
 359    * Modifies the default SequenceFeature in order to set the Target sequence id
 360    * as the description
 361    */
 362   @Override
 363   protected SequenceFeature buildSequenceFeature(String[] gff,
 364           int typeColumn, String group,
 365           Map<String, List<String>> attributes)
 366   {
 367     SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
 368             attributes);
 369     String desc = getDescription(sf, attributes);
 370     if (desc != null)
 371     {
 372       sf.setDescription(desc);
 373     }
 374     return sf;
 375   }
 376
 377   /**
 378    * Apply heuristic rules to try to get the most useful feature description
 379    *
 380    * @param sf
 381    * @param attributes
 382    * @return
 383    */
 384   protected String getDescription(SequenceFeature sf,
 385           Map<String, List<String>> attributes)
 386   {
 387     String desc = null;
 388     String target = (String) sf.getValue(TARGET);
 389     if (target != null)
 390     {
 391       desc = target.split(" ")[0];
 392     }
 393
 394     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 395     String type = sf.getType();
 396     if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
 397     {
 398       /*
 399        * Ensembl returns dna variants as 'alleles'
 400        */
 401       desc = StringUtils.listToDelimitedString(attributes.get("alleles"),
 402               ",");
 403     }
 404
 405     /*
 406      * extract 'Name' for a transcript (to show gene name)
 407      * or an exon (so 'colour by label' shows exon boundaries)
 408      */
 409     if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
 410             || so.isA(type, SequenceOntologyI.TRANSCRIPT)
 411             || so.isA(type, SequenceOntologyI.EXON))
 412     {
 413       desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
 414     }
 415
 416     /*
 417      * if the above fails, try ID
 418      */
 419     if (desc == null)
 420     {
 421       desc = (String) sf.getValue(ID);
 422     }
 423
 424     return desc;
 425   }
 426 }