src/jalview/io/gff/ExonerateHelper.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import java.util.Locale;
  24
  25 import jalview.datamodel.AlignedCodonFrame;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.MappingType;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.util.MapList;
  31
  32 import java.io.IOException;
  33 import java.util.List;
  34 import java.util.Map;
  35
  36 /**
  37  * A handler to parse GFF in the format generated by the exonerate tool
  38  */
  39 public class ExonerateHelper extends Gff2Helper
  40 {
  41   private static final String SIMILARITY = "similarity";
  42
  43   private static final String GENOME2GENOME = "genome2genome";
  44
  45   private static final String CDNA2GENOME = "cdna2genome";
  46
  47   private static final String CODING2GENOME = "coding2genome";
  48
  49   private static final String CODING2CODING = "coding2coding";
  50
  51   private static final String PROTEIN2GENOME = "protein2genome";
  52
  53   private static final String PROTEIN2DNA = "protein2dna";
  54
  55   private static final String ALIGN = "Align";
  56
  57   private static final String QUERY = "Query";
  58
  59   private static final String TARGET = "Target";
  60
  61   /**
  62    * Process one GFF feature line (as modelled by SequenceFeature)
  63    *
  64    * @param seq
  65    *          the sequence with which this feature is associated
  66    * @param gffColumns
  67    *          the sequence feature with ATTRIBUTES property containing any
  68    *          additional attributes
  69    * @param align
  70    *          the alignment we are adding GFF to
  71    * @param newseqs
  72    *          any new sequences referenced by the GFF
  73    * @param relaxedIdMatching
  74    *          if true, match word tokens in sequence names
  75    * @return true if the sequence feature should be added to the sequence, else
  76    *         false (i.e. it has been processed in another way e.g. to generate a
  77    *         mapping)
  78    */
  79   @Override
  80   public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
  81           AlignmentI align, List<SequenceI> newseqs,
  82           boolean relaxedIdMatching)
  83   {
  84     String attr = gffColumns[ATTRIBUTES_COL];
  85     Map<String, List<String>> set = parseNameValuePairs(attr);
  86
  87     try
  88     {
  89       processGffSimilarity(set, seq, gffColumns, align, newseqs,
  90               relaxedIdMatching);
  91     } catch (IOException ivfe)
  92     {
  93       jalview.bin.Console.errPrintln(ivfe);
  94     }
  95
  96     /*
  97      * return null to indicate we don't want to add a sequence feature for
  98      * similarity (only process it to create mappings)
  99      */
 100     return null;
 101   }
 102
 103   /**
 104    * Processes the 'Query' (or 'Target') and 'Align' properties associated with
 105    * an exonerate GFF similarity feature; these properties define the mapping of
 106    * the annotated range to a related sequence.
 107    *
 108    * @param set
 109    *          parsed GFF column 9 key/value(s)
 110    * @param seq
 111    *          the sequence the GFF feature is on
 112    * @param gff
 113    *          the GFF column data
 114    * @param align
 115    *          the alignment the sequence belongs to, where any new mappings
 116    *          should be added
 117    * @param newseqs
 118    *          a list of new 'virtual sequences' generated while parsing GFF
 119    * @param relaxedIdMatching
 120    *          if true allow fuzzy search for a matching target sequence
 121    * @throws IOException
 122    */
 123   protected void processGffSimilarity(Map<String, List<String>> set,
 124           SequenceI seq, String[] gff, AlignmentI align,
 125           List<SequenceI> newseqs, boolean relaxedIdMatching)
 126           throws IOException
 127   {
 128     /*
 129      * exonerate may be run with
 130      * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
 131      *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
 132      * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
 133      *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
 134      * where the Align spec may repeat
 135      */
 136     // TODO handle coding2coding and similar as well
 137     boolean featureIsOnTarget = true;
 138     List<String> mapTo = set.get(QUERY);
 139     if (mapTo == null)
 140     {
 141       mapTo = set.get(TARGET);
 142       featureIsOnTarget = false;
 143     }
 144     MappingType type = getMappingType(gff[SOURCE_COL]);
 145
 146     if (type == null)
 147     {
 148       throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
 149     }
 150
 151     if (mapTo == null || mapTo.size() != 1)
 152     {
 153       throw new IOException(
 154               "Expecting exactly one sequence in Query or Target field (got "
 155                       + mapTo + ")");
 156     }
 157
 158     /*
 159      * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
 160      */
 161     SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
 162             relaxedIdMatching);
 163
 164     /*
 165      * If mapping is from protein to dna, we store it as dna to protein instead
 166      */
 167     SequenceI mapFromSequence = seq;
 168     SequenceI mapToSequence = mappedSequence;
 169     if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
 170             || (type == MappingType.PeptideToNucleotide
 171                     && !featureIsOnTarget))
 172     {
 173       mapFromSequence = mappedSequence;
 174       mapToSequence = seq;
 175     }
 176
 177     /*
 178      * Process the Align maps and create mappings.
 179      * These may be cdna-genome, cdna-protein, genome-protein.
 180      * The mapped sequences may or may not be in the alignment
 181      * (they may be included later in the GFF file).
 182      */
 183
 184     /*
 185      * get any existing mapping for these sequences (or start one),
 186      * and add this mapped range
 187      */
 188     AlignedCodonFrame acf = getMapping(align, mapFromSequence,
 189             mapToSequence);
 190
 191     /*
 192      * exonerate GFF has the strand of the target in column 7
 193      * (differs from GFF3 which has it in the Target descriptor)
 194      */
 195     String strand = gff[STRAND_COL];
 196     boolean forwardStrand = true;
 197     if ("-".equals(strand))
 198     {
 199       forwardStrand = false;
 200     }
 201     else if (!"+".equals(strand))
 202     {
 203       jalview.bin.Console
 204               .errPrintln("Strand must be specified for alignment");
 205       return;
 206     }
 207
 208     List<String> alignedRegions = set.get(ALIGN);
 209     for (String region : alignedRegions)
 210     {
 211       MapList mapping = buildMapping(region, type, forwardStrand,
 212               featureIsOnTarget, gff);
 213
 214       if (mapping == null)
 215       {
 216         continue;
 217       }
 218
 219       acf.addMap(mapFromSequence, mapToSequence, mapping);
 220     }
 221     align.addCodonFrame(acf);
 222   }
 223
 224   /**
 225    * Construct the mapping
 226    *
 227    * @param region
 228    * @param type
 229    * @param forwardStrand
 230    * @param featureIsOnTarget
 231    * @param gff
 232    * @return
 233    */
 234   protected MapList buildMapping(String region, MappingType type,
 235           boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
 236   {
 237     /*
 238      * process one "fromStart toStart fromCount" descriptor
 239      */
 240     String[] tokens = region.split(" ");
 241     if (tokens.length != 3)
 242     {
 243       jalview.bin.Console
 244               .errPrintln("Malformed Align descriptor: " + region);
 245       return null;
 246     }
 247
 248     /*
 249      * get start/end of from/to mappings
 250      * if feature is on the target sequence we have to invert the sense
 251      */
 252     int alignFromStart;
 253     int alignToStart;
 254     int alignCount;
 255     try
 256     {
 257       alignFromStart = Integer.parseInt(tokens[0]);
 258       alignToStart = Integer.parseInt(tokens[1]);
 259       alignCount = Integer.parseInt(tokens[2]);
 260     } catch (NumberFormatException nfe)
 261     {
 262       jalview.bin.Console.errPrintln(nfe.toString());
 263       return null;
 264     }
 265
 266     int fromStart;
 267     int fromEnd;
 268     int toStart;
 269     int toEnd;
 270
 271     if (featureIsOnTarget)
 272     {
 273       fromStart = alignToStart;
 274       toStart = alignFromStart;
 275       toEnd = forwardStrand ? toStart + alignCount - 1
 276               : toStart - (alignCount - 1);
 277       int toLength = Math.abs(toEnd - toStart) + 1;
 278       int fromLength = toLength * type.getFromRatio() / type.getToRatio();
 279       fromEnd = fromStart + fromLength - 1;
 280     }
 281     else
 282     {
 283       // we use the 'Align' values here not the feature start/end
 284       // not clear why they may differ but it seems they can
 285       fromStart = alignFromStart;
 286       fromEnd = alignFromStart + alignCount - 1;
 287       int fromLength = fromEnd - fromStart + 1;
 288       int toLength = fromLength * type.getToRatio() / type.getFromRatio();
 289       toStart = alignToStart;
 290       if (forwardStrand)
 291       {
 292         toEnd = toStart + toLength - 1;
 293       }
 294       else
 295       {
 296         toEnd = toStart - (toLength - 1);
 297       }
 298     }
 299
 300     MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
 301             toStart, toEnd, type);
 302     return codonmapping;
 303   }
 304
 305   /**
 306    * Returns a MappingType depending on the exonerate 'model' value.
 307    *
 308    * @param model
 309    * @return
 310    */
 311   protected static MappingType getMappingType(String model)
 312   {
 313     MappingType result = null;
 314
 315     if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
 316     {
 317       result = MappingType.PeptideToNucleotide;
 318     }
 319     else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
 320             || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
 321     {
 322       result = MappingType.NucleotideToNucleotide;
 323     }
 324     return result;
 325   }
 326
 327   /**
 328    * Tests whether the GFF data looks like it was generated by exonerate, and is
 329    * a format we are willing to handle
 330    *
 331    * @param columns
 332    * @return
 333    */
 334   public static boolean recognises(String[] columns)
 335   {
 336     if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
 337     {
 338       return false;
 339     }
 340
 341     /*
 342      * inspect alignment model
 343      */
 344     String model = columns[SOURCE_COL];
 345     // e.g. exonerate:protein2genome:local
 346     if (model != null)
 347     {
 348       String mdl = model.toLowerCase(Locale.ROOT);
 349       if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
 350               || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
 351               || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
 352       {
 353         return true;
 354       }
 355     }
 356     jalview.bin.Console
 357             .errPrintln("Sorry, I don't handle exonerate model " + model);
 358     return false;
 359   }
 360
 361   /**
 362    * An override to set feature group to "exonerate" instead of the default GFF
 363    * source value (column 2)
 364    */
 365   @Override
 366   protected SequenceFeature buildSequenceFeature(String[] gff,
 367           Map<String, List<String>> set)
 368   {
 369     SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,
 370             "exonerate", set);
 371
 372     return sf;
 373   }
 374
 375 }