src/jalview/io/gff/ExonerateHelper.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import java.util.Locale;
  24
  25 import jalview.datamodel.AlignedCodonFrame;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.MappingType;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.util.MapList;
  31
  32 import java.io.IOException;
  33 import java.util.List;
  34 import java.util.Map;
  35
  36 /**
  37  * A handler to parse GFF in the format generated by the exonerate tool
  38  */
  39 public class ExonerateHelper extends Gff2Helper
  40 {
  41   private static final String SIMILARITY = "similarity";
  42
  43   private static final String GENOME2GENOME = "genome2genome";
  44
  45   private static final String CDNA2GENOME = "cdna2genome";
  46
  47   private static final String CODING2GENOME = "coding2genome";
  48
  49   private static final String CODING2CODING = "coding2coding";
  50
  51   private static final String PROTEIN2GENOME = "protein2genome";
  52
  53   private static final String PROTEIN2DNA = "protein2dna";
  54
  55   private static final String ALIGN = "Align";
  56
  57   private static final String QUERY = "Query";
  58
  59   private static final String TARGET = "Target";
  60
  61   /**
  62    * Process one GFF feature line (as modelled by SequenceFeature)
  63    *
  64    * @param seq
  65    *          the sequence with which this feature is associated
  66    * @param gffColumns
  67    *          the sequence feature with ATTRIBUTES property containing any
  68    *          additional attributes
  69    * @param align
  70    *          the alignment we are adding GFF to
  71    * @param newseqs
  72    *          any new sequences referenced by the GFF
  73    * @param relaxedIdMatching
  74    *          if true, match word tokens in sequence names
  75    * @return true if the sequence feature should be added to the sequence, else
  76    *         false (i.e. it has been processed in another way e.g. to generate a
  77    *         mapping)
  78    */
  79   @Override
  80   public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
  81           AlignmentI align, List<SequenceI> newseqs,
  82           boolean relaxedIdMatching)
  83   {
  84     String attr = gffColumns[ATTRIBUTES_COL];
  85     Map<String, List<String>> set = parseNameValuePairs(attr);
  86
  87     try
  88     {
  89       processGffSimilarity(set, seq, gffColumns, align, newseqs,
  90               relaxedIdMatching);
  91     } catch (IOException ivfe)
  92     {
  93       System.err.println(ivfe);
  94     }
  95
  96     /*
  97      * return null to indicate we don't want to add a sequence feature for
  98      * similarity (only process it to create mappings)
  99      */
 100     return null;
 101   }
 102
 103   /**
 104    * Processes the 'Query' (or 'Target') and 'Align' properties associated with
 105    * an exonerate GFF similarity feature; these properties define the mapping of
 106    * the annotated range to a related sequence.
 107    *
 108    * @param set
 109    *          parsed GFF column 9 key/value(s)
 110    * @param seq
 111    *          the sequence the GFF feature is on
 112    * @param gff
 113    *          the GFF column data
 114    * @param align
 115    *          the alignment the sequence belongs to, where any new mappings
 116    *          should be added
 117    * @param newseqs
 118    *          a list of new 'virtual sequences' generated while parsing GFF
 119    * @param relaxedIdMatching
 120    *          if true allow fuzzy search for a matching target sequence
 121    * @throws IOException
 122    */
 123   protected void processGffSimilarity(Map<String, List<String>> set,
 124           SequenceI seq, String[] gff, AlignmentI align,
 125           List<SequenceI> newseqs, boolean relaxedIdMatching)
 126           throws IOException
 127   {
 128     /*
 129      * exonerate may be run with
 130      * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
 131      *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
 132      * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
 133      *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
 134      * where the Align spec may repeat
 135      */
 136     // TODO handle coding2coding and similar as well
 137     boolean featureIsOnTarget = true;
 138     List<String> mapTo = set.get(QUERY);
 139     if (mapTo == null)
 140     {
 141       mapTo = set.get(TARGET);
 142       featureIsOnTarget = false;
 143     }
 144     MappingType type = getMappingType(gff[SOURCE_COL]);
 145
 146     if (type == null)
 147     {
 148       throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
 149     }
 150
 151     if (mapTo == null || mapTo.size() != 1)
 152     {
 153       throw new IOException(
 154               "Expecting exactly one sequence in Query or Target field (got "
 155                       + mapTo + ")");
 156     }
 157
 158     /*
 159      * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
 160      */
 161     SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
 162             relaxedIdMatching);
 163
 164     /*
 165      * If mapping is from protein to dna, we store it as dna to protein instead
 166      */
 167     SequenceI mapFromSequence = seq;
 168     SequenceI mapToSequence = mappedSequence;
 169     if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
 170             || (type == MappingType.PeptideToNucleotide
 171                     && !featureIsOnTarget))
 172     {
 173       mapFromSequence = mappedSequence;
 174       mapToSequence = seq;
 175     }
 176
 177     /*
 178      * Process the Align maps and create mappings.
 179      * These may be cdna-genome, cdna-protein, genome-protein.
 180      * The mapped sequences may or may not be in the alignment
 181      * (they may be included later in the GFF file).
 182      */
 183
 184     /*
 185      * get any existing mapping for these sequences (or start one),
 186      * and add this mapped range
 187      */
 188     AlignedCodonFrame acf = getMapping(align, mapFromSequence,
 189             mapToSequence);
 190
 191     /*
 192      * exonerate GFF has the strand of the target in column 7
 193      * (differs from GFF3 which has it in the Target descriptor)
 194      */
 195     String strand = gff[STRAND_COL];
 196     boolean forwardStrand = true;
 197     if ("-".equals(strand))
 198     {
 199       forwardStrand = false;
 200     }
 201     else if (!"+".equals(strand))
 202     {
 203       System.err.println("Strand must be specified for alignment");
 204       return;
 205     }
 206
 207     List<String> alignedRegions = set.get(ALIGN);
 208     for (String region : alignedRegions)
 209     {
 210       MapList mapping = buildMapping(region, type, forwardStrand,
 211               featureIsOnTarget, gff);
 212
 213       if (mapping == null)
 214       {
 215         continue;
 216       }
 217
 218       acf.addMap(mapFromSequence, mapToSequence, mapping);
 219     }
 220     align.addCodonFrame(acf);
 221   }
 222
 223   /**
 224    * Construct the mapping
 225    *
 226    * @param region
 227    * @param type
 228    * @param forwardStrand
 229    * @param featureIsOnTarget
 230    * @param gff
 231    * @return
 232    */
 233   protected MapList buildMapping(String region, MappingType type,
 234           boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
 235   {
 236     /*
 237      * process one "fromStart toStart fromCount" descriptor
 238      */
 239     String[] tokens = region.split(" ");
 240     if (tokens.length != 3)
 241     {
 242       System.err.println("Malformed Align descriptor: " + region);
 243       return null;
 244     }
 245
 246     /*
 247      * get start/end of from/to mappings
 248      * if feature is on the target sequence we have to invert the sense
 249      */
 250     int alignFromStart;
 251     int alignToStart;
 252     int alignCount;
 253     try
 254     {
 255       alignFromStart = Integer.parseInt(tokens[0]);
 256       alignToStart = Integer.parseInt(tokens[1]);
 257       alignCount = Integer.parseInt(tokens[2]);
 258     } catch (NumberFormatException nfe)
 259     {
 260       System.err.println(nfe.toString());
 261       return null;
 262     }
 263
 264     int fromStart;
 265     int fromEnd;
 266     int toStart;
 267     int toEnd;
 268
 269     if (featureIsOnTarget)
 270     {
 271       fromStart = alignToStart;
 272       toStart = alignFromStart;
 273       toEnd = forwardStrand ? toStart + alignCount - 1
 274               : toStart - (alignCount - 1);
 275       int toLength = Math.abs(toEnd - toStart) + 1;
 276       int fromLength = toLength * type.getFromRatio() / type.getToRatio();
 277       fromEnd = fromStart + fromLength - 1;
 278     }
 279     else
 280     {
 281       // we use the 'Align' values here not the feature start/end
 282       // not clear why they may differ but it seems they can
 283       fromStart = alignFromStart;
 284       fromEnd = alignFromStart + alignCount - 1;
 285       int fromLength = fromEnd - fromStart + 1;
 286       int toLength = fromLength * type.getToRatio() / type.getFromRatio();
 287       toStart = alignToStart;
 288       if (forwardStrand)
 289       {
 290         toEnd = toStart + toLength - 1;
 291       }
 292       else
 293       {
 294         toEnd = toStart - (toLength - 1);
 295       }
 296     }
 297
 298     MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
 299             toStart, toEnd, type);
 300     return codonmapping;
 301   }
 302
 303   /**
 304    * Returns a MappingType depending on the exonerate 'model' value.
 305    *
 306    * @param model
 307    * @return
 308    */
 309   protected static MappingType getMappingType(String model)
 310   {
 311     MappingType result = null;
 312
 313     if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
 314     {
 315       result = MappingType.PeptideToNucleotide;
 316     }
 317     else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
 318             || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
 319     {
 320       result = MappingType.NucleotideToNucleotide;
 321     }
 322     return result;
 323   }
 324
 325   /**
 326    * Tests whether the GFF data looks like it was generated by exonerate, and is
 327    * a format we are willing to handle
 328    *
 329    * @param columns
 330    * @return
 331    */
 332   public static boolean recognises(String[] columns)
 333   {
 334     if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
 335     {
 336       return false;
 337     }
 338
 339     /*
 340      * inspect alignment model
 341      */
 342     String model = columns[SOURCE_COL];
 343     // e.g. exonerate:protein2genome:local
 344     if (model != null)
 345     {
 346       String mdl = model.toLowerCase(Locale.ROOT);
 347       if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
 348               || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
 349               || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
 350       {
 351         return true;
 352       }
 353     }
 354     System.err.println("Sorry, I don't handle exonerate model " + model);
 355     return false;
 356   }
 357
 358   /**
 359    * An override to set feature group to "exonerate" instead of the default GFF
 360    * source value (column 2)
 361    */
 362   @Override
 363   protected SequenceFeature buildSequenceFeature(String[] gff,
 364           Map<String, List<String>> set)
 365   {
 366     SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,
 367             "exonerate", set);
 368
 369     return sf;
 370   }
 371
 372 }