src/jalview/io/gff/ExonerateHelper.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentI;
  25 import jalview.datamodel.MappingType;
  26 import jalview.datamodel.SequenceFeature;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.MapList;
  29
  30 import java.io.IOException;
  31 import java.util.List;
  32 import java.util.Map;
  33
  34 /**
  35  * A handler to parse GFF in the format generated by the exonerate tool
  36  */
  37 public class ExonerateHelper extends Gff2Helper
  38 {
  39   private static final String SIMILARITY = "similarity";
  40
  41   private static final String GENOME2GENOME = "genome2genome";
  42
  43   private static final String CDNA2GENOME = "cdna2genome";
  44
  45   private static final String CODING2GENOME = "coding2genome";
  46
  47   private static final String CODING2CODING = "coding2coding";
  48
  49   private static final String PROTEIN2GENOME = "protein2genome";
  50
  51   private static final String PROTEIN2DNA = "protein2dna";
  52
  53   private static final String ALIGN = "Align";
  54
  55   private static final String QUERY = "Query";
  56
  57   private static final String TARGET = "Target";
  58
  59   /**
  60    * Process one GFF feature line (as modelled by SequenceFeature)
  61    *
  62    * @param seq
  63    *          the sequence with which this feature is associated
  64    * @param gffColumns
  65    *          the sequence feature with ATTRIBUTES property containing any
  66    *          additional attributes
  67    * @param align
  68    *          the alignment we are adding GFF to
  69    * @param newseqs
  70    *          any new sequences referenced by the GFF
  71    * @param relaxedIdMatching
  72    *          if true, match word tokens in sequence names
  73    * @return true if the sequence feature should be added to the sequence, else
  74    *         false (i.e. it has been processed in another way e.g. to generate a
  75    *         mapping)
  76    */
  77   @Override
  78   public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
  79           AlignmentI align, List<SequenceI> newseqs,
  80           boolean relaxedIdMatching)
  81   {
  82     String attr = gffColumns[ATTRIBUTES_COL];
  83     Map<String, List<String>> set = parseNameValuePairs(attr);
  84
  85     try
  86     {
  87       processGffSimilarity(set, seq, gffColumns, align, newseqs,
  88               relaxedIdMatching);
  89     } catch (IOException ivfe)
  90     {
  91       System.err.println(ivfe);
  92     }
  93
  94     /*
  95      * return null to indicate we don't want to add a sequence feature for
  96      * similarity (only process it to create mappings)
  97      */
  98     return null;
  99   }
 100
 101   /**
 102    * Processes the 'Query' (or 'Target') and 'Align' properties associated with
 103    * an exonerate GFF similarity feature; these properties define the mapping of
 104    * the annotated range to a related sequence.
 105    *
 106    * @param set
 107    *          parsed GFF column 9 key/value(s)
 108    * @param seq
 109    *          the sequence the GFF feature is on
 110    * @param gff
 111    *          the GFF column data
 112    * @param align
 113    *          the alignment the sequence belongs to, where any new mappings
 114    *          should be added
 115    * @param newseqs
 116    *          a list of new 'virtual sequences' generated while parsing GFF
 117    * @param relaxedIdMatching
 118    *          if true allow fuzzy search for a matching target sequence
 119    * @throws IOException
 120    */
 121   protected void processGffSimilarity(Map<String, List<String>> set,
 122           SequenceI seq, String[] gff, AlignmentI align,
 123           List<SequenceI> newseqs, boolean relaxedIdMatching)
 124           throws IOException
 125   {
 126     /*
 127      * exonerate may be run with
 128      * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
 129      *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
 130      * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
 131      *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
 132      * where the Align spec may repeat
 133      */
 134     // TODO handle coding2coding and similar as well
 135     boolean featureIsOnTarget = true;
 136     List<String> mapTo = set.get(QUERY);
 137     if (mapTo == null)
 138     {
 139       mapTo = set.get(TARGET);
 140       featureIsOnTarget = false;
 141     }
 142     MappingType type = getMappingType(gff[SOURCE_COL]);
 143
 144     if (type == null)
 145     {
 146       throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
 147     }
 148
 149     if (mapTo == null || mapTo.size() != 1)
 150     {
 151       throw new IOException(
 152               "Expecting exactly one sequence in Query or Target field (got "
 153                       + mapTo + ")");
 154     }
 155
 156     /*
 157      * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
 158      */
 159     SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
 160             relaxedIdMatching);
 161
 162     /*
 163      * If mapping is from protein to dna, we store it as dna to protein instead
 164      */
 165     SequenceI mapFromSequence = seq;
 166     SequenceI mapToSequence = mappedSequence;
 167     if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
 168             || (type == MappingType.PeptideToNucleotide
 169                     && !featureIsOnTarget))
 170     {
 171       mapFromSequence = mappedSequence;
 172       mapToSequence = seq;
 173     }
 174
 175     /*
 176      * Process the Align maps and create mappings.
 177      * These may be cdna-genome, cdna-protein, genome-protein.
 178      * The mapped sequences may or may not be in the alignment
 179      * (they may be included later in the GFF file).
 180      */
 181
 182     /*
 183      * get any existing mapping for these sequences (or start one),
 184      * and add this mapped range
 185      */
 186     AlignedCodonFrame acf = getMapping(align, mapFromSequence,
 187             mapToSequence);
 188
 189     /*
 190      * exonerate GFF has the strand of the target in column 7
 191      * (differs from GFF3 which has it in the Target descriptor)
 192      */
 193     String strand = gff[STRAND_COL];
 194     boolean forwardStrand = true;
 195     if ("-".equals(strand))
 196     {
 197       forwardStrand = false;
 198     }
 199     else if (!"+".equals(strand))
 200     {
 201       System.err.println("Strand must be specified for alignment");
 202       return;
 203     }
 204
 205     List<String> alignedRegions = set.get(ALIGN);
 206     for (String region : alignedRegions)
 207     {
 208       MapList mapping = buildMapping(region, type, forwardStrand,
 209               featureIsOnTarget, gff);
 210
 211       if (mapping == null)
 212       {
 213         continue;
 214       }
 215
 216       acf.addMap(mapFromSequence, mapToSequence, mapping);
 217     }
 218     align.addCodonFrame(acf);
 219   }
 220
 221   /**
 222    * Construct the mapping
 223    *
 224    * @param region
 225    * @param type
 226    * @param forwardStrand
 227    * @param featureIsOnTarget
 228    * @param gff
 229    * @return
 230    */
 231   protected MapList buildMapping(String region, MappingType type,
 232           boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
 233   {
 234     /*
 235      * process one "fromStart toStart fromCount" descriptor
 236      */
 237     String[] tokens = region.split(" ");
 238     if (tokens.length != 3)
 239     {
 240       System.err.println("Malformed Align descriptor: " + region);
 241       return null;
 242     }
 243
 244     /*
 245      * get start/end of from/to mappings
 246      * if feature is on the target sequence we have to invert the sense
 247      */
 248     int alignFromStart;
 249     int alignToStart;
 250     int alignCount;
 251     try
 252     {
 253       alignFromStart = Integer.parseInt(tokens[0]);
 254       alignToStart = Integer.parseInt(tokens[1]);
 255       alignCount = Integer.parseInt(tokens[2]);
 256     } catch (NumberFormatException nfe)
 257     {
 258       System.err.println(nfe.toString());
 259       return null;
 260     }
 261
 262     int fromStart;
 263     int fromEnd;
 264     int toStart;
 265     int toEnd;
 266
 267     if (featureIsOnTarget)
 268     {
 269       fromStart = alignToStart;
 270       toStart = alignFromStart;
 271       toEnd = forwardStrand ? toStart + alignCount - 1
 272               : toStart - (alignCount - 1);
 273       int toLength = Math.abs(toEnd - toStart) + 1;
 274       int fromLength = toLength * type.getFromRatio() / type.getToRatio();
 275       fromEnd = fromStart + fromLength - 1;
 276     }
 277     else
 278     {
 279       // we use the 'Align' values here not the feature start/end
 280       // not clear why they may differ but it seems they can
 281       fromStart = alignFromStart;
 282       fromEnd = alignFromStart + alignCount - 1;
 283       int fromLength = fromEnd - fromStart + 1;
 284       int toLength = fromLength * type.getToRatio() / type.getFromRatio();
 285       toStart = alignToStart;
 286       if (forwardStrand)
 287       {
 288         toEnd = toStart + toLength - 1;
 289       }
 290       else
 291       {
 292         toEnd = toStart - (toLength - 1);
 293       }
 294     }
 295
 296     MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
 297             toStart, toEnd, type);
 298     return codonmapping;
 299   }
 300
 301   /**
 302    * Returns a MappingType depending on the exonerate 'model' value.
 303    *
 304    * @param model
 305    * @return
 306    */
 307   protected static MappingType getMappingType(String model)
 308   {
 309     MappingType result = null;
 310
 311     if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
 312     {
 313       result = MappingType.PeptideToNucleotide;
 314     }
 315     else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
 316             || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
 317     {
 318       result = MappingType.NucleotideToNucleotide;
 319     }
 320     return result;
 321   }
 322
 323   /**
 324    * Tests whether the GFF data looks like it was generated by exonerate, and is
 325    * a format we are willing to handle
 326    *
 327    * @param columns
 328    * @return
 329    */
 330   public static boolean recognises(String[] columns)
 331   {
 332     if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
 333     {
 334       return false;
 335     }
 336
 337     /*
 338      * inspect alignment model
 339      */
 340     String model = columns[SOURCE_COL];
 341     // e.g. exonerate:protein2genome:local
 342     if (model != null)
 343     {
 344       String mdl = model.toLowerCase();
 345       if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
 346               || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
 347               || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
 348       {
 349         return true;
 350       }
 351     }
 352     System.err.println("Sorry, I don't handle exonerate model " + model);
 353     return false;
 354   }
 355
 356   /**
 357    * An override to set feature group to "exonerate" instead of the default GFF
 358    * source value (column 2)
 359    */
 360   @Override
 361   protected SequenceFeature buildSequenceFeature(String[] gff,
 362           Map<String, List<String>> set)
 363   {
 364     SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,
 365             "exonerate", set);
 366
 367     return sf;
 368   }
 369
 370 }