src/jalview/io/gff/ExonerateHelper.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import jalview.datamodel.AlignedCodonFrame;
  24 import jalview.datamodel.AlignmentI;
  25 import jalview.datamodel.MappingType;
  26 import jalview.datamodel.SequenceFeature;
  27 import jalview.datamodel.SequenceI;
  28 import jalview.util.MapList;
  29
  30 import java.io.IOException;
  31 import java.util.ArrayList;
  32 import java.util.List;
  33 import java.util.Map;
  34
  35 /**
  36  * A handler to parse GFF in the format generated by the exonerate tool
  37  */
  38 public class ExonerateHelper extends Gff2Helper
  39 {
  40   private static final String SIMILARITY = "similarity";
  41
  42   private static final String GENOME2GENOME = "genome2genome";
  43
  44   private static final String CDNA2GENOME = "cdna2genome";
  45
  46   private static final String CODING2GENOME = "coding2genome";
  47
  48   private static final String CODING2CODING = "coding2coding";
  49
  50   private static final String PROTEIN2GENOME = "protein2genome";
  51
  52   private static final String PROTEIN2DNA = "protein2dna";
  53
  54   private static final String ALIGN = "Align";
  55
  56   private static final String QUERY = "Query";
  57
  58   private static final String TARGET = "Target";
  59
  60   /**
  61    * Process one GFF feature line (as modelled by SequenceFeature)
  62    *
  63    * @param seq
  64    *          the sequence with which this feature is associated
  65    * @param gffColumns
  66    *          the sequence feature with ATTRIBUTES property containing any
  67    *          additional attributes
  68    * @param align
  69    *          the alignment we are adding GFF to
  70    * @param newseqs
  71    *          any new sequences referenced by the GFF
  72    * @param relaxedIdMatching
  73    *          if true, match word tokens in sequence names
  74    * @return true if the sequence feature should be added to the sequence, else
  75    *         false (i.e. it has been processed in another way e.g. to generate a
  76    *         mapping)
  77    */
  78   @Override
  79   public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
  80           AlignmentI align, List<SequenceI> newseqs,
  81           boolean relaxedIdMatching)
  82   {
  83     String attr = gffColumns[ATTRIBUTES_COL];
  84     Map<String, List<String>> set = parseNameValuePairs(attr);
  85
  86     try
  87     {
  88       processGffSimilarity(set, seq, gffColumns, align, newseqs,
  89               relaxedIdMatching);
  90     } catch (IOException ivfe)
  91     {
  92       System.err.println(ivfe);
  93     }
  94
  95     /*
  96      * return null to indicate we don't want to add a sequence feature for
  97      * similarity (only process it to create mappings)
  98      */
  99     return null;
 100   }
 101
 102   /**
 103    * Processes the 'Query' (or 'Target') and 'Align' properties associated with
 104    * an exonerate GFF similarity feature; these properties define the mapping of
 105    * the annotated range to a related sequence.
 106    *
 107    * @param set
 108    *          parsed GFF column 9 key/value(s)
 109    * @param seq
 110    *          the sequence the GFF feature is on
 111    * @param gff
 112    *          the GFF column data
 113    * @param align
 114    *          the alignment the sequence belongs to, where any new mappings
 115    *          should be added
 116    * @param newseqs
 117    *          a list of new 'virtual sequences' generated while parsing GFF
 118    * @param relaxedIdMatching
 119    *          if true allow fuzzy search for a matching target sequence
 120    * @throws IOException
 121    */
 122   protected void processGffSimilarity(Map<String, List<String>> set,
 123           SequenceI seq, String[] gff, AlignmentI align,
 124           List<SequenceI> newseqs, boolean relaxedIdMatching)
 125           throws IOException
 126   {
 127     /*
 128      * exonerate may be run with
 129      * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
 130      *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
 131      * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
 132      *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
 133      * where the Align spec may repeat
 134      */
 135     // TODO handle coding2coding and similar as well
 136     boolean featureIsOnTarget = true;
 137     List<String> mapTo = set.get(QUERY);
 138     if (mapTo == null)
 139     {
 140       mapTo = set.get(TARGET);
 141       featureIsOnTarget = false;
 142     }
 143     MappingType type = getMappingType(gff[SOURCE_COL]);
 144
 145     if (type == null)
 146     {
 147       throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
 148     }
 149
 150     if (mapTo == null || mapTo.size() != 1)
 151     {
 152       throw new IOException(
 153               "Expecting exactly one sequence in Query or Target field (got "
 154                       + mapTo + ")");
 155     }
 156
 157     /*
 158      * similarity start and end can tell us
 159      * which part of the alignment refers to which sequence
 160      */
 161     int similarityFrom,similarityTo;
 162     try {
 163       similarityFrom = Integer.parseInt(gff[START_COL]);
 164       similarityTo = Integer.parseInt(gff[END_COL]);
 165     } catch (Exception x) {
 166       throw new IOException("Couldn't parse start/end of the similarity feature",x);
 167     }
 168
 169     /*
 170      * Process the Align maps and create mappings.
 171      * These may be cdna-genome, cdna-protein, genome-protein.
 172      * The mapped sequences may or may not be in the alignment
 173      * (they may be included later in the GFF file).
 174      */
 175
 176     /*
 177      * exonerate GFF has the strand of the target in column 7
 178      * (differs from GFF3 which has it in the Target descriptor)
 179      */
 180     String strand = gff[STRAND_COL];
 181     boolean forwardStrand = true;
 182     if ("-".equals(strand))
 183     {
 184       forwardStrand = false;
 185     }
 186     else if (!"+".equals(strand))
 187     {
 188       System.err.println("Strand must be specified for alignment");
 189       return;
 190     }
 191
 192     List<String> alignedRegions = set.get(ALIGN);
 193     List<MapList> mappings = new ArrayList<MapList>();
 194     int fromLowest=0, fromHighest=0, toLowest=0, toHighest=0;
 195     for (String region : alignedRegions)
 196     {
 197       MapList mapping = buildMapping(region, type, forwardStrand,
 198               featureIsOnTarget, gff);
 199
 200       if (mapping == null)
 201       {
 202         continue;
 203       }
 204
 205       /*
 206        * record total extent of aligned region(s) for later
 207        */
 208       if (mappings.size() == 0)
 209       {
 210         if (mapping.getFromLowest() < mapping.getFromHighest())
 211         {
 212           fromLowest = mapping.getFromLowest();
 213           fromHighest = mapping.getFromHighest();
 214         }
 215         else
 216         {
 217           fromLowest = mapping.getFromHighest();
 218           fromHighest = mapping.getFromLowest();
 219         }
 220         if (mapping.getToLowest() < mapping.getToHighest())
 221         {
 222           toLowest = mapping.getToLowest();
 223           toHighest = mapping.getToHighest();
 224         }
 225         else
 226         {
 227           toLowest = mapping.getToHighest();
 228           toHighest = mapping.getToLowest();
 229         }
 230       }
 231       else
 232       {
 233         int fl = mapping.getFromLowest(), fh = mapping.getFromHighest(),
 234                 tl = mapping.getToLowest(), th = mapping.getToHighest();
 235         if (fl > fh)
 236         {
 237           fl = fh;
 238           fh = mapping.getFromLowest();
 239         }
 240         if (tl > th)
 241         {
 242           tl = th;
 243           th = mapping.getToLowest();
 244         }
 245         if (fromLowest > fl)
 246
 247         {
 248           fromLowest = fl;
 249         }
 250         if (fromHighest < fh)
 251         {
 252           fromHighest = fh;
 253         }
 254         if (toLowest > tl)
 255         {
 256           toLowest = tl;
 257         }
 258         if (toHighest < th)
 259         {
 260           toHighest = th;
 261         }
 262       }
 263       mappings.add(mapping);
 264     }
 265
 266     /*
 267      * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
 268      */
 269     SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
 270             relaxedIdMatching);
 271
 272     /*
 273      * finally, resolve the sense of the mapping
 274      */
 275     SequenceI mapFromSequence = seq;
 276     SequenceI mapToSequence = mappedSequence;
 277
 278     /*
 279      * If mapping is from protein to dna, we store it as dna to protein instead
 280      */
 281     if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
 282             || (type == MappingType.PeptideToNucleotide
 283                     && !featureIsOnTarget))
 284     {
 285       mapFromSequence = mappedSequence;
 286       mapToSequence = seq;
 287     }
 288     /*
 289      * the sense of 'align' mappings for nucleotide alignments
 290      * from exonerate seem to be ambiguous, so we need to do a bit more work
 291      */
 292     if (type == MappingType.NucleotideToNucleotide || type == MappingType.PeptideToPeptide)
 293     {
 294       /*
 295        *  then check whether the aligned region is contained
 296        *  by the feature to determine sense of mapping
 297        */
 298       if (fromHighest==toHighest && fromLowest==toLowest)
 299       {
 300         // ambiguous case - for simple alignments this doesn't matter, but important for rearrangements or inversions
 301         if (featureIsOnTarget)
 302         {
 303           // TODO: raise a warning since we don't have test coverage for this case
 304           mapFromSequence=mappedSequence; // Target sequence
 305           mapToSequence=seq; // annotated sequence
 306         }
 307       } else if (similarityFrom == fromLowest && similarityTo == fromHighest)
 308       {
 309         mapFromSequence = seq;
 310         mapToSequence = mappedSequence;
 311       }
 312       else if (similarityFrom == toLowest && similarityTo == toHighest)
 313       {
 314         mapFromSequence = mappedSequence;
 315         mapToSequence = seq;
 316       }
 317       else
 318       {
 319         throw new IOException(
 320                 "Couldn't determine sense for similarity feature");
 321       }
 322     }
 323
 324     /*
 325      * get any existing mapping for these sequences (or start one),
 326      * and add this mapped range
 327      */
 328     AlignedCodonFrame acf = getMapping(align, mapFromSequence,
 329             mapToSequence);
 330     for (MapList mapping : mappings)
 331     {
 332       acf.addMap(mapFromSequence, mapToSequence, mapping);
 333     }
 334     align.addCodonFrame(acf);
 335   }
 336
 337   /**
 338    * Construct the mapping
 339    *
 340    * @param region
 341    * @param type
 342    * @param forwardStrand
 343    * @param featureIsOnTarget
 344    * @param gff
 345    * @return
 346    */
 347   protected MapList buildMapping(String region, MappingType type,
 348           boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
 349   {
 350     /*
 351      * process one "fromStart toStart fromCount" descriptor
 352      */
 353     String[] tokens = region.split(" ");
 354     if (tokens.length != 3)
 355     {
 356       System.err.println("Malformed Align descriptor: " + region);
 357       return null;
 358     }
 359
 360     /*
 361      * get start/end of from/to mappings
 362      * if feature is on the target sequence we have to invert the sense
 363      */
 364     int alignFromStart;
 365     int alignToStart;
 366     int alignCount;
 367     try
 368     {
 369       alignFromStart = Integer.parseInt(tokens[0]);
 370       alignToStart = Integer.parseInt(tokens[1]);
 371       alignCount = Integer.parseInt(tokens[2]);
 372     } catch (NumberFormatException nfe)
 373     {
 374       System.err.println(nfe.toString());
 375       return null;
 376     }
 377
 378     int fromStart;
 379     int fromEnd;
 380     int toStart;
 381     int toEnd;
 382
 383     if (featureIsOnTarget)
 384     {
 385       fromStart = alignToStart;
 386       toStart = alignFromStart;
 387       toEnd = forwardStrand ? toStart + alignCount - 1
 388               : toStart - (alignCount - 1);
 389       int toLength = Math.abs(toEnd - toStart) + 1;
 390       int fromLength = toLength * type.getFromRatio() / type.getToRatio();
 391       fromEnd = fromStart + fromLength - 1;
 392     }
 393     else
 394     {
 395       // we use the 'Align' values here not the feature start/end
 396       // not clear why they may differ but it seems they can
 397       fromStart = alignFromStart;
 398       fromEnd = alignFromStart + alignCount - 1;
 399       int fromLength = fromEnd - fromStart + 1;
 400       int toLength = fromLength * type.getToRatio() / type.getFromRatio();
 401       toStart = alignToStart;
 402       if (forwardStrand)
 403       {
 404         toEnd = toStart + toLength - 1;
 405       }
 406       else
 407       {
 408         toEnd = toStart - (toLength - 1);
 409       }
 410     }
 411
 412     MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
 413             toStart, toEnd, type);
 414     return codonmapping;
 415   }
 416
 417   /**
 418    * Returns a MappingType depending on the exonerate 'model' value.
 419    *
 420    * @param model
 421    * @return
 422    */
 423   protected static MappingType getMappingType(String model)
 424   {
 425     MappingType result = null;
 426
 427     if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
 428     {
 429       result = MappingType.PeptideToNucleotide;
 430     }
 431     else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
 432             || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
 433     {
 434       result = MappingType.NucleotideToNucleotide;
 435     }
 436     return result;
 437   }
 438
 439   /**
 440    * Tests whether the GFF data looks like it was generated by exonerate, and is
 441    * a format we are willing to handle
 442    *
 443    * @param columns
 444    * @return
 445    */
 446   public static boolean recognises(String[] columns)
 447   {
 448     if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
 449     {
 450       return false;
 451     }
 452
 453     /*
 454      * inspect alignment model
 455      */
 456     String model = columns[SOURCE_COL];
 457     // e.g. exonerate:protein2genome:local
 458     if (model != null)
 459     {
 460       String mdl = model.toLowerCase();
 461       if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
 462               || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
 463               || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
 464       {
 465         return true;
 466       }
 467     }
 468     System.err.println("Sorry, I don't handle exonerate model " + model);
 469     return false;
 470   }
 471
 472   /**
 473    * An override to set feature group to "exonerate" instead of the default GFF
 474    * source value (column 2)
 475    */
 476   @Override
 477   protected SequenceFeature buildSequenceFeature(String[] gff,
 478           Map<String, List<String>> set)
 479   {
 480     SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,
 481             "exonerate", set);
 482
 483     return sf;
 484   }
 485
 486 }