src/jalview/io/gff/ExonerateHelper.java

   1 /*
   2  * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
   3  * Copyright (C) $$Year-Rel$$ The Jalview Authors
   4  *
   5  * This file is part of Jalview.
   6  *
   7  * Jalview is free software: you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation, either version 3
  10  * of the License, or (at your option) any later version.
  11  *
  12  * Jalview is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15  * PURPOSE.  See the GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
  19  * The Jalview Authors are detailed in the 'AUTHORS' file.
  20  */
  21 package jalview.io.gff;
  22
  23 import java.util.Locale;
  24
  25 import jalview.datamodel.AlignedCodonFrame;
  26 import jalview.datamodel.AlignmentI;
  27 import jalview.datamodel.MappingType;
  28 import jalview.datamodel.SequenceFeature;
  29 import jalview.datamodel.SequenceI;
  30 import jalview.util.MapList;
  31
  32 import java.io.IOException;
  33 import java.util.ArrayList;
  34 import java.util.List;
  35 import java.util.Map;
  36
  37 /**
  38  * A handler to parse GFF in the format generated by the exonerate tool
  39  */
  40 public class ExonerateHelper extends Gff2Helper
  41 {
  42   private static final String SIMILARITY = "similarity";
  43
  44   private static final String GENOME2GENOME = "genome2genome";
  45
  46   private static final String CDNA2GENOME = "cdna2genome";
  47
  48   private static final String CODING2GENOME = "coding2genome";
  49
  50   private static final String CODING2CODING = "coding2coding";
  51
  52   private static final String PROTEIN2GENOME = "protein2genome";
  53
  54   private static final String PROTEIN2DNA = "protein2dna";
  55
  56   private static final String ALIGN = "Align";
  57
  58   private static final String QUERY = "Query";
  59
  60   private static final String TARGET = "Target";
  61
  62   /**
  63    * Process one GFF feature line (as modelled by SequenceFeature)
  64    *
  65    * @param seq
  66    *          the sequence with which this feature is associated
  67    * @param gffColumns
  68    *          the sequence feature with ATTRIBUTES property containing any
  69    *          additional attributes
  70    * @param align
  71    *          the alignment we are adding GFF to
  72    * @param newseqs
  73    *          any new sequences referenced by the GFF
  74    * @param relaxedIdMatching
  75    *          if true, match word tokens in sequence names
  76    * @return true if the sequence feature should be added to the sequence, else
  77    *         false (i.e. it has been processed in another way e.g. to generate a
  78    *         mapping)
  79    */
  80   @Override
  81   public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
  82           AlignmentI align, List<SequenceI> newseqs,
  83           boolean relaxedIdMatching)
  84   {
  85     String attr = gffColumns[ATTRIBUTES_COL];
  86     Map<String, List<String>> set = parseNameValuePairs(attr);
  87
  88     try
  89     {
  90       processGffSimilarity(set, seq, gffColumns, align, newseqs,
  91               relaxedIdMatching);
  92     } catch (IOException ivfe)
  93     {
  94       System.err.println(ivfe);
  95     }
  96
  97     /*
  98      * return null to indicate we don't want to add a sequence feature for
  99      * similarity (only process it to create mappings)
 100      */
 101     return null;
 102   }
 103
 104   /**
 105    * Processes the 'Query' (or 'Target') and 'Align' properties associated with
 106    * an exonerate GFF similarity feature; these properties define the mapping of
 107    * the annotated range to a related sequence.
 108    *
 109    * @param set
 110    *          parsed GFF column 9 key/value(s)
 111    * @param seq
 112    *          the sequence the GFF feature is on
 113    * @param gff
 114    *          the GFF column data
 115    * @param align
 116    *          the alignment the sequence belongs to, where any new mappings
 117    *          should be added
 118    * @param newseqs
 119    *          a list of new 'virtual sequences' generated while parsing GFF
 120    * @param relaxedIdMatching
 121    *          if true allow fuzzy search for a matching target sequence
 122    * @throws IOException
 123    */
 124   protected void processGffSimilarity(Map<String, List<String>> set,
 125           SequenceI seq, String[] gff, AlignmentI align,
 126           List<SequenceI> newseqs, boolean relaxedIdMatching)
 127           throws IOException
 128   {
 129     /*
 130      * exonerate may be run with
 131      * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
 132      *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
 133      * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
 134      *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
 135      * where the Align spec may repeat
 136      */
 137     // TODO handle coding2coding and similar as well
 138     boolean featureIsOnTarget = true;
 139     List<String> mapTo = set.get(QUERY);
 140     if (mapTo == null)
 141     {
 142       mapTo = set.get(TARGET);
 143       featureIsOnTarget = false;
 144     }
 145     MappingType type = getMappingType(gff[SOURCE_COL]);
 146
 147     if (type == null)
 148     {
 149       throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
 150     }
 151
 152     if (mapTo == null || mapTo.size() != 1)
 153     {
 154       throw new IOException(
 155               "Expecting exactly one sequence in Query or Target field (got "
 156                       + mapTo + ")");
 157     }
 158
 159     /*
 160      * similarity start and end can tell us
 161      * which part of the alignment refers to which sequence
 162      */
 163     int similarityFrom,similarityTo;
 164     try {
 165       similarityFrom = Integer.parseInt(gff[START_COL]);
 166       similarityTo = Integer.parseInt(gff[END_COL]);
 167     } catch (Exception x) {
 168       throw new IOException("Couldn't parse start/end of the similarity feature",x);
 169     }
 170
 171     /*
 172      * Process the Align maps and create mappings.
 173      * These may be cdna-genome, cdna-protein, genome-protein.
 174      * The mapped sequences may or may not be in the alignment
 175      * (they may be included later in the GFF file).
 176      */
 177
 178     /*
 179      * exonerate GFF has the strand of the target in column 7
 180      * (differs from GFF3 which has it in the Target descriptor)
 181      */
 182     String strand = gff[STRAND_COL];
 183     boolean forwardStrand = true;
 184     if ("-".equals(strand))
 185     {
 186       forwardStrand = false;
 187     }
 188     else if (!"+".equals(strand))
 189     {
 190       System.err.println("Strand must be specified for alignment");
 191       return;
 192     }
 193
 194     List<String> alignedRegions = set.get(ALIGN);
 195     List<MapList> mappings = new ArrayList<MapList>();
 196     int fromLowest=0, fromHighest=0, toLowest=0, toHighest=0;
 197     for (String region : alignedRegions)
 198     {
 199       MapList mapping = buildMapping(region, type, forwardStrand,
 200               featureIsOnTarget, gff);
 201
 202       if (mapping == null)
 203       {
 204         continue;
 205       }
 206
 207       /*
 208        * record total extent of aligned region(s) for later
 209        */
 210       if (mappings.size() == 0)
 211       {
 212         if (mapping.getFromLowest() < mapping.getFromHighest())
 213         {
 214           fromLowest = mapping.getFromLowest();
 215           fromHighest = mapping.getFromHighest();
 216         }
 217         else
 218         {
 219           fromLowest = mapping.getFromHighest();
 220           fromHighest = mapping.getFromLowest();
 221         }
 222         if (mapping.getToLowest() < mapping.getToHighest())
 223         {
 224           toLowest = mapping.getToLowest();
 225           toHighest = mapping.getToHighest();
 226         }
 227         else
 228         {
 229           toLowest = mapping.getToHighest();
 230           toHighest = mapping.getToLowest();
 231         }
 232       }
 233       else
 234       {
 235         int fl = mapping.getFromLowest(), fh = mapping.getFromHighest(),
 236                 tl = mapping.getToLowest(), th = mapping.getToHighest();
 237         if (fl > fh)
 238         {
 239           fl = fh;
 240           fh = mapping.getFromLowest();
 241         }
 242         if (tl > th)
 243         {
 244           tl = th;
 245           th = mapping.getToLowest();
 246         }
 247         if (fromLowest > fl)
 248
 249         {
 250           fromLowest = fl;
 251         }
 252         if (fromHighest < fh)
 253         {
 254           fromHighest = fh;
 255         }
 256         if (toLowest > tl)
 257         {
 258           toLowest = tl;
 259         }
 260         if (toHighest < th)
 261         {
 262           toHighest = th;
 263         }
 264       }
 265       mappings.add(mapping);
 266     }
 267
 268     /*
 269      * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
 270      */
 271     SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
 272             relaxedIdMatching);
 273
 274     /*
 275      * finally, resolve the sense of the mapping
 276      */
 277     SequenceI mapFromSequence = seq;
 278     SequenceI mapToSequence = mappedSequence;
 279
 280     /*
 281      * If mapping is from protein to dna, we store it as dna to protein instead
 282      */
 283     if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
 284             || (type == MappingType.PeptideToNucleotide
 285                     && !featureIsOnTarget))
 286     {
 287       mapFromSequence = mappedSequence;
 288       mapToSequence = seq;
 289     }
 290     /*
 291      * the sense of 'align' mappings for nucleotide alignments
 292      * from exonerate seem to be ambiguous, so we need to do a bit more work
 293      */
 294     if (type == MappingType.NucleotideToNucleotide || type == MappingType.PeptideToPeptide)
 295     {
 296       /*
 297        *  then check whether the aligned region is contained
 298        *  by the feature to determine sense of mapping
 299        */
 300       if (fromHighest==toHighest && fromLowest==toLowest)
 301       {
 302         // ambiguous case - for simple alignments this doesn't matter, but important for rearrangements or inversions
 303         if (featureIsOnTarget)
 304         {
 305           // TODO: raise a warning since we don't have test coverage for this case
 306           mapFromSequence=mappedSequence; // Target sequence
 307           mapToSequence=seq; // annotated sequence
 308         }
 309       } else if (similarityFrom == fromLowest && similarityTo == fromHighest)
 310       {
 311         mapFromSequence = seq;
 312         mapToSequence = mappedSequence;
 313       }
 314       else if (similarityFrom == toLowest && similarityTo == toHighest)
 315       {
 316         mapFromSequence = mappedSequence;
 317         mapToSequence = seq;
 318       }
 319       else
 320       {
 321         throw new IOException(
 322                 "Couldn't determine sense for similarity feature");
 323       }
 324     }
 325
 326     /*
 327      * get any existing mapping for these sequences (or start one),
 328      * and add this mapped range
 329      */
 330     AlignedCodonFrame acf = getMapping(align, mapFromSequence,
 331             mapToSequence);
 332     for (MapList mapping : mappings)
 333     {
 334       acf.addMap(mapFromSequence, mapToSequence, mapping);
 335     }
 336     align.addCodonFrame(acf);
 337   }
 338
 339   /**
 340    * Construct the mapping
 341    *
 342    * @param region
 343    * @param type
 344    * @param forwardStrand
 345    * @param featureIsOnTarget
 346    * @param gff
 347    * @return
 348    */
 349   protected MapList buildMapping(String region, MappingType type,
 350           boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
 351   {
 352     /*
 353      * process one "fromStart toStart fromCount" descriptor
 354      */
 355     String[] tokens = region.split(" ");
 356     if (tokens.length != 3)
 357     {
 358       System.err.println("Malformed Align descriptor: " + region);
 359       return null;
 360     }
 361
 362     /*
 363      * get start/end of from/to mappings
 364      * if feature is on the target sequence we have to invert the sense
 365      */
 366     int alignFromStart;
 367     int alignToStart;
 368     int alignCount;
 369     try
 370     {
 371       alignFromStart = Integer.parseInt(tokens[0]);
 372       alignToStart = Integer.parseInt(tokens[1]);
 373       alignCount = Integer.parseInt(tokens[2]);
 374     } catch (NumberFormatException nfe)
 375     {
 376       System.err.println(nfe.toString());
 377       return null;
 378     }
 379
 380     int fromStart;
 381     int fromEnd;
 382     int toStart;
 383     int toEnd;
 384
 385     if (featureIsOnTarget)
 386     {
 387       fromStart = alignToStart;
 388       toStart = alignFromStart;
 389       toEnd = forwardStrand ? toStart + alignCount - 1
 390               : toStart - (alignCount - 1);
 391       int toLength = Math.abs(toEnd - toStart) + 1;
 392       int fromLength = toLength * type.getFromRatio() / type.getToRatio();
 393       fromEnd = fromStart + fromLength - 1;
 394     }
 395     else
 396     {
 397       // we use the 'Align' values here not the feature start/end
 398       // not clear why they may differ but it seems they can
 399       fromStart = alignFromStart;
 400       fromEnd = alignFromStart + alignCount - 1;
 401       int fromLength = fromEnd - fromStart + 1;
 402       int toLength = fromLength * type.getToRatio() / type.getFromRatio();
 403       toStart = alignToStart;
 404       if (forwardStrand)
 405       {
 406         toEnd = toStart + toLength - 1;
 407       }
 408       else
 409       {
 410         toEnd = toStart - (toLength - 1);
 411       }
 412     }
 413
 414     MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
 415             toStart, toEnd, type);
 416     return codonmapping;
 417   }
 418
 419   /**
 420    * Returns a MappingType depending on the exonerate 'model' value.
 421    *
 422    * @param model
 423    * @return
 424    */
 425   protected static MappingType getMappingType(String model)
 426   {
 427     MappingType result = null;
 428
 429     if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
 430     {
 431       result = MappingType.PeptideToNucleotide;
 432     }
 433     else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
 434             || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
 435     {
 436       result = MappingType.NucleotideToNucleotide;
 437     }
 438     return result;
 439   }
 440
 441   /**
 442    * Tests whether the GFF data looks like it was generated by exonerate, and is
 443    * a format we are willing to handle
 444    *
 445    * @param columns
 446    * @return
 447    */
 448   public static boolean recognises(String[] columns)
 449   {
 450     if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
 451     {
 452       return false;
 453     }
 454
 455     /*
 456      * inspect alignment model
 457      */
 458     String model = columns[SOURCE_COL];
 459     // e.g. exonerate:protein2genome:local
 460     if (model != null)
 461     {
 462       String mdl = model.toLowerCase(Locale.ROOT);
 463       if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
 464               || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
 465               || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
 466       {
 467         return true;
 468       }
 469     }
 470     System.err.println("Sorry, I don't handle exonerate model " + model);
 471     return false;
 472   }
 473
 474   /**
 475    * An override to set feature group to "exonerate" instead of the default GFF
 476    * source value (column 2)
 477    */
 478   @Override
 479   protected SequenceFeature buildSequenceFeature(String[] gff,
 480           Map<String, List<String>> set)
 481   {
 482     SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,
 483             "exonerate", set);
 484
 485     return sf;
 486   }
 487
 488 }