mungo merge

[jalview.git] / src / jalview / io / gff / ExonerateHelper.java
diff --git a/src/jalview/io/gff/ExonerateHelper.java b/src/jalview/io/gff/ExonerateHelper.java

new file mode 100644 (file)

index 0000000..f7805fd
--- /dev/null
+++ b/src/jalview/io/gff/ExonerateHelper.java
@@ -0,0 +1,348 @@
+package jalview.io.gff;
+
+import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.MappingType;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+import jalview.util.MapList;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A handler to parse GFF in the format generated by the exonerate tool
+ */
+public class ExonerateHelper extends Gff2Helper
+{
+  private static final String SIMILARITY = "similarity";
+
+  private static final String GENOME2GENOME = "genome2genome";
+
+  private static final String CDNA2GENOME = "cdna2genome";
+
+  private static final String CODING2GENOME = "coding2genome";
+
+  private static final String CODING2CODING = "coding2coding";
+
+  private static final String PROTEIN2GENOME = "protein2genome";
+
+  private static final String PROTEIN2DNA = "protein2dna";
+
+  private static final String ALIGN = "Align";
+
+  private static final String QUERY = "Query";
+
+  private static final String TARGET = "Target";
+
+  /**
+   * Process one GFF feature line (as modelled by SequenceFeature)
+   * 
+   * @param seq
+   *          the sequence with which this feature is associated
+   * @param gffColumns
+   *          the sequence feature with ATTRIBUTES property containing any
+   *          additional attributes
+   * @param align
+   *          the alignment we are adding GFF to
+   * @param newseqs
+   *          any new sequences referenced by the GFF
+   * @param relaxedIdMatching
+   *          if true, match word tokens in sequence names
+   * @return true if the sequence feature should be added to the sequence, else
+   *         false (i.e. it has been processed in another way e.g. to generate a
+   *         mapping)
+   */
+  @Override
+  public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
+          AlignmentI align, List<SequenceI> newseqs,
+          boolean relaxedIdMatching)
+  {
+    String attr = gffColumns[ATTRIBUTES_COL];
+    Map<String, List<String>> set = parseNameValuePairs(attr);
+
+    try
+    {
+      processGffSimilarity(set, seq, gffColumns,
+              align, newseqs, relaxedIdMatching);
+    } catch (IOException ivfe)
+    {
+      System.err.println(ivfe);
+    }
+
+    /*
+     * return null to indicate we don't want to add a sequence feature for
+     * similarity (only process it to create mappings)
+     */
+    return null;
+  }
+
+  /**
+   * Processes the 'Query' (or 'Target') and 'Align' properties associated with
+   * an exonerate GFF similarity feature; these properties define the mapping of
+   * the annotated range to a related sequence.
+   * 
+   * @param set
+   *          parsed GFF column 9 key/value(s)
+   * @param seq
+   *          the sequence the GFF feature is on
+   * @param gff
+   *          the GFF column data
+   * @param align
+   *          the alignment the sequence belongs to, where any new mappings
+   *          should be added
+   * @param newseqs
+   *          a list of new 'virtual sequences' generated while parsing GFF
+   * @param relaxedIdMatching
+   *          if true allow fuzzy search for a matching target sequence
+   * @throws IOException
+   */
+  protected void processGffSimilarity(
+          Map<String, List<String>> set,
+          SequenceI seq, String[] gff, AlignmentI align,
+          List<SequenceI> newseqs, boolean relaxedIdMatching)
+          throws IOException
+  {
+    /*
+     * exonerate may be run with
+     * --showquerygff - outputs 'features on the query' e.g. (protein2genome)  
+     *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount  
+     * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
+     *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
+     * where the Align spec may repeat 
+     */
+    // TODO handle coding2coding and similar as well
+    boolean featureIsOnTarget = true;
+    List<String> mapTo = set.get(QUERY);
+    if (mapTo == null)
+    {
+      mapTo = set.get(TARGET);
+      featureIsOnTarget = false;
+    }
+    MappingType type = getMappingType(gff[SOURCE_COL]);
+
+    if (type == null)
+    {
+      throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
+    }
+
+    if (mapTo == null || mapTo.size() != 1)
+    {
+      throw new IOException(
+              "Expecting exactly one sequence in Query or Target field (got "
+                      + mapTo + ")");
+    }
+
+    /*
+     * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; 
+     */
+    SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
+            relaxedIdMatching);
+
+    /*
+     * If mapping is from protein to dna, we store it as dna to protein instead
+     */
+    SequenceI mapFromSequence = seq;
+    SequenceI mapToSequence = mappedSequence;
+    if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
+            || (type == MappingType.PeptideToNucleotide && !featureIsOnTarget))
+    {
+      mapFromSequence = mappedSequence;
+      mapToSequence = seq;
+    }
+
+    /*
+     * Process the Align maps and create mappings.
+     * These may be cdna-genome, cdna-protein, genome-protein.
+     * The mapped sequences may or may not be in the alignment
+     * (they may be included later in the GFF file).
+     */
+
+    /*
+     * get any existing mapping for these sequences (or start one),
+     * and add this mapped range
+     */
+    AlignedCodonFrame acf = getMapping(align, mapFromSequence,
+            mapToSequence);
+
+    /*
+     * exonerate GFF has the strand of the target in column 7
+     * (differs from GFF3 which has it in the Target descriptor)
+     */
+    String strand = gff[STRAND_COL];
+    boolean forwardStrand = true;
+    if ("-".equals(strand))
+    {
+      forwardStrand = false;
+    }
+    else if (!"+".equals(strand))
+    {
+      System.err.println("Strand must be specified for alignment");
+      return;
+    }
+
+    List<String> alignedRegions = set.get(ALIGN);
+    for (String region : alignedRegions)
+    {
+      MapList mapping = buildMapping(region, type, forwardStrand,
+              featureIsOnTarget, gff);
+
+      if (mapping == null)
+      {
+        continue;
+      }
+
+      acf.addMap(mapFromSequence, mapToSequence, mapping);
+    }
+    align.addCodonFrame(acf);
+  }
+
+  /**
+   * Construct the mapping
+   * 
+   * @param region
+   * @param type
+   * @param forwardStrand
+   * @param featureIsOnTarget
+   * @param gff
+   * @return
+   */
+  protected MapList buildMapping(String region, MappingType type,
+          boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
+  {
+    /*
+     * process one "fromStart toStart fromCount" descriptor
+     */
+    String[] tokens = region.split(" ");
+    if (tokens.length != 3)
+    {
+      System.err.println("Malformed Align descriptor: " + region);
+      return null;
+    }
+
+    /*
+     * get start/end of from/to mappings
+     * if feature is on the target sequence we have to invert the sense
+     */
+    int alignFromStart;
+    int alignToStart;
+    int alignCount;
+    try {
+      alignFromStart = Integer.parseInt(tokens[0]);
+      alignToStart = Integer.parseInt(tokens[1]);
+      alignCount = Integer.parseInt(tokens[2]);
+    } catch (NumberFormatException nfe) {
+      System.err.println(nfe.toString());
+      return null;
+    }
+    
+    int fromStart;
+    int fromEnd;
+    int toStart;
+    int toEnd;
+
+    if (featureIsOnTarget)
+    {
+      fromStart = alignToStart;
+      toStart = alignFromStart;
+      toEnd = forwardStrand ? toStart + alignCount - 1 : toStart
+              - (alignCount - 1);
+      int toLength = Math.abs(toEnd - toStart) + 1;
+      int fromLength = toLength * type.getFromRatio() / type.getToRatio();
+      fromEnd = fromStart + fromLength - 1;
+    }
+    else
+    {
+      // we use the 'Align' values here not the feature start/end
+      // not clear why they may differ but it seems they can
+      fromStart = alignFromStart;
+      fromEnd = alignFromStart + alignCount - 1;
+      int fromLength = fromEnd - fromStart + 1;
+      int toLength = fromLength * type.getToRatio() / type.getFromRatio();
+      toStart = alignToStart;
+      if (forwardStrand)
+      {
+        toEnd = toStart + toLength - 1;
+      }
+      else
+      {
+        toEnd = toStart - (toLength - 1);
+      }
+    }
+
+    MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
+            toStart, toEnd, type);
+    return codonmapping;
+  }
+
+  /**
+   * Returns a MappingType depending on the exonerate 'model' value.
+   * 
+   * @param model
+   * @return
+   */
+  protected static MappingType getMappingType(String model)
+  {
+    MappingType result = null;
+
+    if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
+    {
+      result = MappingType.PeptideToNucleotide;
+    }
+    else if (model.contains(CODING2CODING)
+            || model.contains(CODING2GENOME)
+            || model.contains(CDNA2GENOME)
+            || model.contains(GENOME2GENOME))
+    {
+      result = MappingType.NucleotideToNucleotide;
+    }
+    return result;
+  }
+
+  /**
+   * Tests whether the GFF data looks like it was generated by exonerate, and is
+   * a format we are willing to handle
+   * 
+   * @param columns
+   * @return
+   */
+  public static boolean recognises(String[] columns)
+  {
+    if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
+    {
+      return false;
+    }
+
+    /*
+     * inspect alignment model
+     */
+    String model = columns[SOURCE_COL];
+    // e.g. exonerate:protein2genome:local
+    if (model != null)
+    {
+      String mdl = model.toLowerCase();
+      if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
+              || mdl.contains(CODING2CODING)
+              || mdl.contains(CODING2GENOME)
+              || mdl.contains(CDNA2GENOME)
+              || mdl.contains(GENOME2GENOME))
+      {
+        return true;
+      }
+    }
+    System.err.println("Sorry, I don't handle exonerate model " + model);
+    return false;
+  }
+
+  @Override
+  protected SequenceFeature buildSequenceFeature(String[] gff,
+          Map<String, List<String>> set)
+  {
+    SequenceFeature sf = super.buildSequenceFeature(gff, set);
+    sf.setFeatureGroup("exonerate");
+
+    return sf;
+  }
+
+}