JAL-1693 make exon alignment for get-xref splitframe (with CDS xref)

[jalview.git] / src / jalview / analysis / AlignmentUtils.java
diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java

index b78afeb..a811d84 100644 (file)
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -35,16 +35,22 @@ import java.util.TreeMap;
  
  import jalview.datamodel.AlignedCodon;
  import jalview.datamodel.AlignedCodonFrame;
+import jalview.datamodel.Alignment;
  import jalview.datamodel.AlignmentAnnotation;
  import jalview.datamodel.AlignmentI;
  import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
+import jalview.datamodel.FeatureProperties;
  import jalview.datamodel.Mapping;
  import jalview.datamodel.SearchResults;
  import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceGroup;
  import jalview.datamodel.SequenceI;
  import jalview.schemes.ResidueProperties;
+import jalview.util.DBRefUtils;
  import jalview.util.MapList;
+import jalview.util.MappingUtils;
  
  /**
   * grab bag of useful alignment manipulation operations Expect these to be
@@ -1258,4 +1264,124 @@ public class AlignmentUtils
      }
      return false;
    }
+
+  /**
+   * Constructs an alignment consisting of the mapped exon regions in the given
+   * nucleotide sequences, and updates mappings to match.
+   * 
+   * @param dna
+   *          aligned dna sequences
+   * @param mappings
+   *          from dna to protein; these are replaced with new mappings
+   * @return an alignment whose sequences are the exon-only parts of the dna
+   *         sequences (or null if no exons are found)
+   */
+  public static AlignmentI makeExonAlignment(SequenceI[] dna,
+          Set<AlignedCodonFrame> mappings)
+  {
+    Set<AlignedCodonFrame> newMappings = new HashSet<AlignedCodonFrame>();
+    List<SequenceI> exonSequences = new ArrayList<SequenceI>();
+    
+    for (SequenceI dnaSeq : dna)
+    {
+      final SequenceI ds = dnaSeq.getDatasetSequence();
+      List<AlignedCodonFrame> seqMappings = MappingUtils
+              .findMappingsForSequence(ds, mappings);
+      if (!seqMappings.isEmpty())
+      {
+        /*
+         * We assume here that only one protein mapping is expected per dna
+         * sequence. Mapping to multiple protein sequences is conceivable but
+         * undefined. Splitting a mapping to one protein sequence across
+         * multiple mappings is possible but pathological. Need closer
+         * constraints on the contents of AlignedCodonFrame.
+         */
+        AlignedCodonFrame newMapping = new AlignedCodonFrame();
+        final SequenceI exonSequence = makeExonSequence(ds,
+                seqMappings.get(0), newMapping);
+        exonSequences.add(exonSequence);
+        newMappings.add(newMapping);
+      }
+    }
+    AlignmentI al = new Alignment(
+            exonSequences.toArray(new SequenceI[exonSequences.size()]));
+    al.setDataset(null);
+
+    /*
+     * Replace the old mappings with the new ones
+     */
+    mappings.clear();
+    mappings.addAll(newMappings);
+
+    return al;
+  }
+
+  /**
+   * Helper method to make an exon-only sequence and populate its mapping to
+   * protein
+   * <p>
+   * For example, if ggCCaTTcGAg has mappings [3, 4, 6, 7, 9, 10] to protein
+   * then generate a sequence CCTTGA with mapping [1, 6] to the same protein
+   * residues
+   * 
+   * @param dnaSeq
+   *          a dna dataset sequence
+   * @param mapping
+   *          the current mapping of the sequence to protein
+   * @param newMapping
+   *          the new mapping to populate, from the exon-only sequence
+   * @return
+   */
+  protected static SequenceI makeExonSequence(SequenceI dnaSeq,
+          AlignedCodonFrame acf, AlignedCodonFrame newMapping)
+  {
+    Mapping mapping = acf.getMappingForSequence(dnaSeq);
+    final char[] dna = dnaSeq.getSequence();
+    StringBuilder newSequence = new StringBuilder(dnaSeq.getLength());
+
+    /*
+     * Get the codon regions as { [2, 5], [7, 12], [14, 14] etc }
+     */
+    List<int[]> exonRanges = mapping.getMap().getFromRanges();
+    for (int[] range : exonRanges)
+    {
+      for (int pos = range[0]; pos <= range[1]; pos++)
+      {
+        newSequence.append(dna[pos - 1]);
+      }
+    }
+
+    SequenceI exon = new Sequence(dnaSeq.getName(), newSequence.toString());
+
+    /*
+     * Locate any xrefs to CDS database on the protein product and attach to the
+     * CDS sequence. Also add as a sub-token of the sequence name.
+     */
+    // default to "CDS" if we can't locate an actual gene id
+    String cdsAccId = FeatureProperties.getCodingFeature(DBRefSource.EMBL);
+    DBRefEntry[] cdsRefs = DBRefUtils.selectRefs(
+            mapping.getTo().getDBRef(), DBRefSource.CODINGDBS);
+    if (cdsRefs != null)
+    {
+      for (DBRefEntry cdsRef : cdsRefs)
+      {
+        exon.addDBRef(new DBRefEntry(cdsRef));
+        cdsAccId = cdsRef.getAccessionId();
+      }
+    }
+    exon.setName(exon.getName() + "|" + cdsAccId);
+    exon.createDatasetSequence();
+
+    /*
+     * Build new mappings - from the same protein regions, but now to contiguous
+     * exons
+     */
+    List<int[]> exonRange = new ArrayList<int[]>();
+    exonRange.add(new int[]
+    { 1, newSequence.length() });
+    MapList map = new MapList(exonRange, mapping.getMap().getToRanges(), 3, 1);
+    newMapping.addMap(exon.getDatasetSequence(), mapping.getTo(), map);
+
+    return exon;
+  }
  }