From: gmungoc <g.m.carstairs@dundee.ac.uk>
Date: Wed, 2 Mar 2016 16:35:00 +0000 (+0000)
Subject: JAL-1705 include any unmapped protein start 'X' when aligning to dna
X-Git-Tag: Release_2_10_0~296^2~16
X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=3aa60eb1704441d7db522c13d6b45ab05cb43e2b;p=jalview.git

JAL-1705 include any unmapped protein start 'X' when aligning to dna
---

diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java
index bdcfc2a..e624ce7 100644
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -930,6 +930,34 @@ public class AlignmentUtils
   public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
   {
     List<SequenceI> unmappedProtein = new ArrayList<SequenceI>();
+    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
+            protein, dna, unmappedProtein);
+    return alignProteinAs(protein, alignedCodons, unmappedProtein);
+  }
+
+  /**
+   * Builds a map whose key is an aligned codon position (3 alignment column
+   * numbers base 0), and whose value is a map from protein sequence to each
+   * protein's peptide residue for that codon. The map generates an ordering of
+   * the codons, and allows us to read off the peptides at each position in
+   * order to assemble 'aligned' protein sequences.
+   * 
+   * @param protein
+   *          the protein alignment
+   * @param dna
+   *          the coding dna alignment
+   * @param unmappedProtein
+   *          any unmapped proteins are added to this list
+   * @return
+   */
+  protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
+          AlignmentI protein, AlignmentI dna,
+          List<SequenceI> unmappedProtein)
+  {
+    /*
+     * maintain a list of any proteins with no mappings - these will be
+     * rendered 'as is' in the protein alignment as we can't align them
+     */
     unmappedProtein.addAll(protein.getSequences());
 
     List<AlignedCodonFrame> mappings = protein.getCodonFrames();
@@ -939,24 +967,114 @@ public class AlignmentUtils
      * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
      * comparator keeps the codon positions ordered.
      */
-    Map<AlignedCodon, Map<SequenceI, String>> alignedCodons = new TreeMap<AlignedCodon, Map<SequenceI, String>>(
+    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<AlignedCodon, Map<SequenceI, AlignedCodon>>(
             new CodonComparator());
+
     for (SequenceI dnaSeq : dna.getSequences())
     {
       for (AlignedCodonFrame mapping : mappings)
       {
-        Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
         SequenceI prot = mapping.findAlignedSequence(
                 dnaSeq.getDatasetSequence(), protein);
         if (prot != null)
         {
+          Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
           addCodonPositions(dnaSeq, prot, protein.getGapCharacter(),
                   seqMap, alignedCodons);
           unmappedProtein.remove(prot);
         }
       }
     }
-    return alignProteinAs(protein, alignedCodons, unmappedProtein);
+
+    /*
+     * Finally add any unmapped peptide start residues (e.g. for incomplete
+     * codons) as if at the codon position before the second residue
+     */
+    int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
+    addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
+    
+    return alignedCodons;
+  }
+
+  /**
+   * Scans for any protein mapped from position 2 (meaning unmapped start
+   * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
+   * preceding position in the alignment
+   * 
+   * @param alignedCodons
+   *          the codon-to-peptide map
+   * @param mappedSequenceCount
+   *          the number of distinct sequences in the map
+   */
+  protected static void addUnmappedPeptideStarts(
+          Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
+          int mappedSequenceCount)
+  {
+    // TODO there must be an easier way! root problem is that our mapping data
+    // model does not include phase so can't map part of a codon to a peptide
+    List<SequenceI> sequencesChecked = new ArrayList<SequenceI>();
+    AlignedCodon lastCodon = null;
+    Map<SequenceI, AlignedCodon> toAdd = new HashMap<SequenceI, AlignedCodon>();
+
+    for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
+            .entrySet())
+    {
+      for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
+              .entrySet())
+      {
+        SequenceI seq = sequenceCodon.getKey();
+        if (sequencesChecked.contains(seq))
+        {
+          continue;
+        }
+        sequencesChecked.add(seq);
+        AlignedCodon codon = sequenceCodon.getValue();
+        if (codon.peptideCol > 1)
+        {
+          System.err
+                  .println("Problem mapping protein with >1 unmapped start positions: "
+                          + seq.getName());
+        }
+        else if (codon.peptideCol == 1)
+        {
+          /*
+           * first position (peptideCol == 0) was unmapped - add it
+           */
+          if (lastCodon != null)
+          {
+            AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
+                    lastCodon.pos2, lastCodon.pos3, String.valueOf(seq
+                            .getCharAt(0)), 0);
+            toAdd.put(seq, firstPeptide);
+          }
+          else
+          {
+            /*
+             * unmapped residue at start of alignment (no prior column) -
+             * 'insert' at nominal codon [0, 0, 0]
+             */
+            AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
+                    String.valueOf(seq.getCharAt(0)), 0);
+            toAdd.put(seq, firstPeptide);
+          }
+        }
+        if (sequencesChecked.size() == mappedSequenceCount)
+        {
+          // no need to check past first mapped position in all sequences
+          break;
+        }
+      }
+      lastCodon = entry.getKey();
+    }
+
+    /*
+     * add any new codons safely after iterating over the map
+     */
+    for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
+    {
+      addCodonToMap(alignedCodons, startCodon.getValue(),
+              startCodon.getKey());
+    }
   }
 
   /**
@@ -971,7 +1089,7 @@ public class AlignmentUtils
    * @return
    */
   protected static int alignProteinAs(AlignmentI protein,
-          Map<AlignedCodon, Map<SequenceI, String>> alignedCodons,
+          Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
           List<SequenceI> unmappedProtein)
   {
     /*
@@ -993,12 +1111,13 @@ public class AlignmentUtils
     int column = 0;
     for (AlignedCodon codon : alignedCodons.keySet())
     {
-      final Map<SequenceI, String> columnResidues = alignedCodons
+      final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
               .get(codon);
-      for (Entry<SequenceI, String> entry : columnResidues.entrySet())
+      for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
       {
         // place translated codon at its column position in sequence
-        entry.getKey().getSequence()[column] = entry.getValue().charAt(0);
+        entry.getKey().getSequence()[column] = entry.getValue().product
+                .charAt(0);
       }
       column++;
     }
@@ -1023,21 +1142,20 @@ public class AlignmentUtils
    */
   static void addCodonPositions(SequenceI dna, SequenceI protein,
           char gapChar, Mapping seqMap,
-          Map<AlignedCodon, Map<SequenceI, String>> alignedCodons)
+          Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
   {
     Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
+
+    /*
+     * add codon positions, and their peptide translations, to the alignment
+     * map, while remembering the first codon mapped
+     */
     while (codons.hasNext())
     {
       try
       {
         AlignedCodon codon = codons.next();
-        Map<SequenceI, String> seqProduct = alignedCodons.get(codon);
-        if (seqProduct == null)
-        {
-          seqProduct = new HashMap<SequenceI, String>();
-          alignedCodons.put(codon, seqProduct);
-        }
-        seqProduct.put(protein, codon.product);
+        addCodonToMap(alignedCodons, codon, protein);
       } catch (IncompleteCodonException e)
       {
         // possible incomplete trailing codon - ignore
@@ -1046,6 +1164,26 @@ public class AlignmentUtils
   }
 
   /**
+   * Helper method to add a codon-to-peptide entry to the aligned codons map
+   * 
+   * @param alignedCodons
+   * @param codon
+   * @param protein
+   */
+  protected static void addCodonToMap(
+          Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
+          AlignedCodon codon, SequenceI protein)
+  {
+    Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
+    if (seqProduct == null)
+    {
+      seqProduct = new HashMap<SequenceI, AlignedCodon>();
+      alignedCodons.put(codon, seqProduct);
+    }
+    seqProduct.put(protein, codon);
+  }
+
+  /**
    * Returns true if a cDNA/Protein mapping either exists, or could be made,
    * between at least one pair of sequences in the two alignments. Currently,
    * the logic is:
@@ -1474,8 +1612,7 @@ public class AlignmentUtils
    * @return
    */
   protected static List<List<int[]>> getMappedColumns(
-          List<SequenceI> mappedSequences,
-          List<AlignedCodonFrame> mappings)
+          List<SequenceI> mappedSequences, List<AlignedCodonFrame> mappings)
   {
     List<List<int[]>> result = new ArrayList<List<int[]>>();
     for (SequenceI seq : mappedSequences)
@@ -1668,8 +1805,8 @@ public class AlignmentUtils
    * @return
    */
   protected static MapList addCdsMappings(SequenceI dnaSeq,
-          SequenceI cdsSeq,
-          Mapping dnaMapping, AlignedCodonFrame newMappings)
+          SequenceI cdsSeq, Mapping dnaMapping,
+          AlignedCodonFrame newMappings)
   {
     cdsSeq.createDatasetSequence();
 
@@ -1682,14 +1819,13 @@ public class AlignmentUtils
     cdsRanges.add(new int[] { 1, cdsDataset.getLength() });
     MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap()
             .getToRanges(), 3, 1);
-    newMappings.addMap(cdsDataset, dnaMapping.getTo(),
-            cdsToPeptide);
+    newMappings.addMap(cdsDataset, dnaMapping.getTo(), cdsToPeptide);
 
     /*
      * dna 'from' ranges map 1:1 to the contiguous extracted CDS 
      */
-    MapList dnaToCds = new MapList(
-            dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1);
+    MapList dnaToCds = new MapList(dnaMapping.getMap().getFromRanges(),
+            cdsRanges, 1, 1);
     newMappings.addMap(dnaSeq, cdsDataset, dnaToCds);
     return dnaToCds;
   }
diff --git a/src/jalview/datamodel/AlignedCodon.java b/src/jalview/datamodel/AlignedCodon.java
index 6179831..39a1853 100644
--- a/src/jalview/datamodel/AlignedCodon.java
+++ b/src/jalview/datamodel/AlignedCodon.java
@@ -27,32 +27,38 @@ package jalview.datamodel;
  * 
  * Example: in "G-AT-C-GA" the aligned codons are (0, 2, 3) and (5, 7, 8).
  * 
- * JBPComment: Is this useful anywhere other than jalview.analysis.Dna ?
- * 
  * @author gmcarstairs
  *
  */
 public final class AlignedCodon
 {
+  // base 1 aligned sequence position (base 0)
   public final int pos1;
 
+  // base 2 aligned sequence position (base 0)
   public final int pos2;
 
+  // base 3 aligned sequence position (base 0)
   public final int pos3;
 
+  // peptide aligned sequence position (base 0)
+  public final int peptideCol;
+
+  // peptide coded for by this codon
   public final String product;
 
   public AlignedCodon(int i, int j, int k)
   {
-    this(i, j, k, null);
+    this(i, j, k, null, 0);
   }
 
-  public AlignedCodon(int i, int j, int k, String prod)
+  public AlignedCodon(int i, int j, int k, String prod, int prodCol)
   {
     pos1 = i;
     pos2 = j;
     pos3 = k;
     product = prod;
+    peptideCol = prodCol;
   }
 
   /**
diff --git a/src/jalview/datamodel/Mapping.java b/src/jalview/datamodel/Mapping.java
index c4c4a2a..bd83fe9 100644
--- a/src/jalview/datamodel/Mapping.java
+++ b/src/jalview/datamodel/Mapping.java
@@ -155,8 +155,9 @@ public class Mapping
       int[] alignedCodon = getAlignedCodon(codon);
 
       String peptide = getPeptide();
+      int peptideCol = toPosition - 1 - Mapping.this.to.getStart();
       return new AlignedCodon(alignedCodon[0], alignedCodon[1],
-              alignedCodon[2], peptide);
+              alignedCodon[2], peptide, peptideCol);
     }
 
     /**
@@ -164,6 +165,8 @@ public class Mapping
      * sequence.
      * 
      * @return
+     * @throws NoSuchElementException
+     *           if the 'toRange' is exhausted (nothing to map to)
      */
     private String getPeptide()
     {
@@ -701,6 +704,15 @@ public class Mapping
     super.finalize();
   }
 
+  /**
+   * Returns an iterator which can serve up the aligned codon column positions
+   * and their corresponding peptide products
+   * 
+   * @param seq
+   *          an aligned (i.e. possibly gapped) sequence
+   * @param gapChar
+   * @return
+   */
   public Iterator<AlignedCodon> getCodonIterator(SequenceI seq, char gapChar)
   {
     return new AlignedCodonIterator(seq, gapChar);