JAL-1705 include any unmapped protein start 'X' when aligning to dna

author gmungoc <g.m.carstairs@dundee.ac.uk>

Wed, 2 Mar 2016 16:35:00 +0000 (16:35 +0000)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Wed, 2 Mar 2016 16:35:00 +0000 (16:35 +0000)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 2 Mar 2016 16:35:00 +0000 (16:35 +0000)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 2 Mar 2016 16:35:00 +0000 (16:35 +0000)
diff --git a/src/jalview/analysis/AlignmentUtils.java b/src/jalview/analysis/AlignmentUtils.java

index bdcfc2a..e624ce7 100644 (file)
--- a/src/jalview/analysis/AlignmentUtils.java
+++ b/src/jalview/analysis/AlignmentUtils.java
@@ -930,6 +930,34 @@ public class AlignmentUtils
    public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
    {
      List<SequenceI> unmappedProtein = new ArrayList<SequenceI>();
+    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
+            protein, dna, unmappedProtein);
+    return alignProteinAs(protein, alignedCodons, unmappedProtein);
+  }
+
+  /**
+   * Builds a map whose key is an aligned codon position (3 alignment column
+   * numbers base 0), and whose value is a map from protein sequence to each
+   * protein's peptide residue for that codon. The map generates an ordering of
+   * the codons, and allows us to read off the peptides at each position in
+   * order to assemble 'aligned' protein sequences.
+   * 
+   * @param protein
+   *          the protein alignment
+   * @param dna
+   *          the coding dna alignment
+   * @param unmappedProtein
+   *          any unmapped proteins are added to this list
+   * @return
+   */
+  protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
+          AlignmentI protein, AlignmentI dna,
+          List<SequenceI> unmappedProtein)
+  {
+    /*
+     * maintain a list of any proteins with no mappings - these will be
+     * rendered 'as is' in the protein alignment as we can't align them
+     */
      unmappedProtein.addAll(protein.getSequences());
  
      List<AlignedCodonFrame> mappings = protein.getCodonFrames();
@@ -939,24 +967,114 @@ public class AlignmentUtils
       * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
       * comparator keeps the codon positions ordered.
       */
-    Map<AlignedCodon, Map<SequenceI, String>> alignedCodons = new TreeMap<AlignedCodon, Map<SequenceI, String>>(
+    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<AlignedCodon, Map<SequenceI, AlignedCodon>>(
              new CodonComparator());
+
      for (SequenceI dnaSeq : dna.getSequences())
      {
        for (AlignedCodonFrame mapping : mappings)
        {
-        Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
          SequenceI prot = mapping.findAlignedSequence(
                  dnaSeq.getDatasetSequence(), protein);
          if (prot != null)
          {
+          Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
            addCodonPositions(dnaSeq, prot, protein.getGapCharacter(),
                    seqMap, alignedCodons);
            unmappedProtein.remove(prot);
          }
        }
      }
-    return alignProteinAs(protein, alignedCodons, unmappedProtein);
+
+    /*
+     * Finally add any unmapped peptide start residues (e.g. for incomplete
+     * codons) as if at the codon position before the second residue
+     */
+    int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
+    addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
+    
+    return alignedCodons;
+  }
+
+  /**
+   * Scans for any protein mapped from position 2 (meaning unmapped start
+   * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
+   * preceding position in the alignment
+   * 
+   * @param alignedCodons
+   *          the codon-to-peptide map
+   * @param mappedSequenceCount
+   *          the number of distinct sequences in the map
+   */
+  protected static void addUnmappedPeptideStarts(
+          Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
+          int mappedSequenceCount)
+  {
+    // TODO there must be an easier way! root problem is that our mapping data
+    // model does not include phase so can't map part of a codon to a peptide
+    List<SequenceI> sequencesChecked = new ArrayList<SequenceI>();
+    AlignedCodon lastCodon = null;
+    Map<SequenceI, AlignedCodon> toAdd = new HashMap<SequenceI, AlignedCodon>();
+
+    for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
+            .entrySet())
+    {
+      for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
+              .entrySet())
+      {
+        SequenceI seq = sequenceCodon.getKey();
+        if (sequencesChecked.contains(seq))
+        {
+          continue;
+        }
+        sequencesChecked.add(seq);
+        AlignedCodon codon = sequenceCodon.getValue();
+        if (codon.peptideCol > 1)
+        {
+          System.err
+                  .println("Problem mapping protein with >1 unmapped start positions: "
+                          + seq.getName());
+        }
+        else if (codon.peptideCol == 1)
+        {
+          /*
+           * first position (peptideCol == 0) was unmapped - add it
+           */
+          if (lastCodon != null)
+          {
+            AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
+                    lastCodon.pos2, lastCodon.pos3, String.valueOf(seq
+                            .getCharAt(0)), 0);
+            toAdd.put(seq, firstPeptide);
+          }
+          else
+          {
+            /*
+             * unmapped residue at start of alignment (no prior column) -
+             * 'insert' at nominal codon [0, 0, 0]
+             */
+            AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
+                    String.valueOf(seq.getCharAt(0)), 0);
+            toAdd.put(seq, firstPeptide);
+          }
+        }
+        if (sequencesChecked.size() == mappedSequenceCount)
+        {
+          // no need to check past first mapped position in all sequences
+          break;
+        }
+      }
+      lastCodon = entry.getKey();
+    }
+
+    /*
+     * add any new codons safely after iterating over the map
+     */
+    for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
+    {
+      addCodonToMap(alignedCodons, startCodon.getValue(),
+              startCodon.getKey());
+    }
    }
  
    /**
@@ -971,7 +1089,7 @@ public class AlignmentUtils
     * @return
     */
    protected static int alignProteinAs(AlignmentI protein,
-          Map<AlignedCodon, Map<SequenceI, String>> alignedCodons,
+          Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
            List<SequenceI> unmappedProtein)
    {
      /*
@@ -993,12 +1111,13 @@ public class AlignmentUtils
      int column = 0;
      for (AlignedCodon codon : alignedCodons.keySet())
      {
-      final Map<SequenceI, String> columnResidues = alignedCodons
+      final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
                .get(codon);
-      for (Entry<SequenceI, String> entry : columnResidues.entrySet())
+      for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
        {
          // place translated codon at its column position in sequence
-        entry.getKey().getSequence()[column] = entry.getValue().charAt(0);
+        entry.getKey().getSequence()[column] = entry.getValue().product
+                .charAt(0);
        }
        column++;
      }
@@ -1023,21 +1142,20 @@ public class AlignmentUtils
     */
    static void addCodonPositions(SequenceI dna, SequenceI protein,
            char gapChar, Mapping seqMap,
-          Map<AlignedCodon, Map<SequenceI, String>> alignedCodons)
+          Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
    {
      Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
+
+    /*
+     * add codon positions, and their peptide translations, to the alignment
+     * map, while remembering the first codon mapped
+     */
      while (codons.hasNext())
      {
        try
        {
          AlignedCodon codon = codons.next();
-        Map<SequenceI, String> seqProduct = alignedCodons.get(codon);
-        if (seqProduct == null)
-        {
-          seqProduct = new HashMap<SequenceI, String>();
-          alignedCodons.put(codon, seqProduct);
-        }
-        seqProduct.put(protein, codon.product);
+        addCodonToMap(alignedCodons, codon, protein);
        } catch (IncompleteCodonException e)
        {
          // possible incomplete trailing codon - ignore
@@ -1046,6 +1164,26 @@ public class AlignmentUtils
    }
  
    /**
+   * Helper method to add a codon-to-peptide entry to the aligned codons map
+   * 
+   * @param alignedCodons
+   * @param codon
+   * @param protein
+   */
+  protected static void addCodonToMap(
+          Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
+          AlignedCodon codon, SequenceI protein)
+  {
+    Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
+    if (seqProduct == null)
+    {
+      seqProduct = new HashMap<SequenceI, AlignedCodon>();
+      alignedCodons.put(codon, seqProduct);
+    }
+    seqProduct.put(protein, codon);
+  }
+
+  /**
     * Returns true if a cDNA/Protein mapping either exists, or could be made,
     * between at least one pair of sequences in the two alignments. Currently,
     * the logic is:
@@ -1474,8 +1612,7 @@ public class AlignmentUtils
     * @return
     */
    protected static List<List<int[]>> getMappedColumns(
-          List<SequenceI> mappedSequences,
-          List<AlignedCodonFrame> mappings)
+          List<SequenceI> mappedSequences, List<AlignedCodonFrame> mappings)
    {
      List<List<int[]>> result = new ArrayList<List<int[]>>();
      for (SequenceI seq : mappedSequences)
@@ -1668,8 +1805,8 @@ public class AlignmentUtils
     * @return
     */
    protected static MapList addCdsMappings(SequenceI dnaSeq,
-          SequenceI cdsSeq,
-          Mapping dnaMapping, AlignedCodonFrame newMappings)
+          SequenceI cdsSeq, Mapping dnaMapping,
+          AlignedCodonFrame newMappings)
    {
      cdsSeq.createDatasetSequence();
  
@@ -1682,14 +1819,13 @@ public class AlignmentUtils
      cdsRanges.add(new int[] { 1, cdsDataset.getLength() });
      MapList cdsToPeptide = new MapList(cdsRanges, dnaMapping.getMap()
              .getToRanges(), 3, 1);
-    newMappings.addMap(cdsDataset, dnaMapping.getTo(),
-            cdsToPeptide);
+    newMappings.addMap(cdsDataset, dnaMapping.getTo(), cdsToPeptide);
  
      /*
       * dna 'from' ranges map 1:1 to the contiguous extracted CDS 
       */
-    MapList dnaToCds = new MapList(
-            dnaMapping.getMap().getFromRanges(), cdsRanges, 1, 1);
+    MapList dnaToCds = new MapList(dnaMapping.getMap().getFromRanges(),
+            cdsRanges, 1, 1);
      newMappings.addMap(dnaSeq, cdsDataset, dnaToCds);
      return dnaToCds;
    }
diff --git a/src/jalview/datamodel/AlignedCodon.java b/src/jalview/datamodel/AlignedCodon.java

index 6179831..39a1853 100644 (file)
--- a/src/jalview/datamodel/AlignedCodon.java
+++ b/src/jalview/datamodel/AlignedCodon.java
@@ -27,32 +27,38 @@ package jalview.datamodel;
   * 
   * Example: in "G-AT-C-GA" the aligned codons are (0, 2, 3) and (5, 7, 8).
   * 
- * JBPComment: Is this useful anywhere other than jalview.analysis.Dna ?
- * 
   * @author gmcarstairs
   *
   */
  public final class AlignedCodon
  {
+  // base 1 aligned sequence position (base 0)
    public final int pos1;
  
+  // base 2 aligned sequence position (base 0)
    public final int pos2;
  
+  // base 3 aligned sequence position (base 0)
    public final int pos3;
  
+  // peptide aligned sequence position (base 0)
+  public final int peptideCol;
+
+  // peptide coded for by this codon
    public final String product;
  
    public AlignedCodon(int i, int j, int k)
    {
-    this(i, j, k, null);
+    this(i, j, k, null, 0);
    }
  
-  public AlignedCodon(int i, int j, int k, String prod)
+  public AlignedCodon(int i, int j, int k, String prod, int prodCol)
    {
      pos1 = i;
      pos2 = j;
      pos3 = k;
      product = prod;
+    peptideCol = prodCol;
    }
  
    /**
diff --git a/src/jalview/datamodel/Mapping.java b/src/jalview/datamodel/Mapping.java

index c4c4a2a..bd83fe9 100644 (file)
--- a/src/jalview/datamodel/Mapping.java
+++ b/src/jalview/datamodel/Mapping.java
@@ -155,8 +155,9 @@ public class Mapping
        int[] alignedCodon = getAlignedCodon(codon);
  
        String peptide = getPeptide();
+      int peptideCol = toPosition - 1 - Mapping.this.to.getStart();
        return new AlignedCodon(alignedCodon[0], alignedCodon[1],
-              alignedCodon[2], peptide);
+              alignedCodon[2], peptide, peptideCol);
      }
  
      /**
@@ -164,6 +165,8 @@ public class Mapping
       * sequence.
       * 
       * @return
+     * @throws NoSuchElementException
+     *           if the 'toRange' is exhausted (nothing to map to)
       */
      private String getPeptide()
      {
@@ -701,6 +704,15 @@ public class Mapping
      super.finalize();
    }
  
+  /**
+   * Returns an iterator which can serve up the aligned codon column positions
+   * and their corresponding peptide products
+   * 
+   * @param seq
+   *          an aligned (i.e. possibly gapped) sequence
+   * @param gapChar
+   * @return
+   */
    public Iterator<AlignedCodon> getCodonIterator(SequenceI seq, char gapChar)
    {
      return new AlignedCodonIterator(seq, gapChar);
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Wed, 2 Mar 2016 16:35:00 +0000 (16:35 +0000)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Wed, 2 Mar 2016 16:35:00 +0000 (16:35 +0000)
src/jalview/analysis/AlignmentUtils.java		patch \| blob \| history
src/jalview/datamodel/AlignedCodon.java		patch \| blob \| history
src/jalview/datamodel/Mapping.java		patch \| blob \| history