JAL-2110 AlignmentUtils.alignCdsAsProtein implemented
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 6 Jul 2016 15:34:01 +0000 (16:34 +0100)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 6 Jul 2016 15:34:01 +0000 (16:34 +0100)
src/jalview/analysis/AlignmentUtils.java
test/jalview/datamodel/AlignmentTest.java

index 35aa91d..20fcf25 100644 (file)
@@ -885,7 +885,8 @@ public class AlignmentUtils
     int alignedCount = 0;
     for (SequenceI dnaSeq : dna.getSequences())
     {
-      if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings))
+      if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
+              dna.getGapCharacter()))
       {
         alignedCount++;
       }
@@ -894,31 +895,124 @@ public class AlignmentUtils
   }
 
   /**
-   * Helper method to align (if possible) the dna sequence (cds only) to match
-   * the alignment of a mapped protein sequence
+   * Helper method to align (if possible) the dna sequence to match the
+   * alignment of a mapped protein sequence. This is currently limited to
+   * handling coding sequence only.
    * 
-   * @param dnaSeq
+   * @param cdsSeq
    * @param protein
    * @param mappings
+   * @param gapChar
    * @return
    */
-  static boolean alignCdsSequenceAsProtein(SequenceI dnaSeq,
-          AlignmentI protein, List<AlignedCodonFrame> mappings)
+  static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
+          AlignmentI protein, List<AlignedCodonFrame> mappings, char gapChar)
   {
+    SequenceI cdsDss = cdsSeq.getDatasetSequence();
+    if (cdsDss == null)
+    {
+      System.err
+              .println("alignCdsSequenceAsProtein needs aligned sequence!");
+      return false;
+    }
+    
     List<AlignedCodonFrame> dnaMappings = MappingUtils
-            .findMappingsForSequence(dnaSeq, mappings);
+            .findMappingsForSequence(cdsSeq, mappings);
     for (AlignedCodonFrame mapping : dnaMappings)
     {
-      SequenceI peptide = mapping.findAlignedSequence(dnaSeq, protein);
+      SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
+      int peptideLength = peptide.getLength();
       if (peptide != null)
       {
-        Mapping map = mapping.getMappingBetween(dnaSeq, peptide);
+        Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
         if (map != null)
         {
+          MapList mapList = map.getMap();
+          if (map.getTo() == peptide.getDatasetSequence())
+          {
+            mapList = mapList.getInverse();
+          }
+          int cdsLength = cdsDss.getLength();
+          int mappedFromLength = MappingUtils.getLength(mapList
+                  .getFromRanges());
+          int mappedToLength = MappingUtils
+                  .getLength(mapList.getToRanges());
+          boolean addStopCodon = (cdsLength == mappedFromLength * 3 + 3)
+                  || (peptide.getDatasetSequence().getLength() == mappedFromLength - 1);
+          if (cdsLength != mappedToLength && !addStopCodon)
+          {
+            System.err
+                    .println(String
+                            .format("Can't align cds as protein (length mismatch %d/%d): %s",
+                                    cdsLength, mappedToLength,
+                                    cdsSeq.getName()));
+          }
+
           /*
-           * traverse peptide adding gaps or codons to new cds sequence
+           * pre-fill the aligned cds sequence with gaps
            */
-          MapList mapList = map.getMap();
+          char[] alignedCds = new char[peptideLength * 3
+                  + (addStopCodon ? 3 : 0)];
+          Arrays.fill(alignedCds, gapChar);
+
+          /*
+           * walk over the aligned peptide sequence and insert mapped 
+           * codons for residues in the aligned cds sequence 
+           */
+          char[] alignedPeptide = peptide.getSequence();
+          char[] nucleotides = cdsDss.getSequence();
+          int copiedBases = 0;
+          int cdsStart = cdsDss.getStart();
+          int proteinPos = peptide.getStart() - 1;
+          int cdsCol = 0;
+          for (char residue : alignedPeptide)
+          {
+            if (Comparison.isGap(residue))
+            {
+              cdsCol += 3;
+            }
+            else
+            {
+              proteinPos++;
+              int[] codon = mapList.locateInTo(proteinPos, proteinPos);
+              if (codon == null)
+              {
+                // e.g. incomplete start codon, X in peptide
+                cdsCol += 3;
+              }
+              else
+              {
+                for (int j = codon[0]; j <= codon[1]; j++)
+                {
+                  char mappedBase = nucleotides[j - cdsStart];
+                  alignedCds[cdsCol++] = mappedBase;
+                  copiedBases++;
+                }
+              }
+            }
+          }
+
+          /*
+           * append stop codon if not mapped from protein,
+           * closing it up to the end of the mapped sequence
+           */
+          if (copiedBases == nucleotides.length - 3)
+          {
+            for (int i = alignedCds.length - 1; i >= 0; i--)
+            {
+              if (!Comparison.isGap(alignedCds[i]))
+              {
+                cdsCol = i + 1; // gap just after end of sequence
+                break;
+              }
+            }
+            for (int i = nucleotides.length - 3; i < nucleotides.length; i++)
+            {
+              alignedCds[cdsCol++] = nucleotides[i];
+            }
+          }
+          cdsSeq.setSequence(new String(alignedCds));
+          return true;
         }
       }
     }
index 08a9441..07b8abf 100644 (file)
@@ -217,7 +217,7 @@ public class AlignmentTest
    * 
    * @throws IOException
    */
-  @Test(groups = { "Functional" }, enabled = false)
+  @Test(groups = { "Functional" }, enabled = true)
   // TODO review / update this test after redesign of alignAs method
   public void testAlignAs_cdnaAsProtein() throws IOException
   {
@@ -243,7 +243,7 @@ public class AlignmentTest
    * 
    * @throws IOException
    */
-  @Test(groups = { "Functional" }, enabled = false)
+  @Test(groups = { "Functional" }, enabled = true)
   // TODO review / update this test after redesign of alignAs method
   public void testAlignAs_cdnaAsProtein_singleSequence() throws IOException
   {
@@ -315,7 +315,12 @@ public class AlignmentTest
       acf.addMap(seqFrom, seqTo, ml);
     }
 
+    /*
+     * not sure whether mappings 'belong' or protein or nucleotide
+     * alignment, so adding to both ;~)
+     */
     alFrom.addCodonFrame(acf);
+    alTo.addCodonFrame(acf);
   }
 
   /**