JAL-2049 separate protein variant per dna variant (combinations tbd)
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 12 Apr 2016 09:36:51 +0000 (10:36 +0100)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 12 Apr 2016 09:36:51 +0000 (10:36 +0100)
src/jalview/analysis/AlignmentUtils.java
test/jalview/analysis/AlignmentUtilsTests.java

index 14e3907..28062c0 100644 (file)
@@ -1744,35 +1744,40 @@ public class AlignmentUtils
      * /ENSP00000288602?feature=transcript_variation;content-type=text/xml
      * which would be a bit slower but possibly more reliable
      */
-    LinkedHashMap<Integer, String[][]> variants = buildDnaVariantsMap(
+    LinkedHashMap<Integer, List<String[][]>> variants = buildDnaVariantsMap(
             dnaSeq, dnaToProtein);
 
     /*
      * scan codon variations, compute peptide variants and add to peptide sequence
      */
     int count = 0;
-    for (Entry<Integer, String[][]> variant : variants.entrySet())
+    for (Entry<Integer, List<String[][]>> variant : variants.entrySet())
     {
       int peptidePos = variant.getKey();
-      String[][] codonVariants = variant.getValue();
+      List<String[][]> codonVariants = variant.getValue();
       String residue = String.valueOf(peptide.getCharAt(peptidePos - 1)); // 0-based
-      List<String> peptideVariants = computePeptideVariants(codonVariants,
-              residue);
-      if (!peptideVariants.isEmpty())
+      for (String[][] codonVariant : codonVariants)
       {
-        String desc = residue + "," // include canonical residue in description
-                + StringUtils.listToDelimitedString(peptideVariants, ", ");
-        SequenceFeature sf = new SequenceFeature(
-                SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
-                peptidePos, 0f, null);
-        peptide.addSequenceFeature(sf);
-        count++;
+        List<String> peptideVariants = computePeptideVariants(codonVariant,
+                residue);
+        if (!peptideVariants.isEmpty())
+        {
+          String desc = residue
+                  + "->" // include canonical residue in description
+                  + StringUtils
+                          .listToDelimitedString(peptideVariants, ", ");
+          SequenceFeature sf = new SequenceFeature(
+                  SequenceOntologyI.SEQUENCE_VARIANT, desc, peptidePos,
+                  peptidePos, 0f, null);
+          peptide.addSequenceFeature(sf);
+          count++;
+        }
       }
     }
 
     /*
      * ugly sort to get sequence features in start position order
-     * - would be better to store in Sequence as a TreeSet instead?
+     * - would be better to store in Sequence as a TreeSet or NCList?
      */
     Arrays.sort(peptide.getSequenceFeatures(),
             new Comparator<SequenceFeature>()
@@ -1796,14 +1801,14 @@ public class AlignmentUtils
    * @param dnaToProtein
    * @return
    */
-  static LinkedHashMap<Integer, String[][]> buildDnaVariantsMap(
+  static LinkedHashMap<Integer, List<String[][]>> buildDnaVariantsMap(
           SequenceI dnaSeq, MapList dnaToProtein)
   {
     /*
      * map from peptide position to all variant features of the codon for it
      * LinkedHashMap ensures we add the peptide features in sequence order
      */
-    LinkedHashMap<Integer, String[][]> variants = new LinkedHashMap<Integer, String[][]>();
+    LinkedHashMap<Integer, List<String[][]>> variants = new LinkedHashMap<Integer, List<String[][]>>();
     SequenceOntologyI so = SequenceOntologyFactory.getInstance();
 
     SequenceFeature[] dnaFeatures = dnaSeq.getSequenceFeatures();
@@ -1836,10 +1841,10 @@ public class AlignmentUtils
           continue;
         }
         int peptidePosition = mapsTo[0];
-        String[][] codonVariants = variants.get(peptidePosition);
+        List<String[][]> codonVariants = variants.get(peptidePosition);
         if (codonVariants == null)
         {
-          codonVariants = new String[3][];
+          codonVariants = new ArrayList<String[][]>();
           variants.put(peptidePosition, codonVariants);
         }
 
@@ -1870,31 +1875,33 @@ public class AlignmentUtils
         /*
          * save nucleotide (and this variant) for each codon position
          */
+        String[][] codonVariant = new String[3][];
         for (int codonPos = 0; codonPos < 3; codonPos++)
         {
           String nucleotide = String.valueOf(
                   dnaSeq.getCharAt(codon[codonPos] - dnaStart))
                   .toUpperCase();
-          if (codonVariants[codonPos] == null)
+          if (codonVariant[codonPos] == null)
           {
             /*
              * record current dna base
              */
-            codonVariants[codonPos] = new String[] { nucleotide };
+            codonVariant[codonPos] = new String[] { nucleotide };
           }
           if (codon[codonPos] == dnaCol)
           {
             /*
              * add alleles to dna base (and any previously found alleles)
              */
-            String[] known = codonVariants[codonPos];
+            String[] known = codonVariant[codonPos];
             String[] dnaVariants = new String[alleles.length + known.length];
             System.arraycopy(known, 0, dnaVariants, 0, known.length);
             System.arraycopy(alleles, 0, dnaVariants, known.length,
                     alleles.length);
-            codonVariants[codonPos] = dnaVariants;
+            codonVariant[codonPos] = dnaVariants;
           }
         }
+        codonVariants.add(codonVariant);
       }
     }
     return variants;
index 7ccbf97..810ef5f 100644 (file)
@@ -1733,52 +1733,97 @@ public class AlignmentUtilsTests
     /*
      * first with no variants on dna
      */
-    LinkedHashMap<Integer, String[][]> variantsMap = AlignmentUtils
+    LinkedHashMap<Integer, List<String[][]>> variantsMap = AlignmentUtils
             .buildDnaVariantsMap(dna, map);
     assertTrue(variantsMap.isEmpty());
 
-    // single allele codon 1, on base 1
+    /*
+     * single allele codon 1, on base 1
+     */
     SequenceFeature sf = new SequenceFeature("sequence_variant", "", 1, 1,
             0f, null);
     sf.setValue("alleles", "T");
+    sf.setValue("ID", "sequence_variant:rs758803211");
     dna.addSequenceFeature(sf);
 
-    // two alleles codon 2, on bases 2 and 3
+    /*
+     * two alleles codon 2, on bases 2 and 3 (distinct variants)
+     */
     sf = new SequenceFeature("sequence_variant", "", 5, 5, 0f, null);
     sf.setValue("alleles", "T");
+    sf.setValue("ID", "sequence_variant:rs758803212");
     dna.addSequenceFeature(sf);
     sf = new SequenceFeature("sequence_variant", "", 6, 6, 0f, null);
     sf.setValue("alleles", "G");
+    sf.setValue("ID", "sequence_variant:rs758803213");
     dna.addSequenceFeature(sf);
 
-    // two alleles codon 3, both on base 2
+    /*
+     * two alleles codon 3, both on base 2 (one variant)
+     */
     sf = new SequenceFeature("sequence_variant", "", 8, 8, 0f, null);
     sf.setValue("alleles", "C, G");
+    sf.setValue("ID", "sequence_variant:rs758803214");
     dna.addSequenceFeature(sf);
 
     // no alleles on codon 4
-    // alleles on codon 5 on all 3 bases
+
+    /*
+     * alleles on codon 5 on all 3 bases (distinct variants)
+     */
     sf = new SequenceFeature("sequence_variant", "", 13, 13, 0f, null);
     sf.setValue("alleles", "C, G"); // (C duplicates given base value)
+    sf.setValue("ID", "sequence_variant:rs758803215");
     dna.addSequenceFeature(sf);
     sf = new SequenceFeature("sequence_variant", "", 14, 14, 0f, null);
     sf.setValue("alleles", "g, a"); // should force to upper-case
+    sf.setValue("ID", "sequence_variant:rs758803216");
     dna.addSequenceFeature(sf);
     sf = new SequenceFeature("sequence_variant", "", 15, 15, 0f, null);
     sf.setValue("alleles", "A, T");
+    sf.setValue("ID", "sequence_variant:rs758803217");
     dna.addSequenceFeature(sf);
 
+    /*
+     * build map - expect variants on positions 1, 2, 3, 5
+     */
     variantsMap = AlignmentUtils.buildDnaVariantsMap(dna, map);
     assertEquals(4, variantsMap.size());
+
+    /*
+     * one variant on protein position 1
+     */
+    assertEquals(1, variantsMap.get(1).size());
     assertTrue(Arrays.deepEquals(new String[][] { { "A", "T" }, { "T" },
-        { "G" } }, variantsMap.get(1)));
+        { "G" } }, variantsMap.get(1).get(0)));
+
+    /*
+     * two variants on protein position 2
+     */
+    assertEquals(2, variantsMap.get(2).size());
     assertTrue(Arrays.deepEquals(new String[][] { { "A" }, { "A", "T" },
-        { "A", "G" } }, variantsMap.get(2)));
+        { "A" } }, variantsMap.get(2).get(0)));
+    assertTrue(Arrays.deepEquals(new String[][] { { "A" }, { "A" },
+        { "A", "G" } }, variantsMap.get(2).get(1)));
+
+    /*
+     * one variant on protein position 3
+     */
+    assertEquals(1, variantsMap.get(3).size());
     assertTrue(Arrays.deepEquals(new String[][] { { "T" },
-        { "T", "C", "G" }, { "T" } }, variantsMap.get(3)));
-    // duplicated bases are not removed here, handled in computePeptideVariants
+        { "T", "C", "G" }, { "T" } }, variantsMap.get(3).get(0)));
+
+    /*
+     * three variants on protein position 5
+     * duplicated bases are not removed here, handled in computePeptideVariants
+     */
+    assertEquals(3, variantsMap.get(5).size());
     assertTrue(Arrays.deepEquals(new String[][] { { "C", "C", "G" },
-        { "C", "G", "A" }, { "C", "A", "T" } }, variantsMap.get(5)));
+        { "C" }, { "C" } }, variantsMap.get(5).get(0)));
+    assertTrue(Arrays.deepEquals(new String[][] { { "C" },
+        { "C", "G", "A" }, { "C" } }, variantsMap.get(5).get(1)));
+    assertTrue(Arrays.deepEquals(new String[][] { { "C" }, { "C" },
+        { "C", "A", "T" } }, variantsMap.get(5).get(2)));
   }
 
   /**