Merge branch 'JAL-1312_ambiguity_na' into develop

author Jim Procter <jprocter@compbio.dundee.ac.uk>

Mon, 17 Jun 2013 16:52:15 +0000 (17:52 +0100)

committer Jim Procter <jprocter@compbio.dundee.ac.uk>

Mon, 17 Jun 2013 16:52:15 +0000 (17:52 +0100)
author Jim Procter <jprocter@compbio.dundee.ac.uk>
Mon, 17 Jun 2013 16:52:15 +0000 (17:52 +0100)
committer Jim Procter <jprocter@compbio.dundee.ac.uk>
Mon, 17 Jun 2013 16:52:15 +0000 (17:52 +0100)
diff --combined src/jalview/schemes/ResidueProperties.java

index 0ca155a,bfff972..d360995
--- 1/src/jalview/schemes/ResidueProperties.java
--- 2/src/jalview/schemes/ResidueProperties.java
+++ b/src/jalview/schemes/ResidueProperties.java
@@@ -18,6 -18,7 +18,7 @@@
   package jalview.schemes;
   
   import java.util.*;
+ import java.util.List;
   
   import java.awt.*;
   
@@@ -690,22 -691,135 +691,135 @@@ public class ResiduePropertie
       codonHash.put("STOP", STOP);
     }
   
-   public static Hashtable codonHash2 = new Hashtable();
+   /**
+    * Nucleotide Ambiguity Codes 
+    */
+   public static final Hashtable<String,String[]> ambiguityCodes=new Hashtable<String,String[]>();
+   /**
+    * Codon triplets with additional symbols for unambiguous codons that include ambiguity codes
+    */
+   public static final Hashtable<String,String> codonHash2 = new Hashtable<String,String>();
+   
+   /**
+    * all ambiguity codes for a given base
+    */
+   public final static Hashtable<String,List<String>> _ambiguityCodes = new Hashtable<String,List<String>>();
+ 
   
     static
     {
+     /**
+      * 3.2. Purine (adenine or guanine): R
+      * 
+      * R is the symbol previously recommended [1].
+      */
+     ambiguityCodes.put("R", new String[]
+     { "A", "G" });
+ 
+     /**
+      * 3.3. Pyrimidine (thymine or cytosine): Y
+      * 
+      * Y is the symbol previously recommended [1].
+      */
+     ambiguityCodes.put("Y", new String[]
+     { "T", "C" });
+     /**
+      * 3.4. Adenine or thymine: W
+      * 
+      * Although several diverse symbols have been used for this pair, (and for
+      * the reciprocal pair G+C), only two symbols have a rational basis, L and
+      * W: L derives from DNA density (light; G+C - heavy - would thus be H); W
+      * derives from the strength of the hydrogen bonding interaction between the
+      * base pairs (weak for A+T: G +C - strong - would thus be S). However, the
+      * system recommended for the three-base series (not-A = B, etc., see below,
+      * section 3.8) rules out H as this would be not-G. W is thus recommended.
+      */
+     ambiguityCodes.put("W", new String[]
+     { "A", "T" });
+     /**
+      * 3.5. Guanine or cytosine: S
+      * 
+      * The choice of this symbol is discussed above in section 3.4.
+      */
+     ambiguityCodes.put("S", new String[]
+     { "G", "C" });
+     /**
+      * 3.6. Adenine or cytosine: M
+      * 
+      * There are few common features between A and C. The presence of an NH2
+      * group in similar positions on both bases (Fig. 1) makes possible a
+      * logically derived symbol. A and N being ruled out, M (from aMino) is
+      * recommended.
+      * 
+      * 
+      * Fig. 1. Origin of the symbols M and K The four bases are drawn so as to
+      * show the relationship between adenine and cytosine on the one hand, which
+      * both have aMino groups at the ring position most distant from the point
+      * of attachment to the sugar, and between guanine and thymine on the other,
+      * which both have Keto groups at the corresponding position. The ring atoms
+      * are numbered as recommended [24-26], although for the present purpose
+      * this has the disadvantage of giving discordant numbers to the
+      * corresponding positions.
+      */
+     ambiguityCodes.put("M", new String[]
+     { "A", "C" });
+     /**
+      * 3.7. Guanine or thymine: K By analogy with A and C (section 3.6), both G
+      * and T have Keto groups in similar positions (Fig. 1).
+      */
+     ambiguityCodes.put("K", new String[]
+     { "G", "T" });
+     /**
+      * 3.8. Adenine or thymine or cytosine: H
+      * 
+      * Not-G is the most simple means of memorising this combination and symbols
+      * logically related to G were examined. F and H would both be suitable, as
+      * the letters before and after G in the alphabet, but A would have no
+      * equivalent to F. The use of H has historical precedence [2].
+      */
+     ambiguityCodes.put("H", new String[]
+     { "A", "T", "C" });
+     /**
+      * 3.9. Guanine or cytosine or thymine: B
+      * 
+      * Not-A as above (section 3.8).
+      */
+     ambiguityCodes.put("B", new String[]
+     { "G", "T", "C" });
+     /**
+      * 3.10. Guanine or adenine or cytosine: V
+      * 
+      * Not-T by analogy with not-G (section 3.8) would be U but this is ruled
+      * out to eliminate confusion with uracil. V is the next logical choice.
+      * Note that T and U may in some cases be considered to be synonyms.
+      */
+     ambiguityCodes.put("V", new String[]
+     { "G", "A", "C" });
+     /**
+      * 3.11. Guanine or adenine or thymine: D
+      * 
+      * Not-C as above (section 3.8).
+      */
+     ambiguityCodes.put("D", new String[]
+     { "G", "A", "T" });
+     /**
+      * 3.12. Guanine or adenine or thymine or cytosine: N
+      */
+     ambiguityCodes.put("R", new String[]
+     { "G", "A", "T", "C" });
+     // Now build codon translation table
       codonHash2.put("AAA", "K");
       codonHash2.put("AAG", "K");
       codonHash2.put("AAC", "N");
       codonHash2.put("AAT", "N");
   
-     codonHash2.put("CAA", "E");
-     codonHash2.put("CAG", "E");
+     codonHash2.put("CAA", "Q");
+     codonHash2.put("CAG", "Q");
       codonHash2.put("CAC", "H");
       codonHash2.put("CAT", "H");
   
-     codonHash2.put("GAA", "Q");
-     codonHash2.put("GAG", "Q");
+     codonHash2.put("GAA", "E");
+     codonHash2.put("GAG", "E");
       codonHash2.put("GAC", "D");
       codonHash2.put("GAT", "D");
   
@@@ -713,9 -827,9 +827,9 @@@
       codonHash2.put("TAT", "Y");
   
       codonHash2.put("ACA", "T");
-     codonHash2.put("AAG", "T");
       codonHash2.put("ACC", "T");
       codonHash2.put("ACT", "T");
+     codonHash2.put("ACG", "T");
   
       codonHash2.put("CCA", "P");
       codonHash2.put("CCG", "P");
@@@ -775,6 -889,145 +889,145 @@@
   
       codonHash2.put("TTC", "F");
       codonHash2.put("TTT", "F");
+     
+     buildAmbiguityCodonSet();
+   }
+   
+   /**
+    * programmatic generation of codons including ambiguity codes
+    */
+   public static void buildAmbiguityCodonSet()
+   {
+     if (_ambiguityCodes.size() > 0)
+     {
+       System.err
+               .println("Ignoring multiple calls to buildAmbiguityCodonSet");
+       return;
+     }
+     // Invert the ambiguity code set
+     for (Map.Entry<String, String[]> acode : ambiguityCodes.entrySet())
+     {
+       for (String r : acode.getValue())
+       {
+         List<String> codesfor = _ambiguityCodes.get(r);
+         if (codesfor == null)
+         {
+           _ambiguityCodes.put(r, codesfor = new ArrayList<String>());
+         }
+         if (!codesfor.contains(acode.getKey()))
+         {
+           codesfor.add(acode.getKey());
+         }
+         else
+         {
+           System.err
+                   .println("Inconsistency in the IUBMB ambiguity code nomenclature table: collision for "
+                           + acode.getKey() + " in residue " + r);
+         }
+       }
+     }
+     // and programmatically add in the ambiguity codes that yield the same amino
+     // acid
+     String[] unambcodons = codonHash2.keySet().toArray(new String[codonHash2.size()]);
+     for (String codon : unambcodons)
+     {
+       String residue = codonHash2.get(codon);
+       String acodon[][] = new String[codon.length()][];
+       for (int i = 0, iSize = codon.length(); i < iSize; i++)
+       {
+         String _ac = "" + codon.charAt(i);
+         List<String> acodes = _ambiguityCodes.get(_ac);
+         if (acodes != null)
+         {
+           acodon[i] = acodes.toArray(new String[acodes.size()]);
+         }
+         else
+         {
+           acodon[i] = new String[]
+           {};
+         }
+       }
+       // enumerate all combinations and test for veracity of translation
+       int tpos[] = new int[codon.length()], cpos[] = new int[codon.length()];
+       for (int i = 0; i < tpos.length; i++)
+       {
+         tpos[i] = -1;
+       }
+       tpos[acodon.length - 1] = 0;
+       int ipos, j;
+       while (tpos[0] < acodon[0].length)
+       {
+         // make all codons for this combination
+         char allres[][] = new char[tpos.length][];
+         String _acodon = "";
+         char _anuc;
+         for (ipos = 0; ipos < tpos.length; ipos++)
+         {
+           if (acodon[ipos].length==0 || tpos[ipos] < 0)
+           {
+             _acodon += codon.charAt(ipos);
+             allres[ipos] = new char[]
+             { codon.charAt(ipos) };
+           }
+           else
+           {
+             _acodon += acodon[ipos][tpos[ipos]];
+             String[] altbase = ambiguityCodes.get(acodon[ipos][tpos[ipos]]);
+             allres[ipos] = new char[altbase.length];
+             j = 0;
+             for (String ab : altbase)
+             {
+               allres[ipos][j++] = ab.charAt(0);
+             }
+           }
+         }
+         // test all codons for this combination
+         for (ipos = 0; ipos < cpos.length; ipos++)
+         {
+           cpos[ipos] = 0;
+         }
+         boolean valid = true;
+         do
+         {
+           String _codon = "";
+           for (j = 0; j < cpos.length; j++)
+           {
+             _codon += allres[j][cpos[j]];
+           }
+           String tr = codonHash2.get(_codon);
+           if (valid = (tr!=null && tr.equals(residue)))
+           {
+             // advance to next combination
+             ipos = acodon.length - 1;
+             while (++cpos[ipos] >= allres[ipos].length && ipos > 0)
+             {
+               cpos[ipos] = 0;
+               ipos--;
+             }
+           }
+         } while (valid && cpos[0] < allres[0].length);
+         if (valid)
+         {
+           // Add this to the set of codons we will translate
+ //          System.out.println("Adding ambiguity codon: " + _acodon + " for "
+ //                  + residue);
+           codonHash2.put(_acodon, residue);
+         }
+         else
+         {
+ //          System.err.println("Rejecting ambiguity codon: " + _acodon
+ //                  + " for " + residue);
+         }
+         // next combination
+         ipos = acodon.length - 1;
+         while (++tpos[ipos] >= acodon[ipos].length && ipos > 0)
+         {
+           tpos[ipos] = -1;
+           ipos--;
+         }
+       }
+     }
+ 
     }
   
     static
@@@ -1232,6 -1485,19 +1485,19 @@@
   
     public static String codonTranslate(String lccodon)
     {
+     if (false)
+     {
+       return _codonTranslate(lccodon);
+     }
+     String cdn = codonHash2.get(lccodon.toUpperCase());
+     if (cdn!=null && cdn.equals("*"))
+     {
+       return "STOP";
+     }
+     return cdn;
+   }
+   public static String _codonTranslate(String lccodon)
+   {
       String codon = lccodon.toUpperCase();
       // all base ambiguity codes yield an 'X' amino acid residue
       if (codon.indexOf('X') > -1 || codon.indexOf('N') > -1)
@@@ -1341,67 -1607,8 +1607,67 @@@
     static
     {
       toRNAssState = new Hashtable();
- -    toRNAssState.put(")", "S");
- -    toRNAssState.put("(", "S");
+ +    toRNAssState.put(")", "(");
+ +    toRNAssState.put("(", "(");
+ +    toRNAssState.put("]", "[");
+ +    toRNAssState.put("[", "[");
+ +    toRNAssState.put("{", "{");
+ +    toRNAssState.put("}", "{");
+ +    toRNAssState.put(">", ">");
+ +    toRNAssState.put("<", ">");
+ +    toRNAssState.put("A", "A");
+ +    toRNAssState.put("a", "A");
+ +    toRNAssState.put("B", "B");
+ +    toRNAssState.put("b", "B");
+ +    toRNAssState.put("C", "C");
+ +    toRNAssState.put("c", "C");
+ +    toRNAssState.put("D", "D");
+ +    toRNAssState.put("d", "D");
+ +    toRNAssState.put("1", "1");
+ +    toRNAssState.put("e", "1");
+ +    toRNAssState.put("F", "F");
+ +    toRNAssState.put("f", "F");
+ +    toRNAssState.put("G", "G");
+ +    toRNAssState.put("g", "G");
+ +    toRNAssState.put("2", "2");
+ +    toRNAssState.put("h", "2");
+ +    toRNAssState.put("I", "I");
+ +    toRNAssState.put("i", "I");
+ +    toRNAssState.put("J", "J");
+ +    toRNAssState.put("j", "J");
+ +    toRNAssState.put("K", "K");
+ +    toRNAssState.put("k", "K");
+ +    toRNAssState.put("L", "L");
+ +    toRNAssState.put("l", "L");
+ +    toRNAssState.put("M", "M");
+ +    toRNAssState.put("m", "M");
+ +    toRNAssState.put("N", "N");
+ +    toRNAssState.put("n", "N");
+ +    toRNAssState.put("O", "O");
+ +    toRNAssState.put("o", "O");
+ +    toRNAssState.put("P", "P");
+ +    toRNAssState.put("p", "P");
+ +    toRNAssState.put("Q", "Q");
+ +    toRNAssState.put("q", "Q");
+ +    toRNAssState.put("R", "R");
+ +    toRNAssState.put("r", "R");
+ +    toRNAssState.put("S", "S");
+ +    toRNAssState.put("s", "S");
+ +    toRNAssState.put("T", "T");
+ +    toRNAssState.put("t", "T");
+ +    toRNAssState.put("U", "U");
+ +    toRNAssState.put("u", "U");
+ +    toRNAssState.put("V", "V");
+ +    toRNAssState.put("v", "V");
+ +    toRNAssState.put("W", "W");
+ +    toRNAssState.put("w", "W");
+ +    toRNAssState.put("X", "X");
+ +    toRNAssState.put("x", "X");
+ +    toRNAssState.put("Y", "Y");
+ +    toRNAssState.put("y", "Y");
+ +    toRNAssState.put("Z", "Z");
+ +    toRNAssState.put("z", "Z");
+ +    
     }
   
     /**
author	Jim Procter <jprocter@compbio.dundee.ac.uk>
	Mon, 17 Jun 2013 16:52:15 +0000 (17:52 +0100)
committer	Jim Procter <jprocter@compbio.dundee.ac.uk>
	Mon, 17 Jun 2013 16:52:15 +0000 (17:52 +0100)