From fbaefbc8965071fd362e9f47aa3d855dcdf312e9 Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Mon, 17 Jun 2013 10:40:26 +0100 Subject: [PATCH] JAL-529 - IUBMB nucleotide ambiguity codes and programmatic generation of codons and inverse tables --- src/jalview/schemes/ResidueProperties.java | 255 +++++++++++++++++++++++++++- test/jalview/schemes/DnaCodonTests.java | 49 ++++++ 2 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 test/jalview/schemes/DnaCodonTests.java diff --git a/src/jalview/schemes/ResidueProperties.java b/src/jalview/schemes/ResidueProperties.java index fa4260f..a5c4e84 100755 --- a/src/jalview/schemes/ResidueProperties.java +++ b/src/jalview/schemes/ResidueProperties.java @@ -18,6 +18,7 @@ package jalview.schemes; import java.util.*; +import java.util.List; import java.awt.*; @@ -690,10 +691,123 @@ public class ResidueProperties codonHash.put("STOP", STOP); } - public static Hashtable codonHash2 = new Hashtable(); + /** + * Nucleotide Ambiguity Codes + */ + public static final Hashtable ambiguityCodes=new Hashtable(); + /** + * Codon triplets with additional symbols for unambiguous codons that include ambiguity codes + */ + public static final Hashtable codonHash2 = new Hashtable(); + + /** + * all ambiguity codes for a given base + */ + public final static Hashtable> _ambiguityCodes = new Hashtable>(); + static { + /** + * 3.2. Purine (adenine or guanine): R + * + * R is the symbol previously recommended [1]. + */ + ambiguityCodes.put("R", new String[] + { "A", "G" }); + + /** + * 3.3. Pyrimidine (thymine or cytosine): Y + * + * Y is the symbol previously recommended [1]. + */ + ambiguityCodes.put("Y", new String[] + { "T", "C" }); + /** + * 3.4. Adenine or thymine: W + * + * Although several diverse symbols have been used for this pair, (and for + * the reciprocal pair G+C), only two symbols have a rational basis, L and + * W: L derives from DNA density (light; G+C - heavy - would thus be H); W + * derives from the strength of the hydrogen bonding interaction between the + * base pairs (weak for A+T: G +C - strong - would thus be S). However, the + * system recommended for the three-base series (not-A = B, etc., see below, + * section 3.8) rules out H as this would be not-G. W is thus recommended. + */ + ambiguityCodes.put("W", new String[] + { "A", "T" }); + /** + * 3.5. Guanine or cytosine: S + * + * The choice of this symbol is discussed above in section 3.4. + */ + ambiguityCodes.put("S", new String[] + { "G", "C" }); + /** + * 3.6. Adenine or cytosine: M + * + * There are few common features between A and C. The presence of an NH2 + * group in similar positions on both bases (Fig. 1) makes possible a + * logically derived symbol. A and N being ruled out, M (from aMino) is + * recommended. + * + * + * Fig. 1. Origin of the symbols M and K The four bases are drawn so as to + * show the relationship between adenine and cytosine on the one hand, which + * both have aMino groups at the ring position most distant from the point + * of attachment to the sugar, and between guanine and thymine on the other, + * which both have Keto groups at the corresponding position. The ring atoms + * are numbered as recommended [24-26], although for the present purpose + * this has the disadvantage of giving discordant numbers to the + * corresponding positions. + */ + ambiguityCodes.put("M", new String[] + { "A", "C" }); + /** + * 3.7. Guanine or thymine: K By analogy with A and C (section 3.6), both G + * and T have Keto groups in similar positions (Fig. 1). + */ + ambiguityCodes.put("K", new String[] + { "G", "T" }); + /** + * 3.8. Adenine or thymine or cytosine: H + * + * Not-G is the most simple means of memorising this combination and symbols + * logically related to G were examined. F and H would both be suitable, as + * the letters before and after G in the alphabet, but A would have no + * equivalent to F. The use of H has historical precedence [2]. + */ + ambiguityCodes.put("H", new String[] + { "A", "T", "C" }); + /** + * 3.9. Guanine or cytosine or thymine: B + * + * Not-A as above (section 3.8). + */ + ambiguityCodes.put("B", new String[] + { "G", "T", "C" }); + /** + * 3.10. Guanine or adenine or cytosine: V + * + * Not-T by analogy with not-G (section 3.8) would be U but this is ruled + * out to eliminate confusion with uracil. V is the next logical choice. + * Note that T and U may in some cases be considered to be synonyms. + */ + ambiguityCodes.put("V", new String[] + { "G", "A", "C" }); + /** + * 3.11. Guanine or adenine or thymine: D + * + * Not-C as above (section 3.8). + */ + ambiguityCodes.put("D", new String[] + { "G", "A", "T" }); + /** + * 3.12. Guanine or adenine or thymine or cytosine: N + */ + ambiguityCodes.put("R", new String[] + { "G", "A", "T", "C" }); + // Now build codon translation table codonHash2.put("AAA", "K"); codonHash2.put("AAG", "K"); codonHash2.put("AAC", "N"); @@ -775,6 +889,145 @@ public class ResidueProperties codonHash2.put("TTC", "F"); codonHash2.put("TTT", "F"); + + buildAmbiguityCodonSet(); + } + + /** + * programmatic generation of codons including ambiguity codes + */ + public static void buildAmbiguityCodonSet() + { + if (_ambiguityCodes.size() > 0) + { + System.err + .println("Ignoring multiple calls to buildAmbiguityCodonSet"); + return; + } + // Invert the ambiguity code set + for (Map.Entry acode : ambiguityCodes.entrySet()) + { + for (String r : acode.getValue()) + { + List codesfor = _ambiguityCodes.get(r); + if (codesfor == null) + { + _ambiguityCodes.put(r, codesfor = new ArrayList()); + } + if (!codesfor.contains(acode.getKey())) + { + codesfor.add(acode.getKey()); + } + else + { + System.err + .println("Inconsistency in the IUBMB ambiguity code nomenclature table: collision for " + + acode.getKey() + " in residue " + r); + } + } + } + // and programmatically add in the ambiguity codes that yield the same amino + // acid + String[] unambcodons = codonHash2.keySet().toArray(new String[codonHash2.size()]); + for (String codon : unambcodons) + { + String residue = codonHash2.get(codon); + String acodon[][] = new String[codon.length()][]; + for (int i = 0, iSize = codon.length(); i < iSize; i++) + { + String _ac = "" + codon.charAt(i); + List acodes = _ambiguityCodes.get(_ac); + if (acodes != null) + { + acodon[i] = acodes.toArray(new String[acodes.size()]); + } + else + { + acodon[i] = new String[] + {}; + } + } + // enumerate all combinations and test for veracity of translation + int tpos[] = new int[codon.length()], cpos[] = new int[codon.length()]; + for (int i = 0; i < tpos.length; i++) + { + tpos[i] = -1; + } + tpos[acodon.length - 1] = 0; + int ipos, j; + while (tpos[0] < acodon[0].length) + { + // make all codons for this combination + char allres[][] = new char[tpos.length][]; + String _acodon = ""; + char _anuc; + for (ipos = 0; ipos < tpos.length; ipos++) + { + if (acodon[ipos].length==0 || tpos[ipos] < 0) + { + _acodon += codon.charAt(ipos); + allres[ipos] = new char[] + { codon.charAt(ipos) }; + } + else + { + _acodon += acodon[ipos][tpos[ipos]]; + String[] altbase = ambiguityCodes.get(acodon[ipos][tpos[ipos]]); + allres[ipos] = new char[altbase.length]; + j = 0; + for (String ab : altbase) + { + allres[ipos][j++] = ab.charAt(0); + } + } + } + // test all codons for this combination + for (ipos = 0; ipos < cpos.length; ipos++) + { + cpos[ipos] = 0; + } + boolean valid = true; + do + { + String _codon = ""; + for (j = 0; j < cpos.length; j++) + { + _codon += allres[j][cpos[j]]; + } + String tr = codonHash2.get(_codon); + if (valid = (tr!=null && tr.equals(residue))) + { + // advance to next combination + ipos = acodon.length - 1; + while (++cpos[ipos] >= allres[ipos].length && ipos > 0) + { + cpos[ipos] = 0; + ipos--; + } + } + } while (valid && cpos[0] < allres[0].length); + if (valid) + { + // Add this to the set of codons we will translate +// System.out.println("Adding ambiguity codon: " + _acodon + " for " +// + residue); + codonHash2.put(_acodon, residue); + } + else + { +// System.err.println("Rejecting ambiguity codon: " + _acodon +// + " for " + residue); + } + // next combination + ipos = acodon.length - 1; + while (++tpos[ipos] >= acodon[ipos].length && ipos > 0) + { + tpos[ipos] = -1; + ipos--; + } + } + } + } static diff --git a/test/jalview/schemes/DnaCodonTests.java b/test/jalview/schemes/DnaCodonTests.java new file mode 100644 index 0000000..690a7bc --- /dev/null +++ b/test/jalview/schemes/DnaCodonTests.java @@ -0,0 +1,49 @@ +package jalview.schemes; + +import static org.junit.Assert.*; + +import jalview.datamodel.AlignmentI; +import jalview.datamodel.ColumnSelection; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceI; + +import java.io.IOException; +import java.util.Map; + +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class DnaCodonTests +{ + + @BeforeClass + public static void setUpBeforeClass() throws Exception + { + } + + @AfterClass + public static void tearDownAfterClass() throws Exception + { + } + + @Test + public void testAmbiguityCodeGeneration() + { + assertTrue(ResidueProperties.ambiguityCodes.size()>0); + } + @Test + public void testAmbiguityCodon() { + for (String ac:ResidueProperties.ambiguityCodes.keySet()) + { + assertTrue("Couldn't resolve GGN as glycine codon",ResidueProperties.codonHash2.get("GG"+ac).equals("G")); + } + } + @Test + public void regenerateCodonTable() { + for (Map.Entry codon:ResidueProperties.codonHash2.entrySet()) + { + System.out.println("ResidueProperties.codonHash2.set(\""+codon.getKey()+"\", \""+codon.getValue()+"\");"); + } + } +} -- 1.7.10.2