X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fschemes%2FResidueProperties.java;h=301a410d5f2bb394433ecad649f9e031f11be4d5;hb=a8f483d04205bb8273ee311c12968b7e86d205fa;hp=99fc074c7bb60a3e6f815d679f85272a0fde11d7;hpb=a86681109f3b9be838ae3fb2dd9d6db544be4bce;p=jalview.git diff --git a/src/jalview/schemes/ResidueProperties.java b/src/jalview/schemes/ResidueProperties.java index 99fc074..301a410 100755 --- a/src/jalview/schemes/ResidueProperties.java +++ b/src/jalview/schemes/ResidueProperties.java @@ -1,29 +1,33 @@ /* - * Jalview - A Sequence Alignment Editor and Viewer (Version 2.7) - * Copyright (C) 2011 J Procter, AM Waterhouse, J Engelhardt, LM Lui, G Barton, M Clamp, S Searle + * Jalview - A Sequence Alignment Editor and Viewer (Version 2.8.2) + * Copyright (C) 2014 The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - * + * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.schemes; -import java.util.*; +import jalview.analysis.scoremodels.PIDScoreModel; +import jalview.api.analysis.ScoreModelI; +import java.util.*; +import java.util.List; import java.awt.*; public class ResidueProperties { - public static Hashtable scoreMatrices = new Hashtable(); + public static Hashtable scoreMatrices = new Hashtable(); // Stores residue codes/names and colours and other things public static final int[] aaIndex; // aaHash version 2.1.1 and below @@ -98,14 +102,15 @@ public class ResidueProperties } /** - * maximum (gap) index for matrices involving protein alphabet + * maximum (gap) index for matrices involving protein alphabet */ - public final static int maxProteinIndex=23; + public final static int maxProteinIndex = 23; + /** - * maximum (gap) index for matrices involving nucleotide alphabet + * maximum (gap) index for matrices involving nucleotide alphabet */ - public final static int maxNucleotideIndex=10; - + public final static int maxNucleotideIndex = 10; + static { nucleotideIndex = new int[255]; @@ -586,22 +591,22 @@ public class ResidueProperties // Will equate sequences if working with mixed nucleotide sets. // treats T and U identically. R and Y weak equivalence with AG and CTU. // N matches any other base weakly - // + // static final int[][] DNA = { - { 10, -8, -8, -8, -8, 1, 1, 1, -8, 1, 1 }, // A - { -8, 10, -8, -8, -8, 1, 1, -8, 1, 1, 1 }, // C - { -8, -8, 10, -8, -8, 1, 1, 1, -8, 1, 1 }, // G - { -8, -8, -8, 10, 10, 1, 1, -8, 1, 1, 1 }, // T - { -8, -8, -8, 10, 10, 1, 1, -8, 1, 1, 1 }, // U - { 1, 1, 1, 1, 1, 10, 0, 0, 0, 1, 1 }, // I - { 1, 1, 1, 1, 1, 0, 10, 0, 0, 1, 1 }, // X - { 1, -8, 1, -8, -8, 0, 0, 10, -8, 1, 1 }, // R - { -8, 1, -8, 1, 1, 0, 0, -8, 10, 1, 1 }, // Y - { 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1 }, // N - { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // - + { 10, -8, -8, -8, -8, 1, 1, 1, -8, 1, 1 }, // A + { -8, 10, -8, -8, -8, 1, 1, -8, 1, 1, 1 }, // C + { -8, -8, 10, -8, -8, 1, 1, 1, -8, 1, 1 }, // G + { -8, -8, -8, 10, 10, 1, 1, -8, 1, 1, 1 }, // T + { -8, -8, -8, 10, 10, 1, 1, -8, 1, 1, 1 }, // U + { 1, 1, 1, 1, 1, 10, 0, 0, 0, 1, 1 }, // I + { 1, 1, 1, 1, 1, 0, 10, 0, 0, 1, 1 }, // X + { 1, -8, 1, -8, -8, 0, 0, 10, -8, 1, 1 }, // R + { -8, 1, -8, 1, 1, 0, 0, -8, 10, 1, 1 }, // Y + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1 }, // N + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // - }; -/** + /** * register matrices in list */ static @@ -609,7 +614,7 @@ public class ResidueProperties scoreMatrices.put("BLOSUM62", new ScoreMatrix("BLOSUM62", BLOSUM62, 0)); scoreMatrices.put("PAM250", new ScoreMatrix("PAM250", PAM250, 0)); scoreMatrices.put("DNA", new ScoreMatrix("DNA", DNA, 1)); - + } public static final Color[] pidColours = @@ -689,22 +694,135 @@ public class ResidueProperties codonHash.put("STOP", STOP); } - public static Hashtable codonHash2 = new Hashtable(); + /** + * Nucleotide Ambiguity Codes + */ + public static final Hashtable ambiguityCodes=new Hashtable(); + /** + * Codon triplets with additional symbols for unambiguous codons that include ambiguity codes + */ + public static final Hashtable codonHash2 = new Hashtable(); + + /** + * all ambiguity codes for a given base + */ + public final static Hashtable> _ambiguityCodes = new Hashtable>(); + static { + /** + * 3.2. Purine (adenine or guanine): R + * + * R is the symbol previously recommended [1]. + */ + ambiguityCodes.put("R", new String[] + { "A", "G" }); + + /** + * 3.3. Pyrimidine (thymine or cytosine): Y + * + * Y is the symbol previously recommended [1]. + */ + ambiguityCodes.put("Y", new String[] + { "T", "C" }); + /** + * 3.4. Adenine or thymine: W + * + * Although several diverse symbols have been used for this pair, (and for + * the reciprocal pair G+C), only two symbols have a rational basis, L and + * W: L derives from DNA density (light; G+C - heavy - would thus be H); W + * derives from the strength of the hydrogen bonding interaction between the + * base pairs (weak for A+T: G +C - strong - would thus be S). However, the + * system recommended for the three-base series (not-A = B, etc., see below, + * section 3.8) rules out H as this would be not-G. W is thus recommended. + */ + ambiguityCodes.put("W", new String[] + { "A", "T" }); + /** + * 3.5. Guanine or cytosine: S + * + * The choice of this symbol is discussed above in section 3.4. + */ + ambiguityCodes.put("S", new String[] + { "G", "C" }); + /** + * 3.6. Adenine or cytosine: M + * + * There are few common features between A and C. The presence of an NH2 + * group in similar positions on both bases (Fig. 1) makes possible a + * logically derived symbol. A and N being ruled out, M (from aMino) is + * recommended. + * + * + * Fig. 1. Origin of the symbols M and K The four bases are drawn so as to + * show the relationship between adenine and cytosine on the one hand, which + * both have aMino groups at the ring position most distant from the point + * of attachment to the sugar, and between guanine and thymine on the other, + * which both have Keto groups at the corresponding position. The ring atoms + * are numbered as recommended [24-26], although for the present purpose + * this has the disadvantage of giving discordant numbers to the + * corresponding positions. + */ + ambiguityCodes.put("M", new String[] + { "A", "C" }); + /** + * 3.7. Guanine or thymine: K By analogy with A and C (section 3.6), both G + * and T have Keto groups in similar positions (Fig. 1). + */ + ambiguityCodes.put("K", new String[] + { "G", "T" }); + /** + * 3.8. Adenine or thymine or cytosine: H + * + * Not-G is the most simple means of memorising this combination and symbols + * logically related to G were examined. F and H would both be suitable, as + * the letters before and after G in the alphabet, but A would have no + * equivalent to F. The use of H has historical precedence [2]. + */ + ambiguityCodes.put("H", new String[] + { "A", "T", "C" }); + /** + * 3.9. Guanine or cytosine or thymine: B + * + * Not-A as above (section 3.8). + */ + ambiguityCodes.put("B", new String[] + { "G", "T", "C" }); + /** + * 3.10. Guanine or adenine or cytosine: V + * + * Not-T by analogy with not-G (section 3.8) would be U but this is ruled + * out to eliminate confusion with uracil. V is the next logical choice. + * Note that T and U may in some cases be considered to be synonyms. + */ + ambiguityCodes.put("V", new String[] + { "G", "A", "C" }); + /** + * 3.11. Guanine or adenine or thymine: D + * + * Not-C as above (section 3.8). + */ + ambiguityCodes.put("D", new String[] + { "G", "A", "T" }); + /** + * 3.12. Guanine or adenine or thymine or cytosine: N + */ + ambiguityCodes.put("R", new String[] + { "G", "A", "T", "C" }); + // Now build codon translation table codonHash2.put("AAA", "K"); codonHash2.put("AAG", "K"); codonHash2.put("AAC", "N"); codonHash2.put("AAT", "N"); - codonHash2.put("CAA", "E"); - codonHash2.put("CAG", "E"); + codonHash2.put("CAA", "Q"); + codonHash2.put("CAG", "Q"); codonHash2.put("CAC", "H"); codonHash2.put("CAT", "H"); - codonHash2.put("GAA", "Q"); - codonHash2.put("GAG", "Q"); + codonHash2.put("GAA", "E"); + codonHash2.put("GAG", "E"); codonHash2.put("GAC", "D"); codonHash2.put("GAT", "D"); @@ -712,9 +830,9 @@ public class ResidueProperties codonHash2.put("TAT", "Y"); codonHash2.put("ACA", "T"); - codonHash2.put("AAG", "T"); codonHash2.put("ACC", "T"); codonHash2.put("ACT", "T"); + codonHash2.put("ACG", "T"); codonHash2.put("CCA", "P"); codonHash2.put("CCG", "P"); @@ -774,6 +892,145 @@ public class ResidueProperties codonHash2.put("TTC", "F"); codonHash2.put("TTT", "F"); + + buildAmbiguityCodonSet(); + } + + /** + * programmatic generation of codons including ambiguity codes + */ + public static void buildAmbiguityCodonSet() + { + if (_ambiguityCodes.size() > 0) + { + System.err + .println("Ignoring multiple calls to buildAmbiguityCodonSet"); + return; + } + // Invert the ambiguity code set + for (Map.Entry acode : ambiguityCodes.entrySet()) + { + for (String r : acode.getValue()) + { + List codesfor = _ambiguityCodes.get(r); + if (codesfor == null) + { + _ambiguityCodes.put(r, codesfor = new ArrayList()); + } + if (!codesfor.contains(acode.getKey())) + { + codesfor.add(acode.getKey()); + } + else + { + System.err + .println("Inconsistency in the IUBMB ambiguity code nomenclature table: collision for " + + acode.getKey() + " in residue " + r); + } + } + } + // and programmatically add in the ambiguity codes that yield the same amino + // acid + String[] unambcodons = codonHash2.keySet().toArray(new String[codonHash2.size()]); + for (String codon : unambcodons) + { + String residue = codonHash2.get(codon); + String acodon[][] = new String[codon.length()][]; + for (int i = 0, iSize = codon.length(); i < iSize; i++) + { + String _ac = "" + codon.charAt(i); + List acodes = _ambiguityCodes.get(_ac); + if (acodes != null) + { + acodon[i] = acodes.toArray(new String[acodes.size()]); + } + else + { + acodon[i] = new String[] + {}; + } + } + // enumerate all combinations and test for veracity of translation + int tpos[] = new int[codon.length()], cpos[] = new int[codon.length()]; + for (int i = 0; i < tpos.length; i++) + { + tpos[i] = -1; + } + tpos[acodon.length - 1] = 0; + int ipos, j; + while (tpos[0] < acodon[0].length) + { + // make all codons for this combination + char allres[][] = new char[tpos.length][]; + String _acodon = ""; + char _anuc; + for (ipos = 0; ipos < tpos.length; ipos++) + { + if (acodon[ipos].length==0 || tpos[ipos] < 0) + { + _acodon += codon.charAt(ipos); + allres[ipos] = new char[] + { codon.charAt(ipos) }; + } + else + { + _acodon += acodon[ipos][tpos[ipos]]; + String[] altbase = ambiguityCodes.get(acodon[ipos][tpos[ipos]]); + allres[ipos] = new char[altbase.length]; + j = 0; + for (String ab : altbase) + { + allres[ipos][j++] = ab.charAt(0); + } + } + } + // test all codons for this combination + for (ipos = 0; ipos < cpos.length; ipos++) + { + cpos[ipos] = 0; + } + boolean valid = true; + do + { + String _codon = ""; + for (j = 0; j < cpos.length; j++) + { + _codon += allres[j][cpos[j]]; + } + String tr = codonHash2.get(_codon); + if (valid = (tr!=null && tr.equals(residue))) + { + // advance to next combination + ipos = acodon.length - 1; + while (++cpos[ipos] >= allres[ipos].length && ipos > 0) + { + cpos[ipos] = 0; + ipos--; + } + } + } while (valid && cpos[0] < allres[0].length); + if (valid) + { + // Add this to the set of codons we will translate +// System.out.println("Adding ambiguity codon: " + _acodon + " for " +// + residue); + codonHash2.put(_acodon, residue); + } + else + { +// System.err.println("Rejecting ambiguity codon: " + _acodon +// + " for " + residue); + } + // next combination + ipos = acodon.length - 1; + while (++tpos[ipos] >= acodon[ipos].length && ipos > 0) + { + tpos[ipos] = -1; + ipos--; + } + } + } + } static @@ -1161,6 +1418,66 @@ public class ResidueProperties propHash.put("proline", proline); propHash.put("polar", polar); } + static + { + int[][] propMatrixF = new int[maxProteinIndex][maxProteinIndex], + propMatrixPos = new int[maxProteinIndex][maxProteinIndex], + propMatrixEpos = new int[maxProteinIndex][maxProteinIndex]; + for (int i=0;ii) { + ic+=aa[i]; + } + else {ic = "-";} + for (int j=i+1;jj) { + jc+=aa[j]; + } + else {jc = "-";} + propMatrixF[i][j]=0; + propMatrixPos[i][j]=0; + propMatrixEpos[i][j]=0; + for (Enumeration en= (Enumeration)propHash.keys(); en.hasMoreElements(); ) + { + String ph = en.nextElement(); + Map pph=(Map)propHash.get(ph); + if (pph.get(ic)!=null && pph.get(jc)!=null) { + int icp=pph.get(ic).intValue(),jcp=pph.get(jc).intValue(); + // Still working on these definitions. + propMatrixPos[i][j] += icp == jcp && icp>0 ? 2 : 0; + propMatrixPos[j][i] += icp == jcp && icp>0 ? 2 : 0; + propMatrixF[i][j] += icp == jcp ? 2 : 0; + propMatrixF[j][i] += icp == jcp ? 2 : 0; + propMatrixEpos[i][j] += icp == jcp ? (1+icp * 2) : 0; + propMatrixEpos[j][i] += icp == jcp ? (1+icp * 2) : 0; + }} + if (maxF -1 || codon.indexOf('N') > -1) @@ -1272,12 +1602,22 @@ public class ResidueProperties public static ScoreMatrix getScoreMatrix(String pwtype) { Object val = scoreMatrices.get(pwtype); - if (val != null) + if (val != null && val instanceof ScoreMatrix) { return (ScoreMatrix) val; } return null; } + /** + * get a ScoreModel based on its string name + * + * @param pwtype + * @return scoremodel of type pwtype or null + */ + public static ScoreModelI getScoreModel(String pwtype) + { + return scoreMatrices.get(pwtype); + } public static int getPAM250(char c, char d) { @@ -1339,9 +1679,68 @@ public class ResidueProperties public static Hashtable toRNAssState; static { - toRNAssState = new Hashtable(); - toRNAssState.put(")", "S"); - toRNAssState.put("(", "S"); + toRNAssState = new Hashtable(); + toRNAssState.put(")", "("); + toRNAssState.put("(", "("); + toRNAssState.put("]", "["); + toRNAssState.put("[", "["); + toRNAssState.put("{", "{"); + toRNAssState.put("}", "{"); + toRNAssState.put(">", ">"); + toRNAssState.put("<", ">"); + toRNAssState.put("A", "A"); + toRNAssState.put("a", "A"); + toRNAssState.put("B", "B"); + toRNAssState.put("b", "B"); + toRNAssState.put("C", "C"); + toRNAssState.put("c", "C"); + toRNAssState.put("D", "D"); + toRNAssState.put("d", "D"); + toRNAssState.put("E", "E"); + toRNAssState.put("e", "E"); + toRNAssState.put("F", "F"); + toRNAssState.put("f", "F"); + toRNAssState.put("G", "G"); + toRNAssState.put("g", "G"); + toRNAssState.put("H", "H"); + toRNAssState.put("h", "H"); + toRNAssState.put("I", "I"); + toRNAssState.put("i", "I"); + toRNAssState.put("J", "J"); + toRNAssState.put("j", "J"); + toRNAssState.put("K", "K"); + toRNAssState.put("k", "K"); + toRNAssState.put("L", "L"); + toRNAssState.put("l", "L"); + toRNAssState.put("M", "M"); + toRNAssState.put("m", "M"); + toRNAssState.put("N", "N"); + toRNAssState.put("n", "N"); + toRNAssState.put("O", "O"); + toRNAssState.put("o", "O"); + toRNAssState.put("P", "P"); + toRNAssState.put("p", "P"); + toRNAssState.put("Q", "Q"); + toRNAssState.put("q", "Q"); + toRNAssState.put("R", "R"); + toRNAssState.put("r", "R"); + toRNAssState.put("S", "S"); + toRNAssState.put("s", "S"); + toRNAssState.put("T", "T"); + toRNAssState.put("t", "T"); + toRNAssState.put("U", "U"); + toRNAssState.put("u", "U"); + toRNAssState.put("V", "V"); + toRNAssState.put("v", "V"); + toRNAssState.put("W", "W"); + toRNAssState.put("w", "W"); + toRNAssState.put("X", "X"); + toRNAssState.put("x", "X"); + toRNAssState.put("Y", "Y"); + toRNAssState.put("y", "Y"); + toRNAssState.put("Z", "Z"); + toRNAssState.put("z", "Z"); + } /**