2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import java.util.Locale;
24 import java.io.BufferedReader;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.InputStreamReader;
28 import java.util.HashMap;
29 import java.util.LinkedHashMap;
31 import java.util.StringTokenizer;
33 import jalview.bin.Console;
36 * A singleton that provides instances of genetic code translation tables
39 * @see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
41 public final class GeneticCodes
43 private static final int CODON_LENGTH = 3;
45 private static final String QUOTE = "\"";
48 * nucleotides as ordered in data file
50 private static final String NUCS = "TCAG";
52 private static final int NUCS_COUNT = NUCS.length();
54 private static final int NUCS_COUNT_SQUARED = NUCS_COUNT * NUCS_COUNT;
56 private static final int NUCS_COUNT_CUBED = NUCS_COUNT * NUCS_COUNT
59 private static final String AMBIGUITY_CODES_FILE = "/AmbiguityCodes.dat";
61 private static final String RESOURCE_FILE = "/GeneticCodes.dat";
63 private static GeneticCodes instance = new GeneticCodes();
65 private Map<String, String> ambiguityCodes;
68 * loaded code tables, with keys in order of loading
70 private Map<String, GeneticCodeI> codeTables;
73 * Private constructor enforces singleton
75 private GeneticCodes()
79 ambiguityCodes = new HashMap<>();
82 * LinkedHashMap preserves order of addition of entries,
83 * so we can assume the Standard Code Table is the first
85 codeTables = new LinkedHashMap<>();
86 loadAmbiguityCodes(AMBIGUITY_CODES_FILE);
87 loadCodes(RESOURCE_FILE);
92 * Returns the singleton instance of this class
96 public static GeneticCodes getInstance()
102 * Returns the known code tables, in order of loading.
106 public Iterable<GeneticCodeI> getCodeTables()
108 return codeTables.values();
112 * Answers the code table with the given id
117 public GeneticCodeI getCodeTable(String id)
119 return codeTables.get(id);
123 * A convenience method that returns the standard code table (table 1). As
124 * implemented, this has to be the first table defined in the data file.
128 public GeneticCodeI getStandardCodeTable()
130 return codeTables.values().iterator().next();
134 * Loads the code tables from a data file
136 protected void loadCodes(String fileName)
140 InputStream is = getClass().getResourceAsStream(fileName);
143 System.err.println("Resource file not found: " + fileName);
146 BufferedReader dataIn = new BufferedReader(new InputStreamReader(is));
149 * skip comments and start of table
152 while (line != null && !line.startsWith("Genetic-code-table"))
154 line = readLine(dataIn);
156 line = readLine(dataIn);
158 while (line.startsWith("{"))
160 line = loadOneTable(dataIn);
162 } catch (IOException | NullPointerException e)
164 Console.error("Error reading genetic codes data file " + fileName
165 + ": " + e.getMessage());
167 if (codeTables.isEmpty())
170 "No genetic code tables loaded, check format of file "
176 * Reads and saves Nucleotide ambiguity codes from a data file. The file may
177 * include comment lines (starting with #), a header 'DNA', and one line per
178 * ambiguity code, for example:
182 * means that R is an ambiguity code meaning "A or G"
186 protected void loadAmbiguityCodes(String fileName)
190 InputStream is = getClass().getResourceAsStream(fileName);
193 System.err.println("Resource file not found: " + fileName);
196 BufferedReader dataIn = new BufferedReader(new InputStreamReader(is));
200 line = readLine(dataIn);
201 if (line != null && !"DNA".equals(line.toUpperCase(Locale.ROOT)))
203 String[] tokens = line.split("\\t");
204 if (tokens.length == 2)
206 ambiguityCodes.put(tokens[0].toUpperCase(Locale.ROOT),
207 tokens[1].toUpperCase(Locale.ROOT));
212 "Unexpected data in " + fileName + ": " + line);
216 } catch (IOException e)
218 Console.error("Error reading nucleotide ambiguity codes data file: "
224 * Reads up to and returns the next non-comment line, trimmed. Comment lines
225 * start with a #. Returns null at end of file.
229 * @throws IOException
231 protected String readLine(BufferedReader dataIn) throws IOException
233 String line = dataIn.readLine();
234 while (line != null && line.startsWith("#"))
236 line = readLine(dataIn);
238 return line == null ? null : line.trim();
242 * Reads the lines of the data file describing one translation table, and
243 * creates and stores an instance of GeneticCodeI. Returns the '{' line
244 * starting the next table, or the '}' line at end of all tables. Data format
249 * name "Vertebrate Mitochondrial" ,
252 * ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
253 * sncbieaa "----------**--------------------MMMM----------**---M------------"
254 * -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
255 * -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
256 * -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
260 * of which we parse the first name, the id, and the ncbieaa translations for
261 * codons as ordered by the Base1/2/3 lines. Note Base1/2/3 are included for
262 * readability and are in a fixed order, these are not parsed. The sncbieaa
263 * line marks alternative start codons, these are not parsed.
267 * @throws IOException
269 protected String loadOneTable(BufferedReader dataIn) throws IOException
273 Map<String, String> codons = new HashMap<>();
275 String line = readLine(dataIn);
277 while (line != null && !line.startsWith("}"))
279 if (line.startsWith("name") && name == null)
281 name = line.substring(line.indexOf(QUOTE) + 1,
282 line.lastIndexOf(QUOTE));
284 else if (line.startsWith("id"))
286 id = new StringTokenizer(line.substring(2)).nextToken();
288 else if (line.startsWith("ncbieaa"))
290 String aminos = line.substring(line.indexOf(QUOTE) + 1,
291 line.lastIndexOf(QUOTE));
292 if (aminos.length() != NUCS_COUNT_CUBED) // 4 * 4 * 4 combinations
294 Console.error("wrong data length in code table: " + line);
298 for (int i = 0; i < aminos.length(); i++)
300 String peptide = String.valueOf(aminos.charAt(i));
301 char codon1 = NUCS.charAt(i / NUCS_COUNT_SQUARED);
303 .charAt((i % NUCS_COUNT_SQUARED) / NUCS_COUNT);
304 char codon3 = NUCS.charAt(i % NUCS_COUNT);
305 String codon = new String(
307 { codon1, codon2, codon3 });
308 codons.put(codon, peptide);
312 line = readLine(dataIn);
315 registerCodeTable(id, name, codons);
316 return readLine(dataIn);
320 * Constructs and registers a GeneticCodeI instance with the codon
321 * translations as defined in the data file. For all instances except the
322 * first, any undeclared translations default to those in the standard code
329 protected void registerCodeTable(final String id, final String name,
330 final Map<String, String> codons)
332 codeTables.put(id, new GeneticCodeI()
335 * map of ambiguous codons to their 'product'
336 * (null if not all possible translations match)
338 Map<String, String> ambiguous = new HashMap<>();
341 public String translateCanonical(String codon)
343 return codons.get(codon.toUpperCase(Locale.ROOT));
347 public String translate(String codon)
349 String upper = codon.toUpperCase(Locale.ROOT);
350 String peptide = translateCanonical(upper);
353 * if still not translated, check for ambiguity codes
357 peptide = getAmbiguousTranslation(upper, ambiguous, this);
363 public String getId()
369 public String getName()
377 * Computes all possible translations of a codon including one or more
378 * ambiguity codes, and stores and returns the result (null if not all
379 * translations match). If the codon includes no ambiguity codes, simply
387 protected String getAmbiguousTranslation(String codon,
388 Map<String, String> ambiguous, GeneticCodeI codeTable)
390 if (codon.length() != CODON_LENGTH)
395 boolean isAmbiguous = false;
397 char[][] expanded = new char[CODON_LENGTH][];
398 for (int i = 0; i < CODON_LENGTH; i++)
400 String base = String.valueOf(codon.charAt(i));
401 if (ambiguityCodes.containsKey(base))
404 base = ambiguityCodes.get(base);
406 expanded[i] = base.toCharArray();
411 // no ambiguity code involved here
416 * generate and translate all permutations of the ambiguous codon
417 * only return the translation if they all agree, else null
419 String peptide = null;
420 for (char c1 : expanded[0])
422 for (char c2 : expanded[1])
424 for (char c3 : expanded[2])
426 char[] cdn = new char[] { c1, c2, c3 };
427 String possibleCodon = String.valueOf(cdn);
428 String pep = codeTable.translate(possibleCodon);
429 if (pep == null || (peptide != null && !pep.equals(peptide)))
431 ambiguous.put(codon, null);
440 * all translations of ambiguous codons matched!
442 ambiguous.put(codon, peptide);