X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Fjalview%2Fanalysis%2FGeneticCodes.java;h=4c826b2201ffa9ecf39acc97d5eb72b69b0a9afd;hb=8a2a0cda7067530f8481c2aec203e18d555f2dfd;hp=88d4e69f51efb4961038ea66e03426fced27a7b5;hpb=a97dbd3e6e9707de13f47c66beaa15df8ea24d0e;p=jalview.git diff --git a/src/jalview/analysis/GeneticCodes.java b/src/jalview/analysis/GeneticCodes.java index 88d4e69..4c826b2 100644 --- a/src/jalview/analysis/GeneticCodes.java +++ b/src/jalview/analysis/GeneticCodes.java @@ -1,5 +1,26 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.analysis; +import java.util.Locale; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -7,6 +28,9 @@ import java.io.InputStreamReader; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; +import java.util.StringTokenizer; + +import jalview.bin.Console; /** * A singleton that provides instances of genetic code translation tables @@ -14,8 +38,26 @@ import java.util.Map; * @author gmcarstairs * @see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi */ -public class GeneticCodes +public final class GeneticCodes { + private static final int CODON_LENGTH = 3; + + private static final String QUOTE = "\""; + + /* + * nucleotides as ordered in data file + */ + private static final String NUCS = "TCAG"; + + private static final int NUCS_COUNT = NUCS.length(); + + private static final int NUCS_COUNT_SQUARED = NUCS_COUNT * NUCS_COUNT; + + private static final int NUCS_COUNT_CUBED = NUCS_COUNT * NUCS_COUNT + * NUCS_COUNT; + + private static final String AMBIGUITY_CODES_FILE = "/AmbiguityCodes.dat"; + private static final String RESOURCE_FILE = "/GeneticCodes.dat"; private static GeneticCodes instance = new GeneticCodes(); @@ -28,16 +70,6 @@ public class GeneticCodes private Map codeTables; /** - * Returns the singleton instance of this class - * - * @return - */ - public static GeneticCodes getInstance() - { - return instance; - } - - /** * Private constructor enforces singleton */ private GeneticCodes() @@ -51,9 +83,20 @@ public class GeneticCodes * so we can assume the Standard Code Table is the first */ codeTables = new LinkedHashMap<>(); + loadAmbiguityCodes(AMBIGUITY_CODES_FILE); loadCodes(RESOURCE_FILE); } - }; + } + + /** + * Returns the singleton instance of this class + * + * @return + */ + public static GeneticCodes getInstance() + { + return instance; + } /** * Returns the known code tables, in order of loading. @@ -95,55 +138,91 @@ public class GeneticCodes try { InputStream is = getClass().getResourceAsStream(fileName); + if (is == null) + { + jalview.bin.Console.errPrintln("Resource file not found: " + fileName); + return; + } BufferedReader dataIn = new BufferedReader(new InputStreamReader(is)); - String line = loadAmbiguityCodes(dataIn); + /* + * skip comments and start of table + */ + String line = ""; + while (line != null && !line.startsWith("Genetic-code-table")) + { + line = readLine(dataIn); + } + line = readLine(dataIn); - do + while (line.startsWith("{")) { - line = loadOneTable(line, dataIn); - } while (line != null); - } catch (IOException e) + line = loadOneTable(dataIn); + } + } catch (IOException | NullPointerException e) { - System.err.println("Error reading genetic codes data file: " - + e.getMessage()); + Console.error("Error reading genetic codes data file " + fileName + + ": " + e.getMessage()); + } + if (codeTables.isEmpty()) + { + jalview.bin.Console.errPrintln( + "No genetic code tables loaded, check format of file " + + fileName); } } /** - * Reads for header line "Ambiguity Codes" and saves following data up to the - * first "Table". Returns the next ("Table") line. + * Reads and saves Nucleotide ambiguity codes from a data file. The file may + * include comment lines (starting with #), a header 'DNA', and one line per + * ambiguity code, for example: + *

+ * R<tab>AG + *

+ * means that R is an ambiguity code meaning "A or G" * - * @param dataIn - * @return - * @throws IOException + * @param fileName */ - protected String loadAmbiguityCodes(BufferedReader dataIn) - throws IOException + protected void loadAmbiguityCodes(String fileName) { - /* - * get first non-comment line - */ - String line = readLine(dataIn); - if (line == null || !line.toUpperCase().startsWith("AMBIGUITY")) - { - return line; - } - while (true) + try { - line = readLine(dataIn); - if (line == null || line.toUpperCase().startsWith("TABLE")) + InputStream is = getClass().getResourceAsStream(fileName); + if (is == null) { - return line; + jalview.bin.Console.errPrintln("Resource file not found: " + fileName); + return; + } + BufferedReader dataIn = new BufferedReader(new InputStreamReader(is)); + String line = ""; + while (line != null) + { + line = readLine(dataIn); + if (line != null && !"DNA".equals(line.toUpperCase(Locale.ROOT))) + { + String[] tokens = line.split("\\t"); + if (tokens.length == 2) + { + ambiguityCodes.put(tokens[0].toUpperCase(Locale.ROOT), + tokens[1].toUpperCase(Locale.ROOT)); + } + else + { + jalview.bin.Console.errPrintln( + "Unexpected data in " + fileName + ": " + line); + } + } } - String[] tokens = line.split("\\t"); - ambiguityCodes.put(tokens[0].toUpperCase(), tokens[1].toUpperCase()); + } catch (IOException e) + { + Console.error("Error reading nucleotide ambiguity codes data file: " + + e.getMessage()); } } /** - * Reads up to and returns the next non-comment line. Comment lines start with - * a #. + * Reads up to and returns the next non-comment line, trimmed. Comment lines + * start with a #. Returns null at end of file. * * @param dataIn * @return @@ -156,63 +235,85 @@ public class GeneticCodes { line = readLine(dataIn); } - return line; + return line == null ? null : line.trim(); } /** - * Reads the next lines of the data file describing one translation table, and - * creates an instance of GeneticCodeI for it. Returns the next line of the - * file (or null at end of file). + * Reads the lines of the data file describing one translation table, and + * creates and stores an instance of GeneticCodeI. Returns the '{' line + * starting the next table, or the '}' line at end of all tables. Data format + * is + * + *

+   * {
+   *   name "Vertebrate Mitochondrial" ,
+   *   name "SGC1" ,
+   *   id 2 ,
+   *   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
+   *   sncbieaa "----------**--------------------MMMM----------**---M------------"
+   *   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+   *   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+   *   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+   * },
+   * 
* - * @param nextLine + * of which we parse the first name, the id, and the ncbieaa translations for + * codons as ordered by the Base1/2/3 lines. Note Base1/2/3 are included for + * readability and are in a fixed order, these are not parsed. The sncbieaa + * line marks alternative start codons, these are not parsed. * * @param dataIn * @return * @throws IOException */ - protected String loadOneTable(String nextLine, BufferedReader dataIn) throws IOException + protected String loadOneTable(BufferedReader dataIn) throws IOException { - String line = nextLine; - if (line == null) - { - return null; - } - - /* - * next line should be tab-delimited "Table", id and description - */ - String[] tokens = line.split("\\t"); - String id = tokens[1]; - String name = tokens[2]; - - /* - * followed by codon translations - * - the full set for the first (Standard) code - * - variations (if any) for other codes - */ + String name = null; + String id = null; Map codons = new HashMap<>(); - while (true) + + String line = readLine(dataIn); + + while (line != null && !line.startsWith("}")) { - line = readLine(dataIn); - if (line == null) + if (line.startsWith("name") && name == null) { - registerCodeTable(id, name, codons); - return null; + name = line.substring(line.indexOf(QUOTE) + 1, + line.lastIndexOf(QUOTE)); } - tokens = line.split("\\t"); - String codon = tokens[0]; - String peptide = tokens[1]; - if ("Table".equalsIgnoreCase(codon)) + else if (line.startsWith("id")) { - /* - * start of next code table - construct this one, - * and return the next line of the data file - */ - registerCodeTable(id, name, codons); - return line; + id = new StringTokenizer(line.substring(2)).nextToken(); + } + else if (line.startsWith("ncbieaa")) + { + String aminos = line.substring(line.indexOf(QUOTE) + 1, + line.lastIndexOf(QUOTE)); + if (aminos.length() != NUCS_COUNT_CUBED) // 4 * 4 * 4 combinations + { + Console.error("wrong data length in code table: " + line); + } + else + { + for (int i = 0; i < aminos.length(); i++) + { + String peptide = String.valueOf(aminos.charAt(i)); + char codon1 = NUCS.charAt(i / NUCS_COUNT_SQUARED); + char codon2 = NUCS + .charAt((i % NUCS_COUNT_SQUARED) / NUCS_COUNT); + char codon3 = NUCS.charAt(i % NUCS_COUNT); + String codon = new String( + new char[] + { codon1, codon2, codon3 }); + codons.put(codon, peptide); + } + } } - codons.put(codon.toUpperCase(), peptide.toUpperCase()); + line = readLine(dataIn); } + + registerCodeTable(id, name, codons); + return readLine(dataIn); } /** @@ -239,38 +340,22 @@ public class GeneticCodes @Override public String translateCanonical(String codon) { - codon = codon.toUpperCase(); - String peptide = codons.get(codon); - if (peptide == null) - { - /* - * delegate an unspecified codon to the Standard Table, - * (unless this is the Standard Table!) - * but don't delegate ambiguity resolution - */ - GeneticCodeI standardCodeTable = getStandardCodeTable(); - if (this != standardCodeTable) - { - peptide = standardCodeTable.translateCanonical(codon); - } - } - return peptide; + return codons.get(codon.toUpperCase(Locale.ROOT)); } @Override public String translate(String codon) { - codon = codon.toUpperCase(); - String peptide = translateCanonical(codon); + String upper = codon.toUpperCase(Locale.ROOT); + String peptide = translateCanonical(upper); /* * if still not translated, check for ambiguity codes */ if (peptide == null) { - peptide = getAmbiguousTranslation(codon, ambiguous, this); + peptide = getAmbiguousTranslation(upper, ambiguous, this); } - return peptide; } @@ -302,29 +387,23 @@ public class GeneticCodes protected String getAmbiguousTranslation(String codon, Map ambiguous, GeneticCodeI codeTable) { - if (codon.length() != 3) + if (codon.length() != CODON_LENGTH) { return null; } boolean isAmbiguous = false; - String base1 = String.valueOf(codon.charAt(0)); - if (ambiguityCodes.containsKey(base1)) - { - isAmbiguous = true; - base1 = ambiguityCodes.get(base1); - } - String base2 = String.valueOf(codon.charAt(1)); - if (ambiguityCodes.containsKey(base2)) - { - isAmbiguous = true; - base2 = ambiguityCodes.get(base2); - } - String base3 = String.valueOf(codon.charAt(2)); - if (ambiguityCodes.containsKey(base3)) + + char[][] expanded = new char[CODON_LENGTH][]; + for (int i = 0; i < CODON_LENGTH; i++) { - isAmbiguous = true; - base3 = ambiguityCodes.get(base3); + String base = String.valueOf(codon.charAt(i)); + if (ambiguityCodes.containsKey(base)) + { + isAmbiguous = true; + base = ambiguityCodes.get(base); + } + expanded[i] = base.toCharArray(); } if (!isAmbiguous) @@ -338,11 +417,11 @@ public class GeneticCodes * only return the translation if they all agree, else null */ String peptide = null; - for (char c1 : base1.toCharArray()) + for (char c1 : expanded[0]) { - for (char c2 : base2.toCharArray()) + for (char c2 : expanded[1]) { - for (char c3 : base3.toCharArray()) + for (char c3 : expanded[2]) { char[] cdn = new char[] { c1, c2, c3 }; String possibleCodon = String.valueOf(cdn);