JAL-3748 pass a list to findAlignedSequence to return mapping that relates CDS and...

[jalview.git] / src / jalview / analysis / GeneticCodes.java
diff --git a/src/jalview/analysis/GeneticCodes.java b/src/jalview/analysis/GeneticCodes.java

index 88d4e69..df1dd82 100644 (file)
--- a/src/jalview/analysis/GeneticCodes.java
+++ b/src/jalview/analysis/GeneticCodes.java
@@ -1,5 +1,27 @@
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ * 
+ * This file is part of Jalview.
+ * 
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License 
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *  
+ * Jalview is distributed in the hope that it will be useful, but 
+ * WITHOUT ANY WARRANTY; without even the implied warranty 
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 
+ * PURPOSE.  See the GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
  package jalview.analysis;
  
+import jalview.bin.Cache;
+
  import java.io.BufferedReader;
  import java.io.IOException;
  import java.io.InputStream;
@@ -7,6 +29,7 @@ import java.io.InputStreamReader;
  import java.util.HashMap;
  import java.util.LinkedHashMap;
  import java.util.Map;
+import java.util.StringTokenizer;
  
  /**
   * A singleton that provides instances of genetic code translation tables
@@ -14,8 +37,26 @@ import java.util.Map;
   * @author gmcarstairs
   * @see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
   */
-public class GeneticCodes
+public final class GeneticCodes
  {
+  private static final int CODON_LENGTH = 3;
+
+  private static final String QUOTE = "\"";
+
+  /*
+   * nucleotides as ordered in data file
+   */
+  private static final String NUCS = "TCAG";
+
+  private static final int NUCS_COUNT = NUCS.length();
+
+  private static final int NUCS_COUNT_SQUARED = NUCS_COUNT * NUCS_COUNT;
+
+  private static final int NUCS_COUNT_CUBED = NUCS_COUNT * NUCS_COUNT
+          * NUCS_COUNT;
+
+  private static final String AMBIGUITY_CODES_FILE = "/AmbiguityCodes.dat";
+
    private static final String RESOURCE_FILE = "/GeneticCodes.dat";
  
    private static GeneticCodes instance = new GeneticCodes();
@@ -28,16 +69,6 @@ public class GeneticCodes
    private Map<String, GeneticCodeI> codeTables;
  
    /**
-   * Returns the singleton instance of this class
-   * 
-   * @return
-   */
-  public static GeneticCodes getInstance()
-  {
-    return instance;
-  }
-
-  /**
     * Private constructor enforces singleton
     */
    private GeneticCodes()
@@ -51,9 +82,20 @@ public class GeneticCodes
         * so we can assume the Standard Code Table is the first
         */
        codeTables = new LinkedHashMap<>();
+      loadAmbiguityCodes(AMBIGUITY_CODES_FILE);
        loadCodes(RESOURCE_FILE);
      }
-  };
+  }
+
+  /**
+   * Returns the singleton instance of this class
+   * 
+   * @return
+   */
+  public static GeneticCodes getInstance()
+  {
+    return instance;
+  }
  
    /**
     * Returns the known code tables, in order of loading.
@@ -95,55 +137,93 @@ public class GeneticCodes
      try
      {
        InputStream is = getClass().getResourceAsStream(fileName);
+      if (is == null)
+      {
+        System.err.println("Resource file not found: " + fileName);
+        return;
+      }
        BufferedReader dataIn = new BufferedReader(new InputStreamReader(is));
  
-      String line = loadAmbiguityCodes(dataIn);
+      /*
+       * skip comments and start of table
+       */
+      String line = "";
+      while (line != null && !line.startsWith("Genetic-code-table"))
+      {
+        line = readLine(dataIn);
+      }
+      line = readLine(dataIn);
  
-      do
+      while (line.startsWith("{"))
        {
-        line = loadOneTable(line, dataIn);
-      } while (line != null);
-    } catch (IOException e)
+        line = loadOneTable(dataIn);
+      }
+    } catch (IOException | NullPointerException e)
      {
-      System.err.println("Error reading genetic codes data file: "
+      Cache.log.error(
+              "Error reading genetic codes data file " + fileName + ": "
                + e.getMessage());
      }
+    if (codeTables.isEmpty())
+    {
+      System.err.println(
+              "No genetic code tables loaded, check format of file "
+                      + fileName);
+    }
    }
  
    /**
-   * Reads for header line "Ambiguity Codes" and saves following data up to the
-   * first "Table". Returns the next ("Table") line.
+   * Reads and saves Nucleotide ambiguity codes from a data file. The file may
+   * include comment lines (starting with #), a header 'DNA', and one line per
+   * ambiguity code, for example:
+   * <p>
+   * R&lt;tab&gt;AG
+   * <p>
+   * means that R is an ambiguity code meaning "A or G"
     * 
-   * @param dataIn
-   * @return
-   * @throws IOException
+   * @param fileName
     */
-  protected String loadAmbiguityCodes(BufferedReader dataIn)
-          throws IOException
+  protected void loadAmbiguityCodes(String fileName)
    {
-    /*
-     * get first non-comment line
-     */
-    String line = readLine(dataIn);
-    if (line == null || !line.toUpperCase().startsWith("AMBIGUITY"))
-    {
-      return line;
-    }
-    while (true)
+    try
      {
-      line = readLine(dataIn);
-      if (line == null || line.toUpperCase().startsWith("TABLE"))
+      InputStream is = getClass().getResourceAsStream(fileName);
+      if (is == null)
        {
-        return line;
+        System.err.println("Resource file not found: " + fileName);
+        return;
+      }
+      BufferedReader dataIn = new BufferedReader(new InputStreamReader(is));
+      String line = "";
+      while (line != null)
+      {
+        line = readLine(dataIn);
+        if (line != null && !"DNA".equals(line.toUpperCase()))
+        {
+          String[] tokens = line.split("\\t");
+          if (tokens.length == 2)
+          {
+          ambiguityCodes.put(tokens[0].toUpperCase(),
+                  tokens[1].toUpperCase());
+          }
+          else
+          {
+            System.err.println(
+                    "Unexpected data in " + fileName + ": " + line);
+          }
+        }
        }
-      String[] tokens = line.split("\\t");
-      ambiguityCodes.put(tokens[0].toUpperCase(), tokens[1].toUpperCase());
+    } catch (IOException e)
+    {
+      Cache.log.error(
+              "Error reading nucleotide ambiguity codes data file: "
+                      + e.getMessage());
      }
    }
  
    /**
-   * Reads up to and returns the next non-comment line. Comment lines start with
-   * a #.
+   * Reads up to and returns the next non-comment line, trimmed. Comment lines
+   * start with a #. Returns null at end of file.
     * 
     * @param dataIn
     * @return
@@ -156,63 +236,85 @@ public class GeneticCodes
      {
        line = readLine(dataIn);
      }
-    return line;
+    return line == null ? null : line.trim();
    }
  
    /**
-   * Reads the next lines of the data file describing one translation table, and
-   * creates an instance of GeneticCodeI for it. Returns the next line of the
-   * file (or null at end of file).
+   * Reads the lines of the data file describing one translation table, and
+   * creates and stores an instance of GeneticCodeI. Returns the '{' line
+   * starting the next table, or the '}' line at end of all tables. Data format
+   * is
     * 
-   * @param nextLine
+   * <pre>
+   * {
+   *   name "Vertebrate Mitochondrial" ,
+   *   name "SGC1" ,
+   *   id 2 ,
+   *   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
+   *   sncbieaa "----------**--------------------MMMM----------**---M------------"
+   *   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+   *   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+   *   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+   * },
+   * </pre>
+   * 
+   * of which we parse the first name, the id, and the ncbieaa translations for
+   * codons as ordered by the Base1/2/3 lines. Note Base1/2/3 are included for
+   * readability and are in a fixed order, these are not parsed. The sncbieaa
+   * line marks alternative start codons, these are not parsed.
     * 
     * @param dataIn
     * @return
     * @throws IOException
     */
-  protected String loadOneTable(String nextLine, BufferedReader dataIn) throws IOException
+  protected String loadOneTable(BufferedReader dataIn) throws IOException
    {
-    String line = nextLine;
-    if (line == null)
-    {
-      return null;
-    }
-    
-    /*
-     * next line should be tab-delimited "Table", id and description
-     */
-    String[] tokens = line.split("\\t");
-    String id = tokens[1];
-    String name = tokens[2];
-
-    /*
-     * followed by codon translations
-     * - the full set for the first (Standard) code
-     * - variations (if any) for other codes
-     */
+    String name = null;
+    String id = null;
      Map<String, String> codons = new HashMap<>();
-    while (true)
+
+    String line = readLine(dataIn);
+
+    while (line != null && !line.startsWith("}"))
      {
-      line = readLine(dataIn);
-      if (line == null)
+      if (line.startsWith("name") && name == null)
        {
-        registerCodeTable(id, name, codons);
-        return null;
+        name = line.substring(line.indexOf(QUOTE) + 1,
+                line.lastIndexOf(QUOTE));
        }
-      tokens = line.split("\\t");
-      String codon = tokens[0];
-      String peptide = tokens[1];
-      if ("Table".equalsIgnoreCase(codon))
+      else if (line.startsWith("id"))
        {
-        /*
-         * start of next code table - construct this one,
-         * and return the next line of the data file
-         */
-        registerCodeTable(id, name, codons);
-        return line;
+        id = new StringTokenizer(line.substring(2)).nextToken();
        }
-      codons.put(codon.toUpperCase(), peptide.toUpperCase());
+      else if (line.startsWith("ncbieaa"))
+      {
+        String aminos = line.substring(line.indexOf(QUOTE) + 1,
+                line.lastIndexOf(QUOTE));
+        if (aminos.length() != NUCS_COUNT_CUBED) // 4 * 4 * 4 combinations
+        {
+          Cache.log.error("wrong data length in code table: " + line);
+        }
+        else
+        {
+          for (int i = 0; i < aminos.length(); i++)
+          {
+            String peptide = String.valueOf(aminos.charAt(i));
+            char codon1 = NUCS.charAt(i / NUCS_COUNT_SQUARED);
+            char codon2 = NUCS
+                    .charAt((i % NUCS_COUNT_SQUARED) / NUCS_COUNT);
+            char codon3 = NUCS.charAt(i % NUCS_COUNT);
+            String codon = new String(
+                    new char[]
+                    { codon1, codon2, codon3 });
+            codons.put(codon, peptide);
+          }
+        }
+      }
+      line = readLine(dataIn);
      }
+
+    registerCodeTable(id, name, codons);
+    return readLine(dataIn);
    }
  
    /**
@@ -239,38 +341,22 @@ public class GeneticCodes
        @Override
        public String translateCanonical(String codon)
        {
-        codon = codon.toUpperCase();
-        String peptide = codons.get(codon);
-        if (peptide == null)
-        {
-          /*
-           * delegate an unspecified codon to the Standard Table, 
-           * (unless this is the Standard Table!)
-           * but don't delegate ambiguity resolution
-           */
-          GeneticCodeI standardCodeTable = getStandardCodeTable();
-          if (this != standardCodeTable)
-          {
-            peptide = standardCodeTable.translateCanonical(codon);
-          }
-        }
-        return peptide;
+        return codons.get(codon.toUpperCase());
        }
  
        @Override
        public String translate(String codon)
        {
-        codon = codon.toUpperCase();
-        String peptide = translateCanonical(codon);
+        String upper = codon.toUpperCase();
+        String peptide = translateCanonical(upper);
  
          /*
           * if still not translated, check for ambiguity codes
           */
          if (peptide == null)
          {
-          peptide = getAmbiguousTranslation(codon, ambiguous, this);
+          peptide = getAmbiguousTranslation(upper, ambiguous, this);
          }
-
          return peptide;
        }
  
@@ -302,29 +388,23 @@ public class GeneticCodes
    protected String getAmbiguousTranslation(String codon,
            Map<String, String> ambiguous, GeneticCodeI codeTable)
    {
-    if (codon.length() != 3)
+    if (codon.length() != CODON_LENGTH)
      {
        return null;
      }
  
      boolean isAmbiguous = false;
-    String base1 = String.valueOf(codon.charAt(0));
-    if (ambiguityCodes.containsKey(base1))
-    {
-      isAmbiguous = true;
-      base1 = ambiguityCodes.get(base1);
-    }
-    String base2 = String.valueOf(codon.charAt(1));
-    if (ambiguityCodes.containsKey(base2))
-    {
-      isAmbiguous = true;
-      base2 = ambiguityCodes.get(base2);
-    }
-    String base3 = String.valueOf(codon.charAt(2));
-    if (ambiguityCodes.containsKey(base3))
+
+    char[][] expanded = new char[CODON_LENGTH][];
+    for (int i = 0; i < CODON_LENGTH; i++)
      {
-      isAmbiguous = true;
-      base3 = ambiguityCodes.get(base3);
+      String base = String.valueOf(codon.charAt(i));
+      if (ambiguityCodes.containsKey(base))
+      {
+        isAmbiguous = true;
+        base = ambiguityCodes.get(base);
+      }
+      expanded[i] = base.toCharArray();
      }
  
      if (!isAmbiguous)
@@ -338,11 +418,11 @@ public class GeneticCodes
       * only return the translation if they all agree, else null
       */
      String peptide = null;
-    for (char c1 : base1.toCharArray())
+    for (char c1 : expanded[0])
      {
-      for (char c2 : base2.toCharArray())
+      for (char c2 : expanded[1])
        {
-        for (char c3 : base3.toCharArray())
+        for (char c3 : expanded[2])
          {
            char[] cdn = new char[] { c1, c2, c3 };
            String possibleCodon = String.valueOf(cdn);