JAL-2620 parse genetic code tables from standard file format
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Mon, 19 Mar 2018 15:40:00 +0000 (15:40 +0000)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Mon, 19 Mar 2018 15:40:00 +0000 (15:40 +0000)
resources/AmbiguityCodes.dat [new file with mode: 0644]
resources/GeneticCodes.dat
src/jalview/analysis/GeneticCodes.java
test/jalview/analysis/GeneticCodesTest.java

diff --git a/resources/AmbiguityCodes.dat b/resources/AmbiguityCodes.dat
new file mode 100644 (file)
index 0000000..9372c03
--- /dev/null
@@ -0,0 +1,13 @@
+# source: IUPAC codes as per http://www.insdc.org/documents/feature_table.html#7.4.1
+DNA
+R      AG
+Y      TC
+W      AT
+S      GC
+M      AC
+K      GT
+H      ATC
+B      GTC
+V      GAC
+D      GAT
+N      GATC
index 4a739b7..ca1ed99 100644 (file)
-#
-# Genetic code translation tables
-# Standard code comes first
-# Other codes only list deviations from the standard
-# Columns are tab separated
-# source: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi (July 2017)
-#
-Ambiguity Codes
-R      AG
-Y      TC
-W      AT
-S      GC
-M      AC
-K      GT
-H      ATC
-B      GTC
-V      GAC
-D      GAT
-N      GATC
-Table  1       Standard
-AAA    K
-AAG    K
-AAC    N
-AAT    N
-CAA    Q
-CAG    Q
-CAC    H
-CAT    H
-GAA    E
-GAG    E
-GAC    D
-GAT    D
-TAC    Y
-TAT    Y
-ACA    T
-ACC    T
-ACT    T
-ACG    T
-CCA    P
-CCG    P
-CCC    P
-CCT    P
-GCA    A
-GCG    A
-GCC    A
-GCT    A
-TCA    S
-TCG    S
-TCC    S
-TCT    S
-AGC    S
-AGT    S
-AGA    R
-AGG    R
-CGA    R
-CGG    R
-CGC    R
-CGT    R
-GGA    G
-GGG    G
-GGC    G
-GGT    G
-TGA    *
-TAA    *
-TAG    *
-TGG    W
-TGC    C
-TGT    C
-ATA    I
-ATC    I
-ATT    I
-ATG    M
-CTA    L
-CTG    L
-CTC    L
-CTT    L
-TTA    L
-TTG    L
-GTA    V
-GTG    V
-GTC    V
-GTT    V
-TTC    F
-TTT    F
-Table  2       Vertebrate Mitochondrial
-AGA    *       # R
-AGG    *       # R
-ATA    M       # I
-TGA    W       # *
-Table  3       Yeast Mitochondrial
-ATA    M       # I
-CTT    T       # L
-CTC    T       # L
-CTA    T       # L
-CTG    T       # L
-TGA    W       # *
-Table  4       Mold, Protozoan, and Coelenterate Mitochondrial
-TGA    W       # *
-Table  5       Invertebrate Mitochondrial
-AGA    S       # R
-AGG    S       # R
-ATA    M       # I
-TGA    W       # *
-Table  6       Ciliate, Dasycladacean and Hexamita Nuclear
-TAA    Q       # *
-TAG    Q       # * 
-Table  9       Echinoderm and Flatworm Mitochondrial
-AAA    N       # K
-AGA    S       # R
-AGG    S       # R
-TGA    W       # *
-Table  10      Euplotid Nuclear
-TGA    C       #  *
-Table  11      Bacterial, Archaeal and Plant Plastid
-Table  12      Alternative Yeast Nuclear
-CTG    S       # L
-Table  13      Ascidian Mitochondrial
-AGA    G       # R
-AGG    G       # R
-ATA    M       # I
-TGA    W       # *
-Table  14      Alternative Flatworm Mitochondrial
-AAA    N       # K
-AGA    S       # R
-AGG    S       # R
-TAA    Y       # *
-TGA    W       # *
-Table  16      Chlorophycean Mitochondrial
-TAG    L       # *
-Table  21      Trematode Mitochondrial
-TGA    W       # *
-ATA    M       # I
-AGA    S       # R
-AGG    S       # R
-AAA    N       # K
-Table  22      Scenedesmus obliquus Mitochondrial
-TCA    *       # S
-TAG    L       # *
-Table  23      Thraustochytrium Mitochondrial
-TTA    *       # L
-Table  24      Pterobranchia Mitochondrial
-AGA    S       # R
-AGG    K       # R
-TGA    W       # *
-Table  25      Candidate Division SR1 and Gracilibacteria 
-TGA    G       # *
-Table  26      Pachysolen tannophilus Nuclear
-CTG    A       # L
-Table  27      Karyorelict Nuclear
-TAG    Q       # *
-TAA    Q       # *
-TGA    W       # or STOP       # * 
-Table  28      Condylostoma Nuclear
-TAA    Q       # or STOP       # *
-TAG    Q       # or STOP       # *
-TGA    W       # or STOP       # *
-Table  29      Mesodinium Nuclear
-TAA    Y       # *
-TAG    Y       # *
-Table  30      Peritrich Nuclear
-TAA    E       # *
-TAG    E       # *
-Table  31      Blastocrithidia Nuclear
-TGA    W       # *
-TAG    E       # or STOP       # * 
-TAA    E       # or STOP       # *
+-- source: ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt (19th March 2018)
+-- SGC3 edited so name is all on one line
+--**************************************************************************
+--  This is the NCBI genetic code table
+--  Initial base data set from Andrzej Elzanowski while at PIR International
+--  Addition of Eubacterial and Alternative Yeast by J.Ostell at NCBI
+--  Base 1-3 of each codon have been added as comments to facilitate
+--    readability at the suggestion of Peter Rice, EMBL
+--  Later additions by Taxonomy Group staff at NCBI
+--
+--  Version 4.2
+--     Added Karyorelict nuclear genetic code 27
+--     Added Condylostoma nuclear genetic code 28
+--     Added Mesodinium nuclear genetic code 29
+--     Added Peritrich nuclear genetic code 30
+--     Added Blastocrithidia nuclear genetic code 31
+--
+--  Version 4.1
+--     Added Pachysolen tannophilus nuclear genetic code 26
+--
+--  Version 4.0
+--     Updated version to reflect numerous undocumented changes:
+--     Corrected start codons for genetic code 25
+--     Name of new genetic code is Candidate Division SR1 and Gracilibacteria
+--     Added candidate division SR1 nuclear genetic code 25
+--     Added GTG as start codon for genetic code 24
+--     Corrected Pterobranchia Mitochondrial genetic code (24)
+--     Added genetic code 24, Pterobranchia Mitochondrial
+--     Genetic code 11 is now Bacterial, Archaeal and Plant Plastid
+--     Fixed capitalization of mitochondrial in codes 22 and 23
+--     Added GTG, ATA, and TTG as alternative start codons to code 13
+--
+--  Version 3.9
+--     Code 14 differs from code 9 only by translating UAA to Tyr rather than
+--     STOP.  A recent study (Telford et al, 2000) has found no evidence that
+--     the codon UAA codes for Tyr in the flatworms, but other opinions exist.
+--     There are very few GenBank records that are translated with code 14,
+--     but a test translation shows that retranslating these records with code
+--     9 can cause premature terminations.  Therefore, GenBank will maintain
+--     code 14 until further information becomes available.
+--
+--  Version 3.8
+--     Added GTG start to Echinoderm mitochondrial code, code 9
+--
+--  Version 3.7
+--     Added code 23 Thraustochytrium mitochondrial code
+--        formerly OGMP code 93
+--        submitted by Gertraude Berger, Ph.D.
+--
+--  Version 3.6
+--     Added code 22 TAG-Leu, TCA-stop
+--        found in mitochondrial DNA of Scenedesmus obliquus
+--        submitted by Gertraude Berger, Ph.D.
+--        Organelle Genome Megasequencing Program, Univ Montreal
+--
+--  Version 3.5
+--     Added code 21, Trematode Mitochondrial
+--       (as deduced from: Garey & Wolstenholme,1989; Ohama et al, 1990)
+--     Added code 16, Chlorophycean Mitochondrial
+--       (TAG can translated to Leucine instaed to STOP in chlorophyceans
+--        and fungi)
+--
+--  Version 3.4
+--     Added CTG,TTG as allowed alternate start codons in Standard code.
+--        Prats et al. 1989, Hann et al. 1992
+--
+--  Version 3.3 - 10/13/95
+--     Added alternate intiation codon ATC to code 5
+--        based on complete mitochondrial genome of honeybee
+--        Crozier and Crozier (1993)
+--
+--  Version 3.2 - 6/24/95
+--  Code       Comments
+--   10        Alternative Ciliate Macronuclear renamed to Euplotid Macro...
+--   15        Blepharisma Macro.. code added
+--    5        Invertebrate Mito.. GTG allowed as alternate initiator
+--   11        Eubacterial renamed to Bacterial as most alternate starts
+--               have been found in Archea
+--
+--
+--  Version 3.1 - 1995
+--  Updated as per Andrzej Elzanowski at NCBI
+--     Complete documentation in NCBI toolkit documentation
+--  Note: 2 genetic codes have been deleted
+--
+--   Old id   Use id     - Notes
+--
+--   id 7      id 4      - Kinetoplast code now merged in code id 4
+--   id 8      id 1      - all plant chloroplast differences due to RNA edit
+--
+--*************************************************************************
+
+Genetic-code-table ::= {
+ {
+  name "Standard" ,
+  name "SGC0" ,
+  id 1 ,
+  ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "---M------**--*----M---------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Vertebrate Mitochondrial" ,
+  name "SGC1" ,
+  id 2 ,
+  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
+  sncbieaa "----------**--------------------MMMM----------**---M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Yeast Mitochondrial" ,
+  name "SGC2" ,
+  id 3 ,
+  ncbieaa  "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "----------**----------------------MM----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+    name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma" ,
+  name "SGC3" ,
+  id 4 ,
+  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "--MM------**-------M------------MMMM---------------M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Invertebrate Mitochondrial" ,
+  name "SGC4" ,
+  id 5 ,
+  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
+  sncbieaa "---M------**--------------------MMMM---------------M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" ,
+  name "SGC5" ,
+  id 6 ,
+  ncbieaa  "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "--------------*--------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Echinoderm Mitochondrial; Flatworm Mitochondrial" ,
+  name "SGC8" ,
+  id 9 ,
+  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
+  sncbieaa "----------**-----------------------M---------------M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Euplotid Nuclear" ,
+  name "SGC9" ,
+  id 10 ,
+  ncbieaa  "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "----------**-----------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Bacterial, Archaeal and Plant Plastid" ,
+  id 11 ,
+  ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "---M------**--*----M------------MMMM---------------M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Alternative Yeast Nuclear" ,
+  id 12 ,
+  ncbieaa  "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "----------**--*----M---------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Ascidian Mitochondrial" ,
+  id 13 ,
+  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
+  sncbieaa "---M------**----------------------MM---------------M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ },
+ {
+  name "Alternative Flatworm Mitochondrial" ,
+  id 14 ,
+  ncbieaa  "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
+  sncbieaa "-----------*-----------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Blepharisma Macronuclear" ,
+  id 15 ,
+  ncbieaa  "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "----------*---*--------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Chlorophycean Mitochondrial" ,
+  id 16 ,
+  ncbieaa  "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "----------*---*--------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Trematode Mitochondrial" ,
+  id 21 ,
+  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
+  sncbieaa "----------**-----------------------M---------------M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Scenedesmus obliquus Mitochondrial" ,
+  id 22 ,
+  ncbieaa  "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "------*---*---*--------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Thraustochytrium Mitochondrial" ,
+  id 23 ,
+  ncbieaa  "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "--*-------**--*-----------------M--M---------------M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Pterobranchia Mitochondrial" ,
+  id 24 ,
+  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG",
+  sncbieaa "---M------**-------M---------------M---------------M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Candidate Division SR1 and Gracilibacteria" ,
+  id 25 ,
+  ncbieaa  "FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "---M------**-----------------------M---------------M------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Pachysolen tannophilus Nuclear" ,
+  id 26 ,
+  ncbieaa  "FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "----------**--*----M---------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Karyorelict Nuclear" ,
+  id 27 ,
+  ncbieaa  "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "--------------*--------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Condylostoma Nuclear" ,
+  id 28 ,
+  ncbieaa  "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "----------**--*--------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Mesodinium Nuclear" ,
+  id 29 ,
+  ncbieaa  "FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "--------------*--------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Peritrich Nuclear" ,
+  id 30 ,
+  ncbieaa  "FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "--------------*--------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ } ,
+ {
+  name "Blastocrithidia Nuclear" ,
+  id 31 ,
+  ncbieaa  "FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+  sncbieaa "----------**-----------------------M----------------------------"
+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+ }
+}
index 88d4e69..d07253e 100644 (file)
@@ -1,5 +1,7 @@
 package jalview.analysis;
 
+import jalview.bin.Cache;
+
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
@@ -7,6 +9,7 @@ import java.io.InputStreamReader;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.Map;
+import java.util.StringTokenizer;
 
 /**
  * A singleton that provides instances of genetic code translation tables
@@ -14,8 +17,26 @@ import java.util.Map;
  * @author gmcarstairs
  * @see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
  */
-public class GeneticCodes
+public final class GeneticCodes
 {
+  private static final int CODON_LENGTH = 3;
+
+  private static final String QUOTE = "\"";
+
+  /*
+   * nucleotides as ordered in data file
+   */
+  private static final String NUCS = "TCAG";
+
+  private static final int NUCS_COUNT = NUCS.length();
+
+  private static final int NUCS_COUNT_SQUARED = NUCS_COUNT * NUCS_COUNT;
+
+  private static final int NUCS_COUNT_CUBED = NUCS_COUNT * NUCS_COUNT
+          * NUCS_COUNT;
+
+  private static final String AMBIGUITY_CODES_FILE = "/AmbiguityCodes.dat";
+
   private static final String RESOURCE_FILE = "/GeneticCodes.dat";
 
   private static GeneticCodes instance = new GeneticCodes();
@@ -28,16 +49,6 @@ public class GeneticCodes
   private Map<String, GeneticCodeI> codeTables;
 
   /**
-   * Returns the singleton instance of this class
-   * 
-   * @return
-   */
-  public static GeneticCodes getInstance()
-  {
-    return instance;
-  }
-
-  /**
    * Private constructor enforces singleton
    */
   private GeneticCodes()
@@ -51,11 +62,22 @@ public class GeneticCodes
        * so we can assume the Standard Code Table is the first
        */
       codeTables = new LinkedHashMap<>();
+      loadAmbiguityCodes(AMBIGUITY_CODES_FILE);
       loadCodes(RESOURCE_FILE);
     }
   };
 
   /**
+   * Returns the singleton instance of this class
+   * 
+   * @return
+   */
+  public static GeneticCodes getInstance()
+  {
+    return instance;
+  }
+
+  /**
    * Returns the known code tables, in order of loading.
    * 
    * @return
@@ -97,53 +119,67 @@ public class GeneticCodes
       InputStream is = getClass().getResourceAsStream(fileName);
       BufferedReader dataIn = new BufferedReader(new InputStreamReader(is));
 
-      String line = loadAmbiguityCodes(dataIn);
+      /*
+       * skip comments and start of table
+       */
+      String line = "";
+      while (line != null && !line.startsWith("Genetic-code-table"))
+      {
+        line = readLine(dataIn);
+      }
+      line = readLine(dataIn);
 
-      do
+      while (line.startsWith("{"))
       {
-        line = loadOneTable(line, dataIn);
-      } while (line != null);
-    } catch (IOException e)
+        line = loadOneTable(dataIn);
+      }
+    } catch (IOException | NullPointerException e)
     {
-      System.err.println("Error reading genetic codes data file: "
+      Cache.log.error(
+              "Error reading genetic codes data file: "
               + e.getMessage());
     }
   }
 
   /**
-   * Reads for header line "Ambiguity Codes" and saves following data up to the
-   * first "Table". Returns the next ("Table") line.
+   * Reads and saves Nucleotide ambiguity codes from a data file. The file may
+   * include comment lines (starting with #), a header 'DNA', and one line per
+   * ambiguity code, for example:
+   * <p>
+   * R&lt;tab&gt;AG
+   * <p>
+   * means that R is an ambiguity code meaning "A or G"
    * 
-   * @param dataIn
-   * @return
-   * @throws IOException
+   * @param fileName
    */
-  protected String loadAmbiguityCodes(BufferedReader dataIn)
-          throws IOException
+  protected void loadAmbiguityCodes(String fileName)
   {
-    /*
-     * get first non-comment line
-     */
-    String line = readLine(dataIn);
-    if (line == null || !line.toUpperCase().startsWith("AMBIGUITY"))
-    {
-      return line;
-    }
-    while (true)
+    try
     {
-      line = readLine(dataIn);
-      if (line == null || line.toUpperCase().startsWith("TABLE"))
+      InputStream is = getClass().getResourceAsStream(fileName);
+      BufferedReader dataIn = new BufferedReader(new InputStreamReader(is));
+      String line = "";
+      while (line != null)
       {
-        return line;
+        line = readLine(dataIn);
+        if (line != null && !"DNA".equals(line.toUpperCase()))
+        {
+          String[] tokens = line.split("\\t");
+          ambiguityCodes.put(tokens[0].toUpperCase(),
+                  tokens[1].toUpperCase());
+        }
       }
-      String[] tokens = line.split("\\t");
-      ambiguityCodes.put(tokens[0].toUpperCase(), tokens[1].toUpperCase());
+    } catch (IOException e)
+    {
+      Cache.log.error(
+              "Error reading nucleotide ambiguity codes data file: "
+                      + e.getMessage());
     }
   }
 
   /**
-   * Reads up to and returns the next non-comment line. Comment lines start with
-   * a #.
+   * Reads up to and returns the next non-comment line, trimmed. Comment lines
+   * start with a #. Returns null at end of file.
    * 
    * @param dataIn
    * @return
@@ -156,63 +192,85 @@ public class GeneticCodes
     {
       line = readLine(dataIn);
     }
-    return line;
+    return line == null ? null : line.trim();
   }
 
   /**
-   * Reads the next lines of the data file describing one translation table, and
-   * creates an instance of GeneticCodeI for it. Returns the next line of the
-   * file (or null at end of file).
+   * Reads the lines of the data file describing one translation table, and
+   * creates and stores an instance of GeneticCodeI. Returns the '{' line
+   * starting the next table, or the '}' line at end of all tables. Data format
+   * is
+   * 
+   * <pre>
+   * {
+   *   name "Vertebrate Mitochondrial" ,
+   *   name "SGC1" ,
+   *   id 2 ,
+   *   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
+   *   sncbieaa "----------**--------------------MMMM----------**---M------------"
+   *   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+   *   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+   *   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+   * },
+   * </pre>
    * 
-   * @param nextLine
+   * of which we parse the first name, the id, and the ncbieaa translations for
+   * codons as ordered by the Base1/2/3 lines. Note Base1/2/3 are included for
+   * readability and are in a fixed order, these are not parsed. The sncbieaa
+   * line marks alternative start codons, these are not parsed.
    * 
    * @param dataIn
    * @return
    * @throws IOException
    */
-  protected String loadOneTable(String nextLine, BufferedReader dataIn) throws IOException
+  protected String loadOneTable(BufferedReader dataIn) throws IOException
   {
-    String line = nextLine;
-    if (line == null)
-    {
-      return null;
-    }
-    
-    /*
-     * next line should be tab-delimited "Table", id and description
-     */
-    String[] tokens = line.split("\\t");
-    String id = tokens[1];
-    String name = tokens[2];
-
-    /*
-     * followed by codon translations
-     * - the full set for the first (Standard) code
-     * - variations (if any) for other codes
-     */
+    String name = null;
+    String id = null;
     Map<String, String> codons = new HashMap<>();
-    while (true)
+
+    String line = readLine(dataIn);
+
+    while (line != null && !line.startsWith("}"))
     {
-      line = readLine(dataIn);
-      if (line == null)
+      if (line.startsWith("name") && name == null)
       {
-        registerCodeTable(id, name, codons);
-        return null;
+        name = line.substring(line.indexOf(QUOTE) + 1,
+                line.lastIndexOf(QUOTE));
       }
-      tokens = line.split("\\t");
-      String codon = tokens[0];
-      String peptide = tokens[1];
-      if ("Table".equalsIgnoreCase(codon))
+      else if (line.startsWith("id"))
       {
-        /*
-         * start of next code table - construct this one,
-         * and return the next line of the data file
-         */
-        registerCodeTable(id, name, codons);
-        return line;
+        id = new StringTokenizer(line.substring(2)).nextToken();
+      }
+      else if (line.startsWith("ncbieaa"))
+      {
+        String aminos = line.substring(line.indexOf(QUOTE) + 1,
+                line.lastIndexOf(QUOTE));
+        if (aminos.length() != NUCS_COUNT_CUBED) // 4 * 4 * 4 combinations
+        {
+          Cache.log.error("wrong data length in code table: " + line);
+        }
+        else
+        {
+          for (int i = 0; i < aminos.length(); i++)
+          {
+            String peptide = String.valueOf(aminos.charAt(i));
+            char codon1 = NUCS.charAt(i / NUCS_COUNT_SQUARED);
+            char codon2 = NUCS
+                    .charAt((i % NUCS_COUNT_SQUARED) / NUCS_COUNT);
+            char codon3 = NUCS.charAt(i % NUCS_COUNT);
+            String codon = new String(
+                    new char[]
+                    { codon1, codon2, codon3 });
+            codons.put(codon, peptide);
+          }
+        }
       }
-      codons.put(codon.toUpperCase(), peptide.toUpperCase());
+      line = readLine(dataIn);
     }
+
+    registerCodeTable(id, name, codons);
+    return readLine(dataIn);
   }
 
   /**
@@ -239,38 +297,22 @@ public class GeneticCodes
       @Override
       public String translateCanonical(String codon)
       {
-        codon = codon.toUpperCase();
-        String peptide = codons.get(codon);
-        if (peptide == null)
-        {
-          /*
-           * delegate an unspecified codon to the Standard Table, 
-           * (unless this is the Standard Table!)
-           * but don't delegate ambiguity resolution
-           */
-          GeneticCodeI standardCodeTable = getStandardCodeTable();
-          if (this != standardCodeTable)
-          {
-            peptide = standardCodeTable.translateCanonical(codon);
-          }
-        }
-        return peptide;
+        return codons.get(codon.toUpperCase());
       }
 
       @Override
       public String translate(String codon)
       {
-        codon = codon.toUpperCase();
-        String peptide = translateCanonical(codon);
+        String upper = codon.toUpperCase();
+        String peptide = translateCanonical(upper);
 
         /*
          * if still not translated, check for ambiguity codes
          */
         if (peptide == null)
         {
-          peptide = getAmbiguousTranslation(codon, ambiguous, this);
+          peptide = getAmbiguousTranslation(upper, ambiguous, this);
         }
-
         return peptide;
       }
 
@@ -302,29 +344,23 @@ public class GeneticCodes
   protected String getAmbiguousTranslation(String codon,
           Map<String, String> ambiguous, GeneticCodeI codeTable)
   {
-    if (codon.length() != 3)
+    if (codon.length() != CODON_LENGTH)
     {
       return null;
     }
 
     boolean isAmbiguous = false;
-    String base1 = String.valueOf(codon.charAt(0));
-    if (ambiguityCodes.containsKey(base1))
-    {
-      isAmbiguous = true;
-      base1 = ambiguityCodes.get(base1);
-    }
-    String base2 = String.valueOf(codon.charAt(1));
-    if (ambiguityCodes.containsKey(base2))
-    {
-      isAmbiguous = true;
-      base2 = ambiguityCodes.get(base2);
-    }
-    String base3 = String.valueOf(codon.charAt(2));
-    if (ambiguityCodes.containsKey(base3))
+
+    char[][] expanded = new char[CODON_LENGTH][];
+    for (int i = 0; i < CODON_LENGTH; i++)
     {
-      isAmbiguous = true;
-      base3 = ambiguityCodes.get(base3);
+      String base = String.valueOf(codon.charAt(i));
+      if (ambiguityCodes.containsKey(base))
+      {
+        isAmbiguous = true;
+        base = ambiguityCodes.get(base);
+      }
+      expanded[i] = base.toCharArray();
     }
 
     if (!isAmbiguous)
@@ -338,11 +374,11 @@ public class GeneticCodes
      * only return the translation if they all agree, else null
      */
     String peptide = null;
-    for (char c1 : base1.toCharArray())
+    for (char c1 : expanded[0])
     {
-      for (char c2 : base2.toCharArray())
+      for (char c2 : expanded[1])
       {
-        for (char c3 : base3.toCharArray())
+        for (char c3 : expanded[2])
         {
           char[] cdn = new char[] { c1, c2, c3 };
           String possibleCodon = String.valueOf(cdn);
index d5634db..5f49092 100644 (file)
@@ -32,8 +32,8 @@ public class GeneticCodesTest
     GeneticCodes codes = GeneticCodes.getInstance();
     Iterator<GeneticCodeI> tableIterator = codes.getCodeTables().iterator();
     String[] ids = new String[] { "1", "2", "3", "4", "5", "6", "9", "10",
-        "11", "12", "13", "14", "16", "21", "22", "23", "24", "25", "26",
-        "27", "28", "29", "30", "31" };
+        "11", "12", "13", "14", "15", "16", "21", "22", "23", "24", "25",
+        "26", "27", "28", "29", "30", "31" };
     for (int i = 0; i < ids.length; i++)
     {
       assertEquals(tableIterator.next().getId(), ids[i]);