From af65be61ff37a024b694150efed2c6c9b81382ef Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 19 Mar 2018 15:40:00 +0000 Subject: [PATCH] JAL-2620 parse genetic code tables from standard file format --- resources/AmbiguityCodes.dat | 13 + resources/GeneticCodes.dat | 493 ++++++++++++++++++--------- src/jalview/analysis/GeneticCodes.java | 276 ++++++++------- test/jalview/analysis/GeneticCodesTest.java | 4 +- 4 files changed, 498 insertions(+), 288 deletions(-) create mode 100644 resources/AmbiguityCodes.dat diff --git a/resources/AmbiguityCodes.dat b/resources/AmbiguityCodes.dat new file mode 100644 index 0000000..9372c03 --- /dev/null +++ b/resources/AmbiguityCodes.dat @@ -0,0 +1,13 @@ +# source: IUPAC codes as per http://www.insdc.org/documents/feature_table.html#7.4.1 +DNA +R AG +Y TC +W AT +S GC +M AC +K GT +H ATC +B GTC +V GAC +D GAT +N GATC diff --git a/resources/GeneticCodes.dat b/resources/GeneticCodes.dat index 4a739b7..ca1ed99 100644 --- a/resources/GeneticCodes.dat +++ b/resources/GeneticCodes.dat @@ -1,166 +1,327 @@ -# -# Genetic code translation tables -# Standard code comes first -# Other codes only list deviations from the standard -# Columns are tab separated -# source: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi (July 2017) -# -Ambiguity Codes -R AG -Y TC -W AT -S GC -M AC -K GT -H ATC -B GTC -V GAC -D GAT -N GATC -Table 1 Standard -AAA K -AAG K -AAC N -AAT N -CAA Q -CAG Q -CAC H -CAT H -GAA E -GAG E -GAC D -GAT D -TAC Y -TAT Y -ACA T -ACC T -ACT T -ACG T -CCA P -CCG P -CCC P -CCT P -GCA A -GCG A -GCC A -GCT A -TCA S -TCG S -TCC S -TCT S -AGC S -AGT S -AGA R -AGG R -CGA R -CGG R -CGC R -CGT R -GGA G -GGG G -GGC G -GGT G -TGA * -TAA * -TAG * -TGG W -TGC C -TGT C -ATA I -ATC I -ATT I -ATG M -CTA L -CTG L -CTC L -CTT L -TTA L -TTG L -GTA V -GTG V -GTC V -GTT V -TTC F -TTT F -Table 2 Vertebrate Mitochondrial -AGA * # R -AGG * # R -ATA M # I -TGA W # * -Table 3 Yeast Mitochondrial -ATA M # I -CTT T # L -CTC T # L -CTA T # L -CTG T # L -TGA W # * -Table 4 Mold, Protozoan, and Coelenterate Mitochondrial -TGA W # * -Table 5 Invertebrate Mitochondrial -AGA S # R -AGG S # R -ATA M # I -TGA W # * -Table 6 Ciliate, Dasycladacean and Hexamita Nuclear -TAA Q # * -TAG Q # * -Table 9 Echinoderm and Flatworm Mitochondrial -AAA N # K -AGA S # R -AGG S # R -TGA W # * -Table 10 Euplotid Nuclear -TGA C # * -Table 11 Bacterial, Archaeal and Plant Plastid -Table 12 Alternative Yeast Nuclear -CTG S # L -Table 13 Ascidian Mitochondrial -AGA G # R -AGG G # R -ATA M # I -TGA W # * -Table 14 Alternative Flatworm Mitochondrial -AAA N # K -AGA S # R -AGG S # R -TAA Y # * -TGA W # * -Table 16 Chlorophycean Mitochondrial -TAG L # * -Table 21 Trematode Mitochondrial -TGA W # * -ATA M # I -AGA S # R -AGG S # R -AAA N # K -Table 22 Scenedesmus obliquus Mitochondrial -TCA * # S -TAG L # * -Table 23 Thraustochytrium Mitochondrial -TTA * # L -Table 24 Pterobranchia Mitochondrial -AGA S # R -AGG K # R -TGA W # * -Table 25 Candidate Division SR1 and Gracilibacteria -TGA G # * -Table 26 Pachysolen tannophilus Nuclear -CTG A # L -Table 27 Karyorelict Nuclear -TAG Q # * -TAA Q # * -TGA W # or STOP # * -Table 28 Condylostoma Nuclear -TAA Q # or STOP # * -TAG Q # or STOP # * -TGA W # or STOP # * -Table 29 Mesodinium Nuclear -TAA Y # * -TAG Y # * -Table 30 Peritrich Nuclear -TAA E # * -TAG E # * -Table 31 Blastocrithidia Nuclear -TGA W # * -TAG E # or STOP # * -TAA E # or STOP # * +-- source: ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt (19th March 2018) +-- SGC3 edited so name is all on one line +--************************************************************************** +-- This is the NCBI genetic code table +-- Initial base data set from Andrzej Elzanowski while at PIR International +-- Addition of Eubacterial and Alternative Yeast by J.Ostell at NCBI +-- Base 1-3 of each codon have been added as comments to facilitate +-- readability at the suggestion of Peter Rice, EMBL +-- Later additions by Taxonomy Group staff at NCBI +-- +-- Version 4.2 +-- Added Karyorelict nuclear genetic code 27 +-- Added Condylostoma nuclear genetic code 28 +-- Added Mesodinium nuclear genetic code 29 +-- Added Peritrich nuclear genetic code 30 +-- Added Blastocrithidia nuclear genetic code 31 +-- +-- Version 4.1 +-- Added Pachysolen tannophilus nuclear genetic code 26 +-- +-- Version 4.0 +-- Updated version to reflect numerous undocumented changes: +-- Corrected start codons for genetic code 25 +-- Name of new genetic code is Candidate Division SR1 and Gracilibacteria +-- Added candidate division SR1 nuclear genetic code 25 +-- Added GTG as start codon for genetic code 24 +-- Corrected Pterobranchia Mitochondrial genetic code (24) +-- Added genetic code 24, Pterobranchia Mitochondrial +-- Genetic code 11 is now Bacterial, Archaeal and Plant Plastid +-- Fixed capitalization of mitochondrial in codes 22 and 23 +-- Added GTG, ATA, and TTG as alternative start codons to code 13 +-- +-- Version 3.9 +-- Code 14 differs from code 9 only by translating UAA to Tyr rather than +-- STOP. A recent study (Telford et al, 2000) has found no evidence that +-- the codon UAA codes for Tyr in the flatworms, but other opinions exist. +-- There are very few GenBank records that are translated with code 14, +-- but a test translation shows that retranslating these records with code +-- 9 can cause premature terminations. Therefore, GenBank will maintain +-- code 14 until further information becomes available. +-- +-- Version 3.8 +-- Added GTG start to Echinoderm mitochondrial code, code 9 +-- +-- Version 3.7 +-- Added code 23 Thraustochytrium mitochondrial code +-- formerly OGMP code 93 +-- submitted by Gertraude Berger, Ph.D. +-- +-- Version 3.6 +-- Added code 22 TAG-Leu, TCA-stop +-- found in mitochondrial DNA of Scenedesmus obliquus +-- submitted by Gertraude Berger, Ph.D. +-- Organelle Genome Megasequencing Program, Univ Montreal +-- +-- Version 3.5 +-- Added code 21, Trematode Mitochondrial +-- (as deduced from: Garey & Wolstenholme,1989; Ohama et al, 1990) +-- Added code 16, Chlorophycean Mitochondrial +-- (TAG can translated to Leucine instaed to STOP in chlorophyceans +-- and fungi) +-- +-- Version 3.4 +-- Added CTG,TTG as allowed alternate start codons in Standard code. +-- Prats et al. 1989, Hann et al. 1992 +-- +-- Version 3.3 - 10/13/95 +-- Added alternate intiation codon ATC to code 5 +-- based on complete mitochondrial genome of honeybee +-- Crozier and Crozier (1993) +-- +-- Version 3.2 - 6/24/95 +-- Code Comments +-- 10 Alternative Ciliate Macronuclear renamed to Euplotid Macro... +-- 15 Blepharisma Macro.. code added +-- 5 Invertebrate Mito.. GTG allowed as alternate initiator +-- 11 Eubacterial renamed to Bacterial as most alternate starts +-- have been found in Archea +-- +-- +-- Version 3.1 - 1995 +-- Updated as per Andrzej Elzanowski at NCBI +-- Complete documentation in NCBI toolkit documentation +-- Note: 2 genetic codes have been deleted +-- +-- Old id Use id - Notes +-- +-- id 7 id 4 - Kinetoplast code now merged in code id 4 +-- id 8 id 1 - all plant chloroplast differences due to RNA edit +-- +--************************************************************************* + +Genetic-code-table ::= { + { + name "Standard" , + name "SGC0" , + id 1 , + ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "---M------**--*----M---------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Vertebrate Mitochondrial" , + name "SGC1" , + id 2 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", + sncbieaa "----------**--------------------MMMM----------**---M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Yeast Mitochondrial" , + name "SGC2" , + id 3 , + ncbieaa "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------**----------------------MM----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma" , + name "SGC3" , + id 4 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--MM------**-------M------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Invertebrate Mitochondrial" , + name "SGC4" , + id 5 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", + sncbieaa "---M------**--------------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" , + name "SGC5" , + id 6 , + ncbieaa "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--------------*--------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Echinoderm Mitochondrial; Flatworm Mitochondrial" , + name "SGC8" , + id 9 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", + sncbieaa "----------**-----------------------M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Euplotid Nuclear" , + name "SGC9" , + id 10 , + ncbieaa "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------**-----------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Bacterial, Archaeal and Plant Plastid" , + id 11 , + ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "---M------**--*----M------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Alternative Yeast Nuclear" , + id 12 , + ncbieaa "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------**--*----M---------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Ascidian Mitochondrial" , + id 13 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", + sncbieaa "---M------**----------------------MM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Alternative Flatworm Mitochondrial" , + id 14 , + ncbieaa "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", + sncbieaa "-----------*-----------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Blepharisma Macronuclear" , + id 15 , + ncbieaa "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------*---*--------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Chlorophycean Mitochondrial" , + id 16 , + ncbieaa "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------*---*--------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Trematode Mitochondrial" , + id 21 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", + sncbieaa "----------**-----------------------M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Scenedesmus obliquus Mitochondrial" , + id 22 , + ncbieaa "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "------*---*---*--------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Thraustochytrium Mitochondrial" , + id 23 , + ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--*-------**--*-----------------M--M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Pterobranchia Mitochondrial" , + id 24 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG", + sncbieaa "---M------**-------M---------------M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Candidate Division SR1 and Gracilibacteria" , + id 25 , + ncbieaa "FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "---M------**-----------------------M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Pachysolen tannophilus Nuclear" , + id 26 , + ncbieaa "FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------**--*----M---------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Karyorelict Nuclear" , + id 27 , + ncbieaa "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--------------*--------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Condylostoma Nuclear" , + id 28 , + ncbieaa "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------**--*--------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Mesodinium Nuclear" , + id 29 , + ncbieaa "FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--------------*--------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Peritrich Nuclear" , + id 30 , + ncbieaa "FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--------------*--------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Blastocrithidia Nuclear" , + id 31 , + ncbieaa "FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------**-----------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } +} diff --git a/src/jalview/analysis/GeneticCodes.java b/src/jalview/analysis/GeneticCodes.java index 88d4e69..d07253e 100644 --- a/src/jalview/analysis/GeneticCodes.java +++ b/src/jalview/analysis/GeneticCodes.java @@ -1,5 +1,7 @@ package jalview.analysis; +import jalview.bin.Cache; + import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -7,6 +9,7 @@ import java.io.InputStreamReader; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; +import java.util.StringTokenizer; /** * A singleton that provides instances of genetic code translation tables @@ -14,8 +17,26 @@ import java.util.Map; * @author gmcarstairs * @see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi */ -public class GeneticCodes +public final class GeneticCodes { + private static final int CODON_LENGTH = 3; + + private static final String QUOTE = "\""; + + /* + * nucleotides as ordered in data file + */ + private static final String NUCS = "TCAG"; + + private static final int NUCS_COUNT = NUCS.length(); + + private static final int NUCS_COUNT_SQUARED = NUCS_COUNT * NUCS_COUNT; + + private static final int NUCS_COUNT_CUBED = NUCS_COUNT * NUCS_COUNT + * NUCS_COUNT; + + private static final String AMBIGUITY_CODES_FILE = "/AmbiguityCodes.dat"; + private static final String RESOURCE_FILE = "/GeneticCodes.dat"; private static GeneticCodes instance = new GeneticCodes(); @@ -28,16 +49,6 @@ public class GeneticCodes private Map codeTables; /** - * Returns the singleton instance of this class - * - * @return - */ - public static GeneticCodes getInstance() - { - return instance; - } - - /** * Private constructor enforces singleton */ private GeneticCodes() @@ -51,11 +62,22 @@ public class GeneticCodes * so we can assume the Standard Code Table is the first */ codeTables = new LinkedHashMap<>(); + loadAmbiguityCodes(AMBIGUITY_CODES_FILE); loadCodes(RESOURCE_FILE); } }; /** + * Returns the singleton instance of this class + * + * @return + */ + public static GeneticCodes getInstance() + { + return instance; + } + + /** * Returns the known code tables, in order of loading. * * @return @@ -97,53 +119,67 @@ public class GeneticCodes InputStream is = getClass().getResourceAsStream(fileName); BufferedReader dataIn = new BufferedReader(new InputStreamReader(is)); - String line = loadAmbiguityCodes(dataIn); + /* + * skip comments and start of table + */ + String line = ""; + while (line != null && !line.startsWith("Genetic-code-table")) + { + line = readLine(dataIn); + } + line = readLine(dataIn); - do + while (line.startsWith("{")) { - line = loadOneTable(line, dataIn); - } while (line != null); - } catch (IOException e) + line = loadOneTable(dataIn); + } + } catch (IOException | NullPointerException e) { - System.err.println("Error reading genetic codes data file: " + Cache.log.error( + "Error reading genetic codes data file: " + e.getMessage()); } } /** - * Reads for header line "Ambiguity Codes" and saves following data up to the - * first "Table". Returns the next ("Table") line. + * Reads and saves Nucleotide ambiguity codes from a data file. The file may + * include comment lines (starting with #), a header 'DNA', and one line per + * ambiguity code, for example: + *

+ * R<tab>AG + *

+ * means that R is an ambiguity code meaning "A or G" * - * @param dataIn - * @return - * @throws IOException + * @param fileName */ - protected String loadAmbiguityCodes(BufferedReader dataIn) - throws IOException + protected void loadAmbiguityCodes(String fileName) { - /* - * get first non-comment line - */ - String line = readLine(dataIn); - if (line == null || !line.toUpperCase().startsWith("AMBIGUITY")) - { - return line; - } - while (true) + try { - line = readLine(dataIn); - if (line == null || line.toUpperCase().startsWith("TABLE")) + InputStream is = getClass().getResourceAsStream(fileName); + BufferedReader dataIn = new BufferedReader(new InputStreamReader(is)); + String line = ""; + while (line != null) { - return line; + line = readLine(dataIn); + if (line != null && !"DNA".equals(line.toUpperCase())) + { + String[] tokens = line.split("\\t"); + ambiguityCodes.put(tokens[0].toUpperCase(), + tokens[1].toUpperCase()); + } } - String[] tokens = line.split("\\t"); - ambiguityCodes.put(tokens[0].toUpperCase(), tokens[1].toUpperCase()); + } catch (IOException e) + { + Cache.log.error( + "Error reading nucleotide ambiguity codes data file: " + + e.getMessage()); } } /** - * Reads up to and returns the next non-comment line. Comment lines start with - * a #. + * Reads up to and returns the next non-comment line, trimmed. Comment lines + * start with a #. Returns null at end of file. * * @param dataIn * @return @@ -156,63 +192,85 @@ public class GeneticCodes { line = readLine(dataIn); } - return line; + return line == null ? null : line.trim(); } /** - * Reads the next lines of the data file describing one translation table, and - * creates an instance of GeneticCodeI for it. Returns the next line of the - * file (or null at end of file). + * Reads the lines of the data file describing one translation table, and + * creates and stores an instance of GeneticCodeI. Returns the '{' line + * starting the next table, or the '}' line at end of all tables. Data format + * is + * + *

+   * {
+   *   name "Vertebrate Mitochondrial" ,
+   *   name "SGC1" ,
+   *   id 2 ,
+   *   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
+   *   sncbieaa "----------**--------------------MMMM----------**---M------------"
+   *   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+   *   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+   *   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+   * },
+   * 
* - * @param nextLine + * of which we parse the first name, the id, and the ncbieaa translations for + * codons as ordered by the Base1/2/3 lines. Note Base1/2/3 are included for + * readability and are in a fixed order, these are not parsed. The sncbieaa + * line marks alternative start codons, these are not parsed. * * @param dataIn * @return * @throws IOException */ - protected String loadOneTable(String nextLine, BufferedReader dataIn) throws IOException + protected String loadOneTable(BufferedReader dataIn) throws IOException { - String line = nextLine; - if (line == null) - { - return null; - } - - /* - * next line should be tab-delimited "Table", id and description - */ - String[] tokens = line.split("\\t"); - String id = tokens[1]; - String name = tokens[2]; - - /* - * followed by codon translations - * - the full set for the first (Standard) code - * - variations (if any) for other codes - */ + String name = null; + String id = null; Map codons = new HashMap<>(); - while (true) + + String line = readLine(dataIn); + + while (line != null && !line.startsWith("}")) { - line = readLine(dataIn); - if (line == null) + if (line.startsWith("name") && name == null) { - registerCodeTable(id, name, codons); - return null; + name = line.substring(line.indexOf(QUOTE) + 1, + line.lastIndexOf(QUOTE)); } - tokens = line.split("\\t"); - String codon = tokens[0]; - String peptide = tokens[1]; - if ("Table".equalsIgnoreCase(codon)) + else if (line.startsWith("id")) { - /* - * start of next code table - construct this one, - * and return the next line of the data file - */ - registerCodeTable(id, name, codons); - return line; + id = new StringTokenizer(line.substring(2)).nextToken(); + } + else if (line.startsWith("ncbieaa")) + { + String aminos = line.substring(line.indexOf(QUOTE) + 1, + line.lastIndexOf(QUOTE)); + if (aminos.length() != NUCS_COUNT_CUBED) // 4 * 4 * 4 combinations + { + Cache.log.error("wrong data length in code table: " + line); + } + else + { + for (int i = 0; i < aminos.length(); i++) + { + String peptide = String.valueOf(aminos.charAt(i)); + char codon1 = NUCS.charAt(i / NUCS_COUNT_SQUARED); + char codon2 = NUCS + .charAt((i % NUCS_COUNT_SQUARED) / NUCS_COUNT); + char codon3 = NUCS.charAt(i % NUCS_COUNT); + String codon = new String( + new char[] + { codon1, codon2, codon3 }); + codons.put(codon, peptide); + } + } } - codons.put(codon.toUpperCase(), peptide.toUpperCase()); + line = readLine(dataIn); } + + registerCodeTable(id, name, codons); + return readLine(dataIn); } /** @@ -239,38 +297,22 @@ public class GeneticCodes @Override public String translateCanonical(String codon) { - codon = codon.toUpperCase(); - String peptide = codons.get(codon); - if (peptide == null) - { - /* - * delegate an unspecified codon to the Standard Table, - * (unless this is the Standard Table!) - * but don't delegate ambiguity resolution - */ - GeneticCodeI standardCodeTable = getStandardCodeTable(); - if (this != standardCodeTable) - { - peptide = standardCodeTable.translateCanonical(codon); - } - } - return peptide; + return codons.get(codon.toUpperCase()); } @Override public String translate(String codon) { - codon = codon.toUpperCase(); - String peptide = translateCanonical(codon); + String upper = codon.toUpperCase(); + String peptide = translateCanonical(upper); /* * if still not translated, check for ambiguity codes */ if (peptide == null) { - peptide = getAmbiguousTranslation(codon, ambiguous, this); + peptide = getAmbiguousTranslation(upper, ambiguous, this); } - return peptide; } @@ -302,29 +344,23 @@ public class GeneticCodes protected String getAmbiguousTranslation(String codon, Map ambiguous, GeneticCodeI codeTable) { - if (codon.length() != 3) + if (codon.length() != CODON_LENGTH) { return null; } boolean isAmbiguous = false; - String base1 = String.valueOf(codon.charAt(0)); - if (ambiguityCodes.containsKey(base1)) - { - isAmbiguous = true; - base1 = ambiguityCodes.get(base1); - } - String base2 = String.valueOf(codon.charAt(1)); - if (ambiguityCodes.containsKey(base2)) - { - isAmbiguous = true; - base2 = ambiguityCodes.get(base2); - } - String base3 = String.valueOf(codon.charAt(2)); - if (ambiguityCodes.containsKey(base3)) + + char[][] expanded = new char[CODON_LENGTH][]; + for (int i = 0; i < CODON_LENGTH; i++) { - isAmbiguous = true; - base3 = ambiguityCodes.get(base3); + String base = String.valueOf(codon.charAt(i)); + if (ambiguityCodes.containsKey(base)) + { + isAmbiguous = true; + base = ambiguityCodes.get(base); + } + expanded[i] = base.toCharArray(); } if (!isAmbiguous) @@ -338,11 +374,11 @@ public class GeneticCodes * only return the translation if they all agree, else null */ String peptide = null; - for (char c1 : base1.toCharArray()) + for (char c1 : expanded[0]) { - for (char c2 : base2.toCharArray()) + for (char c2 : expanded[1]) { - for (char c3 : base3.toCharArray()) + for (char c3 : expanded[2]) { char[] cdn = new char[] { c1, c2, c3 }; String possibleCodon = String.valueOf(cdn); diff --git a/test/jalview/analysis/GeneticCodesTest.java b/test/jalview/analysis/GeneticCodesTest.java index d5634db..5f49092 100644 --- a/test/jalview/analysis/GeneticCodesTest.java +++ b/test/jalview/analysis/GeneticCodesTest.java @@ -32,8 +32,8 @@ public class GeneticCodesTest GeneticCodes codes = GeneticCodes.getInstance(); Iterator tableIterator = codes.getCodeTables().iterator(); String[] ids = new String[] { "1", "2", "3", "4", "5", "6", "9", "10", - "11", "12", "13", "14", "16", "21", "22", "23", "24", "25", "26", - "27", "28", "29", "30", "31" }; + "11", "12", "13", "14", "15", "16", "21", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31" }; for (int i = 0; i < ids.length; i++) { assertEquals(tableIterator.next().getId(), ids[i]); -- 1.7.10.2