+++ /dev/null
-#TODO - Remove this work around once we drop python 2.3 support
-try:
- set = set
-except NameError:
- from sets import Set as set
-
-from Bio import Alphabet
-from Bio.Alphabet import IUPAC
-from Bio.Data import IUPACData
-
-unambiguous_dna_by_name = {}
-unambiguous_dna_by_id = {}
-unambiguous_rna_by_name = {}
-unambiguous_rna_by_id = {}
-generic_by_name = {} # unambiguous DNA or RNA
-generic_by_id = {} # unambiguous DNA or RNA
-ambiguous_generic_by_name = {} # ambiguous DNA or RNA
-ambiguous_generic_by_id = {} # ambiguous DNA or RNA
-
-# standard IUPAC unambiguous codons
-standard_dna_table = None
-standard_rna_table = None
-
-# In the future, the back_table could return a statistically
-# appropriate distribution of codons, so do not cache the results of
-# back_table lookups!
-
-class TranslationError(Exception):
- pass
-
-class CodonTable:
- nucleotide_alphabet = Alphabet.generic_nucleotide
- protein_alphabet = Alphabet.generic_protein
-
- forward_table = {} # only includes codons which actually code
- back_table = {} # for back translations
- start_codons = []
- stop_codons = []
- # Not always called from derived classes!
- def __init__(self, nucleotide_alphabet = nucleotide_alphabet,
- protein_alphabet = protein_alphabet,
- forward_table = forward_table, back_table = back_table,
- start_codons = start_codons, stop_codons = stop_codons):
- self.nucleotide_alphabet = nucleotide_alphabet
- self.protein_alphabet = protein_alphabet
- self.forward_table = forward_table
- self.back_table = back_table
- self.start_codons = start_codons
- self.stop_codons = stop_codons
-
- def __str__(self) :
- """Returns a simple text representation of the codon table
-
- e.g.
- >>> import Bio.Data.CodonTable
- >>> print Bio.Data.CodonTable.standard_dna_table
- >>> print Bio.Data.CodonTable.generic_by_id[1]"""
-
- if self.id :
- answer = "Table %i" % self.id
- else :
- answer = "Table ID unknown"
- if self.names :
- answer += " " + ", ".join(filter(None, self.names))
-
- #Use the main four letters (and the conventional ordering)
- #even for ambiguous tables
- letters = self.nucleotide_alphabet.letters
- if isinstance(self.nucleotide_alphabet, Alphabet.DNAAlphabet) \
- or (letters is not None and "T" in letters) :
- letters = "TCAG"
- else :
- #Should be either RNA or generic nucleotides,
- #e.g. Bio.Data.CodonTable.generic_by_id[1]
- letters = "UCAG"
-
- #Build the table...
- answer=answer + "\n\n |" + "|".join( \
- [" %s " % c2 for c2 in letters] \
- ) + "|"
- answer=answer + "\n--+" \
- + "+".join(["---------" for c2 in letters]) + "+--"
- for c1 in letters :
- for c3 in letters :
- line = c1 + " |"
- for c2 in letters :
- codon = c1+c2+c3
- line = line + " %s" % codon
- if codon in self.stop_codons :
- line = line + " Stop|"
- else :
- try :
- amino = self.forward_table[codon]
- except KeyError :
- amino = "?"
- except TranslationError :
- amino = "?"
- if codon in self.start_codons :
- line = line + " %s(s)|" % amino
- else :
- line = line + " %s |" % amino
- line = line + " " + c3
- answer = answer + "\n"+ line
- answer=answer + "\n--+" \
- + "+".join(["---------" for c2 in letters]) + "+--"
- return answer
-
-def make_back_table(table, default_stop_codon):
- # ONLY RETURNS A SINGLE CODON
- # Do the sort so changes in the hash implementation won't affect
- # the result when one amino acid is coded by more than one codon.
- back_table = {}
- keys = table.keys() ; keys.sort()
- for key in keys:
- back_table[table[key]] = key
- back_table[None] = default_stop_codon
- return back_table
-
-
-class NCBICodonTable(CodonTable):
- nucleotide_alphabet = Alphabet.generic_nucleotide
- protein_alphabet = IUPAC.protein
-
- def __init__(self, id, names, table, start_codons, stop_codons):
- self.id = id
- self.names = names
- self.forward_table = table
- self.back_table = make_back_table(table, stop_codons[0])
- self.start_codons = start_codons
- self.stop_codons = stop_codons
-
-
-class NCBICodonTableDNA(NCBICodonTable):
- nucleotide_alphabet = IUPAC.unambiguous_dna
-
-class NCBICodonTableRNA(NCBICodonTable):
- nucleotide_alphabet = IUPAC.unambiguous_rna
-
-
-
-def register_ncbi_table(name, alt_name, id,
- table, start_codons, stop_codons):
- names = name.split("; ")
-
- dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons,
- stop_codons)
- # replace all T's with U's for the RNA tables
- rna_table = {}
- generic_table = {}
- for codon, val in table.items():
- generic_table[codon] = val
- codon = codon.replace("T", "U")
- generic_table[codon] = val
- rna_table[codon] = val
- rna_start_codons = []
- generic_start_codons = []
- for codon in start_codons:
- generic_start_codons.append(codon)
- codon = codon.replace("T", "U")
- generic_start_codons.append(codon)
- rna_start_codons.append(codon)
- rna_stop_codons = []
- generic_stop_codons = []
- for codon in stop_codons:
- generic_stop_codons.append(codon)
- codon = codon.replace("T", "U")
- generic_stop_codons.append(codon)
- rna_stop_codons.append(codon)
-
- generic = NCBICodonTable(id, names + [alt_name], generic_table,
- generic_start_codons, generic_stop_codons)
- rna = NCBICodonTableRNA(id, names + [alt_name], rna_table,
- rna_start_codons, rna_stop_codons)
-
- if id == 1:
- global standard_dna_table, standard_rna_table
- standard_dna_table = dna
- standard_rna_table = rna
-
- unambiguous_dna_by_id[id] = dna
- unambiguous_rna_by_id[id] = rna
- generic_by_id[id] = generic
-
- if alt_name is not None:
- names.append(alt_name)
-
- for name in names:
- unambiguous_dna_by_name[name] = dna
- unambiguous_rna_by_name[name] = rna
- generic_by_name[name] = generic
-
-### These tables created from the data file
-### ftp://ncbi.nlm.nih.gov/entrez/misc/data/gc.prt
-### using the following:
-##import re
-##for line in open("gc.prt").readlines():
-## if line[:2] == " {":
-## names = []
-## id = None
-## aa = None
-## start = None
-## bases = []
-## elif line[:6] == " name":
-## names.append(re.search('"([^"]*)"', line).group(1))
-## elif line[:8] == " name":
-## names.append(re.search('"(.*)$', line).group(1))
-## elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n':
-## names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma"
-## elif line[:4] == " id":
-## id = int(re.search('(\d+)', line).group(1))
-## elif line[:10] == " ncbieaa ":
-## aa = line[12:12+64]
-## elif line[:10] == " sncbieaa":
-## start = line[12:12+64]
-## elif line[:9] == " -- Base":
-## bases.append(line[12:12+64])
-## elif line[:2] == " }":
-## assert names != [] and id is not None and aa is not None
-## assert start is not None and bases != []
-## if len(names) == 1:
-## names.append(None)
-## print "register_ncbi_table(name = %s," % repr(names[0])
-## print " alt_name = %s, id = %d", % \
-## (repr(names[1]), id)
-## print " table = {"
-## s = " "
-## for i in range(64):
-## if aa[i] != "*":
-## t = " '%s%s%s': '%s'," % (bases[0][i], bases[1][i],
-## bases[2][i], aa[i])
-## if len(s) + len(t) > 75:
-## print s
-## s = " " + t
-## else:
-## s = s + t
-## print s, "},"
-
-## s = " stop_codons = ["
-## for i in range(64):
-## if aa[i] == "*":
-## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i])
-## if len(s) + len(t) > 75:
-## print s
-## s = " " + t
-## else:
-## s = s + t
-## print s, "],"
-
-## s = " start_codons = ["
-## for i in range(64):
-## if start[i] == "M":
-## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i])
-## if len(s) + len(t) > 75:
-## print s
-## s = " " + t
-## else:
-## s = s + t
-## print s, "]"
-## print " )"
-## elif line[:2] == "--" or line == "\n" or line == "}\n" or \
-## line == 'Genetic-code-table ::= {\n':
-## pass
-## else:
-## raise Exception("Unparsed: " + repr(line))
-
-register_ncbi_table(name = 'Standard',
- alt_name = 'SGC0', id = 1,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L',
- 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P',
- 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
- 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I',
- 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T',
- 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K',
- 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
- 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A',
- 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D',
- 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
- 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', 'TGA', ],
- start_codons = [ 'TTG', 'CTG', 'ATG', ]
- )
-register_ncbi_table(name = 'Vertebrate Mitochondrial',
- alt_name = 'SGC1', id = 2,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
- 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
- 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
- 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
- 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
- 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
- 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'GTT': 'V',
- 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A',
- 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E',
- 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', 'AGA', 'AGG', ],
- start_codons = [ 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ]
- )
-register_ncbi_table(name = 'Yeast Mitochondrial',
- alt_name = 'SGC2', id = 3,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'T',
- 'CTC': 'T', 'CTA': 'T', 'CTG': 'T', 'CCT': 'P', 'CCC': 'P',
- 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
- 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
- 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
- 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
- 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
- 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
- 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
- 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
- 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', ],
- start_codons = [ 'ATG', ]
- )
-register_ncbi_table(name = 'Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma',
- alt_name = 'SGC3', id = 4,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
- 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
- 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
- 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
- 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
- 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
- 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
- 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
- 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
- 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
- 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', ],
- start_codons = [ 'TTA', 'TTG', 'CTG', 'ATT', 'ATC',
- 'ATA', 'ATG', 'GTG', ]
- )
-register_ncbi_table(name = 'Invertebrate Mitochondrial',
- alt_name = 'SGC4', id = 5,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
- 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
- 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
- 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
- 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
- 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
- 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S',
- 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
- 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
- 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
- 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', ],
- start_codons = [ 'TTG', 'ATT', 'ATC', 'ATA', 'ATG',
- 'GTG', ]
- )
-register_ncbi_table(name = 'Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear',
- alt_name = 'SGC5', id = 6,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TAA': 'Q', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
- 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P',
- 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H',
- 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R',
- 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
- 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N',
- 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S',
- 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V',
- 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
- 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G',
- 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TGA', ],
- start_codons = [ 'ATG', ]
- )
-register_ncbi_table(name = 'Echinoderm Mitochondrial',
- alt_name = 'SGC8', id = 9,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
- 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
- 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
- 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
- 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
- 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
- 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S',
- 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
- 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
- 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
- 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', ],
- start_codons = [ 'ATG', ]
- )
-register_ncbi_table(name = 'Euplotid Nuclear',
- alt_name = 'SGC9', id = 10,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W', 'CTT': 'L',
- 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
- 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
- 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
- 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
- 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
- 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
- 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
- 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
- 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
- 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', ],
- start_codons = [ 'ATG', ]
- )
-register_ncbi_table(name = 'Bacterial',
- alt_name = None, id = 11,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L',
- 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P',
- 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
- 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I',
- 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T',
- 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K',
- 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
- 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A',
- 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D',
- 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
- 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', 'TGA', ],
- start_codons = [ 'TTG', 'CTG', 'ATT', 'ATC', 'ATA',
- 'ATG', 'GTG', ]
- )
-register_ncbi_table(name = 'Alternative Yeast Nuclear',
- alt_name = None, id = 12,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L',
- 'CTA': 'L', 'CTG': 'S', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P',
- 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
- 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I',
- 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T',
- 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K',
- 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
- 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A',
- 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D',
- 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
- 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', 'TGA', ],
- start_codons = [ 'CTG', 'ATG', ]
- )
-register_ncbi_table(name = 'Ascidian Mitochondrial',
- alt_name = None, id = 13,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
- 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
- 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
- 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
- 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
- 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
- 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'G',
- 'AGG': 'G', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
- 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
- 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
- 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TAG', ],
- start_codons = [ 'ATG', ]
- )
-register_ncbi_table(name = 'Flatworm Mitochondrial',
- alt_name = None, id = 14,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TAA': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
- 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P',
- 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H',
- 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R',
- 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
- 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N',
- 'AAC': 'N', 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S',
- 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V',
- 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
- 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G',
- 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TAG', ],
- start_codons = [ 'ATG', ]
- )
-register_ncbi_table(name = 'Blepharisma Macronuclear',
- alt_name = None, id = 15,
- table = {
- 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
- 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
- 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L',
- 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
- 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
- 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
- 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
- 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
- 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
- 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
- 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
- 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
- 'GGA': 'G', 'GGG': 'G', },
- stop_codons = [ 'TAA', 'TGA', ],
- start_codons = [ 'ATG', ]
- )
-
-######### Deal with ambiguous forward translations
-
-class AmbiguousCodonTable(CodonTable):
- def __init__(self, codon_table,
- ambiguous_nucleotide_alphabet,
- ambiguous_nucleotide_values,
- ambiguous_protein_alphabet,
- ambiguous_protein_values):
- CodonTable.__init__(self,
- ambiguous_nucleotide_alphabet,
- ambiguous_protein_alphabet,
- AmbiguousForwardTable(codon_table.forward_table,
- ambiguous_nucleotide_values,
- ambiguous_protein_values),
- codon_table.back_table,
-
- # These two are WRONG! I need to get the
- # list of ambiguous codons which code for
- # the stop codons XXX
- list_ambiguous_codons(codon_table.start_codons, ambiguous_nucleotide_values),
- list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values)
- )
- self._codon_table = codon_table
-
- # Be sneaky and forward attribute lookups to the original table.
- # This lets us get the names, if the original table is an NCBI
- # table.
- def __getattr__(self, name):
- return getattr(self._codon_table, name)
-
-def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values):
- c1, c2, c3 = codon
- x1 = ambiguous_nucleotide_values[c1]
- x2 = ambiguous_nucleotide_values[c2]
- x3 = ambiguous_nucleotide_values[c3]
- possible = {}
- stops = []
- for y1 in x1:
- for y2 in x2:
- for y3 in x3:
- try:
- possible[forward_table[y1+y2+y3]] = 1
- except KeyError:
- # If tripping over a stop codon
- stops.append(y1+y2+y3)
- if stops:
- if possible.keys():
- raise TranslationError("ambiguous codon '%s' codes " % codon \
- + "for both proteins and stop codons")
- # This is a true stop codon - tell the caller about it
- raise KeyError(codon)
- return possible.keys()
-
-def list_ambiguous_codons(codons, ambiguous_nucleotide_values):
- """Extends a codon list to include all possible ambigous codons.
-
- e.g. ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR']
- ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA']
-
- Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR'.
- Thus only two more codons are added in the following:
-
- e.g. ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR']
-
- Returns a new (longer) list of codon strings.
- """
-
- #Note ambiguous_nucleotide_values['R'] = 'AG' (etc)
- #This will generate things like 'TRR' from ['TAG', 'TGA'], which
- #we don't want to include:
- c1_list = [letter for (letter, meanings) \
- in ambiguous_nucleotide_values.iteritems() \
- if set([codon[0] for codon in codons]).issuperset(set(meanings))]
- c2_list = [letter for (letter, meanings) \
- in ambiguous_nucleotide_values.iteritems() \
- if set([codon[1] for codon in codons]).issuperset(set(meanings))]
- c3_list = [letter for (letter, meanings) \
- in ambiguous_nucleotide_values.iteritems() \
- if set([codon[2] for codon in codons]).issuperset(set(meanings))]
- set2 = set([codon[1] for codon in codons])
- set3 = set([codon[2] for codon in codons])
- candidates = set([c1+c2+c3 for c1 in c1_list for c2 in c2_list for c3 in c3_list])
- candidates.difference_update(codons)
- answer = codons[:] #copy
- #print "Have %i new candidates" % len(candidates)
- for ambig_codon in candidates :
- wanted = True
- #e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG'
- for codon in [c1+c2+c3 \
- for c1 in ambiguous_nucleotide_values[ambig_codon[0]] \
- for c2 in ambiguous_nucleotide_values[ambig_codon[1]] \
- for c3 in ambiguous_nucleotide_values[ambig_codon[2]]]:
- if codon not in codons :
- #This ambiguous codon can code for a non-stop, exclude it!
- wanted=False
- #print "Rejecting %s" % ambig_codon
- continue
- if wanted :
- answer.append(ambig_codon)
- return answer
-assert list_ambiguous_codons(['TGA', 'TAA'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TRA']
-assert list_ambiguous_codons(['TAG', 'TGA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TGA']
-assert list_ambiguous_codons(['TAG', 'TAA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TAA', 'TAR']
-assert list_ambiguous_codons(['UAG', 'UAA'],IUPACData.ambiguous_rna_values) == ['UAG', 'UAA', 'UAR']
-assert list_ambiguous_codons(['TGA', 'TAA', 'TAG'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TAG', 'TAR', 'TRA']
-
-# Forward translation is "onto", that is, any given codon always maps
-# to the same protein, or it doesn't map at all. Thus, I can build
-# off of an existing table to produce the ambiguous mappings.
-#
-# This handles the general case. Perhaps it's overkill?
-# >>> t = CodonTable.ambiguous_dna_by_id[1]
-# >>> t.forward_table["AAT"]
-# 'N'
-# >>> t.forward_table["GAT"]
-# 'D'
-# >>> t.forward_table["RAT"]
-# 'B'
-# >>> t.forward_table["YTA"]
-# 'L'
-
-class AmbiguousForwardTable:
- def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein):
- self.forward_table = forward_table
-
- self.ambiguous_nucleotide = ambiguous_nucleotide
- self.ambiguous_protein = ambiguous_protein
-
- inverted = {}
- for name, val in ambiguous_protein.items():
- for c in val:
- x = inverted.get(c, {})
- x[name] = 1
- inverted[c] = x
- for name, val in inverted.items():
- inverted[name] = val.keys()
- self._inverted = inverted
-
- self._cache = {}
-
- def get(self, codon, failobj = None):
- try:
- return self.__getitem__(codon)
- except KeyError:
- return failobj
-
- def __getitem__(self, codon):
- try:
- x = self._cache[codon]
- except KeyError:
- pass
- else:
- if x is TranslationError:
- raise TranslationError(codon) # no unique translation
- if x is KeyError:
- raise KeyError(codon) # it's a stop codon
- return x
- try:
- x = self.forward_table[codon]
- self._cache[codon] = x
- return x
- except KeyError:
- pass
-
- # XXX Need to make part of this into a method which returns
- # a list of all possible encodings for a codon!
- try:
- possible = list_possible_proteins(codon,
- self.forward_table,
- self.ambiguous_nucleotide)
- except KeyError:
- self._cache[codon] = KeyError
- raise KeyError(codon) # stop codon
- except TranslationError:
- self._cache[codon] = TranslationError
- raise TranslationError(codon) # does not code
- assert len(possible) > 0, "unambiguous codons must code"
-
- # Hah! Only one possible protein, so use it
- if len(possible) == 1:
- self._cache[codon] = possible[0]
- return possible[0]
-
- # See if there's an ambiguous protein encoding for the multiples.
- # Find residues which exist in every coding set.
- ambiguous_possible = {}
- for amino in possible:
- for term in self._inverted[amino]:
- ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1
-
- n = len(possible)
- possible = []
- for amino, val in ambiguous_possible.items():
- if val == n:
- possible.append(amino)
-
- # No amino acid encoding for the results
- if len(possible) == 0:
- self._cache[codon] = TranslationError
- raise TranslationError(codon) # no valid translation
-
- # All of these are valid, so choose one
- # To be unique, sort by smallet ambiguity then alphabetically
- # Can get this if "X" encodes for everything.
- def _sort(x, y, table = self.ambiguous_protein):
- a = cmp(len(table[x]), len(table[y]))
- if a == 0:
- return cmp(x, y)
- return a
- possible.sort(_sort)
-
- x = possible[0]
- self._cache[codon] = x
- return x
-
-#Prepare the ambiguous tables for DNA, RNA and Generic (DNA or RNA)
-ambiguous_dna_by_name = {}
-for key, val in unambiguous_dna_by_name.items():
- ambiguous_dna_by_name[key] = AmbiguousCodonTable(val,
- IUPAC.ambiguous_dna,
- IUPACData.ambiguous_dna_values,
- IUPAC.extended_protein,
- IUPACData.extended_protein_values)
-ambiguous_dna_by_id = {}
-for key, val in unambiguous_dna_by_id.items():
- ambiguous_dna_by_id[key] = AmbiguousCodonTable(val,
- IUPAC.ambiguous_dna,
- IUPACData.ambiguous_dna_values,
- IUPAC.extended_protein,
- IUPACData.extended_protein_values)
-
-ambiguous_rna_by_name = {}
-for key, val in unambiguous_rna_by_name.items():
- ambiguous_rna_by_name[key] = AmbiguousCodonTable(val,
- IUPAC.ambiguous_rna,
- IUPACData.ambiguous_rna_values,
- IUPAC.extended_protein,
- IUPACData.extended_protein_values)
-ambiguous_rna_by_id = {}
-for key, val in unambiguous_rna_by_id.items():
- ambiguous_rna_by_id[key] = AmbiguousCodonTable(val,
- IUPAC.ambiguous_rna,
- IUPACData.ambiguous_rna_values,
- IUPAC.extended_protein,
- IUPACData.extended_protein_values)
-
-#The following isn't very elegant, but seems to work nicely.
-_merged_values = dict(IUPACData.ambiguous_rna_values.iteritems())
-_merged_values["T"] = "U"
-
-for key, val in generic_by_name.items():
- ambiguous_generic_by_name[key] = AmbiguousCodonTable(val,
- Alphabet.NucleotideAlphabet(),
- _merged_values,
- IUPAC.extended_protein,
- IUPACData.extended_protein_values)
-
-for key, val in generic_by_id.items():
- ambiguous_generic_by_id[key] = AmbiguousCodonTable(val,
- Alphabet.NucleotideAlphabet(),
- _merged_values,
- IUPAC.extended_protein,
- IUPACData.extended_protein_values)
-del _merged_values
-del key, val
-
-#Basic sanity test,
-for n in ambiguous_generic_by_id.keys() :
- assert ambiguous_rna_by_id[n].forward_table["GUU"] == "V"
- assert ambiguous_rna_by_id[n].forward_table["GUN"] == "V"
- assert ambiguous_rna_by_id[n].forward_table["UUN"] == "X" #F or L
- #R = A or G, so URR = UAA or UGA / TRA = TAA or TGA = stop codons
- if "UAA" in unambiguous_rna_by_id[n].stop_codons \
- and "UGA" in unambiguous_rna_by_id[n].stop_codons :
- try :
- print ambiguous_dna_by_id[n].forward_table["TRA"]
- assert False, "Should be a stop only"
- except KeyError :
- pass
- assert "URA" in ambiguous_generic_by_id[n].stop_codons
- assert "URA" in ambiguous_rna_by_id[n].stop_codons
- assert "TRA" in ambiguous_generic_by_id[n].stop_codons
- assert "TRA" in ambiguous_dna_by_id[n].stop_codons
-del n
-assert ambiguous_generic_by_id[1].stop_codons == ambiguous_generic_by_name["Standard"].stop_codons
-assert ambiguous_generic_by_id[4].stop_codons == ambiguous_generic_by_name["SGC3"].stop_codons
-assert ambiguous_generic_by_id[15].stop_codons == ambiguous_generic_by_name['Blepharisma Macronuclear'].stop_codons