1 #TODO - Remove this work around once we drop python 2.3 support
5 from sets import Set as set
7 from Bio import Alphabet
8 from Bio.Alphabet import IUPAC
9 from Bio.Data import IUPACData
11 unambiguous_dna_by_name = {}
12 unambiguous_dna_by_id = {}
13 unambiguous_rna_by_name = {}
14 unambiguous_rna_by_id = {}
15 generic_by_name = {} # unambiguous DNA or RNA
16 generic_by_id = {} # unambiguous DNA or RNA
17 ambiguous_generic_by_name = {} # ambiguous DNA or RNA
18 ambiguous_generic_by_id = {} # ambiguous DNA or RNA
20 # standard IUPAC unambiguous codons
21 standard_dna_table = None
22 standard_rna_table = None
24 # In the future, the back_table could return a statistically
25 # appropriate distribution of codons, so do not cache the results of
28 class TranslationError(Exception):
32 nucleotide_alphabet = Alphabet.generic_nucleotide
33 protein_alphabet = Alphabet.generic_protein
35 forward_table = {} # only includes codons which actually code
36 back_table = {} # for back translations
39 # Not always called from derived classes!
40 def __init__(self, nucleotide_alphabet = nucleotide_alphabet,
41 protein_alphabet = protein_alphabet,
42 forward_table = forward_table, back_table = back_table,
43 start_codons = start_codons, stop_codons = stop_codons):
44 self.nucleotide_alphabet = nucleotide_alphabet
45 self.protein_alphabet = protein_alphabet
46 self.forward_table = forward_table
47 self.back_table = back_table
48 self.start_codons = start_codons
49 self.stop_codons = stop_codons
52 """Returns a simple text representation of the codon table
55 >>> import Bio.Data.CodonTable
56 >>> print Bio.Data.CodonTable.standard_dna_table
57 >>> print Bio.Data.CodonTable.generic_by_id[1]"""
60 answer = "Table %i" % self.id
62 answer = "Table ID unknown"
64 answer += " " + ", ".join(filter(None, self.names))
66 #Use the main four letters (and the conventional ordering)
67 #even for ambiguous tables
68 letters = self.nucleotide_alphabet.letters
69 if isinstance(self.nucleotide_alphabet, Alphabet.DNAAlphabet) \
70 or (letters is not None and "T" in letters) :
73 #Should be either RNA or generic nucleotides,
74 #e.g. Bio.Data.CodonTable.generic_by_id[1]
78 answer=answer + "\n\n |" + "|".join( \
79 [" %s " % c2 for c2 in letters] \
81 answer=answer + "\n--+" \
82 + "+".join(["---------" for c2 in letters]) + "+--"
88 line = line + " %s" % codon
89 if codon in self.stop_codons :
90 line = line + " Stop|"
93 amino = self.forward_table[codon]
96 except TranslationError :
98 if codon in self.start_codons :
99 line = line + " %s(s)|" % amino
101 line = line + " %s |" % amino
102 line = line + " " + c3
103 answer = answer + "\n"+ line
104 answer=answer + "\n--+" \
105 + "+".join(["---------" for c2 in letters]) + "+--"
108 def make_back_table(table, default_stop_codon):
109 # ONLY RETURNS A SINGLE CODON
110 # Do the sort so changes in the hash implementation won't affect
111 # the result when one amino acid is coded by more than one codon.
113 keys = table.keys() ; keys.sort()
115 back_table[table[key]] = key
116 back_table[None] = default_stop_codon
120 class NCBICodonTable(CodonTable):
121 nucleotide_alphabet = Alphabet.generic_nucleotide
122 protein_alphabet = IUPAC.protein
124 def __init__(self, id, names, table, start_codons, stop_codons):
127 self.forward_table = table
128 self.back_table = make_back_table(table, stop_codons[0])
129 self.start_codons = start_codons
130 self.stop_codons = stop_codons
133 class NCBICodonTableDNA(NCBICodonTable):
134 nucleotide_alphabet = IUPAC.unambiguous_dna
136 class NCBICodonTableRNA(NCBICodonTable):
137 nucleotide_alphabet = IUPAC.unambiguous_rna
141 def register_ncbi_table(name, alt_name, id,
142 table, start_codons, stop_codons):
143 names = name.split("; ")
145 dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons,
147 # replace all T's with U's for the RNA tables
150 for codon, val in table.items():
151 generic_table[codon] = val
152 codon = codon.replace("T", "U")
153 generic_table[codon] = val
154 rna_table[codon] = val
155 rna_start_codons = []
156 generic_start_codons = []
157 for codon in start_codons:
158 generic_start_codons.append(codon)
159 codon = codon.replace("T", "U")
160 generic_start_codons.append(codon)
161 rna_start_codons.append(codon)
163 generic_stop_codons = []
164 for codon in stop_codons:
165 generic_stop_codons.append(codon)
166 codon = codon.replace("T", "U")
167 generic_stop_codons.append(codon)
168 rna_stop_codons.append(codon)
170 generic = NCBICodonTable(id, names + [alt_name], generic_table,
171 generic_start_codons, generic_stop_codons)
172 rna = NCBICodonTableRNA(id, names + [alt_name], rna_table,
173 rna_start_codons, rna_stop_codons)
176 global standard_dna_table, standard_rna_table
177 standard_dna_table = dna
178 standard_rna_table = rna
180 unambiguous_dna_by_id[id] = dna
181 unambiguous_rna_by_id[id] = rna
182 generic_by_id[id] = generic
184 if alt_name is not None:
185 names.append(alt_name)
188 unambiguous_dna_by_name[name] = dna
189 unambiguous_rna_by_name[name] = rna
190 generic_by_name[name] = generic
192 ### These tables created from the data file
193 ### ftp://ncbi.nlm.nih.gov/entrez/misc/data/gc.prt
194 ### using the following:
196 ##for line in open("gc.prt").readlines():
197 ## if line[:2] == " {":
203 ## elif line[:6] == " name":
204 ## names.append(re.search('"([^"]*)"', line).group(1))
205 ## elif line[:8] == " name":
206 ## names.append(re.search('"(.*)$', line).group(1))
207 ## elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n':
208 ## names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma"
209 ## elif line[:4] == " id":
210 ## id = int(re.search('(\d+)', line).group(1))
211 ## elif line[:10] == " ncbieaa ":
212 ## aa = line[12:12+64]
213 ## elif line[:10] == " sncbieaa":
214 ## start = line[12:12+64]
215 ## elif line[:9] == " -- Base":
216 ## bases.append(line[12:12+64])
217 ## elif line[:2] == " }":
218 ## assert names != [] and id is not None and aa is not None
219 ## assert start is not None and bases != []
220 ## if len(names) == 1:
221 ## names.append(None)
222 ## print "register_ncbi_table(name = %s," % repr(names[0])
223 ## print " alt_name = %s, id = %d", % \
224 ## (repr(names[1]), id)
225 ## print " table = {"
227 ## for i in range(64):
229 ## t = " '%s%s%s': '%s'," % (bases[0][i], bases[1][i],
230 ## bases[2][i], aa[i])
231 ## if len(s) + len(t) > 75:
238 ## s = " stop_codons = ["
239 ## for i in range(64):
241 ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i])
242 ## if len(s) + len(t) > 75:
249 ## s = " start_codons = ["
250 ## for i in range(64):
251 ## if start[i] == "M":
252 ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i])
253 ## if len(s) + len(t) > 75:
260 ## elif line[:2] == "--" or line == "\n" or line == "}\n" or \
261 ## line == 'Genetic-code-table ::= {\n':
264 ## raise Exception("Unparsed: " + repr(line))
266 register_ncbi_table(name = 'Standard',
267 alt_name = 'SGC0', id = 1,
269 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
270 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
271 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L',
272 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P',
273 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
274 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I',
275 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T',
276 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K',
277 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
278 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A',
279 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D',
280 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
282 stop_codons = [ 'TAA', 'TAG', 'TGA', ],
283 start_codons = [ 'TTG', 'CTG', 'ATG', ]
285 register_ncbi_table(name = 'Vertebrate Mitochondrial',
286 alt_name = 'SGC1', id = 2,
288 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
289 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
290 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
291 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
292 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
293 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
294 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
295 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
296 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'GTT': 'V',
297 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A',
298 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E',
299 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', },
300 stop_codons = [ 'TAA', 'TAG', 'AGA', 'AGG', ],
301 start_codons = [ 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ]
303 register_ncbi_table(name = 'Yeast Mitochondrial',
304 alt_name = 'SGC2', id = 3,
306 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
307 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
308 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'T',
309 'CTC': 'T', 'CTA': 'T', 'CTG': 'T', 'CCT': 'P', 'CCC': 'P',
310 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
311 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
312 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
313 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
314 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
315 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
316 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
317 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
318 'GGA': 'G', 'GGG': 'G', },
319 stop_codons = [ 'TAA', 'TAG', ],
320 start_codons = [ 'ATG', ]
322 register_ncbi_table(name = 'Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma',
323 alt_name = 'SGC3', id = 4,
325 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
326 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
327 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
328 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
329 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
330 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
331 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
332 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
333 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
334 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
335 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
336 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
337 'GGA': 'G', 'GGG': 'G', },
338 stop_codons = [ 'TAA', 'TAG', ],
339 start_codons = [ 'TTA', 'TTG', 'CTG', 'ATT', 'ATC',
340 'ATA', 'ATG', 'GTG', ]
342 register_ncbi_table(name = 'Invertebrate Mitochondrial',
343 alt_name = 'SGC4', id = 5,
345 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
346 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
347 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
348 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
349 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
350 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
351 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
352 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
353 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S',
354 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
355 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
356 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
357 'GGA': 'G', 'GGG': 'G', },
358 stop_codons = [ 'TAA', 'TAG', ],
359 start_codons = [ 'TTG', 'ATT', 'ATC', 'ATA', 'ATG',
362 register_ncbi_table(name = 'Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear',
363 alt_name = 'SGC5', id = 6,
365 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
366 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
367 'TAA': 'Q', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
368 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P',
369 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H',
370 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R',
371 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
372 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N',
373 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S',
374 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V',
375 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
376 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G',
377 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', },
378 stop_codons = [ 'TGA', ],
379 start_codons = [ 'ATG', ]
381 register_ncbi_table(name = 'Echinoderm Mitochondrial',
382 alt_name = 'SGC8', id = 9,
384 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
385 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
386 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
387 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
388 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
389 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
390 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
391 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
392 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S',
393 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
394 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
395 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
396 'GGA': 'G', 'GGG': 'G', },
397 stop_codons = [ 'TAA', 'TAG', ],
398 start_codons = [ 'ATG', ]
400 register_ncbi_table(name = 'Euplotid Nuclear',
401 alt_name = 'SGC9', id = 10,
403 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
404 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
405 'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W', 'CTT': 'L',
406 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
407 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
408 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
409 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
410 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
411 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
412 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
413 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
414 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
415 'GGA': 'G', 'GGG': 'G', },
416 stop_codons = [ 'TAA', 'TAG', ],
417 start_codons = [ 'ATG', ]
419 register_ncbi_table(name = 'Bacterial',
420 alt_name = None, id = 11,
422 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
423 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
424 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L',
425 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P',
426 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
427 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I',
428 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T',
429 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K',
430 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
431 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A',
432 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D',
433 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
435 stop_codons = [ 'TAA', 'TAG', 'TGA', ],
436 start_codons = [ 'TTG', 'CTG', 'ATT', 'ATC', 'ATA',
439 register_ncbi_table(name = 'Alternative Yeast Nuclear',
440 alt_name = None, id = 12,
442 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
443 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
444 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L',
445 'CTA': 'L', 'CTG': 'S', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P',
446 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
447 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I',
448 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T',
449 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K',
450 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
451 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A',
452 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D',
453 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
455 stop_codons = [ 'TAA', 'TAG', 'TGA', ],
456 start_codons = [ 'CTG', 'ATG', ]
458 register_ncbi_table(name = 'Ascidian Mitochondrial',
459 alt_name = None, id = 13,
461 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
462 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
463 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
464 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
465 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
466 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
467 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
468 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
469 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'G',
470 'AGG': 'G', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
471 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
472 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
473 'GGA': 'G', 'GGG': 'G', },
474 stop_codons = [ 'TAA', 'TAG', ],
475 start_codons = [ 'ATG', ]
477 register_ncbi_table(name = 'Flatworm Mitochondrial',
478 alt_name = None, id = 14,
480 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
481 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
482 'TAA': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
483 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P',
484 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H',
485 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R',
486 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
487 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N',
488 'AAC': 'N', 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S',
489 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V',
490 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
491 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G',
492 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', },
493 stop_codons = [ 'TAG', ],
494 start_codons = [ 'ATG', ]
496 register_ncbi_table(name = 'Blepharisma Macronuclear',
497 alt_name = None, id = 15,
499 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
500 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
501 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L',
502 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
503 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
504 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
505 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
506 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
507 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
508 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
509 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
510 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
511 'GGA': 'G', 'GGG': 'G', },
512 stop_codons = [ 'TAA', 'TGA', ],
513 start_codons = [ 'ATG', ]
516 ######### Deal with ambiguous forward translations
518 class AmbiguousCodonTable(CodonTable):
519 def __init__(self, codon_table,
520 ambiguous_nucleotide_alphabet,
521 ambiguous_nucleotide_values,
522 ambiguous_protein_alphabet,
523 ambiguous_protein_values):
524 CodonTable.__init__(self,
525 ambiguous_nucleotide_alphabet,
526 ambiguous_protein_alphabet,
527 AmbiguousForwardTable(codon_table.forward_table,
528 ambiguous_nucleotide_values,
529 ambiguous_protein_values),
530 codon_table.back_table,
532 # These two are WRONG! I need to get the
533 # list of ambiguous codons which code for
534 # the stop codons XXX
535 list_ambiguous_codons(codon_table.start_codons, ambiguous_nucleotide_values),
536 list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values)
538 self._codon_table = codon_table
540 # Be sneaky and forward attribute lookups to the original table.
541 # This lets us get the names, if the original table is an NCBI
543 def __getattr__(self, name):
544 return getattr(self._codon_table, name)
546 def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values):
548 x1 = ambiguous_nucleotide_values[c1]
549 x2 = ambiguous_nucleotide_values[c2]
550 x3 = ambiguous_nucleotide_values[c3]
557 possible[forward_table[y1+y2+y3]] = 1
559 # If tripping over a stop codon
560 stops.append(y1+y2+y3)
563 raise TranslationError("ambiguous codon '%s' codes " % codon \
564 + "for both proteins and stop codons")
565 # This is a true stop codon - tell the caller about it
566 raise KeyError(codon)
567 return possible.keys()
569 def list_ambiguous_codons(codons, ambiguous_nucleotide_values):
570 """Extends a codon list to include all possible ambigous codons.
572 e.g. ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR']
573 ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA']
575 Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR'.
576 Thus only two more codons are added in the following:
578 e.g. ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR']
580 Returns a new (longer) list of codon strings.
583 #Note ambiguous_nucleotide_values['R'] = 'AG' (etc)
584 #This will generate things like 'TRR' from ['TAG', 'TGA'], which
585 #we don't want to include:
586 c1_list = [letter for (letter, meanings) \
587 in ambiguous_nucleotide_values.iteritems() \
588 if set([codon[0] for codon in codons]).issuperset(set(meanings))]
589 c2_list = [letter for (letter, meanings) \
590 in ambiguous_nucleotide_values.iteritems() \
591 if set([codon[1] for codon in codons]).issuperset(set(meanings))]
592 c3_list = [letter for (letter, meanings) \
593 in ambiguous_nucleotide_values.iteritems() \
594 if set([codon[2] for codon in codons]).issuperset(set(meanings))]
595 set2 = set([codon[1] for codon in codons])
596 set3 = set([codon[2] for codon in codons])
597 candidates = set([c1+c2+c3 for c1 in c1_list for c2 in c2_list for c3 in c3_list])
598 candidates.difference_update(codons)
599 answer = codons[:] #copy
600 #print "Have %i new candidates" % len(candidates)
601 for ambig_codon in candidates :
603 #e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG'
604 for codon in [c1+c2+c3 \
605 for c1 in ambiguous_nucleotide_values[ambig_codon[0]] \
606 for c2 in ambiguous_nucleotide_values[ambig_codon[1]] \
607 for c3 in ambiguous_nucleotide_values[ambig_codon[2]]]:
608 if codon not in codons :
609 #This ambiguous codon can code for a non-stop, exclude it!
611 #print "Rejecting %s" % ambig_codon
614 answer.append(ambig_codon)
616 assert list_ambiguous_codons(['TGA', 'TAA'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TRA']
617 assert list_ambiguous_codons(['TAG', 'TGA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TGA']
618 assert list_ambiguous_codons(['TAG', 'TAA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TAA', 'TAR']
619 assert list_ambiguous_codons(['UAG', 'UAA'],IUPACData.ambiguous_rna_values) == ['UAG', 'UAA', 'UAR']
620 assert list_ambiguous_codons(['TGA', 'TAA', 'TAG'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TAG', 'TAR', 'TRA']
622 # Forward translation is "onto", that is, any given codon always maps
623 # to the same protein, or it doesn't map at all. Thus, I can build
624 # off of an existing table to produce the ambiguous mappings.
626 # This handles the general case. Perhaps it's overkill?
627 # >>> t = CodonTable.ambiguous_dna_by_id[1]
628 # >>> t.forward_table["AAT"]
630 # >>> t.forward_table["GAT"]
632 # >>> t.forward_table["RAT"]
634 # >>> t.forward_table["YTA"]
637 class AmbiguousForwardTable:
638 def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein):
639 self.forward_table = forward_table
641 self.ambiguous_nucleotide = ambiguous_nucleotide
642 self.ambiguous_protein = ambiguous_protein
645 for name, val in ambiguous_protein.items():
647 x = inverted.get(c, {})
650 for name, val in inverted.items():
651 inverted[name] = val.keys()
652 self._inverted = inverted
656 def get(self, codon, failobj = None):
658 return self.__getitem__(codon)
662 def __getitem__(self, codon):
664 x = self._cache[codon]
668 if x is TranslationError:
669 raise TranslationError(codon) # no unique translation
671 raise KeyError(codon) # it's a stop codon
674 x = self.forward_table[codon]
675 self._cache[codon] = x
680 # XXX Need to make part of this into a method which returns
681 # a list of all possible encodings for a codon!
683 possible = list_possible_proteins(codon,
685 self.ambiguous_nucleotide)
687 self._cache[codon] = KeyError
688 raise KeyError(codon) # stop codon
689 except TranslationError:
690 self._cache[codon] = TranslationError
691 raise TranslationError(codon) # does not code
692 assert len(possible) > 0, "unambiguous codons must code"
694 # Hah! Only one possible protein, so use it
695 if len(possible) == 1:
696 self._cache[codon] = possible[0]
699 # See if there's an ambiguous protein encoding for the multiples.
700 # Find residues which exist in every coding set.
701 ambiguous_possible = {}
702 for amino in possible:
703 for term in self._inverted[amino]:
704 ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1
708 for amino, val in ambiguous_possible.items():
710 possible.append(amino)
712 # No amino acid encoding for the results
713 if len(possible) == 0:
714 self._cache[codon] = TranslationError
715 raise TranslationError(codon) # no valid translation
717 # All of these are valid, so choose one
718 # To be unique, sort by smallet ambiguity then alphabetically
719 # Can get this if "X" encodes for everything.
720 def _sort(x, y, table = self.ambiguous_protein):
721 a = cmp(len(table[x]), len(table[y]))
728 self._cache[codon] = x
731 #Prepare the ambiguous tables for DNA, RNA and Generic (DNA or RNA)
732 ambiguous_dna_by_name = {}
733 for key, val in unambiguous_dna_by_name.items():
734 ambiguous_dna_by_name[key] = AmbiguousCodonTable(val,
736 IUPACData.ambiguous_dna_values,
737 IUPAC.extended_protein,
738 IUPACData.extended_protein_values)
739 ambiguous_dna_by_id = {}
740 for key, val in unambiguous_dna_by_id.items():
741 ambiguous_dna_by_id[key] = AmbiguousCodonTable(val,
743 IUPACData.ambiguous_dna_values,
744 IUPAC.extended_protein,
745 IUPACData.extended_protein_values)
747 ambiguous_rna_by_name = {}
748 for key, val in unambiguous_rna_by_name.items():
749 ambiguous_rna_by_name[key] = AmbiguousCodonTable(val,
751 IUPACData.ambiguous_rna_values,
752 IUPAC.extended_protein,
753 IUPACData.extended_protein_values)
754 ambiguous_rna_by_id = {}
755 for key, val in unambiguous_rna_by_id.items():
756 ambiguous_rna_by_id[key] = AmbiguousCodonTable(val,
758 IUPACData.ambiguous_rna_values,
759 IUPAC.extended_protein,
760 IUPACData.extended_protein_values)
762 #The following isn't very elegant, but seems to work nicely.
763 _merged_values = dict(IUPACData.ambiguous_rna_values.iteritems())
764 _merged_values["T"] = "U"
766 for key, val in generic_by_name.items():
767 ambiguous_generic_by_name[key] = AmbiguousCodonTable(val,
768 Alphabet.NucleotideAlphabet(),
770 IUPAC.extended_protein,
771 IUPACData.extended_protein_values)
773 for key, val in generic_by_id.items():
774 ambiguous_generic_by_id[key] = AmbiguousCodonTable(val,
775 Alphabet.NucleotideAlphabet(),
777 IUPAC.extended_protein,
778 IUPACData.extended_protein_values)
783 for n in ambiguous_generic_by_id.keys() :
784 assert ambiguous_rna_by_id[n].forward_table["GUU"] == "V"
785 assert ambiguous_rna_by_id[n].forward_table["GUN"] == "V"
786 assert ambiguous_rna_by_id[n].forward_table["UUN"] == "X" #F or L
787 #R = A or G, so URR = UAA or UGA / TRA = TAA or TGA = stop codons
788 if "UAA" in unambiguous_rna_by_id[n].stop_codons \
789 and "UGA" in unambiguous_rna_by_id[n].stop_codons :
791 print ambiguous_dna_by_id[n].forward_table["TRA"]
792 assert False, "Should be a stop only"
795 assert "URA" in ambiguous_generic_by_id[n].stop_codons
796 assert "URA" in ambiguous_rna_by_id[n].stop_codons
797 assert "TRA" in ambiguous_generic_by_id[n].stop_codons
798 assert "TRA" in ambiguous_dna_by_id[n].stop_codons
800 assert ambiguous_generic_by_id[1].stop_codons == ambiguous_generic_by_name["Standard"].stop_codons
801 assert ambiguous_generic_by_id[4].stop_codons == ambiguous_generic_by_name["SGC3"].stop_codons
802 assert ambiguous_generic_by_id[15].stop_codons == ambiguous_generic_by_name['Blepharisma Macronuclear'].stop_codons