#TODO - Remove this work around once we drop python 2.3 support try: set = set except NameError: from sets import Set as set from Bio import Alphabet from Bio.Alphabet import IUPAC from Bio.Data import IUPACData unambiguous_dna_by_name = {} unambiguous_dna_by_id = {} unambiguous_rna_by_name = {} unambiguous_rna_by_id = {} generic_by_name = {} # unambiguous DNA or RNA generic_by_id = {} # unambiguous DNA or RNA ambiguous_generic_by_name = {} # ambiguous DNA or RNA ambiguous_generic_by_id = {} # ambiguous DNA or RNA # standard IUPAC unambiguous codons standard_dna_table = None standard_rna_table = None # In the future, the back_table could return a statistically # appropriate distribution of codons, so do not cache the results of # back_table lookups! class TranslationError(Exception): pass class CodonTable: nucleotide_alphabet = Alphabet.generic_nucleotide protein_alphabet = Alphabet.generic_protein forward_table = {} # only includes codons which actually code back_table = {} # for back translations start_codons = [] stop_codons = [] # Not always called from derived classes! def __init__(self, nucleotide_alphabet = nucleotide_alphabet, protein_alphabet = protein_alphabet, forward_table = forward_table, back_table = back_table, start_codons = start_codons, stop_codons = stop_codons): self.nucleotide_alphabet = nucleotide_alphabet self.protein_alphabet = protein_alphabet self.forward_table = forward_table self.back_table = back_table self.start_codons = start_codons self.stop_codons = stop_codons def __str__(self) : """Returns a simple text representation of the codon table e.g. >>> import Bio.Data.CodonTable >>> print Bio.Data.CodonTable.standard_dna_table >>> print Bio.Data.CodonTable.generic_by_id[1]""" if self.id : answer = "Table %i" % self.id else : answer = "Table ID unknown" if self.names : answer += " " + ", ".join(filter(None, self.names)) #Use the main four letters (and the conventional ordering) #even for ambiguous tables letters = self.nucleotide_alphabet.letters if isinstance(self.nucleotide_alphabet, Alphabet.DNAAlphabet) \ or (letters is not None and "T" in letters) : letters = "TCAG" else : #Should be either RNA or generic nucleotides, #e.g. Bio.Data.CodonTable.generic_by_id[1] letters = "UCAG" #Build the table... answer=answer + "\n\n |" + "|".join( \ [" %s " % c2 for c2 in letters] \ ) + "|" answer=answer + "\n--+" \ + "+".join(["---------" for c2 in letters]) + "+--" for c1 in letters : for c3 in letters : line = c1 + " |" for c2 in letters : codon = c1+c2+c3 line = line + " %s" % codon if codon in self.stop_codons : line = line + " Stop|" else : try : amino = self.forward_table[codon] except KeyError : amino = "?" except TranslationError : amino = "?" if codon in self.start_codons : line = line + " %s(s)|" % amino else : line = line + " %s |" % amino line = line + " " + c3 answer = answer + "\n"+ line answer=answer + "\n--+" \ + "+".join(["---------" for c2 in letters]) + "+--" return answer def make_back_table(table, default_stop_codon): # ONLY RETURNS A SINGLE CODON # Do the sort so changes in the hash implementation won't affect # the result when one amino acid is coded by more than one codon. back_table = {} keys = table.keys() ; keys.sort() for key in keys: back_table[table[key]] = key back_table[None] = default_stop_codon return back_table class NCBICodonTable(CodonTable): nucleotide_alphabet = Alphabet.generic_nucleotide protein_alphabet = IUPAC.protein def __init__(self, id, names, table, start_codons, stop_codons): self.id = id self.names = names self.forward_table = table self.back_table = make_back_table(table, stop_codons[0]) self.start_codons = start_codons self.stop_codons = stop_codons class NCBICodonTableDNA(NCBICodonTable): nucleotide_alphabet = IUPAC.unambiguous_dna class NCBICodonTableRNA(NCBICodonTable): nucleotide_alphabet = IUPAC.unambiguous_rna def register_ncbi_table(name, alt_name, id, table, start_codons, stop_codons): names = name.split("; ") dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons, stop_codons) # replace all T's with U's for the RNA tables rna_table = {} generic_table = {} for codon, val in table.items(): generic_table[codon] = val codon = codon.replace("T", "U") generic_table[codon] = val rna_table[codon] = val rna_start_codons = [] generic_start_codons = [] for codon in start_codons: generic_start_codons.append(codon) codon = codon.replace("T", "U") generic_start_codons.append(codon) rna_start_codons.append(codon) rna_stop_codons = [] generic_stop_codons = [] for codon in stop_codons: generic_stop_codons.append(codon) codon = codon.replace("T", "U") generic_stop_codons.append(codon) rna_stop_codons.append(codon) generic = NCBICodonTable(id, names + [alt_name], generic_table, generic_start_codons, generic_stop_codons) rna = NCBICodonTableRNA(id, names + [alt_name], rna_table, rna_start_codons, rna_stop_codons) if id == 1: global standard_dna_table, standard_rna_table standard_dna_table = dna standard_rna_table = rna unambiguous_dna_by_id[id] = dna unambiguous_rna_by_id[id] = rna generic_by_id[id] = generic if alt_name is not None: names.append(alt_name) for name in names: unambiguous_dna_by_name[name] = dna unambiguous_rna_by_name[name] = rna generic_by_name[name] = generic ### These tables created from the data file ### ftp://ncbi.nlm.nih.gov/entrez/misc/data/gc.prt ### using the following: ##import re ##for line in open("gc.prt").readlines(): ## if line[:2] == " {": ## names = [] ## id = None ## aa = None ## start = None ## bases = [] ## elif line[:6] == " name": ## names.append(re.search('"([^"]*)"', line).group(1)) ## elif line[:8] == " name": ## names.append(re.search('"(.*)$', line).group(1)) ## elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n': ## names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma" ## elif line[:4] == " id": ## id = int(re.search('(\d+)', line).group(1)) ## elif line[:10] == " ncbieaa ": ## aa = line[12:12+64] ## elif line[:10] == " sncbieaa": ## start = line[12:12+64] ## elif line[:9] == " -- Base": ## bases.append(line[12:12+64]) ## elif line[:2] == " }": ## assert names != [] and id is not None and aa is not None ## assert start is not None and bases != [] ## if len(names) == 1: ## names.append(None) ## print "register_ncbi_table(name = %s," % repr(names[0]) ## print " alt_name = %s, id = %d", % \ ## (repr(names[1]), id) ## print " table = {" ## s = " " ## for i in range(64): ## if aa[i] != "*": ## t = " '%s%s%s': '%s'," % (bases[0][i], bases[1][i], ## bases[2][i], aa[i]) ## if len(s) + len(t) > 75: ## print s ## s = " " + t ## else: ## s = s + t ## print s, "}," ## s = " stop_codons = [" ## for i in range(64): ## if aa[i] == "*": ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) ## if len(s) + len(t) > 75: ## print s ## s = " " + t ## else: ## s = s + t ## print s, "]," ## s = " start_codons = [" ## for i in range(64): ## if start[i] == "M": ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) ## if len(s) + len(t) > 75: ## print s ## s = " " + t ## else: ## s = s + t ## print s, "]" ## print " )" ## elif line[:2] == "--" or line == "\n" or line == "}\n" or \ ## line == 'Genetic-code-table ::= {\n': ## pass ## else: ## raise Exception("Unparsed: " + repr(line)) register_ncbi_table(name = 'Standard', alt_name = 'SGC0', id = 1, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', 'TGA', ], start_codons = [ 'TTG', 'CTG', 'ATG', ] ) register_ncbi_table(name = 'Vertebrate Mitochondrial', alt_name = 'SGC1', id = 2, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', 'AGA', 'AGG', ], start_codons = [ 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ] ) register_ncbi_table(name = 'Yeast Mitochondrial', alt_name = 'SGC2', id = 3, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'T', 'CTC': 'T', 'CTA': 'T', 'CTG': 'T', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', ], start_codons = [ 'ATG', ] ) register_ncbi_table(name = 'Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma', alt_name = 'SGC3', id = 4, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', ], start_codons = [ 'TTA', 'TTG', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ] ) register_ncbi_table(name = 'Invertebrate Mitochondrial', alt_name = 'SGC4', id = 5, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', ], start_codons = [ 'TTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ] ) register_ncbi_table(name = 'Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear', alt_name = 'SGC5', id = 6, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TGA', ], start_codons = [ 'ATG', ] ) register_ncbi_table(name = 'Echinoderm Mitochondrial', alt_name = 'SGC8', id = 9, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', ], start_codons = [ 'ATG', ] ) register_ncbi_table(name = 'Euplotid Nuclear', alt_name = 'SGC9', id = 10, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', ], start_codons = [ 'ATG', ] ) register_ncbi_table(name = 'Bacterial', alt_name = None, id = 11, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', 'TGA', ], start_codons = [ 'TTG', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ] ) register_ncbi_table(name = 'Alternative Yeast Nuclear', alt_name = None, id = 12, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'S', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', 'TGA', ], start_codons = [ 'CTG', 'ATG', ] ) register_ncbi_table(name = 'Ascidian Mitochondrial', alt_name = None, id = 13, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'G', 'AGG': 'G', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TAG', ], start_codons = [ 'ATG', ] ) register_ncbi_table(name = 'Flatworm Mitochondrial', alt_name = None, id = 14, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAG', ], start_codons = [ 'ATG', ] ) register_ncbi_table(name = 'Blepharisma Macronuclear', alt_name = None, id = 15, table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, stop_codons = [ 'TAA', 'TGA', ], start_codons = [ 'ATG', ] ) ######### Deal with ambiguous forward translations class AmbiguousCodonTable(CodonTable): def __init__(self, codon_table, ambiguous_nucleotide_alphabet, ambiguous_nucleotide_values, ambiguous_protein_alphabet, ambiguous_protein_values): CodonTable.__init__(self, ambiguous_nucleotide_alphabet, ambiguous_protein_alphabet, AmbiguousForwardTable(codon_table.forward_table, ambiguous_nucleotide_values, ambiguous_protein_values), codon_table.back_table, # These two are WRONG! I need to get the # list of ambiguous codons which code for # the stop codons XXX list_ambiguous_codons(codon_table.start_codons, ambiguous_nucleotide_values), list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values) ) self._codon_table = codon_table # Be sneaky and forward attribute lookups to the original table. # This lets us get the names, if the original table is an NCBI # table. def __getattr__(self, name): return getattr(self._codon_table, name) def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values): c1, c2, c3 = codon x1 = ambiguous_nucleotide_values[c1] x2 = ambiguous_nucleotide_values[c2] x3 = ambiguous_nucleotide_values[c3] possible = {} stops = [] for y1 in x1: for y2 in x2: for y3 in x3: try: possible[forward_table[y1+y2+y3]] = 1 except KeyError: # If tripping over a stop codon stops.append(y1+y2+y3) if stops: if possible.keys(): raise TranslationError("ambiguous codon '%s' codes " % codon \ + "for both proteins and stop codons") # This is a true stop codon - tell the caller about it raise KeyError(codon) return possible.keys() def list_ambiguous_codons(codons, ambiguous_nucleotide_values): """Extends a codon list to include all possible ambigous codons. e.g. ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR'] ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA'] Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR'. Thus only two more codons are added in the following: e.g. ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR'] Returns a new (longer) list of codon strings. """ #Note ambiguous_nucleotide_values['R'] = 'AG' (etc) #This will generate things like 'TRR' from ['TAG', 'TGA'], which #we don't want to include: c1_list = [letter for (letter, meanings) \ in ambiguous_nucleotide_values.iteritems() \ if set([codon[0] for codon in codons]).issuperset(set(meanings))] c2_list = [letter for (letter, meanings) \ in ambiguous_nucleotide_values.iteritems() \ if set([codon[1] for codon in codons]).issuperset(set(meanings))] c3_list = [letter for (letter, meanings) \ in ambiguous_nucleotide_values.iteritems() \ if set([codon[2] for codon in codons]).issuperset(set(meanings))] set2 = set([codon[1] for codon in codons]) set3 = set([codon[2] for codon in codons]) candidates = set([c1+c2+c3 for c1 in c1_list for c2 in c2_list for c3 in c3_list]) candidates.difference_update(codons) answer = codons[:] #copy #print "Have %i new candidates" % len(candidates) for ambig_codon in candidates : wanted = True #e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG' for codon in [c1+c2+c3 \ for c1 in ambiguous_nucleotide_values[ambig_codon[0]] \ for c2 in ambiguous_nucleotide_values[ambig_codon[1]] \ for c3 in ambiguous_nucleotide_values[ambig_codon[2]]]: if codon not in codons : #This ambiguous codon can code for a non-stop, exclude it! wanted=False #print "Rejecting %s" % ambig_codon continue if wanted : answer.append(ambig_codon) return answer assert list_ambiguous_codons(['TGA', 'TAA'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TRA'] assert list_ambiguous_codons(['TAG', 'TGA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TGA'] assert list_ambiguous_codons(['TAG', 'TAA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TAA', 'TAR'] assert list_ambiguous_codons(['UAG', 'UAA'],IUPACData.ambiguous_rna_values) == ['UAG', 'UAA', 'UAR'] assert list_ambiguous_codons(['TGA', 'TAA', 'TAG'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TAG', 'TAR', 'TRA'] # Forward translation is "onto", that is, any given codon always maps # to the same protein, or it doesn't map at all. Thus, I can build # off of an existing table to produce the ambiguous mappings. # # This handles the general case. Perhaps it's overkill? # >>> t = CodonTable.ambiguous_dna_by_id[1] # >>> t.forward_table["AAT"] # 'N' # >>> t.forward_table["GAT"] # 'D' # >>> t.forward_table["RAT"] # 'B' # >>> t.forward_table["YTA"] # 'L' class AmbiguousForwardTable: def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein): self.forward_table = forward_table self.ambiguous_nucleotide = ambiguous_nucleotide self.ambiguous_protein = ambiguous_protein inverted = {} for name, val in ambiguous_protein.items(): for c in val: x = inverted.get(c, {}) x[name] = 1 inverted[c] = x for name, val in inverted.items(): inverted[name] = val.keys() self._inverted = inverted self._cache = {} def get(self, codon, failobj = None): try: return self.__getitem__(codon) except KeyError: return failobj def __getitem__(self, codon): try: x = self._cache[codon] except KeyError: pass else: if x is TranslationError: raise TranslationError(codon) # no unique translation if x is KeyError: raise KeyError(codon) # it's a stop codon return x try: x = self.forward_table[codon] self._cache[codon] = x return x except KeyError: pass # XXX Need to make part of this into a method which returns # a list of all possible encodings for a codon! try: possible = list_possible_proteins(codon, self.forward_table, self.ambiguous_nucleotide) except KeyError: self._cache[codon] = KeyError raise KeyError(codon) # stop codon except TranslationError: self._cache[codon] = TranslationError raise TranslationError(codon) # does not code assert len(possible) > 0, "unambiguous codons must code" # Hah! Only one possible protein, so use it if len(possible) == 1: self._cache[codon] = possible[0] return possible[0] # See if there's an ambiguous protein encoding for the multiples. # Find residues which exist in every coding set. ambiguous_possible = {} for amino in possible: for term in self._inverted[amino]: ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1 n = len(possible) possible = [] for amino, val in ambiguous_possible.items(): if val == n: possible.append(amino) # No amino acid encoding for the results if len(possible) == 0: self._cache[codon] = TranslationError raise TranslationError(codon) # no valid translation # All of these are valid, so choose one # To be unique, sort by smallet ambiguity then alphabetically # Can get this if "X" encodes for everything. def _sort(x, y, table = self.ambiguous_protein): a = cmp(len(table[x]), len(table[y])) if a == 0: return cmp(x, y) return a possible.sort(_sort) x = possible[0] self._cache[codon] = x return x #Prepare the ambiguous tables for DNA, RNA and Generic (DNA or RNA) ambiguous_dna_by_name = {} for key, val in unambiguous_dna_by_name.items(): ambiguous_dna_by_name[key] = AmbiguousCodonTable(val, IUPAC.ambiguous_dna, IUPACData.ambiguous_dna_values, IUPAC.extended_protein, IUPACData.extended_protein_values) ambiguous_dna_by_id = {} for key, val in unambiguous_dna_by_id.items(): ambiguous_dna_by_id[key] = AmbiguousCodonTable(val, IUPAC.ambiguous_dna, IUPACData.ambiguous_dna_values, IUPAC.extended_protein, IUPACData.extended_protein_values) ambiguous_rna_by_name = {} for key, val in unambiguous_rna_by_name.items(): ambiguous_rna_by_name[key] = AmbiguousCodonTable(val, IUPAC.ambiguous_rna, IUPACData.ambiguous_rna_values, IUPAC.extended_protein, IUPACData.extended_protein_values) ambiguous_rna_by_id = {} for key, val in unambiguous_rna_by_id.items(): ambiguous_rna_by_id[key] = AmbiguousCodonTable(val, IUPAC.ambiguous_rna, IUPACData.ambiguous_rna_values, IUPAC.extended_protein, IUPACData.extended_protein_values) #The following isn't very elegant, but seems to work nicely. _merged_values = dict(IUPACData.ambiguous_rna_values.iteritems()) _merged_values["T"] = "U" for key, val in generic_by_name.items(): ambiguous_generic_by_name[key] = AmbiguousCodonTable(val, Alphabet.NucleotideAlphabet(), _merged_values, IUPAC.extended_protein, IUPACData.extended_protein_values) for key, val in generic_by_id.items(): ambiguous_generic_by_id[key] = AmbiguousCodonTable(val, Alphabet.NucleotideAlphabet(), _merged_values, IUPAC.extended_protein, IUPACData.extended_protein_values) del _merged_values del key, val #Basic sanity test, for n in ambiguous_generic_by_id.keys() : assert ambiguous_rna_by_id[n].forward_table["GUU"] == "V" assert ambiguous_rna_by_id[n].forward_table["GUN"] == "V" assert ambiguous_rna_by_id[n].forward_table["UUN"] == "X" #F or L #R = A or G, so URR = UAA or UGA / TRA = TAA or TGA = stop codons if "UAA" in unambiguous_rna_by_id[n].stop_codons \ and "UGA" in unambiguous_rna_by_id[n].stop_codons : try : print ambiguous_dna_by_id[n].forward_table["TRA"] assert False, "Should be a stop only" except KeyError : pass assert "URA" in ambiguous_generic_by_id[n].stop_codons assert "URA" in ambiguous_rna_by_id[n].stop_codons assert "TRA" in ambiguous_generic_by_id[n].stop_codons assert "TRA" in ambiguous_dna_by_id[n].stop_codons del n assert ambiguous_generic_by_id[1].stop_codons == ambiguous_generic_by_name["Standard"].stop_codons assert ambiguous_generic_by_id[4].stop_codons == ambiguous_generic_by_name["SGC3"].stop_codons assert ambiguous_generic_by_id[15].stop_codons == ambiguous_generic_by_name['Blepharisma Macronuclear'].stop_codons