1 # Copyright 2000-2002 by Andrew Dalke.
2 # Revisions copyright 2007-2008 by Peter Cock.
4 # This code is part of the Biopython distribution and governed by its
5 # license. Please see the LICENSE file that should have been included
6 # as part of this package.
8 """Alphabets used in Seq objects etc to declare sequence type and letters.
10 This is used by sequences which contain a finite number of similar words.
14 size = None # no fixed size for words
15 letters = None # no fixed alphabet; implement as a list-like
18 return self.__class__.__name__ + "()"
20 def contains(self, other):
21 """Does this alphabet 'contain' the other (OBSOLETE?).
23 Returns a boolean. This relies on the Alphabet subclassing
24 hierarchy only, and does not check the letters property.
25 This isn't ideal, and doesn't seem to work as intended
26 with the AlphabetEncoder classes."""
27 return isinstance(other, self.__class__)
29 generic_alphabet = Alphabet()
31 class SingleLetterAlphabet(Alphabet):
33 letters = None # string of all letters in the alphabet
35 single_letter_alphabet = SingleLetterAlphabet()
39 class ProteinAlphabet(SingleLetterAlphabet):
42 generic_protein = ProteinAlphabet()
45 class NucleotideAlphabet(SingleLetterAlphabet):
48 generic_nucleotide = NucleotideAlphabet()
50 class DNAAlphabet(NucleotideAlphabet):
53 generic_dna = DNAAlphabet()
58 class RNAAlphabet(NucleotideAlphabet):
61 generic_rna = RNAAlphabet()
65 ########### Other per-sequence encodings
67 class SecondaryStructure(SingleLetterAlphabet):
70 class ThreeLetterProtein(Alphabet):
73 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
74 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
75 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx",
78 ###### Non per-sequence modifications
80 # (These are Decorator classes)
82 class AlphabetEncoder:
83 def __init__(self, alphabet, new_letters):
84 self.alphabet = alphabet
85 self.new_letters = new_letters
86 if alphabet.letters is not None:
87 self.letters = alphabet.letters + new_letters
90 def __getattr__(self, key):
91 if key[:2] == "__" and key[-2:] == "__":
92 raise AttributeError(key)
93 return getattr(self.alphabet, key)
96 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet,
99 def contains(self, other):
100 """Does this alphabet 'contain' the other (OBSOLETE?).
102 This is isn't implemented for the base AlphabetEncoder,
103 which will always return 0 (False)."""
106 class Gapped(AlphabetEncoder):
107 def __init__(self, alphabet, gap_char = "-"):
108 AlphabetEncoder.__init__(self, alphabet, gap_char)
109 self.gap_char = gap_char
111 def contains(self, other):
112 """Does this alphabet 'contain' the other (OBSOLETE?).
114 Returns a boolean. This relies on the Alphabet subclassing
115 hierarchy, and attempts to check the gap character. This fails
116 if the other alphabet does not have a gap character!
118 return other.gap_char == self.gap_char and \
119 self.alphabet.contains(other.alphabet)
121 class HasStopCodon(AlphabetEncoder):
122 def __init__(self, alphabet, stop_symbol = "*"):
123 AlphabetEncoder.__init__(self, alphabet, stop_symbol)
124 self.stop_symbol = stop_symbol
126 def __cmp__(self, other):
127 x = cmp(self.alphabet, other.alphabet)
129 return cmp(self.stop_symbol, other.stop_symbol)
132 def contains(self, other):
133 """Does this alphabet 'contain' the other (OBSOLETE?).
135 Returns a boolean. This relies on the Alphabet subclassing
136 hierarchy, and attempts to check the stop symbol. This fails
137 if the other alphabet does not have a stop symbol!
139 return other.stop_symbol == self.stop_symbol and \
140 self.alphabet.contains(other.alphabet)
142 def _get_base_alphabet(alphabet) :
143 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE)."""
145 while isinstance(a, AlphabetEncoder) :
147 assert isinstance(a, Alphabet), \
148 "Invalid alphabet found, %s" % repr(a)
151 def _consensus_base_alphabet(alphabets) :
152 """Returns a common but often generic base alphabet object (PRIVATE).
154 This throws away any AlphabetEncoder information, e.g. Gapped alphabets.
156 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
157 letter. These DO NOT raise an exception!"""
159 for alpha in alphabets :
160 a = _get_base_alphabet(alpha)
165 elif isinstance(a, common.__class__) :
167 elif isinstance(common, a.__class__) :
169 elif isinstance(a, NucleotideAlphabet) \
170 and isinstance(common, NucleotideAlphabet) :
171 #e.g. Give a mix of RNA and DNA alphabets
172 common = generic_nucleotide
173 elif isinstance(a, SingleLetterAlphabet) \
174 and isinstance(common, SingleLetterAlphabet) :
175 #This is a pretty big mis-match!
176 common = single_letter_alphabet
178 #We have a major mis-match... take the easy way out!
179 return generic_alphabet
182 return generic_alphabet
185 def _consensus_alphabet(alphabets) :
186 """Returns a common but often generic alphabet object (PRIVATE).
188 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
189 letter. These DO NOT raise an exception!
191 This is aware of Gapped and HasStopCodon and new letters added by
192 other AlphabetEncoders. This WILL raise an exception if more than
193 one gap character or stop symbol is present."""
194 base = _consensus_base_alphabet(alphabets)
198 for alpha in alphabets :
200 if not hasattr(alpha, "gap_char") :
204 elif gap == alpha.gap_char :
207 raise ValueError("More than one gap character present")
209 if not hasattr(alpha, "stop_symbol") :
212 stop = alpha.stop_symbol
213 elif stop == alpha.stop_symbol :
216 raise ValueError("More than one stop symbol present")
218 if hasattr(alpha, "new_letters") :
219 for letter in alpha.new_letters :
220 if letter not in new_letters \
221 and letter != gap and letter != stop :
222 new_letters += letter
226 alpha = AlphabetEncoder(alpha, new_letters)
228 alpha = Gapped(alpha, gap_char=gap)
230 alpha = HasStopCodon(alpha, stop_symbol=stop)
233 def _check_type_compatible(alphabets) :
234 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE).
236 This relies on the Alphabet subclassing hierarchy. It does not
237 check things like gap characters or stop symbols."""
238 dna, rna, nucl, protein = False, False, False, False
239 for alpha in alphabets :
240 a = _get_base_alphabet(alpha)
241 if isinstance(a, DNAAlphabet) :
244 if rna or protein : return False
245 elif isinstance(a, RNAAlphabet) :
248 if dna or protein : return False
249 elif isinstance(a, NucleotideAlphabet) :
251 if protein : return False
252 elif isinstance(a, ProteinAlphabet) :
254 if nucl : return False