1 # Copyright 2000-2002 Brad Chapman.
2 # Copyright 2004-2005 by M de Hoon.
3 # Copyright 2007-2009 by Peter Cock.
5 # This code is part of the Biopython distribution and governed by its
6 # license. Please see the LICENSE file that should have been included
7 # as part of this package.
8 """Provides objects to represent biological sequences with alphabets.
10 See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
11 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
12 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
14 __docformat__ ="epytext en" #Don't just use plain text in epydoc API pages!
16 import string #for maketrans only
20 #TODO - Remove this work around once we drop python 2.3 support
24 from sets import Set as set
27 from Alphabet import IUPAC
28 from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
29 from Bio.Data import CodonTable
31 def _maketrans(complement_mapping) :
32 """Makes a python string translation table (PRIVATE).
35 - complement_mapping - a dictionary such as ambiguous_dna_complement
36 and ambiguous_rna_complement from Data.IUPACData.
38 Returns a translation table (a string of length 256) for use with the
39 python string's translate method to use in a (reverse) complement.
41 Compatible with lower case and upper case sequences.
43 For internal use only.
45 before = ''.join(complement_mapping.keys())
46 after = ''.join(complement_mapping.values())
47 before = before + before.lower()
48 after = after + after.lower()
49 return string.maketrans(before, after)
51 _dna_complement_table = _maketrans(ambiguous_dna_complement)
52 _rna_complement_table = _maketrans(ambiguous_rna_complement)
55 """A read-only sequence object (essentially a string with an alphabet).
57 Like normal python strings, our basic sequence object is immutable.
58 This prevents you from doing my_seq[5] = "A" for example, but does allow
59 Seq objects to be used as dictionary keys.
61 The Seq object provides a number of string like methods (such as count,
62 find, split and strip), which are alphabet aware where appropriate.
64 The Seq object also provides some biological methods, such as complement,
65 reverse_complement, transcribe, back_transcribe and translate (which are
66 not applicable to sequences with a protein alphabet).
68 def __init__(self, data, alphabet = Alphabet.generic_alphabet):
69 """Create a Seq object.
72 - seq - Sequence, required (string)
73 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
75 You will typically use Bio.SeqIO to read in sequences from files as
76 SeqRecord objects, whose sequence will be exposed as a Seq object via
79 However, will often want to create your own Seq objects directly:
81 >>> from Bio.Seq import Seq
82 >>> from Bio.Alphabet import IUPAC
83 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
86 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
88 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
90 # Enforce string storage
91 assert (type(data) == type("") or # must use a string
92 type(data) == type(u"")) # but can be a unicode string
94 self.alphabet = alphabet # Seq API requirement
96 # A data property is/was a Seq API requirement
97 def _set_data(self, value) :
98 #TODO - In the next release, actually raise an exception?
99 #The Seq object is like a python string, it should be read only!
101 warnings.warn("Writing to the Seq object's .data propery is deprecated.",
104 data = property(fget= lambda self : str(self),
106 doc="Sequence as a string (DEPRECATED)")
109 """Returns a (truncated) representation of the sequence for debugging."""
111 #Shows the last three letters as it is often useful to see if there
112 #is a stop codon at the end of a sequence.
113 #Note total length is 54+3+3=60
114 return "%s('%s...%s', %s)" % (self.__class__.__name__,
115 str(self)[:54], str(self)[-3:],
118 return "%s(%s, %s)" % (self.__class__.__name__,
122 """Returns the full sequence as a python string.
124 Note that Biopython 1.44 and earlier would give a truncated
125 version of repr(my_seq) for str(my_seq). If you are writing code
126 which need to be backwards compatible with old Biopython, you
127 should continue to use my_seq.tostring() rather than str(my_seq).
132 TODO - Work out why this breaks test_Restriction.py
133 (Comparing Seq objects would be nice to have. May need to think about
134 hashes and the in operator for when have list/dictionary of Seq objects...)
135 def __cmp__(self, other):
136 if hasattr(other, "alphabet") :
137 #other should be a Seq or a MutableSeq
138 if not Alphabet._check_type_compatible([self.alphabet,
140 raise TypeError("Incompatable alphabets %s and %s" \
141 % (repr(self.alphabet), repr(other.alphabet)))
142 #They should be the same sequence type (or one of them is generic)
143 return cmp(str(self), str(other))
144 elif isinstance(other, basestring) :
145 return cmp(str(self), other)
150 def __len__(self): return len(self._data) # Seq API requirement
152 def __getitem__(self, index) : # Seq API requirement
153 #Note since Python 2.0, __getslice__ is deprecated
154 #and __getitem__ is used instead.
155 #See http://docs.python.org/ref/sequence-methods.html
156 if isinstance(index, int) :
157 #Return a single letter as a string
158 return self._data[index]
160 #Return the (sub)sequence as another Seq object
161 return Seq(self._data[index], self.alphabet)
163 def __add__(self, other):
164 """Add another sequence or string to this sequence."""
165 if hasattr(other, "alphabet") :
166 #other should be a Seq or a MutableSeq
167 if not Alphabet._check_type_compatible([self.alphabet,
169 raise TypeError("Incompatable alphabets %s and %s" \
170 % (repr(self.alphabet), repr(other.alphabet)))
171 #They should be the same sequence type (or one of them is generic)
172 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
173 return self.__class__(str(self) + str(other), a)
174 elif isinstance(other, basestring) :
175 #other is a plain string - use the current alphabet
176 return self.__class__(str(self) + other, self.alphabet)
180 def __radd__(self, other):
181 if hasattr(other, "alphabet") :
182 #other should be a Seq or a MutableSeq
183 if not Alphabet._check_type_compatible([self.alphabet,
185 raise TypeError("Incompatable alphabets %s and %s" \
186 % (repr(self.alphabet), repr(other.alphabet)))
187 #They should be the same sequence type (or one of them is generic)
188 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
189 return self.__class__(str(other) + str(self), a)
190 elif isinstance(other, basestring) :
191 #other is a plain string - use the current alphabet
192 return self.__class__(other + str(self), self.alphabet)
196 def tostring(self): # Seq API requirement
197 """Returns the full sequence as a python string.
199 Although not formally deprecated, you are now encouraged to use
200 str(my_seq) instead of my_seq.tostring()."""
203 def tomutable(self): # Needed? Or use a function?
204 """Returns the full sequence as a MutableSeq object.
206 >>> from Bio.Seq import Seq
207 >>> from Bio.Alphabet import IUPAC
208 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
211 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
212 >>> my_seq.tomutable()
213 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
215 Note that the alphabet is preserved.
217 return MutableSeq(str(self), self.alphabet)
219 def _get_seq_str_and_check_alphabet(self, other_sequence) :
220 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
222 For a string argument, returns the string.
224 For a Seq or MutableSeq, it checks the alphabet is compatible
225 (raising an exception if it isn't), and then returns a string.
228 other_alpha = other_sequence.alphabet
229 except AttributeError :
230 #Assume other_sequence is a string
231 return other_sequence
233 #Other should be a Seq or a MutableSeq
234 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) :
235 raise TypeError("Incompatable alphabets %s and %s" \
236 % (repr(self.alphabet), repr(other_alpha)))
238 return str(other_sequence)
240 def count(self, sub, start=0, end=sys.maxint):
241 """Non-overlapping count method, like that of a python string.
243 This behaves like the python string method of the same name,
244 which does a non-overlapping count!
246 Returns an integer, the number of occurrences of substring
247 argument sub in the (sub)sequence given by [start:end].
248 Optional arguments start and end are interpreted as in slice
252 - sub - a string or another Seq object to look for
253 - start - optional integer, slice start
254 - end - optional integer, slice end
258 >>> from Bio.Seq import Seq
259 >>> my_seq = Seq("AAAATGA")
260 >>> print my_seq.count("A")
262 >>> print my_seq.count("ATG")
264 >>> print my_seq.count(Seq("AT"))
266 >>> print my_seq.count("AT", 2, -1)
269 HOWEVER, please note because python strings and Seq objects (and
270 MutableSeq objects) do a non-overlapping search, this may not give
271 the answer you expect:
273 >>> "AAAA".count("AA")
275 >>> print Seq("AAAA").count("AA")
278 A non-overlapping search would give the answer as three!
280 #If it has one, check the alphabet:
281 sub_str = self._get_seq_str_and_check_alphabet(sub)
282 return str(self).count(sub_str, start, end)
284 def find(self, sub, start=0, end=sys.maxint):
285 """Find method, like that of a python string.
287 This behaves like the python string method of the same name.
289 Returns an integer, the index of the first occurrence of substring
290 argument sub in the (sub)sequence given by [start:end].
293 - sub - a string or another Seq object to look for
294 - start - optional integer, slice start
295 - end - optional integer, slice end
297 Returns -1 if the subsequence is NOT found.
299 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
301 >>> from Bio.Seq import Seq
302 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
303 >>> my_rna.find("AUG")
306 #If it has one, check the alphabet:
307 sub_str = self._get_seq_str_and_check_alphabet(sub)
308 return str(self).find(sub_str, start, end)
310 def rfind(self, sub, start=0, end=sys.maxint):
311 """Find from right method, like that of a python string.
313 This behaves like the python string method of the same name.
315 Returns an integer, the index of the last (right most) occurrence of
316 substring argument sub in the (sub)sequence given by [start:end].
319 - sub - a string or another Seq object to look for
320 - start - optional integer, slice start
321 - end - optional integer, slice end
323 Returns -1 if the subsequence is NOT found.
325 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
327 >>> from Bio.Seq import Seq
328 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
329 >>> my_rna.rfind("AUG")
332 #If it has one, check the alphabet:
333 sub_str = self._get_seq_str_and_check_alphabet(sub)
334 return str(self).rfind(sub_str, start, end)
336 def startswith(self, prefix, start=0, end=sys.maxint) :
337 """Does the Seq start with the given prefix? Returns True/False.
339 This behaves like the python string method of the same name.
341 Return True if the sequence starts with the specified prefix
342 (a string or another Seq object), False otherwise.
343 With optional start, test sequence beginning at that position.
344 With optional end, stop comparing sequence at that position.
345 prefix can also be a tuple of strings to try. e.g.
347 >>> from Bio.Seq import Seq
348 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
349 >>> my_rna.startswith("GUC")
351 >>> my_rna.startswith("AUG")
353 >>> my_rna.startswith("AUG", 3)
355 >>> my_rna.startswith(("UCC","UCA","UCG"),1)
358 #If it has one, check the alphabet:
359 if isinstance(prefix, tuple) :
360 #TODO - Once we drop support for Python 2.4, instead of this
361 #loop offload to the string method (requires Python 2.5+).
362 #Check all the alphabets first...
363 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
365 for prefix_str in prefix_strings :
366 if str(self).startswith(prefix_str, start, end) :
370 prefix_str = self._get_seq_str_and_check_alphabet(prefix)
371 return str(self).startswith(prefix_str, start, end)
373 def endswith(self, suffix, start=0, end=sys.maxint) :
374 """Does the Seq end with the given suffix? Returns True/False.
376 This behaves like the python string method of the same name.
378 Return True if the sequence ends with the specified suffix
379 (a string or another Seq object), False otherwise.
380 With optional start, test sequence beginning at that position.
381 With optional end, stop comparing sequence at that position.
382 suffix can also be a tuple of strings to try. e.g.
384 >>> from Bio.Seq import Seq
385 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
386 >>> my_rna.endswith("UUG")
388 >>> my_rna.endswith("AUG")
390 >>> my_rna.endswith("AUG", 0, 18)
392 >>> my_rna.endswith(("UCC","UCA","UUG"))
395 #If it has one, check the alphabet:
396 if isinstance(suffix, tuple) :
397 #TODO - Once we drop support for Python 2.4, instead of this
398 #loop offload to the string method (requires Python 2.5+).
399 #Check all the alphabets first...
400 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
402 for suffix_str in suffix_strings :
403 if str(self).endswith(suffix_str, start, end) :
407 suffix_str = self._get_seq_str_and_check_alphabet(suffix)
408 return str(self).endswith(suffix_str, start, end)
411 def split(self, sep=None, maxsplit=-1) :
412 """Split method, like that of a python string.
414 This behaves like the python string method of the same name.
416 Return a list of the 'words' in the string (as Seq objects),
417 using sep as the delimiter string. If maxsplit is given, at
418 most maxsplit splits are done. If maxsplit is ommited, all
421 Following the python string method, sep will by default be any
422 white space (tabs, spaces, newlines) but this is unlikely to
423 apply to biological sequences.
427 >>> from Bio.Seq import Seq
428 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
429 >>> my_aa = my_rna.translate()
431 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
433 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
434 >>> my_aa.split("*",1)
435 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
437 See also the rsplit method:
439 >>> my_aa.rsplit("*",1)
440 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
442 #If it has one, check the alphabet:
443 sep_str = self._get_seq_str_and_check_alphabet(sep)
444 #TODO - If the sep is the defined stop symbol, or gap char,
445 #should we adjust the alphabet?
446 return [Seq(part, self.alphabet) \
447 for part in str(self).split(sep_str, maxsplit)]
449 def rsplit(self, sep=None, maxsplit=-1) :
450 """Right split method, like that of a python string.
452 This behaves like the python string method of the same name.
454 Return a list of the 'words' in the string (as Seq objects),
455 using sep as the delimiter string. If maxsplit is given, at
456 most maxsplit splits are done COUNTING FROM THE RIGHT.
457 If maxsplit is ommited, all splits are made.
459 Following the python string method, sep will by default be any
460 white space (tabs, spaces, newlines) but this is unlikely to
461 apply to biological sequences.
463 e.g. print my_seq.rsplit("*",1)
465 See also the split method.
467 #If it has one, check the alphabet:
468 sep_str = self._get_seq_str_and_check_alphabet(sep)
470 return [Seq(part, self.alphabet) \
471 for part in str(self).rsplit(sep_str, maxsplit)]
472 except AttributeError :
473 #Python 2.3 doesn't have a string rsplit method, which we can
474 #word around by reversing the sequence, using (left) split,
475 #and then reversing the answer. Not very efficient!
476 words = [Seq(word[::-1], self.alphabet) for word \
477 in str(self)[::-1].split(sep_str[::-1], maxsplit)]
481 def strip(self, chars=None) :
482 """Returns a new Seq object with leading and trailing ends stripped.
484 This behaves like the python string method of the same name.
486 Optional argument chars defines which characters to remove. If
487 ommitted or None (default) then as for the python string method,
488 this defaults to removing any white space.
490 e.g. print my_seq.strip("-")
492 See also the lstrip and rstrip methods.
494 #If it has one, check the alphabet:
495 strip_str = self._get_seq_str_and_check_alphabet(chars)
496 return Seq(str(self).strip(strip_str), self.alphabet)
498 def lstrip(self, chars=None) :
499 """Returns a new Seq object with leading (left) end stripped.
501 This behaves like the python string method of the same name.
503 Optional argument chars defines which characters to remove. If
504 ommitted or None (default) then as for the python string method,
505 this defaults to removing any white space.
507 e.g. print my_seq.lstrip("-")
509 See also the strip and rstrip methods.
511 #If it has one, check the alphabet:
512 strip_str = self._get_seq_str_and_check_alphabet(chars)
513 return Seq(str(self).lstrip(strip_str), self.alphabet)
515 def rstrip(self, chars=None) :
516 """Returns a new Seq object with trailing (right) end stripped.
518 This behaves like the python string method of the same name.
520 Optional argument chars defines which characters to remove. If
521 ommitted or None (default) then as for the python string method,
522 this defaults to removing any white space.
524 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
526 >>> from Bio.Alphabet import IUPAC
527 >>> from Bio.Seq import Seq
528 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
530 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
531 >>> my_seq.rstrip("A")
532 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
534 See also the strip and lstrip methods.
536 #If it has one, check the alphabet:
537 strip_str = self._get_seq_str_and_check_alphabet(chars)
538 return Seq(str(self).rstrip(strip_str), self.alphabet)
540 def complement(self):
541 """Returns the complement sequence. New Seq object.
543 >>> from Bio.Seq import Seq
544 >>> from Bio.Alphabet import IUPAC
545 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
547 Seq('CCCCCGATAG', IUPACUnambiguousDNA())
548 >>> my_dna.complement()
549 Seq('GGGGGCTATC', IUPACUnambiguousDNA())
551 You can of course used mixed case sequences,
553 >>> from Bio.Seq import Seq
554 >>> from Bio.Alphabet import generic_dna
555 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
557 Seq('CCCCCgatA-GD', DNAAlphabet())
558 >>> my_dna.complement()
559 Seq('GGGGGctaT-CH', DNAAlphabet())
561 Note in the above example, ambiguous character D denotes
562 G, A or T so its complement is H (for C, T or A).
564 Trying to complement a protein sequence raises an exception.
566 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
567 >>> my_protein.complement()
568 Traceback (most recent call last):
570 ValueError: Proteins do not have complements!
572 base = Alphabet._get_base_alphabet(self.alphabet)
573 if isinstance(base, Alphabet.ProteinAlphabet) :
574 raise ValueError("Proteins do not have complements!")
575 if isinstance(base, Alphabet.DNAAlphabet) :
576 ttable = _dna_complement_table
577 elif isinstance(base, Alphabet.RNAAlphabet) :
578 ttable = _rna_complement_table
579 elif ('U' in self._data or 'u' in self._data) \
580 and ('T' in self._data or 't' in self._data):
581 #TODO - Handle this cleanly?
582 raise ValueError("Mixed RNA/DNA found")
583 elif 'U' in self._data or 'u' in self._data:
584 ttable = _rna_complement_table
586 ttable = _dna_complement_table
587 #Much faster on really long sequences than the previous loop based one.
588 #thx to Michael Palmer, University of Waterloo
589 return Seq(str(self).translate(ttable), self.alphabet)
591 def reverse_complement(self):
592 """Returns the reverse complement sequence. New Seq object.
594 >>> from Bio.Seq import Seq
595 >>> from Bio.Alphabet import IUPAC
596 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
598 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
599 >>> my_dna.reverse_complement()
600 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
602 Note in the above example, since R = G or A, its complement
603 is Y (which denotes C or T).
605 You can of course used mixed case sequences,
607 >>> from Bio.Seq import Seq
608 >>> from Bio.Alphabet import generic_dna
609 >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
611 Seq('CCCCCgatA-G', DNAAlphabet())
612 >>> my_dna.reverse_complement()
613 Seq('C-TatcGGGGG', DNAAlphabet())
615 Trying to complement a protein sequence raises an exception:
617 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
618 >>> my_protein.reverse_complement()
619 Traceback (most recent call last):
621 ValueError: Proteins do not have complements!
623 #Use -1 stride/step to reverse the complement
624 return self.complement()[::-1]
626 def transcribe(self):
627 """Returns the RNA sequence from a DNA sequence. New Seq object.
629 >>> from Bio.Seq import Seq
630 >>> from Bio.Alphabet import IUPAC
631 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
632 ... IUPAC.unambiguous_dna)
634 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
635 >>> coding_dna.transcribe()
636 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
638 Trying to transcribe a protein or RNA sequence raises an exception:
640 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
641 >>> my_protein.transcribe()
642 Traceback (most recent call last):
644 ValueError: Proteins cannot be transcribed!
646 base = Alphabet._get_base_alphabet(self.alphabet)
647 if isinstance(base, Alphabet.ProteinAlphabet) :
648 raise ValueError("Proteins cannot be transcribed!")
649 if isinstance(base, Alphabet.RNAAlphabet) :
650 raise ValueError("RNA cannot be transcribed!")
652 if self.alphabet==IUPAC.unambiguous_dna:
653 alphabet = IUPAC.unambiguous_rna
654 elif self.alphabet==IUPAC.ambiguous_dna:
655 alphabet = IUPAC.ambiguous_rna
657 alphabet = Alphabet.generic_rna
658 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
660 def back_transcribe(self):
661 """Returns the DNA sequence from an RNA sequence. New Seq object.
663 >>> from Bio.Seq import Seq
664 >>> from Bio.Alphabet import IUPAC
665 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
666 ... IUPAC.unambiguous_rna)
668 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
669 >>> messenger_rna.back_transcribe()
670 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
672 Trying to back-transcribe a protein or DNA sequence raises an
675 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
676 >>> my_protein.back_transcribe()
677 Traceback (most recent call last):
679 ValueError: Proteins cannot be back transcribed!
681 base = Alphabet._get_base_alphabet(self.alphabet)
682 if isinstance(base, Alphabet.ProteinAlphabet) :
683 raise ValueError("Proteins cannot be back transcribed!")
684 if isinstance(base, Alphabet.DNAAlphabet) :
685 raise ValueError("DNA cannot be back transcribed!")
687 if self.alphabet==IUPAC.unambiguous_rna:
688 alphabet = IUPAC.unambiguous_dna
689 elif self.alphabet==IUPAC.ambiguous_rna:
690 alphabet = IUPAC.ambiguous_dna
692 alphabet = Alphabet.generic_dna
693 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
695 def translate(self, table="Standard", stop_symbol="*", to_stop=False):
696 """Turns a nucleotide sequence into a protein sequence. New Seq object.
698 This method will translate DNA or RNA sequences, and those with a
699 nucleotide or generic alphabet. Trying to translate a protein
700 sequence raises an exception.
703 - table - Which codon table to use? This can be either a name
704 (string) or an NCBI identifier (integer). This defaults
705 to the "Standard" table.
706 - stop_symbol - Single character string, what to use for terminators.
707 This defaults to the asterisk, "*".
708 - to_stop - Boolean, defaults to False meaning do a full translation
709 continuing on past any stop codons (translated as the
710 specified stop_symbol). If True, translation is
711 terminated at the first in frame stop codon (and the
712 stop_symbol is not appended to the returned protein
715 e.g. Using the standard table:
717 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
718 >>> coding_dna.translate()
719 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
720 >>> coding_dna.translate(stop_symbol="@")
721 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
722 >>> coding_dna.translate(to_stop=True)
723 Seq('VAIVMGR', ExtendedIUPACProtein())
725 Now using NCBI table 2, where TGA is not a stop codon:
727 >>> coding_dna.translate(table=2)
728 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
729 >>> coding_dna.translate(table=2, to_stop=True)
730 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
732 If the sequence has no in-frame stop codon, then the to_stop argument
735 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
736 >>> coding_dna2.translate()
737 Seq('LAIVMGR', ExtendedIUPACProtein())
738 >>> coding_dna2.translate(to_stop=True)
739 Seq('LAIVMGR', ExtendedIUPACProtein())
741 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
742 or a stop codon. These are translated as "X". Any invalid codon
743 (e.g. "TA?" or "T-A") will throw a TranslationError.
745 NOTE - Does NOT support gapped sequences.
747 NOTE - This does NOT behave like the python string's translate
748 method. For that use str(my_seq).translate(...) instead.
751 table_id = int(table)
754 if isinstance(table, str) and len(table)==256 :
755 raise ValueError("The Seq object translate method DOES NOT take " \
756 + "a 256 character string mapping table like " \
757 + "the python string object's translate method. " \
758 + "Use str(my_seq).translate(...) instead.")
759 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
760 Alphabet.ProteinAlphabet) :
761 raise ValueError("Proteins cannot be translated!")
762 if self.alphabet==IUPAC.unambiguous_dna:
763 #Will use standard IUPAC protein alphabet, no need for X
765 codon_table = CodonTable.unambiguous_dna_by_name[table]
767 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
768 #elif self.alphabet==IUPAC.ambiguous_dna:
769 # if table_id is None:
770 # codon_table = CodonTable.ambiguous_dna_by_name[table]
772 # codon_table = CodonTable.ambiguous_dna_by_id[table_id]
773 elif self.alphabet==IUPAC.unambiguous_rna:
774 #Will use standard IUPAC protein alphabet, no need for X
776 codon_table = CodonTable.unambiguous_rna_by_name[table]
778 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
779 #elif self.alphabet==IUPAC.ambiguous_rna:
780 # if table_id is None:
781 # codon_table = CodonTable.ambiguous_rna_by_name[table]
783 # codon_table = CodonTable.ambiguous_rna_by_id[table_id]
785 #This will use the extend IUPAC protein alphabet with X etc.
786 #The same table can be used for RNA or DNA (we use this for
787 #translating strings).
789 codon_table = CodonTable.ambiguous_generic_by_name[table]
791 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
792 protein = _translate_str(str(self), codon_table, stop_symbol, to_stop)
793 if stop_symbol in protein :
794 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
795 stop_symbol = stop_symbol)
797 alphabet = codon_table.protein_alphabet
798 return Seq(protein, alphabet)
800 class UnknownSeq(Seq):
801 """A read-only sequence object of known length but unknown contents.
803 If you have an unknown sequence, you can represent this with a normal
804 Seq object, for example:
806 >>> my_seq = Seq("N"*5)
808 Seq('NNNNN', Alphabet())
814 However, this is rather wasteful of memory (especially for large
815 sequences), which is where this class is most usefull:
817 >>> unk_five = UnknownSeq(5)
819 UnknownSeq(5, alphabet = Alphabet(), character = '?')
825 You can add unknown sequence together, provided their alphabets and
826 characters are compatible, and get another memory saving UnknownSeq:
828 >>> unk_four = UnknownSeq(4)
830 UnknownSeq(4, alphabet = Alphabet(), character = '?')
831 >>> unk_four + unk_five
832 UnknownSeq(9, alphabet = Alphabet(), character = '?')
834 If the alphabet or characters don't match up, the addition gives an
837 >>> unk_nnnn = UnknownSeq(4, character = "N")
839 UnknownSeq(4, alphabet = Alphabet(), character = 'N')
840 >>> unk_nnnn + unk_four
841 Seq('NNNN????', Alphabet())
843 Combining with a real Seq gives a new Seq object:
845 >>> known_seq = Seq("ACGT")
846 >>> unk_four + known_seq
847 Seq('????ACGT', Alphabet())
848 >>> known_seq + unk_four
849 Seq('ACGT????', Alphabet())
851 def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None) :
852 """Create a new UnknownSeq object.
854 If character is ommited, it is determed from the alphabet, "N" for
855 nucleotides, "X" for proteins, and "?" otherwise.
857 self._length = int(length)
858 if self._length < 0 :
859 #TODO - Block zero length UnknownSeq? You can just use a Seq!
860 raise ValueError("Length must not be negative.")
861 self.alphabet = alphabet
863 if len(character) != 1 :
864 raise ValueError("character argument should be a single letter string.")
865 self._character = character
867 base = Alphabet._get_base_alphabet(alphabet)
868 #TODO? Check the case of the letters in the alphabet?
869 #We may have to use "n" instead of "N" etc.
870 if isinstance(base, Alphabet.NucleotideAlphabet) :
871 self._character = "N"
872 elif isinstance(base, Alphabet.ProteinAlphabet) :
873 self._character = "X"
875 self._character = "?"
878 """Returns the stated length of the unknown sequence."""
882 """Returns the unknown sequence as full string of the given length."""
883 return self._character * self._length
886 return "UnknownSeq(%i, alphabet = %s, character = %s)" \
887 % (self._length, repr(self.alphabet), repr(self._character))
889 def __add__(self, other) :
890 if isinstance(other, UnknownSeq) \
891 and other._character == self._character :
892 #TODO - Check the alphabets match
893 return UnknownSeq(len(self)+len(other),
894 self.alphabet, self._character)
895 #Offload to the base class...
896 return Seq(str(self), self.alphabet) + other
898 def __radd__(self, other) :
899 if isinstance(other, UnknownSeq) \
900 and other._character == self._character :
901 #TODO - Check the alphabets match
902 return UnknownSeq(len(self)+len(other),
903 self.alphabet, self._character)
904 #Offload to the base class...
905 return other + Seq(str(self), self.alphabet)
907 def __getitem__(self, index):
908 if isinstance(index, int) :
909 #TODO - Check the bounds without wasting memory
910 return str(self)[index]
912 #TODO - Work out the length without wasting memory
913 return UnknownSeq(len(("#"*self._length)[index]),
914 self.alphabet, self._character)
916 def count(self, sub, start=0, end=sys.maxint):
917 """Non-overlapping count method, like that of a python string.
919 This behaves like the python string (and Seq object) method of the
920 same name, which does a non-overlapping count!
922 Returns an integer, the number of occurrences of substring
923 argument sub in the (sub)sequence given by [start:end].
924 Optional arguments start and end are interpreted as in slice
928 - sub - a string or another Seq object to look for
929 - start - optional integer, slice start
930 - end - optional integer, slice end
932 >>> "NNNN".count("N")
934 >>> Seq("NNNN").count("N")
936 >>> UnknownSeq(4, character="N").count("N")
938 >>> UnknownSeq(4, character="N").count("A")
940 >>> UnknownSeq(4, character="N").count("AA")
943 HOWEVER, please note because that python strings and Seq objects (and
944 MutableSeq objects) do a non-overlapping search, this may not give
945 the answer you expect:
947 >>> UnknownSeq(4, character="N").count("NN")
949 >>> UnknownSeq(4, character="N").count("NNN")
952 sub_str = self._get_seq_str_and_check_alphabet(sub)
953 if len(sub_str) == 1 :
954 if str(sub_str) == self._character :
955 if start==0 and end >= self._length :
958 #This could be done more cleverly...
959 return str(self).count(sub_str, start, end)
963 if set(sub_str) == set(self._character) :
964 if start==0 and end >= self._length :
965 return self._length // len(sub_str)
967 #This could be done more cleverly...
968 return str(self).count(sub_str, start, end)
972 def complement(self) :
973 """The complement of an unknown nucleotide equals itself.
975 >>> my_nuc = UnknownSeq(8)
977 UnknownSeq(8, alphabet = Alphabet(), character = '?')
980 >>> my_nuc.complement()
981 UnknownSeq(8, alphabet = Alphabet(), character = '?')
982 >>> print my_nuc.complement()
985 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
986 Alphabet.ProteinAlphabet) :
987 raise ValueError("Proteins do not have complements!")
990 def reverse_complement(self) :
991 """The reverse complement of an unknown nucleotide equals itself.
993 >>> my_nuc = UnknownSeq(10)
995 UnknownSeq(10, alphabet = Alphabet(), character = '?')
998 >>> my_nuc.reverse_complement()
999 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1000 >>> print my_nuc.reverse_complement()
1003 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1004 Alphabet.ProteinAlphabet) :
1005 raise ValueError("Proteins do not have complements!")
1008 def transcribe(self) :
1009 """Returns unknown RNA sequence from an unknown DNA sequence.
1011 >>> my_dna = UnknownSeq(10, character="N")
1013 UnknownSeq(10, alphabet = Alphabet(), character = 'N')
1016 >>> my_rna = my_dna.transcribe()
1018 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
1022 #Offload the alphabet stuff
1023 s = Seq(self._character, self.alphabet).transcribe()
1024 return UnknownSeq(self._length, s.alphabet, self._character)
1026 def back_transcribe(self) :
1027 """Returns unknown DNA sequence from an unknown RNA sequence.
1029 >>> my_rna = UnknownSeq(20, character="N")
1031 UnknownSeq(20, alphabet = Alphabet(), character = 'N')
1033 NNNNNNNNNNNNNNNNNNNN
1034 >>> my_dna = my_rna.back_transcribe()
1036 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1038 NNNNNNNNNNNNNNNNNNNN
1040 #Offload the alphabet stuff
1041 s = Seq(self._character, self.alphabet).back_transcribe()
1042 return UnknownSeq(self._length, s.alphabet, self._character)
1044 def translate(self, **kwargs) :
1045 """Translate an unknown nucleotide sequence into an unknown protein.
1049 >>> my_seq = UnknownSeq(11, character="N")
1052 >>> my_protein = my_seq.translate()
1054 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
1055 >>> print my_protein
1058 In comparison, using a normal Seq object:
1060 >>> my_seq = Seq("NNNNNNNNNNN")
1063 >>> my_protein = my_seq.translate()
1065 Seq('XXX', ExtendedIUPACProtein())
1066 >>> print my_protein
1070 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1071 Alphabet.ProteinAlphabet) :
1072 raise ValueError("Proteins cannot be translated!")
1073 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1076 class MutableSeq(object):
1077 """An editable sequence object (with an alphabet).
1079 Unlike normal python strings and our basic sequence object (the Seq class)
1080 which are immuatable, the MutableSeq lets you edit the sequence in place.
1081 However, this means you cannot use a MutableSeq object as a dictionary key.
1083 >>> from Bio.Seq import MutableSeq
1084 >>> from Bio.Alphabet import generic_dna
1085 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
1087 MutableSeq('ACTCGTCGTCG', DNAAlphabet())
1092 MutableSeq('ACTCGACGTCG', DNAAlphabet())
1095 >>> my_seq[5:8] = "NNN"
1097 MutableSeq('ACTCGNNNTCG', DNAAlphabet())
1101 Note that the MutableSeq object does not support as many string-like
1102 or biological methods as the Seq object.
1104 def __init__(self, data, alphabet = Alphabet.generic_alphabet):
1105 if type(data) == type(""):
1106 self.data = array.array("c", data)
1108 self.data = data # assumes the input is an array
1109 self.alphabet = alphabet
1112 """Returns a (truncated) representation of the sequence for debugging."""
1114 #Shows the last three letters as it is often useful to see if there
1115 #is a stop codon at the end of a sequence.
1116 #Note total length is 54+3+3=60
1117 return "%s('%s...%s', %s)" % (self.__class__.__name__,
1118 str(self[:54]), str(self[-3:]),
1119 repr(self.alphabet))
1121 return "%s('%s', %s)" % (self.__class__.__name__,
1123 repr(self.alphabet))
1126 """Returns the full sequence as a python string.
1128 Note that Biopython 1.44 and earlier would give a truncated
1129 version of repr(my_seq) for str(my_seq). If you are writing code
1130 which needs to be backwards compatible with old Biopython, you
1131 should continue to use my_seq.tostring() rather than str(my_seq).
1133 #See test_GAQueens.py for an historic usage of a non-string alphabet!
1134 return "".join(self.data)
1136 def __cmp__(self, other):
1137 """Compare the sequence for to another sequence or a string.
1139 If compared to another sequence the alphabets must be compatible.
1140 Comparing DNA to RNA, or Nucleotide to Protein will raise an
1143 Otherwise only the sequence itself is compared, not the precise
1146 This method indirectly supports ==, < , etc."""
1147 if hasattr(other, "alphabet") :
1148 #other should be a Seq or a MutableSeq
1149 if not Alphabet._check_type_compatible([self.alphabet,
1151 raise TypeError("Incompatable alphabets %s and %s" \
1152 % (repr(self.alphabet), repr(other.alphabet)))
1153 #They should be the same sequence type (or one of them is generic)
1154 if isinstance(other, MutableSeq):
1155 #See test_GAQueens.py for an historic usage of a non-string
1156 #alphabet! Comparing the arrays supports this.
1157 return cmp(self.data, other.data)
1159 return cmp(str(self), str(other))
1160 elif isinstance(other, basestring) :
1161 return cmp(str(self), other)
1165 def __len__(self): return len(self.data)
1167 def __getitem__(self, index) :
1168 #Note since Python 2.0, __getslice__ is deprecated
1169 #and __getitem__ is used instead.
1170 #See http://docs.python.org/ref/sequence-methods.html
1171 if isinstance(index, int) :
1172 #Return a single letter as a string
1173 return self.data[index]
1175 #Return the (sub)sequence as another Seq object
1176 return MutableSeq(self.data[index], self.alphabet)
1178 def __setitem__(self, index, value):
1179 #Note since Python 2.0, __setslice__ is deprecated
1180 #and __setitem__ is used instead.
1181 #See http://docs.python.org/ref/sequence-methods.html
1182 if isinstance(index, int) :
1183 #Replacing a single letter with a new string
1184 self.data[index] = value
1186 #Replacing a sub-sequence
1187 if isinstance(value, MutableSeq):
1188 self.data[index] = value.data
1189 elif isinstance(value, type(self.data)):
1190 self.data[index] = value
1192 self.data[index] = array.array("c", str(value))
1194 def __delitem__(self, index):
1195 #Note since Python 2.0, __delslice__ is deprecated
1196 #and __delitem__ is used instead.
1197 #See http://docs.python.org/ref/sequence-methods.html
1199 #Could be deleting a single letter, or a slice
1200 del self.data[index]
1202 def __add__(self, other):
1203 """Add another sequence or string to this sequence.
1205 Returns a new MutableSeq object."""
1206 if hasattr(other, "alphabet") :
1207 #other should be a Seq or a MutableSeq
1208 if not Alphabet._check_type_compatible([self.alphabet,
1210 raise TypeError("Incompatable alphabets %s and %s" \
1211 % (repr(self.alphabet), repr(other.alphabet)))
1212 #They should be the same sequence type (or one of them is generic)
1213 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1214 if isinstance(other, MutableSeq):
1215 #See test_GAQueens.py for an historic usage of a non-string
1216 #alphabet! Adding the arrays should support this.
1217 return self.__class__(self.data + other.data, a)
1219 return self.__class__(str(self) + str(other), a)
1220 elif isinstance(other, basestring) :
1221 #other is a plain string - use the current alphabet
1222 return self.__class__(str(self) + str(other), self.alphabet)
1226 def __radd__(self, other):
1227 if hasattr(other, "alphabet") :
1228 #other should be a Seq or a MutableSeq
1229 if not Alphabet._check_type_compatible([self.alphabet,
1231 raise TypeError("Incompatable alphabets %s and %s" \
1232 % (repr(self.alphabet), repr(other.alphabet)))
1233 #They should be the same sequence type (or one of them is generic)
1234 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1235 if isinstance(other, MutableSeq):
1236 #See test_GAQueens.py for an historic usage of a non-string
1237 #alphabet! Adding the arrays should support this.
1238 return self.__class__(other.data + self.data, a)
1240 return self.__class__(str(other) + str(self), a)
1241 elif isinstance(other, basestring) :
1242 #other is a plain string - use the current alphabet
1243 return self.__class__(str(other) + str(self), self.alphabet)
1247 def append(self, c):
1250 def insert(self, i, c):
1251 self.data.insert(i, c)
1253 def pop(self, i = (-1)):
1258 def remove(self, item):
1259 for i in range(len(self.data)):
1260 if self.data[i] == item:
1263 raise ValueError("MutableSeq.remove(x): x not in list")
1265 def count(self, sub, start=0, end=sys.maxint):
1266 """Non-overlapping count method, like that of a python string.
1268 This behaves like the python string method of the same name,
1269 which does a non-overlapping count!
1271 Returns an integer, the number of occurrences of substring
1272 argument sub in the (sub)sequence given by [start:end].
1273 Optional arguments start and end are interpreted as in slice
1277 - sub - a string or another Seq object to look for
1278 - start - optional integer, slice start
1279 - end - optional integer, slice end
1283 >>> from Bio.Seq import MutableSeq
1284 >>> my_mseq = MutableSeq("AAAATGA")
1285 >>> print my_mseq.count("A")
1287 >>> print my_mseq.count("ATG")
1289 >>> print my_mseq.count(Seq("AT"))
1291 >>> print my_mseq.count("AT", 2, -1)
1294 HOWEVER, please note because that python strings, Seq objects and
1295 MutableSeq objects do a non-overlapping search, this may not give
1296 the answer you expect:
1298 >>> "AAAA".count("AA")
1300 >>> print MutableSeq("AAAA").count("AA")
1303 A non-overlapping search would give the answer as three!
1306 #TODO - Should we check the alphabet?
1307 search = sub.tostring()
1308 except AttributeError :
1311 if not isinstance(search, basestring) :
1312 raise TypeError("expected a string, Seq or MutableSeq")
1314 if len(search) == 1 :
1315 #Try and be efficient and work directly from the array.
1317 for c in self.data[start:end]:
1318 if c == search: count += 1
1321 #TODO - Can we do this more efficiently?
1322 return self.tostring().count(search, start, end)
1324 def index(self, item):
1325 for i in range(len(self.data)):
1326 if self.data[i] == item:
1328 raise ValueError("MutableSeq.index(x): x not in list")
1331 """Modify the mutable sequence to reverse itself.
1337 def complement(self):
1338 """Modify the mutable sequence to take on its complement.
1340 Trying to complement a protein sequence raises an exception.
1344 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1345 Alphabet.ProteinAlphabet) :
1346 raise ValueError("Proteins do not have complements!")
1347 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna):
1348 d = ambiguous_dna_complement
1349 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna):
1350 d = ambiguous_rna_complement
1351 elif 'U' in self.data and 'T' in self.data :
1352 #TODO - Handle this cleanly?
1353 raise ValueError("Mixed RNA/DNA found")
1354 elif 'U' in self.data:
1355 d = ambiguous_rna_complement
1357 d = ambiguous_dna_complement
1358 c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()])
1360 self.data = map(lambda c: d[c], self.data)
1361 self.data = array.array('c', self.data)
1363 def reverse_complement(self):
1364 """Modify the mutable sequence to take on its reverse complement.
1366 Trying to reverse complement a protein sequence raises an exception.
1373 ## Sorting a sequence makes no sense.
1374 # def sort(self, *args): self.data.sort(*args)
1376 def extend(self, other):
1377 if isinstance(other, MutableSeq):
1378 for c in other.data:
1385 """Returns the full sequence as a python string.
1387 Although not formally deprecated, you are now encouraged to use
1388 str(my_seq) instead of my_seq.tostring().
1390 Because str(my_seq) will give you the full sequence as a python string,
1391 there is often no need to make an explicit conversion. For example,
1393 print "ID={%s}, sequence={%s}" % (my_name, my_seq)
1395 On Biopython 1.44 or older you would have to have done this:
1397 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
1399 return "".join(self.data)
1402 """Returns the full sequence as a new immutable Seq object.
1404 >>> from Bio.Seq import Seq
1405 >>> from Bio.Alphabet import IUPAC
1406 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \
1409 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1411 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1413 Note that the alphabet is preserved.
1415 return Seq("".join(self.data), self.alphabet)
1417 # The transcribe, backward_transcribe, and translate functions are
1418 # user-friendly versions of the corresponding functions in Bio.Transcribe
1419 # and Bio.Translate. The functions work both on Seq objects, and on strings.
1421 def transcribe(dna):
1422 """Transcribes a DNA sequence into RNA.
1424 If given a string, returns a new string object.
1426 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1428 Trying to transcribe a protein or RNA sequence raises an exception.
1432 >>> transcribe("ACTGN")
1435 if isinstance(dna, Seq) :
1436 return dna.transcribe()
1437 elif isinstance(dna, MutableSeq):
1438 return dna.toseq().transcribe()
1440 return dna.replace('T','U').replace('t','u')
1442 def back_transcribe(rna):
1443 """Back-transcribes an RNA sequence into DNA.
1445 If given a string, returns a new string object.
1447 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1449 Trying to transcribe a protein or DNA sequence raises an exception.
1453 >>> back_transcribe("ACUGN")
1456 if isinstance(rna, Seq) :
1457 return rna.back_transcribe()
1458 elif isinstance(rna, MutableSeq):
1459 return rna.toseq().back_transcribe()
1461 return rna.replace('U','T').replace('u','t')
1463 def _translate_str(sequence, table, stop_symbol="*",
1464 to_stop=False, pos_stop="X") :
1465 """Helper function to translate a nucleotide string (PRIVATE).
1468 - sequence - a string
1469 - table - a CodonTable object (NOT a table name or id number)
1470 - stop_symbol - a single character string, what to use for terminators.
1471 - to_stop - boolean, should translation terminate at the first
1472 in frame stop codon? If there is no in-frame stop codon
1473 then translation continues to the end.
1474 - pos_stop - a single character string for a possible stop codon
1481 >>> from Bio.Data import CodonTable
1482 >>> table = CodonTable.ambiguous_dna_by_id[1]
1483 >>> _translate_str("AAA", table)
1485 >>> _translate_str("TAR", table)
1487 >>> _translate_str("TAN", table)
1489 >>> _translate_str("TAN", table, pos_stop="@")
1491 >>> _translate_str("TA?", table)
1492 Traceback (most recent call last):
1494 TranslationError: Codon 'TA?' is invalid
1496 sequence = sequence.upper()
1498 forward_table = table.forward_table
1499 stop_codons = table.stop_codons
1500 if table.nucleotide_alphabet.letters is not None :
1501 valid_letters = set(table.nucleotide_alphabet.letters.upper())
1503 #Assume the worst case, ambiguous DNA or RNA:
1504 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
1505 IUPAC.ambiguous_rna.letters.upper())
1508 for i in xrange(0,n-n%3,3) :
1509 codon = sequence[i:i+3]
1511 amino_acids.append(forward_table[codon])
1512 except (KeyError, CodonTable.TranslationError) :
1513 #Todo? Treat "---" as a special case (gapped translation)
1514 if codon in table.stop_codons :
1516 amino_acids.append(stop_symbol)
1517 elif valid_letters.issuperset(set(codon)) :
1518 #Possible stop codon (e.g. NNN or TAN)
1519 amino_acids.append(pos_stop)
1521 raise CodonTable.TranslationError(\
1522 "Codon '%s' is invalid" % codon)
1523 return "".join(amino_acids)
1525 def translate(sequence, table="Standard", stop_symbol="*", to_stop=False):
1526 """Translate a nucleotide sequence into amino acids.
1528 If given a string, returns a new string object. Given a Seq or
1529 MutableSeq, returns a Seq object with a protein alphabet.
1532 - table - Which codon table to use? This can be either a name
1533 (string) or an NCBI identifier (integer). Defaults
1534 to the "Standard" table.
1535 - stop_symbol - Single character string, what to use for any
1536 terminators, defaults to the asterisk, "*".
1537 - to_stop - Boolean, defaults to False meaning do a full
1538 translation continuing on past any stop codons
1539 (translated as the specified stop_symbol). If
1540 True, translation is terminated at the first in
1541 frame stop codon (and the stop_symbol is not
1542 appended to the returned protein sequence).
1544 A simple string example using the default (standard) genetic code:
1546 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
1547 >>> translate(coding_dna)
1549 >>> translate(coding_dna, stop_symbol="@")
1551 >>> translate(coding_dna, to_stop=True)
1554 Now using NCBI table 2, where TGA is not a stop codon:
1556 >>> translate(coding_dna, table=2)
1558 >>> translate(coding_dna, table=2, to_stop=True)
1561 Note that if the sequence has no in-frame stop codon, then the to_stop
1562 argument has no effect:
1564 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
1565 >>> translate(coding_dna2)
1567 >>> translate(coding_dna2, to_stop=True)
1570 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
1571 or a stop codon. These are translated as "X". Any invalid codon
1572 (e.g. "TA?" or "T-A") will throw a TranslationError.
1574 NOTE - Does NOT support gapped sequences.
1576 It will however translate either DNA or RNA.
1578 if isinstance(sequence, Seq) :
1579 return sequence.translate(table, stop_symbol, to_stop)
1580 elif isinstance(sequence, MutableSeq):
1581 #Return a Seq object
1582 return sequence.toseq().translate(table, stop_symbol, to_stop)
1584 #Assume its a string, return a string
1586 codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
1588 codon_table = CodonTable.ambiguous_generic_by_name[table]
1589 return _translate_str(sequence, codon_table, stop_symbol, to_stop)
1591 def reverse_complement(sequence):
1592 """Returns the reverse complement sequence of a nucleotide string.
1594 If given a string, returns a new string object.
1595 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet.
1597 Supports unambiguous and ambiguous nucleotide sequences.
1601 >>> reverse_complement("ACTG-NH")
1604 if isinstance(sequence, Seq) :
1606 return sequence.reverse_complement()
1607 elif isinstance(sequence, MutableSeq) :
1609 #Don't use the MutableSeq reverse_complement method as it is 'in place'.
1610 return sequence.toseq().reverse_complement()
1612 #Assume its a string.
1613 #In order to avoid some code duplication, the old code would turn the string
1614 #into a Seq, use the reverse_complement method, and convert back to a string.
1615 #This worked, but is over five times slower on short sequences!
1616 if ('U' in sequence or 'u' in sequence) \
1617 and ('T' in sequence or 't' in sequence):
1618 raise ValueError("Mixed RNA/DNA found")
1619 elif 'U' in sequence or 'u' in sequence:
1620 ttable = _rna_complement_table
1622 ttable = _dna_complement_table
1623 return sequence.translate(ttable)[::-1]
1626 """Run the Bio.Seq module's doctests."""
1627 print "Runing doctests..."
1632 if __name__ == "__main__":