--- /dev/null
+# Copyright 2000-2002 Brad Chapman.
+# Copyright 2004-2005 by M de Hoon.
+# Copyright 2007-2009 by Peter Cock.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Provides objects to represent biological sequences with alphabets.
+
+See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
+ - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
+ - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
+"""
+__docformat__ ="epytext en" #Don't just use plain text in epydoc API pages!
+
+import string #for maketrans only
+import array
+import sys
+
+#TODO - Remove this work around once we drop python 2.3 support
+try:
+ set = set
+except NameError:
+ from sets import Set as set
+
+import Alphabet
+from Alphabet import IUPAC
+from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
+from Bio.Data import CodonTable
+
+def _maketrans(complement_mapping) :
+ """Makes a python string translation table (PRIVATE).
+
+ Arguments:
+ - complement_mapping - a dictionary such as ambiguous_dna_complement
+ and ambiguous_rna_complement from Data.IUPACData.
+
+ Returns a translation table (a string of length 256) for use with the
+ python string's translate method to use in a (reverse) complement.
+
+ Compatible with lower case and upper case sequences.
+
+ For internal use only.
+ """
+ before = ''.join(complement_mapping.keys())
+ after = ''.join(complement_mapping.values())
+ before = before + before.lower()
+ after = after + after.lower()
+ return string.maketrans(before, after)
+
+_dna_complement_table = _maketrans(ambiguous_dna_complement)
+_rna_complement_table = _maketrans(ambiguous_rna_complement)
+
+class Seq(object):
+ """A read-only sequence object (essentially a string with an alphabet).
+
+ Like normal python strings, our basic sequence object is immutable.
+ This prevents you from doing my_seq[5] = "A" for example, but does allow
+ Seq objects to be used as dictionary keys.
+
+ The Seq object provides a number of string like methods (such as count,
+ find, split and strip), which are alphabet aware where appropriate.
+
+ The Seq object also provides some biological methods, such as complement,
+ reverse_complement, transcribe, back_transcribe and translate (which are
+ not applicable to sequences with a protein alphabet).
+ """
+ def __init__(self, data, alphabet = Alphabet.generic_alphabet):
+ """Create a Seq object.
+
+ Arguments:
+ - seq - Sequence, required (string)
+ - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
+
+ You will typically use Bio.SeqIO to read in sequences from files as
+ SeqRecord objects, whose sequence will be exposed as a Seq object via
+ the seq property.
+
+ However, will often want to create your own Seq objects directly:
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
+ ... IUPAC.protein)
+ >>> my_seq
+ Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
+ >>> print my_seq
+ MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+ """
+ # Enforce string storage
+ assert (type(data) == type("") or # must use a string
+ type(data) == type(u"")) # but can be a unicode string
+ self._data = data
+ self.alphabet = alphabet # Seq API requirement
+
+ # A data property is/was a Seq API requirement
+ def _set_data(self, value) :
+ #TODO - In the next release, actually raise an exception?
+ #The Seq object is like a python string, it should be read only!
+ import warnings
+ warnings.warn("Writing to the Seq object's .data propery is deprecated.",
+ DeprecationWarning)
+ self._data = value
+ data = property(fget= lambda self : str(self),
+ fset=_set_data,
+ doc="Sequence as a string (DEPRECATED)")
+
+ def __repr__(self):
+ """Returns a (truncated) representation of the sequence for debugging."""
+ if len(self) > 60 :
+ #Shows the last three letters as it is often useful to see if there
+ #is a stop codon at the end of a sequence.
+ #Note total length is 54+3+3=60
+ return "%s('%s...%s', %s)" % (self.__class__.__name__,
+ str(self)[:54], str(self)[-3:],
+ repr(self.alphabet))
+ else :
+ return "%s(%s, %s)" % (self.__class__.__name__,
+ repr(self.data),
+ repr(self.alphabet))
+ def __str__(self):
+ """Returns the full sequence as a python string.
+
+ Note that Biopython 1.44 and earlier would give a truncated
+ version of repr(my_seq) for str(my_seq). If you are writing code
+ which need to be backwards compatible with old Biopython, you
+ should continue to use my_seq.tostring() rather than str(my_seq).
+ """
+ return self._data
+
+ """
+ TODO - Work out why this breaks test_Restriction.py
+ (Comparing Seq objects would be nice to have. May need to think about
+ hashes and the in operator for when have list/dictionary of Seq objects...)
+ def __cmp__(self, other):
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ return cmp(str(self), str(other))
+ elif isinstance(other, basestring) :
+ return cmp(str(self), other)
+ else :
+ raise TypeError
+ """
+
+ def __len__(self): return len(self._data) # Seq API requirement
+
+ def __getitem__(self, index) : # Seq API requirement
+ #Note since Python 2.0, __getslice__ is deprecated
+ #and __getitem__ is used instead.
+ #See http://docs.python.org/ref/sequence-methods.html
+ if isinstance(index, int) :
+ #Return a single letter as a string
+ return self._data[index]
+ else :
+ #Return the (sub)sequence as another Seq object
+ return Seq(self._data[index], self.alphabet)
+
+ def __add__(self, other):
+ """Add another sequence or string to this sequence."""
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
+ return self.__class__(str(self) + str(other), a)
+ elif isinstance(other, basestring) :
+ #other is a plain string - use the current alphabet
+ return self.__class__(str(self) + other, self.alphabet)
+ else :
+ raise TypeError
+
+ def __radd__(self, other):
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
+ return self.__class__(str(other) + str(self), a)
+ elif isinstance(other, basestring) :
+ #other is a plain string - use the current alphabet
+ return self.__class__(other + str(self), self.alphabet)
+ else :
+ raise TypeError
+
+ def tostring(self): # Seq API requirement
+ """Returns the full sequence as a python string.
+
+ Although not formally deprecated, you are now encouraged to use
+ str(my_seq) instead of my_seq.tostring()."""
+ return str(self)
+
+ def tomutable(self): # Needed? Or use a function?
+ """Returns the full sequence as a MutableSeq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
+ ... IUPAC.protein)
+ >>> my_seq
+ Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
+ >>> my_seq.tomutable()
+ MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
+
+ Note that the alphabet is preserved.
+ """
+ return MutableSeq(str(self), self.alphabet)
+
+ def _get_seq_str_and_check_alphabet(self, other_sequence) :
+ """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
+
+ For a string argument, returns the string.
+
+ For a Seq or MutableSeq, it checks the alphabet is compatible
+ (raising an exception if it isn't), and then returns a string.
+ """
+ try :
+ other_alpha = other_sequence.alphabet
+ except AttributeError :
+ #Assume other_sequence is a string
+ return other_sequence
+
+ #Other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other_alpha)))
+ #Return as a string
+ return str(other_sequence)
+
+ def count(self, sub, start=0, end=sys.maxint):
+ """Non-overlapping count method, like that of a python string.
+
+ This behaves like the python string method of the same name,
+ which does a non-overlapping count!
+
+ Returns an integer, the number of occurrences of substring
+ argument sub in the (sub)sequence given by [start:end].
+ Optional arguments start and end are interpreted as in slice
+ notation.
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> my_seq = Seq("AAAATGA")
+ >>> print my_seq.count("A")
+ 5
+ >>> print my_seq.count("ATG")
+ 1
+ >>> print my_seq.count(Seq("AT"))
+ 1
+ >>> print my_seq.count("AT", 2, -1)
+ 1
+
+ HOWEVER, please note because python strings and Seq objects (and
+ MutableSeq objects) do a non-overlapping search, this may not give
+ the answer you expect:
+
+ >>> "AAAA".count("AA")
+ 2
+ >>> print Seq("AAAA").count("AA")
+ 2
+
+ A non-overlapping search would give the answer as three!
+ """
+ #If it has one, check the alphabet:
+ sub_str = self._get_seq_str_and_check_alphabet(sub)
+ return str(self).count(sub_str, start, end)
+
+ def find(self, sub, start=0, end=sys.maxint):
+ """Find method, like that of a python string.
+
+ This behaves like the python string method of the same name.
+
+ Returns an integer, the index of the first occurrence of substring
+ argument sub in the (sub)sequence given by [start:end].
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ Returns -1 if the subsequence is NOT found.
+
+ e.g. Locating the first typical start codon, AUG, in an RNA sequence:
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_rna.find("AUG")
+ 3
+ """
+ #If it has one, check the alphabet:
+ sub_str = self._get_seq_str_and_check_alphabet(sub)
+ return str(self).find(sub_str, start, end)
+
+ def rfind(self, sub, start=0, end=sys.maxint):
+ """Find from right method, like that of a python string.
+
+ This behaves like the python string method of the same name.
+
+ Returns an integer, the index of the last (right most) occurrence of
+ substring argument sub in the (sub)sequence given by [start:end].
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ Returns -1 if the subsequence is NOT found.
+
+ e.g. Locating the last typical start codon, AUG, in an RNA sequence:
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_rna.rfind("AUG")
+ 15
+ """
+ #If it has one, check the alphabet:
+ sub_str = self._get_seq_str_and_check_alphabet(sub)
+ return str(self).rfind(sub_str, start, end)
+
+ def startswith(self, prefix, start=0, end=sys.maxint) :
+ """Does the Seq start with the given prefix? Returns True/False.
+
+ This behaves like the python string method of the same name.
+
+ Return True if the sequence starts with the specified prefix
+ (a string or another Seq object), False otherwise.
+ With optional start, test sequence beginning at that position.
+ With optional end, stop comparing sequence at that position.
+ prefix can also be a tuple of strings to try. e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_rna.startswith("GUC")
+ True
+ >>> my_rna.startswith("AUG")
+ False
+ >>> my_rna.startswith("AUG", 3)
+ True
+ >>> my_rna.startswith(("UCC","UCA","UCG"),1)
+ True
+ """
+ #If it has one, check the alphabet:
+ if isinstance(prefix, tuple) :
+ #TODO - Once we drop support for Python 2.4, instead of this
+ #loop offload to the string method (requires Python 2.5+).
+ #Check all the alphabets first...
+ prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
+ for p in prefix]
+ for prefix_str in prefix_strings :
+ if str(self).startswith(prefix_str, start, end) :
+ return True
+ return False
+ else :
+ prefix_str = self._get_seq_str_and_check_alphabet(prefix)
+ return str(self).startswith(prefix_str, start, end)
+
+ def endswith(self, suffix, start=0, end=sys.maxint) :
+ """Does the Seq end with the given suffix? Returns True/False.
+
+ This behaves like the python string method of the same name.
+
+ Return True if the sequence ends with the specified suffix
+ (a string or another Seq object), False otherwise.
+ With optional start, test sequence beginning at that position.
+ With optional end, stop comparing sequence at that position.
+ suffix can also be a tuple of strings to try. e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_rna.endswith("UUG")
+ True
+ >>> my_rna.endswith("AUG")
+ False
+ >>> my_rna.endswith("AUG", 0, 18)
+ True
+ >>> my_rna.endswith(("UCC","UCA","UUG"))
+ True
+ """
+ #If it has one, check the alphabet:
+ if isinstance(suffix, tuple) :
+ #TODO - Once we drop support for Python 2.4, instead of this
+ #loop offload to the string method (requires Python 2.5+).
+ #Check all the alphabets first...
+ suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
+ for p in suffix]
+ for suffix_str in suffix_strings :
+ if str(self).endswith(suffix_str, start, end) :
+ return True
+ return False
+ else :
+ suffix_str = self._get_seq_str_and_check_alphabet(suffix)
+ return str(self).endswith(suffix_str, start, end)
+
+
+ def split(self, sep=None, maxsplit=-1) :
+ """Split method, like that of a python string.
+
+ This behaves like the python string method of the same name.
+
+ Return a list of the 'words' in the string (as Seq objects),
+ using sep as the delimiter string. If maxsplit is given, at
+ most maxsplit splits are done. If maxsplit is ommited, all
+ splits are made.
+
+ Following the python string method, sep will by default be any
+ white space (tabs, spaces, newlines) but this is unlikely to
+ apply to biological sequences.
+
+ e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_aa = my_rna.translate()
+ >>> my_aa
+ Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
+ >>> my_aa.split("*")
+ [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
+ >>> my_aa.split("*",1)
+ [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
+
+ See also the rsplit method:
+
+ >>> my_aa.rsplit("*",1)
+ [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
+ """
+ #If it has one, check the alphabet:
+ sep_str = self._get_seq_str_and_check_alphabet(sep)
+ #TODO - If the sep is the defined stop symbol, or gap char,
+ #should we adjust the alphabet?
+ return [Seq(part, self.alphabet) \
+ for part in str(self).split(sep_str, maxsplit)]
+
+ def rsplit(self, sep=None, maxsplit=-1) :
+ """Right split method, like that of a python string.
+
+ This behaves like the python string method of the same name.
+
+ Return a list of the 'words' in the string (as Seq objects),
+ using sep as the delimiter string. If maxsplit is given, at
+ most maxsplit splits are done COUNTING FROM THE RIGHT.
+ If maxsplit is ommited, all splits are made.
+
+ Following the python string method, sep will by default be any
+ white space (tabs, spaces, newlines) but this is unlikely to
+ apply to biological sequences.
+
+ e.g. print my_seq.rsplit("*",1)
+
+ See also the split method.
+ """
+ #If it has one, check the alphabet:
+ sep_str = self._get_seq_str_and_check_alphabet(sep)
+ try :
+ return [Seq(part, self.alphabet) \
+ for part in str(self).rsplit(sep_str, maxsplit)]
+ except AttributeError :
+ #Python 2.3 doesn't have a string rsplit method, which we can
+ #word around by reversing the sequence, using (left) split,
+ #and then reversing the answer. Not very efficient!
+ words = [Seq(word[::-1], self.alphabet) for word \
+ in str(self)[::-1].split(sep_str[::-1], maxsplit)]
+ words.reverse()
+ return words
+
+ def strip(self, chars=None) :
+ """Returns a new Seq object with leading and trailing ends stripped.
+
+ This behaves like the python string method of the same name.
+
+ Optional argument chars defines which characters to remove. If
+ ommitted or None (default) then as for the python string method,
+ this defaults to removing any white space.
+
+ e.g. print my_seq.strip("-")
+
+ See also the lstrip and rstrip methods.
+ """
+ #If it has one, check the alphabet:
+ strip_str = self._get_seq_str_and_check_alphabet(chars)
+ return Seq(str(self).strip(strip_str), self.alphabet)
+
+ def lstrip(self, chars=None) :
+ """Returns a new Seq object with leading (left) end stripped.
+
+ This behaves like the python string method of the same name.
+
+ Optional argument chars defines which characters to remove. If
+ ommitted or None (default) then as for the python string method,
+ this defaults to removing any white space.
+
+ e.g. print my_seq.lstrip("-")
+
+ See also the strip and rstrip methods.
+ """
+ #If it has one, check the alphabet:
+ strip_str = self._get_seq_str_and_check_alphabet(chars)
+ return Seq(str(self).lstrip(strip_str), self.alphabet)
+
+ def rstrip(self, chars=None) :
+ """Returns a new Seq object with trailing (right) end stripped.
+
+ This behaves like the python string method of the same name.
+
+ Optional argument chars defines which characters to remove. If
+ ommitted or None (default) then as for the python string method,
+ this defaults to removing any white space.
+
+ e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
+
+ >>> from Bio.Alphabet import IUPAC
+ >>> from Bio.Seq import Seq
+ >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
+ >>> my_seq
+ Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
+ >>> my_seq.rstrip("A")
+ Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
+
+ See also the strip and lstrip methods.
+ """
+ #If it has one, check the alphabet:
+ strip_str = self._get_seq_str_and_check_alphabet(chars)
+ return Seq(str(self).rstrip(strip_str), self.alphabet)
+
+ def complement(self):
+ """Returns the complement sequence. New Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
+ >>> my_dna
+ Seq('CCCCCGATAG', IUPACUnambiguousDNA())
+ >>> my_dna.complement()
+ Seq('GGGGGCTATC', IUPACUnambiguousDNA())
+
+ You can of course used mixed case sequences,
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import generic_dna
+ >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
+ >>> my_dna
+ Seq('CCCCCgatA-GD', DNAAlphabet())
+ >>> my_dna.complement()
+ Seq('GGGGGctaT-CH', DNAAlphabet())
+
+ Note in the above example, ambiguous character D denotes
+ G, A or T so its complement is H (for C, T or A).
+
+ Trying to complement a protein sequence raises an exception.
+
+ >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
+ >>> my_protein.complement()
+ Traceback (most recent call last):
+ ...
+ ValueError: Proteins do not have complements!
+ """
+ base = Alphabet._get_base_alphabet(self.alphabet)
+ if isinstance(base, Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins do not have complements!")
+ if isinstance(base, Alphabet.DNAAlphabet) :
+ ttable = _dna_complement_table
+ elif isinstance(base, Alphabet.RNAAlphabet) :
+ ttable = _rna_complement_table
+ elif ('U' in self._data or 'u' in self._data) \
+ and ('T' in self._data or 't' in self._data):
+ #TODO - Handle this cleanly?
+ raise ValueError("Mixed RNA/DNA found")
+ elif 'U' in self._data or 'u' in self._data:
+ ttable = _rna_complement_table
+ else:
+ ttable = _dna_complement_table
+ #Much faster on really long sequences than the previous loop based one.
+ #thx to Michael Palmer, University of Waterloo
+ return Seq(str(self).translate(ttable), self.alphabet)
+
+ def reverse_complement(self):
+ """Returns the reverse complement sequence. New Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
+ >>> my_dna
+ Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
+ >>> my_dna.reverse_complement()
+ Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
+
+ Note in the above example, since R = G or A, its complement
+ is Y (which denotes C or T).
+
+ You can of course used mixed case sequences,
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import generic_dna
+ >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
+ >>> my_dna
+ Seq('CCCCCgatA-G', DNAAlphabet())
+ >>> my_dna.reverse_complement()
+ Seq('C-TatcGGGGG', DNAAlphabet())
+
+ Trying to complement a protein sequence raises an exception:
+
+ >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
+ >>> my_protein.reverse_complement()
+ Traceback (most recent call last):
+ ...
+ ValueError: Proteins do not have complements!
+ """
+ #Use -1 stride/step to reverse the complement
+ return self.complement()[::-1]
+
+ def transcribe(self):
+ """Returns the RNA sequence from a DNA sequence. New Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
+ ... IUPAC.unambiguous_dna)
+ >>> coding_dna
+ Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
+ >>> coding_dna.transcribe()
+ Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
+
+ Trying to transcribe a protein or RNA sequence raises an exception:
+
+ >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
+ >>> my_protein.transcribe()
+ Traceback (most recent call last):
+ ...
+ ValueError: Proteins cannot be transcribed!
+ """
+ base = Alphabet._get_base_alphabet(self.alphabet)
+ if isinstance(base, Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins cannot be transcribed!")
+ if isinstance(base, Alphabet.RNAAlphabet) :
+ raise ValueError("RNA cannot be transcribed!")
+
+ if self.alphabet==IUPAC.unambiguous_dna:
+ alphabet = IUPAC.unambiguous_rna
+ elif self.alphabet==IUPAC.ambiguous_dna:
+ alphabet = IUPAC.ambiguous_rna
+ else:
+ alphabet = Alphabet.generic_rna
+ return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
+
+ def back_transcribe(self):
+ """Returns the DNA sequence from an RNA sequence. New Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
+ ... IUPAC.unambiguous_rna)
+ >>> messenger_rna
+ Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
+ >>> messenger_rna.back_transcribe()
+ Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
+
+ Trying to back-transcribe a protein or DNA sequence raises an
+ exception:
+
+ >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
+ >>> my_protein.back_transcribe()
+ Traceback (most recent call last):
+ ...
+ ValueError: Proteins cannot be back transcribed!
+ """
+ base = Alphabet._get_base_alphabet(self.alphabet)
+ if isinstance(base, Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins cannot be back transcribed!")
+ if isinstance(base, Alphabet.DNAAlphabet) :
+ raise ValueError("DNA cannot be back transcribed!")
+
+ if self.alphabet==IUPAC.unambiguous_rna:
+ alphabet = IUPAC.unambiguous_dna
+ elif self.alphabet==IUPAC.ambiguous_rna:
+ alphabet = IUPAC.ambiguous_dna
+ else:
+ alphabet = Alphabet.generic_dna
+ return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
+
+ def translate(self, table="Standard", stop_symbol="*", to_stop=False):
+ """Turns a nucleotide sequence into a protein sequence. New Seq object.
+
+ This method will translate DNA or RNA sequences, and those with a
+ nucleotide or generic alphabet. Trying to translate a protein
+ sequence raises an exception.
+
+ Arguments:
+ - table - Which codon table to use? This can be either a name
+ (string) or an NCBI identifier (integer). This defaults
+ to the "Standard" table.
+ - stop_symbol - Single character string, what to use for terminators.
+ This defaults to the asterisk, "*".
+ - to_stop - Boolean, defaults to False meaning do a full translation
+ continuing on past any stop codons (translated as the
+ specified stop_symbol). If True, translation is
+ terminated at the first in frame stop codon (and the
+ stop_symbol is not appended to the returned protein
+ sequence).
+
+ e.g. Using the standard table:
+
+ >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
+ >>> coding_dna.translate()
+ Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
+ >>> coding_dna.translate(stop_symbol="@")
+ Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
+ >>> coding_dna.translate(to_stop=True)
+ Seq('VAIVMGR', ExtendedIUPACProtein())
+
+ Now using NCBI table 2, where TGA is not a stop codon:
+
+ >>> coding_dna.translate(table=2)
+ Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
+ >>> coding_dna.translate(table=2, to_stop=True)
+ Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
+
+ If the sequence has no in-frame stop codon, then the to_stop argument
+ has no effect:
+
+ >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
+ >>> coding_dna2.translate()
+ Seq('LAIVMGR', ExtendedIUPACProtein())
+ >>> coding_dna2.translate(to_stop=True)
+ Seq('LAIVMGR', ExtendedIUPACProtein())
+
+ NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
+ or a stop codon. These are translated as "X". Any invalid codon
+ (e.g. "TA?" or "T-A") will throw a TranslationError.
+
+ NOTE - Does NOT support gapped sequences.
+
+ NOTE - This does NOT behave like the python string's translate
+ method. For that use str(my_seq).translate(...) instead.
+ """
+ try:
+ table_id = int(table)
+ except ValueError:
+ table_id = None
+ if isinstance(table, str) and len(table)==256 :
+ raise ValueError("The Seq object translate method DOES NOT take " \
+ + "a 256 character string mapping table like " \
+ + "the python string object's translate method. " \
+ + "Use str(my_seq).translate(...) instead.")
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins cannot be translated!")
+ if self.alphabet==IUPAC.unambiguous_dna:
+ #Will use standard IUPAC protein alphabet, no need for X
+ if table_id is None:
+ codon_table = CodonTable.unambiguous_dna_by_name[table]
+ else:
+ codon_table = CodonTable.unambiguous_dna_by_id[table_id]
+ #elif self.alphabet==IUPAC.ambiguous_dna:
+ # if table_id is None:
+ # codon_table = CodonTable.ambiguous_dna_by_name[table]
+ # else:
+ # codon_table = CodonTable.ambiguous_dna_by_id[table_id]
+ elif self.alphabet==IUPAC.unambiguous_rna:
+ #Will use standard IUPAC protein alphabet, no need for X
+ if table_id is None:
+ codon_table = CodonTable.unambiguous_rna_by_name[table]
+ else:
+ codon_table = CodonTable.unambiguous_rna_by_id[table_id]
+ #elif self.alphabet==IUPAC.ambiguous_rna:
+ # if table_id is None:
+ # codon_table = CodonTable.ambiguous_rna_by_name[table]
+ # else:
+ # codon_table = CodonTable.ambiguous_rna_by_id[table_id]
+ else:
+ #This will use the extend IUPAC protein alphabet with X etc.
+ #The same table can be used for RNA or DNA (we use this for
+ #translating strings).
+ if table_id is None:
+ codon_table = CodonTable.ambiguous_generic_by_name[table]
+ else:
+ codon_table = CodonTable.ambiguous_generic_by_id[table_id]
+ protein = _translate_str(str(self), codon_table, stop_symbol, to_stop)
+ if stop_symbol in protein :
+ alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
+ stop_symbol = stop_symbol)
+ else :
+ alphabet = codon_table.protein_alphabet
+ return Seq(protein, alphabet)
+
+class UnknownSeq(Seq):
+ """A read-only sequence object of known length but unknown contents.
+
+ If you have an unknown sequence, you can represent this with a normal
+ Seq object, for example:
+
+ >>> my_seq = Seq("N"*5)
+ >>> my_seq
+ Seq('NNNNN', Alphabet())
+ >>> len(my_seq)
+ 5
+ >>> print my_seq
+ NNNNN
+
+ However, this is rather wasteful of memory (especially for large
+ sequences), which is where this class is most usefull:
+
+ >>> unk_five = UnknownSeq(5)
+ >>> unk_five
+ UnknownSeq(5, alphabet = Alphabet(), character = '?')
+ >>> len(unk_five)
+ 5
+ >>> print(unk_five)
+ ?????
+
+ You can add unknown sequence together, provided their alphabets and
+ characters are compatible, and get another memory saving UnknownSeq:
+
+ >>> unk_four = UnknownSeq(4)
+ >>> unk_four
+ UnknownSeq(4, alphabet = Alphabet(), character = '?')
+ >>> unk_four + unk_five
+ UnknownSeq(9, alphabet = Alphabet(), character = '?')
+
+ If the alphabet or characters don't match up, the addition gives an
+ ordinary Seq object:
+
+ >>> unk_nnnn = UnknownSeq(4, character = "N")
+ >>> unk_nnnn
+ UnknownSeq(4, alphabet = Alphabet(), character = 'N')
+ >>> unk_nnnn + unk_four
+ Seq('NNNN????', Alphabet())
+
+ Combining with a real Seq gives a new Seq object:
+
+ >>> known_seq = Seq("ACGT")
+ >>> unk_four + known_seq
+ Seq('????ACGT', Alphabet())
+ >>> known_seq + unk_four
+ Seq('ACGT????', Alphabet())
+ """
+ def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None) :
+ """Create a new UnknownSeq object.
+
+ If character is ommited, it is determed from the alphabet, "N" for
+ nucleotides, "X" for proteins, and "?" otherwise.
+ """
+ self._length = int(length)
+ if self._length < 0 :
+ #TODO - Block zero length UnknownSeq? You can just use a Seq!
+ raise ValueError("Length must not be negative.")
+ self.alphabet = alphabet
+ if character :
+ if len(character) != 1 :
+ raise ValueError("character argument should be a single letter string.")
+ self._character = character
+ else :
+ base = Alphabet._get_base_alphabet(alphabet)
+ #TODO? Check the case of the letters in the alphabet?
+ #We may have to use "n" instead of "N" etc.
+ if isinstance(base, Alphabet.NucleotideAlphabet) :
+ self._character = "N"
+ elif isinstance(base, Alphabet.ProteinAlphabet) :
+ self._character = "X"
+ else :
+ self._character = "?"
+
+ def __len__(self) :
+ """Returns the stated length of the unknown sequence."""
+ return self._length
+
+ def __str__(self) :
+ """Returns the unknown sequence as full string of the given length."""
+ return self._character * self._length
+
+ def __repr__(self):
+ return "UnknownSeq(%i, alphabet = %s, character = %s)" \
+ % (self._length, repr(self.alphabet), repr(self._character))
+
+ def __add__(self, other) :
+ if isinstance(other, UnknownSeq) \
+ and other._character == self._character :
+ #TODO - Check the alphabets match
+ return UnknownSeq(len(self)+len(other),
+ self.alphabet, self._character)
+ #Offload to the base class...
+ return Seq(str(self), self.alphabet) + other
+
+ def __radd__(self, other) :
+ if isinstance(other, UnknownSeq) \
+ and other._character == self._character :
+ #TODO - Check the alphabets match
+ return UnknownSeq(len(self)+len(other),
+ self.alphabet, self._character)
+ #Offload to the base class...
+ return other + Seq(str(self), self.alphabet)
+
+ def __getitem__(self, index):
+ if isinstance(index, int) :
+ #TODO - Check the bounds without wasting memory
+ return str(self)[index]
+ else :
+ #TODO - Work out the length without wasting memory
+ return UnknownSeq(len(("#"*self._length)[index]),
+ self.alphabet, self._character)
+
+ def count(self, sub, start=0, end=sys.maxint):
+ """Non-overlapping count method, like that of a python string.
+
+ This behaves like the python string (and Seq object) method of the
+ same name, which does a non-overlapping count!
+
+ Returns an integer, the number of occurrences of substring
+ argument sub in the (sub)sequence given by [start:end].
+ Optional arguments start and end are interpreted as in slice
+ notation.
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ >>> "NNNN".count("N")
+ 4
+ >>> Seq("NNNN").count("N")
+ 4
+ >>> UnknownSeq(4, character="N").count("N")
+ 4
+ >>> UnknownSeq(4, character="N").count("A")
+ 0
+ >>> UnknownSeq(4, character="N").count("AA")
+ 0
+
+ HOWEVER, please note because that python strings and Seq objects (and
+ MutableSeq objects) do a non-overlapping search, this may not give
+ the answer you expect:
+
+ >>> UnknownSeq(4, character="N").count("NN")
+ 2
+ >>> UnknownSeq(4, character="N").count("NNN")
+ 1
+ """
+ sub_str = self._get_seq_str_and_check_alphabet(sub)
+ if len(sub_str) == 1 :
+ if str(sub_str) == self._character :
+ if start==0 and end >= self._length :
+ return self._length
+ else :
+ #This could be done more cleverly...
+ return str(self).count(sub_str, start, end)
+ else :
+ return 0
+ else :
+ if set(sub_str) == set(self._character) :
+ if start==0 and end >= self._length :
+ return self._length // len(sub_str)
+ else :
+ #This could be done more cleverly...
+ return str(self).count(sub_str, start, end)
+ else :
+ return 0
+
+ def complement(self) :
+ """The complement of an unknown nucleotide equals itself.
+
+ >>> my_nuc = UnknownSeq(8)
+ >>> my_nuc
+ UnknownSeq(8, alphabet = Alphabet(), character = '?')
+ >>> print my_nuc
+ ????????
+ >>> my_nuc.complement()
+ UnknownSeq(8, alphabet = Alphabet(), character = '?')
+ >>> print my_nuc.complement()
+ ????????
+ """
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins do not have complements!")
+ return self
+
+ def reverse_complement(self) :
+ """The reverse complement of an unknown nucleotide equals itself.
+
+ >>> my_nuc = UnknownSeq(10)
+ >>> my_nuc
+ UnknownSeq(10, alphabet = Alphabet(), character = '?')
+ >>> print my_nuc
+ ??????????
+ >>> my_nuc.reverse_complement()
+ UnknownSeq(10, alphabet = Alphabet(), character = '?')
+ >>> print my_nuc.reverse_complement()
+ ??????????
+ """
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins do not have complements!")
+ return self
+
+ def transcribe(self) :
+ """Returns unknown RNA sequence from an unknown DNA sequence.
+
+ >>> my_dna = UnknownSeq(10, character="N")
+ >>> my_dna
+ UnknownSeq(10, alphabet = Alphabet(), character = 'N')
+ >>> print my_dna
+ NNNNNNNNNN
+ >>> my_rna = my_dna.transcribe()
+ >>> my_rna
+ UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
+ >>> print my_rna
+ NNNNNNNNNN
+ """
+ #Offload the alphabet stuff
+ s = Seq(self._character, self.alphabet).transcribe()
+ return UnknownSeq(self._length, s.alphabet, self._character)
+
+ def back_transcribe(self) :
+ """Returns unknown DNA sequence from an unknown RNA sequence.
+
+ >>> my_rna = UnknownSeq(20, character="N")
+ >>> my_rna
+ UnknownSeq(20, alphabet = Alphabet(), character = 'N')
+ >>> print my_rna
+ NNNNNNNNNNNNNNNNNNNN
+ >>> my_dna = my_rna.back_transcribe()
+ >>> my_dna
+ UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
+ >>> print my_dna
+ NNNNNNNNNNNNNNNNNNNN
+ """
+ #Offload the alphabet stuff
+ s = Seq(self._character, self.alphabet).back_transcribe()
+ return UnknownSeq(self._length, s.alphabet, self._character)
+
+ def translate(self, **kwargs) :
+ """Translate an unknown nucleotide sequence into an unknown protein.
+
+ e.g.
+
+ >>> my_seq = UnknownSeq(11, character="N")
+ >>> print my_seq
+ NNNNNNNNNNN
+ >>> my_protein = my_seq.translate()
+ >>> my_protein
+ UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
+ >>> print my_protein
+ XXX
+
+ In comparison, using a normal Seq object:
+
+ >>> my_seq = Seq("NNNNNNNNNNN")
+ >>> print my_seq
+ NNNNNNNNNNN
+ >>> my_protein = my_seq.translate()
+ >>> my_protein
+ Seq('XXX', ExtendedIUPACProtein())
+ >>> print my_protein
+ XXX
+
+ """
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins cannot be translated!")
+ return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
+
+
+class MutableSeq(object):
+ """An editable sequence object (with an alphabet).
+
+ Unlike normal python strings and our basic sequence object (the Seq class)
+ which are immuatable, the MutableSeq lets you edit the sequence in place.
+ However, this means you cannot use a MutableSeq object as a dictionary key.
+
+ >>> from Bio.Seq import MutableSeq
+ >>> from Bio.Alphabet import generic_dna
+ >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
+ >>> my_seq
+ MutableSeq('ACTCGTCGTCG', DNAAlphabet())
+ >>> my_seq[5]
+ 'T'
+ >>> my_seq[5] = "A"
+ >>> my_seq
+ MutableSeq('ACTCGACGTCG', DNAAlphabet())
+ >>> my_seq[5]
+ 'A'
+ >>> my_seq[5:8] = "NNN"
+ >>> my_seq
+ MutableSeq('ACTCGNNNTCG', DNAAlphabet())
+ >>> len(my_seq)
+ 11
+
+ Note that the MutableSeq object does not support as many string-like
+ or biological methods as the Seq object.
+ """
+ def __init__(self, data, alphabet = Alphabet.generic_alphabet):
+ if type(data) == type(""):
+ self.data = array.array("c", data)
+ else:
+ self.data = data # assumes the input is an array
+ self.alphabet = alphabet
+
+ def __repr__(self):
+ """Returns a (truncated) representation of the sequence for debugging."""
+ if len(self) > 60 :
+ #Shows the last three letters as it is often useful to see if there
+ #is a stop codon at the end of a sequence.
+ #Note total length is 54+3+3=60
+ return "%s('%s...%s', %s)" % (self.__class__.__name__,
+ str(self[:54]), str(self[-3:]),
+ repr(self.alphabet))
+ else :
+ return "%s('%s', %s)" % (self.__class__.__name__,
+ str(self),
+ repr(self.alphabet))
+
+ def __str__(self):
+ """Returns the full sequence as a python string.
+
+ Note that Biopython 1.44 and earlier would give a truncated
+ version of repr(my_seq) for str(my_seq). If you are writing code
+ which needs to be backwards compatible with old Biopython, you
+ should continue to use my_seq.tostring() rather than str(my_seq).
+ """
+ #See test_GAQueens.py for an historic usage of a non-string alphabet!
+ return "".join(self.data)
+
+ def __cmp__(self, other):
+ """Compare the sequence for to another sequence or a string.
+
+ If compared to another sequence the alphabets must be compatible.
+ Comparing DNA to RNA, or Nucleotide to Protein will raise an
+ exception.
+
+ Otherwise only the sequence itself is compared, not the precise
+ alphabet.
+
+ This method indirectly supports ==, < , etc."""
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ if isinstance(other, MutableSeq):
+ #See test_GAQueens.py for an historic usage of a non-string
+ #alphabet! Comparing the arrays supports this.
+ return cmp(self.data, other.data)
+ else :
+ return cmp(str(self), str(other))
+ elif isinstance(other, basestring) :
+ return cmp(str(self), other)
+ else :
+ raise TypeError
+
+ def __len__(self): return len(self.data)
+
+ def __getitem__(self, index) :
+ #Note since Python 2.0, __getslice__ is deprecated
+ #and __getitem__ is used instead.
+ #See http://docs.python.org/ref/sequence-methods.html
+ if isinstance(index, int) :
+ #Return a single letter as a string
+ return self.data[index]
+ else :
+ #Return the (sub)sequence as another Seq object
+ return MutableSeq(self.data[index], self.alphabet)
+
+ def __setitem__(self, index, value):
+ #Note since Python 2.0, __setslice__ is deprecated
+ #and __setitem__ is used instead.
+ #See http://docs.python.org/ref/sequence-methods.html
+ if isinstance(index, int) :
+ #Replacing a single letter with a new string
+ self.data[index] = value
+ else :
+ #Replacing a sub-sequence
+ if isinstance(value, MutableSeq):
+ self.data[index] = value.data
+ elif isinstance(value, type(self.data)):
+ self.data[index] = value
+ else:
+ self.data[index] = array.array("c", str(value))
+
+ def __delitem__(self, index):
+ #Note since Python 2.0, __delslice__ is deprecated
+ #and __delitem__ is used instead.
+ #See http://docs.python.org/ref/sequence-methods.html
+
+ #Could be deleting a single letter, or a slice
+ del self.data[index]
+
+ def __add__(self, other):
+ """Add another sequence or string to this sequence.
+
+ Returns a new MutableSeq object."""
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
+ if isinstance(other, MutableSeq):
+ #See test_GAQueens.py for an historic usage of a non-string
+ #alphabet! Adding the arrays should support this.
+ return self.__class__(self.data + other.data, a)
+ else :
+ return self.__class__(str(self) + str(other), a)
+ elif isinstance(other, basestring) :
+ #other is a plain string - use the current alphabet
+ return self.__class__(str(self) + str(other), self.alphabet)
+ else :
+ raise TypeError
+
+ def __radd__(self, other):
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
+ if isinstance(other, MutableSeq):
+ #See test_GAQueens.py for an historic usage of a non-string
+ #alphabet! Adding the arrays should support this.
+ return self.__class__(other.data + self.data, a)
+ else :
+ return self.__class__(str(other) + str(self), a)
+ elif isinstance(other, basestring) :
+ #other is a plain string - use the current alphabet
+ return self.__class__(str(other) + str(self), self.alphabet)
+ else :
+ raise TypeError
+
+ def append(self, c):
+ self.data.append(c)
+
+ def insert(self, i, c):
+ self.data.insert(i, c)
+
+ def pop(self, i = (-1)):
+ c = self.data[i]
+ del self.data[i]
+ return c
+
+ def remove(self, item):
+ for i in range(len(self.data)):
+ if self.data[i] == item:
+ del self.data[i]
+ return
+ raise ValueError("MutableSeq.remove(x): x not in list")
+
+ def count(self, sub, start=0, end=sys.maxint):
+ """Non-overlapping count method, like that of a python string.
+
+ This behaves like the python string method of the same name,
+ which does a non-overlapping count!
+
+ Returns an integer, the number of occurrences of substring
+ argument sub in the (sub)sequence given by [start:end].
+ Optional arguments start and end are interpreted as in slice
+ notation.
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ e.g.
+
+ >>> from Bio.Seq import MutableSeq
+ >>> my_mseq = MutableSeq("AAAATGA")
+ >>> print my_mseq.count("A")
+ 5
+ >>> print my_mseq.count("ATG")
+ 1
+ >>> print my_mseq.count(Seq("AT"))
+ 1
+ >>> print my_mseq.count("AT", 2, -1)
+ 1
+
+ HOWEVER, please note because that python strings, Seq objects and
+ MutableSeq objects do a non-overlapping search, this may not give
+ the answer you expect:
+
+ >>> "AAAA".count("AA")
+ 2
+ >>> print MutableSeq("AAAA").count("AA")
+ 2
+
+ A non-overlapping search would give the answer as three!
+ """
+ try :
+ #TODO - Should we check the alphabet?
+ search = sub.tostring()
+ except AttributeError :
+ search = sub
+
+ if not isinstance(search, basestring) :
+ raise TypeError("expected a string, Seq or MutableSeq")
+
+ if len(search) == 1 :
+ #Try and be efficient and work directly from the array.
+ count = 0
+ for c in self.data[start:end]:
+ if c == search: count += 1
+ return count
+ else :
+ #TODO - Can we do this more efficiently?
+ return self.tostring().count(search, start, end)
+
+ def index(self, item):
+ for i in range(len(self.data)):
+ if self.data[i] == item:
+ return i
+ raise ValueError("MutableSeq.index(x): x not in list")
+
+ def reverse(self):
+ """Modify the mutable sequence to reverse itself.
+
+ No return value.
+ """
+ self.data.reverse()
+
+ def complement(self):
+ """Modify the mutable sequence to take on its complement.
+
+ Trying to complement a protein sequence raises an exception.
+
+ No return value.
+ """
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins do not have complements!")
+ if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna):
+ d = ambiguous_dna_complement
+ elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna):
+ d = ambiguous_rna_complement
+ elif 'U' in self.data and 'T' in self.data :
+ #TODO - Handle this cleanly?
+ raise ValueError("Mixed RNA/DNA found")
+ elif 'U' in self.data:
+ d = ambiguous_rna_complement
+ else:
+ d = ambiguous_dna_complement
+ c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()])
+ d.update(c)
+ self.data = map(lambda c: d[c], self.data)
+ self.data = array.array('c', self.data)
+
+ def reverse_complement(self):
+ """Modify the mutable sequence to take on its reverse complement.
+
+ Trying to reverse complement a protein sequence raises an exception.
+
+ No return value.
+ """
+ self.complement()
+ self.data.reverse()
+
+ ## Sorting a sequence makes no sense.
+ # def sort(self, *args): self.data.sort(*args)
+
+ def extend(self, other):
+ if isinstance(other, MutableSeq):
+ for c in other.data:
+ self.data.append(c)
+ else:
+ for c in other:
+ self.data.append(c)
+
+ def tostring(self):
+ """Returns the full sequence as a python string.
+
+ Although not formally deprecated, you are now encouraged to use
+ str(my_seq) instead of my_seq.tostring().
+
+ Because str(my_seq) will give you the full sequence as a python string,
+ there is often no need to make an explicit conversion. For example,
+
+ print "ID={%s}, sequence={%s}" % (my_name, my_seq)
+
+ On Biopython 1.44 or older you would have to have done this:
+
+ print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
+ """
+ return "".join(self.data)
+
+ def toseq(self):
+ """Returns the full sequence as a new immutable Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \
+ IUPAC.protein)
+ >>> my_mseq
+ MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
+ >>> my_mseq.toseq()
+ Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
+
+ Note that the alphabet is preserved.
+ """
+ return Seq("".join(self.data), self.alphabet)
+
+# The transcribe, backward_transcribe, and translate functions are
+# user-friendly versions of the corresponding functions in Bio.Transcribe
+# and Bio.Translate. The functions work both on Seq objects, and on strings.
+
+def transcribe(dna):
+ """Transcribes a DNA sequence into RNA.
+
+ If given a string, returns a new string object.
+
+ Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
+
+ Trying to transcribe a protein or RNA sequence raises an exception.
+
+ e.g.
+
+ >>> transcribe("ACTGN")
+ 'ACUGN'
+ """
+ if isinstance(dna, Seq) :
+ return dna.transcribe()
+ elif isinstance(dna, MutableSeq):
+ return dna.toseq().transcribe()
+ else:
+ return dna.replace('T','U').replace('t','u')
+
+def back_transcribe(rna):
+ """Back-transcribes an RNA sequence into DNA.
+
+ If given a string, returns a new string object.
+
+ Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
+
+ Trying to transcribe a protein or DNA sequence raises an exception.
+
+ e.g.
+
+ >>> back_transcribe("ACUGN")
+ 'ACTGN'
+ """
+ if isinstance(rna, Seq) :
+ return rna.back_transcribe()
+ elif isinstance(rna, MutableSeq):
+ return rna.toseq().back_transcribe()
+ else:
+ return rna.replace('U','T').replace('u','t')
+
+def _translate_str(sequence, table, stop_symbol="*",
+ to_stop=False, pos_stop="X") :
+ """Helper function to translate a nucleotide string (PRIVATE).
+
+ Arguments:
+ - sequence - a string
+ - table - a CodonTable object (NOT a table name or id number)
+ - stop_symbol - a single character string, what to use for terminators.
+ - to_stop - boolean, should translation terminate at the first
+ in frame stop codon? If there is no in-frame stop codon
+ then translation continues to the end.
+ - pos_stop - a single character string for a possible stop codon
+ (e.g. TAN or NNN)
+
+ Returns a string.
+
+ e.g.
+
+ >>> from Bio.Data import CodonTable
+ >>> table = CodonTable.ambiguous_dna_by_id[1]
+ >>> _translate_str("AAA", table)
+ 'K'
+ >>> _translate_str("TAR", table)
+ '*'
+ >>> _translate_str("TAN", table)
+ 'X'
+ >>> _translate_str("TAN", table, pos_stop="@")
+ '@'
+ >>> _translate_str("TA?", table)
+ Traceback (most recent call last):
+ ...
+ TranslationError: Codon 'TA?' is invalid
+ """
+ sequence = sequence.upper()
+ amino_acids = []
+ forward_table = table.forward_table
+ stop_codons = table.stop_codons
+ if table.nucleotide_alphabet.letters is not None :
+ valid_letters = set(table.nucleotide_alphabet.letters.upper())
+ else :
+ #Assume the worst case, ambiguous DNA or RNA:
+ valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
+ IUPAC.ambiguous_rna.letters.upper())
+
+ n = len(sequence)
+ for i in xrange(0,n-n%3,3) :
+ codon = sequence[i:i+3]
+ try :
+ amino_acids.append(forward_table[codon])
+ except (KeyError, CodonTable.TranslationError) :
+ #Todo? Treat "---" as a special case (gapped translation)
+ if codon in table.stop_codons :
+ if to_stop : break
+ amino_acids.append(stop_symbol)
+ elif valid_letters.issuperset(set(codon)) :
+ #Possible stop codon (e.g. NNN or TAN)
+ amino_acids.append(pos_stop)
+ else :
+ raise CodonTable.TranslationError(\
+ "Codon '%s' is invalid" % codon)
+ return "".join(amino_acids)
+
+def translate(sequence, table="Standard", stop_symbol="*", to_stop=False):
+ """Translate a nucleotide sequence into amino acids.
+
+ If given a string, returns a new string object. Given a Seq or
+ MutableSeq, returns a Seq object with a protein alphabet.
+
+ Arguments:
+ - table - Which codon table to use? This can be either a name
+ (string) or an NCBI identifier (integer). Defaults
+ to the "Standard" table.
+ - stop_symbol - Single character string, what to use for any
+ terminators, defaults to the asterisk, "*".
+ - to_stop - Boolean, defaults to False meaning do a full
+ translation continuing on past any stop codons
+ (translated as the specified stop_symbol). If
+ True, translation is terminated at the first in
+ frame stop codon (and the stop_symbol is not
+ appended to the returned protein sequence).
+
+ A simple string example using the default (standard) genetic code:
+
+ >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
+ >>> translate(coding_dna)
+ 'VAIVMGR*KGAR*'
+ >>> translate(coding_dna, stop_symbol="@")
+ 'VAIVMGR@KGAR@'
+ >>> translate(coding_dna, to_stop=True)
+ 'VAIVMGR'
+
+ Now using NCBI table 2, where TGA is not a stop codon:
+
+ >>> translate(coding_dna, table=2)
+ 'VAIVMGRWKGAR*'
+ >>> translate(coding_dna, table=2, to_stop=True)
+ 'VAIVMGRWKGAR'
+
+ Note that if the sequence has no in-frame stop codon, then the to_stop
+ argument has no effect:
+
+ >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
+ >>> translate(coding_dna2)
+ 'VAIVMGR'
+ >>> translate(coding_dna2, to_stop=True)
+ 'VAIVMGR'
+
+ NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
+ or a stop codon. These are translated as "X". Any invalid codon
+ (e.g. "TA?" or "T-A") will throw a TranslationError.
+
+ NOTE - Does NOT support gapped sequences.
+
+ It will however translate either DNA or RNA.
+ """
+ if isinstance(sequence, Seq) :
+ return sequence.translate(table, stop_symbol, to_stop)
+ elif isinstance(sequence, MutableSeq):
+ #Return a Seq object
+ return sequence.toseq().translate(table, stop_symbol, to_stop)
+ else:
+ #Assume its a string, return a string
+ try :
+ codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
+ except ValueError :
+ codon_table = CodonTable.ambiguous_generic_by_name[table]
+ return _translate_str(sequence, codon_table, stop_symbol, to_stop)
+
+def reverse_complement(sequence):
+ """Returns the reverse complement sequence of a nucleotide string.
+
+ If given a string, returns a new string object.
+ Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet.
+
+ Supports unambiguous and ambiguous nucleotide sequences.
+
+ e.g.
+
+ >>> reverse_complement("ACTG-NH")
+ 'DN-CAGT'
+ """
+ if isinstance(sequence, Seq) :
+ #Return a Seq
+ return sequence.reverse_complement()
+ elif isinstance(sequence, MutableSeq) :
+ #Return a Seq
+ #Don't use the MutableSeq reverse_complement method as it is 'in place'.
+ return sequence.toseq().reverse_complement()
+
+ #Assume its a string.
+ #In order to avoid some code duplication, the old code would turn the string
+ #into a Seq, use the reverse_complement method, and convert back to a string.
+ #This worked, but is over five times slower on short sequences!
+ if ('U' in sequence or 'u' in sequence) \
+ and ('T' in sequence or 't' in sequence):
+ raise ValueError("Mixed RNA/DNA found")
+ elif 'U' in sequence or 'u' in sequence:
+ ttable = _rna_complement_table
+ else:
+ ttable = _dna_complement_table
+ return sequence.translate(ttable)[::-1]
+
+def _test():
+ """Run the Bio.Seq module's doctests."""
+ print "Runing doctests..."
+ import doctest
+ doctest.testmod()
+ print "Done"
+
+if __name__ == "__main__":
+ _test()