binaries/src/disembl/biopython-1.50/Bio/Seq.py

   1 # Copyright 2000-2002 Brad Chapman.
   2 # Copyright 2004-2005 by M de Hoon.
   3 # Copyright 2007-2009 by Peter Cock.
   4 # All rights reserved.
   5 # This code is part of the Biopython distribution and governed by its
   6 # license.  Please see the LICENSE file that should have been included
   7 # as part of this package.
   8 """Provides objects to represent biological sequences with alphabets.
   9
  10 See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
  11  - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
  12  - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
  13 """
  14 __docformat__ ="epytext en" #Don't just use plain text in epydoc API pages!
  15
  16 import string #for maketrans only
  17 import array
  18 import sys
  19
  20 #TODO - Remove this work around once we drop python 2.3 support
  21 try:
  22     set = set
  23 except NameError:
  24     from sets import Set as set
  25
  26 import Alphabet
  27 from Alphabet import IUPAC
  28 from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
  29 from Bio.Data import CodonTable
  30
  31 def _maketrans(complement_mapping) :
  32     """Makes a python string translation table (PRIVATE).
  33
  34     Arguments:
  35      - complement_mapping - a dictionary such as ambiguous_dna_complement
  36        and ambiguous_rna_complement from Data.IUPACData.
  37
  38     Returns a translation table (a string of length 256) for use with the
  39     python string's translate method to use in a (reverse) complement.
  40
  41     Compatible with lower case and upper case sequences.
  42
  43     For internal use only.
  44     """
  45     before = ''.join(complement_mapping.keys())
  46     after  = ''.join(complement_mapping.values())
  47     before = before + before.lower()
  48     after  = after + after.lower()
  49     return string.maketrans(before, after)
  50
  51 _dna_complement_table = _maketrans(ambiguous_dna_complement)
  52 _rna_complement_table = _maketrans(ambiguous_rna_complement)
  53
  54 class Seq(object):
  55     """A read-only sequence object (essentially a string with an alphabet).
  56
  57     Like normal python strings, our basic sequence object is immutable.
  58     This prevents you from doing my_seq[5] = "A" for example, but does allow
  59     Seq objects to be used as dictionary keys.
  60
  61     The Seq object provides a number of string like methods (such as count,
  62     find, split and strip), which are alphabet aware where appropriate.
  63
  64     The Seq object also provides some biological methods, such as complement,
  65     reverse_complement, transcribe, back_transcribe and translate (which are
  66     not applicable to sequences with a protein alphabet).
  67     """
  68     def __init__(self, data, alphabet = Alphabet.generic_alphabet):
  69         """Create a Seq object.
  70
  71         Arguments:
  72          - seq      - Sequence, required (string)
  73          - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
  74
  75         You will typically use Bio.SeqIO to read in sequences from files as
  76         SeqRecord objects, whose sequence will be exposed as a Seq object via
  77         the seq property.
  78
  79         However, will often want to create your own Seq objects directly:
  80
  81         >>> from Bio.Seq import Seq
  82         >>> from Bio.Alphabet import IUPAC
  83         >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
  84         ...              IUPAC.protein)
  85         >>> my_seq
  86         Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
  87         >>> print my_seq
  88         MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
  89         """
  90         # Enforce string storage
  91         assert (type(data) == type("") or # must use a string
  92                 type(data) == type(u""))  # but can be a unicode string
  93         self._data = data
  94         self.alphabet = alphabet  # Seq API requirement
  95
  96     # A data property is/was a Seq API requirement
  97     def _set_data(self, value) :
  98         #TODO - In the next release, actually raise an exception?
  99         #The Seq object is like a python string, it should be read only!
 100         import warnings
 101         warnings.warn("Writing to the Seq object's .data propery is deprecated.",
 102                       DeprecationWarning)
 103         self._data = value
 104     data = property(fget= lambda self : str(self),
 105                     fset=_set_data,
 106                     doc="Sequence as a string (DEPRECATED)")
 107
 108     def __repr__(self):
 109         """Returns a (truncated) representation of the sequence for debugging."""
 110         if len(self) > 60 :
 111             #Shows the last three letters as it is often useful to see if there
 112             #is a stop codon at the end of a sequence.
 113             #Note total length is 54+3+3=60
 114             return "%s('%s...%s', %s)" % (self.__class__.__name__,
 115                                    str(self)[:54], str(self)[-3:],
 116                                    repr(self.alphabet))
 117         else :
 118             return "%s(%s, %s)" % (self.__class__.__name__,
 119                                   repr(self.data),
 120                                    repr(self.alphabet))
 121     def __str__(self):
 122         """Returns the full sequence as a python string.
 123
 124         Note that Biopython 1.44 and earlier would give a truncated
 125         version of repr(my_seq) for str(my_seq).  If you are writing code
 126         which need to be backwards compatible with old Biopython, you
 127         should continue to use my_seq.tostring() rather than str(my_seq).
 128         """
 129         return self._data
 130
 131     """
 132     TODO - Work out why this breaks test_Restriction.py
 133     (Comparing Seq objects would be nice to have.  May need to think about
 134     hashes and the in operator for when have list/dictionary of Seq objects...)
 135     def __cmp__(self, other):
 136         if hasattr(other, "alphabet") :
 137             #other should be a Seq or a MutableSeq
 138             if not Alphabet._check_type_compatible([self.alphabet,
 139                                                     other.alphabet]) :
 140                 raise TypeError("Incompatable alphabets %s and %s" \
 141                                 % (repr(self.alphabet), repr(other.alphabet)))
 142             #They should be the same sequence type (or one of them is generic)
 143             return cmp(str(self), str(other))
 144         elif isinstance(other, basestring) :
 145             return cmp(str(self), other)
 146         else :
 147             raise TypeError
 148     """
 149
 150     def __len__(self): return len(self._data)       # Seq API requirement
 151
 152     def __getitem__(self, index) :                 # Seq API requirement
 153         #Note since Python 2.0, __getslice__ is deprecated
 154         #and __getitem__ is used instead.
 155         #See http://docs.python.org/ref/sequence-methods.html
 156         if isinstance(index, int) :
 157             #Return a single letter as a string
 158             return self._data[index]
 159         else :
 160             #Return the (sub)sequence as another Seq object
 161             return Seq(self._data[index], self.alphabet)
 162
 163     def __add__(self, other):
 164         """Add another sequence or string to this sequence."""
 165         if hasattr(other, "alphabet") :
 166             #other should be a Seq or a MutableSeq
 167             if not Alphabet._check_type_compatible([self.alphabet,
 168                                                     other.alphabet]) :
 169                 raise TypeError("Incompatable alphabets %s and %s" \
 170                                 % (repr(self.alphabet), repr(other.alphabet)))
 171             #They should be the same sequence type (or one of them is generic)
 172             a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
 173             return self.__class__(str(self) + str(other), a)
 174         elif isinstance(other, basestring) :
 175             #other is a plain string - use the current alphabet
 176             return self.__class__(str(self) + other, self.alphabet)
 177         else :
 178             raise TypeError
 179
 180     def __radd__(self, other):
 181         if hasattr(other, "alphabet") :
 182             #other should be a Seq or a MutableSeq
 183             if not Alphabet._check_type_compatible([self.alphabet,
 184                                                     other.alphabet]) :
 185                 raise TypeError("Incompatable alphabets %s and %s" \
 186                                 % (repr(self.alphabet), repr(other.alphabet)))
 187             #They should be the same sequence type (or one of them is generic)
 188             a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
 189             return self.__class__(str(other) + str(self), a)
 190         elif isinstance(other, basestring) :
 191             #other is a plain string - use the current alphabet
 192             return self.__class__(other + str(self), self.alphabet)
 193         else :
 194             raise TypeError
 195
 196     def tostring(self):                            # Seq API requirement
 197         """Returns the full sequence as a python string.
 198
 199         Although not formally deprecated, you are now encouraged to use
 200         str(my_seq) instead of my_seq.tostring()."""
 201         return str(self)
 202
 203     def tomutable(self):   # Needed?  Or use a function?
 204         """Returns the full sequence as a MutableSeq object.
 205
 206         >>> from Bio.Seq import Seq
 207         >>> from Bio.Alphabet import IUPAC
 208         >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
 209         ...              IUPAC.protein)
 210         >>> my_seq
 211         Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
 212         >>> my_seq.tomutable()
 213         MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
 214
 215         Note that the alphabet is preserved.
 216         """
 217         return MutableSeq(str(self), self.alphabet)
 218
 219     def _get_seq_str_and_check_alphabet(self, other_sequence) :
 220         """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
 221
 222         For a string argument, returns the string.
 223
 224         For a Seq or MutableSeq, it checks the alphabet is compatible
 225         (raising an exception if it isn't), and then returns a string.
 226         """
 227         try :
 228             other_alpha = other_sequence.alphabet
 229         except AttributeError :
 230             #Assume other_sequence is a string
 231             return other_sequence
 232
 233         #Other should be a Seq or a MutableSeq
 234         if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) :
 235             raise TypeError("Incompatable alphabets %s and %s" \
 236                             % (repr(self.alphabet), repr(other_alpha)))
 237         #Return as a string
 238         return str(other_sequence)
 239
 240     def count(self, sub, start=0, end=sys.maxint):
 241         """Non-overlapping count method, like that of a python string.
 242
 243         This behaves like the python string method of the same name,
 244         which does a non-overlapping count!
 245
 246         Returns an integer, the number of occurrences of substring
 247         argument sub in the (sub)sequence given by [start:end].
 248         Optional arguments start and end are interpreted as in slice
 249         notation.
 250
 251         Arguments:
 252          - sub - a string or another Seq object to look for
 253          - start - optional integer, slice start
 254          - end - optional integer, slice end
 255
 256         e.g.
 257
 258         >>> from Bio.Seq import Seq
 259         >>> my_seq = Seq("AAAATGA")
 260         >>> print my_seq.count("A")
 261         5
 262         >>> print my_seq.count("ATG")
 263         1
 264         >>> print my_seq.count(Seq("AT"))
 265         1
 266         >>> print my_seq.count("AT", 2, -1)
 267         1
 268
 269         HOWEVER, please note because python strings and Seq objects (and
 270         MutableSeq objects) do a non-overlapping search, this may not give
 271         the answer you expect:
 272
 273         >>> "AAAA".count("AA")
 274         2
 275         >>> print Seq("AAAA").count("AA")
 276         2
 277
 278         A non-overlapping search would give the answer as three!
 279         """
 280         #If it has one, check the alphabet:
 281         sub_str = self._get_seq_str_and_check_alphabet(sub)
 282         return str(self).count(sub_str, start, end)
 283
 284     def find(self, sub, start=0, end=sys.maxint):
 285         """Find method, like that of a python string.
 286
 287         This behaves like the python string method of the same name.
 288
 289         Returns an integer, the index of the first occurrence of substring
 290         argument sub in the (sub)sequence given by [start:end].
 291
 292         Arguments:
 293          - sub - a string or another Seq object to look for
 294          - start - optional integer, slice start
 295          - end - optional integer, slice end
 296
 297         Returns -1 if the subsequence is NOT found.
 298
 299         e.g. Locating the first typical start codon, AUG, in an RNA sequence:
 300
 301         >>> from Bio.Seq import Seq
 302         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
 303         >>> my_rna.find("AUG")
 304         3
 305         """
 306         #If it has one, check the alphabet:
 307         sub_str = self._get_seq_str_and_check_alphabet(sub)
 308         return str(self).find(sub_str, start, end)
 309
 310     def rfind(self, sub, start=0, end=sys.maxint):
 311         """Find from right method, like that of a python string.
 312
 313         This behaves like the python string method of the same name.
 314
 315         Returns an integer, the index of the last (right most) occurrence of
 316         substring argument sub in the (sub)sequence given by [start:end].
 317
 318         Arguments:
 319          - sub - a string or another Seq object to look for
 320          - start - optional integer, slice start
 321          - end - optional integer, slice end
 322
 323         Returns -1 if the subsequence is NOT found.
 324
 325         e.g. Locating the last typical start codon, AUG, in an RNA sequence:
 326
 327         >>> from Bio.Seq import Seq
 328         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
 329         >>> my_rna.rfind("AUG")
 330         15
 331         """
 332         #If it has one, check the alphabet:
 333         sub_str = self._get_seq_str_and_check_alphabet(sub)
 334         return str(self).rfind(sub_str, start, end)
 335
 336     def startswith(self, prefix, start=0, end=sys.maxint) :
 337         """Does the Seq start with the given prefix?  Returns True/False.
 338
 339         This behaves like the python string method of the same name.
 340
 341         Return True if the sequence starts with the specified prefix
 342         (a string or another Seq object), False otherwise.
 343         With optional start, test sequence beginning at that position.
 344         With optional end, stop comparing sequence at that position.
 345         prefix can also be a tuple of strings to try.  e.g.
 346
 347         >>> from Bio.Seq import Seq
 348         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
 349         >>> my_rna.startswith("GUC")
 350         True
 351         >>> my_rna.startswith("AUG")
 352         False
 353         >>> my_rna.startswith("AUG", 3)
 354         True
 355         >>> my_rna.startswith(("UCC","UCA","UCG"),1)
 356         True
 357         """
 358         #If it has one, check the alphabet:
 359         if isinstance(prefix, tuple) :
 360             #TODO - Once we drop support for Python 2.4, instead of this
 361             #loop offload to the string method (requires Python 2.5+).
 362             #Check all the alphabets first...
 363             prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
 364                               for p in prefix]
 365             for prefix_str in prefix_strings :
 366                 if str(self).startswith(prefix_str, start, end) :
 367                     return True
 368             return False
 369         else :
 370             prefix_str = self._get_seq_str_and_check_alphabet(prefix)
 371             return str(self).startswith(prefix_str, start, end)
 372
 373     def endswith(self, suffix, start=0, end=sys.maxint) :
 374         """Does the Seq end with the given suffix?  Returns True/False.
 375
 376         This behaves like the python string method of the same name.
 377
 378         Return True if the sequence ends with the specified suffix
 379         (a string or another Seq object), False otherwise.
 380         With optional start, test sequence beginning at that position.
 381         With optional end, stop comparing sequence at that position.
 382         suffix can also be a tuple of strings to try.  e.g.
 383
 384         >>> from Bio.Seq import Seq
 385         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
 386         >>> my_rna.endswith("UUG")
 387         True
 388         >>> my_rna.endswith("AUG")
 389         False
 390         >>> my_rna.endswith("AUG", 0, 18)
 391         True
 392         >>> my_rna.endswith(("UCC","UCA","UUG"))
 393         True
 394         """
 395         #If it has one, check the alphabet:
 396         if isinstance(suffix, tuple) :
 397             #TODO - Once we drop support for Python 2.4, instead of this
 398             #loop offload to the string method (requires Python 2.5+).
 399             #Check all the alphabets first...
 400             suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
 401                               for p in suffix]
 402             for suffix_str in suffix_strings :
 403                 if str(self).endswith(suffix_str, start, end) :
 404                     return True
 405             return False
 406         else :
 407             suffix_str = self._get_seq_str_and_check_alphabet(suffix)
 408             return str(self).endswith(suffix_str, start, end)
 409
 410
 411     def split(self, sep=None, maxsplit=-1) :
 412         """Split method, like that of a python string.
 413
 414         This behaves like the python string method of the same name.
 415
 416         Return a list of the 'words' in the string (as Seq objects),
 417         using sep as the delimiter string.  If maxsplit is given, at
 418         most maxsplit splits are done.  If maxsplit is ommited, all
 419         splits are made.
 420
 421         Following the python string method, sep will by default be any
 422         white space (tabs, spaces, newlines) but this is unlikely to
 423         apply to biological sequences.
 424
 425         e.g.
 426
 427         >>> from Bio.Seq import Seq
 428         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
 429         >>> my_aa = my_rna.translate()
 430         >>> my_aa
 431         Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
 432         >>> my_aa.split("*")
 433         [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
 434         >>> my_aa.split("*",1)
 435         [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
 436
 437         See also the rsplit method:
 438
 439         >>> my_aa.rsplit("*",1)
 440         [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
 441         """
 442         #If it has one, check the alphabet:
 443         sep_str = self._get_seq_str_and_check_alphabet(sep)
 444         #TODO - If the sep is the defined stop symbol, or gap char,
 445         #should we adjust the alphabet?
 446         return [Seq(part, self.alphabet) \
 447                 for part in str(self).split(sep_str, maxsplit)]
 448
 449     def rsplit(self, sep=None, maxsplit=-1) :
 450         """Right split method, like that of a python string.
 451
 452         This behaves like the python string method of the same name.
 453
 454         Return a list of the 'words' in the string (as Seq objects),
 455         using sep as the delimiter string.  If maxsplit is given, at
 456         most maxsplit splits are done COUNTING FROM THE RIGHT.
 457         If maxsplit is ommited, all splits are made.
 458
 459         Following the python string method, sep will by default be any
 460         white space (tabs, spaces, newlines) but this is unlikely to
 461         apply to biological sequences.
 462
 463         e.g. print my_seq.rsplit("*",1)
 464
 465         See also the split method.
 466         """
 467         #If it has one, check the alphabet:
 468         sep_str = self._get_seq_str_and_check_alphabet(sep)
 469         try :
 470             return [Seq(part, self.alphabet) \
 471                     for part in str(self).rsplit(sep_str, maxsplit)]
 472         except AttributeError :
 473             #Python 2.3 doesn't have a string rsplit method, which we can
 474             #word around by reversing the sequence, using (left) split,
 475             #and then reversing the answer. Not very efficient!
 476             words = [Seq(word[::-1], self.alphabet) for word \
 477                      in str(self)[::-1].split(sep_str[::-1], maxsplit)]
 478             words.reverse()
 479             return words
 480
 481     def strip(self, chars=None) :
 482         """Returns a new Seq object with leading and trailing ends stripped.
 483
 484         This behaves like the python string method of the same name.
 485
 486         Optional argument chars defines which characters to remove.  If
 487         ommitted or None (default) then as for the python string method,
 488         this defaults to removing any white space.
 489
 490         e.g. print my_seq.strip("-")
 491
 492         See also the lstrip and rstrip methods.
 493         """
 494         #If it has one, check the alphabet:
 495         strip_str = self._get_seq_str_and_check_alphabet(chars)
 496         return Seq(str(self).strip(strip_str), self.alphabet)
 497
 498     def lstrip(self, chars=None) :
 499         """Returns a new Seq object with leading (left) end stripped.
 500
 501         This behaves like the python string method of the same name.
 502
 503         Optional argument chars defines which characters to remove.  If
 504         ommitted or None (default) then as for the python string method,
 505         this defaults to removing any white space.
 506
 507         e.g. print my_seq.lstrip("-")
 508
 509         See also the strip and rstrip methods.
 510         """
 511         #If it has one, check the alphabet:
 512         strip_str = self._get_seq_str_and_check_alphabet(chars)
 513         return Seq(str(self).lstrip(strip_str), self.alphabet)
 514
 515     def rstrip(self, chars=None) :
 516         """Returns a new Seq object with trailing (right) end stripped.
 517
 518         This behaves like the python string method of the same name.
 519
 520         Optional argument chars defines which characters to remove.  If
 521         ommitted or None (default) then as for the python string method,
 522         this defaults to removing any white space.
 523
 524         e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
 525
 526         >>> from Bio.Alphabet import IUPAC
 527         >>> from Bio.Seq import Seq
 528         >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
 529         >>> my_seq
 530         Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
 531         >>> my_seq.rstrip("A")
 532         Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
 533
 534         See also the strip and lstrip methods.
 535         """
 536         #If it has one, check the alphabet:
 537         strip_str = self._get_seq_str_and_check_alphabet(chars)
 538         return Seq(str(self).rstrip(strip_str), self.alphabet)
 539
 540     def complement(self):
 541         """Returns the complement sequence. New Seq object.
 542
 543         >>> from Bio.Seq import Seq
 544         >>> from Bio.Alphabet import IUPAC
 545         >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
 546         >>> my_dna
 547         Seq('CCCCCGATAG', IUPACUnambiguousDNA())
 548         >>> my_dna.complement()
 549         Seq('GGGGGCTATC', IUPACUnambiguousDNA())
 550
 551         You can of course used mixed case sequences,
 552
 553         >>> from Bio.Seq import Seq
 554         >>> from Bio.Alphabet import generic_dna
 555         >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
 556         >>> my_dna
 557         Seq('CCCCCgatA-GD', DNAAlphabet())
 558         >>> my_dna.complement()
 559         Seq('GGGGGctaT-CH', DNAAlphabet())
 560
 561         Note in the above example, ambiguous character D denotes
 562         G, A or T so its complement is H (for C, T or A).
 563
 564         Trying to complement a protein sequence raises an exception.
 565
 566         >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
 567         >>> my_protein.complement()
 568         Traceback (most recent call last):
 569            ...
 570         ValueError: Proteins do not have complements!
 571         """
 572         base = Alphabet._get_base_alphabet(self.alphabet)
 573         if isinstance(base, Alphabet.ProteinAlphabet) :
 574             raise ValueError("Proteins do not have complements!")
 575         if isinstance(base, Alphabet.DNAAlphabet) :
 576             ttable = _dna_complement_table
 577         elif isinstance(base, Alphabet.RNAAlphabet) :
 578             ttable = _rna_complement_table
 579         elif ('U' in self._data or 'u' in self._data) \
 580         and ('T' in self._data or 't' in self._data):
 581             #TODO - Handle this cleanly?
 582             raise ValueError("Mixed RNA/DNA found")
 583         elif 'U' in self._data or 'u' in self._data:
 584             ttable = _rna_complement_table
 585         else:
 586             ttable = _dna_complement_table
 587         #Much faster on really long sequences than the previous loop based one.
 588         #thx to Michael Palmer, University of Waterloo
 589         return Seq(str(self).translate(ttable), self.alphabet)
 590
 591     def reverse_complement(self):
 592         """Returns the reverse complement sequence. New Seq object.
 593
 594         >>> from Bio.Seq import Seq
 595         >>> from Bio.Alphabet import IUPAC
 596         >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
 597         >>> my_dna
 598         Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
 599         >>> my_dna.reverse_complement()
 600         Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
 601
 602         Note in the above example, since R = G or A, its complement
 603         is Y (which denotes  C or T).
 604
 605         You can of course used mixed case sequences,
 606
 607         >>> from Bio.Seq import Seq
 608         >>> from Bio.Alphabet import generic_dna
 609         >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
 610         >>> my_dna
 611         Seq('CCCCCgatA-G', DNAAlphabet())
 612         >>> my_dna.reverse_complement()
 613         Seq('C-TatcGGGGG', DNAAlphabet())
 614
 615         Trying to complement a protein sequence raises an exception:
 616
 617         >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
 618         >>> my_protein.reverse_complement()
 619         Traceback (most recent call last):
 620            ...
 621         ValueError: Proteins do not have complements!
 622         """
 623         #Use -1 stride/step to reverse the complement
 624         return self.complement()[::-1]
 625
 626     def transcribe(self):
 627         """Returns the RNA sequence from a DNA sequence. New Seq object.
 628
 629         >>> from Bio.Seq import Seq
 630         >>> from Bio.Alphabet import IUPAC
 631         >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
 632         ...                  IUPAC.unambiguous_dna)
 633         >>> coding_dna
 634         Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
 635         >>> coding_dna.transcribe()
 636         Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
 637
 638         Trying to transcribe a protein or RNA sequence raises an exception:
 639
 640         >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
 641         >>> my_protein.transcribe()
 642         Traceback (most recent call last):
 643            ...
 644         ValueError: Proteins cannot be transcribed!
 645         """
 646         base = Alphabet._get_base_alphabet(self.alphabet)
 647         if isinstance(base, Alphabet.ProteinAlphabet) :
 648             raise ValueError("Proteins cannot be transcribed!")
 649         if isinstance(base, Alphabet.RNAAlphabet) :
 650             raise ValueError("RNA cannot be transcribed!")
 651
 652         if self.alphabet==IUPAC.unambiguous_dna:
 653             alphabet = IUPAC.unambiguous_rna
 654         elif self.alphabet==IUPAC.ambiguous_dna:
 655             alphabet = IUPAC.ambiguous_rna
 656         else:
 657             alphabet = Alphabet.generic_rna
 658         return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
 659
 660     def back_transcribe(self):
 661         """Returns the DNA sequence from an RNA sequence. New Seq object.
 662
 663         >>> from Bio.Seq import Seq
 664         >>> from Bio.Alphabet import IUPAC
 665         >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
 666         ...                     IUPAC.unambiguous_rna)
 667         >>> messenger_rna
 668         Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
 669         >>> messenger_rna.back_transcribe()
 670         Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
 671
 672         Trying to back-transcribe a protein or DNA sequence raises an
 673         exception:
 674
 675         >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
 676         >>> my_protein.back_transcribe()
 677         Traceback (most recent call last):
 678            ...
 679         ValueError: Proteins cannot be back transcribed!
 680         """
 681         base = Alphabet._get_base_alphabet(self.alphabet)
 682         if isinstance(base, Alphabet.ProteinAlphabet) :
 683             raise ValueError("Proteins cannot be back transcribed!")
 684         if isinstance(base, Alphabet.DNAAlphabet) :
 685             raise ValueError("DNA cannot be back transcribed!")
 686
 687         if self.alphabet==IUPAC.unambiguous_rna:
 688             alphabet = IUPAC.unambiguous_dna
 689         elif self.alphabet==IUPAC.ambiguous_rna:
 690             alphabet = IUPAC.ambiguous_dna
 691         else:
 692             alphabet = Alphabet.generic_dna
 693         return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
 694
 695     def translate(self, table="Standard", stop_symbol="*", to_stop=False):
 696         """Turns a nucleotide sequence into a protein sequence. New Seq object.
 697
 698         This method will translate DNA or RNA sequences, and those with a
 699         nucleotide or generic alphabet.  Trying to translate a protein
 700         sequence raises an exception.
 701
 702         Arguments:
 703          - table - Which codon table to use?  This can be either a name
 704                    (string) or an NCBI identifier (integer).  This defaults
 705                    to the "Standard" table.
 706          - stop_symbol - Single character string, what to use for terminators.
 707                          This defaults to the asterisk, "*".
 708          - to_stop - Boolean, defaults to False meaning do a full translation
 709                      continuing on past any stop codons (translated as the
 710                      specified stop_symbol).  If True, translation is
 711                      terminated at the first in frame stop codon (and the
 712                      stop_symbol is not appended to the returned protein
 713                      sequence).
 714
 715         e.g. Using the standard table:
 716
 717         >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
 718         >>> coding_dna.translate()
 719         Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
 720         >>> coding_dna.translate(stop_symbol="@")
 721         Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
 722         >>> coding_dna.translate(to_stop=True)
 723         Seq('VAIVMGR', ExtendedIUPACProtein())
 724
 725         Now using NCBI table 2, where TGA is not a stop codon:
 726
 727         >>> coding_dna.translate(table=2)
 728         Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
 729         >>> coding_dna.translate(table=2, to_stop=True)
 730         Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
 731
 732         If the sequence has no in-frame stop codon, then the to_stop argument
 733         has no effect:
 734
 735         >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
 736         >>> coding_dna2.translate()
 737         Seq('LAIVMGR', ExtendedIUPACProtein())
 738         >>> coding_dna2.translate(to_stop=True)
 739         Seq('LAIVMGR', ExtendedIUPACProtein())
 740
 741         NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
 742         or a stop codon.  These are translated as "X".  Any invalid codon
 743         (e.g. "TA?" or "T-A") will throw a TranslationError.
 744
 745         NOTE - Does NOT support gapped sequences.
 746
 747         NOTE - This does NOT behave like the python string's translate
 748         method.  For that use str(my_seq).translate(...) instead.
 749         """
 750         try:
 751             table_id = int(table)
 752         except ValueError:
 753             table_id = None
 754         if isinstance(table, str) and len(table)==256 :
 755             raise ValueError("The Seq object translate method DOES NOT take " \
 756                              + "a 256 character string mapping table like " \
 757                              + "the python string object's translate method. " \
 758                              + "Use str(my_seq).translate(...) instead.")
 759         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
 760                       Alphabet.ProteinAlphabet) :
 761             raise ValueError("Proteins cannot be translated!")
 762         if self.alphabet==IUPAC.unambiguous_dna:
 763             #Will use standard IUPAC protein alphabet, no need for X
 764             if table_id is None:
 765                 codon_table = CodonTable.unambiguous_dna_by_name[table]
 766             else:
 767                 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
 768         #elif self.alphabet==IUPAC.ambiguous_dna:
 769         #    if table_id is None:
 770         #        codon_table = CodonTable.ambiguous_dna_by_name[table]
 771         #    else:
 772         #        codon_table = CodonTable.ambiguous_dna_by_id[table_id]
 773         elif self.alphabet==IUPAC.unambiguous_rna:
 774             #Will use standard IUPAC protein alphabet, no need for X
 775             if table_id is None:
 776                 codon_table = CodonTable.unambiguous_rna_by_name[table]
 777             else:
 778                 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
 779         #elif self.alphabet==IUPAC.ambiguous_rna:
 780         #    if table_id is None:
 781         #        codon_table = CodonTable.ambiguous_rna_by_name[table]
 782         #    else:
 783         #        codon_table = CodonTable.ambiguous_rna_by_id[table_id]
 784         else:
 785             #This will use the extend IUPAC protein alphabet with X etc.
 786             #The same table can be used for RNA or DNA (we use this for
 787             #translating strings).
 788             if table_id is None:
 789                 codon_table = CodonTable.ambiguous_generic_by_name[table]
 790             else:
 791                 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
 792         protein = _translate_str(str(self), codon_table, stop_symbol, to_stop)
 793         if stop_symbol in protein :
 794             alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
 795                                              stop_symbol = stop_symbol)
 796         else :
 797             alphabet = codon_table.protein_alphabet
 798         return Seq(protein, alphabet)
 799
 800 class UnknownSeq(Seq):
 801     """A read-only sequence object of known length but unknown contents.
 802
 803     If you have an unknown sequence, you can represent this with a normal
 804     Seq object, for example:
 805
 806     >>> my_seq = Seq("N"*5)
 807     >>> my_seq
 808     Seq('NNNNN', Alphabet())
 809     >>> len(my_seq)
 810     5
 811     >>> print my_seq
 812     NNNNN
 813
 814     However, this is rather wasteful of memory (especially for large
 815     sequences), which is where this class is most usefull:
 816
 817     >>> unk_five = UnknownSeq(5)
 818     >>> unk_five
 819     UnknownSeq(5, alphabet = Alphabet(), character = '?')
 820     >>> len(unk_five)
 821     5
 822     >>> print(unk_five)
 823     ?????
 824
 825     You can add unknown sequence together, provided their alphabets and
 826     characters are compatible, and get another memory saving UnknownSeq:
 827
 828     >>> unk_four = UnknownSeq(4)
 829     >>> unk_four
 830     UnknownSeq(4, alphabet = Alphabet(), character = '?')
 831     >>> unk_four + unk_five
 832     UnknownSeq(9, alphabet = Alphabet(), character = '?')
 833
 834     If the alphabet or characters don't match up, the addition gives an
 835     ordinary Seq object:
 836
 837     >>> unk_nnnn = UnknownSeq(4, character = "N")
 838     >>> unk_nnnn
 839     UnknownSeq(4, alphabet = Alphabet(), character = 'N')
 840     >>> unk_nnnn + unk_four
 841     Seq('NNNN????', Alphabet())
 842
 843     Combining with a real Seq gives a new Seq object:
 844
 845     >>> known_seq = Seq("ACGT")
 846     >>> unk_four + known_seq
 847     Seq('????ACGT', Alphabet())
 848     >>> known_seq + unk_four
 849     Seq('ACGT????', Alphabet())
 850     """
 851     def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None) :
 852         """Create a new UnknownSeq object.
 853
 854         If character is ommited, it is determed from the alphabet, "N" for
 855         nucleotides, "X" for proteins, and "?" otherwise.
 856         """
 857         self._length = int(length)
 858         if self._length < 0 :
 859             #TODO - Block zero length UnknownSeq?  You can just use a Seq!
 860             raise ValueError("Length must not be negative.")
 861         self.alphabet = alphabet
 862         if character :
 863             if len(character) != 1 :
 864                 raise ValueError("character argument should be a single letter string.")
 865             self._character = character
 866         else :
 867             base = Alphabet._get_base_alphabet(alphabet)
 868             #TODO? Check the case of the letters in the alphabet?
 869             #We may have to use "n" instead of "N" etc.
 870             if isinstance(base, Alphabet.NucleotideAlphabet) :
 871                 self._character = "N"
 872             elif isinstance(base, Alphabet.ProteinAlphabet) :
 873                 self._character = "X"
 874             else :
 875                 self._character = "?"
 876
 877     def __len__(self) :
 878         """Returns the stated length of the unknown sequence."""
 879         return self._length
 880
 881     def __str__(self) :
 882         """Returns the unknown sequence as full string of the given length."""
 883         return self._character * self._length
 884
 885     def __repr__(self):
 886         return "UnknownSeq(%i, alphabet = %s, character = %s)" \
 887                % (self._length, repr(self.alphabet), repr(self._character))
 888
 889     def __add__(self, other) :
 890         if isinstance(other, UnknownSeq) \
 891         and other._character == self._character :
 892             #TODO - Check the alphabets match
 893             return UnknownSeq(len(self)+len(other),
 894                               self.alphabet, self._character)
 895         #Offload to the base class...
 896         return Seq(str(self), self.alphabet) + other
 897
 898     def __radd__(self, other) :
 899         if isinstance(other, UnknownSeq) \
 900         and other._character == self._character :
 901             #TODO - Check the alphabets match
 902             return UnknownSeq(len(self)+len(other),
 903                               self.alphabet, self._character)
 904         #Offload to the base class...
 905         return other + Seq(str(self), self.alphabet)
 906
 907     def __getitem__(self, index):
 908         if isinstance(index, int) :
 909             #TODO - Check the bounds without wasting memory
 910             return str(self)[index]
 911         else :
 912             #TODO - Work out the length without wasting memory
 913             return UnknownSeq(len(("#"*self._length)[index]),
 914                               self.alphabet, self._character)
 915
 916     def count(self, sub, start=0, end=sys.maxint):
 917         """Non-overlapping count method, like that of a python string.
 918
 919         This behaves like the python string (and Seq object) method of the
 920         same name, which does a non-overlapping count!
 921
 922         Returns an integer, the number of occurrences of substring
 923         argument sub in the (sub)sequence given by [start:end].
 924         Optional arguments start and end are interpreted as in slice
 925         notation.
 926
 927         Arguments:
 928          - sub - a string or another Seq object to look for
 929          - start - optional integer, slice start
 930          - end - optional integer, slice end
 931
 932         >>> "NNNN".count("N")
 933         4
 934         >>> Seq("NNNN").count("N")
 935         4
 936         >>> UnknownSeq(4, character="N").count("N")
 937         4
 938         >>> UnknownSeq(4, character="N").count("A")
 939         0
 940         >>> UnknownSeq(4, character="N").count("AA")
 941         0
 942
 943         HOWEVER, please note because that python strings and Seq objects (and
 944         MutableSeq objects) do a non-overlapping search, this may not give
 945         the answer you expect:
 946
 947         >>> UnknownSeq(4, character="N").count("NN")
 948         2
 949         >>> UnknownSeq(4, character="N").count("NNN")
 950         1
 951         """
 952         sub_str = self._get_seq_str_and_check_alphabet(sub)
 953         if len(sub_str) == 1 :
 954             if str(sub_str) == self._character :
 955                 if start==0 and end >= self._length :
 956                     return self._length
 957                 else :
 958                     #This could be done more cleverly...
 959                     return str(self).count(sub_str, start, end)
 960             else :
 961                 return 0
 962         else :
 963             if set(sub_str) == set(self._character) :
 964                 if start==0 and end >= self._length :
 965                     return self._length // len(sub_str)
 966                 else :
 967                     #This could be done more cleverly...
 968                     return str(self).count(sub_str, start, end)
 969             else :
 970                 return 0
 971
 972     def complement(self) :
 973         """The complement of an unknown nucleotide equals itself.
 974
 975         >>> my_nuc = UnknownSeq(8)
 976         >>> my_nuc
 977         UnknownSeq(8, alphabet = Alphabet(), character = '?')
 978         >>> print my_nuc
 979         ????????
 980         >>> my_nuc.complement()
 981         UnknownSeq(8, alphabet = Alphabet(), character = '?')
 982         >>> print my_nuc.complement()
 983         ????????
 984         """
 985         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
 986                       Alphabet.ProteinAlphabet) :
 987             raise ValueError("Proteins do not have complements!")
 988         return self
 989
 990     def reverse_complement(self) :
 991         """The reverse complement of an unknown nucleotide equals itself.
 992
 993         >>> my_nuc = UnknownSeq(10)
 994         >>> my_nuc
 995         UnknownSeq(10, alphabet = Alphabet(), character = '?')
 996         >>> print my_nuc
 997         ??????????
 998         >>> my_nuc.reverse_complement()
 999         UnknownSeq(10, alphabet = Alphabet(), character = '?')
1000         >>> print my_nuc.reverse_complement()
1001         ??????????
1002         """
1003         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1004                       Alphabet.ProteinAlphabet) :
1005             raise ValueError("Proteins do not have complements!")
1006         return self
1007
1008     def transcribe(self) :
1009         """Returns unknown RNA sequence from an unknown DNA sequence.
1010
1011         >>> my_dna = UnknownSeq(10, character="N")
1012         >>> my_dna
1013         UnknownSeq(10, alphabet = Alphabet(), character = 'N')
1014         >>> print my_dna
1015         NNNNNNNNNN
1016         >>> my_rna = my_dna.transcribe()
1017         >>> my_rna
1018         UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
1019         >>> print my_rna
1020         NNNNNNNNNN
1021         """
1022         #Offload the alphabet stuff
1023         s = Seq(self._character, self.alphabet).transcribe()
1024         return UnknownSeq(self._length, s.alphabet, self._character)
1025
1026     def back_transcribe(self) :
1027         """Returns unknown DNA sequence from an unknown RNA sequence.
1028
1029         >>> my_rna = UnknownSeq(20, character="N")
1030         >>> my_rna
1031         UnknownSeq(20, alphabet = Alphabet(), character = 'N')
1032         >>> print my_rna
1033         NNNNNNNNNNNNNNNNNNNN
1034         >>> my_dna = my_rna.back_transcribe()
1035         >>> my_dna
1036         UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1037         >>> print my_dna
1038         NNNNNNNNNNNNNNNNNNNN
1039         """
1040         #Offload the alphabet stuff
1041         s = Seq(self._character, self.alphabet).back_transcribe()
1042         return UnknownSeq(self._length, s.alphabet, self._character)
1043
1044     def translate(self, **kwargs) :
1045         """Translate an unknown nucleotide sequence into an unknown protein.
1046
1047         e.g.
1048
1049         >>> my_seq = UnknownSeq(11, character="N")
1050         >>> print my_seq
1051         NNNNNNNNNNN
1052         >>> my_protein = my_seq.translate()
1053         >>> my_protein
1054         UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
1055         >>> print my_protein
1056         XXX
1057
1058         In comparison, using a normal Seq object:
1059
1060         >>> my_seq = Seq("NNNNNNNNNNN")
1061         >>> print my_seq
1062         NNNNNNNNNNN
1063         >>> my_protein = my_seq.translate()
1064         >>> my_protein
1065         Seq('XXX', ExtendedIUPACProtein())
1066         >>> print my_protein
1067         XXX
1068
1069         """
1070         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1071                       Alphabet.ProteinAlphabet) :
1072             raise ValueError("Proteins cannot be translated!")
1073         return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1074
1075
1076 class MutableSeq(object):
1077     """An editable sequence object (with an alphabet).
1078
1079     Unlike normal python strings and our basic sequence object (the Seq class)
1080     which are immuatable, the MutableSeq lets you edit the sequence in place.
1081     However, this means you cannot use a MutableSeq object as a dictionary key.
1082
1083     >>> from Bio.Seq import MutableSeq
1084     >>> from Bio.Alphabet import generic_dna
1085     >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
1086     >>> my_seq
1087     MutableSeq('ACTCGTCGTCG', DNAAlphabet())
1088     >>> my_seq[5]
1089     'T'
1090     >>> my_seq[5] = "A"
1091     >>> my_seq
1092     MutableSeq('ACTCGACGTCG', DNAAlphabet())
1093     >>> my_seq[5]
1094     'A'
1095     >>> my_seq[5:8] = "NNN"
1096     >>> my_seq
1097     MutableSeq('ACTCGNNNTCG', DNAAlphabet())
1098     >>> len(my_seq)
1099     11
1100
1101     Note that the MutableSeq object does not support as many string-like
1102     or biological methods as the Seq object.
1103     """
1104     def __init__(self, data, alphabet = Alphabet.generic_alphabet):
1105         if type(data) == type(""):
1106             self.data = array.array("c", data)
1107         else:
1108             self.data = data   # assumes the input is an array
1109         self.alphabet = alphabet
1110
1111     def __repr__(self):
1112         """Returns a (truncated) representation of the sequence for debugging."""
1113         if len(self) > 60 :
1114             #Shows the last three letters as it is often useful to see if there
1115             #is a stop codon at the end of a sequence.
1116             #Note total length is 54+3+3=60
1117             return "%s('%s...%s', %s)" % (self.__class__.__name__,
1118                                    str(self[:54]), str(self[-3:]),
1119                                    repr(self.alphabet))
1120         else :
1121             return "%s('%s', %s)" % (self.__class__.__name__,
1122                                    str(self),
1123                                    repr(self.alphabet))
1124
1125     def __str__(self):
1126         """Returns the full sequence as a python string.
1127
1128         Note that Biopython 1.44 and earlier would give a truncated
1129         version of repr(my_seq) for str(my_seq).  If you are writing code
1130         which needs to be backwards compatible with old Biopython, you
1131         should continue to use my_seq.tostring() rather than str(my_seq).
1132         """
1133         #See test_GAQueens.py for an historic usage of a non-string alphabet!
1134         return "".join(self.data)
1135
1136     def __cmp__(self, other):
1137         """Compare the sequence for to another sequence or a string.
1138
1139         If compared to another sequence the alphabets must be compatible.
1140         Comparing DNA to RNA, or Nucleotide to Protein will raise an
1141         exception.
1142
1143         Otherwise only the sequence itself is compared, not the precise
1144         alphabet.
1145
1146         This method indirectly supports ==, < , etc."""
1147         if hasattr(other, "alphabet") :
1148             #other should be a Seq or a MutableSeq
1149             if not Alphabet._check_type_compatible([self.alphabet,
1150                                                     other.alphabet]) :
1151                 raise TypeError("Incompatable alphabets %s and %s" \
1152                                 % (repr(self.alphabet), repr(other.alphabet)))
1153             #They should be the same sequence type (or one of them is generic)
1154             if isinstance(other, MutableSeq):
1155                 #See test_GAQueens.py for an historic usage of a non-string
1156                 #alphabet!  Comparing the arrays supports this.
1157                 return cmp(self.data, other.data)
1158             else :
1159                 return cmp(str(self), str(other))
1160         elif isinstance(other, basestring) :
1161             return cmp(str(self), other)
1162         else :
1163             raise TypeError
1164
1165     def __len__(self): return len(self.data)
1166
1167     def __getitem__(self, index) :
1168         #Note since Python 2.0, __getslice__ is deprecated
1169         #and __getitem__ is used instead.
1170         #See http://docs.python.org/ref/sequence-methods.html
1171         if isinstance(index, int) :
1172             #Return a single letter as a string
1173             return self.data[index]
1174         else :
1175             #Return the (sub)sequence as another Seq object
1176             return MutableSeq(self.data[index], self.alphabet)
1177
1178     def __setitem__(self, index, value):
1179         #Note since Python 2.0, __setslice__ is deprecated
1180         #and __setitem__ is used instead.
1181         #See http://docs.python.org/ref/sequence-methods.html
1182         if isinstance(index, int) :
1183             #Replacing a single letter with a new string
1184             self.data[index] = value
1185         else :
1186             #Replacing a sub-sequence
1187             if isinstance(value, MutableSeq):
1188                 self.data[index] = value.data
1189             elif isinstance(value, type(self.data)):
1190                 self.data[index] = value
1191             else:
1192                 self.data[index] = array.array("c", str(value))
1193
1194     def __delitem__(self, index):
1195         #Note since Python 2.0, __delslice__ is deprecated
1196         #and __delitem__ is used instead.
1197         #See http://docs.python.org/ref/sequence-methods.html
1198
1199         #Could be deleting a single letter, or a slice
1200         del self.data[index]
1201
1202     def __add__(self, other):
1203         """Add another sequence or string to this sequence.
1204
1205         Returns a new MutableSeq object."""
1206         if hasattr(other, "alphabet") :
1207             #other should be a Seq or a MutableSeq
1208             if not Alphabet._check_type_compatible([self.alphabet,
1209                                                     other.alphabet]) :
1210                 raise TypeError("Incompatable alphabets %s and %s" \
1211                                 % (repr(self.alphabet), repr(other.alphabet)))
1212             #They should be the same sequence type (or one of them is generic)
1213             a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1214             if isinstance(other, MutableSeq):
1215                 #See test_GAQueens.py for an historic usage of a non-string
1216                 #alphabet!  Adding the arrays should support this.
1217                 return self.__class__(self.data + other.data, a)
1218             else :
1219                 return self.__class__(str(self) + str(other), a)
1220         elif isinstance(other, basestring) :
1221             #other is a plain string - use the current alphabet
1222             return self.__class__(str(self) + str(other), self.alphabet)
1223         else :
1224             raise TypeError
1225
1226     def __radd__(self, other):
1227         if hasattr(other, "alphabet") :
1228             #other should be a Seq or a MutableSeq
1229             if not Alphabet._check_type_compatible([self.alphabet,
1230                                                     other.alphabet]) :
1231                 raise TypeError("Incompatable alphabets %s and %s" \
1232                                 % (repr(self.alphabet), repr(other.alphabet)))
1233             #They should be the same sequence type (or one of them is generic)
1234             a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1235             if isinstance(other, MutableSeq):
1236                 #See test_GAQueens.py for an historic usage of a non-string
1237                 #alphabet!  Adding the arrays should support this.
1238                 return self.__class__(other.data + self.data, a)
1239             else :
1240                 return self.__class__(str(other) + str(self), a)
1241         elif isinstance(other, basestring) :
1242             #other is a plain string - use the current alphabet
1243             return self.__class__(str(other) + str(self), self.alphabet)
1244         else :
1245             raise TypeError
1246
1247     def append(self, c):
1248         self.data.append(c)
1249
1250     def insert(self, i, c):
1251         self.data.insert(i, c)
1252
1253     def pop(self, i = (-1)):
1254         c = self.data[i]
1255         del self.data[i]
1256         return c
1257
1258     def remove(self, item):
1259         for i in range(len(self.data)):
1260             if self.data[i] == item:
1261                 del self.data[i]
1262                 return
1263         raise ValueError("MutableSeq.remove(x): x not in list")
1264
1265     def count(self, sub, start=0, end=sys.maxint):
1266         """Non-overlapping count method, like that of a python string.
1267
1268         This behaves like the python string method of the same name,
1269         which does a non-overlapping count!
1270
1271         Returns an integer, the number of occurrences of substring
1272         argument sub in the (sub)sequence given by [start:end].
1273         Optional arguments start and end are interpreted as in slice
1274         notation.
1275
1276         Arguments:
1277          - sub - a string or another Seq object to look for
1278          - start - optional integer, slice start
1279          - end - optional integer, slice end
1280
1281         e.g.
1282
1283         >>> from Bio.Seq import MutableSeq
1284         >>> my_mseq = MutableSeq("AAAATGA")
1285         >>> print my_mseq.count("A")
1286         5
1287         >>> print my_mseq.count("ATG")
1288         1
1289         >>> print my_mseq.count(Seq("AT"))
1290         1
1291         >>> print my_mseq.count("AT", 2, -1)
1292         1
1293
1294         HOWEVER, please note because that python strings, Seq objects and
1295         MutableSeq objects do a non-overlapping search, this may not give
1296         the answer you expect:
1297
1298         >>> "AAAA".count("AA")
1299         2
1300         >>> print MutableSeq("AAAA").count("AA")
1301         2
1302
1303         A non-overlapping search would give the answer as three!
1304         """
1305         try :
1306             #TODO - Should we check the alphabet?
1307             search = sub.tostring()
1308         except AttributeError :
1309             search = sub
1310
1311         if not isinstance(search, basestring) :
1312             raise TypeError("expected a string, Seq or MutableSeq")
1313
1314         if len(search) == 1 :
1315             #Try and be efficient and work directly from the array.
1316             count = 0
1317             for c in self.data[start:end]:
1318                 if c == search: count += 1
1319             return count
1320         else :
1321             #TODO - Can we do this more efficiently?
1322             return self.tostring().count(search, start, end)
1323
1324     def index(self, item):
1325         for i in range(len(self.data)):
1326             if self.data[i] == item:
1327                 return i
1328         raise ValueError("MutableSeq.index(x): x not in list")
1329
1330     def reverse(self):
1331         """Modify the mutable sequence to reverse itself.
1332
1333         No return value.
1334         """
1335         self.data.reverse()
1336
1337     def complement(self):
1338         """Modify the mutable sequence to take on its complement.
1339
1340         Trying to complement a protein sequence raises an exception.
1341
1342         No return value.
1343         """
1344         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1345                       Alphabet.ProteinAlphabet) :
1346             raise ValueError("Proteins do not have complements!")
1347         if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna):
1348             d = ambiguous_dna_complement
1349         elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna):
1350             d = ambiguous_rna_complement
1351         elif 'U' in self.data and 'T' in self.data :
1352             #TODO - Handle this cleanly?
1353             raise ValueError("Mixed RNA/DNA found")
1354         elif 'U' in self.data:
1355             d = ambiguous_rna_complement
1356         else:
1357             d = ambiguous_dna_complement
1358         c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()])
1359         d.update(c)
1360         self.data = map(lambda c: d[c], self.data)
1361         self.data = array.array('c', self.data)
1362
1363     def reverse_complement(self):
1364         """Modify the mutable sequence to take on its reverse complement.
1365
1366         Trying to reverse complement a protein sequence raises an exception.
1367
1368         No return value.
1369         """
1370         self.complement()
1371         self.data.reverse()
1372
1373     ## Sorting a sequence makes no sense.
1374     # def sort(self, *args): self.data.sort(*args)
1375
1376     def extend(self, other):
1377         if isinstance(other, MutableSeq):
1378             for c in other.data:
1379                 self.data.append(c)
1380         else:
1381             for c in other:
1382                 self.data.append(c)
1383
1384     def tostring(self):
1385         """Returns the full sequence as a python string.
1386
1387         Although not formally deprecated, you are now encouraged to use
1388         str(my_seq) instead of my_seq.tostring().
1389
1390         Because str(my_seq) will give you the full sequence as a python string,
1391         there is often no need to make an explicit conversion.  For example,
1392
1393         print "ID={%s}, sequence={%s}" % (my_name, my_seq)
1394
1395         On Biopython 1.44 or older you would have to have done this:
1396
1397         print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
1398         """
1399         return "".join(self.data)
1400
1401     def toseq(self):
1402         """Returns the full sequence as a new immutable Seq object.
1403
1404         >>> from Bio.Seq import Seq
1405         >>> from Bio.Alphabet import IUPAC
1406         >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \
1407                                  IUPAC.protein)
1408         >>> my_mseq
1409         MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1410         >>> my_mseq.toseq()
1411         Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1412
1413         Note that the alphabet is preserved.
1414         """
1415         return Seq("".join(self.data), self.alphabet)
1416
1417 # The transcribe, backward_transcribe, and translate functions are
1418 # user-friendly versions of the corresponding functions in Bio.Transcribe
1419 # and Bio.Translate. The functions work both on Seq objects, and on strings.
1420
1421 def transcribe(dna):
1422     """Transcribes a DNA sequence into RNA.
1423
1424     If given a string, returns a new string object.
1425
1426     Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1427
1428     Trying to transcribe a protein or RNA sequence raises an exception.
1429
1430     e.g.
1431
1432     >>> transcribe("ACTGN")
1433     'ACUGN'
1434     """
1435     if isinstance(dna, Seq) :
1436         return dna.transcribe()
1437     elif isinstance(dna, MutableSeq):
1438         return dna.toseq().transcribe()
1439     else:
1440         return dna.replace('T','U').replace('t','u')
1441
1442 def back_transcribe(rna):
1443     """Back-transcribes an RNA sequence into DNA.
1444
1445     If given a string, returns a new string object.
1446
1447     Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1448
1449     Trying to transcribe a protein or DNA sequence raises an exception.
1450
1451     e.g.
1452
1453     >>> back_transcribe("ACUGN")
1454     'ACTGN'
1455     """
1456     if isinstance(rna, Seq) :
1457         return rna.back_transcribe()
1458     elif isinstance(rna, MutableSeq):
1459         return rna.toseq().back_transcribe()
1460     else:
1461         return rna.replace('U','T').replace('u','t')
1462
1463 def _translate_str(sequence, table, stop_symbol="*",
1464                    to_stop=False, pos_stop="X") :
1465     """Helper function to translate a nucleotide string (PRIVATE).
1466
1467     Arguments:
1468      - sequence    - a string
1469      - table       - a CodonTable object (NOT a table name or id number)
1470      - stop_symbol - a single character string, what to use for terminators.
1471      - to_stop     - boolean, should translation terminate at the first
1472                      in frame stop codon?  If there is no in-frame stop codon
1473                      then translation continues to the end.
1474      - pos_stop    - a single character string for a possible stop codon
1475                      (e.g. TAN or NNN)
1476
1477     Returns a string.
1478
1479     e.g.
1480
1481     >>> from Bio.Data import CodonTable
1482     >>> table = CodonTable.ambiguous_dna_by_id[1]
1483     >>> _translate_str("AAA", table)
1484     'K'
1485     >>> _translate_str("TAR", table)
1486     '*'
1487     >>> _translate_str("TAN", table)
1488     'X'
1489     >>> _translate_str("TAN", table, pos_stop="@")
1490     '@'
1491     >>> _translate_str("TA?", table)
1492     Traceback (most recent call last):
1493        ...
1494     TranslationError: Codon 'TA?' is invalid
1495     """
1496     sequence = sequence.upper()
1497     amino_acids = []
1498     forward_table = table.forward_table
1499     stop_codons = table.stop_codons
1500     if table.nucleotide_alphabet.letters is not None :
1501         valid_letters = set(table.nucleotide_alphabet.letters.upper())
1502     else :
1503         #Assume the worst case, ambiguous DNA or RNA:
1504         valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
1505                             IUPAC.ambiguous_rna.letters.upper())
1506
1507     n = len(sequence)
1508     for i in xrange(0,n-n%3,3) :
1509         codon = sequence[i:i+3]
1510         try :
1511             amino_acids.append(forward_table[codon])
1512         except (KeyError, CodonTable.TranslationError) :
1513             #Todo? Treat "---" as a special case (gapped translation)
1514             if codon in table.stop_codons :
1515                 if to_stop : break
1516                 amino_acids.append(stop_symbol)
1517             elif valid_letters.issuperset(set(codon)) :
1518                 #Possible stop codon (e.g. NNN or TAN)
1519                 amino_acids.append(pos_stop)
1520             else :
1521                 raise CodonTable.TranslationError(\
1522                     "Codon '%s' is invalid" % codon)
1523     return "".join(amino_acids)
1524
1525 def translate(sequence, table="Standard", stop_symbol="*", to_stop=False):
1526     """Translate a nucleotide sequence into amino acids.
1527
1528     If given a string, returns a new string object. Given a Seq or
1529     MutableSeq, returns a Seq object with a protein alphabet.
1530
1531     Arguments:
1532      - table - Which codon table to use?  This can be either a name
1533                (string) or an NCBI identifier (integer).  Defaults
1534                to the "Standard" table.
1535      - stop_symbol - Single character string, what to use for any
1536                      terminators, defaults to the asterisk, "*".
1537      - to_stop - Boolean, defaults to False meaning do a full
1538                  translation continuing on past any stop codons
1539                  (translated as the specified stop_symbol).  If
1540                  True, translation is terminated at the first in
1541                  frame stop codon (and the stop_symbol is not
1542                  appended to the returned protein sequence).
1543
1544     A simple string example using the default (standard) genetic code:
1545
1546     >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
1547     >>> translate(coding_dna)
1548     'VAIVMGR*KGAR*'
1549     >>> translate(coding_dna, stop_symbol="@")
1550     'VAIVMGR@KGAR@'
1551     >>> translate(coding_dna, to_stop=True)
1552     'VAIVMGR'
1553
1554     Now using NCBI table 2, where TGA is not a stop codon:
1555
1556     >>> translate(coding_dna, table=2)
1557     'VAIVMGRWKGAR*'
1558     >>> translate(coding_dna, table=2, to_stop=True)
1559     'VAIVMGRWKGAR'
1560
1561     Note that if the sequence has no in-frame stop codon, then the to_stop
1562     argument has no effect:
1563
1564     >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
1565     >>> translate(coding_dna2)
1566     'VAIVMGR'
1567     >>> translate(coding_dna2, to_stop=True)
1568     'VAIVMGR'
1569
1570     NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
1571     or a stop codon.  These are translated as "X".  Any invalid codon
1572     (e.g. "TA?" or "T-A") will throw a TranslationError.
1573
1574     NOTE - Does NOT support gapped sequences.
1575
1576     It will however translate either DNA or RNA.
1577     """
1578     if isinstance(sequence, Seq) :
1579         return sequence.translate(table, stop_symbol, to_stop)
1580     elif isinstance(sequence, MutableSeq):
1581         #Return a Seq object
1582         return sequence.toseq().translate(table, stop_symbol, to_stop)
1583     else:
1584         #Assume its a string, return a string
1585         try :
1586             codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
1587         except ValueError :
1588             codon_table = CodonTable.ambiguous_generic_by_name[table]
1589         return _translate_str(sequence, codon_table, stop_symbol, to_stop)
1590
1591 def reverse_complement(sequence):
1592     """Returns the reverse complement sequence of a nucleotide string.
1593
1594     If given a string, returns a new string object.
1595     Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet.
1596
1597     Supports unambiguous and ambiguous nucleotide sequences.
1598
1599     e.g.
1600
1601     >>> reverse_complement("ACTG-NH")
1602     'DN-CAGT'
1603     """
1604     if isinstance(sequence, Seq) :
1605         #Return a Seq
1606         return sequence.reverse_complement()
1607     elif isinstance(sequence, MutableSeq) :
1608         #Return a Seq
1609         #Don't use the MutableSeq reverse_complement method as it is 'in place'.
1610         return sequence.toseq().reverse_complement()
1611
1612     #Assume its a string.
1613     #In order to avoid some code duplication, the old code would turn the string
1614     #into a Seq, use the reverse_complement method, and convert back to a string.
1615     #This worked, but is over five times slower on short sequences!
1616     if ('U' in sequence or 'u' in sequence) \
1617     and ('T' in sequence or 't' in sequence):
1618         raise ValueError("Mixed RNA/DNA found")
1619     elif 'U' in sequence or 'u' in sequence:
1620         ttable = _rna_complement_table
1621     else:
1622         ttable = _dna_complement_table
1623     return sequence.translate(ttable)[::-1]
1624
1625 def _test():
1626     """Run the Bio.Seq module's doctests."""
1627     print "Runing doctests..."
1628     import doctest
1629     doctest.testmod()
1630     print "Done"
1631
1632 if __name__ == "__main__":
1633     _test()