--- /dev/null
+# Copyright 2000-2001 by Andrew Dalke.
+# Revisions copyright 2008 by Peter Cock.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Standard nucleotide and protein alphabets defined by IUPAC."""
+
+from Bio import Alphabet
+from Bio.Data import IUPACData
+
+##################### Protein
+
+# From the IUPAC definition at:
+# http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html#AA21
+
+assert IUPACData.extended_protein_letters == IUPACData.extended_protein_letters.upper()
+class ExtendedIUPACProtein(Alphabet.ProteinAlphabet):
+ """Extended uppercase IUPAC protein single letter alphabet including X etc.
+
+ In addition to the standard 20 single letter protein codes, this includes:
+
+ B = "Asx"; Aspartic acid (R) or Asparagine (N)
+ X = "Xxx"; Unknown or 'other' amino acid
+ Z = "Glx"; Glutamic acid (E) or Glutamine (Q)
+ J = "Xle"; Leucine (L) or Isoleucine (I), used in mass-spec (NMR)
+ U = "Sec"; Selenocysteine
+ O = "Pyl"; Pyrrolysine
+
+ This alphabet is not intended to be used with X for Selenocysteine
+ (an ad-hoc standard prior to the IUPAC adoption of U instead).
+ """
+ letters = IUPACData.extended_protein_letters
+
+extended_protein = ExtendedIUPACProtein()
+
+assert IUPACData.protein_letters == IUPACData.protein_letters.upper()
+class IUPACProtein(ExtendedIUPACProtein):
+ """Uppercase IUPAC protein single letter alphabet of the 20 standard amino acids."""
+ letters = IUPACData.protein_letters
+
+protein = IUPACProtein()
+
+##################### DNA
+
+# The next two are the IUPAC definitions, from:
+# http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html
+class IUPACAmbiguousDNA(Alphabet.DNAAlphabet):
+ """Uppercase IUPAC ambiguous DNA."""
+ letters = IUPACData.ambiguous_dna_letters
+
+ambiguous_dna = IUPACAmbiguousDNA()
+
+class IUPACUnambiguousDNA(IUPACAmbiguousDNA):
+ """Uppercase IUPAC unambiguous DNA (letters GATC only)."""
+ letters = IUPACData.unambiguous_dna_letters
+
+unambiguous_dna = IUPACUnambiguousDNA()
+
+
+# Also from the URL, but not part of the standard
+class ExtendedIUPACDNA(Alphabet.DNAAlphabet):
+ """Extended IUPAC DNA alphabet.
+
+ In addition to the standard letter codes GATC, this includes:
+
+ B = 5-bromouridine
+ D = 5,6-dihydrouridine
+ S = thiouridine
+ W = wyosine
+ """
+ letters = IUPACData.extended_dna_letters
+
+extended_dna = ExtendedIUPACDNA()
+
+##################### RNA
+
+class IUPACAmbiguousRNA(Alphabet.RNAAlphabet):
+ """Uppercase IUPAC ambiguous RNA."""
+ letters = IUPACData.ambiguous_rna_letters
+
+ambiguous_rna = IUPACAmbiguousRNA()
+
+class IUPACUnambiguousRNA(IUPACAmbiguousRNA):
+ """Uppercase IUPAC unambiguous RNA (letters GAUC only)."""
+ letters = IUPACData.unambiguous_rna_letters
+
+unambiguous_rna = IUPACUnambiguousRNA()
+
+# are there extended forms?
+#class ExtendedIUPACRNA(Alphabet.RNAAlphabet):
+# letters = extended_rna_letters
+# # B == 5-bromouridine
+# # D == 5,6-dihydrouridine
+# # S == thiouridine
+# # W == wyosine
+
+
+# We need to load the property resolution information, but we need to
+# wait until after the systems have been loaded. (There's a nasty loop
+# where, eg, translation objects need an alphabet, which need to be
+# assocated with translators.)
+
+from Bio.PropertyManager import default_manager
+
+def _bootstrap(manager, klass, property):
+ assert manager is default_manager
+ del default_manager.class_resolver[IUPACProtein]
+ del default_manager.class_resolver[ExtendedIUPACProtein]
+ del default_manager.class_resolver[IUPACAmbiguousDNA]
+ del default_manager.class_resolver[IUPACUnambiguousDNA]
+ del default_manager.class_resolver[ExtendedIUPACDNA]
+ del default_manager.class_resolver[IUPACAmbiguousRNA]
+ del default_manager.class_resolver[IUPACUnambiguousRNA]
+
+ from Bio.Encodings import IUPACEncoding
+
+ return manager.resolve_class(klass, property)
+
+default_manager.class_resolver[IUPACProtein] = _bootstrap
+default_manager.class_resolver[ExtendedIUPACProtein] = _bootstrap
+default_manager.class_resolver[IUPACAmbiguousDNA] = _bootstrap
+default_manager.class_resolver[IUPACUnambiguousDNA] = _bootstrap
+default_manager.class_resolver[ExtendedIUPACDNA] = _bootstrap
+default_manager.class_resolver[IUPACAmbiguousRNA] = _bootstrap
+default_manager.class_resolver[IUPACUnambiguousRNA] = _bootstrap
--- /dev/null
+# Copyright 2004 by Iddo Friedberg.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Reduced alphabets which lump together several amino-acids into one letter.
+
+Reduced (redundant or simplified) alphabets are used to represent protein sequences using an
+alternative alphabet which lumps together several amino-acids into one letter, based
+on physico-chemical traits. For example, all the aliphatics (I,L,V) are usually
+quite interchangeable, so many sequence studies group them into one letter
+
+Examples of reduced alphabets are available in:
+
+http://viscose.ifg.uni-muenster.de/html/alphabets.html
+
+Bio.utils.reduce_sequence is used to take a Protein alphabet, and reduce it using one of
+the tables here, or a user-defined table.
+"""
+
+from Bio import Alphabet
+
+# The Murphy tables are from here:
+# Murphy L.R., Wallqvist A, Levy RM. (2000) Simplified amino acid alphabets for protein
+# fold recognition and implications for folding. Protein Eng. 13(3):149-152
+
+murphy_15_tab = {"L": "L",
+ "V": "L",
+ "I": "L",
+ "M": "L",
+ "C": "C",
+ "A": "A",
+ "G": "G",
+ "S": "S",
+ "T": "T",
+ "P": "P",
+ "F": "F",
+ "Y": "F",
+ "W": "W",
+ "E": "E",
+ "D": "D",
+ "N": "N",
+ "Q": "Q",
+ "K": "K",
+ "R": "K",
+ "H": "H"}
+
+class Murphy15(Alphabet.ProteinAlphabet):
+ letters = "LCAGSTPFWEDNQKH"
+ size = 15
+murphy_15 = Murphy15()
+
+murphy_10_tab = {"L": "L",
+ "V": "L",
+ "I": "L",
+ "M": "L",
+ "C": "C",
+ "A": "A",
+ "G": "G",
+ "S": "S",
+ "T": "S",
+ "P": "P",
+ "F": "F",
+ "Y": "F",
+ "W": "F",
+ "E": "E",
+ "D": "E",
+ "N": "E",
+ "Q": "E",
+ "K": "K",
+ "R": "K",
+ "H": "H"}
+class Murphy10(Alphabet.ProteinAlphabet):
+ letters = "LCAGSPFEKH"
+ size = 10
+murphy_10 = Murphy10()
+
+murphy_8_tab = {"L": "L",
+ "V": "L",
+ "I": "L",
+ "M": "L",
+ "C": "L",
+ "A": "A",
+ "G": "A",
+ "S": "S",
+ "T": "S",
+ "P": "P",
+ "F": "F",
+ "Y": "F",
+ "W": "F",
+ "E": "E",
+ "D": "E",
+ "N": "E",
+ "Q": "E",
+ "K": "K",
+ "R": "K",
+ "H": "H"}
+
+class Murphy8(Alphabet.ProteinAlphabet):
+ letters = "LASPFEKH"
+ size = 8
+murphy_8 = Murphy8()
+
+murphy_4_tab = {"L": "L",
+ "V": "L",
+ "I": "L",
+ "M": "L",
+ "C": "L",
+ "A": "A",
+ "G": "A",
+ "S": "A",
+ "T": "A",
+ "P": "A",
+ "F": "F",
+ "Y": "F",
+ "W": "F",
+ "E": "E",
+ "D": "E",
+ "N": "E",
+ "Q": "E",
+ "K": "E",
+ "R": "E",
+ "H": "E"}
+
+class Murphy4(Alphabet.ProteinAlphabet):
+ letters = "LAFE"
+ size = 4
+murphy_4 = Murphy4()
+
+hp_model_tab = {"A": "P", # Hydrophilic
+ "G": "P",
+ "T": "P",
+ "S": "P",
+ "N": "P",
+ "Q": "P",
+ "D": "P",
+ "E": "P",
+ "H": "P",
+ "R": "P",
+ "K": "P",
+ "P": "P",
+ "C": "H", # Hydrophobic
+ "M": "H",
+ "F": "H",
+ "I": "H",
+ "L": "H",
+ "V": "H",
+ "W": "H",
+ "Y": "H"}
+
+class HPModel(Alphabet.ProteinAlphabet):
+ letters = "HP"
+ size = 2
+hp_model = HPModel()
+
+pc_5_table = {"I": "A", # Aliphatic
+ "V": "A",
+ "L": "A",
+ "F": "R", # Aromatic
+ "Y": "R",
+ "W": "R",
+ "H": "R",
+ "K": "C", # Charged
+ "R": "C",
+ "D": "C",
+ "E": "C",
+ "G": "T", # Tiny
+ "A": "T",
+ "C": "T",
+ "S": "T",
+ "T": "D", # Diverse
+ "M": "D",
+ "Q": "D",
+ "N": "D",
+ "P": "D"}
+
+class PC5(Alphabet.ProteinAlphabet):
+ letters = "ARCTD"
+ size = 5
+hp_model = HPModel()
--- /dev/null
+# Copyright 2000-2002 by Andrew Dalke.
+# Revisions copyright 2007-2008 by Peter Cock.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Alphabets used in Seq objects etc to declare sequence type and letters.
+
+This is used by sequences which contain a finite number of similar words.
+"""
+
+class Alphabet:
+ size = None # no fixed size for words
+ letters = None # no fixed alphabet; implement as a list-like
+ # interface,
+ def __repr__(self):
+ return self.__class__.__name__ + "()"
+
+ def contains(self, other):
+ """Does this alphabet 'contain' the other (OBSOLETE?).
+
+ Returns a boolean. This relies on the Alphabet subclassing
+ hierarchy only, and does not check the letters property.
+ This isn't ideal, and doesn't seem to work as intended
+ with the AlphabetEncoder classes."""
+ return isinstance(other, self.__class__)
+
+generic_alphabet = Alphabet()
+
+class SingleLetterAlphabet(Alphabet):
+ size = 1
+ letters = None # string of all letters in the alphabet
+
+single_letter_alphabet = SingleLetterAlphabet()
+
+########### Protein
+
+class ProteinAlphabet(SingleLetterAlphabet):
+ pass
+
+generic_protein = ProteinAlphabet()
+
+########### DNA
+class NucleotideAlphabet(SingleLetterAlphabet):
+ pass
+
+generic_nucleotide = NucleotideAlphabet()
+
+class DNAAlphabet(NucleotideAlphabet):
+ pass
+
+generic_dna = DNAAlphabet()
+
+
+########### RNA
+
+class RNAAlphabet(NucleotideAlphabet):
+ pass
+
+generic_rna = RNAAlphabet()
+
+
+
+########### Other per-sequence encodings
+
+class SecondaryStructure(SingleLetterAlphabet):
+ letters = "HSTC"
+
+class ThreeLetterProtein(Alphabet):
+ size = 3
+ letters = [
+ "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
+ "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
+ "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx",
+ ]
+
+###### Non per-sequence modifications
+
+# (These are Decorator classes)
+
+class AlphabetEncoder:
+ def __init__(self, alphabet, new_letters):
+ self.alphabet = alphabet
+ self.new_letters = new_letters
+ if alphabet.letters is not None:
+ self.letters = alphabet.letters + new_letters
+ else:
+ self.letters = None
+ def __getattr__(self, key):
+ if key[:2] == "__" and key[-2:] == "__":
+ raise AttributeError(key)
+ return getattr(self.alphabet, key)
+
+ def __repr__(self):
+ return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet,
+ self.new_letters)
+
+ def contains(self, other):
+ """Does this alphabet 'contain' the other (OBSOLETE?).
+
+ This is isn't implemented for the base AlphabetEncoder,
+ which will always return 0 (False)."""
+ return 0
+
+class Gapped(AlphabetEncoder):
+ def __init__(self, alphabet, gap_char = "-"):
+ AlphabetEncoder.__init__(self, alphabet, gap_char)
+ self.gap_char = gap_char
+
+ def contains(self, other):
+ """Does this alphabet 'contain' the other (OBSOLETE?).
+
+ Returns a boolean. This relies on the Alphabet subclassing
+ hierarchy, and attempts to check the gap character. This fails
+ if the other alphabet does not have a gap character!
+ """
+ return other.gap_char == self.gap_char and \
+ self.alphabet.contains(other.alphabet)
+
+class HasStopCodon(AlphabetEncoder):
+ def __init__(self, alphabet, stop_symbol = "*"):
+ AlphabetEncoder.__init__(self, alphabet, stop_symbol)
+ self.stop_symbol = stop_symbol
+
+ def __cmp__(self, other):
+ x = cmp(self.alphabet, other.alphabet)
+ if x == 0:
+ return cmp(self.stop_symbol, other.stop_symbol)
+ return x
+
+ def contains(self, other):
+ """Does this alphabet 'contain' the other (OBSOLETE?).
+
+ Returns a boolean. This relies on the Alphabet subclassing
+ hierarchy, and attempts to check the stop symbol. This fails
+ if the other alphabet does not have a stop symbol!
+ """
+ return other.stop_symbol == self.stop_symbol and \
+ self.alphabet.contains(other.alphabet)
+
+def _get_base_alphabet(alphabet) :
+ """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE)."""
+ a = alphabet
+ while isinstance(a, AlphabetEncoder) :
+ a = a.alphabet
+ assert isinstance(a, Alphabet), \
+ "Invalid alphabet found, %s" % repr(a)
+ return a
+
+def _consensus_base_alphabet(alphabets) :
+ """Returns a common but often generic base alphabet object (PRIVATE).
+
+ This throws away any AlphabetEncoder information, e.g. Gapped alphabets.
+
+ Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
+ letter. These DO NOT raise an exception!"""
+ common = None
+ for alpha in alphabets :
+ a = _get_base_alphabet(alpha)
+ if common is None :
+ common = a
+ elif common == a :
+ pass
+ elif isinstance(a, common.__class__) :
+ pass
+ elif isinstance(common, a.__class__) :
+ common = a
+ elif isinstance(a, NucleotideAlphabet) \
+ and isinstance(common, NucleotideAlphabet) :
+ #e.g. Give a mix of RNA and DNA alphabets
+ common = generic_nucleotide
+ elif isinstance(a, SingleLetterAlphabet) \
+ and isinstance(common, SingleLetterAlphabet) :
+ #This is a pretty big mis-match!
+ common = single_letter_alphabet
+ else :
+ #We have a major mis-match... take the easy way out!
+ return generic_alphabet
+ if common is None :
+ #Given NO alphabets!
+ return generic_alphabet
+ return common
+
+def _consensus_alphabet(alphabets) :
+ """Returns a common but often generic alphabet object (PRIVATE).
+
+ Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
+ letter. These DO NOT raise an exception!
+
+ This is aware of Gapped and HasStopCodon and new letters added by
+ other AlphabetEncoders. This WILL raise an exception if more than
+ one gap character or stop symbol is present."""
+ base = _consensus_base_alphabet(alphabets)
+ gap = None
+ stop = None
+ new_letters = ""
+ for alpha in alphabets :
+ #Gaps...
+ if not hasattr(alpha, "gap_char") :
+ pass
+ elif gap is None :
+ gap = alpha.gap_char
+ elif gap == alpha.gap_char :
+ pass
+ else :
+ raise ValueError("More than one gap character present")
+ #Stops...
+ if not hasattr(alpha, "stop_symbol") :
+ pass
+ elif stop is None :
+ stop = alpha.stop_symbol
+ elif stop == alpha.stop_symbol :
+ pass
+ else :
+ raise ValueError("More than one stop symbol present")
+ #New letters...
+ if hasattr(alpha, "new_letters") :
+ for letter in alpha.new_letters :
+ if letter not in new_letters \
+ and letter != gap and letter != stop :
+ new_letters += letter
+
+ alpha = base
+ if new_letters :
+ alpha = AlphabetEncoder(alpha, new_letters)
+ if gap :
+ alpha = Gapped(alpha, gap_char=gap)
+ if stop :
+ alpha = HasStopCodon(alpha, stop_symbol=stop)
+ return alpha
+
+def _check_type_compatible(alphabets) :
+ """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE).
+
+ This relies on the Alphabet subclassing hierarchy. It does not
+ check things like gap characters or stop symbols."""
+ dna, rna, nucl, protein = False, False, False, False
+ for alpha in alphabets :
+ a = _get_base_alphabet(alpha)
+ if isinstance(a, DNAAlphabet) :
+ dna = True
+ nucl = True
+ if rna or protein : return False
+ elif isinstance(a, RNAAlphabet) :
+ rna = True
+ nucl = True
+ if dna or protein : return False
+ elif isinstance(a, NucleotideAlphabet) :
+ nucl = True
+ if protein : return False
+ elif isinstance(a, ProteinAlphabet) :
+ protein = True
+ if nucl : return False
+ return True
--- /dev/null
+"""General mechanisms to access applications in biopython.
+"""
+import os, sys
+import StringIO
+
+from Bio import File
+
+def generic_run(commandline):
+ """Run an application with the given commandline.
+
+ This expects a pre-built commandline that derives from
+ AbstractCommandline, and returns a ApplicationResult object
+ to get results from a program, along with handles of the
+ standard output and standard error.
+
+ WARNING - This will read in the full program output into memory!
+ This may be in issue when the program write a large amount of
+ data to standard output.
+ """
+ # print str(commandline)
+
+ #Try and use subprocess (available in python 2.4+)
+ try :
+ import subprocess, sys
+ #We don't need to supply any piped input, but we setup the
+ #standard input pipe anyway as a work around for a python
+ #bug if this is called from a Windows GUI program. For
+ #details, see http://bugs.python.org/issue1124861
+ child = subprocess.Popen(str(commandline),
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ shell=(sys.platform!="win32"))
+ child.stdin.close()
+ r = child.stdout
+ e = child.stderr
+
+ r_out = r.read()
+ e_out = e.read()
+ r.close()
+ e.close()
+
+ # capture error code
+ error_code = child.wait()
+
+ except ImportError :
+ #For python 2.3 can't use subprocess, using popen2 instead
+ #(deprecated in python 2.6)
+ import popen2
+ if sys.platform[:3]=='win':
+ # Windows does not have popen2.Popen3
+ r, w, e = popen2.popen3(str(commandline))
+
+ r_out = r.read()
+ e_out = e.read()
+ w.close()
+ r.close()
+ e.close()
+
+ # No way to get the error code; setting it to a dummy variable
+ error_code = 0
+
+ else:
+ child = popen2.Popen3(str(commandline), 1)
+ # get information and close the files, so if we call this function
+ # repeatedly we won't end up with too many open files
+
+ # here are the file descriptors
+ r = child.fromchild
+ w = child.tochild
+ e = child.childerr
+
+ r_out = r.read()
+ e_out = e.read()
+ w.close()
+ r.close()
+ e.close()
+
+ # capture error code
+ error_code = os.WEXITSTATUS(child.wait())
+
+ return ApplicationResult(commandline, error_code), \
+ File.UndoHandle(StringIO.StringIO(r_out)), \
+ File.UndoHandle(StringIO.StringIO(e_out))
+
+class ApplicationResult:
+ """Make results of a program available through a standard interface.
+
+ This tries to pick up output information available from the program
+ and make it available programmatically.
+ """
+ def __init__(self, application_cl, return_code):
+ """Intialize with the commandline from the program.
+ """
+ self._cl = application_cl
+
+ # provide the return code of the application
+ self.return_code = return_code
+
+ # get the application dependent results we can provide
+ # right now the only results we handle are output files
+ self._results = {}
+
+ for parameter in self._cl.parameters:
+ if "file" in parameter.param_types and \
+ "output" in parameter.param_types:
+ if parameter.is_set:
+ self._results[parameter.names[-1]] = parameter.value
+
+ def get_result(self, output_name):
+ """Retrieve result information for the given output.
+ """
+ return self._results[output_name]
+
+ def available_results(self):
+ """Retrieve a list of all available results.
+ """
+ result_names = self._results.keys()
+ result_names.sort()
+ return result_names
+
+class AbstractCommandline:
+ """Generic interface for running applications from biopython.
+
+ This class shouldn't be called directly; it should be subclassed to
+ provide an implementation for a specific application.
+ """
+ def __init__(self):
+ self.program_name = ""
+ self.parameters = []
+
+ def __str__(self):
+ """Make the commandline with the currently set options.
+ """
+ commandline = "%s " % self.program_name
+ for parameter in self.parameters:
+ if parameter.is_required and not(parameter.is_set):
+ raise ValueError("Parameter %s is not set." % parameter.names)
+ if parameter.is_set:
+ commandline += str(parameter)
+
+ return commandline
+
+ def set_parameter(self, name, value = None):
+ """Set a commandline option for a program.
+ """
+ set_option = 0
+ for parameter in self.parameters:
+ if name in parameter.names:
+ if value is not None:
+ self._check_value(value, name, parameter.checker_function)
+ parameter.value = value
+ parameter.is_set = 1
+ set_option = 1
+
+ if set_option == 0:
+ raise ValueError("Option name %s was not found." % name)
+
+ def _check_value(self, value, name, check_function):
+ """Check whether the given value is valid.
+
+ This uses the passed function 'check_function', which can either
+ return a [0, 1] (bad, good) value or raise an error. Either way
+ this function will raise an error if the value is not valid, or
+ finish silently otherwise.
+ """
+ if check_function is not None:
+ is_good = check_function(value)
+ if is_good in [0, 1]: # if we are dealing with a good/bad check
+ if not(is_good):
+ raise ValueError(
+ "Invalid parameter value %r for parameter %s" %
+ (value, name))
+
+class _AbstractParameter:
+ """A class to hold information about a parameter for a commandline.
+
+ Do not use this directly, instead use one of the subclasses.
+
+ Attributes:
+
+ o names -- a list of string names by which the parameter can be
+ referenced (ie. ["-a", "--append", "append"]). The first name in
+ the list is considered to be the one that goes on the commandline,
+ for those parameters that print the option. The last name in the list
+ is assumed to be a "human readable" name describing the option in one
+ word.
+
+ o param_type -- a list of string describing the type of parameter,
+ which can help let programs know how to use it. Example descriptions
+ include 'input', 'output', 'file'
+
+ o checker_function -- a reference to a function that will determine
+ if a given value is valid for this parameter. This function can either
+ raise an error when given a bad value, or return a [0, 1] decision on
+ whether the value is correct.
+
+ o description -- a description of the option.
+
+ o is_required -- a flag to indicate if the parameter must be set for
+ the program to be run.
+
+ o is_set -- if the parameter has been set
+
+ o value -- the value of a parameter
+ """
+ def __init__(self, names = [], types = [], checker_function = None,
+ is_required = 0, description = ""):
+ self.names = names
+ self.param_types = types
+ self.checker_function = checker_function
+ self.description = description
+ self.is_required = is_required
+
+ self.is_set = 0
+ self.value = None
+
+class _Option(_AbstractParameter):
+ """Represent an option that can be set for a program.
+
+ This holds UNIXish options like --append=yes and -a yes
+ """
+ def __str__(self):
+ """Return the value of this option for the commandline.
+ """
+ # first deal with long options
+ if self.names[0].find("--") >= 0:
+ output = "%s" % self.names[0]
+ if self.value is not None:
+ output += "=%s " % self.value
+ else:
+ output += " "
+ # now short options
+ elif self.names[0].find("-") >= 0:
+ output = "%s " % self.names[0]
+ if self.value is not None:
+ output += "%s " % self.value
+ else:
+ raise ValueError("Unrecognized option type: %s" % self.names[0])
+
+ return output
+
+class _Argument(_AbstractParameter):
+ """Represent an argument on a commandline.
+ """
+ def __str__(self):
+ if self.value is not None:
+ return "%s " % self.value
+ else:
+ return " "
--- /dev/null
+#TODO - Remove this work around once we drop python 2.3 support
+try:
+ set = set
+except NameError:
+ from sets import Set as set
+
+from Bio import Alphabet
+from Bio.Alphabet import IUPAC
+from Bio.Data import IUPACData
+
+unambiguous_dna_by_name = {}
+unambiguous_dna_by_id = {}
+unambiguous_rna_by_name = {}
+unambiguous_rna_by_id = {}
+generic_by_name = {} # unambiguous DNA or RNA
+generic_by_id = {} # unambiguous DNA or RNA
+ambiguous_generic_by_name = {} # ambiguous DNA or RNA
+ambiguous_generic_by_id = {} # ambiguous DNA or RNA
+
+# standard IUPAC unambiguous codons
+standard_dna_table = None
+standard_rna_table = None
+
+# In the future, the back_table could return a statistically
+# appropriate distribution of codons, so do not cache the results of
+# back_table lookups!
+
+class TranslationError(Exception):
+ pass
+
+class CodonTable:
+ nucleotide_alphabet = Alphabet.generic_nucleotide
+ protein_alphabet = Alphabet.generic_protein
+
+ forward_table = {} # only includes codons which actually code
+ back_table = {} # for back translations
+ start_codons = []
+ stop_codons = []
+ # Not always called from derived classes!
+ def __init__(self, nucleotide_alphabet = nucleotide_alphabet,
+ protein_alphabet = protein_alphabet,
+ forward_table = forward_table, back_table = back_table,
+ start_codons = start_codons, stop_codons = stop_codons):
+ self.nucleotide_alphabet = nucleotide_alphabet
+ self.protein_alphabet = protein_alphabet
+ self.forward_table = forward_table
+ self.back_table = back_table
+ self.start_codons = start_codons
+ self.stop_codons = stop_codons
+
+ def __str__(self) :
+ """Returns a simple text representation of the codon table
+
+ e.g.
+ >>> import Bio.Data.CodonTable
+ >>> print Bio.Data.CodonTable.standard_dna_table
+ >>> print Bio.Data.CodonTable.generic_by_id[1]"""
+
+ if self.id :
+ answer = "Table %i" % self.id
+ else :
+ answer = "Table ID unknown"
+ if self.names :
+ answer += " " + ", ".join(filter(None, self.names))
+
+ #Use the main four letters (and the conventional ordering)
+ #even for ambiguous tables
+ letters = self.nucleotide_alphabet.letters
+ if isinstance(self.nucleotide_alphabet, Alphabet.DNAAlphabet) \
+ or (letters is not None and "T" in letters) :
+ letters = "TCAG"
+ else :
+ #Should be either RNA or generic nucleotides,
+ #e.g. Bio.Data.CodonTable.generic_by_id[1]
+ letters = "UCAG"
+
+ #Build the table...
+ answer=answer + "\n\n |" + "|".join( \
+ [" %s " % c2 for c2 in letters] \
+ ) + "|"
+ answer=answer + "\n--+" \
+ + "+".join(["---------" for c2 in letters]) + "+--"
+ for c1 in letters :
+ for c3 in letters :
+ line = c1 + " |"
+ for c2 in letters :
+ codon = c1+c2+c3
+ line = line + " %s" % codon
+ if codon in self.stop_codons :
+ line = line + " Stop|"
+ else :
+ try :
+ amino = self.forward_table[codon]
+ except KeyError :
+ amino = "?"
+ except TranslationError :
+ amino = "?"
+ if codon in self.start_codons :
+ line = line + " %s(s)|" % amino
+ else :
+ line = line + " %s |" % amino
+ line = line + " " + c3
+ answer = answer + "\n"+ line
+ answer=answer + "\n--+" \
+ + "+".join(["---------" for c2 in letters]) + "+--"
+ return answer
+
+def make_back_table(table, default_stop_codon):
+ # ONLY RETURNS A SINGLE CODON
+ # Do the sort so changes in the hash implementation won't affect
+ # the result when one amino acid is coded by more than one codon.
+ back_table = {}
+ keys = table.keys() ; keys.sort()
+ for key in keys:
+ back_table[table[key]] = key
+ back_table[None] = default_stop_codon
+ return back_table
+
+
+class NCBICodonTable(CodonTable):
+ nucleotide_alphabet = Alphabet.generic_nucleotide
+ protein_alphabet = IUPAC.protein
+
+ def __init__(self, id, names, table, start_codons, stop_codons):
+ self.id = id
+ self.names = names
+ self.forward_table = table
+ self.back_table = make_back_table(table, stop_codons[0])
+ self.start_codons = start_codons
+ self.stop_codons = stop_codons
+
+
+class NCBICodonTableDNA(NCBICodonTable):
+ nucleotide_alphabet = IUPAC.unambiguous_dna
+
+class NCBICodonTableRNA(NCBICodonTable):
+ nucleotide_alphabet = IUPAC.unambiguous_rna
+
+
+
+def register_ncbi_table(name, alt_name, id,
+ table, start_codons, stop_codons):
+ names = name.split("; ")
+
+ dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons,
+ stop_codons)
+ # replace all T's with U's for the RNA tables
+ rna_table = {}
+ generic_table = {}
+ for codon, val in table.items():
+ generic_table[codon] = val
+ codon = codon.replace("T", "U")
+ generic_table[codon] = val
+ rna_table[codon] = val
+ rna_start_codons = []
+ generic_start_codons = []
+ for codon in start_codons:
+ generic_start_codons.append(codon)
+ codon = codon.replace("T", "U")
+ generic_start_codons.append(codon)
+ rna_start_codons.append(codon)
+ rna_stop_codons = []
+ generic_stop_codons = []
+ for codon in stop_codons:
+ generic_stop_codons.append(codon)
+ codon = codon.replace("T", "U")
+ generic_stop_codons.append(codon)
+ rna_stop_codons.append(codon)
+
+ generic = NCBICodonTable(id, names + [alt_name], generic_table,
+ generic_start_codons, generic_stop_codons)
+ rna = NCBICodonTableRNA(id, names + [alt_name], rna_table,
+ rna_start_codons, rna_stop_codons)
+
+ if id == 1:
+ global standard_dna_table, standard_rna_table
+ standard_dna_table = dna
+ standard_rna_table = rna
+
+ unambiguous_dna_by_id[id] = dna
+ unambiguous_rna_by_id[id] = rna
+ generic_by_id[id] = generic
+
+ if alt_name is not None:
+ names.append(alt_name)
+
+ for name in names:
+ unambiguous_dna_by_name[name] = dna
+ unambiguous_rna_by_name[name] = rna
+ generic_by_name[name] = generic
+
+### These tables created from the data file
+### ftp://ncbi.nlm.nih.gov/entrez/misc/data/gc.prt
+### using the following:
+##import re
+##for line in open("gc.prt").readlines():
+## if line[:2] == " {":
+## names = []
+## id = None
+## aa = None
+## start = None
+## bases = []
+## elif line[:6] == " name":
+## names.append(re.search('"([^"]*)"', line).group(1))
+## elif line[:8] == " name":
+## names.append(re.search('"(.*)$', line).group(1))
+## elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n':
+## names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma"
+## elif line[:4] == " id":
+## id = int(re.search('(\d+)', line).group(1))
+## elif line[:10] == " ncbieaa ":
+## aa = line[12:12+64]
+## elif line[:10] == " sncbieaa":
+## start = line[12:12+64]
+## elif line[:9] == " -- Base":
+## bases.append(line[12:12+64])
+## elif line[:2] == " }":
+## assert names != [] and id is not None and aa is not None
+## assert start is not None and bases != []
+## if len(names) == 1:
+## names.append(None)
+## print "register_ncbi_table(name = %s," % repr(names[0])
+## print " alt_name = %s, id = %d", % \
+## (repr(names[1]), id)
+## print " table = {"
+## s = " "
+## for i in range(64):
+## if aa[i] != "*":
+## t = " '%s%s%s': '%s'," % (bases[0][i], bases[1][i],
+## bases[2][i], aa[i])
+## if len(s) + len(t) > 75:
+## print s
+## s = " " + t
+## else:
+## s = s + t
+## print s, "},"
+
+## s = " stop_codons = ["
+## for i in range(64):
+## if aa[i] == "*":
+## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i])
+## if len(s) + len(t) > 75:
+## print s
+## s = " " + t
+## else:
+## s = s + t
+## print s, "],"
+
+## s = " start_codons = ["
+## for i in range(64):
+## if start[i] == "M":
+## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i])
+## if len(s) + len(t) > 75:
+## print s
+## s = " " + t
+## else:
+## s = s + t
+## print s, "]"
+## print " )"
+## elif line[:2] == "--" or line == "\n" or line == "}\n" or \
+## line == 'Genetic-code-table ::= {\n':
+## pass
+## else:
+## raise Exception("Unparsed: " + repr(line))
+
+register_ncbi_table(name = 'Standard',
+ alt_name = 'SGC0', id = 1,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L',
+ 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P',
+ 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+ 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I',
+ 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T',
+ 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K',
+ 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+ 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A',
+ 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D',
+ 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
+ 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', 'TGA', ],
+ start_codons = [ 'TTG', 'CTG', 'ATG', ]
+ )
+register_ncbi_table(name = 'Vertebrate Mitochondrial',
+ alt_name = 'SGC1', id = 2,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
+ 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
+ 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
+ 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
+ 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
+ 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'GTT': 'V',
+ 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A',
+ 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E',
+ 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', 'AGA', 'AGG', ],
+ start_codons = [ 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ]
+ )
+register_ncbi_table(name = 'Yeast Mitochondrial',
+ alt_name = 'SGC2', id = 3,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'T',
+ 'CTC': 'T', 'CTA': 'T', 'CTG': 'T', 'CCT': 'P', 'CCC': 'P',
+ 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
+ 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
+ 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
+ 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
+ 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
+ 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
+ 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', ],
+ start_codons = [ 'ATG', ]
+ )
+register_ncbi_table(name = 'Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma',
+ alt_name = 'SGC3', id = 4,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
+ 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
+ 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
+ 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
+ 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
+ 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
+ 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
+ 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
+ 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', ],
+ start_codons = [ 'TTA', 'TTG', 'CTG', 'ATT', 'ATC',
+ 'ATA', 'ATG', 'GTG', ]
+ )
+register_ncbi_table(name = 'Invertebrate Mitochondrial',
+ alt_name = 'SGC4', id = 5,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
+ 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
+ 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
+ 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
+ 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
+ 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S',
+ 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
+ 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
+ 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', ],
+ start_codons = [ 'TTG', 'ATT', 'ATC', 'ATA', 'ATG',
+ 'GTG', ]
+ )
+register_ncbi_table(name = 'Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear',
+ alt_name = 'SGC5', id = 6,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TAA': 'Q', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
+ 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P',
+ 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H',
+ 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R',
+ 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+ 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N',
+ 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S',
+ 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V',
+ 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+ 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G',
+ 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TGA', ],
+ start_codons = [ 'ATG', ]
+ )
+register_ncbi_table(name = 'Echinoderm Mitochondrial',
+ alt_name = 'SGC8', id = 9,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
+ 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
+ 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
+ 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
+ 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
+ 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S',
+ 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
+ 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
+ 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', ],
+ start_codons = [ 'ATG', ]
+ )
+register_ncbi_table(name = 'Euplotid Nuclear',
+ alt_name = 'SGC9', id = 10,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W', 'CTT': 'L',
+ 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
+ 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
+ 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
+ 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
+ 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
+ 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
+ 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
+ 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', ],
+ start_codons = [ 'ATG', ]
+ )
+register_ncbi_table(name = 'Bacterial',
+ alt_name = None, id = 11,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L',
+ 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P',
+ 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+ 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I',
+ 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T',
+ 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K',
+ 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+ 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A',
+ 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D',
+ 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
+ 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', 'TGA', ],
+ start_codons = [ 'TTG', 'CTG', 'ATT', 'ATC', 'ATA',
+ 'ATG', 'GTG', ]
+ )
+register_ncbi_table(name = 'Alternative Yeast Nuclear',
+ alt_name = None, id = 12,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L',
+ 'CTA': 'L', 'CTG': 'S', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P',
+ 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+ 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I',
+ 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T',
+ 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K',
+ 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+ 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A',
+ 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D',
+ 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G',
+ 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', 'TGA', ],
+ start_codons = [ 'CTG', 'ATG', ]
+ )
+register_ncbi_table(name = 'Ascidian Mitochondrial',
+ alt_name = None, id = 13,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L',
+ 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
+ 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
+ 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T',
+ 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
+ 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'G',
+ 'AGG': 'G', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
+ 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
+ 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TAG', ],
+ start_codons = [ 'ATG', ]
+ )
+register_ncbi_table(name = 'Flatworm Mitochondrial',
+ alt_name = None, id = 14,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TAA': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
+ 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P',
+ 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H',
+ 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R',
+ 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+ 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N',
+ 'AAC': 'N', 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S',
+ 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V',
+ 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+ 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G',
+ 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TAG', ],
+ start_codons = [ 'ATG', ]
+ )
+register_ncbi_table(name = 'Blepharisma Macronuclear',
+ alt_name = None, id = 15,
+ table = {
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
+ 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
+ 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L',
+ 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P',
+ 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q',
+ 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T',
+ 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N',
+ 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R',
+ 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D',
+ 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G',
+ 'GGA': 'G', 'GGG': 'G', },
+ stop_codons = [ 'TAA', 'TGA', ],
+ start_codons = [ 'ATG', ]
+ )
+
+######### Deal with ambiguous forward translations
+
+class AmbiguousCodonTable(CodonTable):
+ def __init__(self, codon_table,
+ ambiguous_nucleotide_alphabet,
+ ambiguous_nucleotide_values,
+ ambiguous_protein_alphabet,
+ ambiguous_protein_values):
+ CodonTable.__init__(self,
+ ambiguous_nucleotide_alphabet,
+ ambiguous_protein_alphabet,
+ AmbiguousForwardTable(codon_table.forward_table,
+ ambiguous_nucleotide_values,
+ ambiguous_protein_values),
+ codon_table.back_table,
+
+ # These two are WRONG! I need to get the
+ # list of ambiguous codons which code for
+ # the stop codons XXX
+ list_ambiguous_codons(codon_table.start_codons, ambiguous_nucleotide_values),
+ list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values)
+ )
+ self._codon_table = codon_table
+
+ # Be sneaky and forward attribute lookups to the original table.
+ # This lets us get the names, if the original table is an NCBI
+ # table.
+ def __getattr__(self, name):
+ return getattr(self._codon_table, name)
+
+def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values):
+ c1, c2, c3 = codon
+ x1 = ambiguous_nucleotide_values[c1]
+ x2 = ambiguous_nucleotide_values[c2]
+ x3 = ambiguous_nucleotide_values[c3]
+ possible = {}
+ stops = []
+ for y1 in x1:
+ for y2 in x2:
+ for y3 in x3:
+ try:
+ possible[forward_table[y1+y2+y3]] = 1
+ except KeyError:
+ # If tripping over a stop codon
+ stops.append(y1+y2+y3)
+ if stops:
+ if possible.keys():
+ raise TranslationError("ambiguous codon '%s' codes " % codon \
+ + "for both proteins and stop codons")
+ # This is a true stop codon - tell the caller about it
+ raise KeyError(codon)
+ return possible.keys()
+
+def list_ambiguous_codons(codons, ambiguous_nucleotide_values):
+ """Extends a codon list to include all possible ambigous codons.
+
+ e.g. ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR']
+ ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA']
+
+ Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR'.
+ Thus only two more codons are added in the following:
+
+ e.g. ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR']
+
+ Returns a new (longer) list of codon strings.
+ """
+
+ #Note ambiguous_nucleotide_values['R'] = 'AG' (etc)
+ #This will generate things like 'TRR' from ['TAG', 'TGA'], which
+ #we don't want to include:
+ c1_list = [letter for (letter, meanings) \
+ in ambiguous_nucleotide_values.iteritems() \
+ if set([codon[0] for codon in codons]).issuperset(set(meanings))]
+ c2_list = [letter for (letter, meanings) \
+ in ambiguous_nucleotide_values.iteritems() \
+ if set([codon[1] for codon in codons]).issuperset(set(meanings))]
+ c3_list = [letter for (letter, meanings) \
+ in ambiguous_nucleotide_values.iteritems() \
+ if set([codon[2] for codon in codons]).issuperset(set(meanings))]
+ set2 = set([codon[1] for codon in codons])
+ set3 = set([codon[2] for codon in codons])
+ candidates = set([c1+c2+c3 for c1 in c1_list for c2 in c2_list for c3 in c3_list])
+ candidates.difference_update(codons)
+ answer = codons[:] #copy
+ #print "Have %i new candidates" % len(candidates)
+ for ambig_codon in candidates :
+ wanted = True
+ #e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG'
+ for codon in [c1+c2+c3 \
+ for c1 in ambiguous_nucleotide_values[ambig_codon[0]] \
+ for c2 in ambiguous_nucleotide_values[ambig_codon[1]] \
+ for c3 in ambiguous_nucleotide_values[ambig_codon[2]]]:
+ if codon not in codons :
+ #This ambiguous codon can code for a non-stop, exclude it!
+ wanted=False
+ #print "Rejecting %s" % ambig_codon
+ continue
+ if wanted :
+ answer.append(ambig_codon)
+ return answer
+assert list_ambiguous_codons(['TGA', 'TAA'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TRA']
+assert list_ambiguous_codons(['TAG', 'TGA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TGA']
+assert list_ambiguous_codons(['TAG', 'TAA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TAA', 'TAR']
+assert list_ambiguous_codons(['UAG', 'UAA'],IUPACData.ambiguous_rna_values) == ['UAG', 'UAA', 'UAR']
+assert list_ambiguous_codons(['TGA', 'TAA', 'TAG'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TAG', 'TAR', 'TRA']
+
+# Forward translation is "onto", that is, any given codon always maps
+# to the same protein, or it doesn't map at all. Thus, I can build
+# off of an existing table to produce the ambiguous mappings.
+#
+# This handles the general case. Perhaps it's overkill?
+# >>> t = CodonTable.ambiguous_dna_by_id[1]
+# >>> t.forward_table["AAT"]
+# 'N'
+# >>> t.forward_table["GAT"]
+# 'D'
+# >>> t.forward_table["RAT"]
+# 'B'
+# >>> t.forward_table["YTA"]
+# 'L'
+
+class AmbiguousForwardTable:
+ def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein):
+ self.forward_table = forward_table
+
+ self.ambiguous_nucleotide = ambiguous_nucleotide
+ self.ambiguous_protein = ambiguous_protein
+
+ inverted = {}
+ for name, val in ambiguous_protein.items():
+ for c in val:
+ x = inverted.get(c, {})
+ x[name] = 1
+ inverted[c] = x
+ for name, val in inverted.items():
+ inverted[name] = val.keys()
+ self._inverted = inverted
+
+ self._cache = {}
+
+ def get(self, codon, failobj = None):
+ try:
+ return self.__getitem__(codon)
+ except KeyError:
+ return failobj
+
+ def __getitem__(self, codon):
+ try:
+ x = self._cache[codon]
+ except KeyError:
+ pass
+ else:
+ if x is TranslationError:
+ raise TranslationError(codon) # no unique translation
+ if x is KeyError:
+ raise KeyError(codon) # it's a stop codon
+ return x
+ try:
+ x = self.forward_table[codon]
+ self._cache[codon] = x
+ return x
+ except KeyError:
+ pass
+
+ # XXX Need to make part of this into a method which returns
+ # a list of all possible encodings for a codon!
+ try:
+ possible = list_possible_proteins(codon,
+ self.forward_table,
+ self.ambiguous_nucleotide)
+ except KeyError:
+ self._cache[codon] = KeyError
+ raise KeyError(codon) # stop codon
+ except TranslationError:
+ self._cache[codon] = TranslationError
+ raise TranslationError(codon) # does not code
+ assert len(possible) > 0, "unambiguous codons must code"
+
+ # Hah! Only one possible protein, so use it
+ if len(possible) == 1:
+ self._cache[codon] = possible[0]
+ return possible[0]
+
+ # See if there's an ambiguous protein encoding for the multiples.
+ # Find residues which exist in every coding set.
+ ambiguous_possible = {}
+ for amino in possible:
+ for term in self._inverted[amino]:
+ ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1
+
+ n = len(possible)
+ possible = []
+ for amino, val in ambiguous_possible.items():
+ if val == n:
+ possible.append(amino)
+
+ # No amino acid encoding for the results
+ if len(possible) == 0:
+ self._cache[codon] = TranslationError
+ raise TranslationError(codon) # no valid translation
+
+ # All of these are valid, so choose one
+ # To be unique, sort by smallet ambiguity then alphabetically
+ # Can get this if "X" encodes for everything.
+ def _sort(x, y, table = self.ambiguous_protein):
+ a = cmp(len(table[x]), len(table[y]))
+ if a == 0:
+ return cmp(x, y)
+ return a
+ possible.sort(_sort)
+
+ x = possible[0]
+ self._cache[codon] = x
+ return x
+
+#Prepare the ambiguous tables for DNA, RNA and Generic (DNA or RNA)
+ambiguous_dna_by_name = {}
+for key, val in unambiguous_dna_by_name.items():
+ ambiguous_dna_by_name[key] = AmbiguousCodonTable(val,
+ IUPAC.ambiguous_dna,
+ IUPACData.ambiguous_dna_values,
+ IUPAC.extended_protein,
+ IUPACData.extended_protein_values)
+ambiguous_dna_by_id = {}
+for key, val in unambiguous_dna_by_id.items():
+ ambiguous_dna_by_id[key] = AmbiguousCodonTable(val,
+ IUPAC.ambiguous_dna,
+ IUPACData.ambiguous_dna_values,
+ IUPAC.extended_protein,
+ IUPACData.extended_protein_values)
+
+ambiguous_rna_by_name = {}
+for key, val in unambiguous_rna_by_name.items():
+ ambiguous_rna_by_name[key] = AmbiguousCodonTable(val,
+ IUPAC.ambiguous_rna,
+ IUPACData.ambiguous_rna_values,
+ IUPAC.extended_protein,
+ IUPACData.extended_protein_values)
+ambiguous_rna_by_id = {}
+for key, val in unambiguous_rna_by_id.items():
+ ambiguous_rna_by_id[key] = AmbiguousCodonTable(val,
+ IUPAC.ambiguous_rna,
+ IUPACData.ambiguous_rna_values,
+ IUPAC.extended_protein,
+ IUPACData.extended_protein_values)
+
+#The following isn't very elegant, but seems to work nicely.
+_merged_values = dict(IUPACData.ambiguous_rna_values.iteritems())
+_merged_values["T"] = "U"
+
+for key, val in generic_by_name.items():
+ ambiguous_generic_by_name[key] = AmbiguousCodonTable(val,
+ Alphabet.NucleotideAlphabet(),
+ _merged_values,
+ IUPAC.extended_protein,
+ IUPACData.extended_protein_values)
+
+for key, val in generic_by_id.items():
+ ambiguous_generic_by_id[key] = AmbiguousCodonTable(val,
+ Alphabet.NucleotideAlphabet(),
+ _merged_values,
+ IUPAC.extended_protein,
+ IUPACData.extended_protein_values)
+del _merged_values
+del key, val
+
+#Basic sanity test,
+for n in ambiguous_generic_by_id.keys() :
+ assert ambiguous_rna_by_id[n].forward_table["GUU"] == "V"
+ assert ambiguous_rna_by_id[n].forward_table["GUN"] == "V"
+ assert ambiguous_rna_by_id[n].forward_table["UUN"] == "X" #F or L
+ #R = A or G, so URR = UAA or UGA / TRA = TAA or TGA = stop codons
+ if "UAA" in unambiguous_rna_by_id[n].stop_codons \
+ and "UGA" in unambiguous_rna_by_id[n].stop_codons :
+ try :
+ print ambiguous_dna_by_id[n].forward_table["TRA"]
+ assert False, "Should be a stop only"
+ except KeyError :
+ pass
+ assert "URA" in ambiguous_generic_by_id[n].stop_codons
+ assert "URA" in ambiguous_rna_by_id[n].stop_codons
+ assert "TRA" in ambiguous_generic_by_id[n].stop_codons
+ assert "TRA" in ambiguous_dna_by_id[n].stop_codons
+del n
+assert ambiguous_generic_by_id[1].stop_codons == ambiguous_generic_by_name["Standard"].stop_codons
+assert ambiguous_generic_by_id[4].stop_codons == ambiguous_generic_by_name["SGC3"].stop_codons
+assert ambiguous_generic_by_id[15].stop_codons == ambiguous_generic_by_name['Blepharisma Macronuclear'].stop_codons
--- /dev/null
+# Information about the IUPAC alphabets
+
+protein_letters = "ACDEFGHIKLMNPQRSTVWY"
+extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
+# B = "Asx"; aspartic acid or asparagine (D or N)
+# X = "Xxx"; unknown or 'other' amino acid
+# Z = "Glx"; glutamic acid or glutamine (E or Q)
+# J = "Xle"; leucine or isoleucine (L or I, used in mass-spec)
+# U = "Sec"; selenocysteine
+# O = "Pyl"; pyrrolysine
+ambiguous_dna_letters = "GATCRYWSMKHBVDN"
+unambiguous_dna_letters = "GATC"
+ambiguous_rna_letters = "GAUCRYWSMKHBVDN"
+unambiguous_rna_letters = "GAUC"
+
+# B == 5-bromouridine
+# D == 5,6-dihydrouridine
+# S == thiouridine
+# W == wyosine
+extended_dna_letters = "GATCBDSW"
+
+# are there extended forms?
+#extended_rna_letters = "GAUCBDSW"
+
+ambiguous_dna_values = {
+ "A": "A",
+ "C": "C",
+ "G": "G",
+ "T": "T",
+ "M": "AC",
+ "R": "AG",
+ "W": "AT",
+ "S": "CG",
+ "Y": "CT",
+ "K": "GT",
+ "V": "ACG",
+ "H": "ACT",
+ "D": "AGT",
+ "B": "CGT",
+ "X": "GATC",
+ "N": "GATC",
+ }
+ambiguous_rna_values = {
+ "A": "A",
+ "C": "C",
+ "G": "G",
+ "U": "U",
+ "M": "AC",
+ "R": "AG",
+ "W": "AU",
+ "S": "CG",
+ "Y": "CU",
+ "K": "GU",
+ "V": "ACG",
+ "H": "ACU",
+ "D": "AGU",
+ "B": "CGU",
+ "X": "GAUC",
+ "N": "GAUC",
+ }
+
+ambiguous_dna_complement = {
+ "A": "T",
+ "C": "G",
+ "G": "C",
+ "T": "A",
+ "M": "K",
+ "R": "Y",
+ "W": "W",
+ "S": "S",
+ "Y": "R",
+ "K": "M",
+ "V": "B",
+ "H": "D",
+ "D": "H",
+ "B": "V",
+ "X": "X",
+ "N": "N",
+ }
+
+ambiguous_rna_complement = {
+ "A": "U",
+ "C": "G",
+ "G": "C",
+ "U": "A",
+ "M": "K",
+ "R": "Y",
+ "W": "W",
+ "S": "S",
+ "Y": "R",
+ "K": "M",
+ "V": "B",
+ "H": "D",
+ "D": "H",
+ "B": "V",
+ "X": "X",
+ "N": "N",
+ }
+
+
+def _make_ranges(dict):
+ d = {}
+ for key, value in dict.items():
+ d[key] = (value, value)
+ return d
+
+# From bioperl's SeqStats.pm
+unambiguous_dna_weights = {
+ "A": 347.,
+ "C": 323.,
+ "G": 363.,
+ "T": 322.,
+ }
+unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)
+
+unambiguous_rna_weights = {
+ "A": unambiguous_dna_weights["A"] + 16., # 16 for the oxygen
+ "C": unambiguous_dna_weights["C"] + 16.,
+ "G": unambiguous_dna_weights["G"] + 16.,
+ "U": 340.,
+}
+unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)
+
+def _make_ambiguous_ranges(dict, weight_table):
+ range_d = {}
+ avg_d = {}
+ for letter, values in dict.items():
+ #Following line is a quick hack to skip undefined weights for U and O
+ if len(values)==1 and values[0] not in weight_table : continue
+ weights = map(weight_table.get, values)
+ range_d[letter] = (min(weights), max(weights))
+ total_w = 0.0
+ for w in weights:
+ total_w = total_w + w
+ avg_d[letter] = total_w / len(weights)
+ return range_d, avg_d
+
+ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \
+ _make_ambiguous_ranges(ambiguous_dna_values,
+ unambiguous_dna_weights)
+
+ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \
+ _make_ambiguous_ranges(ambiguous_rna_values,
+ unambiguous_rna_weights)
+
+protein_weights = {
+ "A": 89.09,
+ "C": 121.16,
+ "D": 133.10,
+ "E": 147.13,
+ "F": 165.19,
+ "G": 75.07,
+ "H": 155.16,
+ "I": 131.18,
+ "K": 146.19,
+ "L": 131.18,
+ "M": 149.21,
+ "N": 132.12,
+ #"O": 0.0, # Needs to be recorded!
+ "P": 115.13,
+ "Q": 146.15,
+ "R": 174.20,
+ "S": 105.09,
+ "T": 119.12,
+ #"U": 168.05, # To be confirmed
+ "V": 117.15,
+ "W": 204.23,
+ "Y": 181.19
+ }
+
+extended_protein_values = {
+ "A": "A",
+ "B": "ND",
+ "C": "C",
+ "D": "D",
+ "E": "E",
+ "F": "F",
+ "G": "G",
+ "H": "H",
+ "I": "I",
+ "J": "IL",
+ "K": "K",
+ "L": "L",
+ "M": "M",
+ "N": "N",
+ "O": "O",
+ "P": "P",
+ "Q": "Q",
+ "R": "R",
+ "S": "S",
+ "T": "T",
+ "U": "U",
+ "V": "V",
+ "W": "W",
+ "X": "ACDEFGHIKLMNPQRSTVWY",
+ #TODO - Include U and O in the possible values of X?
+ #This could alter the extended_protein_weight_ranges ...
+ "Y": "Y",
+ "Z": "QE",
+}
+
+protein_weight_ranges = _make_ranges(protein_weights)
+
+extended_protein_weight_ranges, avg_extended_protein_weights = \
+ _make_ambiguous_ranges(extended_protein_values,
+ protein_weights)
+
+
+
--- /dev/null
+# This is a Python module
+"""Collections of various bits of useful biological data.
+"""
--- /dev/null
+# Copyright 2002 by Andrew Dalke.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Decode elements from a Std/Martel parsed XML stream (OBSOLETE).
+
+Andrew Dalke is no longer maintaining Martel or Bio.Mindy, and these modules
+(and therefore Bio.Decode) have been deprecated. They are no longer used in
+any of the current Biopython parsers, and are likely to be removed in a
+future release."""
+
+import warnings
+warnings.warn("Martel and those parts of Biopython depending on it" \
+ +" directly (such as Bio.Mindy and Bio.Decode) are now" \
+ +" deprecated, and will be removed in a future release of"\
+ +" Biopython. If you want to continue to use this code,"\
+ +" please get in contact with the Biopython developers via"\
+ +" the mailing lists to avoid its permanent removal from"\
+ +" Biopython.", \
+ DeprecationWarning)
+
+import string
+from Bio.Parsers.spark import GenericScanner, GenericParser
+
+def unescape_C(s):
+ result = []
+ for i in range(len(s)):
+ if s[i] != "\\":
+ result.append(s[i])
+ continue
+ c = s[i+1:i+2]
+ if c == "x":
+ x = s[i+2:i+4]
+ if len(x) != 2:
+ raise ValueError("invalid \\x escape")
+ i = int(x, 16)
+ result.append(chr(i))
+ continue
+ if c in "01234567":
+ x = s[i+1:i+4]
+ # \octals don't do a length assertion check
+ i = int(x, 8)
+ result.append(chr(i))
+ continue
+ result.append(c)
+ return "".join(result)
+
+def join_english(fields):
+ if not fields:
+ return ""
+ s = fields[0]
+ for field in fields[1:]:
+ if s[-1:] == "-" and s[-3:-2] == "-":
+ s = s + field
+ continue
+ if s.find(" ") == -1 and field.find(" ") == -1:
+ s = s + field
+ continue
+ s = s + " " + field
+ return (" ".join(s.split())).strip()
+
+
+
+def chomp(s, c):
+ if s[-1:] == c:
+ return s[:-1]
+ return s
+
+def lchomp(s, c):
+ if s[:1] == c:
+ return s[1:]
+ return s
+
+def chompchomp(s, c):
+ if s[:1] == c and s[-1:] == c:
+ return s[1:-1]
+ return s
+
+def fixspaces(s):
+ # s.split breaks down to a list of words
+ # " ".join puts them together
+ # strip removes leading and trailing spaces
+ return " ".join(s.split()).strip()
+
+def join_fixspaces(lines):
+ return " ".join((" ".join(lines)).split()).strip()
+
+def tr(s, frm, to):
+ table = string.maketrans(frm, to)
+ return s.translate(table)
+
+def safe_int(s):
+ """converts to int if the number is small, long if it's large"""
+ try:
+ return int(s)
+ except ValueError:
+ return long(s)
+
+decode_functions = {
+ "chomp": (chomp, str, str),
+ "chompchomp": (chompchomp, str, str),
+ "chop": (lambda s: s[:-1], str, str),
+ "chopchop": (lambda s: s[1:-1], str, str),
+ "fixspaces": (fixspaces, str, str),
+ "lchomp": (lchomp, str, str),
+ "lchop": (lambda s: s[1:], str, str),
+ "lower": (lambda s: s.lower(), str, str),
+ "lstrip": (lambda s: s.lstrip(), str, str),
+ "replace": (lambda s, old, new: s.replace(old, new), str, str),
+ "rstrip": (lambda s: s.rstrip(), str, str),
+ "str": (str, str, str),
+ "strip": (lambda s: s.strip(), str, str),
+ "tr": (tr, str, str),
+ "unescape.c": (unescape_C, str, str),
+ "unescape.doublequote": (lambda s: s.replace('""', '"'), str, str),
+ "unescape.singlequote": (lambda s: s.replace("''", "'"), str, str),
+ "upper": (lambda s: s.upper(), str, str),
+
+ # List operations
+ "join": (lambda lst, s = " ": s.join(lst), list, str),
+ "join.english": (join_english, list, str),
+
+ # Integer operations
+ "int": (safe_int, [float, str, int], int),
+ "int.comma": (lambda s: safe_int(s.replace(",", "")),
+ [float, str, int], int),
+ "hex": (hex, str, int),
+ "oct": (oct, str, int),
+ "add": ((lambda i, j: i+j), int, int),
+
+ # Float operations
+ "float": (float, (float, str, int), float),
+
+ }
+
+def _fixup_defs():
+ # Normalize so the 2nd and 3rd terms are tuples
+ for k, v in decode_functions.items():
+ f, in_types, out_types = v
+ if isinstance(in_types, type([])):
+ in_types = tuple(in_types)
+ elif not isinstance(in_types, type( () )):
+ in_types = (in_types,)
+
+ if isinstance(out_types, type([])):
+ out_types = tuple(out_types)
+ elif not isinstance(out_types, type( () )):
+ out_types = (out_types,)
+
+ decode_functions[k] = (f, in_types, out_types)
+_fixup_defs()
+
+class Token:
+ def __init__(self, type):
+ self.type = type
+ def __cmp__(self, other):
+ return cmp(self.type, other)
+ def __repr__(self):
+ return "Token(%r)" % (self.type,)
+
+class ValueToken(Token):
+ def __init__(self, type, val):
+ Token.__init__(self, type)
+ self.val = val
+ def __cmp__(self, other):
+ return cmp(self.type, other)
+ def __repr__(self):
+ return "%s(%r)" % (self.__class__.__name__, self.val)
+ def __str__(self):
+ return str(self.val)
+
+class Integer(ValueToken):
+ def __init__(self, val):
+ ValueToken.__init__(self, "integer", val)
+
+class Float(ValueToken):
+ def __init__(self, val):
+ ValueToken.__init__(self, "float", val)
+
+class String(ValueToken):
+ def __init__(self, val):
+ ValueToken.__init__(self, "string", val)
+
+class FunctionName(ValueToken):
+ def __init__(self, val):
+ ValueToken.__init__(self, "functionname", val)
+
+class DecodeScanner(GenericScanner):
+ def __init__(self):
+ GenericScanner.__init__(self)
+
+ def tokenize(self, input):
+ self.rv = []
+ GenericScanner.tokenize(self, input)
+ return self.rv
+
+ def t_functionname(self, input):
+ r" \w+(\.\w+)*"
+ self.rv.append(FunctionName(input))
+
+ def t_pipe(self, input):
+ r" \| "
+ self.rv.append(Token("pipe"))
+
+ def t_open_paren(self, input):
+ r" \( "
+ self.rv.append(Token("open_paren"))
+
+ def t_close_paren(self, input):
+ r" \) "
+ self.rv.append(Token("close_paren"))
+
+ def t_comma(self, input):
+ r" , "
+ self.rv.append(Token("comma"))
+
+ def t_whitespace(self, input):
+ r" \s+ "
+ pass
+
+ def t_string(self, input):
+ r""" "([^"\\]+|\\.)*"|'([^'\\]+|\\.)*' """
+ # "' # emacs cruft
+ s = input[1:-1]
+ s = unescape_C(s)
+
+ self.rv.append(String(s))
+
+ def t_float(self, input):
+ r""" [+-]?((\d+(\.\d*)?)|\.\d+)([eE][+-]?[0-9]+)? """
+ # See if this is an integer
+ try:
+ self.rv.append(Integer(safe_int(input)))
+ except ValueError:
+ self.rv.append(Float(float(input)))
+
+class Function:
+ def __init__(self, name, args = ()):
+ self.name = name
+ self.args = args
+ def __str__(self):
+ args = self.args
+ if not args:
+ s = ""
+ else:
+ s = str(args)[1:-1]
+ return "%s(x, %s)" % (self.name, s)
+ __repr__ = __str__
+
+class DecodeParser(GenericParser):
+ def __init__(self, start = "expression"):
+ GenericParser.__init__(self, start)
+ self.begin_pos = 0
+
+ def p_expression(self, args):
+ """
+ expression ::= term
+ expression ::= term pipe expression
+ """
+ if len(args) == 1:
+ return [args[0]]
+ return [args[0]] + args[2]
+
+ def p_term(self, args):
+ """
+ term ::= functionname
+ term ::= functionname open_paren args close_paren
+ """
+ if len(args) == 1:
+ return Function(args[0].val)
+ return Function(args[0].val, tuple([x.val for x in args[2]]))
+
+ def p_args(self, args):
+ """
+ args ::= arg
+ args ::= arg comma args
+ """
+ if len(args) == 1:
+ return [args[0]]
+ return [args[0]] + args[2]
+
+ def p_arg(self, args):
+ """
+ arg ::= string
+ arg ::= integer
+ arg ::= float
+ """
+ return args[0]
+
+def scan(input):
+ scanner = DecodeScanner()
+ return scanner.tokenize(input)
+
+def parse(tokens):
+ parser = DecodeParser()
+ return parser.parse(tokens)
+
+_decoder_cache = {}
+
+class FunctionCall:
+ def __init__(self, f, args):
+ self.f = f
+ self.args = args
+ def __call__(self, x):
+ return self.f(x, *self.args)
+
+class FunctionCallChain:
+ def __init__(self, inner_f, f, args):
+ self.inner_f = inner_f
+ self.f = f
+ self.args = args
+ def __call__(self, x):
+ return self.f(self.inner_f(x), *self.args)
+
+#### I don't think this is the right way to do things
+##class CheckTypes:
+## def __init__(self, f, call_types, return_types):
+## self.f = f
+## self.call_types = call_types
+## self.return_types = return_types
+## def __call__(self, x):
+## if self.call_types is not None:
+## for T in self.call_types:
+## if isinstance(x, T):
+## break
+## else:
+## raise TypeError(
+## "Call value %s of type %s, expecting one of %s" %
+## (x, type(x).__name__,
+## [T.name for T in self.call_types]))
+## y = self.f(x)
+
+## if not self.return_types:
+## return y
+
+## for T in self.return_types:
+## if isinstance(y, T):
+## return y
+## raise TypeError("Return value %s of type %s, expecting one of %s" %
+## (y, type(y).__name__,
+## [T.name for T in self.return_types]))
+
+def make_decoder(s):
+ try:
+ return _decoder_cache[s]
+ except KeyError:
+ pass
+
+ functions = parse(scan(s))
+
+ f = functions[0]
+ fc = decode_functions[f.name][0]
+ args = f.args
+ if args:
+ fc = FunctionCall(fc, args)
+ for f in functions[1:]:
+ fc = FunctionCallChain(fc, decode_functions[f.name][0], f.args)
+ _decoder_cache[s] = fc
+ return fc
+
+def _verify_subtypes(subset, total, old_name, new_name):
+ for x in subset:
+ if x not in total:
+ raise TypeError("%s can produce a %r value not accepted by %s" %
+ (old_name, x.__name__, new_name))
+
+_typechecked_decoder_cache = {}
+def make_typechecked_decoder(s, input_types = None, output_types = None):
+ cache_lookup = (s, input_types, output_types)
+ try:
+ return _typechecked_decoder_cache[cache_lookup]
+ except KeyError:
+ pass
+ if input_types is not None and not isinstance(input_types, type( () )):
+ input_types = (input_types,)
+ if output_types is not None and not isinstance(output_types, type( () )):
+ output_types = (output_types,)
+
+ functions = parse(scan(s))
+
+ # Make sure the input type(s) are allowed
+ f = functions[0]
+ fc, in_types, out_types = decode_functions[f.name]
+ if input_types is not None:
+ for x in input_types:
+ if x not in in_types:
+ raise TypeError(
+ "the input type includes %r which isn't supported by %s" %
+ (x.__name__, f.name))
+
+ # Do the composition
+ old_name = f.name
+ input_types = out_types
+ args = functions[0].args
+ if args:
+ fc = FunctionCall(fc, args)
+
+ for f in functions[1:]:
+ transform_func, in_types, out_types = decode_functions[f.name]
+ _verify_subtypes(input_types, in_types, old_name, f.name)
+ old_name = f.name
+ input_types = out_types
+ fc = FunctionCallChain(fc, transform_func, f.args)
+
+ if output_types is not None:
+ _verify_subtypes(input_types, output_types, old_name, "the output")
+ _typechecked_decoder_cache[cache_lookup] = fc
+ return fc
+
+
+def test():
+ assert make_decoder("chop")("Andrew") == "Andre"
+ assert make_decoder("int")("9") == 9
+ assert make_decoder('join(" ")')(["Andrew", "Dalke"]) == \
+ "Andrew Dalke"
+ assert make_decoder('chomp("|")')("|test|") == "|test"
+ assert make_decoder('chomp("|")')("|test") == "|test"
+ assert make_decoder('chomp("A")|chop')("BA") == ""
+ assert make_decoder('chomp("A")|chop')("AB") == "A"
+ assert make_decoder('chop|chomp("A")')("AB") == ""
+ assert make_decoder('chop|chomp("A")')("BA") == "B"
+ assert make_decoder('add(5)')(2) == 7
+ assert make_decoder('add(-2)')(5) == 3
+
+if __name__ == "__main__":
+ test()
--- /dev/null
+#!/usr/bin/env python
+#
+# Copyright 2002-2003 by Michael Hoffman. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""
+Bio.DocSQL: easy access to DB API databases.
+
+>>> import DocSQL, MySQLdb, os
+>>> db=MySQLdb.connect(passwd='', db='test')
+>>> class CreatePeople(DocSQL.Create):
+... \"""
+... CREATE TEMPORARY TABLE people
+... (id INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
+... last_name TINYTEXT,
+... first_name TINYTEXT)
+... \"""
+...
+>>> CreatePeople(connection=db)
+CreatePeople(message=Success)
+"""
+
+__version__ = "$Revision: 1.13 $"
+# $Source: /home/repository/biopython/biopython/Bio/DocSQL.py,v $
+
+import exceptions
+import sys
+
+from Bio import MissingExternalDependencyError
+
+try:
+ import MySQLdb
+except:
+ raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.DocSQL.")
+
+connection = None
+
+class NoInsertionError(exceptions.Exception):
+ pass
+
+def _check_is_public(name):
+ if name[:6] == "_names":
+ raise AttributeError
+
+class QueryRow(list):
+ def __init__(self, cursor):
+ try:
+ row = cursor.fetchone()
+ super(QueryRow, self).__init__(row)
+ except TypeError:
+ raise StopIteration
+
+ object.__setattr__(self, "_names", [x[0] for x in cursor.description]) # FIXME: legacy
+ object.__setattr__(self, "_names_hash", {})
+
+ for i, name in enumerate(self._names):
+ self._names_hash[name] = i
+
+ def __getattr__(self, name):
+ _check_is_public(name)
+ try:
+ return self[self._names_hash[name]]
+ except (KeyError, AttributeError) :
+ raise AttributeError("'%s' object has no attribute '%s'" \
+ % (self.__class__.__name__, name))
+
+ def __setattr__(self, name, value):
+ try:
+ self._names_hash
+ except AttributeError:
+ return object.__setattr__(self, name, value)
+
+ _check_is_public(name)
+ try:
+ index = self._names_hash[name]
+ self[index] = value
+ except KeyError:
+ return object.__setattr__(self, name, value)
+
+class Query(object):
+ """
+ SHOW TABLES
+ """
+ MSG_FAILURE = "Failure"
+ MSG_SUCCESS = "Success"
+ message = "not executed"
+ error_message = ""
+ prefix = ""
+ suffix = ""
+ row_class = QueryRow
+
+ def __init__(self, *args, **keywds):
+ try:
+ self.connection = keywds['connection']
+ except KeyError:
+ self.connection = connection
+ try:
+ self.diagnostics = keywds['diagnostics']
+ except KeyError:
+ self.diagnostics = 0
+
+ self.statement = self.prefix + self.__doc__ + self.suffix
+ self.params = args
+
+ def __iter__(self):
+ return IterationCursor(self, self.connection)
+
+ def __repr__(self):
+ return "%s(message=%s)" % (self.__class__.__name__, self.message)
+
+ def cursor(self):
+ return iter(self).cursor
+
+ def dump(self):
+ for item in self:
+ print item
+
+class QueryGeneric(Query):
+ def __init__(self, statement, *args, **keywds):
+ Query.__init__(self, *args, **keywds)
+ self.statement = statement,
+
+class IterationCursor(object):
+ def __init__(self, query, connection=connection):
+ if connection is None:
+ raise TypeError("database connection is None")
+ self.cursor = connection.cursor()
+ self.row_class = query.row_class
+ if query.diagnostics:
+ print >>sys.stderr, query.statement
+ print >>sys.stderr, query.params
+ self.cursor.execute(query.statement, query.params)
+
+ def next(self):
+ return self.row_class(self.cursor)
+
+class QuerySingle(Query, QueryRow):
+ ignore_warnings = 0
+ def __init__(self, *args, **keywds):
+ message = self.MSG_FAILURE
+ Query.__init__(self, *args, **keywds)
+ try:
+ self.single_cursor = Query.cursor(self)
+ except MySQLdb.Warning:
+ if not self.ignore_warnings:
+ raise
+ self.row_class.__init__(self, self.cursor())
+ object.__setattr__(self, "message", self.MSG_SUCCESS)
+
+ def cursor(self):
+ return self.single_cursor
+
+class QueryAll(list, Query):
+ def __init__(self, *args, **keywds):
+ Query.__init__(self, *args, **keywds)
+ list.__init__(self, map(self.process_row, self.cursor().fetchall()))
+
+ def process_row(self, row):
+ return row
+
+class QueryAllFirstItem(QueryAll):
+ def process_row(self, row):
+ return row[0]
+
+class Create(QuerySingle):
+ def __init__(self, *args, **keywds):
+ try:
+ QuerySingle.__init__(self, *args, **keywds)
+ except StopIteration:
+ self.message = self.MSG_SUCCESS
+
+class Update(Create):
+ pass
+
+class Insert(Create):
+ MSG_INTEGRITY_ERROR = "Couldn't insert: %s. "
+
+ def __init__(self, *args, **keywds):
+ try:
+ Create.__init__(self, *args, **keywds)
+ except MySQLdb.IntegrityError, error_data:
+ self.error_message += self.MSG_INTEGRITY_ERROR % error_data[1]
+ try:
+ self.total_count
+ except AttributeError:
+ self.total_count = 0
+
+ raise MySQLdb.IntegrityError(self.error_message)
+
+ self.id = self.cursor().insert_id()
+ try:
+ self.total_count += self.cursor().rowcount
+ except AttributeError:
+ self.total_count = self.cursor().rowcount
+
+ if self.cursor().rowcount == 0:
+ raise NoInsertionError
+
+def _test(*args, **keywds):
+ import doctest, sys
+ doctest.testmod(sys.modules[__name__], *args, **keywds)
+
+if __name__ == "__main__":
+ if __debug__:
+ _test()
--- /dev/null
+# Set up the IUPAC alphabet properties
+
+
+from Bio.PropertyManager import default_manager
+from Bio import Alphabet
+from Bio.Alphabet import IUPAC
+from Bio.Data import IUPACData
+
+from Bio import Transcribe, Translate
+
+set_prop = default_manager.class_property
+
+# weight tables
+set_prop[IUPAC.IUPACUnambiguousDNA]["weight_table"] = \
+ IUPACData.unambiguous_dna_weights
+set_prop[IUPAC.IUPACAmbiguousDNA]["weight_table"] = \
+ IUPACData.avg_ambiguous_dna_weights
+set_prop[IUPAC.IUPACUnambiguousRNA]["weight_table"] = \
+ IUPACData.unambiguous_rna_weights
+set_prop[IUPAC.IUPACAmbiguousRNA]["weight_table"] = \
+ IUPACData.avg_ambiguous_rna_weights
+set_prop[IUPAC.IUPACProtein]["weight_table"] = \
+ IUPACData.protein_weights
+set_prop[IUPAC.ExtendedIUPACProtein]["weight_table"] = \
+ IUPACData.avg_extended_protein_weights
+
+set_prop[IUPAC.IUPACUnambiguousDNA]["weight_range_table"] = \
+ IUPACData.unambiguous_dna_weight_ranges
+set_prop[IUPAC.IUPACAmbiguousDNA]["weight_range_table"] = \
+ IUPACData.ambiguous_dna_weight_ranges
+set_prop[IUPAC.IUPACUnambiguousRNA]["weight_range_table"] = \
+ IUPACData.unambiguous_rna_weight_ranges
+set_prop[IUPAC.IUPACAmbiguousRNA]["weight_range_table"] = \
+ IUPACData.ambiguous_rna_weight_ranges
+set_prop[IUPAC.IUPACProtein]["weight_range_table"] = \
+ IUPACData.protein_weight_ranges
+set_prop[IUPAC.ExtendedIUPACProtein]["weight_range_table"] = \
+ IUPACData.extended_protein_weight_ranges
+
+
+
+# transcriber objects
+
+set_prop[Alphabet.DNAAlphabet]["transcriber"] = \
+ Transcribe.generic_transcriber
+
+set_prop[IUPAC.IUPACAmbiguousDNA]["transcriber"] = \
+ Transcribe.ambiguous_transcriber
+
+set_prop[IUPAC.IUPACUnambiguousDNA]["transcriber"] = \
+ Transcribe.unambiguous_transcriber
+
+
+set_prop[Alphabet.RNAAlphabet]["transcriber"] = \
+ Transcribe.generic_transcriber
+
+set_prop[IUPAC.IUPACAmbiguousRNA]["transcriber"] = \
+ Transcribe.ambiguous_transcriber
+
+set_prop[IUPAC.IUPACUnambiguousRNA]["transcriber"] = \
+ Transcribe.unambiguous_transcriber
+
+
+# translator objects
+for name, obj in Translate.unambiguous_dna_by_name.items():
+ property = "translator.name." + name
+ set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj
+ set_prop[obj.table.protein_alphabet.__class__][property] = obj
+
+for name, obj in Translate.unambiguous_rna_by_name.items():
+ property = "translator.name." + name
+ set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj
+ property = "rna_translator.name." + name
+ set_prop[obj.table.protein_alphabet.__class__][property] = obj
+
+
+for id, obj in Translate.unambiguous_dna_by_id.items():
+ property = "translator.id.%d" % id
+ set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj
+ set_prop[obj.table.protein_alphabet.__class__][property] = obj
+ if id == 1:
+ set_prop[obj.table.nucleotide_alphabet.__class__]["translator"] = obj
+ set_prop[obj.table.protein_alphabet.__class__]["translator"] = obj
+
+
+for id, obj in Translate.unambiguous_rna_by_id.items():
+ property = "translator.id.%d" % id
+ set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj
+ property = "rna_translator.id.%d" % id
+ set_prop[obj.table.protein_alphabet.__class__][property] = obj
+ if id == 1:
+ set_prop[obj.table.nucleotide_alphabet.__class__]["translator"] = obj
+ set_prop[obj.table.protein_alphabet.__class__]["rna_translator"] = obj
+
+# ambiguous translator objects
+for name, obj in Translate.ambiguous_dna_by_name.items():
+ property = "translator.name." + name
+ set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj
+ property = "ambiguous_translator.name." + name
+ set_prop[obj.table.protein_alphabet.__class__][property] = obj
+
+for name, obj in Translate.ambiguous_rna_by_name.items():
+ property = "translator.name." + name
+ set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj
+ property = "ambiguous_rna_translator.name." + name
+ set_prop[obj.table.protein_alphabet.__class__][property] = obj
+
+
+for id, obj in Translate.ambiguous_dna_by_id.items():
+ property = "translator.id.%d" % id
+ set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj
+ property = "ambiguous_translator.id.%d" % id
+ set_prop[obj.table.protein_alphabet.__class__][property] = obj
+ if id == 1:
+ set_prop[obj.table.nucleotide_alphabet.__class__]["translator"] = obj
+ set_prop[obj.table.protein_alphabet.__class__]["ambiguous_translator"] = obj
+
+
+for id, obj in Translate.ambiguous_rna_by_id.items():
+ property = "translator.id.%d" % id
+ set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj
+ property = "ambiguous_rna_translator.id.%d" % id
+ set_prop[obj.table.protein_alphabet.__class__][property] = obj
+ if id == 1:
+ set_prop[obj.table.nucleotide_alphabet.__class__]["translator"] = obj
+ set_prop[obj.table.protein_alphabet.__class__]["ambiguous_rna_translator"] = obj
--- /dev/null
+# This is a Python module.
+"""Properties for functionality such as transcription and translation.
+"""
--- /dev/null
+"""
+Code to deal with alignments written in Fasta format (OBSOLETE).
+
+This module is considered obsolete and likely to be deprecated. Please use
+Bio.AlignIO instead for reading and writing alignments in FASTA format.
+
+This mostly just uses the regular Fasta parsing stuff written by Jeff
+to deal with all of the input and output formats.
+
+functions:
+o parse_file()
+
+classes:
+FastaAlignment"""
+# standard library
+import os
+
+# biopython
+from Bio.Align.Generic import Alignment
+from Bio import Alphabet
+from Bio.Alphabet import IUPAC
+from Bio import Fasta
+
+def parse_file(file_name, type = 'DNA'):
+ """Parse the given file into a FastaAlignment object.
+
+ Arguments:
+ o file_name - The location of the file to parse.
+ o type - The type of information contained in the file.
+ """
+ if type.upper() == 'DNA':
+ alphabet = IUPAC.ambiguous_dna
+ elif type.upper() == 'RNA':
+ alphabet = IUPAC.ambiguous_rna
+ elif type.upper() == 'PROTEIN':
+ alphabet = IUPAC.protein
+ else:
+ raise ValueError("Invalid type %s passed. Need DNA, RNA or PROTEIN"
+ % type)
+
+ # create a new alignment object
+ fasta_align = FastaAlignment(Alphabet.Gapped(alphabet))
+
+ # now parse the file and fill up the alignment object
+ align_file = open(file_name, 'r')
+
+ parser = Fasta.RecordParser()
+ iterator = Fasta.Iterator(align_file, parser)
+
+ cur_align = iterator.next()
+ while cur_align:
+ fasta_align.add_sequence(cur_align.title, cur_align.sequence)
+
+ cur_align = iterator.next()
+
+ return fasta_align
+
+class FastaAlignment(Alignment):
+ """Work with the Fasta Alignment format.
+
+ The fasta alignment format is basically the same as the regular ol'
+ Fasta format we know and love, except the sequences have gaps
+ (represented by -'s).
+ """
+ def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
+ Alignment.__init__(self, alphabet)
+
+ def __str__(self):
+ """Print out a fasta version of the alignment info."""
+ return_string = ''
+ for item in self._records:
+ new_f_record = Fasta.Record()
+ new_f_record.title = item.description
+ new_f_record.sequence = item.seq.data
+
+ return_string = return_string + str(new_f_record) + os.linesep + os.linesep
+
+ # have a extra newline, so strip two off and add one before returning
+ return return_string.rstrip() + os.linesep
+
+
+
+
+
--- /dev/null
+"""Utilities for working with FASTA-formatted sequences (OBSOLETE).
+
+Classes:
+Record Holds FASTA sequence data.
+Iterator Iterates over sequence data in a FASTA file.
+RecordParser Parses FASTA sequence data into a Record object.
+SequenceParser Parses FASTA sequence data into a SeqRecord object.
+
+For a long time this module was the most commonly used and best documented
+FASTA parser in Biopython. However, we now recommend using Bio.SeqIO instead.
+
+In view of this, while you can continue to use Bio.Fasta for the moment, it is
+considered to be a legacy module and should not be used if you are writing new
+code. At some point Bio.Fasta may be officially deprecated (with warning
+messages when used) before finally being removed.
+
+If you are already using Bio.Fasta with the SequenceParser to get SeqRecord
+objects, then you should be able to switch to the more recent Bio.SeqIO module
+very easily as that too uses SeqRecord objects. For example,
+
+from Bio import Fasta
+handle = open("example.fas")
+for seq_record in Fasta.Iterator(handle, Fasta.SequenceParser()) :
+ print seq_record.description
+ print seq_record.seq
+handle.close()
+
+Using Bio.SeqIO instead this becomes:
+
+from Bio import SeqIO
+handle = open("example.fas")
+for seq_record in SeqIO.parse(handle, "fasta") :
+ print seq_record.description
+ print seq_record.seq
+handle.close()
+
+Converting an existing code which uses the RecordParser is a little more
+complicated as the Bio.Fasta.Record object differs from the SeqRecord.
+
+from Bio import Fasta
+handle = open("example.fas")
+for record in Fasta.Iterator(handle, Fasta.RecordParser()) :
+ #record is a Bio.Fasta.Record object
+ print record.title #The full title line as a string
+ print record.sequence #The sequence as a string
+handle.close()
+
+Using Bio.SeqIO instead this becomes:
+
+from Bio import SeqIO
+handle = open("example.fas")
+for seq_record in SeqIO.parse(handle, "fasta") :
+ print seq_record.description #The full title line as a string
+ print seq_record.seq.tostring() #The sequence as a string
+handle.close()
+
+
+
+"""
+from Bio import Seq
+from Bio import SeqRecord
+from Bio import Alphabet
+
+
+class Record:
+ """Holds information from a FASTA record.
+
+ Members:
+ title Title line ('>' character not included).
+ sequence The sequence.
+
+ """
+ def __init__(self, colwidth=60):
+ """__init__(self, colwidth=60)
+
+ Create a new Record. colwidth specifies the number of residues
+ to put on each line when generating FASTA format.
+
+ """
+ self.title = ''
+ self.sequence = ''
+ self._colwidth = colwidth
+
+ def __str__(self):
+ s = []
+ s.append('>%s' % self.title)
+ i = 0
+ while i < len(self.sequence):
+ s.append(self.sequence[i:i+self._colwidth])
+ i = i + self._colwidth
+ #Was having a problem getting the tests to pass on windows...
+ #return os.linesep.join(s)
+ return "\n".join(s)
+
+class Iterator:
+ """Returns one record at a time from a FASTA file.
+ """
+ def __init__(self, handle, parser = None, debug = 0):
+ """Initialize a new iterator.
+ """
+ self.handle = handle
+ self._parser = parser
+ self._debug = debug
+
+ #Skip any text before the first record (e.g. blank lines)
+ while True :
+ line = handle.readline()
+ if not line or line[0] == ">" :
+ break
+ if debug : print "Skipping: " + line
+ self._lookahead = line
+
+ def __iter__(self):
+ return iter(self.next, None)
+
+ def next(self):
+ """Return the next record in the file"""
+ line = self._lookahead
+ if not line:
+ return None
+ assert line[0]==">", line
+ lines = [line.rstrip()]
+ line = self.handle.readline()
+ while line:
+ if line[0] == ">": break
+ if line[0] == "#" :
+ if self._debug : print "Ignoring comment line"
+ pass
+ else :
+ lines.append(line.rstrip())
+ line = self.handle.readline()
+ self._lookahead = line
+ if self._debug : print "Debug: '%s' and '%s'" % (title, "".join(lines))
+ if self._parser is None:
+ return "\n".join(lines)
+ else :
+ return self._parser.parse_string("\n".join(lines))
+
+class RecordParser:
+ """Parses FASTA sequence data into a Fasta.Record object.
+ """
+ def __init__(self, debug = 0):
+ pass
+
+ def parse_string(self, text) :
+ text = text.replace("\r\n","\n") #Crude way of dealing with \r\n
+ assert text[0] == ">", text
+ text = text.split("\n>",1)[0] # Only do the first record if more than one
+ title, sequence = text.split("\n", 1)
+ title = title[1:]
+ rec = Record()
+ rec.title = title
+ rec.sequence = sequence.replace("\n","")
+ return rec
+
+ def parse(self, handle):
+ return self.parse_string(handle.read())
+
+class SequenceParser:
+ """Parses FASTA sequence data into a SeqRecord object.
+ """
+ def __init__(self, alphabet = Alphabet.generic_alphabet, title2ids = None,
+ debug = 0):
+ """Initialize a Scanner and Sequence Consumer.
+
+ Arguments:
+ o alphabet - The alphabet of the sequences to be parsed. If not
+ passed, this will be set as generic_alphabet.
+ o title2ids - A function that, when given the title of the FASTA
+ file (without the beginning >), will return the id, name and
+ description (in that order) for the record. If this is not given,
+ then the entire title line will be used as the description.
+ """
+ self.alphabet = alphabet
+ self.title2ids = title2ids
+
+ def parse_string(self, text) :
+ text = text.replace("\r\n","\n") #Crude way of dealing with \r\n
+ assert text[0] == ">", text
+ text = text.split("\n>",1)[0] # Only do the first record if more than one
+ title, sequence = text.split("\n", 1)
+ title = title[1:]
+
+ seq = Seq.Seq(sequence.replace("\n",""), self.alphabet)
+ rec = SeqRecord.SeqRecord(seq)
+
+ if self.title2ids:
+ seq_id, name, descr = self.title2ids(title)
+ rec.id = seq_id
+ rec.name = name
+ rec.description = descr
+ else:
+ rec.description = title
+
+ return rec
+
+ def parse(self, handle):
+ return self.parse_string(handle.read())
--- /dev/null
+# Copyright 1999 by Jeffrey Chang. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Code for more fancy file handles.
+
+
+Classes:
+UndoHandle File object decorator with support for undo-like operations.
+StringHandle Wraps a file object around a string.
+SGMLHandle File object that automatically strips SGML tags from data.
+
+SGMLStripper Object that strips SGML. This is now considered OBSOLETE, and
+ is likely to be deprecated in a future release of Biopython,
+ and later removed.
+
+"""
+import os
+import StringIO
+import sgmllib
+
+class UndoHandle:
+ """A Python handle that adds functionality for saving lines.
+
+ Saves lines in a LIFO fashion.
+
+ Added methods:
+ saveline Save a line to be returned next time.
+ peekline Peek at the next line without consuming it.
+
+ """
+ def __init__(self, handle):
+ self._handle = handle
+ self._saved = []
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ next = self.readline()
+ if not next:
+ raise StopIteration
+ return next
+
+ def readlines(self, *args, **keywds):
+ lines = self._saved + self._handle.readlines(*args,**keywds)
+ self._saved = []
+ return lines
+
+ def readline(self, *args, **keywds):
+ if self._saved:
+ line = self._saved.pop(0)
+ else:
+ line = self._handle.readline(*args,**keywds)
+ return line
+
+ def read(self, size=-1):
+ if size == -1:
+ saved = "".join(self._saved)
+ self._saved[:] = []
+ else:
+ saved = ''
+ while size > 0 and self._saved:
+ if len(self._saved[0]) <= size:
+ size = size - len(self._saved[0])
+ saved = saved + self._saved.pop(0)
+ else:
+ saved = saved + self._saved[0][:size]
+ self._saved[0] = self._saved[0][size:]
+ size = 0
+ return saved + self._handle.read(size)
+
+ def saveline(self, line):
+ if line:
+ self._saved = [line] + self._saved
+
+ def peekline(self):
+ if self._saved:
+ line = self._saved[0]
+ else:
+ line = self._handle.readline()
+ self.saveline(line)
+ return line
+
+ def tell(self):
+ lengths = map(len, self._saved)
+ sum = reduce(lambda x, y: x+y, lengths, 0)
+ return self._handle.tell() - sum
+
+ def seek(self, *args):
+ self._saved = []
+ self._handle.seek(*args)
+
+ def __getattr__(self, attr):
+ return getattr(self._handle, attr)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, traceback):
+ self._handle.close()
+
+
+# I could make this faster by using cStringIO.
+# However, cStringIO (in v1.52) does not implement the
+# readlines method.
+StringHandle = StringIO.StringIO
+
+
+
+class SGMLHandle:
+ """A Python handle that automatically strips SGML tags from data (OBSOLETE).
+
+ This module is now considered to be obsolete, and is likely to be
+ deprecated in a future release of Biopython, and later removed.
+ """
+ def __init__(self, handle):
+ """SGMLStripper(handle)
+
+ handle is a file handle to SGML-formatted data.
+
+ """
+ self._handle = handle
+ self._stripper = SGMLStripper()
+
+ def read(self, *args, **keywds):
+ data = self._handle.read(*args, **keywds)
+ return self._stripper.strip(data)
+
+ def readline(self, *args, **keywds):
+ line = self._handle.readline(*args, **keywds)
+ return self._stripper.strip(line)
+
+ def readlines(self, *args, **keywds):
+ lines = self._handle.readlines(*args, **keywds)
+ for i in range(len(lines)):
+ lines[i] = self._stripper.strip(str)
+ return lines
+
+ def __getattr__(self, attr):
+ return getattr(self._handle, attr)
+
+
+class SGMLStripper:
+ class MyParser(sgmllib.SGMLParser):
+ def __init__(self):
+ sgmllib.SGMLParser.__init__(self)
+ self.data = ''
+ def handle_data(self, data):
+ self.data = self.data + data
+
+ def __init__(self):
+ self._parser = SGMLStripper.MyParser()
+
+ def strip(self, str):
+ """S.strip(str) -> string
+
+ Strip the SGML tags from str.
+
+ """
+ if not str: # empty string, don't do anything.
+ return ''
+ # I need to make sure that I don't return an empty string if
+ # the buffer is not empty. This can happen if there's a newline
+ # character embedded within a tag. Thus, I'll first check to
+ # see if the last character is a newline. If it is, and it's stripped
+ # away, I'll add it back.
+ is_newline = str[-1] in ['\n', '\r']
+
+ self._parser.data = '' # clear the parser's data (don't reset)
+ self._parser.feed(str)
+ if self._parser.data:
+ str = self._parser.data
+ elif is_newline:
+ str = '\n'
+ else:
+ str = ''
+ return str
+
--- /dev/null
+# Copyright 2001 by Katharine Lindner. All rights reserved.\r
+# This code is part of the Biopython distribution and governed by its\r
+# license. Please see the LICENSE file that should have been included\r
+# as part of this package.\r
+\r
+"""Code for more fancy file handles (OBSOLETE).\r
+\r
+Classes:\r
+Filtered is a decorator for File that allows the user to filter the output\r
+on a line by line basis.\r
+\r
+The FilteredReader module reads a file and applies a sequence of filters to the input\r
+The constructor sets a default filter chain, but the user can select another filter by setting\r
+Bio.FilteredReader.filter_chain.\r
+\r
+handle = open( "filename" )\r
+filtered_reader = Bio.FilteredReader( handle )\r
+filtered_reader.filter_chain = [ remove_asterisks, replace_dot_with_dash ]\r
+filtered_reasder.read()\r
+\r
+All filters in the chain must provide the same interface with a line of text as the single\r
+input parameter and altered text as the return value.\r
+\r
+This module is now considered to be obsolete, and is likely to be deprecated\r
+in a future release of Biopython, and later removed.\r
+"""\r
+\r
+\r
+def dump_saved( name, text, j ):\r
+ """Used for debugging."""\r
+ dump_file = open( name + '%d' % j, "w" )\r
+ k = 0\r
+ for i in range ( 0, len( text ), 80 ):\r
+ dump_file.write( '%s\n' % text[ i : i + 80 ] )\r
+ dump_file.close()\r
+\r
+def remove_leading_whitespace( line ):\r
+ return line.lstrip()\r
+\r
+\r
+def remove_empty_line( line ):\r
+ stripped_line = line.strip()\r
+ if( stripped_line ):\r
+ return line[ : ]\r
+ else:\r
+ return ''\r
+\r
+def remove_useless_dot( line ):\r
+ before = line\r
+ while( 1 ):\r
+ after = before.replace( "\t.\t", "\t\t" )\r
+ if( len( before ) == len( after ) ):\r
+ break\r
+ before = after\r
+ if( after.endswith( '.' ) ):\r
+ after = after[ :-1 ]\r
+ return after\r
+\r
+def fix_punctuation( line ):\r
+ line = line.replace( "'", '' )\r
+ line = line.replace( '"', '' )\r
+ line = line.replace( ';', '\t' )\r
+ line = line.replace( 'entryname', 'id' )\r
+# line = line.lower( )\r
+ if( line ):\r
+ return line[ : ]\r
+ else:\r
+ return ''\r
+\r
+\r
+\r
+class FilteredReader:\r
+ def __init__(self, handle ):\r
+ self._handle = handle\r
+ self._start_line = ''\r
+ self._debug_count = 0\r
+ self.filter_chain = [ remove_empty_line, remove_useless_dot, fix_punctuation ]\r
+\r
+ def __getattr__(self, attr):\r
+ return getattr(self._handle, attr)\r
+\r
+\r
+\r
+ def close(self, *args, **keywds ):\r
+ return self._handle.close( *args, **keywds)\r
+\r
+ def read( self, *args, **keywds ):\r
+ line = ''\r
+ len_expected = self._get_len_expected( args, keywds )\r
+ if( len_expected ):\r
+ filtered_text = self.read_block( len_expected )\r
+ else:\r
+ filtered_text = self.read_to_end()\r
+ return filtered_text\r
+\r
+ def read_block( self, len_expected ):\r
+\r
+ len_filtered = 0\r
+ len_adjusted -= len( self._start_line )\r
+ filtered_text = ''\r
+ while( len_filtered < len_expected ):\r
+\r
+ text_read = self._handle.read( len_adjusted )\r
+ full_text = self._start_line + text_read\r
+ lines = full_text.splitlines( 1 )\r
+ if( text_read == '' ):\r
+ filtered_text = filtered_text + self.filter( lines )\r
+ break\r
+ else:\r
+ all_but_last_line = lines[ :-1 ]\r
+ self._start_line = lines[ -1 ]\r
+ filtered_text = filtered_text + self.filter( all_but_last_line )\r
+ len_filtered_text = len( filtered_text )\r
+ len_adjusted = len_adjusted - len_filtered_text\r
+ return filtered_text[ : ]\r
+\r
+ def read_to_end( self ):\r
+ filtered_text = ''\r
+ text_read = self._handle.read()\r
+ full_text = self._start_line + text_read\r
+ lines = full_text.splitlines( 1 )\r
+ filtered_text += self.filter( lines[:] )\r
+ return filtered_text[ : ]\r
+\r
+ def _get_len_expected( self, args, keywds ):\r
+\r
+ if( len( args) > 0 ):\r
+ len_expected = args[ 0 ]\r
+ if( len_expected < 0 ):\r
+ len_expected = None\r
+ elif 'size' in keywds:\r
+ len_expected = keywds['size']\r
+ else:\r
+ len_expected = None\r
+ return len_expected\r
+\r
+ def filter( self, lines ):\r
+ filter_chain = self.filter_chain\r
+ filtered_text = ''\r
+ for line in lines:\r
+ for filter in filter_chain:\r
+ line = filter( *( line, ) )\r
+ filtered_text += line\r
+\r
+ return filtered_text\r
+\r
+def has_trailing_linefeed( line ):\r
+ if( line.endswith( chr( 13 ) ) or \\r
+ line.endswith( chr( 10 ) ) ):\r
+ return 1\r
+ else:\r
+ return 0\r
--- /dev/null
+# Copyright 2002 by Katharine Lindner. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""handles true random numbers supplied from the the web server of fourmilab. Based on atmospheric noise. The motivation is to support biosimulations that rely on random numbers.
+"""
+
+import urllib
+
+
+def hex_convert(text):
+ import warnings
+ warnings.warn("The function Bio.HotRand.hex_convert is deprecated. Instead of Bio.HotRand.hex_convert(text), please use int(text, 16) instead", DeprecationWarning)
+ return int(text, 16)
+
+def byte_concat( text ):
+ val = 0
+ numbytes = len( text )
+ for i in range( 0, numbytes ):
+ val = val * 256
+ val = val + ord( text[ i ] )
+
+ return val
+
+class HotCache:
+
+ def __init__( self ):
+# self.url = 'http://www.fourmilab.ch/cgi-bin/uncgi/Hotbits?num=5000&min=1&max=6&col=1'
+ self.url = 'http://www.random.org/cgi-bin/randbyte?'
+ self.query = { 'nbytes' : 128, 'fmt' : 'h' }
+ self.fill_hot_cache()
+
+ def fill_hot_cache( self ):
+ url = self.url + urllib.urlencode( self.query )
+ fh = urllib.urlopen( url )
+ self.hot_cache = fh.read()
+ fh.close()
+
+ def next_num( self, num_digits = 4 ):
+ cache = self.hot_cache
+ numbytes = num_digits / 2
+ if( len( cache ) % numbytes != 0 ):
+ print 'len_cache is %d' % len( cache )
+ raise ValueError
+ if( cache == '' ):
+ self.fill_hot_cache()
+ cache = self.hot_cache
+ hexdigits = cache[ :numbytes ]
+ self.hot_cache = cache[ numbytes: ]
+ return byte_concat( hexdigits )
+
+
+
+class HotRandom:
+
+ def __init__( self ):
+ self.hot_cache = HotCache( )
+
+ def hot_rand( self, high, low = 0 ):
+ span = high - low
+ val = self.hot_cache.next_num()
+ val = ( span * val ) >> 16
+ val = val + low
+ return val
+
+
+if( __name__ == '__main__' ):
+ hot_random = HotRandom()
+ for j in range ( 0, 130 ):
+ print hot_random.hot_rand( 25 )
+ nums = [ '0000', 'abcd', '1234', '5555', '4321', 'aaaa', 'ffff' ]
+ for num in nums:
+ print hex_convert( num )
+
+
+
--- /dev/null
+# Copyright 1999 by Jeffrey Chang. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Index.py
+
+This module provides a way to create indexes to text files.
+
+Classes:
+Index Dictionary-like class used to store index information.
+
+_ShelveIndex An Index class based on the shelve module.
+_InMemoryIndex An in-memory Index class.
+
+"""
+import os
+import array
+import cPickle
+import shelve
+
+class _ShelveIndex(dict):
+ """An index file wrapped around shelve.
+
+ """
+ # Without a good dbm module installed, this is pretty slow and
+ # generates large files. When generating an index on a FASTA-
+ # formatted file with 82000 sequences (37Mb), the
+ # index 'dat' file is 42Mb and 'dir' file is 8Mb.
+
+ __version = 2
+ __version_key = '__version'
+
+ def __init__(self, indexname, truncate=None):
+ dict.__init__(self)
+ try:
+ if truncate:
+ # In python 1.52 and before, dumbdbm (under shelve)
+ # doesn't clear the old database.
+ files = [indexname + '.dir',
+ indexname + '.dat',
+ indexname + '.bak'
+ ]
+ for file in files:
+ if os.path.exists(file):
+ os.unlink(file)
+ raise Exception("open a new shelf")
+ self.data = shelve.open(indexname, flag='r')
+ except:
+ # No database exists.
+ self.data = shelve.open(indexname, flag='n')
+ self.data[self.__version_key] = self.__version
+ else:
+ # Check to make sure the database is the correct version.
+ version = self.data.get(self.__version_key, None)
+ if version is None:
+ raise IOError("Unrecognized index format")
+ elif version != self.__version:
+ raise IOError("Version %s doesn't match my version %s" \
+ % (version, self.__version))
+
+ def __del__(self):
+ if self.__dict__.has_key('data'):
+ self.data.close()
+
+class _InMemoryIndex(dict):
+ """This creates an in-memory index file.
+
+ """
+ # File Format:
+ # version
+ # key value
+ # [...]
+
+ __version = 3
+ __version_key = '__version'
+
+ def __init__(self, indexname, truncate=None):
+ self._indexname = indexname
+ dict.__init__(self)
+ self.__changed = 0 # the index hasn't changed
+
+ # Remove the database if truncate is true.
+ if truncate and os.path.exists(indexname):
+ os.unlink(indexname)
+ self.__changed = 1
+
+ # Load the database if it exists
+ if os.path.exists(indexname):
+ handle = open(indexname)
+ version = self._toobj(handle.readline().rstrip())
+ if version != self.__version:
+ raise IOError("Version %s doesn't match my version %s" \
+ % (version, self.__version))
+ for line in handle:
+ key, value = line.split()
+ key, value = self._toobj(key), self._toobj(value)
+ self[key] = value
+ self.__changed = 0
+
+ def update(self, dict):
+ self.__changed = 1
+ dict.update(self, dict)
+ def __setitem__(self, key, value):
+ self.__changed = 1
+ dict.__setitem__(self, key, value)
+ def __delitem__(self, key):
+ self.__changed = 1
+ dict.__delitem__(self, key)
+ def clear(self):
+ self.__changed = 1
+ dict.clear(self)
+
+ def __del__(self):
+ if self.__changed:
+ handle = open(self._indexname, 'w')
+ handle.write("%s\n" % self._tostr(self.__version))
+ for key, value in self.items():
+ handle.write("%s %s\n" %
+ (self._tostr(key), self._tostr(value)))
+ handle.close()
+
+ def _tostr(self, obj):
+ # I need a representation of the object that's saveable to
+ # a file that uses whitespace as delimiters. Thus, I'm
+ # going to pickle the object, and then convert each character of
+ # the string to its ASCII integer value. Then, I'm going to convert
+ # the integers into strings and join them together with commas.
+ # It's not the most efficient way of storing things, but it's
+ # relatively fast.
+ s = cPickle.dumps(obj)
+ intlist = array.array('b', s)
+ strlist = map(str, intlist)
+ return ','.join(strlist)
+
+ def _toobj(self, str):
+ intlist = map(int, str.split(','))
+ intlist = array.array('b', intlist)
+ strlist = map(chr, intlist)
+ return cPickle.loads(''.join(strlist))
+
+Index = _InMemoryIndex
--- /dev/null
+#!/usr/bin/env python
+
+"""
+This module provides code for doing logistic regressions.
+
+
+Classes:
+LogisticRegression Holds information for a LogisticRegression classifier.
+
+
+Functions:
+train Train a new classifier.
+calculate Calculate the probabilities of each class, given an observation.
+classify Classify an observation into a class.
+"""
+
+#TODO - Remove this work around once we drop python 2.3 support
+try:
+ set = set
+except NameError:
+ from sets import Set as set
+
+#from numpy import *
+#from numpy.linalg import *
+import numpy
+import numpy.linalg
+
+class LogisticRegression:
+ """Holds information necessary to do logistic regression
+ classification.
+
+ Members:
+ beta List of the weights for each dimension.
+
+ """
+ def __init__(self):
+ """LogisticRegression()"""
+ self.beta = []
+
+def train(xs, ys, update_fn=None, typecode=None):
+ """train(xs, ys[, update_fn]) -> LogisticRegression
+
+ Train a logistic regression classifier on a training set. xs is a
+ list of observations and ys is a list of the class assignments,
+ which should be 0 or 1. xs and ys should contain the same number
+ of elements. update_fn is an optional callback function that
+ takes as parameters that iteration number and log likelihood.
+
+ """
+ if len(xs) != len(ys):
+ raise ValueError("xs and ys should be the same length.")
+ classes = set(ys)
+ if classes != set([0, 1]):
+ raise ValueError("Classes should be 0's and 1's")
+ if typecode is None:
+ typecode = 'd'
+
+ # Dimensionality of the data is the dimensionality of the
+ # observations plus a constant dimension.
+ N, ndims = len(xs), len(xs[0]) + 1
+ if N==0 or ndims==1:
+ raise ValueError("No observations or observation of 0 dimension.")
+
+ # Make an X array, with a constant first dimension.
+ X = numpy.ones((N, ndims), typecode)
+ X[:, 1:] = xs
+ Xt = numpy.transpose(X)
+ y = numpy.asarray(ys, typecode)
+
+ # Initialize the beta parameter to 0.
+ beta = numpy.zeros(ndims, typecode)
+
+ MAX_ITERATIONS = 500
+ CONVERGE_THRESHOLD = 0.01
+ stepsize = 1.0
+ # Now iterate using Newton-Raphson until the log-likelihoods
+ # converge.
+ iter = 0
+ old_beta = old_llik = None
+ while iter < MAX_ITERATIONS:
+ # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X))
+ ebetaX = numpy.exp(numpy.dot(beta, Xt))
+ p = ebetaX / (1+ebetaX)
+
+ # Find the log likelihood score and see if I've converged.
+ logp = y*numpy.log(p) + (1-y)*numpy.log(1-p)
+ llik = sum(logp)
+ if update_fn is not None:
+ update_fn(iter, llik)
+ # Check to see if the likelihood decreased. If it did, then
+ # restore the old beta parameters and half the step size.
+ if llik < old_llik:
+ stepsize = stepsize / 2.0
+ beta = old_beta
+ # If I've converged, then stop.
+ if old_llik is not None and numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
+ break
+ old_llik, old_beta = llik, beta
+ iter += 1
+
+ W = numpy.identity(N) * p
+ Xtyp = numpy.dot(Xt, y-p) # Calculate the first derivative.
+ XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative.
+ #u, s, vt = singular_value_decomposition(XtWX)
+ #print "U", u
+ #print "S", s
+ delta = numpy.linalg.solve(XtWX, Xtyp)
+ if numpy.fabs(stepsize-1.0) > 0.001:
+ delta = delta * stepsize
+ beta = beta + delta # Update beta.
+ else:
+ raise RuntimeError("Didn't converge.")
+
+ lr = LogisticRegression()
+ lr.beta = map(float, beta) # Convert back to regular array.
+ return lr
+
+def calculate(lr, x):
+ """calculate(lr, x) -> list of probabilities
+
+ Calculate the probability for each class. lr is a
+ LogisticRegression object. x is the observed data. Returns a
+ list of the probability that it fits each class.
+
+ """
+ # Insert a constant term for x.
+ x = numpy.asarray([1.0] + x)
+ # Calculate the probability. p = e^(beta X) / (1+e^(beta X))
+ ebetaX = numpy.exp(numpy.dot(lr.beta, x))
+ p = ebetaX / (1+ebetaX)
+ return [1-p, p]
+
+def classify(lr, x):
+ """classify(lr, x) -> 1 or 0
+
+ Classify an observation into a class.
+
+ """
+ probs = calculate(lr, x)
+ if probs[0] > probs[1]:
+ return 0
+ return 1
--- /dev/null
+# Copyright 1999 by Jeffrey Chang. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Code to support writing parsers.
+
+
+
+Classes:
+AbstractParser Base class for parsers.
+AbstractConsumer Base class of all Consumers.
+TaggingConsumer Consumer that tags output with its event. For debugging
+SGMLStrippingConsumer Consumer that strips SGML tags from output.
+EventGenerator Generate Biopython Events from Martel XML output
+ (note that Martel is now DEPRECATED)
+
+Functions:
+safe_readline Read a line from a handle, with check for EOF.
+safe_peekline Peek at next line, with check for EOF.
+read_and_call Read a line from a handle and pass it to a method.
+read_and_call_while Read many lines, as long as a condition is met.
+read_and_call_until Read many lines, until a condition is met.
+attempt_read_and_call Like read_and_call, but forgiving of errors.
+is_blank_line Test whether a line is blank.
+
+"""
+
+import sys
+import traceback
+from types import *
+
+from Bio import File
+
+# XML from python 2.0
+try:
+ from xml.sax import handler
+ xml_support = 1
+except ImportError:
+ sys.stderr.write("Warning: Could not import SAX for dealing with XML.\n" +
+ "This causes problems with some ParserSupport modules\n")
+ xml_support = 0
+
+class AbstractParser:
+ """Base class for other parsers.
+
+ """
+ def parse(self, handle):
+ raise NotImplementedError("Please implement in a derived class")
+
+ def parse_str(self, string):
+ return self.parse(File.StringHandle(string))
+
+ def parse_file(self, filename):
+ h = open(filename)
+ try:
+ retval = self.parse(h)
+ finally:
+ h.close()
+ return retval
+
+class AbstractConsumer:
+ """Base class for other Consumers.
+
+ Derive Consumers from this class and implement appropriate
+ methods for each event that you want to receive.
+
+ """
+ def _unhandled_section(self):
+ pass
+ def _unhandled(self, data):
+ pass
+ def __getattr__(self, attr):
+ if attr[:6] == 'start_' or attr[:4] == 'end_':
+ method = self._unhandled_section
+ else:
+ method = self._unhandled
+ return method
+
+class TaggingConsumer(AbstractConsumer):
+ """A Consumer that tags the data stream with the event and
+ prints it to a handle. Useful for debugging.
+
+ """
+ def __init__(self, handle=None, colwidth=15, maxwidth=80):
+ """TaggingConsumer(handle=sys.stdout, colwidth=15, maxwidth=80)"""
+ # I can't assign sys.stdout to handle in the argument list.
+ # If I do that, handle will be assigned the value of sys.stdout
+ # the first time this function is called. This will fail if
+ # the user has assigned sys.stdout to some other file, which may
+ # be closed or invalid at a later time.
+ if handle is None:
+ handle = sys.stdout
+ self._handle = handle
+ self._colwidth = colwidth
+ self._maxwidth = maxwidth
+
+ def unhandled_section(self):
+ self._print_name('unhandled_section')
+
+ def unhandled(self, data):
+ self._print_name('unhandled', data)
+
+ def _print_name(self, name, data=None):
+ if data is None:
+ # Write the name of a section.
+ self._handle.write("%s %s\n" % ("*"*self._colwidth, name))
+ else:
+ # Write the tag and line.
+ self._handle.write("%-*s: %s\n" % (
+ self._colwidth, name[:self._colwidth],
+ data[:self._maxwidth-self._colwidth-2].rstrip()))
+
+ def __getattr__(self, attr):
+ if attr[:6] == 'start_' or attr[:4] == 'end_':
+ method = lambda a=attr, s=self: s._print_name(a)
+ else:
+ method = lambda x, a=attr, s=self: s._print_name(a, x)
+ return method
+
+class SGMLStrippingConsumer:
+ """A consumer that strips off SGML tags.
+
+ This is meant to be used as a decorator for other consumers.
+
+ """
+ def __init__(self, consumer):
+ if type(consumer) is not InstanceType:
+ raise ValueError("consumer should be an instance")
+ self._consumer = consumer
+ self._prev_attr = None
+ self._stripper = File.SGMLStripper()
+
+ def _apply_clean_data(self, data):
+ clean = self._stripper.strip(data)
+ self._prev_attr(clean)
+
+ def __getattr__(self, name):
+ if name in ['_prev_attr', '_stripper']:
+ return getattr(self, name)
+ attr = getattr(self._consumer, name)
+ # If this is not a method, then return it as is.
+ if type(attr) is not MethodType:
+ return attr
+ # If it's a section method, then return it.
+ if name[:6] == 'start_' or name[:4] == 'end_':
+ return attr
+ # Otherwise, it's an info event, and return my method.
+ self._prev_attr = attr
+ return self._apply_clean_data
+
+# onle use the Event Generator if XML handling is okay
+if xml_support:
+ class EventGenerator(handler.ContentHandler):
+ """Handler to generate events associated with a Martel parsed file.
+
+ This acts like a normal SAX handler, and accepts XML generated by
+ Martel during parsing. These events are then converted into
+ 'Biopython events', which can then be caught by a standard
+ biopython consumer.
+
+ Note that Martel is now DEPRECATED.
+ """
+ def __init__(self, consumer, interest_tags, callback_finalizer = None,
+ exempt_tags = []):
+ """Initialize to begin catching and firing off events.
+
+ Arguments:
+ o consumer - The consumer that we'll send Biopython events to.
+
+ o interest_tags - A listing of all the tags we are interested in.
+
+ o callback_finalizer - A function to deal with the collected
+ information before passing it on to the consumer. By default
+ the collected information is a list of all of the lines read
+ for a particular tag -- if there are multiple tags in a row
+ like:
+
+ <some_info>Spam<some_info>
+ <some_info>More Spam<some_info>
+
+ In this case the list of information would be:
+
+ ['Spam', 'More Spam']
+
+ This list of lines will be passed to the callback finalizer if
+ it is present. Otherwise the consumer will be called with the
+ list of content information.
+
+ o exempt_tags - A listing of particular tags that are exempt from
+ being processed by the callback_finalizer. This allows you to
+ use a finalizer to deal with most tags, but leave those you don't
+ want touched.
+ """
+ self._consumer = consumer
+ self.interest_tags = interest_tags
+ self._finalizer = callback_finalizer
+ self._exempt_tags = exempt_tags
+
+ # a dictionary of content for each tag of interest
+ # the information for each tag is held as a list of the lines.
+ # This allows us to collect information from multiple tags
+ # in a row, and return it all at once.
+ self.info = {}
+ for tag in self.interest_tags:
+ self.info[tag] = []
+
+ # the previous tag we were collecting information for.
+ # We set a delay in sending info to the consumer so that we can
+ # collect a bunch of tags in a row and append all of the info
+ # together.
+ self._previous_tag = ''
+
+ # the current character information for a tag
+ self._cur_content = []
+ # whether we should be collecting information
+ self._collect_characters = 0
+
+ def startElement(self, name, attrs):
+ """Determine if we should collect characters from this tag.
+ """
+ if name in self.interest_tags:
+ self._collect_characters = 1
+
+ def characters(self, content):
+ """Extract the information if we are interested in it.
+ """
+ if self._collect_characters:
+ self._cur_content.append(content)
+
+ def endElement(self, name):
+ """Send the information to the consumer.
+
+ Once we've got the end element we've collected up all of the
+ character information we need, and we need to send this on to
+ the consumer to do something with it.
+
+ We have a delay of one tag on doing this, so that we can collect
+ all of the info from multiple calls to the same element at once.
+ """
+ # only deal with the tag if it is something we are
+ # interested in and potentially have information for
+ if self._collect_characters:
+ # add all of the information collected inside this tag
+ self.info[name].append("".join(self._cur_content))
+ # reset our information and flags
+ self._cur_content = []
+ self._collect_characters = 0
+
+ # if we are at a new tag, pass on the info from the last tag
+ if self._previous_tag and self._previous_tag != name:
+ self._make_callback(self._previous_tag)
+
+ # set this tag as the next to be passed
+ self._previous_tag = name
+
+ def _make_callback(self, name):
+ """Call the callback function with the info with the given name.
+ """
+ # strip off whitespace and call the consumer
+ callback_function = getattr(self._consumer, name)
+
+ # --- pass back the information
+ # if there is a finalizer, use that
+ if self._finalizer is not None and name not in self._exempt_tags:
+ info_to_pass = self._finalizer(self.info[name])
+ # otherwise pass back the entire list of information
+ else:
+ info_to_pass = self.info[name]
+
+ callback_function(info_to_pass)
+
+ # reset the information for the tag
+ self.info[name] = []
+
+ def endDocument(self):
+ """Make sure all of our information has been passed.
+
+ This just flushes out any stored tags that need to be passed.
+ """
+ if self._previous_tag:
+ self._make_callback(self._previous_tag)
+
+def read_and_call(uhandle, method, **keywds):
+ """read_and_call(uhandle, method[, start][, end][, contains][, blank][, has_re])
+
+ Read a line from uhandle, check it, and pass it to the method.
+ Raises a ValueError if the line does not pass the checks.
+
+ start, end, contains, blank, and has_re specify optional conditions
+ that the line must pass. start and end specifies what the line must
+ begin or end with (not counting EOL characters). contains
+ specifies a substring that must be found in the line. If blank
+ is a true value, then the line must be blank. has_re should be
+ a regular expression object with a pattern that the line must match
+ somewhere.
+
+ """
+ line = safe_readline(uhandle)
+ errmsg = _fails_conditions(*(line,), **keywds)
+ if errmsg is not None:
+ raise ValueError(errmsg)
+ method(line)
+
+def read_and_call_while(uhandle, method, **keywds):
+ """read_and_call_while(uhandle, method[, start][, end][, contains][, blank][, has_re]) -> number of lines
+
+ Read a line from uhandle and pass it to the method as long as
+ some condition is true. Returns the number of lines that were read.
+
+ See the docstring for read_and_call for a description of the parameters.
+
+ """
+ nlines = 0
+ while 1:
+ line = safe_readline(uhandle)
+ # If I've failed the condition, then stop reading the line.
+ if _fails_conditions(*(line,), **keywds):
+ uhandle.saveline(line)
+ break
+ method(line)
+ nlines = nlines + 1
+ return nlines
+
+def read_and_call_until(uhandle, method, **keywds):
+ """read_and_call_until(uhandle, method,
+ start=None, end=None, contains=None, blank=None) -> number of lines
+
+ Read a line from uhandle and pass it to the method until
+ some condition is true. Returns the number of lines that were read.
+
+ See the docstring for read_and_call for a description of the parameters.
+
+ """
+ nlines = 0
+ while 1:
+ line = safe_readline(uhandle)
+ # If I've met the condition, then stop reading the line.
+ if not _fails_conditions(*(line,), **keywds):
+ uhandle.saveline(line)
+ break
+ method(line)
+ nlines = nlines + 1
+ return nlines
+
+def attempt_read_and_call(uhandle, method, **keywds):
+ """attempt_read_and_call(uhandle, method, **keywds) -> boolean
+
+ Similar to read_and_call, but returns a boolean specifying
+ whether the line has passed the checks. Does not raise
+ exceptions.
+
+ See docs for read_and_call for a description of the function
+ arguments.
+
+ """
+ line = safe_readline(uhandle)
+ passed = not _fails_conditions(*(line,), **keywds)
+ if passed:
+ method(line)
+ else:
+ uhandle.saveline(line)
+ return passed
+
+def _fails_conditions(line, start=None, end=None, contains=None, blank=None,
+ has_re=None):
+ if start is not None:
+ if line[:len(start)] != start:
+ return "Line does not start with '%s':\n%s" % (start, line)
+ if end is not None:
+ if line.rstrip()[-len(end):] != end:
+ return "Line does not end with '%s':\n%s" % (end, line)
+ if contains is not None:
+ if line.find(contains) == -1:
+ return "Line does not contain '%s':\n%s" % (contains, line)
+ if blank is not None:
+ if blank:
+ if not is_blank_line(line):
+ return "Expected blank line, but got:\n%s" % line
+ else:
+ if is_blank_line(line):
+ return "Expected non-blank line, but got a blank one"
+ if has_re is not None:
+ if has_re.search(line) is None:
+ return "Line does not match regex '%s':\n%s" % (
+ has_re.pattern, line)
+ return None
+
+def is_blank_line(line, allow_spaces=0):
+ """is_blank_line(line, allow_spaces=0) -> boolean
+
+ Return whether a line is blank. allow_spaces specifies whether to
+ allow whitespaces in a blank line. A true value signifies that a
+ line containing whitespaces as well as end-of-line characters
+ should be considered blank.
+
+ """
+ if not line:
+ return 1
+ if allow_spaces:
+ return line.rstrip() == ''
+ return line[0] == '\n' or line[0] == '\r'
+
+def safe_readline(handle):
+ """safe_readline(handle) -> line
+
+ Read a line from an UndoHandle and return it. If there are no more
+ lines to read, I will raise a ValueError.
+
+ """
+ line = handle.readline()
+ if not line:
+ raise ValueError("Unexpected end of stream.")
+ return line
+
+def safe_peekline(handle):
+ """safe_peekline(handle) -> line
+
+ Peek at the next line in an UndoHandle and return it. If there are no
+ more lines to peek, I will raise a ValueError.
+
+ """
+ line = handle.peekline()
+ if not line:
+ raise ValueError("Unexpected end of stream.")
+ return line
--- /dev/null
+"""Third party and other parsers useful internally to Biopython.
+"""
--- /dev/null
+# Copyright (c) 1998-2000 John Aycock
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+__version__ = 'SPARK-0.6.1'
+
+import re
+import sys
+
+def _namelist(instance):
+ namelist, namedict, classlist = [], {}, [instance.__class__]
+ for c in classlist:
+ for b in c.__bases__:
+ classlist.append(b)
+ for name in dir(c):
+ if name not in namedict:
+ namelist.append(name)
+ namedict[name] = 1
+ return namelist
+
+class GenericScanner:
+ def __init__(self):
+ pattern = self.reflect()
+ self.re = re.compile(pattern, re.VERBOSE)
+
+ self.index2func = {}
+ for name, number in self.re.groupindex.items():
+ self.index2func[number-1] = getattr(self, 't_' + name)
+
+ def makeRE(self, name):
+ doc = getattr(self, name).__doc__
+ rv = '(?P<%s>%s)' % (name[2:], doc)
+ return rv
+
+ def reflect(self):
+ rv = []
+ for name in _namelist(self):
+ if name[:2] == 't_' and name != 't_default':
+ rv.append(self.makeRE(name))
+
+ rv.append(self.makeRE('t_default'))
+ return '|'.join(rv)
+
+ def error(self, s, pos):
+ print "Lexical error at position %s" % pos
+ raise SystemExit
+
+ def tokenize(self, s):
+ pos = 0
+ n = len(s)
+ while pos < n:
+ m = self.re.match(s, pos)
+ if m is None:
+ self.error(s, pos)
+
+ groups = m.groups()
+ for i in range(len(groups)):
+ if groups[i] and i in self.index2func:
+ self.index2func[i](groups[i])
+ pos = m.end()
+
+ def t_default(self, s):
+ r'( . | \n )+'
+ pass
+
+class GenericParser:
+ def __init__(self, start):
+ self.rules = {}
+ self.rule2func = {}
+ self.rule2name = {}
+ self.collectRules()
+ self.startRule = self.augment(start)
+ self.ruleschanged = 1
+
+ _START = 'START'
+ _EOF = 'EOF'
+
+ #
+ # A hook for GenericASTBuilder and GenericASTMatcher.
+ #
+ def preprocess(self, rule, func): return rule, func
+
+ def addRule(self, doc, func):
+ rules = doc.split()
+
+ index = []
+ for i in range(len(rules)):
+ if rules[i] == '::=':
+ index.append(i-1)
+ index.append(len(rules))
+
+ for i in range(len(index)-1):
+ lhs = rules[index[i]]
+ rhs = rules[index[i]+2:index[i+1]]
+ rule = (lhs, tuple(rhs))
+
+ rule, fn = self.preprocess(rule, func)
+
+ if lhs in self.rules:
+ self.rules[lhs].append(rule)
+ else:
+ self.rules[lhs] = [ rule ]
+ self.rule2func[rule] = fn
+ self.rule2name[rule] = func.__name__[2:]
+ self.ruleschanged = 1
+
+ def collectRules(self):
+ for name in _namelist(self):
+ if name[:2] == 'p_':
+ func = getattr(self, name)
+ doc = func.__doc__
+ self.addRule(doc, func)
+
+ def augment(self, start):
+ #
+ # Tempting though it is, this isn't made into a call
+ # to self.addRule() because the start rule shouldn't
+ # be subject to preprocessing.
+ #
+ startRule = (self._START, ( start, self._EOF ))
+ self.rule2func[startRule] = lambda args: args[0]
+ self.rules[self._START] = [ startRule ]
+ self.rule2name[startRule] = ''
+ return startRule
+
+ def makeFIRST(self):
+ union = {}
+ self.first = {}
+
+ for rulelist in self.rules.values():
+ for lhs, rhs in rulelist:
+ if lhs not in self.first:
+ self.first[lhs] = {}
+
+ if len(rhs) == 0:
+ self.first[lhs][None] = 1
+ continue
+
+ sym = rhs[0]
+ if sym not in self.rules:
+ self.first[lhs][sym] = 1
+ else:
+ union[(sym, lhs)] = 1
+ changes = 1
+ while changes:
+ changes = 0
+ for src, dest in union.keys():
+ destlen = len(self.first[dest])
+ self.first[dest].update(self.first[src])
+ if len(self.first[dest]) != destlen:
+ changes = 1
+
+ #
+ # An Earley parser, as per J. Earley, "An Efficient Context-Free
+ # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley,
+ # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis,
+ # Carnegie-Mellon University, August 1968, p. 27.
+ #
+
+ def typestring(self, token):
+ return None
+
+ def error(self, token):
+ print "Syntax error at or near `%s' token" % token
+ raise SystemExit
+
+ def parse(self, tokens):
+ tree = {}
+ tokens.append(self._EOF)
+ states = { 0: [ (self.startRule, 0, 0) ] }
+
+ if self.ruleschanged:
+ self.makeFIRST()
+
+ for i in xrange(len(tokens)):
+ states[i+1] = []
+
+ if states[i] == []:
+ break
+ self.buildState(tokens[i], states, i, tree)
+
+ #_dump(tokens, states)
+
+ if i < len(tokens)-1 or states[i+1] != [(self.startRule, 2, 0)]:
+ del tokens[-1]
+ self.error(tokens[i-1])
+ rv = self.buildTree(tokens, tree, ((self.startRule, 2, 0), i+1))
+ del tokens[-1]
+ return rv
+
+ def buildState(self, token, states, i, tree):
+ needsCompletion = {}
+ state = states[i]
+ predicted = {}
+
+ for item in state:
+ rule, pos, parent = item
+ lhs, rhs = rule
+
+ #
+ # A -> a . (completer)
+ #
+ if pos == len(rhs):
+ if len(rhs) == 0:
+ needsCompletion[lhs] = (item, i)
+
+ for pitem in states[parent]:
+ if pitem is item:
+ break
+
+ prule, ppos, pparent = pitem
+ plhs, prhs = prule
+
+ if prhs[ppos:ppos+1] == (lhs,):
+ new = (prule,
+ ppos+1,
+ pparent)
+ if new not in state:
+ state.append(new)
+ tree[(new, i)] = [(item, i)]
+ else:
+ tree[(new, i)].append((item, i))
+ continue
+
+ nextSym = rhs[pos]
+
+ #
+ # A -> a . B (predictor)
+ #
+ if nextSym in self.rules:
+ #
+ # Work on completer step some more; for rules
+ # with empty RHS, the "parent state" is the
+ # current state we're adding Earley items to,
+ # so the Earley items the completer step needs
+ # may not all be present when it runs.
+ #
+ if nextSym in needsCompletion:
+ new = (rule, pos+1, parent)
+ olditem_i = needsCompletion[nextSym]
+ if new not in state:
+ state.append(new)
+ tree[(new, i)] = [olditem_i]
+ else:
+ tree[(new, i)].append(olditem_i)
+
+ #
+ # Has this been predicted already?
+ #
+ if nextSym in predicted:
+ continue
+ predicted[nextSym] = 1
+
+ ttype = token is not self._EOF and \
+ self.typestring(token) or \
+ None
+ if ttype is not None:
+ #
+ # Even smarter predictor, when the
+ # token's type is known. The code is
+ # grungy, but runs pretty fast. Three
+ # cases are looked for: rules with
+ # empty RHS; first symbol on RHS is a
+ # terminal; first symbol on RHS is a
+ # nonterminal (and isn't nullable).
+ #
+ for prule in self.rules[nextSym]:
+ new = (prule, 0, i)
+ prhs = prule[1]
+ if len(prhs) == 0:
+ state.append(new)
+ continue
+ prhs0 = prhs[0]
+ if prhs0 not in self.rules:
+ if prhs0 != ttype:
+ continue
+ else:
+ state.append(new)
+ continue
+ first = self.first[prhs0]
+ if None not in first and \
+ ttype not in first:
+ continue
+ state.append(new)
+ continue
+
+ for prule in self.rules[nextSym]:
+ #
+ # Smarter predictor, as per Grune &
+ # Jacobs' _Parsing Techniques_. Not
+ # as good as FIRST sets though.
+ #
+ prhs = prule[1]
+ if len(prhs) > 0 and \
+ prhs[0] not in self.rules and \
+ token != prhs[0]:
+ continue
+ state.append((prule, 0, i))
+
+ #
+ # A -> a . c (scanner)
+ #
+ elif token == nextSym:
+ #assert new not in states[i+1]
+ states[i+1].append((rule, pos+1, parent))
+
+ def buildTree(self, tokens, tree, root):
+ stack = []
+ self.buildTree_r(stack, tokens, -1, tree, root)
+ return stack[0]
+
+ def buildTree_r(self, stack, tokens, tokpos, tree, root):
+ (rule, pos, parent), state = root
+
+ while pos > 0:
+ want = ((rule, pos, parent), state)
+ if want not in tree:
+ #
+ # Since pos > 0, it didn't come from closure,
+ # and if it isn't in tree[], then there must
+ # be a terminal symbol to the left of the dot.
+ # (It must be from a "scanner" step.)
+ #
+ pos = pos - 1
+ state = state - 1
+ stack.insert(0, tokens[tokpos])
+ tokpos = tokpos - 1
+ else:
+ #
+ # There's a NT to the left of the dot.
+ # Follow the tree pointer recursively (>1
+ # tree pointers from it indicates ambiguity).
+ # Since the item must have come about from a
+ # "completer" step, the state where the item
+ # came from must be the parent state of the
+ # item the tree pointer points to.
+ #
+ children = tree[want]
+ if len(children) > 1:
+ child = self.ambiguity(children)
+ else:
+ child = children[0]
+
+ tokpos = self.buildTree_r(stack,
+ tokens, tokpos,
+ tree, child)
+ pos = pos - 1
+ (crule, cpos, cparent), cstate = child
+ state = cparent
+
+ lhs, rhs = rule
+ result = self.rule2func[rule](stack[:len(rhs)])
+ stack[:len(rhs)] = [result]
+ return tokpos
+
+ def ambiguity(self, children):
+ #
+ # XXX - problem here and in collectRules() if the same
+ # rule appears in >1 method. But in that case the
+ # user probably gets what they deserve :-) Also
+ # undefined results if rules causing the ambiguity
+ # appear in the same method.
+ #
+ sortlist = []
+ name2index = {}
+ for i in range(len(children)):
+ ((rule, pos, parent), index) = children[i]
+ lhs, rhs = rule
+ name = self.rule2name[rule]
+ sortlist.append((len(rhs), name))
+ name2index[name] = i
+ sortlist.sort()
+ list = map(lambda (a,b): b, sortlist)
+ return children[name2index[self.resolve(list)]]
+
+ def resolve(self, list):
+ #
+ # Resolve ambiguity in favor of the shortest RHS.
+ # Since we walk the tree from the top down, this
+ # should effectively resolve in favor of a "shift".
+ #
+ return list[0]
+
+#
+# GenericASTBuilder automagically constructs a concrete/abstract syntax tree
+# for a given input. The extra argument is a class (not an instance!)
+# which supports the "__setslice__" and "__len__" methods.
+#
+# XXX - silently overrides any user code in methods.
+#
+
+class GenericASTBuilder(GenericParser):
+ def __init__(self, AST, start):
+ GenericParser.__init__(self, start)
+ self.AST = AST
+
+ def preprocess(self, rule, func):
+ rebind = lambda lhs, self=self: \
+ lambda args, lhs=lhs, self=self: \
+ self.buildASTNode(args, lhs)
+ lhs, rhs = rule
+ return rule, rebind(lhs)
+
+ def buildASTNode(self, args, lhs):
+ children = []
+ for arg in args:
+ if isinstance(arg, self.AST):
+ children.append(arg)
+ else:
+ children.append(self.terminal(arg))
+ return self.nonterminal(lhs, children)
+
+ def terminal(self, token): return token
+
+ def nonterminal(self, type, args):
+ rv = self.AST(type)
+ rv[:len(args)] = args
+ return rv
+
+#
+# GenericASTTraversal is a Visitor pattern according to Design Patterns. For
+# each node it attempts to invoke the method n_<node type>, falling
+# back onto the default() method if the n_* can't be found. The preorder
+# traversal also looks for an exit hook named n_<node type>_exit (no default
+# routine is called if it's not found). To prematurely halt traversal
+# of a subtree, call the prune() method -- this only makes sense for a
+# preorder traversal. Node type is determined via the typestring() method.
+#
+
+class GenericASTTraversalPruningException:
+ pass
+
+class GenericASTTraversal:
+ def __init__(self, ast):
+ self.ast = ast
+
+ def typestring(self, node):
+ return node.type
+
+ def prune(self):
+ raise GenericASTTraversalPruningException
+
+ def preorder(self, node=None):
+ if node is None:
+ node = self.ast
+
+ try:
+ name = 'n_' + self.typestring(node)
+ if hasattr(self, name):
+ func = getattr(self, name)
+ func(node)
+ else:
+ self.default(node)
+ except GenericASTTraversalPruningException:
+ return
+
+ for kid in node:
+ self.preorder(kid)
+
+ name = name + '_exit'
+ if hasattr(self, name):
+ func = getattr(self, name)
+ func(node)
+
+ def postorder(self, node=None):
+ if node is None:
+ node = self.ast
+
+ for kid in node:
+ self.postorder(kid)
+
+ name = 'n_' + self.typestring(node)
+ if hasattr(self, name):
+ func = getattr(self, name)
+ func(node)
+ else:
+ self.default(node)
+
+
+ def default(self, node):
+ pass
+
+#
+# GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__"
+# implemented.
+#
+# XXX - makes assumptions about how GenericParser walks the parse tree.
+#
+
+class GenericASTMatcher(GenericParser):
+ def __init__(self, start, ast):
+ GenericParser.__init__(self, start)
+ self.ast = ast
+
+ def preprocess(self, rule, func):
+ rebind = lambda func, self=self: \
+ lambda args, func=func, self=self: \
+ self.foundMatch(args, func)
+ lhs, rhs = rule
+ rhslist = list(rhs)
+ rhslist.reverse()
+
+ return (lhs, tuple(rhslist)), rebind(func)
+
+ def foundMatch(self, args, func):
+ func(args[-1])
+ return args[-1]
+
+ def match_r(self, node):
+ self.input.insert(0, node)
+ children = 0
+
+ for child in node:
+ if children == 0:
+ self.input.insert(0, '(')
+ children = children + 1
+ self.match_r(child)
+
+ if children > 0:
+ self.input.insert(0, ')')
+
+ def match(self, ast=None):
+ if ast is None:
+ ast = self.ast
+ self.input = []
+
+ self.match_r(ast)
+ self.parse(self.input)
+
+ def resolve(self, list):
+ #
+ # Resolve ambiguity in favor of the longest RHS.
+ #
+ return list[-1]
+
+def _dump(tokens, states):
+ for i in range(len(states)):
+ print 'state', i
+ for (lhs, rhs), pos, parent in states[i]:
+ print '\t', lhs, '::=',
+ print ' '.join(rhs[:pos]),
+ print '.',
+ print ' '.join(rhs[pos:]),
+ print ',', parent
+ if i < len(tokens):
+ print
+ print 'token', str(tokens[i])
+ print
--- /dev/null
+# Stores properties associated with the class of an object.
+
+
+# Would it be nice to have support for more than one resolver per
+# class? In the meanwhile, they could collude using a dispatch
+# object.
+
+# Do you need access to the actual resolver?
+
+# Resolvers get the sequence because they may do a per-object lookup.
+
+# Could cache search results for better performance.
+
+
+# Dictionary which creates dictionary elements, so lookups never fail.
+# The new elements are always dictionaries.
+class CreateDict(dict):
+ def __getitem__(self, key):
+ return self.setdefault(key,{})
+
+class PropertyManager:
+ def __init__(self):
+ self.class_property = CreateDict()
+ self.class_property_resolver = CreateDict()
+ self.class_resolver = {}
+
+ def resolve(self, obj, property):
+ try:
+ klass = obj.__class__
+ except AttributeError:
+ raise KeyError("built-in instance")
+
+ return self.resolve_class(klass, property)
+
+ def resolve_class(self, klass, property):
+ # Hopefully, we'll find the hit right away
+ try:
+ return self.class_property[klass][property]
+ except KeyError:
+ pass
+
+ # Is there a property resolver?
+ try:
+ return self.class_property_resolver[klass][property](
+ self, klass, property)
+ except KeyError:
+ pass
+
+ # What about the class resolver?
+ try:
+ return self.class_resolver[klass](self, klass, property)
+ except KeyError:
+ pass
+
+ # That failed, so we walk up the class tree, depth-first and
+ # left-to-right (same as Python). For each class, check if
+ # the property exists, then check if the property resolver
+ # exists, and finally, check for the class resolver.
+
+ bases = list(klass.__bases__)
+ while bases:
+ base = bases.pop()
+ try:
+ return self.class_property[base][property]
+ except KeyError:
+ pass
+ try:
+ return self.class_property_resolver[base][property](
+ self, klass, property)
+ except KeyError:
+ pass
+ try:
+ return self.class_resolver[base](self, klass, property)
+ except KeyError:
+ pass
+
+ # this is why the search is depth-first/right-left
+ bases[:0] = list(base.__bases__)
+ raise KeyError("cannot find property %s for class %s" \
+ % (property, klass))
+
+
+default_manager = PropertyManager()
--- /dev/null
+# BLASTN 2.0a19MP-WashU [05-Feb-1998] [Build decunix3.2 01:53:29 05-Feb-1998]
+# BLASTP 2.0.4 [Feb-24-1998]
+class Algorithm:
+ def __init__(self, name, version, description = ""):
+ self.name = name # 'blastx', 'blastn', etc.
+ self.version = version # '2.1.2' or '2.0a19MP-WashU'
+ self.description = description # '[05-Feb-1998] [Build dec ...1998]'
+
+# Query= YAL001C YAL001C, Chr I from 147596 to 147665, and 147756 to 151168,
+# reverse complement
+# (3483 letters)
+class Query:
+ def __init__(self, name, accession, description, length):
+ self.name = name # 'YAL001C'
+ self.accession = accession # or None if missing
+ self.description = description # 'YAL001C, Chr I from 147596 to ... '
+ self.length = length # 3483
+
+# Database: ArabidopsisN
+# 66,211 sequences; 69,074,155 total letters.
+class Database:
+ def __init__(self, name, letters, entries):
+ self.name = name # ArabidopsisN
+ self.letters = letters # 69074155
+ self.entries = entries # 66211
+
+class TableInfo:
+ def __init__(self, full_description, info):
+ self.__dict__.update(info)
+ self.full_description = full_description
+
+
+class Search:
+ def __init__(self, algorithm, query, database, table, hits,
+ parameters, statistics):
+ self.algorithm = algorithm
+ self.query = query
+ self.database = database
+ self.table = table
+ self.hits = hits
+ self.parameters = parameters
+ self.statistics = statistics
+
+class Hit:
+ def __init__(self, name, description, accession, length,
+ algorithm, hsps = None):
+ self.name = name
+ self.description = description
+ self.accession = accession
+ self.length = length
+ self.algorithm = algorithm
+ if hsps is None:
+ hsps = []
+ self.hsps = hsps
+
+ def __len__(self):
+ return self.length
+
+
+
+# >GB_PL:ATF18F4 AL021637 Arabidopsis thaliana DNA chromosome 4, BAC clone
+# F18F4 (ESSAII project). 2/98
+# Length = 93,646
+#
+# Minus Strand HSPs:
+#
+# Score = 226 (33.9 bits), Expect = 0.80, P = 0.55
+# Identities = 98/142 (69%), Positives = 98/142 (69%), Strand = Minus / Plus
+# [...lines deleted...]
+# Query: 2486 ATATCAAGCAATTTGATAAGATCTAG 2461
+# A AT A C ATT GA AAGATC AG
+# Sbjct: 85387 AGATTTACCTATT-GAGAAGATCAAG 85411
+
+# computed from the strings
+class _SeqLength:
+ def __init__(self, length, identical, positives, gaps):
+ self.length = length
+ self.identical = identical
+ self.positives = positives
+ self.gaps = gaps
+ def __len__(self):
+ return self.length
+ def __getattr__(self, name):
+ if name == "frac_identical":
+ return float(self.identical) / self.length
+ elif name == "frac_positives":
+ return float(self.positives) / self.length
+ raise AttributeError(name)
+
+
+class HomologySeq(_SeqLength):
+ def __init__(self, seq, identical, positives, gaps):
+ _SeqLength.__init__(self, len(seq), identical, positives, gaps)
+ self.seq = seq
+
+class HSPSeq(_SeqLength):
+ def __init__(self, name, seq, location, identical, positives, gaps):
+ _SeqLength.__init__(self, len(seq), identical, positives, gaps)
+ self.name = name
+ self.seq = seq
+ self.location = location
+
+
+class HSP(_SeqLength):
+ def __init__(self,
+ query_seq, # ATATCAAGCAATTTGATAAGATCTAG
+ homology_seq, # A AT A C ATT GA AAGATC AG
+ subject_seq, # AGATTTACCTATT-GAGAAGATCAAG
+
+ query_location, # (2486, 2461, negative strand)
+ subject_location, # (85387, 85411)
+
+ query_name, # Query (or None)
+ subject_name, # Sbjct (or None)
+
+ algorithm, # an Algorithm
+ info, # contains Key/value pairs
+ homology_gaps = None, # Is this needed?
+ ):
+ assert len(query_seq) == len(homology_seq) == len(subject_seq), \
+ (query_seq, homology_seq, subject_seq)
+ self.algorithm = algorithm
+
+ query_gaps = query_seq.count("-")
+ subject_gaps = subject_seq.count("-")
+ if homology_gaps is None:
+ homology_gaps = query_gaps + subject_gaps
+ self.info = info
+
+ identical = info["identical"]
+ # bioperl calls this 'conserved'
+ positives = info.get("positives", identical)
+
+ _SeqLength.__init__(self, len(query_seq), identical,
+ positives, homology_gaps)
+
+ self.query = HSPSeq(name = query_name,
+ seq = query_seq,
+ location = query_location,
+ identical = identical,
+ positives = positives,
+ gaps = query_gaps)
+
+ self.subject = HSPSeq(name = subject_name,
+ seq = subject_seq,
+ location = subject_location,
+ identical = identical,
+ positives = positives,
+ gaps = subject_gaps)
+ self.homology = HomologySeq(seq = homology_seq,
+ identical = identical,
+ positives = positives,
+ gaps = homology_gaps)
--- /dev/null
+# Copyright 2000-2002 Brad Chapman.
+# Copyright 2004-2005 by M de Hoon.
+# Copyright 2007-2009 by Peter Cock.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Provides objects to represent biological sequences with alphabets.
+
+See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
+ - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
+ - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
+"""
+__docformat__ ="epytext en" #Don't just use plain text in epydoc API pages!
+
+import string #for maketrans only
+import array
+import sys
+
+#TODO - Remove this work around once we drop python 2.3 support
+try:
+ set = set
+except NameError:
+ from sets import Set as set
+
+import Alphabet
+from Alphabet import IUPAC
+from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
+from Bio.Data import CodonTable
+
+def _maketrans(complement_mapping) :
+ """Makes a python string translation table (PRIVATE).
+
+ Arguments:
+ - complement_mapping - a dictionary such as ambiguous_dna_complement
+ and ambiguous_rna_complement from Data.IUPACData.
+
+ Returns a translation table (a string of length 256) for use with the
+ python string's translate method to use in a (reverse) complement.
+
+ Compatible with lower case and upper case sequences.
+
+ For internal use only.
+ """
+ before = ''.join(complement_mapping.keys())
+ after = ''.join(complement_mapping.values())
+ before = before + before.lower()
+ after = after + after.lower()
+ return string.maketrans(before, after)
+
+_dna_complement_table = _maketrans(ambiguous_dna_complement)
+_rna_complement_table = _maketrans(ambiguous_rna_complement)
+
+class Seq(object):
+ """A read-only sequence object (essentially a string with an alphabet).
+
+ Like normal python strings, our basic sequence object is immutable.
+ This prevents you from doing my_seq[5] = "A" for example, but does allow
+ Seq objects to be used as dictionary keys.
+
+ The Seq object provides a number of string like methods (such as count,
+ find, split and strip), which are alphabet aware where appropriate.
+
+ The Seq object also provides some biological methods, such as complement,
+ reverse_complement, transcribe, back_transcribe and translate (which are
+ not applicable to sequences with a protein alphabet).
+ """
+ def __init__(self, data, alphabet = Alphabet.generic_alphabet):
+ """Create a Seq object.
+
+ Arguments:
+ - seq - Sequence, required (string)
+ - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
+
+ You will typically use Bio.SeqIO to read in sequences from files as
+ SeqRecord objects, whose sequence will be exposed as a Seq object via
+ the seq property.
+
+ However, will often want to create your own Seq objects directly:
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
+ ... IUPAC.protein)
+ >>> my_seq
+ Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
+ >>> print my_seq
+ MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+ """
+ # Enforce string storage
+ assert (type(data) == type("") or # must use a string
+ type(data) == type(u"")) # but can be a unicode string
+ self._data = data
+ self.alphabet = alphabet # Seq API requirement
+
+ # A data property is/was a Seq API requirement
+ def _set_data(self, value) :
+ #TODO - In the next release, actually raise an exception?
+ #The Seq object is like a python string, it should be read only!
+ import warnings
+ warnings.warn("Writing to the Seq object's .data propery is deprecated.",
+ DeprecationWarning)
+ self._data = value
+ data = property(fget= lambda self : str(self),
+ fset=_set_data,
+ doc="Sequence as a string (DEPRECATED)")
+
+ def __repr__(self):
+ """Returns a (truncated) representation of the sequence for debugging."""
+ if len(self) > 60 :
+ #Shows the last three letters as it is often useful to see if there
+ #is a stop codon at the end of a sequence.
+ #Note total length is 54+3+3=60
+ return "%s('%s...%s', %s)" % (self.__class__.__name__,
+ str(self)[:54], str(self)[-3:],
+ repr(self.alphabet))
+ else :
+ return "%s(%s, %s)" % (self.__class__.__name__,
+ repr(self.data),
+ repr(self.alphabet))
+ def __str__(self):
+ """Returns the full sequence as a python string.
+
+ Note that Biopython 1.44 and earlier would give a truncated
+ version of repr(my_seq) for str(my_seq). If you are writing code
+ which need to be backwards compatible with old Biopython, you
+ should continue to use my_seq.tostring() rather than str(my_seq).
+ """
+ return self._data
+
+ """
+ TODO - Work out why this breaks test_Restriction.py
+ (Comparing Seq objects would be nice to have. May need to think about
+ hashes and the in operator for when have list/dictionary of Seq objects...)
+ def __cmp__(self, other):
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ return cmp(str(self), str(other))
+ elif isinstance(other, basestring) :
+ return cmp(str(self), other)
+ else :
+ raise TypeError
+ """
+
+ def __len__(self): return len(self._data) # Seq API requirement
+
+ def __getitem__(self, index) : # Seq API requirement
+ #Note since Python 2.0, __getslice__ is deprecated
+ #and __getitem__ is used instead.
+ #See http://docs.python.org/ref/sequence-methods.html
+ if isinstance(index, int) :
+ #Return a single letter as a string
+ return self._data[index]
+ else :
+ #Return the (sub)sequence as another Seq object
+ return Seq(self._data[index], self.alphabet)
+
+ def __add__(self, other):
+ """Add another sequence or string to this sequence."""
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
+ return self.__class__(str(self) + str(other), a)
+ elif isinstance(other, basestring) :
+ #other is a plain string - use the current alphabet
+ return self.__class__(str(self) + other, self.alphabet)
+ else :
+ raise TypeError
+
+ def __radd__(self, other):
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
+ return self.__class__(str(other) + str(self), a)
+ elif isinstance(other, basestring) :
+ #other is a plain string - use the current alphabet
+ return self.__class__(other + str(self), self.alphabet)
+ else :
+ raise TypeError
+
+ def tostring(self): # Seq API requirement
+ """Returns the full sequence as a python string.
+
+ Although not formally deprecated, you are now encouraged to use
+ str(my_seq) instead of my_seq.tostring()."""
+ return str(self)
+
+ def tomutable(self): # Needed? Or use a function?
+ """Returns the full sequence as a MutableSeq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
+ ... IUPAC.protein)
+ >>> my_seq
+ Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
+ >>> my_seq.tomutable()
+ MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
+
+ Note that the alphabet is preserved.
+ """
+ return MutableSeq(str(self), self.alphabet)
+
+ def _get_seq_str_and_check_alphabet(self, other_sequence) :
+ """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
+
+ For a string argument, returns the string.
+
+ For a Seq or MutableSeq, it checks the alphabet is compatible
+ (raising an exception if it isn't), and then returns a string.
+ """
+ try :
+ other_alpha = other_sequence.alphabet
+ except AttributeError :
+ #Assume other_sequence is a string
+ return other_sequence
+
+ #Other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other_alpha)))
+ #Return as a string
+ return str(other_sequence)
+
+ def count(self, sub, start=0, end=sys.maxint):
+ """Non-overlapping count method, like that of a python string.
+
+ This behaves like the python string method of the same name,
+ which does a non-overlapping count!
+
+ Returns an integer, the number of occurrences of substring
+ argument sub in the (sub)sequence given by [start:end].
+ Optional arguments start and end are interpreted as in slice
+ notation.
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> my_seq = Seq("AAAATGA")
+ >>> print my_seq.count("A")
+ 5
+ >>> print my_seq.count("ATG")
+ 1
+ >>> print my_seq.count(Seq("AT"))
+ 1
+ >>> print my_seq.count("AT", 2, -1)
+ 1
+
+ HOWEVER, please note because python strings and Seq objects (and
+ MutableSeq objects) do a non-overlapping search, this may not give
+ the answer you expect:
+
+ >>> "AAAA".count("AA")
+ 2
+ >>> print Seq("AAAA").count("AA")
+ 2
+
+ A non-overlapping search would give the answer as three!
+ """
+ #If it has one, check the alphabet:
+ sub_str = self._get_seq_str_and_check_alphabet(sub)
+ return str(self).count(sub_str, start, end)
+
+ def find(self, sub, start=0, end=sys.maxint):
+ """Find method, like that of a python string.
+
+ This behaves like the python string method of the same name.
+
+ Returns an integer, the index of the first occurrence of substring
+ argument sub in the (sub)sequence given by [start:end].
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ Returns -1 if the subsequence is NOT found.
+
+ e.g. Locating the first typical start codon, AUG, in an RNA sequence:
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_rna.find("AUG")
+ 3
+ """
+ #If it has one, check the alphabet:
+ sub_str = self._get_seq_str_and_check_alphabet(sub)
+ return str(self).find(sub_str, start, end)
+
+ def rfind(self, sub, start=0, end=sys.maxint):
+ """Find from right method, like that of a python string.
+
+ This behaves like the python string method of the same name.
+
+ Returns an integer, the index of the last (right most) occurrence of
+ substring argument sub in the (sub)sequence given by [start:end].
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ Returns -1 if the subsequence is NOT found.
+
+ e.g. Locating the last typical start codon, AUG, in an RNA sequence:
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_rna.rfind("AUG")
+ 15
+ """
+ #If it has one, check the alphabet:
+ sub_str = self._get_seq_str_and_check_alphabet(sub)
+ return str(self).rfind(sub_str, start, end)
+
+ def startswith(self, prefix, start=0, end=sys.maxint) :
+ """Does the Seq start with the given prefix? Returns True/False.
+
+ This behaves like the python string method of the same name.
+
+ Return True if the sequence starts with the specified prefix
+ (a string or another Seq object), False otherwise.
+ With optional start, test sequence beginning at that position.
+ With optional end, stop comparing sequence at that position.
+ prefix can also be a tuple of strings to try. e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_rna.startswith("GUC")
+ True
+ >>> my_rna.startswith("AUG")
+ False
+ >>> my_rna.startswith("AUG", 3)
+ True
+ >>> my_rna.startswith(("UCC","UCA","UCG"),1)
+ True
+ """
+ #If it has one, check the alphabet:
+ if isinstance(prefix, tuple) :
+ #TODO - Once we drop support for Python 2.4, instead of this
+ #loop offload to the string method (requires Python 2.5+).
+ #Check all the alphabets first...
+ prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
+ for p in prefix]
+ for prefix_str in prefix_strings :
+ if str(self).startswith(prefix_str, start, end) :
+ return True
+ return False
+ else :
+ prefix_str = self._get_seq_str_and_check_alphabet(prefix)
+ return str(self).startswith(prefix_str, start, end)
+
+ def endswith(self, suffix, start=0, end=sys.maxint) :
+ """Does the Seq end with the given suffix? Returns True/False.
+
+ This behaves like the python string method of the same name.
+
+ Return True if the sequence ends with the specified suffix
+ (a string or another Seq object), False otherwise.
+ With optional start, test sequence beginning at that position.
+ With optional end, stop comparing sequence at that position.
+ suffix can also be a tuple of strings to try. e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_rna.endswith("UUG")
+ True
+ >>> my_rna.endswith("AUG")
+ False
+ >>> my_rna.endswith("AUG", 0, 18)
+ True
+ >>> my_rna.endswith(("UCC","UCA","UUG"))
+ True
+ """
+ #If it has one, check the alphabet:
+ if isinstance(suffix, tuple) :
+ #TODO - Once we drop support for Python 2.4, instead of this
+ #loop offload to the string method (requires Python 2.5+).
+ #Check all the alphabets first...
+ suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
+ for p in suffix]
+ for suffix_str in suffix_strings :
+ if str(self).endswith(suffix_str, start, end) :
+ return True
+ return False
+ else :
+ suffix_str = self._get_seq_str_and_check_alphabet(suffix)
+ return str(self).endswith(suffix_str, start, end)
+
+
+ def split(self, sep=None, maxsplit=-1) :
+ """Split method, like that of a python string.
+
+ This behaves like the python string method of the same name.
+
+ Return a list of the 'words' in the string (as Seq objects),
+ using sep as the delimiter string. If maxsplit is given, at
+ most maxsplit splits are done. If maxsplit is ommited, all
+ splits are made.
+
+ Following the python string method, sep will by default be any
+ white space (tabs, spaces, newlines) but this is unlikely to
+ apply to biological sequences.
+
+ e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
+ >>> my_aa = my_rna.translate()
+ >>> my_aa
+ Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
+ >>> my_aa.split("*")
+ [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
+ >>> my_aa.split("*",1)
+ [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
+
+ See also the rsplit method:
+
+ >>> my_aa.rsplit("*",1)
+ [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
+ """
+ #If it has one, check the alphabet:
+ sep_str = self._get_seq_str_and_check_alphabet(sep)
+ #TODO - If the sep is the defined stop symbol, or gap char,
+ #should we adjust the alphabet?
+ return [Seq(part, self.alphabet) \
+ for part in str(self).split(sep_str, maxsplit)]
+
+ def rsplit(self, sep=None, maxsplit=-1) :
+ """Right split method, like that of a python string.
+
+ This behaves like the python string method of the same name.
+
+ Return a list of the 'words' in the string (as Seq objects),
+ using sep as the delimiter string. If maxsplit is given, at
+ most maxsplit splits are done COUNTING FROM THE RIGHT.
+ If maxsplit is ommited, all splits are made.
+
+ Following the python string method, sep will by default be any
+ white space (tabs, spaces, newlines) but this is unlikely to
+ apply to biological sequences.
+
+ e.g. print my_seq.rsplit("*",1)
+
+ See also the split method.
+ """
+ #If it has one, check the alphabet:
+ sep_str = self._get_seq_str_and_check_alphabet(sep)
+ try :
+ return [Seq(part, self.alphabet) \
+ for part in str(self).rsplit(sep_str, maxsplit)]
+ except AttributeError :
+ #Python 2.3 doesn't have a string rsplit method, which we can
+ #word around by reversing the sequence, using (left) split,
+ #and then reversing the answer. Not very efficient!
+ words = [Seq(word[::-1], self.alphabet) for word \
+ in str(self)[::-1].split(sep_str[::-1], maxsplit)]
+ words.reverse()
+ return words
+
+ def strip(self, chars=None) :
+ """Returns a new Seq object with leading and trailing ends stripped.
+
+ This behaves like the python string method of the same name.
+
+ Optional argument chars defines which characters to remove. If
+ ommitted or None (default) then as for the python string method,
+ this defaults to removing any white space.
+
+ e.g. print my_seq.strip("-")
+
+ See also the lstrip and rstrip methods.
+ """
+ #If it has one, check the alphabet:
+ strip_str = self._get_seq_str_and_check_alphabet(chars)
+ return Seq(str(self).strip(strip_str), self.alphabet)
+
+ def lstrip(self, chars=None) :
+ """Returns a new Seq object with leading (left) end stripped.
+
+ This behaves like the python string method of the same name.
+
+ Optional argument chars defines which characters to remove. If
+ ommitted or None (default) then as for the python string method,
+ this defaults to removing any white space.
+
+ e.g. print my_seq.lstrip("-")
+
+ See also the strip and rstrip methods.
+ """
+ #If it has one, check the alphabet:
+ strip_str = self._get_seq_str_and_check_alphabet(chars)
+ return Seq(str(self).lstrip(strip_str), self.alphabet)
+
+ def rstrip(self, chars=None) :
+ """Returns a new Seq object with trailing (right) end stripped.
+
+ This behaves like the python string method of the same name.
+
+ Optional argument chars defines which characters to remove. If
+ ommitted or None (default) then as for the python string method,
+ this defaults to removing any white space.
+
+ e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
+
+ >>> from Bio.Alphabet import IUPAC
+ >>> from Bio.Seq import Seq
+ >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
+ >>> my_seq
+ Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
+ >>> my_seq.rstrip("A")
+ Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
+
+ See also the strip and lstrip methods.
+ """
+ #If it has one, check the alphabet:
+ strip_str = self._get_seq_str_and_check_alphabet(chars)
+ return Seq(str(self).rstrip(strip_str), self.alphabet)
+
+ def complement(self):
+ """Returns the complement sequence. New Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
+ >>> my_dna
+ Seq('CCCCCGATAG', IUPACUnambiguousDNA())
+ >>> my_dna.complement()
+ Seq('GGGGGCTATC', IUPACUnambiguousDNA())
+
+ You can of course used mixed case sequences,
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import generic_dna
+ >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
+ >>> my_dna
+ Seq('CCCCCgatA-GD', DNAAlphabet())
+ >>> my_dna.complement()
+ Seq('GGGGGctaT-CH', DNAAlphabet())
+
+ Note in the above example, ambiguous character D denotes
+ G, A or T so its complement is H (for C, T or A).
+
+ Trying to complement a protein sequence raises an exception.
+
+ >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
+ >>> my_protein.complement()
+ Traceback (most recent call last):
+ ...
+ ValueError: Proteins do not have complements!
+ """
+ base = Alphabet._get_base_alphabet(self.alphabet)
+ if isinstance(base, Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins do not have complements!")
+ if isinstance(base, Alphabet.DNAAlphabet) :
+ ttable = _dna_complement_table
+ elif isinstance(base, Alphabet.RNAAlphabet) :
+ ttable = _rna_complement_table
+ elif ('U' in self._data or 'u' in self._data) \
+ and ('T' in self._data or 't' in self._data):
+ #TODO - Handle this cleanly?
+ raise ValueError("Mixed RNA/DNA found")
+ elif 'U' in self._data or 'u' in self._data:
+ ttable = _rna_complement_table
+ else:
+ ttable = _dna_complement_table
+ #Much faster on really long sequences than the previous loop based one.
+ #thx to Michael Palmer, University of Waterloo
+ return Seq(str(self).translate(ttable), self.alphabet)
+
+ def reverse_complement(self):
+ """Returns the reverse complement sequence. New Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
+ >>> my_dna
+ Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
+ >>> my_dna.reverse_complement()
+ Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
+
+ Note in the above example, since R = G or A, its complement
+ is Y (which denotes C or T).
+
+ You can of course used mixed case sequences,
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import generic_dna
+ >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
+ >>> my_dna
+ Seq('CCCCCgatA-G', DNAAlphabet())
+ >>> my_dna.reverse_complement()
+ Seq('C-TatcGGGGG', DNAAlphabet())
+
+ Trying to complement a protein sequence raises an exception:
+
+ >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
+ >>> my_protein.reverse_complement()
+ Traceback (most recent call last):
+ ...
+ ValueError: Proteins do not have complements!
+ """
+ #Use -1 stride/step to reverse the complement
+ return self.complement()[::-1]
+
+ def transcribe(self):
+ """Returns the RNA sequence from a DNA sequence. New Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
+ ... IUPAC.unambiguous_dna)
+ >>> coding_dna
+ Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
+ >>> coding_dna.transcribe()
+ Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
+
+ Trying to transcribe a protein or RNA sequence raises an exception:
+
+ >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
+ >>> my_protein.transcribe()
+ Traceback (most recent call last):
+ ...
+ ValueError: Proteins cannot be transcribed!
+ """
+ base = Alphabet._get_base_alphabet(self.alphabet)
+ if isinstance(base, Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins cannot be transcribed!")
+ if isinstance(base, Alphabet.RNAAlphabet) :
+ raise ValueError("RNA cannot be transcribed!")
+
+ if self.alphabet==IUPAC.unambiguous_dna:
+ alphabet = IUPAC.unambiguous_rna
+ elif self.alphabet==IUPAC.ambiguous_dna:
+ alphabet = IUPAC.ambiguous_rna
+ else:
+ alphabet = Alphabet.generic_rna
+ return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
+
+ def back_transcribe(self):
+ """Returns the DNA sequence from an RNA sequence. New Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
+ ... IUPAC.unambiguous_rna)
+ >>> messenger_rna
+ Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
+ >>> messenger_rna.back_transcribe()
+ Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
+
+ Trying to back-transcribe a protein or DNA sequence raises an
+ exception:
+
+ >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
+ >>> my_protein.back_transcribe()
+ Traceback (most recent call last):
+ ...
+ ValueError: Proteins cannot be back transcribed!
+ """
+ base = Alphabet._get_base_alphabet(self.alphabet)
+ if isinstance(base, Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins cannot be back transcribed!")
+ if isinstance(base, Alphabet.DNAAlphabet) :
+ raise ValueError("DNA cannot be back transcribed!")
+
+ if self.alphabet==IUPAC.unambiguous_rna:
+ alphabet = IUPAC.unambiguous_dna
+ elif self.alphabet==IUPAC.ambiguous_rna:
+ alphabet = IUPAC.ambiguous_dna
+ else:
+ alphabet = Alphabet.generic_dna
+ return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
+
+ def translate(self, table="Standard", stop_symbol="*", to_stop=False):
+ """Turns a nucleotide sequence into a protein sequence. New Seq object.
+
+ This method will translate DNA or RNA sequences, and those with a
+ nucleotide or generic alphabet. Trying to translate a protein
+ sequence raises an exception.
+
+ Arguments:
+ - table - Which codon table to use? This can be either a name
+ (string) or an NCBI identifier (integer). This defaults
+ to the "Standard" table.
+ - stop_symbol - Single character string, what to use for terminators.
+ This defaults to the asterisk, "*".
+ - to_stop - Boolean, defaults to False meaning do a full translation
+ continuing on past any stop codons (translated as the
+ specified stop_symbol). If True, translation is
+ terminated at the first in frame stop codon (and the
+ stop_symbol is not appended to the returned protein
+ sequence).
+
+ e.g. Using the standard table:
+
+ >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
+ >>> coding_dna.translate()
+ Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
+ >>> coding_dna.translate(stop_symbol="@")
+ Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
+ >>> coding_dna.translate(to_stop=True)
+ Seq('VAIVMGR', ExtendedIUPACProtein())
+
+ Now using NCBI table 2, where TGA is not a stop codon:
+
+ >>> coding_dna.translate(table=2)
+ Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
+ >>> coding_dna.translate(table=2, to_stop=True)
+ Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
+
+ If the sequence has no in-frame stop codon, then the to_stop argument
+ has no effect:
+
+ >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
+ >>> coding_dna2.translate()
+ Seq('LAIVMGR', ExtendedIUPACProtein())
+ >>> coding_dna2.translate(to_stop=True)
+ Seq('LAIVMGR', ExtendedIUPACProtein())
+
+ NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
+ or a stop codon. These are translated as "X". Any invalid codon
+ (e.g. "TA?" or "T-A") will throw a TranslationError.
+
+ NOTE - Does NOT support gapped sequences.
+
+ NOTE - This does NOT behave like the python string's translate
+ method. For that use str(my_seq).translate(...) instead.
+ """
+ try:
+ table_id = int(table)
+ except ValueError:
+ table_id = None
+ if isinstance(table, str) and len(table)==256 :
+ raise ValueError("The Seq object translate method DOES NOT take " \
+ + "a 256 character string mapping table like " \
+ + "the python string object's translate method. " \
+ + "Use str(my_seq).translate(...) instead.")
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins cannot be translated!")
+ if self.alphabet==IUPAC.unambiguous_dna:
+ #Will use standard IUPAC protein alphabet, no need for X
+ if table_id is None:
+ codon_table = CodonTable.unambiguous_dna_by_name[table]
+ else:
+ codon_table = CodonTable.unambiguous_dna_by_id[table_id]
+ #elif self.alphabet==IUPAC.ambiguous_dna:
+ # if table_id is None:
+ # codon_table = CodonTable.ambiguous_dna_by_name[table]
+ # else:
+ # codon_table = CodonTable.ambiguous_dna_by_id[table_id]
+ elif self.alphabet==IUPAC.unambiguous_rna:
+ #Will use standard IUPAC protein alphabet, no need for X
+ if table_id is None:
+ codon_table = CodonTable.unambiguous_rna_by_name[table]
+ else:
+ codon_table = CodonTable.unambiguous_rna_by_id[table_id]
+ #elif self.alphabet==IUPAC.ambiguous_rna:
+ # if table_id is None:
+ # codon_table = CodonTable.ambiguous_rna_by_name[table]
+ # else:
+ # codon_table = CodonTable.ambiguous_rna_by_id[table_id]
+ else:
+ #This will use the extend IUPAC protein alphabet with X etc.
+ #The same table can be used for RNA or DNA (we use this for
+ #translating strings).
+ if table_id is None:
+ codon_table = CodonTable.ambiguous_generic_by_name[table]
+ else:
+ codon_table = CodonTable.ambiguous_generic_by_id[table_id]
+ protein = _translate_str(str(self), codon_table, stop_symbol, to_stop)
+ if stop_symbol in protein :
+ alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
+ stop_symbol = stop_symbol)
+ else :
+ alphabet = codon_table.protein_alphabet
+ return Seq(protein, alphabet)
+
+class UnknownSeq(Seq):
+ """A read-only sequence object of known length but unknown contents.
+
+ If you have an unknown sequence, you can represent this with a normal
+ Seq object, for example:
+
+ >>> my_seq = Seq("N"*5)
+ >>> my_seq
+ Seq('NNNNN', Alphabet())
+ >>> len(my_seq)
+ 5
+ >>> print my_seq
+ NNNNN
+
+ However, this is rather wasteful of memory (especially for large
+ sequences), which is where this class is most usefull:
+
+ >>> unk_five = UnknownSeq(5)
+ >>> unk_five
+ UnknownSeq(5, alphabet = Alphabet(), character = '?')
+ >>> len(unk_five)
+ 5
+ >>> print(unk_five)
+ ?????
+
+ You can add unknown sequence together, provided their alphabets and
+ characters are compatible, and get another memory saving UnknownSeq:
+
+ >>> unk_four = UnknownSeq(4)
+ >>> unk_four
+ UnknownSeq(4, alphabet = Alphabet(), character = '?')
+ >>> unk_four + unk_five
+ UnknownSeq(9, alphabet = Alphabet(), character = '?')
+
+ If the alphabet or characters don't match up, the addition gives an
+ ordinary Seq object:
+
+ >>> unk_nnnn = UnknownSeq(4, character = "N")
+ >>> unk_nnnn
+ UnknownSeq(4, alphabet = Alphabet(), character = 'N')
+ >>> unk_nnnn + unk_four
+ Seq('NNNN????', Alphabet())
+
+ Combining with a real Seq gives a new Seq object:
+
+ >>> known_seq = Seq("ACGT")
+ >>> unk_four + known_seq
+ Seq('????ACGT', Alphabet())
+ >>> known_seq + unk_four
+ Seq('ACGT????', Alphabet())
+ """
+ def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None) :
+ """Create a new UnknownSeq object.
+
+ If character is ommited, it is determed from the alphabet, "N" for
+ nucleotides, "X" for proteins, and "?" otherwise.
+ """
+ self._length = int(length)
+ if self._length < 0 :
+ #TODO - Block zero length UnknownSeq? You can just use a Seq!
+ raise ValueError("Length must not be negative.")
+ self.alphabet = alphabet
+ if character :
+ if len(character) != 1 :
+ raise ValueError("character argument should be a single letter string.")
+ self._character = character
+ else :
+ base = Alphabet._get_base_alphabet(alphabet)
+ #TODO? Check the case of the letters in the alphabet?
+ #We may have to use "n" instead of "N" etc.
+ if isinstance(base, Alphabet.NucleotideAlphabet) :
+ self._character = "N"
+ elif isinstance(base, Alphabet.ProteinAlphabet) :
+ self._character = "X"
+ else :
+ self._character = "?"
+
+ def __len__(self) :
+ """Returns the stated length of the unknown sequence."""
+ return self._length
+
+ def __str__(self) :
+ """Returns the unknown sequence as full string of the given length."""
+ return self._character * self._length
+
+ def __repr__(self):
+ return "UnknownSeq(%i, alphabet = %s, character = %s)" \
+ % (self._length, repr(self.alphabet), repr(self._character))
+
+ def __add__(self, other) :
+ if isinstance(other, UnknownSeq) \
+ and other._character == self._character :
+ #TODO - Check the alphabets match
+ return UnknownSeq(len(self)+len(other),
+ self.alphabet, self._character)
+ #Offload to the base class...
+ return Seq(str(self), self.alphabet) + other
+
+ def __radd__(self, other) :
+ if isinstance(other, UnknownSeq) \
+ and other._character == self._character :
+ #TODO - Check the alphabets match
+ return UnknownSeq(len(self)+len(other),
+ self.alphabet, self._character)
+ #Offload to the base class...
+ return other + Seq(str(self), self.alphabet)
+
+ def __getitem__(self, index):
+ if isinstance(index, int) :
+ #TODO - Check the bounds without wasting memory
+ return str(self)[index]
+ else :
+ #TODO - Work out the length without wasting memory
+ return UnknownSeq(len(("#"*self._length)[index]),
+ self.alphabet, self._character)
+
+ def count(self, sub, start=0, end=sys.maxint):
+ """Non-overlapping count method, like that of a python string.
+
+ This behaves like the python string (and Seq object) method of the
+ same name, which does a non-overlapping count!
+
+ Returns an integer, the number of occurrences of substring
+ argument sub in the (sub)sequence given by [start:end].
+ Optional arguments start and end are interpreted as in slice
+ notation.
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ >>> "NNNN".count("N")
+ 4
+ >>> Seq("NNNN").count("N")
+ 4
+ >>> UnknownSeq(4, character="N").count("N")
+ 4
+ >>> UnknownSeq(4, character="N").count("A")
+ 0
+ >>> UnknownSeq(4, character="N").count("AA")
+ 0
+
+ HOWEVER, please note because that python strings and Seq objects (and
+ MutableSeq objects) do a non-overlapping search, this may not give
+ the answer you expect:
+
+ >>> UnknownSeq(4, character="N").count("NN")
+ 2
+ >>> UnknownSeq(4, character="N").count("NNN")
+ 1
+ """
+ sub_str = self._get_seq_str_and_check_alphabet(sub)
+ if len(sub_str) == 1 :
+ if str(sub_str) == self._character :
+ if start==0 and end >= self._length :
+ return self._length
+ else :
+ #This could be done more cleverly...
+ return str(self).count(sub_str, start, end)
+ else :
+ return 0
+ else :
+ if set(sub_str) == set(self._character) :
+ if start==0 and end >= self._length :
+ return self._length // len(sub_str)
+ else :
+ #This could be done more cleverly...
+ return str(self).count(sub_str, start, end)
+ else :
+ return 0
+
+ def complement(self) :
+ """The complement of an unknown nucleotide equals itself.
+
+ >>> my_nuc = UnknownSeq(8)
+ >>> my_nuc
+ UnknownSeq(8, alphabet = Alphabet(), character = '?')
+ >>> print my_nuc
+ ????????
+ >>> my_nuc.complement()
+ UnknownSeq(8, alphabet = Alphabet(), character = '?')
+ >>> print my_nuc.complement()
+ ????????
+ """
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins do not have complements!")
+ return self
+
+ def reverse_complement(self) :
+ """The reverse complement of an unknown nucleotide equals itself.
+
+ >>> my_nuc = UnknownSeq(10)
+ >>> my_nuc
+ UnknownSeq(10, alphabet = Alphabet(), character = '?')
+ >>> print my_nuc
+ ??????????
+ >>> my_nuc.reverse_complement()
+ UnknownSeq(10, alphabet = Alphabet(), character = '?')
+ >>> print my_nuc.reverse_complement()
+ ??????????
+ """
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins do not have complements!")
+ return self
+
+ def transcribe(self) :
+ """Returns unknown RNA sequence from an unknown DNA sequence.
+
+ >>> my_dna = UnknownSeq(10, character="N")
+ >>> my_dna
+ UnknownSeq(10, alphabet = Alphabet(), character = 'N')
+ >>> print my_dna
+ NNNNNNNNNN
+ >>> my_rna = my_dna.transcribe()
+ >>> my_rna
+ UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
+ >>> print my_rna
+ NNNNNNNNNN
+ """
+ #Offload the alphabet stuff
+ s = Seq(self._character, self.alphabet).transcribe()
+ return UnknownSeq(self._length, s.alphabet, self._character)
+
+ def back_transcribe(self) :
+ """Returns unknown DNA sequence from an unknown RNA sequence.
+
+ >>> my_rna = UnknownSeq(20, character="N")
+ >>> my_rna
+ UnknownSeq(20, alphabet = Alphabet(), character = 'N')
+ >>> print my_rna
+ NNNNNNNNNNNNNNNNNNNN
+ >>> my_dna = my_rna.back_transcribe()
+ >>> my_dna
+ UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
+ >>> print my_dna
+ NNNNNNNNNNNNNNNNNNNN
+ """
+ #Offload the alphabet stuff
+ s = Seq(self._character, self.alphabet).back_transcribe()
+ return UnknownSeq(self._length, s.alphabet, self._character)
+
+ def translate(self, **kwargs) :
+ """Translate an unknown nucleotide sequence into an unknown protein.
+
+ e.g.
+
+ >>> my_seq = UnknownSeq(11, character="N")
+ >>> print my_seq
+ NNNNNNNNNNN
+ >>> my_protein = my_seq.translate()
+ >>> my_protein
+ UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
+ >>> print my_protein
+ XXX
+
+ In comparison, using a normal Seq object:
+
+ >>> my_seq = Seq("NNNNNNNNNNN")
+ >>> print my_seq
+ NNNNNNNNNNN
+ >>> my_protein = my_seq.translate()
+ >>> my_protein
+ Seq('XXX', ExtendedIUPACProtein())
+ >>> print my_protein
+ XXX
+
+ """
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins cannot be translated!")
+ return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
+
+
+class MutableSeq(object):
+ """An editable sequence object (with an alphabet).
+
+ Unlike normal python strings and our basic sequence object (the Seq class)
+ which are immuatable, the MutableSeq lets you edit the sequence in place.
+ However, this means you cannot use a MutableSeq object as a dictionary key.
+
+ >>> from Bio.Seq import MutableSeq
+ >>> from Bio.Alphabet import generic_dna
+ >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
+ >>> my_seq
+ MutableSeq('ACTCGTCGTCG', DNAAlphabet())
+ >>> my_seq[5]
+ 'T'
+ >>> my_seq[5] = "A"
+ >>> my_seq
+ MutableSeq('ACTCGACGTCG', DNAAlphabet())
+ >>> my_seq[5]
+ 'A'
+ >>> my_seq[5:8] = "NNN"
+ >>> my_seq
+ MutableSeq('ACTCGNNNTCG', DNAAlphabet())
+ >>> len(my_seq)
+ 11
+
+ Note that the MutableSeq object does not support as many string-like
+ or biological methods as the Seq object.
+ """
+ def __init__(self, data, alphabet = Alphabet.generic_alphabet):
+ if type(data) == type(""):
+ self.data = array.array("c", data)
+ else:
+ self.data = data # assumes the input is an array
+ self.alphabet = alphabet
+
+ def __repr__(self):
+ """Returns a (truncated) representation of the sequence for debugging."""
+ if len(self) > 60 :
+ #Shows the last three letters as it is often useful to see if there
+ #is a stop codon at the end of a sequence.
+ #Note total length is 54+3+3=60
+ return "%s('%s...%s', %s)" % (self.__class__.__name__,
+ str(self[:54]), str(self[-3:]),
+ repr(self.alphabet))
+ else :
+ return "%s('%s', %s)" % (self.__class__.__name__,
+ str(self),
+ repr(self.alphabet))
+
+ def __str__(self):
+ """Returns the full sequence as a python string.
+
+ Note that Biopython 1.44 and earlier would give a truncated
+ version of repr(my_seq) for str(my_seq). If you are writing code
+ which needs to be backwards compatible with old Biopython, you
+ should continue to use my_seq.tostring() rather than str(my_seq).
+ """
+ #See test_GAQueens.py for an historic usage of a non-string alphabet!
+ return "".join(self.data)
+
+ def __cmp__(self, other):
+ """Compare the sequence for to another sequence or a string.
+
+ If compared to another sequence the alphabets must be compatible.
+ Comparing DNA to RNA, or Nucleotide to Protein will raise an
+ exception.
+
+ Otherwise only the sequence itself is compared, not the precise
+ alphabet.
+
+ This method indirectly supports ==, < , etc."""
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ if isinstance(other, MutableSeq):
+ #See test_GAQueens.py for an historic usage of a non-string
+ #alphabet! Comparing the arrays supports this.
+ return cmp(self.data, other.data)
+ else :
+ return cmp(str(self), str(other))
+ elif isinstance(other, basestring) :
+ return cmp(str(self), other)
+ else :
+ raise TypeError
+
+ def __len__(self): return len(self.data)
+
+ def __getitem__(self, index) :
+ #Note since Python 2.0, __getslice__ is deprecated
+ #and __getitem__ is used instead.
+ #See http://docs.python.org/ref/sequence-methods.html
+ if isinstance(index, int) :
+ #Return a single letter as a string
+ return self.data[index]
+ else :
+ #Return the (sub)sequence as another Seq object
+ return MutableSeq(self.data[index], self.alphabet)
+
+ def __setitem__(self, index, value):
+ #Note since Python 2.0, __setslice__ is deprecated
+ #and __setitem__ is used instead.
+ #See http://docs.python.org/ref/sequence-methods.html
+ if isinstance(index, int) :
+ #Replacing a single letter with a new string
+ self.data[index] = value
+ else :
+ #Replacing a sub-sequence
+ if isinstance(value, MutableSeq):
+ self.data[index] = value.data
+ elif isinstance(value, type(self.data)):
+ self.data[index] = value
+ else:
+ self.data[index] = array.array("c", str(value))
+
+ def __delitem__(self, index):
+ #Note since Python 2.0, __delslice__ is deprecated
+ #and __delitem__ is used instead.
+ #See http://docs.python.org/ref/sequence-methods.html
+
+ #Could be deleting a single letter, or a slice
+ del self.data[index]
+
+ def __add__(self, other):
+ """Add another sequence or string to this sequence.
+
+ Returns a new MutableSeq object."""
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
+ if isinstance(other, MutableSeq):
+ #See test_GAQueens.py for an historic usage of a non-string
+ #alphabet! Adding the arrays should support this.
+ return self.__class__(self.data + other.data, a)
+ else :
+ return self.__class__(str(self) + str(other), a)
+ elif isinstance(other, basestring) :
+ #other is a plain string - use the current alphabet
+ return self.__class__(str(self) + str(other), self.alphabet)
+ else :
+ raise TypeError
+
+ def __radd__(self, other):
+ if hasattr(other, "alphabet") :
+ #other should be a Seq or a MutableSeq
+ if not Alphabet._check_type_compatible([self.alphabet,
+ other.alphabet]) :
+ raise TypeError("Incompatable alphabets %s and %s" \
+ % (repr(self.alphabet), repr(other.alphabet)))
+ #They should be the same sequence type (or one of them is generic)
+ a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
+ if isinstance(other, MutableSeq):
+ #See test_GAQueens.py for an historic usage of a non-string
+ #alphabet! Adding the arrays should support this.
+ return self.__class__(other.data + self.data, a)
+ else :
+ return self.__class__(str(other) + str(self), a)
+ elif isinstance(other, basestring) :
+ #other is a plain string - use the current alphabet
+ return self.__class__(str(other) + str(self), self.alphabet)
+ else :
+ raise TypeError
+
+ def append(self, c):
+ self.data.append(c)
+
+ def insert(self, i, c):
+ self.data.insert(i, c)
+
+ def pop(self, i = (-1)):
+ c = self.data[i]
+ del self.data[i]
+ return c
+
+ def remove(self, item):
+ for i in range(len(self.data)):
+ if self.data[i] == item:
+ del self.data[i]
+ return
+ raise ValueError("MutableSeq.remove(x): x not in list")
+
+ def count(self, sub, start=0, end=sys.maxint):
+ """Non-overlapping count method, like that of a python string.
+
+ This behaves like the python string method of the same name,
+ which does a non-overlapping count!
+
+ Returns an integer, the number of occurrences of substring
+ argument sub in the (sub)sequence given by [start:end].
+ Optional arguments start and end are interpreted as in slice
+ notation.
+
+ Arguments:
+ - sub - a string or another Seq object to look for
+ - start - optional integer, slice start
+ - end - optional integer, slice end
+
+ e.g.
+
+ >>> from Bio.Seq import MutableSeq
+ >>> my_mseq = MutableSeq("AAAATGA")
+ >>> print my_mseq.count("A")
+ 5
+ >>> print my_mseq.count("ATG")
+ 1
+ >>> print my_mseq.count(Seq("AT"))
+ 1
+ >>> print my_mseq.count("AT", 2, -1)
+ 1
+
+ HOWEVER, please note because that python strings, Seq objects and
+ MutableSeq objects do a non-overlapping search, this may not give
+ the answer you expect:
+
+ >>> "AAAA".count("AA")
+ 2
+ >>> print MutableSeq("AAAA").count("AA")
+ 2
+
+ A non-overlapping search would give the answer as three!
+ """
+ try :
+ #TODO - Should we check the alphabet?
+ search = sub.tostring()
+ except AttributeError :
+ search = sub
+
+ if not isinstance(search, basestring) :
+ raise TypeError("expected a string, Seq or MutableSeq")
+
+ if len(search) == 1 :
+ #Try and be efficient and work directly from the array.
+ count = 0
+ for c in self.data[start:end]:
+ if c == search: count += 1
+ return count
+ else :
+ #TODO - Can we do this more efficiently?
+ return self.tostring().count(search, start, end)
+
+ def index(self, item):
+ for i in range(len(self.data)):
+ if self.data[i] == item:
+ return i
+ raise ValueError("MutableSeq.index(x): x not in list")
+
+ def reverse(self):
+ """Modify the mutable sequence to reverse itself.
+
+ No return value.
+ """
+ self.data.reverse()
+
+ def complement(self):
+ """Modify the mutable sequence to take on its complement.
+
+ Trying to complement a protein sequence raises an exception.
+
+ No return value.
+ """
+ if isinstance(Alphabet._get_base_alphabet(self.alphabet),
+ Alphabet.ProteinAlphabet) :
+ raise ValueError("Proteins do not have complements!")
+ if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna):
+ d = ambiguous_dna_complement
+ elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna):
+ d = ambiguous_rna_complement
+ elif 'U' in self.data and 'T' in self.data :
+ #TODO - Handle this cleanly?
+ raise ValueError("Mixed RNA/DNA found")
+ elif 'U' in self.data:
+ d = ambiguous_rna_complement
+ else:
+ d = ambiguous_dna_complement
+ c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()])
+ d.update(c)
+ self.data = map(lambda c: d[c], self.data)
+ self.data = array.array('c', self.data)
+
+ def reverse_complement(self):
+ """Modify the mutable sequence to take on its reverse complement.
+
+ Trying to reverse complement a protein sequence raises an exception.
+
+ No return value.
+ """
+ self.complement()
+ self.data.reverse()
+
+ ## Sorting a sequence makes no sense.
+ # def sort(self, *args): self.data.sort(*args)
+
+ def extend(self, other):
+ if isinstance(other, MutableSeq):
+ for c in other.data:
+ self.data.append(c)
+ else:
+ for c in other:
+ self.data.append(c)
+
+ def tostring(self):
+ """Returns the full sequence as a python string.
+
+ Although not formally deprecated, you are now encouraged to use
+ str(my_seq) instead of my_seq.tostring().
+
+ Because str(my_seq) will give you the full sequence as a python string,
+ there is often no need to make an explicit conversion. For example,
+
+ print "ID={%s}, sequence={%s}" % (my_name, my_seq)
+
+ On Biopython 1.44 or older you would have to have done this:
+
+ print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
+ """
+ return "".join(self.data)
+
+ def toseq(self):
+ """Returns the full sequence as a new immutable Seq object.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.Alphabet import IUPAC
+ >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \
+ IUPAC.protein)
+ >>> my_mseq
+ MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
+ >>> my_mseq.toseq()
+ Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
+
+ Note that the alphabet is preserved.
+ """
+ return Seq("".join(self.data), self.alphabet)
+
+# The transcribe, backward_transcribe, and translate functions are
+# user-friendly versions of the corresponding functions in Bio.Transcribe
+# and Bio.Translate. The functions work both on Seq objects, and on strings.
+
+def transcribe(dna):
+ """Transcribes a DNA sequence into RNA.
+
+ If given a string, returns a new string object.
+
+ Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
+
+ Trying to transcribe a protein or RNA sequence raises an exception.
+
+ e.g.
+
+ >>> transcribe("ACTGN")
+ 'ACUGN'
+ """
+ if isinstance(dna, Seq) :
+ return dna.transcribe()
+ elif isinstance(dna, MutableSeq):
+ return dna.toseq().transcribe()
+ else:
+ return dna.replace('T','U').replace('t','u')
+
+def back_transcribe(rna):
+ """Back-transcribes an RNA sequence into DNA.
+
+ If given a string, returns a new string object.
+
+ Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
+
+ Trying to transcribe a protein or DNA sequence raises an exception.
+
+ e.g.
+
+ >>> back_transcribe("ACUGN")
+ 'ACTGN'
+ """
+ if isinstance(rna, Seq) :
+ return rna.back_transcribe()
+ elif isinstance(rna, MutableSeq):
+ return rna.toseq().back_transcribe()
+ else:
+ return rna.replace('U','T').replace('u','t')
+
+def _translate_str(sequence, table, stop_symbol="*",
+ to_stop=False, pos_stop="X") :
+ """Helper function to translate a nucleotide string (PRIVATE).
+
+ Arguments:
+ - sequence - a string
+ - table - a CodonTable object (NOT a table name or id number)
+ - stop_symbol - a single character string, what to use for terminators.
+ - to_stop - boolean, should translation terminate at the first
+ in frame stop codon? If there is no in-frame stop codon
+ then translation continues to the end.
+ - pos_stop - a single character string for a possible stop codon
+ (e.g. TAN or NNN)
+
+ Returns a string.
+
+ e.g.
+
+ >>> from Bio.Data import CodonTable
+ >>> table = CodonTable.ambiguous_dna_by_id[1]
+ >>> _translate_str("AAA", table)
+ 'K'
+ >>> _translate_str("TAR", table)
+ '*'
+ >>> _translate_str("TAN", table)
+ 'X'
+ >>> _translate_str("TAN", table, pos_stop="@")
+ '@'
+ >>> _translate_str("TA?", table)
+ Traceback (most recent call last):
+ ...
+ TranslationError: Codon 'TA?' is invalid
+ """
+ sequence = sequence.upper()
+ amino_acids = []
+ forward_table = table.forward_table
+ stop_codons = table.stop_codons
+ if table.nucleotide_alphabet.letters is not None :
+ valid_letters = set(table.nucleotide_alphabet.letters.upper())
+ else :
+ #Assume the worst case, ambiguous DNA or RNA:
+ valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
+ IUPAC.ambiguous_rna.letters.upper())
+
+ n = len(sequence)
+ for i in xrange(0,n-n%3,3) :
+ codon = sequence[i:i+3]
+ try :
+ amino_acids.append(forward_table[codon])
+ except (KeyError, CodonTable.TranslationError) :
+ #Todo? Treat "---" as a special case (gapped translation)
+ if codon in table.stop_codons :
+ if to_stop : break
+ amino_acids.append(stop_symbol)
+ elif valid_letters.issuperset(set(codon)) :
+ #Possible stop codon (e.g. NNN or TAN)
+ amino_acids.append(pos_stop)
+ else :
+ raise CodonTable.TranslationError(\
+ "Codon '%s' is invalid" % codon)
+ return "".join(amino_acids)
+
+def translate(sequence, table="Standard", stop_symbol="*", to_stop=False):
+ """Translate a nucleotide sequence into amino acids.
+
+ If given a string, returns a new string object. Given a Seq or
+ MutableSeq, returns a Seq object with a protein alphabet.
+
+ Arguments:
+ - table - Which codon table to use? This can be either a name
+ (string) or an NCBI identifier (integer). Defaults
+ to the "Standard" table.
+ - stop_symbol - Single character string, what to use for any
+ terminators, defaults to the asterisk, "*".
+ - to_stop - Boolean, defaults to False meaning do a full
+ translation continuing on past any stop codons
+ (translated as the specified stop_symbol). If
+ True, translation is terminated at the first in
+ frame stop codon (and the stop_symbol is not
+ appended to the returned protein sequence).
+
+ A simple string example using the default (standard) genetic code:
+
+ >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
+ >>> translate(coding_dna)
+ 'VAIVMGR*KGAR*'
+ >>> translate(coding_dna, stop_symbol="@")
+ 'VAIVMGR@KGAR@'
+ >>> translate(coding_dna, to_stop=True)
+ 'VAIVMGR'
+
+ Now using NCBI table 2, where TGA is not a stop codon:
+
+ >>> translate(coding_dna, table=2)
+ 'VAIVMGRWKGAR*'
+ >>> translate(coding_dna, table=2, to_stop=True)
+ 'VAIVMGRWKGAR'
+
+ Note that if the sequence has no in-frame stop codon, then the to_stop
+ argument has no effect:
+
+ >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
+ >>> translate(coding_dna2)
+ 'VAIVMGR'
+ >>> translate(coding_dna2, to_stop=True)
+ 'VAIVMGR'
+
+ NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
+ or a stop codon. These are translated as "X". Any invalid codon
+ (e.g. "TA?" or "T-A") will throw a TranslationError.
+
+ NOTE - Does NOT support gapped sequences.
+
+ It will however translate either DNA or RNA.
+ """
+ if isinstance(sequence, Seq) :
+ return sequence.translate(table, stop_symbol, to_stop)
+ elif isinstance(sequence, MutableSeq):
+ #Return a Seq object
+ return sequence.toseq().translate(table, stop_symbol, to_stop)
+ else:
+ #Assume its a string, return a string
+ try :
+ codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
+ except ValueError :
+ codon_table = CodonTable.ambiguous_generic_by_name[table]
+ return _translate_str(sequence, codon_table, stop_symbol, to_stop)
+
+def reverse_complement(sequence):
+ """Returns the reverse complement sequence of a nucleotide string.
+
+ If given a string, returns a new string object.
+ Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet.
+
+ Supports unambiguous and ambiguous nucleotide sequences.
+
+ e.g.
+
+ >>> reverse_complement("ACTG-NH")
+ 'DN-CAGT'
+ """
+ if isinstance(sequence, Seq) :
+ #Return a Seq
+ return sequence.reverse_complement()
+ elif isinstance(sequence, MutableSeq) :
+ #Return a Seq
+ #Don't use the MutableSeq reverse_complement method as it is 'in place'.
+ return sequence.toseq().reverse_complement()
+
+ #Assume its a string.
+ #In order to avoid some code duplication, the old code would turn the string
+ #into a Seq, use the reverse_complement method, and convert back to a string.
+ #This worked, but is over five times slower on short sequences!
+ if ('U' in sequence or 'u' in sequence) \
+ and ('T' in sequence or 't' in sequence):
+ raise ValueError("Mixed RNA/DNA found")
+ elif 'U' in sequence or 'u' in sequence:
+ ttable = _rna_complement_table
+ else:
+ ttable = _dna_complement_table
+ return sequence.translate(ttable)[::-1]
+
+def _test():
+ """Run the Bio.Seq module's doctests."""
+ print "Runing doctests..."
+ import doctest
+ doctest.testmod()
+ print "Done"
+
+if __name__ == "__main__":
+ _test()
--- /dev/null
+"""Represent a Sequence Feature holding info about a part of a sequence.
+
+This is heavily modeled after the Biocorba SeqFeature objects, and
+may be pretty biased towards GenBank stuff since I'm writing it
+for the GenBank parser output...
+
+What's here:
+
+Base class to hold a Feature.
+----------------------------
+classes:
+o SeqFeature
+
+Hold information about a Reference.
+----------------------------------
+
+This is an attempt to create a General class to hold Reference type
+information.
+
+classes:
+o Reference
+
+Specify locations of a feature on a Sequence.
+---------------------------------------------
+
+This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
+much the same way as Biocorba. This has the advantages of allowing us
+to handle fuzzy stuff in case anyone needs it, and also be compatible
+with Biocorba.
+
+classes:
+o FeatureLocation - Specify the start and end location of a feature.
+
+o ExactPosition - Specify the position as being exact.
+o WithinPosition - Specify a position occuring within some range.
+o BetweenPosition - Specify a position occuring between a range.
+o BeforePosition - Specify the position as being found before some base.
+o AfterPosition - Specify the position as being found after some base.
+"""
+
+class SeqFeature:
+ """Represent a Sequence Feature on an object.
+
+ Attributes:
+ o location - the location of the feature on the sequence
+ o type - the specified type of the feature (ie. CDS, exon, repeat...)
+ o location_operator - a string specifying how this SeqFeature may
+ be related to others. For example, in the example GenBank feature
+ shown below, the location_operator would be "join"
+ o strand - A value specifying on which strand (of a DNA sequence, for
+ instance) the feature deals with. 1 indicates the plus strand, -1
+ indicates the minus strand, 0 indicates both strands, and None indicates
+ that strand doesn't apply (ie. for proteins) or is not known.
+ o id - A string identifier for the feature.
+ o ref - A reference to another sequence. This could be an accession
+ number for some different sequence.
+ o ref_db - A different database for the reference accession number.
+ o qualifier - A dictionary of qualifiers on the feature. These are
+ analagous to the qualifiers from a GenBank feature table. The keys of
+ the dictionary are qualifier names, the values are the qualifier
+ values.
+ o sub_features - Additional SeqFeatures which fall under this 'parent'
+ feature. For instance, if we having something like:
+
+ CDS join(1..10,30..40,50..60)
+
+ The the top level feature would be a CDS from 1 to 60, and the sub
+ features would be of 'CDS_join' type and would be from 1 to 10, 30 to
+ 40 and 50 to 60, respectively.
+ """
+ def __init__(self, location = None, type = '', location_operator = '',
+ strand = None, id = "<unknown id>",
+ qualifiers = {}, sub_features = [],
+ ref = None, ref_db = None):
+ """Initialize a SeqFeature on a Sequence.
+ """
+ self.location = location
+
+ self.type = type
+ self.location_operator = location_operator
+ self.strand = strand
+ self.id = id
+ # XXX right now sub_features and qualifiers cannot be set
+ # from the initializer because this causes all kinds
+ # of recursive import problems. I can't understand why this is
+ # at all :-<
+ self.qualifiers = {}
+ self.sub_features = []
+ self.ref = ref
+ self.ref_db = ref_db
+
+ def __repr__(self):
+ """A string representation of the record for debugging."""
+ answer = "%s(%s" % (self.__class__, repr(self.location))
+ if self.type :
+ answer += ", type=%s" % repr(self.type)
+ if self.location_operator :
+ answer += ", location_operator=%s" % repr(self.location_operator)
+ if self.strand :
+ answer += ", strand=%s" % repr(self.strand)
+ if self.id and self.id != "<unknown id>" :
+ answer += ", id=%s" % repr(self.id)
+ if self.ref :
+ answer += ", ref=%s" % repr(self.ref)
+ if self.ref_db :
+ answer += ", ref_db=%s" % repr(self.ref_db)
+ answer += ")"
+ return answer
+
+ def __str__(self):
+ """A readable summary of the feature intended to be printed to screen.
+ """
+ out = "type: %s\n" % self.type
+ out += "location: %s\n" % self.location
+ out += "ref: %s:%s\n" % (self.ref, self.ref_db)
+ out += "strand: %s\n" % self.strand
+ out += "qualifiers: \n"
+ qualifier_keys = self.qualifiers.keys()
+ qualifier_keys.sort()
+ for qual_key in qualifier_keys:
+ out += "\tKey: %s, Value: %s\n" % (qual_key,
+ self.qualifiers[qual_key])
+ if len(self.sub_features) != 0:
+ out += "Sub-Features\n"
+ for sub_feature in self.sub_features:
+ out +="%s\n" % sub_feature
+
+ return out
+
+ def _shift(self, offset) :
+ """Returns a copy of the feature with its location shifted (PRIVATE).
+
+ The annotation qaulifiers are copied."""
+ answer = SeqFeature(location = self.location._shift(offset),
+ type = self.type,
+ location_operator = self.location_operator,
+ strand = self.strand,
+ id = self.id,
+ #qualifiers = dict(self.qualifiers.iteritems()),
+ #sub_features = [f._shift(offset) for f in self.sub_features],
+ ref = self.ref,
+ ref_db = self.ref_db)
+ #TODO - Sort out the use of sub_feature and qualifiers in __init___
+ answer.sub_features = [f._shift(offset) for f in self.sub_features]
+ answer.qualifiers = dict(self.qualifiers.iteritems())
+ return answer
+
+# --- References
+
+# TODO -- Will this hold PubMed and Medline information decently?
+class Reference:
+ """Represent a Generic Reference object.
+
+ Attributes:
+ o location - A list of Location objects specifying regions of
+ the sequence that the references correspond to. If no locations are
+ specified, the entire sequence is assumed.
+ o authors - A big old string, or a list split by author, of authors
+ for the reference.
+ o title - The title of the reference.
+ o journal - Journal the reference was published in.
+ o medline_id - A medline reference for the article.
+ o pubmed_id - A pubmed reference for the article.
+ o comment - A place to stick any comments about the reference.
+ """
+ def __init__(self):
+ self.location = []
+ self.authors = ''
+ self.consrtm = ''
+ self.title = ''
+ self.journal = ''
+ self.medline_id = ''
+ self.pubmed_id = ''
+ self.comment = ''
+
+ def __str__(self):
+ """Output an informative string for debugging.
+ """
+ out = ""
+ for single_location in self.location:
+ out += "location: %s\n" % single_location
+ out += "authors: %s\n" % self.authors
+ if self.consrtm:
+ out += "consrtm: %s\n" % self.consrtm
+ out += "title: %s\n" % self.title
+ out += "journal: %s\n" % self.journal
+ out += "medline id: %s\n" % self.medline_id
+ out += "pubmed id: %s\n" % self.pubmed_id
+ out += "comment: %s\n" % self.comment
+
+ return out
+
+# --- Handling feature locations
+
+class FeatureLocation:
+ """Specify the location of a feature along a sequence.
+
+ This attempts to deal with fuzziness of position ends, but also
+ make it easy to get the start and end in the 'normal' case (no
+ fuzziness).
+
+ You should access the start and end attributes with
+ your_location.start and your_location.end. If the start and
+ end are exact, this will return the positions, if not, we'll return
+ the approriate Fuzzy class with info about the position and fuzziness.
+
+ Note that the start and end location numbering follow Python's scheme,
+ thus a GenBank entry of 123..150 (one based counting) becomes a location
+ of [122:150] (zero based counting).
+ """
+ def __init__(self, start, end):
+ """Specify the start and end of a sequence feature.
+
+ start and end arguments specify the values where the feature begins
+ and ends. These can either by any of the *Position objects that
+ inherit from AbstractPosition, or can just be integers specifying the
+ position. In the case of integers, the values are assumed to be
+ exact and are converted in ExactPosition arguments. This is meant
+ to make it easy to deal with non-fuzzy ends.
+ """
+ if isinstance(start, AbstractPosition):
+ self._start = start
+ else:
+ self._start = ExactPosition(start)
+
+ if isinstance(end, AbstractPosition):
+ self._end = end
+ else:
+ self._end = ExactPosition(end)
+
+ def __str__(self):
+ """Returns a representation of the location (with python counting).
+
+ For the simple case this uses the python splicing syntax, [122:150]
+ (zero based counting) which GenBank would call 123..150 (one based
+ counting).
+ """
+ return "[%s:%s]" % (self._start, self._end)
+
+ def __repr__(self):
+ """A string representation of the location for debugging."""
+ return "%s(%s,%s)" \
+ % (self.__class__, repr(self.start), repr(self.end))
+
+ def _shift(self, offset) :
+ """Returns a copy of the location shifted by the offset (PRIVATE)."""
+ return FeatureLocation(start = self._start._shift(offset),
+ end = self._end._shift(offset))
+
+ def __getattr__(self, attr):
+ """Make it easy to get non-fuzzy starts and ends.
+
+ We override get_attribute here so that in non-fuzzy cases we
+ can just return the start and end position without any hassle.
+
+ To get fuzzy start and ends, just ask for item.start and
+ item.end. To get non-fuzzy attributes (ie. the position only)
+ ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return
+ the largest range of the fuzzy position. So something like:
+ (10.20)..(30.40) should return 10 for start, and 40 for end.
+
+ The special tricky case where is when we have a single between position
+ argument like 2^3 for the range. We want nofuzzy_start and nofuzzy_end
+ to give a reasonable approximation of what this really means, which
+ is an empty string -- so the same position for both. Doing a special
+ case here sucks, but there is really not a general rule you can apply
+ to this.
+ """
+ #TODO - these are not currently implemented as properties, this means
+ #they do not show up via dir(...)
+ if attr == 'start':
+ return self._start
+ elif attr == 'end':
+ return self._end
+ elif attr == 'nofuzzy_start':
+ if ((self._start == self._end) and isinstance(self._start,
+ BetweenPosition)):
+ return self._start.position
+ else:
+ return min(self._start.position,
+ self._start.position + self._start.extension)
+ elif attr == 'nofuzzy_end':
+ if ((self._start == self._end) and isinstance(self._start,
+ BetweenPosition)):
+ return self._end.position
+ else:
+ return max(self._end.position,
+ self._end.position + self._end.extension)
+ else:
+ raise AttributeError("Cannot evaluate attribute %s." % attr)
+
+class AbstractPosition:
+ """Abstract base class representing a position.
+ """
+ def __init__(self, position, extension):
+ self.position = position
+ self.extension = extension
+
+ def __repr__(self) :
+ """String representation of the location for debugging."""
+ return "%s(%s,%s)" \
+ % (self.__class__, repr(self.position), repr(self.extension))
+
+ def __cmp__(self, other):
+ """A simple comparison function for positions.
+
+ This is very simple-minded and just compares the position attribute
+ of the features; extensions are not considered at all. This could
+ potentially be expanded to try to take advantage of extensions.
+ """
+ assert isinstance(other, AbstractPosition), \
+ "We can only do comparisons between Biopython Position objects."
+
+ return cmp(self.position, other.position)
+
+ def _shift(self, offset) :
+ #We want this to maintain the subclass when called from a subclass
+ return self.__class__(self.position + offset, self.extension)
+
+class ExactPosition(AbstractPosition):
+ """Specify the specific position of a boundary.
+
+ o position - The position of the boundary.
+ o extension - An optional argument which must be zero since we don't
+ have an extension. The argument is provided so that the same number of
+ arguments can be passed to all position types.
+
+ In this case, there is no fuzziness associated with the position.
+ """
+ def __init__(self, position, extension = 0):
+ if extension != 0:
+ raise AttributeError("Non-zero extension %s for exact position."
+ % extension)
+ AbstractPosition.__init__(self, position, 0)
+
+ def __repr__(self) :
+ """String representation of the ExactPosition location for debugging."""
+ assert self.extension == 0
+ return "%s(%s)" % (self.__class__, repr(self.position))
+
+ def __str__(self):
+ return str(self.position)
+
+class WithinPosition(AbstractPosition):
+ """Specify the position of a boundary within some coordinates.
+
+ Arguments:
+ o position - The start position of the boundary
+ o extension - The range to which the boundary can extend.
+
+ This allows dealing with a position like ((1.4)..100). This
+ indicates that the start of the sequence is somewhere between 1
+ and 4. To represent that with this class we would set position as
+ 1 and extension as 3.
+ """
+ def __init__(self, position, extension = 0):
+ AbstractPosition.__init__(self, position, extension)
+
+ def __str__(self):
+ return "(%s.%s)" % (self.position, self.position + self.extension)
+
+class BetweenPosition(AbstractPosition):
+ """Specify the position of a boundary between two coordinates.
+
+ Arguments:
+ o position - The start position of the boundary.
+ o extension - The range to the other position of a boundary.
+
+ This specifies a coordinate which is found between the two positions.
+ So this allows us to deal with a position like ((1^2)..100). To
+ represent that with this class we set position as 1 and the
+ extension as 1.
+ """
+ def __init__(self, position, extension = 0):
+ AbstractPosition.__init__(self, position, extension)
+
+ def __str__(self):
+ return "(%s^%s)" % (self.position, self.position + self.extension)
+
+class BeforePosition(AbstractPosition):
+ """Specify a position where the actual location occurs before it.
+
+ Arguments:
+ o position - The upper boundary of where the location can occur.
+ o extension - An optional argument which must be zero since we don't
+ have an extension. The argument is provided so that the same number of
+ arguments can be passed to all position types.
+
+ This is used to specify positions like (<10..100) where the location
+ occurs somewhere before position 10.
+ """
+ def __init__(self, position, extension = 0):
+ if extension != 0:
+ raise AttributeError("Non-zero extension %s for exact position."
+ % extension)
+ AbstractPosition.__init__(self, position, 0)
+
+ def __repr__(self) :
+ """A string representation of the location for debugging."""
+ assert self.extension == 0
+ return "%s(%s)" % (self.__class__, repr(self.position))
+
+ def __str__(self):
+ return "<%s" % self.position
+
+class AfterPosition(AbstractPosition):
+ """Specify a position where the actual location is found after it.
+
+ Arguments:
+ o position - The lower boundary of where the location can occur.
+ o extension - An optional argument which must be zero since we don't
+ have an extension. The argument is provided so that the same number of
+ arguments can be passed to all position types.
+
+ This is used to specify positions like (>10..100) where the location
+ occurs somewhere after position 10.
+ """
+ def __init__(self, position, extension = 0):
+ if extension != 0:
+ raise AttributeError("Non-zero extension %s for exact position."
+ % extension)
+ AbstractPosition.__init__(self, position, 0)
+
+ def __repr__(self) :
+ """A string representation of the location for debugging."""
+ assert self.extension == 0
+ return "%s(%s)" % (self.__class__, repr(self.position))
+
+ def __str__(self):
+ return ">%s" % self.position
+
+class OneOfPosition(AbstractPosition):
+ """Specify a position where the location can be multiple positions.
+
+ This models the GenBank 'one-of(1888,1901)' function, and tries
+ to make this fit within the Biopython Position models. In our case
+ the position of the "one-of" is set as the lowest choice, and the
+ extension is the range to the highest choice.
+ """
+ def __init__(self, position_list):
+ """Initialize with a set of posssible positions.
+
+ position_list is a list of AbstractPosition derived objects,
+ specifying possible locations.
+ """
+ # unique attribute for this type of positions
+ self.position_choices = position_list
+ # find the smallest and largest position in the choices
+ smallest = None
+ largest = None
+ for position_choice in self.position_choices:
+ assert isinstance(position_choice, AbstractPosition), \
+ "Expected position objects, got %r" % position_choice
+ if smallest is None and largest is None:
+ smallest = position_choice.position
+ largest = position_choice.position
+ elif position_choice.position > largest:
+ largest = position_choice.position
+ elif position_choice.position < smallest:
+ smallest = position_choice.position
+ # initialize with our definition of position and extension
+ AbstractPosition.__init__(self, smallest, largest - smallest)
+
+ def __repr__(self) :
+ """String representation of the OneOfPosition location for debugging."""
+ return "%s(%s)" % (self.__class__, repr(self.position_choices))
+
+ def __str__(self):
+ out = "one-of("
+ for position in self.position_choices:
+ out += "%s," % position
+ # replace the last comma with the closing parenthesis
+ out = out[:-1] + ")"
+ return out
+
+class PositionGap:
+ """Simple class to hold information about a gap between positions.
+ """
+ def __init__(self, gap_size):
+ """Intialize with a position object containing the gap information.
+ """
+ self.gap_size = gap_size
+
+ def __repr__(self) :
+ """A string representation of the position gap for debugging."""
+ return "%s(%s)" % (self.__class__, repr(self.gap_size))
+
+ def __str__(self):
+ out = "gap(%s)" % self.gap_size
+ return out
--- /dev/null
+# Copyright 2008 by Peter Cock. All rights reserved.
+#
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Bio.SeqIO support for the "ace" file format.
+
+You are expected to use this module via the Bio.SeqIO functions.
+See also the Bio.Sequencing.Ace module which offers more than just accessing
+the contig consensus sequences in an ACE file as SeqRecord objects."""
+
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import generic_nucleotide, generic_dna, generic_rna, Gapped
+from Bio.Sequencing import Ace
+
+#This is a generator function!
+def AceIterator(handle) :
+ """Returns SeqRecord objects from an ACE file.
+
+ This uses the Bio.Sequencing.Ace module to do the hard work. Note that
+ by iterating over the file in a single pass, we are forced to ignore any
+ WA, CT, RT or WR footer tags."""
+
+ for ace_contig in Ace.parse(handle) :
+ #Convert the ACE contig record into a SeqRecord...
+ consensus_seq_str = ace_contig.sequence
+ #Assume its DNA unless there is a U in it,
+ if "U" in consensus_seq_str :
+ if "T" in consensus_seq_str :
+ #Very odd! Error?
+ alpha = generic_ncleotide
+ else :
+ alpha = generic_rna
+ else :
+ alpha = generic_dna
+
+ if "*" in consensus_seq_str :
+ #For consistency with most other file formats, map
+ #any * gaps into 0 gaps.
+ assert "-" not in consensus_seq_str
+ consensus_seq = Seq(consensus_seq_str.replace("*","-"),
+ Gapped(alpha, gap_char="-"))
+ else :
+ consensus_seq = Seq(consensus_seq_str, alpha)
+
+ #TODO - Consensus base quality (BQ lines). Note that any gaps
+ #(* character) in the consensus does not get a quality entry.
+ #This really needs Biopython support for per-letter-annotation.
+
+ #TODO? - Base segments (BS lines) which indicates which read
+ #phrap has chosen to be the consensus at a particular position.
+ #Perhaps as SeqFeature objects?
+
+ #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
+ #Perhaps as SeqFeature objects?
+
+ seq_record = SeqRecord(consensus_seq,
+ id = ace_contig.name,
+ name = ace_contig.name)
+ yield seq_record
+ #All done
--- /dev/null
+# Copyright 2006-2009 by Peter Cock. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+#
+# This module is for reading and writing FASTA format files as SeqRecord
+# objects. The code is partly inspired by earlier Biopython modules,
+# Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA
+
+"""Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format.
+
+You are expected to use this module via the Bio.SeqIO functions."""
+
+from Bio.Alphabet import single_letter_alphabet
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Interfaces import SequentialSequenceWriter
+
+#This is a generator function!
+def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None) :
+ """Generator function to iterate over Fasta records (as SeqRecord objects).
+
+ handle - input file
+ alphabet - optional alphabet
+ title2ids - A function that, when given the title of the FASTA
+ file (without the beginning >), will return the id, name and
+ description (in that order) for the record as a tuple of strings.
+
+ If this is not given, then the entire title line will be used
+ as the description, and the first word as the id and name.
+
+ Note that use of title2ids matches that of Bio.Fasta.SequenceParser
+ but the defaults are slightly different.
+ """
+ #Skip any text before the first record (e.g. blank lines, comments)
+ while True :
+ line = handle.readline()
+ if line == "" : return #Premature end of file, or just empty?
+ if line[0] == ">" :
+ break
+
+ while True :
+ if line[0]!=">" :
+ raise ValueError("Records in Fasta files should start with '>' character")
+ if title2ids :
+ id, name, descr = title2ids(line[1:].rstrip())
+ else :
+ descr = line[1:].rstrip()
+ id = descr.split()[0]
+ name = id
+
+ lines = []
+ line = handle.readline()
+ while True:
+ if not line : break
+ if line[0] == ">": break
+ #Remove trailing whitespace, and any internal spaces
+ #(and any embedded \r which are possible in mangled files
+ #when not opened in universal read lines mode)
+ lines.append(line.rstrip().replace(" ","").replace("\r",""))
+ line = handle.readline()
+
+ #Return the record and then continue...
+ yield SeqRecord(Seq("".join(lines), alphabet),
+ id = id, name = name, description = descr)
+
+ if not line : return #StopIteration
+ assert False, "Should not reach this line"
+
+class FastaWriter(SequentialSequenceWriter):
+ """Class to write Fasta format files."""
+ def __init__(self, handle, wrap=60, record2title=None):
+ """Create a Fasta writer.
+
+ handle - Handle to an output file, e.g. as returned
+ by open(filename, "w")
+ wrap - Optional line length used to wrap sequence lines.
+ Defaults to wrapping the sequence at 60 characters
+ Use zero (or None) for no wrapping, giving a single
+ long line for the sequence.
+ record2title - Optional function to return the text to be
+ used for the title line of each record. By default the
+ a combination of the record.id and record.description
+ is used. If the record.description starts with the
+ record.id, then just the record.description is used.
+
+ You can either use:
+
+ myWriter = FastaWriter(open(filename,"w"))
+ writer.write_file(myRecords)
+
+ Or, follow the sequential file writer system, for example:
+
+ myWriter = FastaWriter(open(filename,"w"))
+ writer.write_header() # does nothing for Fasta files
+ ...
+ Multiple calls to writer.write_record() and/or writer.write_records()
+ ...
+ writer.write_footer() # does nothing for Fasta files
+ writer.close()
+ """
+ SequentialSequenceWriter.__init__(self, handle)
+ #self.handle = handle
+ self.wrap = None
+ if wrap :
+ if wrap < 1 :
+ raise ValueError
+ self.wrap = wrap
+ self.record2title = record2title
+
+ def write_record(self, record):
+ """Write a single Fasta record to the file."""
+ assert self._header_written
+ assert not self._footer_written
+ self._record_written = True
+
+ if self.record2title :
+ title=self.clean(self.record2title(record))
+ else :
+ id = self.clean(record.id)
+ description = self.clean(record.description)
+
+ #if description[:len(id)]==id :
+ if description and description.split(None,1)[0]==id :
+ #The description includes the id at the start
+ title = description
+ else :
+ title = "%s %s" % (id, description)
+
+ assert "\n" not in title
+ assert "\r" not in title
+ self.handle.write(">%s\n" % title)
+
+ data = self._get_seq_string(record) #Catches sequence being None
+
+ assert "\n" not in data
+ assert "\r" not in data
+
+ if self.wrap :
+ for i in range(0, len(data), self.wrap):
+ self.handle.write(data[i:i+self.wrap] + "\n")
+ else :
+ self.handle.write(data + "\n")
+
+if __name__ == "__main__" :
+ print "Running quick self test"
+
+ import os
+ from Bio.Alphabet import generic_protein, generic_nucleotide
+
+ #Download the files from here:
+ #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans
+ fna_filename = "NC_005213.fna"
+ faa_filename = "NC_005213.faa"
+
+ def genbank_name_function(text) :
+ text, descr = text.split(None,1)
+ id = text.split("|")[3]
+ name = id.split(".",1)[0]
+ return id, name, descr
+
+ def print_record(record) :
+ #See also bug 2057
+ #http://bugzilla.open-bio.org/show_bug.cgi?id=2057
+ print "ID:" + record.id
+ print "Name:" + record.name
+ print "Descr:" + record.description
+ print record.seq
+ for feature in record.annotations :
+ print '/%s=%s' % (feature, record.annotations[feature])
+ if record.dbxrefs :
+ print "Database cross references:"
+ for x in record.dbxrefs : print " - %s" % x
+
+ if os.path.isfile(fna_filename) :
+ print "--------"
+ print "FastaIterator (single sequence)"
+ iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function)
+ count=0
+ for record in iterator :
+ count=count+1
+ print_record(record)
+ assert count == 1
+ print str(record.__class__)
+
+ if os.path.isfile(faa_filename) :
+ print "--------"
+ print "FastaIterator (multiple sequences)"
+ iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function)
+ count=0
+ for record in iterator :
+ count=count+1
+ print_record(record)
+ break
+ assert count>0
+ print str(record.__class__)
+
+ from cStringIO import StringIO
+ print "--------"
+ print "FastaIterator (empty input file)"
+ #Just to make sure no errors happen
+ iterator = FastaIterator(StringIO(""))
+ count = 0
+ for record in iterator :
+ count = count+1
+ assert count==0
+
+ print "Done"
--- /dev/null
+# Copyright 2008 by Peter Cock. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+#
+# This module is for reading and writing IntelliGenetics format files as
+# SeqRecord objects. This file format appears to be the same as the MASE
+# multiple sequence alignment format.
+
+"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.
+
+You are expected to use this module via the Bio.SeqIO functions."""
+
+from Bio.Alphabet import single_letter_alphabet
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+#This is a generator function!
+def IgIterator(handle, alphabet = single_letter_alphabet) :
+ """Iterate over IntelliGenetics records (as SeqRecord objects).
+
+ handle - input file
+ alphabet - optional alphabet
+
+ The optional free format file header lines (which start with two
+ semi-colons) are ignored.
+
+ The free format commentary lines at the start of each record (which
+ start with a semi-colon) are recorded as a single string with embedded
+ new line characters in the SeqRecord's annotations dictionary under the
+ key 'comment'.
+ """
+ #Skip any file header text before the first record (;; lines)
+ while True :
+ line = handle.readline()
+ if not line : break #Premature end of file, or just empty?
+ if not line.startswith(";;") : break
+
+ while line :
+ #Now iterate over the records
+ if line[0]!=";" :
+ raise ValueError( \
+ "Records should start with ';' and not:\n%s" % repr(line))
+
+ #Try and agree with SeqRecord convention from the GenBank parser,
+ #(and followed in the SwissProt parser) which stores the comments
+ #as a long string with newlines under annotations key 'comment'.
+
+ #Note some examples use "; ..." and others ";..."
+ comment_lines = []
+ while line.startswith(";") :
+ #TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
+ comment_lines.append(line[1:].strip())
+ line = handle.readline()
+ title = line.rstrip()
+
+ seq_lines = []
+ while True:
+ line = handle.readline()
+ if not line : break
+ if line[0] == ";": break
+ #Remove trailing whitespace, and any internal spaces
+ seq_lines.append(line.rstrip().replace(" ",""))
+ seq_str = "".join(seq_lines)
+ if seq_str.endswith("1") :
+ #Remove the optional terminator (digit one)
+ seq_str = seq_str[:-1]
+ if "1" in seq_str :
+ raise ValueError("Potential terminator digit one found within sequence.")
+
+ #Return the record and then continue...
+ record= SeqRecord(Seq(seq_str, alphabet),
+ id = title, name = title)
+ record.annotations['comment'] = "\n".join(comment_lines)
+ yield record
+
+ #We should be at the end of the file now
+ assert not line
+
+if __name__ == "__main__" :
+ print "Running quick self test"
+
+ import os
+ path = "../../Tests/IntelliGenetics/"
+ if os.path.isdir(path) :
+ for filename in os.listdir(path) :
+ if os.path.splitext(filename)[-1] == ".txt" :
+ print
+ print filename
+ print "-"*len(filename)
+ handle = open(os.path.join(path, filename))
+ for record in IgIterator(handle) :
+ print record.id, len(record)
+ handle.close()
+ print "Done"
+ else :
+ print "Could not find input files"
--- /dev/null
+# Copyright 2007-2009 by Peter Cock. All rights reserved.
+#
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package..
+
+"""Bio.SeqIO support for the "genbank" and "embl" file formats.
+
+You are expected to use this module via the Bio.SeqIO functions.
+Note that internally this module calls Bio.GenBank to do the actual
+parsing of both GenBank and EMBL files.
+
+See also:
+
+International Nucleotide Sequence Database Collaboration
+http://www.insdc.org/
+
+GenBank
+http://www.ncbi.nlm.nih.gov/Genbank/
+
+EMBL Nucleotide Sequence Database
+http://www.ebi.ac.uk/embl/
+
+DDBJ (DNA Data Bank of Japan)
+http://www.ddbj.nig.ac.jp/
+"""
+
+from Bio.Seq import UnknownSeq
+from Bio.GenBank.Scanner import GenBankScanner, EmblScanner
+from Bio import Alphabet
+from Interfaces import SequentialSequenceWriter
+
+# NOTE
+# ====
+# The "brains" for parsing GenBank and EMBL files (and any
+# other flat file variants from the INSDC in future) is in
+# Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank)
+
+def GenBankIterator(handle) :
+ """Breaks up a Genbank file into SeqRecord objects.
+
+ Every section from the LOCUS line to the terminating // becomes
+ a single SeqRecord with associated annotation and features.
+
+ Note that for genomes or chromosomes, there is typically only
+ one record."""
+ #This calls a generator function:
+ return GenBankScanner(debug=0).parse_records(handle)
+
+def EmblIterator(handle) :
+ """Breaks up an EMBL file into SeqRecord objects.
+
+ Every section from the LOCUS line to the terminating // becomes
+ a single SeqRecord with associated annotation and features.
+
+ Note that for genomes or chromosomes, there is typically only
+ one record."""
+ #This calls a generator function:
+ return EmblScanner(debug=0).parse_records(handle)
+
+def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein) :
+ """Breaks up a Genbank file into SeqRecord objects for each CDS feature.
+
+ Every section from the LOCUS line to the terminating // can contain
+ many CDS features. These are returned as with the stated amino acid
+ translation sequence (if given).
+ """
+ #This calls a generator function:
+ return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
+
+def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein) :
+ """Breaks up a EMBL file into SeqRecord objects for each CDS feature.
+
+ Every section from the LOCUS line to the terminating // can contain
+ many CDS features. These are returned as with the stated amino acid
+ translation sequence (if given).
+ """
+ #This calls a generator function:
+ return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
+
+class GenBankWriter(SequentialSequenceWriter) :
+ HEADER_WIDTH = 12
+ MAX_WIDTH = 80
+
+ def _write_single_line(self, tag, text) :
+ "Used in the the 'header' of each GenBank record."""
+ assert len(tag) < self.HEADER_WIDTH
+ assert len(text) < self.MAX_WIDTH - self.HEADER_WIDTH, \
+ "Annotation %s too long for %s line" % (repr(text), tag)
+ self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH),
+ text.replace("\n"," ")))
+
+ def _write_multi_line(self, tag, text) :
+ "Used in the the 'header' of each GenBank record."""
+ #TODO - Do the line spliting while preserving white space?
+ max_len = self.MAX_WIDTH - self.HEADER_WIDTH
+ assert len(tag) < self.HEADER_WIDTH
+ text = text.strip()
+ if len(text) < max_len :
+ self._write_single_line(tag, text)
+ return
+
+ words = text.split()
+ assert max([len(w) for w in words]) < max_len, \
+ "Your description cannot be broken into nice lines!"
+ text = ""
+ while words and len(text) + 1 + len(words[0]) < max_len :
+ text += " " + words.pop(0)
+ text = text.strip()
+ assert len(text) < max_len
+ self._write_single_line(tag, text)
+ while words :
+ text = ""
+ while words and len(text) + 1 + len(words[0]) < max_len :
+ text += " " + words.pop(0)
+ text = text.strip()
+ assert len(text) < max_len
+ self._write_single_line("", text)
+ assert not words
+
+ def _write_the_first_line(self, record) :
+ """Write the LOCUS line."""
+
+ locus = record.name
+ if not locus or locus == "<unknown name>" :
+ locus = record.id
+ if not locus or locus == "<unknown id>" :
+ locus = self._get_annotation_str(record, "accession", just_first=True)
+ if len(locus) > 16 :
+ raise ValueError("Locus identifier %s is too long" % repr(locus))
+
+ if len(record) > 99999999999 :
+ #Currently GenBank only officially support up to 350000, but
+ #the length field can take eleven digits
+ raise ValueError("Sequence too long!")
+
+ #Get the base alphabet (underneath any Gapped or StopCodon encoding)
+ a = Alphabet._get_base_alphabet(record.seq.alphabet)
+ if not isinstance(a, Alphabet.Alphabet) :
+ raise TypeError("Invalid alphabet")
+ elif isinstance(a, Alphabet.ProteinAlphabet) :
+ units = "bp"
+ elif isinstance(a, Alphabet.NucleotideAlphabet) :
+ units = "aa"
+ else :
+ #Must be something like NucleotideAlphabet or
+ #just the generic Alphabet (default for fasta files)
+ raise ValueError("Need a Nucleotide or Protein alphabet")
+
+ #Get the molecule type
+ #TODO - record this explicitly in the parser?
+ if isinstance(a, Alphabet.ProteinAlphabet) :
+ mol_type = ""
+ elif isinstance(a, Alphabet.DNAAlphabet) :
+ mol_type = "DNA"
+ elif isinstance(a, Alphabet.RNAAlphabet) :
+ mol_type = "RNA"
+ else :
+ #Must be something like NucleotideAlphabet or
+ #just the generic Alphabet (default for fasta files)
+ raise ValueError("Need a DNA, RNA or Protein alphabet")
+
+ try :
+ division = record.annotations["data_file_division"]
+ except KeyError :
+ division = "UNK"
+ if division not in ["PRI","ROD","MAM","VRT","INV","PLN","BCT",
+ "VRL","PHG","SYN","UNA","EST","PAT","STS",
+ "GSS","HTG","HTC","ENV"] :
+ division = "UNK"
+
+ assert len(units) == 2
+ assert len(division) == 3
+ #TODO - date
+ #TODO - mol_type
+ line = "LOCUS %s %s %s %s %s 01-JAN-1980\n" \
+ % (locus.ljust(16),
+ str(len(record)).rjust(11),
+ units,
+ mol_type.ljust(6),
+ division)
+ assert len(line) == 79+1, repr(line) #plus one for new line
+
+ assert line[12:28].rstrip() == locus, \
+ 'LOCUS line does not contain the locus at the expected position:\n' + line
+ assert line[28:29] == " "
+ assert line[29:40].lstrip() == str(len(record)), \
+ 'LOCUS line does not contain the length at the expected position:\n' + line
+
+ #Tests copied from Bio.GenBank.Scanner
+ assert line[40:44] in [' bp ', ' aa '] , \
+ 'LOCUS line does not contain size units at expected position:\n' + line
+ assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
+ 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
+ assert line[47:54].strip() == "" \
+ or line[47:54].strip().find('DNA') != -1 \
+ or line[47:54].strip().find('RNA') != -1, \
+ 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
+ assert line[54:55] == ' ', \
+ 'LOCUS line does not contain space at position 55:\n' + line
+ assert line[55:63].strip() in ['','linear','circular'], \
+ 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
+ assert line[63:64] == ' ', \
+ 'LOCUS line does not contain space at position 64:\n' + line
+ assert line[67:68] == ' ', \
+ 'LOCUS line does not contain space at position 68:\n' + line
+ assert line[70:71] == '-', \
+ 'LOCUS line does not contain - at position 71 in date:\n' + line
+ assert line[74:75] == '-', \
+ 'LOCUS line does not contain - at position 75 in date:\n' + line
+
+ self.handle.write(line)
+
+ def _get_annotation_str(self, record, key, default=".", just_first=False) :
+ """Get an annotation dictionary entry (as a string).
+
+ Some entries are lists, in which case if just_first=True the first entry
+ is returned. If just_first=False (default) this verifies there is only
+ one entry before returning it."""
+ try :
+ answer = record.annotations[key]
+ except KeyError :
+ return default
+ if isinstance(answer, list) :
+ if not just_first : assert len(answer) == 1
+ return str(answer[0])
+ else :
+ return str(answer)
+
+ def _write_sequence(self, record):
+ #Loosely based on code from Howard Salis
+ #TODO - Force lower case?
+ LETTERS_PER_LINE = 60
+ SEQUENCE_INDENT = 9
+
+ if isinstance(record.seq, UnknownSeq) :
+ #We have already recorded the length, and there is no need
+ #to record a long sequence of NNNNNNN...NNN or whatever.
+ return
+
+ data = self._get_seq_string(record) #Catches sequence being None
+ seq_len = len(data)
+ for line_number in range(0,seq_len,LETTERS_PER_LINE):
+ self.handle.write(str(line_number+1).rjust(SEQUENCE_INDENT))
+ for words in range(line_number,min(line_number+LETTERS_PER_LINE,seq_len),10):
+ self.handle.write(" %s" % data[words:words+10])
+ self.handle.write("\n")
+
+ def write_record(self, record):
+ """Write a single record to the output file."""
+ handle = self.handle
+ self._write_the_first_line(record)
+
+ accession = self._get_annotation_str(record, "accession",
+ record.id.split(".",1)[0],
+ just_first=True)
+ acc_with_version = accession
+ if record.id.startswith(accession+".") :
+ try :
+ acc_with_version = "%s.%i" \
+ % (accession, int(record.id.split(".",1)[1]))
+ except ValueError :
+ pass
+ gi = self._get_annotation_str(record, "gi", just_first=True)
+
+ descr = record.description
+ if descr == "<unknown description>" : descr = "."
+ self._write_multi_line("DEFINITION", descr)
+
+ self._write_single_line("ACCESSION", accession)
+ if gi != "." :
+ self._write_single_line("VERSION", "%s GI:%s" % (acc_with_version,gi))
+ else :
+ self._write_single_line("VERSION", "%s" % (acc_with_version))
+
+ try :
+ #List of strings
+ keywords = "; ".join(record.annotations["keywords"])
+ except KeyError :
+ keywords = "."
+ self._write_multi_line("KEYWORDS", keywords)
+
+ self._write_multi_line("SOURCE", \
+ self._get_annotation_str(record, "source"))
+ #The ORGANISM line MUST be a single line, as any continuation is the taxonomy
+ org = self._get_annotation_str(record, "organism")
+ if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH :
+ org = org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..."
+ self._write_single_line(" ORGANISM", org)
+ try :
+ #List of strings
+ taxonomy = "; ".join(record.annotations["taxonomy"])
+ except KeyError :
+ taxonomy = "."
+ self._write_multi_line("", taxonomy)
+
+ #TODO - References...
+ handle.write("FEATURES Location/Qualifiers\n")
+ for feature in record.features :
+ self._write_feature(feature)
+ handle.write("ORIGIN\n")
+ self._write_sequence(record)
+ handle.write("//\n")
+
+ def _write_feature(self, feature):
+ """Write a single SeqFeature object to features table.
+
+ Not implemented yet, but this stub exists in the short term to
+ facilitate working on writing GenBank files with a sub-class."""
+ #TODO - Features...
+ pass
+
+if __name__ == "__main__" :
+ print "Quick self test"
+ import os
+ from StringIO import StringIO
+
+ def check_genbank_writer(records) :
+ handle = StringIO()
+ GenBankWriter(handle).write_file(records)
+ handle.seek(0)
+
+ records2 = list(GenBankIterator(handle))
+
+ assert len(records) == len(records2)
+ for r1, r2 in zip(records, records2) :
+ #The SwissProt parser may leave \n in the description...
+ assert r1.description.replace("\n", " ") == r2.description
+ assert r1.id == r2.id
+ assert r1.name == r2.name
+ assert str(r1.seq) == str(r2.seq)
+ for key in ["gi", "keywords", "source", "taxonomy"] :
+ if key in r1.annotations :
+ assert r1.annotations[key] == r2.annotations[key], key
+ for key in ["organism"] :
+ if key in r1.annotations :
+ v1 = r1.annotations[key]
+ v2 = r2.annotations[key]
+ assert isinstance(v1, str) and isinstance(v2, str)
+ #SwissProt organism can be too long to record in GenBank format
+ assert v1 == v2 or \
+ (v2.endswith("...") and v1.startswith(v2[:-3])), key
+
+ for filename in os.listdir("../../Tests/GenBank") :
+ if not filename.endswith(".gbk") and not filename.endswith(".gb") :
+ continue
+ print filename
+
+ handle = open("../../Tests/GenBank/%s" % filename)
+ records = list(GenBankIterator(handle))
+ handle.close()
+
+ check_genbank_writer(records)
+
+ for filename in os.listdir("../../Tests/EMBL") :
+ if not filename.endswith(".embl") :
+ continue
+ print filename
+
+ handle = open("../../Tests/EMBL/%s" % filename)
+ records = list(EmblIterator(handle))
+ handle.close()
+
+ check_genbank_writer(records)
+
+ from Bio import SeqIO
+ for filename in os.listdir("../../Tests/SwissProt") :
+ if not filename.startswith("sp") :
+ continue
+ print filename
+
+ handle = open("../../Tests/SwissProt/%s" % filename)
+ records = list(SeqIO.parse(handle,"swiss"))
+ handle.close()
+
+ check_genbank_writer(records)
+
--- /dev/null
+# Copyright 2006-2008 by Peter Cock. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""
+Bio.SeqIO support module (not for general use).
+
+Unless you are writing a new parser or writer for Bio.SeqIO, you should not
+use this module. It provides base classes to try and simplify things.
+"""
+
+from Bio.Alphabet import generic_alphabet
+
+class SequenceIterator :
+ """Base class for building SeqRecord iterators.
+
+ You should write a next() method to return SeqRecord
+ objects. You may wish to redefine the __init__
+ method as well.
+ """
+ def __init__(self, handle, alphabet=generic_alphabet) :
+ """Create a SequenceIterator object.
+
+ handle - input file
+ alphabet - optional, e.g. Bio.Alphabet.generic_protein
+
+ Note when subclassing:
+ - there should be a single non-optional argument,
+ the handle.
+ - you do not have to require an alphabet.
+ - you can add additional optional arguments."""
+ self.handle = handle
+ self.alphabet = alphabet
+ #####################################################
+ # You may want to subclass this, for example #
+ # to read through the file to find the first record,#
+ # or if additional arguments are required. #
+ #####################################################
+
+ def next(self) :
+ """Return the next record in the file.
+
+ This method should be replaced by any derived class to do something useful."""
+ raise NotImplementedError("This object should be subclassed")
+ #####################################################
+ # You SHOULD subclass this, to split the file up #
+ # into your individual records, and convert these #
+ # into useful objects, e.g. return SeqRecord object #
+ #####################################################
+
+ def __iter__(self):
+ """Iterate over the entries as a SeqRecord objects.
+
+ Example usage for Fasta files:
+
+ myFile = open("example.fasta","r")
+ myFastaReader = FastaIterator(myFile)
+ for record in myFastaReader :
+ print record.id
+ print record.seq
+ myFile.close()"""
+ return iter(self.next, None)
+
+class InterlacedSequenceIterator(SequenceIterator) :
+ """Base class for any iterator of a non-sequential file type.
+
+ This object is not intended for use directly.
+
+ When writing a parser for any interlaced sequence file where the whole
+ file must be read in order to extract any single record, then you should
+ subclass this object.
+
+ All you need to do is to define your own:
+ (1) __init__ method to parse the file and call self.move_start()
+ (2) __len__ method to return the number of records
+ (3) __getitem__ to return any requested record.
+
+ This class will then provide the iterator methods including next(), but relies
+ on knowing the total number of records and tracking the pending record index in
+ as self._n
+
+ It is up to the subclassed object to decide if it wants to generate a cache of
+ SeqRecords when initialised, or simply use its own lists and dicts and create
+ SeqRecords on request.
+ """
+
+ def __init__(self) :
+ """Create the object.
+
+ This method should be replaced by any derived class to do something useful."""
+ #We assume that your implementation of __init__ will ensure self._n=0
+ self.move_start()
+ raise NotImplementedError("This object method should be subclassed")
+ #####################################################
+ # You SHOULD subclass this #
+ #####################################################
+
+ def __len__(self) :
+ """Return the number of records.
+
+ This method should be replaced by any derived class to do something useful."""
+ raise NotImplementedError("This object method should be subclassed")
+ #####################################################
+ # You SHOULD subclass this #
+ #####################################################
+
+ def __getitem__(self, i) :
+ """Return the requested record.
+
+ This method should be replaced by any derived class to do something
+ useful.
+
+ It should NOT touch the value of self._n"""
+ raise NotImplementedError("This object method should be subclassed")
+ #####################################################
+ # You SHOULD subclass this #
+ #####################################################
+
+ def move_start(self) :
+ self._n = 0
+
+ def next(self) :
+ next_record = self._n
+ if next_record < len(self) :
+ self._n = next_record+1
+ return self[next_record]
+ else :
+ #StopIteration
+ return None
+
+ def __iter__(self):
+ return iter(self.next, None)
+
+class SequenceWriter:
+ """This class should be subclassed.
+
+ Interlaced file formats (e.g. Clustal) should subclass directly.
+
+ Sequential file formats (e.g. Fasta, GenBank) should subclass
+ the SequentialSequenceWriter class instead.
+ """
+ def __init__(self, handle):
+ """Creates the writer object.
+
+ Use the method write_file() to actually record your sequence records."""
+ self.handle = handle
+
+ def _get_seq_string(self, record):
+ """Use this to catch errors like the sequence being None."""
+ try :
+ #The tostring() method is part of the Seq API, we could instead
+ #use str(record.seq) but that would give a string "None" if the
+ #sequence was None, and unpredicatable output if an unexpected
+ #object was present.
+ return record.seq.tostring()
+ except AttributeError :
+ if record.seq is None :
+ #We could silently treat this as an empty sequence, Seq(""),
+ #but that would be an implict assumption we should avoid.
+ raise TypeError("SeqRecord (id=%s) has None for its sequence." \
+ % record.id)
+ else :
+ raise TypeError("SeqRecord (id=%s) has an invalid sequence." \
+ % record.id)
+
+ def clean(self, text) :
+ """Use this to avoid getting newlines in the output."""
+ answer = text
+ for x in ["\n", "\r"] :
+ answer = answer.replace(x, " ")
+ return answer.replace(" ", " ")
+
+ def write_file(self, records) :
+ """Use this to write an entire file containing the given records.
+
+ records - A list or iterator returning SeqRecord objects
+
+ Should return the number of records (as an integer).
+
+ This method can only be called once."""
+ #Note when implementing this, you should close the file at the end.
+ raise NotImplementedError("This object should be subclassed")
+ #####################################################
+ # You SHOULD subclass this #
+ #####################################################
+
+class SequentialSequenceWriter(SequenceWriter):
+ """This class should be subclassed.
+
+ It is intended for sequential file formats with an (optional)
+ header, repeated records, and an (optional) footer.
+
+ In this case (as with interlaced file formats), the user may
+ simply call the write_file() method and be done.
+
+ However, they may also call the write_header(), followed
+ by multiple calls to write_record() and/or write_records()
+ followed finally by write_footer().
+
+ Users must call write_header() and write_footer() even when
+ the file format concerned doesn't have a header or footer.
+ This is to try and make life as easy as possible when
+ switching the output format.
+
+ Note that write_header() cannot require any assumptions about
+ the number of records.
+ """
+ def __init__(self, handle):
+ self.handle = handle
+ self._header_written = False
+ self._record_written = False
+ self._footer_written = False
+
+ def write_header(self) :
+ assert not self._header_written, "You have aleady called write_header()"
+ assert not self._record_written, "You have aleady called write_record() or write_records()"
+ assert not self._footer_written, "You have aleady called write_footer()"
+ self._header_written = True
+
+ def write_footer(self) :
+ assert self._header_written, "You must call write_header() first"
+ assert self._record_written, "You have not called write_record() or write_records() yet"
+ assert not self._footer_written, "You have aleady called write_footer()"
+ self._footer_written = True
+
+ def write_record(self, record):
+ """Write a single record to the output file.
+
+ record - a SeqRecord object
+
+ Once you have called write_header() you can call write_record()
+ and/or write_records() as many times as needed. Then call
+ write_footer() and close()."""
+ assert self._header_written, "You must call write_header() first"
+ assert not self._footer_written, "You have already called write_footer()"
+ self._record_written = True
+ raise NotImplementedError("This object should be subclassed")
+ #####################################################
+ # You SHOULD subclass this #
+ #####################################################
+
+ def write_records(self, records):
+ """Write multiple record to the output file.
+
+ records - A list or iterator returning SeqRecord objects
+
+ Once you have called write_header() you can call write_record()
+ and/or write_records() as many times as needed. Then call
+ write_footer() and close().
+
+ Returns the number of records written.
+ """
+ #Default implementation:
+ assert self._header_written, "You must call write_header() first"
+ assert not self._footer_written, "You have already called write_footer()"
+ count = 0
+ for record in records :
+ self.write_record(record)
+ count += 1
+ #Mark as true, even if there where no records
+ self._record_written = True
+ return count
+
+ def write_file(self, records) :
+ """Use this to write an entire file containing the given records.
+
+ records - A list or iterator returning SeqRecord objects
+
+ This method can only be called once. Returns the number of records
+ written.
+ """
+ self.write_header()
+ count = self.write_records(records)
+ self.write_footer()
+ return count
--- /dev/null
+# Copyright 2008 by Peter Cock. All rights reserved.
+#
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Bio.SeqIO support for the "phd" file format.
+
+PHD files are output by PHRED and used by PHRAP and CONSED.
+
+You are expected to use this module via the Bio.SeqIO functions.
+See also the underlying Bio.Sequencing.Phd module."""
+
+from Bio.SeqRecord import SeqRecord
+from Bio.Sequencing import Phd
+
+#This is a generator function!
+def PhdIterator(handle) :
+ """Returns SeqRecord objects from a PHD file.
+
+ This uses the Bio.Sequencing.Phy module to do the hard work.
+ """
+
+ phd_records = Phd.parse(handle)
+ for phd_record in phd_records:
+ #Convert the PHY record into a SeqRecord...
+ seq_record = SeqRecord(phd_record.seq,
+ id = phd_record.file_name,
+ name = phd_record.file_name)
+ #Just re-use the comments dictionary as the SeqRecord's annotations
+ seq_record.annotations = phd_record.comments
+ yield seq_record
+ #All done
+
+if __name__ == "__main__" :
+ print "Quick self test"
+ handle = open("../../Tests/Phd/Phd1")
+ for record in PhdIterator(handle) :
+ print record
+ handle.close()
+ print "Done"
+
+
--- /dev/null
+# Copyright 2008 by Peter Cock. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+#
+# This module is for reading and writing PIR or NBRF format files as
+# SeqRecord objects. The code is based on Bio.SeqIO.FastaIO
+
+"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.
+
+You are expected to use this module via the Bio.SeqIO functions, or if
+the file contains a sequence alignment, optionally via Bio.AlignIO instead.
+
+This format was introduced for the Protein Information Resource (PIR), a
+project of the National Biomedical Research Foundation (NBRF). The PIR
+database itself is now part of UniProt.
+
+The file format is described online at:
+http://www.ebi.ac.uk/help/pir_frame.html
+http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)
+
+An example file in this format would be:
+
+>P1;CRAB_ANAPL
+ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
+ MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
+ SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
+ GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
+ SDVPERSIPI TREEKPAIAG AQRK*
+
+>P1;CRAB_BOVIN
+ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
+ MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
+ PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
+ HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
+ QASGPERTIP ITREEKPAVT AAPKK*
+
+Or, an example of a multiple sequence alignment:
+
+>P1;S27231
+rhodopsin - northern leopard frog
+MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
+VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
+GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
+EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
+ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
+YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
+
+>P1;I51200
+rhodopsin - African clawed frog
+MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
+VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
+GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
+EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
+LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
+YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
+
+>P1;JN0120
+rhodopsin - Japanese lamprey
+MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
+VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
+GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
+EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
+ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
+YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
+
+
+As with the FASTA format, each record starts with a line begining with ">"
+character. There is then a two letter sequence type (P1, F1, DL, DC, RL,
+RC, or XX), a semi colon, and the identification code. The second like is
+free text description. The remaining lines contain the sequence itself,
+terminating in an asterisk. Space separated blocks of ten letters as shown
+above are typical.
+
+Sequence codes and their meanings:
+
+P1 - Protein (complete)
+F1 - Protein (fragment)
+D1 - DNA (e.g. EMBOSS seqret output)
+DL - DNA (linear)
+DC - DNA (circular)
+RL - RNA (linear)
+RC - RNA (circular)
+N3 - tRNA
+N1 - Other functional RNA
+XX - Unknown
+"""
+
+from Bio.Alphabet import single_letter_alphabet, generic_protein, generic_dna, generic_rna
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+_pir_alphabets = {"P1" : generic_protein,
+ "F1" : generic_protein,
+ "D1" : generic_dna,
+ "DL" : generic_dna,
+ "DC" : generic_dna,
+ "RL" : generic_rna,
+ "RC" : generic_rna,
+ "N3" : generic_rna,
+ "XX" : single_letter_alphabet,
+ }
+
+#This is a generator function!
+def PirIterator(handle) :
+ """Generator function to iterate over Fasta records (as SeqRecord objects).
+
+ handle - input file
+ alphabet - optional alphabet
+ title2ids - A function that, when given the title of the FASTA
+ file (without the beginning >), will return the id, name and
+ description (in that order) for the record as a tuple of strings.
+
+ If this is not given, then the entire title line will be used
+ as the description, and the first word as the id and name.
+
+ Note that use of title2ids matches that of Bio.Fasta.SequenceParser
+ but the defaults are slightly different.
+ """
+ #Skip any text before the first record (e.g. blank lines, comments)
+ while True :
+ line = handle.readline()
+ if line == "" : return #Premature end of file, or just empty?
+ if line[0] == ">" :
+ break
+
+ while True :
+ if line[0]!=">" :
+ raise ValueError("Records in PIR files should start with '>' character")
+ pir_type = line[1:3]
+ if pir_type not in _pir_alphabets or line[3] != ";" :
+ raise ValueError("Records should start with '>XX;' where XX is a valid sequence type")
+ identifier = line[4:].strip()
+ description = handle.readline().strip()
+
+
+ lines = []
+ line = handle.readline()
+ while True:
+ if not line : break
+ if line[0] == ">": break
+ #Remove trailing whitespace, and any internal spaces
+ lines.append(line.rstrip().replace(" ",""))
+ line = handle.readline()
+ seq = "".join(lines)
+ if seq[-1] != "*" :
+ #Note the * terminator is present on nucleotide sequences too,
+ #it is not a stop codon!
+ raise ValueError("Sequences in PIR files should include a * terminator!")
+
+ #Return the record and then continue...
+ record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
+ id = identifier, name = identifier,
+ description = description)
+ record.annotations["PIR-type"] = pir_type
+ yield record
+
+ if not line : return #StopIteration
+ assert False, "Should not reach this line"
+
+if __name__ == "__main__" :
+ print "Running quick self test"
+
+ from StringIO import StringIO
+ import os
+
+ for name in ["clustalw", "DMA_nuc", "DMB_prot", "B_nuc", "Cw_prot"] :
+ print name
+ filename = "../../Tests/NBRF/%s.pir" % name
+ if not os.path.isfile(filename) :
+ print "Missing %s" % filename
+ continue
+
+ records = list(PirIterator(open(filename)))
+ count = 0
+ for record in records :
+ count += 1
+ parts = record.description.split()
+ if "bases," in parts :
+ assert len(record) == int(parts[parts.index("bases,")-1])
+ print "Could read %s (%i records)" % (name, count)
+
--- /dev/null
+# Copyright 2009 by Peter Cock. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+#
+# This module is for reading and writing FASTQ and QUAL format files as
+# SeqRecord objects, and is expected to be used via the Bio.SeqIO API.
+
+"""Bio.SeqIO support for the "fastq" and "qual" file formats.
+
+Note that you are expected to use this code via the Bio.SeqIO interface, as
+shown below.
+
+The FASTQ file format is used frequently at the Wellcome Trust Sanger Institute
+to bundle a FASTA sequence and its PHRED quality data (integers between 0 and
+90). Rather than using a single FASTQ file, often paired FASTA and QUAL files
+are used containing the sequence and the quality information separately.
+
+The PHRED software reads DNA sequencing trace files, calls bases, and
+assigns a quality value between 0 and 90 to each called base using a logged
+transformation of the error probability, Q = -10 log10( Pe ), for example::
+
+ Pe = 0.0, Q = 0
+ Pe = 0.1, Q = 10
+ Pe = 0.01, Q = 20
+ ...
+ Pe = 0.00000001, Q = 80
+ Pe = 0.000000001, Q = 90
+
+In the QUAL format these quality values are held as space separated text in
+a FASTA like file format. In the FASTQ format, each quality values is encoded
+with a single ASCI character using chr(Q+33), meaning zero maps to the
+character "!" and for example 80 maps to "q". The sequences and quality are
+then stored in pairs in a FASTA like format.
+
+Unfortunately there is no official document describing the FASTQ file format,
+and worse, several related but different variants exist. Reasonable
+documentation exists at: http://maq.sourceforge.net/fastq.shtml
+
+Solexa/Illumina quality scores use Q = - 10 log10 ( Pe / (1-Pe) ), which can
+be negative or easily exceed 90. PHRED scores and Solexa scores are NOT
+interchangeable (but a reasonable mapping can be achieved between them).
+Confusingly Solexa produces a FASTQ like file but using their own score
+mapping instead.
+
+Also note that Roche 454 sequencers can output files in the QUAL format, and
+thankfully they use PHREP style scores like Sanger. To extract QUAL files from
+a Roche 454 SFF binary file, use the Roche off instrument command line tool
+"sffinfo" with the -q or -qual argument. You can extract a matching FASTA file
+using the -s or -seq argument instead.
+
+You are expected to use this module via the Bio.SeqIO functions, with the
+following format names:
+ - "fastq" means Sanger style FASTQ files using PHRED scores.
+ - "fastq-solexa" means Solexa/Illumina style FASTQ files.
+ - "qual" means simple quality files using PHRED scores.
+
+For example, consider the following short FASTQ file (extracted from a real
+NCBI dataset)::
+
+ @EAS54_6_R1_2_1_413_324
+ CCCTTCTTGTCTTCAGCGTTTCTCC
+ +
+ ;;3;;;;;;;;;;;;7;;;;;;;88
+ @EAS54_6_R1_2_1_540_792
+ TTGGCAGGCCAAGGCCGATGGATCA
+ +
+ ;;;;;;;;;;;7;;;;;-;;;3;83
+ @EAS54_6_R1_2_1_443_348
+ GTTGCTTCTGGCGTGGGTGGGGGGG
+ +
+ ;;;;;;;;;;;9;7;;.7;393333
+
+This contains three reads of length 25. From the read length these were
+probably originally from an early Solexa/Illumina sequencer but NCBI have
+followed the Sanger FASTQ convention and this actually uses PHRED style
+qualities. This means we can parse this file using Bio.SeqIO using "fastq"
+as the format name:
+
+ >>> from Bio import SeqIO
+ >>> for record in SeqIO.parse(open("Quality/example.fastq"), "fastq") :
+ ... print record.id, record.seq
+ EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
+ EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
+ EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG
+
+The qualities are held as a list of integers in each record's annotation:
+
+ >>> print record
+ ID: EAS54_6_R1_2_1_443_348
+ Name: EAS54_6_R1_2_1_443_348
+ Description: EAS54_6_R1_2_1_443_348
+ Number of features: 0
+ Per letter annotation for: phred_quality
+ Seq('GTTGCTTCTGGCGTGGGTGGGGGGG', SingleLetterAlphabet())
+ >>> print record.letter_annotations["phred_quality"]
+ [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+
+You can use the SeqRecord format method you can show this in the QUAL format:
+
+ >>> print record.format("qual")
+ >EAS54_6_R1_2_1_443_348
+ 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18
+ 24 18 18 18 18
+ <BLANKLINE>
+
+Or go back to the FASTQ format,
+
+ >>> print record.format("fastq")
+ @EAS54_6_R1_2_1_443_348
+ GTTGCTTCTGGCGTGGGTGGGGGGG
+ +
+ ;;;;;;;;;;;9;7;;.7;393333
+ <BLANKLINE>
+
+You can also get Biopython to convert the scores and show a Solexa style
+FASTQ file:
+
+ >>> print record.format("fastq-solexa")
+ @EAS54_6_R1_2_1_443_348
+ GTTGCTTCTGGCGTGGGTGGGGGGG
+ +
+ ZZZZZZZZZZZXZVZZMVZRXRRRR
+ <BLANKLINE>
+
+If you wanted to trim your sequences (perhaps to remove low quality regions,
+or to remove a primer sequence), try slicing the SeqRecord objects. e.g.
+
+ >>> sub_rec = record[5:15]
+ >>> print sub_rec
+ ID: EAS54_6_R1_2_1_443_348
+ Name: EAS54_6_R1_2_1_443_348
+ Description: EAS54_6_R1_2_1_443_348
+ Number of features: 0
+ Per letter annotation for: phred_quality
+ Seq('TTCTGGCGTG', SingleLetterAlphabet())
+ >>> print sub_rec.letter_annotations["phred_quality"]
+ [26, 26, 26, 26, 26, 26, 24, 26, 22, 26]
+ >>> print sub_rec.format("fastq")
+ @EAS54_6_R1_2_1_443_348
+ TTCTGGCGTG
+ +
+ ;;;;;;9;7;
+ <BLANKLINE>
+
+If you wanted to, you could read in this FASTQ file, and save it as a QUAL file:
+
+ >>> from Bio import SeqIO
+ >>> record_iterator = SeqIO.parse(open("Quality/example.fastq"), "fastq")
+ >>> out_handle = open("Quality/temp.qual", "w")
+ >>> SeqIO.write(record_iterator, out_handle, "qual")
+ 3
+ >>> out_handle.close()
+
+You can of course read in a QUAL file, such as the one we just created:
+
+ >>> from Bio import SeqIO
+ >>> for record in SeqIO.parse(open("Quality/temp.qual"), "qual") :
+ ... print record.id, record.seq
+ EAS54_6_R1_2_1_413_324 ?????????????????????????
+ EAS54_6_R1_2_1_540_792 ?????????????????????????
+ EAS54_6_R1_2_1_443_348 ?????????????????????????
+
+Notice that QUAL files don't have a proper sequence present! But the quality
+information is there:
+
+ >>> print record
+ ID: EAS54_6_R1_2_1_443_348
+ Name: EAS54_6_R1_2_1_443_348
+ Description: EAS54_6_R1_2_1_443_348
+ Number of features: 0
+ Per letter annotation for: phred_quality
+ UnknownSeq(25, alphabet = SingleLetterAlphabet(), character = '?')
+ >>> print record.letter_annotations["phred_quality"]
+ [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+
+Just to keep things tidy, if you are following this example yourself, you can
+delete this temporary file now:
+
+ >>> import os
+ >>> os.remove("Quality/temp.qual")
+
+Sometimes you won't have a FASTQ file, but rather just a pair of FASTA and QUAL
+files. Because the Bio.SeqIO system is designed for reading single files, you
+would have to read the two in separately and then combine the data. However,
+since this is such a common thing to want to do, there is a helper iterator
+defined in this module that does this for you - PairedFastaQualIterator.
+
+Alternatively, if you have enough RAM to hold all the records in memory at once,
+then a simple dictionary approach would work:
+
+ >>> from Bio import SeqIO
+ >>> reads = SeqIO.to_dict(SeqIO.parse(open("Quality/example.fasta"), "fasta"))
+ >>> for rec in SeqIO.parse(open("Quality/example.qual"), "qual") :
+ ... reads[rec.id].letter_annotations["phred_quality"]=rec.letter_annotations["phred_quality"]
+
+You can then access any record by its key, and get both the sequence and the
+quality scores.
+
+ >>> print reads["EAS54_6_R1_2_1_540_792"].format("fastq")
+ @EAS54_6_R1_2_1_540_792
+ TTGGCAGGCCAAGGCCGATGGATCA
+ +
+ ;;;;;;;;;;;7;;;;;-;;;3;83
+ <BLANKLINE>
+
+It is important that you explicitly tell Bio.SeqIO which FASTQ variant you are
+using ("fastq" for the Sanger standard using PHRED values, or "fastq-solexa"
+for the Solexa/Illumina variant), as this cannot be detected reliably
+automatically.
+"""
+__docformat__ = "epytext en" #Don't just use plain text in epydoc API pages!
+
+#See also http://blog.malde.org/index.php/2008/09/09/the-fastq-file-format-for-sequences/
+
+from Bio.Alphabet import single_letter_alphabet
+from Bio.Seq import Seq, UnknownSeq
+from Bio.SeqRecord import SeqRecord
+from Interfaces import SequentialSequenceWriter
+from math import log
+
+# define score offsets. See discussion for differences between Sanger and
+# Solexa offsets.
+SANGER_SCORE_OFFSET = 33
+SOLEXA_SCORE_OFFSET = 64
+
+def solexa_quality_from_phred(phred_quality) :
+ """Covert a PHRED quality (range 0 to about 90) to a Solexa quality.
+
+ This will return a floating point number, it is up to you to round this to
+ the nearest integer if appropriate. e.g.
+
+ >>> print "%0.2f" % round(solexa_quality_from_phred(80),2)
+ 80.00
+ >>> print "%0.2f" % round(solexa_quality_from_phred(50),2)
+ 50.00
+ >>> print "%0.2f" % round(solexa_quality_from_phred(20),2)
+ 19.96
+ >>> print "%0.2f" % round(solexa_quality_from_phred(10),2)
+ 9.54
+ >>> print "%0.2f" % round(solexa_quality_from_phred(1),2)
+ -5.87
+ """
+ return 10*log(10**(phred_quality/10.0) - 1, 10)
+
+def phred_quality_from_solexa(solexa_quality) :
+ """Convert a Solexa quality (which can be negative) to a PHRED quality.
+
+ This will return a floating point number, it is up to you to round this to
+ the nearest integer if appropriate. e.g.
+
+ >>> print "%0.2f" % round(phred_quality_from_solexa(80),2)
+ 80.00
+ >>> print "%0.2f" % round(phred_quality_from_solexa(20),2)
+ 20.04
+ >>> print "%0.2f" % round(phred_quality_from_solexa(10),2)
+ 10.41
+ >>> print "%0.2f" % round(phred_quality_from_solexa(0),2)
+ 3.01
+ >>> print "%0.2f" % round(phred_quality_from_solexa(-10),2)
+ 0.41
+ """
+ return 10*log(10**(solexa_quality/10.0) + 1, 10)
+
+def _get_phred_quality(record) :
+ """Extract PHRED qualities from a SeqRecord's letter_annotations (PRIVATE).
+
+ If there are no PHRED qualities, but there are Solexa qualities, those are
+ used instead after conversion.
+ """
+ try :
+ return record.letter_annotations["phred_quality"]
+ except KeyError :
+ pass
+ try :
+ return [phred_quality_from_solexa(q) for \
+ q in record.letter_annotations["solexa_quality"]]
+ except KeyError :
+ raise ValueError("No suitable quality scores found in letter_annotations "
+ "of SeqRecord (id=%s)." % record.id)
+
+def _get_solexa_quality(record) :
+ """Extract Solexa qualities from a SeqRecord's letter_annotations (PRIVATE).
+
+ If there are no Solexa qualities, but there are PHRED qualities, those are
+ used instead after conversion.
+ """
+ try :
+ return record.letter_annotations["solexa_quality"]
+ except KeyError :
+ pass
+ try :
+ return [solexa_quality_from_phred(q) for \
+ q in record.letter_annotations["phred_quality"]]
+ except KeyError :
+ raise ValueError("No suitable quality scores found in letter_annotation "
+ "of SeqRecord (id=%s)." % record.id)
+
+
+#TODO - Default to nucleotide or even DNA?
+def FastqGeneralIterator(handle) :
+ """Iterate over Fastq records as string tuples (not as SeqRecord objects).
+
+ This code does not try to interpret the quality string numerically. It
+ just returns tuples of the title, sequence and quality as strings. For
+ the sequence and quality, any whitespace (such as new lines) is removed.
+
+ Our SeqRecord based FASTQ iterators call this function internally, and then
+ turn the strings into a SeqRecord objects, mapping the quality string into
+ a list of numerical scores. If you want to do a custom quality mapping,
+ then you might consider calling this function directly.
+
+ For parsing FASTQ files, the title string from the "@" line at the start
+ of each record can optionally be omitted on the "+" lines. If it is
+ repeated, it must be identical.
+
+ The sequence string and the quality string can optionally be split over
+ multiple lines, although several sources discourage this. In comparison,
+ for the FASTA file format line breaks between 60 and 80 characters are
+ the norm.
+
+ WARNING - Because the "@" character can appear in the quality string,
+ this can cause problems as this is also the marker for the start of
+ a new sequence. In fact, the "+" sign can also appear as well. Some
+ sources recommended having no line breaks in the quality to avoid this,
+ but even that is not enough, consider this example::
+
+ @071113_EAS56_0053:1:1:998:236
+ TTTCTTGCCCCCATAGACTGAGACCTTCCCTAAATA
+ +071113_EAS56_0053:1:1:998:236
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIICII+III
+ @071113_EAS56_0053:1:1:182:712
+ ACCCAGCTAATTTTTGTATTTTTGTTAGAGACAGTG
+ +
+ @IIIIIIIIIIIIIIICDIIIII<%<6&-*).(*%+
+ @071113_EAS56_0053:1:1:153:10
+ TGTTCTGAAGGAAGGTGTGCGTGCGTGTGTGTGTGT
+ +
+ IIIIIIIIIIIICIIGIIIII>IAIIIE65I=II:6
+ @071113_EAS56_0053:1:3:990:501
+ TGGGAGGTTTTATGTGGA
+ AAGCAGCAATGTACAAGA
+ +
+ IIIIIII.IIIIII1@44
+ @-7.%<&+/$/%4(++(%
+
+ This is four PHRED encoded FASTQ entries originally from an NCBI source
+ (given the read length of 36, these are probably Solexa Illumna reads where
+ the quality has been mapped onto the PHRED values).
+
+ This example has been edited to illustrate some of the nasty things allowed
+ in the FASTQ format. Firstly, on the "+" lines most but not all of the
+ (redundant) identifiers are ommited. In real files it is likely that all or
+ none of these extra identifiers will be present.
+
+ Secondly, while the first three sequences have been shown without line
+ breaks, the last has been split over multiple lines. In real files any line
+ breaks are likely to be consistent.
+
+ Thirdly, some of the quality string lines start with an "@" character. For
+ the second record this is unavoidable. However for the fourth sequence this
+ only happens because its quality string is split over two lines. A naive
+ parser could wrongly treat any line starting with an "@" as the beginning of
+ a new sequence! This code copes with this possible ambiguity by keeping track
+ of the length of the sequence which gives the expected length of the quality
+ string.
+
+ Using this tricky example file as input, this short bit of code demonstrates
+ what this parsing function would return:
+
+ >>> handle = open("Quality/tricky.fastq", "rU")
+ >>> for (title, sequence, quality) in FastqGeneralIterator(handle) :
+ ... print title
+ ... print sequence, quality
+ 071113_EAS56_0053:1:1:998:236
+ TTTCTTGCCCCCATAGACTGAGACCTTCCCTAAATA IIIIIIIIIIIIIIIIIIIIIIIIIIIIICII+III
+ 071113_EAS56_0053:1:1:182:712
+ ACCCAGCTAATTTTTGTATTTTTGTTAGAGACAGTG @IIIIIIIIIIIIIIICDIIIII<%<6&-*).(*%+
+ 071113_EAS56_0053:1:1:153:10
+ TGTTCTGAAGGAAGGTGTGCGTGCGTGTGTGTGTGT IIIIIIIIIIIICIIGIIIII>IAIIIE65I=II:6
+ 071113_EAS56_0053:1:3:990:501
+ TGGGAGGTTTTATGTGGAAAGCAGCAATGTACAAGA IIIIIII.IIIIII1@44@-7.%<&+/$/%4(++(%
+ >>> handle.close()
+
+ Finally we note that some sources state that the quality string should
+ start with "!" (which using the PHRED mapping means the first letter always
+ has a quality score of zero). This rather restrictive rule is not widely
+ observed, so is therefore ignored here. One plus point about this "!" rule
+ is that (provided there are no line breaks in the quality sequence) it
+ would prevent the above problem with the "@" character.
+ """
+ #Skip any text before the first record (e.g. blank lines, comments?)
+ while True :
+ line = handle.readline()
+ if line == "" : return #Premature end of file, or just empty?
+ if line[0] == "@" :
+ break
+
+ while True :
+ if line[0]!="@" :
+ raise ValueError("Records in Fastq files should start with '@' character")
+ title_line = line[1:].rstrip()
+
+ seq_lines = []
+ line = handle.readline()
+ while True:
+ if not line :
+ raise ValueError("End of file without quality information.")
+ if line[0] == "+":
+ #The title here is optional, but if present must match!
+ if line[1:].rstrip() and line[1:].rstrip() != title_line :
+ raise ValueError("Sequence and quality captions differ.")
+ break
+ seq_lines.extend(line.split()) #removes any whitespace
+ line = handle.readline()
+
+ seq_string = "".join(seq_lines)
+ del seq_lines
+
+ quality_lines = []
+ line = handle.readline()
+ while True:
+ if not line : break
+ if line[0] == "@":
+ #This COULD be the start of a new sequence. However, it MAY just
+ #be a line of quality data which starts with a "@" character. We
+ #should be able to check this by looking at the sequence length
+ #and the amount of quality data found so far.
+ if len("".join(quality_lines)) >= len(seq_string) :
+ #We expect it to be equal if this is the start of a new record.
+ #If the quality data is longer, we'll raise an error below.
+ break
+ #Continue - its just some (more) sequence data.
+
+ quality_lines.extend(line.split()) #removes any whitespace
+ line = handle.readline()
+
+ quality_string = "".join(quality_lines)
+ del quality_lines
+
+ if len(seq_string) != len(quality_string) :
+ raise ValueError("Lengths of sequence and quality values differs "
+ " for %s (%i and %i)." \
+ % (title_line, len(seq_string), len(quality_string)))
+
+ #Return the record and then continue...
+ yield (title_line, seq_string, quality_string)
+ if not line : return #StopIteration at end of file
+ assert False, "Should not reach this line"
+
+#This is a generator function!
+def FastqPhredIterator(handle, alphabet = single_letter_alphabet, title2ids = None) :
+ """Generator function to iterate over FASTQ records (as SeqRecord objects).
+
+ - handle - input file
+ - alphabet - optional alphabet
+ - title2ids - A function that, when given the title line from the FASTQ
+ file (without the beginning >), will return the id, name and
+ description (in that order) for the record as a tuple of
+ strings. If this is not given, then the entire title line
+ will be used as the description, and the first word as the
+ id and name.
+
+ Note that use of title2ids matches that of Bio.SeqIO.FastaIO.
+
+ For each sequence in a (Sanger style) FASTQ file there is a matching string
+ encoding the PHRED qualities (integers between 0 and about 90) using ASCII
+ values with an offset of 33.
+
+ For example, consider a file containing three short reads::
+
+ @EAS54_6_R1_2_1_413_324
+ CCCTTCTTGTCTTCAGCGTTTCTCC
+ +
+ ;;3;;;;;;;;;;;;7;;;;;;;88
+ @EAS54_6_R1_2_1_540_792
+ TTGGCAGGCCAAGGCCGATGGATCA
+ +
+ ;;;;;;;;;;;7;;;;;-;;;3;83
+ @EAS54_6_R1_2_1_443_348
+ GTTGCTTCTGGCGTGGGTGGGGGGG
+ +
+ ;;;;;;;;;;;9;7;;.7;393333
+
+ For each sequence (e.g. "CCCTTCTTGTCTTCAGCGTTTCTCC") there is a matching
+ string encoding the PHRED qualities using a ASCI values with an offset of
+ 33 (e.g. ";;3;;;;;;;;;;;;7;;;;;;;88").
+
+ Using this module directly you might run:
+
+ >>> handle = open("Quality/example.fastq", "rU")
+ >>> for record in FastqPhredIterator(handle) :
+ ... print record.id, record.seq
+ EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
+ EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
+ EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG
+ >>> handle.close()
+
+ Typically however, you would call this via Bio.SeqIO instead with "fastq" as
+ the format:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Quality/example.fastq", "rU")
+ >>> for record in SeqIO.parse(handle, "fastq") :
+ ... print record.id, record.seq
+ EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
+ EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
+ EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG
+ >>> handle.close()
+
+ If you want to look at the qualities, they are record in each record's
+ per-letter-annotation dictionary as a simple list of integers:
+
+ >>> print record.letter_annotations["phred_quality"]
+ [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+ """
+ for title_line, seq_string, quality_string in FastqGeneralIterator(handle) :
+ if title2ids :
+ id, name, descr = title2ids(title_line)
+ else :
+ descr = title_line
+ id = descr.split()[0]
+ name = id
+ record = SeqRecord(Seq(seq_string, alphabet),
+ id=id, name=name, description=descr)
+
+ assert SANGER_SCORE_OFFSET == ord("!")
+ #According to BioPerl documentation at least, the first character should
+ #be an "!" (and therefore quality zero). This seems crazy - what if the
+ #sequence has been trimmed to remove any poor quality sequence? In any
+ #case real examples from the NCBI don't follow this practice, so we
+ #won't enforce it here.
+ #e.g. ftp://ftp.ncbi.nih.gov/pub/TraceDB/ShortRead/SRA000271/fastq/200x36x36-071113_EAS56_0053-s_1_1.fastq.gz
+ #
+ #if quality_string[0] != "!" :
+ # raise ValueError("The quality string should always start with a ! character.")
+ qualities = [ord(letter)-SANGER_SCORE_OFFSET for letter in quality_string]
+ if qualities :
+ if min(qualities) < 0 or max(qualities) > 90 :
+ raise ValueError("Quality score outside 0 to 90 found - these are perhaps "
+ "in a Solexa/Illumina format, not the Sanger FASTQ format "
+ "which uses PHRED scores.")
+ record.letter_annotations["phred_quality"] = qualities
+ yield record
+
+#This is a generator function!
+def FastqSolexaIterator(handle, alphabet = single_letter_alphabet, title2ids = None) :
+ """Parsing the Solexa/Illumina FASTQ like files (which differ in the quality mapping).
+
+ The optional arguments are the same as those for the FastqPhredIterator.
+
+ For each sequence in Solexa/Illumina FASTQ files there is a matching string
+ encoding the Solexa integer qualities using ASCII values with an offset
+ of 64. Solexa scores are scaled differently to PHRED scores, and Biopython
+ will NOT perform any automatic conversion when loading.
+
+ For example, consider a file containing these five records::
+
+ @SLXA-B3_649_FC8437_R1_1_1_610_79
+ GATGTGCAATACCTTTGTAGAGGAA
+ +SLXA-B3_649_FC8437_R1_1_1_610_79
+ YYYYYYYYYYYYYYYYYYWYWYYSU
+ @SLXA-B3_649_FC8437_R1_1_1_397_389
+ GGTTTGAGAAAGAGAAATGAGATAA
+ +SLXA-B3_649_FC8437_R1_1_1_397_389
+ YYYYYYYYYWYYYYWWYYYWYWYWW
+ @SLXA-B3_649_FC8437_R1_1_1_850_123
+ GAGGGTGTTGATCATGATGATGGCG
+ +SLXA-B3_649_FC8437_R1_1_1_850_123
+ YYYYYYYYYYYYYWYYWYYSYYYSY
+ @SLXA-B3_649_FC8437_R1_1_1_362_549
+ GGAAACAAAGTTTTTCTCAACATAG
+ +SLXA-B3_649_FC8437_R1_1_1_362_549
+ YYYYYYYYYYYYYYYYYYWWWWYWY
+ @SLXA-B3_649_FC8437_R1_1_1_183_714
+ GTATTATTTAATGGCATACACTCAA
+ +SLXA-B3_649_FC8437_R1_1_1_183_714
+ YYYYYYYYYYWYYYYWYWWUWWWQQ
+
+ Using this module directly you might run:
+
+ >>> handle = open("Quality/solexa_example.fastq", "rU")
+ >>> for record in FastqSolexaIterator(handle) :
+ ... print record.id, record.seq
+ SLXA-B3_649_FC8437_R1_1_1_610_79 GATGTGCAATACCTTTGTAGAGGAA
+ SLXA-B3_649_FC8437_R1_1_1_397_389 GGTTTGAGAAAGAGAAATGAGATAA
+ SLXA-B3_649_FC8437_R1_1_1_850_123 GAGGGTGTTGATCATGATGATGGCG
+ SLXA-B3_649_FC8437_R1_1_1_362_549 GGAAACAAAGTTTTTCTCAACATAG
+ SLXA-B3_649_FC8437_R1_1_1_183_714 GTATTATTTAATGGCATACACTCAA
+ >>> handle.close()
+
+ Typically however, you would call this via Bio.SeqIO instead with "fastq" as
+ the format:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Quality/solexa_example.fastq", "rU")
+ >>> for record in SeqIO.parse(handle, "fastq-solexa") :
+ ... print record.id, record.seq
+ SLXA-B3_649_FC8437_R1_1_1_610_79 GATGTGCAATACCTTTGTAGAGGAA
+ SLXA-B3_649_FC8437_R1_1_1_397_389 GGTTTGAGAAAGAGAAATGAGATAA
+ SLXA-B3_649_FC8437_R1_1_1_850_123 GAGGGTGTTGATCATGATGATGGCG
+ SLXA-B3_649_FC8437_R1_1_1_362_549 GGAAACAAAGTTTTTCTCAACATAG
+ SLXA-B3_649_FC8437_R1_1_1_183_714 GTATTATTTAATGGCATACACTCAA
+ >>> handle.close()
+
+ If you want to look at the qualities, they are recorded in each record's
+ per-letter-annotation dictionary as a simple list of integers:
+
+ >>> print record.letter_annotations["solexa_quality"]
+ [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 23, 25, 25, 25, 25, 23, 25, 23, 23, 21, 23, 23, 23, 17, 17]
+
+ These scores aren't very good, but they are high enough that they map
+ almost exactly onto PHRED scores:
+
+ >>> print "%0.2f" % phred_quality_from_solexa(25)
+ 25.01
+
+ Let's look at another example read which is even worse, where there are
+ more noticeable differences between the Solexa and PHRED scores::
+
+ @slxa_0013_1_0001_24
+ ACAAAAATCACAAGCATTCTTATACACC
+ +slxa_0013_1_0001_24
+ ??????????????????:??<?<-6%.
+
+ Again, you would typically use Bio.SeqIO to read this file in (rather than
+ calling the Bio.SeqIO.QualtityIO module directly). Most FASTQ files will
+ contain thousands of reads, so you would normally use Bio.SeqIO.parse()
+ as shown above. This example has only as one entry, so instead we can
+ use the Bio.SeqIO.read() function:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Quality/solexa.fastq", "rU")
+ >>> record = SeqIO.read(handle, "fastq-solexa")
+ >>> handle.close()
+ >>> print record.id, record.seq
+ slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC
+ >>> print record.letter_annotations["solexa_quality"]
+ [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -6, -1, -1, -4, -1, -4, -19, -10, -27, -18]
+
+ These quality scores are so low that when converted from the Solexa scheme
+ into PHRED scores they look quite different:
+
+ >>> print "%0.2f" % phred_quality_from_solexa(-1)
+ 2.54
+
+ Note you can use the Bio.SeqIO.write() function or the SeqRecord's format
+ method to output the record(s):
+
+ >>> print record.format("fastq-solexa")
+ @slxa_0013_1_0001_24
+ ACAAAAATCACAAGCATTCTTATACACC
+ +
+ ??????????????????:??<?<-6%.
+ <BLANKLINE>
+
+ Note this output is slightly different from the input file as Biopython
+ has left out the optional repetition of the sequence identifier on the "+"
+ line. If you want the to use PHRED scores, use "fastq" or "qual" as the
+ output format instead, and Biopython will do the conversion for you:
+
+ >>> print record.format("fastq")
+ @slxa_0013_1_0001_24
+ ACAAAAATCACAAGCATTCTTATACACC
+ +
+ $$$$$$$$$$$$$$$$$$"$$"$"!!!!
+ <BLANKLINE>
+
+ >>> print record.format("qual")
+ >slxa_0013_1_0001_24
+ 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 1 3 1 0 0 0 0
+ <BLANKLINE>
+ """
+ for title_line, seq_string, quality_string in FastqGeneralIterator(handle) :
+ if title2ids :
+ id, name, descr = title_line
+ else :
+ descr = title_line
+ id = descr.split()[0]
+ name = id
+ record = SeqRecord(Seq(seq_string, alphabet),
+ id=id, name=name, description=descr)
+ qualities = [ord(letter)-SOLEXA_SCORE_OFFSET for letter in quality_string]
+ #DO NOT convert these into PHRED qualities automatically!
+ record.letter_annotations["solexa_quality"] = qualities
+ yield record
+
+def QualPhredIterator(handle, alphabet = single_letter_alphabet, title2ids = None) :
+ """For QUAL files which include PHRED quality scores, but no sequence.
+
+ For example, consider this short QUAL file::
+
+ >EAS54_6_R1_2_1_413_324
+ 26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26
+ 26 26 26 23 23
+ >EAS54_6_R1_2_1_540_792
+ 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26
+ 26 18 26 23 18
+ >EAS54_6_R1_2_1_443_348
+ 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18
+ 24 18 18 18 18
+
+ Using this module directly you might run:
+
+ >>> handle = open("Quality/example.qual", "rU")
+ >>> for record in QualPhredIterator(handle) :
+ ... print record.id, record.seq
+ EAS54_6_R1_2_1_413_324 ?????????????????????????
+ EAS54_6_R1_2_1_540_792 ?????????????????????????
+ EAS54_6_R1_2_1_443_348 ?????????????????????????
+ >>> handle.close()
+
+ Typically however, you would call this via Bio.SeqIO instead with "qual"
+ as the format:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Quality/example.qual", "rU")
+ >>> for record in SeqIO.parse(handle, "qual") :
+ ... print record.id, record.seq
+ EAS54_6_R1_2_1_413_324 ?????????????????????????
+ EAS54_6_R1_2_1_540_792 ?????????????????????????
+ EAS54_6_R1_2_1_443_348 ?????????????????????????
+ >>> handle.close()
+
+ Becase QUAL files don't contain the sequence string itself, the seq
+ property is set to an UnknownSeq object. As no alphabet was given, this
+ has defaulted to a generic single letter alphabet and the character "?"
+ used.
+
+ By specifying a nucleotide alphabet, "N" is used instead:
+
+ >>> from Bio import SeqIO
+ >>> from Bio.Alphabet import generic_dna
+ >>> handle = open("Quality/example.qual", "rU")
+ >>> for record in SeqIO.parse(handle, "qual", alphabet=generic_dna) :
+ ... print record.id, record.seq
+ EAS54_6_R1_2_1_413_324 NNNNNNNNNNNNNNNNNNNNNNNNN
+ EAS54_6_R1_2_1_540_792 NNNNNNNNNNNNNNNNNNNNNNNNN
+ EAS54_6_R1_2_1_443_348 NNNNNNNNNNNNNNNNNNNNNNNNN
+ >>> handle.close()
+
+ However, the quality scores themselves are available as a list of integers
+ in each record's per-letter-annotation:
+
+ >>> print record.letter_annotations["phred_quality"]
+ [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+
+ You can still slice one of these SeqRecord objects with an UnknownSeq:
+
+ >>> sub_record = record[5:10]
+ >>> print sub_record.id, sub_record.letter_annotations["phred_quality"]
+ EAS54_6_R1_2_1_443_348 [26, 26, 26, 26, 26]
+ """
+ #Skip any text before the first record (e.g. blank lines, comments)
+ while True :
+ line = handle.readline()
+ if line == "" : return #Premature end of file, or just empty?
+ if line[0] == ">" :
+ break
+
+ while True :
+ if line[0]!=">" :
+ raise ValueError("Records in Fasta files should start with '>' character")
+ if title2ids :
+ id, name, descr = title2ids(line[1:].rstrip())
+ else :
+ descr = line[1:].rstrip()
+ id = descr.split()[0]
+ name = id
+
+ qualities = []
+ line = handle.readline()
+ while True:
+ if not line : break
+ if line[0] == ">": break
+ qualities.extend([int(word) for word in line.split()])
+ line = handle.readline()
+
+ if qualities :
+ if min(qualities) < 0 or max(qualities) > 90 :
+ raise ValueError(("Quality score range for %s is %i to %i, outside the " \
+ +"expected 0 to 90. Perhaps these are Solexa/Illumina " \
+ +"scores, and not PHRED scores?") \
+ % (id, min(qualities), max(qualities)))
+
+ #Return the record and then continue...
+ record = SeqRecord(UnknownSeq(len(qualities), alphabet),
+ id = id, name = name, description = descr)
+ record.letter_annotations["phred_quality"] = qualities
+ yield record
+
+ if not line : return #StopIteration
+ assert False, "Should not reach this line"
+
+class FastqPhredWriter(SequentialSequenceWriter):
+ """Class to write FASTQ format files (using PHRED quality scores).
+
+ Although you can use this class directly, you are strongly encouraged
+ to use the Bio.SeqIO.write() function instead. For example, this code
+ reads in a FASTQ (PHRED) file and re-saves it as another FASTQ (PHRED)
+ file:
+
+ >>> from Bio import SeqIO
+ >>> record_iterator = SeqIO.parse(open("Quality/example.fastq"), "fastq")
+ >>> out_handle = open("Quality/temp.fastq", "w")
+ >>> SeqIO.write(record_iterator, out_handle, "fastq")
+ 3
+ >>> out_handle.close()
+
+ You might want to do this if the original file included extra line breaks,
+ which while valid may not be supported by all tools. The output file from
+ Biopython will have each sequence on a single line, and each quality
+ string on a single line (which is considered desirable for maximum
+ compatibility).
+
+ In this next example, a Solexa FASTQ file is converted into a standard
+ Sanger style FASTQ file using PHRED qualities:
+
+ >>> from Bio import SeqIO
+ >>> record_iterator = SeqIO.parse(open("Quality/solexa.fastq"), "fastq-solexa")
+ >>> out_handle = open("Quality/temp.fastq", "w")
+ >>> SeqIO.write(record_iterator, out_handle, "fastq")
+ 1
+ >>> out_handle.close()
+
+ This code is also called if you use the .format("fastq") method of a
+ SeqRecord.
+
+ P.S. To avoid cluttering up your working directory, you can delete this
+ temporary file now:
+
+ >>> import os
+ >>> os.remove("Quality/temp.fastq")
+
+ """
+ def write_record(self, record):
+ """Write a single FASTQ record to the file."""
+ assert self._header_written
+ assert not self._footer_written
+ self._record_written = True
+
+ #TODO - Is an empty sequence allowed in FASTQ format?
+ assert SANGER_SCORE_OFFSET == ord("!")
+ #This rounds to the nearest integer:
+ qualities = "".join([chr(int(round(q+SANGER_SCORE_OFFSET,0))) for q \
+ in _get_phred_quality(record)])
+ if record.seq is None:
+ raise ValueError("No sequence for record %s" % record.id)
+ if len(qualities) != len(record) :
+ raise ValueError("Record %s has sequence length %i but %i quality scores" \
+ % (record.id, len(record), len(qualities)))
+
+ title = self.clean(record.id) #TODO - add the description too? cf Fasta output
+ self.handle.write("@%s\n%s\n+\n%s\n" % (title, record.seq, qualities))
+
+class QualPhredWriter(SequentialSequenceWriter):
+ """Class to write QUAL format files (using PHRED quality scores).
+
+ Although you can use this class directly, you are strongly encouraged
+ to use the Bio.SeqIO.write() function instead. For example, this code
+ reads in a FASTQ file and saves the quality scores into a QUAL file:
+
+ >>> from Bio import SeqIO
+ >>> record_iterator = SeqIO.parse(open("Quality/example.fastq"), "fastq")
+ >>> out_handle = open("Quality/temp.qual", "w")
+ >>> SeqIO.write(record_iterator, out_handle, "qual")
+ 3
+ >>> out_handle.close()
+
+ This code is also called if you use the .format("qual") method of a
+ SeqRecord.
+
+ P.S. Don't forget to clean up the temp file if you don't need it anymore:
+
+ >>> import os
+ >>> os.remove("Quality/temp.qual")
+ """
+ def __init__(self, handle, wrap=60, record2title=None):
+ """Create a QUAL writer.
+
+ Arguments:
+ - handle - Handle to an output file, e.g. as returned
+ by open(filename, "w")
+ - wrap - Optional line length used to wrap sequence lines.
+ Defaults to wrapping the sequence at 60 characters
+ Use zero (or None) for no wrapping, giving a single
+ long line for the sequence.
+ - record2title - Optional function to return the text to be
+ used for the title line of each record. By default
+ a combination of the record.id and record.description
+ is used. If the record.description starts with the
+ record.id, then just the record.description is used.
+
+ The record2title argument is present for consistency with the
+ Bio.SeqIO.FastaIO writer class.
+ """
+ SequentialSequenceWriter.__init__(self, handle)
+ #self.handle = handle
+ self.wrap = None
+ if wrap :
+ if wrap < 1 :
+ raise ValueError
+ self.wrap = wrap
+ self.record2title = record2title
+
+ def write_record(self, record):
+ """Write a single QUAL record to the file."""
+ assert self._header_written
+ assert not self._footer_written
+ self._record_written = True
+
+ if self.record2title :
+ title=self.clean(self.record2title(record))
+ else :
+ id = self.clean(record.id)
+ description = self.clean(record.description)
+
+ #if description[:len(id)]==id :
+ if description and description.split(None,1)[0]==id :
+ #The description includes the id at the start
+ title = description
+ else :
+ title = "%s %s" % (id, description)
+
+ assert "\n" not in title
+ assert "\r" not in title
+ self.handle.write(">%s\n" % title)
+
+ #This rounds to the nearest integer.
+ #TODO - can we put a float in a qual file?
+ qualities = [("%i" % round(q,0)) for q in _get_phred_quality(record)]
+
+ if self.wrap :
+ while qualities :
+ line=qualities.pop(0)
+ while qualities \
+ and len(line) + 1 + len(qualities[0]) < self.wrap :
+ line += " " + qualities.pop(0)
+ self.handle.write(line + "\n")
+ else :
+ data = " ".join(qualities)
+ self.handle.write(data + "\n")
+
+class FastqSolexaWriter(SequentialSequenceWriter):
+ """Class to write FASTQ format files (using Solexa quality scores).
+
+ Although you can use this class directly, you are strongly encouraged
+ to use the Bio.SeqIO.write() function instead. For example, this code
+ reads in a FASTQ file and re-saves it as another FASTQ file:
+
+ >>> from Bio import SeqIO
+ >>> record_iterator = SeqIO.parse(open("Quality/solexa.fastq"), "fastq-solexa")
+ >>> out_handle = open("Quality/temp.fastq", "w")
+ >>> SeqIO.write(record_iterator, out_handle, "fastq-solexa")
+ 1
+ >>> out_handle.close()
+
+ You might want to do this if the original file included extra line
+ breaks, which (while valid) may not be supported by all tools. The
+ output file from Biopython will have each sequence on a single line, and
+ each quality string on a single line (which is considered desirable for
+ maximum compatibility).
+
+ This code is also called if you use the .format("fastq-solexa") method of
+ a SeqRecord.
+
+ P.S. Don't forget to delete the temp file if you don't need it anymore:
+
+ >>> import os
+ >>> os.remove("Quality/temp.fastq")
+ """
+ def write_record(self, record):
+ """Write a single FASTQ record to the file."""
+ assert self._header_written
+ assert not self._footer_written
+ self._record_written = True
+
+ #TODO - Is an empty sequence allowed in FASTQ format?
+ qualities = "".join([chr(int(round(q+SOLEXA_SCORE_OFFSET,0))) for q \
+ in _get_solexa_quality(record)])
+ if record.seq is None:
+ raise ValueError("No sequence for record %s" % record.id)
+ if len(qualities) != len(record) :
+ raise ValueError("Record %s has sequence length %i but %i quality scores" \
+ % (record.id, len(record), len(qualities)))
+
+ title = self.clean(record.id) #TODO - add the description too? cf Fasta output
+ self.handle.write("@%s\n%s\n+\n%s\n" % (title, record.seq, qualities))
+
+def PairedFastaQualIterator(fasta_handle, qual_handle, alphabet = single_letter_alphabet, title2ids = None) :
+ """Iterate over matched FASTA and QUAL files as SeqRecord objects.
+
+ For example, consider this short QUAL file::
+
+ >EAS54_6_R1_2_1_413_324
+ 26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26
+ 26 26 26 23 23
+ >EAS54_6_R1_2_1_540_792
+ 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26
+ 26 18 26 23 18
+ >EAS54_6_R1_2_1_443_348
+ 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18
+ 24 18 18 18 18
+
+ And a matching FASTA file::
+
+ >EAS54_6_R1_2_1_413_324
+ CCCTTCTTGTCTTCAGCGTTTCTCC
+ >EAS54_6_R1_2_1_540_792
+ TTGGCAGGCCAAGGCCGATGGATCA
+ >EAS54_6_R1_2_1_443_348
+ GTTGCTTCTGGCGTGGGTGGGGGGG
+
+ You can parse these separately using Bio.SeqIO with the "qual" and
+ "fasta" formats, but then you'll get a group of SeqRecord objects with
+ no sequence, and a matching group with the sequence but not the
+ qualities. Because it only deals with one input file handle, Bio.SeqIO
+ can't be used to read the two files together - but this function can!
+ For example,
+
+ >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"),
+ ... open("Quality/example.qual", "rU"))
+ >>> for record in rec_iter :
+ ... print record.id, record.seq
+ EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
+ EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
+ EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG
+
+ As with the FASTQ or QUAL parsers, if you want to look at the qualities,
+ they are in each record's per-letter-annotation dictionary as a simple
+ list of integers:
+
+ >>> print record.letter_annotations["phred_quality"]
+ [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]
+
+ If you have access to data as a FASTQ format file, using that directly
+ would be simpler and more straight forward. Note that you can easily use
+ this function to convert paired FASTA and QUAL files into FASTQ files:
+
+ >>> from Bio import SeqIO
+ >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"),
+ ... open("Quality/example.qual", "rU"))
+ >>> out_handle = open("Quality/temp.fastq", "w")
+ >>> SeqIO.write(rec_iter, out_handle, "fastq")
+ 3
+ >>> out_handle.close()
+
+ And don't forget to clean up the temp file if you don't need it anymore:
+
+ >>> import os
+ >>> os.remove("Quality/temp.fastq")
+ """
+ from Bio.SeqIO.FastaIO import FastaIterator
+ fasta_iter = FastaIterator(fasta_handle, alphabet=alphabet, \
+ title2ids=title2ids)
+ qual_iter = QualPhredIterator(qual_handle, alphabet=alphabet, \
+ title2ids=title2ids)
+
+ #Using zip(...) would create a list loading everything into memory!
+ #It would also not catch any extra records found in only one file.
+ while True :
+ try :
+ f_rec = fasta_iter.next()
+ except StopIteration :
+ f_rec = None
+ try :
+ q_rec = qual_iter.next()
+ except StopIteration :
+ q_rec = None
+ if f_rec is None and q_rec is None :
+ #End of both files
+ break
+ if f_rec is None :
+ raise ValueError("FASTA file has more entries than the QUAL file.")
+ if q_rec is None :
+ raise ValueError("QUAL file has more entries than the FASTA file.")
+ if f_rec.id != q_rec.id :
+ raise ValueError("FASTA and QUAL entries do not match (%s vs %s)." \
+ % (f_rec.id, q_rec.id))
+ if len(f_rec) != len(q_rec.letter_annotations["phred_quality"]) :
+ raise ValueError("Sequence length and number of quality scores disagree for %s" \
+ % f_rec.id)
+ #Merge the data....
+ f_rec.letter_annotations["phred_quality"] = q_rec.letter_annotations["phred_quality"]
+ yield f_rec
+ #Done
+
+
+def _test():
+ """Run the Bio.SeqIO module's doctests.
+
+ This will try and locate the unit tests directory, and run the doctests
+ from there in order that the relative paths used in the examples work.
+ """
+ import doctest
+ import os
+ if os.path.isdir(os.path.join("..","..","Tests")) :
+ print "Runing doctests..."
+ cur_dir = os.path.abspath(os.curdir)
+ os.chdir(os.path.join("..","..","Tests"))
+ assert os.path.isfile("Quality/example.fastq")
+ assert os.path.isfile("Quality/example.fasta")
+ assert os.path.isfile("Quality/example.qual")
+ assert os.path.isfile("Quality/tricky.fastq")
+ assert os.path.isfile("Quality/solexa.fastq")
+ doctest.testmod()
+ os.chdir(cur_dir)
+ del cur_dir
+ print "Done"
+
+if __name__ == "__main__" :
+ _test()
+
--- /dev/null
+# Copyright 2006 by Peter Cock. All rights reserved.
+#
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format.
+
+You are expected to use this module via the Bio.SeqIO functions.
+See also the Bio.SwissProt module which offers more than just accessing
+the sequences as SeqRecord objects."""
+
+from Bio.SwissProt import SProt
+import cStringIO
+
+#This is a generator function!
+def SwissIterator(handle) :
+ """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.
+
+ Every section from the ID line to the terminating // becomes
+ a single SeqRecord with associated annotation and features.
+
+ This parser is for the flat file "swiss" format as used by:
+ * Swiss-Prot aka SwissProt
+ * TrEMBL
+ * UniProtKB aka UniProt Knowledgebase
+
+ It does NOT read their new XML file format.
+ http://www.expasy.org/sprot/
+
+ For consistency with BioPerl and EMBOSS we call this the "swiss"
+ format.
+ """
+ parser = SProt.SequenceParser()
+ lines = []
+ for line in handle:
+ lines.append(line)
+ if line[:2]=='//':
+ handle = cStringIO.StringIO("".join(lines))
+ record = parser.parse(handle)
+ lines = []
+ yield record
+ #If there are more lines, it could only be a partial record.
+ #Should we try and parse them anyway?
+
+
+if __name__ == "__main__" :
+ print "Quick self test..."
+
+ example_filename = "../../Tests/SwissProt/sp008"
+
+ import os
+ if not os.path.isfile(example_filename):
+ print "Missing test file %s" % example_filename
+ else :
+ #Try parsing it!
+ handle = open(example_filename)
+ records = SwissIterator(handle)
+ for record in records:
+ print record.name
+ print record.id
+ print record.annotations['keywords']
+ print repr(record.annotations['organism'])
+ print record.seq.tostring()[:20] + "..."
+ handle.close()
--- /dev/null
+# Copyright 2008 by Peter Cock. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Bio.SeqIO support for the "tab" (simple tab separated) file format.
+
+You are expected to use this module via the Bio.SeqIO functions.
+
+The "tab" format is an ad-hoc plain text file format where each sequence is
+on one (long) line. Each line contains the identifier/description, followed
+by a tab, followed by the sequence. For example, consider the following
+short FASTA format file:
+
+>ID123456 possible binding site?
+CATCNAGATGACACTACGACTACGACTCAGACTAC
+>ID123457 random sequence
+ACACTACGACTACGACTCAGACTACAAN
+
+Apart from the descriptions, this can be represented in the simple two column
+tab separated format as follows:
+
+ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC
+ID123457(tab)ACACTACGACTACGACTCAGACTACAAN
+
+When reading this file, "ID123456" or "ID123457" will be taken as the record's
+.id and .name property. There is no other information to record.
+
+Similarly, when writing to this format, Biopython will ONLY record the record's
+.id and .seq (and not the description or any other information) as in the example
+above.
+"""
+
+from Bio.Alphabet import single_letter_alphabet
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Interfaces import SequentialSequenceWriter
+
+#This is a generator function!
+def TabIterator(handle, alphabet = single_letter_alphabet) :
+ """Iterates over tab separated lines (as SeqRecord objects).
+
+ Each line of the file should contain one tab only, dividing the line
+ into an identifier and the full sequence.
+
+ handle - input file
+ alphabet - optional alphabet
+
+ The first field is taken as the record's .id and .name (regardless of
+ any spaces within the text) and the second field is the sequence.
+
+ Any blank lines are ignored.
+ """
+ for line in handle :
+ try :
+ title, seq = line.split("\t") #will fail if more than one tab!
+ except :
+ if line.strip() == "" :
+ #It's a blank line, ignore it
+ continue
+ raise ValueError("Each line should have one tab separating the" + \
+ " title and sequence, this line has %i tabs: %s" \
+ % (line.count("\t"), repr(line)))
+ title = title.strip()
+ seq = seq.strip() #removes the trailing new line
+ yield SeqRecord(Seq(seq, alphabet), id = title, name = title)
+
+class TabWriter(SequentialSequenceWriter):
+ """Class to write simple tab separated format files.
+
+ Each line consists of "id(tab)sequence" only.
+
+ Any description, name or other annotation is not recorded.
+ """
+ def write_record(self, record):
+ """Write a single tab line to the file."""
+ assert self._header_written
+ assert not self._footer_written
+ self._record_written = True
+
+ title = self.clean(record.id)
+ seq = self._get_seq_string(record) #Catches sequence being None
+ assert "\t" not in title
+ assert "\n" not in title
+ assert "\r" not in title
+ assert "\t" not in seq
+ assert "\n" not in seq
+ assert "\r" not in seq
+ self.handle.write("%s\t%s\n" % (title, seq))
+
+
+if __name__ == "__main__" :
+ print "Running quick self test"
+ from StringIO import StringIO
+
+ #This example has a trailing blank line which should be ignored
+ handle = StringIO("Alpha\tAAAAAAA\nBeta\tCCCCCCC\n\n")
+ records = list(TabIterator(handle))
+ assert len(records) == 2
+
+ handle = StringIO("Alpha\tAAAAAAA\tExtra\nBeta\tCCCCCCC\n")
+ try :
+ records = list(TabIterator(handle))
+ assert False, "Should have reject this invalid example!"
+ except ValueError :
+ #Good!
+ pass
+
+ print "Done"
--- /dev/null
+# Copyright 2006-2008 by Peter Cock. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+#
+#Nice link:
+# http://www.ebi.ac.uk/help/formats_frame.html
+
+"""Sequence input/output as SeqRecord objects.
+
+Bio.SeqIO is also documented at U{http://biopython.org/wiki/SeqIO} and by
+a whole chapter in our tutorial:
+ - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
+ - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
+
+Input
+=====
+The main function is Bio.SeqIO.parse(...) which takes an input file handle,
+and format string. This returns an iterator giving SeqRecord objects:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Fasta/f002", "rU")
+ >>> for record in SeqIO.parse(handle, "fasta") :
+ ... print record.id, len(record)
+ gi|1348912|gb|G26680|G26680 633
+ gi|1348917|gb|G26685|G26685 413
+ gi|1592936|gb|G29385|G29385 471
+ >>> handle.close()
+
+Note that the parse() function will all invoke the relevant parser for the
+format with its default settings. You may want more control, in which case
+you need to create a format specific sequence iterator directly.
+
+For non-interlaced files (e.g. Fasta, GenBank, EMBL) with multiple records
+using a sequence iterator can save you a lot of memory (RAM). There is
+less benefit for interlaced file formats (e.g. most multiple alignment file
+formats). However, an iterator only lets you access the records one by one.
+
+If you want random access to the records by number, turn this into a list:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Fasta/f002", "rU")
+ >>> records = list(SeqIO.parse(handle, "fasta"))
+ >>> handle.close()
+ >>> print records[1].id
+ gi|1348917|gb|G26685|G26685
+
+If you want random access to the records by a key such as the record id,
+turn the iterator into a dictionary:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Fasta/f002", "rU")
+ >>> record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
+ >>> handle.close()
+ >>> print len(record_dict["gi|1348917|gb|G26685|G26685"])
+ 413
+
+If you expect your file to contain one-and-only-one record, then we provide
+the following 'helper' function which will return a single SeqRecord, or
+raise an exception if there are no records or more than one record:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Fasta/f001", "rU")
+ >>> record = SeqIO.read(handle, "fasta")
+ >>> handle.close()
+ >>> print record.id, len(record)
+ gi|3318709|pdb|1A91| 79
+
+This style is useful when you expect a single record only (and would
+consider multiple records an error). For example, when dealing with GenBank
+files for bacterial genomes or chromosomes, there is normally only a single
+record. Alternatively, use this with a handle when download a single record
+from the internet.
+
+However, if you just want the first record from a file containing multiple
+record, use the iterator's next() method:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Fasta/f002", "rU")
+ >>> record = SeqIO.parse(handle, "fasta").next()
+ >>> handle.close()
+ >>> print record.id, len(record)
+ gi|1348912|gb|G26680|G26680 633
+
+The above code will work as long as the file contains at least one record.
+Note that if there is more than one record, the remaining records will be
+silently ignored.
+
+Input - Alignments
+==================
+You can read in alignment files as Alignment objects using Bio.AlignIO.
+Alternatively, reading in an alignment file format via Bio.SeqIO will give
+you a SeqRecord for each row of each alignment:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Clustalw/hedgehog.aln", "rU")
+ >>> for record in SeqIO.parse(handle, "clustal") :
+ ... print record.id, len(record)
+ gi|167877390|gb|EDS40773.1| 447
+ gi|167234445|ref|NP_001107837. 447
+ gi|74100009|gb|AAZ99217.1| 447
+ gi|13990994|dbj|BAA33523.2| 447
+ gi|56122354|gb|AAV74328.1| 447
+ >>> handle.close()
+
+Output
+======
+Use the function Bio.SeqIO.write(...), which takes a complete set of
+SeqRecord objects (either as a list, or an iterator), an output file handle
+and of course the file format::
+
+ from Bio import SeqIO
+ records = ...
+ handle = open("example.faa", "w")
+ SeqIO.write(records, handle, "fasta")
+ handle.close()
+
+In general, you are expected to call this function once (with all your
+records) and then close the file handle.
+
+Output - Advanced
+=================
+The effect of calling write() multiple times on a single file will vary
+depending on the file format, and is best avoided unless you have a strong
+reason to do so.
+
+Trying this for certain alignment formats (e.g. phylip, clustal, stockholm)
+would have the effect of concatenating several multiple sequence alignments
+together. Such files are created by the PHYLIP suite of programs for
+bootstrap analysis.
+
+For sequential files formats (e.g. fasta, genbank) each "record block" holds
+a single sequence. For these files it would probably be safe to call
+write() multiple times.
+
+File Formats
+============
+When specifying the file format, use lowercase strings. The same format
+names are also used in Bio.AlignIO and include the following:
+
+ - ace - Reads the contig sequences from an ACE assembly file.
+ - embl - The EMBL flat file format. Uses Bio.GenBank internally.
+ - fasta - The generic sequence file format where each record starts with
+ an identifer line starting with a ">" character, followed by
+ lines of sequence.
+ - fastq - A "FASTA like" format used by Sanger which also stores PHRED
+ sequence quality values.
+ - fastq-solexa - The Solexa/Illumnia variant of the Sanger FASTQ format which
+ encodes Solexa quality scores (not PHRED quality scores).
+ - genbank - The GenBank or GenPept flat file format.
+ - gb - An alias for "genbank", for consistency with NCBI Entrez Utilities
+ - ig - The IntelliGenetics file format, apparently the same as the
+ MASE alignment format.
+ - phd - Output from PHRED, used by PHRAP and CONSED for input.
+ - pir - A "FASTA like" format introduced by the National Biomedical
+ Research Foundation (NBRF) for the Protein Information Resource
+ (PIR) database, now part of UniProt.
+ - swiss - Plain text Swiss-Prot aka UniProt format.
+ - tab - Simple two column tab separated sequence files, where each
+ line holds a record's identifier and sequence. For example,
+ this is used as by Aligent's eArray software when saving
+ microarray probes in a minimal tab delimited text file.
+ - qual - A "FASTA like" format holding PHRED quality values from
+ sequencing DNA, but no actual sequences (usually provided
+ in separate FASTA files).
+
+Note that while Bio.SeqIO can read all the above file formats, it cannot
+write to all of them.
+
+You can also use any file format supported by Bio.AlignIO, such as "nexus",
+"phlip" and "stockholm", which gives you access to the individual sequences
+making up each alignment as SeqRecords.
+"""
+__docformat__ = "epytext en" #not just plaintext
+
+#TODO
+# - define policy on reading aligned sequences with gaps in
+# (e.g. - and . characters) including how the alphabet interacts
+#
+# - Can we build the to_alignment(...) functionality
+# into the generic Alignment class instead?
+#
+# - How best to handle unique/non unique record.id when writing.
+# For most file formats reading such files is fine; The stockholm
+# parser would fail.
+#
+# - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf)
+# http://www.bioperl.org/wiki/MSF_multiple_alignment_format
+
+"""
+FAO BioPython Developers
+========================
+The way I envision this SeqIO system working as that for any sequence file
+format we have an iterator that returns SeqRecord objects.
+
+This also applies to interlaced fileformats (like clustal - although that
+is now handled via Bio.AlignIO instead) where the file cannot be read record
+by record. You should still return an iterator, even if the implementation
+could just as easily return a list.
+
+These file format specific sequence iterators may be implemented as:
+* Classes which take a handle for __init__ and provide the __iter__ method
+* Functions that take a handle, and return an iterator object
+* Generator functions that take a handle, and yield SeqRecord objects
+
+It is then trivial to turn this iterator into a list of SeqRecord objects,
+an in memory dictionary, or a multiple sequence alignment object.
+
+For building the dictionary by default the id propery of each SeqRecord is
+used as the key. You should always populate the id property, and it should
+be unique in most cases. For some file formats the accession number is a good
+choice. If the file itself contains ambiguous identifiers, don't try and
+dis-ambiguate them - return them as is.
+
+When adding a new file format, please use the same lower case format name
+as BioPerl, or if they have not defined one, try the names used by EMBOSS.
+
+See also http://biopython.org/wiki/SeqIO_dev
+
+--Peter
+"""
+
+import os
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Align.Generic import Alignment
+from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
+
+import AceIO
+import FastaIO
+import IgIO #IntelliGenetics or MASE format
+import InsdcIO #EMBL and GenBank
+import PhdIO
+import PirIO
+import SwissIO
+import TabIO
+import QualityIO #FastQ and qual files
+
+
+#Convention for format names is "mainname-subtype" in lower case.
+#Please use the same names as BioPerl where possible.
+#
+#Note that this simple system copes with defining
+#multiple possible iterators for a given format/extension
+#with the -subtype suffix
+#
+#Most alignment file formats will be handled via Bio.AlignIO
+
+_FormatToIterator ={"fasta" : FastaIO.FastaIterator,
+ "gb" : InsdcIO.GenBankIterator,
+ "genbank" : InsdcIO.GenBankIterator,
+ "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
+ "embl" : InsdcIO.EmblIterator,
+ "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
+ "ig" : IgIO.IgIterator,
+ "swiss" : SwissIO.SwissIterator,
+ "phd" : PhdIO.PhdIterator,
+ "ace" : AceIO.AceIterator,
+ "tab" : TabIO.TabIterator,
+ "pir" : PirIO.PirIterator,
+ "fastq" : QualityIO.FastqPhredIterator,
+ "fastq-solexa" : QualityIO.FastqSolexaIterator,
+ "qual" : QualityIO.QualPhredIterator,
+ }
+
+_FormatToWriter ={"fasta" : FastaIO.FastaWriter,
+ "gb" : InsdcIO.GenBankWriter,
+ "genbank" : InsdcIO.GenBankWriter,
+ "tab" : TabIO.TabWriter,
+ "fastq" : QualityIO.FastqPhredWriter,
+ "fastq-solexa" : QualityIO.FastqSolexaWriter,
+ "qual" : QualityIO.QualPhredWriter,
+ }
+
+def write(sequences, handle, format) :
+ """Write complete set of sequences to a file.
+
+ - sequences - A list (or iterator) of SeqRecord objects.
+ - handle - File handle object to write to.
+ - format - lower case string describing the file format to write.
+
+ You should close the handle after calling this function.
+
+ Returns the number of records written (as an integer).
+ """
+ from Bio import AlignIO
+
+ #Try and give helpful error messages:
+ if isinstance(handle, basestring) :
+ raise TypeError("Need a file handle, not a string (i.e. not a filename)")
+ if not isinstance(format, basestring) :
+ raise TypeError("Need a string for the file format (lower case)")
+ if not format :
+ raise ValueError("Format required (lower case string)")
+ if format != format.lower() :
+ raise ValueError("Format string '%s' should be lower case" % format)
+ if isinstance(sequences,SeqRecord):
+ raise ValueError("Use a SeqRecord list/iterator, not just a single SeqRecord")
+
+ #Map the file format to a writer class
+ if format in _FormatToWriter :
+ writer_class = _FormatToWriter[format]
+ count = writer_class(handle).write_file(sequences)
+ elif format in AlignIO._FormatToWriter :
+ #Try and turn all the records into a single alignment,
+ #and write that using Bio.AlignIO
+ alignment = to_alignment(sequences)
+ alignment_count = AlignIO.write([alignment], handle, format)
+ assert alignment_count == 1, "Internal error - the underlying writer " \
+ + " should have returned 1, not %s" % repr(alignment_count)
+ count = len(alignment.get_all_seqs())
+ del alignment_count, alignment
+ elif format in _FormatToIterator or format in AlignIO._FormatToIterator :
+ raise ValueError("Reading format '%s' is supported, but not writing" \
+ % format)
+ else :
+ raise ValueError("Unknown format '%s'" % format)
+
+ assert isinstance(count, int), "Internal error - the underlying writer " \
+ + " should have returned the record count, not %s" % repr(count)
+ return count
+
+def parse(handle, format, alphabet=None) :
+ r"""Turns a sequence file into an iterator returning SeqRecords.
+
+ - handle - handle to the file.
+ - format - lower case string describing the file format.
+ - alphabet - optional Alphabet object, useful when the sequence type
+ cannot be automatically inferred from the file itself
+ (e.g. format="fasta" or "tab")
+
+ Typical usage, opening a file to read in, and looping over the record(s):
+
+ >>> from Bio import SeqIO
+ >>> filename = "Nucleic/sweetpea.nu"
+ >>> for record in SeqIO.parse(open(filename,"rU"), "fasta") :
+ ... print "ID", record.id
+ ... print "Sequence length", len(record)
+ ... print "Sequence alphabet", record.seq.alphabet
+ ID gi|3176602|gb|U78617.1|LOU78617
+ Sequence length 309
+ Sequence alphabet SingleLetterAlphabet()
+
+ For file formats like FASTA where the alphabet cannot be determined, it
+ may be useful to specify the alphabet explicitly:
+
+ >>> from Bio import SeqIO
+ >>> from Bio.Alphabet import generic_dna
+ >>> filename = "Nucleic/sweetpea.nu"
+ >>> for record in SeqIO.parse(open(filename,"rU"), "fasta", generic_dna) :
+ ... print "ID", record.id
+ ... print "Sequence length", len(record)
+ ... print "Sequence alphabet", record.seq.alphabet
+ ID gi|3176602|gb|U78617.1|LOU78617
+ Sequence length 309
+ Sequence alphabet DNAAlphabet()
+
+ If you have a string 'data' containing the file contents, you must
+ first turn this into a handle in order to parse it:
+
+ >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
+ >>> from Bio import SeqIO
+ >>> from StringIO import StringIO
+ >>> for record in SeqIO.parse(StringIO(data), "fasta") :
+ ... print record.id, record.seq
+ Alpha ACCGGATGTA
+ Beta AGGCTCGGTTA
+
+ Use the Bio.SeqIO.read(handle, format) function when you expect a single
+ record only.
+ """
+ #NOTE - The above docstring has some raw \n characters needed
+ #for the StringIO example, hense the whole docstring is in raw
+ #string more (see the leading r before the opening quote).
+ from Bio import AlignIO
+
+ #Try and give helpful error messages:
+ if isinstance(handle, basestring) :
+ raise TypeError("Need a file handle, not a string (i.e. not a filename)")
+ if not isinstance(format, basestring) :
+ raise TypeError("Need a string for the file format (lower case)")
+ if not format :
+ raise ValueError("Format required (lower case string)")
+ if format != format.lower() :
+ raise ValueError("Format string '%s' should be lower case" % format)
+ if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
+ isinstance(alphabet, AlphabetEncoder)) :
+ raise ValueError("Invalid alphabet, %s" % repr(alphabet))
+
+ #Map the file format to a sequence iterator:
+ if format in _FormatToIterator :
+ iterator_generator = _FormatToIterator[format]
+ if alphabet is None :
+ return iterator_generator(handle)
+ try :
+ return iterator_generator(handle, alphabet=alphabet)
+ except :
+ return _force_alphabet(iterator_generator(handle), alphabet)
+ elif format in AlignIO._FormatToIterator :
+ #Use Bio.AlignIO to read in the alignments
+ #TODO - Once we drop support for Python 2.3, this helper function can be
+ #replaced with a generator expression.
+ return _iterate_via_AlignIO(handle, format, alphabet)
+ else :
+ raise ValueError("Unknown format '%s'" % format)
+
+#This is a generator function
+def _iterate_via_AlignIO(handle, format, alphabet) :
+ """Iterate over all records in several alignments (PRIVATE)."""
+ from Bio import AlignIO
+ for align in AlignIO.parse(handle, format, alphabet=alphabet) :
+ for record in align :
+ yield record
+
+def _force_alphabet(record_iterator, alphabet) :
+ """Iterate over records, over-riding the alphabet (PRIVATE)."""
+ #Assume the alphabet argument has been pre-validated
+ given_base_class = _get_base_alphabet(alphabet).__class__
+ for record in record_iterator :
+ if isinstance(_get_base_alphabet(record.seq.alphabet),
+ given_base_class) :
+ record.seq.alphabet = alphabet
+ yield record
+ else :
+ raise ValueError("Specified alphabet %s clashes with "\
+ "that determined from the file, %s" \
+ % (repr(alphabet), repr(record.seq.alphabet)))
+
+def read(handle, format, alphabet=None) :
+ """Turns a sequence file into a single SeqRecord.
+
+ - handle - handle to the file.
+ - format - string describing the file format.
+ - alphabet - optional Alphabet object, useful when the sequence type
+ cannot be automatically inferred from the file itself
+ (e.g. format="fasta" or "tab")
+
+ This function is for use parsing sequence files containing
+ exactly one record. For example, reading a GenBank file:
+
+ >>> from Bio import SeqIO
+ >>> record = SeqIO.read(open("GenBank/arab1.gb", "rU"), "genbank")
+ >>> print "ID", record.id
+ ID AC007323.5
+ >>> print "Sequence length", len(record)
+ Sequence length 86436
+ >>> print "Sequence alphabet", record.seq.alphabet
+ Sequence alphabet IUPACAmbiguousDNA()
+
+ If the handle contains no records, or more than one record,
+ an exception is raised. For example:
+
+ >>> from Bio import SeqIO
+ >>> record = SeqIO.read(open("GenBank/cor6_6.gb", "rU"), "genbank")
+ Traceback (most recent call last):
+ ...
+ ValueError: More than one record found in handle
+
+ If however you want the first record from a file containing
+ multiple records this function would raise an exception (as
+ shown in the example above). Instead use:
+
+ >>> from Bio import SeqIO
+ >>> record = SeqIO.parse(open("GenBank/cor6_6.gb", "rU"), "genbank").next()
+ >>> print "First record's ID", record.id
+ First record's ID X55053.1
+
+ Use the Bio.SeqIO.parse(handle, format) function if you want
+ to read multiple records from the handle.
+ """
+ iterator = parse(handle, format, alphabet)
+ try :
+ first = iterator.next()
+ except StopIteration :
+ first = None
+ if first is None :
+ raise ValueError("No records found in handle")
+ try :
+ second = iterator.next()
+ except StopIteration :
+ second = None
+ if second is not None :
+ raise ValueError("More than one record found in handle")
+ return first
+
+def to_dict(sequences, key_function=None) :
+ """Turns a sequence iterator or list into a dictionary.
+
+ - sequences - An iterator that returns SeqRecord objects,
+ or simply a list of SeqRecord objects.
+ - key_function - Optional function which when given a SeqRecord
+ returns a unique string for the dictionary key.
+
+ e.g. key_function = lambda rec : rec.name
+ or, key_function = lambda rec : rec.description.split()[0]
+
+ If key_function is ommitted then record.id is used, on the
+ assumption that the records objects returned are SeqRecords
+ with a unique id field.
+
+ If there are duplicate keys, an error is raised.
+
+ Example usage, defaulting to using the record.id as key:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("GenBank/cor6_6.gb", "rU")
+ >>> format = "genbank"
+ >>> id_dict = SeqIO.to_dict(SeqIO.parse(handle, format))
+ >>> print id_dict.keys()
+ ['L31939.1', 'AJ237582.1', 'X62281.1', 'AF297471.1', 'X55053.1', 'M81224.1']
+ >>> print id_dict["L31939.1"].description
+ Brassica rapa (clone bif72) kin mRNA, complete cds.
+
+ A more complex example, using the key_function argument in order to use
+ a sequence checksum as the dictionary key:
+
+ >>> from Bio import SeqIO
+ >>> from Bio.SeqUtils.CheckSum import seguid
+ >>> handle = open("GenBank/cor6_6.gb", "rU")
+ >>> format = "genbank"
+ >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(handle, format),
+ ... key_function = lambda rec : seguid(rec.seq))
+ >>> for key, record in seguid_dict.iteritems() :
+ ... print key, record.id
+ SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1
+ l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1
+ /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1
+ TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1
+ uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1
+ BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1
+ """
+ if key_function is None :
+ key_function = lambda rec : rec.id
+
+ d = dict()
+ for record in sequences :
+ key = key_function(record)
+ if key in d :
+ raise ValueError("Duplicate key '%s'" % key)
+ d[key] = record
+ return d
+
+
+def to_alignment(sequences, alphabet=None, strict=True) :
+ """Returns a multiple sequence alignment (OBSOLETE).
+
+ - sequences -An iterator that returns SeqRecord objects,
+ or simply a list of SeqRecord objects. All
+ the record sequences must be the same length.
+ - alphabet - Optional alphabet. Stongly recommended.
+ - strict - Optional, defaults to True. Should error checking
+ be done?
+
+ Using this function is now discouraged. Rather doing this:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Clustalw/protein.aln")
+ >>> alignment = SeqIO.to_alignment(SeqIO.parse(handle, "clustal"))
+ >>> handle.close()
+
+ You are now encouraged to use Bio.AlignIO instead, e.g.
+
+ >>> from Bio import AlignIO
+ >>> handle = open("Clustalw/protein.aln")
+ >>> alignment = AlignIO.read(handle, "clustal")
+ >>> handle.close()
+ """
+ #TODO - Move this functionality into the Alignment class instead?
+ from Bio.Alphabet import generic_alphabet
+ from Bio.Alphabet import _consensus_alphabet
+ if alphabet is None :
+ sequences = list(sequences)
+ alphabet = _consensus_alphabet([rec.seq.alphabet for rec in sequences \
+ if rec.seq is not None])
+
+ if not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)) :
+ raise ValueError("Invalid alphabet")
+
+ alignment_length = None
+ alignment = Alignment(alphabet)
+ for record in sequences :
+ if strict :
+ if alignment_length is None :
+ alignment_length = len(record.seq)
+ elif alignment_length != len(record.seq) :
+ raise ValueError("Sequences must all be the same length")
+
+ assert isinstance(record.seq.alphabet, Alphabet) \
+ or isinstance(record.seq.alphabet, AlphabetEncoder), \
+ "Sequence does not have a valid alphabet"
+
+ #TODO - Move this alphabet comparison code into the Alphabet module/class?
+ #TODO - Is a normal alphabet "ungapped" by default, or does it just mean
+ #undecided?
+ if isinstance(record.seq.alphabet, Alphabet) \
+ and isinstance(alphabet, Alphabet) :
+ #Comparing two non-gapped alphabets
+ if not isinstance(record.seq.alphabet, alphabet.__class__) :
+ raise ValueError("Incompatible sequence alphabet " \
+ + "%s for %s alignment" \
+ % (record.seq.alphabet, alphabet))
+ elif isinstance(record.seq.alphabet, AlphabetEncoder) \
+ and isinstance(alphabet, Alphabet) :
+ raise ValueError("Sequence has a gapped alphabet, alignment does not")
+ elif isinstance(record.seq.alphabet, Alphabet) \
+ and isinstance(alphabet, Gapped) :
+ #Sequence isn't gapped, alignment is.
+ if not isinstance(record.seq.alphabet, alphabet.alphabet.__class__) :
+ raise ValueError("Incompatible sequence alphabet " \
+ + "%s for %s alignment" \
+ % (record.seq.alphabet, alphabet))
+ else :
+ #Comparing two gapped alphabets
+ if not isinstance(record.seq.alphabet, alphabet.__class__) :
+ raise ValueError("Incompatible sequence alphabet " \
+ + "%s for %s alignment" \
+ % (record.seq.alphabet, alphabet))
+ if record.seq.alphabet.gap_char != alphabet.gap_char :
+ raise ValueError("Sequence gap characters != alignment gap char")
+ #ToDo, additional checks on the specified alignment...
+ #Should we look at the alphabet.contains() method?
+ if record.seq is None :
+ raise TypeError("SeqRecord (id=%s) has None for its sequence." % record.id)
+
+ #This is abusing the "private" records list,
+ #we should really have a method like add_sequence
+ #but which takes SeqRecord objects. See also Bug 1944
+ alignment._records.append(record)
+ return alignment
+
+def _test():
+ """Run the Bio.SeqIO module's doctests.
+
+ This will try and locate the unit tests directory, and run the doctests
+ from there in order that the relative paths used in the examples work.
+ """
+ import doctest
+ import os
+ if os.path.isdir(os.path.join("..","..","Tests")) :
+ print "Runing doctests..."
+ cur_dir = os.path.abspath(os.curdir)
+ os.chdir(os.path.join("..","..","Tests"))
+ doctest.testmod()
+ os.chdir(cur_dir)
+ del cur_dir
+ print "Done"
+
+if __name__ == "__main__":
+ #Run the doctests
+ _test()
--- /dev/null
+# Copyright 2000-2002 Andrew Dalke.
+# Copyright 2002-2004 Brad Chapman.
+# Copyright 2006-2009 by Peter Cock.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Represent a Sequence Record, a sequence with annotation."""
+__docformat__ = "epytext en" #Simple markup to show doctests nicely
+
+# NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL
+# In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes
+# need to be in sync (this is the BioSQL "Database SeqRecord", see
+# also BioSQL.BioSeq.DBSeq which is the "Database Seq" class)
+
+class _RestrictedDict(dict):
+ """Dict which only allows sequences of given length as values (PRIVATE).
+
+ This simple subclass of the python dictionary is used in the SeqRecord
+ object for holding per-letter-annotations. This class is intended to
+ prevent simple errors by only allowing python sequences (e.g. lists,
+ strings and tuples) to be stored, and only if their length matches that
+ expected (the length of the SeqRecord's seq object). It cannot however
+ prevent the entries being edited in situ (for example appending entries
+ to a list).
+ """
+ def __init__(self, length) :
+ """Create an EMPTY restricted dictionary."""
+ dict.__init__(self)
+ self._length = int(length)
+ def __setitem__(self, key, value) :
+ if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \
+ or len(value) != self._length :
+ raise TypeError("We only allow python sequences (lists, tuples or "
+ "strings) of length %i." % self._length)
+ dict.__setitem__(self, key, value)
+
+class SeqRecord(object):
+ """A SeqRecord object holds a sequence and information about it.
+
+ Main attributes:
+ - id - Identifier such as a locus tag (string)
+ - seq - The sequence itself (Seq object)
+
+ Additional attributes:
+ - name - Sequence name, e.g. gene name (string)
+ - description - Additional text (string)
+ - dbxrefs - List of database cross references (list of strings)
+ - features - Any (sub)features defined (list of SeqFeature objects)
+ - annotations - Further information about the whole sequence (dictionary)
+ Most entries are lists of strings.
+ - letter_annotations - Per letter/symbol annotation (restricted
+ dictionary). This holds python sequences (lists, strings
+ or tuples) whose length matches that of the sequence.
+ A typical use would be to hold a list of integers
+ representing sequencing quality scores, or a string
+ representing the secondary structure.
+
+ You will typically use Bio.SeqIO to read in sequences from files as
+ SeqRecord objects. However, you may want to create your own SeqRecord
+ objects directly (see the __init__ method for further details):
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Alphabet import IUPAC
+ >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
+ ... IUPAC.protein),
+ ... id="YP_025292.1", name="HokC",
+ ... description="toxic membrane protein")
+ >>> print record
+ ID: YP_025292.1
+ Name: HokC
+ Description: toxic membrane protein
+ Number of features: 0
+ Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
+
+ If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
+ for this. For the special case where you want the SeqRecord turned into
+ a string in a particular file format there is a format method which uses
+ Bio.SeqIO internally:
+
+ >>> print record.format("fasta")
+ >YP_025292.1 toxic membrane protein
+ MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+ <BLANKLINE>
+ """
+ def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
+ description = "<unknown description>", dbxrefs = None,
+ features = None):
+ """Create a SeqRecord.
+
+ Arguments:
+ - seq - Sequence, required (Seq or Mutable object)
+ - id - Sequence identifier, recommended (string)
+ - name - Sequence name, optional (string)
+ - description - Sequence description, optional (string)
+ - dbxrefs - Database cross references, optional (list of strings)
+ - features - Any (sub)features, optional (list of SeqFeature objects)
+
+ You will typically use Bio.SeqIO to read in sequences from files as
+ SeqRecord objects. However, you may want to create your own SeqRecord
+ objects directly.
+
+ Note that while an id is optional, we strongly recommend you supply a
+ unique id string for each record. This is especially important
+ if you wish to write your sequences to a file.
+
+ If you don't have the actual sequence, but you do know its length,
+ then using the UnknownSeq object from Bio.Seq is appropriate.
+
+ You can create a 'blank' SeqRecord object, and then populate the
+ attributes later. Note that currently the annotations and the
+ letter_annotations dictionaries cannot be specified when creating
+ the SeqRecord.
+ """
+ if id is not None and not isinstance(id, basestring) :
+ #Lots of existing code uses id=None... this may be a bad idea.
+ raise TypeError("id argument should be a string")
+ if not isinstance(name, basestring) :
+ raise TypeError("name argument should be a string")
+ if not isinstance(description, basestring) :
+ raise TypeError("description argument should be a string")
+ if dbxrefs is not None and not isinstance(dbxrefs, list) :
+ raise TypeError("dbxrefs argument should be a list (of strings)")
+ if features is not None and not isinstance(features, list) :
+ raise TypeError("features argument should be a list (of SeqFeature objects)")
+ self._seq = seq
+ self.id = id
+ self.name = name
+ self.description = description
+ if dbxrefs is None:
+ dbxrefs = []
+ self.dbxrefs = dbxrefs
+ # annotations about the whole sequence
+ self.annotations = {}
+
+ # annotations about each letter in the sequence
+ if seq is None :
+ #Should we allow this and use a normal unrestricted dict?
+ self._per_letter_annotations = _RestrictedDict(length=0)
+ else :
+ try :
+ self._per_letter_annotations = _RestrictedDict(length=len(seq))
+ except :
+ raise TypeError("seq argument should be a Seq or MutableSeq")
+
+ # annotations about parts of the sequence
+ if features is None:
+ features = []
+ self.features = features
+
+ #TODO - Just make this a read only property?
+ def _set_per_letter_annotations(self, value) :
+ if not isinstance(value, dict) :
+ raise TypeError("The per-letter-annotations should be a "
+ "(restricted) dictionary.")
+ #Turn this into a restricted-dictionary (and check the entries)
+ try :
+ self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
+ except AttributeError :
+ #e.g. seq is None
+ self._per_letter_annotations = _RestrictedDict(length=0)
+ self._per_letter_annotations.update(value)
+ letter_annotations = property( \
+ fget=lambda self : self._per_letter_annotations,
+ fset=_set_per_letter_annotations,
+ doc="""Dictionary of per-letter-annotation for the sequence.
+
+ For example, this can hold quality scores used in FASTQ or QUAL files.
+ Consider this example using Bio.SeqIO to read in an example Solexa
+ variant FASTQ file as a SeqRecord:
+
+ >>> from Bio import SeqIO
+ >>> handle = open("Quality/solexa.fastq", "rU")
+ >>> record = SeqIO.read(handle, "fastq-solexa")
+ >>> handle.close()
+ >>> print record.id, record.seq
+ slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC
+ >>> print record.letter_annotations.keys()
+ ['solexa_quality']
+ >>> print record.letter_annotations["solexa_quality"]
+ [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -6, -1, -1, -4, -1, -4, -19, -10, -27, -18]
+
+ The per-letter-annotaions get sliced automatically if you slice the
+ parent SeqRecord, for example taking the last ten bases:
+
+ >>> sub_record = record[-10:]
+ >>> print sub_record.id, sub_record.seq
+ slxa_0013_1_0001_24 CTTATACACC
+ >>> print sub_record.letter_annotations["solexa_quality"]
+ [-6, -1, -1, -4, -1, -4, -19, -10, -27, -18]
+
+ Any python sequence (i.e. list, tuple or string) can be recorded in
+ the SeqRecord's letter_annotations dictionary as long as the length
+ matches that of the SeqRecord's sequence. e.g.
+
+ >>> len(sub_record.letter_annotations)
+ 1
+ >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
+ >>> len(sub_record.letter_annotations)
+ 2
+
+ You can delete entries from the letter_annotations dictionary as usual:
+
+ >>> del sub_record.letter_annotations["solexa_quality"]
+ >>> sub_record.letter_annotations
+ {'dummy': 'abcdefghij'}
+
+ You can completely clear the dictionary easily as follows:
+
+ >>> sub_record.letter_annotations = {}
+ >>> sub_record.letter_annotations
+ {}
+ """)
+
+ def _set_seq(self, value) :
+ #TODO - Add a deprecation warning that the seq should be write only?
+ if self._per_letter_annotations :
+ #TODO - Make this a warning? Silently empty the dictionary?
+ raise ValueError("You must empty the letter annotations first!")
+ self._seq = value
+ try :
+ self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
+ except AttributeError :
+ #e.g. seq is None
+ self._per_letter_annotations = _RestrictedDict(length=0)
+
+ seq = property(fget=lambda self : self._seq,
+ fset=_set_seq,
+ doc="The sequence itself, as a Seq or MutableSeq object.")
+
+ def __getitem__(self, index) :
+ """Returns a sub-sequence or an individual letter.
+
+ Splicing, e.g. my_record[5:10], returns a new SeqRecord for
+ that sub-sequence with approriate annotation preserved. The
+ name, id and description are kept.
+
+ Any per-letter-annotations are sliced to match the requested
+ sub-sequence. Unless a stride is used, all those features
+ which fall fully within the subsequence are included (with
+ their locations adjusted accordingly).
+
+ However, the annotations dictionary and the dbxrefs list are
+ not used for the new SeqRecord, as in general they may not
+ apply to the subsequence. If you want to preserve them, you
+ must explictly copy them to the new SeqRecord yourself.
+
+ Using an integer index, e.g. my_record[5] is shorthand for
+ extracting that letter from the sequence, my_record.seq[5].
+
+ For example, consider this short protein and its secondary
+ structure as encoded by the PDB (e.g. H for alpha helices),
+ plus a simple feature for its histidine self phosphorylation
+ site:
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
+ >>> from Bio.Alphabet import IUPAC
+ >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
+ ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR",
+ ... IUPAC.protein),
+ ... id="1JOY", name="EnvZ",
+ ... description="Homodimeric domain of EnvZ from E. coli")
+ >>> rec.letter_annotations["secondary_structure"] = \
+ " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT "
+ >>> rec.features.append(SeqFeature(FeatureLocation(20,21),
+ ... type = "Site"))
+
+ Now let's have a quick look at the full record,
+
+ >>> print rec
+ ID: 1JOY
+ Name: EnvZ
+ Description: Homodimeric domain of EnvZ from E. coli
+ Number of features: 1
+ Per letter annotation for: secondary_structure
+ Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
+ >>> print rec.letter_annotations["secondary_structure"]
+ S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT
+ >>> print rec.features[0].location
+ [20:21]
+
+ Now let's take a sub sequence, here chosen as the first (fractured)
+ alpha helix which includes the histidine phosphorylation site:
+
+ >>> sub = rec[11:41]
+ >>> print sub
+ ID: 1JOY
+ Name: EnvZ
+ Description: Homodimeric domain of EnvZ from E. coli
+ Number of features: 1
+ Per letter annotation for: secondary_structure
+ Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein())
+ >>> print sub.letter_annotations["secondary_structure"]
+ HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH
+ >>> print sub.features[0].location
+ [9:10]
+
+ You can also of course omit the start or end values, for
+ example to get the first ten letters only:
+
+ >>> print rec[:10]
+ ID: 1JOY
+ Name: EnvZ
+ Description: Homodimeric domain of EnvZ from E. coli
+ Number of features: 0
+ Per letter annotation for: secondary_structure
+ Seq('MAAGVKQLAD', IUPACProtein())
+
+ Or for the last ten letters:
+
+ >>> print rec[-10:]
+ ID: 1JOY
+ Name: EnvZ
+ Description: Homodimeric domain of EnvZ from E. coli
+ Number of features: 0
+ Per letter annotation for: secondary_structure
+ Seq('IIEQFIDYLR', IUPACProtein())
+
+ If you omit both, then you get a copy of the original record (although
+ lacking the annotations and dbxrefs):
+
+ >>> print rec[:]
+ ID: 1JOY
+ Name: EnvZ
+ Description: Homodimeric domain of EnvZ from E. coli
+ Number of features: 1
+ Per letter annotation for: secondary_structure
+ Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
+
+ Finally, indexing with a simple integer is shorthand for pulling out
+ that letter from the sequence directly:
+
+ >>> rec[5]
+ 'K'
+ >>> rec.seq[5]
+ 'K'
+ """
+ if isinstance(index, int) :
+ #NOTE - The sequence level annotation like the id, name, etc
+ #do not really apply to a single character. However, should
+ #we try and expose any per-letter-annotation here? If so how?
+ return self.seq[index]
+ elif isinstance(index, slice) :
+ if self.seq is None :
+ raise ValueError("If the sequence is None, we cannot slice it.")
+ parent_length = len(self)
+ answer = self.__class__(self.seq[index],
+ id=self.id,
+ name=self.name,
+ description=self.description)
+ #TODO - The desription may no longer apply.
+ #It would be safer to change it to something
+ #generic like "edited" or the default value.
+
+ #Don't copy the annotation dict and dbxefs list,
+ #they may not apply to a subsequence.
+ #answer.annotations = dict(self.annotations.iteritems())
+ #answer.dbxrefs = self.dbxrefs[:]
+
+ #TODO - Cope with strides by generating ambiguous locations?
+ if index.step is None or index.step == 1 :
+ #Select relevant features, add them with shifted locations
+ if index.start is None :
+ start = 0
+ else :
+ start = index.start
+ if index.stop is None :
+ stop = -1
+ else :
+ stop = index.stop
+ if (start < 0 or stop < 0) and parent_length == 0 :
+ raise ValueError, \
+ "Cannot support negative indices without the sequence length"
+ if start < 0 :
+ start = parent_length - start
+ if stop < 0 :
+ stop = parent_length - stop + 1
+ #assert str(self.seq)[index] == str(self.seq)[start:stop]
+ for f in self.features :
+ if start <= f.location.start.position \
+ and f.location.end.position < stop :
+ answer.features.append(f._shift(-start))
+
+ #Slice all the values to match the sliced sequence
+ #(this should also work with strides, even negative strides):
+ for key, value in self.letter_annotations.iteritems() :
+ answer._per_letter_annotations[key] = value[index]
+
+ return answer
+ raise ValueError, "Invalid index"
+
+ def __iter__(self) :
+ """Iterate over the letters in the sequence.
+
+ For example, using Bio.SeqIO to read in a protein FASTA file:
+
+ >>> from Bio import SeqIO
+ >>> record = SeqIO.read(open("Amino/loveliesbleeding.pro"),"fasta")
+ >>> for amino in record :
+ ... print amino
+ ... if amino == "L" : break
+ X
+ A
+ G
+ L
+ >>> print record.seq[3]
+ L
+
+ This is just a shortcut for iterating over the sequence directly:
+
+ >>> for amino in record.seq :
+ ... print amino
+ ... if amino == "L" : break
+ X
+ A
+ G
+ L
+ >>> print record.seq[3]
+ L
+
+ Note that this does not facilitate iteration together with any
+ per-letter-annotation. However, you can achieve that using the
+ python zip function on the record (or its sequence) and the relevant
+ per-letter-annotation:
+
+ >>> from Bio import SeqIO
+ >>> rec = SeqIO.read(open("Quality/solexa.fastq", "rU"),
+ ... "fastq-solexa")
+ >>> print rec.id, rec.seq
+ slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC
+ >>> print rec.letter_annotations.keys()
+ ['solexa_quality']
+ >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]) :
+ ... if qual < -10 :
+ ... print nuc, qual
+ C -19
+ C -27
+ C -18
+
+ You may agree that using zip(rec.seq, ...) is more explicit than using
+ zip(rec, ...) as shown above.
+ """
+ return iter(self.seq)
+
+ def __str__(self) :
+ """A human readable summary of the record and its annotation (string).
+
+ The python built in function str works by calling the object's ___str__
+ method. e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Alphabet import IUPAC
+ >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
+ ... IUPAC.protein),
+ ... id="YP_025292.1", name="HokC",
+ ... description="toxic membrane protein, small")
+ >>> print str(record)
+ ID: YP_025292.1
+ Name: HokC
+ Description: toxic membrane protein, small
+ Number of features: 0
+ Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
+
+ In this example you don't actually need to call str explicity, as the
+ print command does this automatically:
+
+ >>> print record
+ ID: YP_025292.1
+ Name: HokC
+ Description: toxic membrane protein, small
+ Number of features: 0
+ Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
+
+ Note that long sequences are shown truncated.
+ """
+ lines = []
+ if self.id : lines.append("ID: %s" % self.id)
+ if self.name : lines.append("Name: %s" % self.name)
+ if self.description : lines.append("Description: %s" % self.description)
+ if self.dbxrefs : lines.append("Database cross-references: " \
+ + ", ".join(self.dbxrefs))
+ lines.append("Number of features: %i" % len(self.features))
+ for a in self.annotations:
+ lines.append("/%s=%s" % (a, str(self.annotations[a])))
+ if self.letter_annotations :
+ lines.append("Per letter annotation for: " \
+ + ", ".join(self.letter_annotations.keys()))
+ #Don't want to include the entire sequence,
+ #and showing the alphabet is useful:
+ lines.append(repr(self.seq))
+ return "\n".join(lines)
+
+ def __repr__(self) :
+ """A concise summary of the record for debugging (string).
+
+ The python built in function repr works by calling the object's ___repr__
+ method. e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Alphabet import generic_protein
+ >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
+ ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
+ ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
+ ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
+ ... generic_protein),
+ ... id="NP_418483.1", name="b4059",
+ ... description="ssDNA-binding protein",
+ ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
+ >>> print repr(rec)
+ SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
+
+ At the python prompt you can also use this shorthand:
+
+ >>> rec
+ SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
+
+ Note that long sequences are shown truncated.
+ """
+ return self.__class__.__name__ \
+ + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
+ % tuple(map(repr, (self.seq, self.id, self.name,
+ self.description, self.dbxrefs)))
+
+ def format(self, format) :
+ r"""Returns the record as a string in the specified file format.
+
+ The format should be a lower case string supported as an output
+ format by Bio.SeqIO, which is used to turn the SeqRecord into a
+ string. e.g.
+
+ >>> from Bio.Seq import Seq
+ >>> from Bio.SeqRecord import SeqRecord
+ >>> from Bio.Alphabet import IUPAC
+ >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
+ ... IUPAC.protein),
+ ... id="YP_025292.1", name="HokC",
+ ... description="toxic membrane protein")
+ >>> record.format("fasta")
+ '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n'
+ >>> print record.format("fasta")
+ >YP_025292.1 toxic membrane protein
+ MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
+ <BLANKLINE>
+
+ The python print command automatically appends a new line, meaning
+ in this example a blank line is shown. If you look at the string
+ representation you can see there is a trailing new line (shown as
+ slash n) which is important when writing to a file or if
+ concatenating mutliple sequence strings together.
+
+ Note that this method will NOT work on every possible file format
+ supported by Bio.SeqIO (e.g. some are for multiple sequences only).
+ """
+ #See also the __format__ added for Python 2.6 / 3.0, PEP 3101
+ #See also the Bio.Align.Generic.Alignment class and its format()
+ return self.__format__(format)
+
+ def __format__(self, format_spec) :
+ """Returns the record as a string in the specified file format.
+
+ This method supports the python format() function added in
+ Python 2.6/3.0. The format_spec should be a lower case
+ string supported by Bio.SeqIO as an output file format.
+ See also the SeqRecord's format() method.
+ """
+ if format_spec:
+ from StringIO import StringIO
+ from Bio import SeqIO
+ handle = StringIO()
+ SeqIO.write([self], handle, format_spec)
+ return handle.getvalue()
+ else :
+ #Follow python convention and default to using __str__
+ return str(self)
+
+ def __len__(self) :
+ """Returns the length of the sequence.
+
+ For example, using Bio.SeqIO to read in a FASTA nucleotide file:
+
+ >>> from Bio import SeqIO
+ >>> record = SeqIO.read(open("Nucleic/sweetpea.nu"),"fasta")
+ >>> len(record)
+ 309
+ >>> len(record.seq)
+ 309
+ """
+ return len(self.seq)
+
+ def __nonzero__(self) :
+ """Returns True regardless of the length of the sequence.
+
+ This behaviour is for backwards compatibility, since until the
+ __len__ method was added, a SeqRecord always evaluated as True.
+
+ Note that in comparison, a Seq object will evaluate to False if it
+ has a zero length sequence.
+
+ WARNING: The SeqRecord may in future evaluate to False when its
+ sequence is of zero length (in order to better match the Seq
+ object behaviour)!
+ """
+ return True
+
+def _test():
+ """Run the Bio.SeqRecord module's doctests (PRIVATE).
+
+ This will try and locate the unit tests directory, and run the doctests
+ from there in order that the relative paths used in the examples work.
+ """
+ import doctest
+ import os
+ if os.path.isdir(os.path.join("..","Tests")) :
+ print "Runing doctests..."
+ cur_dir = os.path.abspath(os.curdir)
+ os.chdir(os.path.join("..","Tests"))
+ doctest.testmod()
+ os.chdir(cur_dir)
+ del cur_dir
+ print "Done"
+
+if __name__ == "__main__":
+ _test()
--- /dev/null
+# Copyright 2002 by Yves Bastide and Brad Chapman.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Functions to calculate assorted sequence checksums."""
+
+# crc32, crc64, gcg, and seguid
+# crc64 is adapted from BioPerl
+
+from binascii import crc32 as _crc32
+
+def crc32(seq) :
+ """Returns the crc32 checksum for a sequence (string or Seq object)"""
+ try :
+ #Assume its a Seq object
+ return _crc32(seq.tostring())
+ except AttributeError :
+ #Assume its a string
+ return _crc32(seq)
+
+def _init_table_h():
+ _table_h = []
+ for i in range(256):
+ l = i
+ part_h = 0
+ for j in range(8):
+ rflag = l & 1
+ l >>= 1
+ if part_h & 1: l |= (1L << 31)
+ part_h >>= 1L
+ if rflag: part_h ^= 0xd8000000L
+ _table_h.append(part_h)
+ return _table_h
+
+# Initialisation
+_table_h = _init_table_h()
+
+def crc64(s):
+ """Returns the crc64 checksum for a sequence (string or Seq object)"""
+ crcl = 0
+ crch = 0
+ for c in s:
+ shr = (crch & 0xFF) << 24
+ temp1h = crch >> 8
+ temp1l = (crcl >> 8) | shr
+ idx = (crcl ^ ord(c)) & 0xFF
+ crch = temp1h ^ _table_h[idx]
+ crcl = temp1l
+
+ return "CRC-%08X%08X" % (crch, crcl)
+
+
+def gcg(seq):
+ """Returns the GCG checksum (int) for a sequence (string or Seq object)
+
+ Given a nucleotide or amino-acid secuence (or any string),
+ returns the GCG checksum (int). Checksum used by GCG program.
+ seq type = str.
+ Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
+ with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.
+ All sequences are converted to uppercase """
+ index = checksum = 0
+ if type(seq)!=type("aa"):
+ seq=seq.tostring()
+ for char in seq:
+ index += 1
+ checksum += index * ord(char.upper())
+ if index == 57: index = 0
+ return checksum % 10000
+
+def seguid(seq):
+ """Returns the SEGUID (string) for a sequence (string or Seq object)
+
+ Given a nucleotide or amino-acid secuence (or any string),
+ returns the SEGUID string (A SEquence Globally Unique IDentifier).
+ seq type = str.
+ For more information about SEGUID, see:
+ http://bioinformatics.anl.gov/seguid/
+ DOI: 10.1002/pmic.200600032 """
+ try:
+ #Python 2.5 sha1 is in hashlib
+ import hashlib
+ m = hashlib.sha1()
+ except:
+ #For older versions
+ import sha
+ m = sha.new()
+ import base64
+ if type(seq)!=type("aa"):
+ seq=seq.tostring().upper()
+ else:
+ seq=seq.upper()
+ m.update(seq)
+ try:
+ #For Python 2.5
+ return base64.b64encode(m.digest()).rstrip("=")
+ except:
+ #For older versions
+ import os
+ #Note: Using os.linesep doesn't work on Windows,
+ #where os.linesep= "\r\n" but the encoded string
+ #contains "\n" but not "\r\n"
+ return base64.encodestring(m.digest()).replace("\n","").rstrip("=")
+
+if __name__ == "__main__" :
+ print "Quick self test"
+
+ str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
+ + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
+ + "YCSSYAGSSTLVFGGGTKLTVL"
+
+ str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
+ + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
+ + "YCCSYAGSSTWVFGGGTKLTVL"
+
+ assert crc64(str_light_chain_one) == crc64(str_light_chain_two)
+ assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one)
+
+ assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one)
+ assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two)
+
+ print "Done"
--- /dev/null
+import math
+from CodonUsageIndices import SharpEcoliIndex
+from Bio import SeqIO # To parse a FASTA file
+
+CodonsDict = {'TTT':0, 'TTC':0, 'TTA':0, 'TTG':0, 'CTT':0,
+'CTC':0, 'CTA':0, 'CTG':0, 'ATT':0, 'ATC':0,
+'ATA':0, 'ATG':0, 'GTT':0, 'GTC':0, 'GTA':0,
+'GTG':0, 'TAT':0, 'TAC':0, 'TAA':0, 'TAG':0,
+'CAT':0, 'CAC':0, 'CAA':0, 'CAG':0, 'AAT':0,
+'AAC':0, 'AAA':0, 'AAG':0, 'GAT':0, 'GAC':0,
+'GAA':0, 'GAG':0, 'TCT':0, 'TCC':0, 'TCA':0,
+'TCG':0, 'CCT':0, 'CCC':0, 'CCA':0, 'CCG':0,
+'ACT':0, 'ACC':0, 'ACA':0, 'ACG':0, 'GCT':0,
+'GCC':0, 'GCA':0, 'GCG':0, 'TGT':0, 'TGC':0,
+'TGA':0, 'TGG':0, 'CGT':0, 'CGC':0, 'CGA':0,
+'CGG':0, 'AGT':0, 'AGC':0, 'AGA':0, 'AGG':0,
+'GGT':0, 'GGC':0, 'GGA':0, 'GGG':0}
+
+
+# this dictionary is used to know which codons encode the same AA.
+SynonymousCodons = {'CYS': ['TGT', 'TGC'], 'ASP': ['GAT', 'GAC'],
+'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'],
+'GLN': ['CAA', 'CAG'], 'MET': ['ATG'], 'ASN': ['AAC', 'AAT'],
+'PRO': ['CCT', 'CCG', 'CCA', 'CCC'], 'LYS': ['AAG', 'AAA'],
+'STOP': ['TAG', 'TGA', 'TAA'], 'THR': ['ACC', 'ACA', 'ACG', 'ACT'],
+'PHE': ['TTT', 'TTC'], 'ALA': ['GCA', 'GCC', 'GCG', 'GCT'],
+'GLY': ['GGT', 'GGG', 'GGA', 'GGC'], 'ILE': ['ATC', 'ATA', 'ATT'],
+'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'], 'HIS': ['CAT', 'CAC'],
+'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'], 'TRP': ['TGG'],
+'VAL': ['GTA', 'GTC', 'GTG', 'GTT'], 'GLU': ['GAG', 'GAA'], 'TYR': ['TAT', 'TAC']}
+
+
+class CodonAdaptationIndex:
+ """A codon adaptaion index (CAI) implementation.
+
+ This class implements the codon adaptaion index (CAI) described by Sharp and
+ Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95).
+
+ methods:
+
+ set_cai_index(Index):
+
+ This method sets-up an index to be used when calculating CAI for a gene.
+ Just pass a dictionary similar to the SharpEcoliIndex in CodonUsageIndices
+ module.
+
+ generate_index(FastaFile):
+
+ This method takes a location of a FastaFile and generates an index. This
+ index can later be used to calculate CAI of a gene.
+
+ cai_for_gene(DNAsequence):
+
+ This method uses the Index (either the one you set or the one you generated)
+ and returns the CAI for the DNA sequence.
+
+ print_index():
+ This method prints out the index you used.
+
+ NOTE - This implementation does not currently cope with alternative genetic
+ codes, only the synonymous codons in the standard table are considered.
+ """
+ def __init__(self):
+ self.index = {}
+ self.codon_count={}
+
+ # use this method with predefined CAI index
+ def set_cai_index(self, Index):
+ self.index = Index
+
+ def generate_index(self, FastaFile):
+ """Generate a codon usage index from a FASTA file of CDS sequences.
+
+ This method takes a location of a Fasta file containing CDS sequences
+ (which must all have a whole number of codons) and generates a codon
+ usage index. This index can later be used to calculate CAI of a gene.
+ """
+ # first make sure i am not overwriting an existing index:
+ if self.index != {} or self.codon_count!={}:
+ raise ValueError("an index has already been set or a codon count has been done. cannot overwrite either.")
+ # count codon occurances in the file.
+ self._count_codons(FastaFile)
+
+ # now to calculate the index we first need to sum the number of times
+ # synonymous codons were used all together.
+ for AA in SynonymousCodons.keys():
+ Sum=0.0
+ RCSU=[] # RCSU values are equal to CodonCount/((1/num of synonymous codons) * sum of all synonymous codons)
+
+ for codon in SynonymousCodons[AA]:
+ Sum += self.codon_count[codon]
+ # calculate the RSCU value for each of the codons
+ for codon in SynonymousCodons[AA]:
+ RCSU.append(self.codon_count[codon]/((1.0/len(SynonymousCodons[AA]))*Sum))
+ # now generate the index W=RCSUi/RCSUmax:
+ RCSUmax = max(RCSU)
+ for i in range(len(SynonymousCodons[AA])):
+ self.index[SynonymousCodons[AA][i]]= RCSU[i]/RCSUmax
+
+
+ def cai_for_gene(self, DNAsequence):
+ """Calculate the CAI (float) for the provided DNA sequence (string).
+
+ This method uses the Index (either the one you set or the one you generated)
+ and returns the CAI for the DNA sequence.
+ """
+ caiValue = 0
+ LengthForCai = 0
+ # if no index is set or generated, the default SharpEcoliIndex will be used.
+ if self.index=={}:
+ self.set_cai_index(SharpEcoliIndex)
+
+ if DNAsequence.islower():
+ DNAsequence = DNAsequence.upper()
+ for i in range (0,len(DNAsequence),3):
+ codon = DNAsequence[i:i+3]
+ if codon in self.index:
+ if codon!='ATG' and codon!= 'TGG': #these two codons are always one, exclude them.
+ caiValue += math.log(self.index[codon])
+ LengthForCai += 1
+ elif codon not in ['TGA','TAA', 'TAG']: # some indices you will use may not include stop codons.
+ raise TypeError("illegal codon in sequence: %s.\n%s" % (codon, self.index))
+ return math.exp(caiValue*(1.0/(LengthForCai-1)))
+
+ def _count_codons(self, FastaFile):
+ handle = open(FastaFile, 'r')
+
+ # make the codon dictionary local
+ self.codon_count = CodonsDict.copy()
+
+ # iterate over sequence and count all the codons in the FastaFile.
+ for cur_record in SeqIO.parse(handle, "fasta") :
+ # make sure the sequence is lower case
+ if str(cur_record.seq).islower():
+ DNAsequence = str(cur_record.seq).upper()
+ else:
+ DNAsequence = str(cur_record.seq)
+ for i in range(0,len(DNAsequence),3):
+ codon = DNAsequence[i:i+3]
+ if codon in self.codon_count:
+ self.codon_count[codon] += 1
+ else:
+ raise TypeError("illegal codon %s in gene: %s" % (codon, cur_record.id))
+ handle.close()
+
+ # this just gives the index when the objects is printed.
+ def print_index (self):
+ """This method prints out the index you used."""
+ X=self.index.keys()
+ X.sort()
+ for i in X:
+ print "%s\t%.3f" %(i, self.index[i])
+
--- /dev/null
+# Copyright Yair Benita Y.Benita@pharm.uu.nl
+# Biopython (http://biopython.org) license applies
+
+# sharp Ecoli index for codon adaption index.
+# from Sharp & Li, Nucleic Acids Res. 1987
+SharpEcoliIndex = {
+'GCA':0.586, 'GCC':0.122, 'GCG':0.424, 'GCT':1, 'AGA':0.004, 'AGG':0.002, 'CGA':0.004,
+'CGC':0.356, 'CGG':0.004, 'CGT':1, 'AAC':1, 'AAT':0.051, 'GAC':1, 'GAT':0.434, 'TGC':1,
+'TGT':0.5, 'CAA':0.124, 'CAG':1, 'GAA':1, 'GAG':0.259, 'GGA':0.01, 'GGC':0.724, 'GGG':0.019,
+'GGT':1, 'CAC':1, 'CAT':0.291, 'ATA':0.003, 'ATC':1, 'ATT':0.185, 'CTA':0.007, 'CTC':0.037,
+'CTG':1, 'CTT':0.042, 'TTA':0.02, 'TTG':0.02, 'AAA':1, 'AAG':0.253, 'ATG':1, 'TTC':1, 'TTT':0.296,
+'CCA':0.135, 'CCC':0.012, 'CCG':1, 'CCT':0.07, 'AGC':0.41, 'AGT':0.085, 'TCA':0.077, 'TCC':0.744,
+'TCG':0.017, 'TCT':1, 'ACA':0.076, 'ACC':1,'ACG':0.099, 'ACT':0.965, 'TGG':1, 'TAC':1, 'TAT':0.239,
+'GTA':0.495, 'GTC':0.066,'GTG':0.221, 'GTT':1}
--- /dev/null
+# Copyright Yair Benita Y.Benita@pharm.uu.nl
+# Biopython (http://biopython.org) license applies
+
+"""Calculate isoelectric points of polypeptides using methods of Bjellqvist.
+
+pK values and the methos are taken from:
+
+* Bjellqvist, B.,Hughes, G.J., Pasquali, Ch., Paquet, N., Ravier, F., Sanchez,
+J.-Ch., Frutiger, S. & Hochstrasser, D.F.
+The focusing positions of polypeptides in immobilized pH gradients can be predicted
+from their amino acid sequences. Electrophoresis 1993, 14, 1023-1031.
+
+* Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E.
+Reference points for comparisons of two-dimensional maps of proteins from
+different human cell types defined in a pH scale where isoelectric points correlate
+with polypeptide compositions. Electrophoresis 1994, 15, 529-539.
+
+I designed the algorithm according to a note by David L. Tabb, available at:
+http://fields.scripps.edu/DTASelect/20010710-pI-Algorithm.pdf
+
+"""
+
+positive_pKs = { 'Nterm': 7.5, 'K': 10.0, 'R': 12.0, 'H':5.98 }
+negative_pKs = { 'Cterm': 3.55, 'D': 4.05, 'E': 4.45, 'C':9.0, 'Y':10.0 }
+pKcterminal= {'D':4.55, 'E':4.75}
+pKnterminal = {'A':7.59, 'M':7.0, 'S':6.93, 'P':8.36, 'T':6.82, 'V':7.44, 'E':7.7}
+charged_aas = ('K', 'R', 'H', 'D', 'E', 'C', 'Y')
+
+# access this module through ProtParam.ProteinAnalysis class.
+# first make a ProteinAnalysis object and then call its isoelectric_point method.
+class IsoelectricPoint:
+ def __init__(self, ProteinSequence, AminoAcidsContent):
+ self.sequence = ProteinSequence
+ self.charged_aas_content = self._select_charged(AminoAcidsContent)
+
+ # This function creates a dictionary with the contents of each charged aa,
+ # plus Cterm and Nterm.
+ def _select_charged(self, AminoAcidsContent):
+ charged = {}
+ for aa in charged_aas:
+ charged[aa] = float(AminoAcidsContent[aa])
+ charged['Nterm'] = 1.0
+ charged['Cterm'] = 1.0
+ return charged
+
+ #This function calculates the total charge of the protein at a given pH.
+ def _chargeR(self, pH, pos_pKs, neg_pKs):
+ PositiveCharge = 0.0
+ for aa, pK in pos_pKs.iteritems():
+ CR = 10**(pK-pH)
+ partial_charge = CR/(CR+1.0)
+ PositiveCharge += self.charged_aas_content[aa] * partial_charge
+
+ NegativeCharge = 0.0
+ for aa, pK in neg_pKs.iteritems():
+ CR = 10**(pH-pK)
+ partial_charge = CR/(CR+1.0)
+ NegativeCharge += self.charged_aas_content[aa] * partial_charge
+
+ return PositiveCharge - NegativeCharge
+
+ # This is the action function, it tries different pH until the charge of the protein is 0 (or close).
+ def pi(self):
+ pos_pKs = dict(positive_pKs)
+ neg_pKs = dict(negative_pKs)
+ nterm = self.sequence[0]
+ cterm = self.sequence[-1]
+ if nterm in pKnterminal.keys():
+ pos_pKs['Nterm'] = pKnterminal[nterm]
+ if cterm in pKcterminal.keys():
+ neg_pKs['Cterm'] = pKcterminal[cterm]
+
+ # Bracket between pH1 and pH2
+ pH = 7.0
+ Charge = self._chargeR(pH, pos_pKs, neg_pKs)
+ if Charge > 0.0:
+ pH1 = pH
+ Charge1 = Charge
+ while Charge1 > 0.0:
+ pH = pH1 + 1.0
+ Charge = self._chargeR(pH, pos_pKs, neg_pKs)
+ if Charge > 0.0:
+ pH1 = pH
+ Charge1 = Charge
+ else:
+ pH2 = pH
+ Charge2 = Charge
+ break
+ else:
+ pH2 = pH
+ Charge2 = Charge
+ while Charge2 < 0.0:
+ pH = pH2 - 1.0
+ Charge = self._chargeR(pH, pos_pKs, neg_pKs)
+ if Charge < 0.0:
+ pH2 = pH
+ Charge2 = Charge
+ else:
+ pH1 = pH
+ Charge1 = Charge
+ break
+
+ # Bisection
+ while pH2 - pH1 > 0.0001 and Charge!=0.0:
+ pH = (pH1 + pH2) / 2.0
+ Charge = self._chargeR(pH, pos_pKs, neg_pKs)
+ if Charge > 0.0:
+ pH1 = pH
+ Charge1 = Charge
+ else:
+ pH2 = pH
+ Charge2 = Charge
+
+ return pH
--- /dev/null
+# Copyright 2004-2008 by Sebastian Bassi.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Calculate the thermodynamic melting temperatures of nucleotide sequences."""
+
+import math
+def Tm_staluc(s,dnac=50,saltc=50,rna=0):
+ """Returns DNA/DNA tm using nearest neighbor thermodynamics.
+
+ dnac is DNA concentration [nM]
+ saltc is salt concentration [mM].
+ rna=0 is for DNA/DNA (default), for RNA, rna should be 1.
+
+ Sebastian Bassi <sbassi@genesdigitales.com>"""
+
+ #Credits:
+ #Main author: Sebastian Bassi <sbassi@genesdigitales.com>
+ #Overcount function: Greg Singer <singerg@tcd.ie>
+ #Based on the work of Nicolas Le Novere <lenov@ebi.ac.uk> Bioinformatics.
+ #17:1226-1227(2001)
+
+ #This function returns better results than EMBOSS DAN because it uses
+ #updated thermodynamics values and takes into account inicialization
+ #parameters from the work of SantaLucia (1998).
+
+ #Things to do:
+ #+Detect complementary sequences. Change K according to result.
+ #+Add support for heteroduplex (see Sugimoto et al. 1995).
+ #+Correction for Mg2+. Now supports only monovalent ions.
+ #+Put thermodinamics table in a external file for users to change at will
+ #+Add support for danglings ends (see Le Novele. 2001) and mismatches.
+
+ dh = 0 #DeltaH. Enthalpy
+ ds = 0 #deltaS Entropy
+
+ def tercorr(stri):
+ deltah = 0
+ deltas = 0
+ if rna==0:
+ #DNA/DNA
+ #Allawi and SantaLucia (1997). Biochemistry 36 : 10581-10594
+ if stri.startswith('G') or stri.startswith('C'):
+ deltah -= 0.1
+ deltas += 2.8
+ elif stri.startswith('A') or stri.startswith('T'):
+ deltah -= 2.3
+ deltas -= 4.1
+ if stri.endswith('G') or stri.endswith('C'):
+ deltah -= 0.1
+ deltas += 2.8
+ elif stri.endswith('A') or stri.endswith('T'):
+ deltah -= 2.3
+ deltas -= 4.1
+ dhL = dh + deltah
+ dsL = ds + deltas
+ return dsL,dhL
+ elif rna==1:
+ #RNA
+ if stri.startswith('G') or stri.startswith('C'):
+ deltah -= 3.61
+ deltas -= 1.5
+ elif stri.startswith('A') or stri.startswith('T') or \
+ stri.startswith('U'):
+ deltah -= 3.72
+ deltas += 10.5
+ if stri.endswith('G') or stri.endswith('C'):
+ deltah -= 3.61
+ deltas -= 1.5
+ elif stri.endswith('A') or stri.endswith('T') or \
+ stri.endswith('U'):
+ deltah -= 3.72
+ deltas += 10.5
+ dhL = dh + deltah
+ dsL = ds + deltas
+ # print "delta h=",dhL
+ return dsL,dhL
+
+ def overcount(st,p):
+ """Returns how many p are on st, works even for overlapping"""
+ ocu = 0
+ x = 0
+ while 1:
+ try:
+ i = st.index(p,x)
+ except ValueError:
+ break
+ ocu += 1
+ x = i + 1
+ return ocu
+
+ R = 1.987 # universal gas constant in Cal/degrees C*Mol
+ sup = s.upper()
+ vsTC,vh = tercorr(sup)
+ vs = vsTC
+
+ k = (dnac/4.0)*1e-9
+ #With complementary check on, the 4.0 should be changed to a variable.
+
+ if rna==0:
+ #DNA/DNA
+ #Allawi and SantaLucia (1997). Biochemistry 36 : 10581-10594
+ vh = vh + (overcount(sup,"AA"))*7.9 + (overcount(sup,"TT"))*\
+ 7.9 + (overcount(sup,"AT"))*7.2 + (overcount(sup,"TA"))*7.2 \
+ + (overcount(sup,"CA"))*8.5 + (overcount(sup,"TG"))*8.5 + \
+ (overcount(sup,"GT"))*8.4 + (overcount(sup,"AC"))*8.4
+ vh = vh + (overcount(sup,"CT"))*7.8+(overcount(sup,"AG"))*\
+ 7.8 + (overcount(sup,"GA"))*8.2 + (overcount(sup,"TC"))*8.2
+ vh = vh + (overcount(sup,"CG"))*10.6+(overcount(sup,"GC"))*\
+ 9.8 + (overcount(sup,"GG"))*8 + (overcount(sup,"CC"))*8
+ vs = vs + (overcount(sup,"AA"))*22.2+(overcount(sup,"TT"))*\
+ 22.2 + (overcount(sup,"AT"))*20.4 + (overcount(sup,"TA"))*21.3
+ vs = vs + (overcount(sup,"CA"))*22.7+(overcount(sup,"TG"))*\
+ 22.7 + (overcount(sup,"GT"))*22.4 + (overcount(sup,"AC"))*22.4
+ vs = vs + (overcount(sup,"CT"))*21.0+(overcount(sup,"AG"))*\
+ 21.0 + (overcount(sup,"GA"))*22.2 + (overcount(sup,"TC"))*22.2
+ vs = vs + (overcount(sup,"CG"))*27.2+(overcount(sup,"GC"))*\
+ 24.4 + (overcount(sup,"GG"))*19.9 + (overcount(sup,"CC"))*19.9
+ ds = vs
+ dh = vh
+
+ else:
+ #RNA/RNA hybridisation of Xia et al (1998)
+ #Biochemistry 37: 14719-14735
+ vh = vh+(overcount(sup,"AA"))*6.82+(overcount(sup,"TT"))*6.6+\
+ (overcount(sup,"AT"))*9.38 + (overcount(sup,"TA"))*7.69+\
+ (overcount(sup,"CA"))*10.44 + (overcount(sup,"TG"))*10.5+\
+ (overcount(sup,"GT"))*11.4 + (overcount(sup,"AC"))*10.2
+ vh = vh + (overcount(sup,"CT"))*10.48 + (overcount(sup,"AG"))\
+ *7.6+(overcount(sup,"GA"))*12.44+(overcount(sup,"TC"))*13.3
+ vh = vh + (overcount(sup,"CG"))*10.64 + (overcount(sup,"GC"))\
+ *14.88+(overcount(sup,"GG"))*13.39+(overcount(sup,"CC"))*12.2
+ vs = vs + (overcount(sup,"AA"))*19.0 + (overcount(sup,"TT"))*\
+ 18.4+(overcount(sup,"AT"))*26.7+(overcount(sup,"TA"))*20.5
+ vs = vs + (overcount(sup,"CA"))*26.9 + (overcount(sup,"TG"))*\
+ 27.8 + (overcount(sup,"GT"))*29.5 + (overcount(sup,"AC"))*26.2
+ vs = vs + (overcount(sup,"CT"))*27.1 + (overcount(sup,"AG"))*\
+ 19.2 + (overcount(sup,"GA"))*32.5 + (overcount(sup,"TC"))*35.5
+ vs = vs + (overcount(sup,"CG"))*26.7 + (overcount(sup,"GC"))\
+ *36.9 + (overcount(sup,"GG"))*32.7 + (overcount(sup,"CC"))*29.7
+ ds = vs
+ dh = vh
+
+ ds = ds-0.368*(len(s)-1)*math.log(saltc/1e3)
+ tm = ((1000* (-dh))/(-ds+(R * (math.log(k)))))-273.15
+ # print "ds="+str(ds)
+ # print "dh="+str(dh)
+ return tm
+
+if __name__ == "__main__" :
+ print "Quick self test"
+ assert Tm_staluc('CAGTCAGTACGTACGTGTACTGCCGTA') == 59.865612727457972
+ assert Tm_staluc('CAGTCAGTACGTACGTGTACTGCCGTA',rna=1) == 68.141611264576682
+ print "Done"
--- /dev/null
+# Copyright Yair Benita Y.Benita@pharm.uu.nl
+# Biopython (http://biopython.org) license applies
+
+import sys
+import ProtParamData, IsoelectricPoint
+from ProtParamData import kd # Added by Iddo to enable the gravy method
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+from Bio.Data import IUPACData
+#from BioModule import
+
+class ProteinAnalysis:
+ """
+ This class contains methods for protein analysis. The class init method takes
+ only one argument, the protein sequence as a string and build a sequence
+ object using the Bio.Seq module. This is done just to make sure the sequence
+ is a protein sequence and not anything else.
+
+ methods:
+
+ count_amino_acids:
+
+ Simply counts the number times an amino acid is repeated in the protein
+ sequence. Returns a dictionary {AminoAcid:Number} and also stores the
+ dictionary in self.amino_acids_content.
+
+ get_amino_acids_percent:
+
+ The same as count_amino_acids only returns the Number in percentage of entire
+ sequence. Returns a dictionary and stores the dictionary in
+ self.amino_acids_content_percent.
+
+ molecular_weight:
+ Calculates the molecular weight of a protein.
+
+ aromaticity:
+
+ Calculates the aromaticity value of a protein according to Lobry, 1994. It is
+ simply the relative frequency of Phe+Trp+Tyr.
+
+
+ instability_index:
+
+ Implementation of the method of Guruprasad et al. (Protein Engineering
+ 4:155-161,1990). This method tests a protein for stability. Any value above 40
+ means the protein is unstable (=has a short half life).
+
+ flexibility:
+ Implementation of the flexibility method of Vihinen et al. (Proteins. 1994 Jun;19(2):141-9).
+
+ isoelectric_point:
+ This method uses the module IsoelectricPoint to calculate the pI of a protein.
+
+ secondary_structure_fraction:
+ This methods returns a list of the fraction of amino acids which tend to be in Helix, Turn or Sheet.
+ Amino acids in helix: V, I, Y, F, W, L.
+ Amino acids in Turn: N, P, G, S.
+ Amino acids in sheet: E, M, A, L.
+ The list contains 3 values: [Helix, Turn, Sheet].
+
+
+ protein_scale(Scale, WindwonSize, Edge):
+
+ An amino acid scale is defined by a numerical value assigned to each type of
+ amino acid. The most frequently used scales are the hydrophobicity or
+ hydrophilicity scales and the secondary structure conformational parameters
+ scales, but many other scales exist which are based on different chemical and
+ physical properties of the amino acids. You can set several parameters that
+ control the computation of a scale profile, such as the window size and the
+ window edge relative weight value. WindowSize: The window size is the length
+ of the interval to use for the profile computation. For a window size n, we
+ use the i- ( n-1)/2 neighboring residues on each side of residue it compute
+ the score for residue i. The score for residue is the sum of the scale values
+ for these amino acids, optionally weighted according to their position in the
+ window. Edge: The central amino acid of the window always has a weight of 1.
+ By default, the amino acids at the remaining window positions have the same
+ weight, but you can make the residue at the center of the window have a
+ larger weight than the others by setting the edge value for the residues at
+ the beginning and end of the interval to a value between 0 and 1. For
+ instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7,
+ 1.0, 0.7, 0.4. The method returns a list of values which can be plotted to
+ view the change along a protein sequence. Many scales exist. Just add your
+ favorites to the ProtParamData modules.
+ """
+ def __init__(self, ProtSequence):
+ if ProtSequence.islower():
+ self.sequence = Seq(ProtSequence.upper(), IUPAC.protein)
+ else:
+ self.sequence = Seq(ProtSequence, IUPAC.protein)
+ self.amino_acids_content = None
+ self.amino_acids_percent = None
+ self.length = len(self.sequence)
+
+ def count_amino_acids(self):
+ ProtDic = dict([ (k, 0) for k in IUPACData.protein_letters])
+ for i in ProtDic.keys():
+ ProtDic[i]=self.sequence.count(i)
+ self.amino_acids_content = ProtDic
+ return ProtDic
+
+ """Calculate the amino acid content in percents.
+ input is the dictionary from CountAA.
+ output is a dictionary with AA as keys."""
+ def get_amino_acids_percent(self):
+ if not self.amino_acids_content:
+ self.count_amino_acids()
+
+ PercentAA = {}
+ for i in self.amino_acids_content.keys():
+ if self.amino_acids_content[i] > 0:
+ PercentAA[i]=self.amino_acids_content[i]/float(self.length)
+ else:
+ PercentAA[i] = 0
+ self.amino_acids_percent = PercentAA
+ return PercentAA
+
+ # Calculate MW from Protein sequence
+ # Calculate MW from Protein sequence
+ def molecular_weight (self):
+ # make local dictionary for speed
+ MwDict = {}
+ # remove a molecule of water from the amino acid weight.
+ for i in IUPACData.protein_weights.keys():
+ MwDict[i] = IUPACData.protein_weights[i] - 18.02
+ MW = 18.02 # add just one water molecule for the whole sequence.
+ for i in self.sequence:
+ MW += MwDict[i]
+ return MW
+
+ # calculate the aromaticity according to Lobry, 1994.
+ # Arom=sum of relative frequency of Phe+Trp+Tyr
+ def aromaticity(self):
+ if not self.amino_acids_percent:
+ self.get_amino_acids_percent()
+
+ Arom= self.amino_acids_percent['Y']+self.amino_acids_percent['W']+self.amino_acids_percent['F']
+ return Arom
+
+ # a function to calculate the instability index according to:
+ # Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990).
+ def instability_index(self):
+ #make the dictionary local for speed.
+ DIWV=ProtParamData.DIWV.copy()
+ score=0.0
+ for i in range(self.length - 1):
+ DiPeptide=DIWV[self.sequence[i]][self.sequence[i+1]]
+ score += DiPeptide
+ return (10.0/self.length) * score
+
+ # Calculate the flexibility according to Vihinen, 1994.
+ # No argument to change window size because parameters are specific for a window=9.
+ # the parameters used are optimized for determining the flexibility.
+ def flexibility(self):
+ Flex = ProtParamData.Flex.copy()
+ Window=9
+ Weights=[0.25,0.4375,0.625,0.8125,1]
+ List=[]
+ for i in range(self.length - Window):
+ SubSeq=self.sequence[i:i+Window]
+ score = 0.0
+ for j in range(Window/2):
+ score += (Flex[SubSeq[j]]+Flex[SubSeq[Window-j-1]]) * Weights[j]
+ score += Flex[SubSeq[Window/2+1]]
+ List.append(score/5.25)
+ return List
+
+ # calculate the gravy according to kyte and doolittle.
+ def gravy(self):
+ ProtGravy=0.0
+ for i in self.sequence:
+ ProtGravy += kd[i]
+
+ return ProtGravy/self.length
+
+ # this method is used to make a list of relative weight of the
+ # window edges compared to the window center. The weights are linear.
+ # it actually generates half a list. For a window of size 9 and edge 0.4
+ # you get a list of [0.4, 0.55, 0.7, 0.85].
+ def _weight_list(self, window, edge):
+ unit = ((1.0-edge)/(window-1))*2
+ list = [0.0]*(window/2)
+ for i in range(window/2):
+ list[i] = edge + unit * i
+ return list
+
+ # this method allows you to compute and represent the profile produced
+ # by any amino acid scale on a selected protein.
+ # Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl
+ # The weight list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4]
+ # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done
+ # in the loop.
+ def protein_scale(self, ParamDict, Window, Edge=1.0):
+ # generate the weights
+ weight = self._weight_list(Window,Edge)
+ list = []
+ # the score in each Window is divided by the sum of weights
+ sum_of_weights = 0.0
+ for i in weight: sum_of_weights += i
+ # since the weight list is one sided:
+ sum_of_weights = sum_of_weights*2+1
+
+ for i in range(self.length-Window+1):
+ subsequence = self.sequence[i:i+Window]
+ score = 0.0
+ for j in range(Window/2):
+ # walk from the outside of the Window towards the middle.
+ # Iddo: try/except clauses added to avoid raising an exception on a non-standad amino acid
+ try:
+ score += weight[j] * ParamDict[subsequence[j]] + weight[j] * ParamDict[subsequence[Window-j-1]]
+ except KeyError:
+ sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' %
+ (subsequence[j],subsequence[Window-j-1]))
+
+ # Now add the middle value, which always has a weight of 1.
+ if subsequence[Window/2] in ParamDict:
+ score += ParamDict[subsequence[Window/2]]
+ else:
+ sys.stderr.write('warning: %s is not a standard amino acid.\n' % (subsequence[Window/2]))
+
+ list.append(score/sum_of_weights)
+ return list
+
+ # calculate the isoelectric point.
+ def isoelectric_point(self):
+ if not self.amino_acids_content:
+ self.count_amino_acids()
+ X = IsoelectricPoint.IsoelectricPoint(self.sequence, self.amino_acids_content)
+ return X.pi()
+
+ # calculate fraction of helix, turn and sheet
+ def secondary_structure_fraction (self):
+ if not self.amino_acids_percent:
+ self.get_amino_acids_percent()
+ Helix = self.amino_acids_percent['V'] + self.amino_acids_percent['I'] + self.amino_acids_percent['Y'] + self.amino_acids_percent['F'] + self.amino_acids_percent['W'] + self.amino_acids_percent['L']
+ Turn = self.amino_acids_percent['N'] + self.amino_acids_percent['P'] + self.amino_acids_percent['G'] + self.amino_acids_percent['S']
+ Sheet = self.amino_acids_percent['E'] + self.amino_acids_percent['M'] + self.amino_acids_percent['A'] + self.amino_acids_percent['L']
+ return Helix, Turn, Sheet
+
+#---------------------------------------------------------#
+"""
+X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV")
+print X.count_amino_acids()
+print X.get_amino_acids_percent()
+print X.molecular_weight()
+print X.aromaticity()
+print X.instability_index()
+print X.flexibility()
+print X.pi()
+print X.secondary_structure_fraction()
+print X.protein_scale(ProtParamData.kd, 9, 0.4)
+"""
--- /dev/null
+# This module contains indices to be used with ProtParam
+
+# Kyte & Doolittle index of hydrophobicity
+kd = { 'A': 1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C': 2.5,
+ 'Q':-3.5,'E':-3.5,'G':-0.4,'H':-3.2,'I': 4.5,
+ 'L': 3.8,'K':-3.9,'M': 1.9,'F': 2.8,'P':-1.6,
+ 'S':-0.8,'T':-0.7,'W':-0.9,'Y':-1.3,'V': 4.2 }
+
+# Flexibility
+# Normalized flexibility parameters (B-values), average (Vihinen et al., 1994)
+Flex= {'A': 0.984, 'C': 0.906, 'E': 1.094, 'D': 1.068,
+ 'G': 1.031, 'F': 0.915, 'I': 0.927, 'H': 0.950,
+ 'K': 1.102, 'M': 0.952, 'L': 0.935, 'N': 1.048,
+ 'Q': 1.037, 'P': 1.049, 'S': 1.046, 'R': 1.008,
+ 'T': 0.997, 'W': 0.904, 'V': 0.931, 'Y': 0.929}
+
+# Hydrophilicity
+# 1 Hopp & Wood
+# Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981).
+hw = { 'A':-0.5,'R':3.0, 'N':0.2, 'D':3.0, 'C':-1.0,
+ 'Q':0.2, 'E':3.0, 'G':0.0, 'H':-0.5,'I':-1.8,
+ 'L':-1.8,'K':3.0, 'M':-1.3,'F':-2.5,'P':0.0,
+ 'S':0.3, 'T':-0.4,'W':-3.4,'Y':-2.3,'V':-1.5 }
+
+# Surface accessibility
+# 1 Emini Surface fractional probability
+em = { 'A':0.815,'R':1.475,'N':1.296,'D':1.283,'C':0.394,
+ 'Q':1.348,'E':1.445,'G':0.714,'H':1.180,'I':0.603,
+ 'L':0.603,'K':1.545,'M':0.714,'F':0.695,'P':1.236,
+ 'S':1.115,'T':1.184,'W':0.808,'Y':1.089,'V':0.606 }
+
+# 2 Janin Interior to surface transfer energy scale
+ja = { 'A': 0.28,'R':-1.14,'N':-0.55,'D':-0.52,'C': 0.97,
+ 'Q':-0.69,'E':-1.01,'G': 0.43,'H':-0.31,'I': 0.60,
+ 'L': 0.60,'K':-1.62,'M': 0.43,'F': 0.46,'P':-0.42,
+ 'S':-0.19,'T':-0.32,'W': 0.29,'Y':-0.15,'V': 0.60 }
+
+
+# A two dimentional dictionary for calculating the instability index.
+# Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990).
+# It is based on dipeptide values therefore the vale for the dipeptide DG is DIWV['D']['G'].
+# I know this looks ugly but i can't think of a better way to display it.
+DIWV = {'A': {'A': 1.0, 'C': 44.94, 'E': 1.0, 'D': -7.49, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': -7.49, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0},'C': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 20.26, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 33.60, 'K': 1.0, 'M': 33.60, 'L': 20.26, 'N': 1.0, 'Q': -6.54, 'P': 20.26, 'S': 1.0, 'R': 1.0, 'T': 33.60, 'W': 24.68, 'V': -6.54, 'Y': 1.0},'E': {'A': 1.0, 'C': 44.94, 'E': 33.60, 'D': 20.26, 'G': 1.0, 'F': 1.0, 'I': 20.26, 'H': -6.54, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': 1.0, 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, 'D': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 1.0, 'S': 20.26, 'R': -6.54, 'T': -14.03, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, 'G': {'A': -7.49, 'C': 1.0, 'E': -6.54, 'D': 1.0, 'G': 13.34, 'F': 1.0, 'I': -7.49, 'H': 1.0, 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': -7.49, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, 'T': -7.49, 'W': 13.34, 'V': 1.0, 'Y': -7.49}, 'F': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 13.34, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, 'K': -14.03, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 33.601}, 'I': {'A': 1.0, 'C': 1.0, 'E': 44.94, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 13.34, 'K': -7.49, 'M': 1.0, 'L': 20.26, 'N': 1.0, 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, 'H': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': -9.37, 'I': 44.94, 'H': 1.0, 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 24.68, 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, 'T': -6.54, 'W': -1.88, 'V': 1.0, 'Y': 44.94}, 'K': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': -7.49, 'H': 1.0, 'K': 1.0, 'M': 33.60, 'L': -7.49, 'N': 1.0, 'Q': 24.64, 'P': -6.54, 'S': 1.0, 'R': 33.60, 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, 'M': {'A': 13.34, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 58.28, 'K': 1.0, 'M': -1.88, 'L': 1.0, 'N': 1.0, 'Q': -6.54, 'P': 44.94, 'S': 44.94, 'R': -6.54, 'T': -1.88, 'W': 1.0, 'V': 1.0, 'Y': 24.68}, 'L': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 33.60, 'P': 20.26, 'S': 1.0, 'R': 20.26, 'T': 1.0, 'W': 24.68, 'V': 1.0, 'Y': 1.0}, 'N': {'A': 1.0, 'C': -1.88, 'E': 1.0, 'D': 1.0, 'G': -14.03, 'F': -14.03, 'I': 44.94, 'H': 1.0, 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': -6.54, 'P': -1.88, 'S': 1.0, 'R': 1.0, 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 1.0}, 'Q': {'A': 1.0, 'C': -6.54, 'E': 20.26, 'D': 20.26, 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 1.0, 'T': 1.0, 'W': 1.0, 'V': -6.54, 'Y': -6.54}, 'P': {'A': 20.26, 'C': -6.54, 'E': 18.38, 'D': -6.54, 'G': 1.0, 'F': 20.26, 'I': 1.0, 'H': 1.0, 'K': 1.0, 'M': -6.54, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': -6.54, 'T': 1.0, 'W': -1.88, 'V': 20.26, 'Y': 1.0}, 'S': {'A': 1.0, 'C': 33.60, 'E': 20.26, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 44.94, 'S': 20.26, 'R': 20.26, 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, 'R': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 20.26, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 13.34, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 58.28, 'T': 1.0, 'W': 58.28, 'V': 1.0, 'Y': -6.54}, 'T': {'A': 1.0, 'C': 1.0, 'E': 20.26, 'D': 1.0, 'G': -7.49, 'F': 13.34, 'I': 1.0, 'H': 1.0, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': -14.03, 'Q': -6.54, 'P': 1.0, 'S': 1.0, 'R': 1.0, 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, 'W': {'A': -14.03, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': 1.0, 'I': 1.0, 'H': 24.68, 'K': 1.0, 'M': 24.68, 'L': 13.34, 'N': 13.34, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, 'T': -14.03, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, 'V': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': -14.03, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 1.0, 'K': -1.88, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, 'T': -7.49, 'W': 1.0, 'V': 1.0, 'Y': -6.54}, 'Y': {'A': 24.68, 'C': 1.0, 'E': -6.54, 'D': 24.68, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 13.34, 'K': 1.0, 'M': 44.94, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 13.34, 'S': 1.0, 'R': -15.91, 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 13.34}}
\ No newline at end of file
--- /dev/null
+#!/usr/bin/env python
+# Created: Wed May 29 08:07:18 2002
+# thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se
+# Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Miscellaneous functions for dealing with sequences."""
+
+import re, time
+from Bio import SeqIO
+from Bio import Translate
+from Bio.Seq import Seq
+from Bio import Alphabet
+from Bio.Alphabet import IUPAC
+from Bio.Data import IUPACData, CodonTable
+
+
+######################################
+# DNA
+######################
+# {{{
+
+def reverse(seq):
+ """Reverse the sequence. Works on string sequences.
+
+ e.g.
+ >>> reverse("ACGGT")
+ 'TGGCA'
+
+ """
+ r = list(seq)
+ r.reverse()
+ return ''.join(r)
+
+def GC(seq):
+ """Calculates G+C content, returns the percentage (float between 0 and 100).
+
+ Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
+ when counting the G and C content. The percentage is calculated against
+ the full length, e.g.:
+
+ >>> from Bio.SeqUtils import GC
+ >>> GC("ACTGN")
+ 40.0
+
+ Note that this will return zero for an empty sequence.
+ """
+ try :
+ gc = sum(map(seq.count,['G','C','g','c','S','s']))
+ return gc*100.0/len(seq)
+ except ZeroDivisionError :
+ return 0.0
+
+
+def GC123(seq):
+ """Calculates total G+C content plus first, second and third positions.
+
+ Returns a tuple of four floats (percentages between 0 and 100) for the
+ entire sequence, and the three codon positions. e.g.
+
+ >>> from Bio.SeqUtils import GC123
+ >>> GC123("ACTGTN")
+ (40.0, 50.0, 50.0, 0.0)
+
+ Copes with mixed case sequences, but does NOT deal with ambiguous
+ nucleotides.
+ """
+ d= {}
+ for nt in ['A','T','G','C']:
+ d[nt] = [0,0,0]
+
+ for i in range(0,len(seq),3):
+ codon = seq[i:i+3]
+ if len(codon) <3: codon += ' '
+ for pos in range(0,3):
+ for nt in ['A','T','G','C']:
+ if codon[pos] == nt or codon[pos] == nt.lower():
+ d[nt][pos] += 1
+ gc = {}
+ gcall = 0
+ nall = 0
+ for i in range(0,3):
+ try:
+ n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
+ gc[i] = (d['G'][i] + d['C'][i])*100.0/n
+ except:
+ gc[i] = 0
+
+ gcall = gcall + d['G'][i] + d['C'][i]
+ nall = nall + n
+
+ gcall = 100.0*gcall/nall
+ return gcall, gc[0], gc[1], gc[2]
+
+def GC_skew(seq, window = 100):
+ """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
+
+ Returns a list of ratios (floats), controlled by the length of the sequence
+ and the size of the window.
+
+ Does NOT look at any ambiguous nucleotides.
+ """
+ # 8/19/03: Iddo: added lowercase
+ values = []
+ for i in range(0, len(seq), window):
+ s = seq[i: i + window]
+ g = s.count('G') + s.count('g')
+ c = s.count('C') + s.count('c')
+ skew = (g-c)/float(g+c)
+ values.append(skew)
+ return values
+
+from math import pi, sin, cos, log
+def xGC_skew(seq, window = 1000, zoom = 100,
+ r = 300, px = 100, py = 100):
+ """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
+ from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
+ VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
+ yscroll = Scrollbar(orient = VERTICAL)
+ xscroll = Scrollbar(orient = HORIZONTAL)
+ canvas = Canvas(yscrollcommand = yscroll.set,
+ xscrollcommand = xscroll.set, background = 'white')
+ win = canvas.winfo_toplevel()
+ win.geometry('700x700')
+
+ yscroll.config(command = canvas.yview)
+ xscroll.config(command = canvas.xview)
+ yscroll.pack(side = RIGHT, fill = Y)
+ xscroll.pack(side = BOTTOM, fill = X)
+ canvas.pack(fill=BOTH, side = LEFT, expand = 1)
+ canvas.update()
+
+ X0, Y0 = r + px, r + py
+ x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
+
+ ty = Y0
+ canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
+ ty +=20
+ canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
+ ty +=20
+ canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
+ ty +=20
+ canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
+ ty +=20
+ canvas.create_oval(x1,y1, x2, y2)
+
+ acc = 0
+ start = 0
+ for gc in GC_skew(seq, window):
+ r1 = r
+ acc+=gc
+ # GC skew
+ alpha = pi - (2*pi*start)/len(seq)
+ r2 = r1 - gc*zoom
+ x1 = X0 + r1 * sin(alpha)
+ y1 = Y0 + r1 * cos(alpha)
+ x2 = X0 + r2 * sin(alpha)
+ y2 = Y0 + r2 * cos(alpha)
+ canvas.create_line(x1,y1,x2,y2, fill = 'blue')
+ # accumulated GC skew
+ r1 = r - 50
+ r2 = r1 - acc
+ x1 = X0 + r1 * sin(alpha)
+ y1 = Y0 + r1 * cos(alpha)
+ x2 = X0 + r2 * sin(alpha)
+ y2 = Y0 + r2 * cos(alpha)
+ canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
+
+ canvas.update()
+ start += window
+
+ canvas.configure(scrollregion = canvas.bbox(ALL))
+
+def molecular_weight(seq):
+ """Calculate the molecular weight of a DNA sequence."""
+ if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna)
+ weight_table = IUPACData.unambiguous_dna_weights
+ #TODO, use a generator expession once we drop Python 2.3?
+ #e.g. return sum(weight_table[x] for x in seq)
+ total = 0
+ for x in seq:
+ total += weight_table[x]
+ return total
+
+def nt_search(seq, subseq):
+ """Search for a DNA subseq in sequence.
+
+ use ambiguous values (like N = A or T or C or G, R = A or G etc.)
+ searches only on forward strand
+ """
+ pattern = ''
+ for nt in subseq:
+ value = IUPACData.ambiguous_dna_values[nt]
+ if len(value) == 1:
+ pattern += value
+ else:
+ pattern += '[%s]' % value
+
+ pos = -1
+ result = [pattern]
+ l = len(seq)
+ while True:
+ pos+=1
+ s = seq[pos:]
+ m = re.search(pattern, s)
+ if not m: break
+ pos += int(m.start(0))
+ result.append(pos)
+ return result
+
+# }}}
+
+######################################
+# Protein
+######################
+# {{{
+
+# temporary hack for exception free translation of "dirty" DNA
+# should be moved to ???
+
+class ProteinX(Alphabet.ProteinAlphabet):
+ letters = IUPACData.extended_protein_letters + "X"
+
+proteinX = ProteinX()
+
+class MissingTable:
+ def __init__(self, table):
+ self._table = table
+ def get(self, codon, stop_symbol):
+ try:
+ return self._table.get(codon, stop_symbol)
+ except CodonTable.TranslationError:
+ return 'X'
+
+def makeTableX(table):
+ assert table.protein_alphabet == IUPAC.extended_protein
+ return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX,
+ MissingTable(table.forward_table),
+ table.back_table, table.start_codons,
+ table.stop_codons)
+
+# end of hacks
+
+def seq3(seq):
+ """Turn a one letter code protein sequence into one with three letter codes.
+
+ The single input argument 'seq' should be a protein sequence using single
+ letter codes, either as a python string or as a Seq or MutableSeq object.
+
+ This function returns the amino acid sequence as a string using the three
+ letter amino acid codes. Output follows the IUPAC standard (including
+ ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
+ for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown
+ character (including possible gap characters), is changed into 'Xaa'.
+
+ e.g.
+ >>> from Bio.SeqUtils import seq3
+ >>> seq3("MAIVMGRWKGAR*")
+ 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
+
+ This function was inspired by BioPerl's seq3.
+ """
+ threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
+ 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
+ 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
+ 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
+ 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
+ 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
+ 'U':'Sel', 'O':'Pyl', 'J':'Xle',
+ }
+ #We use a default of 'Xaa' for undefined letters
+ #Note this will map '-' to 'Xaa' which may be undesirable!
+ return ''.join([threecode.get(aa,'Xaa') for aa in seq])
+
+
+# }}}
+
+######################################
+# Mixed ???
+######################
+# {{{
+
+def translate(seq, frame = 1, genetic_code = 1, translator = None):
+ """Translation of DNA in one of the six different reading frames (DEPRECATED).
+
+ Use the Bio.Seq.Translate function, or the Seq object's translate method
+ instead:
+
+ >>> from Bio.Seq import Seq
+ >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
+ >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA")
+ >>> for frame in [0,1,2] :
+ ... print my_seq[frame:].translate()
+ ...
+ MAIVMGR*KGAR*
+ WPL*WAAERVPDS
+ GHCNGPLKGCPIV
+ >>> for frame in [0,1,2] :
+ ... print my_seq.reverse_complement()[frame:].translate()
+ ...
+ YYRAPFQRPITMA
+ TIGHPFSGPLQWP
+ LSGTLSAAHYNGH
+ """
+ import warnings
+ warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \
+ +" to remove it in a future release of Biopython. Please use"\
+ +" the method or function in Bio.Seq instead, as described in"\
+ +" the Tutorial.", DeprecationWarning)
+
+ if frame not in [1,2,3,-1,-2,-3]:
+ raise ValueError('invalid frame')
+
+ if not translator:
+ table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code])
+ translator = Translate.Translator(table)
+
+ #Does this frame calculation do something sensible? No RC taken!
+ return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
+
+def GC_Frame(seq, genetic_code = 1):
+ """Just an alias for six_frame_translations (OBSOLETE).
+
+ Use six_frame_translation directly, as this function may be deprecated
+ in a future release."""
+ return six_frame_translations(seq, genetic_code)
+
+def six_frame_translations(seq, genetic_code = 1):
+ """Formatted string showing the 6 frame translations and GC content.
+
+ nice looking 6 frame translation with GC content - code from xbbtools
+ similar to DNA Striders six-frame translation
+
+ e.g.
+ from Bio.SeqUtils import six_frame_translations
+ print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
+ """
+ from Bio.Seq import reverse_complement, translate
+ anti = reverse_complement(seq)
+ comp = anti[::-1]
+ length = len(seq)
+ frames = {}
+ for i in range(0,3):
+ frames[i+1] = translate(seq[i:], genetic_code)
+ frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
+
+ # create header
+ if length > 20:
+ short = '%s ... %s' % (seq[:10], seq[-10:])
+ else:
+ short = seq
+ #TODO? Remove the date as this would spoil any unit test...
+ date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
+ header = 'GC_Frame: %s, ' % date
+ for nt in ['a','t','g','c']:
+ header += '%s:%d ' % (nt, seq.count(nt.upper()))
+
+ header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
+ res = header
+
+ for i in range(0,length,60):
+ subseq = seq[i:i+60]
+ csubseq = comp[i:i+60]
+ p = i/3
+ res = res + '%d/%d\n' % (i+1, i/3+1)
+ res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
+ res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
+ res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
+ # seq
+ res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
+ res = res + csubseq.lower() + '\n'
+ # - frames
+ res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
+ res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
+ res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
+ return res
+
+# }}}
+
+######################################
+# FASTA file utilities
+######################
+# {{{
+
+def fasta_uniqids(file):
+ """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE).
+
+ file - a FASTA format filename to read in.
+
+ No return value, the output is written to screen.
+ """
+ dict = {}
+ txt = open(file).read()
+ entries = []
+ for entry in txt.split('>')[1:]:
+ name, seq= entry.split('\n',1)
+ name = name.split()[0].split(',')[0]
+
+ if name in dict:
+ n = 1
+ while 1:
+ n = n + 1
+ _name = name + str(n)
+ if _name not in dict:
+ name = _name
+ break
+
+ dict[name] = seq
+
+ for name, seq in dict.items():
+ print '>%s\n%s' % (name, seq)
+
+def quick_FASTA_reader(file):
+ """Simple FASTA reader, returning a list of string tuples.
+
+ The single argument 'file' should be the filename of a FASTA format file.
+ This function will open and read in the entire file, constructing a list
+ of all the records, each held as a tuple of strings (the sequence name or
+ title, and its sequence).
+
+ This function was originally intended for use on large files, where its
+ low overhead makes it very fast. However, because it returns the data as
+ a single in memory list, this can require a lot of RAM on large files.
+
+ You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
+ allows you to iterate over the records one by one (avoiding having all the
+ records in memory at once). Using Bio.SeqIO also makes it easy to switch
+ between different input file formats. However, please note that rather
+ than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
+ """
+ #Want to split on "\n>" not just ">" in case there are any extra ">"
+ #in the name/description. So, in order to make sure we also split on
+ #the first entry, prepend a "\n" to the start of the file.
+ handle = open(file)
+ txt = "\n" + handle.read()
+ handle.close()
+ entries = []
+ for entry in txt.split('\n>')[1:]:
+ name,seq= entry.split('\n',1)
+ seq = seq.replace('\n','').replace(' ','').upper()
+ entries.append((name, seq))
+ return entries
+
+def apply_on_multi_fasta(file, function, *args):
+ """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
+
+ file - filename of a FASTA format file
+ function - the function you wish to invoke on each record
+ *args - any extra arguments you want passed to the function
+
+ This function will iterate over each record in a FASTA file as SeqRecord
+ objects, calling your function with the record (and supplied args) as
+ arguments.
+
+ This function returns a list. For those records where your function
+ returns a value, this is taken as a sequence and used to construct a
+ FASTA format string. If your function never has a return value, this
+ means apply_on_multi_fasta will return an empty list.
+ """
+ try:
+ f = globals()[function]
+ except:
+ raise NotImplementedError("%s not implemented" % function)
+
+ handle = open(file, 'r')
+ records = SeqIO.parse(handle, "fasta")
+ results = []
+ for record in records:
+ arguments = [record.sequence]
+ for arg in args: arguments.append(arg)
+ result = f(*arguments)
+ if result:
+ results.append('>%s\n%s' % (record.name, result))
+ handle.close()
+ return results
+
+def quicker_apply_on_multi_fasta(file, function, *args):
+ """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
+
+ file - filename of a FASTA format file
+ function - the function you wish to invoke on each record
+ *args - any extra arguments you want passed to the function
+
+ This function will use quick_FASTA_reader to load every record in the
+ FASTA file into memory as a list of tuples. For each record, it will
+ call your supplied function with the record as a tuple of the name and
+ sequence as strings (plus any supplied args).
+
+ This function returns a list. For those records where your function
+ returns a value, this is taken as a sequence and used to construct a
+ FASTA format string. If your function never has a return value, this
+ means quicker_apply_on_multi_fasta will return an empty list.
+ """
+ try:
+ f = globals()[function]
+ except:
+ raise NotImplementedError("%s not implemented" % function)
+
+ entries = quick_FASTA_reader(file)
+ results = []
+ for name, seq in entries:
+ arguments = [seq]
+ for arg in args: arguments.append(arg)
+ result = f(*arguments)
+ if result:
+ results.append('>%s\n%s' % (name, result))
+ handle.close()
+ return results
+
+# }}}
+
+######################################
+# Main
+#####################
+# {{{
+
+if __name__ == '__main__':
+ import sys, getopt
+ # crude command line options to use most functions directly on a FASTA file
+ options = {'apply_on_multi_fasta':0,
+ 'quick':0,
+ 'uniq_ids':0,
+ }
+
+ optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
+ 'help', 'quick', 'uniq_ids', 'search='])
+ for arg in optlist:
+ if arg[0] in ['-h', '--help']:
+ pass
+ elif arg[0] in ['--describe']:
+ # get all new functions from this file
+ mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
+ mol_funcs.sort()
+ print 'available functions:'
+ for f in mol_funcs: print '\t--%s' % f
+ print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
+
+ sys.exit(0)
+ elif arg[0] in ['--apply_on_multi_fasta']:
+ options['apply_on_multi_fasta'] = arg[1]
+ elif arg[0] in ['--search']:
+ options['search'] = arg[1]
+ else:
+ key = re.search('-*(.+)', arg[0]).group(1)
+ options[key] = 1
+
+
+ if options.get('apply_on_multi_fasta'):
+ file = args[0]
+ function = options['apply_on_multi_fasta']
+ arguments = []
+ if options.get('search'):
+ arguments = options['search']
+ if function == 'xGC_skew':
+ arguments = 1000
+ if options.get('quick'):
+ results = quicker_apply_on_multi_fasta(file, function, arguments)
+ else:
+ results = apply_on_multi_fasta(file, function, arguments)
+ for result in results: print result
+
+ elif options.get('uniq_ids'):
+ file = args[0]
+ fasta_uniqids(file)
+
+# }}}
+
--- /dev/null
+# Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com
+# All rights reserved. This code is part of the Biopython
+# distribution and governed by its license.
+# Please see the LICENSE file that should have been included as part
+# of this package.
+
+import math
+
+def lcc_mult(seq,wsize):
+ """Local Composition Complexity (LCC) values over sliding window.
+
+ Returns a list of floats, the LCC values for a sliding window over
+ the sequence.
+
+ seq - an unambiguous DNA sequence (a string or Seq object)
+ wsize - window size, integer
+
+ The result is the same as applying lcc_simp multiple times, but this
+ version is optimized for speed. The optimization works by using the
+ value of previous window as a base to compute the next one."""
+ l2=math.log(2)
+ tamseq=len(seq)
+ try :
+ #Assume its a string
+ upper = seq.upper()
+ except AttributeError :
+ #Should be a Seq object then
+ upper = seq.tostring().upper()
+ compone=[0]
+ lccsal=[0]
+ for i in range(wsize):
+ compone.append(((i+1)/float(wsize))*
+ ((math.log((i+1)/float(wsize)))/l2))
+ window=seq[0:wsize]
+ cant_a=window.count('A')
+ cant_c=window.count('C')
+ cant_t=window.count('T')
+ cant_g=window.count('G')
+ term_a=compone[cant_a]
+ term_c=compone[cant_c]
+ term_t=compone[cant_t]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ tail=seq[0]
+ for x in range (tamseq-wsize):
+ window=upper[x+1:wsize+x+1]
+ if tail==window[-1]:
+ lccsal.append(lccsal[-1])
+ elif tail=='A':
+ cant_a=cant_a-1
+ if window.endswith('C'):
+ cant_c=cant_c+1
+ term_a=compone[cant_a]
+ term_c=compone[cant_c]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('T'):
+ cant_t=cant_t+1
+ term_a=compone[cant_a]
+ term_t=compone[cant_t]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('G'):
+ cant_g=cant_g+1
+ term_a=compone[cant_a]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif tail=='C':
+ cant_c=cant_c-1
+ if window.endswith('A'):
+ cant_a=cant_a+1
+ term_a=compone[cant_a]
+ term_c=compone[cant_c]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('T'):
+ cant_t=cant_t+1
+ term_c=compone[cant_c]
+ term_t=compone[cant_t]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('G'):
+ cant_g=cant_g+1
+ term_c=compone[cant_c]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif tail=='T':
+ cant_t=cant_t-1
+ if window.endswith('A'):
+ cant_a=cant_a+1
+ term_a=compone[cant_a]
+ term_t=compone[cant_t]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('C'):
+ cant_c=cant_c+1
+ term_c=compone[cant_c]
+ term_t=compone[cant_t]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('G'):
+ cant_g=cant_g+1
+ term_t=compone[cant_t]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif tail=='G':
+ cant_g=cant_g-1
+ if window.endswith('A'):
+ cant_a=cant_a+1
+ term_a=compone[cant_a]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('C'):
+ cant_c=cant_c+1
+ term_c=compone[cant_c]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('T'):
+ cant_t=cant_t+1
+ term_t=compone[cant_t]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ tail=window[0]
+ return lccsal
+
+def lcc_simp(seq):
+ """Local Composition Complexity (LCC) for a sequence.
+
+ seq - an unambiguous DNA sequence (a string or Seq object)
+
+ Returns the Local Composition Complexity (LCC) value for the entire
+ sequence (as a float).
+
+ Reference:
+ Andrzej K Konopka (2005) Sequence Complexity and Composition
+ DOI: 10.1038/npg.els.0005260
+ """
+ wsize=len(seq)
+ try :
+ #Assume its a string
+ upper = seq.upper()
+ except AttributeError :
+ #Should be a Seq object then
+ upper = seq.tostring().upper()
+ l2=math.log(2)
+ if 'A' not in seq:
+ term_a=0
+ # Check to avoid calculating the log of 0.
+ else:
+ term_a=((upper.count('A'))/float(wsize))*((math.log((upper.count('A'))
+ /float(wsize)))/l2)
+ if 'C' not in seq:
+ term_c=0
+ else:
+ term_c=((upper.count('C'))/float(wsize))*((math.log((upper.count('C'))
+ /float(wsize)))/l2)
+ if 'T' not in seq:
+ term_t=0
+ else:
+ term_t=((upper.count('T'))/float(wsize))*((math.log((upper.count('T'))
+ /float(wsize)))/l2)
+ if 'G' not in seq:
+ term_g=0
+ else:
+ term_g=((upper.count('G'))/float(wsize))*((math.log((upper.count('G'))
+ /float(wsize)))/l2)
+ lccsal=-(term_a+term_c+term_t+term_g)
+ return lccsal
--- /dev/null
+# This is a Python module.
+"""This module is DEPRECATED.
+
+Andrew Dalke is no longer maintaining Martel or Bio.Mindy, and these modules
+and associate ones like Bio.Std are now deprecated. They are no longer
+used in any of the current Biopython parsers, and are likely to be removed
+in a future release.
+"""
+
+import warnings
+warnings.warn("Martel and those parts of Biopython depending on it" \
+ +" directly (such as Bio.Mindy and Bio.Std) are now" \
+ +" deprecated, and will be removed in a future release of"\
+ +" Biopython. If you want to continue to use this code,"\
+ +" please get in contact with the Biopython developers via"\
+ +" the mailing lists to avoid its permanent removal from"\
+ +" Biopython.", \
+ DeprecationWarning)
+# Standard Bioformats definitions
+
+import Martel
+Group = Martel.Group
+
+namespace = "bioformat"
+NS = namespace + ":"
+XMLNS = "http://biopython.org/bioformat"
+
+def _set_if_given(attrs, field, d, valid = None, convert = None):
+ value = attrs.get(field)
+ if value is not None:
+ if valid is not None:
+ if value not in valid:
+ raise TypeError("%s (%r) must be one of %s" % \
+ (field, value, valid))
+ if convert is None:
+ d[field] = value
+ else:
+ d[field] = convert(value)
+
+def _complain_if_given(attrs, name):
+ if attrs.has_key(name) and attrs[name] is not None:
+ raise NotImplementedError("Don't yet handle %r" % (name,))
+
+def _must_have(expr, f):
+ tag = f.tag
+ if tag not in expr.group_names():
+ raise TypeError(
+ "group %r not present in the expression but is required" % \
+ (tag,))
+
+def _must_have_set(expr, sets):
+ names = expr.group_names()
+ for set in sets:
+ for f in set:
+ tag = f.tag
+ if tag not in names:
+ break
+ else:
+ return
+ if len(sets) == 1:
+ raise TypeError("missing required tags (need %s) in expression" %
+ [f.tag for f in sets[0]])
+ lines = ["missing required tags in expression; must have one set from:"]
+ for set in sets:
+ lines.append( str( [t.tag for f in set] ) )
+ s = "\n".join(lines)
+ raise TypeError(s)
+
+def _must_not_have(expr, f):
+ f.tag
+ if tag in expr.group_names():
+ raise TypeError(
+ "group %r present in the expression but is not allowed" % \
+ (tag,))
+
+
+# pre- Python 2.2 functions didn't allow attributes
+def _f():
+ pass
+try:
+ _f.x = 1
+ _use_hack = 0
+except AttributeError:
+ _use_hack = 1
+del _f
+
+def _check_name(f, text):
+ if text == "record": # XXX FIXME
+ return
+ assert NS + f.func_name == text, (NS + ":" + f.func_name, text)
+
+def _check_attrs(attrs, names):
+ for name in attrs.keys():
+ if name not in names:
+ raise TypeError("attr %r is not allowed here (valid terms: %s)" % \
+ (name, names))
+ d = attrs.copy()
+ for name in names:
+ if not d.has_key(name):
+ d[name] = None
+ return d
+
+if not _use_hack:
+ def _settag(f, tag):
+ _check_name(f, tag)
+ f.tag = tag
+else:
+ # Convert the functions into callable objects
+ class StdTerm:
+ def __init__(self, func):
+ self._func = func
+ def __call__(self, *args, **kwargs):
+ return self._func( *args, **kwargs)
+
+ def _settag(f, tag):
+ _check_name(f, tag)
+ x = globals()[f.func_name] = StdTerm(f)
+ x.tag = tag
+
+################ identifier, description, and cross-references
+def record(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("format",))
+ d = {"xmlns:bioformat": XMLNS}
+ _set_if_given(attrs, "format", d)
+ return Group("record", expr, d) # XXX FIXME
+_settag(record, "record") # XXX AND FIXME
+
+
+def dbid(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("type", "style", "dbname"))
+ d = {}
+ _set_if_given(attrs, "type", d, ("primary", "accession", "secondary"))
+ _set_if_given(attrs, "dbname", d)
+ return Group(NS + "dbid", expr, d)
+_settag(dbid, NS + "dbid")
+
+def description_block(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("join",))
+ _must_have(expr, description)
+ d = {}
+ _set_if_given(attrs, "join", d, ("english", "concat", "space", "newline"))
+ return Group(NS + "description_block", expr, d)
+_settag(description_block, NS + "description_block")
+
+def description(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "description", expr)
+_settag(description, NS + "description")
+
+def description_line(expr, attrs = {}):
+ return description_block(description(expr, attrs))
+
+def fast_dbxref(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("style",))
+ d = {}
+ _set_if_given(attrs, "style", d, ("sp-general", "sp-prosite", "sp-embl"))
+ return Group(NS + "fast_dbxref", expr, d)
+
+def dbxref(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("style",))
+ _must_have(expr, dbxref_dbid)
+ d = {}
+ _complain_if_given(attrs, "style")
+ return Group(NS + "dbxref", expr, d)
+_settag(dbxref, NS + "dbxref")
+
+def dbxref_dbname(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("style",))
+ d = {}
+ _set_if_given(attrs, "style", d)
+ return Group(NS + "dbxref_dbname", expr, d)
+_settag(dbxref_dbname, NS + "dbxref_dbname")
+
+def dbxref_dbid(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("dbname", "type", "style", "negate"))
+ d = {}
+ _set_if_given(attrs, "dbname", d)
+ _set_if_given(attrs, "type", d, ("primary", "accession", "secondary"))
+ _complain_if_given(attrs, "style")
+ _set_if_given(attrs, "negate", d, (0, 1), str)
+
+ return Group(NS + "dbxref_dbid", expr, d)
+_settag(dbxref_dbid, NS + "dbxref_dbid")
+
+def dbxref_negate(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "dbxref_negate", expr)
+_settag(dbxref_negate, NS + "dbxref_negate")
+
+##################### sequences
+
+def _check_gapchar(s):
+ if not ( ord(" ") <= ord(s) <= 126 ):
+ raise TypeError("%r not allowed as a gap character" % (s,))
+ return s
+
+# What about three letter codes?
+def sequence_block(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("alphabet", "gapchar", "remove_spaces"))
+ _must_have(expr, sequence)
+ d = {}
+ _set_if_given(attrs, "alphabet", d,
+ ("iupac-protein", "iupac-dna", "iupac-rna",
+ "iupac-ambiguous-protein",
+ "iupac-ambiguous-dna",
+ "iupac-ambiguous-rna",
+ "protein", "dna", "rna", "unknown"))
+ _set_if_given(attrs, "gapchar", d, convert = _check_gapchar)
+ _set_if_given(attrs, "remove_spaces", d, (0, 1), str)
+ return Group(NS + "sequence_block", expr, d)
+_settag(sequence_block, NS + "sequence_block")
+
+def sequence(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "sequence", expr)
+_settag(sequence, NS + "sequence")
+
+def alphabet(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("alphabet",))
+ d = {}
+ _set_if_given(attrs, "alphabet", d,
+ ("iupac-protein", "iupac-dna", "iupac-rna",
+ "iupac-ambiguous-protein",
+ "iupac-ambiguous-dna",
+ "iupac-ambiguous-rna",
+ "protein", "dna", "rna", "nucleotide", "unknown"))
+ return Group(NS + "alphabet", expr, d)
+_settag(alphabet, NS + "alphabet")
+
+
+
+############################## features
+
+# In PIR
+
+# FEATURE
+# 1-25 #domain signal sequence #status predicted #label SIG\
+# 26-737 #product procollagen-lysine 5-dioxygenase 2 #status
+# predicted #label MAT\
+# 63,209,297,365,522,
+# 725 #binding_site carbohydrate (Asn) (covalent) #status
+# predicted
+
+# The whole thing is a 'feature_block'
+
+# One 'feature' is
+# 26-737 #product procollagen-lysine 5-dioxygenase 2 #status
+# predicted #label MAT\
+
+# One 'feature_name' is "binding_site".
+
+# An example of the feature_location_block and feature_block, which I
+# will abbreviate as 'flb' and 'fl', is:
+# <flb> <fl>63,209,297,365,522,</fl>
+# <fl>725</fl> #binding_site carbohydrate ...
+
+# PIR doesn't have a 'feature_description'
+
+# Let:
+# fq = feature_qualifier
+# fqb = feature_qualifier
+# fqn = feature_qualifier_name
+# fqd = feature_qualifier_description
+# then the text
+#
+# 26-737 #product procollagen-lysine 5-dioxygenase 2 #status
+# predicted #label MAT\
+#
+# can be represented as (the rather tedious)
+#
+# 26-737 <fqb><fq>#<fqn>product</fqn> <fqd>procollagen-\
+# lysine 5-dioxygenase 2</fqd></fq> #<fq><fqn>status</fqn>
+# <fqd>predicted</fqd> #<fq><fqn>label\
+# </fqn> <fqd>MAT</fqd></fq>\</fqb>
+#
+
+# 'style' determines the namespace for the feature name
+def feature_block(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("style", "location-style"))
+ d = {}
+ _set_if_given(attrs, "style", d)
+ _set_if_given(attrs, "location-style", d)
+ _must_have(expr, feature)
+ return Group(NS + "feature_block", expr, d)
+_settag(feature_block, NS + "feature_block")
+
+def feature(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("location-style",))
+ d = {}
+ _set_if_given(attrs, "location-style", d)
+ _must_have(expr, feature_name)
+ _must_have_set(expr, [[feature_location],
+ [feature_location_start, feature_location_end]])
+ return Group(NS + "feature", expr, d)
+_settag(feature, NS + "feature")
+
+def feature_name(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "feature_name", expr)
+_settag(feature_name, NS + "feature_name")
+
+def feature_location(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "feature_location", expr)
+_settag(feature_location, NS + "feature_location")
+
+def feature_location_start(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "feature_location_start", expr)
+_settag(feature_location_start, NS + "feature_location_start")
+
+def feature_location_end(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "feature_location_end", expr)
+_settag(feature_location_end, NS + "feature_location_end")
+
+def feature_description(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "feature_description", expr)
+_settag(feature_description, NS + "feature_description")
+
+
+##def feature_qualifier_block(expr, attrs = {}):
+## attrs = _check_attrs(attrs, ())
+## _must_have(expr, feature_qualifier)
+## return Group(NS + "feature_qualifier_block", expr)
+##_settag(feature_qualifier_block, NS + "feature_qualifier_block")
+
+def feature_qualifier(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ _must_have(expr, feature_qualifier_name)
+ return Group(NS + "feature_qualifier", expr)
+_settag(feature_qualifier, NS + "feature_qualifier")
+
+def feature_qualifier_name(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "feature_qualifier_name", expr)
+_settag(feature_qualifier_name, NS + "feature_qualifier_name")
+
+def feature_qualifier_description(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group(NS + "feature_qualifier_description", expr)
+_settag(feature_qualifier_description, NS + "feature_qualifier_description")
+
+
+############ For homology searches
+
+# "BLASTN", "BLASTP"
+def application_name(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("app",))
+ return Group("bioformat:application_name", expr, attrs)
+
+# "2.0.11", "2.0a19MP-WashU"
+def application_version(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:application_version", expr, attrs)
+
+def search_header(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:search_header", expr, attrs)
+
+def search_table(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:search_table", expr, attrs)
+
+def search_table_description(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("bioformat:decode",))
+ d = {"bioformat:decode": "strip"}
+ _set_if_given(attrs, "bioformat:decode", d)
+ return Group("bioformat:search_table_description", expr, d)
+
+def search_table_value(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("name", "bioformat:decode"))
+ return Group("bioformat:search_table_value", expr, attrs)
+
+def search_table_entry(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:search_table_entry", expr, attrs)
+
+def query_description_block(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("join-query",))
+ d = {"join-query": "join|fixspaces"}
+ _set_if_given(attrs, "join-query", d)
+ return Group("bioformat:query_description_block", expr, d)
+
+def query_description(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("bioformat:decode"))
+ d = {}
+ _set_if_given(attrs, "bioformat:decode", d)
+ return Group("bioformat:query_description", expr, d)
+
+def query_size(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:query_size", expr)
+
+def database_name(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:database_name", expr, attrs)
+
+def database_num_sequences(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("bioformat:decode",))
+ return Group("bioformat:database_num_sequences", expr, attrs)
+
+def database_num_letters(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("bioformat:decode",))
+ return Group("bioformat:database_num_letters", expr, attrs)
+
+def hit(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("join-description",))
+ d = {"join-description": "join|fixspaces"}
+ _set_if_given(attrs, "join-description", d)
+ return Group("bioformat:hit", expr, d)
+
+def hit_length(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hit_length", expr, attrs)
+
+def hit_description(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("bioformat:decode"))
+ d = {}
+ _set_if_given(attrs, "bioformat:decode", d)
+ return Group("bioformat:hit_description", expr, d)
+
+def hsp(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp", expr, attrs)
+
+def hsp_value(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("name", "bioformat:decode"))
+ return Group("bioformat:hsp_value", expr, attrs)
+
+def hsp_frame(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("which",))
+ d = {}
+ _set_if_given(attrs, "which", d, valid = ("query", "homology", "subject"))
+ return Group("bioformat:hsp_frame", expr, d)
+
+def hsp_strand(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("strand", "which"))
+ d = {}
+ _set_if_given(attrs, "which", d, valid = ("query", "homology", "subject"))
+ _set_if_given(attrs, "strand", d, valid = ("+1", "0", "-1", ""))
+ return Group("bioformat:hsp_strand", expr, d)
+
+def hsp_seqalign_query_seq(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_query_seq", expr, attrs)
+
+def hsp_seqalign_homology_seq(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_homology_seq", expr, attrs)
+
+def hsp_seqalign_subject_seq(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_subject_seq", expr, attrs)
+
+def hsp_seqalign_query_leader(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_query_leader", expr, attrs)
+
+
+def hsp_seqalign_query_name(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_query_name", expr, attrs)
+
+def hsp_seqalign_subject_name(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_subject_name", expr, attrs)
+
+def hsp_seqalign(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign", expr, attrs)
+
+def hsp_seqalign_query_start(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_query_start", expr, attrs)
+
+def hsp_seqalign_query_end(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_query_end", expr, attrs)
+
+def hsp_seqalign_subject_start(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_subject_start", expr, attrs)
+
+def hsp_seqalign_subject_end(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ())
+ return Group("bioformat:hsp_seqalign_subject_end", expr, attrs)
+
+def search_parameter(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("name", "bioformat:decode"))
+ d = {}
+ _set_if_given(attrs, "name", d)
+ _set_if_given(attrs, "bioformat:decode", d)
+ return Group("bioformat:search_parameter", expr, d)
+
+def search_statistic(expr, attrs = {}):
+ attrs = _check_attrs(attrs, ("name", "bioformat:decode"))
+ d = {}
+ _set_if_given(attrs, "name", d)
+ _set_if_given(attrs, "bioformat:decode", d)
+ return Group("bioformat:search_statistic", expr, d)
+
--- /dev/null
+# Standard Content and Dispatch handlers for the Bioformat IO system
+# This is a Python module.
+"""This module is DEPRECATED.
+
+Andrew Dalke is no longer maintaining Martel or Bio.Mindy, and these modules
+and associate ones like Bio.StdHandler are now deprecated. They are no longer
+used in any of the current Biopython parsers, and are likely to be removed
+in a future release.
+"""
+
+import warnings
+warnings.warn("Martel and those parts of Biopython depending on it" \
+ +" directly (such as Bio.Mindy and Bio.StdHandler) are now" \
+ +" deprecated, and will be removed in a future release of"\
+ +" Biopython. If you want to continue to use this code,"\
+ +" please get in contact with the Biopython developers via"\
+ +" the mailing lists to avoid its permanent removal from"\
+ +" Biopython.", \
+ DeprecationWarning)
+
+from xml.sax import handler
+from Martel import Parser, Dispatch
+from Bio import Std, Decode
+
+###################################
+
+# Helper functions to make functions
+
+def add_int_handler(klass, tag, attrname):
+ assert not hasattr(klass, "start_" +tag), "existing method exists"
+ assert not hasattr(klass, "end_" +tag), "existing method exists"
+ s = """if 1:
+ def start(self, tag, attrs):
+ self.save_characters()
+ def end(self, tag):
+ self.%s = int(self.get_characters())
+""" % attrname
+ d = {}
+ exec s in d
+ setattr(klass, "start_" + tag, d["start"])
+ setattr(klass, "end_" + tag, d["end"])
+
+def add_text_handler(klass, tag, attrname):
+ assert not hasattr(klass, "start_" +tag), "existing method exists"
+ assert not hasattr(klass, "end_" +tag), "existing method exists"
+ s = """if 1:
+ def start(self, tag, attrs):
+ self.save_characters()
+ def end(self, tag):
+ self.%s = self.get_characters()
+""" % attrname
+ d = {}
+ exec s in d
+ setattr(klass, "start_" + tag, d["start"])
+ setattr(klass, "end_" + tag, d["end"])
+
+def add_text_dict_handler(klass, tag, attrname, key):
+ assert not hasattr(klass, "start_" +tag), "existing method exists"
+ assert not hasattr(klass, "end_" +tag), "existing method exists"
+ s = """if 1:
+ def start(self, tag, attrs):
+ self.save_characters()
+ def end(self, tag):
+ self.%s["%s"] = self.get_characters()
+""" % (attrname, key)
+ d = {}
+ exec s in d
+ setattr(klass, "start_" + tag, d["start"])
+ setattr(klass, "end_" + tag, d["end"])
+
+def add_text_decode_handler(klass, tag, attrname):
+ assert not hasattr(klass, "start_" +tag), "existing method exists"
+ assert not hasattr(klass, "end_" +tag), "existing method exists"
+ s = """if 1:
+ def start(self, tag, attrs):
+ self.save_characters()
+ self._decode_%s = attrs.get("bioformat:decode", None)
+ def end(self, tag):
+ if self._decode_%s is not None:
+ s = Decode.make_decoder(self._decode_%s)(s)
+ self.%s = self.get_characters()
+""" % (tag, tag, tag, attrname)
+ d = {"Decode": Decode}
+ exec s in d
+ setattr(klass, "start_" + tag, d["start"])
+ setattr(klass, "end_" + tag, d["end"])
+
+def add_first_text_handler(klass, tag, attrname):
+ assert not hasattr(klass, "start_" +tag), "existing method exists"
+ assert not hasattr(klass, "end_" +tag), "existing method exists"
+ s = """if 1:
+ def start(self, tag, attrs):
+ if self.%s is None:
+ self.save_characters()
+ def end(self, tag):
+ if self.%s is None:
+ self.%s = self.get_characters()
+""" % (attrname, attrname, attrname)
+ d = {}
+ exec s in d
+ setattr(klass, "start_" + tag, d["start"])
+ setattr(klass, "end_" + tag, d["end"])
+
+def add_text_block_handler(klass, tag, joinattr, defaultjoin, attrname):
+ assert not hasattr(klass, "start_" + tag), "existing method exists"
+ assert not hasattr(klass, "end_" + tag), "existing method exists"
+ assert not hasattr(klass, "start_"+tag+"_block"), "existing method exists"
+ assert not hasattr(klass, "end_" +tag+"_block"), "existing method exists"
+ s = """if 1:
+ def start_block(self, tag, attrs):
+ self._%(tag)s_join_func = Decode.make_decoder(attrs.get(%(joinattr)r, %(defaultjoin)r))
+ self._%(tag)s_lines = []
+ def end_block(self, tag):
+ self.%(attrname)s = self._%(tag)s_join_func(self._%(tag)s_lines)
+ def start(self, tag, attrs):
+ self.save_characters()
+ def end(self, tag):
+ self._%(tag)s_lines.append(self.get_characters())
+""" % locals()
+ d = {"Decode": Decode}
+ exec s in d
+ setattr(klass, "start_" + tag, d["start"])
+ setattr(klass, "end_" + tag, d["end"])
+ setattr(klass, "start_" + tag + "_block", d["start_block"])
+ setattr(klass, "end_" + tag + "_block", d["end_block"])
+
+def add_value_handler(klass, tag, attrname):
+ assert not hasattr(klass, "start_" +tag), "existing method exists"
+ assert not hasattr(klass, "end_" +tag), "existing method exists"
+ s = """if 1:
+ def start(self, tag, attrs):
+ self._%(tag)s_name = attrs["name"]
+ self._%(tag)s_decode = attrs.get("bioformat:decode", None)
+ self.save_characters()
+ def end(self, tag):
+ s = self.get_characters()
+ if self._%(tag)s_decode is not None:
+ s = Decode.make_decoder(self._%(tag)s_decode)(s)
+ self.%(attrname)s[self._%(tag)s_name] = s
+""" % locals()
+ d = {"Decode": Decode}
+ exec s in d
+ setattr(klass, "start_" + tag, d["start"])
+ setattr(klass, "end_" + tag, d["end"])
+
+
+#################################
+
+class ConvertHandler(handler.ContentHandler):
+ """Used to read records and produce output"""
+ def __init__(self, record_builder, writer, record_tag = "record"):
+ handler.ContentHandler.__init__(self)
+ self.record_builder = record_builder
+ self.writer = writer
+ self.record_tag = record_tag
+
+ def startDocument(self):
+ self.inside_record = 0
+ self.characters = self.ignore_characters
+
+ def startElement(self, tag, attrs):
+ if self.inside_record:
+ self.record_builder.startElement(tag, attrs)
+ elif tag == self.record_tag:
+ self.record_builder.startDocument()
+ self.inside_record = 1
+ self.characters = self.record_builder.characters
+ self.record_builder.startElement(tag, attrs)
+
+ def endElement(self, tag):
+ if self.inside_record:
+ self.record_builder.endElement(tag)
+ if tag == self.record_tag:
+ self.record_builder.endDocument()
+ self.writer.write(self.record_builder.document)
+ self.inside_record = 0
+ self.characters = self.ignore_characters
+
+ def ignore_characters(self, s):
+ pass
+
+class ConvertDispatchHandler(Dispatch.Dispatcher):
+ """Used to read records and produce output through a Dispatcher"""
+ def __init__(self, record_builder, writer, record_tag = "record"):
+ setattr(self, "end_" + record_tag, self.write_record)
+ Dispatch.Dispatcher.__init__(self,
+ remap = {record_tag: "bioformat:"}
+ )
+ self.acquire(record_builder)
+ self.record_builder = record_builder
+ self.writer = writer
+ self.record_tag = record_tag
+ def write_record(self, tag):
+ self.writer.write(self.record_builder.document)
+
+
+
+class RecognizeHandler(handler.ContentHandler, handler.ErrorHandler):
+ def __init__(self):
+ self.recognized = 1
+ self.exc = None
+
+ def fatalError(self, exc):
+ if isinstance(exc, Parser.ParserIncompleteException):
+ pass
+ else:
+ self.recognized = 0
+ self.exc = exc
+ raise exc
+
+ error = fatalError
+
+ def endElement(self, tag):
+ if tag == "record":
+ raise Parser.ParserException("we finished a record!")
+
+
+
+class Handle_dbid(Dispatch.Callback):
+ def start_dbid(self, tag, attrs):
+ self.attrs = attrs
+ self.save_characters()
+
+ def end_dbid(self, tag):
+ text = self.get_characters()
+ self.callback(text, self.attrs)
+
+
+class Handle_description(Dispatch.Callback):
+ def start_description_block(self, tag, attrs):
+ j = attrs.get("join", None)
+ if j is None:
+ self.join_fctn = Decode.join_fixspaces
+ else:
+ self.join_fctn = Decode.make_typechecked_decoder(j, list, str)
+ self.descriptions = []
+ def start_description(self, tag, attrs):
+ self.save_characters()
+ def end_description(self, tag):
+ x = self.get_characters()
+ self.descriptions.append(x)
+ def end_description_block(self, tag):
+ self.callback(self.join_fctn(self.descriptions))
+
+#### There can be multiple dbxref_dbids in a dbxref
+# DR EMBL; X64411; CAA45756.1; -.
+# <dbxref><..dbname style="swiss">EMBL</..dbname>
+# <dbid type="primary">X64411</dbid>
+# <dbid type="accession">CAA45756.1</dbid>
+# </dbxref>
+###
+# DR P35156, YPUI_BACSU, F;
+# <dbxref><dbid type="primary" dbname="sprot">P35156</dbid>
+# <dbid type="accession" dbname="sprot">YPUI_BACSU</dbid>
+# <negate/>
+# </dbxref>
+
+def _fixup_sp_pattern(exp):
+ import re
+ import Martel
+ exp = Martel.select_names(exp, (Std.dbxref_dbname.tag,Std.dbxref_dbid.tag))
+
+ e = exp._find_groups(Std.dbxref_dbname.tag)
+ assert len(e) == 1
+ e = e[0]
+ e.name = "dbname"
+ dbstyle = e.attrs["style"]
+ e.attrs = {}
+ e = exp._find_groups(Std.dbxref_dbid.tag)
+ assert len(e) == 2
+ e[0].name = "primary_dbid"
+ primary_type = e[0].attrs["type"]
+ e[0].attrs = {}
+ e[1].name = "secondary_dbid"
+ secondary_type = e[1].attrs["type"]
+ e[1].attrs = {}
+ pattern = str(exp) + "$"
+ pat = re.compile(pattern)
+ return pat, dbstyle, primary_type, secondary_type
+
+# Turns out these 'fast' versions speed up the dbxref code by about
+# a factor of 2.
+
+# DR PIR; S08427; S08427.
+_fast_dbxref_sp_general_data = None
+def _fast_dbxref_sp_general(s):
+ global _fast_dbxref_sp_general_data
+ if _fast_dbxref_sp_general_data is None:
+ from Bio.expressions.swissprot import sprot38
+ _fast_dbxref_sp_general_data = _fixup_sp_pattern(
+ sprot38.real_DR_general)
+
+ pat, dbstyle, primary_type, secondary_type = _fast_dbxref_sp_general_data
+
+ m = pat.match(s)
+ assert m is not None, "Ill-formated sp-general dxbref: %r" % s
+ return (
+ (dbstyle, m.group("dbname"), primary_type,
+ m.group("primary_dbid"), 0),
+ (dbstyle, m.group("dbname"), secondary_type,
+ m.group("secondary_dbid"), 0)
+ )
+
+# DR PFAM; PF01018; GTP1_OBG; 1.
+# DR PROSITE; PS00905; GTP1_OBG; 1.
+
+_fast_dbxref_sp_prosite_data = None
+def _fast_dbxref_sp_prosite(s):
+ global _fast_dbxref_sp_prosite_data
+
+ if _fast_dbxref_sp_prosite_data is None:
+ from Bio.expressions.swissprot import sprot38
+ _fast_dbxref_sp_prosite_data = _fixup_sp_pattern(
+ sprot38.real_DR_prosite)
+
+ pat, dbstyle, primary_type, secondary_type = _fast_dbxref_sp_prosite_data
+ m = pat.match(s)
+ assert m is not None, "Ill-formated sp-prosite dxbref: %r" % s
+ return (
+ (dbstyle, m.group("dbname"), primary_type,
+ m.group("primary_dbid"), 0),
+ (dbstyle, m.group("dbname"), secondary_type,
+ m.group("secondary_dbid"), 0)
+ )
+
+
+# DR EMBL; M36407; AAA33110.1; -.
+_fast_dbxref_sp_embl_data = None
+def _fast_dbxref_sp_embl(s):
+ global _fast_dbxref_sp_embl_data
+
+ if _fast_dbxref_sp_embl_data is None:
+ from Bio.expressions.swissprot import sprot38
+ _fast_dbxref_sp_embl_data = _fixup_sp_pattern(
+ sprot38.real_DR_embl)
+
+ pat, dbstyle, primary_type, secondary_type = _fast_dbxref_sp_embl_data
+ m = pat.match(s)
+ assert m is not None, "Ill-formated sp-embl dxbref: %r" % s
+ return (
+ (dbstyle, m.group("dbname"), primary_type,
+ m.group("primary_dbid"), 0),
+ (dbstyle, m.group("dbname"), secondary_type,
+ m.group("secondary_dbid"), 0)
+ )
+
+_fast_dbxref_parser_table = {
+ "sp-general": _fast_dbxref_sp_general,
+ "sp-prosite": _fast_dbxref_sp_prosite,
+ "sp-embl": _fast_dbxref_sp_embl,
+}
+
+class Handle_dbxref(Dispatch.Callback):
+ def __init__(self, callback):
+ Dispatch.Callback.__init__(self, callback)
+ self.supported_features.append("fast-sp-dbxref")
+ self.slow_callback = self.callback
+ def start_dbxref(self, tag, attrs):
+ self.negate = 0
+ self.dbname = None
+ self.dbids = []
+ self.info = []
+
+ def start_dbxref_dbname(self, tag, attrs):
+ assert self.dbname is None, "cannot set the dbname twice"
+ self.dbname_style = attrs.get("style", "unknown")
+ self.save_characters()
+ def end_dbxref_dbname(self, tag):
+ self.dbname = self.get_characters()
+
+ def start_dbxref_dbid(self, tag, attrs):
+ d = attrs.get("dbname", None)
+ if d is None:
+ assert self.dbname is not None, "must set the dbname"
+ self.info.append( (self.dbname_style, self.dbname,
+ attrs.get("type", "primary")) )
+ else:
+ self.info.append( ("bioformat", d,
+ attrs.get("type", "primary")) )
+ self.save_characters()
+
+ def end_dbxref_dbid(self, tag):
+ self.dbids.append( self.get_characters())
+
+ def start_dbxref_negate(self, tag, attrs):
+ self.negate = 1
+
+ def end_dbxref(self, tag):
+ cb = self.slow_callback
+ if cb is None:
+ return
+ negate = self.negate
+ for ( (dbname_style, dbname, idtype), dbid) in zip(self.info,
+ self.dbids):
+ self.slow_callback(dbname_style, dbname, idtype, dbid, negate)
+
+ def start_fast_dbxref(self, tag, attrs):
+ style = attrs["style"]
+ self._fast_parser = _fast_dbxref_parser_table[style]
+ self.save_characters()
+ self.slow_callback = None
+ def end_fast_dbxref(self, tag):
+ for info in self._fast_parser(self.get_characters()):
+ self.callback(*info)
+ self.slow_callback = self.callback
+
+##################
+class Handle_sequence(Dispatch.Callback):
+ global_alphabet = None
+ def start_(self, tag, attrs):
+ self.global_alphabet = None
+
+ def start_sequence_block(self, tag, attrs):
+ self.local_alphabet = attrs.get("alphabet", None)
+ self.gapchar = attrs.get("gapchar", None)
+ self.stopchar = attrs.get("stopchar", None)
+ j = attrs.get("join", None)
+ if j is not None:
+ self.join_func = Decode.make_typechecked_decoder(j, list, str)
+ else:
+ self.join_func = None
+ self.sequences = []
+
+ def end_sequence_block(self, tag):
+ f = self.join_func
+ if f is not None:
+ seq = self.f(self.sequences)
+ else:
+ seq = "".join(self.sequences).replace(" ", "")
+ alphabet = self.local_alphabet or self.global_alphabet or "unknown"
+ self.callback( (alphabet, seq, self.gapchar, self.stopchar) )
+
+ def start_alphabet(self, tag, attrs):
+ self.global_alphabet = attrs["alphabet"]
+
+ def start_sequence(self, tag, attrs):
+ self.save_characters()
+ def end_sequence(self, tag):
+ self.sequences.append(self.get_characters())
+
+class Feature:
+ def __init__(self, name, description, location, qualifiers):
+ self.name = name
+ self.description = description
+ self.location = location
+ self.qualifiers = qualifiers
+ def __str__(self):
+ return "Feature %r %r %s num_qualifiers = %d" % \
+ (self.name, self.description, self.location,
+ len(self.qualifiers))
+
+
+class Handle_feature_location(Dispatch.Callback):
+ def __init__(self, callback, settings = {}):
+ Dispatch.Callback.__init__(self, callback)
+ self.settings = settings
+
+ def start_feature(self, tag, attrs):
+ self.location_style = attrs.get("location-style",
+ self.settings["location-style"])
+ j = attrs.get("join-feature", None)
+ if j is None:
+ self.text_join_func = "".join
+ else:
+ self.text_join_func = Decode.make_typechecked_decoder(j, list, str)
+
+ self.location_start = None
+ self.location_end = None
+ self.text_lines = []
+
+ def end_feature(self, tag):
+ if self.location_start or self.location_end:
+ if self.text_lines:
+ raise TypeError("Cannot have both location text and start/end")
+ self.callback(self.location_style,
+ (self.location_start, self.location_end))
+ else:
+ self.callback(self.location_style,
+ (self.text_join_func(self.text_lines), None))
+
+ def start_feature_location(self, tag, attrs):
+ self.save_characters()
+ def end_feature_location(self, tag):
+ self.text_lines.append(self.get_characters())
+
+add_text_handler(Handle_feature_location, "feature_location_start",
+ "location_start")
+add_text_handler(Handle_feature_location, "feature_location_end",
+ "location_end")
+
+##################################
+
+class Handle_feature_qualifier(Dispatch.Callback):
+ def __init__(self, callback, settings):
+ self.settings = settings
+ Dispatch.Callback.__init__(self, callback)
+
+ def start_feature_qualifier(self, tag, attrs):
+ self.name = None
+ self.description = []
+ qj = attrs.get("join-qualifier", None)
+ if qj is None:
+ self.join = self.settings["qualifier_join_func"]
+ else:
+ self.join = Decode.make_typechecked_decoder(qj, list, str)
+
+ def end_feature_qualifier(self, tag):
+ self.callback(self.name, self.join(self.description))
+
+ def start_feature_qualifier_description(self, tag, attrs):
+ self.save_characters()
+ def end_feature_qualifier_description(self, tag):
+ self.description.append(self.get_characters())
+
+add_text_handler(Handle_feature_qualifier, "feature_qualifier_name", "name")
+
+####################
+
+class Handle_features(Dispatch.Callback):
+ def __init__(self, callback):
+ Dispatch.Callback.__init__(self, callback)
+ self.settings = {}
+
+ self.acquire(Handle_feature_location(self.add_location, self.settings))
+
+ self.acquire(Handle_feature_qualifier(self.add_feature_qualifier,
+ self.settings))
+
+ def start_feature_block(self, tag, attrs):
+ jf = attrs.get("join-description", None)
+ if jf is None:
+ self.join_feature_description = Decode.join_fixspaces
+ else:
+ self.join_feature_description = Decode.make_typechecked_decoder(
+ jf, list, str)
+
+ self.settings["location-style"] = attrs.get("location-style", None)
+
+ jq = attrs.get("join-qualifier", None)
+ if jq is None:
+ self.settings["qualifier_join_func"] = Decode.join_fixspaces
+ else:
+ self.settings["qualifier_join_func"] = \
+ Decode.make_typechecked_decoder(jq, list, str)
+ self.features = []
+
+ def end_feature_block(self, tag):
+ self.callback(self.features)
+ self.features = None
+
+ def start_feature(self, tag, attrs):
+ self.name = None
+ self.description = []
+ self.location = None
+ self.qualifiers = []
+
+ def start_feature_description(self, tag, attrs):
+ self.save_characters()
+ def end_feature_description(self, tag):
+ self.description.append(self.get_characters())
+
+ def end_feature(self, tag):
+ self.features.append(Feature(
+ self.name,
+ self.join_feature_description(self.description),
+ self.location,
+ self.qualifiers))
+
+ def add_feature_qualifier(self, name, description):
+ self.qualifiers.append((name, description))
+
+ def add_location(self, style, location_info):
+ self.location = (style, location_info)
+
+add_text_handler(Handle_features, "feature_name", "name")
+
+
+############## Search handlers
+
+class Handle_hsp_seqalign(Dispatch.Callback):
+ def start_hsp(self, tag, attrs):
+ self.query_name = None # "Query"
+ self.subject_name = None # "Sbjct"
+
+ self.query_seq = "" # the actual text of the sequence
+ self.homology_seq = ""
+ self.subject_seq = ""
+
+ self.query_start_loc = None
+ self.query_end_loc = None
+
+ self.subject_start_loc = None
+ self.subject_end_loc = None
+
+ def end_hsp(self, tag):
+ self.callback(self)
+
+ def start_hsp_seqalign(self, tag, attrs):
+ self.sub_leader = None
+
+ def start_hsp_seqalign_query_seq(self, tag, attrs):
+ self.save_characters()
+ def end_hsp_seqalign_query_seq(self, tag):
+ s = self.get_characters()
+ self.query_seq += s
+ self.sub_query_seq_len = len(s)
+
+ def start_hsp_seqalign_homology_seq(self, tag, attrs):
+ self.save_characters()
+ def end_hsp_seqalign_homology_seq(self, tag):
+ query_leader = self.leader_size
+ query_seq_len = self.sub_query_seq_len
+ line = self.get_characters()
+ s = line[query_leader:query_leader+query_seq_len]
+ assert len(s) == query_seq_len, (len(s), query_seq_len, line)
+ self.homology_seq += s
+
+ def start_hsp_seqalign_subject_seq(self, tag, attrs):
+ self.save_characters()
+ def end_hsp_seqalign_subject_seq(self, tag):
+ self.subject_seq += self.get_characters()
+
+ def start_hsp_seqalign_query_leader(self, tag, attrs):
+ self.save_characters()
+ def end_hsp_seqalign_query_leader(self, tag):
+ self.leader_size = len(self.get_characters())
+
+add_first_text_handler(Handle_hsp_seqalign, "hsp_seqalign_query_name",
+ "query_name")
+
+add_first_text_handler(Handle_hsp_seqalign, "hsp_seqalign_subject_name",
+ "subject_name")
+
+add_first_text_handler(Handle_hsp_seqalign, "hsp_seqalign_query_start",
+ "query_start_loc")
+add_text_handler(Handle_hsp_seqalign, "hsp_seqalign_query_end",
+ "query_end_loc")
+
+add_first_text_handler(Handle_hsp_seqalign, "hsp_seqalign_subject_start",
+ "subject_start_loc")
+add_text_handler(Handle_hsp_seqalign, "hsp_seqalign_subject_end",
+ "subject_end_loc")
+
+
+
+
+#############################
+
+class Handle_hsp(Dispatch.Callback):
+ def __init__(self, callback):
+ Dispatch.Callback.__init__(self, callback)
+ self.acquire(Handle_hsp_seqalign(self.add_hsp_seqs))
+
+ def start_hsp(self, tag, attrs):
+ self.hsp_values = {} # expect, p, identities, ...
+ self.strands = {}
+ self.frames = {}
+
+ def end_hsp(self, tag):
+ self.callback(self.hsp_values,
+ self.hsp_info,
+ self.strands, self.frames,
+ )
+
+ def start_hsp_strand(self, tag, attrs):
+ self.strands[attrs["which"]] = attrs["strand"]
+
+ def start_hsp_frame(self, tag, attrs):
+ self.getting_frame = attrs["which"]
+ self.save_characters()
+
+ def end_hsp_frame(self, tag):
+ self.frames[self.getting_frame] = self.get_characters()
+ self.getting_frame = None
+
+ def add_hsp_seqs(self, hsp_info):
+ self.hsp_info = hsp_info
+
+ def start_hsp_value(self, tag, attrs):
+ self.value_convert = attrs.get("bioformat:decode", None)
+ self.value_name = attrs["name"]
+ self.save_characters()
+
+ def end_hsp_value(self, tag):
+ s = self.get_characters()
+ if self.value_name is not None:
+ if self.value_name == "float":
+ s = float(s)
+ else:
+ s = Decode.make_decoder(self.value_convert)(s)
+ self.hsp_values[self.value_name] = s
+
+#############################
+
+
+class Handle_search_table(Dispatch.Callback):
+ def start_search_table_value(self, tag, attrs):
+ self.value_name = attrs["name"]
+ self.value_decode = attrs.get("bioformat:decode", None)
+ self.save_characters()
+ def end_search_table_value(self, tag):
+ s = self.get_characters()
+ if self.value_decode is not None:
+ x = self.value_decode
+ if x == "int":
+ s = int(s)
+ elif x == "float":
+ s = float(s)
+ else:
+ s = Decode.make_decoder(x)(s)
+ self.values[self.value_name] = s
+
+ def start_search_table(self, tag, attrs):
+ self.data = []
+ def end_search_table(self, tag):
+ self.callback(self.data)
+ self.data = None
+
+ def start_search_table_entry(self, tag, attrs):
+ self.description = None
+ self.values = {}
+
+ def end_search_table_entry(self, tag):
+ self.data.append( (self.description, self.values) )
+ self.description = self.values = None
+
+add_text_handler(Handle_search_table, "search_table_description",
+ "description")
+
+#############################
+
+class Handle_search_header(Dispatch.Callback):
+ def start_(self, tag, attrs):
+ self.dict = {}
+ self.query_description = None
+
+ def end_search_header(self, tag):
+ d = self.dict
+ d["query_description"] = self.query_description
+ self.callback(d)
+
+add_text_block_handler(Handle_search_header, "query_description",
+ "join-query", "join|fixspaces", "query_description")
+
+add_text_dict_handler(Handle_search_header, "application_name",
+ "dict", "appname")
+add_text_dict_handler(Handle_search_header, "application_version",
+ "dict", "appversion")
+add_text_dict_handler(Handle_search_header, "database_name",
+ "dict", "dbname")
+add_text_dict_handler(Handle_search_header, "database_num_sequences",
+ "dict", "db_num_sequences")
+add_text_dict_handler(Handle_search_header, "database_num_letters",
+ "dict", "db_num_letters")
+add_text_dict_handler(Handle_search_header, "query_size",
+ "dict", "query_size")
+
+
+#############################
+
+class Handle_search_info(Dispatch.Callback):
+ def start_(self, tag, attrs):
+ self.parameters = {}
+ self.statistics = {}
+
+ def end_(self, tag):
+ self.callback(self.parameters, self.statistics)
+
+add_value_handler(Handle_search_info, "search_parameter", "parameters")
+add_value_handler(Handle_search_info, "search_statistic", "statistics")
--- /dev/null
+"""Code to transcribe DNA into RNA or back (OBSOLETE).
+
+You are now encouraged to use the Seq object methods or the functions
+in Bio.Seq instead.
+
+This module is now considered to be obsolete, and is likely to be deprecated
+in a future release of Biopython, and later removed.
+"""
+
+from Bio import Alphabet, Seq
+from Bio.Alphabet import IUPAC
+
+class Transcribe:
+ def __init__(self, dna_alphabet, rna_alphabet):
+ self.dna_alphabet = dna_alphabet
+ self.rna_alphabet = rna_alphabet
+
+ def transcribe(self, dna):
+ assert dna.alphabet == self.dna_alphabet, \
+ "transcribe has the wrong DNA alphabet"
+ s = dna.data
+ return Seq.Seq(s.replace("T", "U"), self.rna_alphabet)
+ def back_transcribe(self, rna):
+ assert rna.alphabet == self.rna_alphabet, \
+ "back transcribe has the wrong RNA alphabet"
+ s = rna.data
+ return Seq.Seq(s.replace("U", "T"), self.dna_alphabet)
+
+generic_transcriber = Transcribe(Alphabet.generic_dna,
+ Alphabet.generic_rna)
+ambiguous_transcriber = Transcribe(IUPAC.ambiguous_dna,
+ IUPAC.ambiguous_rna)
+unambiguous_transcriber = Transcribe(IUPAC.unambiguous_dna,
+ IUPAC.unambiguous_rna)
--- /dev/null
+"""Code to translate DNA or RNA into proteins (OBSOLETE).
+
+Instead of Bio.Translate, for translation you are now encouraged to use the
+Seq object's translate method, or the translate function in the Bio.Seq
+module. Translate-to-stop functionality is via an optional argument.
+
+Bio.Seq does not offer any back-translation function like the one here. It
+was concluded that a since a simple back-translation giving a Seq or python
+string could only capture some of the possible back translations, there were
+no practical uses for such a method/function.
+
+This module is now considered to be obsolete, and is likely to be deprecated
+in a future release of Biopython, and later removed.
+"""
+from Bio import Alphabet, Seq
+from Bio.Data import CodonTable
+
+class Translator:
+ def __init__(self, table):
+ self.table = table
+ self._encoded = {}
+
+ def __str__(self) :
+ return "Translator object\n" + str(self.table)
+
+ def translate(self, seq, stop_symbol = "*"):
+ #Allow different instances of the same class to be used:
+ assert seq.alphabet.__class__ == \
+ self.table.nucleotide_alphabet.__class__, \
+ "cannot translate from given alphabet (have %s, need %s)" %\
+ (seq.alphabet, self.table.nucleotide_alphabet)
+ s = seq.data
+ letters = []
+ append = letters.append
+ table = self.table
+ get = table.forward_table.get
+ n = len(seq)
+ for i in range(0, n-n%3, 3):
+ append(get(s[i:i+3], stop_symbol))
+
+ # return with the correct alphabet encoding (cache the encoding)
+ try:
+ alphabet = self._encoded[stop_symbol]
+ except KeyError:
+ alphabet = Alphabet.HasStopCodon(table.protein_alphabet,
+ stop_symbol)
+ self._encoded[stop_symbol] = alphabet
+
+ return Seq.Seq("".join(letters), alphabet)
+
+ def translate_to_stop(self, seq):
+ # This doesn't have a stop encoding
+
+ #Allow different instances of the same class to be used:
+ assert seq.alphabet.__class__ == \
+ self.table.nucleotide_alphabet.__class__, \
+ "cannot translate from given alphabet (have %s, need %s)" %\
+ (seq.alphabet, self.table.nucleotide_alphabet)
+ s = seq.data
+ letters = []
+ append = letters.append
+ table = self.table.forward_table
+ n = len(seq)
+ try:
+ for i in range(0, n-n%3, 3):
+ append(table[s[i:i+3]])
+ except KeyError:
+ # Stop at the first codon failure
+ pass
+ return Seq.Seq("".join(letters), self.table.protein_alphabet)
+
+ def back_translate(self, seq):
+ # includes the stop codon
+ if not isinstance(seq.alphabet, Alphabet.HasStopCodon):
+ return self._back_translate_no_stop(seq)
+ assert seq.alphabet.alphabet == self.table.protein_alphabet, \
+ "cannot back translate from the given alphabet (%s)" % \
+ seq.alphabet.alphabet
+ s = seq.data
+ letter = seq.alphabet.stop_symbol
+ letters = []
+ append = letters.append
+ table = self.table.back_table
+ for c in seq.data:
+ if c == letter:
+ append(table[None])
+ else:
+ append(table[c])
+ return Seq.Seq("".join(letters),
+ self.table.nucleotide_alphabet)
+
+ def _back_translate_no_stop(self, seq):
+ # does not allow a stop codon
+ assert seq.alphabet == self.table.protein_alphabet, \
+ "cannot back translate from the given alphabet (%s)" % \
+ seq.alphabet
+ s = seq.data
+ letters = []
+ append = letters.append
+ table = self.table.back_table
+ for c in seq.data:
+ append(table[c])
+ return Seq.Seq("".join(letters),
+ self.table.nucleotide_alphabet)
+
+unambiguous_dna_by_name = {}
+for key, value in CodonTable.unambiguous_dna_by_name.items():
+ unambiguous_dna_by_name[key] = Translator(value)
+unambiguous_dna_by_id = {}
+for key, value in CodonTable.unambiguous_dna_by_id.items():
+ unambiguous_dna_by_id[key] = Translator(value)
+
+unambiguous_rna_by_name = {}
+for key, value in CodonTable.unambiguous_rna_by_name.items():
+ unambiguous_rna_by_name[key] = Translator(value)
+unambiguous_rna_by_id = {}
+for key, value in CodonTable.unambiguous_rna_by_id.items():
+ unambiguous_rna_by_id[key] = Translator(value)
+
+# XXX Ambiguous - can be done the same except for stop codons!
+ambiguous_dna_by_name = {}
+for key, value in CodonTable.ambiguous_dna_by_name.items():
+ ambiguous_dna_by_name[key] = Translator(value)
+ambiguous_dna_by_id = {}
+for key, value in CodonTable.ambiguous_dna_by_id.items():
+ ambiguous_dna_by_id[key] = Translator(value)
+
+ambiguous_rna_by_name = {}
+for key, value in CodonTable.ambiguous_rna_by_name.items():
+ ambiguous_rna_by_name[key] = Translator(value)
+ambiguous_rna_by_id = {}
+for key, value in CodonTable.ambiguous_rna_by_id.items():
+ ambiguous_rna_by_id[key] = Translator(value)
--- /dev/null
+"""Part of an old unused and undocumented sequence writing framework (DEPRECATED)."""
+
+import warnings
+warnings.warn("Bio.Writer and Bio.writer.* are deprecated. If you do use"\
+ +" these modules, please get in touch via the mailing list or"\
+ +" bugzilla to avoid their permanent removal from Biopython.", \
+ DeprecationWarning)
+
+class Writer:
+ def __init__(self, outfile):
+ self.outfile = outfile
+ def writeHeader(self):
+ pass
+ def write(self, record):
+ pass
+ def writeFooter(self):
+ pass
--- /dev/null
+# Copyright 2000 by Jeffrey Chang. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Collection of modules for dealing with biological data in Python.
+
+The Biopython Project is an international association of developers
+of freely available Python tools for computational molecular biology.
+
+http://biopython.org
+"""
+
+__version__ = "1.50"
+
+class MissingExternalDependencyError(Exception):
+ pass
--- /dev/null
+/* cMarkovModelmodule.c
+ * jchang
+ * Created: 1/13/01
+ * Last modified: 11/26/02
+ *
+ * This optimizes some of the functions in MarkovModel.py.
+ */
+
+#include "Python.h"
+#include "csupport.h"
+
+
+/* Functions in this module. */
+
+static char cMarkovModel__logadd__doc__[] =
+"_logadd(logx, logy) -> log(x+y)\n";
+
+static PyObject *cMarkovModel__logadd(PyObject *self, PyObject *args)
+{
+ PyObject *py_logx, *py_logy;
+ double logx, logy, minxy;
+ double sum;
+
+ if(!PyArg_ParseTuple(args, "OO", &py_logx, &py_logy))
+ return NULL;
+ logx = PyNumber_AsDouble(py_logx);
+ logy = PyNumber_AsDouble(py_logy);
+ if(PyErr_Occurred())
+ return NULL;
+
+ if(logy-logx > 100.0) {
+ Py_INCREF(py_logy);
+ return py_logy;
+ } else if (logx-logy > 100.0) {
+ Py_INCREF(py_logx);
+ return py_logx;
+ }
+ minxy = (logx < logy) ? logx : logy;
+ sum = minxy + log(exp(logx-minxy) + exp(logy-minxy));
+ return PyFloat_FromDouble(sum);
+}
+
+
+/* Module definition stuff */
+
+static PyMethodDef CMarkovModelMethods[] = {
+ {"_logadd", cMarkovModel__logadd, METH_VARARGS, cMarkovModel__logadd__doc__},
+ {NULL, NULL}
+};
+
+static char cMarkovModel__doc__[] =
+"This module provides optimized replacement functions for MarkovModel.\n\
+";
+
+void initcMarkovModel(void)
+{
+ Py_InitModule3("cMarkovModel", CMarkovModelMethods, cMarkovModel__doc__);
+}
+
+
+
--- /dev/null
+/* Copyright 2000 by Jeffrey Chang. All rights reserved.
+ * This code is part of the Biopython distribution and governed by its
+ * license. Please see the LICENSE file that should have been included
+ * as part of this package.
+ *
+ * clistfnsmodule.c
+ * Created 3 Jun 2000
+ */
+
+#include "Python.h"
+#include <math.h>
+
+
+
+
+/************************************** Exported Functions ***********/
+
+static char clistfns_count__doc__[] =
+"count(items) -> dict of counts of each item\n\
+\n\
+Count the number of times each item appears in a list of data.\n\
+\n\
+";
+
+static PyObject *clistfns_count(PyObject *self, PyObject *args)
+{
+ int i;
+ PyObject *items, *counts;
+ PyObject *item, *count, *newcount;
+ long int current;
+
+ if(!PyArg_ParseTuple(args, "O", &items))
+ return NULL;
+ if(!PySequence_Check(items)) {
+ PyErr_SetString(PyExc_TypeError, "expected sequence type");
+ return NULL;
+ }
+
+ if(!(counts = PyDict_New()))
+ return NULL;
+
+ /* Go through the loop, counting how often each item appears. */
+ i = 0;
+ while(1) {
+ if(!(item = PySequence_GetItem(items, i))) {
+ PyErr_Clear(); /* clear the exception set by PySequence_GetItem */
+ break; /* no more numbers */
+ }
+
+ if(!(count = PyDict_GetItem(counts, item))) {
+ newcount = PyInt_FromLong(1); /* New item, set count to 1 */
+ }
+ else {
+ current = PyInt_AsLong(count);
+ newcount = PyInt_FromLong(current+1);
+ }
+
+ PyDict_SetItem(counts, item, newcount);
+ Py_DECREF(newcount);
+ Py_DECREF(item);
+ if(PyErr_Occurred())
+ return NULL;
+
+ i++;
+ }
+
+ return counts;
+}
+
+
+static char clistfns_contents__doc__[] =
+"contents(items) -> dict of item -> percentage\n\
+\n\
+Summarize the contents of the list in terms of the percentages of each\n\
+item. For example, if an item appears 3 times in a list with 10 items,\n\
+it is in 0.3 of the list\n\
+\n\
+";
+
+static PyObject *clistfns_contents(PyObject *self, PyObject *args)
+{
+ int i;
+ PyObject *items, *counts, *percentages;
+ PyObject *countitems, *countitem;
+ PyObject *key, *count, *perc;
+ long c;
+ double total;
+
+ if(!PyArg_ParseTuple(args, "O", &items))
+ return NULL;
+ if(!PySequence_Check(items)) {
+ PyErr_SetString(PyExc_TypeError, "expected mapping type");
+ return NULL;
+ }
+ if((total = PySequence_Length(items)) == -1) {
+ PyErr_SetString(PyExc_ValueError, "I couldn't get length of item.");
+ return NULL;
+ }
+
+ counts = clistfns_count(self, args);
+ if(!counts || PyErr_Occurred())
+ return NULL;
+
+ if(!(percentages = PyDict_New())) {
+ Py_DECREF(counts);
+ return NULL;
+ }
+
+ /* Loop through every element in counts, calculating the probabilities. */
+ if(!(countitems = PyMapping_Items(counts))) {
+ Py_DECREF(counts);
+ Py_DECREF(percentages);
+ return NULL;
+ }
+
+ /* Go through the loop, counting how often each item appears. */
+ i = 0;
+ while(1) {
+ if(!(countitem = PyList_GetItem(countitems, i))) {
+ PyErr_Clear(); /* clear the exception set by PyList_GetItem */
+ break; /* no more numbers */
+ }
+ key = PyTuple_GetItem(countitem, 0);
+ count = PyTuple_GetItem(countitem, 1);
+ c = PyInt_AsLong(count);
+ perc = PyFloat_FromDouble((double)c / total);
+ PyDict_SetItem(percentages, key, perc);
+ Py_DECREF(perc);
+ if(PyErr_Occurred()) /* PyDict_SetItem failed */
+ break;
+ i++;
+ }
+ if(PyErr_Occurred()) {
+ Py_DECREF(percentages);
+ percentages = NULL;
+ }
+ Py_DECREF(countitems);
+ Py_DECREF(counts);
+
+ return percentages;
+}
+
+
+/************************************** Module definition stuff ******/
+
+static PyMethodDef clistfnsMethods[] = {
+ {"count", clistfns_count, METH_VARARGS, clistfns_count__doc__},
+ {"contents", clistfns_contents, METH_VARARGS, clistfns_contents__doc__},
+ {NULL, NULL}
+};
+
+static char clistfns__doc__[] =
+"This provides helper functions for the listfns module.\n\
+You should never import this module on its own.\n\
+\n\
+";
+
+void initclistfns(void)
+{
+ (void) Py_InitModule3("clistfns", clistfnsMethods, clistfns__doc__);
+}
--- /dev/null
+/* Copyright 2000 by Jeffrey Chang. All rights reserved.
+ * This code is part of the Biopython distribution and governed by its
+ * license. Please see the LICENSE file that should have been included
+ * as part of this package.
+ *
+ * cmathfnsmodule.c
+ * Created 3 Jun 2000
+ */
+
+#include "Python.h"
+#include <math.h>
+
+#include "csupport.h"
+
+
+
+/************************************** Exported Functions ***********/
+
+static char cmathfns_intd__doc__[] =
+"intd(x[, digits_after_decimal]) -> int x, rounded\n\
+\n\
+Represent a floating point number with some digits after the\n\
+decimal point as an integer. This is useful when floating point\n\
+comparisons are failing due to precision problems. e.g.\n\
+intd(5.35, 1) -> 54.\n\
+\n\
+";
+
+static PyObject *cmathfns_intd(
+ PyObject *self, PyObject *args, PyObject *keywds)
+{
+ PyObject *digits_after_decimal = Py_None;
+ double x, digits;
+ double precision;
+
+ static char *kwlist[] = {"x", "digits_after_decimal", NULL};
+ if(!PyArg_ParseTupleAndKeywords(args, keywds, "d|O", kwlist,
+ &x, &digits_after_decimal))
+ return NULL;
+
+ if(digits_after_decimal == Py_None)
+ digits = 0;
+ else {
+ digits = PyNumber_AsDouble(digits_after_decimal);
+ if(PyErr_Occurred()) {
+ return NULL;
+ }
+ }
+ precision = pow(10, digits);
+ if(x >= 0)
+ x = (int)(x * precision + 0.5);
+ else
+ x = (int)(x * precision - 0.5);
+ return PyFloat_FromDouble(x);
+}
+
+
+
+
+static char cmathfns_fcmp__doc__[] =
+"fcmp(x, y, precision) -> -1, 0, or 1";
+
+static PyObject *cmathfns_fcmp(
+ PyObject *self, PyObject *args, PyObject *keywds)
+{
+ double x, y, precision;
+ int result;
+
+ static char *kwlist[] = {"x", "y", "precision", NULL};
+ if(!PyArg_ParseTupleAndKeywords(args, keywds, "ddd", kwlist,
+ &x, &y, &precision))
+ return NULL;
+
+ if(fabs(x-y) < precision)
+ result = 0;
+ else if(x < y)
+ result = -1;
+ else result = 1;
+ return PyInt_FromLong(result);
+}
+
+
+
+static char cmathfns_safe_log__doc__[] =
+"safe_log(n, zero=None, neg=None) -> log(n)\n\
+\n\
+Calculate the log of n. If n is 0, returns the value of zero. If n is\n\
+negative, returns the value of neg.\n\
+\n\
+";
+
+static PyObject *cmathfns_safe_log(
+ PyObject *self, PyObject *args, PyObject *keywds)
+{
+ PyObject *zero = Py_None,
+ *neg = Py_None;
+ double n;
+
+ static char *kwlist[] = {"n", "zero", "neg", NULL};
+
+ if(!PyArg_ParseTupleAndKeywords(args, keywds, "d|OO", kwlist,
+ &n, &zero, &neg))
+ return NULL;
+
+ if(n < 0) {
+ Py_INCREF(neg);
+ return neg;
+ } else if(n < 1E-100) {
+ Py_INCREF(zero);
+ return zero;
+ }
+
+ return PyFloat_FromDouble(log(n));
+}
+
+
+
+
+/************************************** Module definition stuff ******/
+
+static PyMethodDef cmathfnsMethods[] = {
+ {"fcmp", (PyCFunction)cmathfns_fcmp, METH_VARARGS|METH_KEYWORDS,
+ cmathfns_fcmp__doc__},
+ {"intd", (PyCFunction)cmathfns_intd, METH_VARARGS|METH_KEYWORDS,
+ cmathfns_intd__doc__},
+ {"safe_log", (PyCFunction)cmathfns_safe_log, METH_VARARGS|METH_KEYWORDS,
+ cmathfns_safe_log__doc__},
+ {NULL, NULL}
+};
+
+static char cmathfns__doc__[] =
+"This provides helper functions for the mathfns module.\n\
+You should never import this module on its own.\n\
+\n\
+";
+
+void initcmathfns(void)
+{
+ (void) Py_InitModule3("cmathfns", cmathfnsMethods, cmathfns__doc__);
+}
--- /dev/null
+/* Copyright 2000 by Jeffrey Chang. All rights reserved.
+ * This code is part of the Biopython distribution and governed by its
+ * license. Please see the LICENSE file that should have been included
+ * as part of this package.
+ *
+ * cstringfnsmodule.c
+ * Created 7 Jun 2000
+ */
+
+#include "Python.h"
+#include <string.h> /* memset */
+
+
+/* Functions in this module. */
+
+static char cstringfns_splitany__doc__[] =
+"splitany(str [,sep [,maxsplit [,negate]]]) -> list of strings\n\
+\n\
+Split a string. Similar to string.split, except that this considers\n\
+any one of the characters in sep to be a delimiter. If negate is\n\
+true, then everything but sep will be a separator.\n\
+\n\
+";
+
+static PyObject *cstringfns_splitany(
+ PyObject *self, PyObject *args, PyObject *keywds)
+{
+ int i, prev;
+ int nsplit, maxsplit=0;
+ /*int negate=0;*/
+ PyObject *py_negate=NULL;
+ PyObject *strlist, *newstr;
+ unsigned char *str,
+ *sep=" \011\012\013\014\015"; /* whitespace */
+ char tosplit[256];
+ static char *kwlist[] = {"str", "sep", "maxsplit", "negate", NULL};
+
+ if(!PyArg_ParseTupleAndKeywords(args, keywds, "s|siO", kwlist,
+ &str, &sep, &maxsplit, &py_negate))
+ return NULL;
+ if(maxsplit < 0)
+ maxsplit = 1;
+ /* negate = (py_negate && PyObject_IsTrue(py_negate));*/
+ /* XXX NO MORE NEGATE */
+
+ /* Set the tosplit array to 1 for characters to split on. */
+ memset(tosplit, 0, 256);
+ while(*sep) {
+ tosplit[(unsigned char)*sep++] = 1;
+ }
+ if(py_negate && PyObject_IsTrue(py_negate)) {
+ for(i=0; i<256; i++)
+ tosplit[i] = !tosplit[i];
+ }
+
+ /* Create a new list to store the variables. */
+ if(!(strlist = PyList_New(0))) {
+ PyErr_SetString(PyExc_SystemError, "I could not create a new list");
+ return NULL;
+ }
+
+ prev = 0;
+ nsplit = 0;
+ for(i=0; str[i] && (maxsplit == 0 || nsplit < maxsplit); i++) {
+ /*if(!(tosplit[(int)str[i]] == !negate))
+ continue; */
+ if(!tosplit[(int)str[i]])
+ continue;
+
+ /* Split the string here. */
+ if(!(newstr = PyString_FromStringAndSize(&str[prev], i-prev))) {
+ PyErr_SetString(PyExc_SystemError,
+ "I could not create a new string");
+ break;
+ }
+ if(PyList_Append(strlist, newstr) == -1) {
+ Py_DECREF(newstr);
+ break;
+ }
+ Py_DECREF(newstr);
+ prev = i+1;
+ nsplit++;
+ }
+ if(!PyErr_Occurred()) {
+ i = strlen(str);
+ /* Add the last one. */
+ if(!(newstr = PyString_FromStringAndSize(&str[prev], i-prev))) {
+ PyErr_SetString(PyExc_SystemError,
+ "I could not create a new string");
+ } else {
+ PyList_Append(strlist, newstr);
+ Py_DECREF(newstr);
+ }
+ } else {
+ Py_DECREF(strlist);
+ return NULL;
+ }
+
+
+ return strlist;
+}
+
+
+
+/* Module definition stuff */
+
+static PyMethodDef cstringfnsMethods[] = {
+ {"splitany", (PyCFunction)cstringfns_splitany, METH_VARARGS|METH_KEYWORDS,
+ cstringfns_splitany__doc__},
+ {NULL, NULL}
+};
+
+static char cstringfns__doc__[] =
+"This provides helper functions for the stringfns module.\n\
+You should never import this module on its own.\n\
+\n\
+";
+
+void initcstringfns(void)
+{
+ (void) Py_InitModule3("cstringfns", cstringfnsMethods, cstringfns__doc__);
+}
--- /dev/null
+/* Copyright 2002 by Jeffrey Chang. All rights reserved.
+ * This code is part of the Biopython distribution and governed by its
+ * license. Please see the LICENSE file that should have been included
+ * as part of this package.
+ *
+ * csupport.c
+ * Created 27 January 2002
+ *
+ * Miscellaneous useful C functions not to be exported as a python
+ * module.
+ *
+ */
+
+#include "Python.h"
+
+
+/* Return a PyNumber as a double.
+ * Raises a TypeError if I can't do it.
+ */
+double PyNumber_AsDouble(PyObject *py_num)
+{
+ double val;
+ PyObject *floatobj;
+
+ if((floatobj = PyNumber_Float(py_num)) == NULL)
+ return(0.0);
+ val = PyFloat_AsDouble(floatobj);
+ Py_DECREF(floatobj);
+ return val;
+}
--- /dev/null
+
+double PyNumber_AsDouble(PyObject *py_num);
--- /dev/null
+"""
+This module provides code for various distance measures.
+
+Functions:
+euclidean Euclidean distance between two points
+euclidean_py Pure Python implementation of euclidean.
+
+"""
+# XXX cosine distance
+
+import warnings
+warnings.warn("Bio.distance is deprecated. If you use this module, please notify the Biopython developers at biopython-dev@biopython.org", DeprecationWarning)
+
+from numpy import *
+
+def euclidean(x, y):
+ """euclidean(x, y) -> euclidean distance between x and y"""
+ if len(x) != len(y):
+ raise ValueError("vectors must be same length")
+ #return sqrt(sum((x-y)**2))
+ # Optimization by John Corradi (JCorradi@msn.com)
+ d = x-y
+ return sqrt(dot(d, d))
+
+def euclidean_py(x, y):
+ """euclidean_py(x, y) -> euclidean distance between x and y"""
+ # lightly modified from implementation by Thomas Sicheritz-Ponten.
+ # This works faster than the Numeric implementation on shorter
+ # vectors.
+ if len(x) != len(y):
+ raise ValueError("vectors must be same length")
+ sum = 0
+ for i in range(len(x)):
+ sum += (x[i]-y[i])**2
+ return sqrt(sum)
--- /dev/null
+#!/usr/bin/env python
+
+"""
+This module provides code for doing k-nearest-neighbors classification.
+
+k Nearest Neighbors is a supervised learning algorithm that classifies
+a new observation based the classes in its surrounding neighborhood.
+
+Glossary:
+distance The distance between two points in the feature space.
+weight The importance given to each point for classification.
+
+
+Classes:
+kNN Holds information for a nearest neighbors classifier.
+
+
+Functions:
+train Train a new kNN classifier.
+calculate Calculate the probabilities of each class, given an observation.
+classify Classify an observation into a class.
+
+ Weighting Functions:
+equal_weight Every example is given a weight of 1.
+
+"""
+
+#TODO - Remove this work around once we drop python 2.3 support
+try:
+ set = set
+except NameError:
+ from sets import Set as set
+
+import numpy
+
+class kNN:
+ """Holds information necessary to do nearest neighbors classification.
+
+ Members:
+ classes Set of the possible classes.
+ xs List of the neighbors.
+ ys List of the classes that the neighbors belong to.
+ k Number of neighbors to look at.
+
+ """
+ def __init__(self):
+ """kNN()"""
+ self.classes = set()
+ self.xs = []
+ self.ys = []
+ self.k = None
+
+def equal_weight(x, y):
+ """equal_weight(x, y) -> 1"""
+ # everything gets 1 vote
+ return 1
+
+def train(xs, ys, k, typecode=None):
+ """train(xs, ys, k) -> kNN
+
+ Train a k nearest neighbors classifier on a training set. xs is a
+ list of observations and ys is a list of the class assignments.
+ Thus, xs and ys should contain the same number of elements. k is
+ the number of neighbors that should be examined when doing the
+ classification.
+
+ """
+ knn = kNN()
+ knn.classes = set(ys)
+ knn.xs = numpy.asarray(xs, typecode)
+ knn.ys = ys
+ knn.k = k
+ return knn
+
+def calculate(knn, x, weight_fn=equal_weight, distance_fn=None):
+ """calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict
+
+ Calculate the probability for each class. knn is a kNN object. x
+ is the observed data. weight_fn is an optional function that
+ takes x and a training example, and returns a weight. distance_fn
+ is an optional function that takes two points and returns the
+ distance between them. If distance_fn is None (the default), the
+ Euclidean distance is used. Returns a dictionary of the class to
+ the weight given to the class.
+
+ """
+ x = numpy.asarray(x)
+
+ order = [] # list of (distance, index)
+ if distance_fn:
+ for i in range(len(knn.xs)):
+ dist = distance_fn(x, knn.xs[i])
+ order.append((dist, i))
+ else:
+ # Default: Use a fast implementation of the Euclidean distance
+ temp = numpy.zeros(len(x))
+ # Predefining temp allows reuse of this array, making this
+ # function about twice as fast.
+ for i in range(len(knn.xs)):
+ temp[:] = x - knn.xs[i]
+ dist = numpy.sqrt(numpy.dot(temp,temp))
+ order.append((dist, i))
+ order.sort()
+
+ # first 'k' are the ones I want.
+ weights = {} # class -> number of votes
+ for k in knn.classes:
+ weights[k] = 0.0
+ for dist, i in order[:knn.k]:
+ klass = knn.ys[i]
+ weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
+
+ return weights
+
+def classify(knn, x, weight_fn=equal_weight, distance_fn=None):
+ """classify(knn, x[, weight_fn][, distance_fn]) -> class
+
+ Classify an observation into a class. If not specified, weight_fn will
+ give all neighbors equal weight. distance_fn is an optional function
+ that takes two points and returns the distance between them. If
+ distance_fn is None (the default), the Euclidean distance is used.
+ """
+ weights = calculate(
+ knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
+
+ most_class = None
+ most_weight = None
+ for klass, weight in weights.items():
+ if most_class is None or weight > most_weight:
+ most_class = klass
+ most_weight = weight
+ return most_class
--- /dev/null
+# Copyright 2000 by Jeffrey Chang. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""This provides useful general functions for working with lists (OBSOLETE).
+
+This module and its C code equivalent are considered to be obsolete, and
+are likely to be deprecated in a future release of Biopython, before being
+removed. Please get in touch via the mailing list if this will affect you.
+Many of these functions can be avoided using the python set object.
+
+Functions:
+asdict Make the list into a dictionary (for fast testing of membership).
+items Get one of each item in a list.
+count Count the number of times each item appears.
+contents Calculate percentage each item appears in a list.
+itemindex Make an index of the items in the list.
+intersection Get the items in common between 2 lists.
+difference Get the items in 1 list, but not the other.
+indexesof Get a list of the indexes of some items in a list.
+take Take some items from a list.
+
+"""
+
+def asdict(l):
+ """asdict(l) -> dictionary
+
+ Return a dictionary where the keys are the items in the list, with
+ arbitrary values. This is useful for quick testing of membership.
+
+ """
+ return count(l)
+
+def items(l):
+ """items(l) -> list of items
+
+ Generate a list of one of each item in l. The items are returned
+ in arbitrary order.
+
+ """
+ try:
+ return asdict(l).keys()
+ except TypeError, x:
+ if str(x).find("unhashable") == -1:
+ raise
+ # asdict failed because l is unhashable. Back up to a naive
+ # implementation.
+ l = l[:]
+ l.sort()
+ i = 0
+ while i < len(l)-1:
+ if l[i] == l[i+1]:
+ del l[i]
+ else:
+ i += 1
+ return l
+
+def count(items):
+ """count(items) -> dict of counts of each item
+
+ Count the number of times each item appears in a list of data.
+
+ """
+ c = {}
+ for i in items:
+ c[i] = c.get(i, 0) + 1
+ return c
+
+def contents(items):
+ """contents(items) -> dict of item:percentage
+
+ Summarize the contents of the list in terms of the percentages of each
+ item. For example, if an item appears 3 times in a list with 10 items,
+ it is in 0.3 of the list.
+
+ """
+ counts = count(items)
+ l = float(len(items))
+ contents = {}
+ for i, c in counts.items():
+ contents[i] = c / l
+ return contents
+
+def intersection(l1, l2):
+ """intersection(l1, l2) -> list of common items
+
+ Return a list of the items in both l1 and l2. The list is in
+ arbitrary order.
+
+ """
+ inter = []
+ words1 = count(l1)
+ for w in l2:
+ if words1.has_key(w):
+ inter.append(w)
+ del words1[w] # don't add the same word twice
+ return inter
+
+def difference(l1, l2):
+ """difference(l1, l2) -> list of items in l1, but not l2
+
+ Return a list of the items in l1, but not l2. The list is in
+ arbitrary order.
+
+ """
+ diff = []
+ words2 = count(l2)
+ for w in l1:
+ if not words2.has_key(w):
+ diff.append(w)
+ words2[w] = 1 # don't add the same word twice
+ return diff
+
+def itemindex(l):
+ """itemindex(l) -> dict of item : index of item
+
+ Make an index of the items in the list. The dictionary contains
+ the items in the list as the keys, and the index of the first
+ occurrence of the item as the value.
+
+ """
+ dict = {}
+ for i in range(len(l)):
+ if not dict.has_key(l[i]):
+ dict[l[i]] = i
+ return dict
+
+def indexesof(l, fn, opposite=0):
+ """indexesof(l, fn) -> list of indexes
+
+ Return a list of indexes i where fn(l[i]) is true.
+
+ """
+ indexes = []
+ for i in range(len(l)):
+ f = fn(l[i])
+ if (not opposite and f) or (opposite and not f):
+ indexes.append(i)
+ return indexes
+
+def take(l, indexes):
+ """take(l, indexes) -> list of just the indexes from l"""
+ items = []
+ for i in indexes:
+ items.append(l[i])
+ return items
+
+def take_byfn(l, fn, opposite=0):
+ indexes = indexesof(l, fn, opposite=opposite)
+ return take(l, indexes)
+
+# Try and load C implementations of functions. If I can't,
+# then just ignore and use the pure python implementations.
+try:
+ from clistfns import *
+except ImportError:
+ pass
--- /dev/null
+# Copyright 2000 by Jeffrey Chang. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""This provides useful general math tools (DEPRECATED).
+
+This module and its C code equivalent are considered to be deprecated, and
+are likely to be removed in a future release of Biopython. Please get in
+touch via the mailing list if this will affect you.
+
+Functions:
+fcmp Compare two floating point numbers, up to a specified precision.
+intd Represent a floating point number as an integer.
+safe_log log, but returns an arbitrarily small number for log(0).
+safe_exp exp, but returns a large or small number instead of overflows.
+
+"""
+import warnings
+warnings.warn("Bio.mathfns and its C code equivalent Bio.cmathfns are" \
+ +" deprecated, and will be removed in a future release of"\
+ +" Biopython. If you want to continue to use this code,"\
+ +" please get in contact with the Biopython developers via"\
+ +" the mailing lists to avoid its permanent removal from"\
+ +" Biopython.", \
+ DeprecationWarning)
+
+import math
+
+def fcmp(x, y, precision):
+ """fcmp(x, y, precision) -> -1, 0, or 1"""
+ if math.fabs(x-y) < precision:
+ return 0
+ elif x < y:
+ return -1
+ return 1
+
+def intd(x, digits_after_decimal=0):
+ """intd(x[, digits_after_decimal]) -> int x, rounded
+
+ Represent a floating point number with some digits after the
+ decimal point as an integer. This is useful when floating point
+ comparisons are failing due to precision problems. e.g.
+ intd(5.35, 1) -> 54.
+
+ """
+ precision = 10.**digits_after_decimal
+ if x >= 0:
+ x = int(x * precision + 0.5)
+ else:
+ x = int(x * precision - 0.5)
+ return x
+
+def safe_log(n, zero=None, neg=None):
+ """safe_log(n, zero=None, neg=None) -> log(n)
+
+ Calculate the log of n. If n is 0, returns the value of zero. If n is
+ negative, returns the value of neg.
+
+ """
+ if n < 0:
+ return neg
+ elif n < 1E-100:
+ return zero
+ return math.log(n)
+
+LOG2 = math.log(2)
+def safe_log2(n, zero=None, neg=None):
+ """safe_log2(n, zero=None, neg=None) -> log(n)
+
+ Calculate the log base 2 of n. If n is 0, returns the value of
+ zero. If n is negative, returns the value of neg.
+
+ """
+ l = safe_log(n, zero=zero, neg=neg)
+ if l is None:
+ return l
+ return l/LOG2
+
+def safe_exp(n, under=None, over=None):
+ """safe_exp(n, under=None, over=None) -> e**n
+
+ Guaranteed not to overflow. Instead of overflowing, it returns
+ the values of 'under' for underflows or 'over' for overflows.
+
+ """
+ try:
+ return math.exp(n)
+ except OverflowError:
+ if n < 0:
+ return under
+ return over
+ raise "How did I get here?"
+
+# Try and load C implementations of functions. If I can't,
+# then just ignore and use the pure python implementations.
+try:
+ from cmathfns import *
+except ImportError:
+ pass
--- /dev/null
+# Copyright 2000 by Jeffrey Chang. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""This provides useful general functions for working with strings (DEPRECATED).
+
+This module and its C code equivalent are considered to be deprecated, and
+are likely to be removed in a future release of Biopython. Please get in
+touch via the mailing list if this will affect you.
+
+Functions:
+splitany Split a string using many delimiters.
+find_anychar Find one of a list of characters in a string.
+rfind_anychar Find one of a list of characters in a string, from end to start.
+
+"""
+import warnings
+warnings.warn("Bio.stringfns and its C code equivalent Bio.cstringfns are" \
+ +" deprecated, and will be removed in a future release of"\
+ +" Biopython. If you want to continue to use this code,"\
+ +" please get in contact with the Biopython developers via"\
+ +" the mailing lists to avoid its permanent removal from"\
+ +" Biopython.", \
+ DeprecationWarning)
+
+def splitany(s, sep=" \011\012\013\014\015", maxsplit=None, negate=0):
+ """splitany(s [,sep [,maxsplit [,negate]]]) -> list of strings
+
+ Split a string. Similar to string.split, except that this considers
+ any one of the characters in sep to be a delimiter. If negate is
+ true, then everything but sep will be a separator.
+
+ """
+ strlist = []
+ prev = 0
+ for i in range(len(s)):
+ if maxsplit is not None and len(strlist) >= maxsplit:
+ break
+ if (s[i] in sep) == (not negate):
+ strlist.append(s[prev:i])
+ prev = i+1
+ strlist.append(s[prev:])
+ return strlist
+
+def find_anychar(string, chars, index=None, negate=0):
+ """find_anychar(string, chars[, index]) -> index of a character or -1
+
+ Find a character in string. chars is a list of characters to look
+ for. Return the index of the first occurrence of any of the
+ characters, or -1 if not found. index is the index where the
+ search should start. By default, I search from the beginning of
+ the string.
+
+ """
+ if index is None:
+ index = 0
+ while index < len(string) and \
+ ((not negate and string[index] not in chars) or
+ (negate and string[index] in chars)):
+ index += 1
+ if index == len(string):
+ return -1
+ return index
+
+def rfind_anychar(string, chars, index=None, negate=0):
+ """rfind_anychar(string, chars[, index]) -> index of a character or -1
+
+ Find a character in string, looking from the end to the start.
+ chars is a list of characters to look for. Return the index of
+ the first occurrence of any of the characters, or -1 if not found.
+ index is the index where the search should start. By default, I
+ search from the end of the string.
+
+ """
+ if index is None:
+ index = len(string)-1
+ while index >= 0 and \
+ ((not negate and string[index] not in chars) or
+ (negate and string[index] in chars)):
+ index -= 1
+ # If not found, index will already be -1.
+ return index
+
+# Try and load C implementations of functions. If I can't,
+# then just ignore and use the pure python implementations.
+try:
+ from cstringfns import *
+except ImportError:
+ pass
--- /dev/null
+#include <stdio.h> /* printf */
+#include <stdlib.h> /* malloc */
+#include <string.h> /* strcmp, strlen */
+
+#include "trie.h"
+
+/* The following is necessary to make sure that trie.pyd won't link
+ * to msvcrt.dll in addition to msvcr71.dll on Windows.
+ * See Bug #1767 on Bugzilla.
+ */
+#ifdef __MINGW32__
+# define strdup _strdup
+#endif
+
+struct _Transition; /* Forward declaration, needed in _Trie. */
+
+
+/* _Trie is a recursive data structure. A _Trie contains zero or more
+ * _Transitions that lead to more _Tries. The transitions are stored
+ * in alphabetical order of the suffix member of the data structure.
+ * _Trie also contains a pointer called value where the user can store
+ * arbitrary data. If value is NULL, then no data is stored here.
+ */
+struct _Trie {
+ struct _Transition *transitions;
+ unsigned char num_transitions;
+ void *value; /* specified by user, never freed or allocated by me! */
+};
+
+/* _Transition holds information about the transitions leading from
+ * one _Trie to another. The trie structure here is different from
+ * typical ones, because the transitions between nodes can contain
+ * strings of arbitrary length, not just single characters. Suffix is
+ * the string that is matched from one node to the next.
+ */
+typedef struct _Transition {
+ unsigned char *suffix;
+ Trie next;
+} *Transition;
+
+
+#define MAX_KEY_LENGTH 1000
+static unsigned char KEY[MAX_KEY_LENGTH];
+
+
+Trie Trie_new(void) {
+ Trie trie;
+
+ if(!(trie = (Trie)malloc(sizeof(struct _Trie))))
+ return NULL;
+ trie->transitions = NULL;
+ trie->num_transitions = 0;
+ trie->value = NULL;
+ return trie;
+}
+
+int Trie_set(Trie trie, const unsigned char *key, const void *value) {
+ int i;
+ Transition transition=NULL;
+ unsigned char *suffix=NULL;
+ int retval = 0;
+ int first, last, mid;
+
+ if(!key[0]) {
+ trie->value = (void *)value;
+ return 0;
+ }
+
+ /* Insert the key in alphabetical order. Do a binary search to
+ find the proper place. */
+ first = 0;
+ last = trie->num_transitions-1;
+ i = -1;
+ while(first <= last) {
+ mid = (first+last)/2;
+ transition = &trie->transitions[mid];
+ suffix = transition->suffix;
+ if(key[0] < suffix[0])
+ last = mid-1;
+ else if(key[0] > suffix[0])
+ first = mid+1;
+ else {
+ i = mid;
+ break;
+ }
+ }
+
+ /* If no place was found for it, then the indexes will be in the
+ order last,first. Place it at index first. */
+ if(i == -1)
+ i = first;
+
+ /* If nothing matches, then insert a new trie here. */
+ if((i >= trie->num_transitions) || (key[0] != suffix[0])) {
+ unsigned char *new_suffix=NULL;
+ Trie newtrie=NULL;
+ Transition new_transitions=NULL;
+
+ /* Create some variables for the new transition. I'm going to
+ allocate these first so that if I can detect memory errors
+ before I mess up the data structure of the transitions.
+ */
+ if(!(new_suffix = (unsigned char *)strdup(key)))
+ goto insert_memerror;
+ if(!(newtrie = Trie_new()))
+ goto insert_memerror;
+
+ /* Create some space for the next transition. Allocate some
+ memory and shift the old transitions over to make room for
+ this one.
+ */
+ if(!(new_transitions = malloc(sizeof(struct _Transition) *
+ (trie->num_transitions+1))))
+ goto insert_memerror;
+ memcpy(new_transitions, trie->transitions,
+ sizeof(struct _Transition)*i);
+ memcpy(&new_transitions[i+1], &trie->transitions[i],
+ sizeof(struct _Transition)*(trie->num_transitions-i));
+ free(trie->transitions);
+ trie->transitions = new_transitions;
+ new_transitions = NULL;
+ trie->num_transitions += 1;
+
+ /* Initialize the new transition. */
+ transition = &trie->transitions[i];
+ transition->suffix = new_suffix;
+ transition->next = newtrie;
+ transition->next->value = (void *)value;
+
+ if(0) {
+ insert_memerror:
+ if(new_transitions) free(new_transitions);
+ if(newtrie) free(newtrie);
+ if(new_suffix) free(new_suffix);
+ return 1;
+ }
+ }
+ /* There are three cases where the key and suffix share some
+ letters.
+ 1. suffix is proper substring of key.
+ 2. key is proper substring of suffix.
+ 3. neither is proper substring of other.
+
+ For cases 2 and 3, I need to first split up the transition
+ based on the number of characters shared. Then, I can insert
+ the rest of the key into the next trie.
+ */
+ else {
+ /* Count the number of characters shared between key
+ and suffix. */
+ int chars_shared = 0;
+ while(key[chars_shared] && key[chars_shared] == suffix[chars_shared])
+ chars_shared++;
+
+ /* Case 2 or 3, split this sucker! */
+ if(chars_shared < strlen(suffix)) {
+ Trie newtrie=NULL;
+ unsigned char *new_suffix1=NULL, *new_suffix2=NULL;
+
+ if(!(new_suffix1 = (unsigned char *)malloc(chars_shared+1)))
+ goto split_memerror;
+ strncpy(new_suffix1, key, chars_shared);
+ new_suffix1[chars_shared] = 0;
+ if(!(new_suffix2 = (unsigned char *)strdup(suffix+chars_shared)))
+ goto split_memerror;
+ if(!(newtrie = Trie_new()))
+ goto split_memerror;
+ if(!(newtrie->transitions =
+ (Transition)malloc(sizeof(struct _Transition))))
+ goto split_memerror;
+ newtrie->num_transitions = 1;
+ newtrie->transitions[0].next = transition->next;
+ newtrie->transitions[0].suffix = new_suffix2;
+
+ free(transition->suffix);
+ transition->suffix = new_suffix1;
+ transition->next = newtrie;
+
+ if(0) {
+ split_memerror:
+ if(newtrie && newtrie->transitions) free(newtrie->transitions);
+ if(newtrie) free(newtrie);
+ if(new_suffix2) free(new_suffix2);
+ if(new_suffix1) free(new_suffix1);
+ return 1;
+ }
+ }
+ retval = Trie_set(transition->next, key+chars_shared, value);
+ }
+
+ return retval;
+}
+
+void Trie_del(Trie trie) {
+ int i;
+ if(!trie)
+ return;
+ for(i=0; i<trie->num_transitions; i++) {
+ Transition transition = &trie->transitions[i];
+ if(transition->suffix)
+ free(transition->suffix);
+ Trie_del(transition->next);
+ }
+ free(trie);
+}
+
+void *Trie_get(const Trie trie, const unsigned char *key) {
+ int first, last, mid;
+
+ if(!key[0]) {
+ return trie->value;
+ }
+
+ /* The transitions are stored in alphabetical order. Do a binary
+ * search to find the proper one.
+ */
+ first = 0;
+ last = trie->num_transitions-1;
+ while(first <= last) {
+ Transition transition;
+ unsigned char *suffix;
+ int c;
+ mid = (first+last)/2;
+ transition = &trie->transitions[mid];
+ suffix = transition->suffix;
+ /* If suffix is a substring of key, then get the value from
+ the next trie.
+ */
+ c = strncmp(key, suffix, strlen(suffix));
+ if(c < 0)
+ last = mid-1;
+ else if(c > 0)
+ first = mid+1;
+ else
+ return Trie_get(transition->next, key+strlen(suffix));
+ }
+ return NULL;
+}
+
+
+/* Mutually recursive, so need to make a forward declaration. */
+void
+_get_approximate_trie(const Trie trie, const unsigned char *key, const int k,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ const int mismatches,
+ void *data),
+ void *data,
+ const int mismatches,
+ unsigned char *current_key, const int max_key
+ );
+
+void
+_get_approximate_transition(const unsigned char *key,
+ const int k,
+ const Transition transition,
+ const unsigned char *suffix,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ const int mismatches,
+ void *data),
+ void *data,
+ const int mismatches,
+ unsigned char *current_key, const int max_key
+ )
+{
+ int i;
+ int prev_keylen = strlen(current_key);
+
+ /* Short circuit optimization. If there's too many characters to
+ possibly be a match, then don't even try to match things. */
+ if((int)(strlen(suffix) - strlen(key)) > k)
+ return;
+
+ /* Match as many characters as possible. */
+ i = 0;
+ while(suffix[i] && (key[i] == suffix[i])) {
+ i++;
+ }
+ /* Check to make sure the key is not too long. BUG: If it is,
+ fails silently. */
+ if((prev_keylen+i) >= max_key)
+ return;
+ strncat(current_key, suffix, i);
+
+ /* If all the letters in the suffix matched, then move to the
+ next trie. */
+ if(!suffix[i]) {
+ _get_approximate_trie(transition->next, &key[i], k, callback, data,
+ mismatches, current_key, max_key);
+ }
+ /* Otherwise, try out different kinds of mismatches. */
+ else if(k) {
+ int new_keylen = prev_keylen+i;
+
+ /* Letter replacement, skip the next letter in both the key and
+ suffix. */
+ if((new_keylen+1 < max_key) && key[i] && suffix[i]) {
+ current_key[new_keylen] = suffix[i];
+ current_key[new_keylen+1] = 0;
+ _get_approximate_transition(&key[i+1], k-1,
+ transition, &suffix[i+1],
+ callback, data,
+ mismatches+1, current_key, max_key);
+ current_key[new_keylen] = 0;
+ }
+
+ /* Insertion in key, skip the next letter in the key. */
+ if(key[i]) {
+ _get_approximate_transition(&key[i+1], k-1,
+ transition, &suffix[i],
+ callback, data,
+ mismatches+1, current_key, max_key);
+ }
+
+ /* Deletion from key, skip the next letter in the suffix. */
+ if((new_keylen+1 < max_key) && suffix[i]) {
+ current_key[new_keylen] = suffix[i];
+ current_key[new_keylen+1] = 0;
+ _get_approximate_transition(&key[i], k-1,
+ transition, &suffix[i+1],
+ callback, data,
+ mismatches+1, current_key, max_key);
+ current_key[new_keylen] = 0;
+ }
+ }
+ current_key[prev_keylen] = 0;
+}
+
+void
+_get_approximate_trie(const Trie trie, const unsigned char *key, const int k,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ const int mismatches,
+ void *data),
+ void *data,
+ const int mismatches,
+ unsigned char *current_key, const int max_key
+ )
+{
+ int i;
+
+ /* If there's no more key to match, then I'm done. */
+ if(!key[0]) {
+ if(trie->value)
+ (*callback)(current_key, trie->value, mismatches, data);
+ }
+ /* If there are no more mismatches allowed, then fall back to the
+ faster Trie_get. */
+ else if(!k) {
+ void *value = Trie_get(trie, key);
+ if(value) {
+ int l = strlen(current_key);
+ /* Make sure I have enough space for the full key. */
+ if(l + strlen(key) < max_key) {
+ strcat(current_key, key);
+ (*callback)(current_key, value, mismatches, data);
+ current_key[l] = 0;
+ }
+ /* BUG: Ran out of space for the key. This fails
+ silently, but should signal an error. */
+ }
+ }
+ /* If there are no more transitions, then all the characters left
+ in the key are mismatches. */
+ else if(!trie->num_transitions) {
+ if(trie->value && (strlen(key) <= k)) {
+ (*callback)(current_key, trie->value,
+ mismatches+strlen(key), data);
+ }
+ }
+ /* Otherwise, try to match each of the transitions. */
+ else {
+ for(i=0; i<trie->num_transitions; i++) {
+ Transition transition = &trie->transitions[i];
+ unsigned char *suffix = transition->suffix;
+ _get_approximate_transition(key, k, transition, suffix,
+ callback, data,
+ mismatches, current_key, max_key);
+ }
+ }
+
+}
+
+
+void
+Trie_get_approximate(const Trie trie, const unsigned char *key, const int k,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ const int mismatches,
+ void *data),
+ void *data
+ )
+{
+ KEY[0] = 0;
+ _get_approximate_trie(trie, key, k, callback, data, 0, KEY,MAX_KEY_LENGTH);
+}
+
+int Trie_len(const Trie trie)
+{
+ int i;
+ int length = 0;
+
+ if(!trie)
+ return 0;
+ if(trie->value)
+ length += 1;
+ for(i=0; i<trie->num_transitions; i++) {
+ length += Trie_len(trie->transitions[i].next);
+ }
+ return length;
+}
+
+int Trie_has_key(const Trie trie, const unsigned char *key)
+{
+ return Trie_get(trie, key) != NULL;
+}
+
+int Trie_has_prefix(const Trie trie, const unsigned char *prefix)
+{
+ int first, last, mid;
+
+ if(!prefix[0]) {
+ return 1;
+ }
+
+ /* The transitions are stored in alphabetical order. Do a binary
+ * search to find the proper one.
+ */
+ first = 0;
+ last = trie->num_transitions-1;
+ while(first <= last) {
+ Transition transition;
+ unsigned char *suffix;
+ int suffixlen, prefixlen, minlen;
+ int c;
+ mid = (first+last)/2;
+ transition = &trie->transitions[mid];
+ suffix = transition->suffix;
+ suffixlen = strlen(suffix);
+ prefixlen = strlen(prefix);
+ minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen;
+ c = strncmp(prefix, suffix, minlen);
+ if(c < 0)
+ last = mid-1;
+ else if(c > 0)
+ first = mid+1;
+ else
+ return Trie_has_prefix(transition->next, prefix+minlen);
+ }
+ return 0;
+}
+
+static void
+_iterate_helper(const Trie trie,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ void *data),
+ void *data,
+ unsigned char *current_key, const int max_key)
+{
+ int i;
+ if(trie->value)
+ (*callback)(current_key, trie->value, data);
+ for(i=0; i<trie->num_transitions; i++) {
+ Transition transition = &trie->transitions[i];
+ unsigned char *suffix = transition->suffix;
+ int keylen = strlen(current_key);
+
+ if(keylen + strlen(suffix) >= max_key) {
+ /* BUG: This will fail silently. It should raise some
+ sort of error. */
+ continue;
+ }
+ strcat(current_key, suffix);
+ _iterate_helper(transition->next, callback, data,
+ current_key, max_key);
+ current_key[keylen] = 0;
+ }
+}
+
+void
+Trie_iterate(const Trie trie,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ void *data),
+ void *data)
+{
+ KEY[0] = 0;
+ _iterate_helper(trie, callback, data, KEY, MAX_KEY_LENGTH);
+}
+
+static void
+_with_prefix_helper(const Trie trie, const unsigned char *prefix,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ void *data),
+ void *data,
+ unsigned char *current_key, const int max_key)
+{
+ int first, last, mid;
+
+ if(!prefix[0]) {
+ _iterate_helper(trie, callback, data, current_key, max_key);
+ return;
+ }
+
+ /* The transitions are stored in alphabetical order. Do a binary
+ * search to find the proper one.
+ */
+ first = 0;
+ last = trie->num_transitions-1;
+ while(first <= last) {
+ Transition transition;
+ unsigned char *suffix;
+ int suffixlen, prefixlen, minlen;
+ int c;
+ mid = (first+last)/2;
+ transition = &trie->transitions[mid];
+ suffix = transition->suffix;
+ suffixlen = strlen(suffix);
+ prefixlen = strlen(prefix);
+ minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen;
+ c = strncmp(prefix, suffix, minlen);
+ if(c < 0)
+ last = mid-1;
+ else if(c > 0)
+ first = mid+1;
+ else {
+ int keylen = strlen(current_key);
+ if(keylen + minlen >= max_key) {
+ /* BUG: This will fail silently. It should raise some
+ sort of error. */
+ break;
+ }
+ strncat(current_key, suffix, minlen);
+ _with_prefix_helper(transition->next, prefix+minlen,
+ callback, data, current_key, max_key);
+ current_key[keylen] = 0;
+ break;
+ }
+ }
+}
+
+void
+Trie_with_prefix(const Trie trie, const unsigned char *prefix,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ void *data),
+ void *data
+ )
+{
+ KEY[0] = 0;
+ _with_prefix_helper(trie, prefix, callback, data, KEY, MAX_KEY_LENGTH);
+}
+
+
+
+/* Need to declare _serialize_transition here so it can be called from
+ _serialize_trie. */
+int _serialize_transition(const Transition transition,
+ int (*write)(const void *towrite, const int length,
+ void *data),
+ int (*write_value)(const void *value, void *data),
+ void *data);
+
+/* This library also provides code for flattening tries so that they
+ * can be saved and read back in later. The format of a serialized
+ * trie is:
+ * TYPE NBYTES DESCRIPTION
+ * byte 1 Whether or not there is a value
+ * variable variable If there is a value, let the client store it.
+ * byte 1 Number of transitions for this Trie.
+ * transition variable
+ * int 4 Number of characters in the suffix.
+ * suffix variable the suffix for this transition
+ * byte 1 Whether or not there is a trie
+ * trie variable Recursively points to another trie.
+ *
+ * The number of bytes and the endian may vary from platform to
+ * platform.
+ */
+
+int _serialize_trie(const Trie trie,
+ int (*write)(const void *towrite, const int length,
+ void *data),
+ int (*write_value)(const void *value, void *data),
+ void *data)
+{
+ int i;
+ unsigned char has_value;
+
+ has_value = (trie->value != NULL);
+ if(!(*write)(&has_value, sizeof(has_value), data))
+ return 0;
+ if(has_value) {
+ if(!(*write_value)(trie->value, data))
+ return 0;
+ }
+
+ if(!(*write)(&trie->num_transitions, sizeof(trie->num_transitions), data))
+ return 0;
+ for(i=0; i<trie->num_transitions; i++) {
+ if(!_serialize_transition(&trie->transitions[i],
+ write, write_value, data))
+ return 0;
+ }
+
+ return 1;
+}
+
+int _serialize_transition(const Transition transition,
+ int (*write)(const void *towrite, const int length,
+ void *data),
+ int (*write_value)(const void *value, void *data),
+ void *data)
+{
+ int suffixlen;
+ unsigned char has_trie;
+
+ suffixlen = strlen(transition->suffix);
+ if(!(*write)(&suffixlen, sizeof(suffixlen), data))
+ return 0;
+ if(!(*write)(transition->suffix, suffixlen, data))
+ return 0;
+
+ has_trie = (transition->next != NULL);
+ if(!(*write)(&has_trie, sizeof(has_trie), data))
+ return 0;
+ if(has_trie) {
+ if(!_serialize_trie(transition->next, write, write_value, data))
+ return 0;
+ }
+ return 1;
+}
+
+int Trie_serialize(const Trie trie,
+ int (*write)(const void *towrite, const int length,
+ void *data),
+ int (*write_value)(const void *value, void *data),
+ void *data)
+{
+ int success = _serialize_trie(trie, write, write_value, data);
+ (*write)(NULL, 0, data);
+ return success;
+}
+
+int _deserialize_transition(Transition transition,
+ int (*read)(void *wasread, const int length,
+ void *data),
+ void *(*read_value)(void *data),
+ void *data);
+
+int _deserialize_trie(Trie trie,
+ int (*read)(void *wasread, const int length, void *data),
+ void *(*read_value)(void *data),
+ void *data)
+{
+ int i;
+ unsigned char has_value;
+
+ if(!(*read)(&has_value, sizeof(has_value), data))
+ goto _deserialize_trie_error;
+ if(has_value != 0 && has_value != 1)
+ goto _deserialize_trie_error;
+ if(has_value) {
+ if(!(trie->value = (*read_value)(data)))
+ goto _deserialize_trie_error;
+ }
+ if(!(*read)(&trie->num_transitions, sizeof(trie->num_transitions), data))
+ goto _deserialize_trie_error;
+ if(!(trie->transitions =
+ malloc(trie->num_transitions*sizeof(struct _Transition))))
+ goto _deserialize_trie_error;
+ for(i=0; i<trie->num_transitions; i++) {
+ if(!_deserialize_transition(&trie->transitions[i],
+ read, read_value, data))
+ goto _deserialize_trie_error;
+ }
+ return 1;
+
+ _deserialize_trie_error:
+ trie->num_transitions = 0;
+ if(trie->transitions) {
+ free(trie->transitions);
+ trie->transitions = NULL;
+ }
+ trie->value = NULL;
+ return 0;
+}
+
+int _deserialize_transition(Transition transition,
+ int (*read)(void *wasread, const int length,
+ void *data),
+ void *(*read_value)(void *data),
+ void *data)
+{
+ int suffixlen;
+ unsigned char has_trie;
+
+ if(!(*read)(&suffixlen, sizeof(suffixlen), data))
+ goto _deserialize_transition_error;
+ if(suffixlen < 0 || suffixlen >= MAX_KEY_LENGTH)
+ goto _deserialize_transition_error;
+ if(!(*read)(KEY, suffixlen, data))
+ goto _deserialize_transition_error;
+ KEY[suffixlen] = 0;
+ if(!(transition->suffix = (unsigned char *)strdup(KEY)))
+ goto _deserialize_transition_error;
+ if(!(*read)(&has_trie, sizeof(has_trie), data))
+ goto _deserialize_transition_error;
+ if(has_trie != 0 && has_trie != 1)
+ goto _deserialize_transition_error;
+ if(has_trie) {
+ transition->next = Trie_new();
+ if(!_deserialize_trie(transition->next, read, read_value, data))
+ goto _deserialize_transition_error;
+ }
+ return 1;
+
+ _deserialize_transition_error:
+ if(transition->suffix) {
+ free(transition->suffix);
+ transition->suffix = NULL;
+ }
+ if(transition->next) {
+ Trie_del(transition->next);
+ transition->next = NULL;
+ }
+ return 0;
+}
+
+Trie Trie_deserialize(int (*read)(void *wasread, const int length, void *data),
+ void *(*read_value)(void *data),
+ void *data)
+{
+ Trie trie = Trie_new();
+ if(!_deserialize_trie(trie, read, read_value, data)) {
+ Trie_del(trie);
+ return NULL;
+ }
+ return trie;
+}
+
+void test(void) {
+ Trie trie;
+
+ printf("Hello world!\n");
+
+ trie = Trie_new();
+ printf("New trie %p\n", trie);
+ Trie_set(trie, "hello world", "s1");
+ Trie_set(trie, "bye", "s2");
+ Trie_set(trie, "hell sucks", "s3");
+ Trie_set(trie, "hebee", "s4");
+
+ printf("%s\n", (char *)Trie_get(trie, "hello world"));
+ printf("%s\n", (char *)Trie_get(trie, "bye"));
+ printf("%s\n", (char *)Trie_get(trie, "hell sucks"));
+ printf("%s\n", (char *)Trie_get(trie, "hebee"));
+
+ Trie_set(trie, "blah", "s5");
+ printf("%s\n", (char *)Trie_get(trie, "blah"));
+
+ printf("%p\n", Trie_get(trie, "foobar"));
+ printf("%d\n", Trie_len(trie));
+
+ Trie_set(trie, "blah", "snew");
+ printf("%s\n", (char *)Trie_get(trie, "blah"));
+
+ Trie_del(trie);
+}
+
+#if 0
+int main() {
+ test();
+}
+#endif
--- /dev/null
+typedef struct _Trie *Trie;
+
+
+
+/* Trie_new
+ * --------
+ * Create a new trie. Return a Trie structure, which is an abstract
+ * data structure. The client should not have to know about the
+ * details of this structure. When finished, each Trie should be
+ * freed with Trie_del.
+ */
+Trie Trie_new(void);
+
+
+/* Trie_del
+ * --------
+ * Free a Trie data structure.
+ */
+void Trie_del(Trie trie);
+
+
+/* Trie_set
+ * --------
+ * Set a string in the Trie to some value. Returns a 0 if the
+ * function succeeded.
+ */
+int Trie_set(Trie trie, const unsigned char *key, const void *value);
+
+/* Trie_get
+ * --------
+ * Lookup whether a key exists in the Trie. Returns the value that
+ * was previous set in the Trie, or NULL if it doesn't exist.
+ */
+void *Trie_get(const Trie trie, const unsigned char *key);
+
+
+/* Trie_get_approximate
+ * --------------------
+ * Lookup whether a key exists in the Trie, allowing for mismatches to
+ * the dictionary. Passes back values using a callback function.
+ */
+void
+Trie_get_approximate(const Trie trie, const unsigned char *key, const int k,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ const int mismatches,
+ void *data),
+ void *data
+ );
+
+/* Trie_len
+ * --------
+ * Return the number of strings in the trie.
+ */
+int Trie_len(const Trie trie);
+
+
+/* Trie_has_key
+ * ------------
+ * Return whether a key exists in the trie.
+ */
+int Trie_has_key(const Trie trie, const unsigned char *key);
+
+
+/* Trie_has_prefix
+ * ---------------
+ * Return whether a string is a prefix of a key in the trie.
+ */
+int Trie_has_prefix(const Trie trie, const unsigned char *prefix);
+
+
+/* Trie_with_prefix
+ * ----------------
+ * Iterate over all the keys in the trie that start with a prefix.
+ */
+void Trie_with_prefix(const Trie trie, const unsigned char *prefix,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ void *data),
+ void *data
+ );
+
+
+/* Trie_iterate
+ * ------------
+ * Iterate through everything stored in the trie. callback is a
+ * function that gets called for each thing in the trie. It is called
+ * in arbitrary order. data is a pointer to some arbitrary data and
+ * gets passed unchanged to the callback.
+ */
+void Trie_iterate(const Trie trie,
+ void (*callback)(const unsigned char *key,
+ const void *value,
+ void *data),
+ void *data
+ );
+
+/* Trie_serialize
+ * --------------
+ * Serialize a tree into a stream of bytes. This function takes a
+ * callback 'write' that should take a pointer to data and the length
+ * of the data in bytes. This will be called repeatedly until the
+ * whole Trie is serialized. When it is done, this function will call
+ * 'write' with a length of 0. Since the values are handled by the
+ * client, this function also takes a callback function 'write_value'
+ * so that the client can serialize their own values.
+ *
+ * This function is platform-dependent, so byte streams created on one
+ * machine may not necessarily port to another.
+ */
+int Trie_serialize(const Trie trie,
+ int (*write)(const void *towrite, const int length,
+ void *data),
+ int (*write_value)(const void *value, void *data),
+ void *data);
+
+
+
+/* Trie_deserialize
+ * ----------------
+ * Deserialize a tree that was previously serialized with
+ * Trie_serialize. This function takes a callback 'read' that should
+ * read 'length' bytes and save it to 'wasread'. 'read_value' should
+ * read a value and return a pointer to it. 'data' is a pointer that
+ * will be passed unchanged to 'read' and 'read_value'.
+ */
+Trie Trie_deserialize(int (*read)(void *wasread, const int length, void *data),
+ void *(*read_value)(void *data),
+ void *data);
--- /dev/null
+"""
+Given a trie, find all occurrences of a word in the trie in a string.
+
+Like searching a string for a substring, except that the substring is
+any word in a trie.
+
+Functions:
+match Find longest key in a trie matching the beginning of the string.
+match_all Find all keys in a trie matching the beginning of the string.
+find Find keys in a trie matching anywhere in a string.
+find_words Find keys in a trie matching whole words in a string.
+
+"""
+import string
+import re
+
+def match(string, trie):
+ """match(string, trie) -> longest key or None
+
+ Find the longest key in the trie that matches the beginning of the
+ string.
+
+ """
+ longest = None
+ for i in range(len(string)):
+ substr = string[:i+1]
+ if not trie.has_prefix(substr):
+ break
+ if trie.has_key(substr):
+ longest = substr
+ return longest
+
+def match_all(string, trie):
+ """match_all(string, trie) -> list of keys
+
+ Find all the keys in the trie that matches the beginning of the
+ string.
+
+ """
+ matches = []
+ for i in range(len(string)):
+ substr = string[:i+1]
+ if not trie.has_prefix(substr):
+ break
+ if trie.has_key(substr):
+ matches.append(substr)
+ return matches
+
+def find(string, trie):
+ """find(string, trie) -> list of tuples (key, start, end)
+
+ Find all the keys in the trie that match anywhere in the string.
+
+ """
+ results = []
+ start = 0 # index to start the search
+ while start < len(string):
+ # Look for a match.
+ keys = match_all(string[start:], trie)
+ for key in keys:
+ results.append((key, start, start+len(key)))
+ start += 1
+ return results
+
+DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace
+
+def find_words(string, trie):
+ """find_words(string, trie) -> list of tuples (key, start, end)
+
+ Find all the keys in the trie that match full words in the string.
+ Word boundaries are defined as any punctuation or whitespace.
+
+ """
+ _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))
+
+ results = []
+ start = 0 # index of word boundary
+ while start < len(string):
+ # Look for a match.
+ keys = match_all(string[start:], trie)
+ for key in keys:
+ l = len(key)
+ # Make sure it ends at a boundary.
+ if start+l == len(string) or \
+ _boundary_re.match(string[start+l]):
+ results.append((key, start, start+l))
+ # Move forward to the next boundary.
+ m = _boundary_re.search(string, start)
+ if m is None:
+ break
+ start = m.end()
+ return results
--- /dev/null
+#include <Python.h>
+#include <marshal.h>
+#include "trie.h"
+
+#if PY_VERSION_HEX < 0x02050000
+#define Py_ssize_t int
+#endif
+
+
+
+staticforward PyTypeObject Trie_Type;
+
+typedef struct {
+ PyObject_HEAD
+ Trie trie;
+} trieobject;
+
+static PyObject*
+trie_trie(PyObject* self, PyObject* args)
+{
+ trieobject* trieobj;
+ Trie trie;
+
+ if (!PyArg_ParseTuple(args,":trie"))
+ return NULL;
+ if(!(trie = Trie_new()))
+ return PyErr_NoMemory();
+ if(!(trieobj = PyObject_New(trieobject, &Trie_Type)))
+ return NULL;
+ trieobj->trie = trie;
+ return (PyObject*)trieobj;
+}
+
+static void
+_decref_objects(const unsigned char *key, const void *value, void *data)
+{
+ Py_DECREF((PyObject *)value);
+}
+
+static void
+trie_dealloc(PyObject* self)
+{
+ trieobject *mp = (trieobject *)self;
+ Trie_iterate(mp->trie, _decref_objects, NULL);
+ Trie_del(mp->trie);
+ PyObject_Del(self);
+}
+
+static Py_ssize_t
+trie_length(trieobject *mp)
+{
+ return Trie_len(mp->trie);
+}
+
+static PyObject *
+trie_subscript(trieobject *mp, PyObject *py_key)
+{
+ unsigned char *key;
+ PyObject *py_value;
+
+ /* Make sure key is a string. */
+ if(!PyString_Check(py_key)) {
+ PyErr_SetString(PyExc_TypeError, "key must be a string");
+ return NULL;
+ }
+ key = (unsigned char *)PyString_AS_STRING(py_key);
+ py_value = (PyObject *)Trie_get(mp->trie, key);
+ if(py_value == NULL)
+ PyErr_SetString(PyExc_KeyError, (char *)key);
+ else
+ Py_INCREF(py_value);
+ return py_value;
+}
+
+static int
+trie_ass_sub(trieobject *mp, PyObject *py_key, PyObject *py_value)
+{
+ unsigned char *key;
+ PyObject *py_prev;
+
+ /* Make sure key is a string. */
+ if(!PyString_Check(py_key)) {
+ PyErr_SetString(PyExc_TypeError, "key must be a string");
+ return -1;
+ }
+ key = (unsigned char *)PyString_AS_STRING((char *)py_key);
+
+ /* Check to see whether something already exists at that key. If
+ there's already an object there, then I will have to remove it.
+ */
+ py_prev = (PyObject *)Trie_get(mp->trie, key);
+ if(py_prev) {
+ Py_DECREF(py_prev);
+ }
+
+ /* The client wants to delete a key from a dictionary. The Trie
+ API doesn't support this, so I will just overwrite it with
+ NULL. */
+ if(!py_value) {
+ /* If the key doesn't exist, raise a KeyError. */
+ if(!py_prev) {
+ PyErr_SetString(PyExc_KeyError, (char *)key);
+ return -1;
+ }
+ Trie_set(mp->trie, key, NULL);
+ }
+ /* The client wants to set a key in the dictionary. */
+ else {
+ Py_INCREF(py_value);
+ if(Trie_set(mp->trie, key, py_value)) {
+ PyErr_SetString(PyExc_AssertionError, "error setting trie");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static char has_key__doc__[] =
+"D.has_key(k) -> 1 if D has a key k, else 0";
+
+static PyObject *
+trie_has_key(trieobject *mp, PyObject *py_key)
+{
+ unsigned char *key;
+ int has_key;
+
+ /* Make sure key is a string. */
+ if(!PyString_Check(py_key)) {
+ PyErr_SetString(PyExc_TypeError, "key must be a string");
+ return NULL;
+ }
+ key = (unsigned char *)PyString_AS_STRING(py_key);
+ has_key = Trie_has_key(mp->trie, key);
+ return PyInt_FromLong((long)has_key);
+}
+
+static PyObject *
+trie_has_key_onearg(trieobject *mp, PyObject *py_args)
+{
+ PyObject *py_arg;
+ if(!PyArg_ParseTuple(py_args, "O", &py_arg))
+ return NULL;
+ return trie_has_key(mp, py_arg);
+}
+
+
+
+static char has_prefix__doc__[] =
+"D.has_prefix(k) -> 1 if D has a prefix k, else 0";
+
+static PyObject *
+trie_has_prefix(trieobject *mp, PyObject *py_prefix)
+{
+ unsigned char *prefix;
+ int has_prefix;
+
+ /* Make sure prefix is a string. */
+ if(!PyString_Check(py_prefix)) {
+ PyErr_SetString(PyExc_TypeError, "k must be a string");
+ return NULL;
+ }
+ prefix = (unsigned char *)PyString_AS_STRING(py_prefix);
+ has_prefix = Trie_has_prefix(mp->trie, prefix);
+ return PyInt_FromLong((long)has_prefix);
+}
+
+static PyObject *
+trie_has_prefix_onearg(trieobject *mp, PyObject *py_args)
+{
+ PyObject *py_arg;
+ if(!PyArg_ParseTuple(py_args, "O", &py_arg))
+ return NULL;
+ return trie_has_prefix(mp, py_arg);
+}
+
+static char with_prefix__doc__[] =
+"D.with_prefix(prefix) -> list of D's keys that begins with prefix";
+
+static void
+_trie_with_prefix_helper(const unsigned char *key, const void *value,
+ void *data)
+{
+ PyObject *py_list = (PyObject *)data;
+ PyObject *py_key;
+
+ if(PyErr_Occurred())
+ return;
+
+ if(!(py_key = PyString_FromString((const char *)key)))
+ return;
+ PyList_Append(py_list, py_key);
+ Py_DECREF(py_key);
+}
+
+static PyObject *
+trie_with_prefix(trieobject *mp, PyObject *py_prefix)
+{
+ unsigned char *prefix;
+ PyObject *py_list;
+
+ /* Make sure prefix is a string. */
+ if(!PyString_Check(py_prefix)) {
+ PyErr_SetString(PyExc_TypeError, "k must be a string");
+ return NULL;
+ }
+ prefix = (unsigned char *)PyString_AS_STRING(py_prefix);
+
+ if(!(py_list = PyList_New(0)))
+ return NULL;
+ Trie_with_prefix(mp->trie, prefix,
+ _trie_with_prefix_helper, (void *)py_list);
+ if(PyErr_Occurred()) {
+ Py_DECREF(py_list);
+ return NULL;
+ }
+ return py_list;
+}
+
+static PyObject *
+trie_with_prefix_onearg(trieobject *mp, PyObject *py_args)
+{
+ PyObject *py_arg;
+ if(!PyArg_ParseTuple(py_args, "O", &py_arg))
+ return NULL;
+ return trie_with_prefix(mp, py_arg);
+}
+
+
+static char keys__doc__[] =
+"D.keys() -> list of D's keys";
+
+static void
+_trie_keys_helper(const unsigned char *key, const void *value, void *data)
+{
+ PyObject *py_list = (PyObject *)data;
+ PyObject *py_key;
+
+ if(PyErr_Occurred())
+ return;
+
+ if(!(py_key = PyString_FromString((char *)key)))
+ return;
+ PyList_Append(py_list, py_key);
+ Py_DECREF(py_key);
+}
+
+static PyObject *
+trie_keys(trieobject *mp)
+{
+ PyObject *py_list;
+
+ if(!(py_list = PyList_New(0)))
+ return NULL;
+ Trie_iterate(mp->trie, _trie_keys_helper, (void *)py_list);
+ if(PyErr_Occurred()) {
+ Py_DECREF(py_list);
+ return NULL;
+ }
+ return py_list;
+}
+
+static PyObject *
+trie_keys_noargs(trieobject *mp, PyObject *py_args)
+{
+ if(PyTuple_Size(py_args) != 0) {
+ PyErr_SetString(PyExc_ValueError, "no args expected");
+ return NULL;
+ }
+ return trie_keys(mp);
+}
+
+static char values__doc__[] =
+"D.values() -> list of D's values";
+
+static void
+_trie_values_helper(const unsigned char *key, const void *value, void *data)
+{
+ PyObject *py_list = (PyObject *)data;
+ if(PyErr_Occurred())
+ return;
+ PyList_Append(py_list, (PyObject *)value);
+}
+
+static PyObject *
+trie_values(trieobject *mp)
+{
+ PyObject *py_list;
+
+ if(!(py_list = PyList_New(0)))
+ return NULL;
+ Trie_iterate(mp->trie, _trie_values_helper, (void *)py_list);
+ if(PyErr_Occurred()) {
+ Py_DECREF(py_list);
+ return NULL;
+ }
+ return py_list;
+}
+
+static PyObject *
+trie_values_noargs(trieobject *mp, PyObject *py_args)
+{
+ if(PyTuple_Size(py_args) != 0) {
+ PyErr_SetString(PyExc_ValueError, "no args expected");
+ return NULL;
+ }
+ return trie_values(mp);
+}
+
+static char get__doc__[] =
+"D.get(k[,d]) -> D[k] if D.has_key(k), else d. d defaults to None.";
+
+static PyObject *
+trie_get(trieobject *mp, PyObject *args)
+{
+ unsigned char *key;
+ PyObject *py_value;
+ PyObject *py_failobj = Py_None;
+
+ if (!PyArg_ParseTuple(args, "s|O:get", &key, &py_failobj))
+ return NULL;
+ py_value = (PyObject *)Trie_get(mp->trie, key);
+ if(!py_value)
+ py_value = py_failobj;
+ Py_INCREF(py_value);
+ return py_value;
+}
+
+static char get_approximate__doc__[] =
+"D.get_approximate(key, k) -> List of (key, value, mismatches) in D, allowing up to k mismatches in key.";
+
+void
+_trie_get_approximate_helper(const unsigned char *key, const void *value,
+ const int mismatches, void *data)
+{
+ /* Append a tuple of (key, value) to data, which is a PyList. */
+ PyObject *py_list = (PyObject *)data,
+ *py_value = (PyObject *)value,
+ *py_key,
+ *py_tuple,
+ *py_mismatches;
+
+ if(PyErr_Occurred())
+ return;
+
+ if(!(py_key = PyString_FromString((const char *)key)))
+ return;
+ if(!(py_mismatches = PyInt_FromLong(mismatches))) {
+ Py_DECREF(py_key);
+ return;
+ }
+ Py_INCREF(py_value);
+
+ if(!(py_tuple = PyTuple_New(3))) {
+ Py_DECREF(py_key);
+ Py_DECREF(py_mismatches);
+ Py_DECREF(py_value);
+ return;
+ }
+ PyTuple_SetItem(py_tuple, 0, py_key);
+ PyTuple_SetItem(py_tuple, 1, py_value);
+ PyTuple_SetItem(py_tuple, 2, py_mismatches);
+ PyList_Append(py_list, py_tuple);
+ Py_DECREF(py_tuple);
+}
+
+static PyObject *
+trie_get_approximate(trieobject *mp, PyObject *args)
+{
+ unsigned char *key;
+ int k;
+ PyObject *py_list;
+
+ if (!PyArg_ParseTuple(args, "si:get_approximate", &key, &k))
+ return NULL;
+
+ if(!(py_list = PyList_New(0)))
+ return NULL;
+ Trie_get_approximate(mp->trie, key, k,
+ _trie_get_approximate_helper, (void *)py_list);
+ if(PyErr_Occurred()) {
+ Py_DECREF(py_list);
+ return NULL;
+ }
+ return py_list;
+}
+
+static long
+trie_nohash(PyObject *self)
+{
+ PyErr_SetString(PyExc_TypeError, "trie objects are unhashable");
+ return -1;
+}
+
+static PyMappingMethods trie_as_mapping = {
+/* The first member of PyMappingMethods was redefined in Python 2.5. */
+#if PY_VERSION_HEX < 0x02050000
+ (inquiry)trie_length, /*mp_length*/
+#else
+ (lenfunc)trie_length, /*mp_length*/
+#endif
+ (binaryfunc)trie_subscript, /*mp_subscript*/
+ (objobjargproc)trie_ass_sub /*mp_ass_subscript*/
+};
+
+static PyMethodDef trieobj_methods[] = {
+ /* METH_O and METH_NOARGS require Python 2.2.
+ {"has_key", (PyCFunction)trie_has_key, METH_O,
+ has_key__doc__},
+ {"has_prefix", (PyCFunction)trie_has_prefix, METH_O,
+ has_prefix__doc__},
+ {"with_prefix", (PyCFunction)trie_with_prefix, METH_O,
+ with_prefix__doc__},
+ {"keys", (PyCFunction)trie_keys, METH_NOARGS,
+ keys__doc__},
+ {"values", (PyCFunction)trie_values, METH_NOARGS,
+ values__doc__},
+ */
+
+ {"has_key", (PyCFunction)trie_has_key_onearg, METH_VARARGS,
+ has_key__doc__},
+ {"has_prefix", (PyCFunction)trie_has_prefix_onearg, METH_VARARGS,
+ has_prefix__doc__},
+ {"with_prefix", (PyCFunction)trie_with_prefix_onearg, METH_VARARGS,
+ with_prefix__doc__},
+ {"keys", (PyCFunction)trie_keys_noargs, METH_VARARGS,
+ keys__doc__},
+ {"values", (PyCFunction)trie_values_noargs, METH_VARARGS,
+ values__doc__},
+
+ {"get", (PyCFunction)trie_get, METH_VARARGS,
+ get__doc__},
+ {"get_approximate", (PyCFunction)trie_get_approximate, METH_VARARGS,
+ get_approximate__doc__},
+ {NULL, NULL} /* sentinel */
+};
+
+static PyObject *trie_getattr(PyObject *obj, char *name)
+{
+ return Py_FindMethod(trieobj_methods, (PyObject *)obj, name);
+
+}
+
+static PyTypeObject Trie_Type = {
+ PyObject_HEAD_INIT(NULL)
+ 0,
+ "trie",
+ sizeof(trieobject),
+ 0,
+ trie_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ trie_getattr, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ &trie_as_mapping, /*tp_as_mapping*/
+ trie_nohash, /*tp_hash */
+};
+
+static int
+_write_to_handle(const void *towrite, const int length, void *handle)
+{
+ PyObject *py_handle = (PyObject *)handle,
+ *py_retval = NULL;
+ int success = 0;
+
+ if(!length)
+ return 1;
+
+ if(!(py_retval = PyObject_CallMethod(py_handle, "write", "s#",
+ towrite, length)))
+ goto _write_to_handle_cleanup;
+ success = 1;
+
+ _write_to_handle_cleanup:
+ if(py_retval) {
+ Py_DECREF(py_retval);
+ }
+ return success;
+}
+
+int _write_value_to_handle(const void *value, void *handle)
+{
+ PyObject *py_value = (PyObject *)value,
+ *py_marshalled = NULL;
+ char *marshalled;
+ Py_ssize_t length;
+ int success = 0;
+
+#ifdef Py_MARSHAL_VERSION
+ if(!(py_marshalled =
+ PyMarshal_WriteObjectToString(py_value, Py_MARSHAL_VERSION)))
+ goto _write_value_to_handle_cleanup;
+#else
+ if(!(py_marshalled = PyMarshal_WriteObjectToString(py_value)))
+ goto _write_value_to_handle_cleanup;
+#endif
+ if(PyString_AsStringAndSize(py_marshalled, &marshalled, &length) == -1)
+ goto _write_value_to_handle_cleanup;
+ if(!_write_to_handle(&length, sizeof(length), handle))
+ goto _write_value_to_handle_cleanup;
+ if (length != (int)length)
+ goto _write_value_to_handle_cleanup;
+ if(!_write_to_handle(marshalled, (int)length, handle))
+ goto _write_value_to_handle_cleanup;
+ success = 1;
+
+ _write_value_to_handle_cleanup:
+ if(py_marshalled) {
+ Py_DECREF(py_marshalled);
+ }
+
+ return success;
+}
+
+static PyObject *
+trie_save(PyObject *self, PyObject *args)
+{
+ PyObject *py_handle,
+ *py_trie;
+ trieobject *mp;
+
+ if(!PyArg_ParseTuple(args, "OO:save", &py_handle, &py_trie))
+ return NULL;
+ mp = (trieobject *)py_trie;
+ if(!Trie_serialize(mp->trie, _write_to_handle, _write_value_to_handle,
+ (void *)py_handle)) {
+ if(!PyErr_Occurred())
+ PyErr_SetString(PyExc_RuntimeError,
+ "saving failed for some reason");
+ return NULL;
+ }
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+static int
+_read_from_handle(void *wasread, const int length, void *handle)
+{
+ PyObject *py_handle = (PyObject *)handle,
+ *py_retval = NULL;
+ void *retval;
+ int success = 0;
+ PyBufferProcs *buffer;
+ int segment;
+ int bytes_read, bytes_left;
+
+ if(!length)
+ return 1;
+
+ if(!(py_retval = PyObject_CallMethod(py_handle, "read", "i", length)))
+ goto _read_from_handle_cleanup;
+ if(!py_retval->ob_type->tp_as_buffer) {
+ PyErr_SetString(PyExc_ValueError, "read method should return buffer");
+ goto _read_from_handle_cleanup;
+ }
+ if(!(py_retval->ob_type->tp_flags & Py_TPFLAGS_DEFAULT)) {
+ PyErr_SetString(PyExc_ValueError, "no bf_getcharbuffer slot");
+ goto _read_from_handle_cleanup;
+ }
+ buffer = py_retval->ob_type->tp_as_buffer;
+ if(!buffer->bf_getreadbuffer) {
+ PyErr_SetString(PyExc_ValueError, "no bf_getreadbuffer");
+ goto _read_from_handle_cleanup;
+ }
+
+ bytes_left = length;
+ segment = 0;
+ while(bytes_left > 0) {
+ if((bytes_read = buffer->bf_getreadbuffer(py_retval,
+ segment, &retval)) == -1)
+ goto _read_from_handle_cleanup;
+ memcpy(wasread, retval, bytes_read);
+ wasread = (void *)((char *)wasread + bytes_read);
+ bytes_left -= bytes_read;
+ segment += 1;
+ }
+
+ success = 1;
+
+ _read_from_handle_cleanup:
+ if(py_retval) {
+ Py_DECREF(py_retval);
+ }
+ return success;
+}
+
+#define MAX_KEY_LENGTH 2000
+static void *
+_read_value_from_handle(void *handle)
+{
+ Py_ssize_t length;
+ char KEY[MAX_KEY_LENGTH];
+
+ if(!_read_from_handle((void *)&length, sizeof(length), (void *)handle))
+ return NULL;
+ if(length < 0 || length >= MAX_KEY_LENGTH)
+ return NULL;
+ if(!_read_from_handle((void *)KEY, length, (void *)handle))
+ return NULL;
+ return PyMarshal_ReadObjectFromString(KEY, length);
+}
+
+
+static PyObject *
+trie_load(PyObject *self, PyObject *args)
+{
+ PyObject *py_handle;
+ Trie trie;
+ trieobject *trieobj;
+
+ if(!PyArg_ParseTuple(args, "O:load", &py_handle))
+ return NULL;
+
+ if(!(trie = Trie_deserialize(_read_from_handle, _read_value_from_handle,
+ py_handle))) {
+ if(!PyErr_Occurred())
+ PyErr_SetString(PyExc_RuntimeError,
+ "loading failed for some reason");
+ return NULL;
+ }
+
+ if(!(trieobj = PyObject_New(trieobject, &Trie_Type))) {
+ Trie_del(trie);
+ return NULL;
+ }
+ trieobj->trie = trie;
+ return (PyObject *)trieobj;
+}
+
+static PyMethodDef trie_methods[] = {
+ {"trie", trie_trie, METH_VARARGS,
+ "trie() -> new Trie object."},
+ {"load", trie_load, METH_VARARGS,
+ "load(handle) -> trie object"},
+ {"save", trie_save, METH_VARARGS,
+ "save(handle, trie), save a trie object to a handle"},
+ {NULL, NULL, 0, NULL}
+};
+
+static char trie__doc__[] =
+"\
+This module implements a trie data structure. This allows an O(M)\n\
+lookup of a string in a dictionary, where M is the length of the\n\
+string. It also supports approximate matches.\n\
+\n\
+Functions:\n\
+trie Create a new trie object.\n\
+save Save a trie to a handle.\n\
+load Load a trie from a handle.\n\
+\n\
+";
+
+DL_EXPORT(void)
+inittrie(void)
+{
+ Trie_Type.ob_type = &PyType_Type;
+
+ (void) Py_InitModule3("trie", trie_methods, trie__doc__);
+}
--- /dev/null
+# Copyright 2000 by Andrew Dalke.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Miscellaneous functions for dealing with sequences (obsolete?)."""
+
+import Seq
+import Alphabet
+
+from PropertyManager import default_manager
+
+def translate(seq, id = None):
+ """Translate a sequence (DEPRECATED)."""
+ import warnings
+ warnings.warn("Bio.utils.translate() has been deprecated, and we" \
+ +" intend to remove it in a future release of Biopython."\
+ +" Please use the translate method or function in Bio.Seq"\
+ +" instead, as described in the Tutorial.",
+ DeprecationWarning)
+ if id is None:
+ s = "translator"
+ else:
+ s = "translator.id.%d" % id
+ translator = default_manager.resolve(seq.alphabet, s)
+ return translator.translate(seq)
+
+def translate_to_stop(seq, id = None):
+ """Translate a sequence up to the first in frame stop codon (DEPRECATED)."""
+ import warnings
+ warnings.warn("Bio.utils.translate_to_stop() has been deprecated, and we" \
+ +" intend to remove it in a future release of Biopython."\
+ +" Please use the translate method or function in Bio.Seq"\
+ +" instead, as described in the Tutorial.",
+ DeprecationWarning)
+ if id is None:
+ s = "translator"
+ else:
+ s = "translator.id.%d" % id
+ translator = default_manager.resolve(seq.alphabet, s)
+ return translator.translate_to_stop(seq)
+
+def back_translate(seq, id = None):
+ """Back-translate a sequence (DEPRECATED)."""
+ import warnings
+ warnings.warn("Bio.utils.back_translate() has been deprecated, and we" \
+ +" intend to remove it in a future release of Biopython."\
+ +" If you use it, please tell us on the mailing list.",
+ DeprecationWarning)
+ if id is None:
+ s = "translator"
+ else:
+ s = "translator.id.%d" % id
+ translator = default_manager.resolve(seq.alphabet, s)
+ return translator.back_translate(seq)
+
+
+def transcribe(seq):
+ """Transcribe a sequence (DEPRECATED)."""
+ import warnings
+ warnings.warn("Bio.utils.transcribe() has been deprecated, and we" \
+ +" intend to remove it in a future release of Biopython."\
+ +" Please use the transcribe method or function in"\
+ +" Bio.Seq instead, as described in the Tutorial.",
+ DeprecationWarning)
+ transcriber = default_manager.resolve(seq.alphabet, "transcriber")
+ return transcriber.transcribe(seq)
+
+def back_transcribe(seq):
+ """Back-transcribe a sequence (DEPRECATED)."""
+ import warnings
+ warnings.warn("Bio.utils.back_transcribe() has been deprecated, and we" \
+ +" intend to remove it in a future release of Biopython."\
+ +" Please use the back_transcribe method or function in"\
+ +" Bio.Seq instead, as described in the Tutorial.",
+ DeprecationWarning)
+ transcriber = default_manager.resolve(seq.alphabet, "transcriber")
+ return transcriber.back_transcribe(seq)
+
+def ungap(seq):
+ """given a sequence with gap encoding, return the ungapped sequence"""
+ #TODO - Fix this? It currently assumes the outmost AlphabetEncoder
+ #is for the gap. Consider HasStopCodon(Gapped(Protein())) as a test case.
+ gap = seq.gap_char
+ letters = []
+ for c in seq.data:
+ if c != gap:
+ letters.append(c)
+ return Seq.Seq("".join(letters), seq.alphabet.alphabet)
+
+def verify_alphabet(seq):
+ letters = {}
+ for c in seq.alphabet.letters:
+ letters[c] = 1
+ try:
+ for c in seq.data:
+ letters[c]
+ except KeyError:
+ return 0
+ return 1
+
+def count_monomers(seq):
+ dict = {}
+# bugfix: string.count(s,c) raises an AttributeError. Iddo Friedberg 16 Mar. 04
+# s = buffer(seq.data) # works for strings and array.arrays
+ for c in seq.alphabet.letters:
+ dict[c] = seq.data.count(c)
+ return dict
+
+def percent_monomers(seq):
+ dict2 = {}
+ seq_len = len(seq)
+ dict = count_monomers(seq)
+ for m in dict:
+ dict2[m] = dict[m] * 100. / seq_len
+ return dict2
+
+def sum(seq, table, zero = 0.0):
+ total = zero
+ for c in getattr(seq, "data", seq):
+ total = total + table[c]
+ return total
+
+# For ranged addition
+def sum_2ple(seq, table, zero = (0.0, 0.0)):
+ x, y = zero
+ data = getattr(seq, "data", seq)
+ for c in data:
+ x2, y2 = table[c]
+ x = x + x2
+ y = y + y2
+ return (x, y)
+
+def total_weight(seq, weight_table = None):
+ if weight_table is None:
+ weight_table = default_manager.resolve(seq.alphabet, "weight_table")
+ return sum(seq, weight_table)
+
+def total_weight_range(seq, weight_table = None):
+ if weight_table is None:
+ weight_table = default_manager.resolve(seq.alphabet, "weight_range_table")
+ return sum_2ple(seq, weight_table)
+
+def reduce_sequence(seq, reduction_table,new_alphabet=None):
+ """ given an amino-acid sequence, return it in reduced alphabet form based
+ on the letter-translation table passed. Some "standard" tables are in
+ Alphabet.Reduced.
+ seq: a Seq.Seq type sequence
+ reduction_table: a dictionary whose keys are the "from" alphabet, and values
+ are the "to" alphabet"""
+ if new_alphabet is None:
+ new_alphabet = Alphabet.single_letter_alphabet
+ new_alphabet.letters = ''
+ for letter in reduction_table:
+ new_alphabet.letters += letter
+ new_alphabet.size = len(new_alphabet.letters)
+ new_seq = Seq.Seq('',new_alphabet)
+ for letter in seq:
+ new_seq += reduction_table[letter]
+ return new_seq
+
+
--- /dev/null
+"""Part of an old unused and undocumented sequence writing framework (DEPRECATED)."""
+# This is a Python module.
--- /dev/null
+"""Part of an old unused and undocumented sequence writing framework (DEPRECATED)."""
+# Not clear on the distinction, if any, between 'embl' and 'embl/65'. This
+# code might apply to either or both.
+
+# See 'http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html' for a
+# definition of this file format.
+
+# This code only makes a best effort--the output may not be strictly valid.
+# So, for example, the EMBL ID is supposed to be alphanumeric, starting with a
+# letter, but we don't check for this, etc.
+
+
+# Example:
+# ID AA03518 standard; DNA; FUN; 237 BP.
+# XX
+# AC U03518;
+# XX
+# DE Aspergillus awamori internal transcribed spacer 1 (ITS1) and 18S
+# DE rRNA and 5.8S rRNA genes, partial sequence.
+# XX
+# SQ Sequence 237 BP; 41 A; 77 C; 67 G; 52 T; 0 other;
+# aacctgcgga aggatcatta ccgagtgcgg gtcctttggg cccaacctcc catccgtgtc 60
+# tattgtaccc tgttgcttcg gcgggcccgc cgcttgtcgg ccgccggggg ggcgcctctg 120
+# ccccccgggc ccgtgcccgc cggagacccc aacacgaaca ctgtctgaaa gcgtgcagtc 180
+# tgagttgatt gaatgcaatc agttaaaact ttcaacaatg gatctcttgg ttccggc 237
+# //
+
+
+import textwrap
+
+from Bio import Alphabet
+from Bio import Writer
+
+class WriteEmbl(Writer.Writer):
+ def __init__(self, outfile):
+ Writer.Writer.__init__(self, outfile)
+
+ def write(self, record):
+ seq = record.seq
+ assert seq.alphabet.size == 1, "cannot handle alphabet of size %d" % \
+ seq.alphabet.size
+ data = seq.data
+ upperdata = data.upper()
+
+# It'd be nice if the alphabet was usefully set, but for many interesting
+# cases (e.g., reading from FASTA files), it's not.
+
+ if isinstance(seq.alphabet, Alphabet.RNAAlphabet):
+ molecule = 'mRNA'
+ letters = ['A', 'C', 'G', 'U']
+ else:
+ molecule = 'DNA'
+ letters = ['A', 'C', 'G', 'T']
+
+ division = 'UNC' # unknown
+
+ self.outfile.write("ID %s standard; %s; %s; %d BP.\n"
+ % (record.id, molecule, division, len(data)))
+
+ desclist = textwrap.wrap(record.description, 74)
+ for l in desclist:
+ self.outfile.write("DE %s\n" % l)
+
+ counts = [ upperdata.count(l) for l in letters ]
+ othercount = len(upperdata) - sum(counts)
+
+ countstring = ''.join([ " %d %s;" % p for p in zip(counts, letters) ])
+
+ self.outfile.write("SQ Sequence %s BP;%s %d other;\n"
+ % (len(data), countstring, othercount))
+
+ rowlength = 60
+ blocklength = 10
+ for i in xrange(0, len(data), rowlength):
+ self.outfile.write(" " * 5)
+ row = data[i:i+rowlength]
+ for b in xrange(0, rowlength, blocklength):
+ block = row[b:b+blocklength]
+ self.outfile.write("%-*s" % (blocklength+1, block))
+ self.outfile.write("%9d\n" % min(i+rowlength, len(data)))
+
+ self.outfile.write("//\n")
+
+
+make_writer = WriteEmbl
--- /dev/null
+"""Part of an old unused and undocumented sequence writing framework (DEPRECATED)."""
+from Bio import Writer
+
+class WriteEmpty(Writer.Writer):
+ pass
+
+make_writer = WriteEmpty
--- /dev/null
+"""Part of an old unused and undocumented sequence writing framework (DEPRECATED)."""
+from Bio import Writer
+
+class WriteFasta(Writer.Writer):
+ def __init__(self, outfile, seqwidth = 72):
+ Writer.Writer.__init__(self, outfile)
+ assert seqwidth > 0, seqwidth
+ self.seqwidth = seqwidth
+
+ def write(self, record):
+ self.outfile.write(">%s %s\n" % (record.id, record.description))
+ seq = record.seq
+ assert seq.alphabet.size == 1, "cannot handle alphabet of size %d" % \
+ seq.alphabet.size
+ seq = seq.data
+ seqwidth = self.seqwidth
+ for i in range(0, len(seq), seqwidth):
+ self.outfile.write(seq[i:i+seqwidth])
+ self.outfile.write("\n")
+
+make_writer = WriteFasta
--- /dev/null
+"""Part of an old unused and undocumented sequence writing framework (DEPRECATED)."""
+# This is a Python module.
+# (there are more files underneath this directory)
--- /dev/null
+CONTRIBUTORS
+============
+
+This is a list of people who have made contributions to Biopython.
+This is certainly not comprehensive, and if you've been overlooked
+(sorry!), please mention it on the development mailing list.
+
+Cecilia Alsmark <Cecilia.Alsmark at domain ebc.uu.se>
+Tiago Antao <tiagoantao at gmail.com>
+Sebastian Bassi <sbassi at domain asalup.org>
+Bill Barnard <bill at domain barnard-engineering.com>
+Yves Bastide <ybastide at domain irisa.fr>
+Yair Benita <Y.Benita at domain pharm.uu.nl>
+Peter Bienstman <Peter.Bienstman at domain rug.ac.be>
+Bob Bussell <rgb2003 at domain med.cornell.edu>
+Diego Brouard <diego at domain conysis.com>
+James Casbon <j.a.casbon at domain qmul.ac.uk>
+Hye-Shik Chang <perky at domain fallin.lv>
+Jeffrey Chang <jchang at domain smi.stanford.edu>
+Brad Chapman <chapmanb at domain arches.uga.edu>
+Peter Cock <p.j.a.cock at googlemail dot com>
+Marc Colosimo <mcolosimo at domain mitre.org>
+Cymon J Cox <cymon at domain duke.edu>
+Gavin E Crooks <gec at domain compbio.berkeley.edu>
+Andrew Dalke <dalke at domain acm.org>
+Michiel de Hoon <mdehoon at domain c2b2.columbia.edu>
+Sjoerd de Vries <sjoerd at domain nmr.chem.uu.nl>
+Iddo Friedberg <idoerg at domain burnham.org>
+Bertrand Frottier <bertrand.frottier at domain free.fr>
+Jason A. Hackney <jhackney at domain stanford.edu>
+Thomas Hamelryck <thamelry at domain binf.ku.dk>
+Michael Hoffman <hoffman+biopython at domain ebi.ac.uk>
+Yu Huang <krocea at domain yahoo.com.cn>
+Frank Kauff <fkauff at domain duke.edu>
+Andreas Kuntzagk <andreas.kuntzagk at domain mdc-berlin.de>
+Michal Kurowski <michal at domain genesilico.pl>
+Chris Lasher <chris.lasher at gmail.com>
+Gaetan Lehman <gaetan.lehmann at domain jouy.inra.fr>
+Katharine Lindner <katel at domain worldpath.net>
+Tarjei Mikkelsen <tarjei at domain genome.wi.mit.edu>
+Cheng Soon Ong <chengsoon.ong at tuebingen.mpg.de>
+Mike Poidinger <Michael.Poidinger at domain eBioinformatics.com>
+Leighton Pritchard <lpritc at domain scri.sari.ac.uk>
+Wolfgang Schueler <wolfgang at domain proceryon.at>
+Peter Slickers <piet at domain clondiag.com>
+Thomas Sicheritz-Ponten <thomas at domain cbs.dtu.dk>
+Frederic Sohm <fsms at domain users.sourceforge.net>
+Thomas Rosleff Soerensen <rosleff at domain mpiz-koeln.mpg.de>
+Johann Visagie <wjv at domain cityip.co.za>
+Dan Vogel <dmv at domain andrew.cmu.edu>
+David Weisman <david.weisman at domain acm.org>
+Bartek Wilczynski <bartek at domain rezolwenta.eu.org>
+Harry Zuzan <iliketobicycle at domain yahoo.ca>
--- /dev/null
+ Biopython License Agreement
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
--- /dev/null
+Metadata-Version: 1.0
+Name: biopython
+Version: 1.50
+Summary: Freely available tools for computational molecular biology.
+Home-page: http://www.biopython.org/
+Author: The Biopython Consortium
+Author-email: biopython@biopython.org
+License: UNKNOWN
+Download-URL: http://biopython.org/DIST/
+Description: UNKNOWN
+Platform: UNKNOWN
--- /dev/null
+#### THIS IS A REDUCED DESTRIBUTION OF BIOPYTHON ####
+
+**Biopython README file**
+
+ "The Biopython Project":http://www.biopython.org/ is an
+international association of developers of freely available Python
+tools for computational molecular biology.
+
+biopython.org provides an online resource for modules, scripts, and
+web links for developers of Python-based software for life science
+research. Thanks to bioperl, we can also provide web, FTP and CVS
+space for individuals and organizations wishing to distribute or
+otherwise make available standalone scripts & code.
+
+This Biopython package is made available under generous terms. Please
+see the LICENSE file for further details.
+
+
+**For the impatient**
+
+To build and install Biopython, download and unzip the source code,
+go to this directory at the command line, and type:
+
+python setup.py build
+python setup.py test
+python setup.py install
+
+**System Requirements**
+
+o "Python 2.3, 2.4, 2.5 or 2.6":http://www.python.org/
+ Note that Biopython 1.50 is expected to the our final release to support
+ Python 2.3. Given that Python 2.6 is still very new, it would be prudent
+ to opt for Python 2.5 or 2.4 at this time.
+
+o "NumPy":http://numpy.scipy.org/ (optional, but strongly recommended)
+ This package is only used in the computationally-oriented modules.
+ It is required for Bio.Cluster, Bio.PDB and a few other modules. If you
+ think you might need these modules, then please install NumPy first BEFORE
+ installing Biopython. The older Numeric library is no longer supported in
+ Biopython.
+
+o "ReportLab":http://www.reportlab.org/downloads.html (optional)
+ This package is only used in Bio.Graphics, so if you do not need this
+ functionality, you will not need to install this package. You can install
+ it later if needed.
+
+o "psycopg":http://initd.org/software/psycopg/ (optional) or
+ "pgdb":http://www.druid.net/pygresql/ (optional)
+ These packages are used by BioSQL to access a PostgreSQL database.
+
+o "MySQLdb":http://sourceforge.net/projects/mysql-python (optional)
+ This package is used by BioSQL or Bio.GFF to access a MySQL database.
+
+o "mxTextTools":http://www.egenix.com/files/python/mxTextTools.html (unlikely)
+ You probably won't need mxTextTools. This was used in some of Biopython's
+ older parsers, and Martel/Mindy, but these have all been deprecated now.
+
+In addition there are a number of useful third party tools you may wish to
+install such as standalone NCBI BLAST or ClustalW.
+
+
+**Installation**
+
+*** Make sure that Python is installed correctly ***
+
+Installation should be as simple as going to the biopython source code
+directory, and typing:
+
+ 'python setup.py build'
+ 'python setup.py test'
+ 'sudo python setup.py install'
+
+If you need to do additional configuration, e.g. changing the base
+directory, please type 'python setup.py', or see the documentation for
+Distutils.
+
+
+**Testing**
+
+Biopython includes a suite of regression tests to check if everything is
+running correctly. To do the tests, go to the biopython source code directory
+and type:
+
+ 'python setup.py test'
+
+Do not panic if you see messages warning of skipped tests:
+ test_DocSQL ... skipping. Install MySQLdb if you want to use Bio.DocSQL.
+
+This most likely means that a package is not installed. You can
+ignore this if it occurs in the tests for a module that you were not
+planning on using. If you did want to use that module, please install
+the required dependency and re-run the tests.
+
+
+**Bugs**
+
+While we try to ship a robust package, bugs inevitably pop up. If you
+are having problems that might be caused by a bug in Biopython, it is
+possible that it has already been identified. Search the
+"bug database":http://bugzilla.open-bio.org/ and mailing lists
+to see if it has already been reported (and hopefully fixed).
+
+If you suspect the problem lies within a parser, it is likely that the
+data format has changed and broken the parsing code. (The BLAST and
+GenBank formats seem to be particularly fragile.) Thus, the parsing
+code in Biopython is sometimes updated faster than we can build Biopython
+releases. You can get the most recent parser by pulling the relevant
+files (e.g. the ones in Bio.SeqIO or Bio.Blast) out of
+"anonymous cvs":http://cvs.biopython.org/ .
+However, be careful when doing this, because the code in CVS is not as
+well-tested as released code, and may contain new dependencies.
+
+Finally, you can send a bug report to the bug database or
+biopython@biopython.org. In the bug report, please let us know 1)
+which operating system and hardware you are using, 2) Python version,
+3) Biopython version (or CVS version/date), 4) traceback that occurs,
+5) offending code, and 6) data file that causes the problem.
+
+
+
+**Contributing, Bug Reports**
+
+Biopython is run by volunteers from all over the world, with many
+types of backgrounds. We are always looking for people interested in
+helping with code development, web-site management, documentation
+writing, technical administration, and whatever else comes up.
+
+If you wish to contribute, please visit the
+"web site":http://www.biopython.org
+and join our "mailing list":http://biopython.org/wiki/Mailing_lists
+
+
+
+**Distribution Structure**
+
+README -- This file.
+
+NEWS -- Release notes and news
+
+LICENSE -- What you can do with the code.
+
+CONTRIB -- An (incomplete) list of people who helped Biopython in
+ one way or another.
+
+DEPRECATED -- Contains information about modules in Biopython that are
+ removed or no longer recommended for use, and how to update
+ code that uses those modules.
+
+MANIFEST.in -- Tells distutils what files to distribute
+
+setup.py -- Installation file.
+
+Bio/ -- The main code base code.
+
+Martel/ -- Code for the Martel parsing system, once used in many
+ Biopython parsers but now deprecated.
+
+BioSQL/ -- Code for using Biopython with BioSQL databases.
+
+Doc/ -- Documentation.
+
+Scripts/ -- Miscellaneous, possibly useful, standalone scripts
+
+Tests/ -- Regression testing code