From: pvtroshin Date: Tue, 8 Feb 2011 17:27:02 +0000 (+0000) Subject: Copying Bio-python to globplot to satisfy the dependency X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=119df1cedad3d4760e6fd458713da2488eff79cc;p=jabaws.git Copying Bio-python to globplot to satisfy the dependency git-svn-id: link to svn.lifesci.dundee.ac.uk/svn/barton/ptroshin/JABA2@3719 e3abac25-378b-4346-85de-24260fe3988d --- diff --git a/binaries/src/globplot/biopython-1.50/Bio/Alphabet/IUPAC.py b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/IUPAC.py new file mode 100644 index 0000000..bf5a1e7 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/IUPAC.py @@ -0,0 +1,127 @@ +# Copyright 2000-2001 by Andrew Dalke. +# Revisions copyright 2008 by Peter Cock. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Standard nucleotide and protein alphabets defined by IUPAC.""" + +from Bio import Alphabet +from Bio.Data import IUPACData + +##################### Protein + +# From the IUPAC definition at: +# http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html#AA21 + +assert IUPACData.extended_protein_letters == IUPACData.extended_protein_letters.upper() +class ExtendedIUPACProtein(Alphabet.ProteinAlphabet): + """Extended uppercase IUPAC protein single letter alphabet including X etc. + + In addition to the standard 20 single letter protein codes, this includes: + + B = "Asx"; Aspartic acid (R) or Asparagine (N) + X = "Xxx"; Unknown or 'other' amino acid + Z = "Glx"; Glutamic acid (E) or Glutamine (Q) + J = "Xle"; Leucine (L) or Isoleucine (I), used in mass-spec (NMR) + U = "Sec"; Selenocysteine + O = "Pyl"; Pyrrolysine + + This alphabet is not intended to be used with X for Selenocysteine + (an ad-hoc standard prior to the IUPAC adoption of U instead). + """ + letters = IUPACData.extended_protein_letters + +extended_protein = ExtendedIUPACProtein() + +assert IUPACData.protein_letters == IUPACData.protein_letters.upper() +class IUPACProtein(ExtendedIUPACProtein): + """Uppercase IUPAC protein single letter alphabet of the 20 standard amino acids.""" + letters = IUPACData.protein_letters + +protein = IUPACProtein() + +##################### DNA + +# The next two are the IUPAC definitions, from: +# http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html +class IUPACAmbiguousDNA(Alphabet.DNAAlphabet): + """Uppercase IUPAC ambiguous DNA.""" + letters = IUPACData.ambiguous_dna_letters + +ambiguous_dna = IUPACAmbiguousDNA() + +class IUPACUnambiguousDNA(IUPACAmbiguousDNA): + """Uppercase IUPAC unambiguous DNA (letters GATC only).""" + letters = IUPACData.unambiguous_dna_letters + +unambiguous_dna = IUPACUnambiguousDNA() + + +# Also from the URL, but not part of the standard +class ExtendedIUPACDNA(Alphabet.DNAAlphabet): + """Extended IUPAC DNA alphabet. + + In addition to the standard letter codes GATC, this includes: + + B = 5-bromouridine + D = 5,6-dihydrouridine + S = thiouridine + W = wyosine + """ + letters = IUPACData.extended_dna_letters + +extended_dna = ExtendedIUPACDNA() + +##################### RNA + +class IUPACAmbiguousRNA(Alphabet.RNAAlphabet): + """Uppercase IUPAC ambiguous RNA.""" + letters = IUPACData.ambiguous_rna_letters + +ambiguous_rna = IUPACAmbiguousRNA() + +class IUPACUnambiguousRNA(IUPACAmbiguousRNA): + """Uppercase IUPAC unambiguous RNA (letters GAUC only).""" + letters = IUPACData.unambiguous_rna_letters + +unambiguous_rna = IUPACUnambiguousRNA() + +# are there extended forms? +#class ExtendedIUPACRNA(Alphabet.RNAAlphabet): +# letters = extended_rna_letters +# # B == 5-bromouridine +# # D == 5,6-dihydrouridine +# # S == thiouridine +# # W == wyosine + + +# We need to load the property resolution information, but we need to +# wait until after the systems have been loaded. (There's a nasty loop +# where, eg, translation objects need an alphabet, which need to be +# assocated with translators.) + +from Bio.PropertyManager import default_manager + +def _bootstrap(manager, klass, property): + assert manager is default_manager + del default_manager.class_resolver[IUPACProtein] + del default_manager.class_resolver[ExtendedIUPACProtein] + del default_manager.class_resolver[IUPACAmbiguousDNA] + del default_manager.class_resolver[IUPACUnambiguousDNA] + del default_manager.class_resolver[ExtendedIUPACDNA] + del default_manager.class_resolver[IUPACAmbiguousRNA] + del default_manager.class_resolver[IUPACUnambiguousRNA] + + from Bio.Encodings import IUPACEncoding + + return manager.resolve_class(klass, property) + +default_manager.class_resolver[IUPACProtein] = _bootstrap +default_manager.class_resolver[ExtendedIUPACProtein] = _bootstrap +default_manager.class_resolver[IUPACAmbiguousDNA] = _bootstrap +default_manager.class_resolver[IUPACUnambiguousDNA] = _bootstrap +default_manager.class_resolver[ExtendedIUPACDNA] = _bootstrap +default_manager.class_resolver[IUPACAmbiguousRNA] = _bootstrap +default_manager.class_resolver[IUPACUnambiguousRNA] = _bootstrap diff --git a/binaries/src/globplot/biopython-1.50/Bio/Alphabet/IUPAC.pyc b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/IUPAC.pyc new file mode 100644 index 0000000..74989a7 Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/IUPAC.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/Alphabet/Reduced.py b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/Reduced.py new file mode 100644 index 0000000..dd90682 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/Reduced.py @@ -0,0 +1,181 @@ +# Copyright 2004 by Iddo Friedberg. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Reduced alphabets which lump together several amino-acids into one letter. + +Reduced (redundant or simplified) alphabets are used to represent protein sequences using an +alternative alphabet which lumps together several amino-acids into one letter, based +on physico-chemical traits. For example, all the aliphatics (I,L,V) are usually +quite interchangeable, so many sequence studies group them into one letter + +Examples of reduced alphabets are available in: + +http://viscose.ifg.uni-muenster.de/html/alphabets.html + +Bio.utils.reduce_sequence is used to take a Protein alphabet, and reduce it using one of +the tables here, or a user-defined table. +""" + +from Bio import Alphabet + +# The Murphy tables are from here: +# Murphy L.R., Wallqvist A, Levy RM. (2000) Simplified amino acid alphabets for protein +# fold recognition and implications for folding. Protein Eng. 13(3):149-152 + +murphy_15_tab = {"L": "L", + "V": "L", + "I": "L", + "M": "L", + "C": "C", + "A": "A", + "G": "G", + "S": "S", + "T": "T", + "P": "P", + "F": "F", + "Y": "F", + "W": "W", + "E": "E", + "D": "D", + "N": "N", + "Q": "Q", + "K": "K", + "R": "K", + "H": "H"} + +class Murphy15(Alphabet.ProteinAlphabet): + letters = "LCAGSTPFWEDNQKH" + size = 15 +murphy_15 = Murphy15() + +murphy_10_tab = {"L": "L", + "V": "L", + "I": "L", + "M": "L", + "C": "C", + "A": "A", + "G": "G", + "S": "S", + "T": "S", + "P": "P", + "F": "F", + "Y": "F", + "W": "F", + "E": "E", + "D": "E", + "N": "E", + "Q": "E", + "K": "K", + "R": "K", + "H": "H"} +class Murphy10(Alphabet.ProteinAlphabet): + letters = "LCAGSPFEKH" + size = 10 +murphy_10 = Murphy10() + +murphy_8_tab = {"L": "L", + "V": "L", + "I": "L", + "M": "L", + "C": "L", + "A": "A", + "G": "A", + "S": "S", + "T": "S", + "P": "P", + "F": "F", + "Y": "F", + "W": "F", + "E": "E", + "D": "E", + "N": "E", + "Q": "E", + "K": "K", + "R": "K", + "H": "H"} + +class Murphy8(Alphabet.ProteinAlphabet): + letters = "LASPFEKH" + size = 8 +murphy_8 = Murphy8() + +murphy_4_tab = {"L": "L", + "V": "L", + "I": "L", + "M": "L", + "C": "L", + "A": "A", + "G": "A", + "S": "A", + "T": "A", + "P": "A", + "F": "F", + "Y": "F", + "W": "F", + "E": "E", + "D": "E", + "N": "E", + "Q": "E", + "K": "E", + "R": "E", + "H": "E"} + +class Murphy4(Alphabet.ProteinAlphabet): + letters = "LAFE" + size = 4 +murphy_4 = Murphy4() + +hp_model_tab = {"A": "P", # Hydrophilic + "G": "P", + "T": "P", + "S": "P", + "N": "P", + "Q": "P", + "D": "P", + "E": "P", + "H": "P", + "R": "P", + "K": "P", + "P": "P", + "C": "H", # Hydrophobic + "M": "H", + "F": "H", + "I": "H", + "L": "H", + "V": "H", + "W": "H", + "Y": "H"} + +class HPModel(Alphabet.ProteinAlphabet): + letters = "HP" + size = 2 +hp_model = HPModel() + +pc_5_table = {"I": "A", # Aliphatic + "V": "A", + "L": "A", + "F": "R", # Aromatic + "Y": "R", + "W": "R", + "H": "R", + "K": "C", # Charged + "R": "C", + "D": "C", + "E": "C", + "G": "T", # Tiny + "A": "T", + "C": "T", + "S": "T", + "T": "D", # Diverse + "M": "D", + "Q": "D", + "N": "D", + "P": "D"} + +class PC5(Alphabet.ProteinAlphabet): + letters = "ARCTD" + size = 5 +hp_model = HPModel() diff --git a/binaries/src/globplot/biopython-1.50/Bio/Alphabet/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/__init__.py new file mode 100644 index 0000000..0f3aca9 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/__init__.py @@ -0,0 +1,255 @@ +# Copyright 2000-2002 by Andrew Dalke. +# Revisions copyright 2007-2008 by Peter Cock. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Alphabets used in Seq objects etc to declare sequence type and letters. + +This is used by sequences which contain a finite number of similar words. +""" + +class Alphabet: + size = None # no fixed size for words + letters = None # no fixed alphabet; implement as a list-like + # interface, + def __repr__(self): + return self.__class__.__name__ + "()" + + def contains(self, other): + """Does this alphabet 'contain' the other (OBSOLETE?). + + Returns a boolean. This relies on the Alphabet subclassing + hierarchy only, and does not check the letters property. + This isn't ideal, and doesn't seem to work as intended + with the AlphabetEncoder classes.""" + return isinstance(other, self.__class__) + +generic_alphabet = Alphabet() + +class SingleLetterAlphabet(Alphabet): + size = 1 + letters = None # string of all letters in the alphabet + +single_letter_alphabet = SingleLetterAlphabet() + +########### Protein + +class ProteinAlphabet(SingleLetterAlphabet): + pass + +generic_protein = ProteinAlphabet() + +########### DNA +class NucleotideAlphabet(SingleLetterAlphabet): + pass + +generic_nucleotide = NucleotideAlphabet() + +class DNAAlphabet(NucleotideAlphabet): + pass + +generic_dna = DNAAlphabet() + + +########### RNA + +class RNAAlphabet(NucleotideAlphabet): + pass + +generic_rna = RNAAlphabet() + + + +########### Other per-sequence encodings + +class SecondaryStructure(SingleLetterAlphabet): + letters = "HSTC" + +class ThreeLetterProtein(Alphabet): + size = 3 + letters = [ + "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", + "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", + "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", + ] + +###### Non per-sequence modifications + +# (These are Decorator classes) + +class AlphabetEncoder: + def __init__(self, alphabet, new_letters): + self.alphabet = alphabet + self.new_letters = new_letters + if alphabet.letters is not None: + self.letters = alphabet.letters + new_letters + else: + self.letters = None + def __getattr__(self, key): + if key[:2] == "__" and key[-2:] == "__": + raise AttributeError(key) + return getattr(self.alphabet, key) + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, + self.new_letters) + + def contains(self, other): + """Does this alphabet 'contain' the other (OBSOLETE?). + + This is isn't implemented for the base AlphabetEncoder, + which will always return 0 (False).""" + return 0 + +class Gapped(AlphabetEncoder): + def __init__(self, alphabet, gap_char = "-"): + AlphabetEncoder.__init__(self, alphabet, gap_char) + self.gap_char = gap_char + + def contains(self, other): + """Does this alphabet 'contain' the other (OBSOLETE?). + + Returns a boolean. This relies on the Alphabet subclassing + hierarchy, and attempts to check the gap character. This fails + if the other alphabet does not have a gap character! + """ + return other.gap_char == self.gap_char and \ + self.alphabet.contains(other.alphabet) + +class HasStopCodon(AlphabetEncoder): + def __init__(self, alphabet, stop_symbol = "*"): + AlphabetEncoder.__init__(self, alphabet, stop_symbol) + self.stop_symbol = stop_symbol + + def __cmp__(self, other): + x = cmp(self.alphabet, other.alphabet) + if x == 0: + return cmp(self.stop_symbol, other.stop_symbol) + return x + + def contains(self, other): + """Does this alphabet 'contain' the other (OBSOLETE?). + + Returns a boolean. This relies on the Alphabet subclassing + hierarchy, and attempts to check the stop symbol. This fails + if the other alphabet does not have a stop symbol! + """ + return other.stop_symbol == self.stop_symbol and \ + self.alphabet.contains(other.alphabet) + +def _get_base_alphabet(alphabet) : + """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" + a = alphabet + while isinstance(a, AlphabetEncoder) : + a = a.alphabet + assert isinstance(a, Alphabet), \ + "Invalid alphabet found, %s" % repr(a) + return a + +def _consensus_base_alphabet(alphabets) : + """Returns a common but often generic base alphabet object (PRIVATE). + + This throws away any AlphabetEncoder information, e.g. Gapped alphabets. + + Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single + letter. These DO NOT raise an exception!""" + common = None + for alpha in alphabets : + a = _get_base_alphabet(alpha) + if common is None : + common = a + elif common == a : + pass + elif isinstance(a, common.__class__) : + pass + elif isinstance(common, a.__class__) : + common = a + elif isinstance(a, NucleotideAlphabet) \ + and isinstance(common, NucleotideAlphabet) : + #e.g. Give a mix of RNA and DNA alphabets + common = generic_nucleotide + elif isinstance(a, SingleLetterAlphabet) \ + and isinstance(common, SingleLetterAlphabet) : + #This is a pretty big mis-match! + common = single_letter_alphabet + else : + #We have a major mis-match... take the easy way out! + return generic_alphabet + if common is None : + #Given NO alphabets! + return generic_alphabet + return common + +def _consensus_alphabet(alphabets) : + """Returns a common but often generic alphabet object (PRIVATE). + + Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single + letter. These DO NOT raise an exception! + + This is aware of Gapped and HasStopCodon and new letters added by + other AlphabetEncoders. This WILL raise an exception if more than + one gap character or stop symbol is present.""" + base = _consensus_base_alphabet(alphabets) + gap = None + stop = None + new_letters = "" + for alpha in alphabets : + #Gaps... + if not hasattr(alpha, "gap_char") : + pass + elif gap is None : + gap = alpha.gap_char + elif gap == alpha.gap_char : + pass + else : + raise ValueError("More than one gap character present") + #Stops... + if not hasattr(alpha, "stop_symbol") : + pass + elif stop is None : + stop = alpha.stop_symbol + elif stop == alpha.stop_symbol : + pass + else : + raise ValueError("More than one stop symbol present") + #New letters... + if hasattr(alpha, "new_letters") : + for letter in alpha.new_letters : + if letter not in new_letters \ + and letter != gap and letter != stop : + new_letters += letter + + alpha = base + if new_letters : + alpha = AlphabetEncoder(alpha, new_letters) + if gap : + alpha = Gapped(alpha, gap_char=gap) + if stop : + alpha = HasStopCodon(alpha, stop_symbol=stop) + return alpha + +def _check_type_compatible(alphabets) : + """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). + + This relies on the Alphabet subclassing hierarchy. It does not + check things like gap characters or stop symbols.""" + dna, rna, nucl, protein = False, False, False, False + for alpha in alphabets : + a = _get_base_alphabet(alpha) + if isinstance(a, DNAAlphabet) : + dna = True + nucl = True + if rna or protein : return False + elif isinstance(a, RNAAlphabet) : + rna = True + nucl = True + if dna or protein : return False + elif isinstance(a, NucleotideAlphabet) : + nucl = True + if protein : return False + elif isinstance(a, ProteinAlphabet) : + protein = True + if nucl : return False + return True diff --git a/binaries/src/globplot/biopython-1.50/Bio/Alphabet/__init__.pyc b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/__init__.pyc new file mode 100644 index 0000000..d43e071 Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/Alphabet/__init__.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/Application/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/Application/__init__.py new file mode 100644 index 0000000..93c2149 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Application/__init__.py @@ -0,0 +1,250 @@ +"""General mechanisms to access applications in biopython. +""" +import os, sys +import StringIO + +from Bio import File + +def generic_run(commandline): + """Run an application with the given commandline. + + This expects a pre-built commandline that derives from + AbstractCommandline, and returns a ApplicationResult object + to get results from a program, along with handles of the + standard output and standard error. + + WARNING - This will read in the full program output into memory! + This may be in issue when the program write a large amount of + data to standard output. + """ + # print str(commandline) + + #Try and use subprocess (available in python 2.4+) + try : + import subprocess, sys + #We don't need to supply any piped input, but we setup the + #standard input pipe anyway as a work around for a python + #bug if this is called from a Windows GUI program. For + #details, see http://bugs.python.org/issue1124861 + child = subprocess.Popen(str(commandline), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=(sys.platform!="win32")) + child.stdin.close() + r = child.stdout + e = child.stderr + + r_out = r.read() + e_out = e.read() + r.close() + e.close() + + # capture error code + error_code = child.wait() + + except ImportError : + #For python 2.3 can't use subprocess, using popen2 instead + #(deprecated in python 2.6) + import popen2 + if sys.platform[:3]=='win': + # Windows does not have popen2.Popen3 + r, w, e = popen2.popen3(str(commandline)) + + r_out = r.read() + e_out = e.read() + w.close() + r.close() + e.close() + + # No way to get the error code; setting it to a dummy variable + error_code = 0 + + else: + child = popen2.Popen3(str(commandline), 1) + # get information and close the files, so if we call this function + # repeatedly we won't end up with too many open files + + # here are the file descriptors + r = child.fromchild + w = child.tochild + e = child.childerr + + r_out = r.read() + e_out = e.read() + w.close() + r.close() + e.close() + + # capture error code + error_code = os.WEXITSTATUS(child.wait()) + + return ApplicationResult(commandline, error_code), \ + File.UndoHandle(StringIO.StringIO(r_out)), \ + File.UndoHandle(StringIO.StringIO(e_out)) + +class ApplicationResult: + """Make results of a program available through a standard interface. + + This tries to pick up output information available from the program + and make it available programmatically. + """ + def __init__(self, application_cl, return_code): + """Intialize with the commandline from the program. + """ + self._cl = application_cl + + # provide the return code of the application + self.return_code = return_code + + # get the application dependent results we can provide + # right now the only results we handle are output files + self._results = {} + + for parameter in self._cl.parameters: + if "file" in parameter.param_types and \ + "output" in parameter.param_types: + if parameter.is_set: + self._results[parameter.names[-1]] = parameter.value + + def get_result(self, output_name): + """Retrieve result information for the given output. + """ + return self._results[output_name] + + def available_results(self): + """Retrieve a list of all available results. + """ + result_names = self._results.keys() + result_names.sort() + return result_names + +class AbstractCommandline: + """Generic interface for running applications from biopython. + + This class shouldn't be called directly; it should be subclassed to + provide an implementation for a specific application. + """ + def __init__(self): + self.program_name = "" + self.parameters = [] + + def __str__(self): + """Make the commandline with the currently set options. + """ + commandline = "%s " % self.program_name + for parameter in self.parameters: + if parameter.is_required and not(parameter.is_set): + raise ValueError("Parameter %s is not set." % parameter.names) + if parameter.is_set: + commandline += str(parameter) + + return commandline + + def set_parameter(self, name, value = None): + """Set a commandline option for a program. + """ + set_option = 0 + for parameter in self.parameters: + if name in parameter.names: + if value is not None: + self._check_value(value, name, parameter.checker_function) + parameter.value = value + parameter.is_set = 1 + set_option = 1 + + if set_option == 0: + raise ValueError("Option name %s was not found." % name) + + def _check_value(self, value, name, check_function): + """Check whether the given value is valid. + + This uses the passed function 'check_function', which can either + return a [0, 1] (bad, good) value or raise an error. Either way + this function will raise an error if the value is not valid, or + finish silently otherwise. + """ + if check_function is not None: + is_good = check_function(value) + if is_good in [0, 1]: # if we are dealing with a good/bad check + if not(is_good): + raise ValueError( + "Invalid parameter value %r for parameter %s" % + (value, name)) + +class _AbstractParameter: + """A class to hold information about a parameter for a commandline. + + Do not use this directly, instead use one of the subclasses. + + Attributes: + + o names -- a list of string names by which the parameter can be + referenced (ie. ["-a", "--append", "append"]). The first name in + the list is considered to be the one that goes on the commandline, + for those parameters that print the option. The last name in the list + is assumed to be a "human readable" name describing the option in one + word. + + o param_type -- a list of string describing the type of parameter, + which can help let programs know how to use it. Example descriptions + include 'input', 'output', 'file' + + o checker_function -- a reference to a function that will determine + if a given value is valid for this parameter. This function can either + raise an error when given a bad value, or return a [0, 1] decision on + whether the value is correct. + + o description -- a description of the option. + + o is_required -- a flag to indicate if the parameter must be set for + the program to be run. + + o is_set -- if the parameter has been set + + o value -- the value of a parameter + """ + def __init__(self, names = [], types = [], checker_function = None, + is_required = 0, description = ""): + self.names = names + self.param_types = types + self.checker_function = checker_function + self.description = description + self.is_required = is_required + + self.is_set = 0 + self.value = None + +class _Option(_AbstractParameter): + """Represent an option that can be set for a program. + + This holds UNIXish options like --append=yes and -a yes + """ + def __str__(self): + """Return the value of this option for the commandline. + """ + # first deal with long options + if self.names[0].find("--") >= 0: + output = "%s" % self.names[0] + if self.value is not None: + output += "=%s " % self.value + else: + output += " " + # now short options + elif self.names[0].find("-") >= 0: + output = "%s " % self.names[0] + if self.value is not None: + output += "%s " % self.value + else: + raise ValueError("Unrecognized option type: %s" % self.names[0]) + + return output + +class _Argument(_AbstractParameter): + """Represent an argument on a commandline. + """ + def __str__(self): + if self.value is not None: + return "%s " % self.value + else: + return " " diff --git a/binaries/src/globplot/biopython-1.50/Bio/Data/CodonTable.py b/binaries/src/globplot/biopython-1.50/Bio/Data/CodonTable.py new file mode 100644 index 0000000..16aaccf --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Data/CodonTable.py @@ -0,0 +1,802 @@ +#TODO - Remove this work around once we drop python 2.3 support +try: + set = set +except NameError: + from sets import Set as set + +from Bio import Alphabet +from Bio.Alphabet import IUPAC +from Bio.Data import IUPACData + +unambiguous_dna_by_name = {} +unambiguous_dna_by_id = {} +unambiguous_rna_by_name = {} +unambiguous_rna_by_id = {} +generic_by_name = {} # unambiguous DNA or RNA +generic_by_id = {} # unambiguous DNA or RNA +ambiguous_generic_by_name = {} # ambiguous DNA or RNA +ambiguous_generic_by_id = {} # ambiguous DNA or RNA + +# standard IUPAC unambiguous codons +standard_dna_table = None +standard_rna_table = None + +# In the future, the back_table could return a statistically +# appropriate distribution of codons, so do not cache the results of +# back_table lookups! + +class TranslationError(Exception): + pass + +class CodonTable: + nucleotide_alphabet = Alphabet.generic_nucleotide + protein_alphabet = Alphabet.generic_protein + + forward_table = {} # only includes codons which actually code + back_table = {} # for back translations + start_codons = [] + stop_codons = [] + # Not always called from derived classes! + def __init__(self, nucleotide_alphabet = nucleotide_alphabet, + protein_alphabet = protein_alphabet, + forward_table = forward_table, back_table = back_table, + start_codons = start_codons, stop_codons = stop_codons): + self.nucleotide_alphabet = nucleotide_alphabet + self.protein_alphabet = protein_alphabet + self.forward_table = forward_table + self.back_table = back_table + self.start_codons = start_codons + self.stop_codons = stop_codons + + def __str__(self) : + """Returns a simple text representation of the codon table + + e.g. + >>> import Bio.Data.CodonTable + >>> print Bio.Data.CodonTable.standard_dna_table + >>> print Bio.Data.CodonTable.generic_by_id[1]""" + + if self.id : + answer = "Table %i" % self.id + else : + answer = "Table ID unknown" + if self.names : + answer += " " + ", ".join(filter(None, self.names)) + + #Use the main four letters (and the conventional ordering) + #even for ambiguous tables + letters = self.nucleotide_alphabet.letters + if isinstance(self.nucleotide_alphabet, Alphabet.DNAAlphabet) \ + or (letters is not None and "T" in letters) : + letters = "TCAG" + else : + #Should be either RNA or generic nucleotides, + #e.g. Bio.Data.CodonTable.generic_by_id[1] + letters = "UCAG" + + #Build the table... + answer=answer + "\n\n |" + "|".join( \ + [" %s " % c2 for c2 in letters] \ + ) + "|" + answer=answer + "\n--+" \ + + "+".join(["---------" for c2 in letters]) + "+--" + for c1 in letters : + for c3 in letters : + line = c1 + " |" + for c2 in letters : + codon = c1+c2+c3 + line = line + " %s" % codon + if codon in self.stop_codons : + line = line + " Stop|" + else : + try : + amino = self.forward_table[codon] + except KeyError : + amino = "?" + except TranslationError : + amino = "?" + if codon in self.start_codons : + line = line + " %s(s)|" % amino + else : + line = line + " %s |" % amino + line = line + " " + c3 + answer = answer + "\n"+ line + answer=answer + "\n--+" \ + + "+".join(["---------" for c2 in letters]) + "+--" + return answer + +def make_back_table(table, default_stop_codon): + # ONLY RETURNS A SINGLE CODON + # Do the sort so changes in the hash implementation won't affect + # the result when one amino acid is coded by more than one codon. + back_table = {} + keys = table.keys() ; keys.sort() + for key in keys: + back_table[table[key]] = key + back_table[None] = default_stop_codon + return back_table + + +class NCBICodonTable(CodonTable): + nucleotide_alphabet = Alphabet.generic_nucleotide + protein_alphabet = IUPAC.protein + + def __init__(self, id, names, table, start_codons, stop_codons): + self.id = id + self.names = names + self.forward_table = table + self.back_table = make_back_table(table, stop_codons[0]) + self.start_codons = start_codons + self.stop_codons = stop_codons + + +class NCBICodonTableDNA(NCBICodonTable): + nucleotide_alphabet = IUPAC.unambiguous_dna + +class NCBICodonTableRNA(NCBICodonTable): + nucleotide_alphabet = IUPAC.unambiguous_rna + + + +def register_ncbi_table(name, alt_name, id, + table, start_codons, stop_codons): + names = name.split("; ") + + dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons, + stop_codons) + # replace all T's with U's for the RNA tables + rna_table = {} + generic_table = {} + for codon, val in table.items(): + generic_table[codon] = val + codon = codon.replace("T", "U") + generic_table[codon] = val + rna_table[codon] = val + rna_start_codons = [] + generic_start_codons = [] + for codon in start_codons: + generic_start_codons.append(codon) + codon = codon.replace("T", "U") + generic_start_codons.append(codon) + rna_start_codons.append(codon) + rna_stop_codons = [] + generic_stop_codons = [] + for codon in stop_codons: + generic_stop_codons.append(codon) + codon = codon.replace("T", "U") + generic_stop_codons.append(codon) + rna_stop_codons.append(codon) + + generic = NCBICodonTable(id, names + [alt_name], generic_table, + generic_start_codons, generic_stop_codons) + rna = NCBICodonTableRNA(id, names + [alt_name], rna_table, + rna_start_codons, rna_stop_codons) + + if id == 1: + global standard_dna_table, standard_rna_table + standard_dna_table = dna + standard_rna_table = rna + + unambiguous_dna_by_id[id] = dna + unambiguous_rna_by_id[id] = rna + generic_by_id[id] = generic + + if alt_name is not None: + names.append(alt_name) + + for name in names: + unambiguous_dna_by_name[name] = dna + unambiguous_rna_by_name[name] = rna + generic_by_name[name] = generic + +### These tables created from the data file +### ftp://ncbi.nlm.nih.gov/entrez/misc/data/gc.prt +### using the following: +##import re +##for line in open("gc.prt").readlines(): +## if line[:2] == " {": +## names = [] +## id = None +## aa = None +## start = None +## bases = [] +## elif line[:6] == " name": +## names.append(re.search('"([^"]*)"', line).group(1)) +## elif line[:8] == " name": +## names.append(re.search('"(.*)$', line).group(1)) +## elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n': +## names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma" +## elif line[:4] == " id": +## id = int(re.search('(\d+)', line).group(1)) +## elif line[:10] == " ncbieaa ": +## aa = line[12:12+64] +## elif line[:10] == " sncbieaa": +## start = line[12:12+64] +## elif line[:9] == " -- Base": +## bases.append(line[12:12+64]) +## elif line[:2] == " }": +## assert names != [] and id is not None and aa is not None +## assert start is not None and bases != [] +## if len(names) == 1: +## names.append(None) +## print "register_ncbi_table(name = %s," % repr(names[0]) +## print " alt_name = %s, id = %d", % \ +## (repr(names[1]), id) +## print " table = {" +## s = " " +## for i in range(64): +## if aa[i] != "*": +## t = " '%s%s%s': '%s'," % (bases[0][i], bases[1][i], +## bases[2][i], aa[i]) +## if len(s) + len(t) > 75: +## print s +## s = " " + t +## else: +## s = s + t +## print s, "}," + +## s = " stop_codons = [" +## for i in range(64): +## if aa[i] == "*": +## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) +## if len(s) + len(t) > 75: +## print s +## s = " " + t +## else: +## s = s + t +## print s, "]," + +## s = " start_codons = [" +## for i in range(64): +## if start[i] == "M": +## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) +## if len(s) + len(t) > 75: +## print s +## s = " " + t +## else: +## s = s + t +## print s, "]" +## print " )" +## elif line[:2] == "--" or line == "\n" or line == "}\n" or \ +## line == 'Genetic-code-table ::= {\n': +## pass +## else: +## raise Exception("Unparsed: " + repr(line)) + +register_ncbi_table(name = 'Standard', + alt_name = 'SGC0', id = 1, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', + 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', + 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', + 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', + 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', + 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', + 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', + 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', + 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', + 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', + 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', 'TGA', ], + start_codons = [ 'TTG', 'CTG', 'ATG', ] + ) +register_ncbi_table(name = 'Vertebrate Mitochondrial', + alt_name = 'SGC1', id = 2, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', + 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', + 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', + 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', + 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', + 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', + 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'GTT': 'V', + 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', + 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', + 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', 'AGA', 'AGG', ], + start_codons = [ 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ] + ) +register_ncbi_table(name = 'Yeast Mitochondrial', + alt_name = 'SGC2', id = 3, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'T', + 'CTC': 'T', 'CTA': 'T', 'CTG': 'T', 'CCT': 'P', 'CCC': 'P', + 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', + 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', + 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', + 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', + 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', + 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', + 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', + 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', + 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', ], + start_codons = [ 'ATG', ] + ) +register_ncbi_table(name = 'Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma', + alt_name = 'SGC3', id = 4, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', + 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', + 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', + 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', + 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', + 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', + 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', + 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', + 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', + 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', + 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', ], + start_codons = [ 'TTA', 'TTG', 'CTG', 'ATT', 'ATC', + 'ATA', 'ATG', 'GTG', ] + ) +register_ncbi_table(name = 'Invertebrate Mitochondrial', + alt_name = 'SGC4', id = 5, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', + 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', + 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', + 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', + 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', + 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', + 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', + 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', + 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', + 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', + 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', ], + start_codons = [ 'TTG', 'ATT', 'ATC', 'ATA', 'ATG', + 'GTG', ] + ) +register_ncbi_table(name = 'Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear', + alt_name = 'SGC5', id = 6, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TAA': 'Q', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', + 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', + 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', + 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', + 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', + 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', + 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', + 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', + 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', + 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', + 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TGA', ], + start_codons = [ 'ATG', ] + ) +register_ncbi_table(name = 'Echinoderm Mitochondrial', + alt_name = 'SGC8', id = 9, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', + 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', + 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', + 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', + 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', + 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', + 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', + 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', + 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', + 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', + 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', ], + start_codons = [ 'ATG', ] + ) +register_ncbi_table(name = 'Euplotid Nuclear', + alt_name = 'SGC9', id = 10, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W', 'CTT': 'L', + 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', + 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', + 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', + 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', + 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', + 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', + 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', + 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', + 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', + 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', ], + start_codons = [ 'ATG', ] + ) +register_ncbi_table(name = 'Bacterial', + alt_name = None, id = 11, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', + 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', + 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', + 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', + 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', + 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', + 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', + 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', + 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', + 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', + 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', 'TGA', ], + start_codons = [ 'TTG', 'CTG', 'ATT', 'ATC', 'ATA', + 'ATG', 'GTG', ] + ) +register_ncbi_table(name = 'Alternative Yeast Nuclear', + alt_name = None, id = 12, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', + 'CTA': 'L', 'CTG': 'S', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', + 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', + 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', + 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', + 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', + 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', + 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', + 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', + 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', + 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', 'TGA', ], + start_codons = [ 'CTG', 'ATG', ] + ) +register_ncbi_table(name = 'Ascidian Mitochondrial', + alt_name = None, id = 13, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', + 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', + 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', + 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', + 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', + 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', + 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'G', + 'AGG': 'G', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', + 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', + 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', + 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TAG', ], + start_codons = [ 'ATG', ] + ) +register_ncbi_table(name = 'Flatworm Mitochondrial', + alt_name = None, id = 14, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TAA': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', + 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', + 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', + 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', + 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', + 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', + 'AAC': 'N', 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', + 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', + 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', + 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', + 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TAG', ], + start_codons = [ 'ATG', ] + ) +register_ncbi_table(name = 'Blepharisma Macronuclear', + alt_name = None, id = 15, + table = { + 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', + 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', + 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', + 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', + 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', + 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', + 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', + 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', + 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', + 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', + 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', + 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', + 'GGA': 'G', 'GGG': 'G', }, + stop_codons = [ 'TAA', 'TGA', ], + start_codons = [ 'ATG', ] + ) + +######### Deal with ambiguous forward translations + +class AmbiguousCodonTable(CodonTable): + def __init__(self, codon_table, + ambiguous_nucleotide_alphabet, + ambiguous_nucleotide_values, + ambiguous_protein_alphabet, + ambiguous_protein_values): + CodonTable.__init__(self, + ambiguous_nucleotide_alphabet, + ambiguous_protein_alphabet, + AmbiguousForwardTable(codon_table.forward_table, + ambiguous_nucleotide_values, + ambiguous_protein_values), + codon_table.back_table, + + # These two are WRONG! I need to get the + # list of ambiguous codons which code for + # the stop codons XXX + list_ambiguous_codons(codon_table.start_codons, ambiguous_nucleotide_values), + list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values) + ) + self._codon_table = codon_table + + # Be sneaky and forward attribute lookups to the original table. + # This lets us get the names, if the original table is an NCBI + # table. + def __getattr__(self, name): + return getattr(self._codon_table, name) + +def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values): + c1, c2, c3 = codon + x1 = ambiguous_nucleotide_values[c1] + x2 = ambiguous_nucleotide_values[c2] + x3 = ambiguous_nucleotide_values[c3] + possible = {} + stops = [] + for y1 in x1: + for y2 in x2: + for y3 in x3: + try: + possible[forward_table[y1+y2+y3]] = 1 + except KeyError: + # If tripping over a stop codon + stops.append(y1+y2+y3) + if stops: + if possible.keys(): + raise TranslationError("ambiguous codon '%s' codes " % codon \ + + "for both proteins and stop codons") + # This is a true stop codon - tell the caller about it + raise KeyError(codon) + return possible.keys() + +def list_ambiguous_codons(codons, ambiguous_nucleotide_values): + """Extends a codon list to include all possible ambigous codons. + + e.g. ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR'] + ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA'] + + Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR'. + Thus only two more codons are added in the following: + + e.g. ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR'] + + Returns a new (longer) list of codon strings. + """ + + #Note ambiguous_nucleotide_values['R'] = 'AG' (etc) + #This will generate things like 'TRR' from ['TAG', 'TGA'], which + #we don't want to include: + c1_list = [letter for (letter, meanings) \ + in ambiguous_nucleotide_values.iteritems() \ + if set([codon[0] for codon in codons]).issuperset(set(meanings))] + c2_list = [letter for (letter, meanings) \ + in ambiguous_nucleotide_values.iteritems() \ + if set([codon[1] for codon in codons]).issuperset(set(meanings))] + c3_list = [letter for (letter, meanings) \ + in ambiguous_nucleotide_values.iteritems() \ + if set([codon[2] for codon in codons]).issuperset(set(meanings))] + set2 = set([codon[1] for codon in codons]) + set3 = set([codon[2] for codon in codons]) + candidates = set([c1+c2+c3 for c1 in c1_list for c2 in c2_list for c3 in c3_list]) + candidates.difference_update(codons) + answer = codons[:] #copy + #print "Have %i new candidates" % len(candidates) + for ambig_codon in candidates : + wanted = True + #e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG' + for codon in [c1+c2+c3 \ + for c1 in ambiguous_nucleotide_values[ambig_codon[0]] \ + for c2 in ambiguous_nucleotide_values[ambig_codon[1]] \ + for c3 in ambiguous_nucleotide_values[ambig_codon[2]]]: + if codon not in codons : + #This ambiguous codon can code for a non-stop, exclude it! + wanted=False + #print "Rejecting %s" % ambig_codon + continue + if wanted : + answer.append(ambig_codon) + return answer +assert list_ambiguous_codons(['TGA', 'TAA'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TRA'] +assert list_ambiguous_codons(['TAG', 'TGA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TGA'] +assert list_ambiguous_codons(['TAG', 'TAA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TAA', 'TAR'] +assert list_ambiguous_codons(['UAG', 'UAA'],IUPACData.ambiguous_rna_values) == ['UAG', 'UAA', 'UAR'] +assert list_ambiguous_codons(['TGA', 'TAA', 'TAG'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TAG', 'TAR', 'TRA'] + +# Forward translation is "onto", that is, any given codon always maps +# to the same protein, or it doesn't map at all. Thus, I can build +# off of an existing table to produce the ambiguous mappings. +# +# This handles the general case. Perhaps it's overkill? +# >>> t = CodonTable.ambiguous_dna_by_id[1] +# >>> t.forward_table["AAT"] +# 'N' +# >>> t.forward_table["GAT"] +# 'D' +# >>> t.forward_table["RAT"] +# 'B' +# >>> t.forward_table["YTA"] +# 'L' + +class AmbiguousForwardTable: + def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein): + self.forward_table = forward_table + + self.ambiguous_nucleotide = ambiguous_nucleotide + self.ambiguous_protein = ambiguous_protein + + inverted = {} + for name, val in ambiguous_protein.items(): + for c in val: + x = inverted.get(c, {}) + x[name] = 1 + inverted[c] = x + for name, val in inverted.items(): + inverted[name] = val.keys() + self._inverted = inverted + + self._cache = {} + + def get(self, codon, failobj = None): + try: + return self.__getitem__(codon) + except KeyError: + return failobj + + def __getitem__(self, codon): + try: + x = self._cache[codon] + except KeyError: + pass + else: + if x is TranslationError: + raise TranslationError(codon) # no unique translation + if x is KeyError: + raise KeyError(codon) # it's a stop codon + return x + try: + x = self.forward_table[codon] + self._cache[codon] = x + return x + except KeyError: + pass + + # XXX Need to make part of this into a method which returns + # a list of all possible encodings for a codon! + try: + possible = list_possible_proteins(codon, + self.forward_table, + self.ambiguous_nucleotide) + except KeyError: + self._cache[codon] = KeyError + raise KeyError(codon) # stop codon + except TranslationError: + self._cache[codon] = TranslationError + raise TranslationError(codon) # does not code + assert len(possible) > 0, "unambiguous codons must code" + + # Hah! Only one possible protein, so use it + if len(possible) == 1: + self._cache[codon] = possible[0] + return possible[0] + + # See if there's an ambiguous protein encoding for the multiples. + # Find residues which exist in every coding set. + ambiguous_possible = {} + for amino in possible: + for term in self._inverted[amino]: + ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1 + + n = len(possible) + possible = [] + for amino, val in ambiguous_possible.items(): + if val == n: + possible.append(amino) + + # No amino acid encoding for the results + if len(possible) == 0: + self._cache[codon] = TranslationError + raise TranslationError(codon) # no valid translation + + # All of these are valid, so choose one + # To be unique, sort by smallet ambiguity then alphabetically + # Can get this if "X" encodes for everything. + def _sort(x, y, table = self.ambiguous_protein): + a = cmp(len(table[x]), len(table[y])) + if a == 0: + return cmp(x, y) + return a + possible.sort(_sort) + + x = possible[0] + self._cache[codon] = x + return x + +#Prepare the ambiguous tables for DNA, RNA and Generic (DNA or RNA) +ambiguous_dna_by_name = {} +for key, val in unambiguous_dna_by_name.items(): + ambiguous_dna_by_name[key] = AmbiguousCodonTable(val, + IUPAC.ambiguous_dna, + IUPACData.ambiguous_dna_values, + IUPAC.extended_protein, + IUPACData.extended_protein_values) +ambiguous_dna_by_id = {} +for key, val in unambiguous_dna_by_id.items(): + ambiguous_dna_by_id[key] = AmbiguousCodonTable(val, + IUPAC.ambiguous_dna, + IUPACData.ambiguous_dna_values, + IUPAC.extended_protein, + IUPACData.extended_protein_values) + +ambiguous_rna_by_name = {} +for key, val in unambiguous_rna_by_name.items(): + ambiguous_rna_by_name[key] = AmbiguousCodonTable(val, + IUPAC.ambiguous_rna, + IUPACData.ambiguous_rna_values, + IUPAC.extended_protein, + IUPACData.extended_protein_values) +ambiguous_rna_by_id = {} +for key, val in unambiguous_rna_by_id.items(): + ambiguous_rna_by_id[key] = AmbiguousCodonTable(val, + IUPAC.ambiguous_rna, + IUPACData.ambiguous_rna_values, + IUPAC.extended_protein, + IUPACData.extended_protein_values) + +#The following isn't very elegant, but seems to work nicely. +_merged_values = dict(IUPACData.ambiguous_rna_values.iteritems()) +_merged_values["T"] = "U" + +for key, val in generic_by_name.items(): + ambiguous_generic_by_name[key] = AmbiguousCodonTable(val, + Alphabet.NucleotideAlphabet(), + _merged_values, + IUPAC.extended_protein, + IUPACData.extended_protein_values) + +for key, val in generic_by_id.items(): + ambiguous_generic_by_id[key] = AmbiguousCodonTable(val, + Alphabet.NucleotideAlphabet(), + _merged_values, + IUPAC.extended_protein, + IUPACData.extended_protein_values) +del _merged_values +del key, val + +#Basic sanity test, +for n in ambiguous_generic_by_id.keys() : + assert ambiguous_rna_by_id[n].forward_table["GUU"] == "V" + assert ambiguous_rna_by_id[n].forward_table["GUN"] == "V" + assert ambiguous_rna_by_id[n].forward_table["UUN"] == "X" #F or L + #R = A or G, so URR = UAA or UGA / TRA = TAA or TGA = stop codons + if "UAA" in unambiguous_rna_by_id[n].stop_codons \ + and "UGA" in unambiguous_rna_by_id[n].stop_codons : + try : + print ambiguous_dna_by_id[n].forward_table["TRA"] + assert False, "Should be a stop only" + except KeyError : + pass + assert "URA" in ambiguous_generic_by_id[n].stop_codons + assert "URA" in ambiguous_rna_by_id[n].stop_codons + assert "TRA" in ambiguous_generic_by_id[n].stop_codons + assert "TRA" in ambiguous_dna_by_id[n].stop_codons +del n +assert ambiguous_generic_by_id[1].stop_codons == ambiguous_generic_by_name["Standard"].stop_codons +assert ambiguous_generic_by_id[4].stop_codons == ambiguous_generic_by_name["SGC3"].stop_codons +assert ambiguous_generic_by_id[15].stop_codons == ambiguous_generic_by_name['Blepharisma Macronuclear'].stop_codons diff --git a/binaries/src/globplot/biopython-1.50/Bio/Data/CodonTable.pyc b/binaries/src/globplot/biopython-1.50/Bio/Data/CodonTable.pyc new file mode 100644 index 0000000..0a5472e Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/Data/CodonTable.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/Data/IUPACData.py b/binaries/src/globplot/biopython-1.50/Bio/Data/IUPACData.py new file mode 100644 index 0000000..ebd5a12 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Data/IUPACData.py @@ -0,0 +1,209 @@ +# Information about the IUPAC alphabets + +protein_letters = "ACDEFGHIKLMNPQRSTVWY" +extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" +# B = "Asx"; aspartic acid or asparagine (D or N) +# X = "Xxx"; unknown or 'other' amino acid +# Z = "Glx"; glutamic acid or glutamine (E or Q) +# J = "Xle"; leucine or isoleucine (L or I, used in mass-spec) +# U = "Sec"; selenocysteine +# O = "Pyl"; pyrrolysine +ambiguous_dna_letters = "GATCRYWSMKHBVDN" +unambiguous_dna_letters = "GATC" +ambiguous_rna_letters = "GAUCRYWSMKHBVDN" +unambiguous_rna_letters = "GAUC" + +# B == 5-bromouridine +# D == 5,6-dihydrouridine +# S == thiouridine +# W == wyosine +extended_dna_letters = "GATCBDSW" + +# are there extended forms? +#extended_rna_letters = "GAUCBDSW" + +ambiguous_dna_values = { + "A": "A", + "C": "C", + "G": "G", + "T": "T", + "M": "AC", + "R": "AG", + "W": "AT", + "S": "CG", + "Y": "CT", + "K": "GT", + "V": "ACG", + "H": "ACT", + "D": "AGT", + "B": "CGT", + "X": "GATC", + "N": "GATC", + } +ambiguous_rna_values = { + "A": "A", + "C": "C", + "G": "G", + "U": "U", + "M": "AC", + "R": "AG", + "W": "AU", + "S": "CG", + "Y": "CU", + "K": "GU", + "V": "ACG", + "H": "ACU", + "D": "AGU", + "B": "CGU", + "X": "GAUC", + "N": "GAUC", + } + +ambiguous_dna_complement = { + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "M": "K", + "R": "Y", + "W": "W", + "S": "S", + "Y": "R", + "K": "M", + "V": "B", + "H": "D", + "D": "H", + "B": "V", + "X": "X", + "N": "N", + } + +ambiguous_rna_complement = { + "A": "U", + "C": "G", + "G": "C", + "U": "A", + "M": "K", + "R": "Y", + "W": "W", + "S": "S", + "Y": "R", + "K": "M", + "V": "B", + "H": "D", + "D": "H", + "B": "V", + "X": "X", + "N": "N", + } + + +def _make_ranges(dict): + d = {} + for key, value in dict.items(): + d[key] = (value, value) + return d + +# From bioperl's SeqStats.pm +unambiguous_dna_weights = { + "A": 347., + "C": 323., + "G": 363., + "T": 322., + } +unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) + +unambiguous_rna_weights = { + "A": unambiguous_dna_weights["A"] + 16., # 16 for the oxygen + "C": unambiguous_dna_weights["C"] + 16., + "G": unambiguous_dna_weights["G"] + 16., + "U": 340., +} +unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) + +def _make_ambiguous_ranges(dict, weight_table): + range_d = {} + avg_d = {} + for letter, values in dict.items(): + #Following line is a quick hack to skip undefined weights for U and O + if len(values)==1 and values[0] not in weight_table : continue + weights = map(weight_table.get, values) + range_d[letter] = (min(weights), max(weights)) + total_w = 0.0 + for w in weights: + total_w = total_w + w + avg_d[letter] = total_w / len(weights) + return range_d, avg_d + +ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \ + _make_ambiguous_ranges(ambiguous_dna_values, + unambiguous_dna_weights) + +ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \ + _make_ambiguous_ranges(ambiguous_rna_values, + unambiguous_rna_weights) + +protein_weights = { + "A": 89.09, + "C": 121.16, + "D": 133.10, + "E": 147.13, + "F": 165.19, + "G": 75.07, + "H": 155.16, + "I": 131.18, + "K": 146.19, + "L": 131.18, + "M": 149.21, + "N": 132.12, + #"O": 0.0, # Needs to be recorded! + "P": 115.13, + "Q": 146.15, + "R": 174.20, + "S": 105.09, + "T": 119.12, + #"U": 168.05, # To be confirmed + "V": 117.15, + "W": 204.23, + "Y": 181.19 + } + +extended_protein_values = { + "A": "A", + "B": "ND", + "C": "C", + "D": "D", + "E": "E", + "F": "F", + "G": "G", + "H": "H", + "I": "I", + "J": "IL", + "K": "K", + "L": "L", + "M": "M", + "N": "N", + "O": "O", + "P": "P", + "Q": "Q", + "R": "R", + "S": "S", + "T": "T", + "U": "U", + "V": "V", + "W": "W", + "X": "ACDEFGHIKLMNPQRSTVWY", + #TODO - Include U and O in the possible values of X? + #This could alter the extended_protein_weight_ranges ... + "Y": "Y", + "Z": "QE", +} + +protein_weight_ranges = _make_ranges(protein_weights) + +extended_protein_weight_ranges, avg_extended_protein_weights = \ + _make_ambiguous_ranges(extended_protein_values, + protein_weights) + + + diff --git a/binaries/src/globplot/biopython-1.50/Bio/Data/IUPACData.pyc b/binaries/src/globplot/biopython-1.50/Bio/Data/IUPACData.pyc new file mode 100644 index 0000000..a748909 Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/Data/IUPACData.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/Data/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/Data/__init__.py new file mode 100644 index 0000000..d3f49b3 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Data/__init__.py @@ -0,0 +1,3 @@ +# This is a Python module +"""Collections of various bits of useful biological data. +""" diff --git a/binaries/src/globplot/biopython-1.50/Bio/Data/__init__.pyc b/binaries/src/globplot/biopython-1.50/Bio/Data/__init__.pyc new file mode 100644 index 0000000..1d6a227 Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/Data/__init__.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/Decode.py b/binaries/src/globplot/biopython-1.50/Bio/Decode.py new file mode 100644 index 0000000..a4bb81f --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Decode.py @@ -0,0 +1,427 @@ +# Copyright 2002 by Andrew Dalke. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +"""Decode elements from a Std/Martel parsed XML stream (OBSOLETE). + +Andrew Dalke is no longer maintaining Martel or Bio.Mindy, and these modules +(and therefore Bio.Decode) have been deprecated. They are no longer used in +any of the current Biopython parsers, and are likely to be removed in a +future release.""" + +import warnings +warnings.warn("Martel and those parts of Biopython depending on it" \ + +" directly (such as Bio.Mindy and Bio.Decode) are now" \ + +" deprecated, and will be removed in a future release of"\ + +" Biopython. If you want to continue to use this code,"\ + +" please get in contact with the Biopython developers via"\ + +" the mailing lists to avoid its permanent removal from"\ + +" Biopython.", \ + DeprecationWarning) + +import string +from Bio.Parsers.spark import GenericScanner, GenericParser + +def unescape_C(s): + result = [] + for i in range(len(s)): + if s[i] != "\\": + result.append(s[i]) + continue + c = s[i+1:i+2] + if c == "x": + x = s[i+2:i+4] + if len(x) != 2: + raise ValueError("invalid \\x escape") + i = int(x, 16) + result.append(chr(i)) + continue + if c in "01234567": + x = s[i+1:i+4] + # \octals don't do a length assertion check + i = int(x, 8) + result.append(chr(i)) + continue + result.append(c) + return "".join(result) + +def join_english(fields): + if not fields: + return "" + s = fields[0] + for field in fields[1:]: + if s[-1:] == "-" and s[-3:-2] == "-": + s = s + field + continue + if s.find(" ") == -1 and field.find(" ") == -1: + s = s + field + continue + s = s + " " + field + return (" ".join(s.split())).strip() + + + +def chomp(s, c): + if s[-1:] == c: + return s[:-1] + return s + +def lchomp(s, c): + if s[:1] == c: + return s[1:] + return s + +def chompchomp(s, c): + if s[:1] == c and s[-1:] == c: + return s[1:-1] + return s + +def fixspaces(s): + # s.split breaks down to a list of words + # " ".join puts them together + # strip removes leading and trailing spaces + return " ".join(s.split()).strip() + +def join_fixspaces(lines): + return " ".join((" ".join(lines)).split()).strip() + +def tr(s, frm, to): + table = string.maketrans(frm, to) + return s.translate(table) + +def safe_int(s): + """converts to int if the number is small, long if it's large""" + try: + return int(s) + except ValueError: + return long(s) + +decode_functions = { + "chomp": (chomp, str, str), + "chompchomp": (chompchomp, str, str), + "chop": (lambda s: s[:-1], str, str), + "chopchop": (lambda s: s[1:-1], str, str), + "fixspaces": (fixspaces, str, str), + "lchomp": (lchomp, str, str), + "lchop": (lambda s: s[1:], str, str), + "lower": (lambda s: s.lower(), str, str), + "lstrip": (lambda s: s.lstrip(), str, str), + "replace": (lambda s, old, new: s.replace(old, new), str, str), + "rstrip": (lambda s: s.rstrip(), str, str), + "str": (str, str, str), + "strip": (lambda s: s.strip(), str, str), + "tr": (tr, str, str), + "unescape.c": (unescape_C, str, str), + "unescape.doublequote": (lambda s: s.replace('""', '"'), str, str), + "unescape.singlequote": (lambda s: s.replace("''", "'"), str, str), + "upper": (lambda s: s.upper(), str, str), + + # List operations + "join": (lambda lst, s = " ": s.join(lst), list, str), + "join.english": (join_english, list, str), + + # Integer operations + "int": (safe_int, [float, str, int], int), + "int.comma": (lambda s: safe_int(s.replace(",", "")), + [float, str, int], int), + "hex": (hex, str, int), + "oct": (oct, str, int), + "add": ((lambda i, j: i+j), int, int), + + # Float operations + "float": (float, (float, str, int), float), + + } + +def _fixup_defs(): + # Normalize so the 2nd and 3rd terms are tuples + for k, v in decode_functions.items(): + f, in_types, out_types = v + if isinstance(in_types, type([])): + in_types = tuple(in_types) + elif not isinstance(in_types, type( () )): + in_types = (in_types,) + + if isinstance(out_types, type([])): + out_types = tuple(out_types) + elif not isinstance(out_types, type( () )): + out_types = (out_types,) + + decode_functions[k] = (f, in_types, out_types) +_fixup_defs() + +class Token: + def __init__(self, type): + self.type = type + def __cmp__(self, other): + return cmp(self.type, other) + def __repr__(self): + return "Token(%r)" % (self.type,) + +class ValueToken(Token): + def __init__(self, type, val): + Token.__init__(self, type) + self.val = val + def __cmp__(self, other): + return cmp(self.type, other) + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.val) + def __str__(self): + return str(self.val) + +class Integer(ValueToken): + def __init__(self, val): + ValueToken.__init__(self, "integer", val) + +class Float(ValueToken): + def __init__(self, val): + ValueToken.__init__(self, "float", val) + +class String(ValueToken): + def __init__(self, val): + ValueToken.__init__(self, "string", val) + +class FunctionName(ValueToken): + def __init__(self, val): + ValueToken.__init__(self, "functionname", val) + +class DecodeScanner(GenericScanner): + def __init__(self): + GenericScanner.__init__(self) + + def tokenize(self, input): + self.rv = [] + GenericScanner.tokenize(self, input) + return self.rv + + def t_functionname(self, input): + r" \w+(\.\w+)*" + self.rv.append(FunctionName(input)) + + def t_pipe(self, input): + r" \| " + self.rv.append(Token("pipe")) + + def t_open_paren(self, input): + r" \( " + self.rv.append(Token("open_paren")) + + def t_close_paren(self, input): + r" \) " + self.rv.append(Token("close_paren")) + + def t_comma(self, input): + r" , " + self.rv.append(Token("comma")) + + def t_whitespace(self, input): + r" \s+ " + pass + + def t_string(self, input): + r""" "([^"\\]+|\\.)*"|'([^'\\]+|\\.)*' """ + # "' # emacs cruft + s = input[1:-1] + s = unescape_C(s) + + self.rv.append(String(s)) + + def t_float(self, input): + r""" [+-]?((\d+(\.\d*)?)|\.\d+)([eE][+-]?[0-9]+)? """ + # See if this is an integer + try: + self.rv.append(Integer(safe_int(input))) + except ValueError: + self.rv.append(Float(float(input))) + +class Function: + def __init__(self, name, args = ()): + self.name = name + self.args = args + def __str__(self): + args = self.args + if not args: + s = "" + else: + s = str(args)[1:-1] + return "%s(x, %s)" % (self.name, s) + __repr__ = __str__ + +class DecodeParser(GenericParser): + def __init__(self, start = "expression"): + GenericParser.__init__(self, start) + self.begin_pos = 0 + + def p_expression(self, args): + """ + expression ::= term + expression ::= term pipe expression + """ + if len(args) == 1: + return [args[0]] + return [args[0]] + args[2] + + def p_term(self, args): + """ + term ::= functionname + term ::= functionname open_paren args close_paren + """ + if len(args) == 1: + return Function(args[0].val) + return Function(args[0].val, tuple([x.val for x in args[2]])) + + def p_args(self, args): + """ + args ::= arg + args ::= arg comma args + """ + if len(args) == 1: + return [args[0]] + return [args[0]] + args[2] + + def p_arg(self, args): + """ + arg ::= string + arg ::= integer + arg ::= float + """ + return args[0] + +def scan(input): + scanner = DecodeScanner() + return scanner.tokenize(input) + +def parse(tokens): + parser = DecodeParser() + return parser.parse(tokens) + +_decoder_cache = {} + +class FunctionCall: + def __init__(self, f, args): + self.f = f + self.args = args + def __call__(self, x): + return self.f(x, *self.args) + +class FunctionCallChain: + def __init__(self, inner_f, f, args): + self.inner_f = inner_f + self.f = f + self.args = args + def __call__(self, x): + return self.f(self.inner_f(x), *self.args) + +#### I don't think this is the right way to do things +##class CheckTypes: +## def __init__(self, f, call_types, return_types): +## self.f = f +## self.call_types = call_types +## self.return_types = return_types +## def __call__(self, x): +## if self.call_types is not None: +## for T in self.call_types: +## if isinstance(x, T): +## break +## else: +## raise TypeError( +## "Call value %s of type %s, expecting one of %s" % +## (x, type(x).__name__, +## [T.name for T in self.call_types])) +## y = self.f(x) + +## if not self.return_types: +## return y + +## for T in self.return_types: +## if isinstance(y, T): +## return y +## raise TypeError("Return value %s of type %s, expecting one of %s" % +## (y, type(y).__name__, +## [T.name for T in self.return_types])) + +def make_decoder(s): + try: + return _decoder_cache[s] + except KeyError: + pass + + functions = parse(scan(s)) + + f = functions[0] + fc = decode_functions[f.name][0] + args = f.args + if args: + fc = FunctionCall(fc, args) + for f in functions[1:]: + fc = FunctionCallChain(fc, decode_functions[f.name][0], f.args) + _decoder_cache[s] = fc + return fc + +def _verify_subtypes(subset, total, old_name, new_name): + for x in subset: + if x not in total: + raise TypeError("%s can produce a %r value not accepted by %s" % + (old_name, x.__name__, new_name)) + +_typechecked_decoder_cache = {} +def make_typechecked_decoder(s, input_types = None, output_types = None): + cache_lookup = (s, input_types, output_types) + try: + return _typechecked_decoder_cache[cache_lookup] + except KeyError: + pass + if input_types is not None and not isinstance(input_types, type( () )): + input_types = (input_types,) + if output_types is not None and not isinstance(output_types, type( () )): + output_types = (output_types,) + + functions = parse(scan(s)) + + # Make sure the input type(s) are allowed + f = functions[0] + fc, in_types, out_types = decode_functions[f.name] + if input_types is not None: + for x in input_types: + if x not in in_types: + raise TypeError( + "the input type includes %r which isn't supported by %s" % + (x.__name__, f.name)) + + # Do the composition + old_name = f.name + input_types = out_types + args = functions[0].args + if args: + fc = FunctionCall(fc, args) + + for f in functions[1:]: + transform_func, in_types, out_types = decode_functions[f.name] + _verify_subtypes(input_types, in_types, old_name, f.name) + old_name = f.name + input_types = out_types + fc = FunctionCallChain(fc, transform_func, f.args) + + if output_types is not None: + _verify_subtypes(input_types, output_types, old_name, "the output") + _typechecked_decoder_cache[cache_lookup] = fc + return fc + + +def test(): + assert make_decoder("chop")("Andrew") == "Andre" + assert make_decoder("int")("9") == 9 + assert make_decoder('join(" ")')(["Andrew", "Dalke"]) == \ + "Andrew Dalke" + assert make_decoder('chomp("|")')("|test|") == "|test" + assert make_decoder('chomp("|")')("|test") == "|test" + assert make_decoder('chomp("A")|chop')("BA") == "" + assert make_decoder('chomp("A")|chop')("AB") == "A" + assert make_decoder('chop|chomp("A")')("AB") == "" + assert make_decoder('chop|chomp("A")')("BA") == "B" + assert make_decoder('add(5)')(2) == 7 + assert make_decoder('add(-2)')(5) == 3 + +if __name__ == "__main__": + test() diff --git a/binaries/src/globplot/biopython-1.50/Bio/DocSQL.py b/binaries/src/globplot/biopython-1.50/Bio/DocSQL.py new file mode 100644 index 0000000..415559e --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/DocSQL.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python +# +# Copyright 2002-2003 by Michael Hoffman. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +""" +Bio.DocSQL: easy access to DB API databases. + +>>> import DocSQL, MySQLdb, os +>>> db=MySQLdb.connect(passwd='', db='test') +>>> class CreatePeople(DocSQL.Create): +... \""" +... CREATE TEMPORARY TABLE people +... (id INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, +... last_name TINYTEXT, +... first_name TINYTEXT) +... \""" +... +>>> CreatePeople(connection=db) +CreatePeople(message=Success) +""" + +__version__ = "$Revision: 1.13 $" +# $Source: /home/repository/biopython/biopython/Bio/DocSQL.py,v $ + +import exceptions +import sys + +from Bio import MissingExternalDependencyError + +try: + import MySQLdb +except: + raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.DocSQL.") + +connection = None + +class NoInsertionError(exceptions.Exception): + pass + +def _check_is_public(name): + if name[:6] == "_names": + raise AttributeError + +class QueryRow(list): + def __init__(self, cursor): + try: + row = cursor.fetchone() + super(QueryRow, self).__init__(row) + except TypeError: + raise StopIteration + + object.__setattr__(self, "_names", [x[0] for x in cursor.description]) # FIXME: legacy + object.__setattr__(self, "_names_hash", {}) + + for i, name in enumerate(self._names): + self._names_hash[name] = i + + def __getattr__(self, name): + _check_is_public(name) + try: + return self[self._names_hash[name]] + except (KeyError, AttributeError) : + raise AttributeError("'%s' object has no attribute '%s'" \ + % (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + try: + self._names_hash + except AttributeError: + return object.__setattr__(self, name, value) + + _check_is_public(name) + try: + index = self._names_hash[name] + self[index] = value + except KeyError: + return object.__setattr__(self, name, value) + +class Query(object): + """ + SHOW TABLES + """ + MSG_FAILURE = "Failure" + MSG_SUCCESS = "Success" + message = "not executed" + error_message = "" + prefix = "" + suffix = "" + row_class = QueryRow + + def __init__(self, *args, **keywds): + try: + self.connection = keywds['connection'] + except KeyError: + self.connection = connection + try: + self.diagnostics = keywds['diagnostics'] + except KeyError: + self.diagnostics = 0 + + self.statement = self.prefix + self.__doc__ + self.suffix + self.params = args + + def __iter__(self): + return IterationCursor(self, self.connection) + + def __repr__(self): + return "%s(message=%s)" % (self.__class__.__name__, self.message) + + def cursor(self): + return iter(self).cursor + + def dump(self): + for item in self: + print item + +class QueryGeneric(Query): + def __init__(self, statement, *args, **keywds): + Query.__init__(self, *args, **keywds) + self.statement = statement, + +class IterationCursor(object): + def __init__(self, query, connection=connection): + if connection is None: + raise TypeError("database connection is None") + self.cursor = connection.cursor() + self.row_class = query.row_class + if query.diagnostics: + print >>sys.stderr, query.statement + print >>sys.stderr, query.params + self.cursor.execute(query.statement, query.params) + + def next(self): + return self.row_class(self.cursor) + +class QuerySingle(Query, QueryRow): + ignore_warnings = 0 + def __init__(self, *args, **keywds): + message = self.MSG_FAILURE + Query.__init__(self, *args, **keywds) + try: + self.single_cursor = Query.cursor(self) + except MySQLdb.Warning: + if not self.ignore_warnings: + raise + self.row_class.__init__(self, self.cursor()) + object.__setattr__(self, "message", self.MSG_SUCCESS) + + def cursor(self): + return self.single_cursor + +class QueryAll(list, Query): + def __init__(self, *args, **keywds): + Query.__init__(self, *args, **keywds) + list.__init__(self, map(self.process_row, self.cursor().fetchall())) + + def process_row(self, row): + return row + +class QueryAllFirstItem(QueryAll): + def process_row(self, row): + return row[0] + +class Create(QuerySingle): + def __init__(self, *args, **keywds): + try: + QuerySingle.__init__(self, *args, **keywds) + except StopIteration: + self.message = self.MSG_SUCCESS + +class Update(Create): + pass + +class Insert(Create): + MSG_INTEGRITY_ERROR = "Couldn't insert: %s. " + + def __init__(self, *args, **keywds): + try: + Create.__init__(self, *args, **keywds) + except MySQLdb.IntegrityError, error_data: + self.error_message += self.MSG_INTEGRITY_ERROR % error_data[1] + try: + self.total_count + except AttributeError: + self.total_count = 0 + + raise MySQLdb.IntegrityError(self.error_message) + + self.id = self.cursor().insert_id() + try: + self.total_count += self.cursor().rowcount + except AttributeError: + self.total_count = self.cursor().rowcount + + if self.cursor().rowcount == 0: + raise NoInsertionError + +def _test(*args, **keywds): + import doctest, sys + doctest.testmod(sys.modules[__name__], *args, **keywds) + +if __name__ == "__main__": + if __debug__: + _test() diff --git a/binaries/src/globplot/biopython-1.50/Bio/Encodings/IUPACEncoding.py b/binaries/src/globplot/biopython-1.50/Bio/Encodings/IUPACEncoding.py new file mode 100644 index 0000000..a4089a3 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Encodings/IUPACEncoding.py @@ -0,0 +1,126 @@ +# Set up the IUPAC alphabet properties + + +from Bio.PropertyManager import default_manager +from Bio import Alphabet +from Bio.Alphabet import IUPAC +from Bio.Data import IUPACData + +from Bio import Transcribe, Translate + +set_prop = default_manager.class_property + +# weight tables +set_prop[IUPAC.IUPACUnambiguousDNA]["weight_table"] = \ + IUPACData.unambiguous_dna_weights +set_prop[IUPAC.IUPACAmbiguousDNA]["weight_table"] = \ + IUPACData.avg_ambiguous_dna_weights +set_prop[IUPAC.IUPACUnambiguousRNA]["weight_table"] = \ + IUPACData.unambiguous_rna_weights +set_prop[IUPAC.IUPACAmbiguousRNA]["weight_table"] = \ + IUPACData.avg_ambiguous_rna_weights +set_prop[IUPAC.IUPACProtein]["weight_table"] = \ + IUPACData.protein_weights +set_prop[IUPAC.ExtendedIUPACProtein]["weight_table"] = \ + IUPACData.avg_extended_protein_weights + +set_prop[IUPAC.IUPACUnambiguousDNA]["weight_range_table"] = \ + IUPACData.unambiguous_dna_weight_ranges +set_prop[IUPAC.IUPACAmbiguousDNA]["weight_range_table"] = \ + IUPACData.ambiguous_dna_weight_ranges +set_prop[IUPAC.IUPACUnambiguousRNA]["weight_range_table"] = \ + IUPACData.unambiguous_rna_weight_ranges +set_prop[IUPAC.IUPACAmbiguousRNA]["weight_range_table"] = \ + IUPACData.ambiguous_rna_weight_ranges +set_prop[IUPAC.IUPACProtein]["weight_range_table"] = \ + IUPACData.protein_weight_ranges +set_prop[IUPAC.ExtendedIUPACProtein]["weight_range_table"] = \ + IUPACData.extended_protein_weight_ranges + + + +# transcriber objects + +set_prop[Alphabet.DNAAlphabet]["transcriber"] = \ + Transcribe.generic_transcriber + +set_prop[IUPAC.IUPACAmbiguousDNA]["transcriber"] = \ + Transcribe.ambiguous_transcriber + +set_prop[IUPAC.IUPACUnambiguousDNA]["transcriber"] = \ + Transcribe.unambiguous_transcriber + + +set_prop[Alphabet.RNAAlphabet]["transcriber"] = \ + Transcribe.generic_transcriber + +set_prop[IUPAC.IUPACAmbiguousRNA]["transcriber"] = \ + Transcribe.ambiguous_transcriber + +set_prop[IUPAC.IUPACUnambiguousRNA]["transcriber"] = \ + Transcribe.unambiguous_transcriber + + +# translator objects +for name, obj in Translate.unambiguous_dna_by_name.items(): + property = "translator.name." + name + set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj + set_prop[obj.table.protein_alphabet.__class__][property] = obj + +for name, obj in Translate.unambiguous_rna_by_name.items(): + property = "translator.name." + name + set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj + property = "rna_translator.name." + name + set_prop[obj.table.protein_alphabet.__class__][property] = obj + + +for id, obj in Translate.unambiguous_dna_by_id.items(): + property = "translator.id.%d" % id + set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj + set_prop[obj.table.protein_alphabet.__class__][property] = obj + if id == 1: + set_prop[obj.table.nucleotide_alphabet.__class__]["translator"] = obj + set_prop[obj.table.protein_alphabet.__class__]["translator"] = obj + + +for id, obj in Translate.unambiguous_rna_by_id.items(): + property = "translator.id.%d" % id + set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj + property = "rna_translator.id.%d" % id + set_prop[obj.table.protein_alphabet.__class__][property] = obj + if id == 1: + set_prop[obj.table.nucleotide_alphabet.__class__]["translator"] = obj + set_prop[obj.table.protein_alphabet.__class__]["rna_translator"] = obj + +# ambiguous translator objects +for name, obj in Translate.ambiguous_dna_by_name.items(): + property = "translator.name." + name + set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj + property = "ambiguous_translator.name." + name + set_prop[obj.table.protein_alphabet.__class__][property] = obj + +for name, obj in Translate.ambiguous_rna_by_name.items(): + property = "translator.name." + name + set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj + property = "ambiguous_rna_translator.name." + name + set_prop[obj.table.protein_alphabet.__class__][property] = obj + + +for id, obj in Translate.ambiguous_dna_by_id.items(): + property = "translator.id.%d" % id + set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj + property = "ambiguous_translator.id.%d" % id + set_prop[obj.table.protein_alphabet.__class__][property] = obj + if id == 1: + set_prop[obj.table.nucleotide_alphabet.__class__]["translator"] = obj + set_prop[obj.table.protein_alphabet.__class__]["ambiguous_translator"] = obj + + +for id, obj in Translate.ambiguous_rna_by_id.items(): + property = "translator.id.%d" % id + set_prop[obj.table.nucleotide_alphabet.__class__][property] = obj + property = "ambiguous_rna_translator.id.%d" % id + set_prop[obj.table.protein_alphabet.__class__][property] = obj + if id == 1: + set_prop[obj.table.nucleotide_alphabet.__class__]["translator"] = obj + set_prop[obj.table.protein_alphabet.__class__]["ambiguous_rna_translator"] = obj diff --git a/binaries/src/globplot/biopython-1.50/Bio/Encodings/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/Encodings/__init__.py new file mode 100644 index 0000000..a200bbe --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Encodings/__init__.py @@ -0,0 +1,3 @@ +# This is a Python module. +"""Properties for functionality such as transcription and translation. +""" diff --git a/binaries/src/globplot/biopython-1.50/Bio/Fasta/FastaAlign.py b/binaries/src/globplot/biopython-1.50/Bio/Fasta/FastaAlign.py new file mode 100644 index 0000000..735b502 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Fasta/FastaAlign.py @@ -0,0 +1,84 @@ +""" +Code to deal with alignments written in Fasta format (OBSOLETE). + +This module is considered obsolete and likely to be deprecated. Please use +Bio.AlignIO instead for reading and writing alignments in FASTA format. + +This mostly just uses the regular Fasta parsing stuff written by Jeff +to deal with all of the input and output formats. + +functions: +o parse_file() + +classes: +FastaAlignment""" +# standard library +import os + +# biopython +from Bio.Align.Generic import Alignment +from Bio import Alphabet +from Bio.Alphabet import IUPAC +from Bio import Fasta + +def parse_file(file_name, type = 'DNA'): + """Parse the given file into a FastaAlignment object. + + Arguments: + o file_name - The location of the file to parse. + o type - The type of information contained in the file. + """ + if type.upper() == 'DNA': + alphabet = IUPAC.ambiguous_dna + elif type.upper() == 'RNA': + alphabet = IUPAC.ambiguous_rna + elif type.upper() == 'PROTEIN': + alphabet = IUPAC.protein + else: + raise ValueError("Invalid type %s passed. Need DNA, RNA or PROTEIN" + % type) + + # create a new alignment object + fasta_align = FastaAlignment(Alphabet.Gapped(alphabet)) + + # now parse the file and fill up the alignment object + align_file = open(file_name, 'r') + + parser = Fasta.RecordParser() + iterator = Fasta.Iterator(align_file, parser) + + cur_align = iterator.next() + while cur_align: + fasta_align.add_sequence(cur_align.title, cur_align.sequence) + + cur_align = iterator.next() + + return fasta_align + +class FastaAlignment(Alignment): + """Work with the Fasta Alignment format. + + The fasta alignment format is basically the same as the regular ol' + Fasta format we know and love, except the sequences have gaps + (represented by -'s). + """ + def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)): + Alignment.__init__(self, alphabet) + + def __str__(self): + """Print out a fasta version of the alignment info.""" + return_string = '' + for item in self._records: + new_f_record = Fasta.Record() + new_f_record.title = item.description + new_f_record.sequence = item.seq.data + + return_string = return_string + str(new_f_record) + os.linesep + os.linesep + + # have a extra newline, so strip two off and add one before returning + return return_string.rstrip() + os.linesep + + + + + diff --git a/binaries/src/globplot/biopython-1.50/Bio/Fasta/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/Fasta/__init__.py new file mode 100644 index 0000000..c49b45b --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Fasta/__init__.py @@ -0,0 +1,198 @@ +"""Utilities for working with FASTA-formatted sequences (OBSOLETE). + +Classes: +Record Holds FASTA sequence data. +Iterator Iterates over sequence data in a FASTA file. +RecordParser Parses FASTA sequence data into a Record object. +SequenceParser Parses FASTA sequence data into a SeqRecord object. + +For a long time this module was the most commonly used and best documented +FASTA parser in Biopython. However, we now recommend using Bio.SeqIO instead. + +In view of this, while you can continue to use Bio.Fasta for the moment, it is +considered to be a legacy module and should not be used if you are writing new +code. At some point Bio.Fasta may be officially deprecated (with warning +messages when used) before finally being removed. + +If you are already using Bio.Fasta with the SequenceParser to get SeqRecord +objects, then you should be able to switch to the more recent Bio.SeqIO module +very easily as that too uses SeqRecord objects. For example, + +from Bio import Fasta +handle = open("example.fas") +for seq_record in Fasta.Iterator(handle, Fasta.SequenceParser()) : + print seq_record.description + print seq_record.seq +handle.close() + +Using Bio.SeqIO instead this becomes: + +from Bio import SeqIO +handle = open("example.fas") +for seq_record in SeqIO.parse(handle, "fasta") : + print seq_record.description + print seq_record.seq +handle.close() + +Converting an existing code which uses the RecordParser is a little more +complicated as the Bio.Fasta.Record object differs from the SeqRecord. + +from Bio import Fasta +handle = open("example.fas") +for record in Fasta.Iterator(handle, Fasta.RecordParser()) : + #record is a Bio.Fasta.Record object + print record.title #The full title line as a string + print record.sequence #The sequence as a string +handle.close() + +Using Bio.SeqIO instead this becomes: + +from Bio import SeqIO +handle = open("example.fas") +for seq_record in SeqIO.parse(handle, "fasta") : + print seq_record.description #The full title line as a string + print seq_record.seq.tostring() #The sequence as a string +handle.close() + + + +""" +from Bio import Seq +from Bio import SeqRecord +from Bio import Alphabet + + +class Record: + """Holds information from a FASTA record. + + Members: + title Title line ('>' character not included). + sequence The sequence. + + """ + def __init__(self, colwidth=60): + """__init__(self, colwidth=60) + + Create a new Record. colwidth specifies the number of residues + to put on each line when generating FASTA format. + + """ + self.title = '' + self.sequence = '' + self._colwidth = colwidth + + def __str__(self): + s = [] + s.append('>%s' % self.title) + i = 0 + while i < len(self.sequence): + s.append(self.sequence[i:i+self._colwidth]) + i = i + self._colwidth + #Was having a problem getting the tests to pass on windows... + #return os.linesep.join(s) + return "\n".join(s) + +class Iterator: + """Returns one record at a time from a FASTA file. + """ + def __init__(self, handle, parser = None, debug = 0): + """Initialize a new iterator. + """ + self.handle = handle + self._parser = parser + self._debug = debug + + #Skip any text before the first record (e.g. blank lines) + while True : + line = handle.readline() + if not line or line[0] == ">" : + break + if debug : print "Skipping: " + line + self._lookahead = line + + def __iter__(self): + return iter(self.next, None) + + def next(self): + """Return the next record in the file""" + line = self._lookahead + if not line: + return None + assert line[0]==">", line + lines = [line.rstrip()] + line = self.handle.readline() + while line: + if line[0] == ">": break + if line[0] == "#" : + if self._debug : print "Ignoring comment line" + pass + else : + lines.append(line.rstrip()) + line = self.handle.readline() + self._lookahead = line + if self._debug : print "Debug: '%s' and '%s'" % (title, "".join(lines)) + if self._parser is None: + return "\n".join(lines) + else : + return self._parser.parse_string("\n".join(lines)) + +class RecordParser: + """Parses FASTA sequence data into a Fasta.Record object. + """ + def __init__(self, debug = 0): + pass + + def parse_string(self, text) : + text = text.replace("\r\n","\n") #Crude way of dealing with \r\n + assert text[0] == ">", text + text = text.split("\n>",1)[0] # Only do the first record if more than one + title, sequence = text.split("\n", 1) + title = title[1:] + rec = Record() + rec.title = title + rec.sequence = sequence.replace("\n","") + return rec + + def parse(self, handle): + return self.parse_string(handle.read()) + +class SequenceParser: + """Parses FASTA sequence data into a SeqRecord object. + """ + def __init__(self, alphabet = Alphabet.generic_alphabet, title2ids = None, + debug = 0): + """Initialize a Scanner and Sequence Consumer. + + Arguments: + o alphabet - The alphabet of the sequences to be parsed. If not + passed, this will be set as generic_alphabet. + o title2ids - A function that, when given the title of the FASTA + file (without the beginning >), will return the id, name and + description (in that order) for the record. If this is not given, + then the entire title line will be used as the description. + """ + self.alphabet = alphabet + self.title2ids = title2ids + + def parse_string(self, text) : + text = text.replace("\r\n","\n") #Crude way of dealing with \r\n + assert text[0] == ">", text + text = text.split("\n>",1)[0] # Only do the first record if more than one + title, sequence = text.split("\n", 1) + title = title[1:] + + seq = Seq.Seq(sequence.replace("\n",""), self.alphabet) + rec = SeqRecord.SeqRecord(seq) + + if self.title2ids: + seq_id, name, descr = self.title2ids(title) + rec.id = seq_id + rec.name = name + rec.description = descr + else: + rec.description = title + + return rec + + def parse(self, handle): + return self.parse_string(handle.read()) diff --git a/binaries/src/globplot/biopython-1.50/Bio/Fasta/__init__.pyc b/binaries/src/globplot/biopython-1.50/Bio/Fasta/__init__.pyc new file mode 100644 index 0000000..3634958 Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/Fasta/__init__.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/File.py b/binaries/src/globplot/biopython-1.50/Bio/File.py new file mode 100644 index 0000000..d616f42 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/File.py @@ -0,0 +1,180 @@ +# Copyright 1999 by Jeffrey Chang. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code for more fancy file handles. + + +Classes: +UndoHandle File object decorator with support for undo-like operations. +StringHandle Wraps a file object around a string. +SGMLHandle File object that automatically strips SGML tags from data. + +SGMLStripper Object that strips SGML. This is now considered OBSOLETE, and + is likely to be deprecated in a future release of Biopython, + and later removed. + +""" +import os +import StringIO +import sgmllib + +class UndoHandle: + """A Python handle that adds functionality for saving lines. + + Saves lines in a LIFO fashion. + + Added methods: + saveline Save a line to be returned next time. + peekline Peek at the next line without consuming it. + + """ + def __init__(self, handle): + self._handle = handle + self._saved = [] + + def __iter__(self): + return self + + def next(self): + next = self.readline() + if not next: + raise StopIteration + return next + + def readlines(self, *args, **keywds): + lines = self._saved + self._handle.readlines(*args,**keywds) + self._saved = [] + return lines + + def readline(self, *args, **keywds): + if self._saved: + line = self._saved.pop(0) + else: + line = self._handle.readline(*args,**keywds) + return line + + def read(self, size=-1): + if size == -1: + saved = "".join(self._saved) + self._saved[:] = [] + else: + saved = '' + while size > 0 and self._saved: + if len(self._saved[0]) <= size: + size = size - len(self._saved[0]) + saved = saved + self._saved.pop(0) + else: + saved = saved + self._saved[0][:size] + self._saved[0] = self._saved[0][size:] + size = 0 + return saved + self._handle.read(size) + + def saveline(self, line): + if line: + self._saved = [line] + self._saved + + def peekline(self): + if self._saved: + line = self._saved[0] + else: + line = self._handle.readline() + self.saveline(line) + return line + + def tell(self): + lengths = map(len, self._saved) + sum = reduce(lambda x, y: x+y, lengths, 0) + return self._handle.tell() - sum + + def seek(self, *args): + self._saved = [] + self._handle.seek(*args) + + def __getattr__(self, attr): + return getattr(self._handle, attr) + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self._handle.close() + + +# I could make this faster by using cStringIO. +# However, cStringIO (in v1.52) does not implement the +# readlines method. +StringHandle = StringIO.StringIO + + + +class SGMLHandle: + """A Python handle that automatically strips SGML tags from data (OBSOLETE). + + This module is now considered to be obsolete, and is likely to be + deprecated in a future release of Biopython, and later removed. + """ + def __init__(self, handle): + """SGMLStripper(handle) + + handle is a file handle to SGML-formatted data. + + """ + self._handle = handle + self._stripper = SGMLStripper() + + def read(self, *args, **keywds): + data = self._handle.read(*args, **keywds) + return self._stripper.strip(data) + + def readline(self, *args, **keywds): + line = self._handle.readline(*args, **keywds) + return self._stripper.strip(line) + + def readlines(self, *args, **keywds): + lines = self._handle.readlines(*args, **keywds) + for i in range(len(lines)): + lines[i] = self._stripper.strip(str) + return lines + + def __getattr__(self, attr): + return getattr(self._handle, attr) + + +class SGMLStripper: + class MyParser(sgmllib.SGMLParser): + def __init__(self): + sgmllib.SGMLParser.__init__(self) + self.data = '' + def handle_data(self, data): + self.data = self.data + data + + def __init__(self): + self._parser = SGMLStripper.MyParser() + + def strip(self, str): + """S.strip(str) -> string + + Strip the SGML tags from str. + + """ + if not str: # empty string, don't do anything. + return '' + # I need to make sure that I don't return an empty string if + # the buffer is not empty. This can happen if there's a newline + # character embedded within a tag. Thus, I'll first check to + # see if the last character is a newline. If it is, and it's stripped + # away, I'll add it back. + is_newline = str[-1] in ['\n', '\r'] + + self._parser.data = '' # clear the parser's data (don't reset) + self._parser.feed(str) + if self._parser.data: + str = self._parser.data + elif is_newline: + str = '\n' + else: + str = '' + return str + diff --git a/binaries/src/globplot/biopython-1.50/Bio/File.pyc b/binaries/src/globplot/biopython-1.50/Bio/File.pyc new file mode 100644 index 0000000..a68368c Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/File.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/FilteredReader.py b/binaries/src/globplot/biopython-1.50/Bio/FilteredReader.py new file mode 100644 index 0000000..ba2bb6f --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/FilteredReader.py @@ -0,0 +1,152 @@ +# Copyright 2001 by Katharine Lindner. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code for more fancy file handles (OBSOLETE). + +Classes: +Filtered is a decorator for File that allows the user to filter the output +on a line by line basis. + +The FilteredReader module reads a file and applies a sequence of filters to the input +The constructor sets a default filter chain, but the user can select another filter by setting +Bio.FilteredReader.filter_chain. + +handle = open( "filename" ) +filtered_reader = Bio.FilteredReader( handle ) +filtered_reader.filter_chain = [ remove_asterisks, replace_dot_with_dash ] +filtered_reasder.read() + +All filters in the chain must provide the same interface with a line of text as the single +input parameter and altered text as the return value. + +This module is now considered to be obsolete, and is likely to be deprecated +in a future release of Biopython, and later removed. +""" + + +def dump_saved( name, text, j ): + """Used for debugging.""" + dump_file = open( name + '%d' % j, "w" ) + k = 0 + for i in range ( 0, len( text ), 80 ): + dump_file.write( '%s\n' % text[ i : i + 80 ] ) + dump_file.close() + +def remove_leading_whitespace( line ): + return line.lstrip() + + +def remove_empty_line( line ): + stripped_line = line.strip() + if( stripped_line ): + return line[ : ] + else: + return '' + +def remove_useless_dot( line ): + before = line + while( 1 ): + after = before.replace( "\t.\t", "\t\t" ) + if( len( before ) == len( after ) ): + break + before = after + if( after.endswith( '.' ) ): + after = after[ :-1 ] + return after + +def fix_punctuation( line ): + line = line.replace( "'", '' ) + line = line.replace( '"', '' ) + line = line.replace( ';', '\t' ) + line = line.replace( 'entryname', 'id' ) +# line = line.lower( ) + if( line ): + return line[ : ] + else: + return '' + + + +class FilteredReader: + def __init__(self, handle ): + self._handle = handle + self._start_line = '' + self._debug_count = 0 + self.filter_chain = [ remove_empty_line, remove_useless_dot, fix_punctuation ] + + def __getattr__(self, attr): + return getattr(self._handle, attr) + + + + def close(self, *args, **keywds ): + return self._handle.close( *args, **keywds) + + def read( self, *args, **keywds ): + line = '' + len_expected = self._get_len_expected( args, keywds ) + if( len_expected ): + filtered_text = self.read_block( len_expected ) + else: + filtered_text = self.read_to_end() + return filtered_text + + def read_block( self, len_expected ): + + len_filtered = 0 + len_adjusted -= len( self._start_line ) + filtered_text = '' + while( len_filtered < len_expected ): + + text_read = self._handle.read( len_adjusted ) + full_text = self._start_line + text_read + lines = full_text.splitlines( 1 ) + if( text_read == '' ): + filtered_text = filtered_text + self.filter( lines ) + break + else: + all_but_last_line = lines[ :-1 ] + self._start_line = lines[ -1 ] + filtered_text = filtered_text + self.filter( all_but_last_line ) + len_filtered_text = len( filtered_text ) + len_adjusted = len_adjusted - len_filtered_text + return filtered_text[ : ] + + def read_to_end( self ): + filtered_text = '' + text_read = self._handle.read() + full_text = self._start_line + text_read + lines = full_text.splitlines( 1 ) + filtered_text += self.filter( lines[:] ) + return filtered_text[ : ] + + def _get_len_expected( self, args, keywds ): + + if( len( args) > 0 ): + len_expected = args[ 0 ] + if( len_expected < 0 ): + len_expected = None + elif 'size' in keywds: + len_expected = keywds['size'] + else: + len_expected = None + return len_expected + + def filter( self, lines ): + filter_chain = self.filter_chain + filtered_text = '' + for line in lines: + for filter in filter_chain: + line = filter( *( line, ) ) + filtered_text += line + + return filtered_text + +def has_trailing_linefeed( line ): + if( line.endswith( chr( 13 ) ) or \ + line.endswith( chr( 10 ) ) ): + return 1 + else: + return 0 diff --git a/binaries/src/globplot/biopython-1.50/Bio/HotRand.py b/binaries/src/globplot/biopython-1.50/Bio/HotRand.py new file mode 100644 index 0000000..d15a64f --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/HotRand.py @@ -0,0 +1,77 @@ +# Copyright 2002 by Katharine Lindner. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""handles true random numbers supplied from the the web server of fourmilab. Based on atmospheric noise. The motivation is to support biosimulations that rely on random numbers. +""" + +import urllib + + +def hex_convert(text): + import warnings + warnings.warn("The function Bio.HotRand.hex_convert is deprecated. Instead of Bio.HotRand.hex_convert(text), please use int(text, 16) instead", DeprecationWarning) + return int(text, 16) + +def byte_concat( text ): + val = 0 + numbytes = len( text ) + for i in range( 0, numbytes ): + val = val * 256 + val = val + ord( text[ i ] ) + + return val + +class HotCache: + + def __init__( self ): +# self.url = 'http://www.fourmilab.ch/cgi-bin/uncgi/Hotbits?num=5000&min=1&max=6&col=1' + self.url = 'http://www.random.org/cgi-bin/randbyte?' + self.query = { 'nbytes' : 128, 'fmt' : 'h' } + self.fill_hot_cache() + + def fill_hot_cache( self ): + url = self.url + urllib.urlencode( self.query ) + fh = urllib.urlopen( url ) + self.hot_cache = fh.read() + fh.close() + + def next_num( self, num_digits = 4 ): + cache = self.hot_cache + numbytes = num_digits / 2 + if( len( cache ) % numbytes != 0 ): + print 'len_cache is %d' % len( cache ) + raise ValueError + if( cache == '' ): + self.fill_hot_cache() + cache = self.hot_cache + hexdigits = cache[ :numbytes ] + self.hot_cache = cache[ numbytes: ] + return byte_concat( hexdigits ) + + + +class HotRandom: + + def __init__( self ): + self.hot_cache = HotCache( ) + + def hot_rand( self, high, low = 0 ): + span = high - low + val = self.hot_cache.next_num() + val = ( span * val ) >> 16 + val = val + low + return val + + +if( __name__ == '__main__' ): + hot_random = HotRandom() + for j in range ( 0, 130 ): + print hot_random.hot_rand( 25 ) + nums = [ '0000', 'abcd', '1234', '5555', '4321', 'aaaa', 'ffff' ] + for num in nums: + print hex_convert( num ) + + + diff --git a/binaries/src/globplot/biopython-1.50/Bio/Index.py b/binaries/src/globplot/biopython-1.50/Bio/Index.py new file mode 100644 index 0000000..4562f0d --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Index.py @@ -0,0 +1,142 @@ +# Copyright 1999 by Jeffrey Chang. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Index.py + +This module provides a way to create indexes to text files. + +Classes: +Index Dictionary-like class used to store index information. + +_ShelveIndex An Index class based on the shelve module. +_InMemoryIndex An in-memory Index class. + +""" +import os +import array +import cPickle +import shelve + +class _ShelveIndex(dict): + """An index file wrapped around shelve. + + """ + # Without a good dbm module installed, this is pretty slow and + # generates large files. When generating an index on a FASTA- + # formatted file with 82000 sequences (37Mb), the + # index 'dat' file is 42Mb and 'dir' file is 8Mb. + + __version = 2 + __version_key = '__version' + + def __init__(self, indexname, truncate=None): + dict.__init__(self) + try: + if truncate: + # In python 1.52 and before, dumbdbm (under shelve) + # doesn't clear the old database. + files = [indexname + '.dir', + indexname + '.dat', + indexname + '.bak' + ] + for file in files: + if os.path.exists(file): + os.unlink(file) + raise Exception("open a new shelf") + self.data = shelve.open(indexname, flag='r') + except: + # No database exists. + self.data = shelve.open(indexname, flag='n') + self.data[self.__version_key] = self.__version + else: + # Check to make sure the database is the correct version. + version = self.data.get(self.__version_key, None) + if version is None: + raise IOError("Unrecognized index format") + elif version != self.__version: + raise IOError("Version %s doesn't match my version %s" \ + % (version, self.__version)) + + def __del__(self): + if self.__dict__.has_key('data'): + self.data.close() + +class _InMemoryIndex(dict): + """This creates an in-memory index file. + + """ + # File Format: + # version + # key value + # [...] + + __version = 3 + __version_key = '__version' + + def __init__(self, indexname, truncate=None): + self._indexname = indexname + dict.__init__(self) + self.__changed = 0 # the index hasn't changed + + # Remove the database if truncate is true. + if truncate and os.path.exists(indexname): + os.unlink(indexname) + self.__changed = 1 + + # Load the database if it exists + if os.path.exists(indexname): + handle = open(indexname) + version = self._toobj(handle.readline().rstrip()) + if version != self.__version: + raise IOError("Version %s doesn't match my version %s" \ + % (version, self.__version)) + for line in handle: + key, value = line.split() + key, value = self._toobj(key), self._toobj(value) + self[key] = value + self.__changed = 0 + + def update(self, dict): + self.__changed = 1 + dict.update(self, dict) + def __setitem__(self, key, value): + self.__changed = 1 + dict.__setitem__(self, key, value) + def __delitem__(self, key): + self.__changed = 1 + dict.__delitem__(self, key) + def clear(self): + self.__changed = 1 + dict.clear(self) + + def __del__(self): + if self.__changed: + handle = open(self._indexname, 'w') + handle.write("%s\n" % self._tostr(self.__version)) + for key, value in self.items(): + handle.write("%s %s\n" % + (self._tostr(key), self._tostr(value))) + handle.close() + + def _tostr(self, obj): + # I need a representation of the object that's saveable to + # a file that uses whitespace as delimiters. Thus, I'm + # going to pickle the object, and then convert each character of + # the string to its ASCII integer value. Then, I'm going to convert + # the integers into strings and join them together with commas. + # It's not the most efficient way of storing things, but it's + # relatively fast. + s = cPickle.dumps(obj) + intlist = array.array('b', s) + strlist = map(str, intlist) + return ','.join(strlist) + + def _toobj(self, str): + intlist = map(int, str.split(',')) + intlist = array.array('b', intlist) + strlist = map(chr, intlist) + return cPickle.loads(''.join(strlist)) + +Index = _InMemoryIndex diff --git a/binaries/src/globplot/biopython-1.50/Bio/LogisticRegression.py b/binaries/src/globplot/biopython-1.50/Bio/LogisticRegression.py new file mode 100644 index 0000000..7bf13de --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/LogisticRegression.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python + +""" +This module provides code for doing logistic regressions. + + +Classes: +LogisticRegression Holds information for a LogisticRegression classifier. + + +Functions: +train Train a new classifier. +calculate Calculate the probabilities of each class, given an observation. +classify Classify an observation into a class. +""" + +#TODO - Remove this work around once we drop python 2.3 support +try: + set = set +except NameError: + from sets import Set as set + +#from numpy import * +#from numpy.linalg import * +import numpy +import numpy.linalg + +class LogisticRegression: + """Holds information necessary to do logistic regression + classification. + + Members: + beta List of the weights for each dimension. + + """ + def __init__(self): + """LogisticRegression()""" + self.beta = [] + +def train(xs, ys, update_fn=None, typecode=None): + """train(xs, ys[, update_fn]) -> LogisticRegression + + Train a logistic regression classifier on a training set. xs is a + list of observations and ys is a list of the class assignments, + which should be 0 or 1. xs and ys should contain the same number + of elements. update_fn is an optional callback function that + takes as parameters that iteration number and log likelihood. + + """ + if len(xs) != len(ys): + raise ValueError("xs and ys should be the same length.") + classes = set(ys) + if classes != set([0, 1]): + raise ValueError("Classes should be 0's and 1's") + if typecode is None: + typecode = 'd' + + # Dimensionality of the data is the dimensionality of the + # observations plus a constant dimension. + N, ndims = len(xs), len(xs[0]) + 1 + if N==0 or ndims==1: + raise ValueError("No observations or observation of 0 dimension.") + + # Make an X array, with a constant first dimension. + X = numpy.ones((N, ndims), typecode) + X[:, 1:] = xs + Xt = numpy.transpose(X) + y = numpy.asarray(ys, typecode) + + # Initialize the beta parameter to 0. + beta = numpy.zeros(ndims, typecode) + + MAX_ITERATIONS = 500 + CONVERGE_THRESHOLD = 0.01 + stepsize = 1.0 + # Now iterate using Newton-Raphson until the log-likelihoods + # converge. + iter = 0 + old_beta = old_llik = None + while iter < MAX_ITERATIONS: + # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) + ebetaX = numpy.exp(numpy.dot(beta, Xt)) + p = ebetaX / (1+ebetaX) + + # Find the log likelihood score and see if I've converged. + logp = y*numpy.log(p) + (1-y)*numpy.log(1-p) + llik = sum(logp) + if update_fn is not None: + update_fn(iter, llik) + # Check to see if the likelihood decreased. If it did, then + # restore the old beta parameters and half the step size. + if llik < old_llik: + stepsize = stepsize / 2.0 + beta = old_beta + # If I've converged, then stop. + if old_llik is not None and numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD: + break + old_llik, old_beta = llik, beta + iter += 1 + + W = numpy.identity(N) * p + Xtyp = numpy.dot(Xt, y-p) # Calculate the first derivative. + XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative. + #u, s, vt = singular_value_decomposition(XtWX) + #print "U", u + #print "S", s + delta = numpy.linalg.solve(XtWX, Xtyp) + if numpy.fabs(stepsize-1.0) > 0.001: + delta = delta * stepsize + beta = beta + delta # Update beta. + else: + raise RuntimeError("Didn't converge.") + + lr = LogisticRegression() + lr.beta = map(float, beta) # Convert back to regular array. + return lr + +def calculate(lr, x): + """calculate(lr, x) -> list of probabilities + + Calculate the probability for each class. lr is a + LogisticRegression object. x is the observed data. Returns a + list of the probability that it fits each class. + + """ + # Insert a constant term for x. + x = numpy.asarray([1.0] + x) + # Calculate the probability. p = e^(beta X) / (1+e^(beta X)) + ebetaX = numpy.exp(numpy.dot(lr.beta, x)) + p = ebetaX / (1+ebetaX) + return [1-p, p] + +def classify(lr, x): + """classify(lr, x) -> 1 or 0 + + Classify an observation into a class. + + """ + probs = calculate(lr, x) + if probs[0] > probs[1]: + return 0 + return 1 diff --git a/binaries/src/globplot/biopython-1.50/Bio/ParserSupport.py b/binaries/src/globplot/biopython-1.50/Bio/ParserSupport.py new file mode 100644 index 0000000..88e9a23 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/ParserSupport.py @@ -0,0 +1,426 @@ +# Copyright 1999 by Jeffrey Chang. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Code to support writing parsers. + + + +Classes: +AbstractParser Base class for parsers. +AbstractConsumer Base class of all Consumers. +TaggingConsumer Consumer that tags output with its event. For debugging +SGMLStrippingConsumer Consumer that strips SGML tags from output. +EventGenerator Generate Biopython Events from Martel XML output + (note that Martel is now DEPRECATED) + +Functions: +safe_readline Read a line from a handle, with check for EOF. +safe_peekline Peek at next line, with check for EOF. +read_and_call Read a line from a handle and pass it to a method. +read_and_call_while Read many lines, as long as a condition is met. +read_and_call_until Read many lines, until a condition is met. +attempt_read_and_call Like read_and_call, but forgiving of errors. +is_blank_line Test whether a line is blank. + +""" + +import sys +import traceback +from types import * + +from Bio import File + +# XML from python 2.0 +try: + from xml.sax import handler + xml_support = 1 +except ImportError: + sys.stderr.write("Warning: Could not import SAX for dealing with XML.\n" + + "This causes problems with some ParserSupport modules\n") + xml_support = 0 + +class AbstractParser: + """Base class for other parsers. + + """ + def parse(self, handle): + raise NotImplementedError("Please implement in a derived class") + + def parse_str(self, string): + return self.parse(File.StringHandle(string)) + + def parse_file(self, filename): + h = open(filename) + try: + retval = self.parse(h) + finally: + h.close() + return retval + +class AbstractConsumer: + """Base class for other Consumers. + + Derive Consumers from this class and implement appropriate + methods for each event that you want to receive. + + """ + def _unhandled_section(self): + pass + def _unhandled(self, data): + pass + def __getattr__(self, attr): + if attr[:6] == 'start_' or attr[:4] == 'end_': + method = self._unhandled_section + else: + method = self._unhandled + return method + +class TaggingConsumer(AbstractConsumer): + """A Consumer that tags the data stream with the event and + prints it to a handle. Useful for debugging. + + """ + def __init__(self, handle=None, colwidth=15, maxwidth=80): + """TaggingConsumer(handle=sys.stdout, colwidth=15, maxwidth=80)""" + # I can't assign sys.stdout to handle in the argument list. + # If I do that, handle will be assigned the value of sys.stdout + # the first time this function is called. This will fail if + # the user has assigned sys.stdout to some other file, which may + # be closed or invalid at a later time. + if handle is None: + handle = sys.stdout + self._handle = handle + self._colwidth = colwidth + self._maxwidth = maxwidth + + def unhandled_section(self): + self._print_name('unhandled_section') + + def unhandled(self, data): + self._print_name('unhandled', data) + + def _print_name(self, name, data=None): + if data is None: + # Write the name of a section. + self._handle.write("%s %s\n" % ("*"*self._colwidth, name)) + else: + # Write the tag and line. + self._handle.write("%-*s: %s\n" % ( + self._colwidth, name[:self._colwidth], + data[:self._maxwidth-self._colwidth-2].rstrip())) + + def __getattr__(self, attr): + if attr[:6] == 'start_' or attr[:4] == 'end_': + method = lambda a=attr, s=self: s._print_name(a) + else: + method = lambda x, a=attr, s=self: s._print_name(a, x) + return method + +class SGMLStrippingConsumer: + """A consumer that strips off SGML tags. + + This is meant to be used as a decorator for other consumers. + + """ + def __init__(self, consumer): + if type(consumer) is not InstanceType: + raise ValueError("consumer should be an instance") + self._consumer = consumer + self._prev_attr = None + self._stripper = File.SGMLStripper() + + def _apply_clean_data(self, data): + clean = self._stripper.strip(data) + self._prev_attr(clean) + + def __getattr__(self, name): + if name in ['_prev_attr', '_stripper']: + return getattr(self, name) + attr = getattr(self._consumer, name) + # If this is not a method, then return it as is. + if type(attr) is not MethodType: + return attr + # If it's a section method, then return it. + if name[:6] == 'start_' or name[:4] == 'end_': + return attr + # Otherwise, it's an info event, and return my method. + self._prev_attr = attr + return self._apply_clean_data + +# onle use the Event Generator if XML handling is okay +if xml_support: + class EventGenerator(handler.ContentHandler): + """Handler to generate events associated with a Martel parsed file. + + This acts like a normal SAX handler, and accepts XML generated by + Martel during parsing. These events are then converted into + 'Biopython events', which can then be caught by a standard + biopython consumer. + + Note that Martel is now DEPRECATED. + """ + def __init__(self, consumer, interest_tags, callback_finalizer = None, + exempt_tags = []): + """Initialize to begin catching and firing off events. + + Arguments: + o consumer - The consumer that we'll send Biopython events to. + + o interest_tags - A listing of all the tags we are interested in. + + o callback_finalizer - A function to deal with the collected + information before passing it on to the consumer. By default + the collected information is a list of all of the lines read + for a particular tag -- if there are multiple tags in a row + like: + + Spam + More Spam + + In this case the list of information would be: + + ['Spam', 'More Spam'] + + This list of lines will be passed to the callback finalizer if + it is present. Otherwise the consumer will be called with the + list of content information. + + o exempt_tags - A listing of particular tags that are exempt from + being processed by the callback_finalizer. This allows you to + use a finalizer to deal with most tags, but leave those you don't + want touched. + """ + self._consumer = consumer + self.interest_tags = interest_tags + self._finalizer = callback_finalizer + self._exempt_tags = exempt_tags + + # a dictionary of content for each tag of interest + # the information for each tag is held as a list of the lines. + # This allows us to collect information from multiple tags + # in a row, and return it all at once. + self.info = {} + for tag in self.interest_tags: + self.info[tag] = [] + + # the previous tag we were collecting information for. + # We set a delay in sending info to the consumer so that we can + # collect a bunch of tags in a row and append all of the info + # together. + self._previous_tag = '' + + # the current character information for a tag + self._cur_content = [] + # whether we should be collecting information + self._collect_characters = 0 + + def startElement(self, name, attrs): + """Determine if we should collect characters from this tag. + """ + if name in self.interest_tags: + self._collect_characters = 1 + + def characters(self, content): + """Extract the information if we are interested in it. + """ + if self._collect_characters: + self._cur_content.append(content) + + def endElement(self, name): + """Send the information to the consumer. + + Once we've got the end element we've collected up all of the + character information we need, and we need to send this on to + the consumer to do something with it. + + We have a delay of one tag on doing this, so that we can collect + all of the info from multiple calls to the same element at once. + """ + # only deal with the tag if it is something we are + # interested in and potentially have information for + if self._collect_characters: + # add all of the information collected inside this tag + self.info[name].append("".join(self._cur_content)) + # reset our information and flags + self._cur_content = [] + self._collect_characters = 0 + + # if we are at a new tag, pass on the info from the last tag + if self._previous_tag and self._previous_tag != name: + self._make_callback(self._previous_tag) + + # set this tag as the next to be passed + self._previous_tag = name + + def _make_callback(self, name): + """Call the callback function with the info with the given name. + """ + # strip off whitespace and call the consumer + callback_function = getattr(self._consumer, name) + + # --- pass back the information + # if there is a finalizer, use that + if self._finalizer is not None and name not in self._exempt_tags: + info_to_pass = self._finalizer(self.info[name]) + # otherwise pass back the entire list of information + else: + info_to_pass = self.info[name] + + callback_function(info_to_pass) + + # reset the information for the tag + self.info[name] = [] + + def endDocument(self): + """Make sure all of our information has been passed. + + This just flushes out any stored tags that need to be passed. + """ + if self._previous_tag: + self._make_callback(self._previous_tag) + +def read_and_call(uhandle, method, **keywds): + """read_and_call(uhandle, method[, start][, end][, contains][, blank][, has_re]) + + Read a line from uhandle, check it, and pass it to the method. + Raises a ValueError if the line does not pass the checks. + + start, end, contains, blank, and has_re specify optional conditions + that the line must pass. start and end specifies what the line must + begin or end with (not counting EOL characters). contains + specifies a substring that must be found in the line. If blank + is a true value, then the line must be blank. has_re should be + a regular expression object with a pattern that the line must match + somewhere. + + """ + line = safe_readline(uhandle) + errmsg = _fails_conditions(*(line,), **keywds) + if errmsg is not None: + raise ValueError(errmsg) + method(line) + +def read_and_call_while(uhandle, method, **keywds): + """read_and_call_while(uhandle, method[, start][, end][, contains][, blank][, has_re]) -> number of lines + + Read a line from uhandle and pass it to the method as long as + some condition is true. Returns the number of lines that were read. + + See the docstring for read_and_call for a description of the parameters. + + """ + nlines = 0 + while 1: + line = safe_readline(uhandle) + # If I've failed the condition, then stop reading the line. + if _fails_conditions(*(line,), **keywds): + uhandle.saveline(line) + break + method(line) + nlines = nlines + 1 + return nlines + +def read_and_call_until(uhandle, method, **keywds): + """read_and_call_until(uhandle, method, + start=None, end=None, contains=None, blank=None) -> number of lines + + Read a line from uhandle and pass it to the method until + some condition is true. Returns the number of lines that were read. + + See the docstring for read_and_call for a description of the parameters. + + """ + nlines = 0 + while 1: + line = safe_readline(uhandle) + # If I've met the condition, then stop reading the line. + if not _fails_conditions(*(line,), **keywds): + uhandle.saveline(line) + break + method(line) + nlines = nlines + 1 + return nlines + +def attempt_read_and_call(uhandle, method, **keywds): + """attempt_read_and_call(uhandle, method, **keywds) -> boolean + + Similar to read_and_call, but returns a boolean specifying + whether the line has passed the checks. Does not raise + exceptions. + + See docs for read_and_call for a description of the function + arguments. + + """ + line = safe_readline(uhandle) + passed = not _fails_conditions(*(line,), **keywds) + if passed: + method(line) + else: + uhandle.saveline(line) + return passed + +def _fails_conditions(line, start=None, end=None, contains=None, blank=None, + has_re=None): + if start is not None: + if line[:len(start)] != start: + return "Line does not start with '%s':\n%s" % (start, line) + if end is not None: + if line.rstrip()[-len(end):] != end: + return "Line does not end with '%s':\n%s" % (end, line) + if contains is not None: + if line.find(contains) == -1: + return "Line does not contain '%s':\n%s" % (contains, line) + if blank is not None: + if blank: + if not is_blank_line(line): + return "Expected blank line, but got:\n%s" % line + else: + if is_blank_line(line): + return "Expected non-blank line, but got a blank one" + if has_re is not None: + if has_re.search(line) is None: + return "Line does not match regex '%s':\n%s" % ( + has_re.pattern, line) + return None + +def is_blank_line(line, allow_spaces=0): + """is_blank_line(line, allow_spaces=0) -> boolean + + Return whether a line is blank. allow_spaces specifies whether to + allow whitespaces in a blank line. A true value signifies that a + line containing whitespaces as well as end-of-line characters + should be considered blank. + + """ + if not line: + return 1 + if allow_spaces: + return line.rstrip() == '' + return line[0] == '\n' or line[0] == '\r' + +def safe_readline(handle): + """safe_readline(handle) -> line + + Read a line from an UndoHandle and return it. If there are no more + lines to read, I will raise a ValueError. + + """ + line = handle.readline() + if not line: + raise ValueError("Unexpected end of stream.") + return line + +def safe_peekline(handle): + """safe_peekline(handle) -> line + + Peek at the next line in an UndoHandle and return it. If there are no + more lines to peek, I will raise a ValueError. + + """ + line = handle.peekline() + if not line: + raise ValueError("Unexpected end of stream.") + return line diff --git a/binaries/src/globplot/biopython-1.50/Bio/Parsers/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/Parsers/__init__.py new file mode 100644 index 0000000..53f3d1b --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Parsers/__init__.py @@ -0,0 +1,2 @@ +"""Third party and other parsers useful internally to Biopython. +""" diff --git a/binaries/src/globplot/biopython-1.50/Bio/Parsers/spark.py b/binaries/src/globplot/biopython-1.50/Bio/Parsers/spark.py new file mode 100644 index 0000000..be547e7 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Parsers/spark.py @@ -0,0 +1,565 @@ +# Copyright (c) 1998-2000 John Aycock +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__version__ = 'SPARK-0.6.1' + +import re +import sys + +def _namelist(instance): + namelist, namedict, classlist = [], {}, [instance.__class__] + for c in classlist: + for b in c.__bases__: + classlist.append(b) + for name in dir(c): + if name not in namedict: + namelist.append(name) + namedict[name] = 1 + return namelist + +class GenericScanner: + def __init__(self): + pattern = self.reflect() + self.re = re.compile(pattern, re.VERBOSE) + + self.index2func = {} + for name, number in self.re.groupindex.items(): + self.index2func[number-1] = getattr(self, 't_' + name) + + def makeRE(self, name): + doc = getattr(self, name).__doc__ + rv = '(?P<%s>%s)' % (name[2:], doc) + return rv + + def reflect(self): + rv = [] + for name in _namelist(self): + if name[:2] == 't_' and name != 't_default': + rv.append(self.makeRE(name)) + + rv.append(self.makeRE('t_default')) + return '|'.join(rv) + + def error(self, s, pos): + print "Lexical error at position %s" % pos + raise SystemExit + + def tokenize(self, s): + pos = 0 + n = len(s) + while pos < n: + m = self.re.match(s, pos) + if m is None: + self.error(s, pos) + + groups = m.groups() + for i in range(len(groups)): + if groups[i] and i in self.index2func: + self.index2func[i](groups[i]) + pos = m.end() + + def t_default(self, s): + r'( . | \n )+' + pass + +class GenericParser: + def __init__(self, start): + self.rules = {} + self.rule2func = {} + self.rule2name = {} + self.collectRules() + self.startRule = self.augment(start) + self.ruleschanged = 1 + + _START = 'START' + _EOF = 'EOF' + + # + # A hook for GenericASTBuilder and GenericASTMatcher. + # + def preprocess(self, rule, func): return rule, func + + def addRule(self, doc, func): + rules = doc.split() + + index = [] + for i in range(len(rules)): + if rules[i] == '::=': + index.append(i-1) + index.append(len(rules)) + + for i in range(len(index)-1): + lhs = rules[index[i]] + rhs = rules[index[i]+2:index[i+1]] + rule = (lhs, tuple(rhs)) + + rule, fn = self.preprocess(rule, func) + + if lhs in self.rules: + self.rules[lhs].append(rule) + else: + self.rules[lhs] = [ rule ] + self.rule2func[rule] = fn + self.rule2name[rule] = func.__name__[2:] + self.ruleschanged = 1 + + def collectRules(self): + for name in _namelist(self): + if name[:2] == 'p_': + func = getattr(self, name) + doc = func.__doc__ + self.addRule(doc, func) + + def augment(self, start): + # + # Tempting though it is, this isn't made into a call + # to self.addRule() because the start rule shouldn't + # be subject to preprocessing. + # + startRule = (self._START, ( start, self._EOF )) + self.rule2func[startRule] = lambda args: args[0] + self.rules[self._START] = [ startRule ] + self.rule2name[startRule] = '' + return startRule + + def makeFIRST(self): + union = {} + self.first = {} + + for rulelist in self.rules.values(): + for lhs, rhs in rulelist: + if lhs not in self.first: + self.first[lhs] = {} + + if len(rhs) == 0: + self.first[lhs][None] = 1 + continue + + sym = rhs[0] + if sym not in self.rules: + self.first[lhs][sym] = 1 + else: + union[(sym, lhs)] = 1 + changes = 1 + while changes: + changes = 0 + for src, dest in union.keys(): + destlen = len(self.first[dest]) + self.first[dest].update(self.first[src]) + if len(self.first[dest]) != destlen: + changes = 1 + + # + # An Earley parser, as per J. Earley, "An Efficient Context-Free + # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, + # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, + # Carnegie-Mellon University, August 1968, p. 27. + # + + def typestring(self, token): + return None + + def error(self, token): + print "Syntax error at or near `%s' token" % token + raise SystemExit + + def parse(self, tokens): + tree = {} + tokens.append(self._EOF) + states = { 0: [ (self.startRule, 0, 0) ] } + + if self.ruleschanged: + self.makeFIRST() + + for i in xrange(len(tokens)): + states[i+1] = [] + + if states[i] == []: + break + self.buildState(tokens[i], states, i, tree) + + #_dump(tokens, states) + + if i < len(tokens)-1 or states[i+1] != [(self.startRule, 2, 0)]: + del tokens[-1] + self.error(tokens[i-1]) + rv = self.buildTree(tokens, tree, ((self.startRule, 2, 0), i+1)) + del tokens[-1] + return rv + + def buildState(self, token, states, i, tree): + needsCompletion = {} + state = states[i] + predicted = {} + + for item in state: + rule, pos, parent = item + lhs, rhs = rule + + # + # A -> a . (completer) + # + if pos == len(rhs): + if len(rhs) == 0: + needsCompletion[lhs] = (item, i) + + for pitem in states[parent]: + if pitem is item: + break + + prule, ppos, pparent = pitem + plhs, prhs = prule + + if prhs[ppos:ppos+1] == (lhs,): + new = (prule, + ppos+1, + pparent) + if new not in state: + state.append(new) + tree[(new, i)] = [(item, i)] + else: + tree[(new, i)].append((item, i)) + continue + + nextSym = rhs[pos] + + # + # A -> a . B (predictor) + # + if nextSym in self.rules: + # + # Work on completer step some more; for rules + # with empty RHS, the "parent state" is the + # current state we're adding Earley items to, + # so the Earley items the completer step needs + # may not all be present when it runs. + # + if nextSym in needsCompletion: + new = (rule, pos+1, parent) + olditem_i = needsCompletion[nextSym] + if new not in state: + state.append(new) + tree[(new, i)] = [olditem_i] + else: + tree[(new, i)].append(olditem_i) + + # + # Has this been predicted already? + # + if nextSym in predicted: + continue + predicted[nextSym] = 1 + + ttype = token is not self._EOF and \ + self.typestring(token) or \ + None + if ttype is not None: + # + # Even smarter predictor, when the + # token's type is known. The code is + # grungy, but runs pretty fast. Three + # cases are looked for: rules with + # empty RHS; first symbol on RHS is a + # terminal; first symbol on RHS is a + # nonterminal (and isn't nullable). + # + for prule in self.rules[nextSym]: + new = (prule, 0, i) + prhs = prule[1] + if len(prhs) == 0: + state.append(new) + continue + prhs0 = prhs[0] + if prhs0 not in self.rules: + if prhs0 != ttype: + continue + else: + state.append(new) + continue + first = self.first[prhs0] + if None not in first and \ + ttype not in first: + continue + state.append(new) + continue + + for prule in self.rules[nextSym]: + # + # Smarter predictor, as per Grune & + # Jacobs' _Parsing Techniques_. Not + # as good as FIRST sets though. + # + prhs = prule[1] + if len(prhs) > 0 and \ + prhs[0] not in self.rules and \ + token != prhs[0]: + continue + state.append((prule, 0, i)) + + # + # A -> a . c (scanner) + # + elif token == nextSym: + #assert new not in states[i+1] + states[i+1].append((rule, pos+1, parent)) + + def buildTree(self, tokens, tree, root): + stack = [] + self.buildTree_r(stack, tokens, -1, tree, root) + return stack[0] + + def buildTree_r(self, stack, tokens, tokpos, tree, root): + (rule, pos, parent), state = root + + while pos > 0: + want = ((rule, pos, parent), state) + if want not in tree: + # + # Since pos > 0, it didn't come from closure, + # and if it isn't in tree[], then there must + # be a terminal symbol to the left of the dot. + # (It must be from a "scanner" step.) + # + pos = pos - 1 + state = state - 1 + stack.insert(0, tokens[tokpos]) + tokpos = tokpos - 1 + else: + # + # There's a NT to the left of the dot. + # Follow the tree pointer recursively (>1 + # tree pointers from it indicates ambiguity). + # Since the item must have come about from a + # "completer" step, the state where the item + # came from must be the parent state of the + # item the tree pointer points to. + # + children = tree[want] + if len(children) > 1: + child = self.ambiguity(children) + else: + child = children[0] + + tokpos = self.buildTree_r(stack, + tokens, tokpos, + tree, child) + pos = pos - 1 + (crule, cpos, cparent), cstate = child + state = cparent + + lhs, rhs = rule + result = self.rule2func[rule](stack[:len(rhs)]) + stack[:len(rhs)] = [result] + return tokpos + + def ambiguity(self, children): + # + # XXX - problem here and in collectRules() if the same + # rule appears in >1 method. But in that case the + # user probably gets what they deserve :-) Also + # undefined results if rules causing the ambiguity + # appear in the same method. + # + sortlist = [] + name2index = {} + for i in range(len(children)): + ((rule, pos, parent), index) = children[i] + lhs, rhs = rule + name = self.rule2name[rule] + sortlist.append((len(rhs), name)) + name2index[name] = i + sortlist.sort() + list = map(lambda (a,b): b, sortlist) + return children[name2index[self.resolve(list)]] + + def resolve(self, list): + # + # Resolve ambiguity in favor of the shortest RHS. + # Since we walk the tree from the top down, this + # should effectively resolve in favor of a "shift". + # + return list[0] + +# +# GenericASTBuilder automagically constructs a concrete/abstract syntax tree +# for a given input. The extra argument is a class (not an instance!) +# which supports the "__setslice__" and "__len__" methods. +# +# XXX - silently overrides any user code in methods. +# + +class GenericASTBuilder(GenericParser): + def __init__(self, AST, start): + GenericParser.__init__(self, start) + self.AST = AST + + def preprocess(self, rule, func): + rebind = lambda lhs, self=self: \ + lambda args, lhs=lhs, self=self: \ + self.buildASTNode(args, lhs) + lhs, rhs = rule + return rule, rebind(lhs) + + def buildASTNode(self, args, lhs): + children = [] + for arg in args: + if isinstance(arg, self.AST): + children.append(arg) + else: + children.append(self.terminal(arg)) + return self.nonterminal(lhs, children) + + def terminal(self, token): return token + + def nonterminal(self, type, args): + rv = self.AST(type) + rv[:len(args)] = args + return rv + +# +# GenericASTTraversal is a Visitor pattern according to Design Patterns. For +# each node it attempts to invoke the method n_, falling +# back onto the default() method if the n_* can't be found. The preorder +# traversal also looks for an exit hook named n__exit (no default +# routine is called if it's not found). To prematurely halt traversal +# of a subtree, call the prune() method -- this only makes sense for a +# preorder traversal. Node type is determined via the typestring() method. +# + +class GenericASTTraversalPruningException: + pass + +class GenericASTTraversal: + def __init__(self, ast): + self.ast = ast + + def typestring(self, node): + return node.type + + def prune(self): + raise GenericASTTraversalPruningException + + def preorder(self, node=None): + if node is None: + node = self.ast + + try: + name = 'n_' + self.typestring(node) + if hasattr(self, name): + func = getattr(self, name) + func(node) + else: + self.default(node) + except GenericASTTraversalPruningException: + return + + for kid in node: + self.preorder(kid) + + name = name + '_exit' + if hasattr(self, name): + func = getattr(self, name) + func(node) + + def postorder(self, node=None): + if node is None: + node = self.ast + + for kid in node: + self.postorder(kid) + + name = 'n_' + self.typestring(node) + if hasattr(self, name): + func = getattr(self, name) + func(node) + else: + self.default(node) + + + def default(self, node): + pass + +# +# GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__" +# implemented. +# +# XXX - makes assumptions about how GenericParser walks the parse tree. +# + +class GenericASTMatcher(GenericParser): + def __init__(self, start, ast): + GenericParser.__init__(self, start) + self.ast = ast + + def preprocess(self, rule, func): + rebind = lambda func, self=self: \ + lambda args, func=func, self=self: \ + self.foundMatch(args, func) + lhs, rhs = rule + rhslist = list(rhs) + rhslist.reverse() + + return (lhs, tuple(rhslist)), rebind(func) + + def foundMatch(self, args, func): + func(args[-1]) + return args[-1] + + def match_r(self, node): + self.input.insert(0, node) + children = 0 + + for child in node: + if children == 0: + self.input.insert(0, '(') + children = children + 1 + self.match_r(child) + + if children > 0: + self.input.insert(0, ')') + + def match(self, ast=None): + if ast is None: + ast = self.ast + self.input = [] + + self.match_r(ast) + self.parse(self.input) + + def resolve(self, list): + # + # Resolve ambiguity in favor of the longest RHS. + # + return list[-1] + +def _dump(tokens, states): + for i in range(len(states)): + print 'state', i + for (lhs, rhs), pos, parent in states[i]: + print '\t', lhs, '::=', + print ' '.join(rhs[:pos]), + print '.', + print ' '.join(rhs[pos:]), + print ',', parent + if i < len(tokens): + print + print 'token', str(tokens[i]) + print diff --git a/binaries/src/globplot/biopython-1.50/Bio/PropertyManager.py b/binaries/src/globplot/biopython-1.50/Bio/PropertyManager.py new file mode 100644 index 0000000..05c27f7 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/PropertyManager.py @@ -0,0 +1,83 @@ +# Stores properties associated with the class of an object. + + +# Would it be nice to have support for more than one resolver per +# class? In the meanwhile, they could collude using a dispatch +# object. + +# Do you need access to the actual resolver? + +# Resolvers get the sequence because they may do a per-object lookup. + +# Could cache search results for better performance. + + +# Dictionary which creates dictionary elements, so lookups never fail. +# The new elements are always dictionaries. +class CreateDict(dict): + def __getitem__(self, key): + return self.setdefault(key,{}) + +class PropertyManager: + def __init__(self): + self.class_property = CreateDict() + self.class_property_resolver = CreateDict() + self.class_resolver = {} + + def resolve(self, obj, property): + try: + klass = obj.__class__ + except AttributeError: + raise KeyError("built-in instance") + + return self.resolve_class(klass, property) + + def resolve_class(self, klass, property): + # Hopefully, we'll find the hit right away + try: + return self.class_property[klass][property] + except KeyError: + pass + + # Is there a property resolver? + try: + return self.class_property_resolver[klass][property]( + self, klass, property) + except KeyError: + pass + + # What about the class resolver? + try: + return self.class_resolver[klass](self, klass, property) + except KeyError: + pass + + # That failed, so we walk up the class tree, depth-first and + # left-to-right (same as Python). For each class, check if + # the property exists, then check if the property resolver + # exists, and finally, check for the class resolver. + + bases = list(klass.__bases__) + while bases: + base = bases.pop() + try: + return self.class_property[base][property] + except KeyError: + pass + try: + return self.class_property_resolver[base][property]( + self, klass, property) + except KeyError: + pass + try: + return self.class_resolver[base](self, klass, property) + except KeyError: + pass + + # this is why the search is depth-first/right-left + bases[:0] = list(base.__bases__) + raise KeyError("cannot find property %s for class %s" \ + % (property, klass)) + + +default_manager = PropertyManager() diff --git a/binaries/src/globplot/biopython-1.50/Bio/PropertyManager.pyc b/binaries/src/globplot/biopython-1.50/Bio/PropertyManager.pyc new file mode 100644 index 0000000..2581ef5 Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/PropertyManager.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/Search.py b/binaries/src/globplot/biopython-1.50/Bio/Search.py new file mode 100644 index 0000000..181f0ba --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Search.py @@ -0,0 +1,153 @@ +# BLASTN 2.0a19MP-WashU [05-Feb-1998] [Build decunix3.2 01:53:29 05-Feb-1998] +# BLASTP 2.0.4 [Feb-24-1998] +class Algorithm: + def __init__(self, name, version, description = ""): + self.name = name # 'blastx', 'blastn', etc. + self.version = version # '2.1.2' or '2.0a19MP-WashU' + self.description = description # '[05-Feb-1998] [Build dec ...1998]' + +# Query= YAL001C YAL001C, Chr I from 147596 to 147665, and 147756 to 151168, +# reverse complement +# (3483 letters) +class Query: + def __init__(self, name, accession, description, length): + self.name = name # 'YAL001C' + self.accession = accession # or None if missing + self.description = description # 'YAL001C, Chr I from 147596 to ... ' + self.length = length # 3483 + +# Database: ArabidopsisN +# 66,211 sequences; 69,074,155 total letters. +class Database: + def __init__(self, name, letters, entries): + self.name = name # ArabidopsisN + self.letters = letters # 69074155 + self.entries = entries # 66211 + +class TableInfo: + def __init__(self, full_description, info): + self.__dict__.update(info) + self.full_description = full_description + + +class Search: + def __init__(self, algorithm, query, database, table, hits, + parameters, statistics): + self.algorithm = algorithm + self.query = query + self.database = database + self.table = table + self.hits = hits + self.parameters = parameters + self.statistics = statistics + +class Hit: + def __init__(self, name, description, accession, length, + algorithm, hsps = None): + self.name = name + self.description = description + self.accession = accession + self.length = length + self.algorithm = algorithm + if hsps is None: + hsps = [] + self.hsps = hsps + + def __len__(self): + return self.length + + + +# >GB_PL:ATF18F4 AL021637 Arabidopsis thaliana DNA chromosome 4, BAC clone +# F18F4 (ESSAII project). 2/98 +# Length = 93,646 +# +# Minus Strand HSPs: +# +# Score = 226 (33.9 bits), Expect = 0.80, P = 0.55 +# Identities = 98/142 (69%), Positives = 98/142 (69%), Strand = Minus / Plus +# [...lines deleted...] +# Query: 2486 ATATCAAGCAATTTGATAAGATCTAG 2461 +# A AT A C ATT GA AAGATC AG +# Sbjct: 85387 AGATTTACCTATT-GAGAAGATCAAG 85411 + +# computed from the strings +class _SeqLength: + def __init__(self, length, identical, positives, gaps): + self.length = length + self.identical = identical + self.positives = positives + self.gaps = gaps + def __len__(self): + return self.length + def __getattr__(self, name): + if name == "frac_identical": + return float(self.identical) / self.length + elif name == "frac_positives": + return float(self.positives) / self.length + raise AttributeError(name) + + +class HomologySeq(_SeqLength): + def __init__(self, seq, identical, positives, gaps): + _SeqLength.__init__(self, len(seq), identical, positives, gaps) + self.seq = seq + +class HSPSeq(_SeqLength): + def __init__(self, name, seq, location, identical, positives, gaps): + _SeqLength.__init__(self, len(seq), identical, positives, gaps) + self.name = name + self.seq = seq + self.location = location + + +class HSP(_SeqLength): + def __init__(self, + query_seq, # ATATCAAGCAATTTGATAAGATCTAG + homology_seq, # A AT A C ATT GA AAGATC AG + subject_seq, # AGATTTACCTATT-GAGAAGATCAAG + + query_location, # (2486, 2461, negative strand) + subject_location, # (85387, 85411) + + query_name, # Query (or None) + subject_name, # Sbjct (or None) + + algorithm, # an Algorithm + info, # contains Key/value pairs + homology_gaps = None, # Is this needed? + ): + assert len(query_seq) == len(homology_seq) == len(subject_seq), \ + (query_seq, homology_seq, subject_seq) + self.algorithm = algorithm + + query_gaps = query_seq.count("-") + subject_gaps = subject_seq.count("-") + if homology_gaps is None: + homology_gaps = query_gaps + subject_gaps + self.info = info + + identical = info["identical"] + # bioperl calls this 'conserved' + positives = info.get("positives", identical) + + _SeqLength.__init__(self, len(query_seq), identical, + positives, homology_gaps) + + self.query = HSPSeq(name = query_name, + seq = query_seq, + location = query_location, + identical = identical, + positives = positives, + gaps = query_gaps) + + self.subject = HSPSeq(name = subject_name, + seq = subject_seq, + location = subject_location, + identical = identical, + positives = positives, + gaps = subject_gaps) + self.homology = HomologySeq(seq = homology_seq, + identical = identical, + positives = positives, + gaps = homology_gaps) diff --git a/binaries/src/globplot/biopython-1.50/Bio/Seq.py b/binaries/src/globplot/biopython-1.50/Bio/Seq.py new file mode 100644 index 0000000..d32d8e7 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Seq.py @@ -0,0 +1,1633 @@ +# Copyright 2000-2002 Brad Chapman. +# Copyright 2004-2005 by M de Hoon. +# Copyright 2007-2009 by Peter Cock. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +"""Provides objects to represent biological sequences with alphabets. + +See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial: + - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html} + - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf} +""" +__docformat__ ="epytext en" #Don't just use plain text in epydoc API pages! + +import string #for maketrans only +import array +import sys + +#TODO - Remove this work around once we drop python 2.3 support +try: + set = set +except NameError: + from sets import Set as set + +import Alphabet +from Alphabet import IUPAC +from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement +from Bio.Data import CodonTable + +def _maketrans(complement_mapping) : + """Makes a python string translation table (PRIVATE). + + Arguments: + - complement_mapping - a dictionary such as ambiguous_dna_complement + and ambiguous_rna_complement from Data.IUPACData. + + Returns a translation table (a string of length 256) for use with the + python string's translate method to use in a (reverse) complement. + + Compatible with lower case and upper case sequences. + + For internal use only. + """ + before = ''.join(complement_mapping.keys()) + after = ''.join(complement_mapping.values()) + before = before + before.lower() + after = after + after.lower() + return string.maketrans(before, after) + +_dna_complement_table = _maketrans(ambiguous_dna_complement) +_rna_complement_table = _maketrans(ambiguous_rna_complement) + +class Seq(object): + """A read-only sequence object (essentially a string with an alphabet). + + Like normal python strings, our basic sequence object is immutable. + This prevents you from doing my_seq[5] = "A" for example, but does allow + Seq objects to be used as dictionary keys. + + The Seq object provides a number of string like methods (such as count, + find, split and strip), which are alphabet aware where appropriate. + + The Seq object also provides some biological methods, such as complement, + reverse_complement, transcribe, back_transcribe and translate (which are + not applicable to sequences with a protein alphabet). + """ + def __init__(self, data, alphabet = Alphabet.generic_alphabet): + """Create a Seq object. + + Arguments: + - seq - Sequence, required (string) + - alphabet - Optional argument, an Alphabet object from Bio.Alphabet + + You will typically use Bio.SeqIO to read in sequences from files as + SeqRecord objects, whose sequence will be exposed as a Seq object via + the seq property. + + However, will often want to create your own Seq objects directly: + + >>> from Bio.Seq import Seq + >>> from Bio.Alphabet import IUPAC + >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", + ... IUPAC.protein) + >>> my_seq + Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) + >>> print my_seq + MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF + """ + # Enforce string storage + assert (type(data) == type("") or # must use a string + type(data) == type(u"")) # but can be a unicode string + self._data = data + self.alphabet = alphabet # Seq API requirement + + # A data property is/was a Seq API requirement + def _set_data(self, value) : + #TODO - In the next release, actually raise an exception? + #The Seq object is like a python string, it should be read only! + import warnings + warnings.warn("Writing to the Seq object's .data propery is deprecated.", + DeprecationWarning) + self._data = value + data = property(fget= lambda self : str(self), + fset=_set_data, + doc="Sequence as a string (DEPRECATED)") + + def __repr__(self): + """Returns a (truncated) representation of the sequence for debugging.""" + if len(self) > 60 : + #Shows the last three letters as it is often useful to see if there + #is a stop codon at the end of a sequence. + #Note total length is 54+3+3=60 + return "%s('%s...%s', %s)" % (self.__class__.__name__, + str(self)[:54], str(self)[-3:], + repr(self.alphabet)) + else : + return "%s(%s, %s)" % (self.__class__.__name__, + repr(self.data), + repr(self.alphabet)) + def __str__(self): + """Returns the full sequence as a python string. + + Note that Biopython 1.44 and earlier would give a truncated + version of repr(my_seq) for str(my_seq). If you are writing code + which need to be backwards compatible with old Biopython, you + should continue to use my_seq.tostring() rather than str(my_seq). + """ + return self._data + + """ + TODO - Work out why this breaks test_Restriction.py + (Comparing Seq objects would be nice to have. May need to think about + hashes and the in operator for when have list/dictionary of Seq objects...) + def __cmp__(self, other): + if hasattr(other, "alphabet") : + #other should be a Seq or a MutableSeq + if not Alphabet._check_type_compatible([self.alphabet, + other.alphabet]) : + raise TypeError("Incompatable alphabets %s and %s" \ + % (repr(self.alphabet), repr(other.alphabet))) + #They should be the same sequence type (or one of them is generic) + return cmp(str(self), str(other)) + elif isinstance(other, basestring) : + return cmp(str(self), other) + else : + raise TypeError + """ + + def __len__(self): return len(self._data) # Seq API requirement + + def __getitem__(self, index) : # Seq API requirement + #Note since Python 2.0, __getslice__ is deprecated + #and __getitem__ is used instead. + #See http://docs.python.org/ref/sequence-methods.html + if isinstance(index, int) : + #Return a single letter as a string + return self._data[index] + else : + #Return the (sub)sequence as another Seq object + return Seq(self._data[index], self.alphabet) + + def __add__(self, other): + """Add another sequence or string to this sequence.""" + if hasattr(other, "alphabet") : + #other should be a Seq or a MutableSeq + if not Alphabet._check_type_compatible([self.alphabet, + other.alphabet]) : + raise TypeError("Incompatable alphabets %s and %s" \ + % (repr(self.alphabet), repr(other.alphabet))) + #They should be the same sequence type (or one of them is generic) + a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) + return self.__class__(str(self) + str(other), a) + elif isinstance(other, basestring) : + #other is a plain string - use the current alphabet + return self.__class__(str(self) + other, self.alphabet) + else : + raise TypeError + + def __radd__(self, other): + if hasattr(other, "alphabet") : + #other should be a Seq or a MutableSeq + if not Alphabet._check_type_compatible([self.alphabet, + other.alphabet]) : + raise TypeError("Incompatable alphabets %s and %s" \ + % (repr(self.alphabet), repr(other.alphabet))) + #They should be the same sequence type (or one of them is generic) + a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) + return self.__class__(str(other) + str(self), a) + elif isinstance(other, basestring) : + #other is a plain string - use the current alphabet + return self.__class__(other + str(self), self.alphabet) + else : + raise TypeError + + def tostring(self): # Seq API requirement + """Returns the full sequence as a python string. + + Although not formally deprecated, you are now encouraged to use + str(my_seq) instead of my_seq.tostring().""" + return str(self) + + def tomutable(self): # Needed? Or use a function? + """Returns the full sequence as a MutableSeq object. + + >>> from Bio.Seq import Seq + >>> from Bio.Alphabet import IUPAC + >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", + ... IUPAC.protein) + >>> my_seq + Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) + >>> my_seq.tomutable() + MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) + + Note that the alphabet is preserved. + """ + return MutableSeq(str(self), self.alphabet) + + def _get_seq_str_and_check_alphabet(self, other_sequence) : + """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). + + For a string argument, returns the string. + + For a Seq or MutableSeq, it checks the alphabet is compatible + (raising an exception if it isn't), and then returns a string. + """ + try : + other_alpha = other_sequence.alphabet + except AttributeError : + #Assume other_sequence is a string + return other_sequence + + #Other should be a Seq or a MutableSeq + if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) : + raise TypeError("Incompatable alphabets %s and %s" \ + % (repr(self.alphabet), repr(other_alpha))) + #Return as a string + return str(other_sequence) + + def count(self, sub, start=0, end=sys.maxint): + """Non-overlapping count method, like that of a python string. + + This behaves like the python string method of the same name, + which does a non-overlapping count! + + Returns an integer, the number of occurrences of substring + argument sub in the (sub)sequence given by [start:end]. + Optional arguments start and end are interpreted as in slice + notation. + + Arguments: + - sub - a string or another Seq object to look for + - start - optional integer, slice start + - end - optional integer, slice end + + e.g. + + >>> from Bio.Seq import Seq + >>> my_seq = Seq("AAAATGA") + >>> print my_seq.count("A") + 5 + >>> print my_seq.count("ATG") + 1 + >>> print my_seq.count(Seq("AT")) + 1 + >>> print my_seq.count("AT", 2, -1) + 1 + + HOWEVER, please note because python strings and Seq objects (and + MutableSeq objects) do a non-overlapping search, this may not give + the answer you expect: + + >>> "AAAA".count("AA") + 2 + >>> print Seq("AAAA").count("AA") + 2 + + A non-overlapping search would give the answer as three! + """ + #If it has one, check the alphabet: + sub_str = self._get_seq_str_and_check_alphabet(sub) + return str(self).count(sub_str, start, end) + + def find(self, sub, start=0, end=sys.maxint): + """Find method, like that of a python string. + + This behaves like the python string method of the same name. + + Returns an integer, the index of the first occurrence of substring + argument sub in the (sub)sequence given by [start:end]. + + Arguments: + - sub - a string or another Seq object to look for + - start - optional integer, slice start + - end - optional integer, slice end + + Returns -1 if the subsequence is NOT found. + + e.g. Locating the first typical start codon, AUG, in an RNA sequence: + + >>> from Bio.Seq import Seq + >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") + >>> my_rna.find("AUG") + 3 + """ + #If it has one, check the alphabet: + sub_str = self._get_seq_str_and_check_alphabet(sub) + return str(self).find(sub_str, start, end) + + def rfind(self, sub, start=0, end=sys.maxint): + """Find from right method, like that of a python string. + + This behaves like the python string method of the same name. + + Returns an integer, the index of the last (right most) occurrence of + substring argument sub in the (sub)sequence given by [start:end]. + + Arguments: + - sub - a string or another Seq object to look for + - start - optional integer, slice start + - end - optional integer, slice end + + Returns -1 if the subsequence is NOT found. + + e.g. Locating the last typical start codon, AUG, in an RNA sequence: + + >>> from Bio.Seq import Seq + >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") + >>> my_rna.rfind("AUG") + 15 + """ + #If it has one, check the alphabet: + sub_str = self._get_seq_str_and_check_alphabet(sub) + return str(self).rfind(sub_str, start, end) + + def startswith(self, prefix, start=0, end=sys.maxint) : + """Does the Seq start with the given prefix? Returns True/False. + + This behaves like the python string method of the same name. + + Return True if the sequence starts with the specified prefix + (a string or another Seq object), False otherwise. + With optional start, test sequence beginning at that position. + With optional end, stop comparing sequence at that position. + prefix can also be a tuple of strings to try. e.g. + + >>> from Bio.Seq import Seq + >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") + >>> my_rna.startswith("GUC") + True + >>> my_rna.startswith("AUG") + False + >>> my_rna.startswith("AUG", 3) + True + >>> my_rna.startswith(("UCC","UCA","UCG"),1) + True + """ + #If it has one, check the alphabet: + if isinstance(prefix, tuple) : + #TODO - Once we drop support for Python 2.4, instead of this + #loop offload to the string method (requires Python 2.5+). + #Check all the alphabets first... + prefix_strings = [self._get_seq_str_and_check_alphabet(p) \ + for p in prefix] + for prefix_str in prefix_strings : + if str(self).startswith(prefix_str, start, end) : + return True + return False + else : + prefix_str = self._get_seq_str_and_check_alphabet(prefix) + return str(self).startswith(prefix_str, start, end) + + def endswith(self, suffix, start=0, end=sys.maxint) : + """Does the Seq end with the given suffix? Returns True/False. + + This behaves like the python string method of the same name. + + Return True if the sequence ends with the specified suffix + (a string or another Seq object), False otherwise. + With optional start, test sequence beginning at that position. + With optional end, stop comparing sequence at that position. + suffix can also be a tuple of strings to try. e.g. + + >>> from Bio.Seq import Seq + >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") + >>> my_rna.endswith("UUG") + True + >>> my_rna.endswith("AUG") + False + >>> my_rna.endswith("AUG", 0, 18) + True + >>> my_rna.endswith(("UCC","UCA","UUG")) + True + """ + #If it has one, check the alphabet: + if isinstance(suffix, tuple) : + #TODO - Once we drop support for Python 2.4, instead of this + #loop offload to the string method (requires Python 2.5+). + #Check all the alphabets first... + suffix_strings = [self._get_seq_str_and_check_alphabet(p) \ + for p in suffix] + for suffix_str in suffix_strings : + if str(self).endswith(suffix_str, start, end) : + return True + return False + else : + suffix_str = self._get_seq_str_and_check_alphabet(suffix) + return str(self).endswith(suffix_str, start, end) + + + def split(self, sep=None, maxsplit=-1) : + """Split method, like that of a python string. + + This behaves like the python string method of the same name. + + Return a list of the 'words' in the string (as Seq objects), + using sep as the delimiter string. If maxsplit is given, at + most maxsplit splits are done. If maxsplit is ommited, all + splits are made. + + Following the python string method, sep will by default be any + white space (tabs, spaces, newlines) but this is unlikely to + apply to biological sequences. + + e.g. + + >>> from Bio.Seq import Seq + >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") + >>> my_aa = my_rna.translate() + >>> my_aa + Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) + >>> my_aa.split("*") + [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] + >>> my_aa.split("*",1) + [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] + + See also the rsplit method: + + >>> my_aa.rsplit("*",1) + [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] + """ + #If it has one, check the alphabet: + sep_str = self._get_seq_str_and_check_alphabet(sep) + #TODO - If the sep is the defined stop symbol, or gap char, + #should we adjust the alphabet? + return [Seq(part, self.alphabet) \ + for part in str(self).split(sep_str, maxsplit)] + + def rsplit(self, sep=None, maxsplit=-1) : + """Right split method, like that of a python string. + + This behaves like the python string method of the same name. + + Return a list of the 'words' in the string (as Seq objects), + using sep as the delimiter string. If maxsplit is given, at + most maxsplit splits are done COUNTING FROM THE RIGHT. + If maxsplit is ommited, all splits are made. + + Following the python string method, sep will by default be any + white space (tabs, spaces, newlines) but this is unlikely to + apply to biological sequences. + + e.g. print my_seq.rsplit("*",1) + + See also the split method. + """ + #If it has one, check the alphabet: + sep_str = self._get_seq_str_and_check_alphabet(sep) + try : + return [Seq(part, self.alphabet) \ + for part in str(self).rsplit(sep_str, maxsplit)] + except AttributeError : + #Python 2.3 doesn't have a string rsplit method, which we can + #word around by reversing the sequence, using (left) split, + #and then reversing the answer. Not very efficient! + words = [Seq(word[::-1], self.alphabet) for word \ + in str(self)[::-1].split(sep_str[::-1], maxsplit)] + words.reverse() + return words + + def strip(self, chars=None) : + """Returns a new Seq object with leading and trailing ends stripped. + + This behaves like the python string method of the same name. + + Optional argument chars defines which characters to remove. If + ommitted or None (default) then as for the python string method, + this defaults to removing any white space. + + e.g. print my_seq.strip("-") + + See also the lstrip and rstrip methods. + """ + #If it has one, check the alphabet: + strip_str = self._get_seq_str_and_check_alphabet(chars) + return Seq(str(self).strip(strip_str), self.alphabet) + + def lstrip(self, chars=None) : + """Returns a new Seq object with leading (left) end stripped. + + This behaves like the python string method of the same name. + + Optional argument chars defines which characters to remove. If + ommitted or None (default) then as for the python string method, + this defaults to removing any white space. + + e.g. print my_seq.lstrip("-") + + See also the strip and rstrip methods. + """ + #If it has one, check the alphabet: + strip_str = self._get_seq_str_and_check_alphabet(chars) + return Seq(str(self).lstrip(strip_str), self.alphabet) + + def rstrip(self, chars=None) : + """Returns a new Seq object with trailing (right) end stripped. + + This behaves like the python string method of the same name. + + Optional argument chars defines which characters to remove. If + ommitted or None (default) then as for the python string method, + this defaults to removing any white space. + + e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): + + >>> from Bio.Alphabet import IUPAC + >>> from Bio.Seq import Seq + >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) + >>> my_seq + Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) + >>> my_seq.rstrip("A") + Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) + + See also the strip and lstrip methods. + """ + #If it has one, check the alphabet: + strip_str = self._get_seq_str_and_check_alphabet(chars) + return Seq(str(self).rstrip(strip_str), self.alphabet) + + def complement(self): + """Returns the complement sequence. New Seq object. + + >>> from Bio.Seq import Seq + >>> from Bio.Alphabet import IUPAC + >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) + >>> my_dna + Seq('CCCCCGATAG', IUPACUnambiguousDNA()) + >>> my_dna.complement() + Seq('GGGGGCTATC', IUPACUnambiguousDNA()) + + You can of course used mixed case sequences, + + >>> from Bio.Seq import Seq + >>> from Bio.Alphabet import generic_dna + >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) + >>> my_dna + Seq('CCCCCgatA-GD', DNAAlphabet()) + >>> my_dna.complement() + Seq('GGGGGctaT-CH', DNAAlphabet()) + + Note in the above example, ambiguous character D denotes + G, A or T so its complement is H (for C, T or A). + + Trying to complement a protein sequence raises an exception. + + >>> my_protein = Seq("MAIVMGR", IUPAC.protein) + >>> my_protein.complement() + Traceback (most recent call last): + ... + ValueError: Proteins do not have complements! + """ + base = Alphabet._get_base_alphabet(self.alphabet) + if isinstance(base, Alphabet.ProteinAlphabet) : + raise ValueError("Proteins do not have complements!") + if isinstance(base, Alphabet.DNAAlphabet) : + ttable = _dna_complement_table + elif isinstance(base, Alphabet.RNAAlphabet) : + ttable = _rna_complement_table + elif ('U' in self._data or 'u' in self._data) \ + and ('T' in self._data or 't' in self._data): + #TODO - Handle this cleanly? + raise ValueError("Mixed RNA/DNA found") + elif 'U' in self._data or 'u' in self._data: + ttable = _rna_complement_table + else: + ttable = _dna_complement_table + #Much faster on really long sequences than the previous loop based one. + #thx to Michael Palmer, University of Waterloo + return Seq(str(self).translate(ttable), self.alphabet) + + def reverse_complement(self): + """Returns the reverse complement sequence. New Seq object. + + >>> from Bio.Seq import Seq + >>> from Bio.Alphabet import IUPAC + >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) + >>> my_dna + Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) + >>> my_dna.reverse_complement() + Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) + + Note in the above example, since R = G or A, its complement + is Y (which denotes C or T). + + You can of course used mixed case sequences, + + >>> from Bio.Seq import Seq + >>> from Bio.Alphabet import generic_dna + >>> my_dna = Seq("CCCCCgatA-G", generic_dna) + >>> my_dna + Seq('CCCCCgatA-G', DNAAlphabet()) + >>> my_dna.reverse_complement() + Seq('C-TatcGGGGG', DNAAlphabet()) + + Trying to complement a protein sequence raises an exception: + + >>> my_protein = Seq("MAIVMGR", IUPAC.protein) + >>> my_protein.reverse_complement() + Traceback (most recent call last): + ... + ValueError: Proteins do not have complements! + """ + #Use -1 stride/step to reverse the complement + return self.complement()[::-1] + + def transcribe(self): + """Returns the RNA sequence from a DNA sequence. New Seq object. + + >>> from Bio.Seq import Seq + >>> from Bio.Alphabet import IUPAC + >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", + ... IUPAC.unambiguous_dna) + >>> coding_dna + Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) + >>> coding_dna.transcribe() + Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) + + Trying to transcribe a protein or RNA sequence raises an exception: + + >>> my_protein = Seq("MAIVMGR", IUPAC.protein) + >>> my_protein.transcribe() + Traceback (most recent call last): + ... + ValueError: Proteins cannot be transcribed! + """ + base = Alphabet._get_base_alphabet(self.alphabet) + if isinstance(base, Alphabet.ProteinAlphabet) : + raise ValueError("Proteins cannot be transcribed!") + if isinstance(base, Alphabet.RNAAlphabet) : + raise ValueError("RNA cannot be transcribed!") + + if self.alphabet==IUPAC.unambiguous_dna: + alphabet = IUPAC.unambiguous_rna + elif self.alphabet==IUPAC.ambiguous_dna: + alphabet = IUPAC.ambiguous_rna + else: + alphabet = Alphabet.generic_rna + return Seq(str(self).replace('T','U').replace('t','u'), alphabet) + + def back_transcribe(self): + """Returns the DNA sequence from an RNA sequence. New Seq object. + + >>> from Bio.Seq import Seq + >>> from Bio.Alphabet import IUPAC + >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", + ... IUPAC.unambiguous_rna) + >>> messenger_rna + Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) + >>> messenger_rna.back_transcribe() + Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) + + Trying to back-transcribe a protein or DNA sequence raises an + exception: + + >>> my_protein = Seq("MAIVMGR", IUPAC.protein) + >>> my_protein.back_transcribe() + Traceback (most recent call last): + ... + ValueError: Proteins cannot be back transcribed! + """ + base = Alphabet._get_base_alphabet(self.alphabet) + if isinstance(base, Alphabet.ProteinAlphabet) : + raise ValueError("Proteins cannot be back transcribed!") + if isinstance(base, Alphabet.DNAAlphabet) : + raise ValueError("DNA cannot be back transcribed!") + + if self.alphabet==IUPAC.unambiguous_rna: + alphabet = IUPAC.unambiguous_dna + elif self.alphabet==IUPAC.ambiguous_rna: + alphabet = IUPAC.ambiguous_dna + else: + alphabet = Alphabet.generic_dna + return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet) + + def translate(self, table="Standard", stop_symbol="*", to_stop=False): + """Turns a nucleotide sequence into a protein sequence. New Seq object. + + This method will translate DNA or RNA sequences, and those with a + nucleotide or generic alphabet. Trying to translate a protein + sequence raises an exception. + + Arguments: + - table - Which codon table to use? This can be either a name + (string) or an NCBI identifier (integer). This defaults + to the "Standard" table. + - stop_symbol - Single character string, what to use for terminators. + This defaults to the asterisk, "*". + - to_stop - Boolean, defaults to False meaning do a full translation + continuing on past any stop codons (translated as the + specified stop_symbol). If True, translation is + terminated at the first in frame stop codon (and the + stop_symbol is not appended to the returned protein + sequence). + + e.g. Using the standard table: + + >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") + >>> coding_dna.translate() + Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) + >>> coding_dna.translate(stop_symbol="@") + Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) + >>> coding_dna.translate(to_stop=True) + Seq('VAIVMGR', ExtendedIUPACProtein()) + + Now using NCBI table 2, where TGA is not a stop codon: + + >>> coding_dna.translate(table=2) + Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) + >>> coding_dna.translate(table=2, to_stop=True) + Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) + + If the sequence has no in-frame stop codon, then the to_stop argument + has no effect: + + >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") + >>> coding_dna2.translate() + Seq('LAIVMGR', ExtendedIUPACProtein()) + >>> coding_dna2.translate(to_stop=True) + Seq('LAIVMGR', ExtendedIUPACProtein()) + + NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid + or a stop codon. These are translated as "X". Any invalid codon + (e.g. "TA?" or "T-A") will throw a TranslationError. + + NOTE - Does NOT support gapped sequences. + + NOTE - This does NOT behave like the python string's translate + method. For that use str(my_seq).translate(...) instead. + """ + try: + table_id = int(table) + except ValueError: + table_id = None + if isinstance(table, str) and len(table)==256 : + raise ValueError("The Seq object translate method DOES NOT take " \ + + "a 256 character string mapping table like " \ + + "the python string object's translate method. " \ + + "Use str(my_seq).translate(...) instead.") + if isinstance(Alphabet._get_base_alphabet(self.alphabet), + Alphabet.ProteinAlphabet) : + raise ValueError("Proteins cannot be translated!") + if self.alphabet==IUPAC.unambiguous_dna: + #Will use standard IUPAC protein alphabet, no need for X + if table_id is None: + codon_table = CodonTable.unambiguous_dna_by_name[table] + else: + codon_table = CodonTable.unambiguous_dna_by_id[table_id] + #elif self.alphabet==IUPAC.ambiguous_dna: + # if table_id is None: + # codon_table = CodonTable.ambiguous_dna_by_name[table] + # else: + # codon_table = CodonTable.ambiguous_dna_by_id[table_id] + elif self.alphabet==IUPAC.unambiguous_rna: + #Will use standard IUPAC protein alphabet, no need for X + if table_id is None: + codon_table = CodonTable.unambiguous_rna_by_name[table] + else: + codon_table = CodonTable.unambiguous_rna_by_id[table_id] + #elif self.alphabet==IUPAC.ambiguous_rna: + # if table_id is None: + # codon_table = CodonTable.ambiguous_rna_by_name[table] + # else: + # codon_table = CodonTable.ambiguous_rna_by_id[table_id] + else: + #This will use the extend IUPAC protein alphabet with X etc. + #The same table can be used for RNA or DNA (we use this for + #translating strings). + if table_id is None: + codon_table = CodonTable.ambiguous_generic_by_name[table] + else: + codon_table = CodonTable.ambiguous_generic_by_id[table_id] + protein = _translate_str(str(self), codon_table, stop_symbol, to_stop) + if stop_symbol in protein : + alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, + stop_symbol = stop_symbol) + else : + alphabet = codon_table.protein_alphabet + return Seq(protein, alphabet) + +class UnknownSeq(Seq): + """A read-only sequence object of known length but unknown contents. + + If you have an unknown sequence, you can represent this with a normal + Seq object, for example: + + >>> my_seq = Seq("N"*5) + >>> my_seq + Seq('NNNNN', Alphabet()) + >>> len(my_seq) + 5 + >>> print my_seq + NNNNN + + However, this is rather wasteful of memory (especially for large + sequences), which is where this class is most usefull: + + >>> unk_five = UnknownSeq(5) + >>> unk_five + UnknownSeq(5, alphabet = Alphabet(), character = '?') + >>> len(unk_five) + 5 + >>> print(unk_five) + ????? + + You can add unknown sequence together, provided their alphabets and + characters are compatible, and get another memory saving UnknownSeq: + + >>> unk_four = UnknownSeq(4) + >>> unk_four + UnknownSeq(4, alphabet = Alphabet(), character = '?') + >>> unk_four + unk_five + UnknownSeq(9, alphabet = Alphabet(), character = '?') + + If the alphabet or characters don't match up, the addition gives an + ordinary Seq object: + + >>> unk_nnnn = UnknownSeq(4, character = "N") + >>> unk_nnnn + UnknownSeq(4, alphabet = Alphabet(), character = 'N') + >>> unk_nnnn + unk_four + Seq('NNNN????', Alphabet()) + + Combining with a real Seq gives a new Seq object: + + >>> known_seq = Seq("ACGT") + >>> unk_four + known_seq + Seq('????ACGT', Alphabet()) + >>> known_seq + unk_four + Seq('ACGT????', Alphabet()) + """ + def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None) : + """Create a new UnknownSeq object. + + If character is ommited, it is determed from the alphabet, "N" for + nucleotides, "X" for proteins, and "?" otherwise. + """ + self._length = int(length) + if self._length < 0 : + #TODO - Block zero length UnknownSeq? You can just use a Seq! + raise ValueError("Length must not be negative.") + self.alphabet = alphabet + if character : + if len(character) != 1 : + raise ValueError("character argument should be a single letter string.") + self._character = character + else : + base = Alphabet._get_base_alphabet(alphabet) + #TODO? Check the case of the letters in the alphabet? + #We may have to use "n" instead of "N" etc. + if isinstance(base, Alphabet.NucleotideAlphabet) : + self._character = "N" + elif isinstance(base, Alphabet.ProteinAlphabet) : + self._character = "X" + else : + self._character = "?" + + def __len__(self) : + """Returns the stated length of the unknown sequence.""" + return self._length + + def __str__(self) : + """Returns the unknown sequence as full string of the given length.""" + return self._character * self._length + + def __repr__(self): + return "UnknownSeq(%i, alphabet = %s, character = %s)" \ + % (self._length, repr(self.alphabet), repr(self._character)) + + def __add__(self, other) : + if isinstance(other, UnknownSeq) \ + and other._character == self._character : + #TODO - Check the alphabets match + return UnknownSeq(len(self)+len(other), + self.alphabet, self._character) + #Offload to the base class... + return Seq(str(self), self.alphabet) + other + + def __radd__(self, other) : + if isinstance(other, UnknownSeq) \ + and other._character == self._character : + #TODO - Check the alphabets match + return UnknownSeq(len(self)+len(other), + self.alphabet, self._character) + #Offload to the base class... + return other + Seq(str(self), self.alphabet) + + def __getitem__(self, index): + if isinstance(index, int) : + #TODO - Check the bounds without wasting memory + return str(self)[index] + else : + #TODO - Work out the length without wasting memory + return UnknownSeq(len(("#"*self._length)[index]), + self.alphabet, self._character) + + def count(self, sub, start=0, end=sys.maxint): + """Non-overlapping count method, like that of a python string. + + This behaves like the python string (and Seq object) method of the + same name, which does a non-overlapping count! + + Returns an integer, the number of occurrences of substring + argument sub in the (sub)sequence given by [start:end]. + Optional arguments start and end are interpreted as in slice + notation. + + Arguments: + - sub - a string or another Seq object to look for + - start - optional integer, slice start + - end - optional integer, slice end + + >>> "NNNN".count("N") + 4 + >>> Seq("NNNN").count("N") + 4 + >>> UnknownSeq(4, character="N").count("N") + 4 + >>> UnknownSeq(4, character="N").count("A") + 0 + >>> UnknownSeq(4, character="N").count("AA") + 0 + + HOWEVER, please note because that python strings and Seq objects (and + MutableSeq objects) do a non-overlapping search, this may not give + the answer you expect: + + >>> UnknownSeq(4, character="N").count("NN") + 2 + >>> UnknownSeq(4, character="N").count("NNN") + 1 + """ + sub_str = self._get_seq_str_and_check_alphabet(sub) + if len(sub_str) == 1 : + if str(sub_str) == self._character : + if start==0 and end >= self._length : + return self._length + else : + #This could be done more cleverly... + return str(self).count(sub_str, start, end) + else : + return 0 + else : + if set(sub_str) == set(self._character) : + if start==0 and end >= self._length : + return self._length // len(sub_str) + else : + #This could be done more cleverly... + return str(self).count(sub_str, start, end) + else : + return 0 + + def complement(self) : + """The complement of an unknown nucleotide equals itself. + + >>> my_nuc = UnknownSeq(8) + >>> my_nuc + UnknownSeq(8, alphabet = Alphabet(), character = '?') + >>> print my_nuc + ???????? + >>> my_nuc.complement() + UnknownSeq(8, alphabet = Alphabet(), character = '?') + >>> print my_nuc.complement() + ???????? + """ + if isinstance(Alphabet._get_base_alphabet(self.alphabet), + Alphabet.ProteinAlphabet) : + raise ValueError("Proteins do not have complements!") + return self + + def reverse_complement(self) : + """The reverse complement of an unknown nucleotide equals itself. + + >>> my_nuc = UnknownSeq(10) + >>> my_nuc + UnknownSeq(10, alphabet = Alphabet(), character = '?') + >>> print my_nuc + ?????????? + >>> my_nuc.reverse_complement() + UnknownSeq(10, alphabet = Alphabet(), character = '?') + >>> print my_nuc.reverse_complement() + ?????????? + """ + if isinstance(Alphabet._get_base_alphabet(self.alphabet), + Alphabet.ProteinAlphabet) : + raise ValueError("Proteins do not have complements!") + return self + + def transcribe(self) : + """Returns unknown RNA sequence from an unknown DNA sequence. + + >>> my_dna = UnknownSeq(10, character="N") + >>> my_dna + UnknownSeq(10, alphabet = Alphabet(), character = 'N') + >>> print my_dna + NNNNNNNNNN + >>> my_rna = my_dna.transcribe() + >>> my_rna + UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') + >>> print my_rna + NNNNNNNNNN + """ + #Offload the alphabet stuff + s = Seq(self._character, self.alphabet).transcribe() + return UnknownSeq(self._length, s.alphabet, self._character) + + def back_transcribe(self) : + """Returns unknown DNA sequence from an unknown RNA sequence. + + >>> my_rna = UnknownSeq(20, character="N") + >>> my_rna + UnknownSeq(20, alphabet = Alphabet(), character = 'N') + >>> print my_rna + NNNNNNNNNNNNNNNNNNNN + >>> my_dna = my_rna.back_transcribe() + >>> my_dna + UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') + >>> print my_dna + NNNNNNNNNNNNNNNNNNNN + """ + #Offload the alphabet stuff + s = Seq(self._character, self.alphabet).back_transcribe() + return UnknownSeq(self._length, s.alphabet, self._character) + + def translate(self, **kwargs) : + """Translate an unknown nucleotide sequence into an unknown protein. + + e.g. + + >>> my_seq = UnknownSeq(11, character="N") + >>> print my_seq + NNNNNNNNNNN + >>> my_protein = my_seq.translate() + >>> my_protein + UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') + >>> print my_protein + XXX + + In comparison, using a normal Seq object: + + >>> my_seq = Seq("NNNNNNNNNNN") + >>> print my_seq + NNNNNNNNNNN + >>> my_protein = my_seq.translate() + >>> my_protein + Seq('XXX', ExtendedIUPACProtein()) + >>> print my_protein + XXX + + """ + if isinstance(Alphabet._get_base_alphabet(self.alphabet), + Alphabet.ProteinAlphabet) : + raise ValueError("Proteins cannot be translated!") + return UnknownSeq(self._length//3, Alphabet.generic_protein, "X") + + +class MutableSeq(object): + """An editable sequence object (with an alphabet). + + Unlike normal python strings and our basic sequence object (the Seq class) + which are immuatable, the MutableSeq lets you edit the sequence in place. + However, this means you cannot use a MutableSeq object as a dictionary key. + + >>> from Bio.Seq import MutableSeq + >>> from Bio.Alphabet import generic_dna + >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) + >>> my_seq + MutableSeq('ACTCGTCGTCG', DNAAlphabet()) + >>> my_seq[5] + 'T' + >>> my_seq[5] = "A" + >>> my_seq + MutableSeq('ACTCGACGTCG', DNAAlphabet()) + >>> my_seq[5] + 'A' + >>> my_seq[5:8] = "NNN" + >>> my_seq + MutableSeq('ACTCGNNNTCG', DNAAlphabet()) + >>> len(my_seq) + 11 + + Note that the MutableSeq object does not support as many string-like + or biological methods as the Seq object. + """ + def __init__(self, data, alphabet = Alphabet.generic_alphabet): + if type(data) == type(""): + self.data = array.array("c", data) + else: + self.data = data # assumes the input is an array + self.alphabet = alphabet + + def __repr__(self): + """Returns a (truncated) representation of the sequence for debugging.""" + if len(self) > 60 : + #Shows the last three letters as it is often useful to see if there + #is a stop codon at the end of a sequence. + #Note total length is 54+3+3=60 + return "%s('%s...%s', %s)" % (self.__class__.__name__, + str(self[:54]), str(self[-3:]), + repr(self.alphabet)) + else : + return "%s('%s', %s)" % (self.__class__.__name__, + str(self), + repr(self.alphabet)) + + def __str__(self): + """Returns the full sequence as a python string. + + Note that Biopython 1.44 and earlier would give a truncated + version of repr(my_seq) for str(my_seq). If you are writing code + which needs to be backwards compatible with old Biopython, you + should continue to use my_seq.tostring() rather than str(my_seq). + """ + #See test_GAQueens.py for an historic usage of a non-string alphabet! + return "".join(self.data) + + def __cmp__(self, other): + """Compare the sequence for to another sequence or a string. + + If compared to another sequence the alphabets must be compatible. + Comparing DNA to RNA, or Nucleotide to Protein will raise an + exception. + + Otherwise only the sequence itself is compared, not the precise + alphabet. + + This method indirectly supports ==, < , etc.""" + if hasattr(other, "alphabet") : + #other should be a Seq or a MutableSeq + if not Alphabet._check_type_compatible([self.alphabet, + other.alphabet]) : + raise TypeError("Incompatable alphabets %s and %s" \ + % (repr(self.alphabet), repr(other.alphabet))) + #They should be the same sequence type (or one of them is generic) + if isinstance(other, MutableSeq): + #See test_GAQueens.py for an historic usage of a non-string + #alphabet! Comparing the arrays supports this. + return cmp(self.data, other.data) + else : + return cmp(str(self), str(other)) + elif isinstance(other, basestring) : + return cmp(str(self), other) + else : + raise TypeError + + def __len__(self): return len(self.data) + + def __getitem__(self, index) : + #Note since Python 2.0, __getslice__ is deprecated + #and __getitem__ is used instead. + #See http://docs.python.org/ref/sequence-methods.html + if isinstance(index, int) : + #Return a single letter as a string + return self.data[index] + else : + #Return the (sub)sequence as another Seq object + return MutableSeq(self.data[index], self.alphabet) + + def __setitem__(self, index, value): + #Note since Python 2.0, __setslice__ is deprecated + #and __setitem__ is used instead. + #See http://docs.python.org/ref/sequence-methods.html + if isinstance(index, int) : + #Replacing a single letter with a new string + self.data[index] = value + else : + #Replacing a sub-sequence + if isinstance(value, MutableSeq): + self.data[index] = value.data + elif isinstance(value, type(self.data)): + self.data[index] = value + else: + self.data[index] = array.array("c", str(value)) + + def __delitem__(self, index): + #Note since Python 2.0, __delslice__ is deprecated + #and __delitem__ is used instead. + #See http://docs.python.org/ref/sequence-methods.html + + #Could be deleting a single letter, or a slice + del self.data[index] + + def __add__(self, other): + """Add another sequence or string to this sequence. + + Returns a new MutableSeq object.""" + if hasattr(other, "alphabet") : + #other should be a Seq or a MutableSeq + if not Alphabet._check_type_compatible([self.alphabet, + other.alphabet]) : + raise TypeError("Incompatable alphabets %s and %s" \ + % (repr(self.alphabet), repr(other.alphabet))) + #They should be the same sequence type (or one of them is generic) + a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) + if isinstance(other, MutableSeq): + #See test_GAQueens.py for an historic usage of a non-string + #alphabet! Adding the arrays should support this. + return self.__class__(self.data + other.data, a) + else : + return self.__class__(str(self) + str(other), a) + elif isinstance(other, basestring) : + #other is a plain string - use the current alphabet + return self.__class__(str(self) + str(other), self.alphabet) + else : + raise TypeError + + def __radd__(self, other): + if hasattr(other, "alphabet") : + #other should be a Seq or a MutableSeq + if not Alphabet._check_type_compatible([self.alphabet, + other.alphabet]) : + raise TypeError("Incompatable alphabets %s and %s" \ + % (repr(self.alphabet), repr(other.alphabet))) + #They should be the same sequence type (or one of them is generic) + a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) + if isinstance(other, MutableSeq): + #See test_GAQueens.py for an historic usage of a non-string + #alphabet! Adding the arrays should support this. + return self.__class__(other.data + self.data, a) + else : + return self.__class__(str(other) + str(self), a) + elif isinstance(other, basestring) : + #other is a plain string - use the current alphabet + return self.__class__(str(other) + str(self), self.alphabet) + else : + raise TypeError + + def append(self, c): + self.data.append(c) + + def insert(self, i, c): + self.data.insert(i, c) + + def pop(self, i = (-1)): + c = self.data[i] + del self.data[i] + return c + + def remove(self, item): + for i in range(len(self.data)): + if self.data[i] == item: + del self.data[i] + return + raise ValueError("MutableSeq.remove(x): x not in list") + + def count(self, sub, start=0, end=sys.maxint): + """Non-overlapping count method, like that of a python string. + + This behaves like the python string method of the same name, + which does a non-overlapping count! + + Returns an integer, the number of occurrences of substring + argument sub in the (sub)sequence given by [start:end]. + Optional arguments start and end are interpreted as in slice + notation. + + Arguments: + - sub - a string or another Seq object to look for + - start - optional integer, slice start + - end - optional integer, slice end + + e.g. + + >>> from Bio.Seq import MutableSeq + >>> my_mseq = MutableSeq("AAAATGA") + >>> print my_mseq.count("A") + 5 + >>> print my_mseq.count("ATG") + 1 + >>> print my_mseq.count(Seq("AT")) + 1 + >>> print my_mseq.count("AT", 2, -1) + 1 + + HOWEVER, please note because that python strings, Seq objects and + MutableSeq objects do a non-overlapping search, this may not give + the answer you expect: + + >>> "AAAA".count("AA") + 2 + >>> print MutableSeq("AAAA").count("AA") + 2 + + A non-overlapping search would give the answer as three! + """ + try : + #TODO - Should we check the alphabet? + search = sub.tostring() + except AttributeError : + search = sub + + if not isinstance(search, basestring) : + raise TypeError("expected a string, Seq or MutableSeq") + + if len(search) == 1 : + #Try and be efficient and work directly from the array. + count = 0 + for c in self.data[start:end]: + if c == search: count += 1 + return count + else : + #TODO - Can we do this more efficiently? + return self.tostring().count(search, start, end) + + def index(self, item): + for i in range(len(self.data)): + if self.data[i] == item: + return i + raise ValueError("MutableSeq.index(x): x not in list") + + def reverse(self): + """Modify the mutable sequence to reverse itself. + + No return value. + """ + self.data.reverse() + + def complement(self): + """Modify the mutable sequence to take on its complement. + + Trying to complement a protein sequence raises an exception. + + No return value. + """ + if isinstance(Alphabet._get_base_alphabet(self.alphabet), + Alphabet.ProteinAlphabet) : + raise ValueError("Proteins do not have complements!") + if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): + d = ambiguous_dna_complement + elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): + d = ambiguous_rna_complement + elif 'U' in self.data and 'T' in self.data : + #TODO - Handle this cleanly? + raise ValueError("Mixed RNA/DNA found") + elif 'U' in self.data: + d = ambiguous_rna_complement + else: + d = ambiguous_dna_complement + c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()]) + d.update(c) + self.data = map(lambda c: d[c], self.data) + self.data = array.array('c', self.data) + + def reverse_complement(self): + """Modify the mutable sequence to take on its reverse complement. + + Trying to reverse complement a protein sequence raises an exception. + + No return value. + """ + self.complement() + self.data.reverse() + + ## Sorting a sequence makes no sense. + # def sort(self, *args): self.data.sort(*args) + + def extend(self, other): + if isinstance(other, MutableSeq): + for c in other.data: + self.data.append(c) + else: + for c in other: + self.data.append(c) + + def tostring(self): + """Returns the full sequence as a python string. + + Although not formally deprecated, you are now encouraged to use + str(my_seq) instead of my_seq.tostring(). + + Because str(my_seq) will give you the full sequence as a python string, + there is often no need to make an explicit conversion. For example, + + print "ID={%s}, sequence={%s}" % (my_name, my_seq) + + On Biopython 1.44 or older you would have to have done this: + + print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring()) + """ + return "".join(self.data) + + def toseq(self): + """Returns the full sequence as a new immutable Seq object. + + >>> from Bio.Seq import Seq + >>> from Bio.Alphabet import IUPAC + >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \ + IUPAC.protein) + >>> my_mseq + MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) + >>> my_mseq.toseq() + Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) + + Note that the alphabet is preserved. + """ + return Seq("".join(self.data), self.alphabet) + +# The transcribe, backward_transcribe, and translate functions are +# user-friendly versions of the corresponding functions in Bio.Transcribe +# and Bio.Translate. The functions work both on Seq objects, and on strings. + +def transcribe(dna): + """Transcribes a DNA sequence into RNA. + + If given a string, returns a new string object. + + Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. + + Trying to transcribe a protein or RNA sequence raises an exception. + + e.g. + + >>> transcribe("ACTGN") + 'ACUGN' + """ + if isinstance(dna, Seq) : + return dna.transcribe() + elif isinstance(dna, MutableSeq): + return dna.toseq().transcribe() + else: + return dna.replace('T','U').replace('t','u') + +def back_transcribe(rna): + """Back-transcribes an RNA sequence into DNA. + + If given a string, returns a new string object. + + Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. + + Trying to transcribe a protein or DNA sequence raises an exception. + + e.g. + + >>> back_transcribe("ACUGN") + 'ACTGN' + """ + if isinstance(rna, Seq) : + return rna.back_transcribe() + elif isinstance(rna, MutableSeq): + return rna.toseq().back_transcribe() + else: + return rna.replace('U','T').replace('u','t') + +def _translate_str(sequence, table, stop_symbol="*", + to_stop=False, pos_stop="X") : + """Helper function to translate a nucleotide string (PRIVATE). + + Arguments: + - sequence - a string + - table - a CodonTable object (NOT a table name or id number) + - stop_symbol - a single character string, what to use for terminators. + - to_stop - boolean, should translation terminate at the first + in frame stop codon? If there is no in-frame stop codon + then translation continues to the end. + - pos_stop - a single character string for a possible stop codon + (e.g. TAN or NNN) + + Returns a string. + + e.g. + + >>> from Bio.Data import CodonTable + >>> table = CodonTable.ambiguous_dna_by_id[1] + >>> _translate_str("AAA", table) + 'K' + >>> _translate_str("TAR", table) + '*' + >>> _translate_str("TAN", table) + 'X' + >>> _translate_str("TAN", table, pos_stop="@") + '@' + >>> _translate_str("TA?", table) + Traceback (most recent call last): + ... + TranslationError: Codon 'TA?' is invalid + """ + sequence = sequence.upper() + amino_acids = [] + forward_table = table.forward_table + stop_codons = table.stop_codons + if table.nucleotide_alphabet.letters is not None : + valid_letters = set(table.nucleotide_alphabet.letters.upper()) + else : + #Assume the worst case, ambiguous DNA or RNA: + valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \ + IUPAC.ambiguous_rna.letters.upper()) + + n = len(sequence) + for i in xrange(0,n-n%3,3) : + codon = sequence[i:i+3] + try : + amino_acids.append(forward_table[codon]) + except (KeyError, CodonTable.TranslationError) : + #Todo? Treat "---" as a special case (gapped translation) + if codon in table.stop_codons : + if to_stop : break + amino_acids.append(stop_symbol) + elif valid_letters.issuperset(set(codon)) : + #Possible stop codon (e.g. NNN or TAN) + amino_acids.append(pos_stop) + else : + raise CodonTable.TranslationError(\ + "Codon '%s' is invalid" % codon) + return "".join(amino_acids) + +def translate(sequence, table="Standard", stop_symbol="*", to_stop=False): + """Translate a nucleotide sequence into amino acids. + + If given a string, returns a new string object. Given a Seq or + MutableSeq, returns a Seq object with a protein alphabet. + + Arguments: + - table - Which codon table to use? This can be either a name + (string) or an NCBI identifier (integer). Defaults + to the "Standard" table. + - stop_symbol - Single character string, what to use for any + terminators, defaults to the asterisk, "*". + - to_stop - Boolean, defaults to False meaning do a full + translation continuing on past any stop codons + (translated as the specified stop_symbol). If + True, translation is terminated at the first in + frame stop codon (and the stop_symbol is not + appended to the returned protein sequence). + + A simple string example using the default (standard) genetic code: + + >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" + >>> translate(coding_dna) + 'VAIVMGR*KGAR*' + >>> translate(coding_dna, stop_symbol="@") + 'VAIVMGR@KGAR@' + >>> translate(coding_dna, to_stop=True) + 'VAIVMGR' + + Now using NCBI table 2, where TGA is not a stop codon: + + >>> translate(coding_dna, table=2) + 'VAIVMGRWKGAR*' + >>> translate(coding_dna, table=2, to_stop=True) + 'VAIVMGRWKGAR' + + Note that if the sequence has no in-frame stop codon, then the to_stop + argument has no effect: + + >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" + >>> translate(coding_dna2) + 'VAIVMGR' + >>> translate(coding_dna2, to_stop=True) + 'VAIVMGR' + + NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid + or a stop codon. These are translated as "X". Any invalid codon + (e.g. "TA?" or "T-A") will throw a TranslationError. + + NOTE - Does NOT support gapped sequences. + + It will however translate either DNA or RNA. + """ + if isinstance(sequence, Seq) : + return sequence.translate(table, stop_symbol, to_stop) + elif isinstance(sequence, MutableSeq): + #Return a Seq object + return sequence.toseq().translate(table, stop_symbol, to_stop) + else: + #Assume its a string, return a string + try : + codon_table = CodonTable.ambiguous_generic_by_id[int(table)] + except ValueError : + codon_table = CodonTable.ambiguous_generic_by_name[table] + return _translate_str(sequence, codon_table, stop_symbol, to_stop) + +def reverse_complement(sequence): + """Returns the reverse complement sequence of a nucleotide string. + + If given a string, returns a new string object. + Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. + + Supports unambiguous and ambiguous nucleotide sequences. + + e.g. + + >>> reverse_complement("ACTG-NH") + 'DN-CAGT' + """ + if isinstance(sequence, Seq) : + #Return a Seq + return sequence.reverse_complement() + elif isinstance(sequence, MutableSeq) : + #Return a Seq + #Don't use the MutableSeq reverse_complement method as it is 'in place'. + return sequence.toseq().reverse_complement() + + #Assume its a string. + #In order to avoid some code duplication, the old code would turn the string + #into a Seq, use the reverse_complement method, and convert back to a string. + #This worked, but is over five times slower on short sequences! + if ('U' in sequence or 'u' in sequence) \ + and ('T' in sequence or 't' in sequence): + raise ValueError("Mixed RNA/DNA found") + elif 'U' in sequence or 'u' in sequence: + ttable = _rna_complement_table + else: + ttable = _dna_complement_table + return sequence.translate(ttable)[::-1] + +def _test(): + """Run the Bio.Seq module's doctests.""" + print "Runing doctests..." + import doctest + doctest.testmod() + print "Done" + +if __name__ == "__main__": + _test() diff --git a/binaries/src/globplot/biopython-1.50/Bio/Seq.pyc b/binaries/src/globplot/biopython-1.50/Bio/Seq.pyc new file mode 100644 index 0000000..b16921e Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/Seq.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqFeature.py b/binaries/src/globplot/biopython-1.50/Bio/SeqFeature.py new file mode 100644 index 0000000..aa44e69 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqFeature.py @@ -0,0 +1,490 @@ +"""Represent a Sequence Feature holding info about a part of a sequence. + +This is heavily modeled after the Biocorba SeqFeature objects, and +may be pretty biased towards GenBank stuff since I'm writing it +for the GenBank parser output... + +What's here: + +Base class to hold a Feature. +---------------------------- +classes: +o SeqFeature + +Hold information about a Reference. +---------------------------------- + +This is an attempt to create a General class to hold Reference type +information. + +classes: +o Reference + +Specify locations of a feature on a Sequence. +--------------------------------------------- + +This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in +much the same way as Biocorba. This has the advantages of allowing us +to handle fuzzy stuff in case anyone needs it, and also be compatible +with Biocorba. + +classes: +o FeatureLocation - Specify the start and end location of a feature. + +o ExactPosition - Specify the position as being exact. +o WithinPosition - Specify a position occuring within some range. +o BetweenPosition - Specify a position occuring between a range. +o BeforePosition - Specify the position as being found before some base. +o AfterPosition - Specify the position as being found after some base. +""" + +class SeqFeature: + """Represent a Sequence Feature on an object. + + Attributes: + o location - the location of the feature on the sequence + o type - the specified type of the feature (ie. CDS, exon, repeat...) + o location_operator - a string specifying how this SeqFeature may + be related to others. For example, in the example GenBank feature + shown below, the location_operator would be "join" + o strand - A value specifying on which strand (of a DNA sequence, for + instance) the feature deals with. 1 indicates the plus strand, -1 + indicates the minus strand, 0 indicates both strands, and None indicates + that strand doesn't apply (ie. for proteins) or is not known. + o id - A string identifier for the feature. + o ref - A reference to another sequence. This could be an accession + number for some different sequence. + o ref_db - A different database for the reference accession number. + o qualifier - A dictionary of qualifiers on the feature. These are + analagous to the qualifiers from a GenBank feature table. The keys of + the dictionary are qualifier names, the values are the qualifier + values. + o sub_features - Additional SeqFeatures which fall under this 'parent' + feature. For instance, if we having something like: + + CDS join(1..10,30..40,50..60) + + The the top level feature would be a CDS from 1 to 60, and the sub + features would be of 'CDS_join' type and would be from 1 to 10, 30 to + 40 and 50 to 60, respectively. + """ + def __init__(self, location = None, type = '', location_operator = '', + strand = None, id = "", + qualifiers = {}, sub_features = [], + ref = None, ref_db = None): + """Initialize a SeqFeature on a Sequence. + """ + self.location = location + + self.type = type + self.location_operator = location_operator + self.strand = strand + self.id = id + # XXX right now sub_features and qualifiers cannot be set + # from the initializer because this causes all kinds + # of recursive import problems. I can't understand why this is + # at all :-< + self.qualifiers = {} + self.sub_features = [] + self.ref = ref + self.ref_db = ref_db + + def __repr__(self): + """A string representation of the record for debugging.""" + answer = "%s(%s" % (self.__class__, repr(self.location)) + if self.type : + answer += ", type=%s" % repr(self.type) + if self.location_operator : + answer += ", location_operator=%s" % repr(self.location_operator) + if self.strand : + answer += ", strand=%s" % repr(self.strand) + if self.id and self.id != "" : + answer += ", id=%s" % repr(self.id) + if self.ref : + answer += ", ref=%s" % repr(self.ref) + if self.ref_db : + answer += ", ref_db=%s" % repr(self.ref_db) + answer += ")" + return answer + + def __str__(self): + """A readable summary of the feature intended to be printed to screen. + """ + out = "type: %s\n" % self.type + out += "location: %s\n" % self.location + out += "ref: %s:%s\n" % (self.ref, self.ref_db) + out += "strand: %s\n" % self.strand + out += "qualifiers: \n" + qualifier_keys = self.qualifiers.keys() + qualifier_keys.sort() + for qual_key in qualifier_keys: + out += "\tKey: %s, Value: %s\n" % (qual_key, + self.qualifiers[qual_key]) + if len(self.sub_features) != 0: + out += "Sub-Features\n" + for sub_feature in self.sub_features: + out +="%s\n" % sub_feature + + return out + + def _shift(self, offset) : + """Returns a copy of the feature with its location shifted (PRIVATE). + + The annotation qaulifiers are copied.""" + answer = SeqFeature(location = self.location._shift(offset), + type = self.type, + location_operator = self.location_operator, + strand = self.strand, + id = self.id, + #qualifiers = dict(self.qualifiers.iteritems()), + #sub_features = [f._shift(offset) for f in self.sub_features], + ref = self.ref, + ref_db = self.ref_db) + #TODO - Sort out the use of sub_feature and qualifiers in __init___ + answer.sub_features = [f._shift(offset) for f in self.sub_features] + answer.qualifiers = dict(self.qualifiers.iteritems()) + return answer + +# --- References + +# TODO -- Will this hold PubMed and Medline information decently? +class Reference: + """Represent a Generic Reference object. + + Attributes: + o location - A list of Location objects specifying regions of + the sequence that the references correspond to. If no locations are + specified, the entire sequence is assumed. + o authors - A big old string, or a list split by author, of authors + for the reference. + o title - The title of the reference. + o journal - Journal the reference was published in. + o medline_id - A medline reference for the article. + o pubmed_id - A pubmed reference for the article. + o comment - A place to stick any comments about the reference. + """ + def __init__(self): + self.location = [] + self.authors = '' + self.consrtm = '' + self.title = '' + self.journal = '' + self.medline_id = '' + self.pubmed_id = '' + self.comment = '' + + def __str__(self): + """Output an informative string for debugging. + """ + out = "" + for single_location in self.location: + out += "location: %s\n" % single_location + out += "authors: %s\n" % self.authors + if self.consrtm: + out += "consrtm: %s\n" % self.consrtm + out += "title: %s\n" % self.title + out += "journal: %s\n" % self.journal + out += "medline id: %s\n" % self.medline_id + out += "pubmed id: %s\n" % self.pubmed_id + out += "comment: %s\n" % self.comment + + return out + +# --- Handling feature locations + +class FeatureLocation: + """Specify the location of a feature along a sequence. + + This attempts to deal with fuzziness of position ends, but also + make it easy to get the start and end in the 'normal' case (no + fuzziness). + + You should access the start and end attributes with + your_location.start and your_location.end. If the start and + end are exact, this will return the positions, if not, we'll return + the approriate Fuzzy class with info about the position and fuzziness. + + Note that the start and end location numbering follow Python's scheme, + thus a GenBank entry of 123..150 (one based counting) becomes a location + of [122:150] (zero based counting). + """ + def __init__(self, start, end): + """Specify the start and end of a sequence feature. + + start and end arguments specify the values where the feature begins + and ends. These can either by any of the *Position objects that + inherit from AbstractPosition, or can just be integers specifying the + position. In the case of integers, the values are assumed to be + exact and are converted in ExactPosition arguments. This is meant + to make it easy to deal with non-fuzzy ends. + """ + if isinstance(start, AbstractPosition): + self._start = start + else: + self._start = ExactPosition(start) + + if isinstance(end, AbstractPosition): + self._end = end + else: + self._end = ExactPosition(end) + + def __str__(self): + """Returns a representation of the location (with python counting). + + For the simple case this uses the python splicing syntax, [122:150] + (zero based counting) which GenBank would call 123..150 (one based + counting). + """ + return "[%s:%s]" % (self._start, self._end) + + def __repr__(self): + """A string representation of the location for debugging.""" + return "%s(%s,%s)" \ + % (self.__class__, repr(self.start), repr(self.end)) + + def _shift(self, offset) : + """Returns a copy of the location shifted by the offset (PRIVATE).""" + return FeatureLocation(start = self._start._shift(offset), + end = self._end._shift(offset)) + + def __getattr__(self, attr): + """Make it easy to get non-fuzzy starts and ends. + + We override get_attribute here so that in non-fuzzy cases we + can just return the start and end position without any hassle. + + To get fuzzy start and ends, just ask for item.start and + item.end. To get non-fuzzy attributes (ie. the position only) + ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return + the largest range of the fuzzy position. So something like: + (10.20)..(30.40) should return 10 for start, and 40 for end. + + The special tricky case where is when we have a single between position + argument like 2^3 for the range. We want nofuzzy_start and nofuzzy_end + to give a reasonable approximation of what this really means, which + is an empty string -- so the same position for both. Doing a special + case here sucks, but there is really not a general rule you can apply + to this. + """ + #TODO - these are not currently implemented as properties, this means + #they do not show up via dir(...) + if attr == 'start': + return self._start + elif attr == 'end': + return self._end + elif attr == 'nofuzzy_start': + if ((self._start == self._end) and isinstance(self._start, + BetweenPosition)): + return self._start.position + else: + return min(self._start.position, + self._start.position + self._start.extension) + elif attr == 'nofuzzy_end': + if ((self._start == self._end) and isinstance(self._start, + BetweenPosition)): + return self._end.position + else: + return max(self._end.position, + self._end.position + self._end.extension) + else: + raise AttributeError("Cannot evaluate attribute %s." % attr) + +class AbstractPosition: + """Abstract base class representing a position. + """ + def __init__(self, position, extension): + self.position = position + self.extension = extension + + def __repr__(self) : + """String representation of the location for debugging.""" + return "%s(%s,%s)" \ + % (self.__class__, repr(self.position), repr(self.extension)) + + def __cmp__(self, other): + """A simple comparison function for positions. + + This is very simple-minded and just compares the position attribute + of the features; extensions are not considered at all. This could + potentially be expanded to try to take advantage of extensions. + """ + assert isinstance(other, AbstractPosition), \ + "We can only do comparisons between Biopython Position objects." + + return cmp(self.position, other.position) + + def _shift(self, offset) : + #We want this to maintain the subclass when called from a subclass + return self.__class__(self.position + offset, self.extension) + +class ExactPosition(AbstractPosition): + """Specify the specific position of a boundary. + + o position - The position of the boundary. + o extension - An optional argument which must be zero since we don't + have an extension. The argument is provided so that the same number of + arguments can be passed to all position types. + + In this case, there is no fuzziness associated with the position. + """ + def __init__(self, position, extension = 0): + if extension != 0: + raise AttributeError("Non-zero extension %s for exact position." + % extension) + AbstractPosition.__init__(self, position, 0) + + def __repr__(self) : + """String representation of the ExactPosition location for debugging.""" + assert self.extension == 0 + return "%s(%s)" % (self.__class__, repr(self.position)) + + def __str__(self): + return str(self.position) + +class WithinPosition(AbstractPosition): + """Specify the position of a boundary within some coordinates. + + Arguments: + o position - The start position of the boundary + o extension - The range to which the boundary can extend. + + This allows dealing with a position like ((1.4)..100). This + indicates that the start of the sequence is somewhere between 1 + and 4. To represent that with this class we would set position as + 1 and extension as 3. + """ + def __init__(self, position, extension = 0): + AbstractPosition.__init__(self, position, extension) + + def __str__(self): + return "(%s.%s)" % (self.position, self.position + self.extension) + +class BetweenPosition(AbstractPosition): + """Specify the position of a boundary between two coordinates. + + Arguments: + o position - The start position of the boundary. + o extension - The range to the other position of a boundary. + + This specifies a coordinate which is found between the two positions. + So this allows us to deal with a position like ((1^2)..100). To + represent that with this class we set position as 1 and the + extension as 1. + """ + def __init__(self, position, extension = 0): + AbstractPosition.__init__(self, position, extension) + + def __str__(self): + return "(%s^%s)" % (self.position, self.position + self.extension) + +class BeforePosition(AbstractPosition): + """Specify a position where the actual location occurs before it. + + Arguments: + o position - The upper boundary of where the location can occur. + o extension - An optional argument which must be zero since we don't + have an extension. The argument is provided so that the same number of + arguments can be passed to all position types. + + This is used to specify positions like (<10..100) where the location + occurs somewhere before position 10. + """ + def __init__(self, position, extension = 0): + if extension != 0: + raise AttributeError("Non-zero extension %s for exact position." + % extension) + AbstractPosition.__init__(self, position, 0) + + def __repr__(self) : + """A string representation of the location for debugging.""" + assert self.extension == 0 + return "%s(%s)" % (self.__class__, repr(self.position)) + + def __str__(self): + return "<%s" % self.position + +class AfterPosition(AbstractPosition): + """Specify a position where the actual location is found after it. + + Arguments: + o position - The lower boundary of where the location can occur. + o extension - An optional argument which must be zero since we don't + have an extension. The argument is provided so that the same number of + arguments can be passed to all position types. + + This is used to specify positions like (>10..100) where the location + occurs somewhere after position 10. + """ + def __init__(self, position, extension = 0): + if extension != 0: + raise AttributeError("Non-zero extension %s for exact position." + % extension) + AbstractPosition.__init__(self, position, 0) + + def __repr__(self) : + """A string representation of the location for debugging.""" + assert self.extension == 0 + return "%s(%s)" % (self.__class__, repr(self.position)) + + def __str__(self): + return ">%s" % self.position + +class OneOfPosition(AbstractPosition): + """Specify a position where the location can be multiple positions. + + This models the GenBank 'one-of(1888,1901)' function, and tries + to make this fit within the Biopython Position models. In our case + the position of the "one-of" is set as the lowest choice, and the + extension is the range to the highest choice. + """ + def __init__(self, position_list): + """Initialize with a set of posssible positions. + + position_list is a list of AbstractPosition derived objects, + specifying possible locations. + """ + # unique attribute for this type of positions + self.position_choices = position_list + # find the smallest and largest position in the choices + smallest = None + largest = None + for position_choice in self.position_choices: + assert isinstance(position_choice, AbstractPosition), \ + "Expected position objects, got %r" % position_choice + if smallest is None and largest is None: + smallest = position_choice.position + largest = position_choice.position + elif position_choice.position > largest: + largest = position_choice.position + elif position_choice.position < smallest: + smallest = position_choice.position + # initialize with our definition of position and extension + AbstractPosition.__init__(self, smallest, largest - smallest) + + def __repr__(self) : + """String representation of the OneOfPosition location for debugging.""" + return "%s(%s)" % (self.__class__, repr(self.position_choices)) + + def __str__(self): + out = "one-of(" + for position in self.position_choices: + out += "%s," % position + # replace the last comma with the closing parenthesis + out = out[:-1] + ")" + return out + +class PositionGap: + """Simple class to hold information about a gap between positions. + """ + def __init__(self, gap_size): + """Intialize with a position object containing the gap information. + """ + self.gap_size = gap_size + + def __repr__(self) : + """A string representation of the position gap for debugging.""" + return "%s(%s)" % (self.__class__, repr(self.gap_size)) + + def __str__(self): + out = "gap(%s)" % self.gap_size + return out diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/AceIO.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/AceIO.py new file mode 100644 index 0000000..1670fb3 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/AceIO.py @@ -0,0 +1,63 @@ +# Copyright 2008 by Peter Cock. All rights reserved. +# +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Bio.SeqIO support for the "ace" file format. + +You are expected to use this module via the Bio.SeqIO functions. +See also the Bio.Sequencing.Ace module which offers more than just accessing +the contig consensus sequences in an ACE file as SeqRecord objects.""" + +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import generic_nucleotide, generic_dna, generic_rna, Gapped +from Bio.Sequencing import Ace + +#This is a generator function! +def AceIterator(handle) : + """Returns SeqRecord objects from an ACE file. + + This uses the Bio.Sequencing.Ace module to do the hard work. Note that + by iterating over the file in a single pass, we are forced to ignore any + WA, CT, RT or WR footer tags.""" + + for ace_contig in Ace.parse(handle) : + #Convert the ACE contig record into a SeqRecord... + consensus_seq_str = ace_contig.sequence + #Assume its DNA unless there is a U in it, + if "U" in consensus_seq_str : + if "T" in consensus_seq_str : + #Very odd! Error? + alpha = generic_ncleotide + else : + alpha = generic_rna + else : + alpha = generic_dna + + if "*" in consensus_seq_str : + #For consistency with most other file formats, map + #any * gaps into 0 gaps. + assert "-" not in consensus_seq_str + consensus_seq = Seq(consensus_seq_str.replace("*","-"), + Gapped(alpha, gap_char="-")) + else : + consensus_seq = Seq(consensus_seq_str, alpha) + + #TODO - Consensus base quality (BQ lines). Note that any gaps + #(* character) in the consensus does not get a quality entry. + #This really needs Biopython support for per-letter-annotation. + + #TODO? - Base segments (BS lines) which indicates which read + #phrap has chosen to be the consensus at a particular position. + #Perhaps as SeqFeature objects? + + #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) + #Perhaps as SeqFeature objects? + + seq_record = SeqRecord(consensus_seq, + id = ace_contig.name, + name = ace_contig.name) + yield seq_record + #All done diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/FastaIO.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/FastaIO.py new file mode 100644 index 0000000..38437ba --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/FastaIO.py @@ -0,0 +1,208 @@ +# Copyright 2006-2009 by Peter Cock. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# +# This module is for reading and writing FASTA format files as SeqRecord +# objects. The code is partly inspired by earlier Biopython modules, +# Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA + +"""Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. + +You are expected to use this module via the Bio.SeqIO functions.""" + +from Bio.Alphabet import single_letter_alphabet +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Interfaces import SequentialSequenceWriter + +#This is a generator function! +def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None) : + """Generator function to iterate over Fasta records (as SeqRecord objects). + + handle - input file + alphabet - optional alphabet + title2ids - A function that, when given the title of the FASTA + file (without the beginning >), will return the id, name and + description (in that order) for the record as a tuple of strings. + + If this is not given, then the entire title line will be used + as the description, and the first word as the id and name. + + Note that use of title2ids matches that of Bio.Fasta.SequenceParser + but the defaults are slightly different. + """ + #Skip any text before the first record (e.g. blank lines, comments) + while True : + line = handle.readline() + if line == "" : return #Premature end of file, or just empty? + if line[0] == ">" : + break + + while True : + if line[0]!=">" : + raise ValueError("Records in Fasta files should start with '>' character") + if title2ids : + id, name, descr = title2ids(line[1:].rstrip()) + else : + descr = line[1:].rstrip() + id = descr.split()[0] + name = id + + lines = [] + line = handle.readline() + while True: + if not line : break + if line[0] == ">": break + #Remove trailing whitespace, and any internal spaces + #(and any embedded \r which are possible in mangled files + #when not opened in universal read lines mode) + lines.append(line.rstrip().replace(" ","").replace("\r","")) + line = handle.readline() + + #Return the record and then continue... + yield SeqRecord(Seq("".join(lines), alphabet), + id = id, name = name, description = descr) + + if not line : return #StopIteration + assert False, "Should not reach this line" + +class FastaWriter(SequentialSequenceWriter): + """Class to write Fasta format files.""" + def __init__(self, handle, wrap=60, record2title=None): + """Create a Fasta writer. + + handle - Handle to an output file, e.g. as returned + by open(filename, "w") + wrap - Optional line length used to wrap sequence lines. + Defaults to wrapping the sequence at 60 characters + Use zero (or None) for no wrapping, giving a single + long line for the sequence. + record2title - Optional function to return the text to be + used for the title line of each record. By default the + a combination of the record.id and record.description + is used. If the record.description starts with the + record.id, then just the record.description is used. + + You can either use: + + myWriter = FastaWriter(open(filename,"w")) + writer.write_file(myRecords) + + Or, follow the sequential file writer system, for example: + + myWriter = FastaWriter(open(filename,"w")) + writer.write_header() # does nothing for Fasta files + ... + Multiple calls to writer.write_record() and/or writer.write_records() + ... + writer.write_footer() # does nothing for Fasta files + writer.close() + """ + SequentialSequenceWriter.__init__(self, handle) + #self.handle = handle + self.wrap = None + if wrap : + if wrap < 1 : + raise ValueError + self.wrap = wrap + self.record2title = record2title + + def write_record(self, record): + """Write a single Fasta record to the file.""" + assert self._header_written + assert not self._footer_written + self._record_written = True + + if self.record2title : + title=self.clean(self.record2title(record)) + else : + id = self.clean(record.id) + description = self.clean(record.description) + + #if description[:len(id)]==id : + if description and description.split(None,1)[0]==id : + #The description includes the id at the start + title = description + else : + title = "%s %s" % (id, description) + + assert "\n" not in title + assert "\r" not in title + self.handle.write(">%s\n" % title) + + data = self._get_seq_string(record) #Catches sequence being None + + assert "\n" not in data + assert "\r" not in data + + if self.wrap : + for i in range(0, len(data), self.wrap): + self.handle.write(data[i:i+self.wrap] + "\n") + else : + self.handle.write(data + "\n") + +if __name__ == "__main__" : + print "Running quick self test" + + import os + from Bio.Alphabet import generic_protein, generic_nucleotide + + #Download the files from here: + #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans + fna_filename = "NC_005213.fna" + faa_filename = "NC_005213.faa" + + def genbank_name_function(text) : + text, descr = text.split(None,1) + id = text.split("|")[3] + name = id.split(".",1)[0] + return id, name, descr + + def print_record(record) : + #See also bug 2057 + #http://bugzilla.open-bio.org/show_bug.cgi?id=2057 + print "ID:" + record.id + print "Name:" + record.name + print "Descr:" + record.description + print record.seq + for feature in record.annotations : + print '/%s=%s' % (feature, record.annotations[feature]) + if record.dbxrefs : + print "Database cross references:" + for x in record.dbxrefs : print " - %s" % x + + if os.path.isfile(fna_filename) : + print "--------" + print "FastaIterator (single sequence)" + iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function) + count=0 + for record in iterator : + count=count+1 + print_record(record) + assert count == 1 + print str(record.__class__) + + if os.path.isfile(faa_filename) : + print "--------" + print "FastaIterator (multiple sequences)" + iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function) + count=0 + for record in iterator : + count=count+1 + print_record(record) + break + assert count>0 + print str(record.__class__) + + from cStringIO import StringIO + print "--------" + print "FastaIterator (empty input file)" + #Just to make sure no errors happen + iterator = FastaIterator(StringIO("")) + count = 0 + for record in iterator : + count = count+1 + assert count==0 + + print "Done" diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/IgIO.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/IgIO.py new file mode 100644 index 0000000..57b7aa1 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/IgIO.py @@ -0,0 +1,97 @@ +# Copyright 2008 by Peter Cock. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# +# This module is for reading and writing IntelliGenetics format files as +# SeqRecord objects. This file format appears to be the same as the MASE +# multiple sequence alignment format. + +"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format. + +You are expected to use this module via the Bio.SeqIO functions.""" + +from Bio.Alphabet import single_letter_alphabet +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +#This is a generator function! +def IgIterator(handle, alphabet = single_letter_alphabet) : + """Iterate over IntelliGenetics records (as SeqRecord objects). + + handle - input file + alphabet - optional alphabet + + The optional free format file header lines (which start with two + semi-colons) are ignored. + + The free format commentary lines at the start of each record (which + start with a semi-colon) are recorded as a single string with embedded + new line characters in the SeqRecord's annotations dictionary under the + key 'comment'. + """ + #Skip any file header text before the first record (;; lines) + while True : + line = handle.readline() + if not line : break #Premature end of file, or just empty? + if not line.startswith(";;") : break + + while line : + #Now iterate over the records + if line[0]!=";" : + raise ValueError( \ + "Records should start with ';' and not:\n%s" % repr(line)) + + #Try and agree with SeqRecord convention from the GenBank parser, + #(and followed in the SwissProt parser) which stores the comments + #as a long string with newlines under annotations key 'comment'. + + #Note some examples use "; ..." and others ";..." + comment_lines = [] + while line.startswith(";") : + #TODO - Extract identifier from lines like "LOCUS\tB_SF2"? + comment_lines.append(line[1:].strip()) + line = handle.readline() + title = line.rstrip() + + seq_lines = [] + while True: + line = handle.readline() + if not line : break + if line[0] == ";": break + #Remove trailing whitespace, and any internal spaces + seq_lines.append(line.rstrip().replace(" ","")) + seq_str = "".join(seq_lines) + if seq_str.endswith("1") : + #Remove the optional terminator (digit one) + seq_str = seq_str[:-1] + if "1" in seq_str : + raise ValueError("Potential terminator digit one found within sequence.") + + #Return the record and then continue... + record= SeqRecord(Seq(seq_str, alphabet), + id = title, name = title) + record.annotations['comment'] = "\n".join(comment_lines) + yield record + + #We should be at the end of the file now + assert not line + +if __name__ == "__main__" : + print "Running quick self test" + + import os + path = "../../Tests/IntelliGenetics/" + if os.path.isdir(path) : + for filename in os.listdir(path) : + if os.path.splitext(filename)[-1] == ".txt" : + print + print filename + print "-"*len(filename) + handle = open(os.path.join(path, filename)) + for record in IgIterator(handle) : + print record.id, len(record) + handle.close() + print "Done" + else : + print "Could not find input files" diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/InsdcIO.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/InsdcIO.py new file mode 100644 index 0000000..f2215ed --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/InsdcIO.py @@ -0,0 +1,377 @@ +# Copyright 2007-2009 by Peter Cock. All rights reserved. +# +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package.. + +"""Bio.SeqIO support for the "genbank" and "embl" file formats. + +You are expected to use this module via the Bio.SeqIO functions. +Note that internally this module calls Bio.GenBank to do the actual +parsing of both GenBank and EMBL files. + +See also: + +International Nucleotide Sequence Database Collaboration +http://www.insdc.org/ + +GenBank +http://www.ncbi.nlm.nih.gov/Genbank/ + +EMBL Nucleotide Sequence Database +http://www.ebi.ac.uk/embl/ + +DDBJ (DNA Data Bank of Japan) +http://www.ddbj.nig.ac.jp/ +""" + +from Bio.Seq import UnknownSeq +from Bio.GenBank.Scanner import GenBankScanner, EmblScanner +from Bio import Alphabet +from Interfaces import SequentialSequenceWriter + +# NOTE +# ==== +# The "brains" for parsing GenBank and EMBL files (and any +# other flat file variants from the INSDC in future) is in +# Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) + +def GenBankIterator(handle) : + """Breaks up a Genbank file into SeqRecord objects. + + Every section from the LOCUS line to the terminating // becomes + a single SeqRecord with associated annotation and features. + + Note that for genomes or chromosomes, there is typically only + one record.""" + #This calls a generator function: + return GenBankScanner(debug=0).parse_records(handle) + +def EmblIterator(handle) : + """Breaks up an EMBL file into SeqRecord objects. + + Every section from the LOCUS line to the terminating // becomes + a single SeqRecord with associated annotation and features. + + Note that for genomes or chromosomes, there is typically only + one record.""" + #This calls a generator function: + return EmblScanner(debug=0).parse_records(handle) + +def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein) : + """Breaks up a Genbank file into SeqRecord objects for each CDS feature. + + Every section from the LOCUS line to the terminating // can contain + many CDS features. These are returned as with the stated amino acid + translation sequence (if given). + """ + #This calls a generator function: + return GenBankScanner(debug=0).parse_cds_features(handle, alphabet) + +def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein) : + """Breaks up a EMBL file into SeqRecord objects for each CDS feature. + + Every section from the LOCUS line to the terminating // can contain + many CDS features. These are returned as with the stated amino acid + translation sequence (if given). + """ + #This calls a generator function: + return EmblScanner(debug=0).parse_cds_features(handle, alphabet) + +class GenBankWriter(SequentialSequenceWriter) : + HEADER_WIDTH = 12 + MAX_WIDTH = 80 + + def _write_single_line(self, tag, text) : + "Used in the the 'header' of each GenBank record.""" + assert len(tag) < self.HEADER_WIDTH + assert len(text) < self.MAX_WIDTH - self.HEADER_WIDTH, \ + "Annotation %s too long for %s line" % (repr(text), tag) + self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), + text.replace("\n"," "))) + + def _write_multi_line(self, tag, text) : + "Used in the the 'header' of each GenBank record.""" + #TODO - Do the line spliting while preserving white space? + max_len = self.MAX_WIDTH - self.HEADER_WIDTH + assert len(tag) < self.HEADER_WIDTH + text = text.strip() + if len(text) < max_len : + self._write_single_line(tag, text) + return + + words = text.split() + assert max([len(w) for w in words]) < max_len, \ + "Your description cannot be broken into nice lines!" + text = "" + while words and len(text) + 1 + len(words[0]) < max_len : + text += " " + words.pop(0) + text = text.strip() + assert len(text) < max_len + self._write_single_line(tag, text) + while words : + text = "" + while words and len(text) + 1 + len(words[0]) < max_len : + text += " " + words.pop(0) + text = text.strip() + assert len(text) < max_len + self._write_single_line("", text) + assert not words + + def _write_the_first_line(self, record) : + """Write the LOCUS line.""" + + locus = record.name + if not locus or locus == "" : + locus = record.id + if not locus or locus == "" : + locus = self._get_annotation_str(record, "accession", just_first=True) + if len(locus) > 16 : + raise ValueError("Locus identifier %s is too long" % repr(locus)) + + if len(record) > 99999999999 : + #Currently GenBank only officially support up to 350000, but + #the length field can take eleven digits + raise ValueError("Sequence too long!") + + #Get the base alphabet (underneath any Gapped or StopCodon encoding) + a = Alphabet._get_base_alphabet(record.seq.alphabet) + if not isinstance(a, Alphabet.Alphabet) : + raise TypeError("Invalid alphabet") + elif isinstance(a, Alphabet.ProteinAlphabet) : + units = "bp" + elif isinstance(a, Alphabet.NucleotideAlphabet) : + units = "aa" + else : + #Must be something like NucleotideAlphabet or + #just the generic Alphabet (default for fasta files) + raise ValueError("Need a Nucleotide or Protein alphabet") + + #Get the molecule type + #TODO - record this explicitly in the parser? + if isinstance(a, Alphabet.ProteinAlphabet) : + mol_type = "" + elif isinstance(a, Alphabet.DNAAlphabet) : + mol_type = "DNA" + elif isinstance(a, Alphabet.RNAAlphabet) : + mol_type = "RNA" + else : + #Must be something like NucleotideAlphabet or + #just the generic Alphabet (default for fasta files) + raise ValueError("Need a DNA, RNA or Protein alphabet") + + try : + division = record.annotations["data_file_division"] + except KeyError : + division = "UNK" + if division not in ["PRI","ROD","MAM","VRT","INV","PLN","BCT", + "VRL","PHG","SYN","UNA","EST","PAT","STS", + "GSS","HTG","HTC","ENV"] : + division = "UNK" + + assert len(units) == 2 + assert len(division) == 3 + #TODO - date + #TODO - mol_type + line = "LOCUS %s %s %s %s %s 01-JAN-1980\n" \ + % (locus.ljust(16), + str(len(record)).rjust(11), + units, + mol_type.ljust(6), + division) + assert len(line) == 79+1, repr(line) #plus one for new line + + assert line[12:28].rstrip() == locus, \ + 'LOCUS line does not contain the locus at the expected position:\n' + line + assert line[28:29] == " " + assert line[29:40].lstrip() == str(len(record)), \ + 'LOCUS line does not contain the length at the expected position:\n' + line + + #Tests copied from Bio.GenBank.Scanner + assert line[40:44] in [' bp ', ' aa '] , \ + 'LOCUS line does not contain size units at expected position:\n' + line + assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ + 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line + assert line[47:54].strip() == "" \ + or line[47:54].strip().find('DNA') != -1 \ + or line[47:54].strip().find('RNA') != -1, \ + 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line + assert line[54:55] == ' ', \ + 'LOCUS line does not contain space at position 55:\n' + line + assert line[55:63].strip() in ['','linear','circular'], \ + 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line + assert line[63:64] == ' ', \ + 'LOCUS line does not contain space at position 64:\n' + line + assert line[67:68] == ' ', \ + 'LOCUS line does not contain space at position 68:\n' + line + assert line[70:71] == '-', \ + 'LOCUS line does not contain - at position 71 in date:\n' + line + assert line[74:75] == '-', \ + 'LOCUS line does not contain - at position 75 in date:\n' + line + + self.handle.write(line) + + def _get_annotation_str(self, record, key, default=".", just_first=False) : + """Get an annotation dictionary entry (as a string). + + Some entries are lists, in which case if just_first=True the first entry + is returned. If just_first=False (default) this verifies there is only + one entry before returning it.""" + try : + answer = record.annotations[key] + except KeyError : + return default + if isinstance(answer, list) : + if not just_first : assert len(answer) == 1 + return str(answer[0]) + else : + return str(answer) + + def _write_sequence(self, record): + #Loosely based on code from Howard Salis + #TODO - Force lower case? + LETTERS_PER_LINE = 60 + SEQUENCE_INDENT = 9 + + if isinstance(record.seq, UnknownSeq) : + #We have already recorded the length, and there is no need + #to record a long sequence of NNNNNNN...NNN or whatever. + return + + data = self._get_seq_string(record) #Catches sequence being None + seq_len = len(data) + for line_number in range(0,seq_len,LETTERS_PER_LINE): + self.handle.write(str(line_number+1).rjust(SEQUENCE_INDENT)) + for words in range(line_number,min(line_number+LETTERS_PER_LINE,seq_len),10): + self.handle.write(" %s" % data[words:words+10]) + self.handle.write("\n") + + def write_record(self, record): + """Write a single record to the output file.""" + handle = self.handle + self._write_the_first_line(record) + + accession = self._get_annotation_str(record, "accession", + record.id.split(".",1)[0], + just_first=True) + acc_with_version = accession + if record.id.startswith(accession+".") : + try : + acc_with_version = "%s.%i" \ + % (accession, int(record.id.split(".",1)[1])) + except ValueError : + pass + gi = self._get_annotation_str(record, "gi", just_first=True) + + descr = record.description + if descr == "" : descr = "." + self._write_multi_line("DEFINITION", descr) + + self._write_single_line("ACCESSION", accession) + if gi != "." : + self._write_single_line("VERSION", "%s GI:%s" % (acc_with_version,gi)) + else : + self._write_single_line("VERSION", "%s" % (acc_with_version)) + + try : + #List of strings + keywords = "; ".join(record.annotations["keywords"]) + except KeyError : + keywords = "." + self._write_multi_line("KEYWORDS", keywords) + + self._write_multi_line("SOURCE", \ + self._get_annotation_str(record, "source")) + #The ORGANISM line MUST be a single line, as any continuation is the taxonomy + org = self._get_annotation_str(record, "organism") + if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH : + org = org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..." + self._write_single_line(" ORGANISM", org) + try : + #List of strings + taxonomy = "; ".join(record.annotations["taxonomy"]) + except KeyError : + taxonomy = "." + self._write_multi_line("", taxonomy) + + #TODO - References... + handle.write("FEATURES Location/Qualifiers\n") + for feature in record.features : + self._write_feature(feature) + handle.write("ORIGIN\n") + self._write_sequence(record) + handle.write("//\n") + + def _write_feature(self, feature): + """Write a single SeqFeature object to features table. + + Not implemented yet, but this stub exists in the short term to + facilitate working on writing GenBank files with a sub-class.""" + #TODO - Features... + pass + +if __name__ == "__main__" : + print "Quick self test" + import os + from StringIO import StringIO + + def check_genbank_writer(records) : + handle = StringIO() + GenBankWriter(handle).write_file(records) + handle.seek(0) + + records2 = list(GenBankIterator(handle)) + + assert len(records) == len(records2) + for r1, r2 in zip(records, records2) : + #The SwissProt parser may leave \n in the description... + assert r1.description.replace("\n", " ") == r2.description + assert r1.id == r2.id + assert r1.name == r2.name + assert str(r1.seq) == str(r2.seq) + for key in ["gi", "keywords", "source", "taxonomy"] : + if key in r1.annotations : + assert r1.annotations[key] == r2.annotations[key], key + for key in ["organism"] : + if key in r1.annotations : + v1 = r1.annotations[key] + v2 = r2.annotations[key] + assert isinstance(v1, str) and isinstance(v2, str) + #SwissProt organism can be too long to record in GenBank format + assert v1 == v2 or \ + (v2.endswith("...") and v1.startswith(v2[:-3])), key + + for filename in os.listdir("../../Tests/GenBank") : + if not filename.endswith(".gbk") and not filename.endswith(".gb") : + continue + print filename + + handle = open("../../Tests/GenBank/%s" % filename) + records = list(GenBankIterator(handle)) + handle.close() + + check_genbank_writer(records) + + for filename in os.listdir("../../Tests/EMBL") : + if not filename.endswith(".embl") : + continue + print filename + + handle = open("../../Tests/EMBL/%s" % filename) + records = list(EmblIterator(handle)) + handle.close() + + check_genbank_writer(records) + + from Bio import SeqIO + for filename in os.listdir("../../Tests/SwissProt") : + if not filename.startswith("sp") : + continue + print filename + + handle = open("../../Tests/SwissProt/%s" % filename) + records = list(SeqIO.parse(handle,"swiss")) + handle.close() + + check_genbank_writer(records) + diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/Interfaces.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/Interfaces.py new file mode 100644 index 0000000..3d28298 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/Interfaces.py @@ -0,0 +1,275 @@ +# Copyright 2006-2008 by Peter Cock. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +""" +Bio.SeqIO support module (not for general use). + +Unless you are writing a new parser or writer for Bio.SeqIO, you should not +use this module. It provides base classes to try and simplify things. +""" + +from Bio.Alphabet import generic_alphabet + +class SequenceIterator : + """Base class for building SeqRecord iterators. + + You should write a next() method to return SeqRecord + objects. You may wish to redefine the __init__ + method as well. + """ + def __init__(self, handle, alphabet=generic_alphabet) : + """Create a SequenceIterator object. + + handle - input file + alphabet - optional, e.g. Bio.Alphabet.generic_protein + + Note when subclassing: + - there should be a single non-optional argument, + the handle. + - you do not have to require an alphabet. + - you can add additional optional arguments.""" + self.handle = handle + self.alphabet = alphabet + ##################################################### + # You may want to subclass this, for example # + # to read through the file to find the first record,# + # or if additional arguments are required. # + ##################################################### + + def next(self) : + """Return the next record in the file. + + This method should be replaced by any derived class to do something useful.""" + raise NotImplementedError("This object should be subclassed") + ##################################################### + # You SHOULD subclass this, to split the file up # + # into your individual records, and convert these # + # into useful objects, e.g. return SeqRecord object # + ##################################################### + + def __iter__(self): + """Iterate over the entries as a SeqRecord objects. + + Example usage for Fasta files: + + myFile = open("example.fasta","r") + myFastaReader = FastaIterator(myFile) + for record in myFastaReader : + print record.id + print record.seq + myFile.close()""" + return iter(self.next, None) + +class InterlacedSequenceIterator(SequenceIterator) : + """Base class for any iterator of a non-sequential file type. + + This object is not intended for use directly. + + When writing a parser for any interlaced sequence file where the whole + file must be read in order to extract any single record, then you should + subclass this object. + + All you need to do is to define your own: + (1) __init__ method to parse the file and call self.move_start() + (2) __len__ method to return the number of records + (3) __getitem__ to return any requested record. + + This class will then provide the iterator methods including next(), but relies + on knowing the total number of records and tracking the pending record index in + as self._n + + It is up to the subclassed object to decide if it wants to generate a cache of + SeqRecords when initialised, or simply use its own lists and dicts and create + SeqRecords on request. + """ + + def __init__(self) : + """Create the object. + + This method should be replaced by any derived class to do something useful.""" + #We assume that your implementation of __init__ will ensure self._n=0 + self.move_start() + raise NotImplementedError("This object method should be subclassed") + ##################################################### + # You SHOULD subclass this # + ##################################################### + + def __len__(self) : + """Return the number of records. + + This method should be replaced by any derived class to do something useful.""" + raise NotImplementedError("This object method should be subclassed") + ##################################################### + # You SHOULD subclass this # + ##################################################### + + def __getitem__(self, i) : + """Return the requested record. + + This method should be replaced by any derived class to do something + useful. + + It should NOT touch the value of self._n""" + raise NotImplementedError("This object method should be subclassed") + ##################################################### + # You SHOULD subclass this # + ##################################################### + + def move_start(self) : + self._n = 0 + + def next(self) : + next_record = self._n + if next_record < len(self) : + self._n = next_record+1 + return self[next_record] + else : + #StopIteration + return None + + def __iter__(self): + return iter(self.next, None) + +class SequenceWriter: + """This class should be subclassed. + + Interlaced file formats (e.g. Clustal) should subclass directly. + + Sequential file formats (e.g. Fasta, GenBank) should subclass + the SequentialSequenceWriter class instead. + """ + def __init__(self, handle): + """Creates the writer object. + + Use the method write_file() to actually record your sequence records.""" + self.handle = handle + + def _get_seq_string(self, record): + """Use this to catch errors like the sequence being None.""" + try : + #The tostring() method is part of the Seq API, we could instead + #use str(record.seq) but that would give a string "None" if the + #sequence was None, and unpredicatable output if an unexpected + #object was present. + return record.seq.tostring() + except AttributeError : + if record.seq is None : + #We could silently treat this as an empty sequence, Seq(""), + #but that would be an implict assumption we should avoid. + raise TypeError("SeqRecord (id=%s) has None for its sequence." \ + % record.id) + else : + raise TypeError("SeqRecord (id=%s) has an invalid sequence." \ + % record.id) + + def clean(self, text) : + """Use this to avoid getting newlines in the output.""" + answer = text + for x in ["\n", "\r"] : + answer = answer.replace(x, " ") + return answer.replace(" ", " ") + + def write_file(self, records) : + """Use this to write an entire file containing the given records. + + records - A list or iterator returning SeqRecord objects + + Should return the number of records (as an integer). + + This method can only be called once.""" + #Note when implementing this, you should close the file at the end. + raise NotImplementedError("This object should be subclassed") + ##################################################### + # You SHOULD subclass this # + ##################################################### + +class SequentialSequenceWriter(SequenceWriter): + """This class should be subclassed. + + It is intended for sequential file formats with an (optional) + header, repeated records, and an (optional) footer. + + In this case (as with interlaced file formats), the user may + simply call the write_file() method and be done. + + However, they may also call the write_header(), followed + by multiple calls to write_record() and/or write_records() + followed finally by write_footer(). + + Users must call write_header() and write_footer() even when + the file format concerned doesn't have a header or footer. + This is to try and make life as easy as possible when + switching the output format. + + Note that write_header() cannot require any assumptions about + the number of records. + """ + def __init__(self, handle): + self.handle = handle + self._header_written = False + self._record_written = False + self._footer_written = False + + def write_header(self) : + assert not self._header_written, "You have aleady called write_header()" + assert not self._record_written, "You have aleady called write_record() or write_records()" + assert not self._footer_written, "You have aleady called write_footer()" + self._header_written = True + + def write_footer(self) : + assert self._header_written, "You must call write_header() first" + assert self._record_written, "You have not called write_record() or write_records() yet" + assert not self._footer_written, "You have aleady called write_footer()" + self._footer_written = True + + def write_record(self, record): + """Write a single record to the output file. + + record - a SeqRecord object + + Once you have called write_header() you can call write_record() + and/or write_records() as many times as needed. Then call + write_footer() and close().""" + assert self._header_written, "You must call write_header() first" + assert not self._footer_written, "You have already called write_footer()" + self._record_written = True + raise NotImplementedError("This object should be subclassed") + ##################################################### + # You SHOULD subclass this # + ##################################################### + + def write_records(self, records): + """Write multiple record to the output file. + + records - A list or iterator returning SeqRecord objects + + Once you have called write_header() you can call write_record() + and/or write_records() as many times as needed. Then call + write_footer() and close(). + + Returns the number of records written. + """ + #Default implementation: + assert self._header_written, "You must call write_header() first" + assert not self._footer_written, "You have already called write_footer()" + count = 0 + for record in records : + self.write_record(record) + count += 1 + #Mark as true, even if there where no records + self._record_written = True + return count + + def write_file(self, records) : + """Use this to write an entire file containing the given records. + + records - A list or iterator returning SeqRecord objects + + This method can only be called once. Returns the number of records + written. + """ + self.write_header() + count = self.write_records(records) + self.write_footer() + return count diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/PhdIO.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/PhdIO.py new file mode 100644 index 0000000..29bfe14 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/PhdIO.py @@ -0,0 +1,43 @@ +# Copyright 2008 by Peter Cock. All rights reserved. +# +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Bio.SeqIO support for the "phd" file format. + +PHD files are output by PHRED and used by PHRAP and CONSED. + +You are expected to use this module via the Bio.SeqIO functions. +See also the underlying Bio.Sequencing.Phd module.""" + +from Bio.SeqRecord import SeqRecord +from Bio.Sequencing import Phd + +#This is a generator function! +def PhdIterator(handle) : + """Returns SeqRecord objects from a PHD file. + + This uses the Bio.Sequencing.Phy module to do the hard work. + """ + + phd_records = Phd.parse(handle) + for phd_record in phd_records: + #Convert the PHY record into a SeqRecord... + seq_record = SeqRecord(phd_record.seq, + id = phd_record.file_name, + name = phd_record.file_name) + #Just re-use the comments dictionary as the SeqRecord's annotations + seq_record.annotations = phd_record.comments + yield seq_record + #All done + +if __name__ == "__main__" : + print "Quick self test" + handle = open("../../Tests/Phd/Phd1") + for record in PhdIterator(handle) : + print record + handle.close() + print "Done" + + diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/PirIO.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/PirIO.py new file mode 100644 index 0000000..953076c --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/PirIO.py @@ -0,0 +1,182 @@ +# Copyright 2008 by Peter Cock. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# +# This module is for reading and writing PIR or NBRF format files as +# SeqRecord objects. The code is based on Bio.SeqIO.FastaIO + +"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format. + +You are expected to use this module via the Bio.SeqIO functions, or if +the file contains a sequence alignment, optionally via Bio.AlignIO instead. + +This format was introduced for the Protein Information Resource (PIR), a +project of the National Biomedical Research Foundation (NBRF). The PIR +database itself is now part of UniProt. + +The file format is described online at: +http://www.ebi.ac.uk/help/pir_frame.html +http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down) + +An example file in this format would be: + +>P1;CRAB_ANAPL +ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). + MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR + SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH + GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ + SDVPERSIPI TREEKPAIAG AQRK* + +>P1;CRAB_BOVIN +ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). + MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR + PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV + HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK + QASGPERTIP ITREEKPAVT AAPKK* + +Or, an example of a multiple sequence alignment: + +>P1;S27231 +rhodopsin - northern leopard frog +MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY +VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG +GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP +EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES +ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI +YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA* + +>P1;I51200 +rhodopsin - African clawed frog +MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF +VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG +GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP +EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES +LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI +YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA* + +>P1;JN0120 +rhodopsin - Japanese lamprey +MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF +VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG +GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP +EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES +ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL +YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA* + + +As with the FASTA format, each record starts with a line begining with ">" +character. There is then a two letter sequence type (P1, F1, DL, DC, RL, +RC, or XX), a semi colon, and the identification code. The second like is +free text description. The remaining lines contain the sequence itself, +terminating in an asterisk. Space separated blocks of ten letters as shown +above are typical. + +Sequence codes and their meanings: + +P1 - Protein (complete) +F1 - Protein (fragment) +D1 - DNA (e.g. EMBOSS seqret output) +DL - DNA (linear) +DC - DNA (circular) +RL - RNA (linear) +RC - RNA (circular) +N3 - tRNA +N1 - Other functional RNA +XX - Unknown +""" + +from Bio.Alphabet import single_letter_alphabet, generic_protein, generic_dna, generic_rna +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +_pir_alphabets = {"P1" : generic_protein, + "F1" : generic_protein, + "D1" : generic_dna, + "DL" : generic_dna, + "DC" : generic_dna, + "RL" : generic_rna, + "RC" : generic_rna, + "N3" : generic_rna, + "XX" : single_letter_alphabet, + } + +#This is a generator function! +def PirIterator(handle) : + """Generator function to iterate over Fasta records (as SeqRecord objects). + + handle - input file + alphabet - optional alphabet + title2ids - A function that, when given the title of the FASTA + file (without the beginning >), will return the id, name and + description (in that order) for the record as a tuple of strings. + + If this is not given, then the entire title line will be used + as the description, and the first word as the id and name. + + Note that use of title2ids matches that of Bio.Fasta.SequenceParser + but the defaults are slightly different. + """ + #Skip any text before the first record (e.g. blank lines, comments) + while True : + line = handle.readline() + if line == "" : return #Premature end of file, or just empty? + if line[0] == ">" : + break + + while True : + if line[0]!=">" : + raise ValueError("Records in PIR files should start with '>' character") + pir_type = line[1:3] + if pir_type not in _pir_alphabets or line[3] != ";" : + raise ValueError("Records should start with '>XX;' where XX is a valid sequence type") + identifier = line[4:].strip() + description = handle.readline().strip() + + + lines = [] + line = handle.readline() + while True: + if not line : break + if line[0] == ">": break + #Remove trailing whitespace, and any internal spaces + lines.append(line.rstrip().replace(" ","")) + line = handle.readline() + seq = "".join(lines) + if seq[-1] != "*" : + #Note the * terminator is present on nucleotide sequences too, + #it is not a stop codon! + raise ValueError("Sequences in PIR files should include a * terminator!") + + #Return the record and then continue... + record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]), + id = identifier, name = identifier, + description = description) + record.annotations["PIR-type"] = pir_type + yield record + + if not line : return #StopIteration + assert False, "Should not reach this line" + +if __name__ == "__main__" : + print "Running quick self test" + + from StringIO import StringIO + import os + + for name in ["clustalw", "DMA_nuc", "DMB_prot", "B_nuc", "Cw_prot"] : + print name + filename = "../../Tests/NBRF/%s.pir" % name + if not os.path.isfile(filename) : + print "Missing %s" % filename + continue + + records = list(PirIterator(open(filename))) + count = 0 + for record in records : + count += 1 + parts = record.description.split() + if "bases," in parts : + assert len(record) == int(parts[parts.index("bases,")-1]) + print "Could read %s (%i records)" % (name, count) + diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/QualityIO.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/QualityIO.py new file mode 100644 index 0000000..e80d5f2 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/QualityIO.py @@ -0,0 +1,1113 @@ +# Copyright 2009 by Peter Cock. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# +# This module is for reading and writing FASTQ and QUAL format files as +# SeqRecord objects, and is expected to be used via the Bio.SeqIO API. + +"""Bio.SeqIO support for the "fastq" and "qual" file formats. + +Note that you are expected to use this code via the Bio.SeqIO interface, as +shown below. + +The FASTQ file format is used frequently at the Wellcome Trust Sanger Institute +to bundle a FASTA sequence and its PHRED quality data (integers between 0 and +90). Rather than using a single FASTQ file, often paired FASTA and QUAL files +are used containing the sequence and the quality information separately. + +The PHRED software reads DNA sequencing trace files, calls bases, and +assigns a quality value between 0 and 90 to each called base using a logged +transformation of the error probability, Q = -10 log10( Pe ), for example:: + + Pe = 0.0, Q = 0 + Pe = 0.1, Q = 10 + Pe = 0.01, Q = 20 + ... + Pe = 0.00000001, Q = 80 + Pe = 0.000000001, Q = 90 + +In the QUAL format these quality values are held as space separated text in +a FASTA like file format. In the FASTQ format, each quality values is encoded +with a single ASCI character using chr(Q+33), meaning zero maps to the +character "!" and for example 80 maps to "q". The sequences and quality are +then stored in pairs in a FASTA like format. + +Unfortunately there is no official document describing the FASTQ file format, +and worse, several related but different variants exist. Reasonable +documentation exists at: http://maq.sourceforge.net/fastq.shtml + +Solexa/Illumina quality scores use Q = - 10 log10 ( Pe / (1-Pe) ), which can +be negative or easily exceed 90. PHRED scores and Solexa scores are NOT +interchangeable (but a reasonable mapping can be achieved between them). +Confusingly Solexa produces a FASTQ like file but using their own score +mapping instead. + +Also note that Roche 454 sequencers can output files in the QUAL format, and +thankfully they use PHREP style scores like Sanger. To extract QUAL files from +a Roche 454 SFF binary file, use the Roche off instrument command line tool +"sffinfo" with the -q or -qual argument. You can extract a matching FASTA file +using the -s or -seq argument instead. + +You are expected to use this module via the Bio.SeqIO functions, with the +following format names: + - "fastq" means Sanger style FASTQ files using PHRED scores. + - "fastq-solexa" means Solexa/Illumina style FASTQ files. + - "qual" means simple quality files using PHRED scores. + +For example, consider the following short FASTQ file (extracted from a real +NCBI dataset):: + + @EAS54_6_R1_2_1_413_324 + CCCTTCTTGTCTTCAGCGTTTCTCC + + + ;;3;;;;;;;;;;;;7;;;;;;;88 + @EAS54_6_R1_2_1_540_792 + TTGGCAGGCCAAGGCCGATGGATCA + + + ;;;;;;;;;;;7;;;;;-;;;3;83 + @EAS54_6_R1_2_1_443_348 + GTTGCTTCTGGCGTGGGTGGGGGGG + + + ;;;;;;;;;;;9;7;;.7;393333 + +This contains three reads of length 25. From the read length these were +probably originally from an early Solexa/Illumina sequencer but NCBI have +followed the Sanger FASTQ convention and this actually uses PHRED style +qualities. This means we can parse this file using Bio.SeqIO using "fastq" +as the format name: + + >>> from Bio import SeqIO + >>> for record in SeqIO.parse(open("Quality/example.fastq"), "fastq") : + ... print record.id, record.seq + EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC + EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA + EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG + +The qualities are held as a list of integers in each record's annotation: + + >>> print record + ID: EAS54_6_R1_2_1_443_348 + Name: EAS54_6_R1_2_1_443_348 + Description: EAS54_6_R1_2_1_443_348 + Number of features: 0 + Per letter annotation for: phred_quality + Seq('GTTGCTTCTGGCGTGGGTGGGGGGG', SingleLetterAlphabet()) + >>> print record.letter_annotations["phred_quality"] + [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18] + +You can use the SeqRecord format method you can show this in the QUAL format: + + >>> print record.format("qual") + >EAS54_6_R1_2_1_443_348 + 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18 + 24 18 18 18 18 + + +Or go back to the FASTQ format, + + >>> print record.format("fastq") + @EAS54_6_R1_2_1_443_348 + GTTGCTTCTGGCGTGGGTGGGGGGG + + + ;;;;;;;;;;;9;7;;.7;393333 + + +You can also get Biopython to convert the scores and show a Solexa style +FASTQ file: + + >>> print record.format("fastq-solexa") + @EAS54_6_R1_2_1_443_348 + GTTGCTTCTGGCGTGGGTGGGGGGG + + + ZZZZZZZZZZZXZVZZMVZRXRRRR + + +If you wanted to trim your sequences (perhaps to remove low quality regions, +or to remove a primer sequence), try slicing the SeqRecord objects. e.g. + + >>> sub_rec = record[5:15] + >>> print sub_rec + ID: EAS54_6_R1_2_1_443_348 + Name: EAS54_6_R1_2_1_443_348 + Description: EAS54_6_R1_2_1_443_348 + Number of features: 0 + Per letter annotation for: phred_quality + Seq('TTCTGGCGTG', SingleLetterAlphabet()) + >>> print sub_rec.letter_annotations["phred_quality"] + [26, 26, 26, 26, 26, 26, 24, 26, 22, 26] + >>> print sub_rec.format("fastq") + @EAS54_6_R1_2_1_443_348 + TTCTGGCGTG + + + ;;;;;;9;7; + + +If you wanted to, you could read in this FASTQ file, and save it as a QUAL file: + + >>> from Bio import SeqIO + >>> record_iterator = SeqIO.parse(open("Quality/example.fastq"), "fastq") + >>> out_handle = open("Quality/temp.qual", "w") + >>> SeqIO.write(record_iterator, out_handle, "qual") + 3 + >>> out_handle.close() + +You can of course read in a QUAL file, such as the one we just created: + + >>> from Bio import SeqIO + >>> for record in SeqIO.parse(open("Quality/temp.qual"), "qual") : + ... print record.id, record.seq + EAS54_6_R1_2_1_413_324 ????????????????????????? + EAS54_6_R1_2_1_540_792 ????????????????????????? + EAS54_6_R1_2_1_443_348 ????????????????????????? + +Notice that QUAL files don't have a proper sequence present! But the quality +information is there: + + >>> print record + ID: EAS54_6_R1_2_1_443_348 + Name: EAS54_6_R1_2_1_443_348 + Description: EAS54_6_R1_2_1_443_348 + Number of features: 0 + Per letter annotation for: phred_quality + UnknownSeq(25, alphabet = SingleLetterAlphabet(), character = '?') + >>> print record.letter_annotations["phred_quality"] + [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18] + +Just to keep things tidy, if you are following this example yourself, you can +delete this temporary file now: + + >>> import os + >>> os.remove("Quality/temp.qual") + +Sometimes you won't have a FASTQ file, but rather just a pair of FASTA and QUAL +files. Because the Bio.SeqIO system is designed for reading single files, you +would have to read the two in separately and then combine the data. However, +since this is such a common thing to want to do, there is a helper iterator +defined in this module that does this for you - PairedFastaQualIterator. + +Alternatively, if you have enough RAM to hold all the records in memory at once, +then a simple dictionary approach would work: + + >>> from Bio import SeqIO + >>> reads = SeqIO.to_dict(SeqIO.parse(open("Quality/example.fasta"), "fasta")) + >>> for rec in SeqIO.parse(open("Quality/example.qual"), "qual") : + ... reads[rec.id].letter_annotations["phred_quality"]=rec.letter_annotations["phred_quality"] + +You can then access any record by its key, and get both the sequence and the +quality scores. + + >>> print reads["EAS54_6_R1_2_1_540_792"].format("fastq") + @EAS54_6_R1_2_1_540_792 + TTGGCAGGCCAAGGCCGATGGATCA + + + ;;;;;;;;;;;7;;;;;-;;;3;83 + + +It is important that you explicitly tell Bio.SeqIO which FASTQ variant you are +using ("fastq" for the Sanger standard using PHRED values, or "fastq-solexa" +for the Solexa/Illumina variant), as this cannot be detected reliably +automatically. +""" +__docformat__ = "epytext en" #Don't just use plain text in epydoc API pages! + +#See also http://blog.malde.org/index.php/2008/09/09/the-fastq-file-format-for-sequences/ + +from Bio.Alphabet import single_letter_alphabet +from Bio.Seq import Seq, UnknownSeq +from Bio.SeqRecord import SeqRecord +from Interfaces import SequentialSequenceWriter +from math import log + +# define score offsets. See discussion for differences between Sanger and +# Solexa offsets. +SANGER_SCORE_OFFSET = 33 +SOLEXA_SCORE_OFFSET = 64 + +def solexa_quality_from_phred(phred_quality) : + """Covert a PHRED quality (range 0 to about 90) to a Solexa quality. + + This will return a floating point number, it is up to you to round this to + the nearest integer if appropriate. e.g. + + >>> print "%0.2f" % round(solexa_quality_from_phred(80),2) + 80.00 + >>> print "%0.2f" % round(solexa_quality_from_phred(50),2) + 50.00 + >>> print "%0.2f" % round(solexa_quality_from_phred(20),2) + 19.96 + >>> print "%0.2f" % round(solexa_quality_from_phred(10),2) + 9.54 + >>> print "%0.2f" % round(solexa_quality_from_phred(1),2) + -5.87 + """ + return 10*log(10**(phred_quality/10.0) - 1, 10) + +def phred_quality_from_solexa(solexa_quality) : + """Convert a Solexa quality (which can be negative) to a PHRED quality. + + This will return a floating point number, it is up to you to round this to + the nearest integer if appropriate. e.g. + + >>> print "%0.2f" % round(phred_quality_from_solexa(80),2) + 80.00 + >>> print "%0.2f" % round(phred_quality_from_solexa(20),2) + 20.04 + >>> print "%0.2f" % round(phred_quality_from_solexa(10),2) + 10.41 + >>> print "%0.2f" % round(phred_quality_from_solexa(0),2) + 3.01 + >>> print "%0.2f" % round(phred_quality_from_solexa(-10),2) + 0.41 + """ + return 10*log(10**(solexa_quality/10.0) + 1, 10) + +def _get_phred_quality(record) : + """Extract PHRED qualities from a SeqRecord's letter_annotations (PRIVATE). + + If there are no PHRED qualities, but there are Solexa qualities, those are + used instead after conversion. + """ + try : + return record.letter_annotations["phred_quality"] + except KeyError : + pass + try : + return [phred_quality_from_solexa(q) for \ + q in record.letter_annotations["solexa_quality"]] + except KeyError : + raise ValueError("No suitable quality scores found in letter_annotations " + "of SeqRecord (id=%s)." % record.id) + +def _get_solexa_quality(record) : + """Extract Solexa qualities from a SeqRecord's letter_annotations (PRIVATE). + + If there are no Solexa qualities, but there are PHRED qualities, those are + used instead after conversion. + """ + try : + return record.letter_annotations["solexa_quality"] + except KeyError : + pass + try : + return [solexa_quality_from_phred(q) for \ + q in record.letter_annotations["phred_quality"]] + except KeyError : + raise ValueError("No suitable quality scores found in letter_annotation " + "of SeqRecord (id=%s)." % record.id) + + +#TODO - Default to nucleotide or even DNA? +def FastqGeneralIterator(handle) : + """Iterate over Fastq records as string tuples (not as SeqRecord objects). + + This code does not try to interpret the quality string numerically. It + just returns tuples of the title, sequence and quality as strings. For + the sequence and quality, any whitespace (such as new lines) is removed. + + Our SeqRecord based FASTQ iterators call this function internally, and then + turn the strings into a SeqRecord objects, mapping the quality string into + a list of numerical scores. If you want to do a custom quality mapping, + then you might consider calling this function directly. + + For parsing FASTQ files, the title string from the "@" line at the start + of each record can optionally be omitted on the "+" lines. If it is + repeated, it must be identical. + + The sequence string and the quality string can optionally be split over + multiple lines, although several sources discourage this. In comparison, + for the FASTA file format line breaks between 60 and 80 characters are + the norm. + + WARNING - Because the "@" character can appear in the quality string, + this can cause problems as this is also the marker for the start of + a new sequence. In fact, the "+" sign can also appear as well. Some + sources recommended having no line breaks in the quality to avoid this, + but even that is not enough, consider this example:: + + @071113_EAS56_0053:1:1:998:236 + TTTCTTGCCCCCATAGACTGAGACCTTCCCTAAATA + +071113_EAS56_0053:1:1:998:236 + IIIIIIIIIIIIIIIIIIIIIIIIIIIIICII+III + @071113_EAS56_0053:1:1:182:712 + ACCCAGCTAATTTTTGTATTTTTGTTAGAGACAGTG + + + @IIIIIIIIIIIIIIICDIIIII<%<6&-*).(*%+ + @071113_EAS56_0053:1:1:153:10 + TGTTCTGAAGGAAGGTGTGCGTGCGTGTGTGTGTGT + + + IIIIIIIIIIIICIIGIIIII>IAIIIE65I=II:6 + @071113_EAS56_0053:1:3:990:501 + TGGGAGGTTTTATGTGGA + AAGCAGCAATGTACAAGA + + + IIIIIII.IIIIII1@44 + @-7.%<&+/$/%4(++(% + + This is four PHRED encoded FASTQ entries originally from an NCBI source + (given the read length of 36, these are probably Solexa Illumna reads where + the quality has been mapped onto the PHRED values). + + This example has been edited to illustrate some of the nasty things allowed + in the FASTQ format. Firstly, on the "+" lines most but not all of the + (redundant) identifiers are ommited. In real files it is likely that all or + none of these extra identifiers will be present. + + Secondly, while the first three sequences have been shown without line + breaks, the last has been split over multiple lines. In real files any line + breaks are likely to be consistent. + + Thirdly, some of the quality string lines start with an "@" character. For + the second record this is unavoidable. However for the fourth sequence this + only happens because its quality string is split over two lines. A naive + parser could wrongly treat any line starting with an "@" as the beginning of + a new sequence! This code copes with this possible ambiguity by keeping track + of the length of the sequence which gives the expected length of the quality + string. + + Using this tricky example file as input, this short bit of code demonstrates + what this parsing function would return: + + >>> handle = open("Quality/tricky.fastq", "rU") + >>> for (title, sequence, quality) in FastqGeneralIterator(handle) : + ... print title + ... print sequence, quality + 071113_EAS56_0053:1:1:998:236 + TTTCTTGCCCCCATAGACTGAGACCTTCCCTAAATA IIIIIIIIIIIIIIIIIIIIIIIIIIIIICII+III + 071113_EAS56_0053:1:1:182:712 + ACCCAGCTAATTTTTGTATTTTTGTTAGAGACAGTG @IIIIIIIIIIIIIIICDIIIII<%<6&-*).(*%+ + 071113_EAS56_0053:1:1:153:10 + TGTTCTGAAGGAAGGTGTGCGTGCGTGTGTGTGTGT IIIIIIIIIIIICIIGIIIII>IAIIIE65I=II:6 + 071113_EAS56_0053:1:3:990:501 + TGGGAGGTTTTATGTGGAAAGCAGCAATGTACAAGA IIIIIII.IIIIII1@44@-7.%<&+/$/%4(++(% + >>> handle.close() + + Finally we note that some sources state that the quality string should + start with "!" (which using the PHRED mapping means the first letter always + has a quality score of zero). This rather restrictive rule is not widely + observed, so is therefore ignored here. One plus point about this "!" rule + is that (provided there are no line breaks in the quality sequence) it + would prevent the above problem with the "@" character. + """ + #Skip any text before the first record (e.g. blank lines, comments?) + while True : + line = handle.readline() + if line == "" : return #Premature end of file, or just empty? + if line[0] == "@" : + break + + while True : + if line[0]!="@" : + raise ValueError("Records in Fastq files should start with '@' character") + title_line = line[1:].rstrip() + + seq_lines = [] + line = handle.readline() + while True: + if not line : + raise ValueError("End of file without quality information.") + if line[0] == "+": + #The title here is optional, but if present must match! + if line[1:].rstrip() and line[1:].rstrip() != title_line : + raise ValueError("Sequence and quality captions differ.") + break + seq_lines.extend(line.split()) #removes any whitespace + line = handle.readline() + + seq_string = "".join(seq_lines) + del seq_lines + + quality_lines = [] + line = handle.readline() + while True: + if not line : break + if line[0] == "@": + #This COULD be the start of a new sequence. However, it MAY just + #be a line of quality data which starts with a "@" character. We + #should be able to check this by looking at the sequence length + #and the amount of quality data found so far. + if len("".join(quality_lines)) >= len(seq_string) : + #We expect it to be equal if this is the start of a new record. + #If the quality data is longer, we'll raise an error below. + break + #Continue - its just some (more) sequence data. + + quality_lines.extend(line.split()) #removes any whitespace + line = handle.readline() + + quality_string = "".join(quality_lines) + del quality_lines + + if len(seq_string) != len(quality_string) : + raise ValueError("Lengths of sequence and quality values differs " + " for %s (%i and %i)." \ + % (title_line, len(seq_string), len(quality_string))) + + #Return the record and then continue... + yield (title_line, seq_string, quality_string) + if not line : return #StopIteration at end of file + assert False, "Should not reach this line" + +#This is a generator function! +def FastqPhredIterator(handle, alphabet = single_letter_alphabet, title2ids = None) : + """Generator function to iterate over FASTQ records (as SeqRecord objects). + + - handle - input file + - alphabet - optional alphabet + - title2ids - A function that, when given the title line from the FASTQ + file (without the beginning >), will return the id, name and + description (in that order) for the record as a tuple of + strings. If this is not given, then the entire title line + will be used as the description, and the first word as the + id and name. + + Note that use of title2ids matches that of Bio.SeqIO.FastaIO. + + For each sequence in a (Sanger style) FASTQ file there is a matching string + encoding the PHRED qualities (integers between 0 and about 90) using ASCII + values with an offset of 33. + + For example, consider a file containing three short reads:: + + @EAS54_6_R1_2_1_413_324 + CCCTTCTTGTCTTCAGCGTTTCTCC + + + ;;3;;;;;;;;;;;;7;;;;;;;88 + @EAS54_6_R1_2_1_540_792 + TTGGCAGGCCAAGGCCGATGGATCA + + + ;;;;;;;;;;;7;;;;;-;;;3;83 + @EAS54_6_R1_2_1_443_348 + GTTGCTTCTGGCGTGGGTGGGGGGG + + + ;;;;;;;;;;;9;7;;.7;393333 + + For each sequence (e.g. "CCCTTCTTGTCTTCAGCGTTTCTCC") there is a matching + string encoding the PHRED qualities using a ASCI values with an offset of + 33 (e.g. ";;3;;;;;;;;;;;;7;;;;;;;88"). + + Using this module directly you might run: + + >>> handle = open("Quality/example.fastq", "rU") + >>> for record in FastqPhredIterator(handle) : + ... print record.id, record.seq + EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC + EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA + EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG + >>> handle.close() + + Typically however, you would call this via Bio.SeqIO instead with "fastq" as + the format: + + >>> from Bio import SeqIO + >>> handle = open("Quality/example.fastq", "rU") + >>> for record in SeqIO.parse(handle, "fastq") : + ... print record.id, record.seq + EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC + EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA + EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG + >>> handle.close() + + If you want to look at the qualities, they are record in each record's + per-letter-annotation dictionary as a simple list of integers: + + >>> print record.letter_annotations["phred_quality"] + [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18] + """ + for title_line, seq_string, quality_string in FastqGeneralIterator(handle) : + if title2ids : + id, name, descr = title2ids(title_line) + else : + descr = title_line + id = descr.split()[0] + name = id + record = SeqRecord(Seq(seq_string, alphabet), + id=id, name=name, description=descr) + + assert SANGER_SCORE_OFFSET == ord("!") + #According to BioPerl documentation at least, the first character should + #be an "!" (and therefore quality zero). This seems crazy - what if the + #sequence has been trimmed to remove any poor quality sequence? In any + #case real examples from the NCBI don't follow this practice, so we + #won't enforce it here. + #e.g. ftp://ftp.ncbi.nih.gov/pub/TraceDB/ShortRead/SRA000271/fastq/200x36x36-071113_EAS56_0053-s_1_1.fastq.gz + # + #if quality_string[0] != "!" : + # raise ValueError("The quality string should always start with a ! character.") + qualities = [ord(letter)-SANGER_SCORE_OFFSET for letter in quality_string] + if qualities : + if min(qualities) < 0 or max(qualities) > 90 : + raise ValueError("Quality score outside 0 to 90 found - these are perhaps " + "in a Solexa/Illumina format, not the Sanger FASTQ format " + "which uses PHRED scores.") + record.letter_annotations["phred_quality"] = qualities + yield record + +#This is a generator function! +def FastqSolexaIterator(handle, alphabet = single_letter_alphabet, title2ids = None) : + """Parsing the Solexa/Illumina FASTQ like files (which differ in the quality mapping). + + The optional arguments are the same as those for the FastqPhredIterator. + + For each sequence in Solexa/Illumina FASTQ files there is a matching string + encoding the Solexa integer qualities using ASCII values with an offset + of 64. Solexa scores are scaled differently to PHRED scores, and Biopython + will NOT perform any automatic conversion when loading. + + For example, consider a file containing these five records:: + + @SLXA-B3_649_FC8437_R1_1_1_610_79 + GATGTGCAATACCTTTGTAGAGGAA + +SLXA-B3_649_FC8437_R1_1_1_610_79 + YYYYYYYYYYYYYYYYYYWYWYYSU + @SLXA-B3_649_FC8437_R1_1_1_397_389 + GGTTTGAGAAAGAGAAATGAGATAA + +SLXA-B3_649_FC8437_R1_1_1_397_389 + YYYYYYYYYWYYYYWWYYYWYWYWW + @SLXA-B3_649_FC8437_R1_1_1_850_123 + GAGGGTGTTGATCATGATGATGGCG + +SLXA-B3_649_FC8437_R1_1_1_850_123 + YYYYYYYYYYYYYWYYWYYSYYYSY + @SLXA-B3_649_FC8437_R1_1_1_362_549 + GGAAACAAAGTTTTTCTCAACATAG + +SLXA-B3_649_FC8437_R1_1_1_362_549 + YYYYYYYYYYYYYYYYYYWWWWYWY + @SLXA-B3_649_FC8437_R1_1_1_183_714 + GTATTATTTAATGGCATACACTCAA + +SLXA-B3_649_FC8437_R1_1_1_183_714 + YYYYYYYYYYWYYYYWYWWUWWWQQ + + Using this module directly you might run: + + >>> handle = open("Quality/solexa_example.fastq", "rU") + >>> for record in FastqSolexaIterator(handle) : + ... print record.id, record.seq + SLXA-B3_649_FC8437_R1_1_1_610_79 GATGTGCAATACCTTTGTAGAGGAA + SLXA-B3_649_FC8437_R1_1_1_397_389 GGTTTGAGAAAGAGAAATGAGATAA + SLXA-B3_649_FC8437_R1_1_1_850_123 GAGGGTGTTGATCATGATGATGGCG + SLXA-B3_649_FC8437_R1_1_1_362_549 GGAAACAAAGTTTTTCTCAACATAG + SLXA-B3_649_FC8437_R1_1_1_183_714 GTATTATTTAATGGCATACACTCAA + >>> handle.close() + + Typically however, you would call this via Bio.SeqIO instead with "fastq" as + the format: + + >>> from Bio import SeqIO + >>> handle = open("Quality/solexa_example.fastq", "rU") + >>> for record in SeqIO.parse(handle, "fastq-solexa") : + ... print record.id, record.seq + SLXA-B3_649_FC8437_R1_1_1_610_79 GATGTGCAATACCTTTGTAGAGGAA + SLXA-B3_649_FC8437_R1_1_1_397_389 GGTTTGAGAAAGAGAAATGAGATAA + SLXA-B3_649_FC8437_R1_1_1_850_123 GAGGGTGTTGATCATGATGATGGCG + SLXA-B3_649_FC8437_R1_1_1_362_549 GGAAACAAAGTTTTTCTCAACATAG + SLXA-B3_649_FC8437_R1_1_1_183_714 GTATTATTTAATGGCATACACTCAA + >>> handle.close() + + If you want to look at the qualities, they are recorded in each record's + per-letter-annotation dictionary as a simple list of integers: + + >>> print record.letter_annotations["solexa_quality"] + [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 23, 25, 25, 25, 25, 23, 25, 23, 23, 21, 23, 23, 23, 17, 17] + + These scores aren't very good, but they are high enough that they map + almost exactly onto PHRED scores: + + >>> print "%0.2f" % phred_quality_from_solexa(25) + 25.01 + + Let's look at another example read which is even worse, where there are + more noticeable differences between the Solexa and PHRED scores:: + + @slxa_0013_1_0001_24 + ACAAAAATCACAAGCATTCTTATACACC + +slxa_0013_1_0001_24 + ??????????????????:??>> from Bio import SeqIO + >>> handle = open("Quality/solexa.fastq", "rU") + >>> record = SeqIO.read(handle, "fastq-solexa") + >>> handle.close() + >>> print record.id, record.seq + slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC + >>> print record.letter_annotations["solexa_quality"] + [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -6, -1, -1, -4, -1, -4, -19, -10, -27, -18] + + These quality scores are so low that when converted from the Solexa scheme + into PHRED scores they look quite different: + + >>> print "%0.2f" % phred_quality_from_solexa(-1) + 2.54 + + Note you can use the Bio.SeqIO.write() function or the SeqRecord's format + method to output the record(s): + + >>> print record.format("fastq-solexa") + @slxa_0013_1_0001_24 + ACAAAAATCACAAGCATTCTTATACACC + + + ??????????????????:?? + + Note this output is slightly different from the input file as Biopython + has left out the optional repetition of the sequence identifier on the "+" + line. If you want the to use PHRED scores, use "fastq" or "qual" as the + output format instead, and Biopython will do the conversion for you: + + >>> print record.format("fastq") + @slxa_0013_1_0001_24 + ACAAAAATCACAAGCATTCTTATACACC + + + $$$$$$$$$$$$$$$$$$"$$"$"!!!! + + + >>> print record.format("qual") + >slxa_0013_1_0001_24 + 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 1 3 1 0 0 0 0 + + """ + for title_line, seq_string, quality_string in FastqGeneralIterator(handle) : + if title2ids : + id, name, descr = title_line + else : + descr = title_line + id = descr.split()[0] + name = id + record = SeqRecord(Seq(seq_string, alphabet), + id=id, name=name, description=descr) + qualities = [ord(letter)-SOLEXA_SCORE_OFFSET for letter in quality_string] + #DO NOT convert these into PHRED qualities automatically! + record.letter_annotations["solexa_quality"] = qualities + yield record + +def QualPhredIterator(handle, alphabet = single_letter_alphabet, title2ids = None) : + """For QUAL files which include PHRED quality scores, but no sequence. + + For example, consider this short QUAL file:: + + >EAS54_6_R1_2_1_413_324 + 26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 + 26 26 26 23 23 + >EAS54_6_R1_2_1_540_792 + 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26 + 26 18 26 23 18 + >EAS54_6_R1_2_1_443_348 + 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18 + 24 18 18 18 18 + + Using this module directly you might run: + + >>> handle = open("Quality/example.qual", "rU") + >>> for record in QualPhredIterator(handle) : + ... print record.id, record.seq + EAS54_6_R1_2_1_413_324 ????????????????????????? + EAS54_6_R1_2_1_540_792 ????????????????????????? + EAS54_6_R1_2_1_443_348 ????????????????????????? + >>> handle.close() + + Typically however, you would call this via Bio.SeqIO instead with "qual" + as the format: + + >>> from Bio import SeqIO + >>> handle = open("Quality/example.qual", "rU") + >>> for record in SeqIO.parse(handle, "qual") : + ... print record.id, record.seq + EAS54_6_R1_2_1_413_324 ????????????????????????? + EAS54_6_R1_2_1_540_792 ????????????????????????? + EAS54_6_R1_2_1_443_348 ????????????????????????? + >>> handle.close() + + Becase QUAL files don't contain the sequence string itself, the seq + property is set to an UnknownSeq object. As no alphabet was given, this + has defaulted to a generic single letter alphabet and the character "?" + used. + + By specifying a nucleotide alphabet, "N" is used instead: + + >>> from Bio import SeqIO + >>> from Bio.Alphabet import generic_dna + >>> handle = open("Quality/example.qual", "rU") + >>> for record in SeqIO.parse(handle, "qual", alphabet=generic_dna) : + ... print record.id, record.seq + EAS54_6_R1_2_1_413_324 NNNNNNNNNNNNNNNNNNNNNNNNN + EAS54_6_R1_2_1_540_792 NNNNNNNNNNNNNNNNNNNNNNNNN + EAS54_6_R1_2_1_443_348 NNNNNNNNNNNNNNNNNNNNNNNNN + >>> handle.close() + + However, the quality scores themselves are available as a list of integers + in each record's per-letter-annotation: + + >>> print record.letter_annotations["phred_quality"] + [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18] + + You can still slice one of these SeqRecord objects with an UnknownSeq: + + >>> sub_record = record[5:10] + >>> print sub_record.id, sub_record.letter_annotations["phred_quality"] + EAS54_6_R1_2_1_443_348 [26, 26, 26, 26, 26] + """ + #Skip any text before the first record (e.g. blank lines, comments) + while True : + line = handle.readline() + if line == "" : return #Premature end of file, or just empty? + if line[0] == ">" : + break + + while True : + if line[0]!=">" : + raise ValueError("Records in Fasta files should start with '>' character") + if title2ids : + id, name, descr = title2ids(line[1:].rstrip()) + else : + descr = line[1:].rstrip() + id = descr.split()[0] + name = id + + qualities = [] + line = handle.readline() + while True: + if not line : break + if line[0] == ">": break + qualities.extend([int(word) for word in line.split()]) + line = handle.readline() + + if qualities : + if min(qualities) < 0 or max(qualities) > 90 : + raise ValueError(("Quality score range for %s is %i to %i, outside the " \ + +"expected 0 to 90. Perhaps these are Solexa/Illumina " \ + +"scores, and not PHRED scores?") \ + % (id, min(qualities), max(qualities))) + + #Return the record and then continue... + record = SeqRecord(UnknownSeq(len(qualities), alphabet), + id = id, name = name, description = descr) + record.letter_annotations["phred_quality"] = qualities + yield record + + if not line : return #StopIteration + assert False, "Should not reach this line" + +class FastqPhredWriter(SequentialSequenceWriter): + """Class to write FASTQ format files (using PHRED quality scores). + + Although you can use this class directly, you are strongly encouraged + to use the Bio.SeqIO.write() function instead. For example, this code + reads in a FASTQ (PHRED) file and re-saves it as another FASTQ (PHRED) + file: + + >>> from Bio import SeqIO + >>> record_iterator = SeqIO.parse(open("Quality/example.fastq"), "fastq") + >>> out_handle = open("Quality/temp.fastq", "w") + >>> SeqIO.write(record_iterator, out_handle, "fastq") + 3 + >>> out_handle.close() + + You might want to do this if the original file included extra line breaks, + which while valid may not be supported by all tools. The output file from + Biopython will have each sequence on a single line, and each quality + string on a single line (which is considered desirable for maximum + compatibility). + + In this next example, a Solexa FASTQ file is converted into a standard + Sanger style FASTQ file using PHRED qualities: + + >>> from Bio import SeqIO + >>> record_iterator = SeqIO.parse(open("Quality/solexa.fastq"), "fastq-solexa") + >>> out_handle = open("Quality/temp.fastq", "w") + >>> SeqIO.write(record_iterator, out_handle, "fastq") + 1 + >>> out_handle.close() + + This code is also called if you use the .format("fastq") method of a + SeqRecord. + + P.S. To avoid cluttering up your working directory, you can delete this + temporary file now: + + >>> import os + >>> os.remove("Quality/temp.fastq") + + """ + def write_record(self, record): + """Write a single FASTQ record to the file.""" + assert self._header_written + assert not self._footer_written + self._record_written = True + + #TODO - Is an empty sequence allowed in FASTQ format? + assert SANGER_SCORE_OFFSET == ord("!") + #This rounds to the nearest integer: + qualities = "".join([chr(int(round(q+SANGER_SCORE_OFFSET,0))) for q \ + in _get_phred_quality(record)]) + if record.seq is None: + raise ValueError("No sequence for record %s" % record.id) + if len(qualities) != len(record) : + raise ValueError("Record %s has sequence length %i but %i quality scores" \ + % (record.id, len(record), len(qualities))) + + title = self.clean(record.id) #TODO - add the description too? cf Fasta output + self.handle.write("@%s\n%s\n+\n%s\n" % (title, record.seq, qualities)) + +class QualPhredWriter(SequentialSequenceWriter): + """Class to write QUAL format files (using PHRED quality scores). + + Although you can use this class directly, you are strongly encouraged + to use the Bio.SeqIO.write() function instead. For example, this code + reads in a FASTQ file and saves the quality scores into a QUAL file: + + >>> from Bio import SeqIO + >>> record_iterator = SeqIO.parse(open("Quality/example.fastq"), "fastq") + >>> out_handle = open("Quality/temp.qual", "w") + >>> SeqIO.write(record_iterator, out_handle, "qual") + 3 + >>> out_handle.close() + + This code is also called if you use the .format("qual") method of a + SeqRecord. + + P.S. Don't forget to clean up the temp file if you don't need it anymore: + + >>> import os + >>> os.remove("Quality/temp.qual") + """ + def __init__(self, handle, wrap=60, record2title=None): + """Create a QUAL writer. + + Arguments: + - handle - Handle to an output file, e.g. as returned + by open(filename, "w") + - wrap - Optional line length used to wrap sequence lines. + Defaults to wrapping the sequence at 60 characters + Use zero (or None) for no wrapping, giving a single + long line for the sequence. + - record2title - Optional function to return the text to be + used for the title line of each record. By default + a combination of the record.id and record.description + is used. If the record.description starts with the + record.id, then just the record.description is used. + + The record2title argument is present for consistency with the + Bio.SeqIO.FastaIO writer class. + """ + SequentialSequenceWriter.__init__(self, handle) + #self.handle = handle + self.wrap = None + if wrap : + if wrap < 1 : + raise ValueError + self.wrap = wrap + self.record2title = record2title + + def write_record(self, record): + """Write a single QUAL record to the file.""" + assert self._header_written + assert not self._footer_written + self._record_written = True + + if self.record2title : + title=self.clean(self.record2title(record)) + else : + id = self.clean(record.id) + description = self.clean(record.description) + + #if description[:len(id)]==id : + if description and description.split(None,1)[0]==id : + #The description includes the id at the start + title = description + else : + title = "%s %s" % (id, description) + + assert "\n" not in title + assert "\r" not in title + self.handle.write(">%s\n" % title) + + #This rounds to the nearest integer. + #TODO - can we put a float in a qual file? + qualities = [("%i" % round(q,0)) for q in _get_phred_quality(record)] + + if self.wrap : + while qualities : + line=qualities.pop(0) + while qualities \ + and len(line) + 1 + len(qualities[0]) < self.wrap : + line += " " + qualities.pop(0) + self.handle.write(line + "\n") + else : + data = " ".join(qualities) + self.handle.write(data + "\n") + +class FastqSolexaWriter(SequentialSequenceWriter): + """Class to write FASTQ format files (using Solexa quality scores). + + Although you can use this class directly, you are strongly encouraged + to use the Bio.SeqIO.write() function instead. For example, this code + reads in a FASTQ file and re-saves it as another FASTQ file: + + >>> from Bio import SeqIO + >>> record_iterator = SeqIO.parse(open("Quality/solexa.fastq"), "fastq-solexa") + >>> out_handle = open("Quality/temp.fastq", "w") + >>> SeqIO.write(record_iterator, out_handle, "fastq-solexa") + 1 + >>> out_handle.close() + + You might want to do this if the original file included extra line + breaks, which (while valid) may not be supported by all tools. The + output file from Biopython will have each sequence on a single line, and + each quality string on a single line (which is considered desirable for + maximum compatibility). + + This code is also called if you use the .format("fastq-solexa") method of + a SeqRecord. + + P.S. Don't forget to delete the temp file if you don't need it anymore: + + >>> import os + >>> os.remove("Quality/temp.fastq") + """ + def write_record(self, record): + """Write a single FASTQ record to the file.""" + assert self._header_written + assert not self._footer_written + self._record_written = True + + #TODO - Is an empty sequence allowed in FASTQ format? + qualities = "".join([chr(int(round(q+SOLEXA_SCORE_OFFSET,0))) for q \ + in _get_solexa_quality(record)]) + if record.seq is None: + raise ValueError("No sequence for record %s" % record.id) + if len(qualities) != len(record) : + raise ValueError("Record %s has sequence length %i but %i quality scores" \ + % (record.id, len(record), len(qualities))) + + title = self.clean(record.id) #TODO - add the description too? cf Fasta output + self.handle.write("@%s\n%s\n+\n%s\n" % (title, record.seq, qualities)) + +def PairedFastaQualIterator(fasta_handle, qual_handle, alphabet = single_letter_alphabet, title2ids = None) : + """Iterate over matched FASTA and QUAL files as SeqRecord objects. + + For example, consider this short QUAL file:: + + >EAS54_6_R1_2_1_413_324 + 26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 + 26 26 26 23 23 + >EAS54_6_R1_2_1_540_792 + 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26 + 26 18 26 23 18 + >EAS54_6_R1_2_1_443_348 + 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18 + 24 18 18 18 18 + + And a matching FASTA file:: + + >EAS54_6_R1_2_1_413_324 + CCCTTCTTGTCTTCAGCGTTTCTCC + >EAS54_6_R1_2_1_540_792 + TTGGCAGGCCAAGGCCGATGGATCA + >EAS54_6_R1_2_1_443_348 + GTTGCTTCTGGCGTGGGTGGGGGGG + + You can parse these separately using Bio.SeqIO with the "qual" and + "fasta" formats, but then you'll get a group of SeqRecord objects with + no sequence, and a matching group with the sequence but not the + qualities. Because it only deals with one input file handle, Bio.SeqIO + can't be used to read the two files together - but this function can! + For example, + + >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"), + ... open("Quality/example.qual", "rU")) + >>> for record in rec_iter : + ... print record.id, record.seq + EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC + EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA + EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG + + As with the FASTQ or QUAL parsers, if you want to look at the qualities, + they are in each record's per-letter-annotation dictionary as a simple + list of integers: + + >>> print record.letter_annotations["phred_quality"] + [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18] + + If you have access to data as a FASTQ format file, using that directly + would be simpler and more straight forward. Note that you can easily use + this function to convert paired FASTA and QUAL files into FASTQ files: + + >>> from Bio import SeqIO + >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"), + ... open("Quality/example.qual", "rU")) + >>> out_handle = open("Quality/temp.fastq", "w") + >>> SeqIO.write(rec_iter, out_handle, "fastq") + 3 + >>> out_handle.close() + + And don't forget to clean up the temp file if you don't need it anymore: + + >>> import os + >>> os.remove("Quality/temp.fastq") + """ + from Bio.SeqIO.FastaIO import FastaIterator + fasta_iter = FastaIterator(fasta_handle, alphabet=alphabet, \ + title2ids=title2ids) + qual_iter = QualPhredIterator(qual_handle, alphabet=alphabet, \ + title2ids=title2ids) + + #Using zip(...) would create a list loading everything into memory! + #It would also not catch any extra records found in only one file. + while True : + try : + f_rec = fasta_iter.next() + except StopIteration : + f_rec = None + try : + q_rec = qual_iter.next() + except StopIteration : + q_rec = None + if f_rec is None and q_rec is None : + #End of both files + break + if f_rec is None : + raise ValueError("FASTA file has more entries than the QUAL file.") + if q_rec is None : + raise ValueError("QUAL file has more entries than the FASTA file.") + if f_rec.id != q_rec.id : + raise ValueError("FASTA and QUAL entries do not match (%s vs %s)." \ + % (f_rec.id, q_rec.id)) + if len(f_rec) != len(q_rec.letter_annotations["phred_quality"]) : + raise ValueError("Sequence length and number of quality scores disagree for %s" \ + % f_rec.id) + #Merge the data.... + f_rec.letter_annotations["phred_quality"] = q_rec.letter_annotations["phred_quality"] + yield f_rec + #Done + + +def _test(): + """Run the Bio.SeqIO module's doctests. + + This will try and locate the unit tests directory, and run the doctests + from there in order that the relative paths used in the examples work. + """ + import doctest + import os + if os.path.isdir(os.path.join("..","..","Tests")) : + print "Runing doctests..." + cur_dir = os.path.abspath(os.curdir) + os.chdir(os.path.join("..","..","Tests")) + assert os.path.isfile("Quality/example.fastq") + assert os.path.isfile("Quality/example.fasta") + assert os.path.isfile("Quality/example.qual") + assert os.path.isfile("Quality/tricky.fastq") + assert os.path.isfile("Quality/solexa.fastq") + doctest.testmod() + os.chdir(cur_dir) + del cur_dir + print "Done" + +if __name__ == "__main__" : + _test() + diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/SwissIO.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/SwissIO.py new file mode 100644 index 0000000..a55c0fe --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/SwissIO.py @@ -0,0 +1,65 @@ +# Copyright 2006 by Peter Cock. All rights reserved. +# +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format. + +You are expected to use this module via the Bio.SeqIO functions. +See also the Bio.SwissProt module which offers more than just accessing +the sequences as SeqRecord objects.""" + +from Bio.SwissProt import SProt +import cStringIO + +#This is a generator function! +def SwissIterator(handle) : + """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. + + Every section from the ID line to the terminating // becomes + a single SeqRecord with associated annotation and features. + + This parser is for the flat file "swiss" format as used by: + * Swiss-Prot aka SwissProt + * TrEMBL + * UniProtKB aka UniProt Knowledgebase + + It does NOT read their new XML file format. + http://www.expasy.org/sprot/ + + For consistency with BioPerl and EMBOSS we call this the "swiss" + format. + """ + parser = SProt.SequenceParser() + lines = [] + for line in handle: + lines.append(line) + if line[:2]=='//': + handle = cStringIO.StringIO("".join(lines)) + record = parser.parse(handle) + lines = [] + yield record + #If there are more lines, it could only be a partial record. + #Should we try and parse them anyway? + + +if __name__ == "__main__" : + print "Quick self test..." + + example_filename = "../../Tests/SwissProt/sp008" + + import os + if not os.path.isfile(example_filename): + print "Missing test file %s" % example_filename + else : + #Try parsing it! + handle = open(example_filename) + records = SwissIterator(handle) + for record in records: + print record.name + print record.id + print record.annotations['keywords'] + print repr(record.annotations['organism']) + print record.seq.tostring()[:20] + "..." + handle.close() diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/TabIO.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/TabIO.py new file mode 100644 index 0000000..3613863 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/TabIO.py @@ -0,0 +1,109 @@ +# Copyright 2008 by Peter Cock. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Bio.SeqIO support for the "tab" (simple tab separated) file format. + +You are expected to use this module via the Bio.SeqIO functions. + +The "tab" format is an ad-hoc plain text file format where each sequence is +on one (long) line. Each line contains the identifier/description, followed +by a tab, followed by the sequence. For example, consider the following +short FASTA format file: + +>ID123456 possible binding site? +CATCNAGATGACACTACGACTACGACTCAGACTAC +>ID123457 random sequence +ACACTACGACTACGACTCAGACTACAAN + +Apart from the descriptions, this can be represented in the simple two column +tab separated format as follows: + +ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC +ID123457(tab)ACACTACGACTACGACTCAGACTACAAN + +When reading this file, "ID123456" or "ID123457" will be taken as the record's +.id and .name property. There is no other information to record. + +Similarly, when writing to this format, Biopython will ONLY record the record's +.id and .seq (and not the description or any other information) as in the example +above. +""" + +from Bio.Alphabet import single_letter_alphabet +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Interfaces import SequentialSequenceWriter + +#This is a generator function! +def TabIterator(handle, alphabet = single_letter_alphabet) : + """Iterates over tab separated lines (as SeqRecord objects). + + Each line of the file should contain one tab only, dividing the line + into an identifier and the full sequence. + + handle - input file + alphabet - optional alphabet + + The first field is taken as the record's .id and .name (regardless of + any spaces within the text) and the second field is the sequence. + + Any blank lines are ignored. + """ + for line in handle : + try : + title, seq = line.split("\t") #will fail if more than one tab! + except : + if line.strip() == "" : + #It's a blank line, ignore it + continue + raise ValueError("Each line should have one tab separating the" + \ + " title and sequence, this line has %i tabs: %s" \ + % (line.count("\t"), repr(line))) + title = title.strip() + seq = seq.strip() #removes the trailing new line + yield SeqRecord(Seq(seq, alphabet), id = title, name = title) + +class TabWriter(SequentialSequenceWriter): + """Class to write simple tab separated format files. + + Each line consists of "id(tab)sequence" only. + + Any description, name or other annotation is not recorded. + """ + def write_record(self, record): + """Write a single tab line to the file.""" + assert self._header_written + assert not self._footer_written + self._record_written = True + + title = self.clean(record.id) + seq = self._get_seq_string(record) #Catches sequence being None + assert "\t" not in title + assert "\n" not in title + assert "\r" not in title + assert "\t" not in seq + assert "\n" not in seq + assert "\r" not in seq + self.handle.write("%s\t%s\n" % (title, seq)) + + +if __name__ == "__main__" : + print "Running quick self test" + from StringIO import StringIO + + #This example has a trailing blank line which should be ignored + handle = StringIO("Alpha\tAAAAAAA\nBeta\tCCCCCCC\n\n") + records = list(TabIterator(handle)) + assert len(records) == 2 + + handle = StringIO("Alpha\tAAAAAAA\tExtra\nBeta\tCCCCCCC\n") + try : + records = list(TabIterator(handle)) + assert False, "Should have reject this invalid example!" + except ValueError : + #Good! + pass + + print "Done" diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqIO/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/__init__.py new file mode 100644 index 0000000..e2be3ec --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqIO/__init__.py @@ -0,0 +1,650 @@ +# Copyright 2006-2008 by Peter Cock. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +# +#Nice link: +# http://www.ebi.ac.uk/help/formats_frame.html + +"""Sequence input/output as SeqRecord objects. + +Bio.SeqIO is also documented at U{http://biopython.org/wiki/SeqIO} and by +a whole chapter in our tutorial: + - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html} + - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf} + +Input +===== +The main function is Bio.SeqIO.parse(...) which takes an input file handle, +and format string. This returns an iterator giving SeqRecord objects: + + >>> from Bio import SeqIO + >>> handle = open("Fasta/f002", "rU") + >>> for record in SeqIO.parse(handle, "fasta") : + ... print record.id, len(record) + gi|1348912|gb|G26680|G26680 633 + gi|1348917|gb|G26685|G26685 413 + gi|1592936|gb|G29385|G29385 471 + >>> handle.close() + +Note that the parse() function will all invoke the relevant parser for the +format with its default settings. You may want more control, in which case +you need to create a format specific sequence iterator directly. + +For non-interlaced files (e.g. Fasta, GenBank, EMBL) with multiple records +using a sequence iterator can save you a lot of memory (RAM). There is +less benefit for interlaced file formats (e.g. most multiple alignment file +formats). However, an iterator only lets you access the records one by one. + +If you want random access to the records by number, turn this into a list: + + >>> from Bio import SeqIO + >>> handle = open("Fasta/f002", "rU") + >>> records = list(SeqIO.parse(handle, "fasta")) + >>> handle.close() + >>> print records[1].id + gi|1348917|gb|G26685|G26685 + +If you want random access to the records by a key such as the record id, +turn the iterator into a dictionary: + + >>> from Bio import SeqIO + >>> handle = open("Fasta/f002", "rU") + >>> record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) + >>> handle.close() + >>> print len(record_dict["gi|1348917|gb|G26685|G26685"]) + 413 + +If you expect your file to contain one-and-only-one record, then we provide +the following 'helper' function which will return a single SeqRecord, or +raise an exception if there are no records or more than one record: + + >>> from Bio import SeqIO + >>> handle = open("Fasta/f001", "rU") + >>> record = SeqIO.read(handle, "fasta") + >>> handle.close() + >>> print record.id, len(record) + gi|3318709|pdb|1A91| 79 + +This style is useful when you expect a single record only (and would +consider multiple records an error). For example, when dealing with GenBank +files for bacterial genomes or chromosomes, there is normally only a single +record. Alternatively, use this with a handle when download a single record +from the internet. + +However, if you just want the first record from a file containing multiple +record, use the iterator's next() method: + + >>> from Bio import SeqIO + >>> handle = open("Fasta/f002", "rU") + >>> record = SeqIO.parse(handle, "fasta").next() + >>> handle.close() + >>> print record.id, len(record) + gi|1348912|gb|G26680|G26680 633 + +The above code will work as long as the file contains at least one record. +Note that if there is more than one record, the remaining records will be +silently ignored. + +Input - Alignments +================== +You can read in alignment files as Alignment objects using Bio.AlignIO. +Alternatively, reading in an alignment file format via Bio.SeqIO will give +you a SeqRecord for each row of each alignment: + + >>> from Bio import SeqIO + >>> handle = open("Clustalw/hedgehog.aln", "rU") + >>> for record in SeqIO.parse(handle, "clustal") : + ... print record.id, len(record) + gi|167877390|gb|EDS40773.1| 447 + gi|167234445|ref|NP_001107837. 447 + gi|74100009|gb|AAZ99217.1| 447 + gi|13990994|dbj|BAA33523.2| 447 + gi|56122354|gb|AAV74328.1| 447 + >>> handle.close() + +Output +====== +Use the function Bio.SeqIO.write(...), which takes a complete set of +SeqRecord objects (either as a list, or an iterator), an output file handle +and of course the file format:: + + from Bio import SeqIO + records = ... + handle = open("example.faa", "w") + SeqIO.write(records, handle, "fasta") + handle.close() + +In general, you are expected to call this function once (with all your +records) and then close the file handle. + +Output - Advanced +================= +The effect of calling write() multiple times on a single file will vary +depending on the file format, and is best avoided unless you have a strong +reason to do so. + +Trying this for certain alignment formats (e.g. phylip, clustal, stockholm) +would have the effect of concatenating several multiple sequence alignments +together. Such files are created by the PHYLIP suite of programs for +bootstrap analysis. + +For sequential files formats (e.g. fasta, genbank) each "record block" holds +a single sequence. For these files it would probably be safe to call +write() multiple times. + +File Formats +============ +When specifying the file format, use lowercase strings. The same format +names are also used in Bio.AlignIO and include the following: + + - ace - Reads the contig sequences from an ACE assembly file. + - embl - The EMBL flat file format. Uses Bio.GenBank internally. + - fasta - The generic sequence file format where each record starts with + an identifer line starting with a ">" character, followed by + lines of sequence. + - fastq - A "FASTA like" format used by Sanger which also stores PHRED + sequence quality values. + - fastq-solexa - The Solexa/Illumnia variant of the Sanger FASTQ format which + encodes Solexa quality scores (not PHRED quality scores). + - genbank - The GenBank or GenPept flat file format. + - gb - An alias for "genbank", for consistency with NCBI Entrez Utilities + - ig - The IntelliGenetics file format, apparently the same as the + MASE alignment format. + - phd - Output from PHRED, used by PHRAP and CONSED for input. + - pir - A "FASTA like" format introduced by the National Biomedical + Research Foundation (NBRF) for the Protein Information Resource + (PIR) database, now part of UniProt. + - swiss - Plain text Swiss-Prot aka UniProt format. + - tab - Simple two column tab separated sequence files, where each + line holds a record's identifier and sequence. For example, + this is used as by Aligent's eArray software when saving + microarray probes in a minimal tab delimited text file. + - qual - A "FASTA like" format holding PHRED quality values from + sequencing DNA, but no actual sequences (usually provided + in separate FASTA files). + +Note that while Bio.SeqIO can read all the above file formats, it cannot +write to all of them. + +You can also use any file format supported by Bio.AlignIO, such as "nexus", +"phlip" and "stockholm", which gives you access to the individual sequences +making up each alignment as SeqRecords. +""" +__docformat__ = "epytext en" #not just plaintext + +#TODO +# - define policy on reading aligned sequences with gaps in +# (e.g. - and . characters) including how the alphabet interacts +# +# - Can we build the to_alignment(...) functionality +# into the generic Alignment class instead? +# +# - How best to handle unique/non unique record.id when writing. +# For most file formats reading such files is fine; The stockholm +# parser would fail. +# +# - MSF multiple alignment format, aka GCG, aka PileUp format (*.msf) +# http://www.bioperl.org/wiki/MSF_multiple_alignment_format + +""" +FAO BioPython Developers +======================== +The way I envision this SeqIO system working as that for any sequence file +format we have an iterator that returns SeqRecord objects. + +This also applies to interlaced fileformats (like clustal - although that +is now handled via Bio.AlignIO instead) where the file cannot be read record +by record. You should still return an iterator, even if the implementation +could just as easily return a list. + +These file format specific sequence iterators may be implemented as: +* Classes which take a handle for __init__ and provide the __iter__ method +* Functions that take a handle, and return an iterator object +* Generator functions that take a handle, and yield SeqRecord objects + +It is then trivial to turn this iterator into a list of SeqRecord objects, +an in memory dictionary, or a multiple sequence alignment object. + +For building the dictionary by default the id propery of each SeqRecord is +used as the key. You should always populate the id property, and it should +be unique in most cases. For some file formats the accession number is a good +choice. If the file itself contains ambiguous identifiers, don't try and +dis-ambiguate them - return them as is. + +When adding a new file format, please use the same lower case format name +as BioPerl, or if they have not defined one, try the names used by EMBOSS. + +See also http://biopython.org/wiki/SeqIO_dev + +--Peter +""" + +import os +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Align.Generic import Alignment +from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet + +import AceIO +import FastaIO +import IgIO #IntelliGenetics or MASE format +import InsdcIO #EMBL and GenBank +import PhdIO +import PirIO +import SwissIO +import TabIO +import QualityIO #FastQ and qual files + + +#Convention for format names is "mainname-subtype" in lower case. +#Please use the same names as BioPerl where possible. +# +#Note that this simple system copes with defining +#multiple possible iterators for a given format/extension +#with the -subtype suffix +# +#Most alignment file formats will be handled via Bio.AlignIO + +_FormatToIterator ={"fasta" : FastaIO.FastaIterator, + "gb" : InsdcIO.GenBankIterator, + "genbank" : InsdcIO.GenBankIterator, + "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator, + "embl" : InsdcIO.EmblIterator, + "embl-cds" : InsdcIO.EmblCdsFeatureIterator, + "ig" : IgIO.IgIterator, + "swiss" : SwissIO.SwissIterator, + "phd" : PhdIO.PhdIterator, + "ace" : AceIO.AceIterator, + "tab" : TabIO.TabIterator, + "pir" : PirIO.PirIterator, + "fastq" : QualityIO.FastqPhredIterator, + "fastq-solexa" : QualityIO.FastqSolexaIterator, + "qual" : QualityIO.QualPhredIterator, + } + +_FormatToWriter ={"fasta" : FastaIO.FastaWriter, + "gb" : InsdcIO.GenBankWriter, + "genbank" : InsdcIO.GenBankWriter, + "tab" : TabIO.TabWriter, + "fastq" : QualityIO.FastqPhredWriter, + "fastq-solexa" : QualityIO.FastqSolexaWriter, + "qual" : QualityIO.QualPhredWriter, + } + +def write(sequences, handle, format) : + """Write complete set of sequences to a file. + + - sequences - A list (or iterator) of SeqRecord objects. + - handle - File handle object to write to. + - format - lower case string describing the file format to write. + + You should close the handle after calling this function. + + Returns the number of records written (as an integer). + """ + from Bio import AlignIO + + #Try and give helpful error messages: + if isinstance(handle, basestring) : + raise TypeError("Need a file handle, not a string (i.e. not a filename)") + if not isinstance(format, basestring) : + raise TypeError("Need a string for the file format (lower case)") + if not format : + raise ValueError("Format required (lower case string)") + if format != format.lower() : + raise ValueError("Format string '%s' should be lower case" % format) + if isinstance(sequences,SeqRecord): + raise ValueError("Use a SeqRecord list/iterator, not just a single SeqRecord") + + #Map the file format to a writer class + if format in _FormatToWriter : + writer_class = _FormatToWriter[format] + count = writer_class(handle).write_file(sequences) + elif format in AlignIO._FormatToWriter : + #Try and turn all the records into a single alignment, + #and write that using Bio.AlignIO + alignment = to_alignment(sequences) + alignment_count = AlignIO.write([alignment], handle, format) + assert alignment_count == 1, "Internal error - the underlying writer " \ + + " should have returned 1, not %s" % repr(alignment_count) + count = len(alignment.get_all_seqs()) + del alignment_count, alignment + elif format in _FormatToIterator or format in AlignIO._FormatToIterator : + raise ValueError("Reading format '%s' is supported, but not writing" \ + % format) + else : + raise ValueError("Unknown format '%s'" % format) + + assert isinstance(count, int), "Internal error - the underlying writer " \ + + " should have returned the record count, not %s" % repr(count) + return count + +def parse(handle, format, alphabet=None) : + r"""Turns a sequence file into an iterator returning SeqRecords. + + - handle - handle to the file. + - format - lower case string describing the file format. + - alphabet - optional Alphabet object, useful when the sequence type + cannot be automatically inferred from the file itself + (e.g. format="fasta" or "tab") + + Typical usage, opening a file to read in, and looping over the record(s): + + >>> from Bio import SeqIO + >>> filename = "Nucleic/sweetpea.nu" + >>> for record in SeqIO.parse(open(filename,"rU"), "fasta") : + ... print "ID", record.id + ... print "Sequence length", len(record) + ... print "Sequence alphabet", record.seq.alphabet + ID gi|3176602|gb|U78617.1|LOU78617 + Sequence length 309 + Sequence alphabet SingleLetterAlphabet() + + For file formats like FASTA where the alphabet cannot be determined, it + may be useful to specify the alphabet explicitly: + + >>> from Bio import SeqIO + >>> from Bio.Alphabet import generic_dna + >>> filename = "Nucleic/sweetpea.nu" + >>> for record in SeqIO.parse(open(filename,"rU"), "fasta", generic_dna) : + ... print "ID", record.id + ... print "Sequence length", len(record) + ... print "Sequence alphabet", record.seq.alphabet + ID gi|3176602|gb|U78617.1|LOU78617 + Sequence length 309 + Sequence alphabet DNAAlphabet() + + If you have a string 'data' containing the file contents, you must + first turn this into a handle in order to parse it: + + >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n" + >>> from Bio import SeqIO + >>> from StringIO import StringIO + >>> for record in SeqIO.parse(StringIO(data), "fasta") : + ... print record.id, record.seq + Alpha ACCGGATGTA + Beta AGGCTCGGTTA + + Use the Bio.SeqIO.read(handle, format) function when you expect a single + record only. + """ + #NOTE - The above docstring has some raw \n characters needed + #for the StringIO example, hense the whole docstring is in raw + #string more (see the leading r before the opening quote). + from Bio import AlignIO + + #Try and give helpful error messages: + if isinstance(handle, basestring) : + raise TypeError("Need a file handle, not a string (i.e. not a filename)") + if not isinstance(format, basestring) : + raise TypeError("Need a string for the file format (lower case)") + if not format : + raise ValueError("Format required (lower case string)") + if format != format.lower() : + raise ValueError("Format string '%s' should be lower case" % format) + if alphabet is not None and not (isinstance(alphabet, Alphabet) or \ + isinstance(alphabet, AlphabetEncoder)) : + raise ValueError("Invalid alphabet, %s" % repr(alphabet)) + + #Map the file format to a sequence iterator: + if format in _FormatToIterator : + iterator_generator = _FormatToIterator[format] + if alphabet is None : + return iterator_generator(handle) + try : + return iterator_generator(handle, alphabet=alphabet) + except : + return _force_alphabet(iterator_generator(handle), alphabet) + elif format in AlignIO._FormatToIterator : + #Use Bio.AlignIO to read in the alignments + #TODO - Once we drop support for Python 2.3, this helper function can be + #replaced with a generator expression. + return _iterate_via_AlignIO(handle, format, alphabet) + else : + raise ValueError("Unknown format '%s'" % format) + +#This is a generator function +def _iterate_via_AlignIO(handle, format, alphabet) : + """Iterate over all records in several alignments (PRIVATE).""" + from Bio import AlignIO + for align in AlignIO.parse(handle, format, alphabet=alphabet) : + for record in align : + yield record + +def _force_alphabet(record_iterator, alphabet) : + """Iterate over records, over-riding the alphabet (PRIVATE).""" + #Assume the alphabet argument has been pre-validated + given_base_class = _get_base_alphabet(alphabet).__class__ + for record in record_iterator : + if isinstance(_get_base_alphabet(record.seq.alphabet), + given_base_class) : + record.seq.alphabet = alphabet + yield record + else : + raise ValueError("Specified alphabet %s clashes with "\ + "that determined from the file, %s" \ + % (repr(alphabet), repr(record.seq.alphabet))) + +def read(handle, format, alphabet=None) : + """Turns a sequence file into a single SeqRecord. + + - handle - handle to the file. + - format - string describing the file format. + - alphabet - optional Alphabet object, useful when the sequence type + cannot be automatically inferred from the file itself + (e.g. format="fasta" or "tab") + + This function is for use parsing sequence files containing + exactly one record. For example, reading a GenBank file: + + >>> from Bio import SeqIO + >>> record = SeqIO.read(open("GenBank/arab1.gb", "rU"), "genbank") + >>> print "ID", record.id + ID AC007323.5 + >>> print "Sequence length", len(record) + Sequence length 86436 + >>> print "Sequence alphabet", record.seq.alphabet + Sequence alphabet IUPACAmbiguousDNA() + + If the handle contains no records, or more than one record, + an exception is raised. For example: + + >>> from Bio import SeqIO + >>> record = SeqIO.read(open("GenBank/cor6_6.gb", "rU"), "genbank") + Traceback (most recent call last): + ... + ValueError: More than one record found in handle + + If however you want the first record from a file containing + multiple records this function would raise an exception (as + shown in the example above). Instead use: + + >>> from Bio import SeqIO + >>> record = SeqIO.parse(open("GenBank/cor6_6.gb", "rU"), "genbank").next() + >>> print "First record's ID", record.id + First record's ID X55053.1 + + Use the Bio.SeqIO.parse(handle, format) function if you want + to read multiple records from the handle. + """ + iterator = parse(handle, format, alphabet) + try : + first = iterator.next() + except StopIteration : + first = None + if first is None : + raise ValueError("No records found in handle") + try : + second = iterator.next() + except StopIteration : + second = None + if second is not None : + raise ValueError("More than one record found in handle") + return first + +def to_dict(sequences, key_function=None) : + """Turns a sequence iterator or list into a dictionary. + + - sequences - An iterator that returns SeqRecord objects, + or simply a list of SeqRecord objects. + - key_function - Optional function which when given a SeqRecord + returns a unique string for the dictionary key. + + e.g. key_function = lambda rec : rec.name + or, key_function = lambda rec : rec.description.split()[0] + + If key_function is ommitted then record.id is used, on the + assumption that the records objects returned are SeqRecords + with a unique id field. + + If there are duplicate keys, an error is raised. + + Example usage, defaulting to using the record.id as key: + + >>> from Bio import SeqIO + >>> handle = open("GenBank/cor6_6.gb", "rU") + >>> format = "genbank" + >>> id_dict = SeqIO.to_dict(SeqIO.parse(handle, format)) + >>> print id_dict.keys() + ['L31939.1', 'AJ237582.1', 'X62281.1', 'AF297471.1', 'X55053.1', 'M81224.1'] + >>> print id_dict["L31939.1"].description + Brassica rapa (clone bif72) kin mRNA, complete cds. + + A more complex example, using the key_function argument in order to use + a sequence checksum as the dictionary key: + + >>> from Bio import SeqIO + >>> from Bio.SeqUtils.CheckSum import seguid + >>> handle = open("GenBank/cor6_6.gb", "rU") + >>> format = "genbank" + >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(handle, format), + ... key_function = lambda rec : seguid(rec.seq)) + >>> for key, record in seguid_dict.iteritems() : + ... print key, record.id + SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1 + l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1 + /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1 + TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1 + uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1 + BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1 + """ + if key_function is None : + key_function = lambda rec : rec.id + + d = dict() + for record in sequences : + key = key_function(record) + if key in d : + raise ValueError("Duplicate key '%s'" % key) + d[key] = record + return d + + +def to_alignment(sequences, alphabet=None, strict=True) : + """Returns a multiple sequence alignment (OBSOLETE). + + - sequences -An iterator that returns SeqRecord objects, + or simply a list of SeqRecord objects. All + the record sequences must be the same length. + - alphabet - Optional alphabet. Stongly recommended. + - strict - Optional, defaults to True. Should error checking + be done? + + Using this function is now discouraged. Rather doing this: + + >>> from Bio import SeqIO + >>> handle = open("Clustalw/protein.aln") + >>> alignment = SeqIO.to_alignment(SeqIO.parse(handle, "clustal")) + >>> handle.close() + + You are now encouraged to use Bio.AlignIO instead, e.g. + + >>> from Bio import AlignIO + >>> handle = open("Clustalw/protein.aln") + >>> alignment = AlignIO.read(handle, "clustal") + >>> handle.close() + """ + #TODO - Move this functionality into the Alignment class instead? + from Bio.Alphabet import generic_alphabet + from Bio.Alphabet import _consensus_alphabet + if alphabet is None : + sequences = list(sequences) + alphabet = _consensus_alphabet([rec.seq.alphabet for rec in sequences \ + if rec.seq is not None]) + + if not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)) : + raise ValueError("Invalid alphabet") + + alignment_length = None + alignment = Alignment(alphabet) + for record in sequences : + if strict : + if alignment_length is None : + alignment_length = len(record.seq) + elif alignment_length != len(record.seq) : + raise ValueError("Sequences must all be the same length") + + assert isinstance(record.seq.alphabet, Alphabet) \ + or isinstance(record.seq.alphabet, AlphabetEncoder), \ + "Sequence does not have a valid alphabet" + + #TODO - Move this alphabet comparison code into the Alphabet module/class? + #TODO - Is a normal alphabet "ungapped" by default, or does it just mean + #undecided? + if isinstance(record.seq.alphabet, Alphabet) \ + and isinstance(alphabet, Alphabet) : + #Comparing two non-gapped alphabets + if not isinstance(record.seq.alphabet, alphabet.__class__) : + raise ValueError("Incompatible sequence alphabet " \ + + "%s for %s alignment" \ + % (record.seq.alphabet, alphabet)) + elif isinstance(record.seq.alphabet, AlphabetEncoder) \ + and isinstance(alphabet, Alphabet) : + raise ValueError("Sequence has a gapped alphabet, alignment does not") + elif isinstance(record.seq.alphabet, Alphabet) \ + and isinstance(alphabet, Gapped) : + #Sequence isn't gapped, alignment is. + if not isinstance(record.seq.alphabet, alphabet.alphabet.__class__) : + raise ValueError("Incompatible sequence alphabet " \ + + "%s for %s alignment" \ + % (record.seq.alphabet, alphabet)) + else : + #Comparing two gapped alphabets + if not isinstance(record.seq.alphabet, alphabet.__class__) : + raise ValueError("Incompatible sequence alphabet " \ + + "%s for %s alignment" \ + % (record.seq.alphabet, alphabet)) + if record.seq.alphabet.gap_char != alphabet.gap_char : + raise ValueError("Sequence gap characters != alignment gap char") + #ToDo, additional checks on the specified alignment... + #Should we look at the alphabet.contains() method? + if record.seq is None : + raise TypeError("SeqRecord (id=%s) has None for its sequence." % record.id) + + #This is abusing the "private" records list, + #we should really have a method like add_sequence + #but which takes SeqRecord objects. See also Bug 1944 + alignment._records.append(record) + return alignment + +def _test(): + """Run the Bio.SeqIO module's doctests. + + This will try and locate the unit tests directory, and run the doctests + from there in order that the relative paths used in the examples work. + """ + import doctest + import os + if os.path.isdir(os.path.join("..","..","Tests")) : + print "Runing doctests..." + cur_dir = os.path.abspath(os.curdir) + os.chdir(os.path.join("..","..","Tests")) + doctest.testmod() + os.chdir(cur_dir) + del cur_dir + print "Done" + +if __name__ == "__main__": + #Run the doctests + _test() diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqRecord.py b/binaries/src/globplot/biopython-1.50/Bio/SeqRecord.py new file mode 100644 index 0000000..46a426c --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqRecord.py @@ -0,0 +1,628 @@ +# Copyright 2000-2002 Andrew Dalke. +# Copyright 2002-2004 Brad Chapman. +# Copyright 2006-2009 by Peter Cock. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +"""Represent a Sequence Record, a sequence with annotation.""" +__docformat__ = "epytext en" #Simple markup to show doctests nicely + +# NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL +# In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes +# need to be in sync (this is the BioSQL "Database SeqRecord", see +# also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) + +class _RestrictedDict(dict): + """Dict which only allows sequences of given length as values (PRIVATE). + + This simple subclass of the python dictionary is used in the SeqRecord + object for holding per-letter-annotations. This class is intended to + prevent simple errors by only allowing python sequences (e.g. lists, + strings and tuples) to be stored, and only if their length matches that + expected (the length of the SeqRecord's seq object). It cannot however + prevent the entries being edited in situ (for example appending entries + to a list). + """ + def __init__(self, length) : + """Create an EMPTY restricted dictionary.""" + dict.__init__(self) + self._length = int(length) + def __setitem__(self, key, value) : + if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \ + or len(value) != self._length : + raise TypeError("We only allow python sequences (lists, tuples or " + "strings) of length %i." % self._length) + dict.__setitem__(self, key, value) + +class SeqRecord(object): + """A SeqRecord object holds a sequence and information about it. + + Main attributes: + - id - Identifier such as a locus tag (string) + - seq - The sequence itself (Seq object) + + Additional attributes: + - name - Sequence name, e.g. gene name (string) + - description - Additional text (string) + - dbxrefs - List of database cross references (list of strings) + - features - Any (sub)features defined (list of SeqFeature objects) + - annotations - Further information about the whole sequence (dictionary) + Most entries are lists of strings. + - letter_annotations - Per letter/symbol annotation (restricted + dictionary). This holds python sequences (lists, strings + or tuples) whose length matches that of the sequence. + A typical use would be to hold a list of integers + representing sequencing quality scores, or a string + representing the secondary structure. + + You will typically use Bio.SeqIO to read in sequences from files as + SeqRecord objects. However, you may want to create your own SeqRecord + objects directly (see the __init__ method for further details): + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Alphabet import IUPAC + >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", + ... IUPAC.protein), + ... id="YP_025292.1", name="HokC", + ... description="toxic membrane protein") + >>> print record + ID: YP_025292.1 + Name: HokC + Description: toxic membrane protein + Number of features: 0 + Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) + + If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO + for this. For the special case where you want the SeqRecord turned into + a string in a particular file format there is a format method which uses + Bio.SeqIO internally: + + >>> print record.format("fasta") + >YP_025292.1 toxic membrane protein + MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF + + """ + def __init__(self, seq, id = "", name = "", + description = "", dbxrefs = None, + features = None): + """Create a SeqRecord. + + Arguments: + - seq - Sequence, required (Seq or Mutable object) + - id - Sequence identifier, recommended (string) + - name - Sequence name, optional (string) + - description - Sequence description, optional (string) + - dbxrefs - Database cross references, optional (list of strings) + - features - Any (sub)features, optional (list of SeqFeature objects) + + You will typically use Bio.SeqIO to read in sequences from files as + SeqRecord objects. However, you may want to create your own SeqRecord + objects directly. + + Note that while an id is optional, we strongly recommend you supply a + unique id string for each record. This is especially important + if you wish to write your sequences to a file. + + If you don't have the actual sequence, but you do know its length, + then using the UnknownSeq object from Bio.Seq is appropriate. + + You can create a 'blank' SeqRecord object, and then populate the + attributes later. Note that currently the annotations and the + letter_annotations dictionaries cannot be specified when creating + the SeqRecord. + """ + if id is not None and not isinstance(id, basestring) : + #Lots of existing code uses id=None... this may be a bad idea. + raise TypeError("id argument should be a string") + if not isinstance(name, basestring) : + raise TypeError("name argument should be a string") + if not isinstance(description, basestring) : + raise TypeError("description argument should be a string") + if dbxrefs is not None and not isinstance(dbxrefs, list) : + raise TypeError("dbxrefs argument should be a list (of strings)") + if features is not None and not isinstance(features, list) : + raise TypeError("features argument should be a list (of SeqFeature objects)") + self._seq = seq + self.id = id + self.name = name + self.description = description + if dbxrefs is None: + dbxrefs = [] + self.dbxrefs = dbxrefs + # annotations about the whole sequence + self.annotations = {} + + # annotations about each letter in the sequence + if seq is None : + #Should we allow this and use a normal unrestricted dict? + self._per_letter_annotations = _RestrictedDict(length=0) + else : + try : + self._per_letter_annotations = _RestrictedDict(length=len(seq)) + except : + raise TypeError("seq argument should be a Seq or MutableSeq") + + # annotations about parts of the sequence + if features is None: + features = [] + self.features = features + + #TODO - Just make this a read only property? + def _set_per_letter_annotations(self, value) : + if not isinstance(value, dict) : + raise TypeError("The per-letter-annotations should be a " + "(restricted) dictionary.") + #Turn this into a restricted-dictionary (and check the entries) + try : + self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) + except AttributeError : + #e.g. seq is None + self._per_letter_annotations = _RestrictedDict(length=0) + self._per_letter_annotations.update(value) + letter_annotations = property( \ + fget=lambda self : self._per_letter_annotations, + fset=_set_per_letter_annotations, + doc="""Dictionary of per-letter-annotation for the sequence. + + For example, this can hold quality scores used in FASTQ or QUAL files. + Consider this example using Bio.SeqIO to read in an example Solexa + variant FASTQ file as a SeqRecord: + + >>> from Bio import SeqIO + >>> handle = open("Quality/solexa.fastq", "rU") + >>> record = SeqIO.read(handle, "fastq-solexa") + >>> handle.close() + >>> print record.id, record.seq + slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC + >>> print record.letter_annotations.keys() + ['solexa_quality'] + >>> print record.letter_annotations["solexa_quality"] + [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -6, -1, -1, -4, -1, -4, -19, -10, -27, -18] + + The per-letter-annotaions get sliced automatically if you slice the + parent SeqRecord, for example taking the last ten bases: + + >>> sub_record = record[-10:] + >>> print sub_record.id, sub_record.seq + slxa_0013_1_0001_24 CTTATACACC + >>> print sub_record.letter_annotations["solexa_quality"] + [-6, -1, -1, -4, -1, -4, -19, -10, -27, -18] + + Any python sequence (i.e. list, tuple or string) can be recorded in + the SeqRecord's letter_annotations dictionary as long as the length + matches that of the SeqRecord's sequence. e.g. + + >>> len(sub_record.letter_annotations) + 1 + >>> sub_record.letter_annotations["dummy"] = "abcdefghij" + >>> len(sub_record.letter_annotations) + 2 + + You can delete entries from the letter_annotations dictionary as usual: + + >>> del sub_record.letter_annotations["solexa_quality"] + >>> sub_record.letter_annotations + {'dummy': 'abcdefghij'} + + You can completely clear the dictionary easily as follows: + + >>> sub_record.letter_annotations = {} + >>> sub_record.letter_annotations + {} + """) + + def _set_seq(self, value) : + #TODO - Add a deprecation warning that the seq should be write only? + if self._per_letter_annotations : + #TODO - Make this a warning? Silently empty the dictionary? + raise ValueError("You must empty the letter annotations first!") + self._seq = value + try : + self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) + except AttributeError : + #e.g. seq is None + self._per_letter_annotations = _RestrictedDict(length=0) + + seq = property(fget=lambda self : self._seq, + fset=_set_seq, + doc="The sequence itself, as a Seq or MutableSeq object.") + + def __getitem__(self, index) : + """Returns a sub-sequence or an individual letter. + + Splicing, e.g. my_record[5:10], returns a new SeqRecord for + that sub-sequence with approriate annotation preserved. The + name, id and description are kept. + + Any per-letter-annotations are sliced to match the requested + sub-sequence. Unless a stride is used, all those features + which fall fully within the subsequence are included (with + their locations adjusted accordingly). + + However, the annotations dictionary and the dbxrefs list are + not used for the new SeqRecord, as in general they may not + apply to the subsequence. If you want to preserve them, you + must explictly copy them to the new SeqRecord yourself. + + Using an integer index, e.g. my_record[5] is shorthand for + extracting that letter from the sequence, my_record.seq[5]. + + For example, consider this short protein and its secondary + structure as encoded by the PDB (e.g. H for alpha helices), + plus a simple feature for its histidine self phosphorylation + site: + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.SeqFeature import SeqFeature, FeatureLocation + >>> from Bio.Alphabet import IUPAC + >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" + ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", + ... IUPAC.protein), + ... id="1JOY", name="EnvZ", + ... description="Homodimeric domain of EnvZ from E. coli") + >>> rec.letter_annotations["secondary_structure"] = \ + " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " + >>> rec.features.append(SeqFeature(FeatureLocation(20,21), + ... type = "Site")) + + Now let's have a quick look at the full record, + + >>> print rec + ID: 1JOY + Name: EnvZ + Description: Homodimeric domain of EnvZ from E. coli + Number of features: 1 + Per letter annotation for: secondary_structure + Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) + >>> print rec.letter_annotations["secondary_structure"] + S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT + >>> print rec.features[0].location + [20:21] + + Now let's take a sub sequence, here chosen as the first (fractured) + alpha helix which includes the histidine phosphorylation site: + + >>> sub = rec[11:41] + >>> print sub + ID: 1JOY + Name: EnvZ + Description: Homodimeric domain of EnvZ from E. coli + Number of features: 1 + Per letter annotation for: secondary_structure + Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) + >>> print sub.letter_annotations["secondary_structure"] + HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH + >>> print sub.features[0].location + [9:10] + + You can also of course omit the start or end values, for + example to get the first ten letters only: + + >>> print rec[:10] + ID: 1JOY + Name: EnvZ + Description: Homodimeric domain of EnvZ from E. coli + Number of features: 0 + Per letter annotation for: secondary_structure + Seq('MAAGVKQLAD', IUPACProtein()) + + Or for the last ten letters: + + >>> print rec[-10:] + ID: 1JOY + Name: EnvZ + Description: Homodimeric domain of EnvZ from E. coli + Number of features: 0 + Per letter annotation for: secondary_structure + Seq('IIEQFIDYLR', IUPACProtein()) + + If you omit both, then you get a copy of the original record (although + lacking the annotations and dbxrefs): + + >>> print rec[:] + ID: 1JOY + Name: EnvZ + Description: Homodimeric domain of EnvZ from E. coli + Number of features: 1 + Per letter annotation for: secondary_structure + Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) + + Finally, indexing with a simple integer is shorthand for pulling out + that letter from the sequence directly: + + >>> rec[5] + 'K' + >>> rec.seq[5] + 'K' + """ + if isinstance(index, int) : + #NOTE - The sequence level annotation like the id, name, etc + #do not really apply to a single character. However, should + #we try and expose any per-letter-annotation here? If so how? + return self.seq[index] + elif isinstance(index, slice) : + if self.seq is None : + raise ValueError("If the sequence is None, we cannot slice it.") + parent_length = len(self) + answer = self.__class__(self.seq[index], + id=self.id, + name=self.name, + description=self.description) + #TODO - The desription may no longer apply. + #It would be safer to change it to something + #generic like "edited" or the default value. + + #Don't copy the annotation dict and dbxefs list, + #they may not apply to a subsequence. + #answer.annotations = dict(self.annotations.iteritems()) + #answer.dbxrefs = self.dbxrefs[:] + + #TODO - Cope with strides by generating ambiguous locations? + if index.step is None or index.step == 1 : + #Select relevant features, add them with shifted locations + if index.start is None : + start = 0 + else : + start = index.start + if index.stop is None : + stop = -1 + else : + stop = index.stop + if (start < 0 or stop < 0) and parent_length == 0 : + raise ValueError, \ + "Cannot support negative indices without the sequence length" + if start < 0 : + start = parent_length - start + if stop < 0 : + stop = parent_length - stop + 1 + #assert str(self.seq)[index] == str(self.seq)[start:stop] + for f in self.features : + if start <= f.location.start.position \ + and f.location.end.position < stop : + answer.features.append(f._shift(-start)) + + #Slice all the values to match the sliced sequence + #(this should also work with strides, even negative strides): + for key, value in self.letter_annotations.iteritems() : + answer._per_letter_annotations[key] = value[index] + + return answer + raise ValueError, "Invalid index" + + def __iter__(self) : + """Iterate over the letters in the sequence. + + For example, using Bio.SeqIO to read in a protein FASTA file: + + >>> from Bio import SeqIO + >>> record = SeqIO.read(open("Amino/loveliesbleeding.pro"),"fasta") + >>> for amino in record : + ... print amino + ... if amino == "L" : break + X + A + G + L + >>> print record.seq[3] + L + + This is just a shortcut for iterating over the sequence directly: + + >>> for amino in record.seq : + ... print amino + ... if amino == "L" : break + X + A + G + L + >>> print record.seq[3] + L + + Note that this does not facilitate iteration together with any + per-letter-annotation. However, you can achieve that using the + python zip function on the record (or its sequence) and the relevant + per-letter-annotation: + + >>> from Bio import SeqIO + >>> rec = SeqIO.read(open("Quality/solexa.fastq", "rU"), + ... "fastq-solexa") + >>> print rec.id, rec.seq + slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC + >>> print rec.letter_annotations.keys() + ['solexa_quality'] + >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]) : + ... if qual < -10 : + ... print nuc, qual + C -19 + C -27 + C -18 + + You may agree that using zip(rec.seq, ...) is more explicit than using + zip(rec, ...) as shown above. + """ + return iter(self.seq) + + def __str__(self) : + """A human readable summary of the record and its annotation (string). + + The python built in function str works by calling the object's ___str__ + method. e.g. + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Alphabet import IUPAC + >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", + ... IUPAC.protein), + ... id="YP_025292.1", name="HokC", + ... description="toxic membrane protein, small") + >>> print str(record) + ID: YP_025292.1 + Name: HokC + Description: toxic membrane protein, small + Number of features: 0 + Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) + + In this example you don't actually need to call str explicity, as the + print command does this automatically: + + >>> print record + ID: YP_025292.1 + Name: HokC + Description: toxic membrane protein, small + Number of features: 0 + Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) + + Note that long sequences are shown truncated. + """ + lines = [] + if self.id : lines.append("ID: %s" % self.id) + if self.name : lines.append("Name: %s" % self.name) + if self.description : lines.append("Description: %s" % self.description) + if self.dbxrefs : lines.append("Database cross-references: " \ + + ", ".join(self.dbxrefs)) + lines.append("Number of features: %i" % len(self.features)) + for a in self.annotations: + lines.append("/%s=%s" % (a, str(self.annotations[a]))) + if self.letter_annotations : + lines.append("Per letter annotation for: " \ + + ", ".join(self.letter_annotations.keys())) + #Don't want to include the entire sequence, + #and showing the alphabet is useful: + lines.append(repr(self.seq)) + return "\n".join(lines) + + def __repr__(self) : + """A concise summary of the record for debugging (string). + + The python built in function repr works by calling the object's ___repr__ + method. e.g. + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Alphabet import generic_protein + >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" + ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" + ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" + ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", + ... generic_protein), + ... id="NP_418483.1", name="b4059", + ... description="ssDNA-binding protein", + ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) + >>> print repr(rec) + SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) + + At the python prompt you can also use this shorthand: + + >>> rec + SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) + + Note that long sequences are shown truncated. + """ + return self.__class__.__name__ \ + + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \ + % tuple(map(repr, (self.seq, self.id, self.name, + self.description, self.dbxrefs))) + + def format(self, format) : + r"""Returns the record as a string in the specified file format. + + The format should be a lower case string supported as an output + format by Bio.SeqIO, which is used to turn the SeqRecord into a + string. e.g. + + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> from Bio.Alphabet import IUPAC + >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", + ... IUPAC.protein), + ... id="YP_025292.1", name="HokC", + ... description="toxic membrane protein") + >>> record.format("fasta") + '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' + >>> print record.format("fasta") + >YP_025292.1 toxic membrane protein + MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF + + + The python print command automatically appends a new line, meaning + in this example a blank line is shown. If you look at the string + representation you can see there is a trailing new line (shown as + slash n) which is important when writing to a file or if + concatenating mutliple sequence strings together. + + Note that this method will NOT work on every possible file format + supported by Bio.SeqIO (e.g. some are for multiple sequences only). + """ + #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 + #See also the Bio.Align.Generic.Alignment class and its format() + return self.__format__(format) + + def __format__(self, format_spec) : + """Returns the record as a string in the specified file format. + + This method supports the python format() function added in + Python 2.6/3.0. The format_spec should be a lower case + string supported by Bio.SeqIO as an output file format. + See also the SeqRecord's format() method. + """ + if format_spec: + from StringIO import StringIO + from Bio import SeqIO + handle = StringIO() + SeqIO.write([self], handle, format_spec) + return handle.getvalue() + else : + #Follow python convention and default to using __str__ + return str(self) + + def __len__(self) : + """Returns the length of the sequence. + + For example, using Bio.SeqIO to read in a FASTA nucleotide file: + + >>> from Bio import SeqIO + >>> record = SeqIO.read(open("Nucleic/sweetpea.nu"),"fasta") + >>> len(record) + 309 + >>> len(record.seq) + 309 + """ + return len(self.seq) + + def __nonzero__(self) : + """Returns True regardless of the length of the sequence. + + This behaviour is for backwards compatibility, since until the + __len__ method was added, a SeqRecord always evaluated as True. + + Note that in comparison, a Seq object will evaluate to False if it + has a zero length sequence. + + WARNING: The SeqRecord may in future evaluate to False when its + sequence is of zero length (in order to better match the Seq + object behaviour)! + """ + return True + +def _test(): + """Run the Bio.SeqRecord module's doctests (PRIVATE). + + This will try and locate the unit tests directory, and run the doctests + from there in order that the relative paths used in the examples work. + """ + import doctest + import os + if os.path.isdir(os.path.join("..","Tests")) : + print "Runing doctests..." + cur_dir = os.path.abspath(os.curdir) + os.chdir(os.path.join("..","Tests")) + doctest.testmod() + os.chdir(cur_dir) + del cur_dir + print "Done" + +if __name__ == "__main__": + _test() diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqRecord.pyc b/binaries/src/globplot/biopython-1.50/Bio/SeqRecord.pyc new file mode 100644 index 0000000..714ba11 Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/SeqRecord.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/CheckSum.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/CheckSum.py new file mode 100644 index 0000000..ae1a9d6 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/CheckSum.py @@ -0,0 +1,124 @@ +# Copyright 2002 by Yves Bastide and Brad Chapman. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Functions to calculate assorted sequence checksums.""" + +# crc32, crc64, gcg, and seguid +# crc64 is adapted from BioPerl + +from binascii import crc32 as _crc32 + +def crc32(seq) : + """Returns the crc32 checksum for a sequence (string or Seq object)""" + try : + #Assume its a Seq object + return _crc32(seq.tostring()) + except AttributeError : + #Assume its a string + return _crc32(seq) + +def _init_table_h(): + _table_h = [] + for i in range(256): + l = i + part_h = 0 + for j in range(8): + rflag = l & 1 + l >>= 1 + if part_h & 1: l |= (1L << 31) + part_h >>= 1L + if rflag: part_h ^= 0xd8000000L + _table_h.append(part_h) + return _table_h + +# Initialisation +_table_h = _init_table_h() + +def crc64(s): + """Returns the crc64 checksum for a sequence (string or Seq object)""" + crcl = 0 + crch = 0 + for c in s: + shr = (crch & 0xFF) << 24 + temp1h = crch >> 8 + temp1l = (crcl >> 8) | shr + idx = (crcl ^ ord(c)) & 0xFF + crch = temp1h ^ _table_h[idx] + crcl = temp1l + + return "CRC-%08X%08X" % (crch, crcl) + + +def gcg(seq): + """Returns the GCG checksum (int) for a sequence (string or Seq object) + + Given a nucleotide or amino-acid secuence (or any string), + returns the GCG checksum (int). Checksum used by GCG program. + seq type = str. + Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi + with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. + All sequences are converted to uppercase """ + index = checksum = 0 + if type(seq)!=type("aa"): + seq=seq.tostring() + for char in seq: + index += 1 + checksum += index * ord(char.upper()) + if index == 57: index = 0 + return checksum % 10000 + +def seguid(seq): + """Returns the SEGUID (string) for a sequence (string or Seq object) + + Given a nucleotide or amino-acid secuence (or any string), + returns the SEGUID string (A SEquence Globally Unique IDentifier). + seq type = str. + For more information about SEGUID, see: + http://bioinformatics.anl.gov/seguid/ + DOI: 10.1002/pmic.200600032 """ + try: + #Python 2.5 sha1 is in hashlib + import hashlib + m = hashlib.sha1() + except: + #For older versions + import sha + m = sha.new() + import base64 + if type(seq)!=type("aa"): + seq=seq.tostring().upper() + else: + seq=seq.upper() + m.update(seq) + try: + #For Python 2.5 + return base64.b64encode(m.digest()).rstrip("=") + except: + #For older versions + import os + #Note: Using os.linesep doesn't work on Windows, + #where os.linesep= "\r\n" but the encoded string + #contains "\n" but not "\r\n" + return base64.encodestring(m.digest()).replace("\n","").rstrip("=") + +if __name__ == "__main__" : + print "Quick self test" + + str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ + + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ + + "YCSSYAGSSTLVFGGGTKLTVL" + + str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ + + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ + + "YCCSYAGSSTWVFGGGTKLTVL" + + assert crc64(str_light_chain_one) == crc64(str_light_chain_two) + assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one) + + assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one) + assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two) + + print "Done" diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/CodonUsage.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/CodonUsage.py new file mode 100644 index 0000000..84e213a --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/CodonUsage.py @@ -0,0 +1,153 @@ +import math +from CodonUsageIndices import SharpEcoliIndex +from Bio import SeqIO # To parse a FASTA file + +CodonsDict = {'TTT':0, 'TTC':0, 'TTA':0, 'TTG':0, 'CTT':0, +'CTC':0, 'CTA':0, 'CTG':0, 'ATT':0, 'ATC':0, +'ATA':0, 'ATG':0, 'GTT':0, 'GTC':0, 'GTA':0, +'GTG':0, 'TAT':0, 'TAC':0, 'TAA':0, 'TAG':0, +'CAT':0, 'CAC':0, 'CAA':0, 'CAG':0, 'AAT':0, +'AAC':0, 'AAA':0, 'AAG':0, 'GAT':0, 'GAC':0, +'GAA':0, 'GAG':0, 'TCT':0, 'TCC':0, 'TCA':0, +'TCG':0, 'CCT':0, 'CCC':0, 'CCA':0, 'CCG':0, +'ACT':0, 'ACC':0, 'ACA':0, 'ACG':0, 'GCT':0, +'GCC':0, 'GCA':0, 'GCG':0, 'TGT':0, 'TGC':0, +'TGA':0, 'TGG':0, 'CGT':0, 'CGC':0, 'CGA':0, +'CGG':0, 'AGT':0, 'AGC':0, 'AGA':0, 'AGG':0, +'GGT':0, 'GGC':0, 'GGA':0, 'GGG':0} + + +# this dictionary is used to know which codons encode the same AA. +SynonymousCodons = {'CYS': ['TGT', 'TGC'], 'ASP': ['GAT', 'GAC'], +'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'], +'GLN': ['CAA', 'CAG'], 'MET': ['ATG'], 'ASN': ['AAC', 'AAT'], +'PRO': ['CCT', 'CCG', 'CCA', 'CCC'], 'LYS': ['AAG', 'AAA'], +'STOP': ['TAG', 'TGA', 'TAA'], 'THR': ['ACC', 'ACA', 'ACG', 'ACT'], +'PHE': ['TTT', 'TTC'], 'ALA': ['GCA', 'GCC', 'GCG', 'GCT'], +'GLY': ['GGT', 'GGG', 'GGA', 'GGC'], 'ILE': ['ATC', 'ATA', 'ATT'], +'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'], 'HIS': ['CAT', 'CAC'], +'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'], 'TRP': ['TGG'], +'VAL': ['GTA', 'GTC', 'GTG', 'GTT'], 'GLU': ['GAG', 'GAA'], 'TYR': ['TAT', 'TAC']} + + +class CodonAdaptationIndex: + """A codon adaptaion index (CAI) implementation. + + This class implements the codon adaptaion index (CAI) described by Sharp and + Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95). + + methods: + + set_cai_index(Index): + + This method sets-up an index to be used when calculating CAI for a gene. + Just pass a dictionary similar to the SharpEcoliIndex in CodonUsageIndices + module. + + generate_index(FastaFile): + + This method takes a location of a FastaFile and generates an index. This + index can later be used to calculate CAI of a gene. + + cai_for_gene(DNAsequence): + + This method uses the Index (either the one you set or the one you generated) + and returns the CAI for the DNA sequence. + + print_index(): + This method prints out the index you used. + + NOTE - This implementation does not currently cope with alternative genetic + codes, only the synonymous codons in the standard table are considered. + """ + def __init__(self): + self.index = {} + self.codon_count={} + + # use this method with predefined CAI index + def set_cai_index(self, Index): + self.index = Index + + def generate_index(self, FastaFile): + """Generate a codon usage index from a FASTA file of CDS sequences. + + This method takes a location of a Fasta file containing CDS sequences + (which must all have a whole number of codons) and generates a codon + usage index. This index can later be used to calculate CAI of a gene. + """ + # first make sure i am not overwriting an existing index: + if self.index != {} or self.codon_count!={}: + raise ValueError("an index has already been set or a codon count has been done. cannot overwrite either.") + # count codon occurances in the file. + self._count_codons(FastaFile) + + # now to calculate the index we first need to sum the number of times + # synonymous codons were used all together. + for AA in SynonymousCodons.keys(): + Sum=0.0 + RCSU=[] # RCSU values are equal to CodonCount/((1/num of synonymous codons) * sum of all synonymous codons) + + for codon in SynonymousCodons[AA]: + Sum += self.codon_count[codon] + # calculate the RSCU value for each of the codons + for codon in SynonymousCodons[AA]: + RCSU.append(self.codon_count[codon]/((1.0/len(SynonymousCodons[AA]))*Sum)) + # now generate the index W=RCSUi/RCSUmax: + RCSUmax = max(RCSU) + for i in range(len(SynonymousCodons[AA])): + self.index[SynonymousCodons[AA][i]]= RCSU[i]/RCSUmax + + + def cai_for_gene(self, DNAsequence): + """Calculate the CAI (float) for the provided DNA sequence (string). + + This method uses the Index (either the one you set or the one you generated) + and returns the CAI for the DNA sequence. + """ + caiValue = 0 + LengthForCai = 0 + # if no index is set or generated, the default SharpEcoliIndex will be used. + if self.index=={}: + self.set_cai_index(SharpEcoliIndex) + + if DNAsequence.islower(): + DNAsequence = DNAsequence.upper() + for i in range (0,len(DNAsequence),3): + codon = DNAsequence[i:i+3] + if codon in self.index: + if codon!='ATG' and codon!= 'TGG': #these two codons are always one, exclude them. + caiValue += math.log(self.index[codon]) + LengthForCai += 1 + elif codon not in ['TGA','TAA', 'TAG']: # some indices you will use may not include stop codons. + raise TypeError("illegal codon in sequence: %s.\n%s" % (codon, self.index)) + return math.exp(caiValue*(1.0/(LengthForCai-1))) + + def _count_codons(self, FastaFile): + handle = open(FastaFile, 'r') + + # make the codon dictionary local + self.codon_count = CodonsDict.copy() + + # iterate over sequence and count all the codons in the FastaFile. + for cur_record in SeqIO.parse(handle, "fasta") : + # make sure the sequence is lower case + if str(cur_record.seq).islower(): + DNAsequence = str(cur_record.seq).upper() + else: + DNAsequence = str(cur_record.seq) + for i in range(0,len(DNAsequence),3): + codon = DNAsequence[i:i+3] + if codon in self.codon_count: + self.codon_count[codon] += 1 + else: + raise TypeError("illegal codon %s in gene: %s" % (codon, cur_record.id)) + handle.close() + + # this just gives the index when the objects is printed. + def print_index (self): + """This method prints out the index you used.""" + X=self.index.keys() + X.sort() + for i in X: + print "%s\t%.3f" %(i, self.index[i]) + diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/CodonUsageIndices.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/CodonUsageIndices.py new file mode 100644 index 0000000..29c9756 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/CodonUsageIndices.py @@ -0,0 +1,14 @@ +# Copyright Yair Benita Y.Benita@pharm.uu.nl +# Biopython (http://biopython.org) license applies + +# sharp Ecoli index for codon adaption index. +# from Sharp & Li, Nucleic Acids Res. 1987 +SharpEcoliIndex = { +'GCA':0.586, 'GCC':0.122, 'GCG':0.424, 'GCT':1, 'AGA':0.004, 'AGG':0.002, 'CGA':0.004, +'CGC':0.356, 'CGG':0.004, 'CGT':1, 'AAC':1, 'AAT':0.051, 'GAC':1, 'GAT':0.434, 'TGC':1, +'TGT':0.5, 'CAA':0.124, 'CAG':1, 'GAA':1, 'GAG':0.259, 'GGA':0.01, 'GGC':0.724, 'GGG':0.019, +'GGT':1, 'CAC':1, 'CAT':0.291, 'ATA':0.003, 'ATC':1, 'ATT':0.185, 'CTA':0.007, 'CTC':0.037, +'CTG':1, 'CTT':0.042, 'TTA':0.02, 'TTG':0.02, 'AAA':1, 'AAG':0.253, 'ATG':1, 'TTC':1, 'TTT':0.296, +'CCA':0.135, 'CCC':0.012, 'CCG':1, 'CCT':0.07, 'AGC':0.41, 'AGT':0.085, 'TCA':0.077, 'TCC':0.744, +'TCG':0.017, 'TCT':1, 'ACA':0.076, 'ACC':1,'ACG':0.099, 'ACT':0.965, 'TGG':1, 'TAC':1, 'TAT':0.239, +'GTA':0.495, 'GTC':0.066,'GTG':0.221, 'GTT':1} diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/IsoelectricPoint.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/IsoelectricPoint.py new file mode 100644 index 0000000..d53ee46 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/IsoelectricPoint.py @@ -0,0 +1,114 @@ +# Copyright Yair Benita Y.Benita@pharm.uu.nl +# Biopython (http://biopython.org) license applies + +"""Calculate isoelectric points of polypeptides using methods of Bjellqvist. + +pK values and the methos are taken from: + +* Bjellqvist, B.,Hughes, G.J., Pasquali, Ch., Paquet, N., Ravier, F., Sanchez, +J.-Ch., Frutiger, S. & Hochstrasser, D.F. +The focusing positions of polypeptides in immobilized pH gradients can be predicted +from their amino acid sequences. Electrophoresis 1993, 14, 1023-1031. + +* Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E. +Reference points for comparisons of two-dimensional maps of proteins from +different human cell types defined in a pH scale where isoelectric points correlate +with polypeptide compositions. Electrophoresis 1994, 15, 529-539. + +I designed the algorithm according to a note by David L. Tabb, available at: +http://fields.scripps.edu/DTASelect/20010710-pI-Algorithm.pdf + +""" + +positive_pKs = { 'Nterm': 7.5, 'K': 10.0, 'R': 12.0, 'H':5.98 } +negative_pKs = { 'Cterm': 3.55, 'D': 4.05, 'E': 4.45, 'C':9.0, 'Y':10.0 } +pKcterminal= {'D':4.55, 'E':4.75} +pKnterminal = {'A':7.59, 'M':7.0, 'S':6.93, 'P':8.36, 'T':6.82, 'V':7.44, 'E':7.7} +charged_aas = ('K', 'R', 'H', 'D', 'E', 'C', 'Y') + +# access this module through ProtParam.ProteinAnalysis class. +# first make a ProteinAnalysis object and then call its isoelectric_point method. +class IsoelectricPoint: + def __init__(self, ProteinSequence, AminoAcidsContent): + self.sequence = ProteinSequence + self.charged_aas_content = self._select_charged(AminoAcidsContent) + + # This function creates a dictionary with the contents of each charged aa, + # plus Cterm and Nterm. + def _select_charged(self, AminoAcidsContent): + charged = {} + for aa in charged_aas: + charged[aa] = float(AminoAcidsContent[aa]) + charged['Nterm'] = 1.0 + charged['Cterm'] = 1.0 + return charged + + #This function calculates the total charge of the protein at a given pH. + def _chargeR(self, pH, pos_pKs, neg_pKs): + PositiveCharge = 0.0 + for aa, pK in pos_pKs.iteritems(): + CR = 10**(pK-pH) + partial_charge = CR/(CR+1.0) + PositiveCharge += self.charged_aas_content[aa] * partial_charge + + NegativeCharge = 0.0 + for aa, pK in neg_pKs.iteritems(): + CR = 10**(pH-pK) + partial_charge = CR/(CR+1.0) + NegativeCharge += self.charged_aas_content[aa] * partial_charge + + return PositiveCharge - NegativeCharge + + # This is the action function, it tries different pH until the charge of the protein is 0 (or close). + def pi(self): + pos_pKs = dict(positive_pKs) + neg_pKs = dict(negative_pKs) + nterm = self.sequence[0] + cterm = self.sequence[-1] + if nterm in pKnterminal.keys(): + pos_pKs['Nterm'] = pKnterminal[nterm] + if cterm in pKcterminal.keys(): + neg_pKs['Cterm'] = pKcterminal[cterm] + + # Bracket between pH1 and pH2 + pH = 7.0 + Charge = self._chargeR(pH, pos_pKs, neg_pKs) + if Charge > 0.0: + pH1 = pH + Charge1 = Charge + while Charge1 > 0.0: + pH = pH1 + 1.0 + Charge = self._chargeR(pH, pos_pKs, neg_pKs) + if Charge > 0.0: + pH1 = pH + Charge1 = Charge + else: + pH2 = pH + Charge2 = Charge + break + else: + pH2 = pH + Charge2 = Charge + while Charge2 < 0.0: + pH = pH2 - 1.0 + Charge = self._chargeR(pH, pos_pKs, neg_pKs) + if Charge < 0.0: + pH2 = pH + Charge2 = Charge + else: + pH1 = pH + Charge1 = Charge + break + + # Bisection + while pH2 - pH1 > 0.0001 and Charge!=0.0: + pH = (pH1 + pH2) / 2.0 + Charge = self._chargeR(pH, pos_pKs, neg_pKs) + if Charge > 0.0: + pH1 = pH + Charge1 = Charge + else: + pH2 = pH + Charge2 = Charge + + return pH diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/MeltingTemp.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/MeltingTemp.py new file mode 100644 index 0000000..d734ac2 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/MeltingTemp.py @@ -0,0 +1,156 @@ +# Copyright 2004-2008 by Sebastian Bassi. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Calculate the thermodynamic melting temperatures of nucleotide sequences.""" + +import math +def Tm_staluc(s,dnac=50,saltc=50,rna=0): + """Returns DNA/DNA tm using nearest neighbor thermodynamics. + + dnac is DNA concentration [nM] + saltc is salt concentration [mM]. + rna=0 is for DNA/DNA (default), for RNA, rna should be 1. + + Sebastian Bassi """ + + #Credits: + #Main author: Sebastian Bassi + #Overcount function: Greg Singer + #Based on the work of Nicolas Le Novere Bioinformatics. + #17:1226-1227(2001) + + #This function returns better results than EMBOSS DAN because it uses + #updated thermodynamics values and takes into account inicialization + #parameters from the work of SantaLucia (1998). + + #Things to do: + #+Detect complementary sequences. Change K according to result. + #+Add support for heteroduplex (see Sugimoto et al. 1995). + #+Correction for Mg2+. Now supports only monovalent ions. + #+Put thermodinamics table in a external file for users to change at will + #+Add support for danglings ends (see Le Novele. 2001) and mismatches. + + dh = 0 #DeltaH. Enthalpy + ds = 0 #deltaS Entropy + + def tercorr(stri): + deltah = 0 + deltas = 0 + if rna==0: + #DNA/DNA + #Allawi and SantaLucia (1997). Biochemistry 36 : 10581-10594 + if stri.startswith('G') or stri.startswith('C'): + deltah -= 0.1 + deltas += 2.8 + elif stri.startswith('A') or stri.startswith('T'): + deltah -= 2.3 + deltas -= 4.1 + if stri.endswith('G') or stri.endswith('C'): + deltah -= 0.1 + deltas += 2.8 + elif stri.endswith('A') or stri.endswith('T'): + deltah -= 2.3 + deltas -= 4.1 + dhL = dh + deltah + dsL = ds + deltas + return dsL,dhL + elif rna==1: + #RNA + if stri.startswith('G') or stri.startswith('C'): + deltah -= 3.61 + deltas -= 1.5 + elif stri.startswith('A') or stri.startswith('T') or \ + stri.startswith('U'): + deltah -= 3.72 + deltas += 10.5 + if stri.endswith('G') or stri.endswith('C'): + deltah -= 3.61 + deltas -= 1.5 + elif stri.endswith('A') or stri.endswith('T') or \ + stri.endswith('U'): + deltah -= 3.72 + deltas += 10.5 + dhL = dh + deltah + dsL = ds + deltas + # print "delta h=",dhL + return dsL,dhL + + def overcount(st,p): + """Returns how many p are on st, works even for overlapping""" + ocu = 0 + x = 0 + while 1: + try: + i = st.index(p,x) + except ValueError: + break + ocu += 1 + x = i + 1 + return ocu + + R = 1.987 # universal gas constant in Cal/degrees C*Mol + sup = s.upper() + vsTC,vh = tercorr(sup) + vs = vsTC + + k = (dnac/4.0)*1e-9 + #With complementary check on, the 4.0 should be changed to a variable. + + if rna==0: + #DNA/DNA + #Allawi and SantaLucia (1997). Biochemistry 36 : 10581-10594 + vh = vh + (overcount(sup,"AA"))*7.9 + (overcount(sup,"TT"))*\ + 7.9 + (overcount(sup,"AT"))*7.2 + (overcount(sup,"TA"))*7.2 \ + + (overcount(sup,"CA"))*8.5 + (overcount(sup,"TG"))*8.5 + \ + (overcount(sup,"GT"))*8.4 + (overcount(sup,"AC"))*8.4 + vh = vh + (overcount(sup,"CT"))*7.8+(overcount(sup,"AG"))*\ + 7.8 + (overcount(sup,"GA"))*8.2 + (overcount(sup,"TC"))*8.2 + vh = vh + (overcount(sup,"CG"))*10.6+(overcount(sup,"GC"))*\ + 9.8 + (overcount(sup,"GG"))*8 + (overcount(sup,"CC"))*8 + vs = vs + (overcount(sup,"AA"))*22.2+(overcount(sup,"TT"))*\ + 22.2 + (overcount(sup,"AT"))*20.4 + (overcount(sup,"TA"))*21.3 + vs = vs + (overcount(sup,"CA"))*22.7+(overcount(sup,"TG"))*\ + 22.7 + (overcount(sup,"GT"))*22.4 + (overcount(sup,"AC"))*22.4 + vs = vs + (overcount(sup,"CT"))*21.0+(overcount(sup,"AG"))*\ + 21.0 + (overcount(sup,"GA"))*22.2 + (overcount(sup,"TC"))*22.2 + vs = vs + (overcount(sup,"CG"))*27.2+(overcount(sup,"GC"))*\ + 24.4 + (overcount(sup,"GG"))*19.9 + (overcount(sup,"CC"))*19.9 + ds = vs + dh = vh + + else: + #RNA/RNA hybridisation of Xia et al (1998) + #Biochemistry 37: 14719-14735 + vh = vh+(overcount(sup,"AA"))*6.82+(overcount(sup,"TT"))*6.6+\ + (overcount(sup,"AT"))*9.38 + (overcount(sup,"TA"))*7.69+\ + (overcount(sup,"CA"))*10.44 + (overcount(sup,"TG"))*10.5+\ + (overcount(sup,"GT"))*11.4 + (overcount(sup,"AC"))*10.2 + vh = vh + (overcount(sup,"CT"))*10.48 + (overcount(sup,"AG"))\ + *7.6+(overcount(sup,"GA"))*12.44+(overcount(sup,"TC"))*13.3 + vh = vh + (overcount(sup,"CG"))*10.64 + (overcount(sup,"GC"))\ + *14.88+(overcount(sup,"GG"))*13.39+(overcount(sup,"CC"))*12.2 + vs = vs + (overcount(sup,"AA"))*19.0 + (overcount(sup,"TT"))*\ + 18.4+(overcount(sup,"AT"))*26.7+(overcount(sup,"TA"))*20.5 + vs = vs + (overcount(sup,"CA"))*26.9 + (overcount(sup,"TG"))*\ + 27.8 + (overcount(sup,"GT"))*29.5 + (overcount(sup,"AC"))*26.2 + vs = vs + (overcount(sup,"CT"))*27.1 + (overcount(sup,"AG"))*\ + 19.2 + (overcount(sup,"GA"))*32.5 + (overcount(sup,"TC"))*35.5 + vs = vs + (overcount(sup,"CG"))*26.7 + (overcount(sup,"GC"))\ + *36.9 + (overcount(sup,"GG"))*32.7 + (overcount(sup,"CC"))*29.7 + ds = vs + dh = vh + + ds = ds-0.368*(len(s)-1)*math.log(saltc/1e3) + tm = ((1000* (-dh))/(-ds+(R * (math.log(k)))))-273.15 + # print "ds="+str(ds) + # print "dh="+str(dh) + return tm + +if __name__ == "__main__" : + print "Quick self test" + assert Tm_staluc('CAGTCAGTACGTACGTGTACTGCCGTA') == 59.865612727457972 + assert Tm_staluc('CAGTCAGTACGTACGTGTACTGCCGTA',rna=1) == 68.141611264576682 + print "Done" diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/ProtParam.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/ProtParam.py new file mode 100644 index 0000000..d0b9ae1 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/ProtParam.py @@ -0,0 +1,251 @@ +# Copyright Yair Benita Y.Benita@pharm.uu.nl +# Biopython (http://biopython.org) license applies + +import sys +import ProtParamData, IsoelectricPoint +from ProtParamData import kd # Added by Iddo to enable the gravy method +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC +from Bio.Data import IUPACData +#from BioModule import + +class ProteinAnalysis: + """ + This class contains methods for protein analysis. The class init method takes + only one argument, the protein sequence as a string and build a sequence + object using the Bio.Seq module. This is done just to make sure the sequence + is a protein sequence and not anything else. + + methods: + + count_amino_acids: + + Simply counts the number times an amino acid is repeated in the protein + sequence. Returns a dictionary {AminoAcid:Number} and also stores the + dictionary in self.amino_acids_content. + + get_amino_acids_percent: + + The same as count_amino_acids only returns the Number in percentage of entire + sequence. Returns a dictionary and stores the dictionary in + self.amino_acids_content_percent. + + molecular_weight: + Calculates the molecular weight of a protein. + + aromaticity: + + Calculates the aromaticity value of a protein according to Lobry, 1994. It is + simply the relative frequency of Phe+Trp+Tyr. + + + instability_index: + + Implementation of the method of Guruprasad et al. (Protein Engineering + 4:155-161,1990). This method tests a protein for stability. Any value above 40 + means the protein is unstable (=has a short half life). + + flexibility: + Implementation of the flexibility method of Vihinen et al. (Proteins. 1994 Jun;19(2):141-9). + + isoelectric_point: + This method uses the module IsoelectricPoint to calculate the pI of a protein. + + secondary_structure_fraction: + This methods returns a list of the fraction of amino acids which tend to be in Helix, Turn or Sheet. + Amino acids in helix: V, I, Y, F, W, L. + Amino acids in Turn: N, P, G, S. + Amino acids in sheet: E, M, A, L. + The list contains 3 values: [Helix, Turn, Sheet]. + + + protein_scale(Scale, WindwonSize, Edge): + + An amino acid scale is defined by a numerical value assigned to each type of + amino acid. The most frequently used scales are the hydrophobicity or + hydrophilicity scales and the secondary structure conformational parameters + scales, but many other scales exist which are based on different chemical and + physical properties of the amino acids. You can set several parameters that + control the computation of a scale profile, such as the window size and the + window edge relative weight value. WindowSize: The window size is the length + of the interval to use for the profile computation. For a window size n, we + use the i- ( n-1)/2 neighboring residues on each side of residue it compute + the score for residue i. The score for residue is the sum of the scale values + for these amino acids, optionally weighted according to their position in the + window. Edge: The central amino acid of the window always has a weight of 1. + By default, the amino acids at the remaining window positions have the same + weight, but you can make the residue at the center of the window have a + larger weight than the others by setting the edge value for the residues at + the beginning and end of the interval to a value between 0 and 1. For + instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, + 1.0, 0.7, 0.4. The method returns a list of values which can be plotted to + view the change along a protein sequence. Many scales exist. Just add your + favorites to the ProtParamData modules. + """ + def __init__(self, ProtSequence): + if ProtSequence.islower(): + self.sequence = Seq(ProtSequence.upper(), IUPAC.protein) + else: + self.sequence = Seq(ProtSequence, IUPAC.protein) + self.amino_acids_content = None + self.amino_acids_percent = None + self.length = len(self.sequence) + + def count_amino_acids(self): + ProtDic = dict([ (k, 0) for k in IUPACData.protein_letters]) + for i in ProtDic.keys(): + ProtDic[i]=self.sequence.count(i) + self.amino_acids_content = ProtDic + return ProtDic + + """Calculate the amino acid content in percents. + input is the dictionary from CountAA. + output is a dictionary with AA as keys.""" + def get_amino_acids_percent(self): + if not self.amino_acids_content: + self.count_amino_acids() + + PercentAA = {} + for i in self.amino_acids_content.keys(): + if self.amino_acids_content[i] > 0: + PercentAA[i]=self.amino_acids_content[i]/float(self.length) + else: + PercentAA[i] = 0 + self.amino_acids_percent = PercentAA + return PercentAA + + # Calculate MW from Protein sequence + # Calculate MW from Protein sequence + def molecular_weight (self): + # make local dictionary for speed + MwDict = {} + # remove a molecule of water from the amino acid weight. + for i in IUPACData.protein_weights.keys(): + MwDict[i] = IUPACData.protein_weights[i] - 18.02 + MW = 18.02 # add just one water molecule for the whole sequence. + for i in self.sequence: + MW += MwDict[i] + return MW + + # calculate the aromaticity according to Lobry, 1994. + # Arom=sum of relative frequency of Phe+Trp+Tyr + def aromaticity(self): + if not self.amino_acids_percent: + self.get_amino_acids_percent() + + Arom= self.amino_acids_percent['Y']+self.amino_acids_percent['W']+self.amino_acids_percent['F'] + return Arom + + # a function to calculate the instability index according to: + # Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). + def instability_index(self): + #make the dictionary local for speed. + DIWV=ProtParamData.DIWV.copy() + score=0.0 + for i in range(self.length - 1): + DiPeptide=DIWV[self.sequence[i]][self.sequence[i+1]] + score += DiPeptide + return (10.0/self.length) * score + + # Calculate the flexibility according to Vihinen, 1994. + # No argument to change window size because parameters are specific for a window=9. + # the parameters used are optimized for determining the flexibility. + def flexibility(self): + Flex = ProtParamData.Flex.copy() + Window=9 + Weights=[0.25,0.4375,0.625,0.8125,1] + List=[] + for i in range(self.length - Window): + SubSeq=self.sequence[i:i+Window] + score = 0.0 + for j in range(Window/2): + score += (Flex[SubSeq[j]]+Flex[SubSeq[Window-j-1]]) * Weights[j] + score += Flex[SubSeq[Window/2+1]] + List.append(score/5.25) + return List + + # calculate the gravy according to kyte and doolittle. + def gravy(self): + ProtGravy=0.0 + for i in self.sequence: + ProtGravy += kd[i] + + return ProtGravy/self.length + + # this method is used to make a list of relative weight of the + # window edges compared to the window center. The weights are linear. + # it actually generates half a list. For a window of size 9 and edge 0.4 + # you get a list of [0.4, 0.55, 0.7, 0.85]. + def _weight_list(self, window, edge): + unit = ((1.0-edge)/(window-1))*2 + list = [0.0]*(window/2) + for i in range(window/2): + list[i] = edge + unit * i + return list + + # this method allows you to compute and represent the profile produced + # by any amino acid scale on a selected protein. + # Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl + # The weight list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] + # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done + # in the loop. + def protein_scale(self, ParamDict, Window, Edge=1.0): + # generate the weights + weight = self._weight_list(Window,Edge) + list = [] + # the score in each Window is divided by the sum of weights + sum_of_weights = 0.0 + for i in weight: sum_of_weights += i + # since the weight list is one sided: + sum_of_weights = sum_of_weights*2+1 + + for i in range(self.length-Window+1): + subsequence = self.sequence[i:i+Window] + score = 0.0 + for j in range(Window/2): + # walk from the outside of the Window towards the middle. + # Iddo: try/except clauses added to avoid raising an exception on a non-standad amino acid + try: + score += weight[j] * ParamDict[subsequence[j]] + weight[j] * ParamDict[subsequence[Window-j-1]] + except KeyError: + sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % + (subsequence[j],subsequence[Window-j-1])) + + # Now add the middle value, which always has a weight of 1. + if subsequence[Window/2] in ParamDict: + score += ParamDict[subsequence[Window/2]] + else: + sys.stderr.write('warning: %s is not a standard amino acid.\n' % (subsequence[Window/2])) + + list.append(score/sum_of_weights) + return list + + # calculate the isoelectric point. + def isoelectric_point(self): + if not self.amino_acids_content: + self.count_amino_acids() + X = IsoelectricPoint.IsoelectricPoint(self.sequence, self.amino_acids_content) + return X.pi() + + # calculate fraction of helix, turn and sheet + def secondary_structure_fraction (self): + if not self.amino_acids_percent: + self.get_amino_acids_percent() + Helix = self.amino_acids_percent['V'] + self.amino_acids_percent['I'] + self.amino_acids_percent['Y'] + self.amino_acids_percent['F'] + self.amino_acids_percent['W'] + self.amino_acids_percent['L'] + Turn = self.amino_acids_percent['N'] + self.amino_acids_percent['P'] + self.amino_acids_percent['G'] + self.amino_acids_percent['S'] + Sheet = self.amino_acids_percent['E'] + self.amino_acids_percent['M'] + self.amino_acids_percent['A'] + self.amino_acids_percent['L'] + return Helix, Turn, Sheet + +#---------------------------------------------------------# +""" +X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV") +print X.count_amino_acids() +print X.get_amino_acids_percent() +print X.molecular_weight() +print X.aromaticity() +print X.instability_index() +print X.flexibility() +print X.pi() +print X.secondary_structure_fraction() +print X.protein_scale(ProtParamData.kd, 9, 0.4) +""" diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/ProtParamData.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/ProtParamData.py new file mode 100644 index 0000000..6a52e8f --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/ProtParamData.py @@ -0,0 +1,43 @@ +# This module contains indices to be used with ProtParam + +# Kyte & Doolittle index of hydrophobicity +kd = { 'A': 1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C': 2.5, + 'Q':-3.5,'E':-3.5,'G':-0.4,'H':-3.2,'I': 4.5, + 'L': 3.8,'K':-3.9,'M': 1.9,'F': 2.8,'P':-1.6, + 'S':-0.8,'T':-0.7,'W':-0.9,'Y':-1.3,'V': 4.2 } + +# Flexibility +# Normalized flexibility parameters (B-values), average (Vihinen et al., 1994) +Flex= {'A': 0.984, 'C': 0.906, 'E': 1.094, 'D': 1.068, + 'G': 1.031, 'F': 0.915, 'I': 0.927, 'H': 0.950, + 'K': 1.102, 'M': 0.952, 'L': 0.935, 'N': 1.048, + 'Q': 1.037, 'P': 1.049, 'S': 1.046, 'R': 1.008, + 'T': 0.997, 'W': 0.904, 'V': 0.931, 'Y': 0.929} + +# Hydrophilicity +# 1 Hopp & Wood +# Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981). +hw = { 'A':-0.5,'R':3.0, 'N':0.2, 'D':3.0, 'C':-1.0, + 'Q':0.2, 'E':3.0, 'G':0.0, 'H':-0.5,'I':-1.8, + 'L':-1.8,'K':3.0, 'M':-1.3,'F':-2.5,'P':0.0, + 'S':0.3, 'T':-0.4,'W':-3.4,'Y':-2.3,'V':-1.5 } + +# Surface accessibility +# 1 Emini Surface fractional probability +em = { 'A':0.815,'R':1.475,'N':1.296,'D':1.283,'C':0.394, + 'Q':1.348,'E':1.445,'G':0.714,'H':1.180,'I':0.603, + 'L':0.603,'K':1.545,'M':0.714,'F':0.695,'P':1.236, + 'S':1.115,'T':1.184,'W':0.808,'Y':1.089,'V':0.606 } + +# 2 Janin Interior to surface transfer energy scale +ja = { 'A': 0.28,'R':-1.14,'N':-0.55,'D':-0.52,'C': 0.97, + 'Q':-0.69,'E':-1.01,'G': 0.43,'H':-0.31,'I': 0.60, + 'L': 0.60,'K':-1.62,'M': 0.43,'F': 0.46,'P':-0.42, + 'S':-0.19,'T':-0.32,'W': 0.29,'Y':-0.15,'V': 0.60 } + + +# A two dimentional dictionary for calculating the instability index. +# Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). +# It is based on dipeptide values therefore the vale for the dipeptide DG is DIWV['D']['G']. +# I know this looks ugly but i can't think of a better way to display it. +DIWV = {'A': {'A': 1.0, 'C': 44.94, 'E': 1.0, 'D': -7.49, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': -7.49, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0},'C': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 20.26, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 33.60, 'K': 1.0, 'M': 33.60, 'L': 20.26, 'N': 1.0, 'Q': -6.54, 'P': 20.26, 'S': 1.0, 'R': 1.0, 'T': 33.60, 'W': 24.68, 'V': -6.54, 'Y': 1.0},'E': {'A': 1.0, 'C': 44.94, 'E': 33.60, 'D': 20.26, 'G': 1.0, 'F': 1.0, 'I': 20.26, 'H': -6.54, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': 1.0, 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, 'D': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 1.0, 'S': 20.26, 'R': -6.54, 'T': -14.03, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, 'G': {'A': -7.49, 'C': 1.0, 'E': -6.54, 'D': 1.0, 'G': 13.34, 'F': 1.0, 'I': -7.49, 'H': 1.0, 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': -7.49, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, 'T': -7.49, 'W': 13.34, 'V': 1.0, 'Y': -7.49}, 'F': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 13.34, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, 'K': -14.03, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 33.601}, 'I': {'A': 1.0, 'C': 1.0, 'E': 44.94, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 13.34, 'K': -7.49, 'M': 1.0, 'L': 20.26, 'N': 1.0, 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, 'H': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': -9.37, 'I': 44.94, 'H': 1.0, 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 24.68, 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, 'T': -6.54, 'W': -1.88, 'V': 1.0, 'Y': 44.94}, 'K': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': -7.49, 'H': 1.0, 'K': 1.0, 'M': 33.60, 'L': -7.49, 'N': 1.0, 'Q': 24.64, 'P': -6.54, 'S': 1.0, 'R': 33.60, 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, 'M': {'A': 13.34, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 58.28, 'K': 1.0, 'M': -1.88, 'L': 1.0, 'N': 1.0, 'Q': -6.54, 'P': 44.94, 'S': 44.94, 'R': -6.54, 'T': -1.88, 'W': 1.0, 'V': 1.0, 'Y': 24.68}, 'L': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 33.60, 'P': 20.26, 'S': 1.0, 'R': 20.26, 'T': 1.0, 'W': 24.68, 'V': 1.0, 'Y': 1.0}, 'N': {'A': 1.0, 'C': -1.88, 'E': 1.0, 'D': 1.0, 'G': -14.03, 'F': -14.03, 'I': 44.94, 'H': 1.0, 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': -6.54, 'P': -1.88, 'S': 1.0, 'R': 1.0, 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 1.0}, 'Q': {'A': 1.0, 'C': -6.54, 'E': 20.26, 'D': 20.26, 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 1.0, 'T': 1.0, 'W': 1.0, 'V': -6.54, 'Y': -6.54}, 'P': {'A': 20.26, 'C': -6.54, 'E': 18.38, 'D': -6.54, 'G': 1.0, 'F': 20.26, 'I': 1.0, 'H': 1.0, 'K': 1.0, 'M': -6.54, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': -6.54, 'T': 1.0, 'W': -1.88, 'V': 20.26, 'Y': 1.0}, 'S': {'A': 1.0, 'C': 33.60, 'E': 20.26, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 44.94, 'S': 20.26, 'R': 20.26, 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, 'R': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 20.26, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 13.34, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 58.28, 'T': 1.0, 'W': 58.28, 'V': 1.0, 'Y': -6.54}, 'T': {'A': 1.0, 'C': 1.0, 'E': 20.26, 'D': 1.0, 'G': -7.49, 'F': 13.34, 'I': 1.0, 'H': 1.0, 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': -14.03, 'Q': -6.54, 'P': 1.0, 'S': 1.0, 'R': 1.0, 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, 'W': {'A': -14.03, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': 1.0, 'I': 1.0, 'H': 24.68, 'K': 1.0, 'M': 24.68, 'L': 13.34, 'N': 13.34, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, 'T': -14.03, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, 'V': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': -14.03, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 1.0, 'K': -1.88, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, 'T': -7.49, 'W': 1.0, 'V': 1.0, 'Y': -6.54}, 'Y': {'A': 24.68, 'C': 1.0, 'E': -6.54, 'D': 24.68, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 13.34, 'K': 1.0, 'M': 44.94, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 13.34, 'S': 1.0, 'R': -15.91, 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 13.34}} \ No newline at end of file diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/__init__.py new file mode 100644 index 0000000..d606ed7 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/__init__.py @@ -0,0 +1,570 @@ +#!/usr/bin/env python +# Created: Wed May 29 08:07:18 2002 +# thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se +# Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Miscellaneous functions for dealing with sequences.""" + +import re, time +from Bio import SeqIO +from Bio import Translate +from Bio.Seq import Seq +from Bio import Alphabet +from Bio.Alphabet import IUPAC +from Bio.Data import IUPACData, CodonTable + + +###################################### +# DNA +###################### +# {{{ + +def reverse(seq): + """Reverse the sequence. Works on string sequences. + + e.g. + >>> reverse("ACGGT") + 'TGGCA' + + """ + r = list(seq) + r.reverse() + return ''.join(r) + +def GC(seq): + """Calculates G+C content, returns the percentage (float between 0 and 100). + + Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C) + when counting the G and C content. The percentage is calculated against + the full length, e.g.: + + >>> from Bio.SeqUtils import GC + >>> GC("ACTGN") + 40.0 + + Note that this will return zero for an empty sequence. + """ + try : + gc = sum(map(seq.count,['G','C','g','c','S','s'])) + return gc*100.0/len(seq) + except ZeroDivisionError : + return 0.0 + + +def GC123(seq): + """Calculates total G+C content plus first, second and third positions. + + Returns a tuple of four floats (percentages between 0 and 100) for the + entire sequence, and the three codon positions. e.g. + + >>> from Bio.SeqUtils import GC123 + >>> GC123("ACTGTN") + (40.0, 50.0, 50.0, 0.0) + + Copes with mixed case sequences, but does NOT deal with ambiguous + nucleotides. + """ + d= {} + for nt in ['A','T','G','C']: + d[nt] = [0,0,0] + + for i in range(0,len(seq),3): + codon = seq[i:i+3] + if len(codon) <3: codon += ' ' + for pos in range(0,3): + for nt in ['A','T','G','C']: + if codon[pos] == nt or codon[pos] == nt.lower(): + d[nt][pos] += 1 + gc = {} + gcall = 0 + nall = 0 + for i in range(0,3): + try: + n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] + gc[i] = (d['G'][i] + d['C'][i])*100.0/n + except: + gc[i] = 0 + + gcall = gcall + d['G'][i] + d['C'][i] + nall = nall + n + + gcall = 100.0*gcall/nall + return gcall, gc[0], gc[1], gc[2] + +def GC_skew(seq, window = 100): + """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence. + + Returns a list of ratios (floats), controlled by the length of the sequence + and the size of the window. + + Does NOT look at any ambiguous nucleotides. + """ + # 8/19/03: Iddo: added lowercase + values = [] + for i in range(0, len(seq), window): + s = seq[i: i + window] + g = s.count('G') + s.count('g') + c = s.count('C') + s.count('c') + skew = (g-c)/float(g+c) + values.append(skew) + return values + +from math import pi, sin, cos, log +def xGC_skew(seq, window = 1000, zoom = 100, + r = 300, px = 100, py = 100): + """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" + from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \ + VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y + yscroll = Scrollbar(orient = VERTICAL) + xscroll = Scrollbar(orient = HORIZONTAL) + canvas = Canvas(yscrollcommand = yscroll.set, + xscrollcommand = xscroll.set, background = 'white') + win = canvas.winfo_toplevel() + win.geometry('700x700') + + yscroll.config(command = canvas.yview) + xscroll.config(command = canvas.xview) + yscroll.pack(side = RIGHT, fill = Y) + xscroll.pack(side = BOTTOM, fill = X) + canvas.pack(fill=BOTH, side = LEFT, expand = 1) + canvas.update() + + X0, Y0 = r + px, r + py + x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r + + ty = Y0 + canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) + ty +=20 + canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) + ty +=20 + canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') + ty +=20 + canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') + ty +=20 + canvas.create_oval(x1,y1, x2, y2) + + acc = 0 + start = 0 + for gc in GC_skew(seq, window): + r1 = r + acc+=gc + # GC skew + alpha = pi - (2*pi*start)/len(seq) + r2 = r1 - gc*zoom + x1 = X0 + r1 * sin(alpha) + y1 = Y0 + r1 * cos(alpha) + x2 = X0 + r2 * sin(alpha) + y2 = Y0 + r2 * cos(alpha) + canvas.create_line(x1,y1,x2,y2, fill = 'blue') + # accumulated GC skew + r1 = r - 50 + r2 = r1 - acc + x1 = X0 + r1 * sin(alpha) + y1 = Y0 + r1 * cos(alpha) + x2 = X0 + r2 * sin(alpha) + y2 = Y0 + r2 * cos(alpha) + canvas.create_line(x1,y1,x2,y2, fill = 'magenta') + + canvas.update() + start += window + + canvas.configure(scrollregion = canvas.bbox(ALL)) + +def molecular_weight(seq): + """Calculate the molecular weight of a DNA sequence.""" + if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) + weight_table = IUPACData.unambiguous_dna_weights + #TODO, use a generator expession once we drop Python 2.3? + #e.g. return sum(weight_table[x] for x in seq) + total = 0 + for x in seq: + total += weight_table[x] + return total + +def nt_search(seq, subseq): + """Search for a DNA subseq in sequence. + + use ambiguous values (like N = A or T or C or G, R = A or G etc.) + searches only on forward strand + """ + pattern = '' + for nt in subseq: + value = IUPACData.ambiguous_dna_values[nt] + if len(value) == 1: + pattern += value + else: + pattern += '[%s]' % value + + pos = -1 + result = [pattern] + l = len(seq) + while True: + pos+=1 + s = seq[pos:] + m = re.search(pattern, s) + if not m: break + pos += int(m.start(0)) + result.append(pos) + return result + +# }}} + +###################################### +# Protein +###################### +# {{{ + +# temporary hack for exception free translation of "dirty" DNA +# should be moved to ??? + +class ProteinX(Alphabet.ProteinAlphabet): + letters = IUPACData.extended_protein_letters + "X" + +proteinX = ProteinX() + +class MissingTable: + def __init__(self, table): + self._table = table + def get(self, codon, stop_symbol): + try: + return self._table.get(codon, stop_symbol) + except CodonTable.TranslationError: + return 'X' + +def makeTableX(table): + assert table.protein_alphabet == IUPAC.extended_protein + return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX, + MissingTable(table.forward_table), + table.back_table, table.start_codons, + table.stop_codons) + +# end of hacks + +def seq3(seq): + """Turn a one letter code protein sequence into one with three letter codes. + + The single input argument 'seq' should be a protein sequence using single + letter codes, either as a python string or as a Seq or MutableSeq object. + + This function returns the amino acid sequence as a string using the three + letter amino acid codes. Output follows the IUPAC standard (including + ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U + for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown + character (including possible gap characters), is changed into 'Xaa'. + + e.g. + >>> from Bio.SeqUtils import seq3 + >>> seq3("MAIVMGRWKGAR*") + 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' + + This function was inspired by BioPerl's seq3. + """ + threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', + 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', + 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', + 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', + 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', + 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', + 'U':'Sel', 'O':'Pyl', 'J':'Xle', + } + #We use a default of 'Xaa' for undefined letters + #Note this will map '-' to 'Xaa' which may be undesirable! + return ''.join([threecode.get(aa,'Xaa') for aa in seq]) + + +# }}} + +###################################### +# Mixed ??? +###################### +# {{{ + +def translate(seq, frame = 1, genetic_code = 1, translator = None): + """Translation of DNA in one of the six different reading frames (DEPRECATED). + + Use the Bio.Seq.Translate function, or the Seq object's translate method + instead: + + >>> from Bio.Seq import Seq + >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") + >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA") + >>> for frame in [0,1,2] : + ... print my_seq[frame:].translate() + ... + MAIVMGR*KGAR* + WPL*WAAERVPDS + GHCNGPLKGCPIV + >>> for frame in [0,1,2] : + ... print my_seq.reverse_complement()[frame:].translate() + ... + YYRAPFQRPITMA + TIGHPFSGPLQWP + LSGTLSAAHYNGH + """ + import warnings + warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \ + +" to remove it in a future release of Biopython. Please use"\ + +" the method or function in Bio.Seq instead, as described in"\ + +" the Tutorial.", DeprecationWarning) + + if frame not in [1,2,3,-1,-2,-3]: + raise ValueError('invalid frame') + + if not translator: + table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code]) + translator = Translate.Translator(table) + + #Does this frame calculation do something sensible? No RC taken! + return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data + +def GC_Frame(seq, genetic_code = 1): + """Just an alias for six_frame_translations (OBSOLETE). + + Use six_frame_translation directly, as this function may be deprecated + in a future release.""" + return six_frame_translations(seq, genetic_code) + +def six_frame_translations(seq, genetic_code = 1): + """Formatted string showing the 6 frame translations and GC content. + + nice looking 6 frame translation with GC content - code from xbbtools + similar to DNA Striders six-frame translation + + e.g. + from Bio.SeqUtils import six_frame_translations + print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") + """ + from Bio.Seq import reverse_complement, translate + anti = reverse_complement(seq) + comp = anti[::-1] + length = len(seq) + frames = {} + for i in range(0,3): + frames[i+1] = translate(seq[i:], genetic_code) + frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) + + # create header + if length > 20: + short = '%s ... %s' % (seq[:10], seq[-10:]) + else: + short = seq + #TODO? Remove the date as this would spoil any unit test... + date = time.strftime('%y %b %d, %X', time.localtime(time.time())) + header = 'GC_Frame: %s, ' % date + for nt in ['a','t','g','c']: + header += '%s:%d ' % (nt, seq.count(nt.upper())) + + header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) + res = header + + for i in range(0,length,60): + subseq = seq[i:i+60] + csubseq = comp[i:i+60] + p = i/3 + res = res + '%d/%d\n' % (i+1, i/3+1) + res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' + res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' + res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' + # seq + res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) + res = res + csubseq.lower() + '\n' + # - frames + res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' + res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' + res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' + return res + +# }}} + +###################################### +# FASTA file utilities +###################### +# {{{ + +def fasta_uniqids(file): + """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE). + + file - a FASTA format filename to read in. + + No return value, the output is written to screen. + """ + dict = {} + txt = open(file).read() + entries = [] + for entry in txt.split('>')[1:]: + name, seq= entry.split('\n',1) + name = name.split()[0].split(',')[0] + + if name in dict: + n = 1 + while 1: + n = n + 1 + _name = name + str(n) + if _name not in dict: + name = _name + break + + dict[name] = seq + + for name, seq in dict.items(): + print '>%s\n%s' % (name, seq) + +def quick_FASTA_reader(file): + """Simple FASTA reader, returning a list of string tuples. + + The single argument 'file' should be the filename of a FASTA format file. + This function will open and read in the entire file, constructing a list + of all the records, each held as a tuple of strings (the sequence name or + title, and its sequence). + + This function was originally intended for use on large files, where its + low overhead makes it very fast. However, because it returns the data as + a single in memory list, this can require a lot of RAM on large files. + + You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which + allows you to iterate over the records one by one (avoiding having all the + records in memory at once). Using Bio.SeqIO also makes it easy to switch + between different input file formats. However, please note that rather + than simple strings, Bio.SeqIO uses SeqRecord objects for each record. + """ + #Want to split on "\n>" not just ">" in case there are any extra ">" + #in the name/description. So, in order to make sure we also split on + #the first entry, prepend a "\n" to the start of the file. + handle = open(file) + txt = "\n" + handle.read() + handle.close() + entries = [] + for entry in txt.split('\n>')[1:]: + name,seq= entry.split('\n',1) + seq = seq.replace('\n','').replace(' ','').upper() + entries.append((name, seq)) + return entries + +def apply_on_multi_fasta(file, function, *args): + """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). + + file - filename of a FASTA format file + function - the function you wish to invoke on each record + *args - any extra arguments you want passed to the function + + This function will iterate over each record in a FASTA file as SeqRecord + objects, calling your function with the record (and supplied args) as + arguments. + + This function returns a list. For those records where your function + returns a value, this is taken as a sequence and used to construct a + FASTA format string. If your function never has a return value, this + means apply_on_multi_fasta will return an empty list. + """ + try: + f = globals()[function] + except: + raise NotImplementedError("%s not implemented" % function) + + handle = open(file, 'r') + records = SeqIO.parse(handle, "fasta") + results = [] + for record in records: + arguments = [record.sequence] + for arg in args: arguments.append(arg) + result = f(*arguments) + if result: + results.append('>%s\n%s' % (record.name, result)) + handle.close() + return results + +def quicker_apply_on_multi_fasta(file, function, *args): + """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). + + file - filename of a FASTA format file + function - the function you wish to invoke on each record + *args - any extra arguments you want passed to the function + + This function will use quick_FASTA_reader to load every record in the + FASTA file into memory as a list of tuples. For each record, it will + call your supplied function with the record as a tuple of the name and + sequence as strings (plus any supplied args). + + This function returns a list. For those records where your function + returns a value, this is taken as a sequence and used to construct a + FASTA format string. If your function never has a return value, this + means quicker_apply_on_multi_fasta will return an empty list. + """ + try: + f = globals()[function] + except: + raise NotImplementedError("%s not implemented" % function) + + entries = quick_FASTA_reader(file) + results = [] + for name, seq in entries: + arguments = [seq] + for arg in args: arguments.append(arg) + result = f(*arguments) + if result: + results.append('>%s\n%s' % (name, result)) + handle.close() + return results + +# }}} + +###################################### +# Main +##################### +# {{{ + +if __name__ == '__main__': + import sys, getopt + # crude command line options to use most functions directly on a FASTA file + options = {'apply_on_multi_fasta':0, + 'quick':0, + 'uniq_ids':0, + } + + optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=', + 'help', 'quick', 'uniq_ids', 'search=']) + for arg in optlist: + if arg[0] in ['-h', '--help']: + pass + elif arg[0] in ['--describe']: + # get all new functions from this file + mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)] + mol_funcs.sort() + print 'available functions:' + for f in mol_funcs: print '\t--%s' % f + print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas' + + sys.exit(0) + elif arg[0] in ['--apply_on_multi_fasta']: + options['apply_on_multi_fasta'] = arg[1] + elif arg[0] in ['--search']: + options['search'] = arg[1] + else: + key = re.search('-*(.+)', arg[0]).group(1) + options[key] = 1 + + + if options.get('apply_on_multi_fasta'): + file = args[0] + function = options['apply_on_multi_fasta'] + arguments = [] + if options.get('search'): + arguments = options['search'] + if function == 'xGC_skew': + arguments = 1000 + if options.get('quick'): + results = quicker_apply_on_multi_fasta(file, function, arguments) + else: + results = apply_on_multi_fasta(file, function, arguments) + for result in results: print result + + elif options.get('uniq_ids'): + file = args[0] + fasta_uniqids(file) + +# }}} + diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/lcc.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/lcc.py new file mode 100644 index 0000000..1fb71a2 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/lcc.py @@ -0,0 +1,162 @@ +# Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com +# All rights reserved. This code is part of the Biopython +# distribution and governed by its license. +# Please see the LICENSE file that should have been included as part +# of this package. + +import math + +def lcc_mult(seq,wsize): + """Local Composition Complexity (LCC) values over sliding window. + + Returns a list of floats, the LCC values for a sliding window over + the sequence. + + seq - an unambiguous DNA sequence (a string or Seq object) + wsize - window size, integer + + The result is the same as applying lcc_simp multiple times, but this + version is optimized for speed. The optimization works by using the + value of previous window as a base to compute the next one.""" + l2=math.log(2) + tamseq=len(seq) + try : + #Assume its a string + upper = seq.upper() + except AttributeError : + #Should be a Seq object then + upper = seq.tostring().upper() + compone=[0] + lccsal=[0] + for i in range(wsize): + compone.append(((i+1)/float(wsize))* + ((math.log((i+1)/float(wsize)))/l2)) + window=seq[0:wsize] + cant_a=window.count('A') + cant_c=window.count('C') + cant_t=window.count('T') + cant_g=window.count('G') + term_a=compone[cant_a] + term_c=compone[cant_c] + term_t=compone[cant_t] + term_g=compone[cant_g] + lccsal.append(-(term_a+term_c+term_t+term_g)) + tail=seq[0] + for x in range (tamseq-wsize): + window=upper[x+1:wsize+x+1] + if tail==window[-1]: + lccsal.append(lccsal[-1]) + elif tail=='A': + cant_a=cant_a-1 + if window.endswith('C'): + cant_c=cant_c+1 + term_a=compone[cant_a] + term_c=compone[cant_c] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif window.endswith('T'): + cant_t=cant_t+1 + term_a=compone[cant_a] + term_t=compone[cant_t] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif window.endswith('G'): + cant_g=cant_g+1 + term_a=compone[cant_a] + term_g=compone[cant_g] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif tail=='C': + cant_c=cant_c-1 + if window.endswith('A'): + cant_a=cant_a+1 + term_a=compone[cant_a] + term_c=compone[cant_c] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif window.endswith('T'): + cant_t=cant_t+1 + term_c=compone[cant_c] + term_t=compone[cant_t] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif window.endswith('G'): + cant_g=cant_g+1 + term_c=compone[cant_c] + term_g=compone[cant_g] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif tail=='T': + cant_t=cant_t-1 + if window.endswith('A'): + cant_a=cant_a+1 + term_a=compone[cant_a] + term_t=compone[cant_t] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif window.endswith('C'): + cant_c=cant_c+1 + term_c=compone[cant_c] + term_t=compone[cant_t] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif window.endswith('G'): + cant_g=cant_g+1 + term_t=compone[cant_t] + term_g=compone[cant_g] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif tail=='G': + cant_g=cant_g-1 + if window.endswith('A'): + cant_a=cant_a+1 + term_a=compone[cant_a] + term_g=compone[cant_g] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif window.endswith('C'): + cant_c=cant_c+1 + term_c=compone[cant_c] + term_g=compone[cant_g] + lccsal.append(-(term_a+term_c+term_t+term_g)) + elif window.endswith('T'): + cant_t=cant_t+1 + term_t=compone[cant_t] + term_g=compone[cant_g] + lccsal.append(-(term_a+term_c+term_t+term_g)) + tail=window[0] + return lccsal + +def lcc_simp(seq): + """Local Composition Complexity (LCC) for a sequence. + + seq - an unambiguous DNA sequence (a string or Seq object) + + Returns the Local Composition Complexity (LCC) value for the entire + sequence (as a float). + + Reference: + Andrzej K Konopka (2005) Sequence Complexity and Composition + DOI: 10.1038/npg.els.0005260 + """ + wsize=len(seq) + try : + #Assume its a string + upper = seq.upper() + except AttributeError : + #Should be a Seq object then + upper = seq.tostring().upper() + l2=math.log(2) + if 'A' not in seq: + term_a=0 + # Check to avoid calculating the log of 0. + else: + term_a=((upper.count('A'))/float(wsize))*((math.log((upper.count('A')) + /float(wsize)))/l2) + if 'C' not in seq: + term_c=0 + else: + term_c=((upper.count('C'))/float(wsize))*((math.log((upper.count('C')) + /float(wsize)))/l2) + if 'T' not in seq: + term_t=0 + else: + term_t=((upper.count('T'))/float(wsize))*((math.log((upper.count('T')) + /float(wsize)))/l2) + if 'G' not in seq: + term_g=0 + else: + term_g=((upper.count('G'))/float(wsize))*((math.log((upper.count('G')) + /float(wsize)))/l2) + lccsal=-(term_a+term_c+term_t+term_g) + return lccsal diff --git a/binaries/src/globplot/biopython-1.50/Bio/Std.py b/binaries/src/globplot/biopython-1.50/Bio/Std.py new file mode 100644 index 0000000..ba44af1 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Std.py @@ -0,0 +1,503 @@ +# This is a Python module. +"""This module is DEPRECATED. + +Andrew Dalke is no longer maintaining Martel or Bio.Mindy, and these modules +and associate ones like Bio.Std are now deprecated. They are no longer +used in any of the current Biopython parsers, and are likely to be removed +in a future release. +""" + +import warnings +warnings.warn("Martel and those parts of Biopython depending on it" \ + +" directly (such as Bio.Mindy and Bio.Std) are now" \ + +" deprecated, and will be removed in a future release of"\ + +" Biopython. If you want to continue to use this code,"\ + +" please get in contact with the Biopython developers via"\ + +" the mailing lists to avoid its permanent removal from"\ + +" Biopython.", \ + DeprecationWarning) +# Standard Bioformats definitions + +import Martel +Group = Martel.Group + +namespace = "bioformat" +NS = namespace + ":" +XMLNS = "http://biopython.org/bioformat" + +def _set_if_given(attrs, field, d, valid = None, convert = None): + value = attrs.get(field) + if value is not None: + if valid is not None: + if value not in valid: + raise TypeError("%s (%r) must be one of %s" % \ + (field, value, valid)) + if convert is None: + d[field] = value + else: + d[field] = convert(value) + +def _complain_if_given(attrs, name): + if attrs.has_key(name) and attrs[name] is not None: + raise NotImplementedError("Don't yet handle %r" % (name,)) + +def _must_have(expr, f): + tag = f.tag + if tag not in expr.group_names(): + raise TypeError( + "group %r not present in the expression but is required" % \ + (tag,)) + +def _must_have_set(expr, sets): + names = expr.group_names() + for set in sets: + for f in set: + tag = f.tag + if tag not in names: + break + else: + return + if len(sets) == 1: + raise TypeError("missing required tags (need %s) in expression" % + [f.tag for f in sets[0]]) + lines = ["missing required tags in expression; must have one set from:"] + for set in sets: + lines.append( str( [t.tag for f in set] ) ) + s = "\n".join(lines) + raise TypeError(s) + +def _must_not_have(expr, f): + f.tag + if tag in expr.group_names(): + raise TypeError( + "group %r present in the expression but is not allowed" % \ + (tag,)) + + +# pre- Python 2.2 functions didn't allow attributes +def _f(): + pass +try: + _f.x = 1 + _use_hack = 0 +except AttributeError: + _use_hack = 1 +del _f + +def _check_name(f, text): + if text == "record": # XXX FIXME + return + assert NS + f.func_name == text, (NS + ":" + f.func_name, text) + +def _check_attrs(attrs, names): + for name in attrs.keys(): + if name not in names: + raise TypeError("attr %r is not allowed here (valid terms: %s)" % \ + (name, names)) + d = attrs.copy() + for name in names: + if not d.has_key(name): + d[name] = None + return d + +if not _use_hack: + def _settag(f, tag): + _check_name(f, tag) + f.tag = tag +else: + # Convert the functions into callable objects + class StdTerm: + def __init__(self, func): + self._func = func + def __call__(self, *args, **kwargs): + return self._func( *args, **kwargs) + + def _settag(f, tag): + _check_name(f, tag) + x = globals()[f.func_name] = StdTerm(f) + x.tag = tag + +################ identifier, description, and cross-references +def record(expr, attrs = {}): + attrs = _check_attrs(attrs, ("format",)) + d = {"xmlns:bioformat": XMLNS} + _set_if_given(attrs, "format", d) + return Group("record", expr, d) # XXX FIXME +_settag(record, "record") # XXX AND FIXME + + +def dbid(expr, attrs = {}): + attrs = _check_attrs(attrs, ("type", "style", "dbname")) + d = {} + _set_if_given(attrs, "type", d, ("primary", "accession", "secondary")) + _set_if_given(attrs, "dbname", d) + return Group(NS + "dbid", expr, d) +_settag(dbid, NS + "dbid") + +def description_block(expr, attrs = {}): + attrs = _check_attrs(attrs, ("join",)) + _must_have(expr, description) + d = {} + _set_if_given(attrs, "join", d, ("english", "concat", "space", "newline")) + return Group(NS + "description_block", expr, d) +_settag(description_block, NS + "description_block") + +def description(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "description", expr) +_settag(description, NS + "description") + +def description_line(expr, attrs = {}): + return description_block(description(expr, attrs)) + +def fast_dbxref(expr, attrs = {}): + attrs = _check_attrs(attrs, ("style",)) + d = {} + _set_if_given(attrs, "style", d, ("sp-general", "sp-prosite", "sp-embl")) + return Group(NS + "fast_dbxref", expr, d) + +def dbxref(expr, attrs = {}): + attrs = _check_attrs(attrs, ("style",)) + _must_have(expr, dbxref_dbid) + d = {} + _complain_if_given(attrs, "style") + return Group(NS + "dbxref", expr, d) +_settag(dbxref, NS + "dbxref") + +def dbxref_dbname(expr, attrs = {}): + attrs = _check_attrs(attrs, ("style",)) + d = {} + _set_if_given(attrs, "style", d) + return Group(NS + "dbxref_dbname", expr, d) +_settag(dbxref_dbname, NS + "dbxref_dbname") + +def dbxref_dbid(expr, attrs = {}): + attrs = _check_attrs(attrs, ("dbname", "type", "style", "negate")) + d = {} + _set_if_given(attrs, "dbname", d) + _set_if_given(attrs, "type", d, ("primary", "accession", "secondary")) + _complain_if_given(attrs, "style") + _set_if_given(attrs, "negate", d, (0, 1), str) + + return Group(NS + "dbxref_dbid", expr, d) +_settag(dbxref_dbid, NS + "dbxref_dbid") + +def dbxref_negate(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "dbxref_negate", expr) +_settag(dbxref_negate, NS + "dbxref_negate") + +##################### sequences + +def _check_gapchar(s): + if not ( ord(" ") <= ord(s) <= 126 ): + raise TypeError("%r not allowed as a gap character" % (s,)) + return s + +# What about three letter codes? +def sequence_block(expr, attrs = {}): + attrs = _check_attrs(attrs, ("alphabet", "gapchar", "remove_spaces")) + _must_have(expr, sequence) + d = {} + _set_if_given(attrs, "alphabet", d, + ("iupac-protein", "iupac-dna", "iupac-rna", + "iupac-ambiguous-protein", + "iupac-ambiguous-dna", + "iupac-ambiguous-rna", + "protein", "dna", "rna", "unknown")) + _set_if_given(attrs, "gapchar", d, convert = _check_gapchar) + _set_if_given(attrs, "remove_spaces", d, (0, 1), str) + return Group(NS + "sequence_block", expr, d) +_settag(sequence_block, NS + "sequence_block") + +def sequence(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "sequence", expr) +_settag(sequence, NS + "sequence") + +def alphabet(expr, attrs = {}): + attrs = _check_attrs(attrs, ("alphabet",)) + d = {} + _set_if_given(attrs, "alphabet", d, + ("iupac-protein", "iupac-dna", "iupac-rna", + "iupac-ambiguous-protein", + "iupac-ambiguous-dna", + "iupac-ambiguous-rna", + "protein", "dna", "rna", "nucleotide", "unknown")) + return Group(NS + "alphabet", expr, d) +_settag(alphabet, NS + "alphabet") + + + +############################## features + +# In PIR + +# FEATURE +# 1-25 #domain signal sequence #status predicted #label SIG\ +# 26-737 #product procollagen-lysine 5-dioxygenase 2 #status +# predicted #label MAT\ +# 63,209,297,365,522, +# 725 #binding_site carbohydrate (Asn) (covalent) #status +# predicted + +# The whole thing is a 'feature_block' + +# One 'feature' is +# 26-737 #product procollagen-lysine 5-dioxygenase 2 #status +# predicted #label MAT\ + +# One 'feature_name' is "binding_site". + +# An example of the feature_location_block and feature_block, which I +# will abbreviate as 'flb' and 'fl', is: +# 63,209,297,365,522, +# 725 #binding_site carbohydrate ... + +# PIR doesn't have a 'feature_description' + +# Let: +# fq = feature_qualifier +# fqb = feature_qualifier +# fqn = feature_qualifier_name +# fqd = feature_qualifier_description +# then the text +# +# 26-737 #product procollagen-lysine 5-dioxygenase 2 #status +# predicted #label MAT\ +# +# can be represented as (the rather tedious) +# +# 26-737 #product procollagen-\ +# lysine 5-dioxygenase 2 #status +# predicted #label\ +# MAT\ +# + +# 'style' determines the namespace for the feature name +def feature_block(expr, attrs = {}): + attrs = _check_attrs(attrs, ("style", "location-style")) + d = {} + _set_if_given(attrs, "style", d) + _set_if_given(attrs, "location-style", d) + _must_have(expr, feature) + return Group(NS + "feature_block", expr, d) +_settag(feature_block, NS + "feature_block") + +def feature(expr, attrs = {}): + attrs = _check_attrs(attrs, ("location-style",)) + d = {} + _set_if_given(attrs, "location-style", d) + _must_have(expr, feature_name) + _must_have_set(expr, [[feature_location], + [feature_location_start, feature_location_end]]) + return Group(NS + "feature", expr, d) +_settag(feature, NS + "feature") + +def feature_name(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "feature_name", expr) +_settag(feature_name, NS + "feature_name") + +def feature_location(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "feature_location", expr) +_settag(feature_location, NS + "feature_location") + +def feature_location_start(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "feature_location_start", expr) +_settag(feature_location_start, NS + "feature_location_start") + +def feature_location_end(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "feature_location_end", expr) +_settag(feature_location_end, NS + "feature_location_end") + +def feature_description(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "feature_description", expr) +_settag(feature_description, NS + "feature_description") + + +##def feature_qualifier_block(expr, attrs = {}): +## attrs = _check_attrs(attrs, ()) +## _must_have(expr, feature_qualifier) +## return Group(NS + "feature_qualifier_block", expr) +##_settag(feature_qualifier_block, NS + "feature_qualifier_block") + +def feature_qualifier(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + _must_have(expr, feature_qualifier_name) + return Group(NS + "feature_qualifier", expr) +_settag(feature_qualifier, NS + "feature_qualifier") + +def feature_qualifier_name(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "feature_qualifier_name", expr) +_settag(feature_qualifier_name, NS + "feature_qualifier_name") + +def feature_qualifier_description(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group(NS + "feature_qualifier_description", expr) +_settag(feature_qualifier_description, NS + "feature_qualifier_description") + + +############ For homology searches + +# "BLASTN", "BLASTP" +def application_name(expr, attrs = {}): + attrs = _check_attrs(attrs, ("app",)) + return Group("bioformat:application_name", expr, attrs) + +# "2.0.11", "2.0a19MP-WashU" +def application_version(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:application_version", expr, attrs) + +def search_header(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:search_header", expr, attrs) + +def search_table(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:search_table", expr, attrs) + +def search_table_description(expr, attrs = {}): + attrs = _check_attrs(attrs, ("bioformat:decode",)) + d = {"bioformat:decode": "strip"} + _set_if_given(attrs, "bioformat:decode", d) + return Group("bioformat:search_table_description", expr, d) + +def search_table_value(expr, attrs = {}): + attrs = _check_attrs(attrs, ("name", "bioformat:decode")) + return Group("bioformat:search_table_value", expr, attrs) + +def search_table_entry(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:search_table_entry", expr, attrs) + +def query_description_block(expr, attrs = {}): + attrs = _check_attrs(attrs, ("join-query",)) + d = {"join-query": "join|fixspaces"} + _set_if_given(attrs, "join-query", d) + return Group("bioformat:query_description_block", expr, d) + +def query_description(expr, attrs = {}): + attrs = _check_attrs(attrs, ("bioformat:decode")) + d = {} + _set_if_given(attrs, "bioformat:decode", d) + return Group("bioformat:query_description", expr, d) + +def query_size(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:query_size", expr) + +def database_name(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:database_name", expr, attrs) + +def database_num_sequences(expr, attrs = {}): + attrs = _check_attrs(attrs, ("bioformat:decode",)) + return Group("bioformat:database_num_sequences", expr, attrs) + +def database_num_letters(expr, attrs = {}): + attrs = _check_attrs(attrs, ("bioformat:decode",)) + return Group("bioformat:database_num_letters", expr, attrs) + +def hit(expr, attrs = {}): + attrs = _check_attrs(attrs, ("join-description",)) + d = {"join-description": "join|fixspaces"} + _set_if_given(attrs, "join-description", d) + return Group("bioformat:hit", expr, d) + +def hit_length(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hit_length", expr, attrs) + +def hit_description(expr, attrs = {}): + attrs = _check_attrs(attrs, ("bioformat:decode")) + d = {} + _set_if_given(attrs, "bioformat:decode", d) + return Group("bioformat:hit_description", expr, d) + +def hsp(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp", expr, attrs) + +def hsp_value(expr, attrs = {}): + attrs = _check_attrs(attrs, ("name", "bioformat:decode")) + return Group("bioformat:hsp_value", expr, attrs) + +def hsp_frame(expr, attrs = {}): + attrs = _check_attrs(attrs, ("which",)) + d = {} + _set_if_given(attrs, "which", d, valid = ("query", "homology", "subject")) + return Group("bioformat:hsp_frame", expr, d) + +def hsp_strand(expr, attrs = {}): + attrs = _check_attrs(attrs, ("strand", "which")) + d = {} + _set_if_given(attrs, "which", d, valid = ("query", "homology", "subject")) + _set_if_given(attrs, "strand", d, valid = ("+1", "0", "-1", "")) + return Group("bioformat:hsp_strand", expr, d) + +def hsp_seqalign_query_seq(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_query_seq", expr, attrs) + +def hsp_seqalign_homology_seq(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_homology_seq", expr, attrs) + +def hsp_seqalign_subject_seq(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_subject_seq", expr, attrs) + +def hsp_seqalign_query_leader(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_query_leader", expr, attrs) + + +def hsp_seqalign_query_name(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_query_name", expr, attrs) + +def hsp_seqalign_subject_name(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_subject_name", expr, attrs) + +def hsp_seqalign(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign", expr, attrs) + +def hsp_seqalign_query_start(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_query_start", expr, attrs) + +def hsp_seqalign_query_end(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_query_end", expr, attrs) + +def hsp_seqalign_subject_start(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_subject_start", expr, attrs) + +def hsp_seqalign_subject_end(expr, attrs = {}): + attrs = _check_attrs(attrs, ()) + return Group("bioformat:hsp_seqalign_subject_end", expr, attrs) + +def search_parameter(expr, attrs = {}): + attrs = _check_attrs(attrs, ("name", "bioformat:decode")) + d = {} + _set_if_given(attrs, "name", d) + _set_if_given(attrs, "bioformat:decode", d) + return Group("bioformat:search_parameter", expr, d) + +def search_statistic(expr, attrs = {}): + attrs = _check_attrs(attrs, ("name", "bioformat:decode")) + d = {} + _set_if_given(attrs, "name", d) + _set_if_given(attrs, "bioformat:decode", d) + return Group("bioformat:search_statistic", expr, d) + diff --git a/binaries/src/globplot/biopython-1.50/Bio/StdHandler.py b/binaries/src/globplot/biopython-1.50/Bio/StdHandler.py new file mode 100644 index 0000000..5807021 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/StdHandler.py @@ -0,0 +1,770 @@ +# Standard Content and Dispatch handlers for the Bioformat IO system +# This is a Python module. +"""This module is DEPRECATED. + +Andrew Dalke is no longer maintaining Martel or Bio.Mindy, and these modules +and associate ones like Bio.StdHandler are now deprecated. They are no longer +used in any of the current Biopython parsers, and are likely to be removed +in a future release. +""" + +import warnings +warnings.warn("Martel and those parts of Biopython depending on it" \ + +" directly (such as Bio.Mindy and Bio.StdHandler) are now" \ + +" deprecated, and will be removed in a future release of"\ + +" Biopython. If you want to continue to use this code,"\ + +" please get in contact with the Biopython developers via"\ + +" the mailing lists to avoid its permanent removal from"\ + +" Biopython.", \ + DeprecationWarning) + +from xml.sax import handler +from Martel import Parser, Dispatch +from Bio import Std, Decode + +################################### + +# Helper functions to make functions + +def add_int_handler(klass, tag, attrname): + assert not hasattr(klass, "start_" +tag), "existing method exists" + assert not hasattr(klass, "end_" +tag), "existing method exists" + s = """if 1: + def start(self, tag, attrs): + self.save_characters() + def end(self, tag): + self.%s = int(self.get_characters()) +""" % attrname + d = {} + exec s in d + setattr(klass, "start_" + tag, d["start"]) + setattr(klass, "end_" + tag, d["end"]) + +def add_text_handler(klass, tag, attrname): + assert not hasattr(klass, "start_" +tag), "existing method exists" + assert not hasattr(klass, "end_" +tag), "existing method exists" + s = """if 1: + def start(self, tag, attrs): + self.save_characters() + def end(self, tag): + self.%s = self.get_characters() +""" % attrname + d = {} + exec s in d + setattr(klass, "start_" + tag, d["start"]) + setattr(klass, "end_" + tag, d["end"]) + +def add_text_dict_handler(klass, tag, attrname, key): + assert not hasattr(klass, "start_" +tag), "existing method exists" + assert not hasattr(klass, "end_" +tag), "existing method exists" + s = """if 1: + def start(self, tag, attrs): + self.save_characters() + def end(self, tag): + self.%s["%s"] = self.get_characters() +""" % (attrname, key) + d = {} + exec s in d + setattr(klass, "start_" + tag, d["start"]) + setattr(klass, "end_" + tag, d["end"]) + +def add_text_decode_handler(klass, tag, attrname): + assert not hasattr(klass, "start_" +tag), "existing method exists" + assert not hasattr(klass, "end_" +tag), "existing method exists" + s = """if 1: + def start(self, tag, attrs): + self.save_characters() + self._decode_%s = attrs.get("bioformat:decode", None) + def end(self, tag): + if self._decode_%s is not None: + s = Decode.make_decoder(self._decode_%s)(s) + self.%s = self.get_characters() +""" % (tag, tag, tag, attrname) + d = {"Decode": Decode} + exec s in d + setattr(klass, "start_" + tag, d["start"]) + setattr(klass, "end_" + tag, d["end"]) + +def add_first_text_handler(klass, tag, attrname): + assert not hasattr(klass, "start_" +tag), "existing method exists" + assert not hasattr(klass, "end_" +tag), "existing method exists" + s = """if 1: + def start(self, tag, attrs): + if self.%s is None: + self.save_characters() + def end(self, tag): + if self.%s is None: + self.%s = self.get_characters() +""" % (attrname, attrname, attrname) + d = {} + exec s in d + setattr(klass, "start_" + tag, d["start"]) + setattr(klass, "end_" + tag, d["end"]) + +def add_text_block_handler(klass, tag, joinattr, defaultjoin, attrname): + assert not hasattr(klass, "start_" + tag), "existing method exists" + assert not hasattr(klass, "end_" + tag), "existing method exists" + assert not hasattr(klass, "start_"+tag+"_block"), "existing method exists" + assert not hasattr(klass, "end_" +tag+"_block"), "existing method exists" + s = """if 1: + def start_block(self, tag, attrs): + self._%(tag)s_join_func = Decode.make_decoder(attrs.get(%(joinattr)r, %(defaultjoin)r)) + self._%(tag)s_lines = [] + def end_block(self, tag): + self.%(attrname)s = self._%(tag)s_join_func(self._%(tag)s_lines) + def start(self, tag, attrs): + self.save_characters() + def end(self, tag): + self._%(tag)s_lines.append(self.get_characters()) +""" % locals() + d = {"Decode": Decode} + exec s in d + setattr(klass, "start_" + tag, d["start"]) + setattr(klass, "end_" + tag, d["end"]) + setattr(klass, "start_" + tag + "_block", d["start_block"]) + setattr(klass, "end_" + tag + "_block", d["end_block"]) + +def add_value_handler(klass, tag, attrname): + assert not hasattr(klass, "start_" +tag), "existing method exists" + assert not hasattr(klass, "end_" +tag), "existing method exists" + s = """if 1: + def start(self, tag, attrs): + self._%(tag)s_name = attrs["name"] + self._%(tag)s_decode = attrs.get("bioformat:decode", None) + self.save_characters() + def end(self, tag): + s = self.get_characters() + if self._%(tag)s_decode is not None: + s = Decode.make_decoder(self._%(tag)s_decode)(s) + self.%(attrname)s[self._%(tag)s_name] = s +""" % locals() + d = {"Decode": Decode} + exec s in d + setattr(klass, "start_" + tag, d["start"]) + setattr(klass, "end_" + tag, d["end"]) + + +################################# + +class ConvertHandler(handler.ContentHandler): + """Used to read records and produce output""" + def __init__(self, record_builder, writer, record_tag = "record"): + handler.ContentHandler.__init__(self) + self.record_builder = record_builder + self.writer = writer + self.record_tag = record_tag + + def startDocument(self): + self.inside_record = 0 + self.characters = self.ignore_characters + + def startElement(self, tag, attrs): + if self.inside_record: + self.record_builder.startElement(tag, attrs) + elif tag == self.record_tag: + self.record_builder.startDocument() + self.inside_record = 1 + self.characters = self.record_builder.characters + self.record_builder.startElement(tag, attrs) + + def endElement(self, tag): + if self.inside_record: + self.record_builder.endElement(tag) + if tag == self.record_tag: + self.record_builder.endDocument() + self.writer.write(self.record_builder.document) + self.inside_record = 0 + self.characters = self.ignore_characters + + def ignore_characters(self, s): + pass + +class ConvertDispatchHandler(Dispatch.Dispatcher): + """Used to read records and produce output through a Dispatcher""" + def __init__(self, record_builder, writer, record_tag = "record"): + setattr(self, "end_" + record_tag, self.write_record) + Dispatch.Dispatcher.__init__(self, + remap = {record_tag: "bioformat:"} + ) + self.acquire(record_builder) + self.record_builder = record_builder + self.writer = writer + self.record_tag = record_tag + def write_record(self, tag): + self.writer.write(self.record_builder.document) + + + +class RecognizeHandler(handler.ContentHandler, handler.ErrorHandler): + def __init__(self): + self.recognized = 1 + self.exc = None + + def fatalError(self, exc): + if isinstance(exc, Parser.ParserIncompleteException): + pass + else: + self.recognized = 0 + self.exc = exc + raise exc + + error = fatalError + + def endElement(self, tag): + if tag == "record": + raise Parser.ParserException("we finished a record!") + + + +class Handle_dbid(Dispatch.Callback): + def start_dbid(self, tag, attrs): + self.attrs = attrs + self.save_characters() + + def end_dbid(self, tag): + text = self.get_characters() + self.callback(text, self.attrs) + + +class Handle_description(Dispatch.Callback): + def start_description_block(self, tag, attrs): + j = attrs.get("join", None) + if j is None: + self.join_fctn = Decode.join_fixspaces + else: + self.join_fctn = Decode.make_typechecked_decoder(j, list, str) + self.descriptions = [] + def start_description(self, tag, attrs): + self.save_characters() + def end_description(self, tag): + x = self.get_characters() + self.descriptions.append(x) + def end_description_block(self, tag): + self.callback(self.join_fctn(self.descriptions)) + +#### There can be multiple dbxref_dbids in a dbxref +# DR EMBL; X64411; CAA45756.1; -. +# <..dbname style="swiss">EMBL +# X64411 +# CAA45756.1 +# +### +# DR P35156, YPUI_BACSU, F; +# P35156 +# YPUI_BACSU +# +# + +def _fixup_sp_pattern(exp): + import re + import Martel + exp = Martel.select_names(exp, (Std.dbxref_dbname.tag,Std.dbxref_dbid.tag)) + + e = exp._find_groups(Std.dbxref_dbname.tag) + assert len(e) == 1 + e = e[0] + e.name = "dbname" + dbstyle = e.attrs["style"] + e.attrs = {} + e = exp._find_groups(Std.dbxref_dbid.tag) + assert len(e) == 2 + e[0].name = "primary_dbid" + primary_type = e[0].attrs["type"] + e[0].attrs = {} + e[1].name = "secondary_dbid" + secondary_type = e[1].attrs["type"] + e[1].attrs = {} + pattern = str(exp) + "$" + pat = re.compile(pattern) + return pat, dbstyle, primary_type, secondary_type + +# Turns out these 'fast' versions speed up the dbxref code by about +# a factor of 2. + +# DR PIR; S08427; S08427. +_fast_dbxref_sp_general_data = None +def _fast_dbxref_sp_general(s): + global _fast_dbxref_sp_general_data + if _fast_dbxref_sp_general_data is None: + from Bio.expressions.swissprot import sprot38 + _fast_dbxref_sp_general_data = _fixup_sp_pattern( + sprot38.real_DR_general) + + pat, dbstyle, primary_type, secondary_type = _fast_dbxref_sp_general_data + + m = pat.match(s) + assert m is not None, "Ill-formated sp-general dxbref: %r" % s + return ( + (dbstyle, m.group("dbname"), primary_type, + m.group("primary_dbid"), 0), + (dbstyle, m.group("dbname"), secondary_type, + m.group("secondary_dbid"), 0) + ) + +# DR PFAM; PF01018; GTP1_OBG; 1. +# DR PROSITE; PS00905; GTP1_OBG; 1. + +_fast_dbxref_sp_prosite_data = None +def _fast_dbxref_sp_prosite(s): + global _fast_dbxref_sp_prosite_data + + if _fast_dbxref_sp_prosite_data is None: + from Bio.expressions.swissprot import sprot38 + _fast_dbxref_sp_prosite_data = _fixup_sp_pattern( + sprot38.real_DR_prosite) + + pat, dbstyle, primary_type, secondary_type = _fast_dbxref_sp_prosite_data + m = pat.match(s) + assert m is not None, "Ill-formated sp-prosite dxbref: %r" % s + return ( + (dbstyle, m.group("dbname"), primary_type, + m.group("primary_dbid"), 0), + (dbstyle, m.group("dbname"), secondary_type, + m.group("secondary_dbid"), 0) + ) + + +# DR EMBL; M36407; AAA33110.1; -. +_fast_dbxref_sp_embl_data = None +def _fast_dbxref_sp_embl(s): + global _fast_dbxref_sp_embl_data + + if _fast_dbxref_sp_embl_data is None: + from Bio.expressions.swissprot import sprot38 + _fast_dbxref_sp_embl_data = _fixup_sp_pattern( + sprot38.real_DR_embl) + + pat, dbstyle, primary_type, secondary_type = _fast_dbxref_sp_embl_data + m = pat.match(s) + assert m is not None, "Ill-formated sp-embl dxbref: %r" % s + return ( + (dbstyle, m.group("dbname"), primary_type, + m.group("primary_dbid"), 0), + (dbstyle, m.group("dbname"), secondary_type, + m.group("secondary_dbid"), 0) + ) + +_fast_dbxref_parser_table = { + "sp-general": _fast_dbxref_sp_general, + "sp-prosite": _fast_dbxref_sp_prosite, + "sp-embl": _fast_dbxref_sp_embl, +} + +class Handle_dbxref(Dispatch.Callback): + def __init__(self, callback): + Dispatch.Callback.__init__(self, callback) + self.supported_features.append("fast-sp-dbxref") + self.slow_callback = self.callback + def start_dbxref(self, tag, attrs): + self.negate = 0 + self.dbname = None + self.dbids = [] + self.info = [] + + def start_dbxref_dbname(self, tag, attrs): + assert self.dbname is None, "cannot set the dbname twice" + self.dbname_style = attrs.get("style", "unknown") + self.save_characters() + def end_dbxref_dbname(self, tag): + self.dbname = self.get_characters() + + def start_dbxref_dbid(self, tag, attrs): + d = attrs.get("dbname", None) + if d is None: + assert self.dbname is not None, "must set the dbname" + self.info.append( (self.dbname_style, self.dbname, + attrs.get("type", "primary")) ) + else: + self.info.append( ("bioformat", d, + attrs.get("type", "primary")) ) + self.save_characters() + + def end_dbxref_dbid(self, tag): + self.dbids.append( self.get_characters()) + + def start_dbxref_negate(self, tag, attrs): + self.negate = 1 + + def end_dbxref(self, tag): + cb = self.slow_callback + if cb is None: + return + negate = self.negate + for ( (dbname_style, dbname, idtype), dbid) in zip(self.info, + self.dbids): + self.slow_callback(dbname_style, dbname, idtype, dbid, negate) + + def start_fast_dbxref(self, tag, attrs): + style = attrs["style"] + self._fast_parser = _fast_dbxref_parser_table[style] + self.save_characters() + self.slow_callback = None + def end_fast_dbxref(self, tag): + for info in self._fast_parser(self.get_characters()): + self.callback(*info) + self.slow_callback = self.callback + +################## +class Handle_sequence(Dispatch.Callback): + global_alphabet = None + def start_(self, tag, attrs): + self.global_alphabet = None + + def start_sequence_block(self, tag, attrs): + self.local_alphabet = attrs.get("alphabet", None) + self.gapchar = attrs.get("gapchar", None) + self.stopchar = attrs.get("stopchar", None) + j = attrs.get("join", None) + if j is not None: + self.join_func = Decode.make_typechecked_decoder(j, list, str) + else: + self.join_func = None + self.sequences = [] + + def end_sequence_block(self, tag): + f = self.join_func + if f is not None: + seq = self.f(self.sequences) + else: + seq = "".join(self.sequences).replace(" ", "") + alphabet = self.local_alphabet or self.global_alphabet or "unknown" + self.callback( (alphabet, seq, self.gapchar, self.stopchar) ) + + def start_alphabet(self, tag, attrs): + self.global_alphabet = attrs["alphabet"] + + def start_sequence(self, tag, attrs): + self.save_characters() + def end_sequence(self, tag): + self.sequences.append(self.get_characters()) + +class Feature: + def __init__(self, name, description, location, qualifiers): + self.name = name + self.description = description + self.location = location + self.qualifiers = qualifiers + def __str__(self): + return "Feature %r %r %s num_qualifiers = %d" % \ + (self.name, self.description, self.location, + len(self.qualifiers)) + + +class Handle_feature_location(Dispatch.Callback): + def __init__(self, callback, settings = {}): + Dispatch.Callback.__init__(self, callback) + self.settings = settings + + def start_feature(self, tag, attrs): + self.location_style = attrs.get("location-style", + self.settings["location-style"]) + j = attrs.get("join-feature", None) + if j is None: + self.text_join_func = "".join + else: + self.text_join_func = Decode.make_typechecked_decoder(j, list, str) + + self.location_start = None + self.location_end = None + self.text_lines = [] + + def end_feature(self, tag): + if self.location_start or self.location_end: + if self.text_lines: + raise TypeError("Cannot have both location text and start/end") + self.callback(self.location_style, + (self.location_start, self.location_end)) + else: + self.callback(self.location_style, + (self.text_join_func(self.text_lines), None)) + + def start_feature_location(self, tag, attrs): + self.save_characters() + def end_feature_location(self, tag): + self.text_lines.append(self.get_characters()) + +add_text_handler(Handle_feature_location, "feature_location_start", + "location_start") +add_text_handler(Handle_feature_location, "feature_location_end", + "location_end") + +################################## + +class Handle_feature_qualifier(Dispatch.Callback): + def __init__(self, callback, settings): + self.settings = settings + Dispatch.Callback.__init__(self, callback) + + def start_feature_qualifier(self, tag, attrs): + self.name = None + self.description = [] + qj = attrs.get("join-qualifier", None) + if qj is None: + self.join = self.settings["qualifier_join_func"] + else: + self.join = Decode.make_typechecked_decoder(qj, list, str) + + def end_feature_qualifier(self, tag): + self.callback(self.name, self.join(self.description)) + + def start_feature_qualifier_description(self, tag, attrs): + self.save_characters() + def end_feature_qualifier_description(self, tag): + self.description.append(self.get_characters()) + +add_text_handler(Handle_feature_qualifier, "feature_qualifier_name", "name") + +#################### + +class Handle_features(Dispatch.Callback): + def __init__(self, callback): + Dispatch.Callback.__init__(self, callback) + self.settings = {} + + self.acquire(Handle_feature_location(self.add_location, self.settings)) + + self.acquire(Handle_feature_qualifier(self.add_feature_qualifier, + self.settings)) + + def start_feature_block(self, tag, attrs): + jf = attrs.get("join-description", None) + if jf is None: + self.join_feature_description = Decode.join_fixspaces + else: + self.join_feature_description = Decode.make_typechecked_decoder( + jf, list, str) + + self.settings["location-style"] = attrs.get("location-style", None) + + jq = attrs.get("join-qualifier", None) + if jq is None: + self.settings["qualifier_join_func"] = Decode.join_fixspaces + else: + self.settings["qualifier_join_func"] = \ + Decode.make_typechecked_decoder(jq, list, str) + self.features = [] + + def end_feature_block(self, tag): + self.callback(self.features) + self.features = None + + def start_feature(self, tag, attrs): + self.name = None + self.description = [] + self.location = None + self.qualifiers = [] + + def start_feature_description(self, tag, attrs): + self.save_characters() + def end_feature_description(self, tag): + self.description.append(self.get_characters()) + + def end_feature(self, tag): + self.features.append(Feature( + self.name, + self.join_feature_description(self.description), + self.location, + self.qualifiers)) + + def add_feature_qualifier(self, name, description): + self.qualifiers.append((name, description)) + + def add_location(self, style, location_info): + self.location = (style, location_info) + +add_text_handler(Handle_features, "feature_name", "name") + + +############## Search handlers + +class Handle_hsp_seqalign(Dispatch.Callback): + def start_hsp(self, tag, attrs): + self.query_name = None # "Query" + self.subject_name = None # "Sbjct" + + self.query_seq = "" # the actual text of the sequence + self.homology_seq = "" + self.subject_seq = "" + + self.query_start_loc = None + self.query_end_loc = None + + self.subject_start_loc = None + self.subject_end_loc = None + + def end_hsp(self, tag): + self.callback(self) + + def start_hsp_seqalign(self, tag, attrs): + self.sub_leader = None + + def start_hsp_seqalign_query_seq(self, tag, attrs): + self.save_characters() + def end_hsp_seqalign_query_seq(self, tag): + s = self.get_characters() + self.query_seq += s + self.sub_query_seq_len = len(s) + + def start_hsp_seqalign_homology_seq(self, tag, attrs): + self.save_characters() + def end_hsp_seqalign_homology_seq(self, tag): + query_leader = self.leader_size + query_seq_len = self.sub_query_seq_len + line = self.get_characters() + s = line[query_leader:query_leader+query_seq_len] + assert len(s) == query_seq_len, (len(s), query_seq_len, line) + self.homology_seq += s + + def start_hsp_seqalign_subject_seq(self, tag, attrs): + self.save_characters() + def end_hsp_seqalign_subject_seq(self, tag): + self.subject_seq += self.get_characters() + + def start_hsp_seqalign_query_leader(self, tag, attrs): + self.save_characters() + def end_hsp_seqalign_query_leader(self, tag): + self.leader_size = len(self.get_characters()) + +add_first_text_handler(Handle_hsp_seqalign, "hsp_seqalign_query_name", + "query_name") + +add_first_text_handler(Handle_hsp_seqalign, "hsp_seqalign_subject_name", + "subject_name") + +add_first_text_handler(Handle_hsp_seqalign, "hsp_seqalign_query_start", + "query_start_loc") +add_text_handler(Handle_hsp_seqalign, "hsp_seqalign_query_end", + "query_end_loc") + +add_first_text_handler(Handle_hsp_seqalign, "hsp_seqalign_subject_start", + "subject_start_loc") +add_text_handler(Handle_hsp_seqalign, "hsp_seqalign_subject_end", + "subject_end_loc") + + + + +############################# + +class Handle_hsp(Dispatch.Callback): + def __init__(self, callback): + Dispatch.Callback.__init__(self, callback) + self.acquire(Handle_hsp_seqalign(self.add_hsp_seqs)) + + def start_hsp(self, tag, attrs): + self.hsp_values = {} # expect, p, identities, ... + self.strands = {} + self.frames = {} + + def end_hsp(self, tag): + self.callback(self.hsp_values, + self.hsp_info, + self.strands, self.frames, + ) + + def start_hsp_strand(self, tag, attrs): + self.strands[attrs["which"]] = attrs["strand"] + + def start_hsp_frame(self, tag, attrs): + self.getting_frame = attrs["which"] + self.save_characters() + + def end_hsp_frame(self, tag): + self.frames[self.getting_frame] = self.get_characters() + self.getting_frame = None + + def add_hsp_seqs(self, hsp_info): + self.hsp_info = hsp_info + + def start_hsp_value(self, tag, attrs): + self.value_convert = attrs.get("bioformat:decode", None) + self.value_name = attrs["name"] + self.save_characters() + + def end_hsp_value(self, tag): + s = self.get_characters() + if self.value_name is not None: + if self.value_name == "float": + s = float(s) + else: + s = Decode.make_decoder(self.value_convert)(s) + self.hsp_values[self.value_name] = s + +############################# + + +class Handle_search_table(Dispatch.Callback): + def start_search_table_value(self, tag, attrs): + self.value_name = attrs["name"] + self.value_decode = attrs.get("bioformat:decode", None) + self.save_characters() + def end_search_table_value(self, tag): + s = self.get_characters() + if self.value_decode is not None: + x = self.value_decode + if x == "int": + s = int(s) + elif x == "float": + s = float(s) + else: + s = Decode.make_decoder(x)(s) + self.values[self.value_name] = s + + def start_search_table(self, tag, attrs): + self.data = [] + def end_search_table(self, tag): + self.callback(self.data) + self.data = None + + def start_search_table_entry(self, tag, attrs): + self.description = None + self.values = {} + + def end_search_table_entry(self, tag): + self.data.append( (self.description, self.values) ) + self.description = self.values = None + +add_text_handler(Handle_search_table, "search_table_description", + "description") + +############################# + +class Handle_search_header(Dispatch.Callback): + def start_(self, tag, attrs): + self.dict = {} + self.query_description = None + + def end_search_header(self, tag): + d = self.dict + d["query_description"] = self.query_description + self.callback(d) + +add_text_block_handler(Handle_search_header, "query_description", + "join-query", "join|fixspaces", "query_description") + +add_text_dict_handler(Handle_search_header, "application_name", + "dict", "appname") +add_text_dict_handler(Handle_search_header, "application_version", + "dict", "appversion") +add_text_dict_handler(Handle_search_header, "database_name", + "dict", "dbname") +add_text_dict_handler(Handle_search_header, "database_num_sequences", + "dict", "db_num_sequences") +add_text_dict_handler(Handle_search_header, "database_num_letters", + "dict", "db_num_letters") +add_text_dict_handler(Handle_search_header, "query_size", + "dict", "query_size") + + +############################# + +class Handle_search_info(Dispatch.Callback): + def start_(self, tag, attrs): + self.parameters = {} + self.statistics = {} + + def end_(self, tag): + self.callback(self.parameters, self.statistics) + +add_value_handler(Handle_search_info, "search_parameter", "parameters") +add_value_handler(Handle_search_info, "search_statistic", "statistics") diff --git a/binaries/src/globplot/biopython-1.50/Bio/Transcribe.py b/binaries/src/globplot/biopython-1.50/Bio/Transcribe.py new file mode 100644 index 0000000..9cc0b48 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Transcribe.py @@ -0,0 +1,34 @@ +"""Code to transcribe DNA into RNA or back (OBSOLETE). + +You are now encouraged to use the Seq object methods or the functions +in Bio.Seq instead. + +This module is now considered to be obsolete, and is likely to be deprecated +in a future release of Biopython, and later removed. +""" + +from Bio import Alphabet, Seq +from Bio.Alphabet import IUPAC + +class Transcribe: + def __init__(self, dna_alphabet, rna_alphabet): + self.dna_alphabet = dna_alphabet + self.rna_alphabet = rna_alphabet + + def transcribe(self, dna): + assert dna.alphabet == self.dna_alphabet, \ + "transcribe has the wrong DNA alphabet" + s = dna.data + return Seq.Seq(s.replace("T", "U"), self.rna_alphabet) + def back_transcribe(self, rna): + assert rna.alphabet == self.rna_alphabet, \ + "back transcribe has the wrong RNA alphabet" + s = rna.data + return Seq.Seq(s.replace("U", "T"), self.dna_alphabet) + +generic_transcriber = Transcribe(Alphabet.generic_dna, + Alphabet.generic_rna) +ambiguous_transcriber = Transcribe(IUPAC.ambiguous_dna, + IUPAC.ambiguous_rna) +unambiguous_transcriber = Transcribe(IUPAC.unambiguous_dna, + IUPAC.unambiguous_rna) diff --git a/binaries/src/globplot/biopython-1.50/Bio/Translate.py b/binaries/src/globplot/biopython-1.50/Bio/Translate.py new file mode 100644 index 0000000..05da460 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Translate.py @@ -0,0 +1,133 @@ +"""Code to translate DNA or RNA into proteins (OBSOLETE). + +Instead of Bio.Translate, for translation you are now encouraged to use the +Seq object's translate method, or the translate function in the Bio.Seq +module. Translate-to-stop functionality is via an optional argument. + +Bio.Seq does not offer any back-translation function like the one here. It +was concluded that a since a simple back-translation giving a Seq or python +string could only capture some of the possible back translations, there were +no practical uses for such a method/function. + +This module is now considered to be obsolete, and is likely to be deprecated +in a future release of Biopython, and later removed. +""" +from Bio import Alphabet, Seq +from Bio.Data import CodonTable + +class Translator: + def __init__(self, table): + self.table = table + self._encoded = {} + + def __str__(self) : + return "Translator object\n" + str(self.table) + + def translate(self, seq, stop_symbol = "*"): + #Allow different instances of the same class to be used: + assert seq.alphabet.__class__ == \ + self.table.nucleotide_alphabet.__class__, \ + "cannot translate from given alphabet (have %s, need %s)" %\ + (seq.alphabet, self.table.nucleotide_alphabet) + s = seq.data + letters = [] + append = letters.append + table = self.table + get = table.forward_table.get + n = len(seq) + for i in range(0, n-n%3, 3): + append(get(s[i:i+3], stop_symbol)) + + # return with the correct alphabet encoding (cache the encoding) + try: + alphabet = self._encoded[stop_symbol] + except KeyError: + alphabet = Alphabet.HasStopCodon(table.protein_alphabet, + stop_symbol) + self._encoded[stop_symbol] = alphabet + + return Seq.Seq("".join(letters), alphabet) + + def translate_to_stop(self, seq): + # This doesn't have a stop encoding + + #Allow different instances of the same class to be used: + assert seq.alphabet.__class__ == \ + self.table.nucleotide_alphabet.__class__, \ + "cannot translate from given alphabet (have %s, need %s)" %\ + (seq.alphabet, self.table.nucleotide_alphabet) + s = seq.data + letters = [] + append = letters.append + table = self.table.forward_table + n = len(seq) + try: + for i in range(0, n-n%3, 3): + append(table[s[i:i+3]]) + except KeyError: + # Stop at the first codon failure + pass + return Seq.Seq("".join(letters), self.table.protein_alphabet) + + def back_translate(self, seq): + # includes the stop codon + if not isinstance(seq.alphabet, Alphabet.HasStopCodon): + return self._back_translate_no_stop(seq) + assert seq.alphabet.alphabet == self.table.protein_alphabet, \ + "cannot back translate from the given alphabet (%s)" % \ + seq.alphabet.alphabet + s = seq.data + letter = seq.alphabet.stop_symbol + letters = [] + append = letters.append + table = self.table.back_table + for c in seq.data: + if c == letter: + append(table[None]) + else: + append(table[c]) + return Seq.Seq("".join(letters), + self.table.nucleotide_alphabet) + + def _back_translate_no_stop(self, seq): + # does not allow a stop codon + assert seq.alphabet == self.table.protein_alphabet, \ + "cannot back translate from the given alphabet (%s)" % \ + seq.alphabet + s = seq.data + letters = [] + append = letters.append + table = self.table.back_table + for c in seq.data: + append(table[c]) + return Seq.Seq("".join(letters), + self.table.nucleotide_alphabet) + +unambiguous_dna_by_name = {} +for key, value in CodonTable.unambiguous_dna_by_name.items(): + unambiguous_dna_by_name[key] = Translator(value) +unambiguous_dna_by_id = {} +for key, value in CodonTable.unambiguous_dna_by_id.items(): + unambiguous_dna_by_id[key] = Translator(value) + +unambiguous_rna_by_name = {} +for key, value in CodonTable.unambiguous_rna_by_name.items(): + unambiguous_rna_by_name[key] = Translator(value) +unambiguous_rna_by_id = {} +for key, value in CodonTable.unambiguous_rna_by_id.items(): + unambiguous_rna_by_id[key] = Translator(value) + +# XXX Ambiguous - can be done the same except for stop codons! +ambiguous_dna_by_name = {} +for key, value in CodonTable.ambiguous_dna_by_name.items(): + ambiguous_dna_by_name[key] = Translator(value) +ambiguous_dna_by_id = {} +for key, value in CodonTable.ambiguous_dna_by_id.items(): + ambiguous_dna_by_id[key] = Translator(value) + +ambiguous_rna_by_name = {} +for key, value in CodonTable.ambiguous_rna_by_name.items(): + ambiguous_rna_by_name[key] = Translator(value) +ambiguous_rna_by_id = {} +for key, value in CodonTable.ambiguous_rna_by_id.items(): + ambiguous_rna_by_id[key] = Translator(value) diff --git a/binaries/src/globplot/biopython-1.50/Bio/Writer.py b/binaries/src/globplot/biopython-1.50/Bio/Writer.py new file mode 100644 index 0000000..9247de2 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/Writer.py @@ -0,0 +1,17 @@ +"""Part of an old unused and undocumented sequence writing framework (DEPRECATED).""" + +import warnings +warnings.warn("Bio.Writer and Bio.writer.* are deprecated. If you do use"\ + +" these modules, please get in touch via the mailing list or"\ + +" bugzilla to avoid their permanent removal from Biopython.", \ + DeprecationWarning) + +class Writer: + def __init__(self, outfile): + self.outfile = outfile + def writeHeader(self): + pass + def write(self, record): + pass + def writeFooter(self): + pass diff --git a/binaries/src/globplot/biopython-1.50/Bio/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/__init__.py new file mode 100644 index 0000000..dd350ee --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2000 by Jeffrey Chang. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. +"""Collection of modules for dealing with biological data in Python. + +The Biopython Project is an international association of developers +of freely available Python tools for computational molecular biology. + +http://biopython.org +""" + +__version__ = "1.50" + +class MissingExternalDependencyError(Exception): + pass diff --git a/binaries/src/globplot/biopython-1.50/Bio/__init__.pyc b/binaries/src/globplot/biopython-1.50/Bio/__init__.pyc new file mode 100644 index 0000000..b1249ba Binary files /dev/null and b/binaries/src/globplot/biopython-1.50/Bio/__init__.pyc differ diff --git a/binaries/src/globplot/biopython-1.50/Bio/cMarkovModelmodule.c b/binaries/src/globplot/biopython-1.50/Bio/cMarkovModelmodule.c new file mode 100644 index 0000000..8683113 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/cMarkovModelmodule.c @@ -0,0 +1,61 @@ +/* cMarkovModelmodule.c + * jchang + * Created: 1/13/01 + * Last modified: 11/26/02 + * + * This optimizes some of the functions in MarkovModel.py. + */ + +#include "Python.h" +#include "csupport.h" + + +/* Functions in this module. */ + +static char cMarkovModel__logadd__doc__[] = +"_logadd(logx, logy) -> log(x+y)\n"; + +static PyObject *cMarkovModel__logadd(PyObject *self, PyObject *args) +{ + PyObject *py_logx, *py_logy; + double logx, logy, minxy; + double sum; + + if(!PyArg_ParseTuple(args, "OO", &py_logx, &py_logy)) + return NULL; + logx = PyNumber_AsDouble(py_logx); + logy = PyNumber_AsDouble(py_logy); + if(PyErr_Occurred()) + return NULL; + + if(logy-logx > 100.0) { + Py_INCREF(py_logy); + return py_logy; + } else if (logx-logy > 100.0) { + Py_INCREF(py_logx); + return py_logx; + } + minxy = (logx < logy) ? logx : logy; + sum = minxy + log(exp(logx-minxy) + exp(logy-minxy)); + return PyFloat_FromDouble(sum); +} + + +/* Module definition stuff */ + +static PyMethodDef CMarkovModelMethods[] = { + {"_logadd", cMarkovModel__logadd, METH_VARARGS, cMarkovModel__logadd__doc__}, + {NULL, NULL} +}; + +static char cMarkovModel__doc__[] = +"This module provides optimized replacement functions for MarkovModel.\n\ +"; + +void initcMarkovModel(void) +{ + Py_InitModule3("cMarkovModel", CMarkovModelMethods, cMarkovModel__doc__); +} + + + diff --git a/binaries/src/globplot/biopython-1.50/Bio/clistfnsmodule.c b/binaries/src/globplot/biopython-1.50/Bio/clistfnsmodule.c new file mode 100644 index 0000000..8d86b44 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/clistfnsmodule.c @@ -0,0 +1,161 @@ +/* Copyright 2000 by Jeffrey Chang. All rights reserved. + * This code is part of the Biopython distribution and governed by its + * license. Please see the LICENSE file that should have been included + * as part of this package. + * + * clistfnsmodule.c + * Created 3 Jun 2000 + */ + +#include "Python.h" +#include + + + + +/************************************** Exported Functions ***********/ + +static char clistfns_count__doc__[] = +"count(items) -> dict of counts of each item\n\ +\n\ +Count the number of times each item appears in a list of data.\n\ +\n\ +"; + +static PyObject *clistfns_count(PyObject *self, PyObject *args) +{ + int i; + PyObject *items, *counts; + PyObject *item, *count, *newcount; + long int current; + + if(!PyArg_ParseTuple(args, "O", &items)) + return NULL; + if(!PySequence_Check(items)) { + PyErr_SetString(PyExc_TypeError, "expected sequence type"); + return NULL; + } + + if(!(counts = PyDict_New())) + return NULL; + + /* Go through the loop, counting how often each item appears. */ + i = 0; + while(1) { + if(!(item = PySequence_GetItem(items, i))) { + PyErr_Clear(); /* clear the exception set by PySequence_GetItem */ + break; /* no more numbers */ + } + + if(!(count = PyDict_GetItem(counts, item))) { + newcount = PyInt_FromLong(1); /* New item, set count to 1 */ + } + else { + current = PyInt_AsLong(count); + newcount = PyInt_FromLong(current+1); + } + + PyDict_SetItem(counts, item, newcount); + Py_DECREF(newcount); + Py_DECREF(item); + if(PyErr_Occurred()) + return NULL; + + i++; + } + + return counts; +} + + +static char clistfns_contents__doc__[] = +"contents(items) -> dict of item -> percentage\n\ +\n\ +Summarize the contents of the list in terms of the percentages of each\n\ +item. For example, if an item appears 3 times in a list with 10 items,\n\ +it is in 0.3 of the list\n\ +\n\ +"; + +static PyObject *clistfns_contents(PyObject *self, PyObject *args) +{ + int i; + PyObject *items, *counts, *percentages; + PyObject *countitems, *countitem; + PyObject *key, *count, *perc; + long c; + double total; + + if(!PyArg_ParseTuple(args, "O", &items)) + return NULL; + if(!PySequence_Check(items)) { + PyErr_SetString(PyExc_TypeError, "expected mapping type"); + return NULL; + } + if((total = PySequence_Length(items)) == -1) { + PyErr_SetString(PyExc_ValueError, "I couldn't get length of item."); + return NULL; + } + + counts = clistfns_count(self, args); + if(!counts || PyErr_Occurred()) + return NULL; + + if(!(percentages = PyDict_New())) { + Py_DECREF(counts); + return NULL; + } + + /* Loop through every element in counts, calculating the probabilities. */ + if(!(countitems = PyMapping_Items(counts))) { + Py_DECREF(counts); + Py_DECREF(percentages); + return NULL; + } + + /* Go through the loop, counting how often each item appears. */ + i = 0; + while(1) { + if(!(countitem = PyList_GetItem(countitems, i))) { + PyErr_Clear(); /* clear the exception set by PyList_GetItem */ + break; /* no more numbers */ + } + key = PyTuple_GetItem(countitem, 0); + count = PyTuple_GetItem(countitem, 1); + c = PyInt_AsLong(count); + perc = PyFloat_FromDouble((double)c / total); + PyDict_SetItem(percentages, key, perc); + Py_DECREF(perc); + if(PyErr_Occurred()) /* PyDict_SetItem failed */ + break; + i++; + } + if(PyErr_Occurred()) { + Py_DECREF(percentages); + percentages = NULL; + } + Py_DECREF(countitems); + Py_DECREF(counts); + + return percentages; +} + + +/************************************** Module definition stuff ******/ + +static PyMethodDef clistfnsMethods[] = { + {"count", clistfns_count, METH_VARARGS, clistfns_count__doc__}, + {"contents", clistfns_contents, METH_VARARGS, clistfns_contents__doc__}, + {NULL, NULL} +}; + +static char clistfns__doc__[] = +"This provides helper functions for the listfns module.\n\ +You should never import this module on its own.\n\ +\n\ +"; + +void initclistfns(void) +{ + (void) Py_InitModule3("clistfns", clistfnsMethods, clistfns__doc__); +} diff --git a/binaries/src/globplot/biopython-1.50/Bio/cmathfnsmodule.c b/binaries/src/globplot/biopython-1.50/Bio/cmathfnsmodule.c new file mode 100644 index 0000000..d3549e2 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/cmathfnsmodule.c @@ -0,0 +1,140 @@ +/* Copyright 2000 by Jeffrey Chang. All rights reserved. + * This code is part of the Biopython distribution and governed by its + * license. Please see the LICENSE file that should have been included + * as part of this package. + * + * cmathfnsmodule.c + * Created 3 Jun 2000 + */ + +#include "Python.h" +#include + +#include "csupport.h" + + + +/************************************** Exported Functions ***********/ + +static char cmathfns_intd__doc__[] = +"intd(x[, digits_after_decimal]) -> int x, rounded\n\ +\n\ +Represent a floating point number with some digits after the\n\ +decimal point as an integer. This is useful when floating point\n\ +comparisons are failing due to precision problems. e.g.\n\ +intd(5.35, 1) -> 54.\n\ +\n\ +"; + +static PyObject *cmathfns_intd( + PyObject *self, PyObject *args, PyObject *keywds) +{ + PyObject *digits_after_decimal = Py_None; + double x, digits; + double precision; + + static char *kwlist[] = {"x", "digits_after_decimal", NULL}; + if(!PyArg_ParseTupleAndKeywords(args, keywds, "d|O", kwlist, + &x, &digits_after_decimal)) + return NULL; + + if(digits_after_decimal == Py_None) + digits = 0; + else { + digits = PyNumber_AsDouble(digits_after_decimal); + if(PyErr_Occurred()) { + return NULL; + } + } + precision = pow(10, digits); + if(x >= 0) + x = (int)(x * precision + 0.5); + else + x = (int)(x * precision - 0.5); + return PyFloat_FromDouble(x); +} + + + + +static char cmathfns_fcmp__doc__[] = +"fcmp(x, y, precision) -> -1, 0, or 1"; + +static PyObject *cmathfns_fcmp( + PyObject *self, PyObject *args, PyObject *keywds) +{ + double x, y, precision; + int result; + + static char *kwlist[] = {"x", "y", "precision", NULL}; + if(!PyArg_ParseTupleAndKeywords(args, keywds, "ddd", kwlist, + &x, &y, &precision)) + return NULL; + + if(fabs(x-y) < precision) + result = 0; + else if(x < y) + result = -1; + else result = 1; + return PyInt_FromLong(result); +} + + + +static char cmathfns_safe_log__doc__[] = +"safe_log(n, zero=None, neg=None) -> log(n)\n\ +\n\ +Calculate the log of n. If n is 0, returns the value of zero. If n is\n\ +negative, returns the value of neg.\n\ +\n\ +"; + +static PyObject *cmathfns_safe_log( + PyObject *self, PyObject *args, PyObject *keywds) +{ + PyObject *zero = Py_None, + *neg = Py_None; + double n; + + static char *kwlist[] = {"n", "zero", "neg", NULL}; + + if(!PyArg_ParseTupleAndKeywords(args, keywds, "d|OO", kwlist, + &n, &zero, &neg)) + return NULL; + + if(n < 0) { + Py_INCREF(neg); + return neg; + } else if(n < 1E-100) { + Py_INCREF(zero); + return zero; + } + + return PyFloat_FromDouble(log(n)); +} + + + + +/************************************** Module definition stuff ******/ + +static PyMethodDef cmathfnsMethods[] = { + {"fcmp", (PyCFunction)cmathfns_fcmp, METH_VARARGS|METH_KEYWORDS, + cmathfns_fcmp__doc__}, + {"intd", (PyCFunction)cmathfns_intd, METH_VARARGS|METH_KEYWORDS, + cmathfns_intd__doc__}, + {"safe_log", (PyCFunction)cmathfns_safe_log, METH_VARARGS|METH_KEYWORDS, + cmathfns_safe_log__doc__}, + {NULL, NULL} +}; + +static char cmathfns__doc__[] = +"This provides helper functions for the mathfns module.\n\ +You should never import this module on its own.\n\ +\n\ +"; + +void initcmathfns(void) +{ + (void) Py_InitModule3("cmathfns", cmathfnsMethods, cmathfns__doc__); +} diff --git a/binaries/src/globplot/biopython-1.50/Bio/cstringfnsmodule.c b/binaries/src/globplot/biopython-1.50/Bio/cstringfnsmodule.c new file mode 100644 index 0000000..af098fd --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/cstringfnsmodule.c @@ -0,0 +1,122 @@ +/* Copyright 2000 by Jeffrey Chang. All rights reserved. + * This code is part of the Biopython distribution and governed by its + * license. Please see the LICENSE file that should have been included + * as part of this package. + * + * cstringfnsmodule.c + * Created 7 Jun 2000 + */ + +#include "Python.h" +#include /* memset */ + + +/* Functions in this module. */ + +static char cstringfns_splitany__doc__[] = +"splitany(str [,sep [,maxsplit [,negate]]]) -> list of strings\n\ +\n\ +Split a string. Similar to string.split, except that this considers\n\ +any one of the characters in sep to be a delimiter. If negate is\n\ +true, then everything but sep will be a separator.\n\ +\n\ +"; + +static PyObject *cstringfns_splitany( + PyObject *self, PyObject *args, PyObject *keywds) +{ + int i, prev; + int nsplit, maxsplit=0; + /*int negate=0;*/ + PyObject *py_negate=NULL; + PyObject *strlist, *newstr; + unsigned char *str, + *sep=" \011\012\013\014\015"; /* whitespace */ + char tosplit[256]; + static char *kwlist[] = {"str", "sep", "maxsplit", "negate", NULL}; + + if(!PyArg_ParseTupleAndKeywords(args, keywds, "s|siO", kwlist, + &str, &sep, &maxsplit, &py_negate)) + return NULL; + if(maxsplit < 0) + maxsplit = 1; + /* negate = (py_negate && PyObject_IsTrue(py_negate));*/ + /* XXX NO MORE NEGATE */ + + /* Set the tosplit array to 1 for characters to split on. */ + memset(tosplit, 0, 256); + while(*sep) { + tosplit[(unsigned char)*sep++] = 1; + } + if(py_negate && PyObject_IsTrue(py_negate)) { + for(i=0; i<256; i++) + tosplit[i] = !tosplit[i]; + } + + /* Create a new list to store the variables. */ + if(!(strlist = PyList_New(0))) { + PyErr_SetString(PyExc_SystemError, "I could not create a new list"); + return NULL; + } + + prev = 0; + nsplit = 0; + for(i=0; str[i] && (maxsplit == 0 || nsplit < maxsplit); i++) { + /*if(!(tosplit[(int)str[i]] == !negate)) + continue; */ + if(!tosplit[(int)str[i]]) + continue; + + /* Split the string here. */ + if(!(newstr = PyString_FromStringAndSize(&str[prev], i-prev))) { + PyErr_SetString(PyExc_SystemError, + "I could not create a new string"); + break; + } + if(PyList_Append(strlist, newstr) == -1) { + Py_DECREF(newstr); + break; + } + Py_DECREF(newstr); + prev = i+1; + nsplit++; + } + if(!PyErr_Occurred()) { + i = strlen(str); + /* Add the last one. */ + if(!(newstr = PyString_FromStringAndSize(&str[prev], i-prev))) { + PyErr_SetString(PyExc_SystemError, + "I could not create a new string"); + } else { + PyList_Append(strlist, newstr); + Py_DECREF(newstr); + } + } else { + Py_DECREF(strlist); + return NULL; + } + + + return strlist; +} + + + +/* Module definition stuff */ + +static PyMethodDef cstringfnsMethods[] = { + {"splitany", (PyCFunction)cstringfns_splitany, METH_VARARGS|METH_KEYWORDS, + cstringfns_splitany__doc__}, + {NULL, NULL} +}; + +static char cstringfns__doc__[] = +"This provides helper functions for the stringfns module.\n\ +You should never import this module on its own.\n\ +\n\ +"; + +void initcstringfns(void) +{ + (void) Py_InitModule3("cstringfns", cstringfnsMethods, cstringfns__doc__); +} diff --git a/binaries/src/globplot/biopython-1.50/Bio/csupport.c b/binaries/src/globplot/biopython-1.50/Bio/csupport.c new file mode 100644 index 0000000..981ba63 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/csupport.c @@ -0,0 +1,30 @@ +/* Copyright 2002 by Jeffrey Chang. All rights reserved. + * This code is part of the Biopython distribution and governed by its + * license. Please see the LICENSE file that should have been included + * as part of this package. + * + * csupport.c + * Created 27 January 2002 + * + * Miscellaneous useful C functions not to be exported as a python + * module. + * + */ + +#include "Python.h" + + +/* Return a PyNumber as a double. + * Raises a TypeError if I can't do it. + */ +double PyNumber_AsDouble(PyObject *py_num) +{ + double val; + PyObject *floatobj; + + if((floatobj = PyNumber_Float(py_num)) == NULL) + return(0.0); + val = PyFloat_AsDouble(floatobj); + Py_DECREF(floatobj); + return val; +} diff --git a/binaries/src/globplot/biopython-1.50/Bio/csupport.h b/binaries/src/globplot/biopython-1.50/Bio/csupport.h new file mode 100644 index 0000000..9b8eefb --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/csupport.h @@ -0,0 +1,2 @@ + +double PyNumber_AsDouble(PyObject *py_num); diff --git a/binaries/src/globplot/biopython-1.50/Bio/distance.py b/binaries/src/globplot/biopython-1.50/Bio/distance.py new file mode 100644 index 0000000..4bec5cb --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/distance.py @@ -0,0 +1,35 @@ +""" +This module provides code for various distance measures. + +Functions: +euclidean Euclidean distance between two points +euclidean_py Pure Python implementation of euclidean. + +""" +# XXX cosine distance + +import warnings +warnings.warn("Bio.distance is deprecated. If you use this module, please notify the Biopython developers at biopython-dev@biopython.org", DeprecationWarning) + +from numpy import * + +def euclidean(x, y): + """euclidean(x, y) -> euclidean distance between x and y""" + if len(x) != len(y): + raise ValueError("vectors must be same length") + #return sqrt(sum((x-y)**2)) + # Optimization by John Corradi (JCorradi@msn.com) + d = x-y + return sqrt(dot(d, d)) + +def euclidean_py(x, y): + """euclidean_py(x, y) -> euclidean distance between x and y""" + # lightly modified from implementation by Thomas Sicheritz-Ponten. + # This works faster than the Numeric implementation on shorter + # vectors. + if len(x) != len(y): + raise ValueError("vectors must be same length") + sum = 0 + for i in range(len(x)): + sum += (x[i]-y[i])**2 + return sqrt(sum) diff --git a/binaries/src/globplot/biopython-1.50/Bio/kNN.py b/binaries/src/globplot/biopython-1.50/Bio/kNN.py new file mode 100644 index 0000000..4bd2d8d --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/kNN.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python + +""" +This module provides code for doing k-nearest-neighbors classification. + +k Nearest Neighbors is a supervised learning algorithm that classifies +a new observation based the classes in its surrounding neighborhood. + +Glossary: +distance The distance between two points in the feature space. +weight The importance given to each point for classification. + + +Classes: +kNN Holds information for a nearest neighbors classifier. + + +Functions: +train Train a new kNN classifier. +calculate Calculate the probabilities of each class, given an observation. +classify Classify an observation into a class. + + Weighting Functions: +equal_weight Every example is given a weight of 1. + +""" + +#TODO - Remove this work around once we drop python 2.3 support +try: + set = set +except NameError: + from sets import Set as set + +import numpy + +class kNN: + """Holds information necessary to do nearest neighbors classification. + + Members: + classes Set of the possible classes. + xs List of the neighbors. + ys List of the classes that the neighbors belong to. + k Number of neighbors to look at. + + """ + def __init__(self): + """kNN()""" + self.classes = set() + self.xs = [] + self.ys = [] + self.k = None + +def equal_weight(x, y): + """equal_weight(x, y) -> 1""" + # everything gets 1 vote + return 1 + +def train(xs, ys, k, typecode=None): + """train(xs, ys, k) -> kNN + + Train a k nearest neighbors classifier on a training set. xs is a + list of observations and ys is a list of the class assignments. + Thus, xs and ys should contain the same number of elements. k is + the number of neighbors that should be examined when doing the + classification. + + """ + knn = kNN() + knn.classes = set(ys) + knn.xs = numpy.asarray(xs, typecode) + knn.ys = ys + knn.k = k + return knn + +def calculate(knn, x, weight_fn=equal_weight, distance_fn=None): + """calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict + + Calculate the probability for each class. knn is a kNN object. x + is the observed data. weight_fn is an optional function that + takes x and a training example, and returns a weight. distance_fn + is an optional function that takes two points and returns the + distance between them. If distance_fn is None (the default), the + Euclidean distance is used. Returns a dictionary of the class to + the weight given to the class. + + """ + x = numpy.asarray(x) + + order = [] # list of (distance, index) + if distance_fn: + for i in range(len(knn.xs)): + dist = distance_fn(x, knn.xs[i]) + order.append((dist, i)) + else: + # Default: Use a fast implementation of the Euclidean distance + temp = numpy.zeros(len(x)) + # Predefining temp allows reuse of this array, making this + # function about twice as fast. + for i in range(len(knn.xs)): + temp[:] = x - knn.xs[i] + dist = numpy.sqrt(numpy.dot(temp,temp)) + order.append((dist, i)) + order.sort() + + # first 'k' are the ones I want. + weights = {} # class -> number of votes + for k in knn.classes: + weights[k] = 0.0 + for dist, i in order[:knn.k]: + klass = knn.ys[i] + weights[klass] = weights[klass] + weight_fn(x, knn.xs[i]) + + return weights + +def classify(knn, x, weight_fn=equal_weight, distance_fn=None): + """classify(knn, x[, weight_fn][, distance_fn]) -> class + + Classify an observation into a class. If not specified, weight_fn will + give all neighbors equal weight. distance_fn is an optional function + that takes two points and returns the distance between them. If + distance_fn is None (the default), the Euclidean distance is used. + """ + weights = calculate( + knn, x, weight_fn=weight_fn, distance_fn=distance_fn) + + most_class = None + most_weight = None + for klass, weight in weights.items(): + if most_class is None or weight > most_weight: + most_class = klass + most_weight = weight + return most_class diff --git a/binaries/src/globplot/biopython-1.50/Bio/listfns.py b/binaries/src/globplot/biopython-1.50/Bio/listfns.py new file mode 100644 index 0000000..ff7db6f --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/listfns.py @@ -0,0 +1,158 @@ +# Copyright 2000 by Jeffrey Chang. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""This provides useful general functions for working with lists (OBSOLETE). + +This module and its C code equivalent are considered to be obsolete, and +are likely to be deprecated in a future release of Biopython, before being +removed. Please get in touch via the mailing list if this will affect you. +Many of these functions can be avoided using the python set object. + +Functions: +asdict Make the list into a dictionary (for fast testing of membership). +items Get one of each item in a list. +count Count the number of times each item appears. +contents Calculate percentage each item appears in a list. +itemindex Make an index of the items in the list. +intersection Get the items in common between 2 lists. +difference Get the items in 1 list, but not the other. +indexesof Get a list of the indexes of some items in a list. +take Take some items from a list. + +""" + +def asdict(l): + """asdict(l) -> dictionary + + Return a dictionary where the keys are the items in the list, with + arbitrary values. This is useful for quick testing of membership. + + """ + return count(l) + +def items(l): + """items(l) -> list of items + + Generate a list of one of each item in l. The items are returned + in arbitrary order. + + """ + try: + return asdict(l).keys() + except TypeError, x: + if str(x).find("unhashable") == -1: + raise + # asdict failed because l is unhashable. Back up to a naive + # implementation. + l = l[:] + l.sort() + i = 0 + while i < len(l)-1: + if l[i] == l[i+1]: + del l[i] + else: + i += 1 + return l + +def count(items): + """count(items) -> dict of counts of each item + + Count the number of times each item appears in a list of data. + + """ + c = {} + for i in items: + c[i] = c.get(i, 0) + 1 + return c + +def contents(items): + """contents(items) -> dict of item:percentage + + Summarize the contents of the list in terms of the percentages of each + item. For example, if an item appears 3 times in a list with 10 items, + it is in 0.3 of the list. + + """ + counts = count(items) + l = float(len(items)) + contents = {} + for i, c in counts.items(): + contents[i] = c / l + return contents + +def intersection(l1, l2): + """intersection(l1, l2) -> list of common items + + Return a list of the items in both l1 and l2. The list is in + arbitrary order. + + """ + inter = [] + words1 = count(l1) + for w in l2: + if words1.has_key(w): + inter.append(w) + del words1[w] # don't add the same word twice + return inter + +def difference(l1, l2): + """difference(l1, l2) -> list of items in l1, but not l2 + + Return a list of the items in l1, but not l2. The list is in + arbitrary order. + + """ + diff = [] + words2 = count(l2) + for w in l1: + if not words2.has_key(w): + diff.append(w) + words2[w] = 1 # don't add the same word twice + return diff + +def itemindex(l): + """itemindex(l) -> dict of item : index of item + + Make an index of the items in the list. The dictionary contains + the items in the list as the keys, and the index of the first + occurrence of the item as the value. + + """ + dict = {} + for i in range(len(l)): + if not dict.has_key(l[i]): + dict[l[i]] = i + return dict + +def indexesof(l, fn, opposite=0): + """indexesof(l, fn) -> list of indexes + + Return a list of indexes i where fn(l[i]) is true. + + """ + indexes = [] + for i in range(len(l)): + f = fn(l[i]) + if (not opposite and f) or (opposite and not f): + indexes.append(i) + return indexes + +def take(l, indexes): + """take(l, indexes) -> list of just the indexes from l""" + items = [] + for i in indexes: + items.append(l[i]) + return items + +def take_byfn(l, fn, opposite=0): + indexes = indexesof(l, fn, opposite=opposite) + return take(l, indexes) + +# Try and load C implementations of functions. If I can't, +# then just ignore and use the pure python implementations. +try: + from clistfns import * +except ImportError: + pass diff --git a/binaries/src/globplot/biopython-1.50/Bio/mathfns.py b/binaries/src/globplot/biopython-1.50/Bio/mathfns.py new file mode 100644 index 0000000..4474232 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/mathfns.py @@ -0,0 +1,100 @@ +# Copyright 2000 by Jeffrey Chang. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""This provides useful general math tools (DEPRECATED). + +This module and its C code equivalent are considered to be deprecated, and +are likely to be removed in a future release of Biopython. Please get in +touch via the mailing list if this will affect you. + +Functions: +fcmp Compare two floating point numbers, up to a specified precision. +intd Represent a floating point number as an integer. +safe_log log, but returns an arbitrarily small number for log(0). +safe_exp exp, but returns a large or small number instead of overflows. + +""" +import warnings +warnings.warn("Bio.mathfns and its C code equivalent Bio.cmathfns are" \ + +" deprecated, and will be removed in a future release of"\ + +" Biopython. If you want to continue to use this code,"\ + +" please get in contact with the Biopython developers via"\ + +" the mailing lists to avoid its permanent removal from"\ + +" Biopython.", \ + DeprecationWarning) + +import math + +def fcmp(x, y, precision): + """fcmp(x, y, precision) -> -1, 0, or 1""" + if math.fabs(x-y) < precision: + return 0 + elif x < y: + return -1 + return 1 + +def intd(x, digits_after_decimal=0): + """intd(x[, digits_after_decimal]) -> int x, rounded + + Represent a floating point number with some digits after the + decimal point as an integer. This is useful when floating point + comparisons are failing due to precision problems. e.g. + intd(5.35, 1) -> 54. + + """ + precision = 10.**digits_after_decimal + if x >= 0: + x = int(x * precision + 0.5) + else: + x = int(x * precision - 0.5) + return x + +def safe_log(n, zero=None, neg=None): + """safe_log(n, zero=None, neg=None) -> log(n) + + Calculate the log of n. If n is 0, returns the value of zero. If n is + negative, returns the value of neg. + + """ + if n < 0: + return neg + elif n < 1E-100: + return zero + return math.log(n) + +LOG2 = math.log(2) +def safe_log2(n, zero=None, neg=None): + """safe_log2(n, zero=None, neg=None) -> log(n) + + Calculate the log base 2 of n. If n is 0, returns the value of + zero. If n is negative, returns the value of neg. + + """ + l = safe_log(n, zero=zero, neg=neg) + if l is None: + return l + return l/LOG2 + +def safe_exp(n, under=None, over=None): + """safe_exp(n, under=None, over=None) -> e**n + + Guaranteed not to overflow. Instead of overflowing, it returns + the values of 'under' for underflows or 'over' for overflows. + + """ + try: + return math.exp(n) + except OverflowError: + if n < 0: + return under + return over + raise "How did I get here?" + +# Try and load C implementations of functions. If I can't, +# then just ignore and use the pure python implementations. +try: + from cmathfns import * +except ImportError: + pass diff --git a/binaries/src/globplot/biopython-1.50/Bio/stringfns.py b/binaries/src/globplot/biopython-1.50/Bio/stringfns.py new file mode 100644 index 0000000..24e461f --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/stringfns.py @@ -0,0 +1,90 @@ +# Copyright 2000 by Jeffrey Chang. All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""This provides useful general functions for working with strings (DEPRECATED). + +This module and its C code equivalent are considered to be deprecated, and +are likely to be removed in a future release of Biopython. Please get in +touch via the mailing list if this will affect you. + +Functions: +splitany Split a string using many delimiters. +find_anychar Find one of a list of characters in a string. +rfind_anychar Find one of a list of characters in a string, from end to start. + +""" +import warnings +warnings.warn("Bio.stringfns and its C code equivalent Bio.cstringfns are" \ + +" deprecated, and will be removed in a future release of"\ + +" Biopython. If you want to continue to use this code,"\ + +" please get in contact with the Biopython developers via"\ + +" the mailing lists to avoid its permanent removal from"\ + +" Biopython.", \ + DeprecationWarning) + +def splitany(s, sep=" \011\012\013\014\015", maxsplit=None, negate=0): + """splitany(s [,sep [,maxsplit [,negate]]]) -> list of strings + + Split a string. Similar to string.split, except that this considers + any one of the characters in sep to be a delimiter. If negate is + true, then everything but sep will be a separator. + + """ + strlist = [] + prev = 0 + for i in range(len(s)): + if maxsplit is not None and len(strlist) >= maxsplit: + break + if (s[i] in sep) == (not negate): + strlist.append(s[prev:i]) + prev = i+1 + strlist.append(s[prev:]) + return strlist + +def find_anychar(string, chars, index=None, negate=0): + """find_anychar(string, chars[, index]) -> index of a character or -1 + + Find a character in string. chars is a list of characters to look + for. Return the index of the first occurrence of any of the + characters, or -1 if not found. index is the index where the + search should start. By default, I search from the beginning of + the string. + + """ + if index is None: + index = 0 + while index < len(string) and \ + ((not negate and string[index] not in chars) or + (negate and string[index] in chars)): + index += 1 + if index == len(string): + return -1 + return index + +def rfind_anychar(string, chars, index=None, negate=0): + """rfind_anychar(string, chars[, index]) -> index of a character or -1 + + Find a character in string, looking from the end to the start. + chars is a list of characters to look for. Return the index of + the first occurrence of any of the characters, or -1 if not found. + index is the index where the search should start. By default, I + search from the end of the string. + + """ + if index is None: + index = len(string)-1 + while index >= 0 and \ + ((not negate and string[index] not in chars) or + (negate and string[index] in chars)): + index -= 1 + # If not found, index will already be -1. + return index + +# Try and load C implementations of functions. If I can't, +# then just ignore and use the pure python implementations. +try: + from cstringfns import * +except ImportError: + pass diff --git a/binaries/src/globplot/biopython-1.50/Bio/trie.c b/binaries/src/globplot/biopython-1.50/Bio/trie.c new file mode 100644 index 0000000..d73d784 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/trie.c @@ -0,0 +1,778 @@ +#include /* printf */ +#include /* malloc */ +#include /* strcmp, strlen */ + +#include "trie.h" + +/* The following is necessary to make sure that trie.pyd won't link + * to msvcrt.dll in addition to msvcr71.dll on Windows. + * See Bug #1767 on Bugzilla. + */ +#ifdef __MINGW32__ +# define strdup _strdup +#endif + +struct _Transition; /* Forward declaration, needed in _Trie. */ + + +/* _Trie is a recursive data structure. A _Trie contains zero or more + * _Transitions that lead to more _Tries. The transitions are stored + * in alphabetical order of the suffix member of the data structure. + * _Trie also contains a pointer called value where the user can store + * arbitrary data. If value is NULL, then no data is stored here. + */ +struct _Trie { + struct _Transition *transitions; + unsigned char num_transitions; + void *value; /* specified by user, never freed or allocated by me! */ +}; + +/* _Transition holds information about the transitions leading from + * one _Trie to another. The trie structure here is different from + * typical ones, because the transitions between nodes can contain + * strings of arbitrary length, not just single characters. Suffix is + * the string that is matched from one node to the next. + */ +typedef struct _Transition { + unsigned char *suffix; + Trie next; +} *Transition; + + +#define MAX_KEY_LENGTH 1000 +static unsigned char KEY[MAX_KEY_LENGTH]; + + +Trie Trie_new(void) { + Trie trie; + + if(!(trie = (Trie)malloc(sizeof(struct _Trie)))) + return NULL; + trie->transitions = NULL; + trie->num_transitions = 0; + trie->value = NULL; + return trie; +} + +int Trie_set(Trie trie, const unsigned char *key, const void *value) { + int i; + Transition transition=NULL; + unsigned char *suffix=NULL; + int retval = 0; + int first, last, mid; + + if(!key[0]) { + trie->value = (void *)value; + return 0; + } + + /* Insert the key in alphabetical order. Do a binary search to + find the proper place. */ + first = 0; + last = trie->num_transitions-1; + i = -1; + while(first <= last) { + mid = (first+last)/2; + transition = &trie->transitions[mid]; + suffix = transition->suffix; + if(key[0] < suffix[0]) + last = mid-1; + else if(key[0] > suffix[0]) + first = mid+1; + else { + i = mid; + break; + } + } + + /* If no place was found for it, then the indexes will be in the + order last,first. Place it at index first. */ + if(i == -1) + i = first; + + /* If nothing matches, then insert a new trie here. */ + if((i >= trie->num_transitions) || (key[0] != suffix[0])) { + unsigned char *new_suffix=NULL; + Trie newtrie=NULL; + Transition new_transitions=NULL; + + /* Create some variables for the new transition. I'm going to + allocate these first so that if I can detect memory errors + before I mess up the data structure of the transitions. + */ + if(!(new_suffix = (unsigned char *)strdup(key))) + goto insert_memerror; + if(!(newtrie = Trie_new())) + goto insert_memerror; + + /* Create some space for the next transition. Allocate some + memory and shift the old transitions over to make room for + this one. + */ + if(!(new_transitions = malloc(sizeof(struct _Transition) * + (trie->num_transitions+1)))) + goto insert_memerror; + memcpy(new_transitions, trie->transitions, + sizeof(struct _Transition)*i); + memcpy(&new_transitions[i+1], &trie->transitions[i], + sizeof(struct _Transition)*(trie->num_transitions-i)); + free(trie->transitions); + trie->transitions = new_transitions; + new_transitions = NULL; + trie->num_transitions += 1; + + /* Initialize the new transition. */ + transition = &trie->transitions[i]; + transition->suffix = new_suffix; + transition->next = newtrie; + transition->next->value = (void *)value; + + if(0) { + insert_memerror: + if(new_transitions) free(new_transitions); + if(newtrie) free(newtrie); + if(new_suffix) free(new_suffix); + return 1; + } + } + /* There are three cases where the key and suffix share some + letters. + 1. suffix is proper substring of key. + 2. key is proper substring of suffix. + 3. neither is proper substring of other. + + For cases 2 and 3, I need to first split up the transition + based on the number of characters shared. Then, I can insert + the rest of the key into the next trie. + */ + else { + /* Count the number of characters shared between key + and suffix. */ + int chars_shared = 0; + while(key[chars_shared] && key[chars_shared] == suffix[chars_shared]) + chars_shared++; + + /* Case 2 or 3, split this sucker! */ + if(chars_shared < strlen(suffix)) { + Trie newtrie=NULL; + unsigned char *new_suffix1=NULL, *new_suffix2=NULL; + + if(!(new_suffix1 = (unsigned char *)malloc(chars_shared+1))) + goto split_memerror; + strncpy(new_suffix1, key, chars_shared); + new_suffix1[chars_shared] = 0; + if(!(new_suffix2 = (unsigned char *)strdup(suffix+chars_shared))) + goto split_memerror; + if(!(newtrie = Trie_new())) + goto split_memerror; + if(!(newtrie->transitions = + (Transition)malloc(sizeof(struct _Transition)))) + goto split_memerror; + newtrie->num_transitions = 1; + newtrie->transitions[0].next = transition->next; + newtrie->transitions[0].suffix = new_suffix2; + + free(transition->suffix); + transition->suffix = new_suffix1; + transition->next = newtrie; + + if(0) { + split_memerror: + if(newtrie && newtrie->transitions) free(newtrie->transitions); + if(newtrie) free(newtrie); + if(new_suffix2) free(new_suffix2); + if(new_suffix1) free(new_suffix1); + return 1; + } + } + retval = Trie_set(transition->next, key+chars_shared, value); + } + + return retval; +} + +void Trie_del(Trie trie) { + int i; + if(!trie) + return; + for(i=0; inum_transitions; i++) { + Transition transition = &trie->transitions[i]; + if(transition->suffix) + free(transition->suffix); + Trie_del(transition->next); + } + free(trie); +} + +void *Trie_get(const Trie trie, const unsigned char *key) { + int first, last, mid; + + if(!key[0]) { + return trie->value; + } + + /* The transitions are stored in alphabetical order. Do a binary + * search to find the proper one. + */ + first = 0; + last = trie->num_transitions-1; + while(first <= last) { + Transition transition; + unsigned char *suffix; + int c; + mid = (first+last)/2; + transition = &trie->transitions[mid]; + suffix = transition->suffix; + /* If suffix is a substring of key, then get the value from + the next trie. + */ + c = strncmp(key, suffix, strlen(suffix)); + if(c < 0) + last = mid-1; + else if(c > 0) + first = mid+1; + else + return Trie_get(transition->next, key+strlen(suffix)); + } + return NULL; +} + + +/* Mutually recursive, so need to make a forward declaration. */ +void +_get_approximate_trie(const Trie trie, const unsigned char *key, const int k, + void (*callback)(const unsigned char *key, + const void *value, + const int mismatches, + void *data), + void *data, + const int mismatches, + unsigned char *current_key, const int max_key + ); + +void +_get_approximate_transition(const unsigned char *key, + const int k, + const Transition transition, + const unsigned char *suffix, + void (*callback)(const unsigned char *key, + const void *value, + const int mismatches, + void *data), + void *data, + const int mismatches, + unsigned char *current_key, const int max_key + ) +{ + int i; + int prev_keylen = strlen(current_key); + + /* Short circuit optimization. If there's too many characters to + possibly be a match, then don't even try to match things. */ + if((int)(strlen(suffix) - strlen(key)) > k) + return; + + /* Match as many characters as possible. */ + i = 0; + while(suffix[i] && (key[i] == suffix[i])) { + i++; + } + /* Check to make sure the key is not too long. BUG: If it is, + fails silently. */ + if((prev_keylen+i) >= max_key) + return; + strncat(current_key, suffix, i); + + /* If all the letters in the suffix matched, then move to the + next trie. */ + if(!suffix[i]) { + _get_approximate_trie(transition->next, &key[i], k, callback, data, + mismatches, current_key, max_key); + } + /* Otherwise, try out different kinds of mismatches. */ + else if(k) { + int new_keylen = prev_keylen+i; + + /* Letter replacement, skip the next letter in both the key and + suffix. */ + if((new_keylen+1 < max_key) && key[i] && suffix[i]) { + current_key[new_keylen] = suffix[i]; + current_key[new_keylen+1] = 0; + _get_approximate_transition(&key[i+1], k-1, + transition, &suffix[i+1], + callback, data, + mismatches+1, current_key, max_key); + current_key[new_keylen] = 0; + } + + /* Insertion in key, skip the next letter in the key. */ + if(key[i]) { + _get_approximate_transition(&key[i+1], k-1, + transition, &suffix[i], + callback, data, + mismatches+1, current_key, max_key); + } + + /* Deletion from key, skip the next letter in the suffix. */ + if((new_keylen+1 < max_key) && suffix[i]) { + current_key[new_keylen] = suffix[i]; + current_key[new_keylen+1] = 0; + _get_approximate_transition(&key[i], k-1, + transition, &suffix[i+1], + callback, data, + mismatches+1, current_key, max_key); + current_key[new_keylen] = 0; + } + } + current_key[prev_keylen] = 0; +} + +void +_get_approximate_trie(const Trie trie, const unsigned char *key, const int k, + void (*callback)(const unsigned char *key, + const void *value, + const int mismatches, + void *data), + void *data, + const int mismatches, + unsigned char *current_key, const int max_key + ) +{ + int i; + + /* If there's no more key to match, then I'm done. */ + if(!key[0]) { + if(trie->value) + (*callback)(current_key, trie->value, mismatches, data); + } + /* If there are no more mismatches allowed, then fall back to the + faster Trie_get. */ + else if(!k) { + void *value = Trie_get(trie, key); + if(value) { + int l = strlen(current_key); + /* Make sure I have enough space for the full key. */ + if(l + strlen(key) < max_key) { + strcat(current_key, key); + (*callback)(current_key, value, mismatches, data); + current_key[l] = 0; + } + /* BUG: Ran out of space for the key. This fails + silently, but should signal an error. */ + } + } + /* If there are no more transitions, then all the characters left + in the key are mismatches. */ + else if(!trie->num_transitions) { + if(trie->value && (strlen(key) <= k)) { + (*callback)(current_key, trie->value, + mismatches+strlen(key), data); + } + } + /* Otherwise, try to match each of the transitions. */ + else { + for(i=0; inum_transitions; i++) { + Transition transition = &trie->transitions[i]; + unsigned char *suffix = transition->suffix; + _get_approximate_transition(key, k, transition, suffix, + callback, data, + mismatches, current_key, max_key); + } + } + +} + + +void +Trie_get_approximate(const Trie trie, const unsigned char *key, const int k, + void (*callback)(const unsigned char *key, + const void *value, + const int mismatches, + void *data), + void *data + ) +{ + KEY[0] = 0; + _get_approximate_trie(trie, key, k, callback, data, 0, KEY,MAX_KEY_LENGTH); +} + +int Trie_len(const Trie trie) +{ + int i; + int length = 0; + + if(!trie) + return 0; + if(trie->value) + length += 1; + for(i=0; inum_transitions; i++) { + length += Trie_len(trie->transitions[i].next); + } + return length; +} + +int Trie_has_key(const Trie trie, const unsigned char *key) +{ + return Trie_get(trie, key) != NULL; +} + +int Trie_has_prefix(const Trie trie, const unsigned char *prefix) +{ + int first, last, mid; + + if(!prefix[0]) { + return 1; + } + + /* The transitions are stored in alphabetical order. Do a binary + * search to find the proper one. + */ + first = 0; + last = trie->num_transitions-1; + while(first <= last) { + Transition transition; + unsigned char *suffix; + int suffixlen, prefixlen, minlen; + int c; + mid = (first+last)/2; + transition = &trie->transitions[mid]; + suffix = transition->suffix; + suffixlen = strlen(suffix); + prefixlen = strlen(prefix); + minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen; + c = strncmp(prefix, suffix, minlen); + if(c < 0) + last = mid-1; + else if(c > 0) + first = mid+1; + else + return Trie_has_prefix(transition->next, prefix+minlen); + } + return 0; +} + +static void +_iterate_helper(const Trie trie, + void (*callback)(const unsigned char *key, + const void *value, + void *data), + void *data, + unsigned char *current_key, const int max_key) +{ + int i; + if(trie->value) + (*callback)(current_key, trie->value, data); + for(i=0; inum_transitions; i++) { + Transition transition = &trie->transitions[i]; + unsigned char *suffix = transition->suffix; + int keylen = strlen(current_key); + + if(keylen + strlen(suffix) >= max_key) { + /* BUG: This will fail silently. It should raise some + sort of error. */ + continue; + } + strcat(current_key, suffix); + _iterate_helper(transition->next, callback, data, + current_key, max_key); + current_key[keylen] = 0; + } +} + +void +Trie_iterate(const Trie trie, + void (*callback)(const unsigned char *key, + const void *value, + void *data), + void *data) +{ + KEY[0] = 0; + _iterate_helper(trie, callback, data, KEY, MAX_KEY_LENGTH); +} + +static void +_with_prefix_helper(const Trie trie, const unsigned char *prefix, + void (*callback)(const unsigned char *key, + const void *value, + void *data), + void *data, + unsigned char *current_key, const int max_key) +{ + int first, last, mid; + + if(!prefix[0]) { + _iterate_helper(trie, callback, data, current_key, max_key); + return; + } + + /* The transitions are stored in alphabetical order. Do a binary + * search to find the proper one. + */ + first = 0; + last = trie->num_transitions-1; + while(first <= last) { + Transition transition; + unsigned char *suffix; + int suffixlen, prefixlen, minlen; + int c; + mid = (first+last)/2; + transition = &trie->transitions[mid]; + suffix = transition->suffix; + suffixlen = strlen(suffix); + prefixlen = strlen(prefix); + minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen; + c = strncmp(prefix, suffix, minlen); + if(c < 0) + last = mid-1; + else if(c > 0) + first = mid+1; + else { + int keylen = strlen(current_key); + if(keylen + minlen >= max_key) { + /* BUG: This will fail silently. It should raise some + sort of error. */ + break; + } + strncat(current_key, suffix, minlen); + _with_prefix_helper(transition->next, prefix+minlen, + callback, data, current_key, max_key); + current_key[keylen] = 0; + break; + } + } +} + +void +Trie_with_prefix(const Trie trie, const unsigned char *prefix, + void (*callback)(const unsigned char *key, + const void *value, + void *data), + void *data + ) +{ + KEY[0] = 0; + _with_prefix_helper(trie, prefix, callback, data, KEY, MAX_KEY_LENGTH); +} + + + +/* Need to declare _serialize_transition here so it can be called from + _serialize_trie. */ +int _serialize_transition(const Transition transition, + int (*write)(const void *towrite, const int length, + void *data), + int (*write_value)(const void *value, void *data), + void *data); + +/* This library also provides code for flattening tries so that they + * can be saved and read back in later. The format of a serialized + * trie is: + * TYPE NBYTES DESCRIPTION + * byte 1 Whether or not there is a value + * variable variable If there is a value, let the client store it. + * byte 1 Number of transitions for this Trie. + * transition variable + * int 4 Number of characters in the suffix. + * suffix variable the suffix for this transition + * byte 1 Whether or not there is a trie + * trie variable Recursively points to another trie. + * + * The number of bytes and the endian may vary from platform to + * platform. + */ + +int _serialize_trie(const Trie trie, + int (*write)(const void *towrite, const int length, + void *data), + int (*write_value)(const void *value, void *data), + void *data) +{ + int i; + unsigned char has_value; + + has_value = (trie->value != NULL); + if(!(*write)(&has_value, sizeof(has_value), data)) + return 0; + if(has_value) { + if(!(*write_value)(trie->value, data)) + return 0; + } + + if(!(*write)(&trie->num_transitions, sizeof(trie->num_transitions), data)) + return 0; + for(i=0; inum_transitions; i++) { + if(!_serialize_transition(&trie->transitions[i], + write, write_value, data)) + return 0; + } + + return 1; +} + +int _serialize_transition(const Transition transition, + int (*write)(const void *towrite, const int length, + void *data), + int (*write_value)(const void *value, void *data), + void *data) +{ + int suffixlen; + unsigned char has_trie; + + suffixlen = strlen(transition->suffix); + if(!(*write)(&suffixlen, sizeof(suffixlen), data)) + return 0; + if(!(*write)(transition->suffix, suffixlen, data)) + return 0; + + has_trie = (transition->next != NULL); + if(!(*write)(&has_trie, sizeof(has_trie), data)) + return 0; + if(has_trie) { + if(!_serialize_trie(transition->next, write, write_value, data)) + return 0; + } + return 1; +} + +int Trie_serialize(const Trie trie, + int (*write)(const void *towrite, const int length, + void *data), + int (*write_value)(const void *value, void *data), + void *data) +{ + int success = _serialize_trie(trie, write, write_value, data); + (*write)(NULL, 0, data); + return success; +} + +int _deserialize_transition(Transition transition, + int (*read)(void *wasread, const int length, + void *data), + void *(*read_value)(void *data), + void *data); + +int _deserialize_trie(Trie trie, + int (*read)(void *wasread, const int length, void *data), + void *(*read_value)(void *data), + void *data) +{ + int i; + unsigned char has_value; + + if(!(*read)(&has_value, sizeof(has_value), data)) + goto _deserialize_trie_error; + if(has_value != 0 && has_value != 1) + goto _deserialize_trie_error; + if(has_value) { + if(!(trie->value = (*read_value)(data))) + goto _deserialize_trie_error; + } + if(!(*read)(&trie->num_transitions, sizeof(trie->num_transitions), data)) + goto _deserialize_trie_error; + if(!(trie->transitions = + malloc(trie->num_transitions*sizeof(struct _Transition)))) + goto _deserialize_trie_error; + for(i=0; inum_transitions; i++) { + if(!_deserialize_transition(&trie->transitions[i], + read, read_value, data)) + goto _deserialize_trie_error; + } + return 1; + + _deserialize_trie_error: + trie->num_transitions = 0; + if(trie->transitions) { + free(trie->transitions); + trie->transitions = NULL; + } + trie->value = NULL; + return 0; +} + +int _deserialize_transition(Transition transition, + int (*read)(void *wasread, const int length, + void *data), + void *(*read_value)(void *data), + void *data) +{ + int suffixlen; + unsigned char has_trie; + + if(!(*read)(&suffixlen, sizeof(suffixlen), data)) + goto _deserialize_transition_error; + if(suffixlen < 0 || suffixlen >= MAX_KEY_LENGTH) + goto _deserialize_transition_error; + if(!(*read)(KEY, suffixlen, data)) + goto _deserialize_transition_error; + KEY[suffixlen] = 0; + if(!(transition->suffix = (unsigned char *)strdup(KEY))) + goto _deserialize_transition_error; + if(!(*read)(&has_trie, sizeof(has_trie), data)) + goto _deserialize_transition_error; + if(has_trie != 0 && has_trie != 1) + goto _deserialize_transition_error; + if(has_trie) { + transition->next = Trie_new(); + if(!_deserialize_trie(transition->next, read, read_value, data)) + goto _deserialize_transition_error; + } + return 1; + + _deserialize_transition_error: + if(transition->suffix) { + free(transition->suffix); + transition->suffix = NULL; + } + if(transition->next) { + Trie_del(transition->next); + transition->next = NULL; + } + return 0; +} + +Trie Trie_deserialize(int (*read)(void *wasread, const int length, void *data), + void *(*read_value)(void *data), + void *data) +{ + Trie trie = Trie_new(); + if(!_deserialize_trie(trie, read, read_value, data)) { + Trie_del(trie); + return NULL; + } + return trie; +} + +void test(void) { + Trie trie; + + printf("Hello world!\n"); + + trie = Trie_new(); + printf("New trie %p\n", trie); + Trie_set(trie, "hello world", "s1"); + Trie_set(trie, "bye", "s2"); + Trie_set(trie, "hell sucks", "s3"); + Trie_set(trie, "hebee", "s4"); + + printf("%s\n", (char *)Trie_get(trie, "hello world")); + printf("%s\n", (char *)Trie_get(trie, "bye")); + printf("%s\n", (char *)Trie_get(trie, "hell sucks")); + printf("%s\n", (char *)Trie_get(trie, "hebee")); + + Trie_set(trie, "blah", "s5"); + printf("%s\n", (char *)Trie_get(trie, "blah")); + + printf("%p\n", Trie_get(trie, "foobar")); + printf("%d\n", Trie_len(trie)); + + Trie_set(trie, "blah", "snew"); + printf("%s\n", (char *)Trie_get(trie, "blah")); + + Trie_del(trie); +} + +#if 0 +int main() { + test(); +} +#endif diff --git a/binaries/src/globplot/biopython-1.50/Bio/trie.h b/binaries/src/globplot/biopython-1.50/Bio/trie.h new file mode 100644 index 0000000..eb85549 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/trie.h @@ -0,0 +1,129 @@ +typedef struct _Trie *Trie; + + + +/* Trie_new + * -------- + * Create a new trie. Return a Trie structure, which is an abstract + * data structure. The client should not have to know about the + * details of this structure. When finished, each Trie should be + * freed with Trie_del. + */ +Trie Trie_new(void); + + +/* Trie_del + * -------- + * Free a Trie data structure. + */ +void Trie_del(Trie trie); + + +/* Trie_set + * -------- + * Set a string in the Trie to some value. Returns a 0 if the + * function succeeded. + */ +int Trie_set(Trie trie, const unsigned char *key, const void *value); + +/* Trie_get + * -------- + * Lookup whether a key exists in the Trie. Returns the value that + * was previous set in the Trie, or NULL if it doesn't exist. + */ +void *Trie_get(const Trie trie, const unsigned char *key); + + +/* Trie_get_approximate + * -------------------- + * Lookup whether a key exists in the Trie, allowing for mismatches to + * the dictionary. Passes back values using a callback function. + */ +void +Trie_get_approximate(const Trie trie, const unsigned char *key, const int k, + void (*callback)(const unsigned char *key, + const void *value, + const int mismatches, + void *data), + void *data + ); + +/* Trie_len + * -------- + * Return the number of strings in the trie. + */ +int Trie_len(const Trie trie); + + +/* Trie_has_key + * ------------ + * Return whether a key exists in the trie. + */ +int Trie_has_key(const Trie trie, const unsigned char *key); + + +/* Trie_has_prefix + * --------------- + * Return whether a string is a prefix of a key in the trie. + */ +int Trie_has_prefix(const Trie trie, const unsigned char *prefix); + + +/* Trie_with_prefix + * ---------------- + * Iterate over all the keys in the trie that start with a prefix. + */ +void Trie_with_prefix(const Trie trie, const unsigned char *prefix, + void (*callback)(const unsigned char *key, + const void *value, + void *data), + void *data + ); + + +/* Trie_iterate + * ------------ + * Iterate through everything stored in the trie. callback is a + * function that gets called for each thing in the trie. It is called + * in arbitrary order. data is a pointer to some arbitrary data and + * gets passed unchanged to the callback. + */ +void Trie_iterate(const Trie trie, + void (*callback)(const unsigned char *key, + const void *value, + void *data), + void *data + ); + +/* Trie_serialize + * -------------- + * Serialize a tree into a stream of bytes. This function takes a + * callback 'write' that should take a pointer to data and the length + * of the data in bytes. This will be called repeatedly until the + * whole Trie is serialized. When it is done, this function will call + * 'write' with a length of 0. Since the values are handled by the + * client, this function also takes a callback function 'write_value' + * so that the client can serialize their own values. + * + * This function is platform-dependent, so byte streams created on one + * machine may not necessarily port to another. + */ +int Trie_serialize(const Trie trie, + int (*write)(const void *towrite, const int length, + void *data), + int (*write_value)(const void *value, void *data), + void *data); + + + +/* Trie_deserialize + * ---------------- + * Deserialize a tree that was previously serialized with + * Trie_serialize. This function takes a callback 'read' that should + * read 'length' bytes and save it to 'wasread'. 'read_value' should + * read a value and return a pointer to it. 'data' is a pointer that + * will be passed unchanged to 'read' and 'read_value'. + */ +Trie Trie_deserialize(int (*read)(void *wasread, const int length, void *data), + void *(*read_value)(void *data), + void *data); diff --git a/binaries/src/globplot/biopython-1.50/Bio/triefind.py b/binaries/src/globplot/biopython-1.50/Bio/triefind.py new file mode 100644 index 0000000..da862e5 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/triefind.py @@ -0,0 +1,92 @@ +""" +Given a trie, find all occurrences of a word in the trie in a string. + +Like searching a string for a substring, except that the substring is +any word in a trie. + +Functions: +match Find longest key in a trie matching the beginning of the string. +match_all Find all keys in a trie matching the beginning of the string. +find Find keys in a trie matching anywhere in a string. +find_words Find keys in a trie matching whole words in a string. + +""" +import string +import re + +def match(string, trie): + """match(string, trie) -> longest key or None + + Find the longest key in the trie that matches the beginning of the + string. + + """ + longest = None + for i in range(len(string)): + substr = string[:i+1] + if not trie.has_prefix(substr): + break + if trie.has_key(substr): + longest = substr + return longest + +def match_all(string, trie): + """match_all(string, trie) -> list of keys + + Find all the keys in the trie that matches the beginning of the + string. + + """ + matches = [] + for i in range(len(string)): + substr = string[:i+1] + if not trie.has_prefix(substr): + break + if trie.has_key(substr): + matches.append(substr) + return matches + +def find(string, trie): + """find(string, trie) -> list of tuples (key, start, end) + + Find all the keys in the trie that match anywhere in the string. + + """ + results = [] + start = 0 # index to start the search + while start < len(string): + # Look for a match. + keys = match_all(string[start:], trie) + for key in keys: + results.append((key, start, start+len(key))) + start += 1 + return results + +DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace + +def find_words(string, trie): + """find_words(string, trie) -> list of tuples (key, start, end) + + Find all the keys in the trie that match full words in the string. + Word boundaries are defined as any punctuation or whitespace. + + """ + _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS)) + + results = [] + start = 0 # index of word boundary + while start < len(string): + # Look for a match. + keys = match_all(string[start:], trie) + for key in keys: + l = len(key) + # Make sure it ends at a boundary. + if start+l == len(string) or \ + _boundary_re.match(string[start+l]): + results.append((key, start, start+l)) + # Move forward to the next boundary. + m = _boundary_re.search(string, start) + if m is None: + break + start = m.end() + return results diff --git a/binaries/src/globplot/biopython-1.50/Bio/triemodule.c b/binaries/src/globplot/biopython-1.50/Bio/triemodule.c new file mode 100644 index 0000000..a33214b --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/triemodule.c @@ -0,0 +1,661 @@ +#include +#include +#include "trie.h" + +#if PY_VERSION_HEX < 0x02050000 +#define Py_ssize_t int +#endif + + + +staticforward PyTypeObject Trie_Type; + +typedef struct { + PyObject_HEAD + Trie trie; +} trieobject; + +static PyObject* +trie_trie(PyObject* self, PyObject* args) +{ + trieobject* trieobj; + Trie trie; + + if (!PyArg_ParseTuple(args,":trie")) + return NULL; + if(!(trie = Trie_new())) + return PyErr_NoMemory(); + if(!(trieobj = PyObject_New(trieobject, &Trie_Type))) + return NULL; + trieobj->trie = trie; + return (PyObject*)trieobj; +} + +static void +_decref_objects(const unsigned char *key, const void *value, void *data) +{ + Py_DECREF((PyObject *)value); +} + +static void +trie_dealloc(PyObject* self) +{ + trieobject *mp = (trieobject *)self; + Trie_iterate(mp->trie, _decref_objects, NULL); + Trie_del(mp->trie); + PyObject_Del(self); +} + +static Py_ssize_t +trie_length(trieobject *mp) +{ + return Trie_len(mp->trie); +} + +static PyObject * +trie_subscript(trieobject *mp, PyObject *py_key) +{ + unsigned char *key; + PyObject *py_value; + + /* Make sure key is a string. */ + if(!PyString_Check(py_key)) { + PyErr_SetString(PyExc_TypeError, "key must be a string"); + return NULL; + } + key = (unsigned char *)PyString_AS_STRING(py_key); + py_value = (PyObject *)Trie_get(mp->trie, key); + if(py_value == NULL) + PyErr_SetString(PyExc_KeyError, (char *)key); + else + Py_INCREF(py_value); + return py_value; +} + +static int +trie_ass_sub(trieobject *mp, PyObject *py_key, PyObject *py_value) +{ + unsigned char *key; + PyObject *py_prev; + + /* Make sure key is a string. */ + if(!PyString_Check(py_key)) { + PyErr_SetString(PyExc_TypeError, "key must be a string"); + return -1; + } + key = (unsigned char *)PyString_AS_STRING((char *)py_key); + + /* Check to see whether something already exists at that key. If + there's already an object there, then I will have to remove it. + */ + py_prev = (PyObject *)Trie_get(mp->trie, key); + if(py_prev) { + Py_DECREF(py_prev); + } + + /* The client wants to delete a key from a dictionary. The Trie + API doesn't support this, so I will just overwrite it with + NULL. */ + if(!py_value) { + /* If the key doesn't exist, raise a KeyError. */ + if(!py_prev) { + PyErr_SetString(PyExc_KeyError, (char *)key); + return -1; + } + Trie_set(mp->trie, key, NULL); + } + /* The client wants to set a key in the dictionary. */ + else { + Py_INCREF(py_value); + if(Trie_set(mp->trie, key, py_value)) { + PyErr_SetString(PyExc_AssertionError, "error setting trie"); + return -1; + } + } + return 0; +} + +static char has_key__doc__[] = +"D.has_key(k) -> 1 if D has a key k, else 0"; + +static PyObject * +trie_has_key(trieobject *mp, PyObject *py_key) +{ + unsigned char *key; + int has_key; + + /* Make sure key is a string. */ + if(!PyString_Check(py_key)) { + PyErr_SetString(PyExc_TypeError, "key must be a string"); + return NULL; + } + key = (unsigned char *)PyString_AS_STRING(py_key); + has_key = Trie_has_key(mp->trie, key); + return PyInt_FromLong((long)has_key); +} + +static PyObject * +trie_has_key_onearg(trieobject *mp, PyObject *py_args) +{ + PyObject *py_arg; + if(!PyArg_ParseTuple(py_args, "O", &py_arg)) + return NULL; + return trie_has_key(mp, py_arg); +} + + + +static char has_prefix__doc__[] = +"D.has_prefix(k) -> 1 if D has a prefix k, else 0"; + +static PyObject * +trie_has_prefix(trieobject *mp, PyObject *py_prefix) +{ + unsigned char *prefix; + int has_prefix; + + /* Make sure prefix is a string. */ + if(!PyString_Check(py_prefix)) { + PyErr_SetString(PyExc_TypeError, "k must be a string"); + return NULL; + } + prefix = (unsigned char *)PyString_AS_STRING(py_prefix); + has_prefix = Trie_has_prefix(mp->trie, prefix); + return PyInt_FromLong((long)has_prefix); +} + +static PyObject * +trie_has_prefix_onearg(trieobject *mp, PyObject *py_args) +{ + PyObject *py_arg; + if(!PyArg_ParseTuple(py_args, "O", &py_arg)) + return NULL; + return trie_has_prefix(mp, py_arg); +} + +static char with_prefix__doc__[] = +"D.with_prefix(prefix) -> list of D's keys that begins with prefix"; + +static void +_trie_with_prefix_helper(const unsigned char *key, const void *value, + void *data) +{ + PyObject *py_list = (PyObject *)data; + PyObject *py_key; + + if(PyErr_Occurred()) + return; + + if(!(py_key = PyString_FromString((const char *)key))) + return; + PyList_Append(py_list, py_key); + Py_DECREF(py_key); +} + +static PyObject * +trie_with_prefix(trieobject *mp, PyObject *py_prefix) +{ + unsigned char *prefix; + PyObject *py_list; + + /* Make sure prefix is a string. */ + if(!PyString_Check(py_prefix)) { + PyErr_SetString(PyExc_TypeError, "k must be a string"); + return NULL; + } + prefix = (unsigned char *)PyString_AS_STRING(py_prefix); + + if(!(py_list = PyList_New(0))) + return NULL; + Trie_with_prefix(mp->trie, prefix, + _trie_with_prefix_helper, (void *)py_list); + if(PyErr_Occurred()) { + Py_DECREF(py_list); + return NULL; + } + return py_list; +} + +static PyObject * +trie_with_prefix_onearg(trieobject *mp, PyObject *py_args) +{ + PyObject *py_arg; + if(!PyArg_ParseTuple(py_args, "O", &py_arg)) + return NULL; + return trie_with_prefix(mp, py_arg); +} + + +static char keys__doc__[] = +"D.keys() -> list of D's keys"; + +static void +_trie_keys_helper(const unsigned char *key, const void *value, void *data) +{ + PyObject *py_list = (PyObject *)data; + PyObject *py_key; + + if(PyErr_Occurred()) + return; + + if(!(py_key = PyString_FromString((char *)key))) + return; + PyList_Append(py_list, py_key); + Py_DECREF(py_key); +} + +static PyObject * +trie_keys(trieobject *mp) +{ + PyObject *py_list; + + if(!(py_list = PyList_New(0))) + return NULL; + Trie_iterate(mp->trie, _trie_keys_helper, (void *)py_list); + if(PyErr_Occurred()) { + Py_DECREF(py_list); + return NULL; + } + return py_list; +} + +static PyObject * +trie_keys_noargs(trieobject *mp, PyObject *py_args) +{ + if(PyTuple_Size(py_args) != 0) { + PyErr_SetString(PyExc_ValueError, "no args expected"); + return NULL; + } + return trie_keys(mp); +} + +static char values__doc__[] = +"D.values() -> list of D's values"; + +static void +_trie_values_helper(const unsigned char *key, const void *value, void *data) +{ + PyObject *py_list = (PyObject *)data; + if(PyErr_Occurred()) + return; + PyList_Append(py_list, (PyObject *)value); +} + +static PyObject * +trie_values(trieobject *mp) +{ + PyObject *py_list; + + if(!(py_list = PyList_New(0))) + return NULL; + Trie_iterate(mp->trie, _trie_values_helper, (void *)py_list); + if(PyErr_Occurred()) { + Py_DECREF(py_list); + return NULL; + } + return py_list; +} + +static PyObject * +trie_values_noargs(trieobject *mp, PyObject *py_args) +{ + if(PyTuple_Size(py_args) != 0) { + PyErr_SetString(PyExc_ValueError, "no args expected"); + return NULL; + } + return trie_values(mp); +} + +static char get__doc__[] = +"D.get(k[,d]) -> D[k] if D.has_key(k), else d. d defaults to None."; + +static PyObject * +trie_get(trieobject *mp, PyObject *args) +{ + unsigned char *key; + PyObject *py_value; + PyObject *py_failobj = Py_None; + + if (!PyArg_ParseTuple(args, "s|O:get", &key, &py_failobj)) + return NULL; + py_value = (PyObject *)Trie_get(mp->trie, key); + if(!py_value) + py_value = py_failobj; + Py_INCREF(py_value); + return py_value; +} + +static char get_approximate__doc__[] = +"D.get_approximate(key, k) -> List of (key, value, mismatches) in D, allowing up to k mismatches in key."; + +void +_trie_get_approximate_helper(const unsigned char *key, const void *value, + const int mismatches, void *data) +{ + /* Append a tuple of (key, value) to data, which is a PyList. */ + PyObject *py_list = (PyObject *)data, + *py_value = (PyObject *)value, + *py_key, + *py_tuple, + *py_mismatches; + + if(PyErr_Occurred()) + return; + + if(!(py_key = PyString_FromString((const char *)key))) + return; + if(!(py_mismatches = PyInt_FromLong(mismatches))) { + Py_DECREF(py_key); + return; + } + Py_INCREF(py_value); + + if(!(py_tuple = PyTuple_New(3))) { + Py_DECREF(py_key); + Py_DECREF(py_mismatches); + Py_DECREF(py_value); + return; + } + PyTuple_SetItem(py_tuple, 0, py_key); + PyTuple_SetItem(py_tuple, 1, py_value); + PyTuple_SetItem(py_tuple, 2, py_mismatches); + PyList_Append(py_list, py_tuple); + Py_DECREF(py_tuple); +} + +static PyObject * +trie_get_approximate(trieobject *mp, PyObject *args) +{ + unsigned char *key; + int k; + PyObject *py_list; + + if (!PyArg_ParseTuple(args, "si:get_approximate", &key, &k)) + return NULL; + + if(!(py_list = PyList_New(0))) + return NULL; + Trie_get_approximate(mp->trie, key, k, + _trie_get_approximate_helper, (void *)py_list); + if(PyErr_Occurred()) { + Py_DECREF(py_list); + return NULL; + } + return py_list; +} + +static long +trie_nohash(PyObject *self) +{ + PyErr_SetString(PyExc_TypeError, "trie objects are unhashable"); + return -1; +} + +static PyMappingMethods trie_as_mapping = { +/* The first member of PyMappingMethods was redefined in Python 2.5. */ +#if PY_VERSION_HEX < 0x02050000 + (inquiry)trie_length, /*mp_length*/ +#else + (lenfunc)trie_length, /*mp_length*/ +#endif + (binaryfunc)trie_subscript, /*mp_subscript*/ + (objobjargproc)trie_ass_sub /*mp_ass_subscript*/ +}; + +static PyMethodDef trieobj_methods[] = { + /* METH_O and METH_NOARGS require Python 2.2. + {"has_key", (PyCFunction)trie_has_key, METH_O, + has_key__doc__}, + {"has_prefix", (PyCFunction)trie_has_prefix, METH_O, + has_prefix__doc__}, + {"with_prefix", (PyCFunction)trie_with_prefix, METH_O, + with_prefix__doc__}, + {"keys", (PyCFunction)trie_keys, METH_NOARGS, + keys__doc__}, + {"values", (PyCFunction)trie_values, METH_NOARGS, + values__doc__}, + */ + + {"has_key", (PyCFunction)trie_has_key_onearg, METH_VARARGS, + has_key__doc__}, + {"has_prefix", (PyCFunction)trie_has_prefix_onearg, METH_VARARGS, + has_prefix__doc__}, + {"with_prefix", (PyCFunction)trie_with_prefix_onearg, METH_VARARGS, + with_prefix__doc__}, + {"keys", (PyCFunction)trie_keys_noargs, METH_VARARGS, + keys__doc__}, + {"values", (PyCFunction)trie_values_noargs, METH_VARARGS, + values__doc__}, + + {"get", (PyCFunction)trie_get, METH_VARARGS, + get__doc__}, + {"get_approximate", (PyCFunction)trie_get_approximate, METH_VARARGS, + get_approximate__doc__}, + {NULL, NULL} /* sentinel */ +}; + +static PyObject *trie_getattr(PyObject *obj, char *name) +{ + return Py_FindMethod(trieobj_methods, (PyObject *)obj, name); + +} + +static PyTypeObject Trie_Type = { + PyObject_HEAD_INIT(NULL) + 0, + "trie", + sizeof(trieobject), + 0, + trie_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + trie_getattr, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + &trie_as_mapping, /*tp_as_mapping*/ + trie_nohash, /*tp_hash */ +}; + +static int +_write_to_handle(const void *towrite, const int length, void *handle) +{ + PyObject *py_handle = (PyObject *)handle, + *py_retval = NULL; + int success = 0; + + if(!length) + return 1; + + if(!(py_retval = PyObject_CallMethod(py_handle, "write", "s#", + towrite, length))) + goto _write_to_handle_cleanup; + success = 1; + + _write_to_handle_cleanup: + if(py_retval) { + Py_DECREF(py_retval); + } + return success; +} + +int _write_value_to_handle(const void *value, void *handle) +{ + PyObject *py_value = (PyObject *)value, + *py_marshalled = NULL; + char *marshalled; + Py_ssize_t length; + int success = 0; + +#ifdef Py_MARSHAL_VERSION + if(!(py_marshalled = + PyMarshal_WriteObjectToString(py_value, Py_MARSHAL_VERSION))) + goto _write_value_to_handle_cleanup; +#else + if(!(py_marshalled = PyMarshal_WriteObjectToString(py_value))) + goto _write_value_to_handle_cleanup; +#endif + if(PyString_AsStringAndSize(py_marshalled, &marshalled, &length) == -1) + goto _write_value_to_handle_cleanup; + if(!_write_to_handle(&length, sizeof(length), handle)) + goto _write_value_to_handle_cleanup; + if (length != (int)length) + goto _write_value_to_handle_cleanup; + if(!_write_to_handle(marshalled, (int)length, handle)) + goto _write_value_to_handle_cleanup; + success = 1; + + _write_value_to_handle_cleanup: + if(py_marshalled) { + Py_DECREF(py_marshalled); + } + + return success; +} + +static PyObject * +trie_save(PyObject *self, PyObject *args) +{ + PyObject *py_handle, + *py_trie; + trieobject *mp; + + if(!PyArg_ParseTuple(args, "OO:save", &py_handle, &py_trie)) + return NULL; + mp = (trieobject *)py_trie; + if(!Trie_serialize(mp->trie, _write_to_handle, _write_value_to_handle, + (void *)py_handle)) { + if(!PyErr_Occurred()) + PyErr_SetString(PyExc_RuntimeError, + "saving failed for some reason"); + return NULL; + } + Py_INCREF(Py_None); + return Py_None; +} + +static int +_read_from_handle(void *wasread, const int length, void *handle) +{ + PyObject *py_handle = (PyObject *)handle, + *py_retval = NULL; + void *retval; + int success = 0; + PyBufferProcs *buffer; + int segment; + int bytes_read, bytes_left; + + if(!length) + return 1; + + if(!(py_retval = PyObject_CallMethod(py_handle, "read", "i", length))) + goto _read_from_handle_cleanup; + if(!py_retval->ob_type->tp_as_buffer) { + PyErr_SetString(PyExc_ValueError, "read method should return buffer"); + goto _read_from_handle_cleanup; + } + if(!(py_retval->ob_type->tp_flags & Py_TPFLAGS_DEFAULT)) { + PyErr_SetString(PyExc_ValueError, "no bf_getcharbuffer slot"); + goto _read_from_handle_cleanup; + } + buffer = py_retval->ob_type->tp_as_buffer; + if(!buffer->bf_getreadbuffer) { + PyErr_SetString(PyExc_ValueError, "no bf_getreadbuffer"); + goto _read_from_handle_cleanup; + } + + bytes_left = length; + segment = 0; + while(bytes_left > 0) { + if((bytes_read = buffer->bf_getreadbuffer(py_retval, + segment, &retval)) == -1) + goto _read_from_handle_cleanup; + memcpy(wasread, retval, bytes_read); + wasread = (void *)((char *)wasread + bytes_read); + bytes_left -= bytes_read; + segment += 1; + } + + success = 1; + + _read_from_handle_cleanup: + if(py_retval) { + Py_DECREF(py_retval); + } + return success; +} + +#define MAX_KEY_LENGTH 2000 +static void * +_read_value_from_handle(void *handle) +{ + Py_ssize_t length; + char KEY[MAX_KEY_LENGTH]; + + if(!_read_from_handle((void *)&length, sizeof(length), (void *)handle)) + return NULL; + if(length < 0 || length >= MAX_KEY_LENGTH) + return NULL; + if(!_read_from_handle((void *)KEY, length, (void *)handle)) + return NULL; + return PyMarshal_ReadObjectFromString(KEY, length); +} + + +static PyObject * +trie_load(PyObject *self, PyObject *args) +{ + PyObject *py_handle; + Trie trie; + trieobject *trieobj; + + if(!PyArg_ParseTuple(args, "O:load", &py_handle)) + return NULL; + + if(!(trie = Trie_deserialize(_read_from_handle, _read_value_from_handle, + py_handle))) { + if(!PyErr_Occurred()) + PyErr_SetString(PyExc_RuntimeError, + "loading failed for some reason"); + return NULL; + } + + if(!(trieobj = PyObject_New(trieobject, &Trie_Type))) { + Trie_del(trie); + return NULL; + } + trieobj->trie = trie; + return (PyObject *)trieobj; +} + +static PyMethodDef trie_methods[] = { + {"trie", trie_trie, METH_VARARGS, + "trie() -> new Trie object."}, + {"load", trie_load, METH_VARARGS, + "load(handle) -> trie object"}, + {"save", trie_save, METH_VARARGS, + "save(handle, trie), save a trie object to a handle"}, + {NULL, NULL, 0, NULL} +}; + +static char trie__doc__[] = +"\ +This module implements a trie data structure. This allows an O(M)\n\ +lookup of a string in a dictionary, where M is the length of the\n\ +string. It also supports approximate matches.\n\ +\n\ +Functions:\n\ +trie Create a new trie object.\n\ +save Save a trie to a handle.\n\ +load Load a trie from a handle.\n\ +\n\ +"; + +DL_EXPORT(void) +inittrie(void) +{ + Trie_Type.ob_type = &PyType_Type; + + (void) Py_InitModule3("trie", trie_methods, trie__doc__); +} diff --git a/binaries/src/globplot/biopython-1.50/Bio/utils.py b/binaries/src/globplot/biopython-1.50/Bio/utils.py new file mode 100644 index 0000000..bf0d317 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/utils.py @@ -0,0 +1,163 @@ +# Copyright 2000 by Andrew Dalke. +# All rights reserved. +# This code is part of the Biopython distribution and governed by its +# license. Please see the LICENSE file that should have been included +# as part of this package. + +"""Miscellaneous functions for dealing with sequences (obsolete?).""" + +import Seq +import Alphabet + +from PropertyManager import default_manager + +def translate(seq, id = None): + """Translate a sequence (DEPRECATED).""" + import warnings + warnings.warn("Bio.utils.translate() has been deprecated, and we" \ + +" intend to remove it in a future release of Biopython."\ + +" Please use the translate method or function in Bio.Seq"\ + +" instead, as described in the Tutorial.", + DeprecationWarning) + if id is None: + s = "translator" + else: + s = "translator.id.%d" % id + translator = default_manager.resolve(seq.alphabet, s) + return translator.translate(seq) + +def translate_to_stop(seq, id = None): + """Translate a sequence up to the first in frame stop codon (DEPRECATED).""" + import warnings + warnings.warn("Bio.utils.translate_to_stop() has been deprecated, and we" \ + +" intend to remove it in a future release of Biopython."\ + +" Please use the translate method or function in Bio.Seq"\ + +" instead, as described in the Tutorial.", + DeprecationWarning) + if id is None: + s = "translator" + else: + s = "translator.id.%d" % id + translator = default_manager.resolve(seq.alphabet, s) + return translator.translate_to_stop(seq) + +def back_translate(seq, id = None): + """Back-translate a sequence (DEPRECATED).""" + import warnings + warnings.warn("Bio.utils.back_translate() has been deprecated, and we" \ + +" intend to remove it in a future release of Biopython."\ + +" If you use it, please tell us on the mailing list.", + DeprecationWarning) + if id is None: + s = "translator" + else: + s = "translator.id.%d" % id + translator = default_manager.resolve(seq.alphabet, s) + return translator.back_translate(seq) + + +def transcribe(seq): + """Transcribe a sequence (DEPRECATED).""" + import warnings + warnings.warn("Bio.utils.transcribe() has been deprecated, and we" \ + +" intend to remove it in a future release of Biopython."\ + +" Please use the transcribe method or function in"\ + +" Bio.Seq instead, as described in the Tutorial.", + DeprecationWarning) + transcriber = default_manager.resolve(seq.alphabet, "transcriber") + return transcriber.transcribe(seq) + +def back_transcribe(seq): + """Back-transcribe a sequence (DEPRECATED).""" + import warnings + warnings.warn("Bio.utils.back_transcribe() has been deprecated, and we" \ + +" intend to remove it in a future release of Biopython."\ + +" Please use the back_transcribe method or function in"\ + +" Bio.Seq instead, as described in the Tutorial.", + DeprecationWarning) + transcriber = default_manager.resolve(seq.alphabet, "transcriber") + return transcriber.back_transcribe(seq) + +def ungap(seq): + """given a sequence with gap encoding, return the ungapped sequence""" + #TODO - Fix this? It currently assumes the outmost AlphabetEncoder + #is for the gap. Consider HasStopCodon(Gapped(Protein())) as a test case. + gap = seq.gap_char + letters = [] + for c in seq.data: + if c != gap: + letters.append(c) + return Seq.Seq("".join(letters), seq.alphabet.alphabet) + +def verify_alphabet(seq): + letters = {} + for c in seq.alphabet.letters: + letters[c] = 1 + try: + for c in seq.data: + letters[c] + except KeyError: + return 0 + return 1 + +def count_monomers(seq): + dict = {} +# bugfix: string.count(s,c) raises an AttributeError. Iddo Friedberg 16 Mar. 04 +# s = buffer(seq.data) # works for strings and array.arrays + for c in seq.alphabet.letters: + dict[c] = seq.data.count(c) + return dict + +def percent_monomers(seq): + dict2 = {} + seq_len = len(seq) + dict = count_monomers(seq) + for m in dict: + dict2[m] = dict[m] * 100. / seq_len + return dict2 + +def sum(seq, table, zero = 0.0): + total = zero + for c in getattr(seq, "data", seq): + total = total + table[c] + return total + +# For ranged addition +def sum_2ple(seq, table, zero = (0.0, 0.0)): + x, y = zero + data = getattr(seq, "data", seq) + for c in data: + x2, y2 = table[c] + x = x + x2 + y = y + y2 + return (x, y) + +def total_weight(seq, weight_table = None): + if weight_table is None: + weight_table = default_manager.resolve(seq.alphabet, "weight_table") + return sum(seq, weight_table) + +def total_weight_range(seq, weight_table = None): + if weight_table is None: + weight_table = default_manager.resolve(seq.alphabet, "weight_range_table") + return sum_2ple(seq, weight_table) + +def reduce_sequence(seq, reduction_table,new_alphabet=None): + """ given an amino-acid sequence, return it in reduced alphabet form based + on the letter-translation table passed. Some "standard" tables are in + Alphabet.Reduced. + seq: a Seq.Seq type sequence + reduction_table: a dictionary whose keys are the "from" alphabet, and values + are the "to" alphabet""" + if new_alphabet is None: + new_alphabet = Alphabet.single_letter_alphabet + new_alphabet.letters = '' + for letter in reduction_table: + new_alphabet.letters += letter + new_alphabet.size = len(new_alphabet.letters) + new_seq = Seq.Seq('',new_alphabet) + for letter in seq: + new_seq += reduction_table[letter] + return new_seq + + diff --git a/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/__init__.py new file mode 100644 index 0000000..b2f2e70 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/__init__.py @@ -0,0 +1,2 @@ +"""Part of an old unused and undocumented sequence writing framework (DEPRECATED).""" +# This is a Python module. diff --git a/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/embl.py b/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/embl.py new file mode 100644 index 0000000..b30b368 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/embl.py @@ -0,0 +1,85 @@ +"""Part of an old unused and undocumented sequence writing framework (DEPRECATED).""" +# Not clear on the distinction, if any, between 'embl' and 'embl/65'. This +# code might apply to either or both. + +# See 'http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html' for a +# definition of this file format. + +# This code only makes a best effort--the output may not be strictly valid. +# So, for example, the EMBL ID is supposed to be alphanumeric, starting with a +# letter, but we don't check for this, etc. + + +# Example: +# ID AA03518 standard; DNA; FUN; 237 BP. +# XX +# AC U03518; +# XX +# DE Aspergillus awamori internal transcribed spacer 1 (ITS1) and 18S +# DE rRNA and 5.8S rRNA genes, partial sequence. +# XX +# SQ Sequence 237 BP; 41 A; 77 C; 67 G; 52 T; 0 other; +# aacctgcgga aggatcatta ccgagtgcgg gtcctttggg cccaacctcc catccgtgtc 60 +# tattgtaccc tgttgcttcg gcgggcccgc cgcttgtcgg ccgccggggg ggcgcctctg 120 +# ccccccgggc ccgtgcccgc cggagacccc aacacgaaca ctgtctgaaa gcgtgcagtc 180 +# tgagttgatt gaatgcaatc agttaaaact ttcaacaatg gatctcttgg ttccggc 237 +# // + + +import textwrap + +from Bio import Alphabet +from Bio import Writer + +class WriteEmbl(Writer.Writer): + def __init__(self, outfile): + Writer.Writer.__init__(self, outfile) + + def write(self, record): + seq = record.seq + assert seq.alphabet.size == 1, "cannot handle alphabet of size %d" % \ + seq.alphabet.size + data = seq.data + upperdata = data.upper() + +# It'd be nice if the alphabet was usefully set, but for many interesting +# cases (e.g., reading from FASTA files), it's not. + + if isinstance(seq.alphabet, Alphabet.RNAAlphabet): + molecule = 'mRNA' + letters = ['A', 'C', 'G', 'U'] + else: + molecule = 'DNA' + letters = ['A', 'C', 'G', 'T'] + + division = 'UNC' # unknown + + self.outfile.write("ID %s standard; %s; %s; %d BP.\n" + % (record.id, molecule, division, len(data))) + + desclist = textwrap.wrap(record.description, 74) + for l in desclist: + self.outfile.write("DE %s\n" % l) + + counts = [ upperdata.count(l) for l in letters ] + othercount = len(upperdata) - sum(counts) + + countstring = ''.join([ " %d %s;" % p for p in zip(counts, letters) ]) + + self.outfile.write("SQ Sequence %s BP;%s %d other;\n" + % (len(data), countstring, othercount)) + + rowlength = 60 + blocklength = 10 + for i in xrange(0, len(data), rowlength): + self.outfile.write(" " * 5) + row = data[i:i+rowlength] + for b in xrange(0, rowlength, blocklength): + block = row[b:b+blocklength] + self.outfile.write("%-*s" % (blocklength+1, block)) + self.outfile.write("%9d\n" % min(i+rowlength, len(data))) + + self.outfile.write("//\n") + + +make_writer = WriteEmbl diff --git a/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/empty.py b/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/empty.py new file mode 100644 index 0000000..b54f072 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/empty.py @@ -0,0 +1,7 @@ +"""Part of an old unused and undocumented sequence writing framework (DEPRECATED).""" +from Bio import Writer + +class WriteEmpty(Writer.Writer): + pass + +make_writer = WriteEmpty diff --git a/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/fasta.py b/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/fasta.py new file mode 100644 index 0000000..0b48366 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/writers/SeqRecord/fasta.py @@ -0,0 +1,21 @@ +"""Part of an old unused and undocumented sequence writing framework (DEPRECATED).""" +from Bio import Writer + +class WriteFasta(Writer.Writer): + def __init__(self, outfile, seqwidth = 72): + Writer.Writer.__init__(self, outfile) + assert seqwidth > 0, seqwidth + self.seqwidth = seqwidth + + def write(self, record): + self.outfile.write(">%s %s\n" % (record.id, record.description)) + seq = record.seq + assert seq.alphabet.size == 1, "cannot handle alphabet of size %d" % \ + seq.alphabet.size + seq = seq.data + seqwidth = self.seqwidth + for i in range(0, len(seq), seqwidth): + self.outfile.write(seq[i:i+seqwidth]) + self.outfile.write("\n") + +make_writer = WriteFasta diff --git a/binaries/src/globplot/biopython-1.50/Bio/writers/__init__.py b/binaries/src/globplot/biopython-1.50/Bio/writers/__init__.py new file mode 100644 index 0000000..98bb12f --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/Bio/writers/__init__.py @@ -0,0 +1,3 @@ +"""Part of an old unused and undocumented sequence writing framework (DEPRECATED).""" +# This is a Python module. +# (there are more files underneath this directory) diff --git a/binaries/src/globplot/biopython-1.50/CONTRIB b/binaries/src/globplot/biopython-1.50/CONTRIB new file mode 100644 index 0000000..8791f7d --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/CONTRIB @@ -0,0 +1,53 @@ +CONTRIBUTORS +============ + +This is a list of people who have made contributions to Biopython. +This is certainly not comprehensive, and if you've been overlooked +(sorry!), please mention it on the development mailing list. + +Cecilia Alsmark +Tiago Antao +Sebastian Bassi +Bill Barnard +Yves Bastide +Yair Benita +Peter Bienstman +Bob Bussell +Diego Brouard +James Casbon +Hye-Shik Chang +Jeffrey Chang +Brad Chapman +Peter Cock +Marc Colosimo +Cymon J Cox +Gavin E Crooks +Andrew Dalke +Michiel de Hoon +Sjoerd de Vries +Iddo Friedberg +Bertrand Frottier +Jason A. Hackney +Thomas Hamelryck +Michael Hoffman +Yu Huang +Frank Kauff +Andreas Kuntzagk +Michal Kurowski +Chris Lasher +Gaetan Lehman +Katharine Lindner +Tarjei Mikkelsen +Cheng Soon Ong +Mike Poidinger +Leighton Pritchard +Wolfgang Schueler +Peter Slickers +Thomas Sicheritz-Ponten +Frederic Sohm +Thomas Rosleff Soerensen +Johann Visagie +Dan Vogel +David Weisman +Bartek Wilczynski +Harry Zuzan diff --git a/binaries/src/globplot/biopython-1.50/LICENSE b/binaries/src/globplot/biopython-1.50/LICENSE new file mode 100644 index 0000000..3595ec5 --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/LICENSE @@ -0,0 +1,19 @@ + Biopython License Agreement + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. diff --git a/binaries/src/globplot/biopython-1.50/PKG-INFO b/binaries/src/globplot/biopython-1.50/PKG-INFO new file mode 100644 index 0000000..7fbacbb --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/PKG-INFO @@ -0,0 +1,11 @@ +Metadata-Version: 1.0 +Name: biopython +Version: 1.50 +Summary: Freely available tools for computational molecular biology. +Home-page: http://www.biopython.org/ +Author: The Biopython Consortium +Author-email: biopython@biopython.org +License: UNKNOWN +Download-URL: http://biopython.org/DIST/ +Description: UNKNOWN +Platform: UNKNOWN diff --git a/binaries/src/globplot/biopython-1.50/README b/binaries/src/globplot/biopython-1.50/README new file mode 100644 index 0000000..9c473ae --- /dev/null +++ b/binaries/src/globplot/biopython-1.50/README @@ -0,0 +1,164 @@ +#### THIS IS A REDUCED DESTRIBUTION OF BIOPYTHON #### + +**Biopython README file** + + "The Biopython Project":http://www.biopython.org/ is an +international association of developers of freely available Python +tools for computational molecular biology. + +biopython.org provides an online resource for modules, scripts, and +web links for developers of Python-based software for life science +research. Thanks to bioperl, we can also provide web, FTP and CVS +space for individuals and organizations wishing to distribute or +otherwise make available standalone scripts & code. + +This Biopython package is made available under generous terms. Please +see the LICENSE file for further details. + + +**For the impatient** + +To build and install Biopython, download and unzip the source code, +go to this directory at the command line, and type: + +python setup.py build +python setup.py test +python setup.py install + +**System Requirements** + +o "Python 2.3, 2.4, 2.5 or 2.6":http://www.python.org/ + Note that Biopython 1.50 is expected to the our final release to support + Python 2.3. Given that Python 2.6 is still very new, it would be prudent + to opt for Python 2.5 or 2.4 at this time. + +o "NumPy":http://numpy.scipy.org/ (optional, but strongly recommended) + This package is only used in the computationally-oriented modules. + It is required for Bio.Cluster, Bio.PDB and a few other modules. If you + think you might need these modules, then please install NumPy first BEFORE + installing Biopython. The older Numeric library is no longer supported in + Biopython. + +o "ReportLab":http://www.reportlab.org/downloads.html (optional) + This package is only used in Bio.Graphics, so if you do not need this + functionality, you will not need to install this package. You can install + it later if needed. + +o "psycopg":http://initd.org/software/psycopg/ (optional) or + "pgdb":http://www.druid.net/pygresql/ (optional) + These packages are used by BioSQL to access a PostgreSQL database. + +o "MySQLdb":http://sourceforge.net/projects/mysql-python (optional) + This package is used by BioSQL or Bio.GFF to access a MySQL database. + +o "mxTextTools":http://www.egenix.com/files/python/mxTextTools.html (unlikely) + You probably won't need mxTextTools. This was used in some of Biopython's + older parsers, and Martel/Mindy, but these have all been deprecated now. + +In addition there are a number of useful third party tools you may wish to +install such as standalone NCBI BLAST or ClustalW. + + +**Installation** + +*** Make sure that Python is installed correctly *** + +Installation should be as simple as going to the biopython source code +directory, and typing: + + 'python setup.py build' + 'python setup.py test' + 'sudo python setup.py install' + +If you need to do additional configuration, e.g. changing the base +directory, please type 'python setup.py', or see the documentation for +Distutils. + + +**Testing** + +Biopython includes a suite of regression tests to check if everything is +running correctly. To do the tests, go to the biopython source code directory +and type: + + 'python setup.py test' + +Do not panic if you see messages warning of skipped tests: + test_DocSQL ... skipping. Install MySQLdb if you want to use Bio.DocSQL. + +This most likely means that a package is not installed. You can +ignore this if it occurs in the tests for a module that you were not +planning on using. If you did want to use that module, please install +the required dependency and re-run the tests. + + +**Bugs** + +While we try to ship a robust package, bugs inevitably pop up. If you +are having problems that might be caused by a bug in Biopython, it is +possible that it has already been identified. Search the +"bug database":http://bugzilla.open-bio.org/ and mailing lists +to see if it has already been reported (and hopefully fixed). + +If you suspect the problem lies within a parser, it is likely that the +data format has changed and broken the parsing code. (The BLAST and +GenBank formats seem to be particularly fragile.) Thus, the parsing +code in Biopython is sometimes updated faster than we can build Biopython +releases. You can get the most recent parser by pulling the relevant +files (e.g. the ones in Bio.SeqIO or Bio.Blast) out of +"anonymous cvs":http://cvs.biopython.org/ . +However, be careful when doing this, because the code in CVS is not as +well-tested as released code, and may contain new dependencies. + +Finally, you can send a bug report to the bug database or +biopython@biopython.org. In the bug report, please let us know 1) +which operating system and hardware you are using, 2) Python version, +3) Biopython version (or CVS version/date), 4) traceback that occurs, +5) offending code, and 6) data file that causes the problem. + + + +**Contributing, Bug Reports** + +Biopython is run by volunteers from all over the world, with many +types of backgrounds. We are always looking for people interested in +helping with code development, web-site management, documentation +writing, technical administration, and whatever else comes up. + +If you wish to contribute, please visit the +"web site":http://www.biopython.org +and join our "mailing list":http://biopython.org/wiki/Mailing_lists + + + +**Distribution Structure** + +README -- This file. + +NEWS -- Release notes and news + +LICENSE -- What you can do with the code. + +CONTRIB -- An (incomplete) list of people who helped Biopython in + one way or another. + +DEPRECATED -- Contains information about modules in Biopython that are + removed or no longer recommended for use, and how to update + code that uses those modules. + +MANIFEST.in -- Tells distutils what files to distribute + +setup.py -- Installation file. + +Bio/ -- The main code base code. + +Martel/ -- Code for the Martel parsing system, once used in many + Biopython parsers but now deprecated. + +BioSQL/ -- Code for using Biopython with BioSQL databases. + +Doc/ -- Documentation. + +Scripts/ -- Miscellaneous, possibly useful, standalone scripts + +Tests/ -- Regression testing code diff --git a/binaries/src/globplot/sav_gol b/binaries/src/globplot/sav_gol new file mode 100644 index 0000000..ef46f55 Binary files /dev/null and b/binaries/src/globplot/sav_gol differ