--- /dev/null
+#!/usr/bin/env python
+# Created: Wed May 29 08:07:18 2002
+# thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se
+# Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark.
+# All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Miscellaneous functions for dealing with sequences."""
+
+import re, time
+from Bio import SeqIO
+from Bio import Translate
+from Bio.Seq import Seq
+from Bio import Alphabet
+from Bio.Alphabet import IUPAC
+from Bio.Data import IUPACData, CodonTable
+
+
+######################################
+# DNA
+######################
+# {{{
+
+def reverse(seq):
+ """Reverse the sequence. Works on string sequences.
+
+ e.g.
+ >>> reverse("ACGGT")
+ 'TGGCA'
+
+ """
+ r = list(seq)
+ r.reverse()
+ return ''.join(r)
+
+def GC(seq):
+ """Calculates G+C content, returns the percentage (float between 0 and 100).
+
+ Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
+ when counting the G and C content. The percentage is calculated against
+ the full length, e.g.:
+
+ >>> from Bio.SeqUtils import GC
+ >>> GC("ACTGN")
+ 40.0
+
+ Note that this will return zero for an empty sequence.
+ """
+ try :
+ gc = sum(map(seq.count,['G','C','g','c','S','s']))
+ return gc*100.0/len(seq)
+ except ZeroDivisionError :
+ return 0.0
+
+
+def GC123(seq):
+ """Calculates total G+C content plus first, second and third positions.
+
+ Returns a tuple of four floats (percentages between 0 and 100) for the
+ entire sequence, and the three codon positions. e.g.
+
+ >>> from Bio.SeqUtils import GC123
+ >>> GC123("ACTGTN")
+ (40.0, 50.0, 50.0, 0.0)
+
+ Copes with mixed case sequences, but does NOT deal with ambiguous
+ nucleotides.
+ """
+ d= {}
+ for nt in ['A','T','G','C']:
+ d[nt] = [0,0,0]
+
+ for i in range(0,len(seq),3):
+ codon = seq[i:i+3]
+ if len(codon) <3: codon += ' '
+ for pos in range(0,3):
+ for nt in ['A','T','G','C']:
+ if codon[pos] == nt or codon[pos] == nt.lower():
+ d[nt][pos] += 1
+ gc = {}
+ gcall = 0
+ nall = 0
+ for i in range(0,3):
+ try:
+ n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
+ gc[i] = (d['G'][i] + d['C'][i])*100.0/n
+ except:
+ gc[i] = 0
+
+ gcall = gcall + d['G'][i] + d['C'][i]
+ nall = nall + n
+
+ gcall = 100.0*gcall/nall
+ return gcall, gc[0], gc[1], gc[2]
+
+def GC_skew(seq, window = 100):
+ """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
+
+ Returns a list of ratios (floats), controlled by the length of the sequence
+ and the size of the window.
+
+ Does NOT look at any ambiguous nucleotides.
+ """
+ # 8/19/03: Iddo: added lowercase
+ values = []
+ for i in range(0, len(seq), window):
+ s = seq[i: i + window]
+ g = s.count('G') + s.count('g')
+ c = s.count('C') + s.count('c')
+ skew = (g-c)/float(g+c)
+ values.append(skew)
+ return values
+
+from math import pi, sin, cos, log
+def xGC_skew(seq, window = 1000, zoom = 100,
+ r = 300, px = 100, py = 100):
+ """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
+ from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
+ VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
+ yscroll = Scrollbar(orient = VERTICAL)
+ xscroll = Scrollbar(orient = HORIZONTAL)
+ canvas = Canvas(yscrollcommand = yscroll.set,
+ xscrollcommand = xscroll.set, background = 'white')
+ win = canvas.winfo_toplevel()
+ win.geometry('700x700')
+
+ yscroll.config(command = canvas.yview)
+ xscroll.config(command = canvas.xview)
+ yscroll.pack(side = RIGHT, fill = Y)
+ xscroll.pack(side = BOTTOM, fill = X)
+ canvas.pack(fill=BOTH, side = LEFT, expand = 1)
+ canvas.update()
+
+ X0, Y0 = r + px, r + py
+ x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
+
+ ty = Y0
+ canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
+ ty +=20
+ canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
+ ty +=20
+ canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
+ ty +=20
+ canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
+ ty +=20
+ canvas.create_oval(x1,y1, x2, y2)
+
+ acc = 0
+ start = 0
+ for gc in GC_skew(seq, window):
+ r1 = r
+ acc+=gc
+ # GC skew
+ alpha = pi - (2*pi*start)/len(seq)
+ r2 = r1 - gc*zoom
+ x1 = X0 + r1 * sin(alpha)
+ y1 = Y0 + r1 * cos(alpha)
+ x2 = X0 + r2 * sin(alpha)
+ y2 = Y0 + r2 * cos(alpha)
+ canvas.create_line(x1,y1,x2,y2, fill = 'blue')
+ # accumulated GC skew
+ r1 = r - 50
+ r2 = r1 - acc
+ x1 = X0 + r1 * sin(alpha)
+ y1 = Y0 + r1 * cos(alpha)
+ x2 = X0 + r2 * sin(alpha)
+ y2 = Y0 + r2 * cos(alpha)
+ canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
+
+ canvas.update()
+ start += window
+
+ canvas.configure(scrollregion = canvas.bbox(ALL))
+
+def molecular_weight(seq):
+ """Calculate the molecular weight of a DNA sequence."""
+ if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna)
+ weight_table = IUPACData.unambiguous_dna_weights
+ #TODO, use a generator expession once we drop Python 2.3?
+ #e.g. return sum(weight_table[x] for x in seq)
+ total = 0
+ for x in seq:
+ total += weight_table[x]
+ return total
+
+def nt_search(seq, subseq):
+ """Search for a DNA subseq in sequence.
+
+ use ambiguous values (like N = A or T or C or G, R = A or G etc.)
+ searches only on forward strand
+ """
+ pattern = ''
+ for nt in subseq:
+ value = IUPACData.ambiguous_dna_values[nt]
+ if len(value) == 1:
+ pattern += value
+ else:
+ pattern += '[%s]' % value
+
+ pos = -1
+ result = [pattern]
+ l = len(seq)
+ while True:
+ pos+=1
+ s = seq[pos:]
+ m = re.search(pattern, s)
+ if not m: break
+ pos += int(m.start(0))
+ result.append(pos)
+ return result
+
+# }}}
+
+######################################
+# Protein
+######################
+# {{{
+
+# temporary hack for exception free translation of "dirty" DNA
+# should be moved to ???
+
+class ProteinX(Alphabet.ProteinAlphabet):
+ letters = IUPACData.extended_protein_letters + "X"
+
+proteinX = ProteinX()
+
+class MissingTable:
+ def __init__(self, table):
+ self._table = table
+ def get(self, codon, stop_symbol):
+ try:
+ return self._table.get(codon, stop_symbol)
+ except CodonTable.TranslationError:
+ return 'X'
+
+def makeTableX(table):
+ assert table.protein_alphabet == IUPAC.extended_protein
+ return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX,
+ MissingTable(table.forward_table),
+ table.back_table, table.start_codons,
+ table.stop_codons)
+
+# end of hacks
+
+def seq3(seq):
+ """Turn a one letter code protein sequence into one with three letter codes.
+
+ The single input argument 'seq' should be a protein sequence using single
+ letter codes, either as a python string or as a Seq or MutableSeq object.
+
+ This function returns the amino acid sequence as a string using the three
+ letter amino acid codes. Output follows the IUPAC standard (including
+ ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
+ for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown
+ character (including possible gap characters), is changed into 'Xaa'.
+
+ e.g.
+ >>> from Bio.SeqUtils import seq3
+ >>> seq3("MAIVMGRWKGAR*")
+ 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
+
+ This function was inspired by BioPerl's seq3.
+ """
+ threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
+ 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
+ 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
+ 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
+ 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
+ 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
+ 'U':'Sel', 'O':'Pyl', 'J':'Xle',
+ }
+ #We use a default of 'Xaa' for undefined letters
+ #Note this will map '-' to 'Xaa' which may be undesirable!
+ return ''.join([threecode.get(aa,'Xaa') for aa in seq])
+
+
+# }}}
+
+######################################
+# Mixed ???
+######################
+# {{{
+
+def translate(seq, frame = 1, genetic_code = 1, translator = None):
+ """Translation of DNA in one of the six different reading frames (DEPRECATED).
+
+ Use the Bio.Seq.Translate function, or the Seq object's translate method
+ instead:
+
+ >>> from Bio.Seq import Seq
+ >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
+ >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA")
+ >>> for frame in [0,1,2] :
+ ... print my_seq[frame:].translate()
+ ...
+ MAIVMGR*KGAR*
+ WPL*WAAERVPDS
+ GHCNGPLKGCPIV
+ >>> for frame in [0,1,2] :
+ ... print my_seq.reverse_complement()[frame:].translate()
+ ...
+ YYRAPFQRPITMA
+ TIGHPFSGPLQWP
+ LSGTLSAAHYNGH
+ """
+ import warnings
+ warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \
+ +" to remove it in a future release of Biopython. Please use"\
+ +" the method or function in Bio.Seq instead, as described in"\
+ +" the Tutorial.", DeprecationWarning)
+
+ if frame not in [1,2,3,-1,-2,-3]:
+ raise ValueError('invalid frame')
+
+ if not translator:
+ table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code])
+ translator = Translate.Translator(table)
+
+ #Does this frame calculation do something sensible? No RC taken!
+ return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
+
+def GC_Frame(seq, genetic_code = 1):
+ """Just an alias for six_frame_translations (OBSOLETE).
+
+ Use six_frame_translation directly, as this function may be deprecated
+ in a future release."""
+ return six_frame_translations(seq, genetic_code)
+
+def six_frame_translations(seq, genetic_code = 1):
+ """Formatted string showing the 6 frame translations and GC content.
+
+ nice looking 6 frame translation with GC content - code from xbbtools
+ similar to DNA Striders six-frame translation
+
+ e.g.
+ from Bio.SeqUtils import six_frame_translations
+ print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
+ """
+ from Bio.Seq import reverse_complement, translate
+ anti = reverse_complement(seq)
+ comp = anti[::-1]
+ length = len(seq)
+ frames = {}
+ for i in range(0,3):
+ frames[i+1] = translate(seq[i:], genetic_code)
+ frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
+
+ # create header
+ if length > 20:
+ short = '%s ... %s' % (seq[:10], seq[-10:])
+ else:
+ short = seq
+ #TODO? Remove the date as this would spoil any unit test...
+ date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
+ header = 'GC_Frame: %s, ' % date
+ for nt in ['a','t','g','c']:
+ header += '%s:%d ' % (nt, seq.count(nt.upper()))
+
+ header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
+ res = header
+
+ for i in range(0,length,60):
+ subseq = seq[i:i+60]
+ csubseq = comp[i:i+60]
+ p = i/3
+ res = res + '%d/%d\n' % (i+1, i/3+1)
+ res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
+ res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
+ res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
+ # seq
+ res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
+ res = res + csubseq.lower() + '\n'
+ # - frames
+ res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
+ res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
+ res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
+ return res
+
+# }}}
+
+######################################
+# FASTA file utilities
+######################
+# {{{
+
+def fasta_uniqids(file):
+ """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE).
+
+ file - a FASTA format filename to read in.
+
+ No return value, the output is written to screen.
+ """
+ dict = {}
+ txt = open(file).read()
+ entries = []
+ for entry in txt.split('>')[1:]:
+ name, seq= entry.split('\n',1)
+ name = name.split()[0].split(',')[0]
+
+ if name in dict:
+ n = 1
+ while 1:
+ n = n + 1
+ _name = name + str(n)
+ if _name not in dict:
+ name = _name
+ break
+
+ dict[name] = seq
+
+ for name, seq in dict.items():
+ print '>%s\n%s' % (name, seq)
+
+def quick_FASTA_reader(file):
+ """Simple FASTA reader, returning a list of string tuples.
+
+ The single argument 'file' should be the filename of a FASTA format file.
+ This function will open and read in the entire file, constructing a list
+ of all the records, each held as a tuple of strings (the sequence name or
+ title, and its sequence).
+
+ This function was originally intended for use on large files, where its
+ low overhead makes it very fast. However, because it returns the data as
+ a single in memory list, this can require a lot of RAM on large files.
+
+ You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
+ allows you to iterate over the records one by one (avoiding having all the
+ records in memory at once). Using Bio.SeqIO also makes it easy to switch
+ between different input file formats. However, please note that rather
+ than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
+ """
+ #Want to split on "\n>" not just ">" in case there are any extra ">"
+ #in the name/description. So, in order to make sure we also split on
+ #the first entry, prepend a "\n" to the start of the file.
+ handle = open(file)
+ txt = "\n" + handle.read()
+ handle.close()
+ entries = []
+ for entry in txt.split('\n>')[1:]:
+ name,seq= entry.split('\n',1)
+ seq = seq.replace('\n','').replace(' ','').upper()
+ entries.append((name, seq))
+ return entries
+
+def apply_on_multi_fasta(file, function, *args):
+ """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
+
+ file - filename of a FASTA format file
+ function - the function you wish to invoke on each record
+ *args - any extra arguments you want passed to the function
+
+ This function will iterate over each record in a FASTA file as SeqRecord
+ objects, calling your function with the record (and supplied args) as
+ arguments.
+
+ This function returns a list. For those records where your function
+ returns a value, this is taken as a sequence and used to construct a
+ FASTA format string. If your function never has a return value, this
+ means apply_on_multi_fasta will return an empty list.
+ """
+ try:
+ f = globals()[function]
+ except:
+ raise NotImplementedError("%s not implemented" % function)
+
+ handle = open(file, 'r')
+ records = SeqIO.parse(handle, "fasta")
+ results = []
+ for record in records:
+ arguments = [record.sequence]
+ for arg in args: arguments.append(arg)
+ result = f(*arguments)
+ if result:
+ results.append('>%s\n%s' % (record.name, result))
+ handle.close()
+ return results
+
+def quicker_apply_on_multi_fasta(file, function, *args):
+ """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
+
+ file - filename of a FASTA format file
+ function - the function you wish to invoke on each record
+ *args - any extra arguments you want passed to the function
+
+ This function will use quick_FASTA_reader to load every record in the
+ FASTA file into memory as a list of tuples. For each record, it will
+ call your supplied function with the record as a tuple of the name and
+ sequence as strings (plus any supplied args).
+
+ This function returns a list. For those records where your function
+ returns a value, this is taken as a sequence and used to construct a
+ FASTA format string. If your function never has a return value, this
+ means quicker_apply_on_multi_fasta will return an empty list.
+ """
+ try:
+ f = globals()[function]
+ except:
+ raise NotImplementedError("%s not implemented" % function)
+
+ entries = quick_FASTA_reader(file)
+ results = []
+ for name, seq in entries:
+ arguments = [seq]
+ for arg in args: arguments.append(arg)
+ result = f(*arguments)
+ if result:
+ results.append('>%s\n%s' % (name, result))
+ handle.close()
+ return results
+
+# }}}
+
+######################################
+# Main
+#####################
+# {{{
+
+if __name__ == '__main__':
+ import sys, getopt
+ # crude command line options to use most functions directly on a FASTA file
+ options = {'apply_on_multi_fasta':0,
+ 'quick':0,
+ 'uniq_ids':0,
+ }
+
+ optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
+ 'help', 'quick', 'uniq_ids', 'search='])
+ for arg in optlist:
+ if arg[0] in ['-h', '--help']:
+ pass
+ elif arg[0] in ['--describe']:
+ # get all new functions from this file
+ mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
+ mol_funcs.sort()
+ print 'available functions:'
+ for f in mol_funcs: print '\t--%s' % f
+ print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
+
+ sys.exit(0)
+ elif arg[0] in ['--apply_on_multi_fasta']:
+ options['apply_on_multi_fasta'] = arg[1]
+ elif arg[0] in ['--search']:
+ options['search'] = arg[1]
+ else:
+ key = re.search('-*(.+)', arg[0]).group(1)
+ options[key] = 1
+
+
+ if options.get('apply_on_multi_fasta'):
+ file = args[0]
+ function = options['apply_on_multi_fasta']
+ arguments = []
+ if options.get('search'):
+ arguments = options['search']
+ if function == 'xGC_skew':
+ arguments = 1000
+ if options.get('quick'):
+ results = quicker_apply_on_multi_fasta(file, function, arguments)
+ else:
+ results = apply_on_multi_fasta(file, function, arguments)
+ for result in results: print result
+
+ elif options.get('uniq_ids'):
+ file = args[0]
+ fasta_uniqids(file)
+
+# }}}
+