2 # Created: Wed May 29 08:07:18 2002
3 # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se
4 # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark.
6 # This code is part of the Biopython distribution and governed by its
7 # license. Please see the LICENSE file that should have been included
8 # as part of this package.
10 """Miscellaneous functions for dealing with sequences."""
14 from Bio import Translate
15 from Bio.Seq import Seq
16 from Bio import Alphabet
17 from Bio.Alphabet import IUPAC
18 from Bio.Data import IUPACData, CodonTable
21 ######################################
23 ######################
27 """Reverse the sequence. Works on string sequences.
39 """Calculates G+C content, returns the percentage (float between 0 and 100).
41 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
42 when counting the G and C content. The percentage is calculated against
43 the full length, e.g.:
45 >>> from Bio.SeqUtils import GC
49 Note that this will return zero for an empty sequence.
52 gc = sum(map(seq.count,['G','C','g','c','S','s']))
53 return gc*100.0/len(seq)
54 except ZeroDivisionError :
59 """Calculates total G+C content plus first, second and third positions.
61 Returns a tuple of four floats (percentages between 0 and 100) for the
62 entire sequence, and the three codon positions. e.g.
64 >>> from Bio.SeqUtils import GC123
66 (40.0, 50.0, 50.0, 0.0)
68 Copes with mixed case sequences, but does NOT deal with ambiguous
72 for nt in ['A','T','G','C']:
75 for i in range(0,len(seq),3):
77 if len(codon) <3: codon += ' '
78 for pos in range(0,3):
79 for nt in ['A','T','G','C']:
80 if codon[pos] == nt or codon[pos] == nt.lower():
87 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
88 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
92 gcall = gcall + d['G'][i] + d['C'][i]
95 gcall = 100.0*gcall/nall
96 return gcall, gc[0], gc[1], gc[2]
98 def GC_skew(seq, window = 100):
99 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
101 Returns a list of ratios (floats), controlled by the length of the sequence
102 and the size of the window.
104 Does NOT look at any ambiguous nucleotides.
106 # 8/19/03: Iddo: added lowercase
108 for i in range(0, len(seq), window):
109 s = seq[i: i + window]
110 g = s.count('G') + s.count('g')
111 c = s.count('C') + s.count('c')
112 skew = (g-c)/float(g+c)
116 from math import pi, sin, cos, log
117 def xGC_skew(seq, window = 1000, zoom = 100,
118 r = 300, px = 100, py = 100):
119 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
120 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
121 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
122 yscroll = Scrollbar(orient = VERTICAL)
123 xscroll = Scrollbar(orient = HORIZONTAL)
124 canvas = Canvas(yscrollcommand = yscroll.set,
125 xscrollcommand = xscroll.set, background = 'white')
126 win = canvas.winfo_toplevel()
127 win.geometry('700x700')
129 yscroll.config(command = canvas.yview)
130 xscroll.config(command = canvas.xview)
131 yscroll.pack(side = RIGHT, fill = Y)
132 xscroll.pack(side = BOTTOM, fill = X)
133 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
136 X0, Y0 = r + px, r + py
137 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
140 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
142 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
144 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
146 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
148 canvas.create_oval(x1,y1, x2, y2)
152 for gc in GC_skew(seq, window):
156 alpha = pi - (2*pi*start)/len(seq)
158 x1 = X0 + r1 * sin(alpha)
159 y1 = Y0 + r1 * cos(alpha)
160 x2 = X0 + r2 * sin(alpha)
161 y2 = Y0 + r2 * cos(alpha)
162 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
163 # accumulated GC skew
166 x1 = X0 + r1 * sin(alpha)
167 y1 = Y0 + r1 * cos(alpha)
168 x2 = X0 + r2 * sin(alpha)
169 y2 = Y0 + r2 * cos(alpha)
170 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
175 canvas.configure(scrollregion = canvas.bbox(ALL))
177 def molecular_weight(seq):
178 """Calculate the molecular weight of a DNA sequence."""
179 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna)
180 weight_table = IUPACData.unambiguous_dna_weights
181 #TODO, use a generator expession once we drop Python 2.3?
182 #e.g. return sum(weight_table[x] for x in seq)
185 total += weight_table[x]
188 def nt_search(seq, subseq):
189 """Search for a DNA subseq in sequence.
191 use ambiguous values (like N = A or T or C or G, R = A or G etc.)
192 searches only on forward strand
196 value = IUPACData.ambiguous_dna_values[nt]
200 pattern += '[%s]' % value
208 m = re.search(pattern, s)
210 pos += int(m.start(0))
216 ######################################
218 ######################
221 # temporary hack for exception free translation of "dirty" DNA
222 # should be moved to ???
224 class ProteinX(Alphabet.ProteinAlphabet):
225 letters = IUPACData.extended_protein_letters + "X"
227 proteinX = ProteinX()
230 def __init__(self, table):
232 def get(self, codon, stop_symbol):
234 return self._table.get(codon, stop_symbol)
235 except CodonTable.TranslationError:
238 def makeTableX(table):
239 assert table.protein_alphabet == IUPAC.extended_protein
240 return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX,
241 MissingTable(table.forward_table),
242 table.back_table, table.start_codons,
248 """Turn a one letter code protein sequence into one with three letter codes.
250 The single input argument 'seq' should be a protein sequence using single
251 letter codes, either as a python string or as a Seq or MutableSeq object.
253 This function returns the amino acid sequence as a string using the three
254 letter amino acid codes. Output follows the IUPAC standard (including
255 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
256 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown
257 character (including possible gap characters), is changed into 'Xaa'.
260 >>> from Bio.SeqUtils import seq3
261 >>> seq3("MAIVMGRWKGAR*")
262 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
264 This function was inspired by BioPerl's seq3.
266 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
267 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
268 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
269 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
270 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
271 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
272 'U':'Sel', 'O':'Pyl', 'J':'Xle',
274 #We use a default of 'Xaa' for undefined letters
275 #Note this will map '-' to 'Xaa' which may be undesirable!
276 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
281 ######################################
283 ######################
286 def translate(seq, frame = 1, genetic_code = 1, translator = None):
287 """Translation of DNA in one of the six different reading frames (DEPRECATED).
289 Use the Bio.Seq.Translate function, or the Seq object's translate method
292 >>> from Bio.Seq import Seq
293 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
294 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA")
295 >>> for frame in [0,1,2] :
296 ... print my_seq[frame:].translate()
301 >>> for frame in [0,1,2] :
302 ... print my_seq.reverse_complement()[frame:].translate()
309 warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \
310 +" to remove it in a future release of Biopython. Please use"\
311 +" the method or function in Bio.Seq instead, as described in"\
312 +" the Tutorial.", DeprecationWarning)
314 if frame not in [1,2,3,-1,-2,-3]:
315 raise ValueError('invalid frame')
318 table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code])
319 translator = Translate.Translator(table)
321 #Does this frame calculation do something sensible? No RC taken!
322 return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
324 def GC_Frame(seq, genetic_code = 1):
325 """Just an alias for six_frame_translations (OBSOLETE).
327 Use six_frame_translation directly, as this function may be deprecated
328 in a future release."""
329 return six_frame_translations(seq, genetic_code)
331 def six_frame_translations(seq, genetic_code = 1):
332 """Formatted string showing the 6 frame translations and GC content.
334 nice looking 6 frame translation with GC content - code from xbbtools
335 similar to DNA Striders six-frame translation
338 from Bio.SeqUtils import six_frame_translations
339 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
341 from Bio.Seq import reverse_complement, translate
342 anti = reverse_complement(seq)
347 frames[i+1] = translate(seq[i:], genetic_code)
348 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
352 short = '%s ... %s' % (seq[:10], seq[-10:])
355 #TODO? Remove the date as this would spoil any unit test...
356 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
357 header = 'GC_Frame: %s, ' % date
358 for nt in ['a','t','g','c']:
359 header += '%s:%d ' % (nt, seq.count(nt.upper()))
361 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
364 for i in range(0,length,60):
366 csubseq = comp[i:i+60]
368 res = res + '%d/%d\n' % (i+1, i/3+1)
369 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
370 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
371 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
373 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
374 res = res + csubseq.lower() + '\n'
376 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
377 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
378 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
383 ######################################
384 # FASTA file utilities
385 ######################
388 def fasta_uniqids(file):
389 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE).
391 file - a FASTA format filename to read in.
393 No return value, the output is written to screen.
396 txt = open(file).read()
398 for entry in txt.split('>')[1:]:
399 name, seq= entry.split('\n',1)
400 name = name.split()[0].split(',')[0]
406 _name = name + str(n)
407 if _name not in dict:
413 for name, seq in dict.items():
414 print '>%s\n%s' % (name, seq)
416 def quick_FASTA_reader(file):
417 """Simple FASTA reader, returning a list of string tuples.
419 The single argument 'file' should be the filename of a FASTA format file.
420 This function will open and read in the entire file, constructing a list
421 of all the records, each held as a tuple of strings (the sequence name or
422 title, and its sequence).
424 This function was originally intended for use on large files, where its
425 low overhead makes it very fast. However, because it returns the data as
426 a single in memory list, this can require a lot of RAM on large files.
428 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
429 allows you to iterate over the records one by one (avoiding having all the
430 records in memory at once). Using Bio.SeqIO also makes it easy to switch
431 between different input file formats. However, please note that rather
432 than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
434 #Want to split on "\n>" not just ">" in case there are any extra ">"
435 #in the name/description. So, in order to make sure we also split on
436 #the first entry, prepend a "\n" to the start of the file.
438 txt = "\n" + handle.read()
441 for entry in txt.split('\n>')[1:]:
442 name,seq= entry.split('\n',1)
443 seq = seq.replace('\n','').replace(' ','').upper()
444 entries.append((name, seq))
447 def apply_on_multi_fasta(file, function, *args):
448 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
450 file - filename of a FASTA format file
451 function - the function you wish to invoke on each record
452 *args - any extra arguments you want passed to the function
454 This function will iterate over each record in a FASTA file as SeqRecord
455 objects, calling your function with the record (and supplied args) as
458 This function returns a list. For those records where your function
459 returns a value, this is taken as a sequence and used to construct a
460 FASTA format string. If your function never has a return value, this
461 means apply_on_multi_fasta will return an empty list.
464 f = globals()[function]
466 raise NotImplementedError("%s not implemented" % function)
468 handle = open(file, 'r')
469 records = SeqIO.parse(handle, "fasta")
471 for record in records:
472 arguments = [record.sequence]
473 for arg in args: arguments.append(arg)
474 result = f(*arguments)
476 results.append('>%s\n%s' % (record.name, result))
480 def quicker_apply_on_multi_fasta(file, function, *args):
481 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
483 file - filename of a FASTA format file
484 function - the function you wish to invoke on each record
485 *args - any extra arguments you want passed to the function
487 This function will use quick_FASTA_reader to load every record in the
488 FASTA file into memory as a list of tuples. For each record, it will
489 call your supplied function with the record as a tuple of the name and
490 sequence as strings (plus any supplied args).
492 This function returns a list. For those records where your function
493 returns a value, this is taken as a sequence and used to construct a
494 FASTA format string. If your function never has a return value, this
495 means quicker_apply_on_multi_fasta will return an empty list.
498 f = globals()[function]
500 raise NotImplementedError("%s not implemented" % function)
502 entries = quick_FASTA_reader(file)
504 for name, seq in entries:
506 for arg in args: arguments.append(arg)
507 result = f(*arguments)
509 results.append('>%s\n%s' % (name, result))
515 ######################################
517 #####################
520 if __name__ == '__main__':
522 # crude command line options to use most functions directly on a FASTA file
523 options = {'apply_on_multi_fasta':0,
528 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
529 'help', 'quick', 'uniq_ids', 'search='])
531 if arg[0] in ['-h', '--help']:
533 elif arg[0] in ['--describe']:
534 # get all new functions from this file
535 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
537 print 'available functions:'
538 for f in mol_funcs: print '\t--%s' % f
539 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
542 elif arg[0] in ['--apply_on_multi_fasta']:
543 options['apply_on_multi_fasta'] = arg[1]
544 elif arg[0] in ['--search']:
545 options['search'] = arg[1]
547 key = re.search('-*(.+)', arg[0]).group(1)
551 if options.get('apply_on_multi_fasta'):
553 function = options['apply_on_multi_fasta']
555 if options.get('search'):
556 arguments = options['search']
557 if function == 'xGC_skew':
559 if options.get('quick'):
560 results = quicker_apply_on_multi_fasta(file, function, arguments)
562 results = apply_on_multi_fasta(file, function, arguments)
563 for result in results: print result
565 elif options.get('uniq_ids'):