__init__.py

   1 #!/usr/bin/env python
   2 # Created: Wed May 29 08:07:18 2002
   3 # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se
   4 # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark.
   5 # All rights reserved.
   6 # This code is part of the Biopython distribution and governed by its
   7 # license.  Please see the LICENSE file that should have been included
   8 # as part of this package.
   9
  10 """Miscellaneous functions for dealing with sequences."""
  11
  12 import re, time
  13 from Bio import SeqIO
  14 from Bio import Translate
  15 from Bio.Seq import Seq
  16 from Bio import Alphabet
  17 from Bio.Alphabet import IUPAC
  18 from Bio.Data import IUPACData, CodonTable
  19
  20
  21 ######################################
  22 # DNA
  23 ######################
  24 # {{{
  25
  26 def reverse(seq):
  27     """Reverse the sequence. Works on string sequences.
  28
  29     e.g.
  30     >>> reverse("ACGGT")
  31     'TGGCA'
  32
  33     """
  34     r = list(seq)
  35     r.reverse()
  36     return ''.join(r)
  37
  38 def GC(seq):
  39     """Calculates G+C content, returns the percentage (float between 0 and 100).
  40
  41     Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
  42     when counting the G and C content.  The percentage is calculated against
  43     the full length, e.g.:
  44
  45     >>> from Bio.SeqUtils import GC
  46     >>> GC("ACTGN")
  47     40.0
  48
  49     Note that this will return zero for an empty sequence.
  50     """
  51     try :
  52         gc = sum(map(seq.count,['G','C','g','c','S','s']))
  53         return gc*100.0/len(seq)
  54     except ZeroDivisionError :
  55         return 0.0
  56
  57
  58 def GC123(seq):
  59     """Calculates total G+C content plus first, second and third positions.
  60
  61     Returns a tuple of four floats (percentages between 0 and 100) for the
  62     entire sequence, and the three codon positions.  e.g.
  63
  64     >>> from Bio.SeqUtils import GC123
  65     >>> GC123("ACTGTN")
  66     (40.0, 50.0, 50.0, 0.0)
  67
  68     Copes with mixed case sequences, but does NOT deal with ambiguous
  69     nucleotides.
  70     """
  71     d= {}
  72     for nt in ['A','T','G','C']:
  73        d[nt] = [0,0,0]
  74
  75     for i in range(0,len(seq),3):
  76         codon = seq[i:i+3]
  77         if len(codon) <3: codon += '  '
  78         for pos in range(0,3):
  79             for nt in ['A','T','G','C']:
  80                 if codon[pos] == nt or codon[pos] == nt.lower():
  81                     d[nt][pos] += 1
  82     gc = {}
  83     gcall = 0
  84     nall = 0
  85     for i in range(0,3):
  86         try:
  87             n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
  88             gc[i] = (d['G'][i] + d['C'][i])*100.0/n
  89         except:
  90             gc[i] = 0
  91
  92         gcall = gcall + d['G'][i] + d['C'][i]
  93         nall = nall + n
  94
  95     gcall = 100.0*gcall/nall
  96     return gcall, gc[0], gc[1], gc[2]
  97
  98 def GC_skew(seq, window = 100):
  99     """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
 100
 101     Returns a list of ratios (floats), controlled by the length of the sequence
 102     and the size of the window.
 103
 104     Does NOT look at any ambiguous nucleotides.
 105     """
 106     # 8/19/03: Iddo: added lowercase
 107     values = []
 108     for i in range(0, len(seq), window):
 109         s = seq[i: i + window]
 110         g = s.count('G') + s.count('g')
 111         c = s.count('C') + s.count('c')
 112         skew = (g-c)/float(g+c)
 113         values.append(skew)
 114     return values
 115
 116 from math import pi, sin, cos, log
 117 def xGC_skew(seq, window = 1000, zoom = 100,
 118                          r = 300, px = 100, py = 100):
 119     """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
 120     from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
 121                         VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
 122     yscroll = Scrollbar(orient = VERTICAL)
 123     xscroll = Scrollbar(orient = HORIZONTAL)
 124     canvas = Canvas(yscrollcommand = yscroll.set,
 125                     xscrollcommand = xscroll.set, background = 'white')
 126     win = canvas.winfo_toplevel()
 127     win.geometry('700x700')
 128
 129     yscroll.config(command = canvas.yview)
 130     xscroll.config(command = canvas.xview)
 131     yscroll.pack(side = RIGHT, fill = Y)
 132     xscroll.pack(side = BOTTOM, fill = X)
 133     canvas.pack(fill=BOTH, side = LEFT, expand = 1)
 134     canvas.update()
 135
 136     X0, Y0  = r + px, r + py
 137     x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
 138
 139     ty = Y0
 140     canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
 141     ty +=20
 142     canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
 143     ty +=20
 144     canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
 145     ty +=20
 146     canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
 147     ty +=20
 148     canvas.create_oval(x1,y1, x2, y2)
 149
 150     acc = 0
 151     start = 0
 152     for gc in GC_skew(seq, window):
 153         r1 = r
 154         acc+=gc
 155         # GC skew
 156         alpha = pi - (2*pi*start)/len(seq)
 157         r2 = r1 - gc*zoom
 158         x1 = X0 + r1 * sin(alpha)
 159         y1 = Y0 + r1 * cos(alpha)
 160         x2 = X0 + r2 * sin(alpha)
 161         y2 = Y0 + r2 * cos(alpha)
 162         canvas.create_line(x1,y1,x2,y2, fill = 'blue')
 163         # accumulated GC skew
 164         r1 = r - 50
 165         r2 = r1 - acc
 166         x1 = X0 + r1 * sin(alpha)
 167         y1 = Y0 + r1 * cos(alpha)
 168         x2 = X0 + r2 * sin(alpha)
 169         y2 = Y0 + r2 * cos(alpha)
 170         canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
 171
 172         canvas.update()
 173         start += window
 174
 175     canvas.configure(scrollregion = canvas.bbox(ALL))
 176
 177 def molecular_weight(seq):
 178     """Calculate the molecular weight of a DNA sequence."""
 179     if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna)
 180     weight_table = IUPACData.unambiguous_dna_weights
 181     #TODO, use a generator expession once we drop Python 2.3?
 182     #e.g. return sum(weight_table[x] for x in seq)
 183     total = 0
 184     for x in seq:
 185         total += weight_table[x]
 186     return total
 187
 188 def nt_search(seq, subseq):
 189     """Search for a DNA subseq in sequence.
 190
 191     use ambiguous values (like N = A or T or C or G, R = A or G etc.)
 192     searches only on forward strand
 193     """
 194     pattern = ''
 195     for nt in subseq:
 196         value = IUPACData.ambiguous_dna_values[nt]
 197         if len(value) == 1:
 198             pattern += value
 199         else:
 200             pattern += '[%s]' % value
 201
 202     pos = -1
 203     result = [pattern]
 204     l = len(seq)
 205     while True:
 206         pos+=1
 207         s = seq[pos:]
 208         m = re.search(pattern, s)
 209         if not m: break
 210         pos += int(m.start(0))
 211         result.append(pos)
 212     return result
 213
 214 # }}}
 215
 216 ######################################
 217 # Protein
 218 ######################
 219 # {{{
 220
 221 # temporary hack for exception free translation of "dirty" DNA
 222 # should be moved to ???
 223
 224 class ProteinX(Alphabet.ProteinAlphabet):
 225     letters = IUPACData.extended_protein_letters + "X"
 226
 227 proteinX = ProteinX()
 228
 229 class MissingTable:
 230     def __init__(self, table):
 231         self._table = table
 232     def get(self, codon, stop_symbol):
 233         try:
 234             return self._table.get(codon, stop_symbol)
 235         except CodonTable.TranslationError:
 236             return 'X'
 237
 238 def makeTableX(table):
 239     assert table.protein_alphabet == IUPAC.extended_protein
 240     return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX,
 241                                  MissingTable(table.forward_table),
 242                                  table.back_table, table.start_codons,
 243                                  table.stop_codons)
 244
 245 # end of hacks
 246
 247 def seq3(seq):
 248     """Turn a one letter code protein sequence into one with three letter codes.
 249
 250     The single input argument 'seq' should be a protein sequence using single
 251     letter codes, either as a python string or as a Seq or MutableSeq object.
 252
 253     This function returns the amino acid sequence as a string using the three
 254     letter amino acid codes. Output follows the IUPAC standard (including
 255     ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
 256     for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.  Any unknown
 257     character (including possible gap characters), is changed into 'Xaa'.
 258
 259     e.g.
 260     >>> from Bio.SeqUtils import seq3
 261     >>> seq3("MAIVMGRWKGAR*")
 262     'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
 263
 264     This function was inspired by BioPerl's seq3.
 265     """
 266     threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
 267                  'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
 268                  'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
 269                  'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
 270                  'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
 271                  'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
 272                  'U':'Sel', 'O':'Pyl', 'J':'Xle',
 273                  }
 274     #We use a default of 'Xaa' for undefined letters
 275     #Note this will map '-' to 'Xaa' which may be undesirable!
 276     return ''.join([threecode.get(aa,'Xaa') for aa in seq])
 277
 278
 279 # }}}
 280
 281 ######################################
 282 # Mixed ???
 283 ######################
 284 # {{{
 285
 286 def translate(seq, frame = 1, genetic_code = 1, translator = None):
 287     """Translation of DNA in one of the six different reading frames (DEPRECATED).
 288
 289     Use the Bio.Seq.Translate function, or the Seq object's translate method
 290     instead:
 291
 292     >>> from Bio.Seq import Seq
 293     >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
 294     >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA")
 295     >>> for frame in [0,1,2] :
 296     ...    print my_seq[frame:].translate()
 297     ...
 298     MAIVMGR*KGAR*
 299     WPL*WAAERVPDS
 300     GHCNGPLKGCPIV
 301     >>> for frame in [0,1,2] :
 302     ...     print my_seq.reverse_complement()[frame:].translate()
 303     ...
 304     YYRAPFQRPITMA
 305     TIGHPFSGPLQWP
 306     LSGTLSAAHYNGH
 307     """
 308     import warnings
 309     warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \
 310                   +" to remove it in a future release of Biopython.  Please use"\
 311                   +" the method or function in Bio.Seq instead, as described in"\
 312                   +" the Tutorial.", DeprecationWarning)
 313
 314     if frame not in [1,2,3,-1,-2,-3]:
 315         raise ValueError('invalid frame')
 316
 317     if not translator:
 318         table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code])
 319         translator = Translate.Translator(table)
 320
 321     #Does this frame calculation do something sensible?  No RC taken!
 322     return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
 323
 324 def GC_Frame(seq, genetic_code = 1):
 325     """Just an alias for six_frame_translations (OBSOLETE).
 326
 327     Use six_frame_translation directly, as this function may be deprecated
 328     in a future release."""
 329     return six_frame_translations(seq, genetic_code)
 330
 331 def six_frame_translations(seq, genetic_code = 1):
 332     """Formatted string showing the 6 frame translations and GC content.
 333
 334     nice looking 6 frame translation with GC content - code from xbbtools
 335     similar to DNA Striders six-frame translation
 336
 337     e.g.
 338     from Bio.SeqUtils import six_frame_translations
 339     print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
 340     """
 341     from Bio.Seq import reverse_complement, translate
 342     anti = reverse_complement(seq)
 343     comp = anti[::-1]
 344     length = len(seq)
 345     frames = {}
 346     for i in range(0,3):
 347         frames[i+1]  = translate(seq[i:], genetic_code)
 348         frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
 349
 350     # create header
 351     if length > 20:
 352         short = '%s ... %s' % (seq[:10], seq[-10:])
 353     else:
 354         short = seq
 355     #TODO? Remove the date as this would spoil any unit test...
 356     date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
 357     header = 'GC_Frame: %s, ' % date
 358     for nt in ['a','t','g','c']:
 359         header += '%s:%d ' % (nt, seq.count(nt.upper()))
 360
 361     header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
 362     res = header
 363
 364     for i in range(0,length,60):
 365         subseq = seq[i:i+60]
 366         csubseq = comp[i:i+60]
 367         p = i/3
 368         res = res + '%d/%d\n' % (i+1, i/3+1)
 369         res = res + '  ' + '  '.join(map(None,frames[3][p:p+20])) + '\n'
 370         res = res + ' ' + '  '.join(map(None,frames[2][p:p+20])) + '\n'
 371         res = res + '  '.join(map(None,frames[1][p:p+20])) + '\n'
 372         # seq
 373         res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
 374         res = res + csubseq.lower() + '\n'
 375         # - frames
 376         res = res + '  '.join(map(None,frames[-2][p:p+20]))  +' \n'
 377         res = res + ' ' + '  '.join(map(None,frames[-1][p:p+20])) + '\n'
 378         res = res + '  ' + '  '.join(map(None,frames[-3][p:p+20])) + '\n\n'
 379     return res
 380
 381 # }}}
 382
 383 ######################################
 384 # FASTA file utilities
 385 ######################
 386 # {{{
 387
 388 def fasta_uniqids(file):
 389     """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE).
 390
 391     file - a FASTA format filename to read in.
 392
 393     No return value, the output is written to screen.
 394     """
 395     dict = {}
 396     txt = open(file).read()
 397     entries = []
 398     for entry in txt.split('>')[1:]:
 399         name, seq= entry.split('\n',1)
 400         name = name.split()[0].split(',')[0]
 401
 402         if name in dict:
 403             n = 1
 404             while 1:
 405                 n = n + 1
 406                 _name = name + str(n)
 407                 if _name not in dict:
 408                     name = _name
 409                     break
 410
 411         dict[name] = seq
 412
 413     for name, seq in dict.items():
 414         print '>%s\n%s' % (name, seq)
 415
 416 def quick_FASTA_reader(file):
 417     """Simple FASTA reader, returning a list of string tuples.
 418
 419     The single argument 'file' should be the filename of a FASTA format file.
 420     This function will open and read in the entire file, constructing a list
 421     of all the records, each held as a tuple of strings (the sequence name or
 422     title, and its sequence).
 423
 424     This function was originally intended for use on large files, where its
 425     low overhead makes it very fast.  However, because it returns the data as
 426     a single in memory list, this can require a lot of RAM on large files.
 427
 428     You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
 429     allows you to iterate over the records one by one (avoiding having all the
 430     records in memory at once).  Using Bio.SeqIO also makes it easy to switch
 431     between different input file formats.  However, please note that rather
 432     than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
 433     """
 434     #Want to split on "\n>" not just ">" in case there are any extra ">"
 435     #in the name/description.  So, in order to make sure we also split on
 436     #the first entry, prepend a "\n" to the start of the file.
 437     handle = open(file)
 438     txt = "\n" + handle.read()
 439     handle.close()
 440     entries = []
 441     for entry in txt.split('\n>')[1:]:
 442         name,seq= entry.split('\n',1)
 443         seq = seq.replace('\n','').replace(' ','').upper()
 444         entries.append((name, seq))
 445     return entries
 446
 447 def apply_on_multi_fasta(file, function, *args):
 448     """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
 449
 450     file - filename of a FASTA format file
 451     function - the function you wish to invoke on each record
 452     *args - any extra arguments you want passed to the function
 453
 454     This function will iterate over each record in a FASTA file as SeqRecord
 455     objects, calling your function with the record (and supplied args) as
 456     arguments.
 457
 458     This function returns a list.  For those records where your function
 459     returns a value, this is taken as a sequence and used to construct a
 460     FASTA format string.  If your function never has a return value, this
 461     means apply_on_multi_fasta will return an empty list.
 462     """
 463     try:
 464         f = globals()[function]
 465     except:
 466         raise NotImplementedError("%s not implemented" % function)
 467
 468     handle = open(file, 'r')
 469     records = SeqIO.parse(handle, "fasta")
 470     results = []
 471     for record in records:
 472         arguments = [record.sequence]
 473         for arg in args: arguments.append(arg)
 474         result = f(*arguments)
 475         if result:
 476             results.append('>%s\n%s' % (record.name, result))
 477     handle.close()
 478     return results
 479
 480 def quicker_apply_on_multi_fasta(file, function, *args):
 481     """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
 482
 483     file - filename of a FASTA format file
 484     function - the function you wish to invoke on each record
 485     *args - any extra arguments you want passed to the function
 486
 487     This function will use quick_FASTA_reader to load every record in the
 488     FASTA file into memory as a list of tuples.  For each record, it will
 489     call your supplied function with the record as a tuple of the name and
 490     sequence as strings (plus any supplied args).
 491
 492     This function returns a list.  For those records where your function
 493     returns a value, this is taken as a sequence and used to construct a
 494     FASTA format string.  If your function never has a return value, this
 495     means quicker_apply_on_multi_fasta will return an empty list.
 496     """
 497     try:
 498         f = globals()[function]
 499     except:
 500         raise NotImplementedError("%s not implemented" % function)
 501
 502     entries = quick_FASTA_reader(file)
 503     results = []
 504     for name, seq in entries:
 505         arguments = [seq]
 506         for arg in args: arguments.append(arg)
 507         result = f(*arguments)
 508         if result:
 509             results.append('>%s\n%s' % (name, result))
 510     handle.close()
 511     return results
 512
 513 # }}}
 514
 515 ######################################
 516 # Main
 517 #####################
 518 # {{{
 519
 520 if __name__ == '__main__':
 521    import sys, getopt
 522    # crude command line options to use most functions directly on a FASTA file
 523    options = {'apply_on_multi_fasta':0,
 524               'quick':0,
 525               'uniq_ids':0,
 526               }
 527
 528    optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
 529                                                     'help', 'quick', 'uniq_ids', 'search='])
 530    for arg in optlist:
 531       if arg[0] in ['-h', '--help']:
 532          pass
 533       elif arg[0] in ['--describe']:
 534          # get all new functions from this file
 535          mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
 536          mol_funcs.sort()
 537          print 'available functions:'
 538          for f in mol_funcs: print '\t--%s' % f
 539          print '\n\ne.g.\n./sequtils.py  --apply_on_multi_fasta GC test.fas'
 540
 541          sys.exit(0)
 542       elif arg[0] in ['--apply_on_multi_fasta']:
 543          options['apply_on_multi_fasta'] = arg[1]
 544       elif arg[0] in ['--search']:
 545          options['search'] = arg[1]
 546       else:
 547          key = re.search('-*(.+)', arg[0]).group(1)
 548          options[key] = 1
 549
 550
 551    if options.get('apply_on_multi_fasta'):
 552       file = args[0]
 553       function = options['apply_on_multi_fasta']
 554       arguments = []
 555       if options.get('search'):
 556          arguments = options['search']
 557       if function == 'xGC_skew':
 558          arguments = 1000
 559       if options.get('quick'):
 560          results = quicker_apply_on_multi_fasta(file, function, arguments)
 561       else:
 562          results = apply_on_multi_fasta(file, function, arguments)
 563       for result in results: print result
 564
 565    elif options.get('uniq_ids'):
 566       file = args[0]
 567       fasta_uniqids(file)
 568
 569 # }}}
 570