Copying Bio-python to globplot to satisfy the dependency
[jabaws.git] / binaries / src / globplot / biopython-1.50 / Bio / SeqUtils / lcc.py
1 # Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com
2 # All rights reserved.  This code is part of the Biopython 
3 # distribution and governed by its license.
4 # Please see the LICENSE file that should have been included as part
5 # of this package.
6
7 import math
8
9 def lcc_mult(seq,wsize):
10     """Local Composition Complexity (LCC) values over sliding window.
11
12     Returns a list of floats, the LCC values for a sliding window over
13     the sequence.
14
15     seq - an unambiguous DNA sequence (a string or Seq object)
16     wsize - window size, integer
17
18     The result is the same as applying lcc_simp multiple times, but this
19     version is optimized for speed. The optimization works by using the
20     value of previous window as a base to compute the next one."""
21     l2=math.log(2)
22     tamseq=len(seq)
23     try :
24         #Assume its a string
25         upper = seq.upper()
26     except AttributeError :
27         #Should be a Seq object then
28         upper = seq.tostring().upper()
29     compone=[0]
30     lccsal=[0]
31     for i in range(wsize):
32         compone.append(((i+1)/float(wsize))*
33                        ((math.log((i+1)/float(wsize)))/l2))
34     window=seq[0:wsize]
35     cant_a=window.count('A')
36     cant_c=window.count('C')
37     cant_t=window.count('T')
38     cant_g=window.count('G')
39     term_a=compone[cant_a]
40     term_c=compone[cant_c]
41     term_t=compone[cant_t]
42     term_g=compone[cant_g]
43     lccsal.append(-(term_a+term_c+term_t+term_g))
44     tail=seq[0]
45     for x in range (tamseq-wsize):
46         window=upper[x+1:wsize+x+1]
47         if tail==window[-1]:
48             lccsal.append(lccsal[-1])
49         elif tail=='A':
50             cant_a=cant_a-1
51             if window.endswith('C'):
52                 cant_c=cant_c+1
53                 term_a=compone[cant_a]
54                 term_c=compone[cant_c]
55                 lccsal.append(-(term_a+term_c+term_t+term_g))
56             elif window.endswith('T'):
57                 cant_t=cant_t+1
58                 term_a=compone[cant_a]
59                 term_t=compone[cant_t]
60                 lccsal.append(-(term_a+term_c+term_t+term_g))
61             elif window.endswith('G'):
62                 cant_g=cant_g+1
63                 term_a=compone[cant_a]
64                 term_g=compone[cant_g]
65                 lccsal.append(-(term_a+term_c+term_t+term_g))
66         elif tail=='C':
67             cant_c=cant_c-1
68             if window.endswith('A'):
69                 cant_a=cant_a+1
70                 term_a=compone[cant_a]
71                 term_c=compone[cant_c]
72                 lccsal.append(-(term_a+term_c+term_t+term_g))
73             elif window.endswith('T'):
74                 cant_t=cant_t+1
75                 term_c=compone[cant_c]
76                 term_t=compone[cant_t]
77                 lccsal.append(-(term_a+term_c+term_t+term_g))
78             elif window.endswith('G'):
79                 cant_g=cant_g+1
80                 term_c=compone[cant_c]
81                 term_g=compone[cant_g]
82                 lccsal.append(-(term_a+term_c+term_t+term_g))
83         elif tail=='T':
84             cant_t=cant_t-1
85             if window.endswith('A'):
86                 cant_a=cant_a+1
87                 term_a=compone[cant_a]
88                 term_t=compone[cant_t]
89                 lccsal.append(-(term_a+term_c+term_t+term_g))
90             elif window.endswith('C'):
91                 cant_c=cant_c+1
92                 term_c=compone[cant_c]
93                 term_t=compone[cant_t]
94                 lccsal.append(-(term_a+term_c+term_t+term_g))
95             elif window.endswith('G'):
96                 cant_g=cant_g+1
97                 term_t=compone[cant_t]
98                 term_g=compone[cant_g]
99                 lccsal.append(-(term_a+term_c+term_t+term_g))
100         elif tail=='G':
101             cant_g=cant_g-1
102             if window.endswith('A'):
103                 cant_a=cant_a+1
104                 term_a=compone[cant_a]
105                 term_g=compone[cant_g]
106                 lccsal.append(-(term_a+term_c+term_t+term_g))
107             elif window.endswith('C'):
108                 cant_c=cant_c+1
109                 term_c=compone[cant_c]
110                 term_g=compone[cant_g]
111                 lccsal.append(-(term_a+term_c+term_t+term_g))
112             elif window.endswith('T'):
113                 cant_t=cant_t+1
114                 term_t=compone[cant_t]
115                 term_g=compone[cant_g]
116                 lccsal.append(-(term_a+term_c+term_t+term_g))
117         tail=window[0]
118     return lccsal
119
120 def lcc_simp(seq):
121     """Local Composition Complexity (LCC) for a sequence.
122
123     seq - an unambiguous DNA sequence (a string or Seq object)
124     
125     Returns the Local Composition Complexity (LCC) value for the entire
126     sequence (as a float).
127
128     Reference:
129     Andrzej K Konopka (2005) Sequence Complexity and Composition
130     DOI: 10.1038/npg.els.0005260
131     """
132     wsize=len(seq)
133     try :
134         #Assume its a string
135         upper = seq.upper()
136     except AttributeError :
137         #Should be a Seq object then
138         upper = seq.tostring().upper()
139     l2=math.log(2)
140     if 'A' not in seq:
141         term_a=0
142         # Check to avoid calculating the log of 0.
143     else:
144         term_a=((upper.count('A'))/float(wsize))*((math.log((upper.count('A'))
145                                                           /float(wsize)))/l2)
146     if 'C' not in seq:
147         term_c=0
148     else:
149         term_c=((upper.count('C'))/float(wsize))*((math.log((upper.count('C'))
150                                                           /float(wsize)))/l2)
151     if 'T' not in seq:
152         term_t=0
153     else:
154         term_t=((upper.count('T'))/float(wsize))*((math.log((upper.count('T'))
155                                                           /float(wsize)))/l2)
156     if 'G' not in seq:
157         term_g=0
158     else:
159         term_g=((upper.count('G'))/float(wsize))*((math.log((upper.count('G'))
160                                                           /float(wsize)))/l2)
161     lccsal=-(term_a+term_c+term_t+term_g)
162     return lccsal