--- /dev/null
+# Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com
+# All rights reserved. This code is part of the Biopython
+# distribution and governed by its license.
+# Please see the LICENSE file that should have been included as part
+# of this package.
+
+import math
+
+def lcc_mult(seq,wsize):
+ """Local Composition Complexity (LCC) values over sliding window.
+
+ Returns a list of floats, the LCC values for a sliding window over
+ the sequence.
+
+ seq - an unambiguous DNA sequence (a string or Seq object)
+ wsize - window size, integer
+
+ The result is the same as applying lcc_simp multiple times, but this
+ version is optimized for speed. The optimization works by using the
+ value of previous window as a base to compute the next one."""
+ l2=math.log(2)
+ tamseq=len(seq)
+ try :
+ #Assume its a string
+ upper = seq.upper()
+ except AttributeError :
+ #Should be a Seq object then
+ upper = seq.tostring().upper()
+ compone=[0]
+ lccsal=[0]
+ for i in range(wsize):
+ compone.append(((i+1)/float(wsize))*
+ ((math.log((i+1)/float(wsize)))/l2))
+ window=seq[0:wsize]
+ cant_a=window.count('A')
+ cant_c=window.count('C')
+ cant_t=window.count('T')
+ cant_g=window.count('G')
+ term_a=compone[cant_a]
+ term_c=compone[cant_c]
+ term_t=compone[cant_t]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ tail=seq[0]
+ for x in range (tamseq-wsize):
+ window=upper[x+1:wsize+x+1]
+ if tail==window[-1]:
+ lccsal.append(lccsal[-1])
+ elif tail=='A':
+ cant_a=cant_a-1
+ if window.endswith('C'):
+ cant_c=cant_c+1
+ term_a=compone[cant_a]
+ term_c=compone[cant_c]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('T'):
+ cant_t=cant_t+1
+ term_a=compone[cant_a]
+ term_t=compone[cant_t]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('G'):
+ cant_g=cant_g+1
+ term_a=compone[cant_a]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif tail=='C':
+ cant_c=cant_c-1
+ if window.endswith('A'):
+ cant_a=cant_a+1
+ term_a=compone[cant_a]
+ term_c=compone[cant_c]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('T'):
+ cant_t=cant_t+1
+ term_c=compone[cant_c]
+ term_t=compone[cant_t]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('G'):
+ cant_g=cant_g+1
+ term_c=compone[cant_c]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif tail=='T':
+ cant_t=cant_t-1
+ if window.endswith('A'):
+ cant_a=cant_a+1
+ term_a=compone[cant_a]
+ term_t=compone[cant_t]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('C'):
+ cant_c=cant_c+1
+ term_c=compone[cant_c]
+ term_t=compone[cant_t]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('G'):
+ cant_g=cant_g+1
+ term_t=compone[cant_t]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif tail=='G':
+ cant_g=cant_g-1
+ if window.endswith('A'):
+ cant_a=cant_a+1
+ term_a=compone[cant_a]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('C'):
+ cant_c=cant_c+1
+ term_c=compone[cant_c]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ elif window.endswith('T'):
+ cant_t=cant_t+1
+ term_t=compone[cant_t]
+ term_g=compone[cant_g]
+ lccsal.append(-(term_a+term_c+term_t+term_g))
+ tail=window[0]
+ return lccsal
+
+def lcc_simp(seq):
+ """Local Composition Complexity (LCC) for a sequence.
+
+ seq - an unambiguous DNA sequence (a string or Seq object)
+
+ Returns the Local Composition Complexity (LCC) value for the entire
+ sequence (as a float).
+
+ Reference:
+ Andrzej K Konopka (2005) Sequence Complexity and Composition
+ DOI: 10.1038/npg.els.0005260
+ """
+ wsize=len(seq)
+ try :
+ #Assume its a string
+ upper = seq.upper()
+ except AttributeError :
+ #Should be a Seq object then
+ upper = seq.tostring().upper()
+ l2=math.log(2)
+ if 'A' not in seq:
+ term_a=0
+ # Check to avoid calculating the log of 0.
+ else:
+ term_a=((upper.count('A'))/float(wsize))*((math.log((upper.count('A'))
+ /float(wsize)))/l2)
+ if 'C' not in seq:
+ term_c=0
+ else:
+ term_c=((upper.count('C'))/float(wsize))*((math.log((upper.count('C'))
+ /float(wsize)))/l2)
+ if 'T' not in seq:
+ term_t=0
+ else:
+ term_t=((upper.count('T'))/float(wsize))*((math.log((upper.count('T'))
+ /float(wsize)))/l2)
+ if 'G' not in seq:
+ term_g=0
+ else:
+ term_g=((upper.count('G'))/float(wsize))*((math.log((upper.count('G'))
+ /float(wsize)))/l2)
+ lccsal=-(term_a+term_c+term_t+term_g)
+ return lccsal