+++ /dev/null
-# Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com
-# All rights reserved. This code is part of the Biopython
-# distribution and governed by its license.
-# Please see the LICENSE file that should have been included as part
-# of this package.
-
-import math
-
-def lcc_mult(seq,wsize):
- """Local Composition Complexity (LCC) values over sliding window.
-
- Returns a list of floats, the LCC values for a sliding window over
- the sequence.
-
- seq - an unambiguous DNA sequence (a string or Seq object)
- wsize - window size, integer
-
- The result is the same as applying lcc_simp multiple times, but this
- version is optimized for speed. The optimization works by using the
- value of previous window as a base to compute the next one."""
- l2=math.log(2)
- tamseq=len(seq)
- try :
- #Assume its a string
- upper = seq.upper()
- except AttributeError :
- #Should be a Seq object then
- upper = seq.tostring().upper()
- compone=[0]
- lccsal=[0]
- for i in range(wsize):
- compone.append(((i+1)/float(wsize))*
- ((math.log((i+1)/float(wsize)))/l2))
- window=seq[0:wsize]
- cant_a=window.count('A')
- cant_c=window.count('C')
- cant_t=window.count('T')
- cant_g=window.count('G')
- term_a=compone[cant_a]
- term_c=compone[cant_c]
- term_t=compone[cant_t]
- term_g=compone[cant_g]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- tail=seq[0]
- for x in range (tamseq-wsize):
- window=upper[x+1:wsize+x+1]
- if tail==window[-1]:
- lccsal.append(lccsal[-1])
- elif tail=='A':
- cant_a=cant_a-1
- if window.endswith('C'):
- cant_c=cant_c+1
- term_a=compone[cant_a]
- term_c=compone[cant_c]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif window.endswith('T'):
- cant_t=cant_t+1
- term_a=compone[cant_a]
- term_t=compone[cant_t]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif window.endswith('G'):
- cant_g=cant_g+1
- term_a=compone[cant_a]
- term_g=compone[cant_g]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif tail=='C':
- cant_c=cant_c-1
- if window.endswith('A'):
- cant_a=cant_a+1
- term_a=compone[cant_a]
- term_c=compone[cant_c]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif window.endswith('T'):
- cant_t=cant_t+1
- term_c=compone[cant_c]
- term_t=compone[cant_t]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif window.endswith('G'):
- cant_g=cant_g+1
- term_c=compone[cant_c]
- term_g=compone[cant_g]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif tail=='T':
- cant_t=cant_t-1
- if window.endswith('A'):
- cant_a=cant_a+1
- term_a=compone[cant_a]
- term_t=compone[cant_t]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif window.endswith('C'):
- cant_c=cant_c+1
- term_c=compone[cant_c]
- term_t=compone[cant_t]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif window.endswith('G'):
- cant_g=cant_g+1
- term_t=compone[cant_t]
- term_g=compone[cant_g]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif tail=='G':
- cant_g=cant_g-1
- if window.endswith('A'):
- cant_a=cant_a+1
- term_a=compone[cant_a]
- term_g=compone[cant_g]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif window.endswith('C'):
- cant_c=cant_c+1
- term_c=compone[cant_c]
- term_g=compone[cant_g]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- elif window.endswith('T'):
- cant_t=cant_t+1
- term_t=compone[cant_t]
- term_g=compone[cant_g]
- lccsal.append(-(term_a+term_c+term_t+term_g))
- tail=window[0]
- return lccsal
-
-def lcc_simp(seq):
- """Local Composition Complexity (LCC) for a sequence.
-
- seq - an unambiguous DNA sequence (a string or Seq object)
-
- Returns the Local Composition Complexity (LCC) value for the entire
- sequence (as a float).
-
- Reference:
- Andrzej K Konopka (2005) Sequence Complexity and Composition
- DOI: 10.1038/npg.els.0005260
- """
- wsize=len(seq)
- try :
- #Assume its a string
- upper = seq.upper()
- except AttributeError :
- #Should be a Seq object then
- upper = seq.tostring().upper()
- l2=math.log(2)
- if 'A' not in seq:
- term_a=0
- # Check to avoid calculating the log of 0.
- else:
- term_a=((upper.count('A'))/float(wsize))*((math.log((upper.count('A'))
- /float(wsize)))/l2)
- if 'C' not in seq:
- term_c=0
- else:
- term_c=((upper.count('C'))/float(wsize))*((math.log((upper.count('C'))
- /float(wsize)))/l2)
- if 'T' not in seq:
- term_t=0
- else:
- term_t=((upper.count('T'))/float(wsize))*((math.log((upper.count('T'))
- /float(wsize)))/l2)
- if 'G' not in seq:
- term_g=0
- else:
- term_g=((upper.count('G'))/float(wsize))*((math.log((upper.count('G'))
- /float(wsize)))/l2)
- lccsal=-(term_a+term_c+term_t+term_g)
- return lccsal