Copying Bio-python to globplot to satisfy the dependency
[jabaws.git] / binaries / src / globplot / biopython-1.50 / Bio / SeqUtils / lcc.py
diff --git a/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/lcc.py b/binaries/src/globplot/biopython-1.50/Bio/SeqUtils/lcc.py
new file mode 100644 (file)
index 0000000..1fb71a2
--- /dev/null
@@ -0,0 +1,162 @@
+# Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com
+# All rights reserved.  This code is part of the Biopython 
+# distribution and governed by its license.
+# Please see the LICENSE file that should have been included as part
+# of this package.
+
+import math
+
+def lcc_mult(seq,wsize):
+    """Local Composition Complexity (LCC) values over sliding window.
+
+    Returns a list of floats, the LCC values for a sliding window over
+    the sequence.
+
+    seq - an unambiguous DNA sequence (a string or Seq object)
+    wsize - window size, integer
+
+    The result is the same as applying lcc_simp multiple times, but this
+    version is optimized for speed. The optimization works by using the
+    value of previous window as a base to compute the next one."""
+    l2=math.log(2)
+    tamseq=len(seq)
+    try :
+        #Assume its a string
+        upper = seq.upper()
+    except AttributeError :
+        #Should be a Seq object then
+        upper = seq.tostring().upper()
+    compone=[0]
+    lccsal=[0]
+    for i in range(wsize):
+        compone.append(((i+1)/float(wsize))*
+                       ((math.log((i+1)/float(wsize)))/l2))
+    window=seq[0:wsize]
+    cant_a=window.count('A')
+    cant_c=window.count('C')
+    cant_t=window.count('T')
+    cant_g=window.count('G')
+    term_a=compone[cant_a]
+    term_c=compone[cant_c]
+    term_t=compone[cant_t]
+    term_g=compone[cant_g]
+    lccsal.append(-(term_a+term_c+term_t+term_g))
+    tail=seq[0]
+    for x in range (tamseq-wsize):
+        window=upper[x+1:wsize+x+1]
+        if tail==window[-1]:
+            lccsal.append(lccsal[-1])
+        elif tail=='A':
+            cant_a=cant_a-1
+            if window.endswith('C'):
+                cant_c=cant_c+1
+                term_a=compone[cant_a]
+                term_c=compone[cant_c]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+            elif window.endswith('T'):
+                cant_t=cant_t+1
+                term_a=compone[cant_a]
+                term_t=compone[cant_t]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+            elif window.endswith('G'):
+                cant_g=cant_g+1
+                term_a=compone[cant_a]
+                term_g=compone[cant_g]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+        elif tail=='C':
+            cant_c=cant_c-1
+            if window.endswith('A'):
+                cant_a=cant_a+1
+                term_a=compone[cant_a]
+                term_c=compone[cant_c]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+            elif window.endswith('T'):
+                cant_t=cant_t+1
+                term_c=compone[cant_c]
+                term_t=compone[cant_t]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+            elif window.endswith('G'):
+                cant_g=cant_g+1
+                term_c=compone[cant_c]
+                term_g=compone[cant_g]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+        elif tail=='T':
+            cant_t=cant_t-1
+            if window.endswith('A'):
+                cant_a=cant_a+1
+                term_a=compone[cant_a]
+                term_t=compone[cant_t]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+            elif window.endswith('C'):
+                cant_c=cant_c+1
+                term_c=compone[cant_c]
+                term_t=compone[cant_t]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+            elif window.endswith('G'):
+                cant_g=cant_g+1
+                term_t=compone[cant_t]
+                term_g=compone[cant_g]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+        elif tail=='G':
+            cant_g=cant_g-1
+            if window.endswith('A'):
+                cant_a=cant_a+1
+                term_a=compone[cant_a]
+                term_g=compone[cant_g]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+            elif window.endswith('C'):
+                cant_c=cant_c+1
+                term_c=compone[cant_c]
+                term_g=compone[cant_g]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+            elif window.endswith('T'):
+                cant_t=cant_t+1
+                term_t=compone[cant_t]
+                term_g=compone[cant_g]
+                lccsal.append(-(term_a+term_c+term_t+term_g))
+        tail=window[0]
+    return lccsal
+
+def lcc_simp(seq):
+    """Local Composition Complexity (LCC) for a sequence.
+
+    seq - an unambiguous DNA sequence (a string or Seq object)
+    
+    Returns the Local Composition Complexity (LCC) value for the entire
+    sequence (as a float).
+
+    Reference:
+    Andrzej K Konopka (2005) Sequence Complexity and Composition
+    DOI: 10.1038/npg.els.0005260
+    """
+    wsize=len(seq)
+    try :
+        #Assume its a string
+        upper = seq.upper()
+    except AttributeError :
+        #Should be a Seq object then
+        upper = seq.tostring().upper()
+    l2=math.log(2)
+    if 'A' not in seq:
+        term_a=0
+       # Check to avoid calculating the log of 0.
+    else:
+        term_a=((upper.count('A'))/float(wsize))*((math.log((upper.count('A'))
+                                                          /float(wsize)))/l2)
+    if 'C' not in seq:
+        term_c=0
+    else:
+        term_c=((upper.count('C'))/float(wsize))*((math.log((upper.count('C'))
+                                                          /float(wsize)))/l2)
+    if 'T' not in seq:
+        term_t=0
+    else:
+        term_t=((upper.count('T'))/float(wsize))*((math.log((upper.count('T'))
+                                                          /float(wsize)))/l2)
+    if 'G' not in seq:
+        term_g=0
+    else:
+        term_g=((upper.count('G'))/float(wsize))*((math.log((upper.count('G'))
+                                                          /float(wsize)))/l2)
+    lccsal=-(term_a+term_c+term_t+term_g)
+    return lccsal