Disembl binaries and its dependancies e.g. minimized BioPython distribution and sovgo...
[jabaws.git] / binaries / src / disembl / DisEMBL.py.original
diff --git a/binaries/src/disembl/DisEMBL.py.original b/binaries/src/disembl/DisEMBL.py.original
new file mode 100644 (file)
index 0000000..0a306d4
--- /dev/null
@@ -0,0 +1,180 @@
+#!/usr/local/bin/python
+# Copyright (C) 2004 Rune Linding & Lars Juhl Jensen - EMBL
+# The DisEMBL is licensed under the GPL license
+# (http://www.opensource.org/licenses/gpl-license.php)
+# DisEMBL pipeline
+
+from string import *
+from sys import argv
+from Bio import File
+from Bio import Fasta
+import fpformat
+import sys
+import tempfile
+import os
+from os import system,popen3
+
+# change these to the correct paths
+NN_bin = '/PATH/DisEMBL-1.4/disembl'
+SG_bin = '/PATH/DisEMBL-1.4/sav_gol'
+
+def JensenNet(sequence):
+    outFile = tempfile.mktemp()
+    inFile= tempfile.mktemp()
+    open(inFile,'w').write(sequence+'\n')
+    system(NN_bin + '< ' + inFile +' > ' + outFile)
+    REM465 = []
+    COILS = []
+    HOTLOOPS = []
+    resultsFile = open(outFile,'r')
+    results = resultsFile.readlines()
+    resultsFile.close()
+    for result in results:
+        coil = float(fpformat.fix(split(result)[0],6))
+        COILS.append(coil)
+        hotloop = float(fpformat.fix(split(result)[1],6))
+        HOTLOOPS.append(hotloop)
+        rem465 = float(fpformat.fix(split(result)[2],6))
+        REM465.append(rem465)
+    os.remove(inFile)
+    os.remove(outFile)
+    return COILS, HOTLOOPS, REM465
+
+
+def SavitzkyGolay(window,derivative,datalist):
+    if len(datalist) < 2*window:
+        window = len(datalist)/2
+    elif window == 0:
+        window = 1
+    stdin, stdout, stderr = popen3(SG_bin + ' -V0 -D' + str(derivative) + ' -n' + str(window)+','+str(window))
+    for data in datalist:
+        stdin.write(`data`+'\n')
+    try:
+        stdin.close()
+    except:
+        print stderr.readlines()
+    results = stdout.readlines()
+    stdout.close()
+    SG_results = []
+    for result in results:
+        f = float(fpformat.fix(result,6))
+        if f < 0:
+            SG_results.append(0)
+        else:
+            SG_results.append(f)
+    return SG_results
+
+def getSlices(NNdata, fold, join_frame, peak_frame, expect_val):
+    slices = []
+    inSlice = 0
+    for i in range(len(NNdata)):
+        if inSlice:
+            if NNdata[i] < expect_val:
+                if maxSlice >= fold*expect_val:
+                    slices.append([beginSlice, endSlice])
+                inSlice = 0
+            else:
+                endSlice += 1
+                if NNdata[i] > maxSlice:
+                    maxSlice = NNdata[i]
+        elif NNdata[i] >= expect_val:
+            beginSlice = i
+            endSlice = i
+            inSlice = 1
+            maxSlice = NNdata[i]
+    if inSlice and maxSlice >= fold*expect_val:
+        slices.append([beginSlice, endSlice])
+
+    i = 0
+    while i < len(slices):
+        if i+1 < len(slices) and slices[i+1][0]-slices[i][1] <= join_frame:
+            slices[i] = [ slices[i][0], slices[i+1][1] ]
+            del slices[i+1]
+        elif slices[i][1]-slices[i][0]+1 < peak_frame:
+            del slices[i]
+        else:
+            i += 1
+    return slices
+
+
+def reportSlicesTXT(slices, sequence):
+    if slices == []:
+        s = lower(sequence)
+    else:
+        if slices[0][0] > 0:
+            s = lower(sequence[0:slices[0][0]])
+        else:
+            s = ''
+        for i in range(len(slices)):
+            if i > 0:
+                sys.stdout.write(', ')
+            sys.stdout.write( str(slices[i][0]+1) + '-' + str(slices[i][1]+1) )
+            s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
+            if i < len(slices)-1:
+                s = s + lower(sequence[(slices[i][1]+1):(slices[i+1][0])])
+            elif slices[i][1] < len(sequence)-1:
+                s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
+    print ''
+    print s
+
+
+
+def runDisEMBLpipeline():
+    try:
+        smooth_frame = int(sys.argv[1])
+        peak_frame = int(sys.argv[2])
+        join_frame = int(sys.argv[3])
+        fold_coils = float(sys.argv[4])
+        fold_hotloops = float(sys.argv[5])
+        fold_rem465 = float(sys.argv[6])
+        file = str(sys.argv[7])
+        try:
+            mode = sys.argv[8]
+        except:
+            mode = 'default'
+    except:
+        print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file [mode]\n'
+        print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2  fasta_file'
+        print 'Mode: "default"(nothing) or "scores" which will give scores per residue in TAB seperated format'
+        raise SystemExit
+    db = open(file,'r')
+    parser = Fasta.RecordParser()
+    iterator = Fasta.Iterator(db,parser)
+    print ' ____  _     _____ __  __ ____  _       _  _  _'
+    print '|  _ \(_)___| ____|  \/  | __ )| |     / || || |'
+    print '| | | | / __|  _| | |\/| |  _ \| |     | || || |_'
+    print '| |_| | \__ \ |___| |  | | |_) | |___  | ||__   _|'
+    print '|____/|_|___/_____|_|  |_|____/|_____| |_(_) |_|'
+    print '# Copyright (C) 2004 - Rune Linding & Lars Juhl Jensen '
+    print '# EMBL Biocomputing Unit - Heidelberg - Germany        '
+    print '#'
+    while 1:
+        try:
+            cur_record = iterator.next()
+            sequence = upper(cur_record.sequence)
+            # Run NN
+            COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
+            # Run Savitzky-Golay
+            REM465_smooth = SavitzkyGolay(smooth_frame,0,REM465_raw)
+            COILS_smooth = SavitzkyGolay(smooth_frame,0,COILS_raw)
+            HOTLOOPS_smooth = SavitzkyGolay(smooth_frame,0,HOTLOOPS_raw)
+            if mode == 'default':
+                sys.stdout.write('> '+cur_record.title+'_COILS ')
+                reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence )
+                sys.stdout.write('> '+cur_record.title+'_REM465 ')
+                reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence )
+                sys.stdout.write('> '+cur_record.title+'_HOTLOOPS ')
+                reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence )
+                sys.stdout.write('\n')
+            elif mode == 'scores':
+                sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n')
+                for i in range(len(REM465_smooth)):
+                    sys.stdout.write(sequence[i]+'\t'+fpformat.fix(COILS_smooth[i],5)+'\t'+fpformat.fix(REM465_smooth[i],5)+'\t'+fpformat.fix(HOTLOOPS_smooth[i],5)+'\n')
+            else:
+                sys.stderr.write('Wrong mode given: '+mode+'\n')
+                raise SystemExit
+        except AttributeError:
+            break
+    return
+
+runDisEMBLpipeline()