Disembl binaries and its dependancies e.g. minimized BioPython distribution and sovgo...
[jabaws.git] / binaries / src / disembl / DisEMBL.py
1 #!/usr/bin/python
2 # Copyright (C) 2004 Rune Linding & Lars Juhl Jensen - EMBL
3 # The DisEMBL is licensed under the GPL license
4 # (http://www.opensource.org/licenses/gpl-license.php)
5 # DisEMBL pipeline
6
7
8 from string import *
9 import sys,re
10 from sys import argv
11 import os
12 from os import system,popen3
13
14 relpath = re.sub("/DisEMBL.py$","",argv[0])
15 cwd = re.sub("/$","", os.getcwd())
16 newpath =cwd+"/"+relpath+"/biopython-1.50"
17 sys.path.append(newpath)
18
19 from Bio import File
20 from Bio import Fasta
21
22
23 import fpformat
24 import tempfile
25
26
27 # change these to the correct paths
28 NN_bin = relpath + "/" + 'disembl'
29 SG_bin = relpath + "/" + 'sav_gol'
30
31 def JensenNet(sequence):
32     outFile = tempfile.mktemp()
33     inFile= tempfile.mktemp()
34     open(inFile,'w').write(sequence+'\n')
35     system(NN_bin + '< ' + inFile +' > ' + outFile)
36     REM465 = []
37     COILS = []
38     HOTLOOPS = []
39     resultsFile = open(outFile,'r')
40     results = resultsFile.readlines()
41     resultsFile.close()
42     for result in results:
43         coil = float(fpformat.fix(split(result)[0],6))
44         COILS.append(coil)
45         hotloop = float(fpformat.fix(split(result)[1],6))
46         HOTLOOPS.append(hotloop)
47         rem465 = float(fpformat.fix(split(result)[2],6))
48         REM465.append(rem465)
49     os.remove(inFile)
50     os.remove(outFile)
51     return COILS, HOTLOOPS, REM465
52
53
54 def SavitzkyGolay(window,derivative,datalist):
55     if len(datalist) < 2*window:
56         window = len(datalist)/2
57     elif window == 0:
58         window = 1
59     stdin, stdout, stderr = popen3(SG_bin + ' -V0 -D' + str(derivative) + ' -n' + str(window)+','+str(window))
60     for data in datalist:
61         stdin.write(`data`+'\n')
62     try:
63         stdin.close()
64     except:
65         print stderr.readlines()
66     results = stdout.readlines()
67     stdout.close()
68     SG_results = []
69     for result in results:
70         f = float(fpformat.fix(result,6))
71         if f < 0:
72             SG_results.append(0)
73         else:
74             SG_results.append(f)
75     return SG_results
76
77 def getSlices(NNdata, fold, join_frame, peak_frame, expect_val):
78     slices = []
79     inSlice = 0
80     for i in range(len(NNdata)):
81         if inSlice:
82             if NNdata[i] < expect_val:
83                 if maxSlice >= fold*expect_val:
84                     slices.append([beginSlice, endSlice])
85                 inSlice = 0
86             else:
87                 endSlice += 1
88                 if NNdata[i] > maxSlice:
89                     maxSlice = NNdata[i]
90         elif NNdata[i] >= expect_val:
91             beginSlice = i
92             endSlice = i
93             inSlice = 1
94             maxSlice = NNdata[i]
95     if inSlice and maxSlice >= fold*expect_val:
96         slices.append([beginSlice, endSlice])
97
98     i = 0
99     while i < len(slices):
100         if i+1 < len(slices) and slices[i+1][0]-slices[i][1] <= join_frame:
101             slices[i] = [ slices[i][0], slices[i+1][1] ]
102             del slices[i+1]
103         elif slices[i][1]-slices[i][0]+1 < peak_frame:
104             del slices[i]
105         else:
106             i += 1
107     return slices
108
109
110 def reportSlicesTXT(slices, sequence):
111     if slices == []:
112         s = lower(sequence)
113     else:
114         if slices[0][0] > 0:
115             s = lower(sequence[0:slices[0][0]])
116         else:
117             s = ''
118         for i in range(len(slices)):
119             if i > 0:
120                 sys.stdout.write(', ')
121             sys.stdout.write( str(slices[i][0]+1) + '-' + str(slices[i][1]+1) )
122             s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
123             if i < len(slices)-1:
124                 s = s + lower(sequence[(slices[i][1]+1):(slices[i+1][0])])
125             elif slices[i][1] < len(sequence)-1:
126                 s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
127     print ''
128     print s
129
130
131
132 def runDisEMBLpipeline():
133     try:
134         smooth_frame = int(sys.argv[1])
135         peak_frame = int(sys.argv[2])
136         join_frame = int(sys.argv[3])
137         fold_coils = float(sys.argv[4])
138         fold_hotloops = float(sys.argv[5])
139         fold_rem465 = float(sys.argv[6])
140         #file = str(sys.argv[7])
141         try:
142             mode = sys.argv[7]
143         except:
144             mode = 'default'
145     except:
146         print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file [mode]\n'
147         print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2  fasta_file'
148         print 'Mode: "default"(nothing) or "scores" which will give scores per residue in TAB seperated format'
149         raise SystemExit
150     db = sys.stdin
151     parser = Fasta.RecordParser()
152     iterator = Fasta.Iterator(db,parser)
153     while 1:
154         try:
155             cur_record = iterator.next()
156             sequence = upper(cur_record.sequence)
157             # Run NN
158             COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
159             # Run Savitzky-Golay
160             REM465_smooth = SavitzkyGolay(smooth_frame,0,REM465_raw)
161             COILS_smooth = SavitzkyGolay(smooth_frame,0,COILS_raw)
162             HOTLOOPS_smooth = SavitzkyGolay(smooth_frame,0,HOTLOOPS_raw)
163             if mode == 'default':
164                 sys.stdout.write('> '+cur_record.title+'_COILS ')
165                 reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence )
166                 sys.stdout.write('> '+cur_record.title+'_REM465 ')
167                 reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence )
168                 sys.stdout.write('> '+cur_record.title+'_HOTLOOPS ')
169                 reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence )
170                 sys.stdout.write('\n')
171             elif mode == 'scores':
172                 sys.stdout.write('> '+cur_record.title+'\n')
173                 sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n')
174                 for i in range(len(REM465_smooth)):
175                     sys.stdout.write(sequence[i]+'\t'+fpformat.fix(COILS_smooth[i],5)+'\t'+fpformat.fix(REM465_smooth[i],5)+'\t'+fpformat.fix(HOTLOOPS_smooth[i],5)+'\n')
176             else:
177                 sys.stderr.write('Wrong mode given: '+mode+'\n')
178                 raise SystemExit
179         except AttributeError:
180             break
181     return
182
183 runDisEMBLpipeline()