d9618e23d3a83112db113961579cc6eeeb430471
[jabaws.git] / binaries / src / disembl / DisEMBL.py
1 #!/usr/bin/python
2 # Copyright (C) 2004 Rune Linding & Lars Juhl Jensen - EMBL
3 # The DisEMBL is licensed under the GPL license
4 # (http://www.opensource.org/licenses/gpl-license.php)
5 # DisEMBL pipeline
6
7
8 from string import *
9 import sys,re
10 from sys import argv
11 import os
12 from os import system,popen3
13
14 relpath = re.sub("/DisEMBL.py$","",argv[0])
15 # cwd = re.sub("/$","", os.getcwd())
16 newpath = relpath + "/biopython-1.50"
17 sys.path.append(newpath)
18
19 from Bio import File
20 from Bio import Fasta
21
22 import fpformat
23 import tempfile
24
25
26 # change these to the correct paths
27 NN_bin = relpath + "/" + 'disembl'
28 SG_bin = relpath + "/" + 'sav_gol'
29
30 def JensenNet(sequence):
31     outFile = tempfile.mktemp()
32     inFile= tempfile.mktemp()
33     open(inFile,'w').write(sequence+'\n')
34     system(NN_bin + '< ' + inFile +' > ' + outFile)
35     REM465 = []
36     COILS = []
37     HOTLOOPS = []
38     resultsFile = open(outFile,'r')
39     results = resultsFile.readlines()
40     resultsFile.close()
41     for result in results:
42         coil = float(fpformat.fix(split(result)[0],6))
43         COILS.append(coil)
44         hotloop = float(fpformat.fix(split(result)[1],6))
45         HOTLOOPS.append(hotloop)
46         rem465 = float(fpformat.fix(split(result)[2],6))
47         REM465.append(rem465)
48     os.remove(inFile)
49     os.remove(outFile)
50     return COILS, HOTLOOPS, REM465
51
52
53 def SavitzkyGolay(window,derivative,datalist):
54     if len(datalist) < 2*window:
55         window = len(datalist)/2
56     elif window == 0:
57         window = 1
58     stdin, stdout, stderr = popen3(SG_bin + ' -V0 -D' + str(derivative) + ' -n' + str(window)+','+str(window))
59     for data in datalist:
60         stdin.write(`data`+'\n')
61     try:
62         stdin.close()
63     except:
64         print stderr.readlines()
65     results = stdout.readlines()
66     stdout.close()
67     SG_results = []
68     for result in results:
69         f = float(fpformat.fix(result,6))
70         if f < 0:
71             SG_results.append(0)
72         else:
73             SG_results.append(f)
74     return SG_results
75
76 def getSlices(NNdata, fold, join_frame, peak_frame, expect_val):
77     slices = []
78     inSlice = 0
79     for i in range(len(NNdata)):
80         if inSlice:
81             if NNdata[i] < expect_val:
82                 if maxSlice >= fold*expect_val:
83                     slices.append([beginSlice, endSlice])
84                 inSlice = 0
85             else:
86                 endSlice += 1
87                 if NNdata[i] > maxSlice:
88                     maxSlice = NNdata[i]
89         elif NNdata[i] >= expect_val:
90             beginSlice = i
91             endSlice = i
92             inSlice = 1
93             maxSlice = NNdata[i]
94     if inSlice and maxSlice >= fold*expect_val:
95         slices.append([beginSlice, endSlice])
96
97     i = 0
98     while i < len(slices):
99         if i+1 < len(slices) and slices[i+1][0]-slices[i][1] <= join_frame:
100             slices[i] = [ slices[i][0], slices[i+1][1] ]
101             del slices[i+1]
102         elif slices[i][1]-slices[i][0]+1 < peak_frame:
103             del slices[i]
104         else:
105             i += 1
106     return slices
107
108
109 def reportSlicesTXT(slices, sequence):
110     if slices == []:
111         s = lower(sequence)
112     else:
113         if slices[0][0] > 0:
114             s = lower(sequence[0:slices[0][0]])
115         else:
116             s = ''
117         for i in range(len(slices)):
118             if i > 0:
119                 sys.stdout.write(', ')
120             sys.stdout.write( str(slices[i][0]+1) + '-' + str(slices[i][1]+1) )
121             s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
122             if i < len(slices)-1:
123                 s = s + lower(sequence[(slices[i][1]+1):(slices[i+1][0])])
124             elif slices[i][1] < len(sequence)-1:
125                 s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
126     print ''
127     print s
128
129
130
131 def runDisEMBLpipeline():
132     try:
133         smooth_frame = 8
134         peak_frame = 8
135         join_frame = 4
136         fold_coils = 1.2
137         fold_hotloops = 1.4
138         fold_rem465 = 1.2
139         mode = 'scores'
140         try:
141             file = open(sys.argv[1],'r')
142         except:
143             mode = 'default'
144     except:
145         print '\nDisEMBL.py sequence_file \n'
146         print 'A default run would be: ./DisEMBL.py fasta_file'
147         print 'Mode: "default"(nothing) or "scores" which will give scores per residue in TAB seperated format'
148         raise SystemExit
149     #db = sys.stdin
150     parser = Fasta.RecordParser()
151     iterator = Fasta.Iterator(file,parser)
152     while 1:
153         try:
154             cur_record = iterator.next()
155             sequence = upper(cur_record.sequence)
156             # Run NN
157             COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
158             # Run Savitzky-Golay
159             REM465_smooth = SavitzkyGolay(smooth_frame,0,REM465_raw)
160             COILS_smooth = SavitzkyGolay(smooth_frame,0,COILS_raw)
161             HOTLOOPS_smooth = SavitzkyGolay(smooth_frame,0,HOTLOOPS_raw)
162             if mode == 'default':
163                 sys.stdout.write('> '+cur_record.title+'_COILS ')
164                 reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence )
165                 sys.stdout.write('> '+cur_record.title+'_REM465 ')
166                 reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence )
167                 sys.stdout.write('> '+cur_record.title+'_HOTLOOPS ')
168                 reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence )
169                 sys.stdout.write('\n')
170             elif mode == 'scores':
171                 sys.stdout.write('> '+cur_record.title+'\n')
172                 sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n')
173                 for i in range(len(REM465_smooth)):
174                     sys.stdout.write(sequence[i]+'\t'+fpformat.fix(COILS_smooth[i],5)+'\t'+fpformat.fix(REM465_smooth[i],5)+'\t'+fpformat.fix(HOTLOOPS_smooth[i],5)+'\n')
175             else:
176                 sys.stderr.write('Wrong mode given: '+mode+'\n')
177                 raise SystemExit
178         except AttributeError:
179             break
180     file.close()
181     return
182
183 runDisEMBLpipeline()