ClustalO servlets
[jabaws.git] / binaries / src / disembl / DisEMBL.py.original
1 #!/usr/local/bin/python
2 # Copyright (C) 2004 Rune Linding & Lars Juhl Jensen - EMBL
3 # The DisEMBL is licensed under the GPL license
4 # (http://www.opensource.org/licenses/gpl-license.php)
5 # DisEMBL pipeline
6
7 from string import *
8 from sys import argv
9 from Bio import File
10 from Bio import Fasta
11 import fpformat
12 import sys
13 import tempfile
14 import os
15 from os import system,popen3
16
17 # change these to the correct paths
18 NN_bin = '/PATH/DisEMBL-1.4/disembl'
19 SG_bin = '/PATH/DisEMBL-1.4/sav_gol'
20
21 def JensenNet(sequence):
22     outFile = tempfile.mktemp()
23     inFile= tempfile.mktemp()
24     open(inFile,'w').write(sequence+'\n')
25     system(NN_bin + '< ' + inFile +' > ' + outFile)
26     REM465 = []
27     COILS = []
28     HOTLOOPS = []
29     resultsFile = open(outFile,'r')
30     results = resultsFile.readlines()
31     resultsFile.close()
32     for result in results:
33         coil = float(fpformat.fix(split(result)[0],6))
34         COILS.append(coil)
35         hotloop = float(fpformat.fix(split(result)[1],6))
36         HOTLOOPS.append(hotloop)
37         rem465 = float(fpformat.fix(split(result)[2],6))
38         REM465.append(rem465)
39     os.remove(inFile)
40     os.remove(outFile)
41     return COILS, HOTLOOPS, REM465
42
43
44 def SavitzkyGolay(window,derivative,datalist):
45     if len(datalist) < 2*window:
46         window = len(datalist)/2
47     elif window == 0:
48         window = 1
49     stdin, stdout, stderr = popen3(SG_bin + ' -V0 -D' + str(derivative) + ' -n' + str(window)+','+str(window))
50     for data in datalist:
51         stdin.write(`data`+'\n')
52     try:
53         stdin.close()
54     except:
55         print stderr.readlines()
56     results = stdout.readlines()
57     stdout.close()
58     SG_results = []
59     for result in results:
60         f = float(fpformat.fix(result,6))
61         if f < 0:
62             SG_results.append(0)
63         else:
64             SG_results.append(f)
65     return SG_results
66
67 def getSlices(NNdata, fold, join_frame, peak_frame, expect_val):
68     slices = []
69     inSlice = 0
70     for i in range(len(NNdata)):
71         if inSlice:
72             if NNdata[i] < expect_val:
73                 if maxSlice >= fold*expect_val:
74                     slices.append([beginSlice, endSlice])
75                 inSlice = 0
76             else:
77                 endSlice += 1
78                 if NNdata[i] > maxSlice:
79                     maxSlice = NNdata[i]
80         elif NNdata[i] >= expect_val:
81             beginSlice = i
82             endSlice = i
83             inSlice = 1
84             maxSlice = NNdata[i]
85     if inSlice and maxSlice >= fold*expect_val:
86         slices.append([beginSlice, endSlice])
87
88     i = 0
89     while i < len(slices):
90         if i+1 < len(slices) and slices[i+1][0]-slices[i][1] <= join_frame:
91             slices[i] = [ slices[i][0], slices[i+1][1] ]
92             del slices[i+1]
93         elif slices[i][1]-slices[i][0]+1 < peak_frame:
94             del slices[i]
95         else:
96             i += 1
97     return slices
98
99
100 def reportSlicesTXT(slices, sequence):
101     if slices == []:
102         s = lower(sequence)
103     else:
104         if slices[0][0] > 0:
105             s = lower(sequence[0:slices[0][0]])
106         else:
107             s = ''
108         for i in range(len(slices)):
109             if i > 0:
110                 sys.stdout.write(', ')
111             sys.stdout.write( str(slices[i][0]+1) + '-' + str(slices[i][1]+1) )
112             s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
113             if i < len(slices)-1:
114                 s = s + lower(sequence[(slices[i][1]+1):(slices[i+1][0])])
115             elif slices[i][1] < len(sequence)-1:
116                 s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
117     print ''
118     print s
119
120
121
122 def runDisEMBLpipeline():
123     try:
124         smooth_frame = int(sys.argv[1])
125         peak_frame = int(sys.argv[2])
126         join_frame = int(sys.argv[3])
127         fold_coils = float(sys.argv[4])
128         fold_hotloops = float(sys.argv[5])
129         fold_rem465 = float(sys.argv[6])
130         file = str(sys.argv[7])
131         try:
132             mode = sys.argv[8]
133         except:
134             mode = 'default'
135     except:
136         print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file [mode]\n'
137         print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2  fasta_file'
138         print 'Mode: "default"(nothing) or "scores" which will give scores per residue in TAB seperated format'
139         raise SystemExit
140     db = open(file,'r')
141     parser = Fasta.RecordParser()
142     iterator = Fasta.Iterator(db,parser)
143     print ' ____  _     _____ __  __ ____  _       _  _  _'
144     print '|  _ \(_)___| ____|  \/  | __ )| |     / || || |'
145     print '| | | | / __|  _| | |\/| |  _ \| |     | || || |_'
146     print '| |_| | \__ \ |___| |  | | |_) | |___  | ||__   _|'
147     print '|____/|_|___/_____|_|  |_|____/|_____| |_(_) |_|'
148     print '# Copyright (C) 2004 - Rune Linding & Lars Juhl Jensen '
149     print '# EMBL Biocomputing Unit - Heidelberg - Germany        '
150     print '#'
151     while 1:
152         try:
153             cur_record = iterator.next()
154             sequence = upper(cur_record.sequence)
155             # Run NN
156             COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
157             # Run Savitzky-Golay
158             REM465_smooth = SavitzkyGolay(smooth_frame,0,REM465_raw)
159             COILS_smooth = SavitzkyGolay(smooth_frame,0,COILS_raw)
160             HOTLOOPS_smooth = SavitzkyGolay(smooth_frame,0,HOTLOOPS_raw)
161             if mode == 'default':
162                 sys.stdout.write('> '+cur_record.title+'_COILS ')
163                 reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence )
164                 sys.stdout.write('> '+cur_record.title+'_REM465 ')
165                 reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence )
166                 sys.stdout.write('> '+cur_record.title+'_HOTLOOPS ')
167                 reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence )
168                 sys.stdout.write('\n')
169             elif mode == 'scores':
170                 sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n')
171                 for i in range(len(REM465_smooth)):
172                     sys.stdout.write(sequence[i]+'\t'+fpformat.fix(COILS_smooth[i],5)+'\t'+fpformat.fix(REM465_smooth[i],5)+'\t'+fpformat.fix(HOTLOOPS_smooth[i],5)+'\n')
173             else:
174                 sys.stderr.write('Wrong mode given: '+mode+'\n')
175                 raise SystemExit
176         except AttributeError:
177             break
178     return
179
180 runDisEMBLpipeline()