Minor changes
[jabaws.git] / binaries / src / globplot / GlobPipe.py.original
1 #!/usr/bin/env python
2 # Copyright (C) 2003 Rune Linding - EMBL
3 # GlobPlot TM
4 # GlobPlot is licensed under the Academic Free license
5
6 from string import *
7 from sys import argv
8 import sys
9 import os
10 from os import system,popen3
11
12 relpath = re.sub("/GlobPipe.py$","",argv[0])
13 cwd = re.sub("/$","", os.getcwd())
14 print '!'  +os.getcwd()
15 print '!!' +cwd
16 newpath =cwd+"/"+relpath+"/biopython-1.50"
17 sys.path.append(newpath)
18
19 import Bio
20 from Bio import File
21 from Bio import Fasta
22 import fpformat
23 import tempfile
24 import math
25
26 # Russell/Linding
27 RL = {'N':0.229885057471264,'P':0.552316012226663,'Q':-0.187676577424997,'A':-0.261538461538462,'R':-0.176592654077609, \
28       'S':0.142883029808825,'C':-0.0151515151515152,'T':0.00887797506611258,'D':0.227629796839729,'E':-0.204684629516228, \
29       'V':-0.386174834235195,'F':-0.225572305974316,'W':-0.243375458622095,'G':0.433225711769886,'H':-0.00121743364986608, \
30       'Y':-0.20750516775322,'I':-0.422234699606962,'K':-0.100092289621613,'L':-0.337933495925287,'M':-0.225903614457831}
31
32 def Sum(seq,par_dict):
33     sum = 0
34     results = []
35     raws = []
36     sums = []
37     p = 1
38     for residue in seq:
39         try:
40             parameter = par_dict[residue]
41         except:
42             parameter = 0
43         if p == 1:
44             sum = parameter
45         else:
46             sum = sum + parameter#*math.log10(p)
47         ssum = float(fpformat.fix(sum,10))
48         sums.append(ssum)
49         p +=1
50     return sums
51
52 def getSlices(dydx_data, DOM_join_frame, DOM_peak_frame, DIS_join_frame, DIS_peak_frame):
53     DOMslices = []
54     DISslices = []
55     in_DOMslice = 0
56     in_DISslice = 0
57     beginDOMslice = 0
58     endDOMslice = 0
59     beginDISslice = 0
60     endDISslice = 0
61     for i in range( len(dydx_data) ):
62     #close dom slice
63         if in_DOMslice and dydx_data[i] > 0:
64             DOMslices.append([beginDOMslice, endDOMslice])
65             in_DOMslice = 0
66     #close dis slice
67         elif in_DISslice and dydx_data[i] < 0:
68             DISslices.append([beginDISslice, endDISslice])
69             in_DISslice = 0
70         # elseif inSlice expandslice
71         elif in_DOMslice:
72             endDOMslice += 1
73         elif in_DISslice:
74             endDISslice += 1
75     # if not in slice and dydx !== 0 start slice
76         if dydx_data[i] > 0 and not in_DISslice:
77             beginDISslice = i
78             endDISslice = i
79             in_DISslice = 1
80         elif dydx_data[i] < 0 and not in_DOMslice:
81             beginDOMslice = i
82             endDOMslice = i
83             in_DOMslice = 1
84     #last slice
85     if in_DOMslice:
86         DOMslices.append([beginDOMslice, endDOMslice])
87     if in_DISslice:
88         DISslices.append([beginDISslice,endDISslice])
89     k = 0
90     l = 0
91     while k < len(DOMslices):
92         if k+1 < len(DOMslices) and DOMslices[k+1][0]-DOMslices[k][1] < DOM_join_frame:
93             DOMslices[k] = [ DOMslices[k][0], DOMslices[k+1][1] ]
94             del DOMslices[k+1]
95         elif DOMslices[k][1]-DOMslices[k][0]+1 < DOM_peak_frame:
96             del DOMslices[k]
97         else:
98             k += 1
99     while l < len(DISslices):
100         if l+1 < len(DISslices) and DISslices[l+1][0]-DISslices[l][1] < DIS_join_frame:
101             DISslices[l] = [ DISslices[l][0], DISslices[l+1][1] ]
102             del DISslices[l+1]
103         elif DISslices[l][1]-DISslices[l][0]+1 < DIS_peak_frame:
104             del DISslices[l]
105         else:
106             l += 1
107     return DOMslices, DISslices
108
109
110 def SavitzkyGolay(window,derivative,datalist):
111     SG_bin = '/homes/pvtroshin/soft/GlobPipe-2.3/sav_gol -V0 '
112     stdin, stdout, stderr = popen3(SG_bin + '-D' + str(derivative) + ' -n' + str(window)+','+str(window))
113     for data in datalist:
114         stdin.write(`data`+'\n')
115     try:
116         stdin.close()
117     except:
118         print stderr.readlines()
119     results = stdout.readlines()
120     stdout.close()
121     SG_results = []
122     for result in results:
123         SG_results.append(float(fpformat.fix(result,6)))
124     return SG_results
125
126
127 def reportSlicesTXT(slices, sequence, maskFlag):
128     if maskFlag == 'DOM':
129         coordstr = '|GlobDoms:'
130     elif maskFlag == 'DIS':
131         coordstr = '|Disorder:'
132     else:
133         raise SystemExit
134     if slices == []:
135         #by default the sequence is in uppercase which is our search space
136         s = sequence
137     else:
138         # insert seq before first slide
139         if slices[0][0] > 0:
140             s = sequence[0:slices[0][0]]
141         else:
142             s = ''
143         for i in range(len(slices)):
144             #skip first slice
145             if i > 0:
146                 coordstr = coordstr + ', '
147             coordstr = coordstr + str(slices[i][0]+1) + '-' + str(slices[i][1]+1)
148             #insert the actual slice
149             if maskFlag == 'DOM':
150                 s = s + lower(sequence[slices[i][0]:(slices[i][1]+1)])
151                 if i < len(slices)-1:
152                     s = s + upper(sequence[(slices[i][1]+1):(slices[i+1][0])])
153                 #last slice
154                 elif slices[i][1] < len(sequence)-1:
155                     s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
156             elif maskFlag == 'DIS':
157                 s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
158                 #insert untouched seq between disorder segments, 2-run labelling
159                 if i < len(slices)-1:
160                     s = s + sequence[(slices[i][1]+1):(slices[i+1][0])]
161                 #last slice
162                 elif slices[i][1] < len(sequence)-1:
163                     s = s + sequence[(slices[i][1]+1):(len(sequence))]
164     return s,coordstr
165
166
167 def runGlobPlot():
168     try:
169         smoothFrame = int(sys.argv[1])
170         DOM_joinFrame = int(sys.argv[2])
171         DOM_peakFrame = int(sys.argv[3])
172         DIS_joinFrame = int(sys.argv[4])
173         DIS_peakFrame = int(sys.argv[5])
174         file = str(sys.argv[6])
175         db = open(file,'r')
176     except:
177         print 'Usage:'
178         print '         ./GlobPipe.py SmoothFrame DOMjoinFrame DOMpeakFrame DISjoinFrame DISpeakFrame FASTAfile'
179         print '         Optimised for ELM: ./GlobPlot.py 10 8 75 8 8 sequence_file'
180         print '         Webserver settings: ./GlobPlot.py 10 15 74 4 5 sequence_file'
181         raise SystemExit
182     parser = Fasta.RecordParser()
183     iterator = Fasta.Iterator(db,parser)
184     while 1:
185         try:
186             cur_record = iterator.next()
187             #uppercase is searchspace
188             seq = upper(cur_record.sequence)
189             # sum function
190             sum_vector = Sum(seq,RL)
191             # Run Savitzky-Golay
192             smooth = SavitzkyGolay(`smoothFrame`,0, sum_vector)
193             dydx_vector = SavitzkyGolay(`smoothFrame`,1, sum_vector)
194             #test
195             sumHEAD = sum_vector[:smoothFrame]
196             sumTAIL = sum_vector[len(sum_vector)-smoothFrame:]
197             newHEAD = []
198             newTAIL = []
199             for i in range(len(sumHEAD)):
200                 try:
201                     dHEAD = (sumHEAD[i+1]-sumHEAD[i])/2
202                 except:
203                     dHEAD = (sumHEAD[i]-sumHEAD[i-1])/2
204                 try:
205                     dTAIL = (sumTAIL[i+1]-sumTAIL[i])/2
206                 except:
207                     dTAIL = (sumTAIL[i]-sumTAIL[i-1])/2
208                 newHEAD.append(dHEAD)
209                 newTAIL.append(dTAIL)
210             dydx_vector[:smoothFrame] = newHEAD
211             dydx_vector[len(dydx_vector)-smoothFrame:] = newTAIL
212             globdoms, globdis = getSlices(dydx_vector, DOM_joinFrame, DOM_peakFrame, DIS_joinFrame, DIS_peakFrame)
213             s_domMask, coordstrDOM = reportSlicesTXT(globdoms, seq, 'DOM')
214             s_final, coordstrDIS = reportSlicesTXT(globdis, s_domMask, 'DIS')
215             sys.stdout.write('>'+cur_record.title+coordstrDOM+coordstrDIS+'\n')
216             print s_final
217             print '\n'
218         except AttributeError:
219             break
220     return
221
222 runGlobPlot()