2 # Copyright (C) 2004 Rune Linding & Lars Juhl Jensen - EMBL
3 # The DisEMBL is licensed under the GPL license
4 # (http://www.opensource.org/licenses/gpl-license.php)
12 from os import system,popen3
14 relpath = re.sub("/DisEMBL.py$","",argv[0])
15 # cwd = re.sub("/$","", os.getcwd())
16 newpath = relpath + "/biopython-1.50"
17 sys.path.append(newpath)
26 # change these to the correct paths
27 NN_bin = relpath + "/" + 'disembl'
28 SG_bin = relpath + "/" + 'sav_gol'
30 def JensenNet(sequence):
31 outFile = tempfile.mktemp()
32 inFile= tempfile.mktemp()
33 open(inFile,'w').write(sequence+'\n')
34 system(NN_bin + '< ' + inFile +' > ' + outFile)
38 resultsFile = open(outFile,'r')
39 results = resultsFile.readlines()
41 for result in results:
42 coil = float(fpformat.fix(split(result)[0],6))
44 hotloop = float(fpformat.fix(split(result)[1],6))
45 HOTLOOPS.append(hotloop)
46 rem465 = float(fpformat.fix(split(result)[2],6))
50 return COILS, HOTLOOPS, REM465
53 def SavitzkyGolay(window,derivative,datalist):
54 if len(datalist) < 2*window:
55 window = len(datalist)/2
58 stdin, stdout, stderr = popen3(SG_bin + ' -V0 -D' + str(derivative) + ' -n' + str(window)+','+str(window))
60 stdin.write(`data`+'\n')
64 print stderr.readlines()
65 results = stdout.readlines()
68 for result in results:
69 f = float(fpformat.fix(result,6))
76 def getSlices(NNdata, fold, join_frame, peak_frame, expect_val):
79 for i in range(len(NNdata)):
81 if NNdata[i] < expect_val:
82 if maxSlice >= fold*expect_val:
83 slices.append([beginSlice, endSlice])
87 if NNdata[i] > maxSlice:
89 elif NNdata[i] >= expect_val:
94 if inSlice and maxSlice >= fold*expect_val:
95 slices.append([beginSlice, endSlice])
98 while i < len(slices):
99 if i+1 < len(slices) and slices[i+1][0]-slices[i][1] <= join_frame:
100 slices[i] = [ slices[i][0], slices[i+1][1] ]
102 elif slices[i][1]-slices[i][0]+1 < peak_frame:
109 def reportSlicesTXT(slices, sequence):
114 s = lower(sequence[0:slices[0][0]])
117 for i in range(len(slices)):
119 sys.stdout.write(', ')
120 sys.stdout.write( str(slices[i][0]+1) + '-' + str(slices[i][1]+1) )
121 s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
122 if i < len(slices)-1:
123 s = s + lower(sequence[(slices[i][1]+1):(slices[i+1][0])])
124 elif slices[i][1] < len(sequence)-1:
125 s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
131 def runDisEMBLpipeline():
141 file = open(sys.argv[1],'r')
145 print '\nDisEMBL.py sequence_file \n'
146 print 'A default run would be: ./DisEMBL.py fasta_file'
147 print 'Mode: "default"(nothing) or "scores" which will give scores per residue in TAB seperated format'
150 parser = Fasta.RecordParser()
151 iterator = Fasta.Iterator(file,parser)
154 cur_record = iterator.next()
155 sequence = upper(cur_record.sequence)
157 COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
159 REM465_smooth = SavitzkyGolay(smooth_frame,0,REM465_raw)
160 COILS_smooth = SavitzkyGolay(smooth_frame,0,COILS_raw)
161 HOTLOOPS_smooth = SavitzkyGolay(smooth_frame,0,HOTLOOPS_raw)
162 if mode == 'default':
163 sys.stdout.write('> '+cur_record.title+'_COILS ')
164 reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence )
165 sys.stdout.write('> '+cur_record.title+'_REM465 ')
166 reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence )
167 sys.stdout.write('> '+cur_record.title+'_HOTLOOPS ')
168 reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence )
169 sys.stdout.write('\n')
170 elif mode == 'scores':
171 sys.stdout.write('> '+cur_record.title+'\n')
172 sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n')
173 for i in range(len(REM465_smooth)):
174 sys.stdout.write(sequence[i]+'\t'+fpformat.fix(COILS_smooth[i],5)+'\t'+fpformat.fix(REM465_smooth[i],5)+'\t'+fpformat.fix(HOTLOOPS_smooth[i],5)+'\n')
176 sys.stderr.write('Wrong mode given: '+mode+'\n')
178 except AttributeError: