+++ /dev/null
-#!/usr/local/bin/python
-# Copyright (C) 2004 Rune Linding & Lars Juhl Jensen - EMBL
-# The DisEMBL is licensed under the GPL license
-# (http://www.opensource.org/licenses/gpl-license.php)
-# DisEMBL pipeline
-
-from string import *
-from sys import argv
-from Bio import File
-from Bio import Fasta
-import fpformat
-import sys
-import tempfile
-import os
-from os import system,popen3
-
-# change these to the correct paths
-NN_bin = '/PATH/DisEMBL-1.4/disembl'
-SG_bin = '/PATH/DisEMBL-1.4/sav_gol'
-
-def JensenNet(sequence):
- outFile = tempfile.mktemp()
- inFile= tempfile.mktemp()
- open(inFile,'w').write(sequence+'\n')
- system(NN_bin + '< ' + inFile +' > ' + outFile)
- REM465 = []
- COILS = []
- HOTLOOPS = []
- resultsFile = open(outFile,'r')
- results = resultsFile.readlines()
- resultsFile.close()
- for result in results:
- coil = float(fpformat.fix(split(result)[0],6))
- COILS.append(coil)
- hotloop = float(fpformat.fix(split(result)[1],6))
- HOTLOOPS.append(hotloop)
- rem465 = float(fpformat.fix(split(result)[2],6))
- REM465.append(rem465)
- os.remove(inFile)
- os.remove(outFile)
- return COILS, HOTLOOPS, REM465
-
-
-def SavitzkyGolay(window,derivative,datalist):
- if len(datalist) < 2*window:
- window = len(datalist)/2
- elif window == 0:
- window = 1
- stdin, stdout, stderr = popen3(SG_bin + ' -V0 -D' + str(derivative) + ' -n' + str(window)+','+str(window))
- for data in datalist:
- stdin.write(`data`+'\n')
- try:
- stdin.close()
- except:
- print stderr.readlines()
- results = stdout.readlines()
- stdout.close()
- SG_results = []
- for result in results:
- f = float(fpformat.fix(result,6))
- if f < 0:
- SG_results.append(0)
- else:
- SG_results.append(f)
- return SG_results
-
-def getSlices(NNdata, fold, join_frame, peak_frame, expect_val):
- slices = []
- inSlice = 0
- for i in range(len(NNdata)):
- if inSlice:
- if NNdata[i] < expect_val:
- if maxSlice >= fold*expect_val:
- slices.append([beginSlice, endSlice])
- inSlice = 0
- else:
- endSlice += 1
- if NNdata[i] > maxSlice:
- maxSlice = NNdata[i]
- elif NNdata[i] >= expect_val:
- beginSlice = i
- endSlice = i
- inSlice = 1
- maxSlice = NNdata[i]
- if inSlice and maxSlice >= fold*expect_val:
- slices.append([beginSlice, endSlice])
-
- i = 0
- while i < len(slices):
- if i+1 < len(slices) and slices[i+1][0]-slices[i][1] <= join_frame:
- slices[i] = [ slices[i][0], slices[i+1][1] ]
- del slices[i+1]
- elif slices[i][1]-slices[i][0]+1 < peak_frame:
- del slices[i]
- else:
- i += 1
- return slices
-
-
-def reportSlicesTXT(slices, sequence):
- if slices == []:
- s = lower(sequence)
- else:
- if slices[0][0] > 0:
- s = lower(sequence[0:slices[0][0]])
- else:
- s = ''
- for i in range(len(slices)):
- if i > 0:
- sys.stdout.write(', ')
- sys.stdout.write( str(slices[i][0]+1) + '-' + str(slices[i][1]+1) )
- s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
- if i < len(slices)-1:
- s = s + lower(sequence[(slices[i][1]+1):(slices[i+1][0])])
- elif slices[i][1] < len(sequence)-1:
- s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
- print ''
- print s
-
-
-
-def runDisEMBLpipeline():
- try:
- smooth_frame = int(sys.argv[1])
- peak_frame = int(sys.argv[2])
- join_frame = int(sys.argv[3])
- fold_coils = float(sys.argv[4])
- fold_hotloops = float(sys.argv[5])
- fold_rem465 = float(sys.argv[6])
- file = str(sys.argv[7])
- try:
- mode = sys.argv[8]
- except:
- mode = 'default'
- except:
- print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file [mode]\n'
- print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2 fasta_file'
- print 'Mode: "default"(nothing) or "scores" which will give scores per residue in TAB seperated format'
- raise SystemExit
- db = open(file,'r')
- parser = Fasta.RecordParser()
- iterator = Fasta.Iterator(db,parser)
- print ' ____ _ _____ __ __ ____ _ _ _ _'
- print '| _ \(_)___| ____| \/ | __ )| | / || || |'
- print '| | | | / __| _| | |\/| | _ \| | | || || |_'
- print '| |_| | \__ \ |___| | | | |_) | |___ | ||__ _|'
- print '|____/|_|___/_____|_| |_|____/|_____| |_(_) |_|'
- print '# Copyright (C) 2004 - Rune Linding & Lars Juhl Jensen '
- print '# EMBL Biocomputing Unit - Heidelberg - Germany '
- print '#'
- while 1:
- try:
- cur_record = iterator.next()
- sequence = upper(cur_record.sequence)
- # Run NN
- COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
- # Run Savitzky-Golay
- REM465_smooth = SavitzkyGolay(smooth_frame,0,REM465_raw)
- COILS_smooth = SavitzkyGolay(smooth_frame,0,COILS_raw)
- HOTLOOPS_smooth = SavitzkyGolay(smooth_frame,0,HOTLOOPS_raw)
- if mode == 'default':
- sys.stdout.write('> '+cur_record.title+'_COILS ')
- reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence )
- sys.stdout.write('> '+cur_record.title+'_REM465 ')
- reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence )
- sys.stdout.write('> '+cur_record.title+'_HOTLOOPS ')
- reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence )
- sys.stdout.write('\n')
- elif mode == 'scores':
- sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n')
- for i in range(len(REM465_smooth)):
- sys.stdout.write(sequence[i]+'\t'+fpformat.fix(COILS_smooth[i],5)+'\t'+fpformat.fix(REM465_smooth[i],5)+'\t'+fpformat.fix(HOTLOOPS_smooth[i],5)+'\n')
- else:
- sys.stderr.write('Wrong mode given: '+mode+'\n')
- raise SystemExit
- except AttributeError:
- break
- return
-
-runDisEMBLpipeline()