#!/usr/local/bin/python # Copyright (C) 2004 Rune Linding & Lars Juhl Jensen - EMBL # The DisEMBL is licensed under the GPL license # (http://www.opensource.org/licenses/gpl-license.php) # DisEMBL pipeline from string import * from sys import argv from Bio import File from Bio import Fasta import fpformat import sys import tempfile import os from os import system,popen3 # change these to the correct paths NN_bin = '/PATH/DisEMBL-1.4/disembl' SG_bin = '/PATH/DisEMBL-1.4/sav_gol' def JensenNet(sequence): outFile = tempfile.mktemp() inFile= tempfile.mktemp() open(inFile,'w').write(sequence+'\n') system(NN_bin + '< ' + inFile +' > ' + outFile) REM465 = [] COILS = [] HOTLOOPS = [] resultsFile = open(outFile,'r') results = resultsFile.readlines() resultsFile.close() for result in results: coil = float(fpformat.fix(split(result)[0],6)) COILS.append(coil) hotloop = float(fpformat.fix(split(result)[1],6)) HOTLOOPS.append(hotloop) rem465 = float(fpformat.fix(split(result)[2],6)) REM465.append(rem465) os.remove(inFile) os.remove(outFile) return COILS, HOTLOOPS, REM465 def SavitzkyGolay(window,derivative,datalist): if len(datalist) < 2*window: window = len(datalist)/2 elif window == 0: window = 1 stdin, stdout, stderr = popen3(SG_bin + ' -V0 -D' + str(derivative) + ' -n' + str(window)+','+str(window)) for data in datalist: stdin.write(`data`+'\n') try: stdin.close() except: print stderr.readlines() results = stdout.readlines() stdout.close() SG_results = [] for result in results: f = float(fpformat.fix(result,6)) if f < 0: SG_results.append(0) else: SG_results.append(f) return SG_results def getSlices(NNdata, fold, join_frame, peak_frame, expect_val): slices = [] inSlice = 0 for i in range(len(NNdata)): if inSlice: if NNdata[i] < expect_val: if maxSlice >= fold*expect_val: slices.append([beginSlice, endSlice]) inSlice = 0 else: endSlice += 1 if NNdata[i] > maxSlice: maxSlice = NNdata[i] elif NNdata[i] >= expect_val: beginSlice = i endSlice = i inSlice = 1 maxSlice = NNdata[i] if inSlice and maxSlice >= fold*expect_val: slices.append([beginSlice, endSlice]) i = 0 while i < len(slices): if i+1 < len(slices) and slices[i+1][0]-slices[i][1] <= join_frame: slices[i] = [ slices[i][0], slices[i+1][1] ] del slices[i+1] elif slices[i][1]-slices[i][0]+1 < peak_frame: del slices[i] else: i += 1 return slices def reportSlicesTXT(slices, sequence): if slices == []: s = lower(sequence) else: if slices[0][0] > 0: s = lower(sequence[0:slices[0][0]]) else: s = '' for i in range(len(slices)): if i > 0: sys.stdout.write(', ') sys.stdout.write( str(slices[i][0]+1) + '-' + str(slices[i][1]+1) ) s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)]) if i < len(slices)-1: s = s + lower(sequence[(slices[i][1]+1):(slices[i+1][0])]) elif slices[i][1] < len(sequence)-1: s = s + lower(sequence[(slices[i][1]+1):(len(sequence))]) print '' print s def runDisEMBLpipeline(): try: smooth_frame = int(sys.argv[1]) peak_frame = int(sys.argv[2]) join_frame = int(sys.argv[3]) fold_coils = float(sys.argv[4]) fold_hotloops = float(sys.argv[5]) fold_rem465 = float(sys.argv[6]) file = str(sys.argv[7]) try: mode = sys.argv[8] except: mode = 'default' except: print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file [mode]\n' print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2 fasta_file' print 'Mode: "default"(nothing) or "scores" which will give scores per residue in TAB seperated format' raise SystemExit db = open(file,'r') parser = Fasta.RecordParser() iterator = Fasta.Iterator(db,parser) print ' ____ _ _____ __ __ ____ _ _ _ _' print '| _ \(_)___| ____| \/ | __ )| | / || || |' print '| | | | / __| _| | |\/| | _ \| | | || || |_' print '| |_| | \__ \ |___| | | | |_) | |___ | ||__ _|' print '|____/|_|___/_____|_| |_|____/|_____| |_(_) |_|' print '# Copyright (C) 2004 - Rune Linding & Lars Juhl Jensen ' print '# EMBL Biocomputing Unit - Heidelberg - Germany ' print '#' while 1: try: cur_record = iterator.next() sequence = upper(cur_record.sequence) # Run NN COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence) # Run Savitzky-Golay REM465_smooth = SavitzkyGolay(smooth_frame,0,REM465_raw) COILS_smooth = SavitzkyGolay(smooth_frame,0,COILS_raw) HOTLOOPS_smooth = SavitzkyGolay(smooth_frame,0,HOTLOOPS_raw) if mode == 'default': sys.stdout.write('> '+cur_record.title+'_COILS ') reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence ) sys.stdout.write('> '+cur_record.title+'_REM465 ') reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence ) sys.stdout.write('> '+cur_record.title+'_HOTLOOPS ') reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence ) sys.stdout.write('\n') elif mode == 'scores': sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n') for i in range(len(REM465_smooth)): sys.stdout.write(sequence[i]+'\t'+fpformat.fix(COILS_smooth[i],5)+'\t'+fpformat.fix(REM465_smooth[i],5)+'\t'+fpformat.fix(HOTLOOPS_smooth[i],5)+'\n') else: sys.stderr.write('Wrong mode given: '+mode+'\n') raise SystemExit except AttributeError: break return runDisEMBLpipeline()