3 # script to run a specified jpred pipeline on a given
4 # sequence. Helps to separate the running of Jpred/Jnet
5 # from any pre-/post-processing of data.
9 run_jpred.pl - a wrapper to run Jpred
18 use lib "/homes/www-jpred/new_site/bin64";
22 use lib '/homes/chris/lib';
26 my $pipe = 'jpred3'; # specify which Jpred pipeline to use (not sure which ones work anymore)
27 my $jnet = '2.2'; # specify which version of Jnet to use (input paramters changed following 2.1)
28 my $useLocal; # do prediction on cluster node's local disk
29 my $useDB = 0; # find query sequences in DB for array jobs
30 my $path; # output path for results
31 my $fasta; # input sequence in Fasta format
39 'use-local' => \$useLocal,
46 pod2usage(-verbose => 2) if $man;
47 pod2usage(-verbose => 1) if $help;
49 ## check to see if in array context
50 my $seqID = $ENV{'SGE_TASK_ID'};
51 undef $seqID if ($seqID eq 'undefined');
53 pod2usage(-msg => 'Please give a sequence filename or run as an array job on the cluster', -verbose => 0) if (!$fasta && !$seqID);
54 pod2usage(-msg => 'Please give an output path', -verbose => 0) if !$path;
56 chdir($path) or die "ERROR - unable to cd to \'$path\': $!\nDied";
61 ## we're cluster array context
62 printf "On cluster node: %s\n", `hostname`;
65 ## find temporary directory on local disk
66 $clustDir = $ENV{TMPDIR};
67 print "Cluster TMPDIR: $clustDir\n";
71 ##retrieve data from DB
72 print "Retrieving $seqID from JnetDB...\n";
73 my $dbh = connect_DB('ro');
74 my $seq = get_seq($dbh, $seqID);
75 die "ERROR - '$seqID' not found in DB.\nDied" unless ($seq);
78 $fasta = "$clustDir/$seqID.fasta";
80 $fasta = "$path/$seqID.fasta";
83 ## write seq data to file
84 open(my $fh, ">$fasta") or die "ERROR - '$fasta': $!";
85 print $fh ">$seqID\n$seq\n";
88 ## else get it from the file
89 $fasta = "$seqID.fasta";
90 die "ERROR - file '$fasta' doesn't exits.\nDied" unless (-e $fasta);
93 system("cp $fasta $clustDir/$seqID.fasta") == 0 or die "ERROR - unable to copy '$fasta' to '$clustDir/$seqID.fasta'\nDied";
95 system("cp $fasta $path/$seqID.fasta") == 0 or die "ERROR - unable to copy '$fasta' to '$path/$seqID.fasta'\nDied";
100 if ($fasta =~ /\/*([^\/]+)?\.fas/) {
102 } elsif ($fasta =~ /\/*([^\/]+)?\.msf/) {
103 print "Found an alignement file...\n";
107 die "ERROR - unable to match for \'$fasta\'";
111 die "ERROR - unknown error ;)";
114 if (-e "$seqID.jnet") {
115 print "Prediction already exists for '$seqID'. Stopping...\n";
116 my @del = glob "*.e*.$seqID";
121 if ($pipe eq 'new') {
123 # cp seq input file to out dir if not already there
125 mkdir($seqID) or die "ERROR - can't mkdir \'$seqID\': $!\nDied";
126 chdir($seqID) or die "ERROR - can't chdir \'$seqID\': $!\nDied";
127 if (!-e "$seqID.fasta") {
128 system("cp $path/$fasta $seqID.fasta") == 0 or die "ERROR - \'cp $path/$fasta $path/$seqID/$seqID.fasta\' failed to run";
131 my $cmd = "$JPREDROOT/new_pipeline/jpred --seq $seqID.fasta --output $seqID --db swall";
132 system($cmd) == 0 or error("$JPREDROOT/new_pipeline/jpred", $seqID);
133 unless ($?) { # when Jnet successfully finishes, copy output files to directory where 'validate_jnet' can find them.
135 system("ln -s $seqID/$seqID.jnet $seqID.jnet");
136 system("ln -s $seqID/$seqID.align $seqID.align");
137 my @clust = glob "$path/*.$seqID";
146 # if input is an alignment reformat it and generate an HMM profile
148 reformat($fasta, $seqID);
149 my $error = get_hmm($seqID);
151 error($error, $seqID);
158 # cp seq input file to out dir if not already there
160 if (!-e "$seqID.fasta") {
161 system("cp $fasta $seqID.fasta") == 0 or die "ERROR - \'cp $fasta $path/$seqID.fasta\' failed to run";
165 # check we know which pipeline to use and run it
167 if ($pipe eq 'cuff') {
168 my $error = old_pipeline($seqID);
170 error($error, $seqID);
173 } elsif ($pipe eq 'jpred2') {
174 my $error = jpred2_pipe($seqID);
176 error($error, $seqID);
179 } elsif ($pipe eq 'jpred3') {
186 my $error = jpred3_pipe($seqID, $dir);
188 error($error, $seqID);
192 print "Rsyncing...\n";
193 system("rsync -az $clustDir/$seqID.* $path/") == 0 or die "ERROR - system() rsync command fails: $?\nDied";
197 die "ERROR - unknown pipeline \'$pipe\'\n";
205 # QSUB IGNORE $cmd = "$BINDIR/jnet_tng --cross 1 --hmm_prof $seqID.hmmprof --psi_prof $seqID.pssm --freq $seqID.freq > $seqID.jnet";
208 # only JNet 2.2 can be run with a pre-generated alignment
210 $cmd = "$BINDIR/jnet --concise --sequence $seqID.align --hmmer $seqID.hmmprof > $seqID.jnet";
213 # generate correct command sytax dependent on version of JNet
216 $cmd = "$BINDIR/jnet --concise --sequence $seqID.align --hmmer $seqID.hmmprof --psiprof $seqID.pssm --psifreq $seqID.freq > $seqID.jnet";
217 } elsif ($jnet eq '2') {
218 $cmd = "$BINDIR/jnet.orig -c $seqID.align $seqID.hmmprof $seqID.pssm $seqID.freq > $seqID.jnet";
220 die "ERROR - unknown version of JNet \'$jnet\'\n";
223 print "Running: $cmd\n";
224 system($cmd) == 0 or error("$BINDIR/jnet", $seqID);
230 my ($seqID, $localDir) = @_;
232 my $run_cmd = "/homes/www-jpred/new_site/jpred_new/jpred --seq $localDir/$seqID.fasta --output $localDir/$seqID --db uniref90 --verbose";
233 print "--CMD $run_cmd\n";
234 system($run_cmd) == 0 or return('jpred3');
240 # original JNet pipeline used for the James Cuff version
241 # of the software. Runs all of the necessary scripts to generate
244 # NB to compare with the 2000 paper requires at least 2 hits to
245 # the query for PSI-BLAST runs. NOT USED AT THE MOMENT
247 my ($seqID, $seq) = @_;
249 my $cmd = "$BINDIR/blastpgp -d $SWALLFILT -b20000 -a2 -j3 -e0.05 -h0.01 -m6 -i $seqID.fasta -Q $seqID.pssm -o $seqID.blast";
250 print "Running: $cmd\n";
251 system($cmd) == 0 or return("$BINDIR/blastpgp");
253 print "Runing: $BINDIR/parse_psi -freq $seqID.blast > $seqID.freq\n";
254 system("$BINDIR/parse_psi -freq $seqID.blast > $seqID.freq") == 0
255 or return("$BINDIR/parse_psi -freq");
257 print "Runing: $BINDIR/parse_psi -ungap $seqID.blast > $seqID.align\n";
258 system("$BINDIR/parse_psi -ungap $seqID.blast > $seqID.align") == 0
259 or return("$BINDIR/parse_psi -ungap");
260 #my $num = `grep -c '>' $seqID.align`;
261 #return("Insufficient PSI-BLAST hits") if ($num < 3); # need at least 2 hits to query (2+1 = 3) to get a decent profile
263 print "Running: $BINDIR/fasta2msf < $seqID.align > $seqID.msf\n";
264 system("$BINDIR/fasta2msf < $seqID.align > $seqID.msf") == 0
265 or return("$BINDIR/fasta2msf");
267 print "Running: $BINDIR/hmmbuild -F --fast --gapmax 1 --wblosum ${seqID}_tmp.hmm $seqID.msf";
268 system("$BINDIR/hmmbuild -F --fast --gapmax 1 --wblosum ${seqID}_tmp.hmm $seqID.msf") == 0
269 or return("$BINDIR/hmmbuild");
271 print "Running: $BINDIR/hmmconvert -F -p ${seqID}_tmp.hmm ${seqID}_tmp.prof\n";
272 system("$BINDIR/hmmconvert -F -p ${seqID}_tmp.hmm ${seqID}_tmp.prof") == 0
273 or return("$BINDIR/hmmconver");
275 print "Running: $BINDIR/hmm2profile ${seqID}_tmp.prof > $seqID.hmmprof\n";
276 system("$BINDIR/hmm2profile ${seqID}_tmp.prof > $seqID.hmmprof") == 0
277 or return("$BINDIR/hmm2profile");
278 unlink("${seqID}_tmp.prof", "${seqID}_tmp.hmm");
285 # current pipeline as used by the Jpred server.
287 # NB to compare with the 2000 paper requires at least 2 hits to
288 # the query for PSI-BLAST runs. NOT USED AT THE MOMENT
290 my ($seqID, $seq) = @_;
292 my $cmd = "$BINDIR/psiblast --query $seqID.fasta --profile $seqID.profile --blast $seqID.blast > $seqID.alignment";
293 print "Running: $cmd\n";
294 system($cmd) == 0 or return("$BINDIR/psiblast");
295 #my $num = `grep -c '>' $seqID.alignment`;
296 #return("Insufficient PSI-BLAST hits") if ($num < 3); # need at least 2 hits to query (2+1 = 3) to get a decent profile
298 $cmd = "$BINDIR/reduce --limit 1000 < $seqID.alignment > $seqID.align";
299 print "Running: $cmd\n";
300 system($cmd) == 0 or return("$BINDIR/reduce");
302 $cmd = "$BINDIR/nonred --cutoff 75 --in $seqID.align --out $seqID.oc > $seqID.align.fasta";
303 print "Running: $cmd\n";
304 system($cmd) == 0 or return("$BINDIR/nonred");
306 $cmd = "$BINDIR/trim_seqs --factor 1.5 < $seqID.align.fasta > ${seqID}_tmp";
307 print "Running: $cmd\n";
308 system($cmd) == 0 or return("$BINDIR/trim_seqs");
309 if (fasta_chk_no_seqs("${seqID}_tmp") < 2) {
310 warn "Warning - Trimmed too many sequences from alignment. Now using all BLAST hits.\n";
311 system("cp $seqID.alignment $seqID.align.fasta") == 0 or return("\'cp $seqID.alignment $seqID.align.fasta\'");
313 rename("${seqID}_tmp", "$seqID.align.fasta");
316 #$cmd = "$JAVABIN -jar $BINDIR/readseq.jar -fMSF -a -p < $seqID.align.fasta > $seqID.align.msf";
317 #print "Running: $cmd\n";
318 #system($cmd) == 0 or return("$BINDIR/readseq");
320 $cmd = "$BINDIR/clean_fasta $seqID.align.fasta > ${seqID}_tmp";
321 print "Running: $cmd\n";
322 system($cmd) == 0 or return("$BINDIR/clean_fasta");
323 rename("${seqID}_tmp", "$seqID.align.fasta");
325 $cmd = "$BINDIR/fasta2upper < $seqID.align.fasta > ${seqID}_tmp";
326 print "Running: $cmd\n";
327 system($cmd) == 0 or return("$BINDIR/fasta2upper");
328 rename("${seqID}_tmp", "$seqID.align.fasta");
330 $cmd = "$BINDIR/fasta2jnet < $seqID.align.fasta > $seqID.align";
331 print "Running: $cmd\n";
332 system($cmd) == 0 or return("$BINDIR/fasta2jnet");
334 #$cmd = "$JAVABIN -jar $BINDIR/readseq.jar -fMSF -a -p < $seqID.align > $seqID.align.msf";
335 #print "Running: $cmd\n";
336 #system($cmd) == 0 or return("$BINDIR/readseq");
338 $cmd = "$BINDIR/profile < $seqID.align > $seqID.freq";
339 print "Running: $cmd\n";
340 system($cmd) == 0 or return("$BINDIR/profile");
342 $cmd = "$BINDIR/getpssm < $seqID.profile > $seqID.pssm";
343 print "Running: $cmd\n";
344 system($cmd) == 0 or return("$BINDIR/getpssm");
346 ## use fasta input rather than msf as sometimes fails. Is it due to faults in readseq above?
347 #$cmd = "$BINDIR/gethmm -in $seqID.align.msf -out $seqID.hmmprof > /dev/null";
348 $cmd = "$BINDIR/gethmm -in $seqID.align -out $seqID.hmmprof > /dev/null";
349 print "Running: $cmd\n";
350 system($cmd) == 0 or return("$BINDIR/gethmm");
357 my ($app, $seq) = @_;
359 warn "ERROR - $app failed to run for $seq\n";
364 my ($file, $seqID) = @_;
366 system("$BINDIR/readseq -fMSF -a -p < $file > $seqID.align.msf");
368 error("$BINDIR/readseq", $seqID);
371 system("$BINDIR/readseq -f8 -a -p < $seqID.align.msf > $seqID.align.fas")
378 $cmd = "$BINDIR/gethmm -in $seqID.align.msf -out $seqID.hmmprof > /dev/null";
379 print "Running: $cmd\n";
380 system($cmd) == 0 or return("$BINDIR/gethmm");
385 sub fasta_chk_no_seqs { # taken from 'webrun' in the Jpred suite of scripts
387 open my $fh, $file or die "$!: $file";
390 while (<$fh>) { $i++ if /^>/ }
397 run_jpred.pl --seq <sequence filename> --out <path> --pipe <name> --jnet <version> [--use-local] [--use-db] [--help] [--man]
401 Jpred is made up of a whole host of scripts and programs which prepare the inputs/outputs to/from the JNet code. Doing this manually is pretty painful and not recommended :-)
403 This wrapper should make life a lot easier whereby the user simply supplies a sequence (or a multiple alignment) and the script does the rest.
405 Due to all the programs that this wrapper encompasses there are many output files generated (at least one for each step), but the main Jpred prediction is found in the .jnet file. All the other files may be of use, but that's up to you to decide...
407 Now the script can be run as an array job while connecting to the jnetDB. This should make running large Jpred runs on the cluster much more efficient.
413 =item B<--seq> <sequence filename>
415 Optional: if a filename is not supplied then cluster array context is assumed.
417 This can either be a fasta-formatted file with a single sequence or a multiple sequence alignment in B<msf> format. The file should have the extension .fas* or .msf
419 =item B<--out> <path>
423 Path to where the results files are to be stored.
425 =item B<--pipe> <name>
427 Select which pipeline is required. There are currently three options: 'cuff', 'jpred2' and 'jpred3'. As the names suggest the 'cuff' pipeline is the one used (as far as I can tell) by James Cuff and the 'server' is the one used by the, erm, server.
431 =item B<--jnet> <version>
433 Select which version of JNet is required to generate the secondary structure prediction. There are currently two options: '2' and '2.2'. JNet v2 is the original version as created by James Cuff and v2.2 is the bug-fixed version updated by Jon Barber as used by the server.
439 For cluster use only. Use the local disk as a working directory. [Default: off]
443 In array context retrieve the files from the Jnet database, otherwise assume they're on disk. [Default: off]
447 I B<strongly> recommend that users only run using the default settings for --pipe and --jnet as things are liable to break otherwise. Those options are mainly for me to do benchmarking between different versions and are specific to my setup. I don't think this is a problem as the defaults simply use what the Jpred server uses, which will be what you want 99.9% of the time. The other 0.01% is your problem ;-)
451 To simply run on a machine:
453 run_jpred.pl --seq test.fas --out my_results/
455 will run PSI-BLAST and everything else on test.fas and the JNet output will be found in my_results/test.jnet
457 To submit to the cluster nodes:
459 qsub -q 64bit.q -cwd -S /usr/bin/perl run_jpred.pl --seqtest.fas --out my_results/
461 and the results will be found in the same place.
463 To submit an array job for sequences 100-200 from jnetDB:
465 qsub -t 100-200 -q 64bit.q -cwd -S /usr/bin/perl run_jpred.pl --out my_results/
469 There are currenly no known bugs...
477 Chris Cole <christian@cole.name>