JPRED-2 Initial commit of software for the Jpred website (some files excluded due...

[jpred.git] / websoft / bin / run_jpred.pl
diff --git a/websoft/bin/run_jpred.pl b/websoft/bin/run_jpred.pl

new file mode 100755 (executable)

index 0000000..78c40d5
--- /dev/null
+++ b/websoft/bin/run_jpred.pl
@@ -0,0 +1,479 @@
+#!/usr/bin/perl
+
+# script to run a specified jpred pipeline on a given
+# sequence. Helps to separate the running of Jpred/Jnet
+# from any pre-/post-processing of data.
+
+=head1 NAME
+
+run_jpred.pl - a wrapper to run Jpred
+
+=cut 
+
+use warnings;
+use strict;
+use Getopt::Long;
+use Pod::Usage;
+
+use lib "/homes/www-jpred/new_site/bin64";
+
+use jpred;
+
+use lib '/homes/chris/lib';
+
+use Jpred::jnetDB;
+
+my $pipe = 'jpred3';  # specify which Jpred pipeline to use (not sure which ones work anymore)
+my $jnet = '2.2';     # specify which version of Jnet to use (input paramters changed following 2.1)
+my $useLocal;         # do prediction on cluster node's local disk
+my $useDB = 0;        # find query sequences in DB for array jobs
+my $path;             # output path for results
+my $fasta;            # input sequence in Fasta format
+my $help;
+my $man;
+
+GetOptions(
+   'seq=s'   => \$fasta,
+   'pipe=s'  => \$pipe,
+   'out=s'   => \$path,
+   'use-local' => \$useLocal,
+   'jnet=s'  => \$jnet,
+   'use-db!' => \$useDB,
+   'help|?'  => \$help,
+   'man'     => \$man
+) or pod2usage(0);
+
+pod2usage(-verbose => 2) if $man;
+pod2usage(-verbose => 1) if $help;
+
+## check to see if in array context
+my $seqID = $ENV{'SGE_TASK_ID'};
+undef $seqID if ($seqID eq 'undefined');
+
+pod2usage(-msg => 'Please give a sequence filename or run as an array job on the cluster', -verbose => 0) if (!$fasta && !$seqID);
+pod2usage(-msg => 'Please give an output path', -verbose => 0) if !$path;
+
+chdir($path) or die "ERROR - unable to cd to \'$path\': $!\nDied";
+
+my $clustDir;
+my $msf = 0;
+if ($seqID) {
+   ## we're cluster array context
+   printf "On cluster node: %s\n", `hostname`;
+
+   if ($useLocal) {
+      ## find temporary directory on local disk
+      $clustDir = $ENV{TMPDIR};
+      print "Cluster TMPDIR: $clustDir\n"; 
+   }
+   
+   if ($useDB) {
+      ##retrieve data from DB
+      print "Retrieving $seqID from JnetDB...\n";
+      my $dbh = connect_DB('ro');
+      my $seq = get_seq($dbh, $seqID);
+      die "ERROR - '$seqID' not found in DB.\nDied" unless ($seq);
+      $dbh->disconnect();
+      if ($clustDir) {
+         $fasta = "$clustDir/$seqID.fasta";
+      } else {
+         $fasta = "$path/$seqID.fasta";
+      }
+   
+      ## write seq data to file 
+      open(my $fh, ">$fasta") or die "ERROR - '$fasta': $!";
+      print $fh ">$seqID\n$seq\n";
+      close($fh);
+   } else {
+      ## else get it from the file
+      $fasta = "$seqID.fasta";
+      die "ERROR - file '$fasta' doesn't exits.\nDied" unless (-e $fasta);
+
+      if ($clustDir) {
+         system("cp $fasta $clustDir/$seqID.fasta") == 0 or die "ERROR - unable to copy '$fasta' to '$clustDir/$seqID.fasta'\nDied";
+      } else {
+         system("cp $fasta $path/$seqID.fasta") == 0 or die "ERROR - unable to copy '$fasta' to '$path/$seqID.fasta'\nDied";
+      }
+      
+   }
+} elsif ($fasta) {
+   if ($fasta =~ /\/*([^\/]+)?\.fas/) {
+      $seqID = $1;
+   } elsif ($fasta =~ /\/*([^\/]+)?\.msf/) {
+      print "Found an alignement file...\n";
+      $seqID = $1;
+      $msf = 1;
+   } else {
+      die "ERROR - unable to match for \'$fasta\'";
+   }
+      
+} else {
+   die "ERROR - unknown error ;)";
+}
+
+if (-e "$seqID.jnet") {
+   print "Prediction already exists for '$seqID'. Stopping...\n";
+   my @del = glob "*.e*.$seqID";
+   unlink(@del);
+   exit;
+}
+
+if ($pipe eq 'new') {
+   #
+   # cp seq input file to out dir if not already there
+   #
+   mkdir($seqID) or die "ERROR - can't mkdir \'$seqID\': $!\nDied";
+   chdir($seqID) or die "ERROR - can't chdir \'$seqID\': $!\nDied";
+   if (!-e "$seqID.fasta") {
+      system("cp $path/$fasta $seqID.fasta") == 0 or die "ERROR - \'cp $path/$fasta $path/$seqID/$seqID.fasta\' failed to run";
+   }
+   
+   my $cmd = "$JPREDROOT/new_pipeline/jpred --seq $seqID.fasta --output $seqID --db swall";
+   system($cmd) == 0 or error("$JPREDROOT/new_pipeline/jpred", $seqID);
+   unless ($?) {   # when Jnet successfully finishes, copy output files to directory where 'validate_jnet' can find them.
+      chdir('..');
+      system("ln -s $seqID/$seqID.jnet $seqID.jnet");
+      system("ln -s $seqID/$seqID.align $seqID.align");
+      my @clust = glob "$path/*.$seqID";
+      unlink(@clust);
+   }
+   exit;
+}
+
+
+if ($msf) {
+   #
+   # if input is an alignment reformat it and generate an HMM profile
+   #
+   reformat($fasta, $seqID);
+   my $error = get_hmm($seqID);
+   if ($error) {
+      error($error, $seqID);
+      exit(1);
+   }  
+   
+   
+} else {
+   #
+   # cp seq input file to out dir if not already there
+   #
+   if (!-e "$seqID.fasta") {
+      system("cp $fasta $seqID.fasta") == 0 or die "ERROR - \'cp $fasta $path/$seqID.fasta\' failed to run";
+   }
+
+   #
+   # check we know which pipeline to use and run it
+   #
+   if ($pipe eq 'cuff') {
+       my $error = old_pipeline($seqID);
+      if ($error) {
+               error($error, $seqID);
+         exit(1);
+      }
+   } elsif ($pipe eq 'jpred2')  {
+       my $error = jpred2_pipe($seqID);
+      if ($error) {
+         error($error, $seqID);
+         exit(1);
+      }
+   } elsif ($pipe eq 'jpred3')  {
+      my $dir;
+      if ($useLocal) {
+         $dir = $clustDir;
+      } else {
+         $dir = $path;
+      }
+       my $error = jpred3_pipe($seqID, $dir);
+      if ($error) {
+         error($error, $seqID);
+         exit(1);
+      }
+      if ($useLocal) {
+         print "Rsyncing...\n";
+         system("rsync -az $clustDir/$seqID.* $path/") == 0 or die "ERROR - system() rsync command fails: $?\nDied";
+      }
+      exit;
+   } else {
+      die "ERROR - unknown pipeline \'$pipe\'\n";
+   }
+}
+
+#
+# run Jnet
+#
+my $cmd;
+# QSUB IGNORE   $cmd = "$BINDIR/jnet_tng --cross 1 --hmm_prof $seqID.hmmprof --psi_prof $seqID.pssm --freq $seqID.freq > $seqID.jnet";
+if ($msf) {
+   #
+   #  only JNet 2.2 can be run with a pre-generated alignment
+   #
+   $cmd = "$BINDIR/jnet --concise --sequence $seqID.align --hmmer $seqID.hmmprof > $seqID.jnet";
+} else {
+   #
+   #  generate correct command sytax dependent on version of JNet
+   #
+   if ($jnet > 2.1) {
+          $cmd = "$BINDIR/jnet --concise --sequence $seqID.align --hmmer $seqID.hmmprof --psiprof $seqID.pssm --psifreq $seqID.freq > $seqID.jnet";
+   } elsif ($jnet eq '2') {
+          $cmd = "$BINDIR/jnet.orig -c $seqID.align $seqID.hmmprof $seqID.pssm $seqID.freq > $seqID.jnet";
+   } else {
+      die "ERROR - unknown version of JNet \'$jnet\'\n";
+   }
+}
+print "Running: $cmd\n";
+system($cmd) == 0      or error("$BINDIR/jnet", $seqID);
+exit(1) if $?;
+
+exit(0);
+
+sub jpred3_pipe {
+   my ($seqID, $localDir) = @_;
+   
+   my $run_cmd = "/homes/www-jpred/new_site/jpred_new/jpred --seq $localDir/$seqID.fasta --output $localDir/$seqID --db uniref90 --verbose";
+   print "--CMD $run_cmd\n";
+   system($run_cmd) == 0 or return('jpred3');
+   return(0);
+}
+
+sub old_pipeline {
+       
+   # original JNet pipeline used for the James Cuff version
+   # of the software. Runs all of the necessary scripts to generate
+   # input for JNet.
+   
+   # NB to compare with the 2000 paper requires at least 2 hits to 
+   # the query for PSI-BLAST runs. NOT USED AT THE MOMENT
+   
+   my ($seqID, $seq) = @_;
+   
+   my $cmd = "$BINDIR/blastpgp -d $SWALLFILT -b20000 -a2 -j3 -e0.05 -h0.01 -m6 -i $seqID.fasta -Q $seqID.pssm -o $seqID.blast";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/blastpgp");
+   
+   print "Runing: $BINDIR/parse_psi -freq $seqID.blast > $seqID.freq\n";
+   system("$BINDIR/parse_psi -freq $seqID.blast > $seqID.freq") == 0
+       or return("$BINDIR/parse_psi -freq");
+   
+   print "Runing: $BINDIR/parse_psi -ungap $seqID.blast > $seqID.align\n";
+   system("$BINDIR/parse_psi -ungap $seqID.blast > $seqID.align")      == 0
+       or return("$BINDIR/parse_psi -ungap");
+   #my $num = `grep -c '>' $seqID.align`;
+   #return("Insufficient PSI-BLAST hits") if ($num < 3);   # need at least 2 hits to query (2+1 = 3) to get a decent profile
+   
+   print "Running: $BINDIR/fasta2msf < $seqID.align > $seqID.msf\n";
+   system("$BINDIR/fasta2msf < $seqID.align > $seqID.msf") == 0
+       or return("$BINDIR/fasta2msf");
+   
+   print "Running: $BINDIR/hmmbuild -F --fast --gapmax 1 --wblosum ${seqID}_tmp.hmm $seqID.msf";
+   system("$BINDIR/hmmbuild -F --fast --gapmax 1 --wblosum ${seqID}_tmp.hmm $seqID.msf") == 0
+       or return("$BINDIR/hmmbuild");
+      
+   print "Running: $BINDIR/hmmconvert -F -p ${seqID}_tmp.hmm ${seqID}_tmp.prof\n";
+   system("$BINDIR/hmmconvert -F -p ${seqID}_tmp.hmm ${seqID}_tmp.prof") == 0
+       or return("$BINDIR/hmmconver");
+      
+   print "Running: $BINDIR/hmm2profile ${seqID}_tmp.prof > $seqID.hmmprof\n";
+   system("$BINDIR/hmm2profile ${seqID}_tmp.prof > $seqID.hmmprof") == 0
+       or return("$BINDIR/hmm2profile");
+   unlink("${seqID}_tmp.prof", "${seqID}_tmp.hmm");
+   
+   return(0);
+}
+
+sub jpred2_pipe {
+       
+   # current pipeline as used by the Jpred server.
+   
+   # NB to compare with the 2000 paper requires at least 2 hits to 
+   # the query for PSI-BLAST runs. NOT USED AT THE MOMENT
+   
+   my ($seqID, $seq) = @_;
+   
+   my $cmd = "$BINDIR/psiblast --query $seqID.fasta --profile $seqID.profile --blast $seqID.blast > $seqID.alignment";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/psiblast");
+   #my $num = `grep -c '>' $seqID.alignment`;
+   #return("Insufficient PSI-BLAST hits") if ($num < 3);   # need at least 2 hits to query (2+1 = 3) to get a decent profile
+   
+   $cmd = "$BINDIR/reduce --limit 1000 < $seqID.alignment > $seqID.align";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/reduce");
+   
+   $cmd = "$BINDIR/nonred --cutoff 75 --in $seqID.align --out $seqID.oc > $seqID.align.fasta";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/nonred");
+   
+   $cmd = "$BINDIR/trim_seqs --factor 1.5 < $seqID.align.fasta > ${seqID}_tmp";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/trim_seqs");
+   if (fasta_chk_no_seqs("${seqID}_tmp") < 2) {
+      warn "Warning - Trimmed too many sequences from alignment. Now using all BLAST hits.\n";
+      system("cp $seqID.alignment $seqID.align.fasta") == 0 or return("\'cp $seqID.alignment $seqID.align.fasta\'");
+   } else {
+      rename("${seqID}_tmp", "$seqID.align.fasta");
+   }
+   
+   #$cmd = "$JAVABIN -jar $BINDIR/readseq.jar -fMSF -a -p < $seqID.align.fasta > $seqID.align.msf";
+   #print "Running: $cmd\n";
+   #system($cmd) == 0 or return("$BINDIR/readseq");
+   
+   $cmd = "$BINDIR/clean_fasta $seqID.align.fasta > ${seqID}_tmp";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/clean_fasta");
+   rename("${seqID}_tmp", "$seqID.align.fasta");
+   
+   $cmd = "$BINDIR/fasta2upper < $seqID.align.fasta > ${seqID}_tmp";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/fasta2upper");
+   rename("${seqID}_tmp", "$seqID.align.fasta");
+   
+   $cmd = "$BINDIR/fasta2jnet < $seqID.align.fasta > $seqID.align";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/fasta2jnet");
+   
+   #$cmd = "$JAVABIN -jar $BINDIR/readseq.jar -fMSF -a -p < $seqID.align > $seqID.align.msf";
+   #print "Running: $cmd\n";
+   #system($cmd) == 0 or return("$BINDIR/readseq");
+   
+   $cmd = "$BINDIR/profile < $seqID.align > $seqID.freq";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/profile");
+   
+   $cmd = "$BINDIR/getpssm < $seqID.profile > $seqID.pssm";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/getpssm");
+   
+   ## use fasta input rather than msf as sometimes fails. Is it due to faults in readseq above?
+   #$cmd = "$BINDIR/gethmm -in $seqID.align.msf -out $seqID.hmmprof > /dev/null";
+   $cmd = "$BINDIR/gethmm -in $seqID.align -out $seqID.hmmprof > /dev/null";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/gethmm");
+   
+   return(0);
+}
+
+sub error {
+       
+   my ($app, $seq) = @_;
+   
+   warn "ERROR - $app failed to run for $seq\n";
+}
+
+sub reformat {
+   
+   my ($file, $seqID) = @_;
+   
+   system("$BINDIR/readseq -fMSF -a -p < $file > $seqID.align.msf");
+   if ($?) {
+      error("$BINDIR/readseq", $seqID);
+      exit;
+   }
+   system("$BINDIR/readseq -f8 -a -p < $seqID.align.msf > $seqID.align.fas")
+   
+}
+
+sub get_hmm {
+   my ($seqID) = @_;
+   
+   $cmd = "$BINDIR/gethmm -in $seqID.align.msf -out $seqID.hmmprof > /dev/null";
+   print "Running: $cmd\n";
+   system($cmd) == 0 or return("$BINDIR/gethmm");
+   
+   return(0);
+
+}
+sub fasta_chk_no_seqs {  # taken from 'webrun' in the Jpred suite of scripts
+   my $file = shift;
+   open my $fh, $file or die "$!: $file";
+   my $i = 0;
+   local $/ = "\n";
+   while (<$fh>) { $i++ if /^>/ }
+   close $fh;
+   return $i;
+}
+
+=head1 SYNOPSIS
+
+run_jpred.pl  --seq <sequence filename>  --out <path>  --pipe <name>  --jnet <version> [--use-local] [--use-db]  [--help] [--man]
+
+=head1 DESCRIPTION
+
+Jpred is made up of a whole host of scripts and programs which prepare the inputs/outputs to/from the JNet code. Doing this manually is pretty painful and not recommended :-)
+
+This wrapper should make life a lot easier whereby the user simply supplies a sequence (or a multiple alignment) and the script does the rest.
+
+Due to all the programs that this wrapper encompasses there are many output files generated (at least one for each step), but the main Jpred prediction is found in the .jnet file. All the other files may be of use, but that's up to you to decide...
+
+Now the script can be run as an array job while connecting to the jnetDB. This should make running large Jpred runs on the cluster much more efficient.
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--seq> <sequence filename>
+
+Optional: if a filename is not supplied then cluster array context is assumed.
+
+This can either be a fasta-formatted file with a single sequence or a multiple sequence alignment in B<msf> format. The file should have the extension .fas* or .msf
+
+=item B<--out> <path>
+
+Required.
+
+Path to where the results files are to be stored.
+
+=item B<--pipe> <name>
+
+Select which pipeline is required. There are currently three options: 'cuff', 'jpred2' and 'jpred3'. As the names suggest the 'cuff' pipeline is the one used (as far as I can tell) by James Cuff and the 'server' is the one used by the, erm, server.
+
+Default: 'jpred3'
+
+=item B<--jnet> <version>
+
+Select which version of JNet is required to generate the secondary structure prediction. There are currently two options: '2' and '2.2'. JNet v2 is the original version as created by James Cuff and v2.2 is the bug-fixed version updated by Jon Barber as used by the server.
+
+Default: 2.2
+
+=item B<--use-local>
+
+For cluster use only. Use the local disk as a working directory. [Default: off]
+
+=item B<--use-db>
+
+In array context retrieve the files from the Jnet database, otherwise assume they're on disk. [Default: off]
+
+=back
+
+I B<strongly> recommend that users only run using the default settings for --pipe and --jnet as things are liable to break otherwise. Those options are mainly for me to do benchmarking between different versions and are specific to my setup. I don't think this is a problem as the defaults simply use what the Jpred server uses, which will be what you want 99.9% of the time. The other 0.01% is your problem ;-)
+
+=head1 EXAMPLES
+
+To simply run on a machine:
+
+ run_jpred.pl  --seq test.fas --out my_results/
+ 
+will run PSI-BLAST and everything else on test.fas and the JNet output will be found in my_results/test.jnet
+
+To submit to the cluster nodes:
+
+ qsub -q 64bit.q -cwd -S /usr/bin/perl run_jpred.pl --seqtest.fas --out my_results/
+  
+and the results will be found in the same place.
+
+To submit an array job for sequences 100-200 from jnetDB:
+
+ qsub -t 100-200 -q 64bit.q -cwd -S /usr/bin/perl run_jpred.pl --out my_results/
+
+=head1 BUGS
+
+There are currenly no known bugs...
+
+=head1 VERSION
+
+This is version 0.5
+
+=head1 AUTHOR
+
+Chris Cole <christian@cole.name>
+
+=cut