websoft/bin/run_jpred.pl

   1 #!/usr/bin/perl
   2
   3 # script to run a specified jpred pipeline on a given
   4 # sequence. Helps to separate the running of Jpred/Jnet
   5 # from any pre-/post-processing of data.
   6
   7 =head1 NAME
   8
   9 run_jpred.pl - a wrapper to run Jpred
  10
  11 =cut
  12
  13 use warnings;
  14 use strict;
  15 use Getopt::Long;
  16 use Pod::Usage;
  17
  18 use lib "/homes/www-jpred/new_site/bin64";
  19
  20 use jpred;
  21
  22 use lib '/homes/chris/lib';
  23
  24 use Jpred::jnetDB;
  25
  26 my $pipe = 'jpred3';  # specify which Jpred pipeline to use (not sure which ones work anymore)
  27 my $jnet = '2.2';     # specify which version of Jnet to use (input paramters changed following 2.1)
  28 my $useLocal;         # do prediction on cluster node's local disk
  29 my $useDB = 0;        # find query sequences in DB for array jobs
  30 my $path;             # output path for results
  31 my $fasta;            # input sequence in Fasta format
  32 my $help;
  33 my $man;
  34
  35 GetOptions(
  36    'seq=s'   => \$fasta,
  37    'pipe=s'  => \$pipe,
  38    'out=s'   => \$path,
  39    'use-local' => \$useLocal,
  40    'jnet=s'  => \$jnet,
  41    'use-db!' => \$useDB,
  42    'help|?'  => \$help,
  43    'man'     => \$man
  44 ) or pod2usage(0);
  45
  46 pod2usage(-verbose => 2) if $man;
  47 pod2usage(-verbose => 1) if $help;
  48
  49 ## check to see if in array context
  50 my $seqID = $ENV{'SGE_TASK_ID'};
  51 undef $seqID if ($seqID eq 'undefined');
  52
  53 pod2usage(-msg => 'Please give a sequence filename or run as an array job on the cluster', -verbose => 0) if (!$fasta && !$seqID);
  54 pod2usage(-msg => 'Please give an output path', -verbose => 0) if !$path;
  55
  56 chdir($path) or die "ERROR - unable to cd to \'$path\': $!\nDied";
  57
  58 my $clustDir;
  59 my $msf = 0;
  60 if ($seqID) {
  61    ## we're cluster array context
  62    printf "On cluster node: %s\n", `hostname`;
  63
  64    if ($useLocal) {
  65       ## find temporary directory on local disk
  66       $clustDir = $ENV{TMPDIR};
  67       print "Cluster TMPDIR: $clustDir\n";
  68    }
  69
  70    if ($useDB) {
  71       ##retrieve data from DB
  72       print "Retrieving $seqID from JnetDB...\n";
  73       my $dbh = connect_DB('ro');
  74       my $seq = get_seq($dbh, $seqID);
  75       die "ERROR - '$seqID' not found in DB.\nDied" unless ($seq);
  76       $dbh->disconnect();
  77       if ($clustDir) {
  78          $fasta = "$clustDir/$seqID.fasta";
  79       } else {
  80          $fasta = "$path/$seqID.fasta";
  81       }
  82
  83       ## write seq data to file
  84       open(my $fh, ">$fasta") or die "ERROR - '$fasta': $!";
  85       print $fh ">$seqID\n$seq\n";
  86       close($fh);
  87    } else {
  88       ## else get it from the file
  89       $fasta = "$seqID.fasta";
  90       die "ERROR - file '$fasta' doesn't exits.\nDied" unless (-e $fasta);
  91
  92       if ($clustDir) {
  93          system("cp $fasta $clustDir/$seqID.fasta") == 0 or die "ERROR - unable to copy '$fasta' to '$clustDir/$seqID.fasta'\nDied";
  94       } else {
  95          system("cp $fasta $path/$seqID.fasta") == 0 or die "ERROR - unable to copy '$fasta' to '$path/$seqID.fasta'\nDied";
  96       }
  97
  98    }
  99 } elsif ($fasta) {
 100    if ($fasta =~ /\/*([^\/]+)?\.fas/) {
 101       $seqID = $1;
 102    } elsif ($fasta =~ /\/*([^\/]+)?\.msf/) {
 103       print "Found an alignement file...\n";
 104       $seqID = $1;
 105       $msf = 1;
 106    } else {
 107       die "ERROR - unable to match for \'$fasta\'";
 108    }
 109
 110 } else {
 111    die "ERROR - unknown error ;)";
 112 }
 113
 114 if (-e "$seqID.jnet") {
 115    print "Prediction already exists for '$seqID'. Stopping...\n";
 116    my @del = glob "*.e*.$seqID";
 117    unlink(@del);
 118    exit;
 119 }
 120
 121 if ($pipe eq 'new') {
 122    #
 123    # cp seq input file to out dir if not already there
 124    #
 125    mkdir($seqID) or die "ERROR - can't mkdir \'$seqID\': $!\nDied";
 126    chdir($seqID) or die "ERROR - can't chdir \'$seqID\': $!\nDied";
 127    if (!-e "$seqID.fasta") {
 128       system("cp $path/$fasta $seqID.fasta") == 0 or die "ERROR - \'cp $path/$fasta $path/$seqID/$seqID.fasta\' failed to run";
 129    }
 130
 131    my $cmd = "$JPREDROOT/new_pipeline/jpred --seq $seqID.fasta --output $seqID --db swall";
 132    system($cmd) == 0 or error("$JPREDROOT/new_pipeline/jpred", $seqID);
 133    unless ($?) {   # when Jnet successfully finishes, copy output files to directory where 'validate_jnet' can find them.
 134       chdir('..');
 135       system("ln -s $seqID/$seqID.jnet $seqID.jnet");
 136       system("ln -s $seqID/$seqID.align $seqID.align");
 137       my @clust = glob "$path/*.$seqID";
 138       unlink(@clust);
 139    }
 140    exit;
 141 }
 142
 143
 144 if ($msf) {
 145    #
 146    # if input is an alignment reformat it and generate an HMM profile
 147    #
 148    reformat($fasta, $seqID);
 149    my $error = get_hmm($seqID);
 150    if ($error) {
 151       error($error, $seqID);
 152       exit(1);
 153    }
 154
 155
 156 } else {
 157    #
 158    # cp seq input file to out dir if not already there
 159    #
 160    if (!-e "$seqID.fasta") {
 161       system("cp $fasta $seqID.fasta") == 0 or die "ERROR - \'cp $fasta $path/$seqID.fasta\' failed to run";
 162    }
 163
 164    #
 165    # check we know which pipeline to use and run it
 166    #
 167    if ($pipe eq 'cuff') {
 168         my $error = old_pipeline($seqID);
 169       if ($error) {
 170                 error($error, $seqID);
 171          exit(1);
 172       }
 173    } elsif ($pipe eq 'jpred2')  {
 174         my $error = jpred2_pipe($seqID);
 175       if ($error) {
 176          error($error, $seqID);
 177          exit(1);
 178       }
 179    } elsif ($pipe eq 'jpred3')  {
 180       my $dir;
 181       if ($useLocal) {
 182          $dir = $clustDir;
 183       } else {
 184          $dir = $path;
 185       }
 186         my $error = jpred3_pipe($seqID, $dir);
 187       if ($error) {
 188          error($error, $seqID);
 189          exit(1);
 190       }
 191       if ($useLocal) {
 192          print "Rsyncing...\n";
 193          system("rsync -az $clustDir/$seqID.* $path/") == 0 or die "ERROR - system() rsync command fails: $?\nDied";
 194       }
 195       exit;
 196    } else {
 197       die "ERROR - unknown pipeline \'$pipe\'\n";
 198    }
 199 }
 200
 201 #
 202 # run Jnet
 203 #
 204 my $cmd;
 205 # QSUB IGNORE   $cmd = "$BINDIR/jnet_tng --cross 1 --hmm_prof $seqID.hmmprof --psi_prof $seqID.pssm --freq $seqID.freq > $seqID.jnet";
 206 if ($msf) {
 207    #
 208    #  only JNet 2.2 can be run with a pre-generated alignment
 209    #
 210    $cmd = "$BINDIR/jnet --concise --sequence $seqID.align --hmmer $seqID.hmmprof > $seqID.jnet";
 211 } else {
 212    #
 213    #  generate correct command sytax dependent on version of JNet
 214    #
 215    if ($jnet > 2.1) {
 216            $cmd = "$BINDIR/jnet --concise --sequence $seqID.align --hmmer $seqID.hmmprof --psiprof $seqID.pssm --psifreq $seqID.freq > $seqID.jnet";
 217    } elsif ($jnet eq '2') {
 218            $cmd = "$BINDIR/jnet.orig -c $seqID.align $seqID.hmmprof $seqID.pssm $seqID.freq > $seqID.jnet";
 219    } else {
 220       die "ERROR - unknown version of JNet \'$jnet\'\n";
 221    }
 222 }
 223 print "Running: $cmd\n";
 224 system($cmd) == 0       or error("$BINDIR/jnet", $seqID);
 225 exit(1) if $?;
 226
 227 exit(0);
 228
 229 sub jpred3_pipe {
 230    my ($seqID, $localDir) = @_;
 231
 232    my $run_cmd = "/homes/www-jpred/new_site/jpred_new/jpred --seq $localDir/$seqID.fasta --output $localDir/$seqID --db uniref90 --verbose";
 233    print "--CMD $run_cmd\n";
 234    system($run_cmd) == 0 or return('jpred3');
 235    return(0);
 236 }
 237
 238 sub old_pipeline {
 239
 240    # original JNet pipeline used for the James Cuff version
 241    # of the software. Runs all of the necessary scripts to generate
 242    # input for JNet.
 243
 244    # NB to compare with the 2000 paper requires at least 2 hits to
 245    # the query for PSI-BLAST runs. NOT USED AT THE MOMENT
 246
 247    my ($seqID, $seq) = @_;
 248
 249    my $cmd = "$BINDIR/blastpgp -d $SWALLFILT -b20000 -a2 -j3 -e0.05 -h0.01 -m6 -i $seqID.fasta -Q $seqID.pssm -o $seqID.blast";
 250    print "Running: $cmd\n";
 251    system($cmd) == 0 or return("$BINDIR/blastpgp");
 252
 253    print "Runing: $BINDIR/parse_psi -freq $seqID.blast > $seqID.freq\n";
 254    system("$BINDIR/parse_psi -freq $seqID.blast > $seqID.freq") == 0
 255         or return("$BINDIR/parse_psi -freq");
 256
 257    print "Runing: $BINDIR/parse_psi -ungap $seqID.blast > $seqID.align\n";
 258    system("$BINDIR/parse_psi -ungap $seqID.blast > $seqID.align")       == 0
 259         or return("$BINDIR/parse_psi -ungap");
 260    #my $num = `grep -c '>' $seqID.align`;
 261    #return("Insufficient PSI-BLAST hits") if ($num < 3);   # need at least 2 hits to query (2+1 = 3) to get a decent profile
 262
 263    print "Running: $BINDIR/fasta2msf < $seqID.align > $seqID.msf\n";
 264    system("$BINDIR/fasta2msf < $seqID.align > $seqID.msf") == 0
 265         or return("$BINDIR/fasta2msf");
 266
 267    print "Running: $BINDIR/hmmbuild -F --fast --gapmax 1 --wblosum ${seqID}_tmp.hmm $seqID.msf";
 268    system("$BINDIR/hmmbuild -F --fast --gapmax 1 --wblosum ${seqID}_tmp.hmm $seqID.msf") == 0
 269         or return("$BINDIR/hmmbuild");
 270
 271    print "Running: $BINDIR/hmmconvert -F -p ${seqID}_tmp.hmm ${seqID}_tmp.prof\n";
 272    system("$BINDIR/hmmconvert -F -p ${seqID}_tmp.hmm ${seqID}_tmp.prof") == 0
 273         or return("$BINDIR/hmmconver");
 274
 275    print "Running: $BINDIR/hmm2profile ${seqID}_tmp.prof > $seqID.hmmprof\n";
 276    system("$BINDIR/hmm2profile ${seqID}_tmp.prof > $seqID.hmmprof") == 0
 277         or return("$BINDIR/hmm2profile");
 278    unlink("${seqID}_tmp.prof", "${seqID}_tmp.hmm");
 279
 280    return(0);
 281 }
 282
 283 sub jpred2_pipe {
 284
 285    # current pipeline as used by the Jpred server.
 286
 287    # NB to compare with the 2000 paper requires at least 2 hits to
 288    # the query for PSI-BLAST runs. NOT USED AT THE MOMENT
 289
 290    my ($seqID, $seq) = @_;
 291
 292    my $cmd = "$BINDIR/psiblast --query $seqID.fasta --profile $seqID.profile --blast $seqID.blast > $seqID.alignment";
 293    print "Running: $cmd\n";
 294    system($cmd) == 0 or return("$BINDIR/psiblast");
 295    #my $num = `grep -c '>' $seqID.alignment`;
 296    #return("Insufficient PSI-BLAST hits") if ($num < 3);   # need at least 2 hits to query (2+1 = 3) to get a decent profile
 297
 298    $cmd = "$BINDIR/reduce --limit 1000 < $seqID.alignment > $seqID.align";
 299    print "Running: $cmd\n";
 300    system($cmd) == 0 or return("$BINDIR/reduce");
 301
 302    $cmd = "$BINDIR/nonred --cutoff 75 --in $seqID.align --out $seqID.oc > $seqID.align.fasta";
 303    print "Running: $cmd\n";
 304    system($cmd) == 0 or return("$BINDIR/nonred");
 305
 306    $cmd = "$BINDIR/trim_seqs --factor 1.5 < $seqID.align.fasta > ${seqID}_tmp";
 307    print "Running: $cmd\n";
 308    system($cmd) == 0 or return("$BINDIR/trim_seqs");
 309    if (fasta_chk_no_seqs("${seqID}_tmp") < 2) {
 310       warn "Warning - Trimmed too many sequences from alignment. Now using all BLAST hits.\n";
 311       system("cp $seqID.alignment $seqID.align.fasta") == 0 or return("\'cp $seqID.alignment $seqID.align.fasta\'");
 312    } else {
 313       rename("${seqID}_tmp", "$seqID.align.fasta");
 314    }
 315
 316    #$cmd = "$JAVABIN -jar $BINDIR/readseq.jar -fMSF -a -p < $seqID.align.fasta > $seqID.align.msf";
 317    #print "Running: $cmd\n";
 318    #system($cmd) == 0 or return("$BINDIR/readseq");
 319
 320    $cmd = "$BINDIR/clean_fasta $seqID.align.fasta > ${seqID}_tmp";
 321    print "Running: $cmd\n";
 322    system($cmd) == 0 or return("$BINDIR/clean_fasta");
 323    rename("${seqID}_tmp", "$seqID.align.fasta");
 324
 325    $cmd = "$BINDIR/fasta2upper < $seqID.align.fasta > ${seqID}_tmp";
 326    print "Running: $cmd\n";
 327    system($cmd) == 0 or return("$BINDIR/fasta2upper");
 328    rename("${seqID}_tmp", "$seqID.align.fasta");
 329
 330    $cmd = "$BINDIR/fasta2jnet < $seqID.align.fasta > $seqID.align";
 331    print "Running: $cmd\n";
 332    system($cmd) == 0 or return("$BINDIR/fasta2jnet");
 333
 334    #$cmd = "$JAVABIN -jar $BINDIR/readseq.jar -fMSF -a -p < $seqID.align > $seqID.align.msf";
 335    #print "Running: $cmd\n";
 336    #system($cmd) == 0 or return("$BINDIR/readseq");
 337
 338    $cmd = "$BINDIR/profile < $seqID.align > $seqID.freq";
 339    print "Running: $cmd\n";
 340    system($cmd) == 0 or return("$BINDIR/profile");
 341
 342    $cmd = "$BINDIR/getpssm < $seqID.profile > $seqID.pssm";
 343    print "Running: $cmd\n";
 344    system($cmd) == 0 or return("$BINDIR/getpssm");
 345
 346    ## use fasta input rather than msf as sometimes fails. Is it due to faults in readseq above?
 347    #$cmd = "$BINDIR/gethmm -in $seqID.align.msf -out $seqID.hmmprof > /dev/null";
 348    $cmd = "$BINDIR/gethmm -in $seqID.align -out $seqID.hmmprof > /dev/null";
 349    print "Running: $cmd\n";
 350    system($cmd) == 0 or return("$BINDIR/gethmm");
 351
 352    return(0);
 353 }
 354
 355 sub error {
 356
 357    my ($app, $seq) = @_;
 358
 359    warn "ERROR - $app failed to run for $seq\n";
 360 }
 361
 362 sub reformat {
 363
 364    my ($file, $seqID) = @_;
 365
 366    system("$BINDIR/readseq -fMSF -a -p < $file > $seqID.align.msf");
 367    if ($?) {
 368       error("$BINDIR/readseq", $seqID);
 369       exit;
 370    }
 371    system("$BINDIR/readseq -f8 -a -p < $seqID.align.msf > $seqID.align.fas")
 372
 373 }
 374
 375 sub get_hmm {
 376    my ($seqID) = @_;
 377
 378    $cmd = "$BINDIR/gethmm -in $seqID.align.msf -out $seqID.hmmprof > /dev/null";
 379    print "Running: $cmd\n";
 380    system($cmd) == 0 or return("$BINDIR/gethmm");
 381
 382    return(0);
 383
 384 }
 385 sub fasta_chk_no_seqs {  # taken from 'webrun' in the Jpred suite of scripts
 386    my $file = shift;
 387    open my $fh, $file or die "$!: $file";
 388    my $i = 0;
 389    local $/ = "\n";
 390    while (<$fh>) { $i++ if /^>/ }
 391    close $fh;
 392    return $i;
 393 }
 394
 395 =head1 SYNOPSIS
 396
 397 run_jpred.pl  --seq <sequence filename>  --out <path>  --pipe <name>  --jnet <version> [--use-local] [--use-db]  [--help] [--man]
 398
 399 =head1 DESCRIPTION
 400
 401 Jpred is made up of a whole host of scripts and programs which prepare the inputs/outputs to/from the JNet code. Doing this manually is pretty painful and not recommended :-)
 402
 403 This wrapper should make life a lot easier whereby the user simply supplies a sequence (or a multiple alignment) and the script does the rest.
 404
 405 Due to all the programs that this wrapper encompasses there are many output files generated (at least one for each step), but the main Jpred prediction is found in the .jnet file. All the other files may be of use, but that's up to you to decide...
 406
 407 Now the script can be run as an array job while connecting to the jnetDB. This should make running large Jpred runs on the cluster much more efficient.
 408
 409 =head1 OPTIONS
 410
 411 =over 8
 412
 413 =item B<--seq> <sequence filename>
 414
 415 Optional: if a filename is not supplied then cluster array context is assumed.
 416
 417 This can either be a fasta-formatted file with a single sequence or a multiple sequence alignment in B<msf> format. The file should have the extension .fas* or .msf
 418
 419 =item B<--out> <path>
 420
 421 Required.
 422
 423 Path to where the results files are to be stored.
 424
 425 =item B<--pipe> <name>
 426
 427 Select which pipeline is required. There are currently three options: 'cuff', 'jpred2' and 'jpred3'. As the names suggest the 'cuff' pipeline is the one used (as far as I can tell) by James Cuff and the 'server' is the one used by the, erm, server.
 428
 429 Default: 'jpred3'
 430
 431 =item B<--jnet> <version>
 432
 433 Select which version of JNet is required to generate the secondary structure prediction. There are currently two options: '2' and '2.2'. JNet v2 is the original version as created by James Cuff and v2.2 is the bug-fixed version updated by Jon Barber as used by the server.
 434
 435 Default: 2.2
 436
 437 =item B<--use-local>
 438
 439 For cluster use only. Use the local disk as a working directory. [Default: off]
 440
 441 =item B<--use-db>
 442
 443 In array context retrieve the files from the Jnet database, otherwise assume they're on disk. [Default: off]
 444
 445 =back
 446
 447 I B<strongly> recommend that users only run using the default settings for --pipe and --jnet as things are liable to break otherwise. Those options are mainly for me to do benchmarking between different versions and are specific to my setup. I don't think this is a problem as the defaults simply use what the Jpred server uses, which will be what you want 99.9% of the time. The other 0.01% is your problem ;-)
 448
 449 =head1 EXAMPLES
 450
 451 To simply run on a machine:
 452
 453  run_jpred.pl  --seq test.fas --out my_results/
 454
 455 will run PSI-BLAST and everything else on test.fas and the JNet output will be found in my_results/test.jnet
 456
 457 To submit to the cluster nodes:
 458
 459  qsub -q 64bit.q -cwd -S /usr/bin/perl run_jpred.pl --seqtest.fas --out my_results/
 460
 461 and the results will be found in the same place.
 462
 463 To submit an array job for sequences 100-200 from jnetDB:
 464
 465  qsub -t 100-200 -q 64bit.q -cwd -S /usr/bin/perl run_jpred.pl --out my_results/
 466
 467 =head1 BUGS
 468
 469 There are currenly no known bugs...
 470
 471 =head1 VERSION
 472
 473 This is version 0.5
 474
 475 =head1 AUTHOR
 476
 477 Chris Cole <christian@cole.name>
 478
 479 =cut