#!/usr/bin/perl =head1 NAME run_large_data.pl - script to run a large dataset against Jpred using the cluster =cut use strict; use warnings; use Getopt::Long; use Pod::Usage; use File::Basename; # temporary. Should put this module somewhere more global. use lib '/homes/ccole/lib'; use Cluster::ArrayJob 0.3; my $file; my $scriptName = "jpredLarge.pl"; my $title = 'jpredLarge'; my $VERBOSE = 0; my $DEBUG = 0; my $help; my $man; GetOptions ( 'in=s' => \$file, 'title=s' => \$title, 'verbose!' => \$VERBOSE, 'debug!' => \$DEBUG, 'man' => \$man, 'help|?' => \$help, ) or pod2usage(); pod2usage(-verbose => 2) if ($man); pod2usage(-verbose => 1) if ($help); pod2usage(-msg => 'Please supply a valid filename.') if (!$file or !-e $file); print "Reading fasta file...\n"; my $total = split_fasta($file); print "Created $total fasta files\n"; write_script($scriptName); ## set up Array Job and submit perl script ## Just use all the defaults. print "Submitting $total Jpred jobs to the cluster as an array job...\n" if $VERBOSE; my $sgeArray = Cluster::ArrayJob->new(); $sgeArray->taskRange("1-$total"); $sgeArray->queue('64bit-pri.q'); $sgeArray->setCWD(); $sgeArray->jobname($title); $sgeArray->setJobShare(0); $sgeArray->setPriority(-100); $sgeArray->setResourceRequest( 'ram' => '6G' ); $sgeArray->setEnv( 'PERL5LIB' => '/homes/www-jpred/live/lib' ); $sgeArray->submit($scriptName); print "Checking status of array job...\n" if $VERBOSE; while (1) { my $status = $sgeArray->getStatus() or die "ERROR - unable to get SGE job status: ", $sgeArray->error(); if ($status eq '-1') { print "Job has finished\n" if $VERBOSE; last; } elsif ($status eq '1') { die "ERROR - unable to get SGE job status: ", $sgeArray->error(); } else { my $out = ''; foreach my $k (sort keys %$status) { $out .= " $k:".$status->{$k}; } print "Job status: $out\n" if $VERBOSE; } sleep(30); } ## Finally, check that all jobs completed successfully. my $jobID = $sgeArray->jobid(); my @files = glob "$title.e$jobID.*"; my $errors = 0; ## check that the correct number of SGE files exist - should be the same as the number of tasks my $nFiles = scalar @files; if ($nFiles != $total) { warn "Warning - found $nFiles SGE output files where $total expected\n"; ++$errors; } ## check that the right number of Jnet outputs were generated my @jnets = glob "*.jnet"; my $nJnets = scalar @jnets; if ($total != $nJnets) { warn "Warning - found $nJnets Jnet predictions where $total expected\n"; } print "All Jpred searches completed!\n"; exit; ## write script with code from __DATA__ below sub write_script { my $file = shift; open(my $OUT, ">$file") or die "ERROR - unable to open file '$file': $!\n"; while () { print $OUT $_; } close($OUT); close(DATA); } ## short function to split a Fasta file ## into one file per sequence sub split_fasta { my $file = shift; open(my $FAS, "<", $file) or die "ERROR - unable to open '$file': ${!}\nDied"; my $num = 0; my $OUT; while(<$FAS>) { ## check first line has appriate start if ($. == 1) { die "ERROR - this file does not appear to be a Fasta file\n" unless (/^>/); } ## for each new record open a new file and close the preceeding one if (/^>/) { ++$num; my $out = "$num.fasta"; close($OUT) if ($OUT); # needed for first one; can't close if nothing's open open($OUT, ">", $out) or die "ERROR - unable to open '$out' for write: ${!}\nDied"; } print $OUT $_; } close($OUT); return($num) } =head1 SYNOPSIS run_large_data.pl --in [--verbose] [--debug] [--man] [--help] =head1 DESCRIPTION Running a large set of sequences against Jpred can be a but of a pain as Jpred can be very memory intensive and can take a while. Therefore, use of the cluster is best, but managing the jobs can be a bit fiddly. So, this script does it all for you! Just provide a Fasta file with all the sequences you want to run and the script will submit each of them to the cluster separately, monitor their progress and report if there have been any failures. Job done! =head1 OPTIONS =over 5 =item B<--in> Input file (fasta format). =item B<--verbose|--no-verbose> Toggle verbosity. [default:none] =item B<--debug|--no-debug> Toggle debugging output. [default:none] =item B<--help> Brief help. =item B<--man> Full manpage of program. =back =head1 AUTHOR Chris Cole =cut __DATA__ #!/usr/bin/perl use strict; use warnings; my $pwd = $ENV{PWD}; # get current directory my $dir = $ENV{TMPDIR}; # get SGE tmpdir my $task = $ENV{'SGE_TASK_ID'}; # get SGE array task ID die "ERROR - not in SGE array job context. Please submit as an array job.\n" if (!$task or $task eq 'undefined'); die "ERROR - file '$task.fasta' not found at '$pwd'. Check path.\n" unless (-e "$task.fasta"); if (-s "$task.jnet") { print "A Jnet prediction already exists for '$task.fasta'. Skipping...\n"; exit(); } my $cmd = "cd $dir && /homes/www-jpred/live/jpred/jpred --sequence $pwd/$task.fasta --output $task"; print "Running CMD: $cmd\n"; system($cmd) == 0 or die "ERROR - system() died\n"; if (-s "$dir/$task.jnet") { print "Jnet successful! Copying files from cluster to $pwd..."; system("cp $dir/$task.* $pwd/") ==0 or die "ERROR - system() died\n"; print "Done\n"; } else { #my $out = `ls -ltr $dir`; #print "ls -ltr:\n$out\n"; print "No Jnet prediction found for '$task'. Something failed...\n" }