5 run_large_data.pl - script to run a large dataset against Jpred using the cluster
16 # temporary. Should put this module somewhere more global.
17 use lib '/homes/ccole/lib';
18 use Cluster::ArrayJob 0.3;
21 my $scriptName = "jpredLarge.pl";
22 my $title = 'jpredLarge';
31 'verbose!' => \$VERBOSE,
37 pod2usage(-verbose => 2) if ($man);
38 pod2usage(-verbose => 1) if ($help);
39 pod2usage(-msg => 'Please supply a valid filename.') if (!$file or !-e $file);
41 print "Reading fasta file...\n";
42 my $total = split_fasta($file);
43 print "Created $total fasta files\n";
45 write_script($scriptName);
48 ## set up Array Job and submit perl script
49 ## Just use all the defaults.
50 print "Submitting $total Jpred jobs to the cluster as an array job...\n" if $VERBOSE;
51 my $sgeArray = Cluster::ArrayJob->new();
52 $sgeArray->taskRange("1-$total");
53 $sgeArray->queue('64bit-pri.q');
55 $sgeArray->jobname($title);
56 $sgeArray->setJobShare(0);
57 $sgeArray->setPriority(-100);
58 $sgeArray->setResourceRequest( 'ram' => '6G' );
59 $sgeArray->setEnv( 'PERL5LIB' => '/homes/www-jpred/live/lib' );
60 $sgeArray->submit($scriptName);
62 print "Checking status of array job...\n" if $VERBOSE;
64 my $status = $sgeArray->getStatus() or die "ERROR - unable to get SGE job status: ", $sgeArray->error();
65 if ($status eq '-1') {
66 print "Job has finished\n" if $VERBOSE;
68 } elsif ($status eq '1') {
69 die "ERROR - unable to get SGE job status: ", $sgeArray->error();
72 foreach my $k (sort keys %$status) {
73 $out .= " $k:".$status->{$k};
75 print "Job status: $out\n" if $VERBOSE;
80 ## Finally, check that all jobs completed successfully.
81 my $jobID = $sgeArray->jobid();
82 my @files = glob "$title.e$jobID.*";
85 ## check that the correct number of SGE files exist - should be the same as the number of tasks
86 my $nFiles = scalar @files;
87 if ($nFiles != $total) {
88 warn "Warning - found $nFiles SGE output files where $total expected\n";
92 ## check that the right number of Jnet outputs were generated
93 my @jnets = glob "*.jnet";
94 my $nJnets = scalar @jnets;
95 if ($total != $nJnets) {
96 warn "Warning - found $nJnets Jnet predictions where $total expected\n";
98 print "All Jpred searches completed!\n";
102 ## write script with code from __DATA__ below
106 open(my $OUT, ">$file") or die "ERROR - unable to open file '$file': $!\n";
114 ## short function to split a Fasta file
115 ## into one file per sequence
119 open(my $FAS, "<", $file) or die "ERROR - unable to open '$file': ${!}\nDied";
123 ## check first line has appriate start
125 die "ERROR - this file does not appear to be a Fasta file\n" unless (/^>/);
127 ## for each new record open a new file and close the preceeding one
130 my $out = "$num.fasta";
131 close($OUT) if ($OUT); # needed for first one; can't close if nothing's open
132 open($OUT, ">", $out) or die "ERROR - unable to open '$out' for write: ${!}\nDied";
143 run_large_data.pl --in <file> [--verbose] [--debug] [--man] [--help]
147 Running a large set of sequences against Jpred can be a but of a pain as Jpred can be very memory intensive and can take a while.
149 Therefore, use of the cluster is best, but managing the jobs can be a bit fiddly. So, this script does it all for you!
151 Just provide a Fasta file with all the sequences you want to run and the script will submit each of them to the cluster separately, monitor their progress and report if there have been any failures. Job done!
159 Input file (fasta format).
161 =item B<--verbose|--no-verbose>
163 Toggle verbosity. [default:none]
165 =item B<--debug|--no-debug>
167 Toggle debugging output. [default:none]
175 Full manpage of program.
181 Chris Cole <christian@cole.name>
191 my $pwd = $ENV{PWD}; # get current directory
192 my $dir = $ENV{TMPDIR}; # get SGE tmpdir
193 my $task = $ENV{'SGE_TASK_ID'}; # get SGE array task ID
194 die "ERROR - not in SGE array job context. Please submit as an array job.\n" if (!$task or $task eq 'undefined');
196 die "ERROR - file '$task.fasta' not found at '$pwd'. Check path.\n" unless (-e "$task.fasta");
197 if (-s "$task.jnet") {
198 print "A Jnet prediction already exists for '$task.fasta'. Skipping...\n";
202 my $cmd = "cd $dir && /homes/www-jpred/live/jpred/jpred --sequence $pwd/$task.fasta --output $task";
203 print "Running CMD: $cmd\n";
204 system($cmd) == 0 or die "ERROR - system() died\n";
205 if (-s "$dir/$task.jnet") {
206 print "Jnet successful! Copying files from cluster to $pwd...";
207 system("cp $dir/$task.* $pwd/") ==0 or die "ERROR - system() died\n";
210 #my $out = `ls -ltr $dir`;
211 #print "ls -ltr:\n$out\n";
212 print "No Jnet prediction found for '$task'. Something failed...\n"