#!/usr/bin/perl

=head1 NAME

run_large_data.pl - script to run a large dataset against Jpred using the cluster

=cut

use strict;
use warnings;

use Getopt::Long;
use Pod::Usage;
use File::Basename;

# temporary. Should put this module somewhere more global.
use lib '/homes/ccole/lib';
use Cluster::ArrayJob 0.3;

my $file;
my $scriptName = "jpredLarge.pl";
my $title = 'jpredLarge';
my $VERBOSE = 0;
my $DEBUG = 0;
my $help;
my $man;

GetOptions (
   'in=s'      => \$file,
   'title=s'   => \$title,
   'verbose!'  => \$VERBOSE,
   'debug!'    => \$DEBUG,
   'man'       => \$man,
   'help|?'    => \$help,
) or pod2usage();

pod2usage(-verbose => 2) if ($man);
pod2usage(-verbose => 1) if ($help);
pod2usage(-msg => 'Please supply a valid filename.') if (!$file or !-e $file);

print "Reading fasta file...\n";
my $total = split_fasta($file);
print "Created $total fasta files\n";

write_script($scriptName);


## set up Array Job and submit perl script
## Just use all the defaults.
print "Submitting $total Jpred jobs to the cluster as an array job...\n" if $VERBOSE;
my $sgeArray = Cluster::ArrayJob->new();
$sgeArray->taskRange("1-$total");
$sgeArray->queue('64bit-pri.q');
$sgeArray->setCWD();
$sgeArray->jobname($title);
$sgeArray->setJobShare(0);
$sgeArray->setPriority(-100);
$sgeArray->setResourceRequest( 'ram' => '6G' );
$sgeArray->setEnv( 'PERL5LIB' => '/homes/www-jpred/live/lib' );
$sgeArray->submit($scriptName);

print "Checking status of array job...\n" if $VERBOSE;
while (1) {
   my $status = $sgeArray->getStatus() or die "ERROR - unable to get SGE job status: ", $sgeArray->error();
   if ($status eq '-1') {
      print "Job has finished\n" if $VERBOSE;
      last;
   } elsif ($status eq '1') {
      die "ERROR - unable to get SGE job status: ", $sgeArray->error();
   } else {
      my $out = '';
      foreach my $k (sort keys %$status) {
         $out .= " $k:".$status->{$k};
      }
      print "Job status: $out\n" if $VERBOSE;
   }
   sleep(30);
}

## Finally, check that all jobs completed successfully.
my $jobID = $sgeArray->jobid();
my @files = glob "$title.e$jobID.*";

my $errors = 0;
## check that the correct number of SGE files exist - should be the same as the number of tasks
my $nFiles = scalar @files;
if ($nFiles != $total) {
   warn "Warning - found $nFiles SGE output files where $total expected\n";
   ++$errors;
}

## check that the right number of Jnet outputs were generated
my @jnets = glob "*.jnet";
my $nJnets = scalar @jnets;
if ($total != $nJnets) {
   warn "Warning - found $nJnets Jnet predictions where $total expected\n";
}
print "All Jpred searches completed!\n";
exit;


## write script with code from __DATA__ below
sub write_script {
   my $file = shift;
   
   open(my $OUT, ">$file") or die "ERROR - unable to open file '$file': $!\n";
   while (<DATA>) {
      print $OUT $_;
   }
   close($OUT);
   close(DATA);
}

## short function to split a Fasta file
## into one file per sequence
sub split_fasta {
   my $file = shift;
   
   open(my $FAS, "<", $file) or die "ERROR - unable to open '$file': ${!}\nDied";
   my $num = 0;
   my $OUT;
   while(<$FAS>) {
      ## check first line has appriate start
      if ($. == 1) {
         die "ERROR - this file does not appear to be a Fasta file\n" unless (/^>/);
      }
      ## for each new record open a new file and close the preceeding one
      if (/^>/) {
         ++$num;
         my $out = "$num.fasta";
         close($OUT) if ($OUT); # needed for first one; can't close if nothing's open
         open($OUT, ">", $out) or die "ERROR - unable to open '$out' for write: ${!}\nDied";
      }
      print $OUT $_;
      
   }
   close($OUT);
   return($num)
}

=head1 SYNOPSIS

run_large_data.pl --in <file> [--verbose] [--debug] [--man] [--help]

=head1 DESCRIPTION

Running a large set of sequences against Jpred can be a but of a pain as Jpred can be very memory intensive and can take a while.

Therefore, use of the cluster is best, but managing the jobs can be a bit fiddly. So, this script does it all for you!

Just provide a Fasta file with all the sequences you want to run and the script will submit each of them to the cluster separately, monitor their progress and report if there have been any failures. Job done!

=head1 OPTIONS

=over 5

=item B<--in>

Input file (fasta format).

=item B<--verbose|--no-verbose>

Toggle verbosity. [default:none]

=item B<--debug|--no-debug>

Toggle debugging output. [default:none]

=item B<--help>

Brief help.

=item B<--man>

Full manpage of program.

=back

=head1 AUTHOR

Chris Cole <christian@cole.name>

=cut

__DATA__
#!/usr/bin/perl

use strict;
use warnings;

my $pwd = $ENV{PWD};  # get current directory
my $dir = $ENV{TMPDIR};  # get SGE tmpdir
my $task = $ENV{'SGE_TASK_ID'};  # get SGE array task ID
die "ERROR - not in SGE array job context. Please submit as an array job.\n" if (!$task or $task eq 'undefined');

die "ERROR - file '$task.fasta' not found at '$pwd'. Check path.\n" unless (-e "$task.fasta");
if (-s "$task.jnet") {
   print "A Jnet prediction already exists for '$task.fasta'. Skipping...\n";
   exit();
}

my $cmd = "cd $dir && /homes/www-jpred/live/jpred/jpred --sequence $pwd/$task.fasta --output $task";
print "Running CMD: $cmd\n";
system($cmd) == 0 or die "ERROR - system() died\n";
if (-s "$dir/$task.jnet") {
   print "Jnet successful! Copying files from cluster to $pwd...";
   system("cp $dir/$task.* $pwd/") ==0 or die "ERROR - system() died\n";
   print "Done\n";
} else {
   #my $out = `ls -ltr $dir`;
   #print "ls -ltr:\n$out\n";
   print "No Jnet prediction found for '$task'. Something failed...\n"
}