JWS-117 Compiled all tools with ./compilebin.sh and some were missing related files.

[jabaws.git] / binaries / src / jpred / jpred.pl
diff --git a/binaries/src/jpred/jpred.pl b/binaries/src/jpred/jpred.pl

old mode 100755 (executable)

new mode 100644 (file)

index 085b515..bd23814
--- a/binaries/src/jpred/jpred.pl
+++ b/binaries/src/jpred/jpred.pl
@@ -6,55 +6,80 @@ jpred - Secondary structure prediction program
  
  =head1 SYNOPSIS
  
-  jpred --sequence <fasta file> [--output <output prefix>] [--db <database>] [--psi <psiblast output>] [--pred-nohits] [--verbose] [--debug] [--help] [--man]
+./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
  
  =head1 DESCRIPTION
  
-This is a program which predicts the secondary structure of a sequence given a path to a FASTA sequence. 
-It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
-
-The program is primarily design for use by the Jpred server, but it can be used directly by any user. 
-Some of the output may not be meaningful if used directly - it can be safely ignored.
+This is a program for predicting the secondary structure of a multiple sequence alignment or a protein sequence. 
+The input file can be stored in 3 formats: FASTA, MSF, or BLC. 
+For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and 
+predicting the secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, 
+created from the alignment, is used in Jnet.
  
  =head1 OPTIONS
  
  =over 5
  
-=item -sequence path to input FASTA file
+=item -in <FILE1>
+
+The path to the sequence file (in FASTA, MSF, or BLC format)
+
+=item -output <FILEPREFIX>
+
+A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
+
+=item -outfile <FILE2>
+
+An output file, by default it is undefined and the -output option is working
  
-The path to the sequence (in FASTA format) you want to predict.
+=item -logfile <FILE3>
  
-=item -output file
+Logging file, by default it is undefined and logging switched off
  
-A prefix to the filenames created by Jpred, defaults to the value set by --sequence.
+=item -dbpath PATH
  
-=item --db database
+Path to the uniref database used for PSI-BLAST querying. default value: .
  
-Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code. Crap I know, but that's the way it is at the moment. It's probably best to stick the default for the time being.
+=item -dbname database
  
-Default: uniref90
+Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
+Default value: uniref90
  
  =item -psi path
  
  Path to a PSIBLAST output file.
  
-=item --pred-nohits
+=item -ncpu NNN
+
+Number of CPU used by jpred.pl. Maximum value is 8
+Default: NNN = 1
+
+=item -pred-nohits
  
  Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  
-=item --verbose
+=item -no-final
+
+By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
+The option untoggles this step and stops jpred at the concise file generation
+
+=item -jabaws
+
+Redefines some basic variables in order to use the script within JABAWS
+
+=item -verbose
  
  Verbose mode. Print more information while running jpred.
  
-=item --debug
+=item -debug
  
  Debug mode. Print debugging information while running jpred. 
  
-=item --help
+=item -help
  
  Gives help on the programs usage.
  
-=item --man
+=item -man
  
  Displays this man page.
  
@@ -66,7 +91,7 @@ Can't cope with gaps in the query sequence.
  
  Doesn't accept alignments.
  
-If you find any others please contact me.
+If you find any others please contact authors.
  
  =head1 AUTHORS
  
@@ -87,15 +112,13 @@ use Carp;
  use Data::Dumper;
  use File::Temp;
  
-#use UNIVERSAL;
-
  # path to Jpred modules
  use FindBin qw($Bin);
  use lib "$Bin/lib";
  
  # internal jpred modules
  use Jpred;
-use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS);
+use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
  use HMMER::Profile;
  use HMMER::Profile::Jnet;
  use PSIBLAST;
@@ -157,27 +180,40 @@ our $GAP = '-';
  
  #####################################################################################################
  my $infile;
-my $output;
+my $infastafile;
+my $format;
+my $goal = "align";
+my $seqgoal;
+my $outfile;
+my $prefix;
  my $psiblast;
-my $ncpu       = 1;           # number of CPUs for psiblast calculations
-my $predNoHits = 0;           # define whether to make predictions when no PSI-BLAST hits are found
-my $db_path    = ".";
-my $db_entry   = "unknown";
+my $jabaws;
+my $logfile;
+my $ncpu       = 1;                              # number of CPUs for psiblast calculations
+my $predNoHits = 0;                              # define whether to make predictions when no PSI-BLAST hits are found
+my $db_path    = "/homes/www-jpred/databases";
+my $db_entry   = "cluster";
+my $nofinal;
  
  my ( $help, $man, $DEBUG, $VERBOSE );
  
  GetOptions(
-  "in|sequence=s" => \$infile,
-  "out|output=s"  => \$output,
-  "psi=s"         => \$psiblast,
-  "dbname=s"      => \$db_entry,
-  "dbpath=s"      => \$db_path,
-  "ncpu=s"        => \$ncpu,
-  "pred-nohits"   => \$predNoHits,
-  "help"          => \$help,
-  "man"           => \$man,
-  "debug"         => \$DEBUG,
-  "verbose"       => \$VERBOSE
+  "in=s"        => \$infile,
+  "output=s"    => \$prefix,
+  "outfile=s"   => \$outfile,
+  "logfile=s"   => \$logfile,
+  "psi=s"       => \$psiblast,
+  "dbname=s"    => \$db_entry,
+  "dbpath=s"    => \$db_path,
+  "ncpu=s"      => \$ncpu,
+  "pred-nohits" => \$predNoHits,
+  "no-final"    => \$nofinal,
+  "jabaws"      => \$jabaws,
+
+  "help"    => \$help,
+  "man"     => \$man,
+  "debug"   => \$DEBUG,
+  "verbose" => \$VERBOSE
  ) or pod2usage(2);
  pod2usage(1) if $help;
  pod2usage( verbose => 2 ) if $man;
@@ -200,231 +236,647 @@ my $database = {
  
    ## database used for ported Jpred
    ported_db => {
-    database   => $db_path . "/databases/uniref90.filt",
-    unfiltered => $db_path . "/databases/uniref90",
+    database   => $db_path . "/uniref90.filt",
+    unfiltered => $db_path . "/uniref90",
    },
+
    ## cluster-specific path for Jpred
    cluster => {
-    database   => "/homes/www-jpred/databases/uniref90.filt",
-    unfiltered => "/homes/www-jpred/databases/uniref90",
+    database   => $db_path . "/uniref90.filt",
+    unfiltered => $db_path . "/uniref90",
    },
  
    ## these other DBs are experimental ones used during development.
    ## check they exist before using them.
+  # generic entry for use with validate_jnet.pl
+  # real db location defined by validate_jnet.pl
    swall => {
-
-    # generic entry for use with validate_jnet.pl
-    # real db location defined by validate_jnet.pl
      database   => $db_path . "/swall/swall.filt",
      unfiltered => $db_path . "/swall/swall",
    },
  
+  # Path to PSIBLAST db
    uniprot => {
-
-    # Path to PSIBLAST db
      database   => $db_path . "/3/swall.filt",
      unfiltered => $db_path . "/3/swall",
    },
+
    uniref50 => {
      database   => $db_path . "/6/swall.filt",
      unfiltered => $db_path . "/6/swall",
    },
  };
  
-pod2usage(' --sequence argument not provided') unless $infile;
-die "--db $db_entry not recognised" unless exists $database->{$db_entry};
-$output = $infile . ".res" unless $output;
-$ncpu = 1 if ( 1 > $ncpu or 8 < $ncpu );
+my $dpf = $database->{$db_entry}{database}.'.pal';
+my $dpu = $database->{$db_entry}{unfiltered}.'.pal';
+pod2usage("ERROR! Input file should be provided with -sequence/-in")                                              unless $infile;
+pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database")            unless exists $database->{$db_entry};
+pod2usage("ERROR! UNIREF filtered   database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf;
+pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu;
  
-for ( [ "input", $infile ], [ "output", $output ], [ "psi", $psiblast ], [ "db", $db_entry ] ) {
+#####################################################################################################
+# lots of preparation steps
+if ( defined $prefix ) {
+  unless ( defined $outfile ) {
+    $outfile = $prefix . ".res.fasta";
+  }
+} else {
+  if ( defined $outfile ) {
+    print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
+    $prefix = $outfile . ".tmp";
+  } else {
+    print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
+    $prefix  = $infile . ".res";
+    $outfile = $prefix . ".jnet";
+  }
+}
+
+if ( 1 > $ncpu or 8 < $ncpu ) {
+  print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
+  $ncpu = 1;
+}
+
+$VERBOSE = 1 if $DEBUG;
+
+my $LOG;
+if ( defined $logfile ) {
+  open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
+}
+
+for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
    my ( $key, $value ) = @{$_};
    defined $value or next;
-  errlog( join( ": ", $key, $value ), "\n" );
+  my $par = join( ": ", $key, $value );
+  print "$par\n" if $DEBUG;
+  print $LOG "$par\n" if $LOG;
  }
  
-#####################################################################################################
+if ( defined $jabaws ) {
+  setup_jpred_env($Bin);
+  setup_env($Bin);
+}
  my $platform = check_OS();
  print "JPRED: checking platiform... $platform\n" if $DEBUG;
+print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
  
  #####################################################################################################
-my $query = FASTA::File->new( read_file => $infile );
-
-my @seqs;
-my $psi = PSIBLAST->new;
-unless ( defined $psiblast && -e $psiblast ) {
-  ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
-  # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
-  # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
-  # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
-  my $psi_run = PSIBLAST::Run->new(
-    debug    => $DEBUG,
-    args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
-    path     => $psiblastbin,
-    input    => $infile,
-    output   => "$output.blast",
-    matrix   => "$output.profile",
-    database => $database->{$db_entry}{database},
-  );
-
-  # For reduced databases, get the size of the orginal and use this as
-  # the equivilent DB size
-  if ( $db_entry =~ /^sp_red_/ ) {
-    ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
-    $psi_run->args(
-      $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
-    );
+# check input file format
+my $nseq = check_FASTA_format($infile);
+if ( 0 < $nseq ) {
+  $format = "fasta";
+  if ( 1 == $nseq ) {
+    # one FASTA record
+    $goal = 'seq';
+  } else {
+    unless ( 0 < check_FASTA_alignment($infile)) {
+      die "\nERROR! jpred requires either FASTA alignment or 1 sequence in the FASTA, MSF, or BLC formats\n";
+    }
    }
-  print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
-  print "BLASTDB path: $ENV{BLASTDB}\n"  if $DEBUG;
-
-  print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
-  $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
-
-  $psi->read_file("$output.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
-  system("gzip -9f $output.blast");
+} elsif ( 0 < check_MSF_format($infile) ) {
+  $format = "msf";
+} elsif ( 0 < check_BLC_format($infile) ) {
+  $format = "blc";
  } else {
-  if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
-  else                         { $psi->read_file($psiblast) }                        # CC ditto above
+  die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
  }
+$infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
  
  #####################################################################################################
-# Convert the last itteration into a collection of PSISEQ objects
-for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
-  my ( $id, $start, $align ) = @{$_};
-  push @seqs,
-    PSISEQ->new(
-    id    => $id,
-    start => $start,
-    align => [ split( //, $align ) ]
+if ( 'seq' eq $goal ) {
+  my $query = FASTA::File->new( read_file => $infile );
+  my @seqs;
+  my $psi = PSIBLAST->new;
+  unless ( defined $psiblast && -e $psiblast ) {
+    ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
+    # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
+    # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
+    # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
+    my $psi_run = PSIBLAST::Run->new(
+      debug    => $DEBUG,
+      args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
+      path     => $psiblastbin,
+      input    => $infile,
+      output   => "$prefix.blast",
+      matrix   => "$prefix.profile",
+      database => $database->{$db_entry}{database},
      );
-}
+
+    # For reduced databases, get the size of the orginal and use this as
+    # the equivilent DB size
+    if ( $db_entry =~ /^sp_red_/ ) {
+      ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
+      $psi_run->args(
+        $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
+      );
+    }
+    print "BLAST matrix: $ENV{BLASTMAT}\n"      if $DEBUG;
+    print "BLASTDB path: $ENV{BLASTDB}\n"       if $DEBUG;
+    print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
+    print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
+
+    print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
+    print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
+    $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
+
+    $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
+    system("gzip -9f $prefix.blast");
+  } else {
+    if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
+    else                         { $psi->read_file($psiblast) }                        # CC ditto above
+  }
+
+#####################################################################################################
+  # Convert the last itteration into a collection of PSISEQ objects
+  for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
+    my ( $id, $start, $align ) = @{$_};
+    push @seqs,
+      PSISEQ->new(
+      id    => $id,
+      start => $start,
+      align => [ split( //, $align ) ]
+      );
+  }
  
  #####################################################################################################
-## When there are no PSI-BLAST hits generate an HMM from the query
-## and run Jnet against that only.
-## Accuracy won't be as good, but at least it's a prediction
-if ( @seqs == 0 ) {
-  if ( $predNoHits == 0 ) {
-    warn "\nJPRED: Warning! no PSI-BLAST hits found and '--pred-nohits' option not set. Exiting...\n";
-    print ">>100% complete\n";
-    exit;
+  # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
+  # Accuracy won't be as good, but at least it's a prediction
+  if ( @seqs == 0 ) {
+    if ( $predNoHits == 0 ) {
+      warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
+      print ">>100% complete\n";
+      print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
+      print $LOG ">>100% complete\n"                                                                         if $LOG;
+      close($LOG)                                                                                            if $LOG;
+      exit;
+    } else {
+      print ">>50% complete\n";
+      warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
+      print "Running HMMer on query...\n"                                                            if $VERBOSE;
+      print $LOG ">>50% complete\n"                                                                  if $LOG;
+      print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
+      print $LOG "Running HMMer on query...\n"                                                       if $LOG;
+
+      # copy input query to alignment
+      #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
+      open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
+      open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
+      while (<$ifh>) { print $ofh, $_ }
+      close $ifh;
+      close $ofh;
+
+      # Temp files required for HMMer
+      my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
+      system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
+      system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+      unlink $hmmbuild_out;
+
+      # Read in the HMMER file
+      my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+      # Convert from HMMER::Profile to HMMER::Profile::Jnet
+      my $hmmer_jnet = HMMER::Profile::Jnet->new;
+      $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
+      $hmmer_jnet->write_file("$prefix.hmm");
+      print ">>70% complete\n";
+      print $LOG ">>70% complete\n" if $LOG;
+
+      # Run Jnet for the prediction
+      print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
+      print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
+      my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
+      print ">>90% complete\n";
+      print $LOG ">>90% complete\n" if $LOG;
+      print $jnetlog                if $VERBOSE;
+      print $LOG $jnetlog           if $LOG;
+    }
    } else {
+    psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
+    print ">>40% complete\n";
+    print $LOG ">>40% complete\n" if $LOG;
+
+    #####################################################################################################
+    # Make PSIBLAST truncated alignments the right length
+    print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
+    print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
+    @seqs = extend( $query, @seqs );
+    psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Remove masking from PSIBLAST alignment
+    print "Unmasking the alignments...\n" if $VERBOSE;
+    print $LOG "Unmasking the alignments...\n" if $LOG;
+    my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
+    remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
+    psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Convert the sequences to upper case
+    print "Converting sequences to the same case...\n" if $VERBOSE;
+    print $LOG "Converting sequences to the same case...\n" if $LOG;
+    toupper($_) for @seqs;    # CC sub is below
+    psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
+
+    #####################################################################################################
+    # Remove excessive sequences
+    print "Remove excessive sequences...\n" if $VERBOSE;
+    print $LOG "Remove excessive sequences...\n" if $LOG;
+    @seqs = reduce( $MAX_ALIGN, @seqs );
+    psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Remove sequences that are too long or too short
+    print "Remove sequences which too long or short...\n" if $VERBOSE;
+    print $LOG "Remove sequences which too long or short...\n" if $LOG;
+    @seqs = del_long_seqs( 50, @seqs );
+    psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Remove redundant sequences based upon pairwise identity and OC clustering
+    print "Remove redundant sequences...\n" if $VERBOSE;
+    print $LOG "Remove redundant sequences...\n" if $LOG;
+    @seqs = nonred( $NR_CUT, @seqs );
+    psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Check that we haven't got rid of all of the sequences
+    if ( @seqs < 2 ) {
+      warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
+      print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
+      @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
+    }
+    unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
+
+    # Remove gaps in the query sequence and same positions in the alignment
+    print "Removing gaps in the query sequence...\n" if $VERBOSE;
+    print $LOG "Removing gaps in the query sequence...\n" if $LOG;
+    degap(@seqs);
+    psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
+
+    # Output the alignment for the prediction
+    print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
+    print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
+    psiseq2fasta( "$prefix.align", @seqs );
+
+    #####################################################################################################
+    # Equivilent to getpssm script
+    print "Output the PSSM matrix from the PSI-BLAST profile...\n";
+    print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
+    my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
+    $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
      print ">>50% complete\n";
-    warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
-    print "Running HMMer on query...\n" if $VERBOSE;
-
-    # copy input query to alignment
-    #system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
-    open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
-    open( my $ofh, ">", $output . ".align" ) or die "JPRED: cannot open file $output\.align: $!";
-    while (<$ifh>) { print $ofh, $_ }
-    close $ifh;
-    close $ofh;
-
-    # Temp files required for HMMer
-    my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
-
-    system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
-    system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
-
-    # Read in the HMMER file
-    my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
-
-    # Convert from HMMER::Profile to HMMER::Profile::Jnet
-    my $hmmer_jnet = HMMER::Profile::Jnet->new;
-    $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
-    $hmmer_jnet->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
+    print $LOG ">>50% complete\n" if $LOG;
+
+    #####################################################################################################
+    # Run HMMER on the sequences
+    print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
+    print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
+    my $hmmer = hmmer(@seqs);             # CC sub is below
+    $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
      print ">>70% complete\n";
-
+    print $LOG ">>70% complete\n" if $LOG;
+#####################################################################################################
      # Run Jnet for the prediction
-    print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
-    jnet( map { "$output.$_" } qw(hmm jnet) );    # CC sub is below
+    print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
+    print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
+    my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
+    print ">>90% complete\n";
+    print $LOG ">>90% complete\n" if $LOG;
+    print $jnetlog                if $VERBOSE;
+    print $LOG $jnetlog           if $LOG;
    }
  } else {
-  psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;    # CC sub is below
-  print ">>40% complete\n";
-
-  #####################################################################################################
-  # Make PSIBLAST truncated alignments the right length
-  print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
-  @seqs = extend( $query, @seqs );                  # CC sub si below
-  psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Remove masking from PSIBLAST alignment
-  print "Unmasking the alignments...\n" if $VERBOSE;
-  my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
-  remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
-  psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Convert the sequences to upper case
-  print "Converting sequences to the same case...\n" if $VERBOSE;
-  toupper($_) for @seqs;    # CC sub is below
-  psiseq2fasta( "${output}_backup.fasta.gz", @seqs );
-
-  #####################################################################################################
-  # Remove excessive sequences
-  print "Remove excessive sequences...\n" if $VERBOSE;
-  @seqs = reduce( $MAX_ALIGN, @seqs );
-  psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Remove sequences that are too long or too short
-  print "Remove sequences which too long or short...\n" if $VERBOSE;
-  @seqs = del_long_seqs( 50, @seqs );
-  psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Remove redundant sequences based upon pairwise identity and OC clustering
-  print "Remove redundant sequences...\n" if $VERBOSE;
-  @seqs = nonred( $NR_CUT, @seqs );
-  psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Check that we haven't got rid of all of the sequences
-  if ( @seqs < 2 ) {
-    warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
-    @seqs = fasta2psiseq("${output}_backup.fasta.gz");
-  }
-  unlink("${output}_backup.fasta.gz") unless $DEBUG;
-
-  # Remove gaps in the query sequence and same positions in the alignment
-  print "Removing gaps in the query sequence...\n" if $VERBOSE;
-  degap(@seqs);
-  psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
-
-  # Output the alignment for the prediction
-  print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
-  psiseq2fasta( "$output.align", @seqs );
-
-  #####################################################################################################
-  # Equivilent to getpssm script
-  print "Output the PSSM matrix from the PSI-BLAST profile...\n";
-  my $pssm = PSIBLAST::PSSM->new( read_file => "$output.profile" );
-  $pssm->write_file("$output.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
-  print ">>50% complete\n";
-
-  #####################################################################################################
-  # Run HMMER on the sequences
-  print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
-  my $hmmer = hmmer(@seqs);             # CC sub is below
-  $hmmer->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
+  if ( 'fasta' eq $format ) {
+    print "Read FASTA file\n";
+    my $hmmer1 = hmm_for_align($infile);
+    $hmmer1->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
+  } elsif ( 'msf' eq $format ) {
+    msf2fasta( $infile, $infastafile );
+    my $hmmer2 = hmm_for_align($infastafile);
+    $hmmer2->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
+  } elsif ( 'blc' eq $format ) {
+    my $tmpfile = File::Temp->new->filename;
+    blc2msf( $infile, $tmpfile );
+    msf2fasta( $infile, $infastafile );
+    my $hmmer3 = hmm_for_align($infastafile);
+    $hmmer3->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
+  } else {
+    die "ERROR! unknown input format '$format'. exit...\n";
+  }
    print ">>70% complete\n";
-
-  #####################################################################################################
+  print $LOG ">>70% complete\n" if $LOG;
+#####################################################################################################
    # Run Jnet for the prediction
-  print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
-  jnet( map { "$output.$_" } qw(hmm jnet pssm) );    # CC sub is below
+  print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
+  print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
+  my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );           # CC sub is below
+  print ">>90% complete\n";
+  print $LOG ">>90% complete\n" if $LOG;
+  print $jnetlog                if $VERBOSE;
+  print $LOG $jnetlog           if $LOG;
+}
+
+unless ( defined $nofinal ) {
+  my $aligfile      = $prefix . ".align";
+  my $jnetfile      = $prefix . ".jnet";
+  my $jnetfastafile = $prefix . ".jnet.fasta";
+  $aligfile = $infile if ( 'fasta' eq $format );
+  $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
+  concise2fasta( $jnetfile, $jnetfastafile );
+  open( my $IN1, "<", $aligfile )      or die "ERROR! unable to open '$aligfile': ${!}\nDied";
+  open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
+  open( my $OUT, ">", $outfile )       or die "ERROR! unable to open '$outfile': ${!}\nDied";
+  while (<$IN2>) { print $OUT $_; }
+  while (<$IN1>) { print $OUT $_; }
+  close($IN1);
+  close($IN2);
+  close($OUT);
+  unlink $jnetfastafile;
+  my $seqs = FASTA::File->new( read_file => $outfile );
+  open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
+  $seqs->write($OUT2);
+  close($OUT2);
+} else {
+  if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
+    rename $prefix . ".jnet", $outfile;
+  }
  }
  
  print ">>100% complete\n";
  print "Jpred Finished\n";
+print $LOG ">>100% complete\n" if $LOG;
+print $LOG "Jpred Finished\n"  if $LOG;
+close($LOG)                    if $LOG;
  exit;
  
  #####################################################################################################
  # Functions
  #####################################################################################################
+sub check_FASTA_format {
+  my $infile = shift;
+
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+  my $check_first_line = 1;
+  my $nseq             = 0;
+  local $/ = "\n>";
+  while (<$IN>) {
+    if ($check_first_line) {
+      return 0 unless (/^>/);
+      $check_first_line = 0;
+    }
+    s/^>//g;
+    s/>$//g;
+
+    my ( $id, @seqs ) = split /\n/, $_;
+    return 0 unless ( defined $id or @seqs );
+    my $seq = join( "", @seqs );
+    return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
+    ++$nseq;
+  }
+  close($IN);
+
+  return $nseq;
+}
+#####################################################################################################
+sub check_FASTA_alignment {
+  my $infile = shift;
+
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+  my $check_first_line = 1;
+  my $nseq             = 0;
+  my $seqlen = -1;
+  local $/ = "\n>";
+  while (<$IN>) {
+    if ($check_first_line) {
+      return 0 unless (/^>/);
+      $check_first_line = 0;
+    }
+    s/^>//g;
+    s/>$//g;
+
+    my ( $id, @seqs ) = split /\n/, $_;
+    return 0 unless ( defined $id or @seqs );
+    my $seq = join( "", @seqs );
+    return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
+    if (-1 == $seqlen) {
+      $seqlen = length ($seq);
+    } else {
+      return 0 if ($seqlen != length ($seq) );
+    }
+    ++$nseq;
+  }
+  close($IN);
+
+  return $nseq;
+}
+#####################################################################################################
+sub check_MSF_format {
+  my $infile = shift;
+  $? = 0;
+  system("$readseq -fMSF -a -p < $infile > /dev/null");
+  return 0 if ($?);
+
+  return 1;
+}
+#####################################################################################################
+sub check_BLC_format {
+  my $infile = shift;
+
+  my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
+  open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
+  print $fh "silent_mode\nblock_file $infile\n";
+  print $fh "output_file /dev/null\nmax_nseq 2000\n";
+  print $fh "pir_save $tmpfile2\nsetup\n";
+  close $fh;
+
+  $? = 0;
+  system("$alscript -s -f $tmpfile1");
+  $? = 0;
+  system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
+  unlink $tmpfile1;
+  unlink $tmpfile2;
+  return 0 if ($?);
+
+  return 1;
+}
+############################################################################################################################################
+# convertor BLC -> MSF
+sub msf2fasta {
+  my $infile  = shift;
+  my $outfile = shift;
+
+  my $sequence = "";
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+  while (<$IN>) {
+    $sequence .= $_;
+  }
+  close($IN);
+
+  $sequence =~ s/~/\./g;
+
+  ## check for non-unique sequence names - readseq has problems with these
+  my @lines = split /\n/, $sequence;
+  my %names;
+  foreach my $line (@lines) {
+    if ( $line =~ /Name:\s+(\S+)/ ) {
+      die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
+      $names{$1}++;
+    }
+  }
+
+  my $tmpfile1 = File::Temp->new->filename;
+  my $tmpfile2 = File::Temp->new->filename;
+  open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
+  print $FILE $sequence;
+  close $FILE;
+
+  $? = 0;
+  system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
+  die "Reformatting of $infile failed. exit...\n" if ($?);
+  unlink $tmpfile1;
+
+  $? = 0;
+  system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
+  die "Reformatting of $infile failed. exit...\n" if ($?);
+  unlink $tmpfile2;
+
+  die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
+}
+
+############################################################################################################################################
+# convertor BLC -> MSF
+sub blc2msf {
+  my ( $in, $out ) = @_;
+
+  my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
+
+  open my $fh, ">$tmp" or die "$tmp: $!\n";
+  print $fh "silent_mode\nblock_file $infile\n";
+  print $fh "output_file /dev/null\nmax_nseq 2000\n";
+  print $fh "pir_save $tmp_pir\nsetup\n";
+  close $fh;
+
+  $? = 0;
+  system("$alscript -s -f $tmp");
+  die "Reformatting of $infile failed. exit...\n" if ($?);
+  system("$readseq -f=MSF -o=$out -a $tmp_pir");
+
+  unlink $tmp;
+  unlink $tmp_pir;
+}
+
+#####################################################################################################
+
+=begin :private
+
+=head2 fasta2concise ($infile, $outfile)
+
+Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
+
+=cut
+
+sub fasta2concise {
+  my $infile  = shift;
+  my $outfile = shift;
+
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
+  my ( $seq, @seqs, @title );
+  while (<$IN>) {
+    if (s/^>//) {
+      if ($seq) {
+        $seq =~ s/\n|\s//g;
+        $seq =~ s/(.)/$1,/g;
+        push @seqs, $seq;
+        $seq = "";
+      }
+      chomp;
+      push @title, $_;
+    } else {
+      chomp;
+      $seq .= $_;
+    }
+  }
+  $seq =~ s/\n|\s//g;
+  $seq =~ s/(.)/$1,/g;
+  push @seqs, $seq;
+  close($IN);
+
+  if ( @title != @seqs ) {
+    die("non matching number of titles and sequences!\n");
+  }
+
+  open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+  foreach ( 0 .. $#title ) {
+    print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
+  }
+  close($OUT);
+}
+
+=begin :private
+
+=head2 concise2fasta ($infile, $outfile)
+
+Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
+
+=cut
+
+#####################################################################################################
+sub concise2fasta {
+  my $infile  = shift;
+  my $outfile = shift;
+
+  my ( @seqs, %seq, @pred, %pred );
+
+  my @var = (
+    "Lupas_21",  "Lupas_14", "Lupas_28", "MULTCOIL",  "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
+    "JNETALIGN", "JNETHMM",  "JNETSOL5", "JNETSOL25", "JNETSOL0",        "jnetpred",       "jpred"
+  );
+
+  # parse input concise file
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
+  while (<$IN>) {
+    unless (/^\n/) {
+      my ( $id, $seq ) = split( ":", $_ );
+      if ( defined $id and defined $seq ) {    # Check we have proper values
+        $seq =~ s/,//g;
+        chomp $seq;
+        if ( $id =~ /;/ ) {                    # Then it's an alignment
+          @_ = split( ";", $id );
+          push @seqs, $_[1];
+          $seq{ $_[1] } = $seq;
+        }
+        foreach my $v (@var) {
+          if ( $v eq $id ) {
+            push @pred, $v;
+            $pred{$v} = $seq;
+          }
+        }
+      }
+    }
+  }
+  close($IN);
+
+  open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+
+  # print out sequences found with PSI-BLAST
+  foreach (@seqs) {
+    $seq{$_} =~ s/(.{72})/$1\n/g;
+    print $OUT ">$_\n$seq{$_}\n";
+  }
+
+  # print out predictions
+  foreach my $p (@pred) {
+    $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
+    $pred{$p} =~ s/B/E/g;
+    $pred{$p} =~ s/G/H/g;
+
+    if ( $p =~ /SOL/ ) {
+      $pred{$p} =~ s/E/B/g;
+    }
+    $pred{$p} =~ s/(.{72})/$1\n/g;
+    print $OUT ">$p\n$pred{$p}\n";
+  }
+  close($OUT);
+}
  
  =begin :private
  
@@ -716,6 +1168,41 @@ Runs Jnet. Pass the paths of the filenames to do the prediction on
  
  =cut
  
+#################################################################################################################################################
+
+=head2 get_hmm($alignFile, $name)
+
+Adapted from the hmmer() function in Jpred as written by JDB.
+
+Generates an HMMer profile output compatible with Jnet.
+
+Equivilent to gethmm script.
+
+=cut
+
+sub hmm_for_align {
+  my $fastafile = shift;
+
+  # Temp files required
+  print "hmm_for_align: fastafile = $fastafile\n";
+  my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
+
+  system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
+  system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+
+  # Read in the HMMER file
+  my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+  # Read in the HMMER file
+  my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+  # Convert from HMMER::Profile to HMMER::Profile::Jnet
+  my $hmmer_jnet = HMMER::Profile::Jnet->new;
+  $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
+
+  return $hmmer_jnet;
+}
+
  #####################################################################################################
  sub jnet {
    my ( $hmmer, $outfile, $pssm ) = @_;
@@ -729,6 +1216,21 @@ sub jnet {
      system("$jnet --concise --hmmer $hmmer > $outfile");
    }
    check( "jnet", $? ) or exit 1;
+  my $res     = "";
+  my $logging = "";
+  open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+  while (<$IN>) {
+    if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
+      $res .= $_;
+    } else {
+      $logging .= $_;
+    }
+  }
+  close $IN;
+  open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+  print $OUT "\n" . $res;
+  close $OUT;
+  return $logging;
  }
  
  =head1 psiseq2fasta($path, @PSISEQ)
@@ -839,6 +1341,8 @@ sub fasta2psiseq_lowmemory {
    return @seqs;
  }
  
+#####################################################################################################
+
  =head1 extend($FASTA::File, @PSISEQ)
  
  Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues 
@@ -847,7 +1351,6 @@ present in the alignment. The other sequences are extended with gap characters.
  
  =cut
  
-#####################################################################################################
  sub extend {
    my ( $query, @seqs ) = @_;
  
@@ -898,13 +1401,14 @@ sub extend {
    return @seqs;
  }
  
+#####################################################################################################
+
  =head1 $int = fasta_seq_length($path)
  
  Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
  
  =cut
  
-#####################################################################################################
  sub fasta_seq_length {
    my ($fn) = @_;
    open my $fh, $fn or croak "Can't open file $fn: $!\n";
@@ -919,12 +1423,3 @@ sub fasta_seq_length {
  
    return $length;
  }
-
-=head1 log(@messages)
-
-Report messages to STDERR
-
-=cut
-
-#####################################################################################################
-sub errlog { print STDERR @_ }