Add SS prediction for multiple aligment into jpred.pl. Before this was a part of...

[jabaws.git] / binaries / src / jpred / jpred.pl
diff --git a/binaries/src/jpred/jpred.pl b/binaries/src/jpred/jpred.pl

index 74206dd..f5910fc 100755 (executable)
--- a/binaries/src/jpred/jpred.pl
+++ b/binaries/src/jpred/jpred.pl
@@ -6,23 +6,26 @@ jpred - Secondary structure prediction program
  
  =head1 SYNOPSIS
  
-./jpred.pl -in/-sequence <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
+./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
  
  =head1 DESCRIPTION
  
-This is a program which predicts the secondary structure of a protein sequence given a path to a 
-FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
-
-The program is primarily design for use by the Jpred server, but it can be used directly by any user. 
-Some of the output may not be meaningful if used directly - it can be safely ignored.
+This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence 
+(with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC. 
+For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the 
+secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
  
  =head1 OPTIONS
  
  =over 5
  
-=item -in/-sequence <FILE1>
+=item -in <FILE1>
+
+The path to the sequence file (in FASTA, MSF, or BLC format)
  
-The path to the sequence file (in FASTA format) you want to predict.
+=item -seq
+
+The input file is a FASTA file with one sequence only.
  
  =item -output <FILEPREFIX>
  
@@ -112,15 +115,13 @@ use Carp;
  use Data::Dumper;
  use File::Temp;
  
-#use UNIVERSAL;
-
  # path to Jpred modules
  use FindBin qw($Bin);
  use lib "$Bin/lib";
  
  # internal jpred modules
  use Jpred;
-use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS setup_env);
+use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
  use HMMER::Profile;
  use HMMER::Profile::Jnet;
  use PSIBLAST;
@@ -182,13 +183,17 @@ our $GAP = '-';
  
  #####################################################################################################
  my $infile;
+my $infastafile;
+my $format;
+my $goal = "align";
+my $seqgoal;
  my $outfile;
  my $prefix;
  my $psiblast;
  my $jabaws;
  my $logfile;
-my $ncpu       = 1;           # number of CPUs for psiblast calculations
-my $predNoHits = 0;           # define whether to make predictions when no PSI-BLAST hits are found
+my $ncpu       = 1;                              # number of CPUs for psiblast calculations
+my $predNoHits = 0;                              # define whether to make predictions when no PSI-BLAST hits are found
  my $db_path    = "/homes/www-jpred/databases";
  my $db_entry   = "cluster";
  my $nofinal;
@@ -196,30 +201,34 @@ my $nofinal;
  my ( $help, $man, $DEBUG, $VERBOSE );
  
  GetOptions(
-  "in|sequence=s" => \$infile,
-  "output=s"      => \$prefix,
-  "outfile=s"     => \$outfile,
-  "logfile=s"     => \$logfile,
-  "psi=s"         => \$psiblast,
-  "dbname=s"      => \$db_entry,
-  "dbpath=s"      => \$db_path,
-  "ncpu=s"        => \$ncpu,
-  "pred-nohits"   => \$predNoHits,
-  "no-final"      => \$nofinal,
-  "jabaws"        => \$jabaws,
-  "help"          => \$help,
-  "man"           => \$man,
-  "debug"         => \$DEBUG,
-  "verbose"       => \$VERBOSE
+  "in=s"        => \$infile,
+  "output=s"    => \$prefix,
+  "outfile=s"   => \$outfile,
+  "logfile=s"   => \$logfile,
+  "psi=s"       => \$psiblast,
+  "dbname=s"    => \$db_entry,
+  "dbpath=s"    => \$db_path,
+  "ncpu=s"      => \$ncpu,
+  "pred-nohits" => \$predNoHits,
+  "no-final"    => \$nofinal,
+  "seq"         => \$seqgoal,
+  "jabaws"      => \$jabaws,
+
+  "help"    => \$help,
+  "man"     => \$man,
+  "debug"   => \$DEBUG,
+  "verbose" => \$VERBOSE
  ) or pod2usage(2);
  pod2usage(1) if $help;
  pod2usage( verbose => 2 ) if $man;
  
+$goal = "seq" if ( defined $seqgoal );
+
  #####################################################################################################
  # Key to database information and information for accessing them
-if (defined $jabaws) {
-  $db_path    = "/data/UNIREFdb";
-  $db_entry   = "cluster";
+if ( defined $jabaws ) {
+  $db_path  = "/data/UNIREFdb";
+  $db_entry = "cluster";
  }
  
  my $database = {
@@ -270,22 +279,24 @@ my $database = {
  };
  
  my $dp = $database->{$db_entry}{database};
-pod2usage("ERROR! Input file with a sequence should be provided with -sequence/-in")                              unless $infile;
-pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database" )           unless exists $database->{$db_entry};
+pod2usage("ERROR! Input file should be provided with -sequence/-in")                                              unless $infile;
+pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database")            unless exists $database->{$db_entry};
  pod2usage("ERROR! UNIREF database is not available at $dp. Use -dbpath and -dbname for configuring the database") unless -f $dp;
  
+#####################################################################################################
+# lots of preparation steps
  if ( defined $prefix ) {
-  unless (defined $outfile) {
-    $outfile = $prefix. ".res.fasta";
+  unless ( defined $outfile ) {
+    $outfile = $prefix . ".res.fasta";
    }
  } else {
-  if (defined $outfile) {
+  if ( defined $outfile ) {
      print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
      $prefix = $outfile . ".tmp";
    } else {
      print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
-    $prefix = $infile . ".res";
-    $outfile = $prefix. ".jnet";
+    $prefix  = $infile . ".res";
+    $outfile = $prefix . ".jnet";
    }
  }
  
@@ -297,7 +308,7 @@ if ( 1 > $ncpu or 8 < $ncpu ) {
  $VERBOSE = 1 if $DEBUG;
  
  my $LOG;
-if (defined $logfile) {
+if ( defined $logfile ) {
    open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
  }
  
@@ -309,8 +320,7 @@ for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefi
    print $LOG "$par\n" if $LOG;
  }
  
-#####################################################################################################
-if (defined $jabaws) {
+if ( defined $jabaws ) {
    setup_jpred_env($Bin);
    setup_env($Bin);
  }
@@ -319,215 +329,269 @@ print "JPRED: checking platiform... $platform\n" if $DEBUG;
  print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
  
  #####################################################################################################
-my $query = FASTA::File->new( read_file => $infile );
-
-my @seqs;
-my $psi = PSIBLAST->new;
-unless ( defined $psiblast && -e $psiblast ) {
-  ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
-  # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
-  # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
-  # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
-  my $psi_run = PSIBLAST::Run->new(
-    debug    => $DEBUG,
-    args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
-    path     => $psiblastbin,
-    input    => $infile,
-    output   => "$prefix.blast",
-    matrix   => "$prefix.profile",
-    database => $database->{$db_entry}{database},
-  );
-
-  # For reduced databases, get the size of the orginal and use this as
-  # the equivilent DB size
-  if ( $db_entry =~ /^sp_red_/ ) {
-    ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
-    $psi_run->args(
-      $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
-    );
+# check input file format
+if ( 'seq' eq $goal ) {
+  $format = "seq";
+  if ( 1 != check_FASTA_format($infile) ) {
+    die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
    }
-  print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
-  print "BLASTDB path: $ENV{BLASTDB}\n"  if $DEBUG;
-  print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
-  print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
-
-  print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
-  print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
-  $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
-
-  $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
-  system("gzip -9f $prefix.blast");
  } else {
-  if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
-  else                         { $psi->read_file($psiblast) }                        # CC ditto above
+  my $nseq = check_FASTA_format($infile);
+  if ( 0 < $nseq ) {
+    $format = "fasta";
+    if ( 1 == $nseq ) {
+      die "\nERROR! jpred requires alignment with more than 1 sequence\n       if you provide only one sequence use the -seq option.\n";
+    }
+  } elsif ( 0 < check_MSF_format($infile) ) {
+    $format = "msf";
+  } elsif ( 0 < check_BLC_format($infile) ) {
+    $format = "blc";
+  } else {
+    die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
+  }
  }
+$infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
  
  #####################################################################################################
-# Convert the last itteration into a collection of PSISEQ objects
-for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
-  my ( $id, $start, $align ) = @{$_};
-  push @seqs,
-    PSISEQ->new(
-    id    => $id,
-    start => $start,
-    align => [ split( //, $align ) ]
+if ( 'seq' eq $goal ) {
+  my $query = FASTA::File->new( read_file => $infile );
+  my @seqs;
+  my $psi = PSIBLAST->new;
+  unless ( defined $psiblast && -e $psiblast ) {
+    ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
+    # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
+    # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
+    # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
+    my $psi_run = PSIBLAST::Run->new(
+      debug    => $DEBUG,
+      args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
+      path     => $psiblastbin,
+      input    => $infile,
+      output   => "$prefix.blast",
+      matrix   => "$prefix.profile",
+      database => $database->{$db_entry}{database},
      );
-}
+
+    # For reduced databases, get the size of the orginal and use this as
+    # the equivilent DB size
+    if ( $db_entry =~ /^sp_red_/ ) {
+      ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
+      $psi_run->args(
+        $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
+      );
+    }
+    print "BLAST matrix: $ENV{BLASTMAT}\n"      if $DEBUG;
+    print "BLASTDB path: $ENV{BLASTDB}\n"       if $DEBUG;
+    print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
+    print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
+
+    print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
+    print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
+    $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
+
+    $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
+    system("gzip -9f $prefix.blast");
+  } else {
+    if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
+    else                         { $psi->read_file($psiblast) }                        # CC ditto above
+  }
+
+#####################################################################################################
+  # Convert the last itteration into a collection of PSISEQ objects
+  for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
+    my ( $id, $start, $align ) = @{$_};
+    push @seqs,
+      PSISEQ->new(
+      id    => $id,
+      start => $start,
+      align => [ split( //, $align ) ]
+      );
+  }
  
  #####################################################################################################
-## When there are no PSI-BLAST hits generate an HMM from the query
-## and run Jnet against that only.
-## Accuracy won't be as good, but at least it's a prediction
-if ( @seqs == 0 ) {
-  if ( $predNoHits == 0 ) {
-    warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
-    print ">>100% complete\n";
-    print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
-    print $LOG ">>100% complete\n" if $LOG;
-    close ($LOG) if $LOG;
-    exit;
+  # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
+  # Accuracy won't be as good, but at least it's a prediction
+  if ( @seqs == 0 ) {
+    if ( $predNoHits == 0 ) {
+      warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
+      print ">>100% complete\n";
+      print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
+      print $LOG ">>100% complete\n"                                                                         if $LOG;
+      close($LOG)                                                                                            if $LOG;
+      exit;
+    } else {
+      print ">>50% complete\n";
+      warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
+      print "Running HMMer on query...\n"                                                            if $VERBOSE;
+      print $LOG ">>50% complete\n"                                                                  if $LOG;
+      print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
+      print $LOG "Running HMMer on query...\n"                                                       if $LOG;
+
+      # copy input query to alignment
+      #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
+      open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
+      open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
+      while (<$ifh>) { print $ofh, $_ }
+      close $ifh;
+      close $ofh;
+
+      # Temp files required for HMMer
+      my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
+      system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
+      system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+      unlink $hmmbuild_out;
+
+      # Read in the HMMER file
+      my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+      # Convert from HMMER::Profile to HMMER::Profile::Jnet
+      my $hmmer_jnet = HMMER::Profile::Jnet->new;
+      $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
+      $hmmer_jnet->write_file("$prefix.hmm");
+      print ">>70% complete\n";
+      print $LOG ">>70% complete\n" if $LOG;
+
+      # Run Jnet for the prediction
+      print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
+      print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
+      my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
+      print ">>90% complete\n";
+      print $LOG ">>90% complete\n" if $LOG;
+      print $jnetlog                if $VERBOSE;
+      print $LOG $jnetlog           if $LOG;
+    }
    } else {
+    psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
+    print ">>40% complete\n";
+    print $LOG ">>40% complete\n" if $LOG;
+
+    #####################################################################################################
+    # Make PSIBLAST truncated alignments the right length
+    print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
+    print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
+    @seqs = extend( $query, @seqs );
+    psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Remove masking from PSIBLAST alignment
+    print "Unmasking the alignments...\n" if $VERBOSE;
+    print $LOG "Unmasking the alignments...\n" if $LOG;
+    my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
+    remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
+    psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Convert the sequences to upper case
+    print "Converting sequences to the same case...\n" if $VERBOSE;
+    print $LOG "Converting sequences to the same case...\n" if $LOG;
+    toupper($_) for @seqs;    # CC sub is below
+    psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
+
+    #####################################################################################################
+    # Remove excessive sequences
+    print "Remove excessive sequences...\n" if $VERBOSE;
+    print $LOG "Remove excessive sequences...\n" if $LOG;
+    @seqs = reduce( $MAX_ALIGN, @seqs );
+    psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Remove sequences that are too long or too short
+    print "Remove sequences which too long or short...\n" if $VERBOSE;
+    print $LOG "Remove sequences which too long or short...\n" if $LOG;
+    @seqs = del_long_seqs( 50, @seqs );
+    psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Remove redundant sequences based upon pairwise identity and OC clustering
+    print "Remove redundant sequences...\n" if $VERBOSE;
+    print $LOG "Remove redundant sequences...\n" if $LOG;
+    @seqs = nonred( $NR_CUT, @seqs );
+    psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Check that we haven't got rid of all of the sequences
+    if ( @seqs < 2 ) {
+      warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
+      print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
+      @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
+    }
+    unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
+
+    # Remove gaps in the query sequence and same positions in the alignment
+    print "Removing gaps in the query sequence...\n" if $VERBOSE;
+    print $LOG "Removing gaps in the query sequence...\n" if $LOG;
+    degap(@seqs);
+    psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
+
+    # Output the alignment for the prediction
+    print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
+    print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
+    psiseq2fasta( "$prefix.align", @seqs );
+
+    #####################################################################################################
+    # Equivilent to getpssm script
+    print "Output the PSSM matrix from the PSI-BLAST profile...\n";
+    print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
+    my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
+    $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
      print ">>50% complete\n";
-    warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
-    print "Running HMMer on query...\n" if $VERBOSE;
      print $LOG ">>50% complete\n" if $LOG;
-    print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
-    print $LOG "Running HMMer on query...\n" if $LOG;
-
-    # copy input query to alignment
-    #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
-    open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
-    open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
-    while (<$ifh>) { print $ofh, $_ }
-    close $ifh;
-    close $ofh;
-
-    # Temp files required for HMMer
-    my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
-
-    system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
-    system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
-
-    # Read in the HMMER file
-    my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
-
-    # Convert from HMMER::Profile to HMMER::Profile::Jnet
-    my $hmmer_jnet = HMMER::Profile::Jnet->new;
-    $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
-    $hmmer_jnet->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
+
+    #####################################################################################################
+    # Run HMMER on the sequences
+    print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
+    print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
+    my $hmmer = hmmer(@seqs);             # CC sub is below
+    $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
      print ">>70% complete\n";
      print $LOG ">>70% complete\n" if $LOG;
-
+#####################################################################################################
      # Run Jnet for the prediction
-    print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
-    print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
-    my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
+    print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
+    print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
+    my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
      print ">>90% complete\n";
      print $LOG ">>90% complete\n" if $LOG;
-    print $jnetlog if $VERBOSE;
-    print $LOG $jnetlog if $LOG;
+    print $jnetlog                if $VERBOSE;
+    print $LOG $jnetlog           if $LOG;
    }
  } else {
-  psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
-  print ">>40% complete\n";
-  print $LOG ">>40% complete\n" if $LOG;
-
-  #####################################################################################################
-  # Make PSIBLAST truncated alignments the right length
-  print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
-  print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
-  @seqs = extend( $query, @seqs );
-  psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Remove masking from PSIBLAST alignment
-  print "Unmasking the alignments...\n" if $VERBOSE;
-  print $LOG "Unmasking the alignments...\n" if $LOG;
-  my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
-  remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
-  psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Convert the sequences to upper case
-  print "Converting sequences to the same case...\n" if $VERBOSE;
-  print $LOG "Converting sequences to the same case...\n" if $LOG;
-  toupper($_) for @seqs;    # CC sub is below
-  psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
-
-  #####################################################################################################
-  # Remove excessive sequences
-  print "Remove excessive sequences...\n" if $VERBOSE;
-  print $LOG "Remove excessive sequences...\n" if $LOG;
-  @seqs = reduce( $MAX_ALIGN, @seqs );
-  psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Remove sequences that are too long or too short
-  print "Remove sequences which too long or short...\n" if $VERBOSE;
-  print $LOG "Remove sequences which too long or short...\n" if $LOG;
-  @seqs = del_long_seqs( 50, @seqs );
-  psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Remove redundant sequences based upon pairwise identity and OC clustering
-  print "Remove redundant sequences...\n" if $VERBOSE;
-  print $LOG "Remove redundant sequences...\n" if $LOG;
-  @seqs = nonred( $NR_CUT, @seqs );
-  psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
-
-  #####################################################################################################
-  # Check that we haven't got rid of all of the sequences
-  if ( @seqs < 2 ) {
-    warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
-    print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
-    @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
+  if ( 'fasta' eq $format ) {
+    print "Read FASTA file\n";
+    my $hmmer1 = hmm_for_align($infile);
+    $hmmer1->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
+  } elsif ( 'msf' eq $format ) {
+    msf2fasta( $infile, $infastafile );
+    my $hmmer2 = hmm_for_align($infastafile);
+    $hmmer2->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
+  } elsif ( 'blc' eq $format ) {
+    my $tmpfile = File::Temp->new->filename;
+    blc2msf( $infile, $tmpfile );
+    msf2fasta( $infile, $infastafile );
+    my $hmmer3 = hmm_for_align($infastafile);
+    $hmmer3->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
+  } else {
+    die "ERROR! unknown input format '$format'. exit...\n";
    }
-  unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
-
-  # Remove gaps in the query sequence and same positions in the alignment
-  print "Removing gaps in the query sequence...\n" if $VERBOSE;
-  print $LOG "Removing gaps in the query sequence...\n" if $LOG;
-  degap(@seqs);
-  psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
-
-  # Output the alignment for the prediction
-  print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
-  print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
-  psiseq2fasta( "$prefix.align", @seqs );
-
-  #####################################################################################################
-  # Equivilent to getpssm script
-  print "Output the PSSM matrix from the PSI-BLAST profile...\n";
-  print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
-  my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
-  $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
-  print ">>50% complete\n";
-  print $LOG ">>50% complete\n" if $LOG;
-
-  #####################################################################################################
-  # Run HMMER on the sequences
-  print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
-  print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
-  my $hmmer = hmmer(@seqs);             # CC sub is below
-  $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
    print ">>70% complete\n";
    print $LOG ">>70% complete\n" if $LOG;
-
-  #####################################################################################################
+#####################################################################################################
    # Run Jnet for the prediction
-  print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
-  print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
-  my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
+  print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
+  print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
+  my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );           # CC sub is below
    print ">>90% complete\n";
    print $LOG ">>90% complete\n" if $LOG;
-  print $jnetlog if $VERBOSE;
-  print $LOG $jnetlog if $LOG;
+  print $jnetlog                if $VERBOSE;
+  print $LOG $jnetlog           if $LOG;
  }
  
  unless ( defined $nofinal ) {
    my $aligfile      = $prefix . ".align";
    my $jnetfile      = $prefix . ".jnet";
    my $jnetfastafile = $prefix . ".jnet.fasta";
+  $aligfile = $infile if ( 'fasta' eq $format );
+  $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
    concise2fasta( $jnetfile, $jnetfastafile );
    open( my $IN1, "<", $aligfile )      or die "ERROR! unable to open '$aligfile': ${!}\nDied";
    open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
@@ -538,22 +602,146 @@ unless ( defined $nofinal ) {
    close($IN2);
    close($OUT);
    unlink $jnetfastafile;
+  my $seqs = FASTA::File->new( read_file => $outfile );
+  open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
+  $seqs->write($OUT2);
+  close($OUT2);
  } else {
-  if (defined $outfile and $prefix.".jnet" ne $outfile) {
-    rename $prefix.".jnet", $outfile;
+  if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
+    rename $prefix . ".jnet", $outfile;
    }
  }
  
  print ">>100% complete\n";
  print "Jpred Finished\n";
  print $LOG ">>100% complete\n" if $LOG;
-print $LOG "Jpred Finished\n" if $LOG;
-close ($LOG) if $LOG;
+print $LOG "Jpred Finished\n"  if $LOG;
+close($LOG)                    if $LOG;
  exit;
  
  #####################################################################################################
  # Functions
  #####################################################################################################
+sub check_FASTA_format {
+  my $infile = shift;
+
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+  my $check_first_line = 1;
+  my $nseq             = 0;
+  local $/ = "\n>";
+  while (<$IN>) {
+    if ($check_first_line) {
+      return 0 unless (/^>/);
+      $check_first_line = 0;
+    }
+    s/^>//g;
+    s/>$//g;
+
+    my ( $id, @seqs ) = split /\n/, $_;
+    return 0 unless ( defined $id or @seqs );
+    my $seq = join( "", @seqs );
+    return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
+    ++$nseq;
+  }
+  close($IN);
+
+  return $nseq;
+}
+#####################################################################################################
+sub check_MSF_format {
+  my $infile = shift;
+  $? = 0;
+  system("$readseq -fMSF -a -p < $infile > /dev/null");
+  return 0 if ($?);
+
+  return 1;
+}
+#####################################################################################################
+sub check_BLC_format {
+  my $infile = shift;
+
+  my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
+  open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
+  print $fh "silent_mode\nblock_file $infile\n";
+  print $fh "output_file /dev/null\nmax_nseq 2000\n";
+  print $fh "pir_save $tmpfile2\nsetup\n";
+  close $fh;
+
+  $? = 0;
+  system("$alscript -s -f $tmpfile1");
+  $? = 0;
+  system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
+  unlink $tmpfile1;
+  unlink $tmpfile2;
+  return 0 if ($?);
+
+  return 1;
+}
+############################################################################################################################################
+# convertor BLC -> MSF
+sub msf2fasta {
+  my $infile  = shift;
+  my $outfile = shift;
+
+  my $sequence = "";
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+  while (<$IN>) {
+    $sequence .= $_;
+  }
+  close($IN);
+
+  $sequence =~ s/~/\./g;
+
+  ## check for non-unique sequence names - readseq has problems with these
+  my @lines = split /\n/, $sequence;
+  my %names;
+  foreach my $line (@lines) {
+    if ( $line =~ /Name:\s+(\S+)/ ) {
+      die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
+      $names{$1}++;
+    }
+  }
+
+  my $tmpfile1 = File::Temp->new->filename;
+  my $tmpfile2 = File::Temp->new->filename;
+  open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
+  print $FILE $sequence;
+  close $FILE;
+
+  $? = 0;
+  system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
+  die "Reformatting of $infile failed. exit...\n" if ($?);
+  unlink $tmpfile1;
+
+  $? = 0;
+  system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
+  die "Reformatting of $infile failed. exit...\n" if ($?);
+  unlink $tmpfile2;
+
+  die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
+}
+
+############################################################################################################################################
+# convertor BLC -> MSF
+sub blc2msf {
+  my ( $in, $out ) = @_;
+
+  my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
+
+  open my $fh, ">$tmp" or die "$tmp: $!\n";
+  print $fh "silent_mode\nblock_file $infile\n";
+  print $fh "output_file /dev/null\nmax_nseq 2000\n";
+  print $fh "pir_save $tmp_pir\nsetup\n";
+  close $fh;
+
+  $? = 0;
+  system("$alscript -s -f $tmp");
+  die "Reformatting of $infile failed. exit...\n" if ($?);
+  system("$readseq -f=MSF -o=$out -a $tmp_pir");
+
+  unlink $tmp;
+  unlink $tmp_pir;
+}
  
  #####################################################################################################
  
@@ -959,6 +1147,41 @@ Runs Jnet. Pass the paths of the filenames to do the prediction on
  
  =cut
  
+#################################################################################################################################################
+
+=head2 get_hmm($alignFile, $name)
+
+Adapted from the hmmer() function in Jpred as written by JDB.
+
+Generates an HMMer profile output compatible with Jnet.
+
+Equivilent to gethmm script.
+
+=cut
+
+sub hmm_for_align {
+  my $fastafile = shift;
+
+  # Temp files required
+  print "hmm_for_align: fastafile = $fastafile\n";
+  my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
+
+  system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
+  system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+
+  # Read in the HMMER file
+  my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+  # Read in the HMMER file
+  my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+  # Convert from HMMER::Profile to HMMER::Profile::Jnet
+  my $hmmer_jnet = HMMER::Profile::Jnet->new;
+  $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
+
+  return $hmmer_jnet;
+}
+
  #####################################################################################################
  sub jnet {
    my ( $hmmer, $outfile, $pssm ) = @_;
@@ -972,7 +1195,7 @@ sub jnet {
      system("$jnet --concise --hmmer $hmmer > $outfile");
    }
    check( "jnet", $? ) or exit 1;
-  my $res = "";
+  my $res     = "";
    my $logging = "";
    open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
    while (<$IN>) {
@@ -984,7 +1207,7 @@ sub jnet {
    }
    close $IN;
    open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
-  print $OUT "\n".$res;
+  print $OUT "\n" . $res;
    close $OUT;
    return $logging;
  }
@@ -1097,6 +1320,8 @@ sub fasta2psiseq_lowmemory {
    return @seqs;
  }
  
+#####################################################################################################
+
  =head1 extend($FASTA::File, @PSISEQ)
  
  Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues 
@@ -1105,7 +1330,6 @@ present in the alignment. The other sequences are extended with gap characters.
  
  =cut
  
-#####################################################################################################
  sub extend {
    my ( $query, @seqs ) = @_;
  
@@ -1156,13 +1380,14 @@ sub extend {
    return @seqs;
  }
  
+#####################################################################################################
+
  =head1 $int = fasta_seq_length($path)
  
  Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
  
  =cut
  
-#####################################################################################################
  sub fasta_seq_length {
    my ($fn) = @_;
    open my $fh, $fn or croak "Can't open file $fn: $!\n";
@@ -1177,9 +1402,3 @@ sub fasta_seq_length {
  
    return $length;
  }
-
-=head1 log(@messages)
-
-Report messages to STDERR
-
-=cut