Add an old jpred.pl script

author Sasha Sherstnev <a.sherstnev@dundee.ac.uk>

Mon, 14 Oct 2013 14:26:30 +0000 (15:26 +0100)

committer Sasha Sherstnev <a.sherstnev@dundee.ac.uk>

Mon, 14 Oct 2013 14:26:30 +0000 (15:26 +0100)
author Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
Mon, 14 Oct 2013 14:26:30 +0000 (15:26 +0100)
committer Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
Mon, 14 Oct 2013 14:26:30 +0000 (15:26 +0100)
diff --git a/binaries/src/jpred/jpred_old.pl b/binaries/src/jpred/jpred_old.pl

new file mode 100755 (executable)

index 0000000..99d7fdc
--- /dev/null
+++ b/binaries/src/jpred/jpred_old.pl
@@ -0,0 +1,1401 @@
+#!/usr/bin/perl
+
+=head1 NAME
+
+jpred - Secondary structure prediction program
+
+=head1 SYNOPSIS
+
+./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
+
+=head1 DESCRIPTION
+
+This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence 
+(with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC. 
+For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the 
+secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
+
+=head1 OPTIONS
+
+=over 5
+
+=item -in <FILE1>
+
+The path to the sequence file (in FASTA, MSF, or BLC format)
+
+=item -seq
+
+The input file is a FASTA file with one sequence only.
+
+=item -output <FILEPREFIX>
+
+A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
+
+=item -outfile <FILE2>
+
+An output file, by default it is undefined and the -output option is working
+
+=item -logfile <FILE3>
+
+Logging file, by default it is undefined and logging switched off
+
+=item -dbpath PATH
+
+Path to the uniref database used for PSI-BLAST querying. default value: .
+
+=item -dbname database
+
+Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
+Default value: uniref90
+
+=item -psi path
+
+Path to a PSIBLAST output file.
+
+=item -ncpu NNN
+
+Number of CPU used by jpred.pl. Maximum value is 8
+Default: NNN = 1
+
+=item -pred-nohits
+
+Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
+
+=item -no-final
+
+By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
+The option untoggles this step and stops jpred at the concise file generation
+
+=item -jabaws
+
+Redefines some basic variables in order to use the script within JABAWS
+
+=item -verbose
+
+Verbose mode. Print more information while running jpred.
+
+=item -debug
+
+Debug mode. Print debugging information while running jpred. 
+
+=item -help
+
+Gives help on the programs usage.
+
+=item -man
+
+Displays this man page.
+
+=back
+
+=head1 BUGS
+
+Can't cope with gaps in the query sequence.
+
+Doesn't accept alignments.
+
+If you find any others please contact authors.
+
+=head1 AUTHORS
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
+Chris Cole <christian@cole.name>
+Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
+
+=cut
+
+## TODO check for gaps in query sequence
+##      check for disallowed chars in sequence
+
+use strict;
+use warnings;
+use Getopt::Long;
+use Pod::Usage;
+use Carp;
+use Data::Dumper;
+use File::Temp;
+
+# path to Jpred modules
+use FindBin qw($Bin);
+use lib "$Bin/lib";
+
+# internal jpred modules
+use Jpred;
+use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
+use HMMER::Profile;
+use HMMER::Profile::Jnet;
+use PSIBLAST;
+use PSIBLAST::PSSM;
+use PSIBLAST::Run;
+use FASTA::File;
+use FASTA;
+use Index;
+use Pairwise;
+use OC;
+use Utils qw(profile);
+use Run qw(check);
+
+our $VERSION = '3.0.2';
+my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
+my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
+
+# Gap characters in sequences
+our $GAP = '-';
+
+#####################################################################################################
+# Light object to hold sequence information, light to prevent memory overload
+# for big search results. Even so this is quite memory intensive, I can't see
+# how to reduce the size further.
+{
+
+  package PSISEQ;
+
+  sub new {
+    my ( $class, %args ) = @_;
+    my $self = [];
+    bless $self, $class;
+    for ( keys %args ) { $self->$_( $args{$_} ) }
+    return $self;
+  }
+
+  sub id {
+    return defined $_[1]
+      ? $_[0]->[0] = $_[1]
+      : $_[0]->[0];
+  }
+
+  sub start {
+    return defined $_[1]
+      ? $_[0]->[1] = $_[1]
+      : $_[0]->[1];
+  }
+
+  sub align {
+    if ( defined $_[1] ) {
+      ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
+
+      $_[0]->[2] = join( "", @{ $_[1] } );
+    } else {
+      return [ split( //, $_[0]->[2] ) ];
+    }
+  }
+}
+
+#####################################################################################################
+my $infile;
+my $infastafile;
+my $format;
+my $goal = "align";
+my $seqgoal;
+my $outfile;
+my $prefix;
+my $psiblast;
+my $jabaws;
+my $logfile;
+my $ncpu       = 1;                              # number of CPUs for psiblast calculations
+my $predNoHits = 0;                              # define whether to make predictions when no PSI-BLAST hits are found
+my $db_path    = "/homes/www-jpred/databases";
+my $db_entry   = "cluster";
+my $nofinal;
+
+my ( $help, $man, $DEBUG, $VERBOSE );
+
+GetOptions(
+  "in=s"        => \$infile,
+  "output=s"    => \$prefix,
+  "outfile=s"   => \$outfile,
+  "logfile=s"   => \$logfile,
+  "psi=s"       => \$psiblast,
+  "dbname=s"    => \$db_entry,
+  "dbpath=s"    => \$db_path,
+  "ncpu=s"      => \$ncpu,
+  "pred-nohits" => \$predNoHits,
+  "no-final"    => \$nofinal,
+  "seq"         => \$seqgoal,
+  "jabaws"      => \$jabaws,
+
+  "help"    => \$help,
+  "man"     => \$man,
+  "debug"   => \$DEBUG,
+  "verbose" => \$VERBOSE
+) or pod2usage(2);
+pod2usage(1) if $help;
+pod2usage( verbose => 2 ) if $man;
+
+$goal = "seq" if ( defined $seqgoal );
+
+#####################################################################################################
+# Key to database information and information for accessing them
+my $database = {
+
+  ## default database used by Jpred
+  uniref90 => {
+    database   => $db_path . "/uniref90.filt",
+    unfiltered => $db_path . "/uniref90",
+  },
+
+  ## database used during Jnet training
+  training => {
+    database   => $db_path . "/training/uniref90.filt",
+    unfiltered => $db_path . "/training/uniref90",
+  },
+
+  ## database used for ported Jpred
+  ported_db => {
+    database   => $db_path . "/uniref90.filt",
+    unfiltered => $db_path . "/uniref90",
+  },
+
+  ## cluster-specific path for Jpred
+  cluster => {
+    database   => $db_path . "/uniref90.filt",
+    unfiltered => $db_path . "/uniref90",
+  },
+
+  ## these other DBs are experimental ones used during development.
+  ## check they exist before using them.
+  # generic entry for use with validate_jnet.pl
+  # real db location defined by validate_jnet.pl
+  swall => {
+    database   => $db_path . "/swall/swall.filt",
+    unfiltered => $db_path . "/swall/swall",
+  },
+
+  # Path to PSIBLAST db
+  uniprot => {
+    database   => $db_path . "/3/swall.filt",
+    unfiltered => $db_path . "/3/swall",
+  },
+
+  uniref50 => {
+    database   => $db_path . "/6/swall.filt",
+    unfiltered => $db_path . "/6/swall",
+  },
+};
+
+my $dpf = $database->{$db_entry}{database}.'.pal';
+my $dpu = $database->{$db_entry}{unfiltered}.'.pal';
+pod2usage("ERROR! Input file should be provided with -sequence/-in")                                              unless $infile;
+pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database")            unless exists $database->{$db_entry};
+pod2usage("ERROR! UNIREF filtered   database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf;
+pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu;
+
+#####################################################################################################
+# lots of preparation steps
+if ( defined $prefix ) {
+  unless ( defined $outfile ) {
+    $outfile = $prefix . ".res.fasta";
+  }
+} else {
+  if ( defined $outfile ) {
+    print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
+    $prefix = $outfile . ".tmp";
+  } else {
+    print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
+    $prefix  = $infile . ".res";
+    $outfile = $prefix . ".jnet";
+  }
+}
+
+if ( 1 > $ncpu or 8 < $ncpu ) {
+  print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
+  $ncpu = 1;
+}
+
+$VERBOSE = 1 if $DEBUG;
+
+my $LOG;
+if ( defined $logfile ) {
+  open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
+}
+
+for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
+  my ( $key, $value ) = @{$_};
+  defined $value or next;
+  my $par = join( ": ", $key, $value );
+  print "$par\n" if $DEBUG;
+  print $LOG "$par\n" if $LOG;
+}
+
+if ( defined $jabaws ) {
+  setup_jpred_env($Bin);
+  setup_env($Bin);
+}
+my $platform = check_OS();
+print "JPRED: checking platiform... $platform\n" if $DEBUG;
+print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
+
+#####################################################################################################
+# check input file format
+if ( 'seq' eq $goal ) {
+  $format = "seq";
+  if ( 1 != check_FASTA_format($infile) ) {
+    die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
+  }
+} else {
+  my $nseq = check_FASTA_format($infile);
+  if ( 0 < $nseq ) {
+    $format = "fasta";
+    if ( 1 == $nseq ) {
+      die "\nERROR! jpred requires alignment with more than 1 sequence\n       if you provide only one sequence use the -seq option.\n";
+    }
+  } elsif ( 0 < check_MSF_format($infile) ) {
+    $format = "msf";
+  } elsif ( 0 < check_BLC_format($infile) ) {
+    $format = "blc";
+  } else {
+    die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
+  }
+}
+$infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
+
+#####################################################################################################
+if ( 'seq' eq $goal ) {
+  my $query = FASTA::File->new( read_file => $infile );
+  my @seqs;
+  my $psi = PSIBLAST->new;
+  unless ( defined $psiblast && -e $psiblast ) {
+    ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
+    # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
+    # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
+    # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
+    my $psi_run = PSIBLAST::Run->new(
+      debug    => $DEBUG,
+      args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
+      path     => $psiblastbin,
+      input    => $infile,
+      output   => "$prefix.blast",
+      matrix   => "$prefix.profile",
+      database => $database->{$db_entry}{database},
+    );
+
+    # For reduced databases, get the size of the orginal and use this as
+    # the equivilent DB size
+    if ( $db_entry =~ /^sp_red_/ ) {
+      ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
+      $psi_run->args(
+        $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
+      );
+    }
+    print "BLAST matrix: $ENV{BLASTMAT}\n"      if $DEBUG;
+    print "BLASTDB path: $ENV{BLASTDB}\n"       if $DEBUG;
+    print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
+    print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
+
+    print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
+    print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
+    $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
+
+    $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
+    system("gzip -9f $prefix.blast");
+  } else {
+    if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
+    else                         { $psi->read_file($psiblast) }                        # CC ditto above
+  }
+
+#####################################################################################################
+  # Convert the last itteration into a collection of PSISEQ objects
+  for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
+    my ( $id, $start, $align ) = @{$_};
+    push @seqs,
+      PSISEQ->new(
+      id    => $id,
+      start => $start,
+      align => [ split( //, $align ) ]
+      );
+  }
+
+#####################################################################################################
+  # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
+  # Accuracy won't be as good, but at least it's a prediction
+  if ( @seqs == 0 ) {
+    if ( $predNoHits == 0 ) {
+      warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
+      print ">>100% complete\n";
+      print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
+      print $LOG ">>100% complete\n"                                                                         if $LOG;
+      close($LOG)                                                                                            if $LOG;
+      exit;
+    } else {
+      print ">>50% complete\n";
+      warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
+      print "Running HMMer on query...\n"                                                            if $VERBOSE;
+      print $LOG ">>50% complete\n"                                                                  if $LOG;
+      print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
+      print $LOG "Running HMMer on query...\n"                                                       if $LOG;
+
+      # copy input query to alignment
+      #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
+      open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
+      open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
+      while (<$ifh>) { print $ofh, $_ }
+      close $ifh;
+      close $ofh;
+
+      # Temp files required for HMMer
+      my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
+      system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
+      system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+      unlink $hmmbuild_out;
+
+      # Read in the HMMER file
+      my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+      # Convert from HMMER::Profile to HMMER::Profile::Jnet
+      my $hmmer_jnet = HMMER::Profile::Jnet->new;
+      $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
+      $hmmer_jnet->write_file("$prefix.hmm");
+      print ">>70% complete\n";
+      print $LOG ">>70% complete\n" if $LOG;
+
+      # Run Jnet for the prediction
+      print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
+      print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
+      my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
+      print ">>90% complete\n";
+      print $LOG ">>90% complete\n" if $LOG;
+      print $jnetlog                if $VERBOSE;
+      print $LOG $jnetlog           if $LOG;
+    }
+  } else {
+    psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
+    print ">>40% complete\n";
+    print $LOG ">>40% complete\n" if $LOG;
+
+    #####################################################################################################
+    # Make PSIBLAST truncated alignments the right length
+    print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
+    print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
+    @seqs = extend( $query, @seqs );
+    psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Remove masking from PSIBLAST alignment
+    print "Unmasking the alignments...\n" if $VERBOSE;
+    print $LOG "Unmasking the alignments...\n" if $LOG;
+    my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
+    remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
+    psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Convert the sequences to upper case
+    print "Converting sequences to the same case...\n" if $VERBOSE;
+    print $LOG "Converting sequences to the same case...\n" if $LOG;
+    toupper($_) for @seqs;    # CC sub is below
+    psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
+
+    #####################################################################################################
+    # Remove excessive sequences
+    print "Remove excessive sequences...\n" if $VERBOSE;
+    print $LOG "Remove excessive sequences...\n" if $LOG;
+    @seqs = reduce( $MAX_ALIGN, @seqs );
+    psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Remove sequences that are too long or too short
+    print "Remove sequences which too long or short...\n" if $VERBOSE;
+    print $LOG "Remove sequences which too long or short...\n" if $LOG;
+    @seqs = del_long_seqs( 50, @seqs );
+    psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Remove redundant sequences based upon pairwise identity and OC clustering
+    print "Remove redundant sequences...\n" if $VERBOSE;
+    print $LOG "Remove redundant sequences...\n" if $LOG;
+    @seqs = nonred( $NR_CUT, @seqs );
+    psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
+
+    #####################################################################################################
+    # Check that we haven't got rid of all of the sequences
+    if ( @seqs < 2 ) {
+      warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
+      print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
+      @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
+    }
+    unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
+
+    # Remove gaps in the query sequence and same positions in the alignment
+    print "Removing gaps in the query sequence...\n" if $VERBOSE;
+    print $LOG "Removing gaps in the query sequence...\n" if $LOG;
+    degap(@seqs);
+    psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
+
+    # Output the alignment for the prediction
+    print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
+    print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
+    psiseq2fasta( "$prefix.align", @seqs );
+
+    #####################################################################################################
+    # Equivilent to getpssm script
+    print "Output the PSSM matrix from the PSI-BLAST profile...\n";
+    print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
+    my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
+    $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
+    print ">>50% complete\n";
+    print $LOG ">>50% complete\n" if $LOG;
+
+    #####################################################################################################
+    # Run HMMER on the sequences
+    print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
+    print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
+    my $hmmer = hmmer(@seqs);             # CC sub is below
+    $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
+    print ">>70% complete\n";
+    print $LOG ">>70% complete\n" if $LOG;
+#####################################################################################################
+    # Run Jnet for the prediction
+    print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
+    print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
+    my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
+    print ">>90% complete\n";
+    print $LOG ">>90% complete\n" if $LOG;
+    print $jnetlog                if $VERBOSE;
+    print $LOG $jnetlog           if $LOG;
+  }
+} else {
+  if ( 'fasta' eq $format ) {
+    print "Read FASTA file\n";
+    my $hmmer1 = hmm_for_align($infile);
+    $hmmer1->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
+  } elsif ( 'msf' eq $format ) {
+    msf2fasta( $infile, $infastafile );
+    my $hmmer2 = hmm_for_align($infastafile);
+    $hmmer2->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
+  } elsif ( 'blc' eq $format ) {
+    my $tmpfile = File::Temp->new->filename;
+    blc2msf( $infile, $tmpfile );
+    msf2fasta( $infile, $infastafile );
+    my $hmmer3 = hmm_for_align($infastafile);
+    $hmmer3->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
+  } else {
+    die "ERROR! unknown input format '$format'. exit...\n";
+  }
+  print ">>70% complete\n";
+  print $LOG ">>70% complete\n" if $LOG;
+#####################################################################################################
+  # Run Jnet for the prediction
+  print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
+  print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
+  my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );           # CC sub is below
+  print ">>90% complete\n";
+  print $LOG ">>90% complete\n" if $LOG;
+  print $jnetlog                if $VERBOSE;
+  print $LOG $jnetlog           if $LOG;
+}
+
+unless ( defined $nofinal ) {
+  my $aligfile      = $prefix . ".align";
+  my $jnetfile      = $prefix . ".jnet";
+  my $jnetfastafile = $prefix . ".jnet.fasta";
+  $aligfile = $infile if ( 'fasta' eq $format );
+  $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
+  concise2fasta( $jnetfile, $jnetfastafile );
+  open( my $IN1, "<", $aligfile )      or die "ERROR! unable to open '$aligfile': ${!}\nDied";
+  open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
+  open( my $OUT, ">", $outfile )       or die "ERROR! unable to open '$outfile': ${!}\nDied";
+  while (<$IN2>) { print $OUT $_; }
+  while (<$IN1>) { print $OUT $_; }
+  close($IN1);
+  close($IN2);
+  close($OUT);
+  unlink $jnetfastafile;
+  my $seqs = FASTA::File->new( read_file => $outfile );
+  open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
+  $seqs->write($OUT2);
+  close($OUT2);
+} else {
+  if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
+    rename $prefix . ".jnet", $outfile;
+  }
+}
+
+print ">>100% complete\n";
+print "Jpred Finished\n";
+print $LOG ">>100% complete\n" if $LOG;
+print $LOG "Jpred Finished\n"  if $LOG;
+close($LOG)                    if $LOG;
+exit;
+
+#####################################################################################################
+# Functions
+#####################################################################################################
+sub check_FASTA_format {
+  my $infile = shift;
+
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+  my $check_first_line = 1;
+  my $nseq             = 0;
+  local $/ = "\n>";
+  while (<$IN>) {
+    if ($check_first_line) {
+      return 0 unless (/^>/);
+      $check_first_line = 0;
+    }
+    s/^>//g;
+    s/>$//g;
+
+    my ( $id, @seqs ) = split /\n/, $_;
+    return 0 unless ( defined $id or @seqs );
+    my $seq = join( "", @seqs );
+    return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
+    ++$nseq;
+  }
+  close($IN);
+
+  return $nseq;
+}
+#####################################################################################################
+sub check_MSF_format {
+  my $infile = shift;
+  $? = 0;
+  system("$readseq -fMSF -a -p < $infile > /dev/null");
+  return 0 if ($?);
+
+  return 1;
+}
+#####################################################################################################
+sub check_BLC_format {
+  my $infile = shift;
+
+  my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
+  open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
+  print $fh "silent_mode\nblock_file $infile\n";
+  print $fh "output_file /dev/null\nmax_nseq 2000\n";
+  print $fh "pir_save $tmpfile2\nsetup\n";
+  close $fh;
+
+  $? = 0;
+  system("$alscript -s -f $tmpfile1");
+  $? = 0;
+  system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
+  unlink $tmpfile1;
+  unlink $tmpfile2;
+  return 0 if ($?);
+
+  return 1;
+}
+############################################################################################################################################
+# convertor BLC -> MSF
+sub msf2fasta {
+  my $infile  = shift;
+  my $outfile = shift;
+
+  my $sequence = "";
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+  while (<$IN>) {
+    $sequence .= $_;
+  }
+  close($IN);
+
+  $sequence =~ s/~/\./g;
+
+  ## check for non-unique sequence names - readseq has problems with these
+  my @lines = split /\n/, $sequence;
+  my %names;
+  foreach my $line (@lines) {
+    if ( $line =~ /Name:\s+(\S+)/ ) {
+      die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
+      $names{$1}++;
+    }
+  }
+
+  my $tmpfile1 = File::Temp->new->filename;
+  my $tmpfile2 = File::Temp->new->filename;
+  open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
+  print $FILE $sequence;
+  close $FILE;
+
+  $? = 0;
+  system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
+  die "Reformatting of $infile failed. exit...\n" if ($?);
+  unlink $tmpfile1;
+
+  $? = 0;
+  system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
+  die "Reformatting of $infile failed. exit...\n" if ($?);
+  unlink $tmpfile2;
+
+  die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
+}
+
+############################################################################################################################################
+# convertor BLC -> MSF
+sub blc2msf {
+  my ( $in, $out ) = @_;
+
+  my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
+
+  open my $fh, ">$tmp" or die "$tmp: $!\n";
+  print $fh "silent_mode\nblock_file $infile\n";
+  print $fh "output_file /dev/null\nmax_nseq 2000\n";
+  print $fh "pir_save $tmp_pir\nsetup\n";
+  close $fh;
+
+  $? = 0;
+  system("$alscript -s -f $tmp");
+  die "Reformatting of $infile failed. exit...\n" if ($?);
+  system("$readseq -f=MSF -o=$out -a $tmp_pir");
+
+  unlink $tmp;
+  unlink $tmp_pir;
+}
+
+#####################################################################################################
+
+=begin :private
+
+=head2 fasta2concise ($infile, $outfile)
+
+Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
+
+=cut
+
+sub fasta2concise {
+  my $infile  = shift;
+  my $outfile = shift;
+
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
+  my ( $seq, @seqs, @title );
+  while (<$IN>) {
+    if (s/^>//) {
+      if ($seq) {
+        $seq =~ s/\n|\s//g;
+        $seq =~ s/(.)/$1,/g;
+        push @seqs, $seq;
+        $seq = "";
+      }
+      chomp;
+      push @title, $_;
+    } else {
+      chomp;
+      $seq .= $_;
+    }
+  }
+  $seq =~ s/\n|\s//g;
+  $seq =~ s/(.)/$1,/g;
+  push @seqs, $seq;
+  close($IN);
+
+  if ( @title != @seqs ) {
+    die("non matching number of titles and sequences!\n");
+  }
+
+  open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+  foreach ( 0 .. $#title ) {
+    print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
+  }
+  close($OUT);
+}
+
+=begin :private
+
+=head2 concise2fasta ($infile, $outfile)
+
+Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
+
+=cut
+
+#####################################################################################################
+sub concise2fasta {
+  my $infile  = shift;
+  my $outfile = shift;
+
+  my ( @seqs, %seq, @pred, %pred );
+
+  my @var = (
+    "Lupas_21",  "Lupas_14", "Lupas_28", "MULTCOIL",  "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
+    "JNETALIGN", "JNETHMM",  "JNETSOL5", "JNETSOL25", "JNETSOL0",        "jnetpred",       "jpred"
+  );
+
+  # parse input concise file
+  open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
+  while (<$IN>) {
+    unless (/^\n/) {
+      my ( $id, $seq ) = split( ":", $_ );
+      if ( defined $id and defined $seq ) {    # Check we have proper values
+        $seq =~ s/,//g;
+        chomp $seq;
+        if ( $id =~ /;/ ) {                    # Then it's an alignment
+          @_ = split( ";", $id );
+          push @seqs, $_[1];
+          $seq{ $_[1] } = $seq;
+        }
+        foreach my $v (@var) {
+          if ( $v eq $id ) {
+            push @pred, $v;
+            $pred{$v} = $seq;
+          }
+        }
+      }
+    }
+  }
+  close($IN);
+
+  open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+
+  # print out sequences found with PSI-BLAST
+  foreach (@seqs) {
+    $seq{$_} =~ s/(.{72})/$1\n/g;
+    print $OUT ">$_\n$seq{$_}\n";
+  }
+
+  # print out predictions
+  foreach my $p (@pred) {
+    $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
+    $pred{$p} =~ s/B/E/g;
+    $pred{$p} =~ s/G/H/g;
+
+    if ( $p =~ /SOL/ ) {
+      $pred{$p} =~ s/E/B/g;
+    }
+    $pred{$p} =~ s/(.{72})/$1\n/g;
+    print $OUT ">$p\n$pred{$p}\n";
+  }
+  close($OUT);
+}
+
+=begin :private
+
+=head2 $PSISEQ = remove_seq_masks($PSISEQ)
+
+Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database. 
+Does it in place.
+
+=cut
+
+#####################################################################################################
+sub remove_seq_masks {
+  my $idx = shift;
+  my $seq = shift;
+
+  # Only bother if we have a sequence which has been masked
+  return unless grep { uc $_ eq 'X' } @{ $seq->align };
+
+  my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
+  my $searchdb = $database->{$db_entry}{unfiltered};
+  $start--;    # We need this to be zero based
+
+  my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
+  my @seqs = $f->get_entries;
+  my $db   = shift @seqs;
+
+  unless ($db) {
+    confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
+  } elsif (@seqs) {
+    confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
+  }
+
+  my @db_seq = @{ $db->seq };
+
+  # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
+  my $i = 0;
+  for ( 0 .. $#align ) {
+    next if $align[$_] =~ /$GAP/;
+
+    if ( $align[$_] eq 'X' ) {
+      unless ( defined $db_seq[ $i + $start ] ) {
+        croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
+      }
+      $align[$_] = $db_seq[ $i + $start ];
+    }
+    $i++;
+  }
+  $seq->align( \@align );
+}
+
+=head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
+
+Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
+
+Equivilent to reduce script.
+
+=cut
+
+#####################################################################################################
+sub reduce {
+  my ( $max, @seqs ) = @_;
+
+  my $num = int( @seqs / $max );
+  my @res = shift @seqs;
+
+  # Don't bother if we have less than the required sequences
+  if ( @seqs < $max or 0 == $num ) {
+    return @res, @seqs;
+  }
+
+  for ( 0 .. $#seqs ) {
+    push @res, $seqs[$_] if ( 0 == $_ % $num );
+  }
+
+  return @res;
+}
+
+=head2 nonred($cutoff, @PSISEQS);
+
+Removes redundant sequences at $cutoff level of sequence identity.
+
+Equivilent to nonred script.
+
+=cut
+
+#####################################################################################################
+sub nonred {
+  my ( $cutoff, @seqs ) = @_;
+
+  # Run pairwise and then OC and remove similar sequences
+  unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
+  unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
+  if     ( @seqs == 1 ) {
+    warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
+    return @seqs;
+  }
+
+  # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
+  my $fasta = FASTA::File->new;
+  foreach (@seqs) {
+    my $f = FASTA->new( id => $_->id );
+    $f->seq( @{ $_->align } );
+    $fasta->add_entries($f);
+  }
+
+  # Run pairwise
+  my $pair = Pairwise->new( path => $pairwise );
+  my $foo = join "\n", $pair->run($fasta) or die $!;
+
+  # Run OC
+  my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
+  $ocres->read_string( join "", $ocres->run($foo) );
+
+  # Now found those entries that are unrelated
+  my @keep;
+  for ( $ocres->get_groups ) {
+    my ( $group, $score, $size, @labels ) = @{$_};
+
+    # Keep the QUERY from the cluster, otherwise just get onelt
+    if ( grep { /QUERY/ } @labels ) {
+
+      # Make sure the query stays at the start
+      unshift @keep, "QUERY";
+      next;
+    } else {
+      push @keep, shift @labels;
+    }
+  }
+  push @keep, $ocres->get_unclustered;
+
+  # Return the matching entries.
+  # We use the temporay to mimic the selection of sequences in the nonred
+  # script, which took the sequences in the order they occured in the file.
+  my @filtered_res;
+  for (@seqs) {
+    my $label = $_->id;
+    if ( grep { $_ =~ /^$label$/ } @keep ) {
+      push @filtered_res, $_;
+    }
+  }
+
+  return @filtered_res;
+}
+
+=head2 @seqs = del_long_seqs($percentage, @PSISEQS)
+
+Removes those sequences that are more than $percentage longer or shorter than the first 
+sequence in @seqs. @seqs is an array of PSISEQ objects.
+
+Equivilent to trim_seqs script.
+
+=cut
+
+#####################################################################################################
+sub del_long_seqs {
+  my ( $factor, @seqs ) = @_;
+
+  # Keep the query sequence (we assume query is the 1st FASTA record)
+  my $query = shift @seqs;
+  my @res   = $query;
+
+  # Find the length of the query sequence without any gaps
+  my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
+
+  # Calculate the upper and lower allowed bounds
+  my ( $upper, $lower );
+  {
+    my $bounds = ( $length / 100 ) * $factor;
+    ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
+  }
+
+  # Now try each sequence and see how long they are
+  for (@seqs) {
+    my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
+    if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
+  }
+
+  return @res;
+}
+
+=head2 toupper ($PSISEQ)
+
+Converts sequence held in PSISEQ object to uppercase.
+
+=cut
+
+#####################################################################################################
+sub toupper {
+  $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
+}
+
+=head2 degap($PSISEQ_QUERY, @PSISEQS)
+
+Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
+
+Equivilent to fasta2jnet script.
+
+=cut
+
+#####################################################################################################
+sub degap {
+  my (@seqs) = @_;
+
+  # Find where the gaps in the query sequence are
+  my @gaps;
+  my $i = 0;
+  for ( @{ $seqs[0]->align } ) {
+    push @gaps, $i if $_ !~ /$GAP/;
+    $i++;
+  }
+
+  return unless @gaps;
+
+  # Remove the gaps in all the sequences.
+  # Derefences the array reference and takes a slice inside the method call
+  # arguments, then coerce back to an array ref.
+  for (@seqs) {
+    $_->align( [ @{ $_->align }[@gaps] ] );
+  }
+}
+
+=head2 getfreq($filename, @PSISEQS)
+
+Creates a PSIBLAST like profile and writes it to the $filename.
+
+Equivilant to profile or getfreq (older) script. This doesn't create the same answer as 
+the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output, 
+but it's *not* the same.
+
+=cut
+
+#####################################################################################################
+sub getfreq {
+  my ( $fn, @seqs ) = @_;
+
+  #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
+
+  # Create a PSIBLAST like profile from the alignment
+  # This doesn't greate the same result as old Jpred, I think this is because
+  # differences in the input between old and new
+
+  # For testing
+  #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
+  #   my @profile = profile(
+  #      map { join "", @{ $_->seq } } $f->get_entries
+  #   );
+
+  my @profile = profile( map { join "", @{ $_->align } } @seqs );
+
+  open my $fh, ">$fn" or die "JPRED: $fn: $!";
+  print $fh join( " ", @{$_} ), "\n" for @profile;
+}
+
+=head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
+
+Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
+
+Equivilent to gethmm script.
+
+=cut
+
+#####################################################################################################
+sub hmmer {
+  my (@seqs) = @_;
+
+  # Temp files required
+  my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
+
+  # Create the FASTA file
+  psiseq2fasta( $tmp_fasta, @seqs );
+
+  # Run HMMer
+  system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
+  system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+
+  # Read in the HMMER file
+  my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+  # Convert from HMMER::Profile to HMMER::Profile::Jnet
+  my $hmmer_jnet = HMMER::Profile::Jnet->new;
+  $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
+
+  return $hmmer_jnet;
+}
+
+=head2 jnet($hmmer, $out, $pssm )
+
+Runs Jnet. Pass the paths of the filenames to do the prediction on
+
+=cut
+
+#################################################################################################################################################
+
+=head2 get_hmm($alignFile, $name)
+
+Adapted from the hmmer() function in Jpred as written by JDB.
+
+Generates an HMMer profile output compatible with Jnet.
+
+Equivilent to gethmm script.
+
+=cut
+
+sub hmm_for_align {
+  my $fastafile = shift;
+
+  # Temp files required
+  print "hmm_for_align: fastafile = $fastafile\n";
+  my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
+
+  system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
+  system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+
+  # Read in the HMMER file
+  my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+  # Read in the HMMER file
+  my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+  # Convert from HMMER::Profile to HMMER::Profile::Jnet
+  my $hmmer_jnet = HMMER::Profile::Jnet->new;
+  $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
+
+  return $hmmer_jnet;
+}
+
+#####################################################################################################
+sub jnet {
+  my ( $hmmer, $outfile, $pssm ) = @_;
+  if ($pssm) {
+
+    # run Jnet prediction with all the profile data
+    system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
+  } else {
+
+    # run Jnet prediction with only HMMer and alignment data
+    system("$jnet --concise --hmmer $hmmer > $outfile");
+  }
+  check( "jnet", $? ) or exit 1;
+  my $res     = "";
+  my $logging = "";
+  open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+  while (<$IN>) {
+    if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
+      $res .= $_;
+    } else {
+      $logging .= $_;
+    }
+  }
+  close $IN;
+  open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+  print $OUT "\n" . $res;
+  close $OUT;
+  return $logging;
+}
+
+=head1 psiseq2fasta($path, @PSISEQ)
+
+Writes a FASTA file given an array of PSISEQ objects.
+
+=cut
+
+#####################################################################################################
+sub psiseq2fasta {
+  my ( $fn, @seqs ) = @_;
+
+  #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
+  confess "JPRED: not passed PSISEQs" unless @seqs;
+
+  #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
+
+  # Changed from using FASTA::File object due to seg faulting on some large,
+  # long alignments, presumably due to running out of memory.
+  #   my $fasta = FASTA::File->new;
+  #   $fasta->add_entries(
+  #      map {
+  #         my $f = FASTA->new(id => $_->id);
+  #         $f->seq(@{$_->align});
+  #         $f;
+  #      } @seqs
+  #   );
+  #   $fasta->write_file($fn);
+
+  my $fh;
+  $fn =~ /\.gz$/
+    ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
+    : ( open $fh, ">$fn" or die "JPRED: $!" );
+
+  for (@seqs) {
+    print $fh ">", $_->id, "\n";
+    my $seq = join( "", @{ $_->align } );
+    $seq =~ s/(.{72})/$1\n/g;
+    chomp $seq;
+    print $fh $seq . "\n";
+  }
+  close $fh;
+}
+
+=head1 @PSISEQ = fasta2psiseq($path)
+
+Reads in a FASTA file and returns an array of PSISEQ objects.
+
+=cut
+
+#####################################################################################################
+sub fasta2psiseq {
+  my ($fn) = @_;
+
+  my $fasta =
+    $fn =~ /\.gz$/
+    ? FASTA::File->new( read_gzip_file => $fn )
+    : FASTA::File->new( read_file      => $fn );
+  $fasta or die "JPRED: $!";
+
+  my @seqs;
+  for ( $fasta->get_entries ) {
+    my $seq = PSISEQ->new;
+    $seq->id( $_->id );
+    $seq->align( [ @{ $_->seq } ] );
+    push @seqs, $seq;
+  }
+
+  return @seqs;
+}
+
+#####################################################################################################
+sub fasta2psiseq_lowmemory {
+  my $filename = shift;
+
+  local $/ = "\n";
+  my $fh;
+  $filename =~ /\.gz$/
+    ? ( open $fh, "gzip -dc $filename|" or die $! )
+    : ( open $fh, $filename or die $! );
+
+  my @seqs;
+  my ( $id, $seq );
+  while (<$fh>) {
+    chomp;
+    if (s/^>//) {
+      $seq =~ s/ //g if defined $seq;
+
+      if ( $id and $seq ) {
+        my $psi = PSISEQ->new( id => $id );
+        $psi->align( [ split //, $seq ] );
+        push @seqs, $psi;
+        undef $id;
+        undef $seq;
+      } else {
+        $id = $_;
+      }
+    } else {
+      $seq .= $_;
+    }
+  }
+  if ( $id and $seq ) {
+    my $psi = PSISEQ->new( id => $id );
+    $psi->seq($seq);
+    push @seqs, $psi;
+  }
+
+  return @seqs;
+}
+
+#####################################################################################################
+
+=head1 extend($FASTA::File, @PSISEQ)
+
+Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues 
+against the query, this function extends the alignment so that all of the query is 
+present in the alignment. The other sequences are extended with gap characters.
+
+=cut
+
+sub extend {
+  my ( $query, @seqs ) = @_;
+
+  #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
+  #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
+
+  # Get the query sequence
+  my $q_query = $query->get_entry(0);
+
+  # Get the query sequence from the PSIBLAST alignment
+  my $a_query = shift @seqs;
+
+  # The gap size required to expand the alignment
+  my ( $start_gap_length, $end_gap_length );
+  {
+
+    # Make handling the sequence easier
+    my $q_seq = join "", @{ [ $q_query->seq ] };
+    my $a_seq = join "", @{ $a_query->align };
+
+    $q_seq =~ s/-//g;
+    $a_seq =~ s/-//g;
+
+    ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
+
+    $start_gap_length = index( $q_seq, $a_seq );
+    croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
+
+    $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
+  }
+
+  # Add the gaps to the end of the alignments
+  for (@seqs) {
+    $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
+  }
+
+  # Put the query sequence back
+  unshift @seqs,
+    PSISEQ->new(
+    id    => $a_query->id,
+    align => [
+      ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
+      @{ $a_query->align },
+      ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
+    ]
+    );
+
+  return @seqs;
+}
+
+#####################################################################################################
+
+=head1 $int = fasta_seq_length($path)
+
+Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
+
+=cut
+
+sub fasta_seq_length {
+  my ($fn) = @_;
+  open my $fh, $fn or croak "Can't open file $fn: $!\n";
+
+  my $length = 0;
+  local $_, $/ = "\n";
+  while (<$fh>) {
+    next if /^>/;
+    chomp;
+    $length += tr///c;
+  }
+
+  return $length;
+}
author	Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
	Mon, 14 Oct 2013 14:26:30 +0000 (15:26 +0100)
committer	Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
	Mon, 14 Oct 2013 14:26:30 +0000 (15:26 +0100)