binaries/src/jpred/jpred.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9 ./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence
  14 (with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC.
  15 For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the
  16 secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
  17
  18 =head1 OPTIONS
  19
  20 =over 5
  21
  22 =item -in <FILE1>
  23
  24 The path to the sequence file (in FASTA, MSF, or BLC format)
  25
  26 =item -seq
  27
  28 The input file is a FASTA file with one sequence only.
  29
  30 =item -output <FILEPREFIX>
  31
  32 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
  33
  34 =item -outfile <FILE2>
  35
  36 An output file, by default it is undefined and the -output option is working
  37
  38 =item -logfile <FILE3>
  39
  40 Logging file, by default it is undefined and logging switched off
  41
  42 =item -dbpath PATH
  43
  44 Path to the uniref database used for PSI-BLAST querying. default value: .
  45
  46 =item -dbname database
  47
  48 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
  49 Default value: uniref90
  50
  51 =item -psi path
  52
  53 Path to a PSIBLAST output file.
  54
  55 =item -ncpu NNN
  56
  57 Number of CPU used by jpred.pl. Maximum value is 8
  58 Default: NNN = 1
  59
  60 =item -pred-nohits
  61
  62 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  63
  64 =item -no-final
  65
  66 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
  67 The option untoggles this step and stops jpred at the concise file generation
  68
  69 =item -jabaws
  70
  71 Redefined basic variable in order to use the script within JABAWS
  72
  73 =item -verbose
  74
  75 Verbose mode. Print more information while running jpred.
  76
  77 =item -debug
  78
  79 Debug mode. Print debugging information while running jpred.
  80
  81 =item -help
  82
  83 Gives help on the programs usage.
  84
  85 =item -man
  86
  87 Displays this man page.
  88
  89 =back
  90
  91 =head1 BUGS
  92
  93 Can't cope with gaps in the query sequence.
  94
  95 Doesn't accept alignments.
  96
  97 If you find any others please contact authors.
  98
  99 =head1 AUTHORS
 100
 101 Jonathan Barber <jon@compbio.dundee.ac.uk>
 102 Chris Cole <christian@cole.name>
 103 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
 104
 105 =cut
 106
 107 ## TODO check for gaps in query sequence
 108 ##      check for disallowed chars in sequence
 109
 110 use strict;
 111 use warnings;
 112 use Getopt::Long;
 113 use Pod::Usage;
 114 use Carp;
 115 use Data::Dumper;
 116 use File::Temp;
 117
 118 # path to Jpred modules
 119 use FindBin qw($Bin);
 120 use lib "$Bin/lib";
 121
 122 # internal jpred modules
 123 use Jpred;
 124 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
 125 use HMMER::Profile;
 126 use HMMER::Profile::Jnet;
 127 use PSIBLAST;
 128 use PSIBLAST::PSSM;
 129 use PSIBLAST::Run;
 130 use FASTA::File;
 131 use FASTA;
 132 use Index;
 133 use Pairwise;
 134 use OC;
 135 use Utils qw(profile);
 136 use Run qw(check);
 137
 138 our $VERSION = '3.0.2';
 139 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 140 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 141
 142 # Gap characters in sequences
 143 our $GAP = '-';
 144
 145 #####################################################################################################
 146 # Light object to hold sequence information, light to prevent memory overload
 147 # for big search results. Even so this is quite memory intensive, I can't see
 148 # how to reduce the size further.
 149 {
 150
 151   package PSISEQ;
 152
 153   sub new {
 154     my ( $class, %args ) = @_;
 155     my $self = [];
 156     bless $self, $class;
 157     for ( keys %args ) { $self->$_( $args{$_} ) }
 158     return $self;
 159   }
 160
 161   sub id {
 162     return defined $_[1]
 163       ? $_[0]->[0] = $_[1]
 164       : $_[0]->[0];
 165   }
 166
 167   sub start {
 168     return defined $_[1]
 169       ? $_[0]->[1] = $_[1]
 170       : $_[0]->[1];
 171   }
 172
 173   sub align {
 174     if ( defined $_[1] ) {
 175       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 176
 177       $_[0]->[2] = join( "", @{ $_[1] } );
 178     } else {
 179       return [ split( //, $_[0]->[2] ) ];
 180     }
 181   }
 182 }
 183
 184 #####################################################################################################
 185 my $infile;
 186 my $infastafile;
 187 my $format;
 188 my $goal = "align";
 189 my $seqgoal;
 190 my $outfile;
 191 my $prefix;
 192 my $psiblast;
 193 my $jabaws;
 194 my $logfile;
 195 my $ncpu       = 1;                              # number of CPUs for psiblast calculations
 196 my $predNoHits = 0;                              # define whether to make predictions when no PSI-BLAST hits are found
 197 my $db_path    = "/homes/www-jpred/databases";
 198 my $db_entry   = "cluster";
 199 my $nofinal;
 200
 201 my ( $help, $man, $DEBUG, $VERBOSE );
 202
 203 GetOptions(
 204   "in=s"        => \$infile,
 205   "output=s"    => \$prefix,
 206   "outfile=s"   => \$outfile,
 207   "logfile=s"   => \$logfile,
 208   "psi=s"       => \$psiblast,
 209   "dbname=s"    => \$db_entry,
 210   "dbpath=s"    => \$db_path,
 211   "ncpu=s"      => \$ncpu,
 212   "pred-nohits" => \$predNoHits,
 213   "no-final"    => \$nofinal,
 214   "seq"         => \$seqgoal,
 215   "jabaws"      => \$jabaws,
 216
 217   "help"    => \$help,
 218   "man"     => \$man,
 219   "debug"   => \$DEBUG,
 220   "verbose" => \$VERBOSE
 221 ) or pod2usage(2);
 222 pod2usage(1) if $help;
 223 pod2usage( verbose => 2 ) if $man;
 224
 225 $goal = "seq" if ( defined $seqgoal );
 226
 227 #####################################################################################################
 228 # Key to database information and information for accessing them
 229 if ( defined $jabaws ) {
 230   $db_path  = "/homes/www-jpred/databases";
 231   $db_entry = "cluster";
 232 }
 233
 234 my $database = {
 235
 236   ## default database used by Jpred
 237   uniref90 => {
 238     database   => $db_path . "/uniref90.filt",
 239     unfiltered => $db_path . "/uniref90",
 240   },
 241
 242   ## database used during Jnet training
 243   training => {
 244     database   => $db_path . "/training/uniref90.filt",
 245     unfiltered => $db_path . "/training/uniref90",
 246   },
 247
 248   ## database used for ported Jpred
 249   ported_db => {
 250     database   => $db_path . "/uniref90.filt",
 251     unfiltered => $db_path . "/uniref90",
 252   },
 253
 254   ## cluster-specific path for Jpred
 255   cluster => {
 256     database   => $db_path . "/uniref90.filt",
 257     unfiltered => $db_path . "/uniref90",
 258   },
 259
 260   ## these other DBs are experimental ones used during development.
 261   ## check they exist before using them.
 262   # generic entry for use with validate_jnet.pl
 263   # real db location defined by validate_jnet.pl
 264   swall => {
 265     database   => $db_path . "/swall/swall.filt",
 266     unfiltered => $db_path . "/swall/swall",
 267   },
 268
 269   # Path to PSIBLAST db
 270   uniprot => {
 271     database   => $db_path . "/3/swall.filt",
 272     unfiltered => $db_path . "/3/swall",
 273   },
 274
 275   uniref50 => {
 276     database   => $db_path . "/6/swall.filt",
 277     unfiltered => $db_path . "/6/swall",
 278   },
 279 };
 280
 281 my $dpf = $database->{$db_entry}{database}.'.pal';
 282 my $dpu = $database->{$db_entry}{unfiltered}.'.pal';
 283 pod2usage("ERROR! Input file should be provided with -sequence/-in")                                              unless $infile;
 284 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database")            unless exists $database->{$db_entry};
 285 pod2usage("ERROR! UNIREF filtered   database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf;
 286 pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu;
 287
 288 #####################################################################################################
 289 # lots of preparation steps
 290 if ( defined $prefix ) {
 291   unless ( defined $outfile ) {
 292     $outfile = $prefix . ".res.fasta";
 293   }
 294 } else {
 295   if ( defined $outfile ) {
 296     print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
 297     $prefix = $outfile . ".tmp";
 298   } else {
 299     print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
 300     $prefix  = $infile . ".res";
 301     $outfile = $prefix . ".jnet";
 302   }
 303 }
 304
 305 if ( 1 > $ncpu or 8 < $ncpu ) {
 306   print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
 307   $ncpu = 1;
 308 }
 309
 310 $VERBOSE = 1 if $DEBUG;
 311
 312 my $LOG;
 313 if ( defined $logfile ) {
 314   open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
 315 }
 316
 317 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
 318   my ( $key, $value ) = @{$_};
 319   defined $value or next;
 320   my $par = join( ": ", $key, $value );
 321   print "$par\n" if $DEBUG;
 322   print $LOG "$par\n" if $LOG;
 323 }
 324
 325 if ( defined $jabaws ) {
 326   setup_jpred_env($Bin);
 327   setup_env($Bin);
 328 }
 329 my $platform = check_OS();
 330 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 331 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
 332
 333 #####################################################################################################
 334 # check input file format
 335 if ( 'seq' eq $goal ) {
 336   $format = "seq";
 337   if ( 1 != check_FASTA_format($infile) ) {
 338     die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
 339   }
 340 } else {
 341   my $nseq = check_FASTA_format($infile);
 342   if ( 0 < $nseq ) {
 343     $format = "fasta";
 344     if ( 1 == $nseq ) {
 345       die "\nERROR! jpred requires alignment with more than 1 sequence\n       if you provide only one sequence use the -seq option.\n";
 346     }
 347   } elsif ( 0 < check_MSF_format($infile) ) {
 348     $format = "msf";
 349   } elsif ( 0 < check_BLC_format($infile) ) {
 350     $format = "blc";
 351   } else {
 352     die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
 353   }
 354 }
 355 $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
 356
 357 #####################################################################################################
 358 if ( 'seq' eq $goal ) {
 359   my $query = FASTA::File->new( read_file => $infile );
 360   my @seqs;
 361   my $psi = PSIBLAST->new;
 362   unless ( defined $psiblast && -e $psiblast ) {
 363     ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 364     # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 365     # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 366     # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 367     my $psi_run = PSIBLAST::Run->new(
 368       debug    => $DEBUG,
 369       args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 370       path     => $psiblastbin,
 371       input    => $infile,
 372       output   => "$prefix.blast",
 373       matrix   => "$prefix.profile",
 374       database => $database->{$db_entry}{database},
 375     );
 376
 377     # For reduced databases, get the size of the orginal and use this as
 378     # the equivilent DB size
 379     if ( $db_entry =~ /^sp_red_/ ) {
 380       ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 381       $psi_run->args(
 382         $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 383       );
 384     }
 385     print "BLAST matrix: $ENV{BLASTMAT}\n"      if $DEBUG;
 386     print "BLASTDB path: $ENV{BLASTDB}\n"       if $DEBUG;
 387     print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
 388     print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
 389
 390     print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 391     print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
 392     $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 393
 394     $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 395     system("gzip -9f $prefix.blast");
 396   } else {
 397     if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 398     else                         { $psi->read_file($psiblast) }                        # CC ditto above
 399   }
 400
 401 #####################################################################################################
 402   # Convert the last itteration into a collection of PSISEQ objects
 403   for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 404     my ( $id, $start, $align ) = @{$_};
 405     push @seqs,
 406       PSISEQ->new(
 407       id    => $id,
 408       start => $start,
 409       align => [ split( //, $align ) ]
 410       );
 411   }
 412
 413 #####################################################################################################
 414   # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
 415   # Accuracy won't be as good, but at least it's a prediction
 416   if ( @seqs == 0 ) {
 417     if ( $predNoHits == 0 ) {
 418       warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
 419       print ">>100% complete\n";
 420       print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
 421       print $LOG ">>100% complete\n"                                                                         if $LOG;
 422       close($LOG)                                                                                            if $LOG;
 423       exit;
 424     } else {
 425       print ">>50% complete\n";
 426       warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 427       print "Running HMMer on query...\n"                                                            if $VERBOSE;
 428       print $LOG ">>50% complete\n"                                                                  if $LOG;
 429       print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
 430       print $LOG "Running HMMer on query...\n"                                                       if $LOG;
 431
 432       # copy input query to alignment
 433       #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
 434       open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 435       open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
 436       while (<$ifh>) { print $ofh, $_ }
 437       close $ifh;
 438       close $ofh;
 439
 440       # Temp files required for HMMer
 441       my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 442       system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 443       system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 444       unlink $hmmbuild_out;
 445
 446       # Read in the HMMER file
 447       my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 448
 449       # Convert from HMMER::Profile to HMMER::Profile::Jnet
 450       my $hmmer_jnet = HMMER::Profile::Jnet->new;
 451       $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 452       $hmmer_jnet->write_file("$prefix.hmm");
 453       print ">>70% complete\n";
 454       print $LOG ">>70% complete\n" if $LOG;
 455
 456       # Run Jnet for the prediction
 457       print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 458       print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 459       my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
 460       print ">>90% complete\n";
 461       print $LOG ">>90% complete\n" if $LOG;
 462       print $jnetlog                if $VERBOSE;
 463       print $LOG $jnetlog           if $LOG;
 464     }
 465   } else {
 466     psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
 467     print ">>40% complete\n";
 468     print $LOG ">>40% complete\n" if $LOG;
 469
 470     #####################################################################################################
 471     # Make PSIBLAST truncated alignments the right length
 472     print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 473     print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
 474     @seqs = extend( $query, @seqs );
 475     psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 476
 477     #####################################################################################################
 478     # Remove masking from PSIBLAST alignment
 479     print "Unmasking the alignments...\n" if $VERBOSE;
 480     print $LOG "Unmasking the alignments...\n" if $LOG;
 481     my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 482     remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 483     psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 484
 485     #####################################################################################################
 486     # Convert the sequences to upper case
 487     print "Converting sequences to the same case...\n" if $VERBOSE;
 488     print $LOG "Converting sequences to the same case...\n" if $LOG;
 489     toupper($_) for @seqs;    # CC sub is below
 490     psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
 491
 492     #####################################################################################################
 493     # Remove excessive sequences
 494     print "Remove excessive sequences...\n" if $VERBOSE;
 495     print $LOG "Remove excessive sequences...\n" if $LOG;
 496     @seqs = reduce( $MAX_ALIGN, @seqs );
 497     psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 498
 499     #####################################################################################################
 500     # Remove sequences that are too long or too short
 501     print "Remove sequences which too long or short...\n" if $VERBOSE;
 502     print $LOG "Remove sequences which too long or short...\n" if $LOG;
 503     @seqs = del_long_seqs( 50, @seqs );
 504     psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 505
 506     #####################################################################################################
 507     # Remove redundant sequences based upon pairwise identity and OC clustering
 508     print "Remove redundant sequences...\n" if $VERBOSE;
 509     print $LOG "Remove redundant sequences...\n" if $LOG;
 510     @seqs = nonred( $NR_CUT, @seqs );
 511     psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 512
 513     #####################################################################################################
 514     # Check that we haven't got rid of all of the sequences
 515     if ( @seqs < 2 ) {
 516       warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 517       print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
 518       @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
 519     }
 520     unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
 521
 522     # Remove gaps in the query sequence and same positions in the alignment
 523     print "Removing gaps in the query sequence...\n" if $VERBOSE;
 524     print $LOG "Removing gaps in the query sequence...\n" if $LOG;
 525     degap(@seqs);
 526     psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 527
 528     # Output the alignment for the prediction
 529     print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 530     print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
 531     psiseq2fasta( "$prefix.align", @seqs );
 532
 533     #####################################################################################################
 534     # Equivilent to getpssm script
 535     print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 536     print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
 537     my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
 538     $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 539     print ">>50% complete\n";
 540     print $LOG ">>50% complete\n" if $LOG;
 541
 542     #####################################################################################################
 543     # Run HMMER on the sequences
 544     print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 545     print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
 546     my $hmmer = hmmer(@seqs);             # CC sub is below
 547     $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 548     print ">>70% complete\n";
 549     print $LOG ">>70% complete\n" if $LOG;
 550 #####################################################################################################
 551     # Run Jnet for the prediction
 552     print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 553     print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
 554     my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
 555     print ">>90% complete\n";
 556     print $LOG ">>90% complete\n" if $LOG;
 557     print $jnetlog                if $VERBOSE;
 558     print $LOG $jnetlog           if $LOG;
 559   }
 560 } else {
 561   if ( 'fasta' eq $format ) {
 562     print "Read FASTA file\n";
 563     my $hmmer1 = hmm_for_align($infile);
 564     $hmmer1->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 565   } elsif ( 'msf' eq $format ) {
 566     msf2fasta( $infile, $infastafile );
 567     my $hmmer2 = hmm_for_align($infastafile);
 568     $hmmer2->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 569   } elsif ( 'blc' eq $format ) {
 570     my $tmpfile = File::Temp->new->filename;
 571     blc2msf( $infile, $tmpfile );
 572     msf2fasta( $infile, $infastafile );
 573     my $hmmer3 = hmm_for_align($infastafile);
 574     $hmmer3->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 575   } else {
 576     die "ERROR! unknown input format '$format'. exit...\n";
 577   }
 578   print ">>70% complete\n";
 579   print $LOG ">>70% complete\n" if $LOG;
 580 #####################################################################################################
 581   # Run Jnet for the prediction
 582   print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 583   print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 584   my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );           # CC sub is below
 585   print ">>90% complete\n";
 586   print $LOG ">>90% complete\n" if $LOG;
 587   print $jnetlog                if $VERBOSE;
 588   print $LOG $jnetlog           if $LOG;
 589 }
 590
 591 unless ( defined $nofinal ) {
 592   my $aligfile      = $prefix . ".align";
 593   my $jnetfile      = $prefix . ".jnet";
 594   my $jnetfastafile = $prefix . ".jnet.fasta";
 595   $aligfile = $infile if ( 'fasta' eq $format );
 596   $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
 597   concise2fasta( $jnetfile, $jnetfastafile );
 598   open( my $IN1, "<", $aligfile )      or die "ERROR! unable to open '$aligfile': ${!}\nDied";
 599   open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
 600   open( my $OUT, ">", $outfile )       or die "ERROR! unable to open '$outfile': ${!}\nDied";
 601   while (<$IN2>) { print $OUT $_; }
 602   while (<$IN1>) { print $OUT $_; }
 603   close($IN1);
 604   close($IN2);
 605   close($OUT);
 606   unlink $jnetfastafile;
 607   my $seqs = FASTA::File->new( read_file => $outfile );
 608   open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
 609   $seqs->write($OUT2);
 610   close($OUT2);
 611 } else {
 612   if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
 613     rename $prefix . ".jnet", $outfile;
 614   }
 615 }
 616
 617 print ">>100% complete\n";
 618 print "Jpred Finished\n";
 619 print $LOG ">>100% complete\n" if $LOG;
 620 print $LOG "Jpred Finished\n"  if $LOG;
 621 close($LOG)                    if $LOG;
 622 exit;
 623
 624 #####################################################################################################
 625 # Functions
 626 #####################################################################################################
 627 sub check_FASTA_format {
 628   my $infile = shift;
 629
 630   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
 631   my $check_first_line = 1;
 632   my $nseq             = 0;
 633   local $/ = "\n>";
 634   while (<$IN>) {
 635     if ($check_first_line) {
 636       return 0 unless (/^>/);
 637       $check_first_line = 0;
 638     }
 639     s/^>//g;
 640     s/>$//g;
 641
 642     my ( $id, @seqs ) = split /\n/, $_;
 643     return 0 unless ( defined $id or @seqs );
 644     my $seq = join( "", @seqs );
 645     return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
 646     ++$nseq;
 647   }
 648   close($IN);
 649
 650   return $nseq;
 651 }
 652 #####################################################################################################
 653 sub check_MSF_format {
 654   my $infile = shift;
 655   $? = 0;
 656   system("$readseq -fMSF -a -p < $infile > /dev/null");
 657   return 0 if ($?);
 658
 659   return 1;
 660 }
 661 #####################################################################################################
 662 sub check_BLC_format {
 663   my $infile = shift;
 664
 665   my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
 666   open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
 667   print $fh "silent_mode\nblock_file $infile\n";
 668   print $fh "output_file /dev/null\nmax_nseq 2000\n";
 669   print $fh "pir_save $tmpfile2\nsetup\n";
 670   close $fh;
 671
 672   $? = 0;
 673   system("$alscript -s -f $tmpfile1");
 674   $? = 0;
 675   system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
 676   unlink $tmpfile1;
 677   unlink $tmpfile2;
 678   return 0 if ($?);
 679
 680   return 1;
 681 }
 682 ############################################################################################################################################
 683 # convertor BLC -> MSF
 684 sub msf2fasta {
 685   my $infile  = shift;
 686   my $outfile = shift;
 687
 688   my $sequence = "";
 689   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
 690   while (<$IN>) {
 691     $sequence .= $_;
 692   }
 693   close($IN);
 694
 695   $sequence =~ s/~/\./g;
 696
 697   ## check for non-unique sequence names - readseq has problems with these
 698   my @lines = split /\n/, $sequence;
 699   my %names;
 700   foreach my $line (@lines) {
 701     if ( $line =~ /Name:\s+(\S+)/ ) {
 702       die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
 703       $names{$1}++;
 704     }
 705   }
 706
 707   my $tmpfile1 = File::Temp->new->filename;
 708   my $tmpfile2 = File::Temp->new->filename;
 709   open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
 710   print $FILE $sequence;
 711   close $FILE;
 712
 713   $? = 0;
 714   system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
 715   die "Reformatting of $infile failed. exit...\n" if ($?);
 716   unlink $tmpfile1;
 717
 718   $? = 0;
 719   system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
 720   die "Reformatting of $infile failed. exit...\n" if ($?);
 721   unlink $tmpfile2;
 722
 723   die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
 724 }
 725
 726 ############################################################################################################################################
 727 # convertor BLC -> MSF
 728 sub blc2msf {
 729   my ( $in, $out ) = @_;
 730
 731   my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
 732
 733   open my $fh, ">$tmp" or die "$tmp: $!\n";
 734   print $fh "silent_mode\nblock_file $infile\n";
 735   print $fh "output_file /dev/null\nmax_nseq 2000\n";
 736   print $fh "pir_save $tmp_pir\nsetup\n";
 737   close $fh;
 738
 739   $? = 0;
 740   system("$alscript -s -f $tmp");
 741   die "Reformatting of $infile failed. exit...\n" if ($?);
 742   system("$readseq -f=MSF -o=$out -a $tmp_pir");
 743
 744   unlink $tmp;
 745   unlink $tmp_pir;
 746 }
 747
 748 #####################################################################################################
 749
 750 =begin :private
 751
 752 =head2 fasta2concise ($infile, $outfile)
 753
 754 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
 755
 756 =cut
 757
 758 sub fasta2concise {
 759   my $infile  = shift;
 760   my $outfile = shift;
 761
 762   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 763   my ( $seq, @seqs, @title );
 764   while (<$IN>) {
 765     if (s/^>//) {
 766       if ($seq) {
 767         $seq =~ s/\n|\s//g;
 768         $seq =~ s/(.)/$1,/g;
 769         push @seqs, $seq;
 770         $seq = "";
 771       }
 772       chomp;
 773       push @title, $_;
 774     } else {
 775       chomp;
 776       $seq .= $_;
 777     }
 778   }
 779   $seq =~ s/\n|\s//g;
 780   $seq =~ s/(.)/$1,/g;
 781   push @seqs, $seq;
 782   close($IN);
 783
 784   if ( @title != @seqs ) {
 785     die("non matching number of titles and sequences!\n");
 786   }
 787
 788   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 789   foreach ( 0 .. $#title ) {
 790     print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
 791   }
 792   close($OUT);
 793 }
 794
 795 =begin :private
 796
 797 =head2 concise2fasta ($infile, $outfile)
 798
 799 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
 800
 801 =cut
 802
 803 #####################################################################################################
 804 sub concise2fasta {
 805   my $infile  = shift;
 806   my $outfile = shift;
 807
 808   my ( @seqs, %seq, @pred, %pred );
 809
 810   my @var = (
 811     "Lupas_21",  "Lupas_14", "Lupas_28", "MULTCOIL",  "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
 812     "JNETALIGN", "JNETHMM",  "JNETSOL5", "JNETSOL25", "JNETSOL0",        "jnetpred",       "jpred"
 813   );
 814
 815   # parse input concise file
 816   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 817   while (<$IN>) {
 818     unless (/^\n/) {
 819       my ( $id, $seq ) = split( ":", $_ );
 820       if ( defined $id and defined $seq ) {    # Check we have proper values
 821         $seq =~ s/,//g;
 822         chomp $seq;
 823         if ( $id =~ /;/ ) {                    # Then it's an alignment
 824           @_ = split( ";", $id );
 825           push @seqs, $_[1];
 826           $seq{ $_[1] } = $seq;
 827         }
 828         foreach my $v (@var) {
 829           if ( $v eq $id ) {
 830             push @pred, $v;
 831             $pred{$v} = $seq;
 832           }
 833         }
 834       }
 835     }
 836   }
 837   close($IN);
 838
 839   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 840
 841   # print out sequences found with PSI-BLAST
 842   foreach (@seqs) {
 843     $seq{$_} =~ s/(.{72})/$1\n/g;
 844     print $OUT ">$_\n$seq{$_}\n";
 845   }
 846
 847   # print out predictions
 848   foreach my $p (@pred) {
 849     $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
 850     $pred{$p} =~ s/B/E/g;
 851     $pred{$p} =~ s/G/H/g;
 852
 853     if ( $p =~ /SOL/ ) {
 854       $pred{$p} =~ s/E/B/g;
 855     }
 856     $pred{$p} =~ s/(.{72})/$1\n/g;
 857     print $OUT ">$p\n$pred{$p}\n";
 858   }
 859   close($OUT);
 860 }
 861
 862 =begin :private
 863
 864 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 865
 866 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 867 Does it in place.
 868
 869 =cut
 870
 871 #####################################################################################################
 872 sub remove_seq_masks {
 873   my $idx = shift;
 874   my $seq = shift;
 875
 876   # Only bother if we have a sequence which has been masked
 877   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 878
 879   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 880   my $searchdb = $database->{$db_entry}{unfiltered};
 881   $start--;    # We need this to be zero based
 882
 883   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 884   my @seqs = $f->get_entries;
 885   my $db   = shift @seqs;
 886
 887   unless ($db) {
 888     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 889   } elsif (@seqs) {
 890     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 891   }
 892
 893   my @db_seq = @{ $db->seq };
 894
 895   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 896   my $i = 0;
 897   for ( 0 .. $#align ) {
 898     next if $align[$_] =~ /$GAP/;
 899
 900     if ( $align[$_] eq 'X' ) {
 901       unless ( defined $db_seq[ $i + $start ] ) {
 902         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 903       }
 904       $align[$_] = $db_seq[ $i + $start ];
 905     }
 906     $i++;
 907   }
 908   $seq->align( \@align );
 909 }
 910
 911 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 912
 913 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 914
 915 Equivilent to reduce script.
 916
 917 =cut
 918
 919 #####################################################################################################
 920 sub reduce {
 921   my ( $max, @seqs ) = @_;
 922
 923   my $num = int( @seqs / $max );
 924   my @res = shift @seqs;
 925
 926   # Don't bother if we have less than the required sequences
 927   if ( @seqs < $max or 0 == $num ) {
 928     return @res, @seqs;
 929   }
 930
 931   for ( 0 .. $#seqs ) {
 932     push @res, $seqs[$_] if ( 0 == $_ % $num );
 933   }
 934
 935   return @res;
 936 }
 937
 938 =head2 nonred($cutoff, @PSISEQS);
 939
 940 Removes redundant sequences at $cutoff level of sequence identity.
 941
 942 Equivilent to nonred script.
 943
 944 =cut
 945
 946 #####################################################################################################
 947 sub nonred {
 948   my ( $cutoff, @seqs ) = @_;
 949
 950   # Run pairwise and then OC and remove similar sequences
 951   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 952   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 953   if     ( @seqs == 1 ) {
 954     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 955     return @seqs;
 956   }
 957
 958   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 959   my $fasta = FASTA::File->new;
 960   foreach (@seqs) {
 961     my $f = FASTA->new( id => $_->id );
 962     $f->seq( @{ $_->align } );
 963     $fasta->add_entries($f);
 964   }
 965
 966   # Run pairwise
 967   my $pair = Pairwise->new( path => $pairwise );
 968   my $foo = join "\n", $pair->run($fasta) or die $!;
 969
 970   # Run OC
 971   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 972   $ocres->read_string( join "", $ocres->run($foo) );
 973
 974   # Now found those entries that are unrelated
 975   my @keep;
 976   for ( $ocres->get_groups ) {
 977     my ( $group, $score, $size, @labels ) = @{$_};
 978
 979     # Keep the QUERY from the cluster, otherwise just get onelt
 980     if ( grep { /QUERY/ } @labels ) {
 981
 982       # Make sure the query stays at the start
 983       unshift @keep, "QUERY";
 984       next;
 985     } else {
 986       push @keep, shift @labels;
 987     }
 988   }
 989   push @keep, $ocres->get_unclustered;
 990
 991   # Return the matching entries.
 992   # We use the temporay to mimic the selection of sequences in the nonred
 993   # script, which took the sequences in the order they occured in the file.
 994   my @filtered_res;
 995   for (@seqs) {
 996     my $label = $_->id;
 997     if ( grep { $_ =~ /^$label$/ } @keep ) {
 998       push @filtered_res, $_;
 999     }
1000   }
1001
1002   return @filtered_res;
1003 }
1004
1005 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
1006
1007 Removes those sequences that are more than $percentage longer or shorter than the first
1008 sequence in @seqs. @seqs is an array of PSISEQ objects.
1009
1010 Equivilent to trim_seqs script.
1011
1012 =cut
1013
1014 #####################################################################################################
1015 sub del_long_seqs {
1016   my ( $factor, @seqs ) = @_;
1017
1018   # Keep the query sequence (we assume query is the 1st FASTA record)
1019   my $query = shift @seqs;
1020   my @res   = $query;
1021
1022   # Find the length of the query sequence without any gaps
1023   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
1024
1025   # Calculate the upper and lower allowed bounds
1026   my ( $upper, $lower );
1027   {
1028     my $bounds = ( $length / 100 ) * $factor;
1029     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
1030   }
1031
1032   # Now try each sequence and see how long they are
1033   for (@seqs) {
1034     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
1035     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
1036   }
1037
1038   return @res;
1039 }
1040
1041 =head2 toupper ($PSISEQ)
1042
1043 Converts sequence held in PSISEQ object to uppercase.
1044
1045 =cut
1046
1047 #####################################################################################################
1048 sub toupper {
1049   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
1050 }
1051
1052 =head2 degap($PSISEQ_QUERY, @PSISEQS)
1053
1054 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
1055
1056 Equivilent to fasta2jnet script.
1057
1058 =cut
1059
1060 #####################################################################################################
1061 sub degap {
1062   my (@seqs) = @_;
1063
1064   # Find where the gaps in the query sequence are
1065   my @gaps;
1066   my $i = 0;
1067   for ( @{ $seqs[0]->align } ) {
1068     push @gaps, $i if $_ !~ /$GAP/;
1069     $i++;
1070   }
1071
1072   return unless @gaps;
1073
1074   # Remove the gaps in all the sequences.
1075   # Derefences the array reference and takes a slice inside the method call
1076   # arguments, then coerce back to an array ref.
1077   for (@seqs) {
1078     $_->align( [ @{ $_->align }[@gaps] ] );
1079   }
1080 }
1081
1082 =head2 getfreq($filename, @PSISEQS)
1083
1084 Creates a PSIBLAST like profile and writes it to the $filename.
1085
1086 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
1087 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
1088 but it's *not* the same.
1089
1090 =cut
1091
1092 #####################################################################################################
1093 sub getfreq {
1094   my ( $fn, @seqs ) = @_;
1095
1096   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
1097
1098   # Create a PSIBLAST like profile from the alignment
1099   # This doesn't greate the same result as old Jpred, I think this is because
1100   # differences in the input between old and new
1101
1102   # For testing
1103   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
1104   #   my @profile = profile(
1105   #      map { join "", @{ $_->seq } } $f->get_entries
1106   #   );
1107
1108   my @profile = profile( map { join "", @{ $_->align } } @seqs );
1109
1110   open my $fh, ">$fn" or die "JPRED: $fn: $!";
1111   print $fh join( " ", @{$_} ), "\n" for @profile;
1112 }
1113
1114 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
1115
1116 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
1117
1118 Equivilent to gethmm script.
1119
1120 =cut
1121
1122 #####################################################################################################
1123 sub hmmer {
1124   my (@seqs) = @_;
1125
1126   # Temp files required
1127   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
1128
1129   # Create the FASTA file
1130   psiseq2fasta( $tmp_fasta, @seqs );
1131
1132   # Run HMMer
1133   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
1134   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1135
1136   # Read in the HMMER file
1137   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1138
1139   # Convert from HMMER::Profile to HMMER::Profile::Jnet
1140   my $hmmer_jnet = HMMER::Profile::Jnet->new;
1141   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
1142
1143   return $hmmer_jnet;
1144 }
1145
1146 =head2 jnet($hmmer, $out, $pssm )
1147
1148 Runs Jnet. Pass the paths of the filenames to do the prediction on
1149
1150 =cut
1151
1152 #################################################################################################################################################
1153
1154 =head2 get_hmm($alignFile, $name)
1155
1156 Adapted from the hmmer() function in Jpred as written by JDB.
1157
1158 Generates an HMMer profile output compatible with Jnet.
1159
1160 Equivilent to gethmm script.
1161
1162 =cut
1163
1164 sub hmm_for_align {
1165   my $fastafile = shift;
1166
1167   # Temp files required
1168   print "hmm_for_align: fastafile = $fastafile\n";
1169   my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
1170
1171   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
1172   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1173
1174   # Read in the HMMER file
1175   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1176
1177   # Read in the HMMER file
1178   my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
1179
1180   # Convert from HMMER::Profile to HMMER::Profile::Jnet
1181   my $hmmer_jnet = HMMER::Profile::Jnet->new;
1182   $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
1183
1184   return $hmmer_jnet;
1185 }
1186
1187 #####################################################################################################
1188 sub jnet {
1189   my ( $hmmer, $outfile, $pssm ) = @_;
1190   if ($pssm) {
1191
1192     # run Jnet prediction with all the profile data
1193     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
1194   } else {
1195
1196     # run Jnet prediction with only HMMer and alignment data
1197     system("$jnet --concise --hmmer $hmmer > $outfile");
1198   }
1199   check( "jnet", $? ) or exit 1;
1200   my $res     = "";
1201   my $logging = "";
1202   open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1203   while (<$IN>) {
1204     if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
1205       $res .= $_;
1206     } else {
1207       $logging .= $_;
1208     }
1209   }
1210   close $IN;
1211   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1212   print $OUT "\n" . $res;
1213   close $OUT;
1214   return $logging;
1215 }
1216
1217 =head1 psiseq2fasta($path, @PSISEQ)
1218
1219 Writes a FASTA file given an array of PSISEQ objects.
1220
1221 =cut
1222
1223 #####################################################################################################
1224 sub psiseq2fasta {
1225   my ( $fn, @seqs ) = @_;
1226
1227   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1228   confess "JPRED: not passed PSISEQs" unless @seqs;
1229
1230   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1231
1232   # Changed from using FASTA::File object due to seg faulting on some large,
1233   # long alignments, presumably due to running out of memory.
1234   #   my $fasta = FASTA::File->new;
1235   #   $fasta->add_entries(
1236   #      map {
1237   #         my $f = FASTA->new(id => $_->id);
1238   #         $f->seq(@{$_->align});
1239   #         $f;
1240   #      } @seqs
1241   #   );
1242   #   $fasta->write_file($fn);
1243
1244   my $fh;
1245   $fn =~ /\.gz$/
1246     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1247     : ( open $fh, ">$fn" or die "JPRED: $!" );
1248
1249   for (@seqs) {
1250     print $fh ">", $_->id, "\n";
1251     my $seq = join( "", @{ $_->align } );
1252     $seq =~ s/(.{72})/$1\n/g;
1253     chomp $seq;
1254     print $fh $seq . "\n";
1255   }
1256   close $fh;
1257 }
1258
1259 =head1 @PSISEQ = fasta2psiseq($path)
1260
1261 Reads in a FASTA file and returns an array of PSISEQ objects.
1262
1263 =cut
1264
1265 #####################################################################################################
1266 sub fasta2psiseq {
1267   my ($fn) = @_;
1268
1269   my $fasta =
1270     $fn =~ /\.gz$/
1271     ? FASTA::File->new( read_gzip_file => $fn )
1272     : FASTA::File->new( read_file      => $fn );
1273   $fasta or die "JPRED: $!";
1274
1275   my @seqs;
1276   for ( $fasta->get_entries ) {
1277     my $seq = PSISEQ->new;
1278     $seq->id( $_->id );
1279     $seq->align( [ @{ $_->seq } ] );
1280     push @seqs, $seq;
1281   }
1282
1283   return @seqs;
1284 }
1285
1286 #####################################################################################################
1287 sub fasta2psiseq_lowmemory {
1288   my $filename = shift;
1289
1290   local $/ = "\n";
1291   my $fh;
1292   $filename =~ /\.gz$/
1293     ? ( open $fh, "gzip -dc $filename|" or die $! )
1294     : ( open $fh, $filename or die $! );
1295
1296   my @seqs;
1297   my ( $id, $seq );
1298   while (<$fh>) {
1299     chomp;
1300     if (s/^>//) {
1301       $seq =~ s/ //g if defined $seq;
1302
1303       if ( $id and $seq ) {
1304         my $psi = PSISEQ->new( id => $id );
1305         $psi->align( [ split //, $seq ] );
1306         push @seqs, $psi;
1307         undef $id;
1308         undef $seq;
1309       } else {
1310         $id = $_;
1311       }
1312     } else {
1313       $seq .= $_;
1314     }
1315   }
1316   if ( $id and $seq ) {
1317     my $psi = PSISEQ->new( id => $id );
1318     $psi->seq($seq);
1319     push @seqs, $psi;
1320   }
1321
1322   return @seqs;
1323 }
1324
1325 #####################################################################################################
1326
1327 =head1 extend($FASTA::File, @PSISEQ)
1328
1329 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1330 against the query, this function extends the alignment so that all of the query is
1331 present in the alignment. The other sequences are extended with gap characters.
1332
1333 =cut
1334
1335 sub extend {
1336   my ( $query, @seqs ) = @_;
1337
1338   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1339   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
1340
1341   # Get the query sequence
1342   my $q_query = $query->get_entry(0);
1343
1344   # Get the query sequence from the PSIBLAST alignment
1345   my $a_query = shift @seqs;
1346
1347   # The gap size required to expand the alignment
1348   my ( $start_gap_length, $end_gap_length );
1349   {
1350
1351     # Make handling the sequence easier
1352     my $q_seq = join "", @{ [ $q_query->seq ] };
1353     my $a_seq = join "", @{ $a_query->align };
1354
1355     $q_seq =~ s/-//g;
1356     $a_seq =~ s/-//g;
1357
1358     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1359
1360     $start_gap_length = index( $q_seq, $a_seq );
1361     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1362
1363     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1364   }
1365
1366   # Add the gaps to the end of the alignments
1367   for (@seqs) {
1368     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1369   }
1370
1371   # Put the query sequence back
1372   unshift @seqs,
1373     PSISEQ->new(
1374     id    => $a_query->id,
1375     align => [
1376       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1377       @{ $a_query->align },
1378       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1379     ]
1380     );
1381
1382   return @seqs;
1383 }
1384
1385 #####################################################################################################
1386
1387 =head1 $int = fasta_seq_length($path)
1388
1389 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1390
1391 =cut
1392
1393 sub fasta_seq_length {
1394   my ($fn) = @_;
1395   open my $fh, $fn or croak "Can't open file $fn: $!\n";
1396
1397   my $length = 0;
1398   local $_, $/ = "\n";
1399   while (<$fh>) {
1400     next if /^>/;
1401     chomp;
1402     $length += tr///c;
1403   }
1404
1405   return $length;
1406 }