jpred/jpred_jabaws.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9 ./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program for predicting the secondary structure of a multiple sequence alignment or a protein sequence.
  14 The input file can be stored in 3 formats: FASTA, MSF, or BLC.
  15 For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and
  16 predicting the secondary structure with Jnet. For the multiple sequence alignment only the HMM profile,
  17 created from the alignment, is used in Jnet.
  18
  19 =head1 OPTIONS
  20
  21 =over 5
  22
  23 =item -in <FILE1>
  24
  25 The path to the sequence file (in FASTA, MSF, or BLC format)
  26
  27 =item -output <FILEPREFIX>
  28
  29 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
  30
  31 =item -outfile <FILE2>
  32
  33 An output file, by default it is undefined and the -output option is working
  34
  35 =item -logfile <FILE3>
  36
  37 Logging file, by default it is undefined and logging switched off
  38
  39 =item -dbpath PATH
  40
  41 Path to the uniref database used for PSI-BLAST querying. default value: .
  42
  43 =item -dbname database
  44
  45 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
  46 Default value: uniref90
  47
  48 =item -psi path
  49
  50 Path to a PSIBLAST output file.
  51
  52 =item -ncpu NNN
  53
  54 Number of CPU used by jpred.pl. Maximum value is 8
  55 Default: NNN = 1
  56
  57 =item -pred-nohits
  58
  59 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  60
  61 =item -no-final
  62
  63 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
  64 The option untoggles this step and stops jpred at the concise file generation
  65
  66 =item -jabaws
  67
  68 Redefines some basic variables in order to use the script within JABAWS
  69
  70 =item -verbose
  71
  72 Verbose mode. Print more information while running jpred.
  73
  74 =item -debug
  75
  76 Debug mode. Print debugging information while running jpred.
  77
  78 =item -help
  79
  80 Gives help on the programs usage.
  81
  82 =item -man
  83
  84 Displays this man page.
  85
  86 =back
  87
  88 =head1 BUGS
  89
  90 Can't cope with gaps in the query sequence.
  91
  92 Doesn't accept alignments.
  93
  94 If you find any others please contact authors.
  95
  96 =head1 AUTHORS
  97
  98 Jonathan Barber <jon@compbio.dundee.ac.uk>
  99 Chris Cole <christian@cole.name>
 100 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
 101
 102 =cut
 103
 104 ## TODO check for gaps in query sequence
 105 ##      check for disallowed chars in sequence
 106
 107 use strict;
 108 use warnings;
 109 use Getopt::Long;
 110 use Pod::Usage;
 111 use Carp;
 112 use Data::Dumper;
 113 use File::Temp;
 114
 115 # path to Jpred modules
 116 use FindBin qw($Bin);
 117 use lib "$Bin/lib";
 118
 119 # internal jpred modules
 120 use Jpred;
 121 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
 122 use HMMER::Profile;
 123 use HMMER::Profile::Jnet;
 124 use PSIBLAST;
 125 use PSIBLAST::PSSM;
 126 use PSIBLAST::Run;
 127 use FASTA::File;
 128 use FASTA;
 129 use Index;
 130 use Pairwise;
 131 use OC;
 132 use Utils qw(profile);
 133 use Run qw(check);
 134
 135 our $VERSION = '3.0.2';
 136 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 137 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 138
 139 # Gap characters in sequences
 140 our $GAP = '-';
 141
 142 #####################################################################################################
 143 # Light object to hold sequence information, light to prevent memory overload
 144 # for big search results. Even so this is quite memory intensive, I can't see
 145 # how to reduce the size further.
 146 {
 147
 148   package PSISEQ;
 149
 150   sub new {
 151     my ( $class, %args ) = @_;
 152     my $self = [];
 153     bless $self, $class;
 154     for ( keys %args ) { $self->$_( $args{$_} ) }
 155     return $self;
 156   }
 157
 158   sub id {
 159     return defined $_[1]
 160       ? $_[0]->[0] = $_[1]
 161       : $_[0]->[0];
 162   }
 163
 164   sub start {
 165     return defined $_[1]
 166       ? $_[0]->[1] = $_[1]
 167       : $_[0]->[1];
 168   }
 169
 170   sub align {
 171     if ( defined $_[1] ) {
 172       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 173
 174       $_[0]->[2] = join( "", @{ $_[1] } );
 175     } else {
 176       return [ split( //, $_[0]->[2] ) ];
 177     }
 178   }
 179 }
 180
 181 #####################################################################################################
 182 my $infile;
 183 my $infastafile;
 184 my $format;
 185 my $goal = "align";
 186 my $seqgoal;
 187 my $outfile;
 188 my $prefix;
 189 my $psiblast;
 190 my $jabaws;
 191 my $logfile;
 192 my $ncpu       = 1;                              # number of CPUs for psiblast calculations
 193 my $predNoHits = 0;                              # define whether to make predictions when no PSI-BLAST hits are found
 194 my $db_path    = "/homes/www-jpred/databases";
 195 my $db_entry   = "cluster";
 196 my $nofinal;
 197
 198 my ( $help, $man, $DEBUG, $VERBOSE );
 199
 200 GetOptions(
 201   "in=s"        => \$infile,
 202   "output=s"    => \$prefix,
 203   "outfile=s"   => \$outfile,
 204   "logfile=s"   => \$logfile,
 205   "psi=s"       => \$psiblast,
 206   "dbname=s"    => \$db_entry,
 207   "dbpath=s"    => \$db_path,
 208   "ncpu=s"      => \$ncpu,
 209   "pred-nohits" => \$predNoHits,
 210   "no-final"    => \$nofinal,
 211   "jabaws"      => \$jabaws,
 212
 213   "help"    => \$help,
 214   "man"     => \$man,
 215   "debug"   => \$DEBUG,
 216   "verbose" => \$VERBOSE
 217 ) or pod2usage(2);
 218 pod2usage(1) if $help;
 219 pod2usage( verbose => 2 ) if $man;
 220
 221 #####################################################################################################
 222 # Key to database information and information for accessing them
 223 my $database = {
 224
 225   ## default database used by Jpred
 226   uniref90 => {
 227     database   => $db_path . "/uniref90.filt",
 228     unfiltered => $db_path . "/uniref90",
 229   },
 230
 231   ## database used during Jnet training
 232   training => {
 233     database   => $db_path . "/training/uniref90.filt",
 234     unfiltered => $db_path . "/training/uniref90",
 235   },
 236
 237   ## database used for ported Jpred
 238   ported_db => {
 239     database   => $db_path . "/uniref90.filt",
 240     unfiltered => $db_path . "/uniref90",
 241   },
 242
 243   ## cluster-specific path for Jpred
 244   cluster => {
 245     database   => $db_path . "/uniref90.filt",
 246     unfiltered => $db_path . "/uniref90",
 247   },
 248
 249   ## these other DBs are experimental ones used during development.
 250   ## check they exist before using them.
 251   # generic entry for use with validate_jnet.pl
 252   # real db location defined by validate_jnet.pl
 253   swall => {
 254     database   => $db_path . "/swall/swall.filt",
 255     unfiltered => $db_path . "/swall/swall",
 256   },
 257
 258   # Path to PSIBLAST db
 259   uniprot => {
 260     database   => $db_path . "/3/swall.filt",
 261     unfiltered => $db_path . "/3/swall",
 262   },
 263
 264   uniref50 => {
 265     database   => $db_path . "/6/swall.filt",
 266     unfiltered => $db_path . "/6/swall",
 267   },
 268 };
 269
 270 my $dpf = $database->{$db_entry}{database}.'.pal';
 271 my $dpu = $database->{$db_entry}{unfiltered}.'.pal';
 272 pod2usage("ERROR! Input file should be provided with -sequence/-in")                                              unless $infile;
 273 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database")            unless exists $database->{$db_entry};
 274 pod2usage("ERROR! UNIREF filtered   database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf;
 275 pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu;
 276
 277 #####################################################################################################
 278 # lots of preparation steps
 279 if ( defined $prefix ) {
 280   unless ( defined $outfile ) {
 281     $outfile = $prefix . ".res.fasta";
 282   }
 283 } else {
 284   if ( defined $outfile ) {
 285     print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
 286     $prefix = $outfile . ".tmp";
 287   } else {
 288     print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
 289     $prefix  = $infile . ".res";
 290     $outfile = $prefix . ".jnet";
 291   }
 292 }
 293
 294 if ( 1 > $ncpu or 8 < $ncpu ) {
 295   print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
 296   $ncpu = 1;
 297 }
 298
 299 $VERBOSE = 1 if $DEBUG;
 300
 301 my $LOG;
 302 if ( defined $logfile ) {
 303   open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
 304 }
 305
 306 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
 307   my ( $key, $value ) = @{$_};
 308   defined $value or next;
 309   my $par = join( ": ", $key, $value );
 310   print "$par\n" if $DEBUG;
 311   print $LOG "$par\n" if $LOG;
 312 }
 313
 314 if ( defined $jabaws ) {
 315   setup_jpred_env($Bin);
 316   setup_env($Bin);
 317 }
 318 my $platform = check_OS();
 319 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 320 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
 321
 322 #####################################################################################################
 323 # check input file format
 324 my $nseq = check_FASTA_format($infile);
 325 if ( 0 < $nseq ) {
 326   $format = "fasta";
 327   if ( 1 == $nseq ) {
 328     # one FASTA record
 329     $goal = 'seq';
 330   } else {
 331     unless ( 0 < check_FASTA_alignment($infile)) {
 332       die "\nERROR! jpred requires either FASTA alignment or 1 sequence in the FASTA, MSF, or BLC formats\n";
 333     }
 334   }
 335 } elsif ( 0 < check_MSF_format($infile) ) {
 336   $format = "msf";
 337 } elsif ( 0 < check_BLC_format($infile) ) {
 338   $format = "blc";
 339 } else {
 340   die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
 341 }
 342 $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
 343
 344 #####################################################################################################
 345 if ( 'seq' eq $goal ) {
 346   my $query = FASTA::File->new( read_file => $infile );
 347   my @seqs;
 348   my $psi = PSIBLAST->new;
 349   unless ( defined $psiblast && -e $psiblast ) {
 350     ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 351     # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 352     # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 353     # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 354     my $psi_run = PSIBLAST::Run->new(
 355       debug    => $DEBUG,
 356       args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 357       path     => $psiblastbin,
 358       input    => $infile,
 359       output   => "$prefix.blast",
 360       matrix   => "$prefix.profile",
 361       database => $database->{$db_entry}{database},
 362     );
 363
 364     # For reduced databases, get the size of the orginal and use this as
 365     # the equivilent DB size
 366     if ( $db_entry =~ /^sp_red_/ ) {
 367       ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 368       $psi_run->args(
 369         $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 370       );
 371     }
 372     print "BLAST matrix: $ENV{BLASTMAT}\n"      if $DEBUG;
 373     print "BLASTDB path: $ENV{BLASTDB}\n"       if $DEBUG;
 374     print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
 375     print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
 376
 377     print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 378     print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
 379     $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 380
 381     $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 382     system("gzip -9f $prefix.blast");
 383   } else {
 384     if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 385     else                         { $psi->read_file($psiblast) }                        # CC ditto above
 386   }
 387
 388 #####################################################################################################
 389   # Convert the last itteration into a collection of PSISEQ objects
 390   for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 391     my ( $id, $start, $align ) = @{$_};
 392     push @seqs,
 393       PSISEQ->new(
 394       id    => $id,
 395       start => $start,
 396       align => [ split( //, $align ) ]
 397       );
 398   }
 399
 400 #####################################################################################################
 401   # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
 402   # Accuracy won't be as good, but at least it's a prediction
 403   if ( @seqs == 0 ) {
 404     if ( $predNoHits == 0 ) {
 405       warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
 406       print ">>100% complete\n";
 407       print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
 408       print $LOG ">>100% complete\n"                                                                         if $LOG;
 409       close($LOG)                                                                                            if $LOG;
 410       exit;
 411     } else {
 412       print ">>50% complete\n";
 413       warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 414       print "Running HMMer on query...\n"                                                            if $VERBOSE;
 415       print $LOG ">>50% complete\n"                                                                  if $LOG;
 416       print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
 417       print $LOG "Running HMMer on query...\n"                                                       if $LOG;
 418
 419       # copy input query to alignment
 420       #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
 421       open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 422       open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
 423       while (<$ifh>) { print $ofh, $_ }
 424       close $ifh;
 425       close $ofh;
 426
 427       # Temp files required for HMMer
 428       my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 429       system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 430       system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 431       unlink $hmmbuild_out;
 432
 433       # Read in the HMMER file
 434       my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 435
 436       # Convert from HMMER::Profile to HMMER::Profile::Jnet
 437       my $hmmer_jnet = HMMER::Profile::Jnet->new;
 438       $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 439       $hmmer_jnet->write_file("$prefix.hmm");
 440       print ">>70% complete\n";
 441       print $LOG ">>70% complete\n" if $LOG;
 442
 443       # Run Jnet for the prediction
 444       print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 445       print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 446       my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
 447       print ">>90% complete\n";
 448       print $LOG ">>90% complete\n" if $LOG;
 449       print $jnetlog                if $VERBOSE;
 450       print $LOG $jnetlog           if $LOG;
 451     }
 452   } else {
 453     psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
 454     print ">>40% complete\n";
 455     print $LOG ">>40% complete\n" if $LOG;
 456
 457     #####################################################################################################
 458     # Make PSIBLAST truncated alignments the right length
 459     print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 460     print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
 461     @seqs = extend( $query, @seqs );
 462     psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 463
 464     #####################################################################################################
 465     # Remove masking from PSIBLAST alignment
 466     print "Unmasking the alignments...\n" if $VERBOSE;
 467     print $LOG "Unmasking the alignments...\n" if $LOG;
 468     my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 469     remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 470     psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 471
 472     #####################################################################################################
 473     # Convert the sequences to upper case
 474     print "Converting sequences to the same case...\n" if $VERBOSE;
 475     print $LOG "Converting sequences to the same case...\n" if $LOG;
 476     toupper($_) for @seqs;    # CC sub is below
 477     psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
 478
 479     #####################################################################################################
 480     # Remove excessive sequences
 481     print "Remove excessive sequences...\n" if $VERBOSE;
 482     print $LOG "Remove excessive sequences...\n" if $LOG;
 483     @seqs = reduce( $MAX_ALIGN, @seqs );
 484     psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 485
 486     #####################################################################################################
 487     # Remove sequences that are too long or too short
 488     print "Remove sequences which too long or short...\n" if $VERBOSE;
 489     print $LOG "Remove sequences which too long or short...\n" if $LOG;
 490     @seqs = del_long_seqs( 50, @seqs );
 491     psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 492
 493     #####################################################################################################
 494     # Remove redundant sequences based upon pairwise identity and OC clustering
 495     print "Remove redundant sequences...\n" if $VERBOSE;
 496     print $LOG "Remove redundant sequences...\n" if $LOG;
 497     @seqs = nonred( $NR_CUT, @seqs );
 498     psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 499
 500     #####################################################################################################
 501     # Check that we haven't got rid of all of the sequences
 502     if ( @seqs < 2 ) {
 503       warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 504       print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
 505       @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
 506     }
 507     unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
 508
 509     # Remove gaps in the query sequence and same positions in the alignment
 510     print "Removing gaps in the query sequence...\n" if $VERBOSE;
 511     print $LOG "Removing gaps in the query sequence...\n" if $LOG;
 512     degap(@seqs);
 513     psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 514
 515     # Output the alignment for the prediction
 516     print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 517     print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
 518     psiseq2fasta( "$prefix.align", @seqs );
 519
 520     #####################################################################################################
 521     # Equivilent to getpssm script
 522     print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 523     print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
 524     my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
 525     $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 526     print ">>50% complete\n";
 527     print $LOG ">>50% complete\n" if $LOG;
 528
 529     #####################################################################################################
 530     # Run HMMER on the sequences
 531     print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 532     print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
 533     my $hmmer = hmmer(@seqs);             # CC sub is below
 534     $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 535     print ">>70% complete\n";
 536     print $LOG ">>70% complete\n" if $LOG;
 537 #####################################################################################################
 538     # Run Jnet for the prediction
 539     print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 540     print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
 541     my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
 542     print ">>90% complete\n";
 543     print $LOG ">>90% complete\n" if $LOG;
 544     print $jnetlog                if $VERBOSE;
 545     print $LOG $jnetlog           if $LOG;
 546   }
 547 } else {
 548   if ( 'fasta' eq $format ) {
 549     print "Read FASTA file\n";
 550     my $hmmer1 = hmm_for_align($infile);
 551     $hmmer1->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 552   } elsif ( 'msf' eq $format ) {
 553     msf2fasta( $infile, $infastafile );
 554     my $hmmer2 = hmm_for_align($infastafile);
 555     $hmmer2->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 556   } elsif ( 'blc' eq $format ) {
 557     my $tmpfile = File::Temp->new->filename;
 558     blc2msf( $infile, $tmpfile );
 559     msf2fasta( $infile, $infastafile );
 560     my $hmmer3 = hmm_for_align($infastafile);
 561     $hmmer3->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 562   } else {
 563     die "ERROR! unknown input format '$format'. exit...\n";
 564   }
 565   print ">>70% complete\n";
 566   print $LOG ">>70% complete\n" if $LOG;
 567 #####################################################################################################
 568   # Run Jnet for the prediction
 569   print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 570   print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 571   my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );           # CC sub is below
 572   print ">>90% complete\n";
 573   print $LOG ">>90% complete\n" if $LOG;
 574   print $jnetlog                if $VERBOSE;
 575   print $LOG $jnetlog           if $LOG;
 576 }
 577
 578 unless ( defined $nofinal ) {
 579   my $aligfile      = $prefix . ".align";
 580   my $jnetfile      = $prefix . ".jnet";
 581   my $jnetfastafile = $prefix . ".jnet.fasta";
 582   $aligfile = $infile if ( 'fasta' eq $format );
 583   $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
 584   concise2fasta( $jnetfile, $jnetfastafile );
 585   open( my $IN1, "<", $aligfile )      or die "ERROR! unable to open '$aligfile': ${!}\nDied";
 586   open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
 587   open( my $OUT, ">", $outfile )       or die "ERROR! unable to open '$outfile': ${!}\nDied";
 588   while (<$IN2>) { print $OUT $_; }
 589   while (<$IN1>) { print $OUT $_; }
 590   close($IN1);
 591   close($IN2);
 592   close($OUT);
 593   unlink $jnetfastafile;
 594   my $seqs = FASTA::File->new( read_file => $outfile );
 595   open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
 596   $seqs->write($OUT2);
 597   close($OUT2);
 598 } else {
 599   if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
 600     rename $prefix . ".jnet", $outfile;
 601   }
 602 }
 603
 604 print ">>100% complete\n";
 605 print "Jpred Finished\n";
 606 print $LOG ">>100% complete\n" if $LOG;
 607 print $LOG "Jpred Finished\n"  if $LOG;
 608 close($LOG)                    if $LOG;
 609 exit;
 610
 611 #####################################################################################################
 612 # Functions
 613 #####################################################################################################
 614 sub check_FASTA_format {
 615   my $infile = shift;
 616
 617   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
 618   my $check_first_line = 1;
 619   my $nseq             = 0;
 620   local $/ = "\n>";
 621   while (<$IN>) {
 622     if ($check_first_line) {
 623       return 0 unless (/^>/);
 624       $check_first_line = 0;
 625     }
 626     s/^>//g;
 627     s/>$//g;
 628
 629     my ( $id, @seqs ) = split /\n/, $_;
 630     return 0 unless ( defined $id or @seqs );
 631     my $seq = join( "", @seqs );
 632     return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
 633     ++$nseq;
 634   }
 635   close($IN);
 636
 637   return $nseq;
 638 }
 639 #####################################################################################################
 640 sub check_FASTA_alignment {
 641   my $infile = shift;
 642
 643   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
 644   my $check_first_line = 1;
 645   my $nseq             = 0;
 646   my $seqlen = -1;
 647   local $/ = "\n>";
 648   while (<$IN>) {
 649     if ($check_first_line) {
 650       return 0 unless (/^>/);
 651       $check_first_line = 0;
 652     }
 653     s/^>//g;
 654     s/>$//g;
 655
 656     my ( $id, @seqs ) = split /\n/, $_;
 657     return 0 unless ( defined $id or @seqs );
 658     my $seq = join( "", @seqs );
 659     return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
 660     if (-1 == $seqlen) {
 661       $seqlen = length ($seq);
 662     } else {
 663       return 0 if ($seqlen != length ($seq) );
 664     }
 665     ++$nseq;
 666   }
 667   close($IN);
 668
 669   return $nseq;
 670 }
 671 #####################################################################################################
 672 sub check_MSF_format {
 673   my $infile = shift;
 674   $? = 0;
 675   system("$readseq -fMSF -a -p < $infile > /dev/null");
 676   return 0 if ($?);
 677
 678   return 1;
 679 }
 680 #####################################################################################################
 681 sub check_BLC_format {
 682   my $infile = shift;
 683
 684   my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
 685   open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
 686   print $fh "silent_mode\nblock_file $infile\n";
 687   print $fh "output_file /dev/null\nmax_nseq 2000\n";
 688   print $fh "pir_save $tmpfile2\nsetup\n";
 689   close $fh;
 690
 691   $? = 0;
 692   system("$alscript -s -f $tmpfile1");
 693   $? = 0;
 694   system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
 695   unlink $tmpfile1;
 696   unlink $tmpfile2;
 697   return 0 if ($?);
 698
 699   return 1;
 700 }
 701 ############################################################################################################################################
 702 # convertor BLC -> MSF
 703 sub msf2fasta {
 704   my $infile  = shift;
 705   my $outfile = shift;
 706
 707   my $sequence = "";
 708   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
 709   while (<$IN>) {
 710     $sequence .= $_;
 711   }
 712   close($IN);
 713
 714   $sequence =~ s/~/\./g;
 715
 716   ## check for non-unique sequence names - readseq has problems with these
 717   my @lines = split /\n/, $sequence;
 718   my %names;
 719   foreach my $line (@lines) {
 720     if ( $line =~ /Name:\s+(\S+)/ ) {
 721       die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
 722       $names{$1}++;
 723     }
 724   }
 725
 726   my $tmpfile1 = File::Temp->new->filename;
 727   my $tmpfile2 = File::Temp->new->filename;
 728   open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
 729   print $FILE $sequence;
 730   close $FILE;
 731
 732   $? = 0;
 733   system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
 734   die "Reformatting of $infile failed. exit...\n" if ($?);
 735   unlink $tmpfile1;
 736
 737   $? = 0;
 738   system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
 739   die "Reformatting of $infile failed. exit...\n" if ($?);
 740   unlink $tmpfile2;
 741
 742   die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
 743 }
 744
 745 ############################################################################################################################################
 746 # convertor BLC -> MSF
 747 sub blc2msf {
 748   my ( $in, $out ) = @_;
 749
 750   my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
 751
 752   open my $fh, ">$tmp" or die "$tmp: $!\n";
 753   print $fh "silent_mode\nblock_file $infile\n";
 754   print $fh "output_file /dev/null\nmax_nseq 2000\n";
 755   print $fh "pir_save $tmp_pir\nsetup\n";
 756   close $fh;
 757
 758   $? = 0;
 759   system("$alscript -s -f $tmp");
 760   die "Reformatting of $infile failed. exit...\n" if ($?);
 761   system("$readseq -f=MSF -o=$out -a $tmp_pir");
 762
 763   unlink $tmp;
 764   unlink $tmp_pir;
 765 }
 766
 767 #####################################################################################################
 768
 769 =begin :private
 770
 771 =head2 fasta2concise ($infile, $outfile)
 772
 773 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
 774
 775 =cut
 776
 777 sub fasta2concise {
 778   my $infile  = shift;
 779   my $outfile = shift;
 780
 781   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 782   my ( $seq, @seqs, @title );
 783   while (<$IN>) {
 784     if (s/^>//) {
 785       if ($seq) {
 786         $seq =~ s/\n|\s//g;
 787         $seq =~ s/(.)/$1,/g;
 788         push @seqs, $seq;
 789         $seq = "";
 790       }
 791       chomp;
 792       push @title, $_;
 793     } else {
 794       chomp;
 795       $seq .= $_;
 796     }
 797   }
 798   $seq =~ s/\n|\s//g;
 799   $seq =~ s/(.)/$1,/g;
 800   push @seqs, $seq;
 801   close($IN);
 802
 803   if ( @title != @seqs ) {
 804     die("non matching number of titles and sequences!\n");
 805   }
 806
 807   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 808   foreach ( 0 .. $#title ) {
 809     print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
 810   }
 811   close($OUT);
 812 }
 813
 814 =begin :private
 815
 816 =head2 concise2fasta ($infile, $outfile)
 817
 818 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
 819
 820 =cut
 821
 822 #####################################################################################################
 823 sub concise2fasta {
 824   my $infile  = shift;
 825   my $outfile = shift;
 826
 827   my ( @seqs, %seq, @pred, %pred );
 828
 829   my @var = (
 830     "Lupas_21",  "Lupas_14", "Lupas_28", "MULTCOIL",  "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
 831     "JNETALIGN", "JNETHMM",  "JNETSOL5", "JNETSOL25", "JNETSOL0",        "jnetpred",       "jpred"
 832   );
 833
 834   # parse input concise file
 835   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 836   while (<$IN>) {
 837     unless (/^\n/) {
 838       my ( $id, $seq ) = split( ":", $_ );
 839       if ( defined $id and defined $seq ) {    # Check we have proper values
 840         $seq =~ s/,//g;
 841         chomp $seq;
 842         if ( $id =~ /;/ ) {                    # Then it's an alignment
 843           @_ = split( ";", $id );
 844           push @seqs, $_[1];
 845           $seq{ $_[1] } = $seq;
 846         }
 847         foreach my $v (@var) {
 848           if ( $v eq $id ) {
 849             push @pred, $v;
 850             $pred{$v} = $seq;
 851           }
 852         }
 853       }
 854     }
 855   }
 856   close($IN);
 857
 858   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 859
 860   # print out sequences found with PSI-BLAST
 861   foreach (@seqs) {
 862     $seq{$_} =~ s/(.{72})/$1\n/g;
 863     print $OUT ">$_\n$seq{$_}\n";
 864   }
 865
 866   # print out predictions
 867   foreach my $p (@pred) {
 868     $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
 869     $pred{$p} =~ s/B/E/g;
 870     $pred{$p} =~ s/G/H/g;
 871
 872     if ( $p =~ /SOL/ ) {
 873       $pred{$p} =~ s/E/B/g;
 874     }
 875     $pred{$p} =~ s/(.{72})/$1\n/g;
 876     print $OUT ">$p\n$pred{$p}\n";
 877   }
 878   close($OUT);
 879 }
 880
 881 =begin :private
 882
 883 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 884
 885 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 886 Does it in place.
 887
 888 =cut
 889
 890 #####################################################################################################
 891 sub remove_seq_masks {
 892   my $idx = shift;
 893   my $seq = shift;
 894
 895   # Only bother if we have a sequence which has been masked
 896   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 897
 898   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 899   my $searchdb = $database->{$db_entry}{unfiltered};
 900   $start--;    # We need this to be zero based
 901
 902   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 903   my @seqs = $f->get_entries;
 904   my $db   = shift @seqs;
 905
 906   unless ($db) {
 907     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 908   } elsif (@seqs) {
 909     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 910   }
 911
 912   my @db_seq = @{ $db->seq };
 913
 914   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 915   my $i = 0;
 916   for ( 0 .. $#align ) {
 917     next if $align[$_] =~ /$GAP/;
 918
 919     if ( $align[$_] eq 'X' ) {
 920       unless ( defined $db_seq[ $i + $start ] ) {
 921         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 922       }
 923       $align[$_] = $db_seq[ $i + $start ];
 924     }
 925     $i++;
 926   }
 927   $seq->align( \@align );
 928 }
 929
 930 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 931
 932 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 933
 934 Equivilent to reduce script.
 935
 936 =cut
 937
 938 #####################################################################################################
 939 sub reduce {
 940   my ( $max, @seqs ) = @_;
 941
 942   my $num = int( @seqs / $max );
 943   my @res = shift @seqs;
 944
 945   # Don't bother if we have less than the required sequences
 946   if ( @seqs < $max or 0 == $num ) {
 947     return @res, @seqs;
 948   }
 949
 950   for ( 0 .. $#seqs ) {
 951     push @res, $seqs[$_] if ( 0 == $_ % $num );
 952   }
 953
 954   return @res;
 955 }
 956
 957 =head2 nonred($cutoff, @PSISEQS);
 958
 959 Removes redundant sequences at $cutoff level of sequence identity.
 960
 961 Equivilent to nonred script.
 962
 963 =cut
 964
 965 #####################################################################################################
 966 sub nonred {
 967   my ( $cutoff, @seqs ) = @_;
 968
 969   # Run pairwise and then OC and remove similar sequences
 970   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 971   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 972   if     ( @seqs == 1 ) {
 973     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 974     return @seqs;
 975   }
 976
 977   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 978   my $fasta = FASTA::File->new;
 979   foreach (@seqs) {
 980     my $f = FASTA->new( id => $_->id );
 981     $f->seq( @{ $_->align } );
 982     $fasta->add_entries($f);
 983   }
 984
 985   # Run pairwise
 986   my $pair = Pairwise->new( path => $pairwise );
 987   my $foo = join "\n", $pair->run($fasta) or die $!;
 988
 989   # Run OC
 990   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 991   $ocres->read_string( join "", $ocres->run($foo) );
 992
 993   # Now found those entries that are unrelated
 994   my @keep;
 995   for ( $ocres->get_groups ) {
 996     my ( $group, $score, $size, @labels ) = @{$_};
 997
 998     # Keep the QUERY from the cluster, otherwise just get onelt
 999     if ( grep { /QUERY/ } @labels ) {
1000
1001       # Make sure the query stays at the start
1002       unshift @keep, "QUERY";
1003       next;
1004     } else {
1005       push @keep, shift @labels;
1006     }
1007   }
1008   push @keep, $ocres->get_unclustered;
1009
1010   # Return the matching entries.
1011   # We use the temporay to mimic the selection of sequences in the nonred
1012   # script, which took the sequences in the order they occured in the file.
1013   my @filtered_res;
1014   for (@seqs) {
1015     my $label = $_->id;
1016     if ( grep { $_ =~ /^$label$/ } @keep ) {
1017       push @filtered_res, $_;
1018     }
1019   }
1020
1021   return @filtered_res;
1022 }
1023
1024 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
1025
1026 Removes those sequences that are more than $percentage longer or shorter than the first
1027 sequence in @seqs. @seqs is an array of PSISEQ objects.
1028
1029 Equivilent to trim_seqs script.
1030
1031 =cut
1032
1033 #####################################################################################################
1034 sub del_long_seqs {
1035   my ( $factor, @seqs ) = @_;
1036
1037   # Keep the query sequence (we assume query is the 1st FASTA record)
1038   my $query = shift @seqs;
1039   my @res   = $query;
1040
1041   # Find the length of the query sequence without any gaps
1042   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
1043
1044   # Calculate the upper and lower allowed bounds
1045   my ( $upper, $lower );
1046   {
1047     my $bounds = ( $length / 100 ) * $factor;
1048     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
1049   }
1050
1051   # Now try each sequence and see how long they are
1052   for (@seqs) {
1053     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
1054     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
1055   }
1056
1057   return @res;
1058 }
1059
1060 =head2 toupper ($PSISEQ)
1061
1062 Converts sequence held in PSISEQ object to uppercase.
1063
1064 =cut
1065
1066 #####################################################################################################
1067 sub toupper {
1068   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
1069 }
1070
1071 =head2 degap($PSISEQ_QUERY, @PSISEQS)
1072
1073 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
1074
1075 Equivilent to fasta2jnet script.
1076
1077 =cut
1078
1079 #####################################################################################################
1080 sub degap {
1081   my (@seqs) = @_;
1082
1083   # Find where the gaps in the query sequence are
1084   my @gaps;
1085   my $i = 0;
1086   for ( @{ $seqs[0]->align } ) {
1087     push @gaps, $i if $_ !~ /$GAP/;
1088     $i++;
1089   }
1090
1091   return unless @gaps;
1092
1093   # Remove the gaps in all the sequences.
1094   # Derefences the array reference and takes a slice inside the method call
1095   # arguments, then coerce back to an array ref.
1096   for (@seqs) {
1097     $_->align( [ @{ $_->align }[@gaps] ] );
1098   }
1099 }
1100
1101 =head2 getfreq($filename, @PSISEQS)
1102
1103 Creates a PSIBLAST like profile and writes it to the $filename.
1104
1105 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
1106 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
1107 but it's *not* the same.
1108
1109 =cut
1110
1111 #####################################################################################################
1112 sub getfreq {
1113   my ( $fn, @seqs ) = @_;
1114
1115   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
1116
1117   # Create a PSIBLAST like profile from the alignment
1118   # This doesn't greate the same result as old Jpred, I think this is because
1119   # differences in the input between old and new
1120
1121   # For testing
1122   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
1123   #   my @profile = profile(
1124   #      map { join "", @{ $_->seq } } $f->get_entries
1125   #   );
1126
1127   my @profile = profile( map { join "", @{ $_->align } } @seqs );
1128
1129   open my $fh, ">$fn" or die "JPRED: $fn: $!";
1130   print $fh join( " ", @{$_} ), "\n" for @profile;
1131 }
1132
1133 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
1134
1135 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
1136
1137 Equivilent to gethmm script.
1138
1139 =cut
1140
1141 #####################################################################################################
1142 sub hmmer {
1143   my (@seqs) = @_;
1144
1145   # Temp files required
1146   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
1147
1148   # Create the FASTA file
1149   psiseq2fasta( $tmp_fasta, @seqs );
1150
1151   # Run HMMer
1152   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
1153   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1154
1155   # Read in the HMMER file
1156   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1157
1158   # Convert from HMMER::Profile to HMMER::Profile::Jnet
1159   my $hmmer_jnet = HMMER::Profile::Jnet->new;
1160   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
1161
1162   return $hmmer_jnet;
1163 }
1164
1165 =head2 jnet($hmmer, $out, $pssm )
1166
1167 Runs Jnet. Pass the paths of the filenames to do the prediction on
1168
1169 =cut
1170
1171 #################################################################################################################################################
1172
1173 =head2 get_hmm($alignFile, $name)
1174
1175 Adapted from the hmmer() function in Jpred as written by JDB.
1176
1177 Generates an HMMer profile output compatible with Jnet.
1178
1179 Equivilent to gethmm script.
1180
1181 =cut
1182
1183 sub hmm_for_align {
1184   my $fastafile = shift;
1185
1186   # Temp files required
1187   print "hmm_for_align: fastafile = $fastafile\n";
1188   my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
1189
1190   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
1191   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1192
1193   # Read in the HMMER file
1194   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1195
1196   # Read in the HMMER file
1197   my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
1198
1199   # Convert from HMMER::Profile to HMMER::Profile::Jnet
1200   my $hmmer_jnet = HMMER::Profile::Jnet->new;
1201   $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
1202
1203   return $hmmer_jnet;
1204 }
1205
1206 #####################################################################################################
1207 sub jnet {
1208   my ( $hmmer, $outfile, $pssm ) = @_;
1209   if ($pssm) {
1210
1211     # run Jnet prediction with all the profile data
1212     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
1213   } else {
1214
1215     # run Jnet prediction with only HMMer and alignment data
1216     system("$jnet --concise --hmmer $hmmer > $outfile");
1217   }
1218   check( "jnet", $? ) or exit 1;
1219   my $res     = "";
1220   my $logging = "";
1221   open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1222   while (<$IN>) {
1223     if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
1224       $res .= $_;
1225     } else {
1226       $logging .= $_;
1227     }
1228   }
1229   close $IN;
1230   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1231   print $OUT "\n" . $res;
1232   close $OUT;
1233   return $logging;
1234 }
1235
1236 =head1 psiseq2fasta($path, @PSISEQ)
1237
1238 Writes a FASTA file given an array of PSISEQ objects.
1239
1240 =cut
1241
1242 #####################################################################################################
1243 sub psiseq2fasta {
1244   my ( $fn, @seqs ) = @_;
1245
1246   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1247   confess "JPRED: not passed PSISEQs" unless @seqs;
1248
1249   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1250
1251   # Changed from using FASTA::File object due to seg faulting on some large,
1252   # long alignments, presumably due to running out of memory.
1253   #   my $fasta = FASTA::File->new;
1254   #   $fasta->add_entries(
1255   #      map {
1256   #         my $f = FASTA->new(id => $_->id);
1257   #         $f->seq(@{$_->align});
1258   #         $f;
1259   #      } @seqs
1260   #   );
1261   #   $fasta->write_file($fn);
1262
1263   my $fh;
1264   $fn =~ /\.gz$/
1265     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1266     : ( open $fh, ">$fn" or die "JPRED: $!" );
1267
1268   for (@seqs) {
1269     print $fh ">", $_->id, "\n";
1270     my $seq = join( "", @{ $_->align } );
1271     $seq =~ s/(.{72})/$1\n/g;
1272     chomp $seq;
1273     print $fh $seq . "\n";
1274   }
1275   close $fh;
1276 }
1277
1278 =head1 @PSISEQ = fasta2psiseq($path)
1279
1280 Reads in a FASTA file and returns an array of PSISEQ objects.
1281
1282 =cut
1283
1284 #####################################################################################################
1285 sub fasta2psiseq {
1286   my ($fn) = @_;
1287
1288   my $fasta =
1289     $fn =~ /\.gz$/
1290     ? FASTA::File->new( read_gzip_file => $fn )
1291     : FASTA::File->new( read_file      => $fn );
1292   $fasta or die "JPRED: $!";
1293
1294   my @seqs;
1295   for ( $fasta->get_entries ) {
1296     my $seq = PSISEQ->new;
1297     $seq->id( $_->id );
1298     $seq->align( [ @{ $_->seq } ] );
1299     push @seqs, $seq;
1300   }
1301
1302   return @seqs;
1303 }
1304
1305 #####################################################################################################
1306 sub fasta2psiseq_lowmemory {
1307   my $filename = shift;
1308
1309   local $/ = "\n";
1310   my $fh;
1311   $filename =~ /\.gz$/
1312     ? ( open $fh, "gzip -dc $filename|" or die $! )
1313     : ( open $fh, $filename or die $! );
1314
1315   my @seqs;
1316   my ( $id, $seq );
1317   while (<$fh>) {
1318     chomp;
1319     if (s/^>//) {
1320       $seq =~ s/ //g if defined $seq;
1321
1322       if ( $id and $seq ) {
1323         my $psi = PSISEQ->new( id => $id );
1324         $psi->align( [ split //, $seq ] );
1325         push @seqs, $psi;
1326         undef $id;
1327         undef $seq;
1328       } else {
1329         $id = $_;
1330       }
1331     } else {
1332       $seq .= $_;
1333     }
1334   }
1335   if ( $id and $seq ) {
1336     my $psi = PSISEQ->new( id => $id );
1337     $psi->seq($seq);
1338     push @seqs, $psi;
1339   }
1340
1341   return @seqs;
1342 }
1343
1344 #####################################################################################################
1345
1346 =head1 extend($FASTA::File, @PSISEQ)
1347
1348 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1349 against the query, this function extends the alignment so that all of the query is
1350 present in the alignment. The other sequences are extended with gap characters.
1351
1352 =cut
1353
1354 sub extend {
1355   my ( $query, @seqs ) = @_;
1356
1357   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1358   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
1359
1360   # Get the query sequence
1361   my $q_query = $query->get_entry(0);
1362
1363   # Get the query sequence from the PSIBLAST alignment
1364   my $a_query = shift @seqs;
1365
1366   # The gap size required to expand the alignment
1367   my ( $start_gap_length, $end_gap_length );
1368   {
1369
1370     # Make handling the sequence easier
1371     my $q_seq = join "", @{ [ $q_query->seq ] };
1372     my $a_seq = join "", @{ $a_query->align };
1373
1374     $q_seq =~ s/-//g;
1375     $a_seq =~ s/-//g;
1376
1377     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1378
1379     $start_gap_length = index( $q_seq, $a_seq );
1380     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1381
1382     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1383   }
1384
1385   # Add the gaps to the end of the alignments
1386   for (@seqs) {
1387     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1388   }
1389
1390   # Put the query sequence back
1391   unshift @seqs,
1392     PSISEQ->new(
1393     id    => $a_query->id,
1394     align => [
1395       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1396       @{ $a_query->align },
1397       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1398     ]
1399     );
1400
1401   return @seqs;
1402 }
1403
1404 #####################################################################################################
1405
1406 =head1 $int = fasta_seq_length($path)
1407
1408 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1409
1410 =cut
1411
1412 sub fasta_seq_length {
1413   my ($fn) = @_;
1414   open my $fh, $fn or croak "Can't open file $fn: $!\n";
1415
1416   my $length = 0;
1417   local $_, $/ = "\n";
1418   while (<$fh>) {
1419     next if /^>/;
1420     chomp;
1421     $length += tr///c;
1422   }
1423
1424   return $length;
1425 }