binaries/src/jpred/jpred.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9 ./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence
  14 (with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC.
  15 For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the
  16 secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
  17
  18 =head1 OPTIONS
  19
  20 =over 5
  21
  22 =item -in <FILE1>
  23
  24 The path to the sequence file (in FASTA, MSF, or BLC format)
  25
  26 =item -seq
  27
  28 The input file is a FASTA file with one sequence only.
  29
  30 =item -output <FILEPREFIX>
  31
  32 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
  33
  34 =item -outfile <FILE2>
  35
  36 An output file, by default it is undefined and the -output option is working
  37
  38 =item -logfile <FILE3>
  39
  40 Logging file, by default it is undefined and logging switched off
  41
  42 =item -dbpath PATH
  43
  44 Path to the uniref database used for PSI-BLAST querying. default value: .
  45
  46 =item -dbname database
  47
  48 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
  49 Default value: uniref90
  50
  51 =item -psi path
  52
  53 Path to a PSIBLAST output file.
  54
  55 =item -ncpu NNN
  56
  57 Number of CPU used by jpred.pl. Maximum value is 8
  58 Default: NNN = 1
  59
  60 =item -pred-nohits
  61
  62 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  63
  64 =item -no-final
  65
  66 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
  67 The option untoggles this step and stops jpred at the concise file generation
  68
  69 =item -jabaws
  70
  71 Redefined basic variable in order to use the script within JABAWS
  72
  73 =item -verbose
  74
  75 Verbose mode. Print more information while running jpred.
  76
  77 =item -debug
  78
  79 Debug mode. Print debugging information while running jpred.
  80
  81 =item -help
  82
  83 Gives help on the programs usage.
  84
  85 =item -man
  86
  87 Displays this man page.
  88
  89 =back
  90
  91 =head1 BUGS
  92
  93 Can't cope with gaps in the query sequence.
  94
  95 Doesn't accept alignments.
  96
  97 If you find any others please contact authors.
  98
  99 =head1 AUTHORS
 100
 101 Jonathan Barber <jon@compbio.dundee.ac.uk>
 102 Chris Cole <christian@cole.name>
 103 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
 104
 105 =cut
 106
 107 ## TODO check for gaps in query sequence
 108 ##      check for disallowed chars in sequence
 109
 110 use strict;
 111 use warnings;
 112 use Getopt::Long;
 113 use Pod::Usage;
 114 use Carp;
 115 use Data::Dumper;
 116 use File::Temp;
 117
 118 # path to Jpred modules
 119 use FindBin qw($Bin);
 120 use lib "$Bin/lib";
 121
 122 # internal jpred modules
 123 use Jpred;
 124 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
 125 use HMMER::Profile;
 126 use HMMER::Profile::Jnet;
 127 use PSIBLAST;
 128 use PSIBLAST::PSSM;
 129 use PSIBLAST::Run;
 130 use FASTA::File;
 131 use FASTA;
 132 use Index;
 133 use Pairwise;
 134 use OC;
 135 use Utils qw(profile);
 136 use Run qw(check);
 137
 138 our $VERSION = '3.0.2';
 139 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 140 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 141
 142 # Gap characters in sequences
 143 our $GAP = '-';
 144
 145 #####################################################################################################
 146 # Light object to hold sequence information, light to prevent memory overload
 147 # for big search results. Even so this is quite memory intensive, I can't see
 148 # how to reduce the size further.
 149 {
 150
 151   package PSISEQ;
 152
 153   sub new {
 154     my ( $class, %args ) = @_;
 155     my $self = [];
 156     bless $self, $class;
 157     for ( keys %args ) { $self->$_( $args{$_} ) }
 158     return $self;
 159   }
 160
 161   sub id {
 162     return defined $_[1]
 163       ? $_[0]->[0] = $_[1]
 164       : $_[0]->[0];
 165   }
 166
 167   sub start {
 168     return defined $_[1]
 169       ? $_[0]->[1] = $_[1]
 170       : $_[0]->[1];
 171   }
 172
 173   sub align {
 174     if ( defined $_[1] ) {
 175       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 176
 177       $_[0]->[2] = join( "", @{ $_[1] } );
 178     } else {
 179       return [ split( //, $_[0]->[2] ) ];
 180     }
 181   }
 182 }
 183
 184 #####################################################################################################
 185 my $infile;
 186 my $infastafile;
 187 my $format;
 188 my $goal = "align";
 189 my $seqgoal;
 190 my $outfile;
 191 my $prefix;
 192 my $psiblast;
 193 my $jabaws;
 194 my $logfile;
 195 my $ncpu       = 1;                              # number of CPUs for psiblast calculations
 196 my $predNoHits = 0;                              # define whether to make predictions when no PSI-BLAST hits are found
 197 my $db_path    = "/homes/www-jpred/databases";
 198 my $db_entry   = "cluster";
 199 my $nofinal;
 200
 201 my ( $help, $man, $DEBUG, $VERBOSE );
 202
 203 GetOptions(
 204   "in=s"        => \$infile,
 205   "output=s"    => \$prefix,
 206   "outfile=s"   => \$outfile,
 207   "logfile=s"   => \$logfile,
 208   "psi=s"       => \$psiblast,
 209   "dbname=s"    => \$db_entry,
 210   "dbpath=s"    => \$db_path,
 211   "ncpu=s"      => \$ncpu,
 212   "pred-nohits" => \$predNoHits,
 213   "no-final"    => \$nofinal,
 214   "seq"         => \$seqgoal,
 215   "jabaws"      => \$jabaws,
 216
 217   "help"    => \$help,
 218   "man"     => \$man,
 219   "debug"   => \$DEBUG,
 220   "verbose" => \$VERBOSE
 221 ) or pod2usage(2);
 222 pod2usage(1) if $help;
 223 pod2usage( verbose => 2 ) if $man;
 224
 225 $goal = "seq" if ( defined $seqgoal );
 226
 227 #####################################################################################################
 228 # Key to database information and information for accessing them
 229 if ( defined $jabaws ) {
 230   $db_path  = "/data/UNIREFdb";
 231   $db_entry = "cluster";
 232 }
 233
 234 my $database = {
 235
 236   ## default database used by Jpred
 237   uniref90 => {
 238     database   => $db_path . "/uniref90.filt",
 239     unfiltered => $db_path . "/uniref90",
 240   },
 241
 242   ## database used during Jnet training
 243   training => {
 244     database   => $db_path . "/training/uniref90.filt",
 245     unfiltered => $db_path . "/training/uniref90",
 246   },
 247
 248   ## database used for ported Jpred
 249   ported_db => {
 250     database   => $db_path . "/uniref90.filt",
 251     unfiltered => $db_path . "/uniref90",
 252   },
 253
 254   ## cluster-specific path for Jpred
 255   cluster => {
 256     database   => $db_path . "/uniref90.filt",
 257     unfiltered => $db_path . "/uniref90",
 258   },
 259
 260   ## these other DBs are experimental ones used during development.
 261   ## check they exist before using them.
 262   # generic entry for use with validate_jnet.pl
 263   # real db location defined by validate_jnet.pl
 264   swall => {
 265     database   => $db_path . "/swall/swall.filt",
 266     unfiltered => $db_path . "/swall/swall",
 267   },
 268
 269   # Path to PSIBLAST db
 270   uniprot => {
 271     database   => $db_path . "/3/swall.filt",
 272     unfiltered => $db_path . "/3/swall",
 273   },
 274
 275   uniref50 => {
 276     database   => $db_path . "/6/swall.filt",
 277     unfiltered => $db_path . "/6/swall",
 278   },
 279 };
 280
 281 my $dp = $database->{$db_entry}{database};
 282 pod2usage("ERROR! Input file should be provided with -sequence/-in")                                              unless $infile;
 283 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database")            unless exists $database->{$db_entry};
 284 pod2usage("ERROR! UNIREF database is not available at $dp. Use -dbpath and -dbname for configuring the database") unless -f $dp;
 285
 286 #####################################################################################################
 287 # lots of preparation steps
 288 if ( defined $prefix ) {
 289   unless ( defined $outfile ) {
 290     $outfile = $prefix . ".res.fasta";
 291   }
 292 } else {
 293   if ( defined $outfile ) {
 294     print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
 295     $prefix = $outfile . ".tmp";
 296   } else {
 297     print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
 298     $prefix  = $infile . ".res";
 299     $outfile = $prefix . ".jnet";
 300   }
 301 }
 302
 303 if ( 1 > $ncpu or 8 < $ncpu ) {
 304   print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
 305   $ncpu = 1;
 306 }
 307
 308 $VERBOSE = 1 if $DEBUG;
 309
 310 my $LOG;
 311 if ( defined $logfile ) {
 312   open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
 313 }
 314
 315 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
 316   my ( $key, $value ) = @{$_};
 317   defined $value or next;
 318   my $par = join( ": ", $key, $value );
 319   print "$par\n" if $DEBUG;
 320   print $LOG "$par\n" if $LOG;
 321 }
 322
 323 if ( defined $jabaws ) {
 324   setup_jpred_env($Bin);
 325   setup_env($Bin);
 326 }
 327 my $platform = check_OS();
 328 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 329 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
 330
 331 #####################################################################################################
 332 # check input file format
 333 if ( 'seq' eq $goal ) {
 334   $format = "seq";
 335   if ( 1 != check_FASTA_format($infile) ) {
 336     die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
 337   }
 338 } else {
 339   my $nseq = check_FASTA_format($infile);
 340   if ( 0 < $nseq ) {
 341     $format = "fasta";
 342     if ( 1 == $nseq ) {
 343       die "\nERROR! jpred requires alignment with more than 1 sequence\n       if you provide only one sequence use the -seq option.\n";
 344     }
 345   } elsif ( 0 < check_MSF_format($infile) ) {
 346     $format = "msf";
 347   } elsif ( 0 < check_BLC_format($infile) ) {
 348     $format = "blc";
 349   } else {
 350     die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
 351   }
 352 }
 353 $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
 354
 355 #####################################################################################################
 356 if ( 'seq' eq $goal ) {
 357   my $query = FASTA::File->new( read_file => $infile );
 358   my @seqs;
 359   my $psi = PSIBLAST->new;
 360   unless ( defined $psiblast && -e $psiblast ) {
 361     ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 362     # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 363     # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 364     # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 365     my $psi_run = PSIBLAST::Run->new(
 366       debug    => $DEBUG,
 367       args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 368       path     => $psiblastbin,
 369       input    => $infile,
 370       output   => "$prefix.blast",
 371       matrix   => "$prefix.profile",
 372       database => $database->{$db_entry}{database},
 373     );
 374
 375     # For reduced databases, get the size of the orginal and use this as
 376     # the equivilent DB size
 377     if ( $db_entry =~ /^sp_red_/ ) {
 378       ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 379       $psi_run->args(
 380         $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 381       );
 382     }
 383     print "BLAST matrix: $ENV{BLASTMAT}\n"      if $DEBUG;
 384     print "BLASTDB path: $ENV{BLASTDB}\n"       if $DEBUG;
 385     print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
 386     print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
 387
 388     print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 389     print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
 390     $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 391
 392     $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 393     system("gzip -9f $prefix.blast");
 394   } else {
 395     if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 396     else                         { $psi->read_file($psiblast) }                        # CC ditto above
 397   }
 398
 399 #####################################################################################################
 400   # Convert the last itteration into a collection of PSISEQ objects
 401   for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 402     my ( $id, $start, $align ) = @{$_};
 403     push @seqs,
 404       PSISEQ->new(
 405       id    => $id,
 406       start => $start,
 407       align => [ split( //, $align ) ]
 408       );
 409   }
 410
 411 #####################################################################################################
 412   # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
 413   # Accuracy won't be as good, but at least it's a prediction
 414   if ( @seqs == 0 ) {
 415     if ( $predNoHits == 0 ) {
 416       warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
 417       print ">>100% complete\n";
 418       print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
 419       print $LOG ">>100% complete\n"                                                                         if $LOG;
 420       close($LOG)                                                                                            if $LOG;
 421       exit;
 422     } else {
 423       print ">>50% complete\n";
 424       warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 425       print "Running HMMer on query...\n"                                                            if $VERBOSE;
 426       print $LOG ">>50% complete\n"                                                                  if $LOG;
 427       print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
 428       print $LOG "Running HMMer on query...\n"                                                       if $LOG;
 429
 430       # copy input query to alignment
 431       #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
 432       open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 433       open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
 434       while (<$ifh>) { print $ofh, $_ }
 435       close $ifh;
 436       close $ofh;
 437
 438       # Temp files required for HMMer
 439       my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 440       system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 441       system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 442       unlink $hmmbuild_out;
 443
 444       # Read in the HMMER file
 445       my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 446
 447       # Convert from HMMER::Profile to HMMER::Profile::Jnet
 448       my $hmmer_jnet = HMMER::Profile::Jnet->new;
 449       $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 450       $hmmer_jnet->write_file("$prefix.hmm");
 451       print ">>70% complete\n";
 452       print $LOG ">>70% complete\n" if $LOG;
 453
 454       # Run Jnet for the prediction
 455       print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 456       print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 457       my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
 458       print ">>90% complete\n";
 459       print $LOG ">>90% complete\n" if $LOG;
 460       print $jnetlog                if $VERBOSE;
 461       print $LOG $jnetlog           if $LOG;
 462     }
 463   } else {
 464     psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
 465     print ">>40% complete\n";
 466     print $LOG ">>40% complete\n" if $LOG;
 467
 468     #####################################################################################################
 469     # Make PSIBLAST truncated alignments the right length
 470     print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 471     print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
 472     @seqs = extend( $query, @seqs );
 473     psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 474
 475     #####################################################################################################
 476     # Remove masking from PSIBLAST alignment
 477     print "Unmasking the alignments...\n" if $VERBOSE;
 478     print $LOG "Unmasking the alignments...\n" if $LOG;
 479     my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 480     remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 481     psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 482
 483     #####################################################################################################
 484     # Convert the sequences to upper case
 485     print "Converting sequences to the same case...\n" if $VERBOSE;
 486     print $LOG "Converting sequences to the same case...\n" if $LOG;
 487     toupper($_) for @seqs;    # CC sub is below
 488     psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
 489
 490     #####################################################################################################
 491     # Remove excessive sequences
 492     print "Remove excessive sequences...\n" if $VERBOSE;
 493     print $LOG "Remove excessive sequences...\n" if $LOG;
 494     @seqs = reduce( $MAX_ALIGN, @seqs );
 495     psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 496
 497     #####################################################################################################
 498     # Remove sequences that are too long or too short
 499     print "Remove sequences which too long or short...\n" if $VERBOSE;
 500     print $LOG "Remove sequences which too long or short...\n" if $LOG;
 501     @seqs = del_long_seqs( 50, @seqs );
 502     psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 503
 504     #####################################################################################################
 505     # Remove redundant sequences based upon pairwise identity and OC clustering
 506     print "Remove redundant sequences...\n" if $VERBOSE;
 507     print $LOG "Remove redundant sequences...\n" if $LOG;
 508     @seqs = nonred( $NR_CUT, @seqs );
 509     psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 510
 511     #####################################################################################################
 512     # Check that we haven't got rid of all of the sequences
 513     if ( @seqs < 2 ) {
 514       warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 515       print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
 516       @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
 517     }
 518     unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
 519
 520     # Remove gaps in the query sequence and same positions in the alignment
 521     print "Removing gaps in the query sequence...\n" if $VERBOSE;
 522     print $LOG "Removing gaps in the query sequence...\n" if $LOG;
 523     degap(@seqs);
 524     psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 525
 526     # Output the alignment for the prediction
 527     print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 528     print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
 529     psiseq2fasta( "$prefix.align", @seqs );
 530
 531     #####################################################################################################
 532     # Equivilent to getpssm script
 533     print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 534     print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
 535     my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
 536     $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 537     print ">>50% complete\n";
 538     print $LOG ">>50% complete\n" if $LOG;
 539
 540     #####################################################################################################
 541     # Run HMMER on the sequences
 542     print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 543     print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
 544     my $hmmer = hmmer(@seqs);             # CC sub is below
 545     $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 546     print ">>70% complete\n";
 547     print $LOG ">>70% complete\n" if $LOG;
 548 #####################################################################################################
 549     # Run Jnet for the prediction
 550     print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 551     print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
 552     my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
 553     print ">>90% complete\n";
 554     print $LOG ">>90% complete\n" if $LOG;
 555     print $jnetlog                if $VERBOSE;
 556     print $LOG $jnetlog           if $LOG;
 557   }
 558 } else {
 559   if ( 'fasta' eq $format ) {
 560     print "Read FASTA file\n";
 561     my $hmmer1 = hmm_for_align($infile);
 562     $hmmer1->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 563   } elsif ( 'msf' eq $format ) {
 564     msf2fasta( $infile, $infastafile );
 565     my $hmmer2 = hmm_for_align($infastafile);
 566     $hmmer2->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 567   } elsif ( 'blc' eq $format ) {
 568     my $tmpfile = File::Temp->new->filename;
 569     blc2msf( $infile, $tmpfile );
 570     msf2fasta( $infile, $infastafile );
 571     my $hmmer3 = hmm_for_align($infastafile);
 572     $hmmer3->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 573   } else {
 574     die "ERROR! unknown input format '$format'. exit...\n";
 575   }
 576   print ">>70% complete\n";
 577   print $LOG ">>70% complete\n" if $LOG;
 578 #####################################################################################################
 579   # Run Jnet for the prediction
 580   print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 581   print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 582   my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );           # CC sub is below
 583   print ">>90% complete\n";
 584   print $LOG ">>90% complete\n" if $LOG;
 585   print $jnetlog                if $VERBOSE;
 586   print $LOG $jnetlog           if $LOG;
 587 }
 588
 589 unless ( defined $nofinal ) {
 590   my $aligfile      = $prefix . ".align";
 591   my $jnetfile      = $prefix . ".jnet";
 592   my $jnetfastafile = $prefix . ".jnet.fasta";
 593   $aligfile = $infile if ( 'fasta' eq $format );
 594   $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
 595   concise2fasta( $jnetfile, $jnetfastafile );
 596   open( my $IN1, "<", $aligfile )      or die "ERROR! unable to open '$aligfile': ${!}\nDied";
 597   open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
 598   open( my $OUT, ">", $outfile )       or die "ERROR! unable to open '$outfile': ${!}\nDied";
 599   while (<$IN2>) { print $OUT $_; }
 600   while (<$IN1>) { print $OUT $_; }
 601   close($IN1);
 602   close($IN2);
 603   close($OUT);
 604   unlink $jnetfastafile;
 605   my $seqs = FASTA::File->new( read_file => $outfile );
 606   open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
 607   $seqs->write($OUT2);
 608   close($OUT2);
 609 } else {
 610   if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
 611     rename $prefix . ".jnet", $outfile;
 612   }
 613 }
 614
 615 print ">>100% complete\n";
 616 print "Jpred Finished\n";
 617 print $LOG ">>100% complete\n" if $LOG;
 618 print $LOG "Jpred Finished\n"  if $LOG;
 619 close($LOG)                    if $LOG;
 620 exit;
 621
 622 #####################################################################################################
 623 # Functions
 624 #####################################################################################################
 625 sub check_FASTA_format {
 626   my $infile = shift;
 627
 628   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
 629   my $check_first_line = 1;
 630   my $nseq             = 0;
 631   local $/ = "\n>";
 632   while (<$IN>) {
 633     if ($check_first_line) {
 634       return 0 unless (/^>/);
 635       $check_first_line = 0;
 636     }
 637     s/^>//g;
 638     s/>$//g;
 639
 640     my ( $id, @seqs ) = split /\n/, $_;
 641     return 0 unless ( defined $id or @seqs );
 642     my $seq = join( "", @seqs );
 643     return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
 644     ++$nseq;
 645   }
 646   close($IN);
 647
 648   return $nseq;
 649 }
 650 #####################################################################################################
 651 sub check_MSF_format {
 652   my $infile = shift;
 653   $? = 0;
 654   system("$readseq -fMSF -a -p < $infile > /dev/null");
 655   return 0 if ($?);
 656
 657   return 1;
 658 }
 659 #####################################################################################################
 660 sub check_BLC_format {
 661   my $infile = shift;
 662
 663   my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
 664   open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
 665   print $fh "silent_mode\nblock_file $infile\n";
 666   print $fh "output_file /dev/null\nmax_nseq 2000\n";
 667   print $fh "pir_save $tmpfile2\nsetup\n";
 668   close $fh;
 669
 670   $? = 0;
 671   system("$alscript -s -f $tmpfile1");
 672   $? = 0;
 673   system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
 674   unlink $tmpfile1;
 675   unlink $tmpfile2;
 676   return 0 if ($?);
 677
 678   return 1;
 679 }
 680 ############################################################################################################################################
 681 # convertor BLC -> MSF
 682 sub msf2fasta {
 683   my $infile  = shift;
 684   my $outfile = shift;
 685
 686   my $sequence = "";
 687   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
 688   while (<$IN>) {
 689     $sequence .= $_;
 690   }
 691   close($IN);
 692
 693   $sequence =~ s/~/\./g;
 694
 695   ## check for non-unique sequence names - readseq has problems with these
 696   my @lines = split /\n/, $sequence;
 697   my %names;
 698   foreach my $line (@lines) {
 699     if ( $line =~ /Name:\s+(\S+)/ ) {
 700       die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
 701       $names{$1}++;
 702     }
 703   }
 704
 705   my $tmpfile1 = File::Temp->new->filename;
 706   my $tmpfile2 = File::Temp->new->filename;
 707   open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
 708   print $FILE $sequence;
 709   close $FILE;
 710
 711   $? = 0;
 712   system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
 713   die "Reformatting of $infile failed. exit...\n" if ($?);
 714   unlink $tmpfile1;
 715
 716   $? = 0;
 717   system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
 718   die "Reformatting of $infile failed. exit...\n" if ($?);
 719   unlink $tmpfile2;
 720
 721   die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
 722 }
 723
 724 ############################################################################################################################################
 725 # convertor BLC -> MSF
 726 sub blc2msf {
 727   my ( $in, $out ) = @_;
 728
 729   my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
 730
 731   open my $fh, ">$tmp" or die "$tmp: $!\n";
 732   print $fh "silent_mode\nblock_file $infile\n";
 733   print $fh "output_file /dev/null\nmax_nseq 2000\n";
 734   print $fh "pir_save $tmp_pir\nsetup\n";
 735   close $fh;
 736
 737   $? = 0;
 738   system("$alscript -s -f $tmp");
 739   die "Reformatting of $infile failed. exit...\n" if ($?);
 740   system("$readseq -f=MSF -o=$out -a $tmp_pir");
 741
 742   unlink $tmp;
 743   unlink $tmp_pir;
 744 }
 745
 746 #####################################################################################################
 747
 748 =begin :private
 749
 750 =head2 fasta2concise ($infile, $outfile)
 751
 752 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
 753
 754 =cut
 755
 756 sub fasta2concise {
 757   my $infile  = shift;
 758   my $outfile = shift;
 759
 760   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 761   my ( $seq, @seqs, @title );
 762   while (<$IN>) {
 763     if (s/^>//) {
 764       if ($seq) {
 765         $seq =~ s/\n|\s//g;
 766         $seq =~ s/(.)/$1,/g;
 767         push @seqs, $seq;
 768         $seq = "";
 769       }
 770       chomp;
 771       push @title, $_;
 772     } else {
 773       chomp;
 774       $seq .= $_;
 775     }
 776   }
 777   $seq =~ s/\n|\s//g;
 778   $seq =~ s/(.)/$1,/g;
 779   push @seqs, $seq;
 780   close($IN);
 781
 782   if ( @title != @seqs ) {
 783     die("non matching number of titles and sequences!\n");
 784   }
 785
 786   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 787   foreach ( 0 .. $#title ) {
 788     print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
 789   }
 790   close($OUT);
 791 }
 792
 793 =begin :private
 794
 795 =head2 concise2fasta ($infile, $outfile)
 796
 797 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
 798
 799 =cut
 800
 801 #####################################################################################################
 802 sub concise2fasta {
 803   my $infile  = shift;
 804   my $outfile = shift;
 805
 806   my ( @seqs, %seq, @pred, %pred );
 807
 808   my @var = (
 809     "Lupas_21",  "Lupas_14", "Lupas_28", "MULTCOIL",  "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
 810     "JNETALIGN", "JNETHMM",  "JNETSOL5", "JNETSOL25", "JNETSOL0",        "jnetpred",       "jpred"
 811   );
 812
 813   # parse input concise file
 814   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 815   while (<$IN>) {
 816     unless (/^\n/) {
 817       my ( $id, $seq ) = split( ":", $_ );
 818       if ( defined $id and defined $seq ) {    # Check we have proper values
 819         $seq =~ s/,//g;
 820         chomp $seq;
 821         if ( $id =~ /;/ ) {                    # Then it's an alignment
 822           @_ = split( ";", $id );
 823           push @seqs, $_[1];
 824           $seq{ $_[1] } = $seq;
 825         }
 826         foreach my $v (@var) {
 827           if ( $v eq $id ) {
 828             push @pred, $v;
 829             $pred{$v} = $seq;
 830           }
 831         }
 832       }
 833     }
 834   }
 835   close($IN);
 836
 837   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 838
 839   # print out sequences found with PSI-BLAST
 840   foreach (@seqs) {
 841     $seq{$_} =~ s/(.{72})/$1\n/g;
 842     print $OUT ">$_\n$seq{$_}\n";
 843   }
 844
 845   # print out predictions
 846   foreach my $p (@pred) {
 847     $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
 848     $pred{$p} =~ s/B/E/g;
 849     $pred{$p} =~ s/G/H/g;
 850
 851     if ( $p =~ /SOL/ ) {
 852       $pred{$p} =~ s/E/B/g;
 853     }
 854     $pred{$p} =~ s/(.{72})/$1\n/g;
 855     print $OUT ">$p\n$pred{$p}\n";
 856   }
 857   close($OUT);
 858 }
 859
 860 =begin :private
 861
 862 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 863
 864 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 865 Does it in place.
 866
 867 =cut
 868
 869 #####################################################################################################
 870 sub remove_seq_masks {
 871   my $idx = shift;
 872   my $seq = shift;
 873
 874   # Only bother if we have a sequence which has been masked
 875   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 876
 877   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 878   my $searchdb = $database->{$db_entry}{unfiltered};
 879   $start--;    # We need this to be zero based
 880
 881   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 882   my @seqs = $f->get_entries;
 883   my $db   = shift @seqs;
 884
 885   unless ($db) {
 886     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 887   } elsif (@seqs) {
 888     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 889   }
 890
 891   my @db_seq = @{ $db->seq };
 892
 893   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 894   my $i = 0;
 895   for ( 0 .. $#align ) {
 896     next if $align[$_] =~ /$GAP/;
 897
 898     if ( $align[$_] eq 'X' ) {
 899       unless ( defined $db_seq[ $i + $start ] ) {
 900         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 901       }
 902       $align[$_] = $db_seq[ $i + $start ];
 903     }
 904     $i++;
 905   }
 906   $seq->align( \@align );
 907 }
 908
 909 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 910
 911 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 912
 913 Equivilent to reduce script.
 914
 915 =cut
 916
 917 #####################################################################################################
 918 sub reduce {
 919   my ( $max, @seqs ) = @_;
 920
 921   my $num = int( @seqs / $max );
 922   my @res = shift @seqs;
 923
 924   # Don't bother if we have less than the required sequences
 925   if ( @seqs < $max or 0 == $num ) {
 926     return @res, @seqs;
 927   }
 928
 929   for ( 0 .. $#seqs ) {
 930     push @res, $seqs[$_] if ( 0 == $_ % $num );
 931   }
 932
 933   return @res;
 934 }
 935
 936 =head2 nonred($cutoff, @PSISEQS);
 937
 938 Removes redundant sequences at $cutoff level of sequence identity.
 939
 940 Equivilent to nonred script.
 941
 942 =cut
 943
 944 #####################################################################################################
 945 sub nonred {
 946   my ( $cutoff, @seqs ) = @_;
 947
 948   # Run pairwise and then OC and remove similar sequences
 949   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 950   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 951   if     ( @seqs == 1 ) {
 952     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 953     return @seqs;
 954   }
 955
 956   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 957   my $fasta = FASTA::File->new;
 958   foreach (@seqs) {
 959     my $f = FASTA->new( id => $_->id );
 960     $f->seq( @{ $_->align } );
 961     $fasta->add_entries($f);
 962   }
 963
 964   # Run pairwise
 965   my $pair = Pairwise->new( path => $pairwise );
 966   my $foo = join "\n", $pair->run($fasta) or die $!;
 967
 968   # Run OC
 969   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 970   $ocres->read_string( join "", $ocres->run($foo) );
 971
 972   # Now found those entries that are unrelated
 973   my @keep;
 974   for ( $ocres->get_groups ) {
 975     my ( $group, $score, $size, @labels ) = @{$_};
 976
 977     # Keep the QUERY from the cluster, otherwise just get onelt
 978     if ( grep { /QUERY/ } @labels ) {
 979
 980       # Make sure the query stays at the start
 981       unshift @keep, "QUERY";
 982       next;
 983     } else {
 984       push @keep, shift @labels;
 985     }
 986   }
 987   push @keep, $ocres->get_unclustered;
 988
 989   # Return the matching entries.
 990   # We use the temporay to mimic the selection of sequences in the nonred
 991   # script, which took the sequences in the order they occured in the file.
 992   my @filtered_res;
 993   for (@seqs) {
 994     my $label = $_->id;
 995     if ( grep { $_ =~ /^$label$/ } @keep ) {
 996       push @filtered_res, $_;
 997     }
 998   }
 999
1000   return @filtered_res;
1001 }
1002
1003 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
1004
1005 Removes those sequences that are more than $percentage longer or shorter than the first
1006 sequence in @seqs. @seqs is an array of PSISEQ objects.
1007
1008 Equivilent to trim_seqs script.
1009
1010 =cut
1011
1012 #####################################################################################################
1013 sub del_long_seqs {
1014   my ( $factor, @seqs ) = @_;
1015
1016   # Keep the query sequence (we assume query is the 1st FASTA record)
1017   my $query = shift @seqs;
1018   my @res   = $query;
1019
1020   # Find the length of the query sequence without any gaps
1021   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
1022
1023   # Calculate the upper and lower allowed bounds
1024   my ( $upper, $lower );
1025   {
1026     my $bounds = ( $length / 100 ) * $factor;
1027     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
1028   }
1029
1030   # Now try each sequence and see how long they are
1031   for (@seqs) {
1032     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
1033     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
1034   }
1035
1036   return @res;
1037 }
1038
1039 =head2 toupper ($PSISEQ)
1040
1041 Converts sequence held in PSISEQ object to uppercase.
1042
1043 =cut
1044
1045 #####################################################################################################
1046 sub toupper {
1047   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
1048 }
1049
1050 =head2 degap($PSISEQ_QUERY, @PSISEQS)
1051
1052 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
1053
1054 Equivilent to fasta2jnet script.
1055
1056 =cut
1057
1058 #####################################################################################################
1059 sub degap {
1060   my (@seqs) = @_;
1061
1062   # Find where the gaps in the query sequence are
1063   my @gaps;
1064   my $i = 0;
1065   for ( @{ $seqs[0]->align } ) {
1066     push @gaps, $i if $_ !~ /$GAP/;
1067     $i++;
1068   }
1069
1070   return unless @gaps;
1071
1072   # Remove the gaps in all the sequences.
1073   # Derefences the array reference and takes a slice inside the method call
1074   # arguments, then coerce back to an array ref.
1075   for (@seqs) {
1076     $_->align( [ @{ $_->align }[@gaps] ] );
1077   }
1078 }
1079
1080 =head2 getfreq($filename, @PSISEQS)
1081
1082 Creates a PSIBLAST like profile and writes it to the $filename.
1083
1084 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
1085 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
1086 but it's *not* the same.
1087
1088 =cut
1089
1090 #####################################################################################################
1091 sub getfreq {
1092   my ( $fn, @seqs ) = @_;
1093
1094   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
1095
1096   # Create a PSIBLAST like profile from the alignment
1097   # This doesn't greate the same result as old Jpred, I think this is because
1098   # differences in the input between old and new
1099
1100   # For testing
1101   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
1102   #   my @profile = profile(
1103   #      map { join "", @{ $_->seq } } $f->get_entries
1104   #   );
1105
1106   my @profile = profile( map { join "", @{ $_->align } } @seqs );
1107
1108   open my $fh, ">$fn" or die "JPRED: $fn: $!";
1109   print $fh join( " ", @{$_} ), "\n" for @profile;
1110 }
1111
1112 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
1113
1114 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
1115
1116 Equivilent to gethmm script.
1117
1118 =cut
1119
1120 #####################################################################################################
1121 sub hmmer {
1122   my (@seqs) = @_;
1123
1124   # Temp files required
1125   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
1126
1127   # Create the FASTA file
1128   psiseq2fasta( $tmp_fasta, @seqs );
1129
1130   # Run HMMer
1131   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
1132   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1133
1134   # Read in the HMMER file
1135   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1136
1137   # Convert from HMMER::Profile to HMMER::Profile::Jnet
1138   my $hmmer_jnet = HMMER::Profile::Jnet->new;
1139   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
1140
1141   return $hmmer_jnet;
1142 }
1143
1144 =head2 jnet($hmmer, $out, $pssm )
1145
1146 Runs Jnet. Pass the paths of the filenames to do the prediction on
1147
1148 =cut
1149
1150 #################################################################################################################################################
1151
1152 =head2 get_hmm($alignFile, $name)
1153
1154 Adapted from the hmmer() function in Jpred as written by JDB.
1155
1156 Generates an HMMer profile output compatible with Jnet.
1157
1158 Equivilent to gethmm script.
1159
1160 =cut
1161
1162 sub hmm_for_align {
1163   my $fastafile = shift;
1164
1165   # Temp files required
1166   print "hmm_for_align: fastafile = $fastafile\n";
1167   my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
1168
1169   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
1170   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1171
1172   # Read in the HMMER file
1173   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1174
1175   # Read in the HMMER file
1176   my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
1177
1178   # Convert from HMMER::Profile to HMMER::Profile::Jnet
1179   my $hmmer_jnet = HMMER::Profile::Jnet->new;
1180   $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
1181
1182   return $hmmer_jnet;
1183 }
1184
1185 #####################################################################################################
1186 sub jnet {
1187   my ( $hmmer, $outfile, $pssm ) = @_;
1188   if ($pssm) {
1189
1190     # run Jnet prediction with all the profile data
1191     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
1192   } else {
1193
1194     # run Jnet prediction with only HMMer and alignment data
1195     system("$jnet --concise --hmmer $hmmer > $outfile");
1196   }
1197   check( "jnet", $? ) or exit 1;
1198   my $res     = "";
1199   my $logging = "";
1200   open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1201   while (<$IN>) {
1202     if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
1203       $res .= $_;
1204     } else {
1205       $logging .= $_;
1206     }
1207   }
1208   close $IN;
1209   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1210   print $OUT "\n" . $res;
1211   close $OUT;
1212   return $logging;
1213 }
1214
1215 =head1 psiseq2fasta($path, @PSISEQ)
1216
1217 Writes a FASTA file given an array of PSISEQ objects.
1218
1219 =cut
1220
1221 #####################################################################################################
1222 sub psiseq2fasta {
1223   my ( $fn, @seqs ) = @_;
1224
1225   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1226   confess "JPRED: not passed PSISEQs" unless @seqs;
1227
1228   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1229
1230   # Changed from using FASTA::File object due to seg faulting on some large,
1231   # long alignments, presumably due to running out of memory.
1232   #   my $fasta = FASTA::File->new;
1233   #   $fasta->add_entries(
1234   #      map {
1235   #         my $f = FASTA->new(id => $_->id);
1236   #         $f->seq(@{$_->align});
1237   #         $f;
1238   #      } @seqs
1239   #   );
1240   #   $fasta->write_file($fn);
1241
1242   my $fh;
1243   $fn =~ /\.gz$/
1244     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1245     : ( open $fh, ">$fn" or die "JPRED: $!" );
1246
1247   for (@seqs) {
1248     print $fh ">", $_->id, "\n";
1249     my $seq = join( "", @{ $_->align } );
1250     $seq =~ s/(.{72})/$1\n/g;
1251     chomp $seq;
1252     print $fh $seq . "\n";
1253   }
1254   close $fh;
1255 }
1256
1257 =head1 @PSISEQ = fasta2psiseq($path)
1258
1259 Reads in a FASTA file and returns an array of PSISEQ objects.
1260
1261 =cut
1262
1263 #####################################################################################################
1264 sub fasta2psiseq {
1265   my ($fn) = @_;
1266
1267   my $fasta =
1268     $fn =~ /\.gz$/
1269     ? FASTA::File->new( read_gzip_file => $fn )
1270     : FASTA::File->new( read_file      => $fn );
1271   $fasta or die "JPRED: $!";
1272
1273   my @seqs;
1274   for ( $fasta->get_entries ) {
1275     my $seq = PSISEQ->new;
1276     $seq->id( $_->id );
1277     $seq->align( [ @{ $_->seq } ] );
1278     push @seqs, $seq;
1279   }
1280
1281   return @seqs;
1282 }
1283
1284 #####################################################################################################
1285 sub fasta2psiseq_lowmemory {
1286   my $filename = shift;
1287
1288   local $/ = "\n";
1289   my $fh;
1290   $filename =~ /\.gz$/
1291     ? ( open $fh, "gzip -dc $filename|" or die $! )
1292     : ( open $fh, $filename or die $! );
1293
1294   my @seqs;
1295   my ( $id, $seq );
1296   while (<$fh>) {
1297     chomp;
1298     if (s/^>//) {
1299       $seq =~ s/ //g if defined $seq;
1300
1301       if ( $id and $seq ) {
1302         my $psi = PSISEQ->new( id => $id );
1303         $psi->align( [ split //, $seq ] );
1304         push @seqs, $psi;
1305         undef $id;
1306         undef $seq;
1307       } else {
1308         $id = $_;
1309       }
1310     } else {
1311       $seq .= $_;
1312     }
1313   }
1314   if ( $id and $seq ) {
1315     my $psi = PSISEQ->new( id => $id );
1316     $psi->seq($seq);
1317     push @seqs, $psi;
1318   }
1319
1320   return @seqs;
1321 }
1322
1323 #####################################################################################################
1324
1325 =head1 extend($FASTA::File, @PSISEQ)
1326
1327 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1328 against the query, this function extends the alignment so that all of the query is
1329 present in the alignment. The other sequences are extended with gap characters.
1330
1331 =cut
1332
1333 sub extend {
1334   my ( $query, @seqs ) = @_;
1335
1336   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1337   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
1338
1339   # Get the query sequence
1340   my $q_query = $query->get_entry(0);
1341
1342   # Get the query sequence from the PSIBLAST alignment
1343   my $a_query = shift @seqs;
1344
1345   # The gap size required to expand the alignment
1346   my ( $start_gap_length, $end_gap_length );
1347   {
1348
1349     # Make handling the sequence easier
1350     my $q_seq = join "", @{ [ $q_query->seq ] };
1351     my $a_seq = join "", @{ $a_query->align };
1352
1353     $q_seq =~ s/-//g;
1354     $a_seq =~ s/-//g;
1355
1356     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1357
1358     $start_gap_length = index( $q_seq, $a_seq );
1359     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1360
1361     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1362   }
1363
1364   # Add the gaps to the end of the alignments
1365   for (@seqs) {
1366     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1367   }
1368
1369   # Put the query sequence back
1370   unshift @seqs,
1371     PSISEQ->new(
1372     id    => $a_query->id,
1373     align => [
1374       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1375       @{ $a_query->align },
1376       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1377     ]
1378     );
1379
1380   return @seqs;
1381 }
1382
1383 #####################################################################################################
1384
1385 =head1 $int = fasta_seq_length($path)
1386
1387 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1388
1389 =cut
1390
1391 sub fasta_seq_length {
1392   my ($fn) = @_;
1393   open my $fh, $fn or croak "Can't open file $fn: $!\n";
1394
1395   my $length = 0;
1396   local $_, $/ = "\n";
1397   while (<$fh>) {
1398     next if /^>/;
1399     chomp;
1400     $length += tr///c;
1401   }
1402
1403   return $length;
1404 }