binaries/src/jpred/jpred.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9 ./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence
  14 (with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC.
  15 For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the
  16 secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
  17
  18 =head1 OPTIONS
  19
  20 =over 5
  21
  22 =item -in <FILE1>
  23
  24 The path to the sequence file (in FASTA, MSF, or BLC format)
  25
  26 =item -seq
  27
  28 The input file is a FASTA file with one sequence only.
  29
  30 =item -output <FILEPREFIX>
  31
  32 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
  33
  34 =item -outfile <FILE2>
  35
  36 An output file, by default it is undefined and the -output option is working
  37
  38 =item -logfile <FILE3>
  39
  40 Logging file, by default it is undefined and logging switched off
  41
  42 =item -dbpath PATH
  43
  44 Path to the uniref database used for PSI-BLAST querying. default value: .
  45
  46 =item -dbname database
  47
  48 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
  49 Default value: uniref90
  50
  51 =item -psi path
  52
  53 Path to a PSIBLAST output file.
  54
  55 =item -ncpu NNN
  56
  57 Number of CPU used by jpred.pl. Maximum value is 8
  58 Default: NNN = 1
  59
  60 =item -pred-nohits
  61
  62 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  63
  64 =item -no-final
  65
  66 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
  67 The option untoggles this step and stops jpred at the concise file generation
  68
  69 =item -jabaws
  70
  71 Redefines some basic variables in order to use the script within JABAWS
  72
  73 =item -verbose
  74
  75 Verbose mode. Print more information while running jpred.
  76
  77 =item -debug
  78
  79 Debug mode. Print debugging information while running jpred.
  80
  81 =item -help
  82
  83 Gives help on the programs usage.
  84
  85 =item -man
  86
  87 Displays this man page.
  88
  89 =back
  90
  91 =head1 BUGS
  92
  93 Can't cope with gaps in the query sequence.
  94
  95 Doesn't accept alignments.
  96
  97 If you find any others please contact authors.
  98
  99 =head1 AUTHORS
 100
 101 Jonathan Barber <jon@compbio.dundee.ac.uk>
 102 Chris Cole <christian@cole.name>
 103 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
 104
 105 =cut
 106
 107 ## TODO check for gaps in query sequence
 108 ##      check for disallowed chars in sequence
 109
 110 use strict;
 111 use warnings;
 112 use Getopt::Long;
 113 use Pod::Usage;
 114 use Carp;
 115 use Data::Dumper;
 116 use File::Temp;
 117
 118 # path to Jpred modules
 119 use FindBin qw($Bin);
 120 use lib "$Bin/lib";
 121
 122 # internal jpred modules
 123 use Jpred;
 124 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
 125 use HMMER::Profile;
 126 use HMMER::Profile::Jnet;
 127 use PSIBLAST;
 128 use PSIBLAST::PSSM;
 129 use PSIBLAST::Run;
 130 use FASTA::File;
 131 use FASTA;
 132 use Index;
 133 use Pairwise;
 134 use OC;
 135 use Utils qw(profile);
 136 use Run qw(check);
 137
 138 our $VERSION = '3.0.2';
 139 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 140 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 141
 142 # Gap characters in sequences
 143 our $GAP = '-';
 144
 145 #####################################################################################################
 146 # Light object to hold sequence information, light to prevent memory overload
 147 # for big search results. Even so this is quite memory intensive, I can't see
 148 # how to reduce the size further.
 149 {
 150
 151   package PSISEQ;
 152
 153   sub new {
 154     my ( $class, %args ) = @_;
 155     my $self = [];
 156     bless $self, $class;
 157     for ( keys %args ) { $self->$_( $args{$_} ) }
 158     return $self;
 159   }
 160
 161   sub id {
 162     return defined $_[1]
 163       ? $_[0]->[0] = $_[1]
 164       : $_[0]->[0];
 165   }
 166
 167   sub start {
 168     return defined $_[1]
 169       ? $_[0]->[1] = $_[1]
 170       : $_[0]->[1];
 171   }
 172
 173   sub align {
 174     if ( defined $_[1] ) {
 175       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 176
 177       $_[0]->[2] = join( "", @{ $_[1] } );
 178     } else {
 179       return [ split( //, $_[0]->[2] ) ];
 180     }
 181   }
 182 }
 183
 184 #####################################################################################################
 185 my $infile;
 186 my $infastafile;
 187 my $format;
 188 my $goal = "align";
 189 my $seqgoal;
 190 my $outfile;
 191 my $prefix;
 192 my $psiblast;
 193 my $jabaws;
 194 my $logfile;
 195 my $ncpu       = 1;                              # number of CPUs for psiblast calculations
 196 my $predNoHits = 0;                              # define whether to make predictions when no PSI-BLAST hits are found
 197 my $db_path    = "/homes/www-jpred/databases";
 198 my $db_entry   = "cluster";
 199 my $nofinal;
 200
 201 my ( $help, $man, $DEBUG, $VERBOSE );
 202
 203 GetOptions(
 204   "in=s"        => \$infile,
 205   "output=s"    => \$prefix,
 206   "outfile=s"   => \$outfile,
 207   "logfile=s"   => \$logfile,
 208   "psi=s"       => \$psiblast,
 209   "dbname=s"    => \$db_entry,
 210   "dbpath=s"    => \$db_path,
 211   "ncpu=s"      => \$ncpu,
 212   "pred-nohits" => \$predNoHits,
 213   "no-final"    => \$nofinal,
 214   "seq"         => \$seqgoal,
 215   "jabaws"      => \$jabaws,
 216
 217   "help"    => \$help,
 218   "man"     => \$man,
 219   "debug"   => \$DEBUG,
 220   "verbose" => \$VERBOSE
 221 ) or pod2usage(2);
 222 pod2usage(1) if $help;
 223 pod2usage( verbose => 2 ) if $man;
 224
 225 $goal = "seq" if ( defined $seqgoal );
 226
 227 #####################################################################################################
 228 # Key to database information and information for accessing them
 229 my $database = {
 230
 231   ## default database used by Jpred
 232   uniref90 => {
 233     database   => $db_path . "/uniref90.filt",
 234     unfiltered => $db_path . "/uniref90",
 235   },
 236
 237   ## database used during Jnet training
 238   training => {
 239     database   => $db_path . "/training/uniref90.filt",
 240     unfiltered => $db_path . "/training/uniref90",
 241   },
 242
 243   ## database used for ported Jpred
 244   ported_db => {
 245     database   => $db_path . "/uniref90.filt",
 246     unfiltered => $db_path . "/uniref90",
 247   },
 248
 249   ## cluster-specific path for Jpred
 250   cluster => {
 251     database   => $db_path . "/uniref90.filt",
 252     unfiltered => $db_path . "/uniref90",
 253   },
 254
 255   ## these other DBs are experimental ones used during development.
 256   ## check they exist before using them.
 257   # generic entry for use with validate_jnet.pl
 258   # real db location defined by validate_jnet.pl
 259   swall => {
 260     database   => $db_path . "/swall/swall.filt",
 261     unfiltered => $db_path . "/swall/swall",
 262   },
 263
 264   # Path to PSIBLAST db
 265   uniprot => {
 266     database   => $db_path . "/3/swall.filt",
 267     unfiltered => $db_path . "/3/swall",
 268   },
 269
 270   uniref50 => {
 271     database   => $db_path . "/6/swall.filt",
 272     unfiltered => $db_path . "/6/swall",
 273   },
 274 };
 275
 276 my $dpf = $database->{$db_entry}{database}.'.pal';
 277 my $dpu = $database->{$db_entry}{unfiltered}.'.pal';
 278 pod2usage("ERROR! Input file should be provided with -sequence/-in")                                              unless $infile;
 279 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database")            unless exists $database->{$db_entry};
 280 pod2usage("ERROR! UNIREF filtered   database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf;
 281 pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu;
 282
 283 #####################################################################################################
 284 # lots of preparation steps
 285 if ( defined $prefix ) {
 286   unless ( defined $outfile ) {
 287     $outfile = $prefix . ".res.fasta";
 288   }
 289 } else {
 290   if ( defined $outfile ) {
 291     print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
 292     $prefix = $outfile . ".tmp";
 293   } else {
 294     print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
 295     $prefix  = $infile . ".res";
 296     $outfile = $prefix . ".jnet";
 297   }
 298 }
 299
 300 if ( 1 > $ncpu or 8 < $ncpu ) {
 301   print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
 302   $ncpu = 1;
 303 }
 304
 305 $VERBOSE = 1 if $DEBUG;
 306
 307 my $LOG;
 308 if ( defined $logfile ) {
 309   open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
 310 }
 311
 312 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
 313   my ( $key, $value ) = @{$_};
 314   defined $value or next;
 315   my $par = join( ": ", $key, $value );
 316   print "$par\n" if $DEBUG;
 317   print $LOG "$par\n" if $LOG;
 318 }
 319
 320 if ( defined $jabaws ) {
 321   setup_jpred_env($Bin);
 322   setup_env($Bin);
 323 }
 324 my $platform = check_OS();
 325 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 326 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
 327
 328 #####################################################################################################
 329 # check input file format
 330 if ( 'seq' eq $goal ) {
 331   $format = "seq";
 332   if ( 1 != check_FASTA_format($infile) ) {
 333     die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
 334   }
 335 } else {
 336   my $nseq = check_FASTA_format($infile);
 337   if ( 0 < $nseq ) {
 338     $format = "fasta";
 339     if ( 1 == $nseq ) {
 340       die "\nERROR! jpred requires alignment with more than 1 sequence\n       if you provide only one sequence use the -seq option.\n";
 341     }
 342   } elsif ( 0 < check_MSF_format($infile) ) {
 343     $format = "msf";
 344   } elsif ( 0 < check_BLC_format($infile) ) {
 345     $format = "blc";
 346   } else {
 347     die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
 348   }
 349 }
 350 $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
 351
 352 #####################################################################################################
 353 if ( 'seq' eq $goal ) {
 354   my $query = FASTA::File->new( read_file => $infile );
 355   my @seqs;
 356   my $psi = PSIBLAST->new;
 357   unless ( defined $psiblast && -e $psiblast ) {
 358     ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 359     # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 360     # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 361     # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 362     my $psi_run = PSIBLAST::Run->new(
 363       debug    => $DEBUG,
 364       args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 365       path     => $psiblastbin,
 366       input    => $infile,
 367       output   => "$prefix.blast",
 368       matrix   => "$prefix.profile",
 369       database => $database->{$db_entry}{database},
 370     );
 371
 372     # For reduced databases, get the size of the orginal and use this as
 373     # the equivilent DB size
 374     if ( $db_entry =~ /^sp_red_/ ) {
 375       ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 376       $psi_run->args(
 377         $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 378       );
 379     }
 380     print "BLAST matrix: $ENV{BLASTMAT}\n"      if $DEBUG;
 381     print "BLASTDB path: $ENV{BLASTDB}\n"       if $DEBUG;
 382     print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
 383     print $LOG "BLASTDB path: $ENV{BLASTDB}\n"  if $LOG;
 384
 385     print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 386     print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
 387     $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 388
 389     $psi->read_file("$prefix.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 390     system("gzip -9f $prefix.blast");
 391   } else {
 392     if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 393     else                         { $psi->read_file($psiblast) }                        # CC ditto above
 394   }
 395
 396 #####################################################################################################
 397   # Convert the last itteration into a collection of PSISEQ objects
 398   for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 399     my ( $id, $start, $align ) = @{$_};
 400     push @seqs,
 401       PSISEQ->new(
 402       id    => $id,
 403       start => $start,
 404       align => [ split( //, $align ) ]
 405       );
 406   }
 407
 408 #####################################################################################################
 409   # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
 410   # Accuracy won't be as good, but at least it's a prediction
 411   if ( @seqs == 0 ) {
 412     if ( $predNoHits == 0 ) {
 413       warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
 414       print ">>100% complete\n";
 415       print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
 416       print $LOG ">>100% complete\n"                                                                         if $LOG;
 417       close($LOG)                                                                                            if $LOG;
 418       exit;
 419     } else {
 420       print ">>50% complete\n";
 421       warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 422       print "Running HMMer on query...\n"                                                            if $VERBOSE;
 423       print $LOG ">>50% complete\n"                                                                  if $LOG;
 424       print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
 425       print $LOG "Running HMMer on query...\n"                                                       if $LOG;
 426
 427       # copy input query to alignment
 428       #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
 429       open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 430       open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
 431       while (<$ifh>) { print $ofh, $_ }
 432       close $ifh;
 433       close $ofh;
 434
 435       # Temp files required for HMMer
 436       my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 437       system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 438       system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 439       unlink $hmmbuild_out;
 440
 441       # Read in the HMMER file
 442       my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 443
 444       # Convert from HMMER::Profile to HMMER::Profile::Jnet
 445       my $hmmer_jnet = HMMER::Profile::Jnet->new;
 446       $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 447       $hmmer_jnet->write_file("$prefix.hmm");
 448       print ">>70% complete\n";
 449       print $LOG ">>70% complete\n" if $LOG;
 450
 451       # Run Jnet for the prediction
 452       print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 453       print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 454       my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
 455       print ">>90% complete\n";
 456       print $LOG ">>90% complete\n" if $LOG;
 457       print $jnetlog                if $VERBOSE;
 458       print $LOG $jnetlog           if $LOG;
 459     }
 460   } else {
 461     psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
 462     print ">>40% complete\n";
 463     print $LOG ">>40% complete\n" if $LOG;
 464
 465     #####################################################################################################
 466     # Make PSIBLAST truncated alignments the right length
 467     print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 468     print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
 469     @seqs = extend( $query, @seqs );
 470     psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 471
 472     #####################################################################################################
 473     # Remove masking from PSIBLAST alignment
 474     print "Unmasking the alignments...\n" if $VERBOSE;
 475     print $LOG "Unmasking the alignments...\n" if $LOG;
 476     my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 477     remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 478     psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 479
 480     #####################################################################################################
 481     # Convert the sequences to upper case
 482     print "Converting sequences to the same case...\n" if $VERBOSE;
 483     print $LOG "Converting sequences to the same case...\n" if $LOG;
 484     toupper($_) for @seqs;    # CC sub is below
 485     psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
 486
 487     #####################################################################################################
 488     # Remove excessive sequences
 489     print "Remove excessive sequences...\n" if $VERBOSE;
 490     print $LOG "Remove excessive sequences...\n" if $LOG;
 491     @seqs = reduce( $MAX_ALIGN, @seqs );
 492     psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 493
 494     #####################################################################################################
 495     # Remove sequences that are too long or too short
 496     print "Remove sequences which too long or short...\n" if $VERBOSE;
 497     print $LOG "Remove sequences which too long or short...\n" if $LOG;
 498     @seqs = del_long_seqs( 50, @seqs );
 499     psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 500
 501     #####################################################################################################
 502     # Remove redundant sequences based upon pairwise identity and OC clustering
 503     print "Remove redundant sequences...\n" if $VERBOSE;
 504     print $LOG "Remove redundant sequences...\n" if $LOG;
 505     @seqs = nonred( $NR_CUT, @seqs );
 506     psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 507
 508     #####################################################################################################
 509     # Check that we haven't got rid of all of the sequences
 510     if ( @seqs < 2 ) {
 511       warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 512       print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
 513       @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
 514     }
 515     unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
 516
 517     # Remove gaps in the query sequence and same positions in the alignment
 518     print "Removing gaps in the query sequence...\n" if $VERBOSE;
 519     print $LOG "Removing gaps in the query sequence...\n" if $LOG;
 520     degap(@seqs);
 521     psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 522
 523     # Output the alignment for the prediction
 524     print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 525     print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
 526     psiseq2fasta( "$prefix.align", @seqs );
 527
 528     #####################################################################################################
 529     # Equivilent to getpssm script
 530     print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 531     print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
 532     my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
 533     $pssm->write_file("$prefix.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 534     print ">>50% complete\n";
 535     print $LOG ">>50% complete\n" if $LOG;
 536
 537     #####################################################################################################
 538     # Run HMMER on the sequences
 539     print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 540     print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
 541     my $hmmer = hmmer(@seqs);             # CC sub is below
 542     $hmmer->write_file("$prefix.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 543     print ">>70% complete\n";
 544     print $LOG ">>70% complete\n" if $LOG;
 545 #####################################################################################################
 546     # Run Jnet for the prediction
 547     print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 548     print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
 549     my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) );    # CC sub is below
 550     print ">>90% complete\n";
 551     print $LOG ">>90% complete\n" if $LOG;
 552     print $jnetlog                if $VERBOSE;
 553     print $LOG $jnetlog           if $LOG;
 554   }
 555 } else {
 556   if ( 'fasta' eq $format ) {
 557     print "Read FASTA file\n";
 558     my $hmmer1 = hmm_for_align($infile);
 559     $hmmer1->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 560   } elsif ( 'msf' eq $format ) {
 561     msf2fasta( $infile, $infastafile );
 562     my $hmmer2 = hmm_for_align($infastafile);
 563     $hmmer2->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 564   } elsif ( 'blc' eq $format ) {
 565     my $tmpfile = File::Temp->new->filename;
 566     blc2msf( $infile, $tmpfile );
 567     msf2fasta( $infile, $infastafile );
 568     my $hmmer3 = hmm_for_align($infastafile);
 569     $hmmer3->write_file("$prefix.hmm");                              # write_file is in the Read.pm called from the HMMER::Profile module
 570   } else {
 571     die "ERROR! unknown input format '$format'. exit...\n";
 572   }
 573   print ">>70% complete\n";
 574   print $LOG ">>70% complete\n" if $LOG;
 575 #####################################################################################################
 576   # Run Jnet for the prediction
 577   print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 578   print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
 579   my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );           # CC sub is below
 580   print ">>90% complete\n";
 581   print $LOG ">>90% complete\n" if $LOG;
 582   print $jnetlog                if $VERBOSE;
 583   print $LOG $jnetlog           if $LOG;
 584 }
 585
 586 unless ( defined $nofinal ) {
 587   my $aligfile      = $prefix . ".align";
 588   my $jnetfile      = $prefix . ".jnet";
 589   my $jnetfastafile = $prefix . ".jnet.fasta";
 590   $aligfile = $infile if ( 'fasta' eq $format );
 591   $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
 592   concise2fasta( $jnetfile, $jnetfastafile );
 593   open( my $IN1, "<", $aligfile )      or die "ERROR! unable to open '$aligfile': ${!}\nDied";
 594   open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
 595   open( my $OUT, ">", $outfile )       or die "ERROR! unable to open '$outfile': ${!}\nDied";
 596   while (<$IN2>) { print $OUT $_; }
 597   while (<$IN1>) { print $OUT $_; }
 598   close($IN1);
 599   close($IN2);
 600   close($OUT);
 601   unlink $jnetfastafile;
 602   my $seqs = FASTA::File->new( read_file => $outfile );
 603   open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
 604   $seqs->write($OUT2);
 605   close($OUT2);
 606 } else {
 607   if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
 608     rename $prefix . ".jnet", $outfile;
 609   }
 610 }
 611
 612 print ">>100% complete\n";
 613 print "Jpred Finished\n";
 614 print $LOG ">>100% complete\n" if $LOG;
 615 print $LOG "Jpred Finished\n"  if $LOG;
 616 close($LOG)                    if $LOG;
 617 exit;
 618
 619 #####################################################################################################
 620 # Functions
 621 #####################################################################################################
 622 sub check_FASTA_format {
 623   my $infile = shift;
 624
 625   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
 626   my $check_first_line = 1;
 627   my $nseq             = 0;
 628   local $/ = "\n>";
 629   while (<$IN>) {
 630     if ($check_first_line) {
 631       return 0 unless (/^>/);
 632       $check_first_line = 0;
 633     }
 634     s/^>//g;
 635     s/>$//g;
 636
 637     my ( $id, @seqs ) = split /\n/, $_;
 638     return 0 unless ( defined $id or @seqs );
 639     my $seq = join( "", @seqs );
 640     return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
 641     ++$nseq;
 642   }
 643   close($IN);
 644
 645   return $nseq;
 646 }
 647 #####################################################################################################
 648 sub check_MSF_format {
 649   my $infile = shift;
 650   $? = 0;
 651   system("$readseq -fMSF -a -p < $infile > /dev/null");
 652   return 0 if ($?);
 653
 654   return 1;
 655 }
 656 #####################################################################################################
 657 sub check_BLC_format {
 658   my $infile = shift;
 659
 660   my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
 661   open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
 662   print $fh "silent_mode\nblock_file $infile\n";
 663   print $fh "output_file /dev/null\nmax_nseq 2000\n";
 664   print $fh "pir_save $tmpfile2\nsetup\n";
 665   close $fh;
 666
 667   $? = 0;
 668   system("$alscript -s -f $tmpfile1");
 669   $? = 0;
 670   system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
 671   unlink $tmpfile1;
 672   unlink $tmpfile2;
 673   return 0 if ($?);
 674
 675   return 1;
 676 }
 677 ############################################################################################################################################
 678 # convertor BLC -> MSF
 679 sub msf2fasta {
 680   my $infile  = shift;
 681   my $outfile = shift;
 682
 683   my $sequence = "";
 684   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
 685   while (<$IN>) {
 686     $sequence .= $_;
 687   }
 688   close($IN);
 689
 690   $sequence =~ s/~/\./g;
 691
 692   ## check for non-unique sequence names - readseq has problems with these
 693   my @lines = split /\n/, $sequence;
 694   my %names;
 695   foreach my $line (@lines) {
 696     if ( $line =~ /Name:\s+(\S+)/ ) {
 697       die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
 698       $names{$1}++;
 699     }
 700   }
 701
 702   my $tmpfile1 = File::Temp->new->filename;
 703   my $tmpfile2 = File::Temp->new->filename;
 704   open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
 705   print $FILE $sequence;
 706   close $FILE;
 707
 708   $? = 0;
 709   system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
 710   die "Reformatting of $infile failed. exit...\n" if ($?);
 711   unlink $tmpfile1;
 712
 713   $? = 0;
 714   system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
 715   die "Reformatting of $infile failed. exit...\n" if ($?);
 716   unlink $tmpfile2;
 717
 718   die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
 719 }
 720
 721 ############################################################################################################################################
 722 # convertor BLC -> MSF
 723 sub blc2msf {
 724   my ( $in, $out ) = @_;
 725
 726   my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
 727
 728   open my $fh, ">$tmp" or die "$tmp: $!\n";
 729   print $fh "silent_mode\nblock_file $infile\n";
 730   print $fh "output_file /dev/null\nmax_nseq 2000\n";
 731   print $fh "pir_save $tmp_pir\nsetup\n";
 732   close $fh;
 733
 734   $? = 0;
 735   system("$alscript -s -f $tmp");
 736   die "Reformatting of $infile failed. exit...\n" if ($?);
 737   system("$readseq -f=MSF -o=$out -a $tmp_pir");
 738
 739   unlink $tmp;
 740   unlink $tmp_pir;
 741 }
 742
 743 #####################################################################################################
 744
 745 =begin :private
 746
 747 =head2 fasta2concise ($infile, $outfile)
 748
 749 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
 750
 751 =cut
 752
 753 sub fasta2concise {
 754   my $infile  = shift;
 755   my $outfile = shift;
 756
 757   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 758   my ( $seq, @seqs, @title );
 759   while (<$IN>) {
 760     if (s/^>//) {
 761       if ($seq) {
 762         $seq =~ s/\n|\s//g;
 763         $seq =~ s/(.)/$1,/g;
 764         push @seqs, $seq;
 765         $seq = "";
 766       }
 767       chomp;
 768       push @title, $_;
 769     } else {
 770       chomp;
 771       $seq .= $_;
 772     }
 773   }
 774   $seq =~ s/\n|\s//g;
 775   $seq =~ s/(.)/$1,/g;
 776   push @seqs, $seq;
 777   close($IN);
 778
 779   if ( @title != @seqs ) {
 780     die("non matching number of titles and sequences!\n");
 781   }
 782
 783   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 784   foreach ( 0 .. $#title ) {
 785     print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
 786   }
 787   close($OUT);
 788 }
 789
 790 =begin :private
 791
 792 =head2 concise2fasta ($infile, $outfile)
 793
 794 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
 795
 796 =cut
 797
 798 #####################################################################################################
 799 sub concise2fasta {
 800   my $infile  = shift;
 801   my $outfile = shift;
 802
 803   my ( @seqs, %seq, @pred, %pred );
 804
 805   my @var = (
 806     "Lupas_21",  "Lupas_14", "Lupas_28", "MULTCOIL",  "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
 807     "JNETALIGN", "JNETHMM",  "JNETSOL5", "JNETSOL25", "JNETSOL0",        "jnetpred",       "jpred"
 808   );
 809
 810   # parse input concise file
 811   open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
 812   while (<$IN>) {
 813     unless (/^\n/) {
 814       my ( $id, $seq ) = split( ":", $_ );
 815       if ( defined $id and defined $seq ) {    # Check we have proper values
 816         $seq =~ s/,//g;
 817         chomp $seq;
 818         if ( $id =~ /;/ ) {                    # Then it's an alignment
 819           @_ = split( ";", $id );
 820           push @seqs, $_[1];
 821           $seq{ $_[1] } = $seq;
 822         }
 823         foreach my $v (@var) {
 824           if ( $v eq $id ) {
 825             push @pred, $v;
 826             $pred{$v} = $seq;
 827           }
 828         }
 829       }
 830     }
 831   }
 832   close($IN);
 833
 834   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 835
 836   # print out sequences found with PSI-BLAST
 837   foreach (@seqs) {
 838     $seq{$_} =~ s/(.{72})/$1\n/g;
 839     print $OUT ">$_\n$seq{$_}\n";
 840   }
 841
 842   # print out predictions
 843   foreach my $p (@pred) {
 844     $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
 845     $pred{$p} =~ s/B/E/g;
 846     $pred{$p} =~ s/G/H/g;
 847
 848     if ( $p =~ /SOL/ ) {
 849       $pred{$p} =~ s/E/B/g;
 850     }
 851     $pred{$p} =~ s/(.{72})/$1\n/g;
 852     print $OUT ">$p\n$pred{$p}\n";
 853   }
 854   close($OUT);
 855 }
 856
 857 =begin :private
 858
 859 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 860
 861 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 862 Does it in place.
 863
 864 =cut
 865
 866 #####################################################################################################
 867 sub remove_seq_masks {
 868   my $idx = shift;
 869   my $seq = shift;
 870
 871   # Only bother if we have a sequence which has been masked
 872   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 873
 874   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 875   my $searchdb = $database->{$db_entry}{unfiltered};
 876   $start--;    # We need this to be zero based
 877
 878   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 879   my @seqs = $f->get_entries;
 880   my $db   = shift @seqs;
 881
 882   unless ($db) {
 883     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 884   } elsif (@seqs) {
 885     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 886   }
 887
 888   my @db_seq = @{ $db->seq };
 889
 890   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 891   my $i = 0;
 892   for ( 0 .. $#align ) {
 893     next if $align[$_] =~ /$GAP/;
 894
 895     if ( $align[$_] eq 'X' ) {
 896       unless ( defined $db_seq[ $i + $start ] ) {
 897         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 898       }
 899       $align[$_] = $db_seq[ $i + $start ];
 900     }
 901     $i++;
 902   }
 903   $seq->align( \@align );
 904 }
 905
 906 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 907
 908 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 909
 910 Equivilent to reduce script.
 911
 912 =cut
 913
 914 #####################################################################################################
 915 sub reduce {
 916   my ( $max, @seqs ) = @_;
 917
 918   my $num = int( @seqs / $max );
 919   my @res = shift @seqs;
 920
 921   # Don't bother if we have less than the required sequences
 922   if ( @seqs < $max or 0 == $num ) {
 923     return @res, @seqs;
 924   }
 925
 926   for ( 0 .. $#seqs ) {
 927     push @res, $seqs[$_] if ( 0 == $_ % $num );
 928   }
 929
 930   return @res;
 931 }
 932
 933 =head2 nonred($cutoff, @PSISEQS);
 934
 935 Removes redundant sequences at $cutoff level of sequence identity.
 936
 937 Equivilent to nonred script.
 938
 939 =cut
 940
 941 #####################################################################################################
 942 sub nonred {
 943   my ( $cutoff, @seqs ) = @_;
 944
 945   # Run pairwise and then OC and remove similar sequences
 946   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 947   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 948   if     ( @seqs == 1 ) {
 949     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 950     return @seqs;
 951   }
 952
 953   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 954   my $fasta = FASTA::File->new;
 955   foreach (@seqs) {
 956     my $f = FASTA->new( id => $_->id );
 957     $f->seq( @{ $_->align } );
 958     $fasta->add_entries($f);
 959   }
 960
 961   # Run pairwise
 962   my $pair = Pairwise->new( path => $pairwise );
 963   my $foo = join "\n", $pair->run($fasta) or die $!;
 964
 965   # Run OC
 966   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 967   $ocres->read_string( join "", $ocres->run($foo) );
 968
 969   # Now found those entries that are unrelated
 970   my @keep;
 971   for ( $ocres->get_groups ) {
 972     my ( $group, $score, $size, @labels ) = @{$_};
 973
 974     # Keep the QUERY from the cluster, otherwise just get onelt
 975     if ( grep { /QUERY/ } @labels ) {
 976
 977       # Make sure the query stays at the start
 978       unshift @keep, "QUERY";
 979       next;
 980     } else {
 981       push @keep, shift @labels;
 982     }
 983   }
 984   push @keep, $ocres->get_unclustered;
 985
 986   # Return the matching entries.
 987   # We use the temporay to mimic the selection of sequences in the nonred
 988   # script, which took the sequences in the order they occured in the file.
 989   my @filtered_res;
 990   for (@seqs) {
 991     my $label = $_->id;
 992     if ( grep { $_ =~ /^$label$/ } @keep ) {
 993       push @filtered_res, $_;
 994     }
 995   }
 996
 997   return @filtered_res;
 998 }
 999
1000 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
1001
1002 Removes those sequences that are more than $percentage longer or shorter than the first
1003 sequence in @seqs. @seqs is an array of PSISEQ objects.
1004
1005 Equivilent to trim_seqs script.
1006
1007 =cut
1008
1009 #####################################################################################################
1010 sub del_long_seqs {
1011   my ( $factor, @seqs ) = @_;
1012
1013   # Keep the query sequence (we assume query is the 1st FASTA record)
1014   my $query = shift @seqs;
1015   my @res   = $query;
1016
1017   # Find the length of the query sequence without any gaps
1018   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
1019
1020   # Calculate the upper and lower allowed bounds
1021   my ( $upper, $lower );
1022   {
1023     my $bounds = ( $length / 100 ) * $factor;
1024     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
1025   }
1026
1027   # Now try each sequence and see how long they are
1028   for (@seqs) {
1029     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
1030     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
1031   }
1032
1033   return @res;
1034 }
1035
1036 =head2 toupper ($PSISEQ)
1037
1038 Converts sequence held in PSISEQ object to uppercase.
1039
1040 =cut
1041
1042 #####################################################################################################
1043 sub toupper {
1044   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
1045 }
1046
1047 =head2 degap($PSISEQ_QUERY, @PSISEQS)
1048
1049 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
1050
1051 Equivilent to fasta2jnet script.
1052
1053 =cut
1054
1055 #####################################################################################################
1056 sub degap {
1057   my (@seqs) = @_;
1058
1059   # Find where the gaps in the query sequence are
1060   my @gaps;
1061   my $i = 0;
1062   for ( @{ $seqs[0]->align } ) {
1063     push @gaps, $i if $_ !~ /$GAP/;
1064     $i++;
1065   }
1066
1067   return unless @gaps;
1068
1069   # Remove the gaps in all the sequences.
1070   # Derefences the array reference and takes a slice inside the method call
1071   # arguments, then coerce back to an array ref.
1072   for (@seqs) {
1073     $_->align( [ @{ $_->align }[@gaps] ] );
1074   }
1075 }
1076
1077 =head2 getfreq($filename, @PSISEQS)
1078
1079 Creates a PSIBLAST like profile and writes it to the $filename.
1080
1081 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
1082 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
1083 but it's *not* the same.
1084
1085 =cut
1086
1087 #####################################################################################################
1088 sub getfreq {
1089   my ( $fn, @seqs ) = @_;
1090
1091   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
1092
1093   # Create a PSIBLAST like profile from the alignment
1094   # This doesn't greate the same result as old Jpred, I think this is because
1095   # differences in the input between old and new
1096
1097   # For testing
1098   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
1099   #   my @profile = profile(
1100   #      map { join "", @{ $_->seq } } $f->get_entries
1101   #   );
1102
1103   my @profile = profile( map { join "", @{ $_->align } } @seqs );
1104
1105   open my $fh, ">$fn" or die "JPRED: $fn: $!";
1106   print $fh join( " ", @{$_} ), "\n" for @profile;
1107 }
1108
1109 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
1110
1111 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
1112
1113 Equivilent to gethmm script.
1114
1115 =cut
1116
1117 #####################################################################################################
1118 sub hmmer {
1119   my (@seqs) = @_;
1120
1121   # Temp files required
1122   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
1123
1124   # Create the FASTA file
1125   psiseq2fasta( $tmp_fasta, @seqs );
1126
1127   # Run HMMer
1128   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
1129   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1130
1131   # Read in the HMMER file
1132   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1133
1134   # Convert from HMMER::Profile to HMMER::Profile::Jnet
1135   my $hmmer_jnet = HMMER::Profile::Jnet->new;
1136   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
1137
1138   return $hmmer_jnet;
1139 }
1140
1141 =head2 jnet($hmmer, $out, $pssm )
1142
1143 Runs Jnet. Pass the paths of the filenames to do the prediction on
1144
1145 =cut
1146
1147 #################################################################################################################################################
1148
1149 =head2 get_hmm($alignFile, $name)
1150
1151 Adapted from the hmmer() function in Jpred as written by JDB.
1152
1153 Generates an HMMer profile output compatible with Jnet.
1154
1155 Equivilent to gethmm script.
1156
1157 =cut
1158
1159 sub hmm_for_align {
1160   my $fastafile = shift;
1161
1162   # Temp files required
1163   print "hmm_for_align: fastafile = $fastafile\n";
1164   my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
1165
1166   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
1167   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1168
1169   # Read in the HMMER file
1170   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1171
1172   # Read in the HMMER file
1173   my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
1174
1175   # Convert from HMMER::Profile to HMMER::Profile::Jnet
1176   my $hmmer_jnet = HMMER::Profile::Jnet->new;
1177   $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
1178
1179   return $hmmer_jnet;
1180 }
1181
1182 #####################################################################################################
1183 sub jnet {
1184   my ( $hmmer, $outfile, $pssm ) = @_;
1185   if ($pssm) {
1186
1187     # run Jnet prediction with all the profile data
1188     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
1189   } else {
1190
1191     # run Jnet prediction with only HMMer and alignment data
1192     system("$jnet --concise --hmmer $hmmer > $outfile");
1193   }
1194   check( "jnet", $? ) or exit 1;
1195   my $res     = "";
1196   my $logging = "";
1197   open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1198   while (<$IN>) {
1199     if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
1200       $res .= $_;
1201     } else {
1202       $logging .= $_;
1203     }
1204   }
1205   close $IN;
1206   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1207   print $OUT "\n" . $res;
1208   close $OUT;
1209   return $logging;
1210 }
1211
1212 =head1 psiseq2fasta($path, @PSISEQ)
1213
1214 Writes a FASTA file given an array of PSISEQ objects.
1215
1216 =cut
1217
1218 #####################################################################################################
1219 sub psiseq2fasta {
1220   my ( $fn, @seqs ) = @_;
1221
1222   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1223   confess "JPRED: not passed PSISEQs" unless @seqs;
1224
1225   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1226
1227   # Changed from using FASTA::File object due to seg faulting on some large,
1228   # long alignments, presumably due to running out of memory.
1229   #   my $fasta = FASTA::File->new;
1230   #   $fasta->add_entries(
1231   #      map {
1232   #         my $f = FASTA->new(id => $_->id);
1233   #         $f->seq(@{$_->align});
1234   #         $f;
1235   #      } @seqs
1236   #   );
1237   #   $fasta->write_file($fn);
1238
1239   my $fh;
1240   $fn =~ /\.gz$/
1241     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1242     : ( open $fh, ">$fn" or die "JPRED: $!" );
1243
1244   for (@seqs) {
1245     print $fh ">", $_->id, "\n";
1246     my $seq = join( "", @{ $_->align } );
1247     $seq =~ s/(.{72})/$1\n/g;
1248     chomp $seq;
1249     print $fh $seq . "\n";
1250   }
1251   close $fh;
1252 }
1253
1254 =head1 @PSISEQ = fasta2psiseq($path)
1255
1256 Reads in a FASTA file and returns an array of PSISEQ objects.
1257
1258 =cut
1259
1260 #####################################################################################################
1261 sub fasta2psiseq {
1262   my ($fn) = @_;
1263
1264   my $fasta =
1265     $fn =~ /\.gz$/
1266     ? FASTA::File->new( read_gzip_file => $fn )
1267     : FASTA::File->new( read_file      => $fn );
1268   $fasta or die "JPRED: $!";
1269
1270   my @seqs;
1271   for ( $fasta->get_entries ) {
1272     my $seq = PSISEQ->new;
1273     $seq->id( $_->id );
1274     $seq->align( [ @{ $_->seq } ] );
1275     push @seqs, $seq;
1276   }
1277
1278   return @seqs;
1279 }
1280
1281 #####################################################################################################
1282 sub fasta2psiseq_lowmemory {
1283   my $filename = shift;
1284
1285   local $/ = "\n";
1286   my $fh;
1287   $filename =~ /\.gz$/
1288     ? ( open $fh, "gzip -dc $filename|" or die $! )
1289     : ( open $fh, $filename or die $! );
1290
1291   my @seqs;
1292   my ( $id, $seq );
1293   while (<$fh>) {
1294     chomp;
1295     if (s/^>//) {
1296       $seq =~ s/ //g if defined $seq;
1297
1298       if ( $id and $seq ) {
1299         my $psi = PSISEQ->new( id => $id );
1300         $psi->align( [ split //, $seq ] );
1301         push @seqs, $psi;
1302         undef $id;
1303         undef $seq;
1304       } else {
1305         $id = $_;
1306       }
1307     } else {
1308       $seq .= $_;
1309     }
1310   }
1311   if ( $id and $seq ) {
1312     my $psi = PSISEQ->new( id => $id );
1313     $psi->seq($seq);
1314     push @seqs, $psi;
1315   }
1316
1317   return @seqs;
1318 }
1319
1320 #####################################################################################################
1321
1322 =head1 extend($FASTA::File, @PSISEQ)
1323
1324 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1325 against the query, this function extends the alignment so that all of the query is
1326 present in the alignment. The other sequences are extended with gap characters.
1327
1328 =cut
1329
1330 sub extend {
1331   my ( $query, @seqs ) = @_;
1332
1333   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1334   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
1335
1336   # Get the query sequence
1337   my $q_query = $query->get_entry(0);
1338
1339   # Get the query sequence from the PSIBLAST alignment
1340   my $a_query = shift @seqs;
1341
1342   # The gap size required to expand the alignment
1343   my ( $start_gap_length, $end_gap_length );
1344   {
1345
1346     # Make handling the sequence easier
1347     my $q_seq = join "", @{ [ $q_query->seq ] };
1348     my $a_seq = join "", @{ $a_query->align };
1349
1350     $q_seq =~ s/-//g;
1351     $a_seq =~ s/-//g;
1352
1353     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1354
1355     $start_gap_length = index( $q_seq, $a_seq );
1356     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1357
1358     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1359   }
1360
1361   # Add the gaps to the end of the alignments
1362   for (@seqs) {
1363     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1364   }
1365
1366   # Put the query sequence back
1367   unshift @seqs,
1368     PSISEQ->new(
1369     id    => $a_query->id,
1370     align => [
1371       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1372       @{ $a_query->align },
1373       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1374     ]
1375     );
1376
1377   return @seqs;
1378 }
1379
1380 #####################################################################################################
1381
1382 =head1 $int = fasta_seq_length($path)
1383
1384 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1385
1386 =cut
1387
1388 sub fasta_seq_length {
1389   my ($fn) = @_;
1390   open my $fh, $fn or croak "Can't open file $fn: $!\n";
1391
1392   my $length = 0;
1393   local $_, $/ = "\n";
1394   while (<$fh>) {
1395     next if /^>/;
1396     chomp;
1397     $length += tr///c;
1398   }
1399
1400   return $length;
1401 }