binaries/src/jpred/jpred.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9   jpred --sequence <fasta file> [--output <output prefix>] [--db <database>] [--psi <psiblast output>] [--pred-nohits] [--verbose] [--debug] [--help] [--man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program which predicts the secondary structure of a sequence given a path to a FASTA sequence.
  14 It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
  15
  16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
  17 Some of the output may not be meaningful if used directly - it can be safely ignored.
  18
  19 =head1 OPTIONS
  20
  21 =over 5
  22
  23 =item -sequence path to input FASTA file
  24
  25 The path to the sequence (in FASTA format) you want to predict.
  26
  27 =item -output file
  28
  29 A prefix to the filenames created by Jpred, defaults to the value set by --sequence.
  30
  31 =item --db database
  32
  33 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code. Crap I know, but that's the way it is at the moment. It's probably best to stick the default for the time being.
  34
  35 Default: uniref90
  36
  37 =item -psi path
  38
  39 Path to a PSIBLAST output file.
  40
  41 =item --pred-nohits
  42
  43 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  44
  45 =item --verbose
  46
  47 Verbose mode. Print more information while running jpred.
  48
  49 =item --debug
  50
  51 Debug mode. Print debugging information while running jpred.
  52
  53 =item --help
  54
  55 Gives help on the programs usage.
  56
  57 =item --man
  58
  59 Displays this man page.
  60
  61 =back
  62
  63 =head1 BUGS
  64
  65 Can't cope with gaps in the query sequence.
  66
  67 Doesn't accept alignments.
  68
  69 If you find any others please contact me.
  70
  71 =head1 AUTHORS
  72
  73 Jonathan Barber <jon@compbio.dundee.ac.uk>
  74 Chris Cole <christian@cole.name>
  75 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
  76
  77 =cut
  78
  79 ## TODO check for gaps in query sequence
  80 ##      check for disallowed chars in sequence
  81
  82 use strict;
  83 use warnings;
  84 use Getopt::Long;
  85 use Pod::Usage;
  86 use Carp;
  87 use Data::Dumper;
  88 use File::Temp;
  89
  90 #use UNIVERSAL;
  91
  92 # path to Jpred modules
  93 use FindBin qw($Bin);
  94 use lib "$Bin/lib";
  95
  96 # internal jpred modules
  97 use Jpred;
  98 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS);
  99 use HMMER::Profile;
 100 use HMMER::Profile::Jnet;
 101 use PSIBLAST;
 102 use PSIBLAST::PSSM;
 103 use PSIBLAST::Run;
 104 use FASTA::File;
 105 use FASTA;
 106 use Index;
 107 use Pairwise;
 108 use OC;
 109 use Utils qw(profile);
 110 use Run qw(check);
 111
 112 our $VERSION = '3.0.2';
 113 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 114 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 115
 116 # Gap characters in sequences
 117 our $GAP = '-';
 118
 119 #####################################################################################################
 120 # Light object to hold sequence information, light to prevent memory overload
 121 # for big search results. Even so this is quite memory intensive, I can't see
 122 # how to reduce the size further.
 123 {
 124
 125   package PSISEQ;
 126
 127   sub new {
 128     my ( $class, %args ) = @_;
 129     my $self = [];
 130     bless $self, $class;
 131     for ( keys %args ) { $self->$_( $args{$_} ) }
 132     return $self;
 133   }
 134
 135   sub id {
 136     return defined $_[1]
 137       ? $_[0]->[0] = $_[1]
 138       : $_[0]->[0];
 139   }
 140
 141   sub start {
 142     return defined $_[1]
 143       ? $_[0]->[1] = $_[1]
 144       : $_[0]->[1];
 145   }
 146
 147   sub align {
 148     if ( defined $_[1] ) {
 149       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 150
 151       $_[0]->[2] = join( "", @{ $_[1] } );
 152     } else {
 153       return [ split( //, $_[0]->[2] ) ];
 154     }
 155   }
 156 }
 157
 158 #####################################################################################################
 159 my $infile;
 160 my $output;
 161 my $psiblast;
 162 my $ncpu       = 1;           # number of CPUs for psiblast calculations
 163 my $predNoHits = 0;           # define whether to make predictions when no PSI-BLAST hits are found
 164 my $db_path    = ".";
 165 my $db_entry   = "unknown";
 166
 167 my ( $help, $man, $DEBUG, $VERBOSE );
 168
 169 GetOptions(
 170   "in|sequence=s" => \$infile,
 171   "out|output=s"  => \$output,
 172   "psi=s"         => \$psiblast,
 173   "dbname=s"      => \$db_entry,
 174   "dbpath=s"      => \$db_path,
 175   "ncpu=s"        => \$ncpu,
 176   "pred-nohits"   => \$predNoHits,
 177   "help"          => \$help,
 178   "man"           => \$man,
 179   "debug"         => \$DEBUG,
 180   "verbose"       => \$VERBOSE
 181 ) or pod2usage(2);
 182 pod2usage(1) if $help;
 183 pod2usage( verbose => 2 ) if $man;
 184
 185 #####################################################################################################
 186 # Key to database information and information for accessing them
 187 my $database = {
 188
 189   ## default database used by Jpred
 190   uniref90 => {
 191     database   => $db_path . "/uniref90.filt",
 192     unfiltered => $db_path . "/uniref90",
 193   },
 194
 195   ## database used during Jnet training
 196   training => {
 197     database   => $db_path . "/training/uniref90.filt",
 198     unfiltered => $db_path . "/training/uniref90",
 199   },
 200
 201   ## database used for ported Jpred
 202   ported_db => {
 203     database   => $db_path . "/databases/uniref90.filt",
 204     unfiltered => $db_path . "/databases/uniref90",
 205   },
 206   ## cluster-specific path for Jpred
 207   cluster => {
 208     database   => "/homes/www-jpred/databases/uniref90.filt",
 209     unfiltered => "/homes/www-jpred/databases/uniref90",
 210   },
 211
 212   ## these other DBs are experimental ones used during development.
 213   ## check they exist before using them.
 214   swall => {
 215
 216     # generic entry for use with validate_jnet.pl
 217     # real db location defined by validate_jnet.pl
 218     database   => $db_path . "/swall/swall.filt",
 219     unfiltered => $db_path . "/swall/swall",
 220   },
 221
 222   uniprot => {
 223
 224     # Path to PSIBLAST db
 225     database   => $db_path . "/3/swall.filt",
 226     unfiltered => $db_path . "/3/swall",
 227   },
 228   uniref50 => {
 229     database   => $db_path . "/6/swall.filt",
 230     unfiltered => $db_path . "/6/swall",
 231   },
 232 };
 233
 234 pod2usage(' --sequence argument not provided') unless $infile;
 235 die "--db $db_entry not recognised" unless exists $database->{$db_entry};
 236 $output = $infile . ".res" unless $output;
 237 $ncpu = 1 if ( 1 > $ncpu or 8 < $ncpu );
 238
 239 for ( [ "input", $infile ], [ "output", $output ], [ "psi", $psiblast ], [ "db", $db_entry ] ) {
 240   my ( $key, $value ) = @{$_};
 241   defined $value or next;
 242   errlog( join( ": ", $key, $value ), "\n" );
 243 }
 244
 245 #####################################################################################################
 246 my $platform = check_OS();
 247 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 248
 249 #####################################################################################################
 250 my $query = FASTA::File->new( read_file => $infile );
 251
 252 my @seqs;
 253 my $psi = PSIBLAST->new;
 254 unless ( defined $psiblast && -e $psiblast ) {
 255   ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 256   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 257   # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 258   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 259   my $psi_run = PSIBLAST::Run->new(
 260     debug    => $DEBUG,
 261     args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 262     path     => $psiblastbin,
 263     input    => $infile,
 264     output   => "$output.blast",
 265     matrix   => "$output.profile",
 266     database => $database->{$db_entry}{database},
 267   );
 268
 269   # For reduced databases, get the size of the orginal and use this as
 270   # the equivilent DB size
 271   if ( $db_entry =~ /^sp_red_/ ) {
 272     ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 273     $psi_run->args(
 274       $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 275     );
 276   }
 277   print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
 278   print "BLASTDB path: $ENV{BLASTDB}\n"  if $DEBUG;
 279
 280   print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 281   $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 282
 283   $psi->read_file("$output.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 284   system("gzip -9f $output.blast");
 285 } else {
 286   if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 287   else                         { $psi->read_file($psiblast) }                        # CC ditto above
 288 }
 289
 290 #####################################################################################################
 291 # Convert the last itteration into a collection of PSISEQ objects
 292 for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 293   my ( $id, $start, $align ) = @{$_};
 294   push @seqs,
 295     PSISEQ->new(
 296     id    => $id,
 297     start => $start,
 298     align => [ split( //, $align ) ]
 299     );
 300 }
 301
 302 #####################################################################################################
 303 ## When there are no PSI-BLAST hits generate an HMM from the query
 304 ## and run Jnet against that only.
 305 ## Accuracy won't be as good, but at least it's a prediction
 306 if ( @seqs == 0 ) {
 307   if ( $predNoHits == 0 ) {
 308     warn "\nJPRED: Warning! no PSI-BLAST hits found and '--pred-nohits' option not set. Exiting...\n";
 309     print ">>100% complete\n";
 310     exit;
 311   } else {
 312     print ">>50% complete\n";
 313     warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 314     print "Running HMMer on query...\n" if $VERBOSE;
 315
 316     # copy input query to alignment
 317     #system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
 318     open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 319     open( my $ofh, ">", $output . ".align" ) or die "JPRED: cannot open file $output\.align: $!";
 320     while (<$ifh>) { print $ofh, $_ }
 321     close $ifh;
 322     close $ofh;
 323
 324     # Temp files required for HMMer
 325     my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 326
 327     system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 328     system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 329
 330     # Read in the HMMER file
 331     my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 332
 333     # Convert from HMMER::Profile to HMMER::Profile::Jnet
 334     my $hmmer_jnet = HMMER::Profile::Jnet->new;
 335     $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 336     $hmmer_jnet->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 337     print ">>70% complete\n";
 338
 339     # Run Jnet for the prediction
 340     print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 341     jnet( map { "$output.$_" } qw(hmm jnet) );    # CC sub is below
 342   }
 343 } else {
 344   psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;    # CC sub is below
 345   print ">>40% complete\n";
 346
 347   #####################################################################################################
 348   # Make PSIBLAST truncated alignments the right length
 349   print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 350   @seqs = extend( $query, @seqs );                  # CC sub si below
 351   psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 352
 353   #####################################################################################################
 354   # Remove masking from PSIBLAST alignment
 355   print "Unmasking the alignments...\n" if $VERBOSE;
 356   my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 357   remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 358   psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 359
 360   #####################################################################################################
 361   # Convert the sequences to upper case
 362   print "Converting sequences to the same case...\n" if $VERBOSE;
 363   toupper($_) for @seqs;    # CC sub is below
 364   psiseq2fasta( "${output}_backup.fasta.gz", @seqs );
 365
 366   #####################################################################################################
 367   # Remove excessive sequences
 368   print "Remove excessive sequences...\n" if $VERBOSE;
 369   @seqs = reduce( $MAX_ALIGN, @seqs );
 370   psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 371
 372   #####################################################################################################
 373   # Remove sequences that are too long or too short
 374   print "Remove sequences which too long or short...\n" if $VERBOSE;
 375   @seqs = del_long_seqs( 50, @seqs );
 376   psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 377
 378   #####################################################################################################
 379   # Remove redundant sequences based upon pairwise identity and OC clustering
 380   print "Remove redundant sequences...\n" if $VERBOSE;
 381   @seqs = nonred( $NR_CUT, @seqs );
 382   psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 383
 384   #####################################################################################################
 385   # Check that we haven't got rid of all of the sequences
 386   if ( @seqs < 2 ) {
 387     warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 388     @seqs = fasta2psiseq("${output}_backup.fasta.gz");
 389   }
 390   unlink("${output}_backup.fasta.gz") unless $DEBUG;
 391
 392   # Remove gaps in the query sequence and same positions in the alignment
 393   print "Removing gaps in the query sequence...\n" if $VERBOSE;
 394   degap(@seqs);
 395   psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 396
 397   # Output the alignment for the prediction
 398   print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 399   psiseq2fasta( "$output.align", @seqs );
 400
 401   #####################################################################################################
 402   # Equivilent to getpssm script
 403   print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 404   my $pssm = PSIBLAST::PSSM->new( read_file => "$output.profile" );
 405   $pssm->write_file("$output.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 406   print ">>50% complete\n";
 407
 408   #####################################################################################################
 409   # Run HMMER on the sequences
 410   print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 411   my $hmmer = hmmer(@seqs);             # CC sub is below
 412   $hmmer->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 413   print ">>70% complete\n";
 414
 415   #####################################################################################################
 416   # Run Jnet for the prediction
 417   print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 418   jnet( map { "$output.$_" } qw(hmm jnet pssm) );    # CC sub is below
 419 }
 420
 421 print ">>100% complete\n";
 422 print "Jpred Finished\n";
 423 exit;
 424
 425 #####################################################################################################
 426 # Functions
 427 #####################################################################################################
 428
 429 =begin :private
 430
 431 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 432
 433 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 434 Does it in place.
 435
 436 =cut
 437
 438 #####################################################################################################
 439 sub remove_seq_masks {
 440   my $idx = shift;
 441   my $seq = shift;
 442
 443   # Only bother if we have a sequence which has been masked
 444   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 445
 446   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 447   my $searchdb = $database->{$db_entry}{unfiltered};
 448   $start--;    # We need this to be zero based
 449
 450   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 451   my @seqs = $f->get_entries;
 452   my $db   = shift @seqs;
 453
 454   unless ($db) {
 455     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 456   } elsif (@seqs) {
 457     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 458   }
 459
 460   my @db_seq = @{ $db->seq };
 461
 462   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 463   my $i = 0;
 464   for ( 0 .. $#align ) {
 465     next if $align[$_] =~ /$GAP/;
 466
 467     if ( $align[$_] eq 'X' ) {
 468       unless ( defined $db_seq[ $i + $start ] ) {
 469         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 470       }
 471       $align[$_] = $db_seq[ $i + $start ];
 472     }
 473     $i++;
 474   }
 475   $seq->align( \@align );
 476 }
 477
 478 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 479
 480 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 481
 482 Equivilent to reduce script.
 483
 484 =cut
 485
 486 #####################################################################################################
 487 sub reduce {
 488   my ( $max, @seqs ) = @_;
 489
 490   my $num = int( @seqs / $max );
 491   my @res = shift @seqs;
 492
 493   # Don't bother if we have less than the required sequences
 494   if ( @seqs < $max or 0 == $num ) {
 495     return @res, @seqs;
 496   }
 497
 498   for ( 0 .. $#seqs ) {
 499     push @res, $seqs[$_] if ( 0 == $_ % $num );
 500   }
 501
 502   return @res;
 503 }
 504
 505 =head2 nonred($cutoff, @PSISEQS);
 506
 507 Removes redundant sequences at $cutoff level of sequence identity.
 508
 509 Equivilent to nonred script.
 510
 511 =cut
 512
 513 #####################################################################################################
 514 sub nonred {
 515   my ( $cutoff, @seqs ) = @_;
 516
 517   # Run pairwise and then OC and remove similar sequences
 518   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 519   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 520   if     ( @seqs == 1 ) {
 521     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 522     return @seqs;
 523   }
 524
 525   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 526   my $fasta = FASTA::File->new;
 527   foreach (@seqs) {
 528     my $f = FASTA->new( id => $_->id );
 529     $f->seq( @{ $_->align } );
 530     $fasta->add_entries($f);
 531   }
 532
 533   # Run pairwise
 534   my $pair = Pairwise->new( path => $pairwise );
 535   my $foo = join "\n", $pair->run($fasta) or die $!;
 536
 537   # Run OC
 538   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 539   $ocres->read_string( join "", $ocres->run($foo) );
 540
 541   # Now found those entries that are unrelated
 542   my @keep;
 543   for ( $ocres->get_groups ) {
 544     my ( $group, $score, $size, @labels ) = @{$_};
 545
 546     # Keep the QUERY from the cluster, otherwise just get onelt
 547     if ( grep { /QUERY/ } @labels ) {
 548
 549       # Make sure the query stays at the start
 550       unshift @keep, "QUERY";
 551       next;
 552     } else {
 553       push @keep, shift @labels;
 554     }
 555   }
 556   push @keep, $ocres->get_unclustered;
 557
 558   # Return the matching entries.
 559   # We use the temporay to mimic the selection of sequences in the nonred
 560   # script, which took the sequences in the order they occured in the file.
 561   my @filtered_res;
 562   for (@seqs) {
 563     my $label = $_->id;
 564     if ( grep { $_ =~ /^$label$/ } @keep ) {
 565       push @filtered_res, $_;
 566     }
 567   }
 568
 569   return @filtered_res;
 570 }
 571
 572 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
 573
 574 Removes those sequences that are more than $percentage longer or shorter than the first
 575 sequence in @seqs. @seqs is an array of PSISEQ objects.
 576
 577 Equivilent to trim_seqs script.
 578
 579 =cut
 580
 581 #####################################################################################################
 582 sub del_long_seqs {
 583   my ( $factor, @seqs ) = @_;
 584
 585   # Keep the query sequence (we assume query is the 1st FASTA record)
 586   my $query = shift @seqs;
 587   my @res   = $query;
 588
 589   # Find the length of the query sequence without any gaps
 590   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
 591
 592   # Calculate the upper and lower allowed bounds
 593   my ( $upper, $lower );
 594   {
 595     my $bounds = ( $length / 100 ) * $factor;
 596     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
 597   }
 598
 599   # Now try each sequence and see how long they are
 600   for (@seqs) {
 601     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
 602     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
 603   }
 604
 605   return @res;
 606 }
 607
 608 =head2 toupper ($PSISEQ)
 609
 610 Converts sequence held in PSISEQ object to uppercase.
 611
 612 =cut
 613
 614 #####################################################################################################
 615 sub toupper {
 616   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
 617 }
 618
 619 =head2 degap($PSISEQ_QUERY, @PSISEQS)
 620
 621 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
 622
 623 Equivilent to fasta2jnet script.
 624
 625 =cut
 626
 627 #####################################################################################################
 628 sub degap {
 629   my (@seqs) = @_;
 630
 631   # Find where the gaps in the query sequence are
 632   my @gaps;
 633   my $i = 0;
 634   for ( @{ $seqs[0]->align } ) {
 635     push @gaps, $i if $_ !~ /$GAP/;
 636     $i++;
 637   }
 638
 639   return unless @gaps;
 640
 641   # Remove the gaps in all the sequences.
 642   # Derefences the array reference and takes a slice inside the method call
 643   # arguments, then coerce back to an array ref.
 644   for (@seqs) {
 645     $_->align( [ @{ $_->align }[@gaps] ] );
 646   }
 647 }
 648
 649 =head2 getfreq($filename, @PSISEQS)
 650
 651 Creates a PSIBLAST like profile and writes it to the $filename.
 652
 653 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
 654 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
 655 but it's *not* the same.
 656
 657 =cut
 658
 659 #####################################################################################################
 660 sub getfreq {
 661   my ( $fn, @seqs ) = @_;
 662
 663   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
 664
 665   # Create a PSIBLAST like profile from the alignment
 666   # This doesn't greate the same result as old Jpred, I think this is because
 667   # differences in the input between old and new
 668
 669   # For testing
 670   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
 671   #   my @profile = profile(
 672   #      map { join "", @{ $_->seq } } $f->get_entries
 673   #   );
 674
 675   my @profile = profile( map { join "", @{ $_->align } } @seqs );
 676
 677   open my $fh, ">$fn" or die "JPRED: $fn: $!";
 678   print $fh join( " ", @{$_} ), "\n" for @profile;
 679 }
 680
 681 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
 682
 683 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
 684
 685 Equivilent to gethmm script.
 686
 687 =cut
 688
 689 #####################################################################################################
 690 sub hmmer {
 691   my (@seqs) = @_;
 692
 693   # Temp files required
 694   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
 695
 696   # Create the FASTA file
 697   psiseq2fasta( $tmp_fasta, @seqs );
 698
 699   # Run HMMer
 700   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
 701   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 702
 703   # Read in the HMMER file
 704   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 705
 706   # Convert from HMMER::Profile to HMMER::Profile::Jnet
 707   my $hmmer_jnet = HMMER::Profile::Jnet->new;
 708   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 709
 710   return $hmmer_jnet;
 711 }
 712
 713 =head2 jnet($hmmer, $out, $pssm )
 714
 715 Runs Jnet. Pass the paths of the filenames to do the prediction on
 716
 717 =cut
 718
 719 #####################################################################################################
 720 sub jnet {
 721   my ( $hmmer, $outfile, $pssm ) = @_;
 722   if ($pssm) {
 723
 724     # run Jnet prediction with all the profile data
 725     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
 726   } else {
 727
 728     # run Jnet prediction with only HMMer and alignment data
 729     system("$jnet --concise --hmmer $hmmer > $outfile");
 730   }
 731   check( "jnet", $? ) or exit 1;
 732 }
 733
 734 =head1 psiseq2fasta($path, @PSISEQ)
 735
 736 Writes a FASTA file given an array of PSISEQ objects.
 737
 738 =cut
 739
 740 #####################################################################################################
 741 sub psiseq2fasta {
 742   my ( $fn, @seqs ) = @_;
 743
 744   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
 745   confess "JPRED: not passed PSISEQs" unless @seqs;
 746
 747   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 748
 749   # Changed from using FASTA::File object due to seg faulting on some large,
 750   # long alignments, presumably due to running out of memory.
 751   #   my $fasta = FASTA::File->new;
 752   #   $fasta->add_entries(
 753   #      map {
 754   #         my $f = FASTA->new(id => $_->id);
 755   #         $f->seq(@{$_->align});
 756   #         $f;
 757   #      } @seqs
 758   #   );
 759   #   $fasta->write_file($fn);
 760
 761   my $fh;
 762   $fn =~ /\.gz$/
 763     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
 764     : ( open $fh, ">$fn" or die "JPRED: $!" );
 765
 766   for (@seqs) {
 767     print $fh ">", $_->id, "\n";
 768     my $seq = join( "", @{ $_->align } );
 769     $seq =~ s/(.{72})/$1\n/g;
 770     chomp $seq;
 771     print $fh $seq . "\n";
 772   }
 773   close $fh;
 774 }
 775
 776 =head1 @PSISEQ = fasta2psiseq($path)
 777
 778 Reads in a FASTA file and returns an array of PSISEQ objects.
 779
 780 =cut
 781
 782 #####################################################################################################
 783 sub fasta2psiseq {
 784   my ($fn) = @_;
 785
 786   my $fasta =
 787     $fn =~ /\.gz$/
 788     ? FASTA::File->new( read_gzip_file => $fn )
 789     : FASTA::File->new( read_file      => $fn );
 790   $fasta or die "JPRED: $!";
 791
 792   my @seqs;
 793   for ( $fasta->get_entries ) {
 794     my $seq = PSISEQ->new;
 795     $seq->id( $_->id );
 796     $seq->align( [ @{ $_->seq } ] );
 797     push @seqs, $seq;
 798   }
 799
 800   return @seqs;
 801 }
 802
 803 #####################################################################################################
 804 sub fasta2psiseq_lowmemory {
 805   my $filename = shift;
 806
 807   local $/ = "\n";
 808   my $fh;
 809   $filename =~ /\.gz$/
 810     ? ( open $fh, "gzip -dc $filename|" or die $! )
 811     : ( open $fh, $filename or die $! );
 812
 813   my @seqs;
 814   my ( $id, $seq );
 815   while (<$fh>) {
 816     chomp;
 817     if (s/^>//) {
 818       $seq =~ s/ //g if defined $seq;
 819
 820       if ( $id and $seq ) {
 821         my $psi = PSISEQ->new( id => $id );
 822         $psi->align( [ split //, $seq ] );
 823         push @seqs, $psi;
 824         undef $id;
 825         undef $seq;
 826       } else {
 827         $id = $_;
 828       }
 829     } else {
 830       $seq .= $_;
 831     }
 832   }
 833   if ( $id and $seq ) {
 834     my $psi = PSISEQ->new( id => $id );
 835     $psi->seq($seq);
 836     push @seqs, $psi;
 837   }
 838
 839   return @seqs;
 840 }
 841
 842 =head1 extend($FASTA::File, @PSISEQ)
 843
 844 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
 845 against the query, this function extends the alignment so that all of the query is
 846 present in the alignment. The other sequences are extended with gap characters.
 847
 848 =cut
 849
 850 #####################################################################################################
 851 sub extend {
 852   my ( $query, @seqs ) = @_;
 853
 854   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
 855   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
 856
 857   # Get the query sequence
 858   my $q_query = $query->get_entry(0);
 859
 860   # Get the query sequence from the PSIBLAST alignment
 861   my $a_query = shift @seqs;
 862
 863   # The gap size required to expand the alignment
 864   my ( $start_gap_length, $end_gap_length );
 865   {
 866
 867     # Make handling the sequence easier
 868     my $q_seq = join "", @{ [ $q_query->seq ] };
 869     my $a_seq = join "", @{ $a_query->align };
 870
 871     $q_seq =~ s/-//g;
 872     $a_seq =~ s/-//g;
 873
 874     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
 875
 876     $start_gap_length = index( $q_seq, $a_seq );
 877     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
 878
 879     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
 880   }
 881
 882   # Add the gaps to the end of the alignments
 883   for (@seqs) {
 884     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
 885   }
 886
 887   # Put the query sequence back
 888   unshift @seqs,
 889     PSISEQ->new(
 890     id    => $a_query->id,
 891     align => [
 892       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
 893       @{ $a_query->align },
 894       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
 895     ]
 896     );
 897
 898   return @seqs;
 899 }
 900
 901 =head1 $int = fasta_seq_length($path)
 902
 903 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
 904
 905 =cut
 906
 907 #####################################################################################################
 908 sub fasta_seq_length {
 909   my ($fn) = @_;
 910   open my $fh, $fn or croak "Can't open file $fn: $!\n";
 911
 912   my $length = 0;
 913   local $_, $/ = "\n";
 914   while (<$fh>) {
 915     next if /^>/;
 916     chomp;
 917     $length += tr///c;
 918   }
 919
 920   return $length;
 921 }
 922
 923 =head1 log(@messages)
 924
 925 Report messages to STDERR
 926
 927 =cut
 928
 929 #####################################################################################################
 930 sub errlog { print STDERR @_ }