binaries/src/jpred/jpred.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9 ./jpred.pl -in/-sequence <FILE1> [-out/-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-verbose] [-debug] [-help] [-man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program which predicts the secondary structure of a protein sequence given a path to a
  14 FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
  15
  16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
  17 Some of the output may not be meaningful if used directly - it can be safely ignored.
  18
  19 =head1 OPTIONS
  20
  21 =over 5
  22
  23 =item -in/-sequence FILE
  24
  25 The path to the sequence file (in FASTA format) you want to predict.
  26
  27 =item -out/-output FILEPREFIX
  28
  29 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
  30
  31 =item -dbpath PATH
  32
  33 Path to the uniref database used for PSI-BLAST querying.
  34 Default: .
  35
  36 =item -dbname database
  37
  38 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
  39 Default: uniref90
  40
  41 =item -psi path
  42
  43 Path to a PSIBLAST output file.
  44
  45 =item -ncpu NNN
  46
  47 Number of CPU used by jpred.pl. Maximum value is 8
  48 Default: NNN = 1
  49
  50 =item -pred-nohits
  51
  52 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  53
  54 =item -verbose
  55
  56 Verbose mode. Print more information while running jpred.
  57
  58 =item -debug
  59
  60 Debug mode. Print debugging information while running jpred.
  61
  62 =item -help
  63
  64 Gives help on the programs usage.
  65
  66 =item -man
  67
  68 Displays this man page.
  69
  70 =back
  71
  72 =head1 BUGS
  73
  74 Can't cope with gaps in the query sequence.
  75
  76 Doesn't accept alignments.
  77
  78 If you find any others please contact authors.
  79
  80 =head1 AUTHORS
  81
  82 Jonathan Barber <jon@compbio.dundee.ac.uk>
  83 Chris Cole <christian@cole.name>
  84 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
  85
  86 =cut
  87
  88 ## TODO check for gaps in query sequence
  89 ##      check for disallowed chars in sequence
  90
  91 use strict;
  92 use warnings;
  93 use Getopt::Long;
  94 use Pod::Usage;
  95 use Carp;
  96 use Data::Dumper;
  97 use File::Temp;
  98
  99 #use UNIVERSAL;
 100
 101 # path to Jpred modules
 102 use FindBin qw($Bin);
 103 use lib "$Bin/lib";
 104
 105 # internal jpred modules
 106 use Jpred;
 107 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS);
 108 use HMMER::Profile;
 109 use HMMER::Profile::Jnet;
 110 use PSIBLAST;
 111 use PSIBLAST::PSSM;
 112 use PSIBLAST::Run;
 113 use FASTA::File;
 114 use FASTA;
 115 use Index;
 116 use Pairwise;
 117 use OC;
 118 use Utils qw(profile);
 119 use Run qw(check);
 120
 121 our $VERSION = '3.0.2';
 122 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 123 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 124
 125 # Gap characters in sequences
 126 our $GAP = '-';
 127
 128 #####################################################################################################
 129 # Light object to hold sequence information, light to prevent memory overload
 130 # for big search results. Even so this is quite memory intensive, I can't see
 131 # how to reduce the size further.
 132 {
 133
 134   package PSISEQ;
 135
 136   sub new {
 137     my ( $class, %args ) = @_;
 138     my $self = [];
 139     bless $self, $class;
 140     for ( keys %args ) { $self->$_( $args{$_} ) }
 141     return $self;
 142   }
 143
 144   sub id {
 145     return defined $_[1]
 146       ? $_[0]->[0] = $_[1]
 147       : $_[0]->[0];
 148   }
 149
 150   sub start {
 151     return defined $_[1]
 152       ? $_[0]->[1] = $_[1]
 153       : $_[0]->[1];
 154   }
 155
 156   sub align {
 157     if ( defined $_[1] ) {
 158       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 159
 160       $_[0]->[2] = join( "", @{ $_[1] } );
 161     } else {
 162       return [ split( //, $_[0]->[2] ) ];
 163     }
 164   }
 165 }
 166
 167 #####################################################################################################
 168 my $infile;
 169 my $output;
 170 my $psiblast;
 171 my $ncpu       = 1;           # number of CPUs for psiblast calculations
 172 my $predNoHits = 0;           # define whether to make predictions when no PSI-BLAST hits are found
 173 my $db_path    = ".";
 174 my $db_entry   = "unknown";
 175
 176 my ( $help, $man, $DEBUG, $VERBOSE );
 177
 178 GetOptions(
 179   "in|sequence=s" => \$infile,
 180   "out|output=s"  => \$output,
 181   "psi=s"         => \$psiblast,
 182   "dbname=s"      => \$db_entry,
 183   "dbpath=s"      => \$db_path,
 184   "ncpu=s"        => \$ncpu,
 185   "pred-nohits"   => \$predNoHits,
 186   "help"          => \$help,
 187   "man"           => \$man,
 188   "debug"         => \$DEBUG,
 189   "verbose"       => \$VERBOSE
 190 ) or pod2usage(2);
 191 pod2usage(1) if $help;
 192 pod2usage( verbose => 2 ) if $man;
 193
 194 #####################################################################################################
 195 # Key to database information and information for accessing them
 196 my $database = {
 197
 198   ## default database used by Jpred
 199   uniref90 => {
 200     database   => $db_path . "/uniref90.filt",
 201     unfiltered => $db_path . "/uniref90",
 202   },
 203
 204   ## database used during Jnet training
 205   training => {
 206     database   => $db_path . "/training/uniref90.filt",
 207     unfiltered => $db_path . "/training/uniref90",
 208   },
 209
 210   ## database used for ported Jpred
 211   ported_db => {
 212     database   => $db_path . "/databases/uniref90.filt",
 213     unfiltered => $db_path . "/databases/uniref90",
 214   },
 215   ## cluster-specific path for Jpred
 216   cluster => {
 217     database   => "/homes/www-jpred/databases/uniref90.filt",
 218     unfiltered => "/homes/www-jpred/databases/uniref90",
 219   },
 220
 221   ## these other DBs are experimental ones used during development.
 222   ## check they exist before using them.
 223   swall => {
 224     # generic entry for use with validate_jnet.pl
 225     # real db location defined by validate_jnet.pl
 226     database   => $db_path . "/swall/swall.filt",
 227     unfiltered => $db_path . "/swall/swall",
 228   },
 229
 230   uniprot => {
 231     # Path to PSIBLAST db
 232     database   => $db_path . "/3/swall.filt",
 233     unfiltered => $db_path . "/3/swall",
 234   },
 235
 236   uniref50 => {
 237     database   => $db_path . "/6/swall.filt",
 238     unfiltered => $db_path . "/6/swall",
 239   },
 240 };
 241
 242 pod2usage(' -sequence argument not provided') unless $infile;
 243 die "-db $db_entry not recognised" unless exists $database->{$db_entry};
 244 $output = $infile . ".res" unless $output;
 245 $ncpu = 1 if ( 1 > $ncpu or 8 < $ncpu );
 246
 247 for ( [ "input", $infile ], [ "output", $output ], [ "psi", $psiblast ], [ "db", $db_entry ] ) {
 248   my ( $key, $value ) = @{$_};
 249   defined $value or next;
 250   errlog( join( ": ", $key, $value ), "\n" );
 251 }
 252
 253 #####################################################################################################
 254 my $platform = check_OS();
 255 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 256
 257 #####################################################################################################
 258 my $query = FASTA::File->new( read_file => $infile );
 259
 260 my @seqs;
 261 my $psi = PSIBLAST->new;
 262 unless ( defined $psiblast && -e $psiblast ) {
 263   ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 264   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 265   # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 266   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 267   my $psi_run = PSIBLAST::Run->new(
 268     debug    => $DEBUG,
 269     args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 270     path     => $psiblastbin,
 271     input    => $infile,
 272     output   => "$output.blast",
 273     matrix   => "$output.profile",
 274     database => $database->{$db_entry}{database},
 275   );
 276
 277   # For reduced databases, get the size of the orginal and use this as
 278   # the equivilent DB size
 279   if ( $db_entry =~ /^sp_red_/ ) {
 280     ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 281     $psi_run->args(
 282       $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 283     );
 284   }
 285   print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
 286   print "BLASTDB path: $ENV{BLASTDB}\n"  if $DEBUG;
 287
 288   print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 289   $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 290
 291   $psi->read_file("$output.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 292   system("gzip -9f $output.blast");
 293 } else {
 294   if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 295   else                         { $psi->read_file($psiblast) }                        # CC ditto above
 296 }
 297
 298 #####################################################################################################
 299 # Convert the last itteration into a collection of PSISEQ objects
 300 for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 301   my ( $id, $start, $align ) = @{$_};
 302   push @seqs,
 303     PSISEQ->new(
 304     id    => $id,
 305     start => $start,
 306     align => [ split( //, $align ) ]
 307     );
 308 }
 309
 310 #####################################################################################################
 311 ## When there are no PSI-BLAST hits generate an HMM from the query
 312 ## and run Jnet against that only.
 313 ## Accuracy won't be as good, but at least it's a prediction
 314 if ( @seqs == 0 ) {
 315   if ( $predNoHits == 0 ) {
 316     warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
 317     print ">>100% complete\n";
 318     exit;
 319   } else {
 320     print ">>50% complete\n";
 321     warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 322     print "Running HMMer on query...\n" if $VERBOSE;
 323
 324     # copy input query to alignment
 325     #system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
 326     open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 327     open( my $ofh, ">", $output . ".align" ) or die "JPRED: cannot open file $output\.align: $!";
 328     while (<$ifh>) { print $ofh, $_ }
 329     close $ifh;
 330     close $ofh;
 331
 332     # Temp files required for HMMer
 333     my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 334
 335     system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 336     system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 337
 338     # Read in the HMMER file
 339     my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 340
 341     # Convert from HMMER::Profile to HMMER::Profile::Jnet
 342     my $hmmer_jnet = HMMER::Profile::Jnet->new;
 343     $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 344     $hmmer_jnet->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 345     print ">>70% complete\n";
 346
 347     # Run Jnet for the prediction
 348     print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 349     jnet( map { "$output.$_" } qw(hmm jnet) );    # CC sub is below
 350   }
 351 } else {
 352   psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;    # CC sub is below
 353   print ">>40% complete\n";
 354
 355   #####################################################################################################
 356   # Make PSIBLAST truncated alignments the right length
 357   print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 358   @seqs = extend( $query, @seqs );                  # CC sub si below
 359   psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 360
 361   #####################################################################################################
 362   # Remove masking from PSIBLAST alignment
 363   print "Unmasking the alignments...\n" if $VERBOSE;
 364   my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 365   remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 366   psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 367
 368   #####################################################################################################
 369   # Convert the sequences to upper case
 370   print "Converting sequences to the same case...\n" if $VERBOSE;
 371   toupper($_) for @seqs;    # CC sub is below
 372   psiseq2fasta( "${output}_backup.fasta.gz", @seqs );
 373
 374   #####################################################################################################
 375   # Remove excessive sequences
 376   print "Remove excessive sequences...\n" if $VERBOSE;
 377   @seqs = reduce( $MAX_ALIGN, @seqs );
 378   psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 379
 380   #####################################################################################################
 381   # Remove sequences that are too long or too short
 382   print "Remove sequences which too long or short...\n" if $VERBOSE;
 383   @seqs = del_long_seqs( 50, @seqs );
 384   psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 385
 386   #####################################################################################################
 387   # Remove redundant sequences based upon pairwise identity and OC clustering
 388   print "Remove redundant sequences...\n" if $VERBOSE;
 389   @seqs = nonred( $NR_CUT, @seqs );
 390   psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 391
 392   #####################################################################################################
 393   # Check that we haven't got rid of all of the sequences
 394   if ( @seqs < 2 ) {
 395     warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 396     @seqs = fasta2psiseq("${output}_backup.fasta.gz");
 397   }
 398   unlink("${output}_backup.fasta.gz") unless $DEBUG;
 399
 400   # Remove gaps in the query sequence and same positions in the alignment
 401   print "Removing gaps in the query sequence...\n" if $VERBOSE;
 402   degap(@seqs);
 403   psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 404
 405   # Output the alignment for the prediction
 406   print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 407   psiseq2fasta( "$output.align", @seqs );
 408
 409   #####################################################################################################
 410   # Equivilent to getpssm script
 411   print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 412   my $pssm = PSIBLAST::PSSM->new( read_file => "$output.profile" );
 413   $pssm->write_file("$output.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 414   print ">>50% complete\n";
 415
 416   #####################################################################################################
 417   # Run HMMER on the sequences
 418   print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 419   my $hmmer = hmmer(@seqs);             # CC sub is below
 420   $hmmer->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 421   print ">>70% complete\n";
 422
 423   #####################################################################################################
 424   # Run Jnet for the prediction
 425   print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 426   jnet( map { "$output.$_" } qw(hmm jnet pssm) );    # CC sub is below
 427 }
 428
 429 print ">>100% complete\n";
 430 print "Jpred Finished\n";
 431 exit;
 432
 433 #####################################################################################################
 434 # Functions
 435 #####################################################################################################
 436
 437 =begin :private
 438
 439 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 440
 441 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 442 Does it in place.
 443
 444 =cut
 445
 446 #####################################################################################################
 447 sub remove_seq_masks {
 448   my $idx = shift;
 449   my $seq = shift;
 450
 451   # Only bother if we have a sequence which has been masked
 452   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 453
 454   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 455   my $searchdb = $database->{$db_entry}{unfiltered};
 456   $start--;    # We need this to be zero based
 457
 458   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 459   my @seqs = $f->get_entries;
 460   my $db   = shift @seqs;
 461
 462   unless ($db) {
 463     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 464   } elsif (@seqs) {
 465     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 466   }
 467
 468   my @db_seq = @{ $db->seq };
 469
 470   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 471   my $i = 0;
 472   for ( 0 .. $#align ) {
 473     next if $align[$_] =~ /$GAP/;
 474
 475     if ( $align[$_] eq 'X' ) {
 476       unless ( defined $db_seq[ $i + $start ] ) {
 477         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 478       }
 479       $align[$_] = $db_seq[ $i + $start ];
 480     }
 481     $i++;
 482   }
 483   $seq->align( \@align );
 484 }
 485
 486 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 487
 488 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 489
 490 Equivilent to reduce script.
 491
 492 =cut
 493
 494 #####################################################################################################
 495 sub reduce {
 496   my ( $max, @seqs ) = @_;
 497
 498   my $num = int( @seqs / $max );
 499   my @res = shift @seqs;
 500
 501   # Don't bother if we have less than the required sequences
 502   if ( @seqs < $max or 0 == $num ) {
 503     return @res, @seqs;
 504   }
 505
 506   for ( 0 .. $#seqs ) {
 507     push @res, $seqs[$_] if ( 0 == $_ % $num );
 508   }
 509
 510   return @res;
 511 }
 512
 513 =head2 nonred($cutoff, @PSISEQS);
 514
 515 Removes redundant sequences at $cutoff level of sequence identity.
 516
 517 Equivilent to nonred script.
 518
 519 =cut
 520
 521 #####################################################################################################
 522 sub nonred {
 523   my ( $cutoff, @seqs ) = @_;
 524
 525   # Run pairwise and then OC and remove similar sequences
 526   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 527   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 528   if     ( @seqs == 1 ) {
 529     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 530     return @seqs;
 531   }
 532
 533   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 534   my $fasta = FASTA::File->new;
 535   foreach (@seqs) {
 536     my $f = FASTA->new( id => $_->id );
 537     $f->seq( @{ $_->align } );
 538     $fasta->add_entries($f);
 539   }
 540
 541   # Run pairwise
 542   my $pair = Pairwise->new( path => $pairwise );
 543   my $foo = join "\n", $pair->run($fasta) or die $!;
 544
 545   # Run OC
 546   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 547   $ocres->read_string( join "", $ocres->run($foo) );
 548
 549   # Now found those entries that are unrelated
 550   my @keep;
 551   for ( $ocres->get_groups ) {
 552     my ( $group, $score, $size, @labels ) = @{$_};
 553
 554     # Keep the QUERY from the cluster, otherwise just get onelt
 555     if ( grep { /QUERY/ } @labels ) {
 556
 557       # Make sure the query stays at the start
 558       unshift @keep, "QUERY";
 559       next;
 560     } else {
 561       push @keep, shift @labels;
 562     }
 563   }
 564   push @keep, $ocres->get_unclustered;
 565
 566   # Return the matching entries.
 567   # We use the temporay to mimic the selection of sequences in the nonred
 568   # script, which took the sequences in the order they occured in the file.
 569   my @filtered_res;
 570   for (@seqs) {
 571     my $label = $_->id;
 572     if ( grep { $_ =~ /^$label$/ } @keep ) {
 573       push @filtered_res, $_;
 574     }
 575   }
 576
 577   return @filtered_res;
 578 }
 579
 580 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
 581
 582 Removes those sequences that are more than $percentage longer or shorter than the first
 583 sequence in @seqs. @seqs is an array of PSISEQ objects.
 584
 585 Equivilent to trim_seqs script.
 586
 587 =cut
 588
 589 #####################################################################################################
 590 sub del_long_seqs {
 591   my ( $factor, @seqs ) = @_;
 592
 593   # Keep the query sequence (we assume query is the 1st FASTA record)
 594   my $query = shift @seqs;
 595   my @res   = $query;
 596
 597   # Find the length of the query sequence without any gaps
 598   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
 599
 600   # Calculate the upper and lower allowed bounds
 601   my ( $upper, $lower );
 602   {
 603     my $bounds = ( $length / 100 ) * $factor;
 604     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
 605   }
 606
 607   # Now try each sequence and see how long they are
 608   for (@seqs) {
 609     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
 610     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
 611   }
 612
 613   return @res;
 614 }
 615
 616 =head2 toupper ($PSISEQ)
 617
 618 Converts sequence held in PSISEQ object to uppercase.
 619
 620 =cut
 621
 622 #####################################################################################################
 623 sub toupper {
 624   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
 625 }
 626
 627 =head2 degap($PSISEQ_QUERY, @PSISEQS)
 628
 629 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
 630
 631 Equivilent to fasta2jnet script.
 632
 633 =cut
 634
 635 #####################################################################################################
 636 sub degap {
 637   my (@seqs) = @_;
 638
 639   # Find where the gaps in the query sequence are
 640   my @gaps;
 641   my $i = 0;
 642   for ( @{ $seqs[0]->align } ) {
 643     push @gaps, $i if $_ !~ /$GAP/;
 644     $i++;
 645   }
 646
 647   return unless @gaps;
 648
 649   # Remove the gaps in all the sequences.
 650   # Derefences the array reference and takes a slice inside the method call
 651   # arguments, then coerce back to an array ref.
 652   for (@seqs) {
 653     $_->align( [ @{ $_->align }[@gaps] ] );
 654   }
 655 }
 656
 657 =head2 getfreq($filename, @PSISEQS)
 658
 659 Creates a PSIBLAST like profile and writes it to the $filename.
 660
 661 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
 662 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
 663 but it's *not* the same.
 664
 665 =cut
 666
 667 #####################################################################################################
 668 sub getfreq {
 669   my ( $fn, @seqs ) = @_;
 670
 671   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
 672
 673   # Create a PSIBLAST like profile from the alignment
 674   # This doesn't greate the same result as old Jpred, I think this is because
 675   # differences in the input between old and new
 676
 677   # For testing
 678   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
 679   #   my @profile = profile(
 680   #      map { join "", @{ $_->seq } } $f->get_entries
 681   #   );
 682
 683   my @profile = profile( map { join "", @{ $_->align } } @seqs );
 684
 685   open my $fh, ">$fn" or die "JPRED: $fn: $!";
 686   print $fh join( " ", @{$_} ), "\n" for @profile;
 687 }
 688
 689 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
 690
 691 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
 692
 693 Equivilent to gethmm script.
 694
 695 =cut
 696
 697 #####################################################################################################
 698 sub hmmer {
 699   my (@seqs) = @_;
 700
 701   # Temp files required
 702   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
 703
 704   # Create the FASTA file
 705   psiseq2fasta( $tmp_fasta, @seqs );
 706
 707   # Run HMMer
 708   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
 709   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 710
 711   # Read in the HMMER file
 712   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 713
 714   # Convert from HMMER::Profile to HMMER::Profile::Jnet
 715   my $hmmer_jnet = HMMER::Profile::Jnet->new;
 716   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 717
 718   return $hmmer_jnet;
 719 }
 720
 721 =head2 jnet($hmmer, $out, $pssm )
 722
 723 Runs Jnet. Pass the paths of the filenames to do the prediction on
 724
 725 =cut
 726
 727 #####################################################################################################
 728 sub jnet {
 729   my ( $hmmer, $outfile, $pssm ) = @_;
 730   if ($pssm) {
 731
 732     # run Jnet prediction with all the profile data
 733     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
 734   } else {
 735
 736     # run Jnet prediction with only HMMer and alignment data
 737     system("$jnet --concise --hmmer $hmmer > $outfile");
 738   }
 739   check( "jnet", $? ) or exit 1;
 740 }
 741
 742 =head1 psiseq2fasta($path, @PSISEQ)
 743
 744 Writes a FASTA file given an array of PSISEQ objects.
 745
 746 =cut
 747
 748 #####################################################################################################
 749 sub psiseq2fasta {
 750   my ( $fn, @seqs ) = @_;
 751
 752   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
 753   confess "JPRED: not passed PSISEQs" unless @seqs;
 754
 755   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 756
 757   # Changed from using FASTA::File object due to seg faulting on some large,
 758   # long alignments, presumably due to running out of memory.
 759   #   my $fasta = FASTA::File->new;
 760   #   $fasta->add_entries(
 761   #      map {
 762   #         my $f = FASTA->new(id => $_->id);
 763   #         $f->seq(@{$_->align});
 764   #         $f;
 765   #      } @seqs
 766   #   );
 767   #   $fasta->write_file($fn);
 768
 769   my $fh;
 770   $fn =~ /\.gz$/
 771     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
 772     : ( open $fh, ">$fn" or die "JPRED: $!" );
 773
 774   for (@seqs) {
 775     print $fh ">", $_->id, "\n";
 776     my $seq = join( "", @{ $_->align } );
 777     $seq =~ s/(.{72})/$1\n/g;
 778     chomp $seq;
 779     print $fh $seq . "\n";
 780   }
 781   close $fh;
 782 }
 783
 784 =head1 @PSISEQ = fasta2psiseq($path)
 785
 786 Reads in a FASTA file and returns an array of PSISEQ objects.
 787
 788 =cut
 789
 790 #####################################################################################################
 791 sub fasta2psiseq {
 792   my ($fn) = @_;
 793
 794   my $fasta =
 795     $fn =~ /\.gz$/
 796     ? FASTA::File->new( read_gzip_file => $fn )
 797     : FASTA::File->new( read_file      => $fn );
 798   $fasta or die "JPRED: $!";
 799
 800   my @seqs;
 801   for ( $fasta->get_entries ) {
 802     my $seq = PSISEQ->new;
 803     $seq->id( $_->id );
 804     $seq->align( [ @{ $_->seq } ] );
 805     push @seqs, $seq;
 806   }
 807
 808   return @seqs;
 809 }
 810
 811 #####################################################################################################
 812 sub fasta2psiseq_lowmemory {
 813   my $filename = shift;
 814
 815   local $/ = "\n";
 816   my $fh;
 817   $filename =~ /\.gz$/
 818     ? ( open $fh, "gzip -dc $filename|" or die $! )
 819     : ( open $fh, $filename or die $! );
 820
 821   my @seqs;
 822   my ( $id, $seq );
 823   while (<$fh>) {
 824     chomp;
 825     if (s/^>//) {
 826       $seq =~ s/ //g if defined $seq;
 827
 828       if ( $id and $seq ) {
 829         my $psi = PSISEQ->new( id => $id );
 830         $psi->align( [ split //, $seq ] );
 831         push @seqs, $psi;
 832         undef $id;
 833         undef $seq;
 834       } else {
 835         $id = $_;
 836       }
 837     } else {
 838       $seq .= $_;
 839     }
 840   }
 841   if ( $id and $seq ) {
 842     my $psi = PSISEQ->new( id => $id );
 843     $psi->seq($seq);
 844     push @seqs, $psi;
 845   }
 846
 847   return @seqs;
 848 }
 849
 850 =head1 extend($FASTA::File, @PSISEQ)
 851
 852 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
 853 against the query, this function extends the alignment so that all of the query is
 854 present in the alignment. The other sequences are extended with gap characters.
 855
 856 =cut
 857
 858 #####################################################################################################
 859 sub extend {
 860   my ( $query, @seqs ) = @_;
 861
 862   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
 863   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
 864
 865   # Get the query sequence
 866   my $q_query = $query->get_entry(0);
 867
 868   # Get the query sequence from the PSIBLAST alignment
 869   my $a_query = shift @seqs;
 870
 871   # The gap size required to expand the alignment
 872   my ( $start_gap_length, $end_gap_length );
 873   {
 874
 875     # Make handling the sequence easier
 876     my $q_seq = join "", @{ [ $q_query->seq ] };
 877     my $a_seq = join "", @{ $a_query->align };
 878
 879     $q_seq =~ s/-//g;
 880     $a_seq =~ s/-//g;
 881
 882     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
 883
 884     $start_gap_length = index( $q_seq, $a_seq );
 885     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
 886
 887     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
 888   }
 889
 890   # Add the gaps to the end of the alignments
 891   for (@seqs) {
 892     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
 893   }
 894
 895   # Put the query sequence back
 896   unshift @seqs,
 897     PSISEQ->new(
 898     id    => $a_query->id,
 899     align => [
 900       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
 901       @{ $a_query->align },
 902       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
 903     ]
 904     );
 905
 906   return @seqs;
 907 }
 908
 909 =head1 $int = fasta_seq_length($path)
 910
 911 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
 912
 913 =cut
 914
 915 #####################################################################################################
 916 sub fasta_seq_length {
 917   my ($fn) = @_;
 918   open my $fh, $fn or croak "Can't open file $fn: $!\n";
 919
 920   my $length = 0;
 921   local $_, $/ = "\n";
 922   while (<$fh>) {
 923     next if /^>/;
 924     chomp;
 925     $length += tr///c;
 926   }
 927
 928   return $length;
 929 }
 930
 931 =head1 log(@messages)
 932
 933 Report messages to STDERR
 934
 935 =cut
 936
 937 #####################################################################################################
 938 sub errlog { print STDERR @_ }