jpred/jpred

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9   jpred --sequence <fasta file> [--output <output prefix>] [--db <database>] [--psi <psiblast output>] [--pred-nohits] [--verbose] [--debug] [--help] [--man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program which predicts the secondary structure of a sequence given a path to a FASTA sequence.
  14 It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
  15
  16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
  17 Some of the output may not be meaningful if used directly - it can be safely ignored.
  18
  19 =head1 OPTIONS
  20
  21 =over 5
  22
  23 =item --sequence path
  24
  25 The path to the sequence (in FASTA format) you want to predict.
  26
  27 =item --output file
  28
  29 A prefix to the filenames created by Jpred, defaults to the value set by --sequence.
  30
  31 =item --db database
  32
  33 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code. Crap I know, but that's the way it is at the moment. It's probably best to stick the default for the time being.
  34
  35 Default: uniref90
  36
  37 =item --psi path
  38
  39 Path to a PSIBLAST output file.
  40
  41 =item --pred-nohits
  42
  43 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  44
  45 =item --verbose
  46
  47 Verbose mode. Print more information while running jpred.
  48
  49 =item --debug
  50
  51 Debug mode. Print debugging information while running jpred.
  52
  53 =item --help
  54
  55 Gives help on the programs usage.
  56
  57 =item --man
  58
  59 Displays this man page.
  60
  61 =back
  62
  63 =head1 BUGS
  64
  65 Can't cope with gaps in the query sequence.
  66
  67 Doesn't accept alignments.
  68
  69 If you find any others please contact me.
  70
  71 =head1 AUTHORS
  72
  73 Jonathan Barber <jon@compbio.dundee.ac.uk>
  74
  75 Chris Cole <christian@cole.name> (current maintainer)
  76
  77 =cut
  78
  79 ## TODO check for gaps in query sequence
  80 ##      check for disallowed chars in sequence
  81
  82 use strict;
  83 use warnings;
  84 use Getopt::Long;
  85 use Pod::Usage;
  86 use Carp;
  87 use UNIVERSAL qw(isa);
  88 use Data::Dumper;
  89 use File::Temp;
  90
  91 use FindBin qw($Bin);
  92 use lib "$Bin/../lib";
  93 #use lib "lib";
  94 #use lib "/home/asherstnev/Projects/Jpred/jpred/trunk/lib";
  95
  96 use Jpred;    # needed to define BLAST environment variables
  97
  98 # my modules
  99 use HMMER::Profile;
 100 use HMMER::Profile::Jnet;
 101
 102 use PSIBLAST;
 103 use PSIBLAST::PSSM;
 104 use PSIBLAST::Run;
 105
 106 use FASTA::File;
 107 use FASTA;
 108
 109 use Index;
 110 use OC;
 111 use Utils qw(profile);
 112 use Run qw(check);
 113 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin);
 114
 115 our $VERSION   = '3.0.2';
 116 my $MAX_ALIGN = 1000;      # maximum number of alignment sequences allowed
 117 my $NR_CUT    = 75;        # cut-off seqeunce ID value for defining non-redundancy
 118
 119 # Light object to hold sequence information, light to prevent memory overload
 120 # for big search results. Even so this is quite memory intensive, I can't see
 121 # how to reduce the size further.
 122 {
 123
 124   package PSISEQ;
 125
 126   #use Compress::LZF qw(compress decompress);
 127
 128   sub new {
 129     my ( $class, %args ) = @_;
 130     my $self = [];
 131     bless $self, $class;
 132     for ( keys %args ) { $self->$_( $args{$_} ) }
 133     return $self;
 134   }
 135
 136   sub id {
 137     return defined $_[1]
 138       ? $_[0]->[0] = $_[1]
 139       : $_[0]->[0];
 140   }
 141
 142   sub start {
 143     return defined $_[1]
 144       ? $_[0]->[1] = $_[1]
 145       : $_[0]->[1];
 146   }
 147
 148   sub align {
 149     if ( defined $_[1] ) {
 150       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 151
 152       #$_[0]->[2] = compress(join "", @{ $_[1] });
 153       $_[0]->[2] = join( "", @{ $_[1] } );
 154     } else {
 155
 156       #return [ split //, decompress($_[0]->[2]) ];
 157       return [ split( //, $_[0]->[2] ) ];
 158     }
 159   }
 160 }
 161
 162 # Key to database information and information for accessing them
 163 my $dbPATH   = '/homes/chris/projects/Jnet/db';
 164 my $db_entry = "uniref90";
 165 my $database = {
 166
 167   ## default database used by Jpred
 168   uniref90 => {
 169     database   => 'uniref90.filt',
 170     unfiltered => 'uniref90',
 171   },
 172
 173   ## database used during Jnet training
 174   training => {
 175     database   => 'training/uniref90.filt',
 176     unfiltered => 'training/uniref90',
 177   },
 178
 179   ## cluster-specific path for Jpred
 180   cluster => {
 181     database   => '/local/gjb_lab/www-jpred/uniref90.filt',
 182     unfiltered => '/local/gjb_lab/www-jpred/uniref90',
 183   },
 184
 185   ## these other DBs are experimental ones used during development.
 186   ## check they exist before using them.
 187   swall => {
 188
 189     # generic entry for use with validate_jnet.pl
 190     # real db location defined by validate_jnet.pl
 191     database   => "$dbPATH/swall/swall.filt",    # CC new
 192     unfiltered => "$dbPATH/swall/swall",
 193   },
 194   uniprot => {
 195
 196     # Path to PSIBLAST db
 197     database   => "$dbPATH/3/swall.filt",        # CC new
 198     unfiltered => "$dbPATH/3/swall",
 199   },
 200   uniref50 => {
 201     database   => "$dbPATH/6/swall.filt",
 202     unfiltered => "$dbPATH/6/swall",
 203   },
 204 };
 205
 206 # Gap characters in sequences
 207 our $GAP = '-';
 208
 209 my ( $help, $man, $path, $output, $psiblast, $DEBUG, $VERBOSE );
 210 my $predNoHits = 0;    # define whether to make predictions when no PSI-BLAST hits are found
 211 GetOptions(
 212   "help"        => \$help,
 213   "man"         => \$man,
 214   "verbose"     => \$VERBOSE,
 215   "sequence=s"  => \$path,
 216   "output=s"    => \$output,
 217   "psi=s"       => \$psiblast,
 218   "db=s"        => \$db_entry,
 219   "pred-nohits" => \$predNoHits,
 220   "debug"       => \$DEBUG
 221 ) or pod2usage(2);
 222 pod2usage(1) if $help;
 223 pod2usage( verbose => 2 ) if $man;
 224
 225 pod2usage(' --sequence argument not provided') unless $path;
 226 die "--db $db_entry not recognised" unless exists $database->{$db_entry};
 227 $output = $path unless $output;
 228
 229 for ( [ "path", $path ], [ "output", $output ], [ "psi", $psiblast ], [ "db", $db_entry ] ) {
 230   my ( $key, $value ) = @{$_};
 231   defined $value or next;
 232   errlog( join( ": ", $key, $value ), "\n" );
 233 }
 234
 235 my $query = FASTA::File->new( read_file => $path );
 236
 237 my @seqs;
 238 if (1) {
 239   my $psi = PSIBLAST->new;    # CC PSIBLAST.pm doesn't have a new() class??
 240   unless ( defined $psiblast && -e $psiblast ) {
 241     my $psi_run = PSIBLAST::Run->new(
 242       debug => $DEBUG,
 243
 244       #         path => "/software/jpred_bin/blastpgp",
 245       path     => $psiblastbin,                       # CC new path
 246       input    => $path,
 247       output   => "$output.blast",
 248       matrix   => "$output.profile",
 249       database => $database->{$db_entry}{database},
 250       ## potentially a better set of parameters (esp. -b & -v)
 251       ## which avoid PSI-BLAST dying with large number of hits
 252       # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 253       # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
 254       # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 255     );
 256
 257     # For reduced databases, get the size of the orginal and use this as
 258     # the equivilent DB size
 259     #print $db_entry, "\n";
 260     if ( $db_entry =~ /^sp_red_/ ) {
 261       ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 262       $psi_run->args(
 263         $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 264       );
 265     }
 266     print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
 267     print "BLASTDB path: $ENV{BLASTDB}\n"  if $DEBUG;
 268
 269     print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 270     $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 271
 272     $psi->read_file("$output.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 273     system("gzip -9f $output.blast");
 274   } else {
 275     if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 276     else                         { $psi->read_file($psiblast) }                        # CC ditto above
 277   }
 278
 279   # Convert the last itteration into a collection of PSISEQ objects
 280   for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 281     my ( $id, $start, $align ) = @{$_};
 282     push @seqs,
 283       PSISEQ->new(
 284       id    => $id,
 285       start => $start,
 286       align => [ split( //, $align ) ]
 287       );
 288   }
 289 }
 290
 291 ## When there are no PSI-BLAST hits generate an HMM from the query
 292 ## and run Jnet against that only.
 293 ## Accuracy won't be as good, but at least it's a prediction
 294 if ( @seqs == 0 ) {
 295   if ( $predNoHits == 0 ) {
 296     warn "\nJPRED: Warning - no PSI-BLAST hits found and '--pred-nohits' option not set. Exiting...\n";
 297     print ">>100% complete\n";
 298     exit;
 299   }
 300   print ">>50% complete\n";
 301   warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 302   print "Running HMMer on query...\n" if $VERBOSE;
 303
 304   # copy input query to alignment
 305   system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
 306
 307   # Temp files required for HMMer
 308   my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 309
 310   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $path");
 311   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 312
 313   # Read in the HMMER file
 314   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 315
 316   # Convert from HMMER::Profile to HMMER::Profile::Jnet
 317   my $hmmer_jnet = HMMER::Profile::Jnet->new;
 318   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 319   $hmmer_jnet->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 320   print ">>70% complete\n";
 321
 322   # Run Jnet for the prediction
 323   print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 324   jnet( map { "$output.$_" } qw(hmm jnet) );    # CC sub is below
 325
 326   print "Jpred Finished\n";
 327   exit;
 328 }
 329
 330 psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;    # CC sub is below
 331 print ">>40% complete\n";
 332
 333 # Make PSIBLAST truncated alignments the right length
 334 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 335 @seqs = extend( $query, @seqs );                  # CC sub si below
 336 psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 337
 338 # Remove masking from PSIBLAST alignment
 339 print "Unmasking the alignments...\n" if $VERBOSE;
 340 dex($_) for @seqs[ 1 .. $#seqs ];                 ## <-- CC dies here in dex() - found below
 341 psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 342
 343 # Convert the sequences to upper case
 344 print "Converting sequences to the same case...\n" if $VERBOSE;
 345 toupper($_) for @seqs;                            # CC sub is below
 346 psiseq2fasta( "${output}_backup.fasta.gz", @seqs );
 347
 348 # Remove excessive sequences
 349 print "Remove excessive sequences...\n" if $VERBOSE;
 350 @seqs = reduce( $MAX_ALIGN, @seqs );              # CC sub is below
 351 psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 352
 353 # Remove sequences that are too long or too short
 354 print "Remove sequences which too long or short...\n" if $VERBOSE;
 355 @seqs = del_long_seqs( 50, @seqs );               # CC sub is below
 356 psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 357
 358 # Remove redundant sequences based upon pairwise identity and OC clustering
 359 print "Remove redundant sequences...\n" if $VERBOSE;
 360 @seqs = nonred( $NR_CUT, @seqs );                 # CC sub is below
 361 psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 362
 363 # Check that we haven't got rid of all of the sequences
 364 if ( @seqs < 2 ) {
 365   warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 366   @seqs = fasta2psiseq("${output}_backup.fasta.gz");
 367 }
 368 unlink("${output}_backup.fasta.gz") unless $DEBUG;
 369
 370 # Remove gaps in the query sequence and same positions in the alignment
 371 print "Removing gaps in the query sequence...\n" if $VERBOSE;
 372 degap(@seqs);                                     # CC sub is below
 373 psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 374
 375 # Output the alignment for the prediction
 376 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 377 psiseq2fasta( "$output.align", @seqs );
 378
 379 {
 380   # Equivilent to getpssm script
 381   print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 382   my $pssm = PSIBLAST::PSSM->new( read_file => "$output.profile" );
 383   $pssm->write_file("$output.pssm");              # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 384 }
 385 print ">>50% complete\n";
 386
 387 # Run HMMER on the sequences
 388 {
 389   print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 390   my $hmmer = hmmer(@seqs);                       # CC sub is below
 391   $hmmer->write_file("$output.hmm");              # write_file is in the Read.pm called from the HMMER::Profile module
 392 }
 393 print ">>70% complete\n";
 394
 395 # Run Jnet for the prediction
 396 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 397 jnet( map { "$output.$_" } qw(hmm jnet pssm) );    # CC sub is below
 398
 399 print "Jpred Finished\n";
 400
 401 # Then do an accuracy prediction
 402
 403 ############################################################
 404 # Functions
 405 ############################################################
 406
 407 =begin :private
 408
 409 =head2 $PSISEQ = dex($PSISEQ)
 410
 411 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database. Does it in place.
 412
 413 =cut
 414
 415 BEGIN {
 416   my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 417
 418   sub dex {
 419     my ($seq) = @_;
 420
 421     croak "JPRED: dex: can only take a PSISEQ object as an argument" unless isa $seq, 'PSISEQ';
 422
 423     # Only bother if we have a sequence which has been masked
 424     return unless grep { uc $_ eq 'X' } @{ $seq->align };
 425
 426     my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 427
 428     #$id = "$id"; # Add the EMBOSS USA
 429     #$id = "sprot_37:$id"; # Add the EMBOSS USA
 430     #$id = "sprot_43:$id"; # Add the EMBOSS USA
 431     #      $id = $database->{$db_entry}{emboss} . ":$id";  # commented out to try FastaCMD module
 432     my $searchdb = $database->{$db_entry}{unfiltered};
 433     $start--;    # We need this to be zero based
 434
 435     my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;    # CC sub is from Index::EMBOSS
 436     my @seqs = $f->get_entries;                                                                                           ## Sub is from Sequence::File
 437     my $db   = shift @seqs;
 438
 439     unless ($db) {
 440       confess "JPRED: dex: Accession number $id returned no sequences";
 441     } elsif (@seqs) {
 442       confess "JPRED: dex: Accession number $id returned more than one sequence";
 443     }
 444
 445     my @db_seq = @{ $db->seq };
 446
 447     # $i is a pointer into the original, ungapped, unmasked sequence. $_ is
 448     # for the aligned sequence
 449     my $i = 0;
 450     for ( 0 .. $#align ) {
 451       next if $align[$_] =~ /$GAP/;
 452
 453       if ( $align[$_] eq 'X' ) {
 454         unless ( defined $db_seq[ $i + $start ] ) {
 455           croak "JPRED: dex: the database sequence is not as long as the query sequence!";
 456         }
 457         $align[$_] = $db_seq[ $i + $start ];
 458       }
 459       $i++;
 460     }
 461
 462     $seq->align( \@align );
 463   }
 464 }
 465
 466 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 467
 468 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 469
 470 Equivilent to reduce script.
 471
 472 =cut
 473
 474 sub reduce {
 475   my ( $max, @seqs ) = @_;
 476
 477   my $num = int( @seqs / $max );
 478
 479   my @res = shift @seqs;
 480
 481   # Don't bother if we have less than the required sequences
 482   if ( @seqs < $max ) { return @res, @seqs }
 483   if ( $num == 0 )    { return @res, @seqs }
 484
 485   for ( 0 .. $#seqs ) { push @res, $seqs[$_] if $_ % $num == 0 }
 486
 487   return @res;
 488 }
 489
 490 =head2 nonred($cutoff, @PSISEQS);
 491
 492 Removes redundant sequences at $cutoff level of sequence identity.
 493
 494 Equivilent to nonred script.
 495
 496 =cut
 497
 498 sub nonred {
 499   my ( $cutoff, @seqs ) = @_;
 500
 501   # Run pairwise and then OC
 502   # Remove similar sequences
 503
 504   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 505   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 506   if     ( @seqs == 1 ) {
 507     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 508     return @seqs;
 509   }
 510
 511   my $fasta = FASTA::File->new;
 512
 513   # Convert the sequences into a FASTA object, which is what
 514   # pairwise expects.
 515   $fasta->add_entries(
 516     map {
 517       my $f = FASTA->new( id => $_->id );
 518       $f->seq( @{ $_->align } );
 519       $f;
 520       } @seqs
 521   );
 522
 523   # Run pairwise via wrappers
 524   use Pairwise;
 525   my $pair = Pairwise->new( path => $pairwise );
 526   my $foo = join "\n", $pair->run($fasta) or die $!;
 527
 528   # Run OC
 529   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 530   $ocres->read_string( join "", $ocres->run($foo) );
 531
 532   # Now found those entries that aren't related
 533   my @keep;
 534   {
 535     for ( $ocres->get_groups ) {
 536       my ( $group, $score, $size, @labels ) = @{$_};
 537
 538       # Keep the QUERY from the cluster, otherwise just get one
 539       if ( grep { /QUERY/ } @labels ) {
 540
 541         # Make sure the query stays at the start
 542         unshift @keep, "QUERY";
 543         next;
 544       } else {
 545         push @keep, shift @labels;
 546       }
 547     }
 548
 549     push @keep, $ocres->get_unclustered;
 550   }
 551
 552   # Return the matching entries.
 553   my @return;
 554
 555   # We use the temporay to mimic the selection of sequences in the nonred
 556   # script, which took the sequences in the order they occured in the file.
 557   for (@seqs) {
 558     my $label = $_->id;
 559     if ( grep { $_ =~ /^$label$/ } @keep ) {
 560       push @return, $_;
 561     }
 562   }
 563
 564   return @return;
 565 }
 566
 567 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
 568
 569 Removes those sequences that are more than $percentage longer or shorter than the first
 570 sequence in @seqs. @seqs is an array of PSISEQ objects.
 571
 572 Equivilent to trim_seqs script.
 573
 574 =cut
 575
 576 sub del_long_seqs {
 577   my ( $factor, @seqs ) = @_;
 578
 579   # Keep the query sequence
 580   my $query = shift @seqs;
 581   my @res   = $query;
 582
 583   # Find the length of the query sequence without any gaps
 584   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
 585
 586   # Calculate the upper and lower allowed bounds
 587   my ( $upper, $lower );
 588   {
 589     my $bounds = ( $length / 100 ) * $factor;
 590     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
 591   }
 592
 593   # Now try each sequence and see how long they are
 594   for (@seqs) {
 595     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
 596     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
 597   }
 598
 599   return @res;
 600 }
 601
 602 =head2 toupper ($PSISEQ)
 603
 604 Converts sequence held in PSISEQ object to uppercase.
 605
 606 =cut
 607
 608 sub toupper {
 609   croak "JPRED: toupper: argument not a PSISEQ object" unless isa $_[0], 'PSISEQ';
 610   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
 611 }
 612
 613 =head2 degap($PSISEQ_QUERY, @PSISEQS)
 614
 615 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
 616
 617 Equivilent to fasta2jnet script.
 618
 619 =cut
 620
 621 sub degap {
 622   my (@seqs) = @_;
 623
 624   # Find where the gaps in the query sequence are
 625   my @gaps;
 626   {
 627     my $i = 0;
 628     for ( @{ $seqs[0]->align } ) {
 629       push @gaps, $i if $_ !~ /$GAP/;
 630       $i++;
 631     }
 632   }
 633
 634   return unless @gaps;
 635
 636   # Remove the gaps in all the sequences.
 637   # Derefences the array reference and takes a slice inside the method call
 638   # arguments, then coerce back to an array ref.
 639   for (@seqs) {
 640     $_->align( [ @{ $_->align }[@gaps] ] );
 641   }
 642 }
 643
 644 =head2 getfreq($filename, @PSISEQS)
 645
 646 Creates a PSIBLAST like profile and writes it to the $filename.
 647
 648 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
 649 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
 650 but it's *not* the same.
 651
 652 =cut
 653
 654 sub getfreq {
 655   my ( $fn, @seqs ) = @_;
 656
 657   croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
 658
 659   # Create a PSIBLAST like profile from the alignment
 660   # This doesn't greate the same result as old Jpred, I think this is because
 661   # differences in the input between old and new
 662
 663   # For testing
 664   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
 665   #   my @profile = profile(
 666   #      map { join "", @{ $_->seq } } $f->get_entries
 667   #   );
 668
 669   my @profile = profile( map { join "", @{ $_->align } } @seqs );
 670
 671   open my $fh, ">$fn" or die "JPRED: $fn: $!";
 672   print $fh join( " ", @{$_} ), "\n" for @profile;
 673 }
 674
 675 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
 676
 677 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
 678
 679 Equivilent to gethmm script.
 680
 681 =cut
 682
 683 sub hmmer {
 684   my (@seqs) = @_;
 685
 686   # Temp files required
 687   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
 688
 689   # Create the FASTA file
 690   psiseq2fasta( $tmp_fasta, @seqs );
 691
 692   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
 693   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 694
 695   # Read in the HMMER file
 696   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 697
 698   # Convert from HMMER::Profile to HMMER::Profile::Jnet
 699   my $hmmer_jnet = HMMER::Profile::Jnet->new;
 700   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 701
 702   return $hmmer_jnet;
 703 }
 704
 705 =head2 jnet($hmmer, $out, $pssm )
 706
 707 Runs Jnet. Pass the paths of the filenames to do the prediction on
 708
 709 =cut
 710
 711 sub jnet {
 712   my ( $hmmer, $outfile, $pssm ) = @_;
 713   if ($pssm) {
 714
 715     # run Jnet prediction with all the profile data
 716     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
 717   } else {
 718
 719     # run Jnet prediction with only HMMer and alignment data
 720     system("$jnet --concise --hmmer $hmmer > $outfile");
 721   }
 722   check( "jnet", $? ) or exit 1;
 723 }
 724
 725 =head1 psiseq2fasta($path, @PSISEQ)
 726
 727 Writes a FASTA file given an array of PSISEQ objects.
 728
 729 =cut
 730
 731 sub psiseq2fasta {
 732   my ( $fn, @seqs ) = @_;
 733
 734   confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
 735   @seqs or confess "JPRED: not passed PSISEQs";
 736
 737   confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 738
 739   # Changed from using FASTA::File object due to seg faulting on some large,
 740   # long alignments, presumably due to running out of memory.
 741   #   my $fasta = FASTA::File->new;
 742   #   $fasta->add_entries(
 743   #      map {
 744   #         my $f = FASTA->new(id => $_->id);
 745   #         $f->seq(@{$_->align});
 746   #         $f;
 747   #      } @seqs
 748   #   );
 749   #   $fasta->write_file($fn);
 750
 751   my $fh;
 752   $fn =~ /\.gz$/
 753     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
 754     : ( open $fh, ">$fn" or die "JPRED: $!" );
 755
 756   for (@seqs) {
 757     print $fh ">", $_->id, "\n";
 758     my $seq = join( "", @{ $_->align } );
 759     $seq =~ s/(.{72})/$1\n/g;
 760     chomp $seq;
 761     print $fh $seq . "\n";
 762   }
 763   close $fh;
 764 }
 765
 766 =head1 @PSISEQ = fasta2psiseq($path)
 767
 768 Reads in a FASTA file and returns an array of PSISEQ objects.
 769
 770 =cut
 771
 772 sub fasta2psiseq {
 773   my ($fn) = @_;
 774
 775   my $fasta =
 776     $fn =~ /\.gz$/
 777     ? FASTA::File->new( read_gzip_file => $fn )
 778     : FASTA::File->new( read_file      => $fn );
 779   $fasta or die "JPRED: $!";
 780
 781   my @seqs;
 782   for ( $fasta->get_entries ) {
 783     my $seq = PSISEQ->new;
 784     $seq->id( $_->id );
 785     $seq->align( [ @{ $_->seq } ] );
 786     push @seqs, $seq;
 787   }
 788
 789   return @seqs;
 790 }
 791
 792 sub fasta2psiseq_lowmem {
 793   my ($fn) = @_;
 794
 795   local $/ = "\n";
 796   my $fh;
 797   $fn =~ /\.gz$/
 798     ? ( open $fh, "gzip -dc $fn|" or die $! )
 799     : ( open $fh, $fn or die $! );
 800
 801   my @seqs;
 802
 803   my ( $id, $seq );
 804   while (<$fh>) {
 805     chomp;
 806     if (s/^>//) {
 807       $seq =~ s/ //g if defined $seq;
 808
 809       if ( $id and $seq ) {
 810         my $psi = PSISEQ->new( id => $id );
 811         $psi->align( [ split //, $seq ] );
 812         push @seqs, $psi;
 813         undef $id;
 814         undef $seq;
 815       } else {
 816         $id = $_;
 817       }
 818     } else {
 819       $seq .= $_;
 820     }
 821   }
 822   if ( $id and $seq ) {
 823     my $psi = PSISEQ->new( id => $id );
 824     $psi->seq($seq);
 825     push @seqs, $psi;
 826   }
 827
 828   return @seqs;
 829 }
 830
 831 =head1 die_if_no_seqs($size, $message, @seqs)
 832
 833 Exits with error message if @seqs is not at least $size.
 834
 835 =cut
 836
 837 sub die_if_no_seqs {
 838   my ( $size, $message, @seqs ) = @_;
 839
 840   confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 841
 842   chomp $message;
 843   die "JPRED: $message\n" unless @seqs > $size;
 844 }
 845
 846 =head1 extend($FASTA::File, @PSISEQ)
 847
 848 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
 849 against the query, this function extends the alignment so that all of the query is
 850 present in the alignment. The other sequences are extended with gap characters.
 851
 852 =cut
 853
 854 sub extend {
 855   my ( $query, @seqs ) = @_;
 856
 857   croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
 858   croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
 859
 860   # Get the query sequence
 861   my $q_query = $query->get_entry(0);
 862
 863   # Get the query sequence from the PSIBLAST alignment
 864   my $a_query = shift @seqs;
 865
 866   # The gap size required to expand the alignment
 867   my ( $start_gap_length, $end_gap_length );
 868   {
 869
 870     # Make handling the sequence easier
 871     my $q_seq = join "", @{ [ $q_query->seq ] };
 872     my $a_seq = join "", @{ $a_query->align };
 873
 874     $q_seq =~ s/-//g;
 875     $a_seq =~ s/-//g;
 876
 877     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
 878
 879     $start_gap_length = index( $q_seq, $a_seq );
 880     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
 881
 882     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
 883   }
 884
 885   # Add the gaps to the end of the alignments
 886   for (@seqs) {
 887     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
 888   }
 889
 890   # Put the query sequence back
 891   unshift @seqs,
 892     PSISEQ->new(
 893     id    => $a_query->id,
 894     align => [
 895       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
 896       @{ $a_query->align },
 897       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
 898     ]
 899     );
 900
 901   return @seqs;
 902 }
 903
 904 =head1 $int = fasta_seq_length($path)
 905
 906 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
 907
 908 =cut
 909
 910 sub fasta_seq_length {
 911   my ($fn) = @_;
 912   open my $fh, $fn or croak "Can't open file $fn: $!\n";
 913
 914   my $length = 0;
 915   local $_, $/ = "\n";
 916   while (<$fh>) {
 917     next if /^>/;
 918     chomp;
 919     $length += tr///c;
 920   }
 921
 922   return $length;
 923 }
 924
 925 =head1 log(@messages)
 926
 927 Report messages to STDERR
 928
 929 =cut
 930
 931 sub errlog { print STDERR @_ }