jpred/jpred

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9   jpred --sequence <fasta file> [--output <output prefix>] [--db <database>] [--psi <psiblast output>] [--pred-nohits] [--verbose] [--debug] [--help] [--man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program which predicts the secondary structure of a sequence given a path to a FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
  14
  15 The program is primarily design for use by the Jpred server, but it can be used directly by any user. Some of the output may not be meaningful if used directly - it can be safely ignored.
  16
  17 =head1 OPTIONS
  18
  19 =over 5
  20
  21 =item --sequence path
  22
  23 The path to the sequence (in FASTA format) you want to predict.
  24
  25 =item --output file
  26
  27 A prefix to the filenames created by Jpred, defaults to the value set by --sequence.
  28
  29 =item --db database
  30
  31 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code. Crap I know, but that's the way it is at the moment. It's probably best to stick the default for the time being.
  32
  33 Default: uniref90
  34
  35 =item --psi path
  36
  37 Path to a PSIBLAST output file.
  38
  39 =item --pred-nohits
  40
  41 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  42
  43 =item --verbose
  44
  45 Verbose mode. Print more information while running jpred.
  46
  47 =item --debug
  48
  49 Debug mode. Print debugging information while running jpred.
  50
  51 =item --help
  52
  53 Gives help on the programs usage.
  54
  55 =item --man
  56
  57 Displays this man page.
  58
  59 =back
  60
  61 =head1 BUGS
  62
  63 Can't cope with gaps in the query sequence.
  64
  65 Doesn't accept alignments.
  66
  67 If you find any others please contact me.
  68
  69 =head1 AUTHORS
  70
  71 Jonathan Barber <jon@compbio.dundee.ac.uk>
  72
  73 Chris Cole <christian@cole.name> (current maintainer)
  74
  75 =cut
  76
  77 ## TODO check for gaps in query sequence
  78 ##      check for disallowed chars in sequence
  79
  80 use strict;
  81 use warnings;
  82 use Getopt::Long qw(:config auto_version);
  83 use Pod::Usage;
  84 use Carp;
  85 use UNIVERSAL qw(isa);
  86 use Data::Dumper;
  87 use File::Temp;
  88
  89 use FindBin qw($Bin);
  90 use lib "$Bin/../lib";
  91
  92 use Jpred;  # needed to define BLAST environment variables
  93
  94 # my modules
  95 use HMMER::Profile;
  96 use HMMER::Profile::Jnet;
  97
  98 use PSIBLAST;
  99 use PSIBLAST::PSSM;
 100 use PSIBLAST::Run;
 101
 102 use FASTA::File;
 103 use FASTA;
 104
 105 use Index;
 106 use OC;
 107 use Utils qw(profile);
 108 use Run qw(check);
 109 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin);
 110
 111 our $VERSION = '3.0.2';
 112 my $MAX_ALIGN = 1000;  # maximum number of alignment sequences allowed
 113 my $NR_CUT = 75;       # cut-off seqeunce ID value for defining non-redundancy
 114
 115 # Light object to hold sequence information, light to prevent memory overload
 116 # for big search results. Even so this is quite memory intensive, I can't see
 117 # how to reduce the size further.
 118 {
 119    package PSISEQ;
 120    #use Compress::LZF qw(compress decompress);
 121
 122    sub new {
 123       my ($class, %args) = @_;
 124       my $self = [];
 125       bless $self, $class;
 126       for (keys %args) { $self->$_($args{$_}) }
 127       return $self;
 128    }
 129
 130    sub id {
 131       return defined $_[1] ?
 132          $_[0]->[0] = $_[1] :
 133          $_[0]->[0];
 134    }
 135
 136    sub start {
 137       return defined $_[1] ?
 138          $_[0]->[1] = $_[1] :
 139          $_[0]->[1];
 140    }
 141
 142    sub align {
 143       if (defined $_[1]) {
 144          ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 145          #$_[0]->[2] = compress(join "", @{ $_[1] });
 146          $_[0]->[2] = join("", @{ $_[1] });
 147       }
 148       else {
 149          #return [ split //, decompress($_[0]->[2]) ];
 150          return [ split(//, $_[0]->[2]) ];
 151       }
 152    }
 153 }
 154
 155 # Key to database information and information for accessing them
 156 my $dbPATH = '/homes/chris/projects/Jnet/db';
 157 my $db_entry = "uniref90";
 158 my $database = {
 159
 160    ## default database used by Jpred
 161    uniref90 => {
 162       database => 'uniref90.filt',
 163       unfiltered => 'uniref90',
 164    },
 165
 166    ## database used during Jnet training
 167    training => {
 168       database => 'training/uniref90.filt',
 169       unfiltered => 'training/uniref90',
 170    },
 171
 172    ## cluster-specific path for Jpred
 173    cluster => {
 174       database => '/local/gjb_lab/www-jpred/uniref90.filt',
 175       unfiltered => '/local/gjb_lab/www-jpred/uniref90',
 176    },
 177
 178    ## these other DBs are experimental ones used during development.
 179    ## check they exist before using them.
 180    swall => {
 181       # generic entry for use with validate_jnet.pl
 182       # real db location defined by validate_jnet.pl
 183       database => "$dbPATH/swall/swall.filt",  # CC new
 184       unfiltered => "$dbPATH/swall/swall",
 185    },
 186    uniprot => {
 187       # Path to PSIBLAST db
 188       database => "$dbPATH/3/swall.filt",  # CC new
 189       unfiltered => "$dbPATH/3/swall",
 190    },
 191    uniref50 => {
 192       database => "$dbPATH/6/swall.filt",
 193       unfiltered => "$dbPATH/6/swall",
 194    },
 195 };
 196
 197 # Gap characters in sequences
 198 our $GAP = '-';
 199
 200 my ($help, $man, $path, $output, $psiblast, $DEBUG, $VERBOSE);
 201 my $predNoHits = 0;  # define whether to make predictions when no PSI-BLAST hits are found
 202 GetOptions(
 203    "help" => \$help,
 204    "man" => \$man,
 205    "verbose" => \$VERBOSE,
 206    "sequence=s" => \$path,
 207    "output=s" => \$output,
 208    "psi=s" => \$psiblast,
 209    "db=s" => \$db_entry,
 210    "pred-nohits" => \$predNoHits,
 211    "debug" => \$DEBUG
 212 ) or pod2usage(2);
 213 pod2usage(1) if $help;
 214 pod2usage(verbose => 2) if $man;
 215
 216 pod2usage(' --sequence argument not provided') unless $path;
 217 die "--db $db_entry not recognised" unless exists $database->{$db_entry};
 218 $output = $path unless $output;
 219
 220 for (
 221    [ "path", $path ],
 222    [ "output", $output ],
 223    [ "psi", $psiblast ],
 224    [ "db", $db_entry]
 225    ) {
 226    my ($key, $value) = @{$_};
 227    defined $value or next;
 228    errlog(join(": ", $key, $value), "\n");
 229 }
 230
 231 my $query = FASTA::File->new(read_file => $path);
 232
 233 my @seqs;
 234 if (1) {
 235    my $psi = PSIBLAST->new;        # CC PSIBLAST.pm doesn't have a new() class??
 236    unless (defined $psiblast && -e $psiblast) {
 237       my $psi_run = PSIBLAST::Run->new(
 238          debug => $DEBUG,
 239 #         path => "/software/jpred_bin/blastpgp",
 240          path => $psiblastbin,  # CC new path
 241          input => $path,
 242          output => "$output.blast",
 243          matrix => "$output.profile",
 244          database => $database->{$db_entry}{database},
 245          ## potentially a better set of parameters (esp. -b & -v)
 246          ## which avoid PSI-BLAST dying with large number of hits
 247          # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 248          # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
 249          # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 250       );
 251
 252       # For reduced databases, get the size of the orginal and use this as
 253       # the equivilent DB size
 254       #print $db_entry, "\n";
 255       if ($db_entry =~ /^sp_red_/) {
 256          (my $entry = $db_entry) =~ s/^sp_red_/sp_/;
 257          $psi_run->args( $psi_run->args . " -z " .
 258             fasta_seq_length($database->{$entry}{database})   # CC sub is below
 259          );
 260       }
 261       print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
 262       print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
 263
 264       print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 265       $psi_run->run or die;    # CC sub is from PSIBLAST::Run
 266
 267       $psi->read_file("$output.blast");    # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 268       system("gzip -9f $output.blast");
 269    }
 270    else {
 271       if ($psiblast =~ /.gz$/) { $psi->read_gzip_file($psiblast) }    # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 272       else { $psi->read_file($psiblast) }  # CC ditto above
 273    }
 274
 275    # Convert the last itteration into a collection of PSISEQ objects
 276    for ($psi->get_all) {     # CC sub is from PSIBLAST.pm
 277       my ($id, $start, $align) = @{$_};
 278       push @seqs, PSISEQ->new(
 279          id => $id,
 280          start => $start,
 281          align => [ split(//, $align) ]
 282       );
 283    }
 284 }
 285
 286 ## When there are no PSI-BLAST hits generate an HMM from the query
 287 ## and run Jnet against that only.
 288 ## Accuracy won't be as good, but at least it's a prediction
 289 if (@seqs == 0) {
 290    if ($predNoHits == 0) {
 291       warn "\nJPRED: Warning - no PSI-BLAST hits found and '--pred-nohits' option not set. Exiting...\n";
 292       print ">>100% complete\n";
 293       exit;
 294    }
 295    print ">>50% complete\n";
 296    warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 297    print "Running HMMer on query...\n" if $VERBOSE;
 298
 299    # copy input query to alignment
 300    system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
 301
 302    # Temp files required for HMMer
 303    my ($hmmbuild_out, $hmmconvert_out) = map {
 304       File::Temp->new->filename
 305    } 1..2;
 306
 307    system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $path");
 308    system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 309
 310    # Read in the HMMER file
 311    my $hmmer = HMMER::Profile->new(
 312       read_file => $hmmconvert_out
 313    );
 314
 315    # Convert from HMMER::Profile to HMMER::Profile::Jnet
 316    my $hmmer_jnet = HMMER::Profile::Jnet->new;
 317    $hmmer_jnet->add_line(@{$_}) for $hmmer->get_line;
 318    $hmmer_jnet->write_file("$output.hmm");   # write_file is in the Read.pm called from the HMMER::Profile module
 319    print ">>70% complete\n";
 320
 321    # Run Jnet for the prediction
 322    print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 323    jnet(map { "$output.$_" } qw(hmm jnet));   # CC sub is below
 324
 325    print "Jpred Finished\n";
 326    exit;
 327 }
 328
 329 psiseq2fasta("0.fasta.gz", @seqs) if $DEBUG;     # CC sub is below
 330 print ">>40% complete\n";
 331
 332 # Make PSIBLAST truncated alignments the right length
 333 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 334 @seqs = extend($query, @seqs);         # CC sub si below
 335 psiseq2fasta("1.fasta.gz", @seqs) if $DEBUG;
 336
 337 # Remove masking from PSIBLAST alignment
 338 print "Unmasking the alignments...\n" if $VERBOSE;
 339 dex($_) for @seqs[ 1..$#seqs ];        ## <-- CC dies here in dex() - found below
 340 psiseq2fasta("2.fasta.gz", @seqs) if $DEBUG;
 341
 342 # Convert the sequences to upper case
 343 print "Converting sequences to the same case...\n" if $VERBOSE;
 344 toupper($_) for @seqs;                 # CC sub is below
 345 psiseq2fasta("${output}_backup.fasta.gz", @seqs);
 346
 347 # Remove excessive sequences
 348 print "Remove excessive sequences...\n" if $VERBOSE;
 349 @seqs = reduce($MAX_ALIGN, @seqs);           # CC sub is below
 350 psiseq2fasta("3.fasta.gz", @seqs) if $DEBUG;
 351
 352 # Remove sequences that are too long or too short
 353 print "Remove sequences which too long or short...\n" if $VERBOSE;
 354 @seqs = del_long_seqs(50, @seqs);      # CC sub is below
 355 psiseq2fasta("4.fasta.gz", @seqs) if $DEBUG;
 356
 357 # Remove redundant sequences based upon pairwise identity and OC clustering
 358 print "Remove redundant sequences...\n" if $VERBOSE;
 359 @seqs = nonred($NR_CUT, @seqs);             # CC sub is below
 360 psiseq2fasta("5.fasta.gz", @seqs) if $DEBUG;
 361
 362 # Check that we haven't got rid of all of the sequences
 363 if (@seqs < 2) {
 364    warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 365    @seqs = fasta2psiseq("${output}_backup.fasta.gz");
 366 }
 367 unlink("${output}_backup.fasta.gz") unless $DEBUG;
 368
 369 # Remove gaps in the query sequence and same positions in the alignment
 370 print "Removing gaps in the query sequence...\n" if $VERBOSE;
 371 degap(@seqs);                          # CC sub is below
 372 psiseq2fasta("6.fasta.gz", @seqs) if $DEBUG;
 373
 374 # Output the alignment for the prediction
 375 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 376 psiseq2fasta("$output.align", @seqs);
 377
 378 {
 379    # Equivilent to getpssm script
 380    print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 381    my $pssm = PSIBLAST::PSSM->new(read_file => "$output.profile");
 382    $pssm->write_file("$output.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 383 }
 384 print ">>50% complete\n";
 385
 386 # Run HMMER on the sequences
 387 {
 388    print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 389    my $hmmer = hmmer(@seqs);          # CC sub is below
 390    $hmmer->write_file("$output.hmm");   # write_file is in the Read.pm called from the HMMER::Profile module
 391 }
 392 print ">>70% complete\n";
 393
 394 # Run Jnet for the prediction
 395 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 396 jnet(map { "$output.$_" } qw(hmm jnet pssm));   # CC sub is below
 397
 398 print "Jpred Finished\n";
 399
 400 # Then do an accuracy prediction
 401
 402 ############################################################
 403 # Functions
 404 ############################################################
 405
 406 =begin :private
 407
 408 =head2 $PSISEQ = dex($PSISEQ)
 409
 410 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database. Does it in place.
 411
 412 =cut
 413
 414
 415 BEGIN {
 416    my $idx = Index->new(type => "Index::FastaCMD") or die "Index type cannot be found.";
 417
 418    sub dex {
 419       my ($seq) = @_;
 420
 421       croak "JPRED: dex: can only take a PSISEQ object as an argument" unless isa $seq, 'PSISEQ';
 422
 423       # Only bother if we have a sequence which has been masked
 424       return unless grep { uc $_ eq 'X' } @{ $seq->align };
 425
 426       my ($id, $start, @align) = ($seq->id, $seq->start, @{$seq->align});
 427
 428       #$id = "$id"; # Add the EMBOSS USA
 429       #$id = "sprot_37:$id"; # Add the EMBOSS USA
 430       #$id = "sprot_43:$id"; # Add the EMBOSS USA
 431 #      $id = $database->{$db_entry}{emboss} . ":$id";  # commented out to try FastaCMD module
 432       my $searchdb = $database->{$db_entry}{unfiltered};
 433       $start--; # We need this to be zero based
 434
 435       my $f = $idx->get_sequence($id, $searchdb) or croak "JPRED: No such entry for $id in $searchdb\n" and return;   # CC sub is from Index::EMBOSS
 436       my @seqs = $f->get_entries;       ## Sub is from Sequence::File
 437       my $db = shift @seqs;
 438
 439       unless ($db) {
 440          confess "JPRED: dex: Accession number $id returned no sequences";
 441       }
 442       elsif (@seqs) {
 443          confess "JPRED: dex: Accession number $id returned more than one sequence";
 444       }
 445
 446       my @db_seq = @{ $db->seq };
 447
 448       # $i is a pointer into the original, ungapped, unmasked sequence. $_ is
 449       # for the aligned sequence
 450       my $i = 0;
 451       for (0..$#align) {
 452          next if $align[$_] =~ /$GAP/;
 453
 454          if ($align[$_] eq 'X') {
 455             unless (defined $db_seq[$i + $start]) {
 456                croak "JPRED: dex: the database sequence is not as long as the query sequence!"
 457             }
 458             $align[$_] = $db_seq[$i + $start];
 459          }
 460          $i++;
 461       }
 462
 463       $seq->align(\@align);
 464    }
 465 }
 466
 467 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 468
 469 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 470
 471 Equivilent to reduce script.
 472
 473 =cut
 474
 475 sub reduce {
 476    my ($max, @seqs) = @_;
 477
 478    my $num = int(@seqs / $max);
 479
 480    my @res = shift @seqs;
 481
 482    # Don't bother if we have less than the required sequences
 483    if (@seqs < $max) { return @res, @seqs }
 484    if ($num == 0) { return @res, @seqs }
 485
 486    for (0..$#seqs) { push @res, $seqs[$_] if $_ % $num == 0 }
 487
 488    return @res;
 489 }
 490
 491 =head2 nonred($cutoff, @PSISEQS);
 492
 493 Removes redundant sequences at $cutoff level of sequence identity.
 494
 495 Equivilent to nonred script.
 496
 497 =cut
 498
 499 sub nonred {
 500    my ($cutoff, @seqs) = @_;
 501    # Run pairwise and then OC
 502    # Remove similar sequences
 503
 504    unless (defined $cutoff) { croak "JPRED: nonred() not passed a cutoff" }
 505    unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
 506    if (@seqs == 1) {
 507       warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 508       return @seqs;
 509    }
 510
 511    my $fasta = FASTA::File->new;
 512
 513    # Convert the sequences into a FASTA object, which is what
 514    # pairwise expects.
 515    $fasta->add_entries(
 516       map {
 517          my $f = FASTA->new(id => $_->id);
 518          $f->seq(@{$_->align});
 519          $f;
 520       } @seqs
 521    );
 522
 523    # Run pairwise via wrappers
 524    use Pairwise;
 525    my $pair = Pairwise->new(
 526       path => $pairwise
 527    );
 528    my $foo = join "\n", $pair->run($fasta) or die $!;
 529
 530    # Run OC
 531    my $ocres = OC->new(
 532       path => "$oc sim complete cut $cutoff"
 533    );
 534    $ocres->read_string(join "", $ocres->run($foo) );
 535
 536    # Now found those entries that aren't related
 537    my @keep;
 538    {
 539       for ($ocres->get_groups) {
 540          my ($group, $score, $size, @labels) = @{ $_ };
 541
 542          # Keep the QUERY from the cluster, otherwise just get one
 543          if (grep { /QUERY/ } @labels) {
 544             # Make sure the query stays at the start
 545             unshift @keep, "QUERY";
 546             next;
 547          }
 548          else { push @keep, shift @labels }
 549       }
 550
 551       push @keep, $ocres->get_unclustered;
 552    }
 553
 554    # Return the matching entries.
 555    my @return;
 556
 557    # We use the temporay to mimic the selection of sequences in the nonred
 558    # script, which took the sequences in the order they occured in the file.
 559    for (@seqs) {
 560       my $label = $_->id;
 561       if (grep { $_ =~ /^$label$/ } @keep) {
 562          push @return, $_;
 563       }
 564    }
 565
 566    return @return;
 567 }
 568
 569 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
 570
 571 Removes those sequences that are more than $percentage longer or shorter than the first sequence in @seqs. @seqs is an array of PSISEQ objects.
 572
 573 Equivilent to trim_seqs script.
 574
 575 =cut
 576
 577 sub del_long_seqs {
 578    my ($factor, @seqs) = @_;
 579
 580    # Keep the query sequence
 581    my $query = shift @seqs;
 582    my @res = $query;
 583
 584    # Find the length of the query sequence without any gaps
 585    my $length = (() = (join '', @{$query->align}) =~ /[^$GAP]/g);
 586
 587    # Calculate the upper and lower allowed bounds
 588    my ($upper, $lower);
 589    {
 590       my $bounds = ($length / 100) * $factor;
 591       ($upper, $lower) = ($length + $bounds, $length - $bounds);
 592    }
 593
 594    # Now try each sequence and see how long they are
 595    for (@seqs) {
 596       my $l = (() = (join '', @{$_->align}) =~ /[^$GAP]/g);
 597       if ($l >= $lower && $l <= $upper) { push @res, $_ }
 598    }
 599
 600    return @res;
 601 }
 602
 603 =head2 toupper ($PSISEQ)
 604
 605 Converts sequence held in PSISEQ object to uppercase.
 606
 607 =cut
 608
 609 sub toupper {
 610    croak "JPRED: toupper: argument not a PSISEQ object" unless isa $_[0], 'PSISEQ';
 611    $_[0]->align(
 612       [ split //, uc join '', @{ $_[0]->align } ]
 613    );
 614 }
 615
 616 =head2 degap($PSISEQ_QUERY, @PSISEQS)
 617
 618 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
 619
 620 Equivilent to fasta2jnet script.
 621
 622 =cut
 623
 624 sub degap {
 625    my (@seqs) = @_;
 626
 627    # Find where the gaps in the query sequence are
 628    my @gaps;
 629    {
 630       my $i = 0;
 631       for (@{$seqs[0]->align}) {
 632          push @gaps, $i if $_ !~ /$GAP/;
 633          $i++;
 634       }
 635    }
 636
 637    return unless @gaps;
 638
 639    # Remove the gaps in all the sequences.
 640    # Derefences the array reference and takes a slice inside the method call
 641    # arguments, then coerce back to an array ref.
 642    for (@seqs) {
 643       $_->align([ @{$_->align}[@gaps] ]);
 644    }
 645 }
 646
 647 =head2 getfreq($filename, @PSISEQS)
 648
 649 Creates a PSIBLAST like profile and writes it to the $filename.
 650
 651 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output, but it's *not* the same.
 652
 653 =cut
 654
 655 sub getfreq {
 656    my ($fn, @seqs) = @_;
 657
 658    croak "JPRED: Passed non PSISEQ object" if grep { ! isa($_, 'PSISEQ') } @seqs;
 659
 660    # Create a PSIBLAST like profile from the alignment
 661    # This doesn't greate the same result as old Jpred, I think this is because
 662    # differences in the input between old and new
 663
 664    # For testing
 665 #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
 666 #   my @profile = profile(
 667 #      map { join "", @{ $_->seq } } $f->get_entries
 668 #   );
 669
 670    my @profile = profile(
 671       map { join "", @{ $_->align } } @seqs
 672    );
 673
 674    open my $fh, ">$fn" or die "JPRED: $fn: $!";
 675    print $fh join(" ", @{$_}), "\n" for @profile;
 676 }
 677
 678 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
 679
 680 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
 681
 682 Equivilent to gethmm script.
 683
 684 =cut
 685
 686 sub hmmer {
 687    my (@seqs) = @_;
 688
 689    # Temp files required
 690    my ($hmmbuild_out, $hmmconvert_out, $tmp_fasta) = map {
 691       File::Temp->new->filename
 692    } 1..3;
 693
 694    # Create the FASTA file
 695    psiseq2fasta($tmp_fasta, @seqs);
 696
 697    system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
 698    system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 699
 700    # Read in the HMMER file
 701    my $hmmer = HMMER::Profile->new(
 702       read_file => $hmmconvert_out
 703    );
 704
 705    # Convert from HMMER::Profile to HMMER::Profile::Jnet
 706    my $hmmer_jnet = HMMER::Profile::Jnet->new;
 707    $hmmer_jnet->add_line(@{$_}) for $hmmer->get_line;
 708
 709    return $hmmer_jnet;
 710 }
 711
 712 =head2 jnet($hmmer, $out, $pssm )
 713
 714 Runs Jnet. Pass the paths of the filenames to do the prediction on
 715
 716 =cut
 717
 718 sub jnet {
 719    my ($hmmer, $outfile, $pssm) = @_;
 720    if ($pssm) {
 721       # run Jnet prediction with all the profile data
 722       system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
 723    } else {
 724       # run Jnet prediction with only HMMer and alignment data
 725       system("$jnet --concise --hmmer $hmmer > $outfile");
 726    }
 727    check("jnet", $?) or exit 1;
 728 }
 729
 730 =head1 psiseq2fasta($path, @PSISEQ)
 731
 732 Writes a FASTA file given an array of PSISEQ objects.
 733
 734 =cut
 735
 736 sub psiseq2fasta {
 737    my ($fn, @seqs) = @_;
 738
 739    confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
 740    @seqs or confess "JPRED: not passed PSISEQs";
 741
 742    confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 743
 744    # Changed from using FASTA::File object due to seg faulting on some large,
 745    # long alignments, presumably due to running out of memory.
 746 #   my $fasta = FASTA::File->new;
 747 #   $fasta->add_entries(
 748 #      map {
 749 #         my $f = FASTA->new(id => $_->id);
 750 #         $f->seq(@{$_->align});
 751 #         $f;
 752 #      } @seqs
 753 #   );
 754 #   $fasta->write_file($fn);
 755
 756    my $fh;
 757    $fn =~ /\.gz$/ ?
 758       ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" ):
 759       ( open $fh, ">$fn" or die "JPRED: $!" );
 760
 761    for (@seqs) {
 762       print $fh ">", $_->id, "\n";
 763       my $seq = join("", @{ $_->align });
 764       $seq =~ s/(.{72})/$1\n/g;
 765       chomp $seq;
 766       print $fh $seq."\n";
 767    }
 768    close $fh;
 769 }
 770
 771 =head1 @PSISEQ = fasta2psiseq($path)
 772
 773 Reads in a FASTA file and returns an array of PSISEQ objects.
 774
 775 =cut
 776
 777 sub fasta2psiseq {
 778    my ($fn) = @_;
 779
 780    my $fasta = $fn =~ /\.gz$/ ?
 781       FASTA::File->new(read_gzip_file => $fn) :
 782       FASTA::File->new(read_file => $fn);
 783    $fasta or die "JPRED: $!";
 784
 785    my @seqs;
 786    for ($fasta->get_entries) {
 787       my $seq = PSISEQ->new;
 788       $seq->id($_->id);
 789       $seq->align( [ @{ $_->seq } ] );
 790       push @seqs, $seq;
 791    }
 792
 793    return @seqs;
 794 }
 795
 796 sub fasta2psiseq_lowmem {
 797    my ($fn) = @_;
 798
 799    local $/ = "\n";
 800    my $fh;
 801    $fn =~ /\.gz$/ ?
 802       (open $fh, "gzip -dc $fn|" or die $!) :
 803       (open $fh, $fn or die $!);
 804
 805    my @seqs;
 806
 807    my ($id, $seq);
 808    while (<$fh>) {
 809       chomp;
 810       if (s/^>//) {
 811          $seq =~ s/ //g if defined $seq;
 812
 813          if ($id and $seq) {
 814             my $psi = PSISEQ->new(id => $id);
 815             $psi->align([ split //, $seq ]);
 816             push @seqs, $psi;
 817             undef $id; undef $seq;
 818          }
 819          else { $id = $_ }
 820       }
 821       else {
 822          $seq .= $_;
 823       }
 824    }
 825    if ($id and $seq) {
 826       my $psi = PSISEQ->new(id => $id);
 827       $psi->seq($seq);
 828       push @seqs, $psi;
 829    }
 830
 831    return @seqs;
 832 }
 833
 834 =head1 die_if_no_seqs($size, $message, @seqs)
 835
 836 Exits with error message if @seqs is not at least $size.
 837
 838 =cut
 839
 840 sub die_if_no_seqs {
 841    my ($size, $message, @seqs) = @_;
 842
 843    confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 844
 845    chomp $message;
 846    die "JPRED: $message\n" unless @seqs > $size;
 847 }
 848
 849 =head1 extend($FASTA::File, @PSISEQ)
 850
 851 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues against the query, this function extends the alignment so that all of the query is present in the alignment. The other sequences are extended with gap characters.
 852
 853 =cut
 854
 855 sub extend {
 856    my ($query, @seqs) = @_;
 857
 858    croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
 859    croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 860
 861    # Get the query sequence
 862    my $q_query = $query->get_entry(0);
 863
 864    # Get the query sequence from the PSIBLAST alignment
 865    my $a_query = shift @seqs;
 866
 867    # The gap size required to expand the alignment
 868    my ($start_gap_length, $end_gap_length);
 869    {
 870       # Make handling the sequence easier
 871       my $q_seq = join "", @{[ $q_query->seq ]};
 872       my $a_seq = join "", @{ $a_query->align };
 873
 874       $q_seq =~ s/-//g;
 875       $a_seq =~ s/-//g;
 876
 877       ($q_seq, $a_seq) = map { uc $_ } $q_seq, $a_seq;
 878
 879       $start_gap_length = index($q_seq, $a_seq);
 880       croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
 881
 882       $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
 883    }
 884
 885    # Add the gaps to the end of the alignments
 886    for (@seqs) {
 887       $_->align([
 888          ($GAP) x $start_gap_length,
 889          @{ $_->align },
 890          ($GAP) x $end_gap_length
 891       ]);
 892    }
 893
 894    # Put the query sequence back
 895    unshift @seqs, PSISEQ->new(
 896       id => $a_query->id,
 897       align => [
 898          ($start_gap_length ? @{ $q_query->seq }[ 0..($start_gap_length-1) ] : ()),
 899          @{ $a_query->align },
 900          ($end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : ()),
 901       ]
 902    );
 903
 904    return @seqs;
 905 }
 906
 907 =head1 $int = fasta_seq_length($path)
 908
 909 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
 910
 911 =cut
 912
 913 sub fasta_seq_length {
 914    my ($fn) = @_;
 915    open my $fh, $fn or croak "Can't open file $fn: $!\n";
 916
 917    my $length = 0;
 918    local $_, $/ = "\n";
 919    while (<$fh>) {
 920       next if /^>/;
 921       chomp;
 922       $length += tr///c;
 923    }
 924
 925    return $length;
 926 }
 927
 928 =head1 log(@messages)
 929
 930 Report messages to STDERR
 931
 932 =cut
 933
 934 sub errlog { print STDERR @_ }