jpred/jpred

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9   jpred --sequence <fasta file> [--output <output prefix>] [--db <database>] [--psi <psiblast output>] [--pred-nohits] [--verbose] [--debug] [--help] [--man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program which predicts the secondary structure of a sequence given a path to a FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
  14
  15 The program is primarily design for use by the Jpred server, but it can be used directly by any user. Some of the output may not be meaningful if used directly - it can be safely ignored.
  16
  17 =head1 OPTIONS
  18
  19 =over 5
  20
  21 =item --sequence path
  22
  23 The path to the sequence (in FASTA format) you want to predict.
  24
  25 =item --output file
  26
  27 A prefix to the filenames created by Jpred, defaults to the value set by --sequence.
  28
  29 =item --db database
  30
  31 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code. Crap I know, but that's the way it is at the moment. It's probably best to stick the default for the time being.
  32
  33 Default: uniref90
  34
  35 =item --psi path
  36
  37 Path to a PSIBLAST output file.
  38
  39 =item --pred-nohits
  40
  41 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  42
  43 =item --verbose
  44
  45 Verbose mode. Print more information while running jpred.
  46
  47 =item --debug
  48
  49 Debug mode. Print debugging information while running jpred.
  50
  51 =item --help
  52
  53 Gives help on the programs usage.
  54
  55 =item --man
  56
  57 Displays this man page.
  58
  59 =back
  60
  61 =head1 BUGS
  62
  63 Can't cope with gaps in the query sequence.
  64
  65 Doesn't accept alignments.
  66
  67 If you find any others please contact me.
  68
  69 =head1 AUTHORS
  70
  71 Jonathan Barber <jon@compbio.dundee.ac.uk>
  72
  73 Chris Cole <christian@cole.name> (current maintainer)
  74
  75 =cut
  76
  77 ## TODO check for gaps in query sequence
  78 ##      check for disallowed chars in sequence
  79
  80 use strict;
  81 use warnings;
  82 use Getopt::Long;
  83 use Pod::Usage;
  84 use Carp;
  85 use UNIVERSAL qw(isa);
  86 use Data::Dumper;
  87 use File::Temp;
  88
  89 use Jpred;  # needed to define BLAST environment variables
  90
  91 # my modules
  92 use HMMER::Profile;
  93 use HMMER::Profile::Jnet;
  94
  95 use PSIBLAST;
  96 use PSIBLAST::PSSM;
  97 use PSIBLAST::Run;
  98
  99 use FASTA::File;
 100 use FASTA;
 101
 102 use Index;
 103 use OC;
 104 use Utils qw(profile);
 105 use Run qw(check);
 106 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin);
 107
 108 my $VERSION = '3.0.1';
 109 my $MAX_ALIGN = 1000;  # maximum number of alignment sequences allowed
 110 my $NR_CUT = 75;       # cut-off seqeunce ID value for defining non-redundancy
 111
 112 # Light object to hold sequence information, light to prevent memory overload
 113 # for big search results. Even so this is quite memory intensive, I can't see
 114 # how to reduce the size further.
 115 {
 116    package PSISEQ;
 117    #use Compress::LZF qw(compress decompress);
 118
 119    sub new {
 120       my ($class, %args) = @_;
 121       my $self = [];
 122       bless $self, $class;
 123       for (keys %args) { $self->$_($args{$_}) }
 124       return $self;
 125    }
 126
 127    sub id {
 128       return defined $_[1] ?
 129          $_[0]->[0] = $_[1] :
 130          $_[0]->[0];
 131    }
 132
 133    sub start {
 134       return defined $_[1] ?
 135          $_[0]->[1] = $_[1] :
 136          $_[0]->[1];
 137    }
 138
 139    sub align {
 140       if (defined $_[1]) {
 141          ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 142          #$_[0]->[2] = compress(join "", @{ $_[1] });
 143          $_[0]->[2] = join("", @{ $_[1] });
 144       }
 145       else {
 146          #return [ split //, decompress($_[0]->[2]) ];
 147          return [ split(//, $_[0]->[2]) ];
 148       }
 149    }
 150 }
 151
 152 # Key to database information and information for accessing them
 153 my $dbPATH = '/homes/chris/projects/Jnet/db';
 154 my $db_entry = "uniref90";
 155 my $database = {
 156
 157    ## default database used by Jpred
 158    uniref90 => {
 159       database => 'uniref90.filt',
 160       unfiltered => 'uniref90',
 161    },
 162
 163    ## database used during Jnet training
 164    training => {
 165       database => 'training/uniref90.filt',
 166       unfiltered => 'training/uniref90',
 167    },
 168
 169    ## cluster-specific path for Jpred
 170    cluster => {
 171       database => '/local/gjb_lab/www-jpred/uniref90.filt',
 172       unfiltered => '/local/gjb_lab/www-jpred/uniref90',
 173    },
 174
 175    ## these other DBs are experimental ones used during development.
 176    ## check they exist before using them.
 177    swall => {
 178       # generic entry for use with validate_jnet.pl
 179       # real db location defined by validate_jnet.pl
 180       database => "$dbPATH/swall/swall.filt",  # CC new
 181       unfiltered => "$dbPATH/swall/swall",
 182    },
 183    uniprot => {
 184       # Path to PSIBLAST db
 185       database => "$dbPATH/3/swall.filt",  # CC new
 186       unfiltered => "$dbPATH/3/swall",
 187    },
 188    uniref50 => {
 189       database => "$dbPATH/6/swall.filt",
 190       unfiltered => "$dbPATH/6/swall",
 191    },
 192 };
 193
 194 # Gap characters in sequences
 195 our $GAP = '-';
 196
 197 my ($help, $man, $path, $output, $psiblast, $DEBUG, $VERBOSE);
 198 my $predNoHits = 0;  # define whether to make predictions when no PSI-BLAST hits are found
 199 GetOptions(
 200    "help" => \$help,
 201    "man" => \$man,
 202    "verbose" => \$VERBOSE,
 203    "sequence=s" => \$path,
 204    "output=s" => \$output,
 205    "psi=s" => \$psiblast,
 206    "db=s" => \$db_entry,
 207    "pred-nohits" => \$predNoHits,
 208    "debug" => \$DEBUG
 209 ) or pod2usage(2);
 210 pod2usage(1) if $help;
 211 pod2usage(verbose => 2) if $man;
 212
 213 pod2usage(' --sequence argument not provided') unless $path;
 214 die "--db $db_entry not recognised" unless exists $database->{$db_entry};
 215 $output = $path unless $output;
 216
 217 for (
 218    [ "path", $path ],
 219    [ "output", $output ],
 220    [ "psi", $psiblast ],
 221    [ "db", $db_entry]
 222    ) {
 223    my ($key, $value) = @{$_};
 224    defined $value or next;
 225    errlog(join(": ", $key, $value), "\n");
 226 }
 227
 228 my $query = FASTA::File->new(read_file => $path);
 229
 230 my @seqs;
 231 if (1) {
 232    my $psi = PSIBLAST->new;        # CC PSIBLAST.pm doesn't have a new() class??
 233    unless (defined $psiblast && -e $psiblast) {
 234       my $psi_run = PSIBLAST::Run->new(
 235          debug => $DEBUG,
 236 #         path => "/software/jpred_bin/blastpgp",
 237          path => $psiblastbin,  # CC new path
 238          input => $path,
 239          output => "$output.blast",
 240          matrix => "$output.profile",
 241          database => $database->{$db_entry}{database},
 242          ## potentially a better set of parameters (esp. -b & -v)
 243          ## which avoid PSI-BLAST dying with large number of hits
 244          # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 245          # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
 246          # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 247       );
 248
 249       # For reduced databases, get the size of the orginal and use this as
 250       # the equivilent DB size
 251       #print $db_entry, "\n";
 252       if ($db_entry =~ /^sp_red_/) {
 253          (my $entry = $db_entry) =~ s/^sp_red_/sp_/;
 254          $psi_run->args( $psi_run->args . " -z " .
 255             fasta_seq_length($database->{$entry}{database})   # CC sub is below
 256          );
 257       }
 258       print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
 259       print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
 260
 261       print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 262       $psi_run->run or die;    # CC sub is from PSIBLAST::Run
 263
 264       $psi->read_file("$output.blast");    # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 265       system("gzip -9f $output.blast");
 266    }
 267    else {
 268       if ($psiblast =~ /.gz$/) { $psi->read_gzip_file($psiblast) }    # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 269       else { $psi->read_file($psiblast) }  # CC ditto above
 270    }
 271
 272    # Convert the last itteration into a collection of PSISEQ objects
 273    for ($psi->get_all) {     # CC sub is from PSIBLAST.pm
 274       my ($id, $start, $align) = @{$_};
 275       push @seqs, PSISEQ->new(
 276          id => $id,
 277          start => $start,
 278          align => [ split(//, $align) ]
 279       );
 280    }
 281 }
 282
 283 ## When there are no PSI-BLAST hits generate an HMM from the query
 284 ## and run Jnet against that only.
 285 ## Accuracy won't be as good, but at least it's a prediction
 286 if (@seqs == 0) {
 287    if ($predNoHits == 0) {
 288       warn "\nJPRED: Warning - no PSI-BLAST hits found and '--pred-nohits' option not set. Exiting...\n";
 289       print ">>100% complete\n";
 290       exit;
 291    }
 292    print ">>50% complete\n";
 293    warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 294    print "Running HMMer on query...\n" if $VERBOSE;
 295
 296    # copy input query to alignment
 297    system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
 298
 299    # Temp files required for HMMer
 300    my ($hmmbuild_out, $hmmconvert_out) = map {
 301       File::Temp->new->filename
 302    } 1..2;
 303
 304    system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $path");
 305    system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 306
 307    # Read in the HMMER file
 308    my $hmmer = HMMER::Profile->new(
 309       read_file => $hmmconvert_out
 310    );
 311
 312    # Convert from HMMER::Profile to HMMER::Profile::Jnet
 313    my $hmmer_jnet = HMMER::Profile::Jnet->new;
 314    $hmmer_jnet->add_line(@{$_}) for $hmmer->get_line;
 315    $hmmer_jnet->write_file("$output.hmm");   # write_file is in the Read.pm called from the HMMER::Profile module
 316    print ">>70% complete\n";
 317
 318    # Run Jnet for the prediction
 319    print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 320    jnet(map { "$output.$_" } qw(hmm jnet));   # CC sub is below
 321
 322    print "Jpred Finished\n";
 323    exit;
 324 }
 325
 326 psiseq2fasta("0.fasta.gz", @seqs) if $DEBUG;     # CC sub is below
 327 print ">>40% complete\n";
 328
 329 # Make PSIBLAST truncated alignments the right length
 330 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 331 @seqs = extend($query, @seqs);         # CC sub si below
 332 psiseq2fasta("1.fasta.gz", @seqs) if $DEBUG;
 333
 334 # Remove masking from PSIBLAST alignment
 335 print "Unmasking the alignments...\n" if $VERBOSE;
 336 dex($_) for @seqs[ 1..$#seqs ];        ## <-- CC dies here in dex() - found below
 337 psiseq2fasta("2.fasta.gz", @seqs) if $DEBUG;
 338
 339 # Convert the sequences to upper case
 340 print "Converting sequences to the same case...\n" if $VERBOSE;
 341 toupper($_) for @seqs;                 # CC sub is below
 342 psiseq2fasta("${output}_backup.fasta.gz", @seqs);
 343
 344 # Remove excessive sequences
 345 print "Remove excessive sequences...\n" if $VERBOSE;
 346 @seqs = reduce($MAX_ALIGN, @seqs);           # CC sub is below
 347 psiseq2fasta("3.fasta.gz", @seqs) if $DEBUG;
 348
 349 # Remove sequences that are too long or too short
 350 print "Remove sequences which too long or short...\n" if $VERBOSE;
 351 @seqs = del_long_seqs(50, @seqs);      # CC sub is below
 352 psiseq2fasta("4.fasta.gz", @seqs) if $DEBUG;
 353
 354 # Remove redundant sequences based upon pairwise identity and OC clustering
 355 print "Remove redundant sequences...\n" if $VERBOSE;
 356 @seqs = nonred($NR_CUT, @seqs);             # CC sub is below
 357 psiseq2fasta("5.fasta.gz", @seqs) if $DEBUG;
 358
 359 # Check that we haven't got rid of all of the sequences
 360 if (@seqs < 2) {
 361    warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 362    @seqs = fasta2psiseq("${output}_backup.fasta.gz");
 363 }
 364 unlink("${output}_backup.fasta.gz") unless $DEBUG;
 365
 366 # Remove gaps in the query sequence and same positions in the alignment
 367 print "Removing gaps in the query sequence...\n" if $VERBOSE;
 368 degap(@seqs);                          # CC sub is below
 369 psiseq2fasta("6.fasta.gz", @seqs) if $DEBUG;
 370
 371 # Output the alignment for the prediction
 372 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 373 psiseq2fasta("$output.align", @seqs);
 374
 375 {
 376    # Equivilent to getpssm script
 377    print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 378    my $pssm = PSIBLAST::PSSM->new(read_file => "$output.profile");
 379    $pssm->write_file("$output.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 380 }
 381 print ">>50% complete\n";
 382
 383 # Run HMMER on the sequences
 384 {
 385    print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 386    my $hmmer = hmmer(@seqs);          # CC sub is below
 387    $hmmer->write_file("$output.hmm");   # write_file is in the Read.pm called from the HMMER::Profile module
 388 }
 389 print ">>70% complete\n";
 390
 391 # Run Jnet for the prediction
 392 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 393 jnet(map { "$output.$_" } qw(hmm jnet pssm));   # CC sub is below
 394
 395 print "Jpred Finished\n";
 396
 397 # Then do an accuracy prediction
 398
 399 ############################################################
 400 # Functions
 401 ############################################################
 402
 403 =begin :private
 404
 405 =head2 $PSISEQ = dex($PSISEQ)
 406
 407 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database. Does it in place.
 408
 409 =cut
 410
 411
 412 BEGIN {
 413    my $idx = Index->new(type => "Index::FastaCMD") or die "Index type cannot be found.";
 414
 415    sub dex {
 416       my ($seq) = @_;
 417
 418       croak "JPRED: dex: can only take a PSISEQ object as an argument" unless isa $seq, 'PSISEQ';
 419
 420       # Only bother if we have a sequence which has been masked
 421       return unless grep { uc $_ eq 'X' } @{ $seq->align };
 422
 423       my ($id, $start, @align) = ($seq->id, $seq->start, @{$seq->align});
 424
 425       #$id = "$id"; # Add the EMBOSS USA
 426       #$id = "sprot_37:$id"; # Add the EMBOSS USA
 427       #$id = "sprot_43:$id"; # Add the EMBOSS USA
 428 #      $id = $database->{$db_entry}{emboss} . ":$id";  # commented out to try FastaCMD module
 429       my $searchdb = $database->{$db_entry}{unfiltered};
 430       $start--; # We need this to be zero based
 431
 432       my $f = $idx->get_sequence($id, $searchdb) or croak "JPRED: No such entry for $id in $searchdb\n" and return;   # CC sub is from Index::EMBOSS
 433       my @seqs = $f->get_entries;       ## Sub is from Sequence::File
 434       my $db = shift @seqs;
 435
 436       unless ($db) {
 437          confess "JPRED: dex: Accession number $id returned no sequences";
 438       }
 439       elsif (@seqs) {
 440          confess "JPRED: dex: Accession number $id returned more than one sequence";
 441       }
 442
 443       my @db_seq = @{ $db->seq };
 444
 445       # $i is a pointer into the original, ungapped, unmasked sequence. $_ is
 446       # for the aligned sequence
 447       my $i = 0;
 448       for (0..$#align) {
 449          next if $align[$_] =~ /$GAP/;
 450
 451          if ($align[$_] eq 'X') {
 452             unless (defined $db_seq[$i + $start]) {
 453                croak "JPRED: dex: the database sequence is not as long as the query sequence!"
 454             }
 455             $align[$_] = $db_seq[$i + $start];
 456          }
 457          $i++;
 458       }
 459
 460       $seq->align(\@align);
 461    }
 462 }
 463
 464 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 465
 466 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 467
 468 Equivilent to reduce script.
 469
 470 =cut
 471
 472 sub reduce {
 473    my ($max, @seqs) = @_;
 474
 475    my $num = int(@seqs / $max);
 476
 477    my @res = shift @seqs;
 478
 479    # Don't bother if we have less than the required sequences
 480    if (@seqs < $max) { return @res, @seqs }
 481    if ($num == 0) { return @res, @seqs }
 482
 483    for (0..$#seqs) { push @res, $seqs[$_] if $_ % $num == 0 }
 484
 485    return @res;
 486 }
 487
 488 =head2 nonred($cutoff, @PSISEQS);
 489
 490 Removes redundant sequences at $cutoff level of sequence identity.
 491
 492 Equivilent to nonred script.
 493
 494 =cut
 495
 496 sub nonred {
 497    my ($cutoff, @seqs) = @_;
 498    # Run pairwise and then OC
 499    # Remove similar sequences
 500
 501    unless (defined $cutoff) { croak "JPRED: nonred() not passed a cutoff" }
 502    unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
 503    if (@seqs == 1) {
 504       warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 505       return @seqs;
 506    }
 507
 508    my $fasta = FASTA::File->new;
 509
 510    # Convert the sequences into a FASTA object, which is what
 511    # pairwise expects.
 512    $fasta->add_entries(
 513       map {
 514          my $f = FASTA->new(id => $_->id);
 515          $f->seq(@{$_->align});
 516          $f;
 517       } @seqs
 518    );
 519
 520    # Run pairwise via wrappers
 521    use Pairwise;
 522    my $pair = Pairwise->new(
 523       path => $pairwise
 524    );
 525    my $foo = join "\n", $pair->run($fasta) or die $!;
 526
 527    # Run OC
 528    my $ocres = OC->new(
 529       path => "$oc sim complete cut $cutoff"
 530    );
 531    $ocres->read_string(join "", $ocres->run($foo) );
 532
 533    # Now found those entries that aren't related
 534    my @keep;
 535    {
 536       for ($ocres->get_groups) {
 537          my ($group, $score, $size, @labels) = @{ $_ };
 538
 539          # Keep the QUERY from the cluster, otherwise just get one
 540          if (grep { /QUERY/ } @labels) {
 541             # Make sure the query stays at the start
 542             unshift @keep, "QUERY";
 543             next;
 544          }
 545          else { push @keep, shift @labels }
 546       }
 547
 548       push @keep, $ocres->get_unclustered;
 549    }
 550
 551    # Return the matching entries.
 552    my @return;
 553
 554    # We use the temporay to mimic the selection of sequences in the nonred
 555    # script, which took the sequences in the order they occured in the file.
 556    for (@seqs) {
 557       my $label = $_->id;
 558       if (grep { $_ =~ /^$label$/ } @keep) {
 559          push @return, $_;
 560       }
 561    }
 562
 563    return @return;
 564 }
 565
 566 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
 567
 568 Removes those sequences that are more than $percentage longer or shorter than the first sequence in @seqs. @seqs is an array of PSISEQ objects.
 569
 570 Equivilent to trim_seqs script.
 571
 572 =cut
 573
 574 sub del_long_seqs {
 575    my ($factor, @seqs) = @_;
 576
 577    # Keep the query sequence
 578    my $query = shift @seqs;
 579    my @res = $query;
 580
 581    # Find the length of the query sequence without any gaps
 582    my $length = (() = (join '', @{$query->align}) =~ /[^$GAP]/g);
 583
 584    # Calculate the upper and lower allowed bounds
 585    my ($upper, $lower);
 586    {
 587       my $bounds = ($length / 100) * $factor;
 588       ($upper, $lower) = ($length + $bounds, $length - $bounds);
 589    }
 590
 591    # Now try each sequence and see how long they are
 592    for (@seqs) {
 593       my $l = (() = (join '', @{$_->align}) =~ /[^$GAP]/g);
 594       if ($l >= $lower && $l <= $upper) { push @res, $_ }
 595    }
 596
 597    return @res;
 598 }
 599
 600 =head2 toupper ($PSISEQ)
 601
 602 Converts sequence held in PSISEQ object to uppercase.
 603
 604 =cut
 605
 606 sub toupper {
 607    croak "JPRED: toupper: argument not a PSISEQ object" unless isa $_[0], 'PSISEQ';
 608    $_[0]->align(
 609       [ split //, uc join '', @{ $_[0]->align } ]
 610    );
 611 }
 612
 613 =head2 degap($PSISEQ_QUERY, @PSISEQS)
 614
 615 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
 616
 617 Equivilent to fasta2jnet script.
 618
 619 =cut
 620
 621 sub degap {
 622    my (@seqs) = @_;
 623
 624    # Find where the gaps in the query sequence are
 625    my @gaps;
 626    {
 627       my $i = 0;
 628       for (@{$seqs[0]->align}) {
 629          push @gaps, $i if $_ !~ /$GAP/;
 630          $i++;
 631       }
 632    }
 633
 634    return unless @gaps;
 635
 636    # Remove the gaps in all the sequences.
 637    # Derefences the array reference and takes a slice inside the method call
 638    # arguments, then coerce back to an array ref.
 639    for (@seqs) {
 640       $_->align([ @{$_->align}[@gaps] ]);
 641    }
 642 }
 643
 644 =head2 getfreq($filename, @PSISEQS)
 645
 646 Creates a PSIBLAST like profile and writes it to the $filename.
 647
 648 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output, but it's *not* the same.
 649
 650 =cut
 651
 652 sub getfreq {
 653    my ($fn, @seqs) = @_;
 654
 655    croak "JPRED: Passed non PSISEQ object" if grep { ! isa($_, 'PSISEQ') } @seqs;
 656
 657    # Create a PSIBLAST like profile from the alignment
 658    # This doesn't greate the same result as old Jpred, I think this is because
 659    # differences in the input between old and new
 660
 661    # For testing
 662 #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
 663 #   my @profile = profile(
 664 #      map { join "", @{ $_->seq } } $f->get_entries
 665 #   );
 666
 667    my @profile = profile(
 668       map { join "", @{ $_->align } } @seqs
 669    );
 670
 671    open my $fh, ">$fn" or die "JPRED: $fn: $!";
 672    print $fh join(" ", @{$_}), "\n" for @profile;
 673 }
 674
 675 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
 676
 677 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
 678
 679 Equivilent to gethmm script.
 680
 681 =cut
 682
 683 sub hmmer {
 684    my (@seqs) = @_;
 685
 686    # Temp files required
 687    my ($hmmbuild_out, $hmmconvert_out, $tmp_fasta) = map {
 688       File::Temp->new->filename
 689    } 1..3;
 690
 691    # Create the FASTA file
 692    psiseq2fasta($tmp_fasta, @seqs);
 693
 694    system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
 695    system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 696
 697    # Read in the HMMER file
 698    my $hmmer = HMMER::Profile->new(
 699       read_file => $hmmconvert_out
 700    );
 701
 702    # Convert from HMMER::Profile to HMMER::Profile::Jnet
 703    my $hmmer_jnet = HMMER::Profile::Jnet->new;
 704    $hmmer_jnet->add_line(@{$_}) for $hmmer->get_line;
 705
 706    return $hmmer_jnet;
 707 }
 708
 709 =head2 jnet($hmmer, $out, $pssm )
 710
 711 Runs Jnet. Pass the paths of the filenames to do the prediction on
 712
 713 =cut
 714
 715 sub jnet {
 716    my ($hmmer, $outfile, $pssm) = @_;
 717    if ($pssm) {
 718       # run Jnet prediction with all the profile data
 719       system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
 720    } else {
 721       # run Jnet prediction with only HMMer and alignment data
 722       system("$jnet --concise --hmmer $hmmer > $outfile");
 723    }
 724    check("jnet", $?) or exit 1;
 725 }
 726
 727 =head1 psiseq2fasta($path, @PSISEQ)
 728
 729 Writes a FASTA file given an array of PSISEQ objects.
 730
 731 =cut
 732
 733 sub psiseq2fasta {
 734    my ($fn, @seqs) = @_;
 735
 736    confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
 737    @seqs or confess "JPRED: not passed PSISEQs";
 738
 739    confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 740
 741    # Changed from using FASTA::File object due to seg faulting on some large,
 742    # long alignments, presumably due to running out of memory.
 743 #   my $fasta = FASTA::File->new;
 744 #   $fasta->add_entries(
 745 #      map {
 746 #         my $f = FASTA->new(id => $_->id);
 747 #         $f->seq(@{$_->align});
 748 #         $f;
 749 #      } @seqs
 750 #   );
 751 #   $fasta->write_file($fn);
 752
 753    my $fh;
 754    $fn =~ /\.gz$/ ?
 755       ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" ):
 756       ( open $fh, ">$fn" or die "JPRED: $!" );
 757
 758    for (@seqs) {
 759       print $fh ">", $_->id, "\n";
 760       my $seq = join("", @{ $_->align });
 761       $seq =~ s/(.{72})/$1\n/g;
 762       chomp $seq;
 763       print $fh $seq."\n";
 764    }
 765    close $fh;
 766 }
 767
 768 =head1 @PSISEQ = fasta2psiseq($path)
 769
 770 Reads in a FASTA file and returns an array of PSISEQ objects.
 771
 772 =cut
 773
 774 sub fasta2psiseq {
 775    my ($fn) = @_;
 776
 777    my $fasta = $fn =~ /\.gz$/ ?
 778       FASTA::File->new(read_gzip_file => $fn) :
 779       FASTA::File->new(read_file => $fn);
 780    $fasta or die "JPRED: $!";
 781
 782    my @seqs;
 783    for ($fasta->get_entries) {
 784       my $seq = PSISEQ->new;
 785       $seq->id($_->id);
 786       $seq->align( [ @{ $_->seq } ] );
 787       push @seqs, $seq;
 788    }
 789
 790    return @seqs;
 791 }
 792
 793 sub fasta2psiseq_lowmem {
 794    my ($fn) = @_;
 795
 796    local $/ = "\n";
 797    my $fh;
 798    $fn =~ /\.gz$/ ?
 799       (open $fh, "gzip -dc $fn|" or die $!) :
 800       (open $fh, $fn or die $!);
 801
 802    my @seqs;
 803
 804    my ($id, $seq);
 805    while (<$fh>) {
 806       chomp;
 807       if (s/^>//) {
 808          $seq =~ s/ //g if defined $seq;
 809
 810          if ($id and $seq) {
 811             my $psi = PSISEQ->new(id => $id);
 812             $psi->align([ split //, $seq ]);
 813             push @seqs, $psi;
 814             undef $id; undef $seq;
 815          }
 816          else { $id = $_ }
 817       }
 818       else {
 819          $seq .= $_;
 820       }
 821    }
 822    if ($id and $seq) {
 823       my $psi = PSISEQ->new(id => $id);
 824       $psi->seq($seq);
 825       push @seqs, $psi;
 826    }
 827
 828    return @seqs;
 829 }
 830
 831 =head1 die_if_no_seqs($size, $message, @seqs)
 832
 833 Exits with error message if @seqs is not at least $size.
 834
 835 =cut
 836
 837 sub die_if_no_seqs {
 838    my ($size, $message, @seqs) = @_;
 839
 840    confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 841
 842    chomp $message;
 843    die "JPRED: $message\n" unless @seqs > $size;
 844 }
 845
 846 =head1 extend($FASTA::File, @PSISEQ)
 847
 848 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues against the query, this function extends the alignment so that all of the query is present in the alignment. The other sequences are extended with gap characters.
 849
 850 =cut
 851
 852 sub extend {
 853    my ($query, @seqs) = @_;
 854
 855    croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
 856    croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 857
 858    # Get the query sequence
 859    my $q_query = $query->get_entry(0);
 860
 861    # Get the query sequence from the PSIBLAST alignment
 862    my $a_query = shift @seqs;
 863
 864    # The gap size required to expand the alignment
 865    my ($start_gap_length, $end_gap_length);
 866    {
 867       # Make handling the sequence easier
 868       my $q_seq = join "", @{[ $q_query->seq ]};
 869       my $a_seq = join "", @{ $a_query->align };
 870
 871       $q_seq =~ s/-//g;
 872       $a_seq =~ s/-//g;
 873
 874       ($q_seq, $a_seq) = map { uc $_ } $q_seq, $a_seq;
 875
 876       $start_gap_length = index($q_seq, $a_seq);
 877       croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
 878
 879       $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
 880    }
 881
 882    # Add the gaps to the end of the alignments
 883    for (@seqs) {
 884       $_->align([
 885          ($GAP) x $start_gap_length,
 886          @{ $_->align },
 887          ($GAP) x $end_gap_length
 888       ]);
 889    }
 890
 891    # Put the query sequence back
 892    unshift @seqs, PSISEQ->new(
 893       id => $a_query->id,
 894       align => [
 895          ($start_gap_length ? @{ $q_query->seq }[ 0..($start_gap_length-1) ] : ()),
 896          @{ $a_query->align },
 897          ($end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : ()),
 898       ]
 899    );
 900
 901    return @seqs;
 902 }
 903
 904 =head1 $int = fasta_seq_length($path)
 905
 906 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
 907
 908 =cut
 909
 910 sub fasta_seq_length {
 911    my ($fn) = @_;
 912    open my $fh, $fn or croak "Can't open file $fn: $!\n";
 913
 914    my $length = 0;
 915    local $_, $/ = "\n";
 916    while (<$fh>) {
 917       next if /^>/;
 918       chomp;
 919       $length += tr///c;
 920    }
 921
 922    return $length;
 923 }
 924
 925 =head1 log(@messages)
 926
 927 Report messages to STDERR
 928
 929 =cut
 930
 931 sub errlog { print STDERR @_ }