jpred/jpred.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9   jpred --sequence <fasta file> [--output <output prefix>] [--db <database>] [--psi <psiblast output>] [--pred-nohits] [--verbose] [--debug] [--help] [--man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program which predicts the secondary structure of a sequence given a path to a FASTA sequence.
  14 It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
  15
  16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
  17 Some of the output may not be meaningful if used directly - it can be safely ignored.
  18
  19 =head1 OPTIONS
  20
  21 =over 5
  22
  23 =item -sequence path to input FASTA file
  24
  25 The path to the sequence (in FASTA format) you want to predict.
  26
  27 =item -output file
  28
  29 A prefix to the filenames created by Jpred, defaults to the value set by --sequence.
  30
  31 =item --db database
  32
  33 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code. Crap I know, but that's the way it is at the moment. It's probably best to stick the default for the time being.
  34
  35 Default: uniref90
  36
  37 =item -psi path
  38
  39 Path to a PSIBLAST output file.
  40
  41 =item --pred-nohits
  42
  43 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  44
  45 =item --verbose
  46
  47 Verbose mode. Print more information while running jpred.
  48
  49 =item --debug
  50
  51 Debug mode. Print debugging information while running jpred.
  52
  53 =item --help
  54
  55 Gives help on the programs usage.
  56
  57 =item --man
  58
  59 Displays this man page.
  60
  61 =back
  62
  63 =head1 BUGS
  64
  65 Can't cope with gaps in the query sequence.
  66
  67 Doesn't accept alignments.
  68
  69 If you find any others please contact me.
  70
  71 =head1 AUTHORS
  72
  73 Jonathan Barber <jon@compbio.dundee.ac.uk>
  74 Chris Cole <christian@cole.name>
  75 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
  76
  77 =cut
  78
  79 ## TODO check for gaps in query sequence
  80 ##      check for disallowed chars in sequence
  81
  82 use strict;
  83 use warnings;
  84 use Getopt::Long;
  85 use Pod::Usage;
  86 use Carp;
  87 use Data::Dumper;
  88 use File::Temp;
  89
  90 #use UNIVERSAL;
  91
  92 # path to Jpred modules
  93 #use FindBin qw($Bin);
  94 #use lib "$Bin/../lib";
  95 use lib "lib";
  96 use lib "/home/asherstnev/Projects/Jpred.project/jpred/branches/portable/lib";
  97
  98 # internal jpred modules
  99 use Jpred;
 100 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS);
 101 use HMMER::Profile;
 102 use HMMER::Profile::Jnet;
 103 use PSIBLAST;
 104 use PSIBLAST::PSSM;
 105 use PSIBLAST::Run;
 106 use FASTA::File;
 107 use FASTA;
 108 use Index;
 109 use Pairwise;
 110 use OC;
 111 use Utils qw(profile);
 112 use Run qw(check);
 113
 114 our $VERSION = '3.0.2';
 115 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 116 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 117
 118 # Gap characters in sequences
 119 our $GAP = '-';
 120
 121 #####################################################################################################
 122 # Light object to hold sequence information, light to prevent memory overload
 123 # for big search results. Even so this is quite memory intensive, I can't see
 124 # how to reduce the size further.
 125 {
 126
 127   package PSISEQ;
 128
 129   sub new {
 130     my ( $class, %args ) = @_;
 131     my $self = [];
 132     bless $self, $class;
 133     for ( keys %args ) { $self->$_( $args{$_} ) }
 134     return $self;
 135   }
 136
 137   sub id {
 138     return defined $_[1]
 139       ? $_[0]->[0] = $_[1]
 140       : $_[0]->[0];
 141   }
 142
 143   sub start {
 144     return defined $_[1]
 145       ? $_[0]->[1] = $_[1]
 146       : $_[0]->[1];
 147   }
 148
 149   sub align {
 150     if ( defined $_[1] ) {
 151       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 152
 153       $_[0]->[2] = join( "", @{ $_[1] } );
 154     } else {
 155       return [ split( //, $_[0]->[2] ) ];
 156     }
 157   }
 158 }
 159
 160 #####################################################################################################
 161 my $infile;
 162 my $output;
 163 my $psiblast;
 164 my $ncpu       = 1;           # number of CPUs for psiblast calculations
 165 my $predNoHits = 0;           # define whether to make predictions when no PSI-BLAST hits are found
 166 my $db_path    = ".";
 167 my $db_entry   = "unknown";
 168
 169 my ( $help, $man, $DEBUG, $VERBOSE );
 170
 171 GetOptions(
 172   "in|sequence=s" => \$infile,
 173   "out|output=s"  => \$output,
 174   "psi=s"         => \$psiblast,
 175   "dbname=s"      => \$db_entry,
 176   "dbpath=s"      => \$db_path,
 177   "ncpu=s"        => \$ncpu,
 178   "pred-nohits"   => \$predNoHits,
 179   "help"          => \$help,
 180   "man"           => \$man,
 181   "debug"         => \$DEBUG,
 182   "verbose"       => \$VERBOSE
 183 ) or pod2usage(2);
 184 pod2usage(1) if $help;
 185 pod2usage( verbose => 2 ) if $man;
 186
 187 #####################################################################################################
 188 # Key to database information and information for accessing them
 189 my $database = {
 190
 191   ## default database used by Jpred
 192   uniref90 => {
 193     database   => $db_path . "/uniref90.filt",
 194     unfiltered => $db_path . "/uniref90",
 195   },
 196
 197   ## database used during Jnet training
 198   training => {
 199     database   => $db_path . "/training/uniref90.filt",
 200     unfiltered => $db_path . "/training/uniref90",
 201   },
 202
 203   ## database used for ported Jpred
 204   ported_db => {
 205     database   => $db_path . "/databases/uniref90.filt",
 206     unfiltered => $db_path . "/databases/uniref90",
 207   },
 208   ## cluster-specific path for Jpred
 209   cluster => {
 210     database   => "/homes/www-jpred/databases/uniref90.filt",
 211     unfiltered => "/homes/www-jpred/databases/uniref90",
 212   },
 213
 214   ## these other DBs are experimental ones used during development.
 215   ## check they exist before using them.
 216   swall => {
 217
 218     # generic entry for use with validate_jnet.pl
 219     # real db location defined by validate_jnet.pl
 220     database   => $db_path . "/swall/swall.filt",
 221     unfiltered => $db_path . "/swall/swall",
 222   },
 223
 224   uniprot => {
 225
 226     # Path to PSIBLAST db
 227     database   => $db_path . "/3/swall.filt",
 228     unfiltered => $db_path . "/3/swall",
 229   },
 230   uniref50 => {
 231     database   => $db_path . "/6/swall.filt",
 232     unfiltered => $db_path . "/6/swall",
 233   },
 234 };
 235
 236 pod2usage(' --sequence argument not provided') unless $infile;
 237 die "--db $db_entry not recognised" unless exists $database->{$db_entry};
 238 $output = $infile . ".res" unless $output;
 239 $ncpu = 1 if ( 1 > $ncpu or 8 < $ncpu );
 240
 241 for ( [ "input", $infile ], [ "output", $output ], [ "psi", $psiblast ], [ "db", $db_entry ] ) {
 242   my ( $key, $value ) = @{$_};
 243   defined $value or next;
 244   errlog( join( ": ", $key, $value ), "\n" );
 245 }
 246
 247 #####################################################################################################
 248 my $platform = check_OS();
 249 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 250
 251 #####################################################################################################
 252 my $query = FASTA::File->new( read_file => $infile );
 253
 254 my @seqs;
 255 my $psi = PSIBLAST->new;
 256 unless ( defined $psiblast && -e $psiblast ) {
 257   ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 258   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 259   # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 260   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 261   my $psi_run = PSIBLAST::Run->new(
 262     debug    => $DEBUG,
 263     args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 264     path     => $psiblastbin,
 265     input    => $infile,
 266     output   => "$output.blast",
 267     matrix   => "$output.profile",
 268     database => $database->{$db_entry}{database},
 269   );
 270
 271   # For reduced databases, get the size of the orginal and use this as
 272   # the equivilent DB size
 273   if ( $db_entry =~ /^sp_red_/ ) {
 274     ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 275     $psi_run->args(
 276       $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 277     );
 278   }
 279   print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
 280   print "BLASTDB path: $ENV{BLASTDB}\n"  if $DEBUG;
 281
 282   print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 283   $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 284
 285   $psi->read_file("$output.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 286   system("gzip -9f $output.blast");
 287 } else {
 288   if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 289   else                         { $psi->read_file($psiblast) }                        # CC ditto above
 290 }
 291
 292 #####################################################################################################
 293 # Convert the last itteration into a collection of PSISEQ objects
 294 for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 295   my ( $id, $start, $align ) = @{$_};
 296   push @seqs,
 297     PSISEQ->new(
 298     id    => $id,
 299     start => $start,
 300     align => [ split( //, $align ) ]
 301     );
 302 }
 303
 304 #####################################################################################################
 305 ## When there are no PSI-BLAST hits generate an HMM from the query
 306 ## and run Jnet against that only.
 307 ## Accuracy won't be as good, but at least it's a prediction
 308 if ( @seqs == 0 ) {
 309   if ( $predNoHits == 0 ) {
 310     warn "\nJPRED: Warning! no PSI-BLAST hits found and '--pred-nohits' option not set. Exiting...\n";
 311     print ">>100% complete\n";
 312     exit;
 313   } else {
 314     print ">>50% complete\n";
 315     warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 316     print "Running HMMer on query...\n" if $VERBOSE;
 317
 318     # copy input query to alignment
 319     #system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
 320     open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 321     open( my $ofh, ">", $output . ".align" ) or die "JPRED: cannot open file $output\.align: $!";
 322     while (<$ifh>) { print $ofh, $_ }
 323     close $ifh;
 324     close $ofh;
 325
 326     # Temp files required for HMMer
 327     my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 328
 329     system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 330     system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 331
 332     # Read in the HMMER file
 333     my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 334
 335     # Convert from HMMER::Profile to HMMER::Profile::Jnet
 336     my $hmmer_jnet = HMMER::Profile::Jnet->new;
 337     $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 338     $hmmer_jnet->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 339     print ">>70% complete\n";
 340
 341     # Run Jnet for the prediction
 342     print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 343     jnet( map { "$output.$_" } qw(hmm jnet) );    # CC sub is below
 344   }
 345 } else {
 346   psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;    # CC sub is below
 347   print ">>40% complete\n";
 348
 349   #####################################################################################################
 350   # Make PSIBLAST truncated alignments the right length
 351   print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 352   @seqs = extend( $query, @seqs );                  # CC sub si below
 353   psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 354
 355   #####################################################################################################
 356   # Remove masking from PSIBLAST alignment
 357   print "Unmasking the alignments...\n" if $VERBOSE;
 358   my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 359   remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 360   psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 361
 362   #####################################################################################################
 363   # Convert the sequences to upper case
 364   print "Converting sequences to the same case...\n" if $VERBOSE;
 365   toupper($_) for @seqs;    # CC sub is below
 366   psiseq2fasta( "${output}_backup.fasta.gz", @seqs );
 367
 368   #####################################################################################################
 369   # Remove excessive sequences
 370   print "Remove excessive sequences...\n" if $VERBOSE;
 371   @seqs = reduce( $MAX_ALIGN, @seqs );
 372   psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 373
 374   #####################################################################################################
 375   # Remove sequences that are too long or too short
 376   print "Remove sequences which too long or short...\n" if $VERBOSE;
 377   @seqs = del_long_seqs( 50, @seqs );
 378   psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 379
 380   #####################################################################################################
 381   # Remove redundant sequences based upon pairwise identity and OC clustering
 382   print "Remove redundant sequences...\n" if $VERBOSE;
 383   @seqs = nonred( $NR_CUT, @seqs );
 384   psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 385
 386   #####################################################################################################
 387   # Check that we haven't got rid of all of the sequences
 388   if ( @seqs < 2 ) {
 389     warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 390     @seqs = fasta2psiseq("${output}_backup.fasta.gz");
 391   }
 392   unlink("${output}_backup.fasta.gz") unless $DEBUG;
 393
 394   # Remove gaps in the query sequence and same positions in the alignment
 395   print "Removing gaps in the query sequence...\n" if $VERBOSE;
 396   degap(@seqs);
 397   psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 398
 399   # Output the alignment for the prediction
 400   print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 401   psiseq2fasta( "$output.align", @seqs );
 402
 403   #####################################################################################################
 404   # Equivilent to getpssm script
 405   print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 406   my $pssm = PSIBLAST::PSSM->new( read_file => "$output.profile" );
 407   $pssm->write_file("$output.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 408   print ">>50% complete\n";
 409
 410   #####################################################################################################
 411   # Run HMMER on the sequences
 412   print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 413   my $hmmer = hmmer(@seqs);             # CC sub is below
 414   $hmmer->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 415   print ">>70% complete\n";
 416
 417   #####################################################################################################
 418   # Run Jnet for the prediction
 419   print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 420   jnet( map { "$output.$_" } qw(hmm jnet pssm) );    # CC sub is below
 421 }
 422
 423 print ">>100% complete\n";
 424 print "Jpred Finished\n";
 425 exit;
 426
 427 #####################################################################################################
 428 # Functions
 429 #####################################################################################################
 430
 431 =begin :private
 432
 433 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 434
 435 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 436 Does it in place.
 437
 438 =cut
 439
 440 #####################################################################################################
 441 sub remove_seq_masks {
 442   my $idx = shift;
 443   my $seq = shift;
 444
 445   # Only bother if we have a sequence which has been masked
 446   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 447
 448   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 449   my $searchdb = $database->{$db_entry}{unfiltered};
 450   $start--;    # We need this to be zero based
 451
 452   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 453   my @seqs = $f->get_entries;
 454   my $db   = shift @seqs;
 455
 456   unless ($db) {
 457     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 458   } elsif (@seqs) {
 459     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 460   }
 461
 462   my @db_seq = @{ $db->seq };
 463
 464   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 465   my $i = 0;
 466   for ( 0 .. $#align ) {
 467     next if $align[$_] =~ /$GAP/;
 468
 469     if ( $align[$_] eq 'X' ) {
 470       unless ( defined $db_seq[ $i + $start ] ) {
 471         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 472       }
 473       $align[$_] = $db_seq[ $i + $start ];
 474     }
 475     $i++;
 476   }
 477   $seq->align( \@align );
 478 }
 479
 480 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 481
 482 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 483
 484 Equivilent to reduce script.
 485
 486 =cut
 487
 488 #####################################################################################################
 489 sub reduce {
 490   my ( $max, @seqs ) = @_;
 491
 492   my $num = int( @seqs / $max );
 493   my @res = shift @seqs;
 494
 495   # Don't bother if we have less than the required sequences
 496   if ( @seqs < $max or 0 == $num ) {
 497     return @res, @seqs;
 498   }
 499
 500   for ( 0 .. $#seqs ) {
 501     push @res, $seqs[$_] if ( 0 == $_ % $num );
 502   }
 503
 504   return @res;
 505 }
 506
 507 =head2 nonred($cutoff, @PSISEQS);
 508
 509 Removes redundant sequences at $cutoff level of sequence identity.
 510
 511 Equivilent to nonred script.
 512
 513 =cut
 514
 515 #####################################################################################################
 516 sub nonred {
 517   my ( $cutoff, @seqs ) = @_;
 518
 519   # Run pairwise and then OC and remove similar sequences
 520   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 521   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 522   if     ( @seqs == 1 ) {
 523     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 524     return @seqs;
 525   }
 526
 527   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 528   my $fasta = FASTA::File->new;
 529   foreach (@seqs) {
 530     my $f = FASTA->new( id => $_->id );
 531     $f->seq( @{ $_->align } );
 532     $fasta->add_entries($f);
 533   }
 534
 535   # Run pairwise
 536   my $pair = Pairwise->new( path => $pairwise );
 537   my $foo = join "\n", $pair->run($fasta) or die $!;
 538
 539   # Run OC
 540   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 541   $ocres->read_string( join "", $ocres->run($foo) );
 542
 543   # Now found those entries that are unrelated
 544   my @keep;
 545   for ( $ocres->get_groups ) {
 546     my ( $group, $score, $size, @labels ) = @{$_};
 547
 548     # Keep the QUERY from the cluster, otherwise just get onelt
 549     if ( grep { /QUERY/ } @labels ) {
 550
 551       # Make sure the query stays at the start
 552       unshift @keep, "QUERY";
 553       next;
 554     } else {
 555       push @keep, shift @labels;
 556     }
 557   }
 558   push @keep, $ocres->get_unclustered;
 559
 560   # Return the matching entries.
 561   # We use the temporay to mimic the selection of sequences in the nonred
 562   # script, which took the sequences in the order they occured in the file.
 563   my @filtered_res;
 564   for (@seqs) {
 565     my $label = $_->id;
 566     if ( grep { $_ =~ /^$label$/ } @keep ) {
 567       push @filtered_res, $_;
 568     }
 569   }
 570
 571   return @filtered_res;
 572 }
 573
 574 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
 575
 576 Removes those sequences that are more than $percentage longer or shorter than the first
 577 sequence in @seqs. @seqs is an array of PSISEQ objects.
 578
 579 Equivilent to trim_seqs script.
 580
 581 =cut
 582
 583 #####################################################################################################
 584 sub del_long_seqs {
 585   my ( $factor, @seqs ) = @_;
 586
 587   # Keep the query sequence (we assume query is the 1st FASTA record)
 588   my $query = shift @seqs;
 589   my @res   = $query;
 590
 591   # Find the length of the query sequence without any gaps
 592   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
 593
 594   # Calculate the upper and lower allowed bounds
 595   my ( $upper, $lower );
 596   {
 597     my $bounds = ( $length / 100 ) * $factor;
 598     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
 599   }
 600
 601   # Now try each sequence and see how long they are
 602   for (@seqs) {
 603     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
 604     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
 605   }
 606
 607   return @res;
 608 }
 609
 610 =head2 toupper ($PSISEQ)
 611
 612 Converts sequence held in PSISEQ object to uppercase.
 613
 614 =cut
 615
 616 #####################################################################################################
 617 sub toupper {
 618   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
 619 }
 620
 621 =head2 degap($PSISEQ_QUERY, @PSISEQS)
 622
 623 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
 624
 625 Equivilent to fasta2jnet script.
 626
 627 =cut
 628
 629 #####################################################################################################
 630 sub degap {
 631   my (@seqs) = @_;
 632
 633   # Find where the gaps in the query sequence are
 634   my @gaps;
 635   my $i = 0;
 636   for ( @{ $seqs[0]->align } ) {
 637     push @gaps, $i if $_ !~ /$GAP/;
 638     $i++;
 639   }
 640
 641   return unless @gaps;
 642
 643   # Remove the gaps in all the sequences.
 644   # Derefences the array reference and takes a slice inside the method call
 645   # arguments, then coerce back to an array ref.
 646   for (@seqs) {
 647     $_->align( [ @{ $_->align }[@gaps] ] );
 648   }
 649 }
 650
 651 =head2 getfreq($filename, @PSISEQS)
 652
 653 Creates a PSIBLAST like profile and writes it to the $filename.
 654
 655 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
 656 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
 657 but it's *not* the same.
 658
 659 =cut
 660
 661 #####################################################################################################
 662 sub getfreq {
 663   my ( $fn, @seqs ) = @_;
 664
 665   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
 666
 667   # Create a PSIBLAST like profile from the alignment
 668   # This doesn't greate the same result as old Jpred, I think this is because
 669   # differences in the input between old and new
 670
 671   # For testing
 672   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
 673   #   my @profile = profile(
 674   #      map { join "", @{ $_->seq } } $f->get_entries
 675   #   );
 676
 677   my @profile = profile( map { join "", @{ $_->align } } @seqs );
 678
 679   open my $fh, ">$fn" or die "JPRED: $fn: $!";
 680   print $fh join( " ", @{$_} ), "\n" for @profile;
 681 }
 682
 683 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
 684
 685 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
 686
 687 Equivilent to gethmm script.
 688
 689 =cut
 690
 691 #####################################################################################################
 692 sub hmmer {
 693   my (@seqs) = @_;
 694
 695   # Temp files required
 696   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
 697
 698   # Create the FASTA file
 699   psiseq2fasta( $tmp_fasta, @seqs );
 700
 701   # Run HMMer
 702   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
 703   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 704
 705   # Read in the HMMER file
 706   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 707
 708   # Convert from HMMER::Profile to HMMER::Profile::Jnet
 709   my $hmmer_jnet = HMMER::Profile::Jnet->new;
 710   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 711
 712   return $hmmer_jnet;
 713 }
 714
 715 =head2 jnet($hmmer, $out, $pssm )
 716
 717 Runs Jnet. Pass the paths of the filenames to do the prediction on
 718
 719 =cut
 720
 721 #####################################################################################################
 722 sub jnet {
 723   my ( $hmmer, $outfile, $pssm ) = @_;
 724   if ($pssm) {
 725
 726     # run Jnet prediction with all the profile data
 727     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
 728   } else {
 729
 730     # run Jnet prediction with only HMMer and alignment data
 731     system("$jnet --concise --hmmer $hmmer > $outfile");
 732   }
 733   check( "jnet", $? ) or exit 1;
 734 }
 735
 736 =head1 psiseq2fasta($path, @PSISEQ)
 737
 738 Writes a FASTA file given an array of PSISEQ objects.
 739
 740 =cut
 741
 742 #####################################################################################################
 743 sub psiseq2fasta {
 744   my ( $fn, @seqs ) = @_;
 745
 746   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
 747   confess "JPRED: not passed PSISEQs" unless @seqs;
 748
 749   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 750
 751   # Changed from using FASTA::File object due to seg faulting on some large,
 752   # long alignments, presumably due to running out of memory.
 753   #   my $fasta = FASTA::File->new;
 754   #   $fasta->add_entries(
 755   #      map {
 756   #         my $f = FASTA->new(id => $_->id);
 757   #         $f->seq(@{$_->align});
 758   #         $f;
 759   #      } @seqs
 760   #   );
 761   #   $fasta->write_file($fn);
 762
 763   my $fh;
 764   $fn =~ /\.gz$/
 765     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
 766     : ( open $fh, ">$fn" or die "JPRED: $!" );
 767
 768   for (@seqs) {
 769     print $fh ">", $_->id, "\n";
 770     my $seq = join( "", @{ $_->align } );
 771     $seq =~ s/(.{72})/$1\n/g;
 772     chomp $seq;
 773     print $fh $seq . "\n";
 774   }
 775   close $fh;
 776 }
 777
 778 =head1 @PSISEQ = fasta2psiseq($path)
 779
 780 Reads in a FASTA file and returns an array of PSISEQ objects.
 781
 782 =cut
 783
 784 #####################################################################################################
 785 sub fasta2psiseq {
 786   my ($fn) = @_;
 787
 788   my $fasta =
 789     $fn =~ /\.gz$/
 790     ? FASTA::File->new( read_gzip_file => $fn )
 791     : FASTA::File->new( read_file      => $fn );
 792   $fasta or die "JPRED: $!";
 793
 794   my @seqs;
 795   for ( $fasta->get_entries ) {
 796     my $seq = PSISEQ->new;
 797     $seq->id( $_->id );
 798     $seq->align( [ @{ $_->seq } ] );
 799     push @seqs, $seq;
 800   }
 801
 802   return @seqs;
 803 }
 804
 805 #####################################################################################################
 806 sub fasta2psiseq_lowmemory {
 807   my $filename = shift;
 808
 809   local $/ = "\n";
 810   my $fh;
 811   $filename =~ /\.gz$/
 812     ? ( open $fh, "gzip -dc $filename|" or die $! )
 813     : ( open $fh, $filename or die $! );
 814
 815   my @seqs;
 816   my ( $id, $seq );
 817   while (<$fh>) {
 818     chomp;
 819     if (s/^>//) {
 820       $seq =~ s/ //g if defined $seq;
 821
 822       if ( $id and $seq ) {
 823         my $psi = PSISEQ->new( id => $id );
 824         $psi->align( [ split //, $seq ] );
 825         push @seqs, $psi;
 826         undef $id;
 827         undef $seq;
 828       } else {
 829         $id = $_;
 830       }
 831     } else {
 832       $seq .= $_;
 833     }
 834   }
 835   if ( $id and $seq ) {
 836     my $psi = PSISEQ->new( id => $id );
 837     $psi->seq($seq);
 838     push @seqs, $psi;
 839   }
 840
 841   return @seqs;
 842 }
 843
 844 =head1 extend($FASTA::File, @PSISEQ)
 845
 846 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
 847 against the query, this function extends the alignment so that all of the query is
 848 present in the alignment. The other sequences are extended with gap characters.
 849
 850 =cut
 851
 852 #####################################################################################################
 853 sub extend {
 854   my ( $query, @seqs ) = @_;
 855
 856   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
 857   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
 858
 859   # Get the query sequence
 860   my $q_query = $query->get_entry(0);
 861
 862   # Get the query sequence from the PSIBLAST alignment
 863   my $a_query = shift @seqs;
 864
 865   # The gap size required to expand the alignment
 866   my ( $start_gap_length, $end_gap_length );
 867   {
 868
 869     # Make handling the sequence easier
 870     my $q_seq = join "", @{ [ $q_query->seq ] };
 871     my $a_seq = join "", @{ $a_query->align };
 872
 873     $q_seq =~ s/-//g;
 874     $a_seq =~ s/-//g;
 875
 876     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
 877
 878     $start_gap_length = index( $q_seq, $a_seq );
 879     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
 880
 881     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
 882   }
 883
 884   # Add the gaps to the end of the alignments
 885   for (@seqs) {
 886     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
 887   }
 888
 889   # Put the query sequence back
 890   unshift @seqs,
 891     PSISEQ->new(
 892     id    => $a_query->id,
 893     align => [
 894       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
 895       @{ $a_query->align },
 896       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
 897     ]
 898     );
 899
 900   return @seqs;
 901 }
 902
 903 =head1 $int = fasta_seq_length($path)
 904
 905 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
 906
 907 =cut
 908
 909 #####################################################################################################
 910 sub fasta_seq_length {
 911   my ($fn) = @_;
 912   open my $fh, $fn or croak "Can't open file $fn: $!\n";
 913
 914   my $length = 0;
 915   local $_, $/ = "\n";
 916   while (<$fh>) {
 917     next if /^>/;
 918     chomp;
 919     $length += tr///c;
 920   }
 921
 922   return $length;
 923 }
 924
 925 =head1 log(@messages)
 926
 927 Report messages to STDERR
 928
 929 =cut
 930
 931 #####################################################################################################
 932 sub errlog { print STDERR @_ }