binaries/src/jpred/jpred.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 jpred - Secondary structure prediction program
   6
   7 =head1 SYNOPSIS
   8
   9 ./jpred.pl -in/-sequence <FILE1> [-out/-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-verbose] [-debug] [-help] [-man]
  10
  11 =head1 DESCRIPTION
  12
  13 This is a program which predicts the secondary structure of a protein sequence given a path to a
  14 FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
  15
  16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
  17 Some of the output may not be meaningful if used directly - it can be safely ignored.
  18
  19 =head1 OPTIONS
  20
  21 =over 5
  22
  23 =item -in/-sequence FILE
  24
  25 The path to the sequence file (in FASTA format) you want to predict.
  26
  27 =item -out/-output FILEPREFIX
  28
  29 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
  30
  31 =item -dbpath PATH
  32
  33 Path to the uniref database used for PSI-BLAST querying.
  34 Default: .
  35
  36 =item -dbname database
  37
  38 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
  39 Default: uniref90
  40
  41 =item -psi path
  42
  43 Path to a PSIBLAST output file.
  44
  45 =item -ncpu NNN
  46
  47 Number of CPU used by jpred.pl. Maximum value is 8
  48 Default: NNN = 1
  49
  50 =item -pred-nohits
  51
  52 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
  53
  54 =item -no-final
  55
  56 Untoggle final step of Jpred prediction
  57
  58 =item -verbose
  59
  60 Verbose mode. Print more information while running jpred.
  61
  62 =item -debug
  63
  64 Debug mode. Print debugging information while running jpred.
  65
  66 =item -help
  67
  68 Gives help on the programs usage.
  69
  70 =item -man
  71
  72 Displays this man page.
  73
  74 =back
  75
  76 =head1 BUGS
  77
  78 Can't cope with gaps in the query sequence.
  79
  80 Doesn't accept alignments.
  81
  82 If you find any others please contact authors.
  83
  84 =head1 AUTHORS
  85
  86 Jonathan Barber <jon@compbio.dundee.ac.uk>
  87 Chris Cole <christian@cole.name>
  88 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
  89
  90 =cut
  91
  92 ## TODO check for gaps in query sequence
  93 ##      check for disallowed chars in sequence
  94
  95 use strict;
  96 use warnings;
  97 use Getopt::Long;
  98 use Pod::Usage;
  99 use Carp;
 100 use Data::Dumper;
 101 use File::Temp;
 102
 103 #use UNIVERSAL;
 104
 105 # path to Jpred modules
 106 use FindBin qw($Bin);
 107 use lib "$Bin/lib";
 108
 109 # internal jpred modules
 110 use Jpred;
 111 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS);
 112 use HMMER::Profile;
 113 use HMMER::Profile::Jnet;
 114 use PSIBLAST;
 115 use PSIBLAST::PSSM;
 116 use PSIBLAST::Run;
 117 use FASTA::File;
 118 use FASTA;
 119 use Index;
 120 use Pairwise;
 121 use OC;
 122 use Utils qw(profile);
 123 use Run qw(check);
 124
 125 our $VERSION = '3.0.2';
 126 my $MAX_ALIGN = 10000;    # maximum number of alignment sequences allowed
 127 my $NR_CUT    = 75;       # cut-off seqeunce ID value for defining non-redundancy
 128
 129 # Gap characters in sequences
 130 our $GAP = '-';
 131
 132 #####################################################################################################
 133 # Light object to hold sequence information, light to prevent memory overload
 134 # for big search results. Even so this is quite memory intensive, I can't see
 135 # how to reduce the size further.
 136 {
 137
 138   package PSISEQ;
 139
 140   sub new {
 141     my ( $class, %args ) = @_;
 142     my $self = [];
 143     bless $self, $class;
 144     for ( keys %args ) { $self->$_( $args{$_} ) }
 145     return $self;
 146   }
 147
 148   sub id {
 149     return defined $_[1]
 150       ? $_[0]->[0] = $_[1]
 151       : $_[0]->[0];
 152   }
 153
 154   sub start {
 155     return defined $_[1]
 156       ? $_[0]->[1] = $_[1]
 157       : $_[0]->[1];
 158   }
 159
 160   sub align {
 161     if ( defined $_[1] ) {
 162       ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
 163
 164       $_[0]->[2] = join( "", @{ $_[1] } );
 165     } else {
 166       return [ split( //, $_[0]->[2] ) ];
 167     }
 168   }
 169 }
 170
 171 #####################################################################################################
 172 my $infile;
 173 my $output;
 174 my $psiblast;
 175 my $ncpu       = 1;           # number of CPUs for psiblast calculations
 176 my $predNoHits = 0;           # define whether to make predictions when no PSI-BLAST hits are found
 177 my $db_path    = ".";
 178 my $db_entry   = "unknown";
 179 my $nofinal;
 180
 181 my ( $help, $man, $DEBUG, $VERBOSE );
 182
 183 GetOptions(
 184   "in|sequence=s" => \$infile,
 185   "out|output=s"  => \$output,
 186   "psi=s"         => \$psiblast,
 187   "dbname=s"      => \$db_entry,
 188   "dbpath=s"      => \$db_path,
 189   "ncpu=s"        => \$ncpu,
 190   "pred-nohits"   => \$predNoHits,
 191   "no-final"      => \$nofinal,
 192   "help"          => \$help,
 193   "man"           => \$man,
 194   "debug"         => \$DEBUG,
 195   "verbose"       => \$VERBOSE
 196 ) or pod2usage(2);
 197 pod2usage(1) if $help;
 198 pod2usage( verbose => 2 ) if $man;
 199
 200 #####################################################################################################
 201 # Key to database information and information for accessing them
 202 my $database = {
 203
 204   ## default database used by Jpred
 205   uniref90 => {
 206     database   => $db_path . "/uniref90.filt",
 207     unfiltered => $db_path . "/uniref90",
 208   },
 209
 210   ## database used during Jnet training
 211   training => {
 212     database   => $db_path . "/training/uniref90.filt",
 213     unfiltered => $db_path . "/training/uniref90",
 214   },
 215
 216   ## database used for ported Jpred
 217   ported_db => {
 218     database   => $db_path . "/uniref90.filt",
 219     unfiltered => $db_path . "/uniref90",
 220   },
 221   ## cluster-specific path for Jpred
 222   cluster => {
 223     database   => "/homes/www-jpred/databases/uniref90.filt",
 224     unfiltered => "/homes/www-jpred/databases/uniref90",
 225   },
 226
 227   ## these other DBs are experimental ones used during development.
 228   ## check they exist before using them.
 229   swall => {
 230
 231     # generic entry for use with validate_jnet.pl
 232     # real db location defined by validate_jnet.pl
 233     database   => $db_path . "/swall/swall.filt",
 234     unfiltered => $db_path . "/swall/swall",
 235   },
 236
 237   uniprot => {
 238
 239     # Path to PSIBLAST db
 240     database   => $db_path . "/3/swall.filt",
 241     unfiltered => $db_path . "/3/swall",
 242   },
 243
 244   uniref50 => {
 245     database   => $db_path . "/6/swall.filt",
 246     unfiltered => $db_path . "/6/swall",
 247   },
 248 };
 249
 250 pod2usage(' -sequence argument not provided') unless $infile;
 251 die "-db $db_entry not recognised" unless exists $database->{$db_entry};
 252 $output = $infile . ".res" unless $output;
 253 $ncpu = 1 if ( 1 > $ncpu or 8 < $ncpu );
 254
 255 for ( [ "input", $infile ], [ "output", $output ], [ "psi", $psiblast ], [ "db", $db_entry ] ) {
 256   my ( $key, $value ) = @{$_};
 257   defined $value or next;
 258   errlog( join( ": ", $key, $value ), "\n" );
 259 }
 260
 261 #####################################################################################################
 262 my $platform = check_OS();
 263 print "JPRED: checking platiform... $platform\n" if $DEBUG;
 264
 265 #####################################################################################################
 266 my $query = FASTA::File->new( read_file => $infile );
 267
 268 my @seqs;
 269 my $psi = PSIBLAST->new;
 270 unless ( defined $psiblast && -e $psiblast ) {
 271   ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
 272   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
 273   # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3'        # stricter searching criteria
 274   # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
 275   my $psi_run = PSIBLAST::Run->new(
 276     debug    => $DEBUG,
 277     args     => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
 278     path     => $psiblastbin,
 279     input    => $infile,
 280     output   => "$output.blast",
 281     matrix   => "$output.profile",
 282     database => $database->{$db_entry}{database},
 283   );
 284
 285   # For reduced databases, get the size of the orginal and use this as
 286   # the equivilent DB size
 287   if ( $db_entry =~ /^sp_red_/ ) {
 288     ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
 289     $psi_run->args(
 290       $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} )    # CC sub is below
 291     );
 292   }
 293   print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
 294   print "BLASTDB path: $ENV{BLASTDB}\n"  if $DEBUG;
 295
 296   print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
 297   $psi_run->run or die;                                                              # CC sub is from PSIBLAST::Run
 298
 299   $psi->read_file("$output.blast");                                                  # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
 300   system("gzip -9f $output.blast");
 301 } else {
 302   if   ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) }                   # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
 303   else                         { $psi->read_file($psiblast) }                        # CC ditto above
 304 }
 305
 306 #####################################################################################################
 307 # Convert the last itteration into a collection of PSISEQ objects
 308 for ( $psi->get_all ) {                                                              # CC sub is from PSIBLAST.pm
 309   my ( $id, $start, $align ) = @{$_};
 310   push @seqs,
 311     PSISEQ->new(
 312     id    => $id,
 313     start => $start,
 314     align => [ split( //, $align ) ]
 315     );
 316 }
 317
 318 #####################################################################################################
 319 ## When there are no PSI-BLAST hits generate an HMM from the query
 320 ## and run Jnet against that only.
 321 ## Accuracy won't be as good, but at least it's a prediction
 322 if ( @seqs == 0 ) {
 323   if ( $predNoHits == 0 ) {
 324     warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
 325     print ">>100% complete\n";
 326     exit;
 327   } else {
 328     print ">>50% complete\n";
 329     warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
 330     print "Running HMMer on query...\n" if $VERBOSE;
 331
 332     # copy input query to alignment
 333     #system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
 334     open( my $ifh, "<", $infile )            or die "JPRED: cannot open file $infile: $!";
 335     open( my $ofh, ">", $output . ".align" ) or die "JPRED: cannot open file $output\.align: $!";
 336     while (<$ifh>) { print $ofh, $_ }
 337     close $ifh;
 338     close $ofh;
 339
 340     # Temp files required for HMMer
 341     my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
 342
 343     system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
 344     system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 345
 346     # Read in the HMMER file
 347     my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 348
 349     # Convert from HMMER::Profile to HMMER::Profile::Jnet
 350     my $hmmer_jnet = HMMER::Profile::Jnet->new;
 351     $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 352     $hmmer_jnet->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 353     print ">>70% complete\n";
 354
 355     # Run Jnet for the prediction
 356     print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
 357     jnet( map { "$output.$_" } qw(hmm jnet) );    # CC sub is below
 358   }
 359 } else {
 360   psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;    # CC sub is below
 361   print ">>40% complete\n";
 362
 363   #####################################################################################################
 364   # Make PSIBLAST truncated alignments the right length
 365   print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
 366   @seqs = extend( $query, @seqs );                  # CC sub si below
 367   psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
 368
 369   #####################################################################################################
 370   # Remove masking from PSIBLAST alignment
 371   print "Unmasking the alignments...\n" if $VERBOSE;
 372   my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
 373   remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
 374   psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
 375
 376   #####################################################################################################
 377   # Convert the sequences to upper case
 378   print "Converting sequences to the same case...\n" if $VERBOSE;
 379   toupper($_) for @seqs;    # CC sub is below
 380   psiseq2fasta( "${output}_backup.fasta.gz", @seqs );
 381
 382   #####################################################################################################
 383   # Remove excessive sequences
 384   print "Remove excessive sequences...\n" if $VERBOSE;
 385   @seqs = reduce( $MAX_ALIGN, @seqs );
 386   psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
 387
 388   #####################################################################################################
 389   # Remove sequences that are too long or too short
 390   print "Remove sequences which too long or short...\n" if $VERBOSE;
 391   @seqs = del_long_seqs( 50, @seqs );
 392   psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
 393
 394   #####################################################################################################
 395   # Remove redundant sequences based upon pairwise identity and OC clustering
 396   print "Remove redundant sequences...\n" if $VERBOSE;
 397   @seqs = nonred( $NR_CUT, @seqs );
 398   psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
 399
 400   #####################################################################################################
 401   # Check that we haven't got rid of all of the sequences
 402   if ( @seqs < 2 ) {
 403     warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
 404     @seqs = fasta2psiseq("${output}_backup.fasta.gz");
 405   }
 406   unlink("${output}_backup.fasta.gz") unless $DEBUG;
 407
 408   # Remove gaps in the query sequence and same positions in the alignment
 409   print "Removing gaps in the query sequence...\n" if $VERBOSE;
 410   degap(@seqs);
 411   psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
 412
 413   # Output the alignment for the prediction
 414   print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
 415   psiseq2fasta( "$output.align", @seqs );
 416
 417   #####################################################################################################
 418   # Equivilent to getpssm script
 419   print "Output the PSSM matrix from the PSI-BLAST profile...\n";
 420   my $pssm = PSIBLAST::PSSM->new( read_file => "$output.profile" );
 421   $pssm->write_file("$output.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
 422   print ">>50% complete\n";
 423
 424   #####################################################################################################
 425   # Run HMMER on the sequences
 426   print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
 427   my $hmmer = hmmer(@seqs);             # CC sub is below
 428   $hmmer->write_file("$output.hmm");    # write_file is in the Read.pm called from the HMMER::Profile module
 429   print ">>70% complete\n";
 430
 431   #####################################################################################################
 432   # Run Jnet for the prediction
 433   print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
 434   jnet( map { "$output.$_" } qw(hmm jnet pssm) );    # CC sub is below
 435 }
 436
 437 print ">>100% complete\n";
 438
 439 unless ( defined $nofinal ) {
 440   my $aligfile = $output.".align";
 441   my $jnetfile = $output.".jnet";
 442   my $jnetfastafile = $output.".jnet.fasta";
 443   my $resufile = $output.".res";
 444   concise2fasta($jnetfile, $jnetfastafile);
 445   open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
 446   open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
 447   open( my $OUT, ">", $resufile ) or die "ERROR! unable to open '$resufile': ${!}\nDied";
 448   while (<$IN2>) {print $OUT $_;}
 449   while (<$IN1>) {print $OUT $_;}
 450   close($IN1);
 451   close($IN2);
 452   close($OUT);
 453 }
 454
 455 print "Jpred Finished\n";
 456 exit;
 457
 458 #####################################################################################################
 459 # Functions
 460 #####################################################################################################
 461
 462 #####################################################################################################
 463 sub fasta2concise {
 464   my $infile  = shift;
 465   my $outfile = shift;
 466
 467   open( my $IN,  "<", $infile )  or die "ERROR! unable to open '$infile': ${!}\nDied";
 468   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 469
 470   my ( $seq, @seqs, @title );
 471   while (<$IN>) {
 472     if (s/^>//) {
 473       if ($seq) {
 474         $seq =~ s/\n|\s//g;
 475         $seq =~ s/(.)/$1,/g;
 476         push @seqs, $seq;
 477         $seq = "";
 478       }
 479       chomp;
 480       push @title, $_;
 481     } else {
 482       chomp;
 483       $seq .= $_;
 484     }
 485   }
 486   $seq =~ s/\n|\s//g;
 487   $seq =~ s/(.)/$1,/g;
 488   push @seqs, $seq;
 489
 490   if ( @title != @seqs ) { die("non matching number of titles and sequences!\n"); }
 491
 492   foreach ( 0 .. $#title ) {
 493     print "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
 494   }
 495   close($IN);
 496   close($OUT);
 497 }
 498
 499 #####################################################################################################
 500 sub concise2fasta {
 501   my $infile  = shift;
 502   my $outfile = shift;
 503
 504   my ( @seqs, %seq, @pred, %pred );
 505
 506   my @var = (
 507   "Lupas_21",  "Lupas_14", "Lupas_28", "JNETPSSM",  "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETFREQ",
 508   "JNETALIGN", "JNETHMM",  "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
 509   );
 510
 511   open( my $IN,  "<", $infile )  or die "ERROR! unable to open '$infile': ${!}\nDied";
 512   open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
 513
 514   while (<$IN>) {
 515     if (/^\n/) { next; }
 516     my ( $id, $seq ) = split( ":", $_ );
 517     if ( !$id || !$seq ) { next; }    # Check we have proper values
 518     $seq =~ s/,//g;
 519     chomp($seq);
 520     if ( $id =~ /;/ ) {               # Then it's an alignment
 521       @_ = split( ";", $id );
 522       push @seqs, $_[1];
 523       $seq{ $_[1] } = $seq;
 524     }
 525     foreach (@var) {
 526       if ( $_ eq $id ) {
 527         push @pred, $_;
 528         $pred{$_} = $seq;
 529       }
 530     }
 531   }
 532   close($IN);
 533
 534   foreach (@seqs) {
 535     $seq{$_} =~ s/(.{72})/$1\n/g;
 536     print $OUT ">$_\n$seq{$_}\n";
 537   }
 538
 539   foreach (@pred) {
 540     $pred{$_} =~ s/[TCYWXZSI\?_]/-/g;
 541     $pred{$_} =~ s/B/E/g;
 542     $pred{$_} =~ s/G/H/g;
 543
 544     if (/SOL/) { $pred{$_} =~ s/E/B/g; }
 545     $pred{$_} =~ s/(.{72})/$1\n/g;
 546     print $OUT ">$_\n$pred{$_}\n";
 547   }
 548   close($OUT);
 549 }
 550
 551 =begin :private
 552
 553 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
 554
 555 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
 556 Does it in place.
 557
 558 =cut
 559
 560 #####################################################################################################
 561 sub remove_seq_masks {
 562   my $idx = shift;
 563   my $seq = shift;
 564
 565   # Only bother if we have a sequence which has been masked
 566   return unless grep { uc $_ eq 'X' } @{ $seq->align };
 567
 568   my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
 569   my $searchdb = $database->{$db_entry}{unfiltered};
 570   $start--;    # We need this to be zero based
 571
 572   my $f    = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
 573   my @seqs = $f->get_entries;
 574   my $db   = shift @seqs;
 575
 576   unless ($db) {
 577     confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
 578   } elsif (@seqs) {
 579     confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
 580   }
 581
 582   my @db_seq = @{ $db->seq };
 583
 584   # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
 585   my $i = 0;
 586   for ( 0 .. $#align ) {
 587     next if $align[$_] =~ /$GAP/;
 588
 589     if ( $align[$_] eq 'X' ) {
 590       unless ( defined $db_seq[ $i + $start ] ) {
 591         croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
 592       }
 593       $align[$_] = $db_seq[ $i + $start ];
 594     }
 595     $i++;
 596   }
 597   $seq->align( \@align );
 598 }
 599
 600 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
 601
 602 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
 603
 604 Equivilent to reduce script.
 605
 606 =cut
 607
 608 #####################################################################################################
 609 sub reduce {
 610   my ( $max, @seqs ) = @_;
 611
 612   my $num = int( @seqs / $max );
 613   my @res = shift @seqs;
 614
 615   # Don't bother if we have less than the required sequences
 616   if ( @seqs < $max or 0 == $num ) {
 617     return @res, @seqs;
 618   }
 619
 620   for ( 0 .. $#seqs ) {
 621     push @res, $seqs[$_] if ( 0 == $_ % $num );
 622   }
 623
 624   return @res;
 625 }
 626
 627 =head2 nonred($cutoff, @PSISEQS);
 628
 629 Removes redundant sequences at $cutoff level of sequence identity.
 630
 631 Equivilent to nonred script.
 632
 633 =cut
 634
 635 #####################################################################################################
 636 sub nonred {
 637   my ( $cutoff, @seqs ) = @_;
 638
 639   # Run pairwise and then OC and remove similar sequences
 640   unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
 641   unless (@seqs)             { croak "JPRED: No sequences passed to nonred()" }
 642   if     ( @seqs == 1 ) {
 643     warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
 644     return @seqs;
 645   }
 646
 647   # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
 648   my $fasta = FASTA::File->new;
 649   foreach (@seqs) {
 650     my $f = FASTA->new( id => $_->id );
 651     $f->seq( @{ $_->align } );
 652     $fasta->add_entries($f);
 653   }
 654
 655   # Run pairwise
 656   my $pair = Pairwise->new( path => $pairwise );
 657   my $foo = join "\n", $pair->run($fasta) or die $!;
 658
 659   # Run OC
 660   my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
 661   $ocres->read_string( join "", $ocres->run($foo) );
 662
 663   # Now found those entries that are unrelated
 664   my @keep;
 665   for ( $ocres->get_groups ) {
 666     my ( $group, $score, $size, @labels ) = @{$_};
 667
 668     # Keep the QUERY from the cluster, otherwise just get onelt
 669     if ( grep { /QUERY/ } @labels ) {
 670
 671       # Make sure the query stays at the start
 672       unshift @keep, "QUERY";
 673       next;
 674     } else {
 675       push @keep, shift @labels;
 676     }
 677   }
 678   push @keep, $ocres->get_unclustered;
 679
 680   # Return the matching entries.
 681   # We use the temporay to mimic the selection of sequences in the nonred
 682   # script, which took the sequences in the order they occured in the file.
 683   my @filtered_res;
 684   for (@seqs) {
 685     my $label = $_->id;
 686     if ( grep { $_ =~ /^$label$/ } @keep ) {
 687       push @filtered_res, $_;
 688     }
 689   }
 690
 691   return @filtered_res;
 692 }
 693
 694 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
 695
 696 Removes those sequences that are more than $percentage longer or shorter than the first
 697 sequence in @seqs. @seqs is an array of PSISEQ objects.
 698
 699 Equivilent to trim_seqs script.
 700
 701 =cut
 702
 703 #####################################################################################################
 704 sub del_long_seqs {
 705   my ( $factor, @seqs ) = @_;
 706
 707   # Keep the query sequence (we assume query is the 1st FASTA record)
 708   my $query = shift @seqs;
 709   my @res   = $query;
 710
 711   # Find the length of the query sequence without any gaps
 712   my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
 713
 714   # Calculate the upper and lower allowed bounds
 715   my ( $upper, $lower );
 716   {
 717     my $bounds = ( $length / 100 ) * $factor;
 718     ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
 719   }
 720
 721   # Now try each sequence and see how long they are
 722   for (@seqs) {
 723     my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
 724     if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
 725   }
 726
 727   return @res;
 728 }
 729
 730 =head2 toupper ($PSISEQ)
 731
 732 Converts sequence held in PSISEQ object to uppercase.
 733
 734 =cut
 735
 736 #####################################################################################################
 737 sub toupper {
 738   $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
 739 }
 740
 741 =head2 degap($PSISEQ_QUERY, @PSISEQS)
 742
 743 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
 744
 745 Equivilent to fasta2jnet script.
 746
 747 =cut
 748
 749 #####################################################################################################
 750 sub degap {
 751   my (@seqs) = @_;
 752
 753   # Find where the gaps in the query sequence are
 754   my @gaps;
 755   my $i = 0;
 756   for ( @{ $seqs[0]->align } ) {
 757     push @gaps, $i if $_ !~ /$GAP/;
 758     $i++;
 759   }
 760
 761   return unless @gaps;
 762
 763   # Remove the gaps in all the sequences.
 764   # Derefences the array reference and takes a slice inside the method call
 765   # arguments, then coerce back to an array ref.
 766   for (@seqs) {
 767     $_->align( [ @{ $_->align }[@gaps] ] );
 768   }
 769 }
 770
 771 =head2 getfreq($filename, @PSISEQS)
 772
 773 Creates a PSIBLAST like profile and writes it to the $filename.
 774
 775 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
 776 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
 777 but it's *not* the same.
 778
 779 =cut
 780
 781 #####################################################################################################
 782 sub getfreq {
 783   my ( $fn, @seqs ) = @_;
 784
 785   #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
 786
 787   # Create a PSIBLAST like profile from the alignment
 788   # This doesn't greate the same result as old Jpred, I think this is because
 789   # differences in the input between old and new
 790
 791   # For testing
 792   #   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
 793   #   my @profile = profile(
 794   #      map { join "", @{ $_->seq } } $f->get_entries
 795   #   );
 796
 797   my @profile = profile( map { join "", @{ $_->align } } @seqs );
 798
 799   open my $fh, ">$fn" or die "JPRED: $fn: $!";
 800   print $fh join( " ", @{$_} ), "\n" for @profile;
 801 }
 802
 803 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
 804
 805 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
 806
 807 Equivilent to gethmm script.
 808
 809 =cut
 810
 811 #####################################################################################################
 812 sub hmmer {
 813   my (@seqs) = @_;
 814
 815   # Temp files required
 816   my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
 817
 818   # Create the FASTA file
 819   psiseq2fasta( $tmp_fasta, @seqs );
 820
 821   # Run HMMer
 822   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
 823   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
 824
 825   # Read in the HMMER file
 826   my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
 827
 828   # Convert from HMMER::Profile to HMMER::Profile::Jnet
 829   my $hmmer_jnet = HMMER::Profile::Jnet->new;
 830   $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
 831
 832   return $hmmer_jnet;
 833 }
 834
 835 =head2 jnet($hmmer, $out, $pssm )
 836
 837 Runs Jnet. Pass the paths of the filenames to do the prediction on
 838
 839 =cut
 840
 841 #####################################################################################################
 842 sub jnet {
 843   my ( $hmmer, $outfile, $pssm ) = @_;
 844   if ($pssm) {
 845
 846     # run Jnet prediction with all the profile data
 847     system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
 848   } else {
 849
 850     # run Jnet prediction with only HMMer and alignment data
 851     system("$jnet --concise --hmmer $hmmer > $outfile");
 852   }
 853   check( "jnet", $? ) or exit 1;
 854 }
 855
 856 =head1 psiseq2fasta($path, @PSISEQ)
 857
 858 Writes a FASTA file given an array of PSISEQ objects.
 859
 860 =cut
 861
 862 #####################################################################################################
 863 sub psiseq2fasta {
 864   my ( $fn, @seqs ) = @_;
 865
 866   #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
 867   confess "JPRED: not passed PSISEQs" unless @seqs;
 868
 869   #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
 870
 871   # Changed from using FASTA::File object due to seg faulting on some large,
 872   # long alignments, presumably due to running out of memory.
 873   #   my $fasta = FASTA::File->new;
 874   #   $fasta->add_entries(
 875   #      map {
 876   #         my $f = FASTA->new(id => $_->id);
 877   #         $f->seq(@{$_->align});
 878   #         $f;
 879   #      } @seqs
 880   #   );
 881   #   $fasta->write_file($fn);
 882
 883   my $fh;
 884   $fn =~ /\.gz$/
 885     ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
 886     : ( open $fh, ">$fn" or die "JPRED: $!" );
 887
 888   for (@seqs) {
 889     print $fh ">", $_->id, "\n";
 890     my $seq = join( "", @{ $_->align } );
 891     $seq =~ s/(.{72})/$1\n/g;
 892     chomp $seq;
 893     print $fh $seq . "\n";
 894   }
 895   close $fh;
 896 }
 897
 898 =head1 @PSISEQ = fasta2psiseq($path)
 899
 900 Reads in a FASTA file and returns an array of PSISEQ objects.
 901
 902 =cut
 903
 904 #####################################################################################################
 905 sub fasta2psiseq {
 906   my ($fn) = @_;
 907
 908   my $fasta =
 909     $fn =~ /\.gz$/
 910     ? FASTA::File->new( read_gzip_file => $fn )
 911     : FASTA::File->new( read_file      => $fn );
 912   $fasta or die "JPRED: $!";
 913
 914   my @seqs;
 915   for ( $fasta->get_entries ) {
 916     my $seq = PSISEQ->new;
 917     $seq->id( $_->id );
 918     $seq->align( [ @{ $_->seq } ] );
 919     push @seqs, $seq;
 920   }
 921
 922   return @seqs;
 923 }
 924
 925 #####################################################################################################
 926 sub fasta2psiseq_lowmemory {
 927   my $filename = shift;
 928
 929   local $/ = "\n";
 930   my $fh;
 931   $filename =~ /\.gz$/
 932     ? ( open $fh, "gzip -dc $filename|" or die $! )
 933     : ( open $fh, $filename or die $! );
 934
 935   my @seqs;
 936   my ( $id, $seq );
 937   while (<$fh>) {
 938     chomp;
 939     if (s/^>//) {
 940       $seq =~ s/ //g if defined $seq;
 941
 942       if ( $id and $seq ) {
 943         my $psi = PSISEQ->new( id => $id );
 944         $psi->align( [ split //, $seq ] );
 945         push @seqs, $psi;
 946         undef $id;
 947         undef $seq;
 948       } else {
 949         $id = $_;
 950       }
 951     } else {
 952       $seq .= $_;
 953     }
 954   }
 955   if ( $id and $seq ) {
 956     my $psi = PSISEQ->new( id => $id );
 957     $psi->seq($seq);
 958     push @seqs, $psi;
 959   }
 960
 961   return @seqs;
 962 }
 963
 964 =head1 extend($FASTA::File, @PSISEQ)
 965
 966 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
 967 against the query, this function extends the alignment so that all of the query is
 968 present in the alignment. The other sequences are extended with gap characters.
 969
 970 =cut
 971
 972 #####################################################################################################
 973 sub extend {
 974   my ( $query, @seqs ) = @_;
 975
 976   #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
 977   #croak "JPRED: Not PSISEQ"           if grep { not isa $_, 'PSISEQ' } @seqs;
 978
 979   # Get the query sequence
 980   my $q_query = $query->get_entry(0);
 981
 982   # Get the query sequence from the PSIBLAST alignment
 983   my $a_query = shift @seqs;
 984
 985   # The gap size required to expand the alignment
 986   my ( $start_gap_length, $end_gap_length );
 987   {
 988
 989     # Make handling the sequence easier
 990     my $q_seq = join "", @{ [ $q_query->seq ] };
 991     my $a_seq = join "", @{ $a_query->align };
 992
 993     $q_seq =~ s/-//g;
 994     $a_seq =~ s/-//g;
 995
 996     ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
 997
 998     $start_gap_length = index( $q_seq, $a_seq );
 999     croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1000
1001     $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1002   }
1003
1004   # Add the gaps to the end of the alignments
1005   for (@seqs) {
1006     $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1007   }
1008
1009   # Put the query sequence back
1010   unshift @seqs,
1011     PSISEQ->new(
1012     id    => $a_query->id,
1013     align => [
1014       ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1015       @{ $a_query->align },
1016       ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1017     ]
1018     );
1019
1020   return @seqs;
1021 }
1022
1023 =head1 $int = fasta_seq_length($path)
1024
1025 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1026
1027 =cut
1028
1029 #####################################################################################################
1030 sub fasta_seq_length {
1031   my ($fn) = @_;
1032   open my $fh, $fn or croak "Can't open file $fn: $!\n";
1033
1034   my $length = 0;
1035   local $_, $/ = "\n";
1036   while (<$fh>) {
1037     next if /^>/;
1038     chomp;
1039     $length += tr///c;
1040   }
1041
1042   return $length;
1043 }
1044
1045 =head1 log(@messages)
1046
1047 Report messages to STDERR
1048
1049 =cut
1050
1051 #####################################################################################################
1052 sub errlog { print STDERR @_ }