5 jpred - Secondary structure prediction program
9 ./jpred.pl -in/-sequence <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
13 This is a program which predicts the secondary structure of a protein sequence given a path to a
14 FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
17 Some of the output may not be meaningful if used directly - it can be safely ignored.
23 =item -in/-sequence <FILE1>
25 The path to the sequence file (in FASTA format) you want to predict.
27 =item -output <FILEPREFIX>
29 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
31 =item -outfile <FILE2>
33 An output file, by default it is undefined and the -output option is working
35 =item -logfile <FILE3>
37 Logging file, by default it is undefined and logging switched off
41 Path to the uniref database used for PSI-BLAST querying. default value: .
43 =item -dbname database
45 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
46 Default value: uniref90
50 Path to a PSIBLAST output file.
54 Number of CPU used by jpred.pl. Maximum value is 8
59 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
63 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
64 The option untoggles this step and stops jpred at the concise file generation
68 Redefined basic variable in order to use the script within JABAWS
72 Verbose mode. Print more information while running jpred.
76 Debug mode. Print debugging information while running jpred.
80 Gives help on the programs usage.
84 Displays this man page.
90 Can't cope with gaps in the query sequence.
92 Doesn't accept alignments.
94 If you find any others please contact authors.
98 Jonathan Barber <jon@compbio.dundee.ac.uk>
99 Chris Cole <christian@cole.name>
100 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
104 ## TODO check for gaps in query sequence
105 ## check for disallowed chars in sequence
117 # path to Jpred modules
118 use FindBin qw($Bin);
121 # internal jpred modules
123 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS setup_env);
125 use HMMER::Profile::Jnet;
134 use Utils qw(profile);
137 our $VERSION = '3.0.2';
138 my $MAX_ALIGN = 10000; # maximum number of alignment sequences allowed
139 my $NR_CUT = 75; # cut-off seqeunce ID value for defining non-redundancy
141 # Gap characters in sequences
144 #####################################################################################################
145 # Light object to hold sequence information, light to prevent memory overload
146 # for big search results. Even so this is quite memory intensive, I can't see
147 # how to reduce the size further.
153 my ( $class, %args ) = @_;
156 for ( keys %args ) { $self->$_( $args{$_} ) }
173 if ( defined $_[1] ) {
174 ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
176 $_[0]->[2] = join( "", @{ $_[1] } );
178 return [ split( //, $_[0]->[2] ) ];
183 #####################################################################################################
190 my $ncpu = 1; # number of CPUs for psiblast calculations
191 my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
192 my $db_path = "/homes/www-jpred/databases";
193 my $db_entry = "cluster";
196 my ( $help, $man, $DEBUG, $VERBOSE );
199 "in|sequence=s" => \$infile,
200 "output=s" => \$prefix,
201 "outfile=s" => \$outfile,
202 "logfile=s" => \$logfile,
203 "psi=s" => \$psiblast,
204 "dbname=s" => \$db_entry,
205 "dbpath=s" => \$db_path,
207 "pred-nohits" => \$predNoHits,
208 "no-final" => \$nofinal,
209 "jabaws" => \$jabaws,
213 "verbose" => \$VERBOSE
215 pod2usage(1) if $help;
216 pod2usage( verbose => 2 ) if $man;
218 #####################################################################################################
219 # Key to database information and information for accessing them
222 ## default database used by Jpred
224 database => $db_path . "/uniref90.filt",
225 unfiltered => $db_path . "/uniref90",
228 ## database used during Jnet training
230 database => $db_path . "/training/uniref90.filt",
231 unfiltered => $db_path . "/training/uniref90",
234 ## database used for ported Jpred
236 database => $db_path . "/uniref90.filt",
237 unfiltered => $db_path . "/uniref90",
240 ## cluster-specific path for Jpred
242 database => $db_path . "uniref90.filt",
243 unfiltered => $db_path . "uniref90",
246 ## these other DBs are experimental ones used during development.
247 ## check they exist before using them.
248 # generic entry for use with validate_jnet.pl
249 # real db location defined by validate_jnet.pl
251 database => $db_path . "/swall/swall.filt",
252 unfiltered => $db_path . "/swall/swall",
255 # Path to PSIBLAST db
257 database => $db_path . "/3/swall.filt",
258 unfiltered => $db_path . "/3/swall",
262 database => $db_path . "/6/swall.filt",
263 unfiltered => $db_path . "/6/swall",
267 pod2usage(' ERROR! Input file with a sequence should be provided with -sequence/-in') unless $infile;
268 pod2usage(' ERROR! Unknown database at ' . $db_path . '. Use -dbpath and -dbname for configuring the database' ) unless exists $database->{$db_entry};
269 pod2usage(' ERROR! UNIREF database doesn\'t exist. Use -dbpath and -dbname for configuring the database') unless -f $database->{$db_entry}{database};
271 if ( defined $prefix ) {
272 unless (defined $outfile) {
273 $outfile = $prefix. ".res.fasta";
276 if (defined $outfile) {
277 print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
278 $prefix = $outfile . ".tmp";
280 print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
281 $prefix = $infile . ".res";
282 $outfile = $prefix. ".jnet";
286 if ( 1 > $ncpu or 8 < $ncpu ) {
287 print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
291 $VERBOSE = 1 if $DEBUG;
294 if (defined $logfile) {
295 open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
298 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
299 my ( $key, $value ) = @{$_};
300 defined $value or next;
301 my $par = join( ": ", $key, $value );
302 print "$par\n" if $DEBUG;
303 print $LOG "$par\n" if $LOG;
306 #####################################################################################################
307 if (defined $jabaws) {
308 setup_jpred_env($Bin);
311 my $platform = check_OS();
312 print "JPRED: checking platiform... $platform\n" if $DEBUG;
313 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
315 #####################################################################################################
316 my $query = FASTA::File->new( read_file => $infile );
319 my $psi = PSIBLAST->new;
320 unless ( defined $psiblast && -e $psiblast ) {
321 ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
322 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence
323 # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
324 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
325 my $psi_run = PSIBLAST::Run->new(
327 args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
328 path => $psiblastbin,
330 output => "$prefix.blast",
331 matrix => "$prefix.profile",
332 database => $database->{$db_entry}{database},
335 # For reduced databases, get the size of the orginal and use this as
336 # the equivilent DB size
337 if ( $db_entry =~ /^sp_red_/ ) {
338 ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
340 $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below
343 print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
344 print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
345 print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
346 print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG;
348 print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
349 print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
350 $psi_run->run or die; # CC sub is from PSIBLAST::Run
352 $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
353 system("gzip -9f $prefix.blast");
355 if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
356 else { $psi->read_file($psiblast) } # CC ditto above
359 #####################################################################################################
360 # Convert the last itteration into a collection of PSISEQ objects
361 for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm
362 my ( $id, $start, $align ) = @{$_};
367 align => [ split( //, $align ) ]
371 #####################################################################################################
372 ## When there are no PSI-BLAST hits generate an HMM from the query
373 ## and run Jnet against that only.
374 ## Accuracy won't be as good, but at least it's a prediction
376 if ( $predNoHits == 0 ) {
377 warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
378 print ">>100% complete\n";
379 print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
380 print $LOG ">>100% complete\n" if $LOG;
381 close ($LOG) if $LOG;
384 print ">>50% complete\n";
385 warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
386 print "Running HMMer on query...\n" if $VERBOSE;
387 print $LOG ">>50% complete\n" if $LOG;
388 print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
389 print $LOG "Running HMMer on query...\n" if $LOG;
391 # copy input query to alignment
392 #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
393 open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
394 open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
395 while (<$ifh>) { print $ofh, $_ }
399 # Temp files required for HMMer
400 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
402 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
403 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
405 # Read in the HMMER file
406 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
408 # Convert from HMMER::Profile to HMMER::Profile::Jnet
409 my $hmmer_jnet = HMMER::Profile::Jnet->new;
410 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
411 $hmmer_jnet->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
412 print ">>70% complete\n";
413 print $LOG ">>70% complete\n" if $LOG;
415 # Run Jnet for the prediction
416 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
417 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
418 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
419 print ">>90% complete\n";
420 print $LOG ">>90% complete\n" if $LOG;
421 print $jnetlog if $VERBOSE;
422 print $LOG $jnetlog if $LOG;
425 psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
426 print ">>40% complete\n";
427 print $LOG ">>40% complete\n" if $LOG;
429 #####################################################################################################
430 # Make PSIBLAST truncated alignments the right length
431 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
432 print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
433 @seqs = extend( $query, @seqs );
434 psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
436 #####################################################################################################
437 # Remove masking from PSIBLAST alignment
438 print "Unmasking the alignments...\n" if $VERBOSE;
439 print $LOG "Unmasking the alignments...\n" if $LOG;
440 my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
441 remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
442 psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
444 #####################################################################################################
445 # Convert the sequences to upper case
446 print "Converting sequences to the same case...\n" if $VERBOSE;
447 print $LOG "Converting sequences to the same case...\n" if $LOG;
448 toupper($_) for @seqs; # CC sub is below
449 psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
451 #####################################################################################################
452 # Remove excessive sequences
453 print "Remove excessive sequences...\n" if $VERBOSE;
454 print $LOG "Remove excessive sequences...\n" if $LOG;
455 @seqs = reduce( $MAX_ALIGN, @seqs );
456 psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
458 #####################################################################################################
459 # Remove sequences that are too long or too short
460 print "Remove sequences which too long or short...\n" if $VERBOSE;
461 print $LOG "Remove sequences which too long or short...\n" if $LOG;
462 @seqs = del_long_seqs( 50, @seqs );
463 psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
465 #####################################################################################################
466 # Remove redundant sequences based upon pairwise identity and OC clustering
467 print "Remove redundant sequences...\n" if $VERBOSE;
468 print $LOG "Remove redundant sequences...\n" if $LOG;
469 @seqs = nonred( $NR_CUT, @seqs );
470 psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
472 #####################################################################################################
473 # Check that we haven't got rid of all of the sequences
475 warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
476 print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
477 @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
479 unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
481 # Remove gaps in the query sequence and same positions in the alignment
482 print "Removing gaps in the query sequence...\n" if $VERBOSE;
483 print $LOG "Removing gaps in the query sequence...\n" if $LOG;
485 psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
487 # Output the alignment for the prediction
488 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
489 print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
490 psiseq2fasta( "$prefix.align", @seqs );
492 #####################################################################################################
493 # Equivilent to getpssm script
494 print "Output the PSSM matrix from the PSI-BLAST profile...\n";
495 print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
496 my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
497 $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
498 print ">>50% complete\n";
499 print $LOG ">>50% complete\n" if $LOG;
501 #####################################################################################################
502 # Run HMMER on the sequences
503 print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
504 print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
505 my $hmmer = hmmer(@seqs); # CC sub is below
506 $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
507 print ">>70% complete\n";
508 print $LOG ">>70% complete\n" if $LOG;
510 #####################################################################################################
511 # Run Jnet for the prediction
512 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
513 print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
514 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below
515 print ">>90% complete\n";
516 print $LOG ">>90% complete\n" if $LOG;
517 print $jnetlog if $VERBOSE;
518 print $LOG $jnetlog if $LOG;
521 unless ( defined $nofinal ) {
522 my $aligfile = $prefix . ".align";
523 my $jnetfile = $prefix . ".jnet";
524 my $jnetfastafile = $prefix . ".jnet.fasta";
525 concise2fasta( $jnetfile, $jnetfastafile );
526 open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
527 open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
528 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
529 while (<$IN2>) { print $OUT $_; }
530 while (<$IN1>) { print $OUT $_; }
534 unlink $jnetfastafile;
536 if (defined $outfile and $prefix.".jnet" ne $outfile) {
537 rename $prefix.".jnet", $outfile;
541 print ">>100% complete\n";
542 print "Jpred Finished\n";
543 print $LOG ">>100% complete\n" if $LOG;
544 print $LOG "Jpred Finished\n" if $LOG;
545 close ($LOG) if $LOG;
548 #####################################################################################################
550 #####################################################################################################
552 #####################################################################################################
556 =head2 fasta2concise ($infile, $outfile)
558 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
566 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
567 my ( $seq, @seqs, @title );
588 if ( @title != @seqs ) {
589 die("non matching number of titles and sequences!\n");
592 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
593 foreach ( 0 .. $#title ) {
594 print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
601 =head2 concise2fasta ($infile, $outfile)
603 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
607 #####################################################################################################
612 my ( @seqs, %seq, @pred, %pred );
615 "Lupas_21", "Lupas_14", "Lupas_28", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
616 "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
619 # parse input concise file
620 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
623 my ( $id, $seq ) = split( ":", $_ );
624 if ( defined $id and defined $seq ) { # Check we have proper values
627 if ( $id =~ /;/ ) { # Then it's an alignment
628 @_ = split( ";", $id );
630 $seq{ $_[1] } = $seq;
632 foreach my $v (@var) {
643 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
645 # print out sequences found with PSI-BLAST
647 $seq{$_} =~ s/(.{72})/$1\n/g;
648 print $OUT ">$_\n$seq{$_}\n";
651 # print out predictions
652 foreach my $p (@pred) {
653 $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
654 $pred{$p} =~ s/B/E/g;
655 $pred{$p} =~ s/G/H/g;
658 $pred{$p} =~ s/E/B/g;
660 $pred{$p} =~ s/(.{72})/$1\n/g;
661 print $OUT ">$p\n$pred{$p}\n";
668 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
670 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
675 #####################################################################################################
676 sub remove_seq_masks {
680 # Only bother if we have a sequence which has been masked
681 return unless grep { uc $_ eq 'X' } @{ $seq->align };
683 my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
684 my $searchdb = $database->{$db_entry}{unfiltered};
685 $start--; # We need this to be zero based
687 my $f = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
688 my @seqs = $f->get_entries;
689 my $db = shift @seqs;
692 confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
694 confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
697 my @db_seq = @{ $db->seq };
699 # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
701 for ( 0 .. $#align ) {
702 next if $align[$_] =~ /$GAP/;
704 if ( $align[$_] eq 'X' ) {
705 unless ( defined $db_seq[ $i + $start ] ) {
706 croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
708 $align[$_] = $db_seq[ $i + $start ];
712 $seq->align( \@align );
715 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
717 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
719 Equivilent to reduce script.
723 #####################################################################################################
725 my ( $max, @seqs ) = @_;
727 my $num = int( @seqs / $max );
728 my @res = shift @seqs;
730 # Don't bother if we have less than the required sequences
731 if ( @seqs < $max or 0 == $num ) {
735 for ( 0 .. $#seqs ) {
736 push @res, $seqs[$_] if ( 0 == $_ % $num );
742 =head2 nonred($cutoff, @PSISEQS);
744 Removes redundant sequences at $cutoff level of sequence identity.
746 Equivilent to nonred script.
750 #####################################################################################################
752 my ( $cutoff, @seqs ) = @_;
754 # Run pairwise and then OC and remove similar sequences
755 unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
756 unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
758 warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
762 # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
763 my $fasta = FASTA::File->new;
765 my $f = FASTA->new( id => $_->id );
766 $f->seq( @{ $_->align } );
767 $fasta->add_entries($f);
771 my $pair = Pairwise->new( path => $pairwise );
772 my $foo = join "\n", $pair->run($fasta) or die $!;
775 my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
776 $ocres->read_string( join "", $ocres->run($foo) );
778 # Now found those entries that are unrelated
780 for ( $ocres->get_groups ) {
781 my ( $group, $score, $size, @labels ) = @{$_};
783 # Keep the QUERY from the cluster, otherwise just get onelt
784 if ( grep { /QUERY/ } @labels ) {
786 # Make sure the query stays at the start
787 unshift @keep, "QUERY";
790 push @keep, shift @labels;
793 push @keep, $ocres->get_unclustered;
795 # Return the matching entries.
796 # We use the temporay to mimic the selection of sequences in the nonred
797 # script, which took the sequences in the order they occured in the file.
801 if ( grep { $_ =~ /^$label$/ } @keep ) {
802 push @filtered_res, $_;
806 return @filtered_res;
809 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
811 Removes those sequences that are more than $percentage longer or shorter than the first
812 sequence in @seqs. @seqs is an array of PSISEQ objects.
814 Equivilent to trim_seqs script.
818 #####################################################################################################
820 my ( $factor, @seqs ) = @_;
822 # Keep the query sequence (we assume query is the 1st FASTA record)
823 my $query = shift @seqs;
826 # Find the length of the query sequence without any gaps
827 my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
829 # Calculate the upper and lower allowed bounds
830 my ( $upper, $lower );
832 my $bounds = ( $length / 100 ) * $factor;
833 ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
836 # Now try each sequence and see how long they are
838 my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
839 if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
845 =head2 toupper ($PSISEQ)
847 Converts sequence held in PSISEQ object to uppercase.
851 #####################################################################################################
853 $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
856 =head2 degap($PSISEQ_QUERY, @PSISEQS)
858 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
860 Equivilent to fasta2jnet script.
864 #####################################################################################################
868 # Find where the gaps in the query sequence are
871 for ( @{ $seqs[0]->align } ) {
872 push @gaps, $i if $_ !~ /$GAP/;
878 # Remove the gaps in all the sequences.
879 # Derefences the array reference and takes a slice inside the method call
880 # arguments, then coerce back to an array ref.
882 $_->align( [ @{ $_->align }[@gaps] ] );
886 =head2 getfreq($filename, @PSISEQS)
888 Creates a PSIBLAST like profile and writes it to the $filename.
890 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
891 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
892 but it's *not* the same.
896 #####################################################################################################
898 my ( $fn, @seqs ) = @_;
900 #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
902 # Create a PSIBLAST like profile from the alignment
903 # This doesn't greate the same result as old Jpred, I think this is because
904 # differences in the input between old and new
907 # my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
908 # my @profile = profile(
909 # map { join "", @{ $_->seq } } $f->get_entries
912 my @profile = profile( map { join "", @{ $_->align } } @seqs );
914 open my $fh, ">$fn" or die "JPRED: $fn: $!";
915 print $fh join( " ", @{$_} ), "\n" for @profile;
918 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
920 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
922 Equivilent to gethmm script.
926 #####################################################################################################
930 # Temp files required
931 my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
933 # Create the FASTA file
934 psiseq2fasta( $tmp_fasta, @seqs );
937 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
938 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
940 # Read in the HMMER file
941 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
943 # Convert from HMMER::Profile to HMMER::Profile::Jnet
944 my $hmmer_jnet = HMMER::Profile::Jnet->new;
945 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
950 =head2 jnet($hmmer, $out, $pssm )
952 Runs Jnet. Pass the paths of the filenames to do the prediction on
956 #####################################################################################################
958 my ( $hmmer, $outfile, $pssm ) = @_;
961 # run Jnet prediction with all the profile data
962 system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
965 # run Jnet prediction with only HMMer and alignment data
966 system("$jnet --concise --hmmer $hmmer > $outfile");
968 check( "jnet", $? ) or exit 1;
971 open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
973 if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
980 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
981 print $OUT "\n".$res;
986 =head1 psiseq2fasta($path, @PSISEQ)
988 Writes a FASTA file given an array of PSISEQ objects.
992 #####################################################################################################
994 my ( $fn, @seqs ) = @_;
996 #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
997 confess "JPRED: not passed PSISEQs" unless @seqs;
999 #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1001 # Changed from using FASTA::File object due to seg faulting on some large,
1002 # long alignments, presumably due to running out of memory.
1003 # my $fasta = FASTA::File->new;
1004 # $fasta->add_entries(
1006 # my $f = FASTA->new(id => $_->id);
1007 # $f->seq(@{$_->align});
1011 # $fasta->write_file($fn);
1015 ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1016 : ( open $fh, ">$fn" or die "JPRED: $!" );
1019 print $fh ">", $_->id, "\n";
1020 my $seq = join( "", @{ $_->align } );
1021 $seq =~ s/(.{72})/$1\n/g;
1023 print $fh $seq . "\n";
1028 =head1 @PSISEQ = fasta2psiseq($path)
1030 Reads in a FASTA file and returns an array of PSISEQ objects.
1034 #####################################################################################################
1040 ? FASTA::File->new( read_gzip_file => $fn )
1041 : FASTA::File->new( read_file => $fn );
1042 $fasta or die "JPRED: $!";
1045 for ( $fasta->get_entries ) {
1046 my $seq = PSISEQ->new;
1048 $seq->align( [ @{ $_->seq } ] );
1055 #####################################################################################################
1056 sub fasta2psiseq_lowmemory {
1057 my $filename = shift;
1061 $filename =~ /\.gz$/
1062 ? ( open $fh, "gzip -dc $filename|" or die $! )
1063 : ( open $fh, $filename or die $! );
1070 $seq =~ s/ //g if defined $seq;
1072 if ( $id and $seq ) {
1073 my $psi = PSISEQ->new( id => $id );
1074 $psi->align( [ split //, $seq ] );
1085 if ( $id and $seq ) {
1086 my $psi = PSISEQ->new( id => $id );
1094 =head1 extend($FASTA::File, @PSISEQ)
1096 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1097 against the query, this function extends the alignment so that all of the query is
1098 present in the alignment. The other sequences are extended with gap characters.
1102 #####################################################################################################
1104 my ( $query, @seqs ) = @_;
1106 #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1107 #croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1109 # Get the query sequence
1110 my $q_query = $query->get_entry(0);
1112 # Get the query sequence from the PSIBLAST alignment
1113 my $a_query = shift @seqs;
1115 # The gap size required to expand the alignment
1116 my ( $start_gap_length, $end_gap_length );
1119 # Make handling the sequence easier
1120 my $q_seq = join "", @{ [ $q_query->seq ] };
1121 my $a_seq = join "", @{ $a_query->align };
1126 ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1128 $start_gap_length = index( $q_seq, $a_seq );
1129 croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1131 $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1134 # Add the gaps to the end of the alignments
1136 $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1139 # Put the query sequence back
1144 ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1145 @{ $a_query->align },
1146 ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1153 =head1 $int = fasta_seq_length($path)
1155 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1159 #####################################################################################################
1160 sub fasta_seq_length {
1162 open my $fh, $fn or croak "Can't open file $fn: $!\n";
1165 local $_, $/ = "\n";
1175 =head1 log(@messages)
1177 Report messages to STDERR