5 jpred - Secondary structure prediction program
9 ./jpred.pl -in/-sequence <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
13 This is a program which predicts the secondary structure of a protein sequence given a path to a
14 FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
17 Some of the output may not be meaningful if used directly - it can be safely ignored.
23 =item -in/-sequence <FILE1>
25 The path to the sequence file (in FASTA format) you want to predict.
27 =item -output <FILEPREFIX>
29 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
31 =item -outfile <FILE2>
33 An output file, by default it is undefined and the -output option is working
35 =item -logfile <FILE3>
37 Logging file, by default it is undefined and logging switched off
41 Path to the uniref database used for PSI-BLAST querying. default value: .
43 =item -dbname database
45 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
46 Default value: uniref90
50 Path to a PSIBLAST output file.
54 Number of CPU used by jpred.pl. Maximum value is 8
59 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
63 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
64 The option untoggles this step and stops jpred at the concise file generation
68 Redefined basic variable in order to use the script within JABAWS
72 Verbose mode. Print more information while running jpred.
76 Debug mode. Print debugging information while running jpred.
80 Gives help on the programs usage.
84 Displays this man page.
90 Can't cope with gaps in the query sequence.
92 Doesn't accept alignments.
94 If you find any others please contact authors.
98 Jonathan Barber <jon@compbio.dundee.ac.uk>
99 Chris Cole <christian@cole.name>
100 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
104 ## TODO check for gaps in query sequence
105 ## check for disallowed chars in sequence
117 # path to Jpred modules
118 use FindBin qw($Bin);
121 # internal jpred modules
123 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS setup_env);
125 use HMMER::Profile::Jnet;
134 use Utils qw(profile);
137 our $VERSION = '3.0.2';
138 my $MAX_ALIGN = 10000; # maximum number of alignment sequences allowed
139 my $NR_CUT = 75; # cut-off seqeunce ID value for defining non-redundancy
141 # Gap characters in sequences
144 #####################################################################################################
145 # Light object to hold sequence information, light to prevent memory overload
146 # for big search results. Even so this is quite memory intensive, I can't see
147 # how to reduce the size further.
153 my ( $class, %args ) = @_;
156 for ( keys %args ) { $self->$_( $args{$_} ) }
173 if ( defined $_[1] ) {
174 ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
176 $_[0]->[2] = join( "", @{ $_[1] } );
178 return [ split( //, $_[0]->[2] ) ];
183 #####################################################################################################
190 my $ncpu = 1; # number of CPUs for psiblast calculations
191 my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
192 my $db_path = "/homes/www-jpred/databases";
193 my $db_entry = "cluster";
196 my ( $help, $man, $DEBUG, $VERBOSE );
199 "in|sequence=s" => \$infile,
200 "output=s" => \$prefix,
201 "outfile=s" => \$outfile,
202 "logfile=s" => \$logfile,
203 "psi=s" => \$psiblast,
204 "dbname=s" => \$db_entry,
205 "dbpath=s" => \$db_path,
207 "pred-nohits" => \$predNoHits,
208 "no-final" => \$nofinal,
209 "jabaws" => \$jabaws,
213 "verbose" => \$VERBOSE
215 pod2usage(1) if $help;
216 pod2usage( verbose => 2 ) if $man;
218 #####################################################################################################
219 # Key to database information and information for accessing them
220 if (defined $jabaws) {
221 $db_path = "/data/UNIREFdb";
222 $db_entry = "cluster";
227 ## default database used by Jpred
229 database => $db_path . "/uniref90.filt",
230 unfiltered => $db_path . "/uniref90",
233 ## database used during Jnet training
235 database => $db_path . "/training/uniref90.filt",
236 unfiltered => $db_path . "/training/uniref90",
239 ## database used for ported Jpred
241 database => $db_path . "/uniref90.filt",
242 unfiltered => $db_path . "/uniref90",
245 ## cluster-specific path for Jpred
247 database => $db_path . "/uniref90.filt",
248 unfiltered => $db_path . "/uniref90",
251 ## these other DBs are experimental ones used during development.
252 ## check they exist before using them.
253 # generic entry for use with validate_jnet.pl
254 # real db location defined by validate_jnet.pl
256 database => $db_path . "/swall/swall.filt",
257 unfiltered => $db_path . "/swall/swall",
260 # Path to PSIBLAST db
262 database => $db_path . "/3/swall.filt",
263 unfiltered => $db_path . "/3/swall",
267 database => $db_path . "/6/swall.filt",
268 unfiltered => $db_path . "/6/swall",
272 my $dp = $database->{$db_entry}{database};
273 pod2usage("ERROR! Input file with a sequence should be provided with -sequence/-in") unless $infile;
274 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database" ) unless exists $database->{$db_entry};
275 pod2usage("ERROR! UNIREF database is not available at $dp. Use -dbpath and -dbname for configuring the database") unless -f $dp;
277 if ( defined $prefix ) {
278 unless (defined $outfile) {
279 $outfile = $prefix. ".res.fasta";
282 if (defined $outfile) {
283 print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
284 $prefix = $outfile . ".tmp";
286 print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
287 $prefix = $infile . ".res";
288 $outfile = $prefix. ".jnet";
292 if ( 1 > $ncpu or 8 < $ncpu ) {
293 print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
297 $VERBOSE = 1 if $DEBUG;
300 if (defined $logfile) {
301 open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
304 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
305 my ( $key, $value ) = @{$_};
306 defined $value or next;
307 my $par = join( ": ", $key, $value );
308 print "$par\n" if $DEBUG;
309 print $LOG "$par\n" if $LOG;
312 #####################################################################################################
313 if (defined $jabaws) {
314 setup_jpred_env($Bin);
317 my $platform = check_OS();
318 print "JPRED: checking platiform... $platform\n" if $DEBUG;
319 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
321 #####################################################################################################
322 my $query = FASTA::File->new( read_file => $infile );
325 my $psi = PSIBLAST->new;
326 unless ( defined $psiblast && -e $psiblast ) {
327 ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
328 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence
329 # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
330 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
331 my $psi_run = PSIBLAST::Run->new(
333 args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
334 path => $psiblastbin,
336 output => "$prefix.blast",
337 matrix => "$prefix.profile",
338 database => $database->{$db_entry}{database},
341 # For reduced databases, get the size of the orginal and use this as
342 # the equivilent DB size
343 if ( $db_entry =~ /^sp_red_/ ) {
344 ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
346 $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below
349 print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
350 print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
351 print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
352 print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG;
354 print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
355 print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
356 $psi_run->run or die; # CC sub is from PSIBLAST::Run
358 $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
359 system("gzip -9f $prefix.blast");
361 if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
362 else { $psi->read_file($psiblast) } # CC ditto above
365 #####################################################################################################
366 # Convert the last itteration into a collection of PSISEQ objects
367 for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm
368 my ( $id, $start, $align ) = @{$_};
373 align => [ split( //, $align ) ]
377 #####################################################################################################
378 ## When there are no PSI-BLAST hits generate an HMM from the query
379 ## and run Jnet against that only.
380 ## Accuracy won't be as good, but at least it's a prediction
382 if ( $predNoHits == 0 ) {
383 warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
384 print ">>100% complete\n";
385 print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
386 print $LOG ">>100% complete\n" if $LOG;
387 close ($LOG) if $LOG;
390 print ">>50% complete\n";
391 warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
392 print "Running HMMer on query...\n" if $VERBOSE;
393 print $LOG ">>50% complete\n" if $LOG;
394 print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
395 print $LOG "Running HMMer on query...\n" if $LOG;
397 # copy input query to alignment
398 #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
399 open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
400 open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
401 while (<$ifh>) { print $ofh, $_ }
405 # Temp files required for HMMer
406 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
408 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
409 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
411 # Read in the HMMER file
412 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
414 # Convert from HMMER::Profile to HMMER::Profile::Jnet
415 my $hmmer_jnet = HMMER::Profile::Jnet->new;
416 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
417 $hmmer_jnet->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
418 print ">>70% complete\n";
419 print $LOG ">>70% complete\n" if $LOG;
421 # Run Jnet for the prediction
422 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
423 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
424 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
425 print ">>90% complete\n";
426 print $LOG ">>90% complete\n" if $LOG;
427 print $jnetlog if $VERBOSE;
428 print $LOG $jnetlog if $LOG;
431 psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
432 print ">>40% complete\n";
433 print $LOG ">>40% complete\n" if $LOG;
435 #####################################################################################################
436 # Make PSIBLAST truncated alignments the right length
437 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
438 print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
439 @seqs = extend( $query, @seqs );
440 psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
442 #####################################################################################################
443 # Remove masking from PSIBLAST alignment
444 print "Unmasking the alignments...\n" if $VERBOSE;
445 print $LOG "Unmasking the alignments...\n" if $LOG;
446 my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
447 remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
448 psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
450 #####################################################################################################
451 # Convert the sequences to upper case
452 print "Converting sequences to the same case...\n" if $VERBOSE;
453 print $LOG "Converting sequences to the same case...\n" if $LOG;
454 toupper($_) for @seqs; # CC sub is below
455 psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
457 #####################################################################################################
458 # Remove excessive sequences
459 print "Remove excessive sequences...\n" if $VERBOSE;
460 print $LOG "Remove excessive sequences...\n" if $LOG;
461 @seqs = reduce( $MAX_ALIGN, @seqs );
462 psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
464 #####################################################################################################
465 # Remove sequences that are too long or too short
466 print "Remove sequences which too long or short...\n" if $VERBOSE;
467 print $LOG "Remove sequences which too long or short...\n" if $LOG;
468 @seqs = del_long_seqs( 50, @seqs );
469 psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
471 #####################################################################################################
472 # Remove redundant sequences based upon pairwise identity and OC clustering
473 print "Remove redundant sequences...\n" if $VERBOSE;
474 print $LOG "Remove redundant sequences...\n" if $LOG;
475 @seqs = nonred( $NR_CUT, @seqs );
476 psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
478 #####################################################################################################
479 # Check that we haven't got rid of all of the sequences
481 warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
482 print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
483 @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
485 unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
487 # Remove gaps in the query sequence and same positions in the alignment
488 print "Removing gaps in the query sequence...\n" if $VERBOSE;
489 print $LOG "Removing gaps in the query sequence...\n" if $LOG;
491 psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
493 # Output the alignment for the prediction
494 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
495 print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
496 psiseq2fasta( "$prefix.align", @seqs );
498 #####################################################################################################
499 # Equivilent to getpssm script
500 print "Output the PSSM matrix from the PSI-BLAST profile...\n";
501 print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
502 my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
503 $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
504 print ">>50% complete\n";
505 print $LOG ">>50% complete\n" if $LOG;
507 #####################################################################################################
508 # Run HMMER on the sequences
509 print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
510 print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
511 my $hmmer = hmmer(@seqs); # CC sub is below
512 $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
513 print ">>70% complete\n";
514 print $LOG ">>70% complete\n" if $LOG;
516 #####################################################################################################
517 # Run Jnet for the prediction
518 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
519 print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
520 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below
521 print ">>90% complete\n";
522 print $LOG ">>90% complete\n" if $LOG;
523 print $jnetlog if $VERBOSE;
524 print $LOG $jnetlog if $LOG;
527 unless ( defined $nofinal ) {
528 my $aligfile = $prefix . ".align";
529 my $jnetfile = $prefix . ".jnet";
530 my $jnetfastafile = $prefix . ".jnet.fasta";
531 concise2fasta( $jnetfile, $jnetfastafile );
532 open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
533 open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
534 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
535 while (<$IN2>) { print $OUT $_; }
536 while (<$IN1>) { print $OUT $_; }
540 unlink $jnetfastafile;
542 if (defined $outfile and $prefix.".jnet" ne $outfile) {
543 rename $prefix.".jnet", $outfile;
547 print ">>100% complete\n";
548 print "Jpred Finished\n";
549 print $LOG ">>100% complete\n" if $LOG;
550 print $LOG "Jpred Finished\n" if $LOG;
551 close ($LOG) if $LOG;
554 #####################################################################################################
556 #####################################################################################################
558 #####################################################################################################
562 =head2 fasta2concise ($infile, $outfile)
564 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
572 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
573 my ( $seq, @seqs, @title );
594 if ( @title != @seqs ) {
595 die("non matching number of titles and sequences!\n");
598 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
599 foreach ( 0 .. $#title ) {
600 print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
607 =head2 concise2fasta ($infile, $outfile)
609 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
613 #####################################################################################################
618 my ( @seqs, %seq, @pred, %pred );
621 "Lupas_21", "Lupas_14", "Lupas_28", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
622 "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
625 # parse input concise file
626 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
629 my ( $id, $seq ) = split( ":", $_ );
630 if ( defined $id and defined $seq ) { # Check we have proper values
633 if ( $id =~ /;/ ) { # Then it's an alignment
634 @_ = split( ";", $id );
636 $seq{ $_[1] } = $seq;
638 foreach my $v (@var) {
649 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
651 # print out sequences found with PSI-BLAST
653 $seq{$_} =~ s/(.{72})/$1\n/g;
654 print $OUT ">$_\n$seq{$_}\n";
657 # print out predictions
658 foreach my $p (@pred) {
659 $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
660 $pred{$p} =~ s/B/E/g;
661 $pred{$p} =~ s/G/H/g;
664 $pred{$p} =~ s/E/B/g;
666 $pred{$p} =~ s/(.{72})/$1\n/g;
667 print $OUT ">$p\n$pred{$p}\n";
674 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
676 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
681 #####################################################################################################
682 sub remove_seq_masks {
686 # Only bother if we have a sequence which has been masked
687 return unless grep { uc $_ eq 'X' } @{ $seq->align };
689 my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
690 my $searchdb = $database->{$db_entry}{unfiltered};
691 $start--; # We need this to be zero based
693 my $f = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
694 my @seqs = $f->get_entries;
695 my $db = shift @seqs;
698 confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
700 confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
703 my @db_seq = @{ $db->seq };
705 # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
707 for ( 0 .. $#align ) {
708 next if $align[$_] =~ /$GAP/;
710 if ( $align[$_] eq 'X' ) {
711 unless ( defined $db_seq[ $i + $start ] ) {
712 croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
714 $align[$_] = $db_seq[ $i + $start ];
718 $seq->align( \@align );
721 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
723 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
725 Equivilent to reduce script.
729 #####################################################################################################
731 my ( $max, @seqs ) = @_;
733 my $num = int( @seqs / $max );
734 my @res = shift @seqs;
736 # Don't bother if we have less than the required sequences
737 if ( @seqs < $max or 0 == $num ) {
741 for ( 0 .. $#seqs ) {
742 push @res, $seqs[$_] if ( 0 == $_ % $num );
748 =head2 nonred($cutoff, @PSISEQS);
750 Removes redundant sequences at $cutoff level of sequence identity.
752 Equivilent to nonred script.
756 #####################################################################################################
758 my ( $cutoff, @seqs ) = @_;
760 # Run pairwise and then OC and remove similar sequences
761 unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
762 unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
764 warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
768 # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
769 my $fasta = FASTA::File->new;
771 my $f = FASTA->new( id => $_->id );
772 $f->seq( @{ $_->align } );
773 $fasta->add_entries($f);
777 my $pair = Pairwise->new( path => $pairwise );
778 my $foo = join "\n", $pair->run($fasta) or die $!;
781 my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
782 $ocres->read_string( join "", $ocres->run($foo) );
784 # Now found those entries that are unrelated
786 for ( $ocres->get_groups ) {
787 my ( $group, $score, $size, @labels ) = @{$_};
789 # Keep the QUERY from the cluster, otherwise just get onelt
790 if ( grep { /QUERY/ } @labels ) {
792 # Make sure the query stays at the start
793 unshift @keep, "QUERY";
796 push @keep, shift @labels;
799 push @keep, $ocres->get_unclustered;
801 # Return the matching entries.
802 # We use the temporay to mimic the selection of sequences in the nonred
803 # script, which took the sequences in the order they occured in the file.
807 if ( grep { $_ =~ /^$label$/ } @keep ) {
808 push @filtered_res, $_;
812 return @filtered_res;
815 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
817 Removes those sequences that are more than $percentage longer or shorter than the first
818 sequence in @seqs. @seqs is an array of PSISEQ objects.
820 Equivilent to trim_seqs script.
824 #####################################################################################################
826 my ( $factor, @seqs ) = @_;
828 # Keep the query sequence (we assume query is the 1st FASTA record)
829 my $query = shift @seqs;
832 # Find the length of the query sequence without any gaps
833 my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
835 # Calculate the upper and lower allowed bounds
836 my ( $upper, $lower );
838 my $bounds = ( $length / 100 ) * $factor;
839 ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
842 # Now try each sequence and see how long they are
844 my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
845 if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
851 =head2 toupper ($PSISEQ)
853 Converts sequence held in PSISEQ object to uppercase.
857 #####################################################################################################
859 $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
862 =head2 degap($PSISEQ_QUERY, @PSISEQS)
864 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
866 Equivilent to fasta2jnet script.
870 #####################################################################################################
874 # Find where the gaps in the query sequence are
877 for ( @{ $seqs[0]->align } ) {
878 push @gaps, $i if $_ !~ /$GAP/;
884 # Remove the gaps in all the sequences.
885 # Derefences the array reference and takes a slice inside the method call
886 # arguments, then coerce back to an array ref.
888 $_->align( [ @{ $_->align }[@gaps] ] );
892 =head2 getfreq($filename, @PSISEQS)
894 Creates a PSIBLAST like profile and writes it to the $filename.
896 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
897 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
898 but it's *not* the same.
902 #####################################################################################################
904 my ( $fn, @seqs ) = @_;
906 #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
908 # Create a PSIBLAST like profile from the alignment
909 # This doesn't greate the same result as old Jpred, I think this is because
910 # differences in the input between old and new
913 # my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
914 # my @profile = profile(
915 # map { join "", @{ $_->seq } } $f->get_entries
918 my @profile = profile( map { join "", @{ $_->align } } @seqs );
920 open my $fh, ">$fn" or die "JPRED: $fn: $!";
921 print $fh join( " ", @{$_} ), "\n" for @profile;
924 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
926 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
928 Equivilent to gethmm script.
932 #####################################################################################################
936 # Temp files required
937 my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
939 # Create the FASTA file
940 psiseq2fasta( $tmp_fasta, @seqs );
943 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
944 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
946 # Read in the HMMER file
947 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
949 # Convert from HMMER::Profile to HMMER::Profile::Jnet
950 my $hmmer_jnet = HMMER::Profile::Jnet->new;
951 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
956 =head2 jnet($hmmer, $out, $pssm )
958 Runs Jnet. Pass the paths of the filenames to do the prediction on
962 #####################################################################################################
964 my ( $hmmer, $outfile, $pssm ) = @_;
967 # run Jnet prediction with all the profile data
968 system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
971 # run Jnet prediction with only HMMer and alignment data
972 system("$jnet --concise --hmmer $hmmer > $outfile");
974 check( "jnet", $? ) or exit 1;
977 open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
979 if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
986 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
987 print $OUT "\n".$res;
992 =head1 psiseq2fasta($path, @PSISEQ)
994 Writes a FASTA file given an array of PSISEQ objects.
998 #####################################################################################################
1000 my ( $fn, @seqs ) = @_;
1002 #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1003 confess "JPRED: not passed PSISEQs" unless @seqs;
1005 #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1007 # Changed from using FASTA::File object due to seg faulting on some large,
1008 # long alignments, presumably due to running out of memory.
1009 # my $fasta = FASTA::File->new;
1010 # $fasta->add_entries(
1012 # my $f = FASTA->new(id => $_->id);
1013 # $f->seq(@{$_->align});
1017 # $fasta->write_file($fn);
1021 ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1022 : ( open $fh, ">$fn" or die "JPRED: $!" );
1025 print $fh ">", $_->id, "\n";
1026 my $seq = join( "", @{ $_->align } );
1027 $seq =~ s/(.{72})/$1\n/g;
1029 print $fh $seq . "\n";
1034 =head1 @PSISEQ = fasta2psiseq($path)
1036 Reads in a FASTA file and returns an array of PSISEQ objects.
1040 #####################################################################################################
1046 ? FASTA::File->new( read_gzip_file => $fn )
1047 : FASTA::File->new( read_file => $fn );
1048 $fasta or die "JPRED: $!";
1051 for ( $fasta->get_entries ) {
1052 my $seq = PSISEQ->new;
1054 $seq->align( [ @{ $_->seq } ] );
1061 #####################################################################################################
1062 sub fasta2psiseq_lowmemory {
1063 my $filename = shift;
1067 $filename =~ /\.gz$/
1068 ? ( open $fh, "gzip -dc $filename|" or die $! )
1069 : ( open $fh, $filename or die $! );
1076 $seq =~ s/ //g if defined $seq;
1078 if ( $id and $seq ) {
1079 my $psi = PSISEQ->new( id => $id );
1080 $psi->align( [ split //, $seq ] );
1091 if ( $id and $seq ) {
1092 my $psi = PSISEQ->new( id => $id );
1100 =head1 extend($FASTA::File, @PSISEQ)
1102 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1103 against the query, this function extends the alignment so that all of the query is
1104 present in the alignment. The other sequences are extended with gap characters.
1108 #####################################################################################################
1110 my ( $query, @seqs ) = @_;
1112 #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1113 #croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1115 # Get the query sequence
1116 my $q_query = $query->get_entry(0);
1118 # Get the query sequence from the PSIBLAST alignment
1119 my $a_query = shift @seqs;
1121 # The gap size required to expand the alignment
1122 my ( $start_gap_length, $end_gap_length );
1125 # Make handling the sequence easier
1126 my $q_seq = join "", @{ [ $q_query->seq ] };
1127 my $a_seq = join "", @{ $a_query->align };
1132 ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1134 $start_gap_length = index( $q_seq, $a_seq );
1135 croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1137 $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1140 # Add the gaps to the end of the alignments
1142 $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1145 # Put the query sequence back
1150 ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1151 @{ $a_query->align },
1152 ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1159 =head1 $int = fasta_seq_length($path)
1161 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1165 #####################################################################################################
1166 sub fasta_seq_length {
1168 open my $fh, $fn or croak "Can't open file $fn: $!\n";
1171 local $_, $/ = "\n";
1181 =head1 log(@messages)
1183 Report messages to STDERR