5 jpred - Secondary structure prediction program
9 ./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
13 This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence
14 (with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC.
15 For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the
16 secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
24 The path to the sequence file (in FASTA, MSF, or BLC format)
28 The input file is a FASTA file with one sequence only.
30 =item -output <FILEPREFIX>
32 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
34 =item -outfile <FILE2>
36 An output file, by default it is undefined and the -output option is working
38 =item -logfile <FILE3>
40 Logging file, by default it is undefined and logging switched off
44 Path to the uniref database used for PSI-BLAST querying. default value: .
46 =item -dbname database
48 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
49 Default value: uniref90
53 Path to a PSIBLAST output file.
57 Number of CPU used by jpred.pl. Maximum value is 8
62 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
66 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
67 The option untoggles this step and stops jpred at the concise file generation
71 Redefines some basic variables in order to use the script within JABAWS
75 Verbose mode. Print more information while running jpred.
79 Debug mode. Print debugging information while running jpred.
83 Gives help on the programs usage.
87 Displays this man page.
93 Can't cope with gaps in the query sequence.
95 Doesn't accept alignments.
97 If you find any others please contact authors.
101 Jonathan Barber <jon@compbio.dundee.ac.uk>
102 Chris Cole <christian@cole.name>
103 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
107 ## TODO check for gaps in query sequence
108 ## check for disallowed chars in sequence
118 # path to Jpred modules
119 use FindBin qw($Bin);
122 # internal jpred modules
124 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
126 use HMMER::Profile::Jnet;
135 use Utils qw(profile);
138 our $VERSION = '3.0.2';
139 my $MAX_ALIGN = 10000; # maximum number of alignment sequences allowed
140 my $NR_CUT = 75; # cut-off seqeunce ID value for defining non-redundancy
142 # Gap characters in sequences
145 #####################################################################################################
146 # Light object to hold sequence information, light to prevent memory overload
147 # for big search results. Even so this is quite memory intensive, I can't see
148 # how to reduce the size further.
154 my ( $class, %args ) = @_;
157 for ( keys %args ) { $self->$_( $args{$_} ) }
174 if ( defined $_[1] ) {
175 ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
177 $_[0]->[2] = join( "", @{ $_[1] } );
179 return [ split( //, $_[0]->[2] ) ];
184 #####################################################################################################
195 my $ncpu = 1; # number of CPUs for psiblast calculations
196 my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
197 my $db_path = "/homes/www-jpred/databases";
198 my $db_entry = "cluster";
201 my ( $help, $man, $DEBUG, $VERBOSE );
205 "output=s" => \$prefix,
206 "outfile=s" => \$outfile,
207 "logfile=s" => \$logfile,
208 "psi=s" => \$psiblast,
209 "dbname=s" => \$db_entry,
210 "dbpath=s" => \$db_path,
212 "pred-nohits" => \$predNoHits,
213 "no-final" => \$nofinal,
215 "jabaws" => \$jabaws,
220 "verbose" => \$VERBOSE
222 pod2usage(1) if $help;
223 pod2usage( verbose => 2 ) if $man;
225 $goal = "seq" if ( defined $seqgoal );
227 #####################################################################################################
228 # Key to database information and information for accessing them
231 ## default database used by Jpred
233 database => $db_path . "/uniref90.filt",
234 unfiltered => $db_path . "/uniref90",
237 ## database used during Jnet training
239 database => $db_path . "/training/uniref90.filt",
240 unfiltered => $db_path . "/training/uniref90",
243 ## database used for ported Jpred
245 database => $db_path . "/uniref90.filt",
246 unfiltered => $db_path . "/uniref90",
249 ## cluster-specific path for Jpred
251 database => $db_path . "/uniref90.filt",
252 unfiltered => $db_path . "/uniref90",
255 ## these other DBs are experimental ones used during development.
256 ## check they exist before using them.
257 # generic entry for use with validate_jnet.pl
258 # real db location defined by validate_jnet.pl
260 database => $db_path . "/swall/swall.filt",
261 unfiltered => $db_path . "/swall/swall",
264 # Path to PSIBLAST db
266 database => $db_path . "/3/swall.filt",
267 unfiltered => $db_path . "/3/swall",
271 database => $db_path . "/6/swall.filt",
272 unfiltered => $db_path . "/6/swall",
276 my $dpf = $database->{$db_entry}{database}.'.pal';
277 my $dpu = $database->{$db_entry}{unfiltered}.'.pal';
278 pod2usage("ERROR! Input file should be provided with -sequence/-in") unless $infile;
279 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database") unless exists $database->{$db_entry};
280 pod2usage("ERROR! UNIREF filtered database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf;
281 pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu;
283 #####################################################################################################
284 # lots of preparation steps
285 if ( defined $prefix ) {
286 unless ( defined $outfile ) {
287 $outfile = $prefix . ".res.fasta";
290 if ( defined $outfile ) {
291 print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
292 $prefix = $outfile . ".tmp";
294 print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
295 $prefix = $infile . ".res";
296 $outfile = $prefix . ".jnet";
300 if ( 1 > $ncpu or 8 < $ncpu ) {
301 print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
305 $VERBOSE = 1 if $DEBUG;
308 if ( defined $logfile ) {
309 open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
312 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
313 my ( $key, $value ) = @{$_};
314 defined $value or next;
315 my $par = join( ": ", $key, $value );
316 print "$par\n" if $DEBUG;
317 print $LOG "$par\n" if $LOG;
320 if ( defined $jabaws ) {
321 setup_jpred_env($Bin);
324 my $platform = check_OS();
325 print "JPRED: checking platiform... $platform\n" if $DEBUG;
326 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
328 #####################################################################################################
329 # check input file format
330 if ( 'seq' eq $goal ) {
332 if ( 1 != check_FASTA_format($infile) ) {
333 die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
336 my $nseq = check_FASTA_format($infile);
340 die "\nERROR! jpred requires alignment with more than 1 sequence\n if you provide only one sequence use the -seq option.\n";
342 } elsif ( 0 < check_MSF_format($infile) ) {
344 } elsif ( 0 < check_BLC_format($infile) ) {
347 die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
350 $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
352 #####################################################################################################
353 if ( 'seq' eq $goal ) {
354 my $query = FASTA::File->new( read_file => $infile );
356 my $psi = PSIBLAST->new;
357 unless ( defined $psiblast && -e $psiblast ) {
358 ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
359 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence
360 # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
361 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
362 my $psi_run = PSIBLAST::Run->new(
364 args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
365 path => $psiblastbin,
367 output => "$prefix.blast",
368 matrix => "$prefix.profile",
369 database => $database->{$db_entry}{database},
372 # For reduced databases, get the size of the orginal and use this as
373 # the equivilent DB size
374 if ( $db_entry =~ /^sp_red_/ ) {
375 ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
377 $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below
380 print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
381 print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
382 print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
383 print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG;
385 print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
386 print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
387 $psi_run->run or die; # CC sub is from PSIBLAST::Run
389 $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
390 system("gzip -9f $prefix.blast");
392 if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
393 else { $psi->read_file($psiblast) } # CC ditto above
396 #####################################################################################################
397 # Convert the last itteration into a collection of PSISEQ objects
398 for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm
399 my ( $id, $start, $align ) = @{$_};
404 align => [ split( //, $align ) ]
408 #####################################################################################################
409 # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
410 # Accuracy won't be as good, but at least it's a prediction
412 if ( $predNoHits == 0 ) {
413 warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
414 print ">>100% complete\n";
415 print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
416 print $LOG ">>100% complete\n" if $LOG;
420 print ">>50% complete\n";
421 warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
422 print "Running HMMer on query...\n" if $VERBOSE;
423 print $LOG ">>50% complete\n" if $LOG;
424 print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
425 print $LOG "Running HMMer on query...\n" if $LOG;
427 # copy input query to alignment
428 #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
429 open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
430 open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
431 while (<$ifh>) { print $ofh, $_ }
435 # Temp files required for HMMer
436 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
437 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
438 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
439 unlink $hmmbuild_out;
441 # Read in the HMMER file
442 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
444 # Convert from HMMER::Profile to HMMER::Profile::Jnet
445 my $hmmer_jnet = HMMER::Profile::Jnet->new;
446 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
447 $hmmer_jnet->write_file("$prefix.hmm");
448 print ">>70% complete\n";
449 print $LOG ">>70% complete\n" if $LOG;
451 # Run Jnet for the prediction
452 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
453 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
454 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
455 print ">>90% complete\n";
456 print $LOG ">>90% complete\n" if $LOG;
457 print $jnetlog if $VERBOSE;
458 print $LOG $jnetlog if $LOG;
461 psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
462 print ">>40% complete\n";
463 print $LOG ">>40% complete\n" if $LOG;
465 #####################################################################################################
466 # Make PSIBLAST truncated alignments the right length
467 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
468 print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
469 @seqs = extend( $query, @seqs );
470 psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
472 #####################################################################################################
473 # Remove masking from PSIBLAST alignment
474 print "Unmasking the alignments...\n" if $VERBOSE;
475 print $LOG "Unmasking the alignments...\n" if $LOG;
476 my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
477 remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
478 psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
480 #####################################################################################################
481 # Convert the sequences to upper case
482 print "Converting sequences to the same case...\n" if $VERBOSE;
483 print $LOG "Converting sequences to the same case...\n" if $LOG;
484 toupper($_) for @seqs; # CC sub is below
485 psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
487 #####################################################################################################
488 # Remove excessive sequences
489 print "Remove excessive sequences...\n" if $VERBOSE;
490 print $LOG "Remove excessive sequences...\n" if $LOG;
491 @seqs = reduce( $MAX_ALIGN, @seqs );
492 psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
494 #####################################################################################################
495 # Remove sequences that are too long or too short
496 print "Remove sequences which too long or short...\n" if $VERBOSE;
497 print $LOG "Remove sequences which too long or short...\n" if $LOG;
498 @seqs = del_long_seqs( 50, @seqs );
499 psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
501 #####################################################################################################
502 # Remove redundant sequences based upon pairwise identity and OC clustering
503 print "Remove redundant sequences...\n" if $VERBOSE;
504 print $LOG "Remove redundant sequences...\n" if $LOG;
505 @seqs = nonred( $NR_CUT, @seqs );
506 psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
508 #####################################################################################################
509 # Check that we haven't got rid of all of the sequences
511 warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
512 print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
513 @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
515 unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
517 # Remove gaps in the query sequence and same positions in the alignment
518 print "Removing gaps in the query sequence...\n" if $VERBOSE;
519 print $LOG "Removing gaps in the query sequence...\n" if $LOG;
521 psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
523 # Output the alignment for the prediction
524 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
525 print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
526 psiseq2fasta( "$prefix.align", @seqs );
528 #####################################################################################################
529 # Equivilent to getpssm script
530 print "Output the PSSM matrix from the PSI-BLAST profile...\n";
531 print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
532 my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
533 $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
534 print ">>50% complete\n";
535 print $LOG ">>50% complete\n" if $LOG;
537 #####################################################################################################
538 # Run HMMER on the sequences
539 print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
540 print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
541 my $hmmer = hmmer(@seqs); # CC sub is below
542 $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
543 print ">>70% complete\n";
544 print $LOG ">>70% complete\n" if $LOG;
545 #####################################################################################################
546 # Run Jnet for the prediction
547 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
548 print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
549 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below
550 print ">>90% complete\n";
551 print $LOG ">>90% complete\n" if $LOG;
552 print $jnetlog if $VERBOSE;
553 print $LOG $jnetlog if $LOG;
556 if ( 'fasta' eq $format ) {
557 print "Read FASTA file\n";
558 my $hmmer1 = hmm_for_align($infile);
559 $hmmer1->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
560 } elsif ( 'msf' eq $format ) {
561 msf2fasta( $infile, $infastafile );
562 my $hmmer2 = hmm_for_align($infastafile);
563 $hmmer2->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
564 } elsif ( 'blc' eq $format ) {
565 my $tmpfile = File::Temp->new->filename;
566 blc2msf( $infile, $tmpfile );
567 msf2fasta( $infile, $infastafile );
568 my $hmmer3 = hmm_for_align($infastafile);
569 $hmmer3->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
571 die "ERROR! unknown input format '$format'. exit...\n";
573 print ">>70% complete\n";
574 print $LOG ">>70% complete\n" if $LOG;
575 #####################################################################################################
576 # Run Jnet for the prediction
577 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
578 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
579 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) ); # CC sub is below
580 print ">>90% complete\n";
581 print $LOG ">>90% complete\n" if $LOG;
582 print $jnetlog if $VERBOSE;
583 print $LOG $jnetlog if $LOG;
586 unless ( defined $nofinal ) {
587 my $aligfile = $prefix . ".align";
588 my $jnetfile = $prefix . ".jnet";
589 my $jnetfastafile = $prefix . ".jnet.fasta";
590 $aligfile = $infile if ( 'fasta' eq $format );
591 $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
592 concise2fasta( $jnetfile, $jnetfastafile );
593 open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
594 open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
595 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
596 while (<$IN2>) { print $OUT $_; }
597 while (<$IN1>) { print $OUT $_; }
601 unlink $jnetfastafile;
602 my $seqs = FASTA::File->new( read_file => $outfile );
603 open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
607 if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
608 rename $prefix . ".jnet", $outfile;
612 print ">>100% complete\n";
613 print "Jpred Finished\n";
614 print $LOG ">>100% complete\n" if $LOG;
615 print $LOG "Jpred Finished\n" if $LOG;
619 #####################################################################################################
621 #####################################################################################################
622 sub check_FASTA_format {
625 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
626 my $check_first_line = 1;
630 if ($check_first_line) {
631 return 0 unless (/^>/);
632 $check_first_line = 0;
637 my ( $id, @seqs ) = split /\n/, $_;
638 return 0 unless ( defined $id or @seqs );
639 my $seq = join( "", @seqs );
640 return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
647 #####################################################################################################
648 sub check_MSF_format {
651 system("$readseq -fMSF -a -p < $infile > /dev/null");
656 #####################################################################################################
657 sub check_BLC_format {
660 my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
661 open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
662 print $fh "silent_mode\nblock_file $infile\n";
663 print $fh "output_file /dev/null\nmax_nseq 2000\n";
664 print $fh "pir_save $tmpfile2\nsetup\n";
668 system("$alscript -s -f $tmpfile1");
670 system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
677 ############################################################################################################################################
678 # convertor BLC -> MSF
684 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
690 $sequence =~ s/~/\./g;
692 ## check for non-unique sequence names - readseq has problems with these
693 my @lines = split /\n/, $sequence;
695 foreach my $line (@lines) {
696 if ( $line =~ /Name:\s+(\S+)/ ) {
697 die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
702 my $tmpfile1 = File::Temp->new->filename;
703 my $tmpfile2 = File::Temp->new->filename;
704 open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
705 print $FILE $sequence;
709 system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
710 die "Reformatting of $infile failed. exit...\n" if ($?);
714 system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
715 die "Reformatting of $infile failed. exit...\n" if ($?);
718 die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
721 ############################################################################################################################################
722 # convertor BLC -> MSF
724 my ( $in, $out ) = @_;
726 my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
728 open my $fh, ">$tmp" or die "$tmp: $!\n";
729 print $fh "silent_mode\nblock_file $infile\n";
730 print $fh "output_file /dev/null\nmax_nseq 2000\n";
731 print $fh "pir_save $tmp_pir\nsetup\n";
735 system("$alscript -s -f $tmp");
736 die "Reformatting of $infile failed. exit...\n" if ($?);
737 system("$readseq -f=MSF -o=$out -a $tmp_pir");
743 #####################################################################################################
747 =head2 fasta2concise ($infile, $outfile)
749 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
757 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
758 my ( $seq, @seqs, @title );
779 if ( @title != @seqs ) {
780 die("non matching number of titles and sequences!\n");
783 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
784 foreach ( 0 .. $#title ) {
785 print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
792 =head2 concise2fasta ($infile, $outfile)
794 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
798 #####################################################################################################
803 my ( @seqs, %seq, @pred, %pred );
806 "Lupas_21", "Lupas_14", "Lupas_28", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
807 "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
810 # parse input concise file
811 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
814 my ( $id, $seq ) = split( ":", $_ );
815 if ( defined $id and defined $seq ) { # Check we have proper values
818 if ( $id =~ /;/ ) { # Then it's an alignment
819 @_ = split( ";", $id );
821 $seq{ $_[1] } = $seq;
823 foreach my $v (@var) {
834 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
836 # print out sequences found with PSI-BLAST
838 $seq{$_} =~ s/(.{72})/$1\n/g;
839 print $OUT ">$_\n$seq{$_}\n";
842 # print out predictions
843 foreach my $p (@pred) {
844 $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
845 $pred{$p} =~ s/B/E/g;
846 $pred{$p} =~ s/G/H/g;
849 $pred{$p} =~ s/E/B/g;
851 $pred{$p} =~ s/(.{72})/$1\n/g;
852 print $OUT ">$p\n$pred{$p}\n";
859 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
861 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
866 #####################################################################################################
867 sub remove_seq_masks {
871 # Only bother if we have a sequence which has been masked
872 return unless grep { uc $_ eq 'X' } @{ $seq->align };
874 my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
875 my $searchdb = $database->{$db_entry}{unfiltered};
876 $start--; # We need this to be zero based
878 my $f = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
879 my @seqs = $f->get_entries;
880 my $db = shift @seqs;
883 confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
885 confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
888 my @db_seq = @{ $db->seq };
890 # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
892 for ( 0 .. $#align ) {
893 next if $align[$_] =~ /$GAP/;
895 if ( $align[$_] eq 'X' ) {
896 unless ( defined $db_seq[ $i + $start ] ) {
897 croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
899 $align[$_] = $db_seq[ $i + $start ];
903 $seq->align( \@align );
906 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
908 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
910 Equivilent to reduce script.
914 #####################################################################################################
916 my ( $max, @seqs ) = @_;
918 my $num = int( @seqs / $max );
919 my @res = shift @seqs;
921 # Don't bother if we have less than the required sequences
922 if ( @seqs < $max or 0 == $num ) {
926 for ( 0 .. $#seqs ) {
927 push @res, $seqs[$_] if ( 0 == $_ % $num );
933 =head2 nonred($cutoff, @PSISEQS);
935 Removes redundant sequences at $cutoff level of sequence identity.
937 Equivilent to nonred script.
941 #####################################################################################################
943 my ( $cutoff, @seqs ) = @_;
945 # Run pairwise and then OC and remove similar sequences
946 unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
947 unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
949 warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
953 # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
954 my $fasta = FASTA::File->new;
956 my $f = FASTA->new( id => $_->id );
957 $f->seq( @{ $_->align } );
958 $fasta->add_entries($f);
962 my $pair = Pairwise->new( path => $pairwise );
963 my $foo = join "\n", $pair->run($fasta) or die $!;
966 my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
967 $ocres->read_string( join "", $ocres->run($foo) );
969 # Now found those entries that are unrelated
971 for ( $ocres->get_groups ) {
972 my ( $group, $score, $size, @labels ) = @{$_};
974 # Keep the QUERY from the cluster, otherwise just get onelt
975 if ( grep { /QUERY/ } @labels ) {
977 # Make sure the query stays at the start
978 unshift @keep, "QUERY";
981 push @keep, shift @labels;
984 push @keep, $ocres->get_unclustered;
986 # Return the matching entries.
987 # We use the temporay to mimic the selection of sequences in the nonred
988 # script, which took the sequences in the order they occured in the file.
992 if ( grep { $_ =~ /^$label$/ } @keep ) {
993 push @filtered_res, $_;
997 return @filtered_res;
1000 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
1002 Removes those sequences that are more than $percentage longer or shorter than the first
1003 sequence in @seqs. @seqs is an array of PSISEQ objects.
1005 Equivilent to trim_seqs script.
1009 #####################################################################################################
1011 my ( $factor, @seqs ) = @_;
1013 # Keep the query sequence (we assume query is the 1st FASTA record)
1014 my $query = shift @seqs;
1017 # Find the length of the query sequence without any gaps
1018 my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
1020 # Calculate the upper and lower allowed bounds
1021 my ( $upper, $lower );
1023 my $bounds = ( $length / 100 ) * $factor;
1024 ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
1027 # Now try each sequence and see how long they are
1029 my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
1030 if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
1036 =head2 toupper ($PSISEQ)
1038 Converts sequence held in PSISEQ object to uppercase.
1042 #####################################################################################################
1044 $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
1047 =head2 degap($PSISEQ_QUERY, @PSISEQS)
1049 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
1051 Equivilent to fasta2jnet script.
1055 #####################################################################################################
1059 # Find where the gaps in the query sequence are
1062 for ( @{ $seqs[0]->align } ) {
1063 push @gaps, $i if $_ !~ /$GAP/;
1067 return unless @gaps;
1069 # Remove the gaps in all the sequences.
1070 # Derefences the array reference and takes a slice inside the method call
1071 # arguments, then coerce back to an array ref.
1073 $_->align( [ @{ $_->align }[@gaps] ] );
1077 =head2 getfreq($filename, @PSISEQS)
1079 Creates a PSIBLAST like profile and writes it to the $filename.
1081 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
1082 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
1083 but it's *not* the same.
1087 #####################################################################################################
1089 my ( $fn, @seqs ) = @_;
1091 #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
1093 # Create a PSIBLAST like profile from the alignment
1094 # This doesn't greate the same result as old Jpred, I think this is because
1095 # differences in the input between old and new
1098 # my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
1099 # my @profile = profile(
1100 # map { join "", @{ $_->seq } } $f->get_entries
1103 my @profile = profile( map { join "", @{ $_->align } } @seqs );
1105 open my $fh, ">$fn" or die "JPRED: $fn: $!";
1106 print $fh join( " ", @{$_} ), "\n" for @profile;
1109 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
1111 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
1113 Equivilent to gethmm script.
1117 #####################################################################################################
1121 # Temp files required
1122 my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
1124 # Create the FASTA file
1125 psiseq2fasta( $tmp_fasta, @seqs );
1128 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
1129 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1131 # Read in the HMMER file
1132 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1134 # Convert from HMMER::Profile to HMMER::Profile::Jnet
1135 my $hmmer_jnet = HMMER::Profile::Jnet->new;
1136 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
1141 =head2 jnet($hmmer, $out, $pssm )
1143 Runs Jnet. Pass the paths of the filenames to do the prediction on
1147 #################################################################################################################################################
1149 =head2 get_hmm($alignFile, $name)
1151 Adapted from the hmmer() function in Jpred as written by JDB.
1153 Generates an HMMer profile output compatible with Jnet.
1155 Equivilent to gethmm script.
1160 my $fastafile = shift;
1162 # Temp files required
1163 print "hmm_for_align: fastafile = $fastafile\n";
1164 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
1166 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
1167 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1169 # Read in the HMMER file
1170 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1172 # Read in the HMMER file
1173 my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
1175 # Convert from HMMER::Profile to HMMER::Profile::Jnet
1176 my $hmmer_jnet = HMMER::Profile::Jnet->new;
1177 $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
1182 #####################################################################################################
1184 my ( $hmmer, $outfile, $pssm ) = @_;
1187 # run Jnet prediction with all the profile data
1188 system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
1191 # run Jnet prediction with only HMMer and alignment data
1192 system("$jnet --concise --hmmer $hmmer > $outfile");
1194 check( "jnet", $? ) or exit 1;
1197 open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1199 if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
1206 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1207 print $OUT "\n" . $res;
1212 =head1 psiseq2fasta($path, @PSISEQ)
1214 Writes a FASTA file given an array of PSISEQ objects.
1218 #####################################################################################################
1220 my ( $fn, @seqs ) = @_;
1222 #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1223 confess "JPRED: not passed PSISEQs" unless @seqs;
1225 #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1227 # Changed from using FASTA::File object due to seg faulting on some large,
1228 # long alignments, presumably due to running out of memory.
1229 # my $fasta = FASTA::File->new;
1230 # $fasta->add_entries(
1232 # my $f = FASTA->new(id => $_->id);
1233 # $f->seq(@{$_->align});
1237 # $fasta->write_file($fn);
1241 ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1242 : ( open $fh, ">$fn" or die "JPRED: $!" );
1245 print $fh ">", $_->id, "\n";
1246 my $seq = join( "", @{ $_->align } );
1247 $seq =~ s/(.{72})/$1\n/g;
1249 print $fh $seq . "\n";
1254 =head1 @PSISEQ = fasta2psiseq($path)
1256 Reads in a FASTA file and returns an array of PSISEQ objects.
1260 #####################################################################################################
1266 ? FASTA::File->new( read_gzip_file => $fn )
1267 : FASTA::File->new( read_file => $fn );
1268 $fasta or die "JPRED: $!";
1271 for ( $fasta->get_entries ) {
1272 my $seq = PSISEQ->new;
1274 $seq->align( [ @{ $_->seq } ] );
1281 #####################################################################################################
1282 sub fasta2psiseq_lowmemory {
1283 my $filename = shift;
1287 $filename =~ /\.gz$/
1288 ? ( open $fh, "gzip -dc $filename|" or die $! )
1289 : ( open $fh, $filename or die $! );
1296 $seq =~ s/ //g if defined $seq;
1298 if ( $id and $seq ) {
1299 my $psi = PSISEQ->new( id => $id );
1300 $psi->align( [ split //, $seq ] );
1311 if ( $id and $seq ) {
1312 my $psi = PSISEQ->new( id => $id );
1320 #####################################################################################################
1322 =head1 extend($FASTA::File, @PSISEQ)
1324 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1325 against the query, this function extends the alignment so that all of the query is
1326 present in the alignment. The other sequences are extended with gap characters.
1331 my ( $query, @seqs ) = @_;
1333 #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1334 #croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1336 # Get the query sequence
1337 my $q_query = $query->get_entry(0);
1339 # Get the query sequence from the PSIBLAST alignment
1340 my $a_query = shift @seqs;
1342 # The gap size required to expand the alignment
1343 my ( $start_gap_length, $end_gap_length );
1346 # Make handling the sequence easier
1347 my $q_seq = join "", @{ [ $q_query->seq ] };
1348 my $a_seq = join "", @{ $a_query->align };
1353 ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1355 $start_gap_length = index( $q_seq, $a_seq );
1356 croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1358 $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1361 # Add the gaps to the end of the alignments
1363 $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1366 # Put the query sequence back
1371 ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1372 @{ $a_query->align },
1373 ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1380 #####################################################################################################
1382 =head1 $int = fasta_seq_length($path)
1384 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1388 sub fasta_seq_length {
1390 open my $fh, $fn or croak "Can't open file $fn: $!\n";
1393 local $_, $/ = "\n";