5 jpred - Secondary structure prediction program
9 ./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
13 This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence
14 (with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC.
15 For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the
16 secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
24 The path to the sequence file (in FASTA, MSF, or BLC format)
28 The input file is a FASTA file with one sequence only.
30 =item -output <FILEPREFIX>
32 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
34 =item -outfile <FILE2>
36 An output file, by default it is undefined and the -output option is working
38 =item -logfile <FILE3>
40 Logging file, by default it is undefined and logging switched off
44 Path to the uniref database used for PSI-BLAST querying. default value: .
46 =item -dbname database
48 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
49 Default value: uniref90
53 Path to a PSIBLAST output file.
57 Number of CPU used by jpred.pl. Maximum value is 8
62 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
66 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
67 The option untoggles this step and stops jpred at the concise file generation
71 Redefined basic variable in order to use the script within JABAWS
75 Verbose mode. Print more information while running jpred.
79 Debug mode. Print debugging information while running jpred.
83 Gives help on the programs usage.
87 Displays this man page.
93 Can't cope with gaps in the query sequence.
95 Doesn't accept alignments.
97 If you find any others please contact authors.
101 Jonathan Barber <jon@compbio.dundee.ac.uk>
102 Chris Cole <christian@cole.name>
103 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
107 ## TODO check for gaps in query sequence
108 ## check for disallowed chars in sequence
118 # path to Jpred modules
119 use FindBin qw($Bin);
122 # internal jpred modules
124 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
126 use HMMER::Profile::Jnet;
135 use Utils qw(profile);
138 our $VERSION = '3.0.2';
139 my $MAX_ALIGN = 10000; # maximum number of alignment sequences allowed
140 my $NR_CUT = 75; # cut-off seqeunce ID value for defining non-redundancy
142 # Gap characters in sequences
145 #####################################################################################################
146 # Light object to hold sequence information, light to prevent memory overload
147 # for big search results. Even so this is quite memory intensive, I can't see
148 # how to reduce the size further.
154 my ( $class, %args ) = @_;
157 for ( keys %args ) { $self->$_( $args{$_} ) }
174 if ( defined $_[1] ) {
175 ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
177 $_[0]->[2] = join( "", @{ $_[1] } );
179 return [ split( //, $_[0]->[2] ) ];
184 #####################################################################################################
195 my $ncpu = 1; # number of CPUs for psiblast calculations
196 my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
197 my $db_path = "/homes/www-jpred/databases";
198 my $db_entry = "cluster";
201 my ( $help, $man, $DEBUG, $VERBOSE );
205 "output=s" => \$prefix,
206 "outfile=s" => \$outfile,
207 "logfile=s" => \$logfile,
208 "psi=s" => \$psiblast,
209 "dbname=s" => \$db_entry,
210 "dbpath=s" => \$db_path,
212 "pred-nohits" => \$predNoHits,
213 "no-final" => \$nofinal,
215 "jabaws" => \$jabaws,
220 "verbose" => \$VERBOSE
222 pod2usage(1) if $help;
223 pod2usage( verbose => 2 ) if $man;
225 $goal = "seq" if ( defined $seqgoal );
227 #####################################################################################################
228 # Key to database information and information for accessing them
229 if ( defined $jabaws ) {
230 $db_path = "/homes/www-jpred/databases";
231 $db_entry = "cluster";
236 ## default database used by Jpred
238 database => $db_path . "/uniref90.filt",
239 unfiltered => $db_path . "/uniref90",
242 ## database used during Jnet training
244 database => $db_path . "/training/uniref90.filt",
245 unfiltered => $db_path . "/training/uniref90",
248 ## database used for ported Jpred
250 database => $db_path . "/uniref90.filt",
251 unfiltered => $db_path . "/uniref90",
254 ## cluster-specific path for Jpred
256 database => $db_path . "/uniref90.filt",
257 unfiltered => $db_path . "/uniref90",
260 ## these other DBs are experimental ones used during development.
261 ## check they exist before using them.
262 # generic entry for use with validate_jnet.pl
263 # real db location defined by validate_jnet.pl
265 database => $db_path . "/swall/swall.filt",
266 unfiltered => $db_path . "/swall/swall",
269 # Path to PSIBLAST db
271 database => $db_path . "/3/swall.filt",
272 unfiltered => $db_path . "/3/swall",
276 database => $db_path . "/6/swall.filt",
277 unfiltered => $db_path . "/6/swall",
281 my $dpf = $database->{$db_entry}{database}.'.pal';
282 my $dpu = $database->{$db_entry}{unfiltered}.'.pal';
283 pod2usage("ERROR! Input file should be provided with -sequence/-in") unless $infile;
284 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database") unless exists $database->{$db_entry};
285 pod2usage("ERROR! UNIREF filtered database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf;
286 pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu;
288 #####################################################################################################
289 # lots of preparation steps
290 if ( defined $prefix ) {
291 unless ( defined $outfile ) {
292 $outfile = $prefix . ".res.fasta";
295 if ( defined $outfile ) {
296 print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
297 $prefix = $outfile . ".tmp";
299 print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
300 $prefix = $infile . ".res";
301 $outfile = $prefix . ".jnet";
305 if ( 1 > $ncpu or 8 < $ncpu ) {
306 print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
310 $VERBOSE = 1 if $DEBUG;
313 if ( defined $logfile ) {
314 open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
317 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
318 my ( $key, $value ) = @{$_};
319 defined $value or next;
320 my $par = join( ": ", $key, $value );
321 print "$par\n" if $DEBUG;
322 print $LOG "$par\n" if $LOG;
325 if ( defined $jabaws ) {
326 setup_jpred_env($Bin);
329 my $platform = check_OS();
330 print "JPRED: checking platiform... $platform\n" if $DEBUG;
331 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
333 #####################################################################################################
334 # check input file format
335 if ( 'seq' eq $goal ) {
337 if ( 1 != check_FASTA_format($infile) ) {
338 die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
341 my $nseq = check_FASTA_format($infile);
345 die "\nERROR! jpred requires alignment with more than 1 sequence\n if you provide only one sequence use the -seq option.\n";
347 } elsif ( 0 < check_MSF_format($infile) ) {
349 } elsif ( 0 < check_BLC_format($infile) ) {
352 die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
355 $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
357 #####################################################################################################
358 if ( 'seq' eq $goal ) {
359 my $query = FASTA::File->new( read_file => $infile );
361 my $psi = PSIBLAST->new;
362 unless ( defined $psiblast && -e $psiblast ) {
363 ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
364 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence
365 # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
366 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
367 my $psi_run = PSIBLAST::Run->new(
369 args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
370 path => $psiblastbin,
372 output => "$prefix.blast",
373 matrix => "$prefix.profile",
374 database => $database->{$db_entry}{database},
377 # For reduced databases, get the size of the orginal and use this as
378 # the equivilent DB size
379 if ( $db_entry =~ /^sp_red_/ ) {
380 ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
382 $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below
385 print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
386 print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
387 print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
388 print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG;
390 print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
391 print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
392 $psi_run->run or die; # CC sub is from PSIBLAST::Run
394 $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
395 system("gzip -9f $prefix.blast");
397 if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
398 else { $psi->read_file($psiblast) } # CC ditto above
401 #####################################################################################################
402 # Convert the last itteration into a collection of PSISEQ objects
403 for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm
404 my ( $id, $start, $align ) = @{$_};
409 align => [ split( //, $align ) ]
413 #####################################################################################################
414 # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
415 # Accuracy won't be as good, but at least it's a prediction
417 if ( $predNoHits == 0 ) {
418 warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
419 print ">>100% complete\n";
420 print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
421 print $LOG ">>100% complete\n" if $LOG;
425 print ">>50% complete\n";
426 warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
427 print "Running HMMer on query...\n" if $VERBOSE;
428 print $LOG ">>50% complete\n" if $LOG;
429 print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
430 print $LOG "Running HMMer on query...\n" if $LOG;
432 # copy input query to alignment
433 #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
434 open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
435 open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
436 while (<$ifh>) { print $ofh, $_ }
440 # Temp files required for HMMer
441 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
442 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
443 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
444 unlink $hmmbuild_out;
446 # Read in the HMMER file
447 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
449 # Convert from HMMER::Profile to HMMER::Profile::Jnet
450 my $hmmer_jnet = HMMER::Profile::Jnet->new;
451 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
452 $hmmer_jnet->write_file("$prefix.hmm");
453 print ">>70% complete\n";
454 print $LOG ">>70% complete\n" if $LOG;
456 # Run Jnet for the prediction
457 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
458 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
459 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
460 print ">>90% complete\n";
461 print $LOG ">>90% complete\n" if $LOG;
462 print $jnetlog if $VERBOSE;
463 print $LOG $jnetlog if $LOG;
466 psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
467 print ">>40% complete\n";
468 print $LOG ">>40% complete\n" if $LOG;
470 #####################################################################################################
471 # Make PSIBLAST truncated alignments the right length
472 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
473 print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
474 @seqs = extend( $query, @seqs );
475 psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
477 #####################################################################################################
478 # Remove masking from PSIBLAST alignment
479 print "Unmasking the alignments...\n" if $VERBOSE;
480 print $LOG "Unmasking the alignments...\n" if $LOG;
481 my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
482 remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
483 psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
485 #####################################################################################################
486 # Convert the sequences to upper case
487 print "Converting sequences to the same case...\n" if $VERBOSE;
488 print $LOG "Converting sequences to the same case...\n" if $LOG;
489 toupper($_) for @seqs; # CC sub is below
490 psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
492 #####################################################################################################
493 # Remove excessive sequences
494 print "Remove excessive sequences...\n" if $VERBOSE;
495 print $LOG "Remove excessive sequences...\n" if $LOG;
496 @seqs = reduce( $MAX_ALIGN, @seqs );
497 psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
499 #####################################################################################################
500 # Remove sequences that are too long or too short
501 print "Remove sequences which too long or short...\n" if $VERBOSE;
502 print $LOG "Remove sequences which too long or short...\n" if $LOG;
503 @seqs = del_long_seqs( 50, @seqs );
504 psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
506 #####################################################################################################
507 # Remove redundant sequences based upon pairwise identity and OC clustering
508 print "Remove redundant sequences...\n" if $VERBOSE;
509 print $LOG "Remove redundant sequences...\n" if $LOG;
510 @seqs = nonred( $NR_CUT, @seqs );
511 psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
513 #####################################################################################################
514 # Check that we haven't got rid of all of the sequences
516 warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
517 print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
518 @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
520 unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
522 # Remove gaps in the query sequence and same positions in the alignment
523 print "Removing gaps in the query sequence...\n" if $VERBOSE;
524 print $LOG "Removing gaps in the query sequence...\n" if $LOG;
526 psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
528 # Output the alignment for the prediction
529 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
530 print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
531 psiseq2fasta( "$prefix.align", @seqs );
533 #####################################################################################################
534 # Equivilent to getpssm script
535 print "Output the PSSM matrix from the PSI-BLAST profile...\n";
536 print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
537 my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
538 $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
539 print ">>50% complete\n";
540 print $LOG ">>50% complete\n" if $LOG;
542 #####################################################################################################
543 # Run HMMER on the sequences
544 print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
545 print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
546 my $hmmer = hmmer(@seqs); # CC sub is below
547 $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
548 print ">>70% complete\n";
549 print $LOG ">>70% complete\n" if $LOG;
550 #####################################################################################################
551 # Run Jnet for the prediction
552 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
553 print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
554 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below
555 print ">>90% complete\n";
556 print $LOG ">>90% complete\n" if $LOG;
557 print $jnetlog if $VERBOSE;
558 print $LOG $jnetlog if $LOG;
561 if ( 'fasta' eq $format ) {
562 print "Read FASTA file\n";
563 my $hmmer1 = hmm_for_align($infile);
564 $hmmer1->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
565 } elsif ( 'msf' eq $format ) {
566 msf2fasta( $infile, $infastafile );
567 my $hmmer2 = hmm_for_align($infastafile);
568 $hmmer2->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
569 } elsif ( 'blc' eq $format ) {
570 my $tmpfile = File::Temp->new->filename;
571 blc2msf( $infile, $tmpfile );
572 msf2fasta( $infile, $infastafile );
573 my $hmmer3 = hmm_for_align($infastafile);
574 $hmmer3->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
576 die "ERROR! unknown input format '$format'. exit...\n";
578 print ">>70% complete\n";
579 print $LOG ">>70% complete\n" if $LOG;
580 #####################################################################################################
581 # Run Jnet for the prediction
582 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
583 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
584 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) ); # CC sub is below
585 print ">>90% complete\n";
586 print $LOG ">>90% complete\n" if $LOG;
587 print $jnetlog if $VERBOSE;
588 print $LOG $jnetlog if $LOG;
591 unless ( defined $nofinal ) {
592 my $aligfile = $prefix . ".align";
593 my $jnetfile = $prefix . ".jnet";
594 my $jnetfastafile = $prefix . ".jnet.fasta";
595 $aligfile = $infile if ( 'fasta' eq $format );
596 $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
597 concise2fasta( $jnetfile, $jnetfastafile );
598 open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
599 open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
600 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
601 while (<$IN2>) { print $OUT $_; }
602 while (<$IN1>) { print $OUT $_; }
606 unlink $jnetfastafile;
607 my $seqs = FASTA::File->new( read_file => $outfile );
608 open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
612 if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
613 rename $prefix . ".jnet", $outfile;
617 print ">>100% complete\n";
618 print "Jpred Finished\n";
619 print $LOG ">>100% complete\n" if $LOG;
620 print $LOG "Jpred Finished\n" if $LOG;
624 #####################################################################################################
626 #####################################################################################################
627 sub check_FASTA_format {
630 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
631 my $check_first_line = 1;
635 if ($check_first_line) {
636 return 0 unless (/^>/);
637 $check_first_line = 0;
642 my ( $id, @seqs ) = split /\n/, $_;
643 return 0 unless ( defined $id or @seqs );
644 my $seq = join( "", @seqs );
645 return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
652 #####################################################################################################
653 sub check_MSF_format {
656 system("$readseq -fMSF -a -p < $infile > /dev/null");
661 #####################################################################################################
662 sub check_BLC_format {
665 my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
666 open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
667 print $fh "silent_mode\nblock_file $infile\n";
668 print $fh "output_file /dev/null\nmax_nseq 2000\n";
669 print $fh "pir_save $tmpfile2\nsetup\n";
673 system("$alscript -s -f $tmpfile1");
675 system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
682 ############################################################################################################################################
683 # convertor BLC -> MSF
689 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
695 $sequence =~ s/~/\./g;
697 ## check for non-unique sequence names - readseq has problems with these
698 my @lines = split /\n/, $sequence;
700 foreach my $line (@lines) {
701 if ( $line =~ /Name:\s+(\S+)/ ) {
702 die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
707 my $tmpfile1 = File::Temp->new->filename;
708 my $tmpfile2 = File::Temp->new->filename;
709 open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
710 print $FILE $sequence;
714 system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
715 die "Reformatting of $infile failed. exit...\n" if ($?);
719 system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
720 die "Reformatting of $infile failed. exit...\n" if ($?);
723 die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
726 ############################################################################################################################################
727 # convertor BLC -> MSF
729 my ( $in, $out ) = @_;
731 my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
733 open my $fh, ">$tmp" or die "$tmp: $!\n";
734 print $fh "silent_mode\nblock_file $infile\n";
735 print $fh "output_file /dev/null\nmax_nseq 2000\n";
736 print $fh "pir_save $tmp_pir\nsetup\n";
740 system("$alscript -s -f $tmp");
741 die "Reformatting of $infile failed. exit...\n" if ($?);
742 system("$readseq -f=MSF -o=$out -a $tmp_pir");
748 #####################################################################################################
752 =head2 fasta2concise ($infile, $outfile)
754 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
762 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
763 my ( $seq, @seqs, @title );
784 if ( @title != @seqs ) {
785 die("non matching number of titles and sequences!\n");
788 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
789 foreach ( 0 .. $#title ) {
790 print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
797 =head2 concise2fasta ($infile, $outfile)
799 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
803 #####################################################################################################
808 my ( @seqs, %seq, @pred, %pred );
811 "Lupas_21", "Lupas_14", "Lupas_28", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
812 "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
815 # parse input concise file
816 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
819 my ( $id, $seq ) = split( ":", $_ );
820 if ( defined $id and defined $seq ) { # Check we have proper values
823 if ( $id =~ /;/ ) { # Then it's an alignment
824 @_ = split( ";", $id );
826 $seq{ $_[1] } = $seq;
828 foreach my $v (@var) {
839 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
841 # print out sequences found with PSI-BLAST
843 $seq{$_} =~ s/(.{72})/$1\n/g;
844 print $OUT ">$_\n$seq{$_}\n";
847 # print out predictions
848 foreach my $p (@pred) {
849 $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
850 $pred{$p} =~ s/B/E/g;
851 $pred{$p} =~ s/G/H/g;
854 $pred{$p} =~ s/E/B/g;
856 $pred{$p} =~ s/(.{72})/$1\n/g;
857 print $OUT ">$p\n$pred{$p}\n";
864 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
866 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
871 #####################################################################################################
872 sub remove_seq_masks {
876 # Only bother if we have a sequence which has been masked
877 return unless grep { uc $_ eq 'X' } @{ $seq->align };
879 my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
880 my $searchdb = $database->{$db_entry}{unfiltered};
881 $start--; # We need this to be zero based
883 my $f = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
884 my @seqs = $f->get_entries;
885 my $db = shift @seqs;
888 confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
890 confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
893 my @db_seq = @{ $db->seq };
895 # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
897 for ( 0 .. $#align ) {
898 next if $align[$_] =~ /$GAP/;
900 if ( $align[$_] eq 'X' ) {
901 unless ( defined $db_seq[ $i + $start ] ) {
902 croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
904 $align[$_] = $db_seq[ $i + $start ];
908 $seq->align( \@align );
911 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
913 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
915 Equivilent to reduce script.
919 #####################################################################################################
921 my ( $max, @seqs ) = @_;
923 my $num = int( @seqs / $max );
924 my @res = shift @seqs;
926 # Don't bother if we have less than the required sequences
927 if ( @seqs < $max or 0 == $num ) {
931 for ( 0 .. $#seqs ) {
932 push @res, $seqs[$_] if ( 0 == $_ % $num );
938 =head2 nonred($cutoff, @PSISEQS);
940 Removes redundant sequences at $cutoff level of sequence identity.
942 Equivilent to nonred script.
946 #####################################################################################################
948 my ( $cutoff, @seqs ) = @_;
950 # Run pairwise and then OC and remove similar sequences
951 unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
952 unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
954 warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
958 # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
959 my $fasta = FASTA::File->new;
961 my $f = FASTA->new( id => $_->id );
962 $f->seq( @{ $_->align } );
963 $fasta->add_entries($f);
967 my $pair = Pairwise->new( path => $pairwise );
968 my $foo = join "\n", $pair->run($fasta) or die $!;
971 my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
972 $ocres->read_string( join "", $ocres->run($foo) );
974 # Now found those entries that are unrelated
976 for ( $ocres->get_groups ) {
977 my ( $group, $score, $size, @labels ) = @{$_};
979 # Keep the QUERY from the cluster, otherwise just get onelt
980 if ( grep { /QUERY/ } @labels ) {
982 # Make sure the query stays at the start
983 unshift @keep, "QUERY";
986 push @keep, shift @labels;
989 push @keep, $ocres->get_unclustered;
991 # Return the matching entries.
992 # We use the temporay to mimic the selection of sequences in the nonred
993 # script, which took the sequences in the order they occured in the file.
997 if ( grep { $_ =~ /^$label$/ } @keep ) {
998 push @filtered_res, $_;
1002 return @filtered_res;
1005 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
1007 Removes those sequences that are more than $percentage longer or shorter than the first
1008 sequence in @seqs. @seqs is an array of PSISEQ objects.
1010 Equivilent to trim_seqs script.
1014 #####################################################################################################
1016 my ( $factor, @seqs ) = @_;
1018 # Keep the query sequence (we assume query is the 1st FASTA record)
1019 my $query = shift @seqs;
1022 # Find the length of the query sequence without any gaps
1023 my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
1025 # Calculate the upper and lower allowed bounds
1026 my ( $upper, $lower );
1028 my $bounds = ( $length / 100 ) * $factor;
1029 ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
1032 # Now try each sequence and see how long they are
1034 my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
1035 if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
1041 =head2 toupper ($PSISEQ)
1043 Converts sequence held in PSISEQ object to uppercase.
1047 #####################################################################################################
1049 $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
1052 =head2 degap($PSISEQ_QUERY, @PSISEQS)
1054 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
1056 Equivilent to fasta2jnet script.
1060 #####################################################################################################
1064 # Find where the gaps in the query sequence are
1067 for ( @{ $seqs[0]->align } ) {
1068 push @gaps, $i if $_ !~ /$GAP/;
1072 return unless @gaps;
1074 # Remove the gaps in all the sequences.
1075 # Derefences the array reference and takes a slice inside the method call
1076 # arguments, then coerce back to an array ref.
1078 $_->align( [ @{ $_->align }[@gaps] ] );
1082 =head2 getfreq($filename, @PSISEQS)
1084 Creates a PSIBLAST like profile and writes it to the $filename.
1086 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
1087 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
1088 but it's *not* the same.
1092 #####################################################################################################
1094 my ( $fn, @seqs ) = @_;
1096 #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
1098 # Create a PSIBLAST like profile from the alignment
1099 # This doesn't greate the same result as old Jpred, I think this is because
1100 # differences in the input between old and new
1103 # my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
1104 # my @profile = profile(
1105 # map { join "", @{ $_->seq } } $f->get_entries
1108 my @profile = profile( map { join "", @{ $_->align } } @seqs );
1110 open my $fh, ">$fn" or die "JPRED: $fn: $!";
1111 print $fh join( " ", @{$_} ), "\n" for @profile;
1114 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
1116 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
1118 Equivilent to gethmm script.
1122 #####################################################################################################
1126 # Temp files required
1127 my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
1129 # Create the FASTA file
1130 psiseq2fasta( $tmp_fasta, @seqs );
1133 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
1134 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1136 # Read in the HMMER file
1137 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1139 # Convert from HMMER::Profile to HMMER::Profile::Jnet
1140 my $hmmer_jnet = HMMER::Profile::Jnet->new;
1141 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
1146 =head2 jnet($hmmer, $out, $pssm )
1148 Runs Jnet. Pass the paths of the filenames to do the prediction on
1152 #################################################################################################################################################
1154 =head2 get_hmm($alignFile, $name)
1156 Adapted from the hmmer() function in Jpred as written by JDB.
1158 Generates an HMMer profile output compatible with Jnet.
1160 Equivilent to gethmm script.
1165 my $fastafile = shift;
1167 # Temp files required
1168 print "hmm_for_align: fastafile = $fastafile\n";
1169 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
1171 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
1172 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1174 # Read in the HMMER file
1175 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1177 # Read in the HMMER file
1178 my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
1180 # Convert from HMMER::Profile to HMMER::Profile::Jnet
1181 my $hmmer_jnet = HMMER::Profile::Jnet->new;
1182 $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
1187 #####################################################################################################
1189 my ( $hmmer, $outfile, $pssm ) = @_;
1192 # run Jnet prediction with all the profile data
1193 system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
1196 # run Jnet prediction with only HMMer and alignment data
1197 system("$jnet --concise --hmmer $hmmer > $outfile");
1199 check( "jnet", $? ) or exit 1;
1202 open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1204 if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
1211 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1212 print $OUT "\n" . $res;
1217 =head1 psiseq2fasta($path, @PSISEQ)
1219 Writes a FASTA file given an array of PSISEQ objects.
1223 #####################################################################################################
1225 my ( $fn, @seqs ) = @_;
1227 #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1228 confess "JPRED: not passed PSISEQs" unless @seqs;
1230 #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1232 # Changed from using FASTA::File object due to seg faulting on some large,
1233 # long alignments, presumably due to running out of memory.
1234 # my $fasta = FASTA::File->new;
1235 # $fasta->add_entries(
1237 # my $f = FASTA->new(id => $_->id);
1238 # $f->seq(@{$_->align});
1242 # $fasta->write_file($fn);
1246 ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1247 : ( open $fh, ">$fn" or die "JPRED: $!" );
1250 print $fh ">", $_->id, "\n";
1251 my $seq = join( "", @{ $_->align } );
1252 $seq =~ s/(.{72})/$1\n/g;
1254 print $fh $seq . "\n";
1259 =head1 @PSISEQ = fasta2psiseq($path)
1261 Reads in a FASTA file and returns an array of PSISEQ objects.
1265 #####################################################################################################
1271 ? FASTA::File->new( read_gzip_file => $fn )
1272 : FASTA::File->new( read_file => $fn );
1273 $fasta or die "JPRED: $!";
1276 for ( $fasta->get_entries ) {
1277 my $seq = PSISEQ->new;
1279 $seq->align( [ @{ $_->seq } ] );
1286 #####################################################################################################
1287 sub fasta2psiseq_lowmemory {
1288 my $filename = shift;
1292 $filename =~ /\.gz$/
1293 ? ( open $fh, "gzip -dc $filename|" or die $! )
1294 : ( open $fh, $filename or die $! );
1301 $seq =~ s/ //g if defined $seq;
1303 if ( $id and $seq ) {
1304 my $psi = PSISEQ->new( id => $id );
1305 $psi->align( [ split //, $seq ] );
1316 if ( $id and $seq ) {
1317 my $psi = PSISEQ->new( id => $id );
1325 #####################################################################################################
1327 =head1 extend($FASTA::File, @PSISEQ)
1329 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1330 against the query, this function extends the alignment so that all of the query is
1331 present in the alignment. The other sequences are extended with gap characters.
1336 my ( $query, @seqs ) = @_;
1338 #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1339 #croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1341 # Get the query sequence
1342 my $q_query = $query->get_entry(0);
1344 # Get the query sequence from the PSIBLAST alignment
1345 my $a_query = shift @seqs;
1347 # The gap size required to expand the alignment
1348 my ( $start_gap_length, $end_gap_length );
1351 # Make handling the sequence easier
1352 my $q_seq = join "", @{ [ $q_query->seq ] };
1353 my $a_seq = join "", @{ $a_query->align };
1358 ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1360 $start_gap_length = index( $q_seq, $a_seq );
1361 croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1363 $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1366 # Add the gaps to the end of the alignments
1368 $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1371 # Put the query sequence back
1376 ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1377 @{ $a_query->align },
1378 ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1385 #####################################################################################################
1387 =head1 $int = fasta_seq_length($path)
1389 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1393 sub fasta_seq_length {
1395 open my $fh, $fn or croak "Can't open file $fn: $!\n";
1398 local $_, $/ = "\n";