5 jpred - Secondary structure prediction program
9 ./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
13 This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence
14 (with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC.
15 For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the
16 secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
24 The path to the sequence file (in FASTA, MSF, or BLC format)
28 The input file is a FASTA file with one sequence only.
30 =item -output <FILEPREFIX>
32 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
34 =item -outfile <FILE2>
36 An output file, by default it is undefined and the -output option is working
38 =item -logfile <FILE3>
40 Logging file, by default it is undefined and logging switched off
44 Path to the uniref database used for PSI-BLAST querying. default value: .
46 =item -dbname database
48 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
49 Default value: uniref90
53 Path to a PSIBLAST output file.
57 Number of CPU used by jpred.pl. Maximum value is 8
62 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
66 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
67 The option untoggles this step and stops jpred at the concise file generation
71 Redefined basic variable in order to use the script within JABAWS
75 Verbose mode. Print more information while running jpred.
79 Debug mode. Print debugging information while running jpred.
83 Gives help on the programs usage.
87 Displays this man page.
93 Can't cope with gaps in the query sequence.
95 Doesn't accept alignments.
97 If you find any others please contact authors.
101 Jonathan Barber <jon@compbio.dundee.ac.uk>
102 Chris Cole <christian@cole.name>
103 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
107 ## TODO check for gaps in query sequence
108 ## check for disallowed chars in sequence
118 # path to Jpred modules
119 use FindBin qw($Bin);
122 # internal jpred modules
124 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
126 use HMMER::Profile::Jnet;
135 use Utils qw(profile);
138 our $VERSION = '3.0.2';
139 my $MAX_ALIGN = 10000; # maximum number of alignment sequences allowed
140 my $NR_CUT = 75; # cut-off seqeunce ID value for defining non-redundancy
142 # Gap characters in sequences
145 #####################################################################################################
146 # Light object to hold sequence information, light to prevent memory overload
147 # for big search results. Even so this is quite memory intensive, I can't see
148 # how to reduce the size further.
154 my ( $class, %args ) = @_;
157 for ( keys %args ) { $self->$_( $args{$_} ) }
174 if ( defined $_[1] ) {
175 ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
177 $_[0]->[2] = join( "", @{ $_[1] } );
179 return [ split( //, $_[0]->[2] ) ];
184 #####################################################################################################
195 my $ncpu = 1; # number of CPUs for psiblast calculations
196 my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
197 my $db_path = "/homes/www-jpred/databases";
198 my $db_entry = "cluster";
201 my ( $help, $man, $DEBUG, $VERBOSE );
205 "output=s" => \$prefix,
206 "outfile=s" => \$outfile,
207 "logfile=s" => \$logfile,
208 "psi=s" => \$psiblast,
209 "dbname=s" => \$db_entry,
210 "dbpath=s" => \$db_path,
212 "pred-nohits" => \$predNoHits,
213 "no-final" => \$nofinal,
215 "jabaws" => \$jabaws,
220 "verbose" => \$VERBOSE
222 pod2usage(1) if $help;
223 pod2usage( verbose => 2 ) if $man;
225 $goal = "seq" if ( defined $seqgoal );
227 #####################################################################################################
228 # Key to database information and information for accessing them
229 if ( defined $jabaws ) {
230 $db_path = "/data/UNIREFdb";
231 $db_entry = "cluster";
236 ## default database used by Jpred
238 database => $db_path . "/uniref90.filt",
239 unfiltered => $db_path . "/uniref90",
242 ## database used during Jnet training
244 database => $db_path . "/training/uniref90.filt",
245 unfiltered => $db_path . "/training/uniref90",
248 ## database used for ported Jpred
250 database => $db_path . "/uniref90.filt",
251 unfiltered => $db_path . "/uniref90",
254 ## cluster-specific path for Jpred
256 database => $db_path . "/uniref90.filt",
257 unfiltered => $db_path . "/uniref90",
260 ## these other DBs are experimental ones used during development.
261 ## check they exist before using them.
262 # generic entry for use with validate_jnet.pl
263 # real db location defined by validate_jnet.pl
265 database => $db_path . "/swall/swall.filt",
266 unfiltered => $db_path . "/swall/swall",
269 # Path to PSIBLAST db
271 database => $db_path . "/3/swall.filt",
272 unfiltered => $db_path . "/3/swall",
276 database => $db_path . "/6/swall.filt",
277 unfiltered => $db_path . "/6/swall",
281 my $dp = $database->{$db_entry}{database};
282 pod2usage("ERROR! Input file should be provided with -sequence/-in") unless $infile;
283 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database") unless exists $database->{$db_entry};
284 pod2usage("ERROR! UNIREF database is not available at $dp. Use -dbpath and -dbname for configuring the database") unless -f $dp;
286 #####################################################################################################
287 # lots of preparation steps
288 if ( defined $prefix ) {
289 unless ( defined $outfile ) {
290 $outfile = $prefix . ".res.fasta";
293 if ( defined $outfile ) {
294 print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
295 $prefix = $outfile . ".tmp";
297 print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
298 $prefix = $infile . ".res";
299 $outfile = $prefix . ".jnet";
303 if ( 1 > $ncpu or 8 < $ncpu ) {
304 print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
308 $VERBOSE = 1 if $DEBUG;
311 if ( defined $logfile ) {
312 open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
315 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
316 my ( $key, $value ) = @{$_};
317 defined $value or next;
318 my $par = join( ": ", $key, $value );
319 print "$par\n" if $DEBUG;
320 print $LOG "$par\n" if $LOG;
323 if ( defined $jabaws ) {
324 setup_jpred_env($Bin);
327 my $platform = check_OS();
328 print "JPRED: checking platiform... $platform\n" if $DEBUG;
329 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
331 #####################################################################################################
332 # check input file format
333 if ( 'seq' eq $goal ) {
335 if ( 1 != check_FASTA_format($infile) ) {
336 die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
339 my $nseq = check_FASTA_format($infile);
343 die "\nERROR! jpred requires alignment with more than 1 sequence\n if you provide only one sequence use the -seq option.\n";
345 } elsif ( 0 < check_MSF_format($infile) ) {
347 } elsif ( 0 < check_BLC_format($infile) ) {
350 die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
353 $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
355 #####################################################################################################
356 if ( 'seq' eq $goal ) {
357 my $query = FASTA::File->new( read_file => $infile );
359 my $psi = PSIBLAST->new;
360 unless ( defined $psiblast && -e $psiblast ) {
361 ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
362 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence
363 # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
364 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
365 my $psi_run = PSIBLAST::Run->new(
367 args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
368 path => $psiblastbin,
370 output => "$prefix.blast",
371 matrix => "$prefix.profile",
372 database => $database->{$db_entry}{database},
375 # For reduced databases, get the size of the orginal and use this as
376 # the equivilent DB size
377 if ( $db_entry =~ /^sp_red_/ ) {
378 ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
380 $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below
383 print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
384 print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
385 print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
386 print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG;
388 print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
389 print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
390 $psi_run->run or die; # CC sub is from PSIBLAST::Run
392 $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
393 system("gzip -9f $prefix.blast");
395 if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
396 else { $psi->read_file($psiblast) } # CC ditto above
399 #####################################################################################################
400 # Convert the last itteration into a collection of PSISEQ objects
401 for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm
402 my ( $id, $start, $align ) = @{$_};
407 align => [ split( //, $align ) ]
411 #####################################################################################################
412 # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
413 # Accuracy won't be as good, but at least it's a prediction
415 if ( $predNoHits == 0 ) {
416 warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
417 print ">>100% complete\n";
418 print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
419 print $LOG ">>100% complete\n" if $LOG;
423 print ">>50% complete\n";
424 warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
425 print "Running HMMer on query...\n" if $VERBOSE;
426 print $LOG ">>50% complete\n" if $LOG;
427 print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
428 print $LOG "Running HMMer on query...\n" if $LOG;
430 # copy input query to alignment
431 #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
432 open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
433 open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
434 while (<$ifh>) { print $ofh, $_ }
438 # Temp files required for HMMer
439 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
440 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
441 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
442 unlink $hmmbuild_out;
444 # Read in the HMMER file
445 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
447 # Convert from HMMER::Profile to HMMER::Profile::Jnet
448 my $hmmer_jnet = HMMER::Profile::Jnet->new;
449 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
450 $hmmer_jnet->write_file("$prefix.hmm");
451 print ">>70% complete\n";
452 print $LOG ">>70% complete\n" if $LOG;
454 # Run Jnet for the prediction
455 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
456 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
457 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
458 print ">>90% complete\n";
459 print $LOG ">>90% complete\n" if $LOG;
460 print $jnetlog if $VERBOSE;
461 print $LOG $jnetlog if $LOG;
464 psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
465 print ">>40% complete\n";
466 print $LOG ">>40% complete\n" if $LOG;
468 #####################################################################################################
469 # Make PSIBLAST truncated alignments the right length
470 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
471 print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
472 @seqs = extend( $query, @seqs );
473 psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
475 #####################################################################################################
476 # Remove masking from PSIBLAST alignment
477 print "Unmasking the alignments...\n" if $VERBOSE;
478 print $LOG "Unmasking the alignments...\n" if $LOG;
479 my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
480 remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
481 psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
483 #####################################################################################################
484 # Convert the sequences to upper case
485 print "Converting sequences to the same case...\n" if $VERBOSE;
486 print $LOG "Converting sequences to the same case...\n" if $LOG;
487 toupper($_) for @seqs; # CC sub is below
488 psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
490 #####################################################################################################
491 # Remove excessive sequences
492 print "Remove excessive sequences...\n" if $VERBOSE;
493 print $LOG "Remove excessive sequences...\n" if $LOG;
494 @seqs = reduce( $MAX_ALIGN, @seqs );
495 psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
497 #####################################################################################################
498 # Remove sequences that are too long or too short
499 print "Remove sequences which too long or short...\n" if $VERBOSE;
500 print $LOG "Remove sequences which too long or short...\n" if $LOG;
501 @seqs = del_long_seqs( 50, @seqs );
502 psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
504 #####################################################################################################
505 # Remove redundant sequences based upon pairwise identity and OC clustering
506 print "Remove redundant sequences...\n" if $VERBOSE;
507 print $LOG "Remove redundant sequences...\n" if $LOG;
508 @seqs = nonred( $NR_CUT, @seqs );
509 psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
511 #####################################################################################################
512 # Check that we haven't got rid of all of the sequences
514 warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
515 print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
516 @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
518 unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
520 # Remove gaps in the query sequence and same positions in the alignment
521 print "Removing gaps in the query sequence...\n" if $VERBOSE;
522 print $LOG "Removing gaps in the query sequence...\n" if $LOG;
524 psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
526 # Output the alignment for the prediction
527 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
528 print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
529 psiseq2fasta( "$prefix.align", @seqs );
531 #####################################################################################################
532 # Equivilent to getpssm script
533 print "Output the PSSM matrix from the PSI-BLAST profile...\n";
534 print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
535 my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
536 $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
537 print ">>50% complete\n";
538 print $LOG ">>50% complete\n" if $LOG;
540 #####################################################################################################
541 # Run HMMER on the sequences
542 print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
543 print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
544 my $hmmer = hmmer(@seqs); # CC sub is below
545 $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
546 print ">>70% complete\n";
547 print $LOG ">>70% complete\n" if $LOG;
548 #####################################################################################################
549 # Run Jnet for the prediction
550 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
551 print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
552 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below
553 print ">>90% complete\n";
554 print $LOG ">>90% complete\n" if $LOG;
555 print $jnetlog if $VERBOSE;
556 print $LOG $jnetlog if $LOG;
559 if ( 'fasta' eq $format ) {
560 print "Read FASTA file\n";
561 my $hmmer1 = hmm_for_align($infile);
562 $hmmer1->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
563 } elsif ( 'msf' eq $format ) {
564 msf2fasta( $infile, $infastafile );
565 my $hmmer2 = hmm_for_align($infastafile);
566 $hmmer2->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
567 } elsif ( 'blc' eq $format ) {
568 my $tmpfile = File::Temp->new->filename;
569 blc2msf( $infile, $tmpfile );
570 msf2fasta( $infile, $infastafile );
571 my $hmmer3 = hmm_for_align($infastafile);
572 $hmmer3->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
574 die "ERROR! unknown input format '$format'. exit...\n";
576 print ">>70% complete\n";
577 print $LOG ">>70% complete\n" if $LOG;
578 #####################################################################################################
579 # Run Jnet for the prediction
580 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
581 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
582 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) ); # CC sub is below
583 print ">>90% complete\n";
584 print $LOG ">>90% complete\n" if $LOG;
585 print $jnetlog if $VERBOSE;
586 print $LOG $jnetlog if $LOG;
589 unless ( defined $nofinal ) {
590 my $aligfile = $prefix . ".align";
591 my $jnetfile = $prefix . ".jnet";
592 my $jnetfastafile = $prefix . ".jnet.fasta";
593 $aligfile = $infile if ( 'fasta' eq $format );
594 $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
595 concise2fasta( $jnetfile, $jnetfastafile );
596 open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
597 open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
598 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
599 while (<$IN2>) { print $OUT $_; }
600 while (<$IN1>) { print $OUT $_; }
604 unlink $jnetfastafile;
605 my $seqs = FASTA::File->new( read_file => $outfile );
606 open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
610 if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
611 rename $prefix . ".jnet", $outfile;
615 print ">>100% complete\n";
616 print "Jpred Finished\n";
617 print $LOG ">>100% complete\n" if $LOG;
618 print $LOG "Jpred Finished\n" if $LOG;
622 #####################################################################################################
624 #####################################################################################################
625 sub check_FASTA_format {
628 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
629 my $check_first_line = 1;
633 if ($check_first_line) {
634 return 0 unless (/^>/);
635 $check_first_line = 0;
640 my ( $id, @seqs ) = split /\n/, $_;
641 return 0 unless ( defined $id or @seqs );
642 my $seq = join( "", @seqs );
643 return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
650 #####################################################################################################
651 sub check_MSF_format {
654 system("$readseq -fMSF -a -p < $infile > /dev/null");
659 #####################################################################################################
660 sub check_BLC_format {
663 my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
664 open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
665 print $fh "silent_mode\nblock_file $infile\n";
666 print $fh "output_file /dev/null\nmax_nseq 2000\n";
667 print $fh "pir_save $tmpfile2\nsetup\n";
671 system("$alscript -s -f $tmpfile1");
673 system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
680 ############################################################################################################################################
681 # convertor BLC -> MSF
687 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
693 $sequence =~ s/~/\./g;
695 ## check for non-unique sequence names - readseq has problems with these
696 my @lines = split /\n/, $sequence;
698 foreach my $line (@lines) {
699 if ( $line =~ /Name:\s+(\S+)/ ) {
700 die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
705 my $tmpfile1 = File::Temp->new->filename;
706 my $tmpfile2 = File::Temp->new->filename;
707 open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
708 print $FILE $sequence;
712 system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
713 die "Reformatting of $infile failed. exit...\n" if ($?);
717 system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
718 die "Reformatting of $infile failed. exit...\n" if ($?);
721 die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
724 ############################################################################################################################################
725 # convertor BLC -> MSF
727 my ( $in, $out ) = @_;
729 my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
731 open my $fh, ">$tmp" or die "$tmp: $!\n";
732 print $fh "silent_mode\nblock_file $infile\n";
733 print $fh "output_file /dev/null\nmax_nseq 2000\n";
734 print $fh "pir_save $tmp_pir\nsetup\n";
738 system("$alscript -s -f $tmp");
739 die "Reformatting of $infile failed. exit...\n" if ($?);
740 system("$readseq -f=MSF -o=$out -a $tmp_pir");
746 #####################################################################################################
750 =head2 fasta2concise ($infile, $outfile)
752 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
760 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
761 my ( $seq, @seqs, @title );
782 if ( @title != @seqs ) {
783 die("non matching number of titles and sequences!\n");
786 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
787 foreach ( 0 .. $#title ) {
788 print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
795 =head2 concise2fasta ($infile, $outfile)
797 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
801 #####################################################################################################
806 my ( @seqs, %seq, @pred, %pred );
809 "Lupas_21", "Lupas_14", "Lupas_28", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
810 "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
813 # parse input concise file
814 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
817 my ( $id, $seq ) = split( ":", $_ );
818 if ( defined $id and defined $seq ) { # Check we have proper values
821 if ( $id =~ /;/ ) { # Then it's an alignment
822 @_ = split( ";", $id );
824 $seq{ $_[1] } = $seq;
826 foreach my $v (@var) {
837 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
839 # print out sequences found with PSI-BLAST
841 $seq{$_} =~ s/(.{72})/$1\n/g;
842 print $OUT ">$_\n$seq{$_}\n";
845 # print out predictions
846 foreach my $p (@pred) {
847 $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
848 $pred{$p} =~ s/B/E/g;
849 $pred{$p} =~ s/G/H/g;
852 $pred{$p} =~ s/E/B/g;
854 $pred{$p} =~ s/(.{72})/$1\n/g;
855 print $OUT ">$p\n$pred{$p}\n";
862 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
864 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
869 #####################################################################################################
870 sub remove_seq_masks {
874 # Only bother if we have a sequence which has been masked
875 return unless grep { uc $_ eq 'X' } @{ $seq->align };
877 my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
878 my $searchdb = $database->{$db_entry}{unfiltered};
879 $start--; # We need this to be zero based
881 my $f = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
882 my @seqs = $f->get_entries;
883 my $db = shift @seqs;
886 confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
888 confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
891 my @db_seq = @{ $db->seq };
893 # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
895 for ( 0 .. $#align ) {
896 next if $align[$_] =~ /$GAP/;
898 if ( $align[$_] eq 'X' ) {
899 unless ( defined $db_seq[ $i + $start ] ) {
900 croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
902 $align[$_] = $db_seq[ $i + $start ];
906 $seq->align( \@align );
909 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
911 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
913 Equivilent to reduce script.
917 #####################################################################################################
919 my ( $max, @seqs ) = @_;
921 my $num = int( @seqs / $max );
922 my @res = shift @seqs;
924 # Don't bother if we have less than the required sequences
925 if ( @seqs < $max or 0 == $num ) {
929 for ( 0 .. $#seqs ) {
930 push @res, $seqs[$_] if ( 0 == $_ % $num );
936 =head2 nonred($cutoff, @PSISEQS);
938 Removes redundant sequences at $cutoff level of sequence identity.
940 Equivilent to nonred script.
944 #####################################################################################################
946 my ( $cutoff, @seqs ) = @_;
948 # Run pairwise and then OC and remove similar sequences
949 unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
950 unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
952 warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
956 # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
957 my $fasta = FASTA::File->new;
959 my $f = FASTA->new( id => $_->id );
960 $f->seq( @{ $_->align } );
961 $fasta->add_entries($f);
965 my $pair = Pairwise->new( path => $pairwise );
966 my $foo = join "\n", $pair->run($fasta) or die $!;
969 my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
970 $ocres->read_string( join "", $ocres->run($foo) );
972 # Now found those entries that are unrelated
974 for ( $ocres->get_groups ) {
975 my ( $group, $score, $size, @labels ) = @{$_};
977 # Keep the QUERY from the cluster, otherwise just get onelt
978 if ( grep { /QUERY/ } @labels ) {
980 # Make sure the query stays at the start
981 unshift @keep, "QUERY";
984 push @keep, shift @labels;
987 push @keep, $ocres->get_unclustered;
989 # Return the matching entries.
990 # We use the temporay to mimic the selection of sequences in the nonred
991 # script, which took the sequences in the order they occured in the file.
995 if ( grep { $_ =~ /^$label$/ } @keep ) {
996 push @filtered_res, $_;
1000 return @filtered_res;
1003 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
1005 Removes those sequences that are more than $percentage longer or shorter than the first
1006 sequence in @seqs. @seqs is an array of PSISEQ objects.
1008 Equivilent to trim_seqs script.
1012 #####################################################################################################
1014 my ( $factor, @seqs ) = @_;
1016 # Keep the query sequence (we assume query is the 1st FASTA record)
1017 my $query = shift @seqs;
1020 # Find the length of the query sequence without any gaps
1021 my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
1023 # Calculate the upper and lower allowed bounds
1024 my ( $upper, $lower );
1026 my $bounds = ( $length / 100 ) * $factor;
1027 ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
1030 # Now try each sequence and see how long they are
1032 my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
1033 if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
1039 =head2 toupper ($PSISEQ)
1041 Converts sequence held in PSISEQ object to uppercase.
1045 #####################################################################################################
1047 $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
1050 =head2 degap($PSISEQ_QUERY, @PSISEQS)
1052 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
1054 Equivilent to fasta2jnet script.
1058 #####################################################################################################
1062 # Find where the gaps in the query sequence are
1065 for ( @{ $seqs[0]->align } ) {
1066 push @gaps, $i if $_ !~ /$GAP/;
1070 return unless @gaps;
1072 # Remove the gaps in all the sequences.
1073 # Derefences the array reference and takes a slice inside the method call
1074 # arguments, then coerce back to an array ref.
1076 $_->align( [ @{ $_->align }[@gaps] ] );
1080 =head2 getfreq($filename, @PSISEQS)
1082 Creates a PSIBLAST like profile and writes it to the $filename.
1084 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
1085 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
1086 but it's *not* the same.
1090 #####################################################################################################
1092 my ( $fn, @seqs ) = @_;
1094 #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
1096 # Create a PSIBLAST like profile from the alignment
1097 # This doesn't greate the same result as old Jpred, I think this is because
1098 # differences in the input between old and new
1101 # my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
1102 # my @profile = profile(
1103 # map { join "", @{ $_->seq } } $f->get_entries
1106 my @profile = profile( map { join "", @{ $_->align } } @seqs );
1108 open my $fh, ">$fn" or die "JPRED: $fn: $!";
1109 print $fh join( " ", @{$_} ), "\n" for @profile;
1112 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
1114 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
1116 Equivilent to gethmm script.
1120 #####################################################################################################
1124 # Temp files required
1125 my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
1127 # Create the FASTA file
1128 psiseq2fasta( $tmp_fasta, @seqs );
1131 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
1132 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1134 # Read in the HMMER file
1135 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1137 # Convert from HMMER::Profile to HMMER::Profile::Jnet
1138 my $hmmer_jnet = HMMER::Profile::Jnet->new;
1139 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
1144 =head2 jnet($hmmer, $out, $pssm )
1146 Runs Jnet. Pass the paths of the filenames to do the prediction on
1150 #################################################################################################################################################
1152 =head2 get_hmm($alignFile, $name)
1154 Adapted from the hmmer() function in Jpred as written by JDB.
1156 Generates an HMMer profile output compatible with Jnet.
1158 Equivilent to gethmm script.
1163 my $fastafile = shift;
1165 # Temp files required
1166 print "hmm_for_align: fastafile = $fastafile\n";
1167 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
1169 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
1170 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1172 # Read in the HMMER file
1173 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1175 # Read in the HMMER file
1176 my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
1178 # Convert from HMMER::Profile to HMMER::Profile::Jnet
1179 my $hmmer_jnet = HMMER::Profile::Jnet->new;
1180 $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
1185 #####################################################################################################
1187 my ( $hmmer, $outfile, $pssm ) = @_;
1190 # run Jnet prediction with all the profile data
1191 system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
1194 # run Jnet prediction with only HMMer and alignment data
1195 system("$jnet --concise --hmmer $hmmer > $outfile");
1197 check( "jnet", $? ) or exit 1;
1200 open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1202 if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
1209 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1210 print $OUT "\n" . $res;
1215 =head1 psiseq2fasta($path, @PSISEQ)
1217 Writes a FASTA file given an array of PSISEQ objects.
1221 #####################################################################################################
1223 my ( $fn, @seqs ) = @_;
1225 #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1226 confess "JPRED: not passed PSISEQs" unless @seqs;
1228 #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1230 # Changed from using FASTA::File object due to seg faulting on some large,
1231 # long alignments, presumably due to running out of memory.
1232 # my $fasta = FASTA::File->new;
1233 # $fasta->add_entries(
1235 # my $f = FASTA->new(id => $_->id);
1236 # $f->seq(@{$_->align});
1240 # $fasta->write_file($fn);
1244 ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1245 : ( open $fh, ">$fn" or die "JPRED: $!" );
1248 print $fh ">", $_->id, "\n";
1249 my $seq = join( "", @{ $_->align } );
1250 $seq =~ s/(.{72})/$1\n/g;
1252 print $fh $seq . "\n";
1257 =head1 @PSISEQ = fasta2psiseq($path)
1259 Reads in a FASTA file and returns an array of PSISEQ objects.
1263 #####################################################################################################
1269 ? FASTA::File->new( read_gzip_file => $fn )
1270 : FASTA::File->new( read_file => $fn );
1271 $fasta or die "JPRED: $!";
1274 for ( $fasta->get_entries ) {
1275 my $seq = PSISEQ->new;
1277 $seq->align( [ @{ $_->seq } ] );
1284 #####################################################################################################
1285 sub fasta2psiseq_lowmemory {
1286 my $filename = shift;
1290 $filename =~ /\.gz$/
1291 ? ( open $fh, "gzip -dc $filename|" or die $! )
1292 : ( open $fh, $filename or die $! );
1299 $seq =~ s/ //g if defined $seq;
1301 if ( $id and $seq ) {
1302 my $psi = PSISEQ->new( id => $id );
1303 $psi->align( [ split //, $seq ] );
1314 if ( $id and $seq ) {
1315 my $psi = PSISEQ->new( id => $id );
1323 #####################################################################################################
1325 =head1 extend($FASTA::File, @PSISEQ)
1327 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1328 against the query, this function extends the alignment so that all of the query is
1329 present in the alignment. The other sequences are extended with gap characters.
1334 my ( $query, @seqs ) = @_;
1336 #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1337 #croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1339 # Get the query sequence
1340 my $q_query = $query->get_entry(0);
1342 # Get the query sequence from the PSIBLAST alignment
1343 my $a_query = shift @seqs;
1345 # The gap size required to expand the alignment
1346 my ( $start_gap_length, $end_gap_length );
1349 # Make handling the sequence easier
1350 my $q_seq = join "", @{ [ $q_query->seq ] };
1351 my $a_seq = join "", @{ $a_query->align };
1356 ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1358 $start_gap_length = index( $q_seq, $a_seq );
1359 croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1361 $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1364 # Add the gaps to the end of the alignments
1366 $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1369 # Put the query sequence back
1374 ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1375 @{ $a_query->align },
1376 ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1383 #####################################################################################################
1385 =head1 $int = fasta_seq_length($path)
1387 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1391 sub fasta_seq_length {
1393 open my $fh, $fn or croak "Can't open file $fn: $!\n";
1396 local $_, $/ = "\n";