5 jpred - Secondary structure prediction program
9 ./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
13 This is a program for predicting the secondary structure of a multiple sequence alignment or a protein sequence.
14 The input file can be stored in 3 formats: FASTA, MSF, or BLC.
15 For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and
16 predicting the secondary structure with Jnet. For the multiple sequence alignment only the HMM profile,
17 created from the alignment, is used in Jnet.
25 The path to the sequence file (in FASTA, MSF, or BLC format)
27 =item -output <FILEPREFIX>
29 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
31 =item -outfile <FILE2>
33 An output file, by default it is undefined and the -output option is working
35 =item -logfile <FILE3>
37 Logging file, by default it is undefined and logging switched off
41 Path to the uniref database used for PSI-BLAST querying. default value: .
43 =item -dbname database
45 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
46 Default value: uniref90
50 Path to a PSIBLAST output file.
54 Number of CPU used by jpred.pl. Maximum value is 8
59 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
63 By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
64 The option untoggles this step and stops jpred at the concise file generation
68 Redefines some basic variables in order to use the script within JABAWS
72 Verbose mode. Print more information while running jpred.
76 Debug mode. Print debugging information while running jpred.
80 Gives help on the programs usage.
84 Displays this man page.
90 Can't cope with gaps in the query sequence.
92 Doesn't accept alignments.
94 If you find any others please contact authors.
98 Jonathan Barber <jon@compbio.dundee.ac.uk>
99 Chris Cole <christian@cole.name>
100 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
104 ## TODO check for gaps in query sequence
105 ## check for disallowed chars in sequence
115 # path to Jpred modules
116 use FindBin qw($Bin);
119 # internal jpred modules
121 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
123 use HMMER::Profile::Jnet;
132 use Utils qw(profile);
135 our $VERSION = '3.0.2';
136 my $MAX_ALIGN = 10000; # maximum number of alignment sequences allowed
137 my $NR_CUT = 75; # cut-off seqeunce ID value for defining non-redundancy
139 # Gap characters in sequences
142 #####################################################################################################
143 # Light object to hold sequence information, light to prevent memory overload
144 # for big search results. Even so this is quite memory intensive, I can't see
145 # how to reduce the size further.
151 my ( $class, %args ) = @_;
154 for ( keys %args ) { $self->$_( $args{$_} ) }
171 if ( defined $_[1] ) {
172 ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
174 $_[0]->[2] = join( "", @{ $_[1] } );
176 return [ split( //, $_[0]->[2] ) ];
181 #####################################################################################################
192 my $ncpu = 1; # number of CPUs for psiblast calculations
193 my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
194 my $db_path = "/homes/www-jpred/databases";
195 my $db_entry = "cluster";
198 my ( $help, $man, $DEBUG, $VERBOSE );
202 "output=s" => \$prefix,
203 "outfile=s" => \$outfile,
204 "logfile=s" => \$logfile,
205 "psi=s" => \$psiblast,
206 "dbname=s" => \$db_entry,
207 "dbpath=s" => \$db_path,
209 "pred-nohits" => \$predNoHits,
210 "no-final" => \$nofinal,
211 "jabaws" => \$jabaws,
216 "verbose" => \$VERBOSE
218 pod2usage(1) if $help;
219 pod2usage( verbose => 2 ) if $man;
221 #####################################################################################################
222 # Key to database information and information for accessing them
225 ## default database used by Jpred
227 database => $db_path . "/uniref90.filt",
228 unfiltered => $db_path . "/uniref90",
231 ## database used during Jnet training
233 database => $db_path . "/training/uniref90.filt",
234 unfiltered => $db_path . "/training/uniref90",
237 ## database used for ported Jpred
239 database => $db_path . "/uniref90.filt",
240 unfiltered => $db_path . "/uniref90",
243 ## cluster-specific path for Jpred
245 database => $db_path . "/uniref90.filt",
246 unfiltered => $db_path . "/uniref90",
249 ## these other DBs are experimental ones used during development.
250 ## check they exist before using them.
251 # generic entry for use with validate_jnet.pl
252 # real db location defined by validate_jnet.pl
254 database => $db_path . "/swall/swall.filt",
255 unfiltered => $db_path . "/swall/swall",
258 # Path to PSIBLAST db
260 database => $db_path . "/3/swall.filt",
261 unfiltered => $db_path . "/3/swall",
265 database => $db_path . "/6/swall.filt",
266 unfiltered => $db_path . "/6/swall",
270 my $dpf = $database->{$db_entry}{database}.'.pal';
271 my $dpu = $database->{$db_entry}{unfiltered}.'.pal';
272 pod2usage("ERROR! Input file should be provided with -sequence/-in") unless $infile;
273 pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database") unless exists $database->{$db_entry};
274 pod2usage("ERROR! UNIREF filtered database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf;
275 pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu;
277 #####################################################################################################
278 # lots of preparation steps
279 if ( defined $prefix ) {
280 unless ( defined $outfile ) {
281 $outfile = $prefix . ".res.fasta";
284 if ( defined $outfile ) {
285 print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
286 $prefix = $outfile . ".tmp";
288 print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
289 $prefix = $infile . ".res";
290 $outfile = $prefix . ".jnet";
294 if ( 1 > $ncpu or 8 < $ncpu ) {
295 print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
299 $VERBOSE = 1 if $DEBUG;
302 if ( defined $logfile ) {
303 open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
306 for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
307 my ( $key, $value ) = @{$_};
308 defined $value or next;
309 my $par = join( ": ", $key, $value );
310 print "$par\n" if $DEBUG;
311 print $LOG "$par\n" if $LOG;
314 if ( defined $jabaws ) {
315 setup_jpred_env($Bin);
318 my $platform = check_OS();
319 print "JPRED: checking platiform... $platform\n" if $DEBUG;
320 print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
322 #####################################################################################################
323 # check input file format
324 my $nseq = check_FASTA_format($infile);
331 unless ( 0 < check_FASTA_alignment($infile)) {
332 die "\nERROR! jpred requires either FASTA alignment or 1 sequence in the FASTA, MSF, or BLC formats\n";
335 } elsif ( 0 < check_MSF_format($infile) ) {
337 } elsif ( 0 < check_BLC_format($infile) ) {
340 die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
342 $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
344 #####################################################################################################
345 if ( 'seq' eq $goal ) {
346 my $query = FASTA::File->new( read_file => $infile );
348 my $psi = PSIBLAST->new;
349 unless ( defined $psiblast && -e $psiblast ) {
350 ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
351 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence
352 # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
353 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
354 my $psi_run = PSIBLAST::Run->new(
356 args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
357 path => $psiblastbin,
359 output => "$prefix.blast",
360 matrix => "$prefix.profile",
361 database => $database->{$db_entry}{database},
364 # For reduced databases, get the size of the orginal and use this as
365 # the equivilent DB size
366 if ( $db_entry =~ /^sp_red_/ ) {
367 ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
369 $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below
372 print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
373 print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
374 print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
375 print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG;
377 print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
378 print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
379 $psi_run->run or die; # CC sub is from PSIBLAST::Run
381 $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
382 system("gzip -9f $prefix.blast");
384 if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
385 else { $psi->read_file($psiblast) } # CC ditto above
388 #####################################################################################################
389 # Convert the last itteration into a collection of PSISEQ objects
390 for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm
391 my ( $id, $start, $align ) = @{$_};
396 align => [ split( //, $align ) ]
400 #####################################################################################################
401 # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
402 # Accuracy won't be as good, but at least it's a prediction
404 if ( $predNoHits == 0 ) {
405 warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
406 print ">>100% complete\n";
407 print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
408 print $LOG ">>100% complete\n" if $LOG;
412 print ">>50% complete\n";
413 warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
414 print "Running HMMer on query...\n" if $VERBOSE;
415 print $LOG ">>50% complete\n" if $LOG;
416 print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
417 print $LOG "Running HMMer on query...\n" if $LOG;
419 # copy input query to alignment
420 #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
421 open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
422 open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
423 while (<$ifh>) { print $ofh, $_ }
427 # Temp files required for HMMer
428 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
429 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
430 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
431 unlink $hmmbuild_out;
433 # Read in the HMMER file
434 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
436 # Convert from HMMER::Profile to HMMER::Profile::Jnet
437 my $hmmer_jnet = HMMER::Profile::Jnet->new;
438 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
439 $hmmer_jnet->write_file("$prefix.hmm");
440 print ">>70% complete\n";
441 print $LOG ">>70% complete\n" if $LOG;
443 # Run Jnet for the prediction
444 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
445 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
446 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
447 print ">>90% complete\n";
448 print $LOG ">>90% complete\n" if $LOG;
449 print $jnetlog if $VERBOSE;
450 print $LOG $jnetlog if $LOG;
453 psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
454 print ">>40% complete\n";
455 print $LOG ">>40% complete\n" if $LOG;
457 #####################################################################################################
458 # Make PSIBLAST truncated alignments the right length
459 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
460 print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
461 @seqs = extend( $query, @seqs );
462 psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
464 #####################################################################################################
465 # Remove masking from PSIBLAST alignment
466 print "Unmasking the alignments...\n" if $VERBOSE;
467 print $LOG "Unmasking the alignments...\n" if $LOG;
468 my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
469 remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
470 psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
472 #####################################################################################################
473 # Convert the sequences to upper case
474 print "Converting sequences to the same case...\n" if $VERBOSE;
475 print $LOG "Converting sequences to the same case...\n" if $LOG;
476 toupper($_) for @seqs; # CC sub is below
477 psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
479 #####################################################################################################
480 # Remove excessive sequences
481 print "Remove excessive sequences...\n" if $VERBOSE;
482 print $LOG "Remove excessive sequences...\n" if $LOG;
483 @seqs = reduce( $MAX_ALIGN, @seqs );
484 psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
486 #####################################################################################################
487 # Remove sequences that are too long or too short
488 print "Remove sequences which too long or short...\n" if $VERBOSE;
489 print $LOG "Remove sequences which too long or short...\n" if $LOG;
490 @seqs = del_long_seqs( 50, @seqs );
491 psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
493 #####################################################################################################
494 # Remove redundant sequences based upon pairwise identity and OC clustering
495 print "Remove redundant sequences...\n" if $VERBOSE;
496 print $LOG "Remove redundant sequences...\n" if $LOG;
497 @seqs = nonred( $NR_CUT, @seqs );
498 psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
500 #####################################################################################################
501 # Check that we haven't got rid of all of the sequences
503 warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
504 print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
505 @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
507 unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
509 # Remove gaps in the query sequence and same positions in the alignment
510 print "Removing gaps in the query sequence...\n" if $VERBOSE;
511 print $LOG "Removing gaps in the query sequence...\n" if $LOG;
513 psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
515 # Output the alignment for the prediction
516 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
517 print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
518 psiseq2fasta( "$prefix.align", @seqs );
520 #####################################################################################################
521 # Equivilent to getpssm script
522 print "Output the PSSM matrix from the PSI-BLAST profile...\n";
523 print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
524 my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
525 $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
526 print ">>50% complete\n";
527 print $LOG ">>50% complete\n" if $LOG;
529 #####################################################################################################
530 # Run HMMER on the sequences
531 print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
532 print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
533 my $hmmer = hmmer(@seqs); # CC sub is below
534 $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
535 print ">>70% complete\n";
536 print $LOG ">>70% complete\n" if $LOG;
537 #####################################################################################################
538 # Run Jnet for the prediction
539 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
540 print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
541 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below
542 print ">>90% complete\n";
543 print $LOG ">>90% complete\n" if $LOG;
544 print $jnetlog if $VERBOSE;
545 print $LOG $jnetlog if $LOG;
548 if ( 'fasta' eq $format ) {
549 print "Read FASTA file\n";
550 my $hmmer1 = hmm_for_align($infile);
551 $hmmer1->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
552 } elsif ( 'msf' eq $format ) {
553 msf2fasta( $infile, $infastafile );
554 my $hmmer2 = hmm_for_align($infastafile);
555 $hmmer2->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
556 } elsif ( 'blc' eq $format ) {
557 my $tmpfile = File::Temp->new->filename;
558 blc2msf( $infile, $tmpfile );
559 msf2fasta( $infile, $infastafile );
560 my $hmmer3 = hmm_for_align($infastafile);
561 $hmmer3->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
563 die "ERROR! unknown input format '$format'. exit...\n";
565 print ">>70% complete\n";
566 print $LOG ">>70% complete\n" if $LOG;
567 #####################################################################################################
568 # Run Jnet for the prediction
569 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
570 print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
571 my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) ); # CC sub is below
572 print ">>90% complete\n";
573 print $LOG ">>90% complete\n" if $LOG;
574 print $jnetlog if $VERBOSE;
575 print $LOG $jnetlog if $LOG;
578 unless ( defined $nofinal ) {
579 my $aligfile = $prefix . ".align";
580 my $jnetfile = $prefix . ".jnet";
581 my $jnetfastafile = $prefix . ".jnet.fasta";
582 $aligfile = $infile if ( 'fasta' eq $format );
583 $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
584 concise2fasta( $jnetfile, $jnetfastafile );
585 open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
586 open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
587 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
588 while (<$IN2>) { print $OUT $_; }
589 while (<$IN1>) { print $OUT $_; }
593 unlink $jnetfastafile;
594 my $seqs = FASTA::File->new( read_file => $outfile );
595 open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
599 if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
600 rename $prefix . ".jnet", $outfile;
604 print ">>100% complete\n";
605 print "Jpred Finished\n";
606 print $LOG ">>100% complete\n" if $LOG;
607 print $LOG "Jpred Finished\n" if $LOG;
611 #####################################################################################################
613 #####################################################################################################
614 sub check_FASTA_format {
617 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
618 my $check_first_line = 1;
622 if ($check_first_line) {
623 return 0 unless (/^>/);
624 $check_first_line = 0;
629 my ( $id, @seqs ) = split /\n/, $_;
630 return 0 unless ( defined $id or @seqs );
631 my $seq = join( "", @seqs );
632 return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
639 #####################################################################################################
640 sub check_FASTA_alignment {
643 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
644 my $check_first_line = 1;
649 if ($check_first_line) {
650 return 0 unless (/^>/);
651 $check_first_line = 0;
656 my ( $id, @seqs ) = split /\n/, $_;
657 return 0 unless ( defined $id or @seqs );
658 my $seq = join( "", @seqs );
659 return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
661 $seqlen = length ($seq);
663 return 0 if ($seqlen != length ($seq) );
671 #####################################################################################################
672 sub check_MSF_format {
675 system("$readseq -fMSF -a -p < $infile > /dev/null");
680 #####################################################################################################
681 sub check_BLC_format {
684 my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
685 open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
686 print $fh "silent_mode\nblock_file $infile\n";
687 print $fh "output_file /dev/null\nmax_nseq 2000\n";
688 print $fh "pir_save $tmpfile2\nsetup\n";
692 system("$alscript -s -f $tmpfile1");
694 system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
701 ############################################################################################################################################
702 # convertor BLC -> MSF
708 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
714 $sequence =~ s/~/\./g;
716 ## check for non-unique sequence names - readseq has problems with these
717 my @lines = split /\n/, $sequence;
719 foreach my $line (@lines) {
720 if ( $line =~ /Name:\s+(\S+)/ ) {
721 die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
726 my $tmpfile1 = File::Temp->new->filename;
727 my $tmpfile2 = File::Temp->new->filename;
728 open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
729 print $FILE $sequence;
733 system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
734 die "Reformatting of $infile failed. exit...\n" if ($?);
738 system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
739 die "Reformatting of $infile failed. exit...\n" if ($?);
742 die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
745 ############################################################################################################################################
746 # convertor BLC -> MSF
748 my ( $in, $out ) = @_;
750 my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
752 open my $fh, ">$tmp" or die "$tmp: $!\n";
753 print $fh "silent_mode\nblock_file $infile\n";
754 print $fh "output_file /dev/null\nmax_nseq 2000\n";
755 print $fh "pir_save $tmp_pir\nsetup\n";
759 system("$alscript -s -f $tmp");
760 die "Reformatting of $infile failed. exit...\n" if ($?);
761 system("$readseq -f=MSF -o=$out -a $tmp_pir");
767 #####################################################################################################
771 =head2 fasta2concise ($infile, $outfile)
773 Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
781 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
782 my ( $seq, @seqs, @title );
803 if ( @title != @seqs ) {
804 die("non matching number of titles and sequences!\n");
807 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
808 foreach ( 0 .. $#title ) {
809 print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
816 =head2 concise2fasta ($infile, $outfile)
818 Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
822 #####################################################################################################
827 my ( @seqs, %seq, @pred, %pred );
830 "Lupas_21", "Lupas_14", "Lupas_28", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
831 "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
834 # parse input concise file
835 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
838 my ( $id, $seq ) = split( ":", $_ );
839 if ( defined $id and defined $seq ) { # Check we have proper values
842 if ( $id =~ /;/ ) { # Then it's an alignment
843 @_ = split( ";", $id );
845 $seq{ $_[1] } = $seq;
847 foreach my $v (@var) {
858 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
860 # print out sequences found with PSI-BLAST
862 $seq{$_} =~ s/(.{72})/$1\n/g;
863 print $OUT ">$_\n$seq{$_}\n";
866 # print out predictions
867 foreach my $p (@pred) {
868 $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
869 $pred{$p} =~ s/B/E/g;
870 $pred{$p} =~ s/G/H/g;
873 $pred{$p} =~ s/E/B/g;
875 $pred{$p} =~ s/(.{72})/$1\n/g;
876 print $OUT ">$p\n$pred{$p}\n";
883 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
885 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
890 #####################################################################################################
891 sub remove_seq_masks {
895 # Only bother if we have a sequence which has been masked
896 return unless grep { uc $_ eq 'X' } @{ $seq->align };
898 my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
899 my $searchdb = $database->{$db_entry}{unfiltered};
900 $start--; # We need this to be zero based
902 my $f = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
903 my @seqs = $f->get_entries;
904 my $db = shift @seqs;
907 confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
909 confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
912 my @db_seq = @{ $db->seq };
914 # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
916 for ( 0 .. $#align ) {
917 next if $align[$_] =~ /$GAP/;
919 if ( $align[$_] eq 'X' ) {
920 unless ( defined $db_seq[ $i + $start ] ) {
921 croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
923 $align[$_] = $db_seq[ $i + $start ];
927 $seq->align( \@align );
930 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
932 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
934 Equivilent to reduce script.
938 #####################################################################################################
940 my ( $max, @seqs ) = @_;
942 my $num = int( @seqs / $max );
943 my @res = shift @seqs;
945 # Don't bother if we have less than the required sequences
946 if ( @seqs < $max or 0 == $num ) {
950 for ( 0 .. $#seqs ) {
951 push @res, $seqs[$_] if ( 0 == $_ % $num );
957 =head2 nonred($cutoff, @PSISEQS);
959 Removes redundant sequences at $cutoff level of sequence identity.
961 Equivilent to nonred script.
965 #####################################################################################################
967 my ( $cutoff, @seqs ) = @_;
969 # Run pairwise and then OC and remove similar sequences
970 unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
971 unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
973 warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
977 # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
978 my $fasta = FASTA::File->new;
980 my $f = FASTA->new( id => $_->id );
981 $f->seq( @{ $_->align } );
982 $fasta->add_entries($f);
986 my $pair = Pairwise->new( path => $pairwise );
987 my $foo = join "\n", $pair->run($fasta) or die $!;
990 my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
991 $ocres->read_string( join "", $ocres->run($foo) );
993 # Now found those entries that are unrelated
995 for ( $ocres->get_groups ) {
996 my ( $group, $score, $size, @labels ) = @{$_};
998 # Keep the QUERY from the cluster, otherwise just get onelt
999 if ( grep { /QUERY/ } @labels ) {
1001 # Make sure the query stays at the start
1002 unshift @keep, "QUERY";
1005 push @keep, shift @labels;
1008 push @keep, $ocres->get_unclustered;
1010 # Return the matching entries.
1011 # We use the temporay to mimic the selection of sequences in the nonred
1012 # script, which took the sequences in the order they occured in the file.
1016 if ( grep { $_ =~ /^$label$/ } @keep ) {
1017 push @filtered_res, $_;
1021 return @filtered_res;
1024 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
1026 Removes those sequences that are more than $percentage longer or shorter than the first
1027 sequence in @seqs. @seqs is an array of PSISEQ objects.
1029 Equivilent to trim_seqs script.
1033 #####################################################################################################
1035 my ( $factor, @seqs ) = @_;
1037 # Keep the query sequence (we assume query is the 1st FASTA record)
1038 my $query = shift @seqs;
1041 # Find the length of the query sequence without any gaps
1042 my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
1044 # Calculate the upper and lower allowed bounds
1045 my ( $upper, $lower );
1047 my $bounds = ( $length / 100 ) * $factor;
1048 ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
1051 # Now try each sequence and see how long they are
1053 my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
1054 if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
1060 =head2 toupper ($PSISEQ)
1062 Converts sequence held in PSISEQ object to uppercase.
1066 #####################################################################################################
1068 $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
1071 =head2 degap($PSISEQ_QUERY, @PSISEQS)
1073 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
1075 Equivilent to fasta2jnet script.
1079 #####################################################################################################
1083 # Find where the gaps in the query sequence are
1086 for ( @{ $seqs[0]->align } ) {
1087 push @gaps, $i if $_ !~ /$GAP/;
1091 return unless @gaps;
1093 # Remove the gaps in all the sequences.
1094 # Derefences the array reference and takes a slice inside the method call
1095 # arguments, then coerce back to an array ref.
1097 $_->align( [ @{ $_->align }[@gaps] ] );
1101 =head2 getfreq($filename, @PSISEQS)
1103 Creates a PSIBLAST like profile and writes it to the $filename.
1105 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
1106 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
1107 but it's *not* the same.
1111 #####################################################################################################
1113 my ( $fn, @seqs ) = @_;
1115 #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
1117 # Create a PSIBLAST like profile from the alignment
1118 # This doesn't greate the same result as old Jpred, I think this is because
1119 # differences in the input between old and new
1122 # my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
1123 # my @profile = profile(
1124 # map { join "", @{ $_->seq } } $f->get_entries
1127 my @profile = profile( map { join "", @{ $_->align } } @seqs );
1129 open my $fh, ">$fn" or die "JPRED: $fn: $!";
1130 print $fh join( " ", @{$_} ), "\n" for @profile;
1133 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
1135 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
1137 Equivilent to gethmm script.
1141 #####################################################################################################
1145 # Temp files required
1146 my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
1148 # Create the FASTA file
1149 psiseq2fasta( $tmp_fasta, @seqs );
1152 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
1153 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1155 # Read in the HMMER file
1156 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1158 # Convert from HMMER::Profile to HMMER::Profile::Jnet
1159 my $hmmer_jnet = HMMER::Profile::Jnet->new;
1160 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
1165 =head2 jnet($hmmer, $out, $pssm )
1167 Runs Jnet. Pass the paths of the filenames to do the prediction on
1171 #################################################################################################################################################
1173 =head2 get_hmm($alignFile, $name)
1175 Adapted from the hmmer() function in Jpred as written by JDB.
1177 Generates an HMMer profile output compatible with Jnet.
1179 Equivilent to gethmm script.
1184 my $fastafile = shift;
1186 # Temp files required
1187 print "hmm_for_align: fastafile = $fastafile\n";
1188 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
1190 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
1191 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
1193 # Read in the HMMER file
1194 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
1196 # Read in the HMMER file
1197 my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
1199 # Convert from HMMER::Profile to HMMER::Profile::Jnet
1200 my $hmmer_jnet = HMMER::Profile::Jnet->new;
1201 $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
1206 #####################################################################################################
1208 my ( $hmmer, $outfile, $pssm ) = @_;
1211 # run Jnet prediction with all the profile data
1212 system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
1215 # run Jnet prediction with only HMMer and alignment data
1216 system("$jnet --concise --hmmer $hmmer > $outfile");
1218 check( "jnet", $? ) or exit 1;
1221 open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1223 if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
1230 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
1231 print $OUT "\n" . $res;
1236 =head1 psiseq2fasta($path, @PSISEQ)
1238 Writes a FASTA file given an array of PSISEQ objects.
1242 #####################################################################################################
1244 my ( $fn, @seqs ) = @_;
1246 #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
1247 confess "JPRED: not passed PSISEQs" unless @seqs;
1249 #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1251 # Changed from using FASTA::File object due to seg faulting on some large,
1252 # long alignments, presumably due to running out of memory.
1253 # my $fasta = FASTA::File->new;
1254 # $fasta->add_entries(
1256 # my $f = FASTA->new(id => $_->id);
1257 # $f->seq(@{$_->align});
1261 # $fasta->write_file($fn);
1265 ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
1266 : ( open $fh, ">$fn" or die "JPRED: $!" );
1269 print $fh ">", $_->id, "\n";
1270 my $seq = join( "", @{ $_->align } );
1271 $seq =~ s/(.{72})/$1\n/g;
1273 print $fh $seq . "\n";
1278 =head1 @PSISEQ = fasta2psiseq($path)
1280 Reads in a FASTA file and returns an array of PSISEQ objects.
1284 #####################################################################################################
1290 ? FASTA::File->new( read_gzip_file => $fn )
1291 : FASTA::File->new( read_file => $fn );
1292 $fasta or die "JPRED: $!";
1295 for ( $fasta->get_entries ) {
1296 my $seq = PSISEQ->new;
1298 $seq->align( [ @{ $_->seq } ] );
1305 #####################################################################################################
1306 sub fasta2psiseq_lowmemory {
1307 my $filename = shift;
1311 $filename =~ /\.gz$/
1312 ? ( open $fh, "gzip -dc $filename|" or die $! )
1313 : ( open $fh, $filename or die $! );
1320 $seq =~ s/ //g if defined $seq;
1322 if ( $id and $seq ) {
1323 my $psi = PSISEQ->new( id => $id );
1324 $psi->align( [ split //, $seq ] );
1335 if ( $id and $seq ) {
1336 my $psi = PSISEQ->new( id => $id );
1344 #####################################################################################################
1346 =head1 extend($FASTA::File, @PSISEQ)
1348 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
1349 against the query, this function extends the alignment so that all of the query is
1350 present in the alignment. The other sequences are extended with gap characters.
1355 my ( $query, @seqs ) = @_;
1357 #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
1358 #croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
1360 # Get the query sequence
1361 my $q_query = $query->get_entry(0);
1363 # Get the query sequence from the PSIBLAST alignment
1364 my $a_query = shift @seqs;
1366 # The gap size required to expand the alignment
1367 my ( $start_gap_length, $end_gap_length );
1370 # Make handling the sequence easier
1371 my $q_seq = join "", @{ [ $q_query->seq ] };
1372 my $a_seq = join "", @{ $a_query->align };
1377 ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
1379 $start_gap_length = index( $q_seq, $a_seq );
1380 croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1382 $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1385 # Add the gaps to the end of the alignments
1387 $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1390 # Put the query sequence back
1395 ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1396 @{ $a_query->align },
1397 ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1404 #####################################################################################################
1406 =head1 $int = fasta_seq_length($path)
1408 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1412 sub fasta_seq_length {
1414 open my $fh, $fn or croak "Can't open file $fn: $!\n";
1417 local $_, $/ = "\n";