5 jpred - Secondary structure prediction program
9 ./jpred.pl -in/-sequence <FILE1> [-out/-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-verbose] [-debug] [-help] [-man]
13 This is a program which predicts the secondary structure of a protein sequence given a path to a
14 FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
16 The program is primarily design for use by the Jpred server, but it can be used directly by any user.
17 Some of the output may not be meaningful if used directly - it can be safely ignored.
23 =item -in/-sequence FILE
25 The path to the sequence file (in FASTA format) you want to predict.
27 =item -out/-output FILEPREFIX
29 A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
33 Path to the uniref database used for PSI-BLAST querying.
36 =item -dbname database
38 Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
43 Path to a PSIBLAST output file.
47 Number of CPU used by jpred.pl. Maximum value is 8
52 Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
56 Untoggle final step of Jpred prediction
60 Verbose mode. Print more information while running jpred.
64 Debug mode. Print debugging information while running jpred.
68 Gives help on the programs usage.
72 Displays this man page.
78 Can't cope with gaps in the query sequence.
80 Doesn't accept alignments.
82 If you find any others please contact authors.
86 Jonathan Barber <jon@compbio.dundee.ac.uk>
87 Chris Cole <christian@cole.name>
88 Alexander Sherstnev <a.sherstnev@dundee.ac.uk> (current maintainer)
92 ## TODO check for gaps in query sequence
93 ## check for disallowed chars in sequence
105 # path to Jpred modules
106 use FindBin qw($Bin);
109 # internal jpred modules
111 use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS);
113 use HMMER::Profile::Jnet;
122 use Utils qw(profile);
125 our $VERSION = '3.0.2';
126 my $MAX_ALIGN = 10000; # maximum number of alignment sequences allowed
127 my $NR_CUT = 75; # cut-off seqeunce ID value for defining non-redundancy
129 # Gap characters in sequences
132 #####################################################################################################
133 # Light object to hold sequence information, light to prevent memory overload
134 # for big search results. Even so this is quite memory intensive, I can't see
135 # how to reduce the size further.
141 my ( $class, %args ) = @_;
144 for ( keys %args ) { $self->$_( $args{$_} ) }
161 if ( defined $_[1] ) {
162 ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
164 $_[0]->[2] = join( "", @{ $_[1] } );
166 return [ split( //, $_[0]->[2] ) ];
171 #####################################################################################################
175 my $ncpu = 1; # number of CPUs for psiblast calculations
176 my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
178 my $db_entry = "unknown";
181 my ( $help, $man, $DEBUG, $VERBOSE );
184 "in|sequence=s" => \$infile,
185 "out|output=s" => \$output,
186 "psi=s" => \$psiblast,
187 "dbname=s" => \$db_entry,
188 "dbpath=s" => \$db_path,
190 "pred-nohits" => \$predNoHits,
191 "no-final" => \$nofinal,
195 "verbose" => \$VERBOSE
197 pod2usage(1) if $help;
198 pod2usage( verbose => 2 ) if $man;
200 #####################################################################################################
201 # Key to database information and information for accessing them
204 ## default database used by Jpred
206 database => $db_path . "/uniref90.filt",
207 unfiltered => $db_path . "/uniref90",
210 ## database used during Jnet training
212 database => $db_path . "/training/uniref90.filt",
213 unfiltered => $db_path . "/training/uniref90",
216 ## database used for ported Jpred
218 database => $db_path . "/uniref90.filt",
219 unfiltered => $db_path . "/uniref90",
221 ## cluster-specific path for Jpred
223 database => "/homes/www-jpred/databases/uniref90.filt",
224 unfiltered => "/homes/www-jpred/databases/uniref90",
227 ## these other DBs are experimental ones used during development.
228 ## check they exist before using them.
231 # generic entry for use with validate_jnet.pl
232 # real db location defined by validate_jnet.pl
233 database => $db_path . "/swall/swall.filt",
234 unfiltered => $db_path . "/swall/swall",
239 # Path to PSIBLAST db
240 database => $db_path . "/3/swall.filt",
241 unfiltered => $db_path . "/3/swall",
245 database => $db_path . "/6/swall.filt",
246 unfiltered => $db_path . "/6/swall",
250 pod2usage(' -sequence argument not provided') unless $infile;
251 die "-db $db_entry not recognised" unless exists $database->{$db_entry};
252 $output = $infile . ".res" unless $output;
253 $ncpu = 1 if ( 1 > $ncpu or 8 < $ncpu );
255 for ( [ "input", $infile ], [ "output", $output ], [ "psi", $psiblast ], [ "db", $db_entry ] ) {
256 my ( $key, $value ) = @{$_};
257 defined $value or next;
258 errlog( join( ": ", $key, $value ), "\n" );
261 #####################################################################################################
262 my $platform = check_OS();
263 print "JPRED: checking platiform... $platform\n" if $DEBUG;
265 #####################################################################################################
266 my $query = FASTA::File->new( read_file => $infile );
269 my $psi = PSIBLAST->new;
270 unless ( defined $psiblast && -e $psiblast ) {
271 ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
272 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence
273 # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
274 # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
275 my $psi_run = PSIBLAST::Run->new(
277 args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
278 path => $psiblastbin,
280 output => "$output.blast",
281 matrix => "$output.profile",
282 database => $database->{$db_entry}{database},
285 # For reduced databases, get the size of the orginal and use this as
286 # the equivilent DB size
287 if ( $db_entry =~ /^sp_red_/ ) {
288 ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
290 $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below
293 print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
294 print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
296 print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
297 $psi_run->run or die; # CC sub is from PSIBLAST::Run
299 $psi->read_file("$output.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
300 system("gzip -9f $output.blast");
302 if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
303 else { $psi->read_file($psiblast) } # CC ditto above
306 #####################################################################################################
307 # Convert the last itteration into a collection of PSISEQ objects
308 for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm
309 my ( $id, $start, $align ) = @{$_};
314 align => [ split( //, $align ) ]
318 #####################################################################################################
319 ## When there are no PSI-BLAST hits generate an HMM from the query
320 ## and run Jnet against that only.
321 ## Accuracy won't be as good, but at least it's a prediction
323 if ( $predNoHits == 0 ) {
324 warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
325 print ">>100% complete\n";
328 print ">>50% complete\n";
329 warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
330 print "Running HMMer on query...\n" if $VERBOSE;
332 # copy input query to alignment
333 #system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
334 open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
335 open( my $ofh, ">", $output . ".align" ) or die "JPRED: cannot open file $output\.align: $!";
336 while (<$ifh>) { print $ofh, $_ }
340 # Temp files required for HMMer
341 my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
343 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
344 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
346 # Read in the HMMER file
347 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
349 # Convert from HMMER::Profile to HMMER::Profile::Jnet
350 my $hmmer_jnet = HMMER::Profile::Jnet->new;
351 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
352 $hmmer_jnet->write_file("$output.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
353 print ">>70% complete\n";
355 # Run Jnet for the prediction
356 print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
357 jnet( map { "$output.$_" } qw(hmm jnet) ); # CC sub is below
360 psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG; # CC sub is below
361 print ">>40% complete\n";
363 #####################################################################################################
364 # Make PSIBLAST truncated alignments the right length
365 print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
366 @seqs = extend( $query, @seqs ); # CC sub si below
367 psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
369 #####################################################################################################
370 # Remove masking from PSIBLAST alignment
371 print "Unmasking the alignments...\n" if $VERBOSE;
372 my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
373 remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
374 psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
376 #####################################################################################################
377 # Convert the sequences to upper case
378 print "Converting sequences to the same case...\n" if $VERBOSE;
379 toupper($_) for @seqs; # CC sub is below
380 psiseq2fasta( "${output}_backup.fasta.gz", @seqs );
382 #####################################################################################################
383 # Remove excessive sequences
384 print "Remove excessive sequences...\n" if $VERBOSE;
385 @seqs = reduce( $MAX_ALIGN, @seqs );
386 psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
388 #####################################################################################################
389 # Remove sequences that are too long or too short
390 print "Remove sequences which too long or short...\n" if $VERBOSE;
391 @seqs = del_long_seqs( 50, @seqs );
392 psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
394 #####################################################################################################
395 # Remove redundant sequences based upon pairwise identity and OC clustering
396 print "Remove redundant sequences...\n" if $VERBOSE;
397 @seqs = nonred( $NR_CUT, @seqs );
398 psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
400 #####################################################################################################
401 # Check that we haven't got rid of all of the sequences
403 warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
404 @seqs = fasta2psiseq("${output}_backup.fasta.gz");
406 unlink("${output}_backup.fasta.gz") unless $DEBUG;
408 # Remove gaps in the query sequence and same positions in the alignment
409 print "Removing gaps in the query sequence...\n" if $VERBOSE;
411 psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
413 # Output the alignment for the prediction
414 print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
415 psiseq2fasta( "$output.align", @seqs );
417 #####################################################################################################
418 # Equivilent to getpssm script
419 print "Output the PSSM matrix from the PSI-BLAST profile...\n";
420 my $pssm = PSIBLAST::PSSM->new( read_file => "$output.profile" );
421 $pssm->write_file("$output.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
422 print ">>50% complete\n";
424 #####################################################################################################
425 # Run HMMER on the sequences
426 print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
427 my $hmmer = hmmer(@seqs); # CC sub is below
428 $hmmer->write_file("$output.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
429 print ">>70% complete\n";
431 #####################################################################################################
432 # Run Jnet for the prediction
433 print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
434 jnet( map { "$output.$_" } qw(hmm jnet pssm) ); # CC sub is below
437 print ">>100% complete\n";
439 unless ( defined $nofinal ) {
440 my $aligfile = $output.".align";
441 my $jnetfile = $output.".jnet";
442 my $jnetfastafile = $output.".jnet.fasta";
443 my $resufile = $output.".res";
444 concise2fasta($jnetfile, $jnetfastafile);
445 open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
446 open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
447 open( my $OUT, ">", $resufile ) or die "ERROR! unable to open '$resufile': ${!}\nDied";
448 while (<$IN2>) {print $OUT $_;}
449 while (<$IN1>) {print $OUT $_;}
455 print "Jpred Finished\n";
458 #####################################################################################################
460 #####################################################################################################
462 #####################################################################################################
467 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
468 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
470 my ( $seq, @seqs, @title );
490 if ( @title != @seqs ) { die("non matching number of titles and sequences!\n"); }
492 foreach ( 0 .. $#title ) {
493 print "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
499 #####################################################################################################
504 my ( @seqs, %seq, @pred, %pred );
507 "Lupas_21", "Lupas_14", "Lupas_28", "JNETPSSM", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETFREQ",
508 "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
511 open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
512 open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
516 my ( $id, $seq ) = split( ":", $_ );
517 if ( !$id || !$seq ) { next; } # Check we have proper values
520 if ( $id =~ /;/ ) { # Then it's an alignment
521 @_ = split( ";", $id );
523 $seq{ $_[1] } = $seq;
535 $seq{$_} =~ s/(.{72})/$1\n/g;
536 print $OUT ">$_\n$seq{$_}\n";
540 $pred{$_} =~ s/[TCYWXZSI\?_]/-/g;
541 $pred{$_} =~ s/B/E/g;
542 $pred{$_} =~ s/G/H/g;
544 if (/SOL/) { $pred{$_} =~ s/E/B/g; }
545 $pred{$_} =~ s/(.{72})/$1\n/g;
546 print $OUT ">$_\n$pred{$_}\n";
553 =head2 $PSISEQ = remove_seq_masks($PSISEQ)
555 Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database.
560 #####################################################################################################
561 sub remove_seq_masks {
565 # Only bother if we have a sequence which has been masked
566 return unless grep { uc $_ eq 'X' } @{ $seq->align };
568 my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } );
569 my $searchdb = $database->{$db_entry}{unfiltered};
570 $start--; # We need this to be zero based
572 my $f = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return;
573 my @seqs = $f->get_entries;
574 my $db = shift @seqs;
577 confess "JPRED: remove_seq_masks: Accession number $id returned no sequences";
579 confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence";
582 my @db_seq = @{ $db->seq };
584 # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence
586 for ( 0 .. $#align ) {
587 next if $align[$_] =~ /$GAP/;
589 if ( $align[$_] eq 'X' ) {
590 unless ( defined $db_seq[ $i + $start ] ) {
591 croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!";
593 $align[$_] = $db_seq[ $i + $start ];
597 $seq->align( \@align );
600 =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
602 Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
604 Equivilent to reduce script.
608 #####################################################################################################
610 my ( $max, @seqs ) = @_;
612 my $num = int( @seqs / $max );
613 my @res = shift @seqs;
615 # Don't bother if we have less than the required sequences
616 if ( @seqs < $max or 0 == $num ) {
620 for ( 0 .. $#seqs ) {
621 push @res, $seqs[$_] if ( 0 == $_ % $num );
627 =head2 nonred($cutoff, @PSISEQS);
629 Removes redundant sequences at $cutoff level of sequence identity.
631 Equivilent to nonred script.
635 #####################################################################################################
637 my ( $cutoff, @seqs ) = @_;
639 # Run pairwise and then OC and remove similar sequences
640 unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" }
641 unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
643 warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
647 # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version).
648 my $fasta = FASTA::File->new;
650 my $f = FASTA->new( id => $_->id );
651 $f->seq( @{ $_->align } );
652 $fasta->add_entries($f);
656 my $pair = Pairwise->new( path => $pairwise );
657 my $foo = join "\n", $pair->run($fasta) or die $!;
660 my $ocres = OC->new( path => "$oc sim complete cut $cutoff" );
661 $ocres->read_string( join "", $ocres->run($foo) );
663 # Now found those entries that are unrelated
665 for ( $ocres->get_groups ) {
666 my ( $group, $score, $size, @labels ) = @{$_};
668 # Keep the QUERY from the cluster, otherwise just get onelt
669 if ( grep { /QUERY/ } @labels ) {
671 # Make sure the query stays at the start
672 unshift @keep, "QUERY";
675 push @keep, shift @labels;
678 push @keep, $ocres->get_unclustered;
680 # Return the matching entries.
681 # We use the temporay to mimic the selection of sequences in the nonred
682 # script, which took the sequences in the order they occured in the file.
686 if ( grep { $_ =~ /^$label$/ } @keep ) {
687 push @filtered_res, $_;
691 return @filtered_res;
694 =head2 @seqs = del_long_seqs($percentage, @PSISEQS)
696 Removes those sequences that are more than $percentage longer or shorter than the first
697 sequence in @seqs. @seqs is an array of PSISEQ objects.
699 Equivilent to trim_seqs script.
703 #####################################################################################################
705 my ( $factor, @seqs ) = @_;
707 # Keep the query sequence (we assume query is the 1st FASTA record)
708 my $query = shift @seqs;
711 # Find the length of the query sequence without any gaps
712 my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g );
714 # Calculate the upper and lower allowed bounds
715 my ( $upper, $lower );
717 my $bounds = ( $length / 100 ) * $factor;
718 ( $upper, $lower ) = ( $length + $bounds, $length - $bounds );
721 # Now try each sequence and see how long they are
723 my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g );
724 if ( $l >= $lower && $l <= $upper ) { push @res, $_ }
730 =head2 toupper ($PSISEQ)
732 Converts sequence held in PSISEQ object to uppercase.
736 #####################################################################################################
738 $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] );
741 =head2 degap($PSISEQ_QUERY, @PSISEQS)
743 Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
745 Equivilent to fasta2jnet script.
749 #####################################################################################################
753 # Find where the gaps in the query sequence are
756 for ( @{ $seqs[0]->align } ) {
757 push @gaps, $i if $_ !~ /$GAP/;
763 # Remove the gaps in all the sequences.
764 # Derefences the array reference and takes a slice inside the method call
765 # arguments, then coerce back to an array ref.
767 $_->align( [ @{ $_->align }[@gaps] ] );
771 =head2 getfreq($filename, @PSISEQS)
773 Creates a PSIBLAST like profile and writes it to the $filename.
775 Equivilant to profile or getfreq (older) script. This doesn't create the same answer as
776 the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output,
777 but it's *not* the same.
781 #####################################################################################################
783 my ( $fn, @seqs ) = @_;
785 #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs;
787 # Create a PSIBLAST like profile from the alignment
788 # This doesn't greate the same result as old Jpred, I think this is because
789 # differences in the input between old and new
792 # my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
793 # my @profile = profile(
794 # map { join "", @{ $_->seq } } $f->get_entries
797 my @profile = profile( map { join "", @{ $_->align } } @seqs );
799 open my $fh, ">$fn" or die "JPRED: $fn: $!";
800 print $fh join( " ", @{$_} ), "\n" for @profile;
803 =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
805 Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
807 Equivilent to gethmm script.
811 #####################################################################################################
815 # Temp files required
816 my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3;
818 # Create the FASTA file
819 psiseq2fasta( $tmp_fasta, @seqs );
822 system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
823 system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
825 # Read in the HMMER file
826 my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
828 # Convert from HMMER::Profile to HMMER::Profile::Jnet
829 my $hmmer_jnet = HMMER::Profile::Jnet->new;
830 $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
835 =head2 jnet($hmmer, $out, $pssm )
837 Runs Jnet. Pass the paths of the filenames to do the prediction on
841 #####################################################################################################
843 my ( $hmmer, $outfile, $pssm ) = @_;
846 # run Jnet prediction with all the profile data
847 system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
850 # run Jnet prediction with only HMMer and alignment data
851 system("$jnet --concise --hmmer $hmmer > $outfile");
853 check( "jnet", $? ) or exit 1;
856 =head1 psiseq2fasta($path, @PSISEQ)
858 Writes a FASTA file given an array of PSISEQ objects.
862 #####################################################################################################
864 my ( $fn, @seqs ) = @_;
866 #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
867 confess "JPRED: not passed PSISEQs" unless @seqs;
869 #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
871 # Changed from using FASTA::File object due to seg faulting on some large,
872 # long alignments, presumably due to running out of memory.
873 # my $fasta = FASTA::File->new;
874 # $fasta->add_entries(
876 # my $f = FASTA->new(id => $_->id);
877 # $f->seq(@{$_->align});
881 # $fasta->write_file($fn);
885 ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" )
886 : ( open $fh, ">$fn" or die "JPRED: $!" );
889 print $fh ">", $_->id, "\n";
890 my $seq = join( "", @{ $_->align } );
891 $seq =~ s/(.{72})/$1\n/g;
893 print $fh $seq . "\n";
898 =head1 @PSISEQ = fasta2psiseq($path)
900 Reads in a FASTA file and returns an array of PSISEQ objects.
904 #####################################################################################################
910 ? FASTA::File->new( read_gzip_file => $fn )
911 : FASTA::File->new( read_file => $fn );
912 $fasta or die "JPRED: $!";
915 for ( $fasta->get_entries ) {
916 my $seq = PSISEQ->new;
918 $seq->align( [ @{ $_->seq } ] );
925 #####################################################################################################
926 sub fasta2psiseq_lowmemory {
927 my $filename = shift;
932 ? ( open $fh, "gzip -dc $filename|" or die $! )
933 : ( open $fh, $filename or die $! );
940 $seq =~ s/ //g if defined $seq;
942 if ( $id and $seq ) {
943 my $psi = PSISEQ->new( id => $id );
944 $psi->align( [ split //, $seq ] );
955 if ( $id and $seq ) {
956 my $psi = PSISEQ->new( id => $id );
964 =head1 extend($FASTA::File, @PSISEQ)
966 Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
967 against the query, this function extends the alignment so that all of the query is
968 present in the alignment. The other sequences are extended with gap characters.
972 #####################################################################################################
974 my ( $query, @seqs ) = @_;
976 #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
977 #croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
979 # Get the query sequence
980 my $q_query = $query->get_entry(0);
982 # Get the query sequence from the PSIBLAST alignment
983 my $a_query = shift @seqs;
985 # The gap size required to expand the alignment
986 my ( $start_gap_length, $end_gap_length );
989 # Make handling the sequence easier
990 my $q_seq = join "", @{ [ $q_query->seq ] };
991 my $a_seq = join "", @{ $a_query->align };
996 ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq;
998 $start_gap_length = index( $q_seq, $a_seq );
999 croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
1001 $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
1004 # Add the gaps to the end of the alignments
1006 $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] );
1009 # Put the query sequence back
1014 ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ),
1015 @{ $a_query->align },
1016 ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ),
1023 =head1 $int = fasta_seq_length($path)
1025 Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
1029 #####################################################################################################
1030 sub fasta_seq_length {
1032 open my $fh, $fn or croak "Can't open file $fn: $!\n";
1035 local $_, $/ = "\n";
1045 =head1 log(@messages)
1047 Report messages to STDERR
1051 #####################################################################################################
1052 sub errlog { print STDERR @_ }