#!/usr/bin/perl =head1 NAME jpred - Secondary structure prediction program =head1 SYNOPSIS ./jpred.pl -in [-outfile ] [-logfile ] [-output ] [-dbname ] [-dbpath ] [-ncpu NNN] [-psi ] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man] =head1 DESCRIPTION This is a program for predicting the secondary structure of a multiple sequence alignment or a protein sequence. The input file can be stored in 3 formats: FASTA, MSF, or BLC. For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet. =head1 OPTIONS =over 5 =item -in The path to the sequence file (in FASTA, MSF, or BLC format) =item -output A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in. =item -outfile An output file, by default it is undefined and the -output option is working =item -logfile Logging file, by default it is undefined and logging switched off =item -dbpath PATH Path to the uniref database used for PSI-BLAST querying. default value: . =item -dbname database Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code. Default value: uniref90 =item -psi path Path to a PSIBLAST output file. =item -ncpu NNN Number of CPU used by jpred.pl. Maximum value is 8 Default: NNN = 1 =item -pred-nohits Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits. =item -no-final By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features The option untoggles this step and stops jpred at the concise file generation =item -jabaws Redefines some basic variables in order to use the script within JABAWS =item -verbose Verbose mode. Print more information while running jpred. =item -debug Debug mode. Print debugging information while running jpred. =item -help Gives help on the programs usage. =item -man Displays this man page. =back =head1 BUGS Can't cope with gaps in the query sequence. Doesn't accept alignments. If you find any others please contact authors. =head1 AUTHORS Jonathan Barber Chris Cole Alexander Sherstnev (current maintainer) =cut ## TODO check for gaps in query sequence ## check for disallowed chars in sequence use strict; use warnings; use Getopt::Long; use Pod::Usage; use Carp; use Data::Dumper; use File::Temp; # path to Jpred modules use FindBin qw($Bin); use lib "$Bin/lib"; # internal jpred modules use Jpred; use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env); use HMMER::Profile; use HMMER::Profile::Jnet; use PSIBLAST; use PSIBLAST::PSSM; use PSIBLAST::Run; use FASTA::File; use FASTA; use Index; use Pairwise; use OC; use Utils qw(profile); use Run qw(check); our $VERSION = '3.0.2'; my $MAX_ALIGN = 10000; # maximum number of alignment sequences allowed my $NR_CUT = 75; # cut-off seqeunce ID value for defining non-redundancy # Gap characters in sequences our $GAP = '-'; ##################################################################################################### # Light object to hold sequence information, light to prevent memory overload # for big search results. Even so this is quite memory intensive, I can't see # how to reduce the size further. { package PSISEQ; sub new { my ( $class, %args ) = @_; my $self = []; bless $self, $class; for ( keys %args ) { $self->$_( $args{$_} ) } return $self; } sub id { return defined $_[1] ? $_[0]->[0] = $_[1] : $_[0]->[0]; } sub start { return defined $_[1] ? $_[0]->[1] = $_[1] : $_[0]->[1]; } sub align { if ( defined $_[1] ) { ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref"; $_[0]->[2] = join( "", @{ $_[1] } ); } else { return [ split( //, $_[0]->[2] ) ]; } } } ##################################################################################################### my $infile; my $infastafile; my $format; my $goal = "align"; my $seqgoal; my $outfile; my $prefix; my $psiblast; my $jabaws; my $logfile; my $ncpu = 1; # number of CPUs for psiblast calculations my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found my $db_path = "/homes/www-jpred/databases"; my $db_entry = "cluster"; my $nofinal; my ( $help, $man, $DEBUG, $VERBOSE ); GetOptions( "in=s" => \$infile, "output=s" => \$prefix, "outfile=s" => \$outfile, "logfile=s" => \$logfile, "psi=s" => \$psiblast, "dbname=s" => \$db_entry, "dbpath=s" => \$db_path, "ncpu=s" => \$ncpu, "pred-nohits" => \$predNoHits, "no-final" => \$nofinal, "jabaws" => \$jabaws, "help" => \$help, "man" => \$man, "debug" => \$DEBUG, "verbose" => \$VERBOSE ) or pod2usage(2); pod2usage(1) if $help; pod2usage( verbose => 2 ) if $man; ##################################################################################################### # Key to database information and information for accessing them my $database = { ## default database used by Jpred uniref90 => { database => $db_path . "/uniref90.filt", unfiltered => $db_path . "/uniref90", }, ## database used during Jnet training training => { database => $db_path . "/training/uniref90.filt", unfiltered => $db_path . "/training/uniref90", }, ## database used for ported Jpred ported_db => { database => $db_path . "/uniref90.filt", unfiltered => $db_path . "/uniref90", }, ## cluster-specific path for Jpred cluster => { database => $db_path . "/uniref90.filt", unfiltered => $db_path . "/uniref90", }, ## these other DBs are experimental ones used during development. ## check they exist before using them. # generic entry for use with validate_jnet.pl # real db location defined by validate_jnet.pl swall => { database => $db_path . "/swall/swall.filt", unfiltered => $db_path . "/swall/swall", }, # Path to PSIBLAST db uniprot => { database => $db_path . "/3/swall.filt", unfiltered => $db_path . "/3/swall", }, uniref50 => { database => $db_path . "/6/swall.filt", unfiltered => $db_path . "/6/swall", }, }; my $dpf = $database->{$db_entry}{database}.'.pal'; my $dpu = $database->{$db_entry}{unfiltered}.'.pal'; pod2usage("ERROR! Input file should be provided with -sequence/-in") unless $infile; pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database") unless exists $database->{$db_entry}; pod2usage("ERROR! UNIREF filtered database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf; pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu; ##################################################################################################### # lots of preparation steps if ( defined $prefix ) { unless ( defined $outfile ) { $outfile = $prefix . ".res.fasta"; } } else { if ( defined $outfile ) { print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n"; $prefix = $outfile . ".tmp"; } else { print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n"; $prefix = $infile . ".res"; $outfile = $prefix . ".jnet"; } } if ( 1 > $ncpu or 8 < $ncpu ) { print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n"; $ncpu = 1; } $VERBOSE = 1 if $DEBUG; my $LOG; if ( defined $logfile ) { open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied"; } for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) { my ( $key, $value ) = @{$_}; defined $value or next; my $par = join( ": ", $key, $value ); print "$par\n" if $DEBUG; print $LOG "$par\n" if $LOG; } if ( defined $jabaws ) { setup_jpred_env($Bin); setup_env($Bin); } my $platform = check_OS(); print "JPRED: checking platiform... $platform\n" if $DEBUG; print $LOG "JPRED: checking platiform... $platform\n" if $LOG; ##################################################################################################### # check input file format my $nseq = check_FASTA_format($infile); if ( 0 < $nseq ) { $format = "fasta"; if ( 1 == $nseq ) { # one FASTA record $goal = 'seq'; } else { unless ( 0 < check_FASTA_alignment($infile)) { die "\nERROR! jpred requires either FASTA alignment or 1 sequence in the FASTA, MSF, or BLC formats\n"; } } } elsif ( 0 < check_MSF_format($infile) ) { $format = "msf"; } elsif ( 0 < check_BLC_format($infile) ) { $format = "blc"; } else { die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n"; } $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format ); ##################################################################################################### if ( 'seq' eq $goal ) { my $query = FASTA::File->new( read_file => $infile ); my @seqs; my $psi = PSIBLAST->new; unless ( defined $psiblast && -e $psiblast ) { ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3' my $psi_run = PSIBLAST::Run->new( debug => $DEBUG, args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3", path => $psiblastbin, input => $infile, output => "$prefix.blast", matrix => "$prefix.profile", database => $database->{$db_entry}{database}, ); # For reduced databases, get the size of the orginal and use this as # the equivilent DB size if ( $db_entry =~ /^sp_red_/ ) { ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/; $psi_run->args( $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below ); } print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG; print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG; print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG; print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG; print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE; print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG; $psi_run->run or die; # CC sub is from PSIBLAST::Run $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does? system("gzip -9f $prefix.blast"); } else { if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does? else { $psi->read_file($psiblast) } # CC ditto above } ##################################################################################################### # Convert the last itteration into a collection of PSISEQ objects for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm my ( $id, $start, $align ) = @{$_}; push @seqs, PSISEQ->new( id => $id, start => $start, align => [ split( //, $align ) ] ); } ##################################################################################################### # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only. # Accuracy won't be as good, but at least it's a prediction if ( @seqs == 0 ) { if ( $predNoHits == 0 ) { warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n"; print ">>100% complete\n"; print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG; print $LOG ">>100% complete\n" if $LOG; close($LOG) if $LOG; exit; } else { print ">>50% complete\n"; warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n"; print "Running HMMer on query...\n" if $VERBOSE; print $LOG ">>50% complete\n" if $LOG; print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG; print $LOG "Running HMMer on query...\n" if $LOG; # copy input query to alignment #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n"; open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!"; open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!"; while (<$ifh>) { print $ofh, $_ } close $ifh; close $ofh; # Temp files required for HMMer my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2; system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile"); system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out"); unlink $hmmbuild_out; # Read in the HMMER file my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out ); # Convert from HMMER::Profile to HMMER::Profile::Jnet my $hmmer_jnet = HMMER::Profile::Jnet->new; $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line; $hmmer_jnet->write_file("$prefix.hmm"); print ">>70% complete\n"; print $LOG ">>70% complete\n" if $LOG; # Run Jnet for the prediction print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE; print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG; my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) ); print ">>90% complete\n"; print $LOG ">>90% complete\n" if $LOG; print $jnetlog if $VERBOSE; print $LOG $jnetlog if $LOG; } } else { psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG; print ">>40% complete\n"; print $LOG ">>40% complete\n" if $LOG; ##################################################################################################### # Make PSIBLAST truncated alignments the right length print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE; print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG; @seqs = extend( $query, @seqs ); psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG; ##################################################################################################### # Remove masking from PSIBLAST alignment print "Unmasking the alignments...\n" if $VERBOSE; print $LOG "Unmasking the alignments...\n" if $LOG; my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found."; remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ]; psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG; ##################################################################################################### # Convert the sequences to upper case print "Converting sequences to the same case...\n" if $VERBOSE; print $LOG "Converting sequences to the same case...\n" if $LOG; toupper($_) for @seqs; # CC sub is below psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs ); ##################################################################################################### # Remove excessive sequences print "Remove excessive sequences...\n" if $VERBOSE; print $LOG "Remove excessive sequences...\n" if $LOG; @seqs = reduce( $MAX_ALIGN, @seqs ); psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG; ##################################################################################################### # Remove sequences that are too long or too short print "Remove sequences which too long or short...\n" if $VERBOSE; print $LOG "Remove sequences which too long or short...\n" if $LOG; @seqs = del_long_seqs( 50, @seqs ); psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG; ##################################################################################################### # Remove redundant sequences based upon pairwise identity and OC clustering print "Remove redundant sequences...\n" if $VERBOSE; print $LOG "Remove redundant sequences...\n" if $LOG; @seqs = nonred( $NR_CUT, @seqs ); psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG; ##################################################################################################### # Check that we haven't got rid of all of the sequences if ( @seqs < 2 ) { warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n"; print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG; @seqs = fasta2psiseq("${prefix}_backup.fasta.gz"); } unlink("${prefix}_backup.fasta.gz") unless $DEBUG; # Remove gaps in the query sequence and same positions in the alignment print "Removing gaps in the query sequence...\n" if $VERBOSE; print $LOG "Removing gaps in the query sequence...\n" if $LOG; degap(@seqs); psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG; # Output the alignment for the prediction print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE; print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG; psiseq2fasta( "$prefix.align", @seqs ); ##################################################################################################### # Equivilent to getpssm script print "Output the PSSM matrix from the PSI-BLAST profile...\n"; print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG; my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" ); $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM print ">>50% complete\n"; print $LOG ">>50% complete\n" if $LOG; ##################################################################################################### # Run HMMER on the sequences print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE; print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG; my $hmmer = hmmer(@seqs); # CC sub is below $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module print ">>70% complete\n"; print $LOG ">>70% complete\n" if $LOG; ##################################################################################################### # Run Jnet for the prediction print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE; print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG; my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below print ">>90% complete\n"; print $LOG ">>90% complete\n" if $LOG; print $jnetlog if $VERBOSE; print $LOG $jnetlog if $LOG; } } else { if ( 'fasta' eq $format ) { print "Read FASTA file\n"; my $hmmer1 = hmm_for_align($infile); $hmmer1->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module } elsif ( 'msf' eq $format ) { msf2fasta( $infile, $infastafile ); my $hmmer2 = hmm_for_align($infastafile); $hmmer2->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module } elsif ( 'blc' eq $format ) { my $tmpfile = File::Temp->new->filename; blc2msf( $infile, $tmpfile ); msf2fasta( $infile, $infastafile ); my $hmmer3 = hmm_for_align($infastafile); $hmmer3->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module } else { die "ERROR! unknown input format '$format'. exit...\n"; } print ">>70% complete\n"; print $LOG ">>70% complete\n" if $LOG; ##################################################################################################### # Run Jnet for the prediction print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE; print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG; my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) ); # CC sub is below print ">>90% complete\n"; print $LOG ">>90% complete\n" if $LOG; print $jnetlog if $VERBOSE; print $LOG $jnetlog if $LOG; } unless ( defined $nofinal ) { my $aligfile = $prefix . ".align"; my $jnetfile = $prefix . ".jnet"; my $jnetfastafile = $prefix . ".jnet.fasta"; $aligfile = $infile if ( 'fasta' eq $format ); $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format ); concise2fasta( $jnetfile, $jnetfastafile ); open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied"; open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied"; open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; while (<$IN2>) { print $OUT $_; } while (<$IN1>) { print $OUT $_; } close($IN1); close($IN2); close($OUT); unlink $jnetfastafile; my $seqs = FASTA::File->new( read_file => $outfile ); open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n"; $seqs->write($OUT2); close($OUT2); } else { if ( defined $outfile and $prefix . ".jnet" ne $outfile ) { rename $prefix . ".jnet", $outfile; } } print ">>100% complete\n"; print "Jpred Finished\n"; print $LOG ">>100% complete\n" if $LOG; print $LOG "Jpred Finished\n" if $LOG; close($LOG) if $LOG; exit; ##################################################################################################### # Functions ##################################################################################################### sub check_FASTA_format { my $infile = shift; open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n"; my $check_first_line = 1; my $nseq = 0; local $/ = "\n>"; while (<$IN>) { if ($check_first_line) { return 0 unless (/^>/); $check_first_line = 0; } s/^>//g; s/>$//g; my ( $id, @seqs ) = split /\n/, $_; return 0 unless ( defined $id or @seqs ); my $seq = join( "", @seqs ); return 0 unless ( $seq =~ /[a-zA-Z\.-]/ ); ++$nseq; } close($IN); return $nseq; } ##################################################################################################### sub check_FASTA_alignment { my $infile = shift; open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n"; my $check_first_line = 1; my $nseq = 0; my $seqlen = -1; local $/ = "\n>"; while (<$IN>) { if ($check_first_line) { return 0 unless (/^>/); $check_first_line = 0; } s/^>//g; s/>$//g; my ( $id, @seqs ) = split /\n/, $_; return 0 unless ( defined $id or @seqs ); my $seq = join( "", @seqs ); return 0 unless ( $seq =~ /[a-zA-Z\.-]/ ); if (-1 == $seqlen) { $seqlen = length ($seq); } else { return 0 if ($seqlen != length ($seq) ); } ++$nseq; } close($IN); return $nseq; } ##################################################################################################### sub check_MSF_format { my $infile = shift; $? = 0; system("$readseq -fMSF -a -p < $infile > /dev/null"); return 0 if ($?); return 1; } ##################################################################################################### sub check_BLC_format { my $infile = shift; my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2; open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n"; print $fh "silent_mode\nblock_file $infile\n"; print $fh "output_file /dev/null\nmax_nseq 2000\n"; print $fh "pir_save $tmpfile2\nsetup\n"; close $fh; $? = 0; system("$alscript -s -f $tmpfile1"); $? = 0; system("$readseq -f8 -a -p < $tmpfile2 > /dev/null"); unlink $tmpfile1; unlink $tmpfile2; return 0 if ($?); return 1; } ############################################################################################################################################ # convertor BLC -> MSF sub msf2fasta { my $infile = shift; my $outfile = shift; my $sequence = ""; open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n"; while (<$IN>) { $sequence .= $_; } close($IN); $sequence =~ s/~/\./g; ## check for non-unique sequence names - readseq has problems with these my @lines = split /\n/, $sequence; my %names; foreach my $line (@lines) { if ( $line =~ /Name:\s+(\S+)/ ) { die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 ); $names{$1}++; } } my $tmpfile1 = File::Temp->new->filename; my $tmpfile2 = File::Temp->new->filename; open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!"; print $FILE $sequence; close $FILE; $? = 0; system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2"); die "Reformatting of $infile failed. exit...\n" if ($?); unlink $tmpfile1; $? = 0; system("$readseq -f8 -a -p < $tmpfile2 > $outfile"); die "Reformatting of $infile failed. exit...\n" if ($?); unlink $tmpfile2; die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile ); } ############################################################################################################################################ # convertor BLC -> MSF sub blc2msf { my ( $in, $out ) = @_; my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2; open my $fh, ">$tmp" or die "$tmp: $!\n"; print $fh "silent_mode\nblock_file $infile\n"; print $fh "output_file /dev/null\nmax_nseq 2000\n"; print $fh "pir_save $tmp_pir\nsetup\n"; close $fh; $? = 0; system("$alscript -s -f $tmp"); die "Reformatting of $infile failed. exit...\n" if ($?); system("$readseq -f=MSF -o=$out -a $tmp_pir"); unlink $tmp; unlink $tmp_pir; } ##################################################################################################### =begin :private =head2 fasta2concise ($infile, $outfile) Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file =cut sub fasta2concise { my $infile = shift; my $outfile = shift; open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied"; my ( $seq, @seqs, @title ); while (<$IN>) { if (s/^>//) { if ($seq) { $seq =~ s/\n|\s//g; $seq =~ s/(.)/$1,/g; push @seqs, $seq; $seq = ""; } chomp; push @title, $_; } else { chomp; $seq .= $_; } } $seq =~ s/\n|\s//g; $seq =~ s/(.)/$1,/g; push @seqs, $seq; close($IN); if ( @title != @seqs ) { die("non matching number of titles and sequences!\n"); } open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; foreach ( 0 .. $#title ) { print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n"; } close($OUT); } =begin :private =head2 concise2fasta ($infile, $outfile) Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file =cut ##################################################################################################### sub concise2fasta { my $infile = shift; my $outfile = shift; my ( @seqs, %seq, @pred, %pred ); my @var = ( "Lupas_21", "Lupas_14", "Lupas_28", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ", "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred" ); # parse input concise file open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied"; while (<$IN>) { unless (/^\n/) { my ( $id, $seq ) = split( ":", $_ ); if ( defined $id and defined $seq ) { # Check we have proper values $seq =~ s/,//g; chomp $seq; if ( $id =~ /;/ ) { # Then it's an alignment @_ = split( ";", $id ); push @seqs, $_[1]; $seq{ $_[1] } = $seq; } foreach my $v (@var) { if ( $v eq $id ) { push @pred, $v; $pred{$v} = $seq; } } } } } close($IN); open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; # print out sequences found with PSI-BLAST foreach (@seqs) { $seq{$_} =~ s/(.{72})/$1\n/g; print $OUT ">$_\n$seq{$_}\n"; } # print out predictions foreach my $p (@pred) { $pred{$p} =~ s/[TCYWXZSI\?_]/-/g; $pred{$p} =~ s/B/E/g; $pred{$p} =~ s/G/H/g; if ( $p =~ /SOL/ ) { $pred{$p} =~ s/E/B/g; } $pred{$p} =~ s/(.{72})/$1\n/g; print $OUT ">$p\n$pred{$p}\n"; } close($OUT); } =begin :private =head2 $PSISEQ = remove_seq_masks($PSISEQ) Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database. Does it in place. =cut ##################################################################################################### sub remove_seq_masks { my $idx = shift; my $seq = shift; # Only bother if we have a sequence which has been masked return unless grep { uc $_ eq 'X' } @{ $seq->align }; my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } ); my $searchdb = $database->{$db_entry}{unfiltered}; $start--; # We need this to be zero based my $f = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return; my @seqs = $f->get_entries; my $db = shift @seqs; unless ($db) { confess "JPRED: remove_seq_masks: Accession number $id returned no sequences"; } elsif (@seqs) { confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence"; } my @db_seq = @{ $db->seq }; # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence my $i = 0; for ( 0 .. $#align ) { next if $align[$_] =~ /$GAP/; if ( $align[$_] eq 'X' ) { unless ( defined $db_seq[ $i + $start ] ) { croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!"; } $align[$_] = $db_seq[ $i + $start ]; } $i++; } $seq->align( \@align ); } =head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN); Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN. Equivilent to reduce script. =cut ##################################################################################################### sub reduce { my ( $max, @seqs ) = @_; my $num = int( @seqs / $max ); my @res = shift @seqs; # Don't bother if we have less than the required sequences if ( @seqs < $max or 0 == $num ) { return @res, @seqs; } for ( 0 .. $#seqs ) { push @res, $seqs[$_] if ( 0 == $_ % $num ); } return @res; } =head2 nonred($cutoff, @PSISEQS); Removes redundant sequences at $cutoff level of sequence identity. Equivilent to nonred script. =cut ##################################################################################################### sub nonred { my ( $cutoff, @seqs ) = @_; # Run pairwise and then OC and remove similar sequences unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" } unless (@seqs) { croak "JPRED: No sequences passed to nonred()" } if ( @seqs == 1 ) { warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n"; return @seqs; } # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version). my $fasta = FASTA::File->new; foreach (@seqs) { my $f = FASTA->new( id => $_->id ); $f->seq( @{ $_->align } ); $fasta->add_entries($f); } # Run pairwise my $pair = Pairwise->new( path => $pairwise ); my $foo = join "\n", $pair->run($fasta) or die $!; # Run OC my $ocres = OC->new( path => "$oc sim complete cut $cutoff" ); $ocres->read_string( join "", $ocres->run($foo) ); # Now found those entries that are unrelated my @keep; for ( $ocres->get_groups ) { my ( $group, $score, $size, @labels ) = @{$_}; # Keep the QUERY from the cluster, otherwise just get onelt if ( grep { /QUERY/ } @labels ) { # Make sure the query stays at the start unshift @keep, "QUERY"; next; } else { push @keep, shift @labels; } } push @keep, $ocres->get_unclustered; # Return the matching entries. # We use the temporay to mimic the selection of sequences in the nonred # script, which took the sequences in the order they occured in the file. my @filtered_res; for (@seqs) { my $label = $_->id; if ( grep { $_ =~ /^$label$/ } @keep ) { push @filtered_res, $_; } } return @filtered_res; } =head2 @seqs = del_long_seqs($percentage, @PSISEQS) Removes those sequences that are more than $percentage longer or shorter than the first sequence in @seqs. @seqs is an array of PSISEQ objects. Equivilent to trim_seqs script. =cut ##################################################################################################### sub del_long_seqs { my ( $factor, @seqs ) = @_; # Keep the query sequence (we assume query is the 1st FASTA record) my $query = shift @seqs; my @res = $query; # Find the length of the query sequence without any gaps my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g ); # Calculate the upper and lower allowed bounds my ( $upper, $lower ); { my $bounds = ( $length / 100 ) * $factor; ( $upper, $lower ) = ( $length + $bounds, $length - $bounds ); } # Now try each sequence and see how long they are for (@seqs) { my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g ); if ( $l >= $lower && $l <= $upper ) { push @res, $_ } } return @res; } =head2 toupper ($PSISEQ) Converts sequence held in PSISEQ object to uppercase. =cut ##################################################################################################### sub toupper { $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] ); } =head2 degap($PSISEQ_QUERY, @PSISEQS) Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place. Equivilent to fasta2jnet script. =cut ##################################################################################################### sub degap { my (@seqs) = @_; # Find where the gaps in the query sequence are my @gaps; my $i = 0; for ( @{ $seqs[0]->align } ) { push @gaps, $i if $_ !~ /$GAP/; $i++; } return unless @gaps; # Remove the gaps in all the sequences. # Derefences the array reference and takes a slice inside the method call # arguments, then coerce back to an array ref. for (@seqs) { $_->align( [ @{ $_->align }[@gaps] ] ); } } =head2 getfreq($filename, @PSISEQS) Creates a PSIBLAST like profile and writes it to the $filename. Equivilant to profile or getfreq (older) script. This doesn't create the same answer as the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output, but it's *not* the same. =cut ##################################################################################################### sub getfreq { my ( $fn, @seqs ) = @_; #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs; # Create a PSIBLAST like profile from the alignment # This doesn't greate the same result as old Jpred, I think this is because # differences in the input between old and new # For testing # my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa"); # my @profile = profile( # map { join "", @{ $_->seq } } $f->get_entries # ); my @profile = profile( map { join "", @{ $_->align } } @seqs ); open my $fh, ">$fn" or die "JPRED: $fn: $!"; print $fh join( " ", @{$_} ), "\n" for @profile; } =head2 HMMER::Profile::Jnet = hmmer(@PSISEQS) Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object. Equivilent to gethmm script. =cut ##################################################################################################### sub hmmer { my (@seqs) = @_; # Temp files required my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3; # Create the FASTA file psiseq2fasta( $tmp_fasta, @seqs ); # Run HMMer system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta"); system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out"); # Read in the HMMER file my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out ); # Convert from HMMER::Profile to HMMER::Profile::Jnet my $hmmer_jnet = HMMER::Profile::Jnet->new; $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line; return $hmmer_jnet; } =head2 jnet($hmmer, $out, $pssm ) Runs Jnet. Pass the paths of the filenames to do the prediction on =cut ################################################################################################################################################# =head2 get_hmm($alignFile, $name) Adapted from the hmmer() function in Jpred as written by JDB. Generates an HMMer profile output compatible with Jnet. Equivilent to gethmm script. =cut sub hmm_for_align { my $fastafile = shift; # Temp files required print "hmm_for_align: fastafile = $fastafile\n"; my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2; system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile"); system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out"); # Read in the HMMER file my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out ); # Read in the HMMER file my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out ); # Convert from HMMER::Profile to HMMER::Profile::Jnet my $hmmer_jnet = HMMER::Profile::Jnet->new; $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line; return $hmmer_jnet; } ##################################################################################################### sub jnet { my ( $hmmer, $outfile, $pssm ) = @_; if ($pssm) { # run Jnet prediction with all the profile data system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile"); } else { # run Jnet prediction with only HMMer and alignment data system("$jnet --concise --hmmer $hmmer > $outfile"); } check( "jnet", $? ) or exit 1; my $res = ""; my $logging = ""; open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; while (<$IN>) { if (/^jnetpred:|^JNET[A-Z0-9]+:/) { $res .= $_; } else { $logging .= $_; } } close $IN; open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; print $OUT "\n" . $res; close $OUT; return $logging; } =head1 psiseq2fasta($path, @PSISEQ) Writes a FASTA file given an array of PSISEQ objects. =cut ##################################################################################################### sub psiseq2fasta { my ( $fn, @seqs ) = @_; #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ'; confess "JPRED: not passed PSISEQs" unless @seqs; #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs; # Changed from using FASTA::File object due to seg faulting on some large, # long alignments, presumably due to running out of memory. # my $fasta = FASTA::File->new; # $fasta->add_entries( # map { # my $f = FASTA->new(id => $_->id); # $f->seq(@{$_->align}); # $f; # } @seqs # ); # $fasta->write_file($fn); my $fh; $fn =~ /\.gz$/ ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" ) : ( open $fh, ">$fn" or die "JPRED: $!" ); for (@seqs) { print $fh ">", $_->id, "\n"; my $seq = join( "", @{ $_->align } ); $seq =~ s/(.{72})/$1\n/g; chomp $seq; print $fh $seq . "\n"; } close $fh; } =head1 @PSISEQ = fasta2psiseq($path) Reads in a FASTA file and returns an array of PSISEQ objects. =cut ##################################################################################################### sub fasta2psiseq { my ($fn) = @_; my $fasta = $fn =~ /\.gz$/ ? FASTA::File->new( read_gzip_file => $fn ) : FASTA::File->new( read_file => $fn ); $fasta or die "JPRED: $!"; my @seqs; for ( $fasta->get_entries ) { my $seq = PSISEQ->new; $seq->id( $_->id ); $seq->align( [ @{ $_->seq } ] ); push @seqs, $seq; } return @seqs; } ##################################################################################################### sub fasta2psiseq_lowmemory { my $filename = shift; local $/ = "\n"; my $fh; $filename =~ /\.gz$/ ? ( open $fh, "gzip -dc $filename|" or die $! ) : ( open $fh, $filename or die $! ); my @seqs; my ( $id, $seq ); while (<$fh>) { chomp; if (s/^>//) { $seq =~ s/ //g if defined $seq; if ( $id and $seq ) { my $psi = PSISEQ->new( id => $id ); $psi->align( [ split //, $seq ] ); push @seqs, $psi; undef $id; undef $seq; } else { $id = $_; } } else { $seq .= $_; } } if ( $id and $seq ) { my $psi = PSISEQ->new( id => $id ); $psi->seq($seq); push @seqs, $psi; } return @seqs; } ##################################################################################################### =head1 extend($FASTA::File, @PSISEQ) Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues against the query, this function extends the alignment so that all of the query is present in the alignment. The other sequences are extended with gap characters. =cut sub extend { my ( $query, @seqs ) = @_; #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query; #croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs; # Get the query sequence my $q_query = $query->get_entry(0); # Get the query sequence from the PSIBLAST alignment my $a_query = shift @seqs; # The gap size required to expand the alignment my ( $start_gap_length, $end_gap_length ); { # Make handling the sequence easier my $q_seq = join "", @{ [ $q_query->seq ] }; my $a_seq = join "", @{ $a_query->align }; $q_seq =~ s/-//g; $a_seq =~ s/-//g; ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq; $start_gap_length = index( $q_seq, $a_seq ); croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1; $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length; } # Add the gaps to the end of the alignments for (@seqs) { $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] ); } # Put the query sequence back unshift @seqs, PSISEQ->new( id => $a_query->id, align => [ ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ), @{ $a_query->align }, ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ), ] ); return @seqs; } ##################################################################################################### =head1 $int = fasta_seq_length($path) Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue. =cut sub fasta_seq_length { my ($fn) = @_; open my $fh, $fn or croak "Can't open file $fn: $!\n"; my $length = 0; local $_, $/ = "\n"; while (<$fh>) { next if /^>/; chomp; $length += tr///c; } return $length; }