From: Sasha Sherstnev Date: Mon, 14 Oct 2013 14:26:30 +0000 (+0100) Subject: Add an old jpred.pl script X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=fbb05ca3bdf63d62c1b7a0161f8172838c55053c;p=jabaws.git Add an old jpred.pl script --- diff --git a/binaries/src/jpred/jpred_old.pl b/binaries/src/jpred/jpred_old.pl new file mode 100755 index 0000000..99d7fdc --- /dev/null +++ b/binaries/src/jpred/jpred_old.pl @@ -0,0 +1,1401 @@ +#!/usr/bin/perl + +=head1 NAME + +jpred - Secondary structure prediction program + +=head1 SYNOPSIS + +./jpred.pl -in [-outfile ] [-logfile ] [-output ] [-dbname ] [-dbpath ] [-ncpu NNN] [-psi ] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man] + +=head1 DESCRIPTION + +This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence +(with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC. +For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the +secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet. + +=head1 OPTIONS + +=over 5 + +=item -in + +The path to the sequence file (in FASTA, MSF, or BLC format) + +=item -seq + +The input file is a FASTA file with one sequence only. + +=item -output + +A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in. + +=item -outfile + +An output file, by default it is undefined and the -output option is working + +=item -logfile + +Logging file, by default it is undefined and logging switched off + +=item -dbpath PATH + +Path to the uniref database used for PSI-BLAST querying. default value: . + +=item -dbname database + +Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code. +Default value: uniref90 + +=item -psi path + +Path to a PSIBLAST output file. + +=item -ncpu NNN + +Number of CPU used by jpred.pl. Maximum value is 8 +Default: NNN = 1 + +=item -pred-nohits + +Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits. + +=item -no-final + +By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features +The option untoggles this step and stops jpred at the concise file generation + +=item -jabaws + +Redefines some basic variables in order to use the script within JABAWS + +=item -verbose + +Verbose mode. Print more information while running jpred. + +=item -debug + +Debug mode. Print debugging information while running jpred. + +=item -help + +Gives help on the programs usage. + +=item -man + +Displays this man page. + +=back + +=head1 BUGS + +Can't cope with gaps in the query sequence. + +Doesn't accept alignments. + +If you find any others please contact authors. + +=head1 AUTHORS + +Jonathan Barber +Chris Cole +Alexander Sherstnev (current maintainer) + +=cut + +## TODO check for gaps in query sequence +## check for disallowed chars in sequence + +use strict; +use warnings; +use Getopt::Long; +use Pod::Usage; +use Carp; +use Data::Dumper; +use File::Temp; + +# path to Jpred modules +use FindBin qw($Bin); +use lib "$Bin/lib"; + +# internal jpred modules +use Jpred; +use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env); +use HMMER::Profile; +use HMMER::Profile::Jnet; +use PSIBLAST; +use PSIBLAST::PSSM; +use PSIBLAST::Run; +use FASTA::File; +use FASTA; +use Index; +use Pairwise; +use OC; +use Utils qw(profile); +use Run qw(check); + +our $VERSION = '3.0.2'; +my $MAX_ALIGN = 10000; # maximum number of alignment sequences allowed +my $NR_CUT = 75; # cut-off seqeunce ID value for defining non-redundancy + +# Gap characters in sequences +our $GAP = '-'; + +##################################################################################################### +# Light object to hold sequence information, light to prevent memory overload +# for big search results. Even so this is quite memory intensive, I can't see +# how to reduce the size further. +{ + + package PSISEQ; + + sub new { + my ( $class, %args ) = @_; + my $self = []; + bless $self, $class; + for ( keys %args ) { $self->$_( $args{$_} ) } + return $self; + } + + sub id { + return defined $_[1] + ? $_[0]->[0] = $_[1] + : $_[0]->[0]; + } + + sub start { + return defined $_[1] + ? $_[0]->[1] = $_[1] + : $_[0]->[1]; + } + + sub align { + if ( defined $_[1] ) { + ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref"; + + $_[0]->[2] = join( "", @{ $_[1] } ); + } else { + return [ split( //, $_[0]->[2] ) ]; + } + } +} + +##################################################################################################### +my $infile; +my $infastafile; +my $format; +my $goal = "align"; +my $seqgoal; +my $outfile; +my $prefix; +my $psiblast; +my $jabaws; +my $logfile; +my $ncpu = 1; # number of CPUs for psiblast calculations +my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found +my $db_path = "/homes/www-jpred/databases"; +my $db_entry = "cluster"; +my $nofinal; + +my ( $help, $man, $DEBUG, $VERBOSE ); + +GetOptions( + "in=s" => \$infile, + "output=s" => \$prefix, + "outfile=s" => \$outfile, + "logfile=s" => \$logfile, + "psi=s" => \$psiblast, + "dbname=s" => \$db_entry, + "dbpath=s" => \$db_path, + "ncpu=s" => \$ncpu, + "pred-nohits" => \$predNoHits, + "no-final" => \$nofinal, + "seq" => \$seqgoal, + "jabaws" => \$jabaws, + + "help" => \$help, + "man" => \$man, + "debug" => \$DEBUG, + "verbose" => \$VERBOSE +) or pod2usage(2); +pod2usage(1) if $help; +pod2usage( verbose => 2 ) if $man; + +$goal = "seq" if ( defined $seqgoal ); + +##################################################################################################### +# Key to database information and information for accessing them +my $database = { + + ## default database used by Jpred + uniref90 => { + database => $db_path . "/uniref90.filt", + unfiltered => $db_path . "/uniref90", + }, + + ## database used during Jnet training + training => { + database => $db_path . "/training/uniref90.filt", + unfiltered => $db_path . "/training/uniref90", + }, + + ## database used for ported Jpred + ported_db => { + database => $db_path . "/uniref90.filt", + unfiltered => $db_path . "/uniref90", + }, + + ## cluster-specific path for Jpred + cluster => { + database => $db_path . "/uniref90.filt", + unfiltered => $db_path . "/uniref90", + }, + + ## these other DBs are experimental ones used during development. + ## check they exist before using them. + # generic entry for use with validate_jnet.pl + # real db location defined by validate_jnet.pl + swall => { + database => $db_path . "/swall/swall.filt", + unfiltered => $db_path . "/swall/swall", + }, + + # Path to PSIBLAST db + uniprot => { + database => $db_path . "/3/swall.filt", + unfiltered => $db_path . "/3/swall", + }, + + uniref50 => { + database => $db_path . "/6/swall.filt", + unfiltered => $db_path . "/6/swall", + }, +}; + +my $dpf = $database->{$db_entry}{database}.'.pal'; +my $dpu = $database->{$db_entry}{unfiltered}.'.pal'; +pod2usage("ERROR! Input file should be provided with -sequence/-in") unless $infile; +pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database") unless exists $database->{$db_entry}; +pod2usage("ERROR! UNIREF filtered database is not available at $dpf Use -dbpath and -dbname for configuring the database") unless -f $dpf; +pod2usage("ERROR! UNIREF unfiltered database is not available at $dpu Use -dbpath and -dbname for configuring the database") unless -f $dpu; + +##################################################################################################### +# lots of preparation steps +if ( defined $prefix ) { + unless ( defined $outfile ) { + $outfile = $prefix . ".res.fasta"; + } +} else { + if ( defined $outfile ) { + print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n"; + $prefix = $outfile . ".tmp"; + } else { + print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n"; + $prefix = $infile . ".res"; + $outfile = $prefix . ".jnet"; + } +} + +if ( 1 > $ncpu or 8 < $ncpu ) { + print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n"; + $ncpu = 1; +} + +$VERBOSE = 1 if $DEBUG; + +my $LOG; +if ( defined $logfile ) { + open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied"; +} + +for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) { + my ( $key, $value ) = @{$_}; + defined $value or next; + my $par = join( ": ", $key, $value ); + print "$par\n" if $DEBUG; + print $LOG "$par\n" if $LOG; +} + +if ( defined $jabaws ) { + setup_jpred_env($Bin); + setup_env($Bin); +} +my $platform = check_OS(); +print "JPRED: checking platiform... $platform\n" if $DEBUG; +print $LOG "JPRED: checking platiform... $platform\n" if $LOG; + +##################################################################################################### +# check input file format +if ( 'seq' eq $goal ) { + $format = "seq"; + if ( 1 != check_FASTA_format($infile) ) { + die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n"; + } +} else { + my $nseq = check_FASTA_format($infile); + if ( 0 < $nseq ) { + $format = "fasta"; + if ( 1 == $nseq ) { + die "\nERROR! jpred requires alignment with more than 1 sequence\n if you provide only one sequence use the -seq option.\n"; + } + } elsif ( 0 < check_MSF_format($infile) ) { + $format = "msf"; + } elsif ( 0 < check_BLC_format($infile) ) { + $format = "blc"; + } else { + die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n"; + } +} +$infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format ); + +##################################################################################################### +if ( 'seq' eq $goal ) { + my $query = FASTA::File->new( read_file => $infile ); + my @seqs; + my $psi = PSIBLAST->new; + unless ( defined $psiblast && -e $psiblast ) { + ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits + # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence + # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria + # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3' + my $psi_run = PSIBLAST::Run->new( + debug => $DEBUG, + args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3", + path => $psiblastbin, + input => $infile, + output => "$prefix.blast", + matrix => "$prefix.profile", + database => $database->{$db_entry}{database}, + ); + + # For reduced databases, get the size of the orginal and use this as + # the equivilent DB size + if ( $db_entry =~ /^sp_red_/ ) { + ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/; + $psi_run->args( + $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below + ); + } + print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG; + print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG; + print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG; + print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG; + + print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE; + print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG; + $psi_run->run or die; # CC sub is from PSIBLAST::Run + + $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does? + system("gzip -9f $prefix.blast"); + } else { + if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does? + else { $psi->read_file($psiblast) } # CC ditto above + } + +##################################################################################################### + # Convert the last itteration into a collection of PSISEQ objects + for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm + my ( $id, $start, $align ) = @{$_}; + push @seqs, + PSISEQ->new( + id => $id, + start => $start, + align => [ split( //, $align ) ] + ); + } + +##################################################################################################### + # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only. + # Accuracy won't be as good, but at least it's a prediction + if ( @seqs == 0 ) { + if ( $predNoHits == 0 ) { + warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n"; + print ">>100% complete\n"; + print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG; + print $LOG ">>100% complete\n" if $LOG; + close($LOG) if $LOG; + exit; + } else { + print ">>50% complete\n"; + warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n"; + print "Running HMMer on query...\n" if $VERBOSE; + print $LOG ">>50% complete\n" if $LOG; + print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG; + print $LOG "Running HMMer on query...\n" if $LOG; + + # copy input query to alignment + #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n"; + open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!"; + open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!"; + while (<$ifh>) { print $ofh, $_ } + close $ifh; + close $ofh; + + # Temp files required for HMMer + my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2; + system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile"); + system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out"); + unlink $hmmbuild_out; + + # Read in the HMMER file + my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out ); + + # Convert from HMMER::Profile to HMMER::Profile::Jnet + my $hmmer_jnet = HMMER::Profile::Jnet->new; + $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line; + $hmmer_jnet->write_file("$prefix.hmm"); + print ">>70% complete\n"; + print $LOG ">>70% complete\n" if $LOG; + + # Run Jnet for the prediction + print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE; + print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG; + my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) ); + print ">>90% complete\n"; + print $LOG ">>90% complete\n" if $LOG; + print $jnetlog if $VERBOSE; + print $LOG $jnetlog if $LOG; + } + } else { + psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG; + print ">>40% complete\n"; + print $LOG ">>40% complete\n" if $LOG; + + ##################################################################################################### + # Make PSIBLAST truncated alignments the right length + print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE; + print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG; + @seqs = extend( $query, @seqs ); + psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG; + + ##################################################################################################### + # Remove masking from PSIBLAST alignment + print "Unmasking the alignments...\n" if $VERBOSE; + print $LOG "Unmasking the alignments...\n" if $LOG; + my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found."; + remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ]; + psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG; + + ##################################################################################################### + # Convert the sequences to upper case + print "Converting sequences to the same case...\n" if $VERBOSE; + print $LOG "Converting sequences to the same case...\n" if $LOG; + toupper($_) for @seqs; # CC sub is below + psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs ); + + ##################################################################################################### + # Remove excessive sequences + print "Remove excessive sequences...\n" if $VERBOSE; + print $LOG "Remove excessive sequences...\n" if $LOG; + @seqs = reduce( $MAX_ALIGN, @seqs ); + psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG; + + ##################################################################################################### + # Remove sequences that are too long or too short + print "Remove sequences which too long or short...\n" if $VERBOSE; + print $LOG "Remove sequences which too long or short...\n" if $LOG; + @seqs = del_long_seqs( 50, @seqs ); + psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG; + + ##################################################################################################### + # Remove redundant sequences based upon pairwise identity and OC clustering + print "Remove redundant sequences...\n" if $VERBOSE; + print $LOG "Remove redundant sequences...\n" if $LOG; + @seqs = nonred( $NR_CUT, @seqs ); + psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG; + + ##################################################################################################### + # Check that we haven't got rid of all of the sequences + if ( @seqs < 2 ) { + warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n"; + print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG; + @seqs = fasta2psiseq("${prefix}_backup.fasta.gz"); + } + unlink("${prefix}_backup.fasta.gz") unless $DEBUG; + + # Remove gaps in the query sequence and same positions in the alignment + print "Removing gaps in the query sequence...\n" if $VERBOSE; + print $LOG "Removing gaps in the query sequence...\n" if $LOG; + degap(@seqs); + psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG; + + # Output the alignment for the prediction + print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE; + print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG; + psiseq2fasta( "$prefix.align", @seqs ); + + ##################################################################################################### + # Equivilent to getpssm script + print "Output the PSSM matrix from the PSI-BLAST profile...\n"; + print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG; + my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" ); + $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM + print ">>50% complete\n"; + print $LOG ">>50% complete\n" if $LOG; + + ##################################################################################################### + # Run HMMER on the sequences + print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE; + print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG; + my $hmmer = hmmer(@seqs); # CC sub is below + $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module + print ">>70% complete\n"; + print $LOG ">>70% complete\n" if $LOG; +##################################################################################################### + # Run Jnet for the prediction + print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE; + print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG; + my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below + print ">>90% complete\n"; + print $LOG ">>90% complete\n" if $LOG; + print $jnetlog if $VERBOSE; + print $LOG $jnetlog if $LOG; + } +} else { + if ( 'fasta' eq $format ) { + print "Read FASTA file\n"; + my $hmmer1 = hmm_for_align($infile); + $hmmer1->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module + } elsif ( 'msf' eq $format ) { + msf2fasta( $infile, $infastafile ); + my $hmmer2 = hmm_for_align($infastafile); + $hmmer2->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module + } elsif ( 'blc' eq $format ) { + my $tmpfile = File::Temp->new->filename; + blc2msf( $infile, $tmpfile ); + msf2fasta( $infile, $infastafile ); + my $hmmer3 = hmm_for_align($infastafile); + $hmmer3->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module + } else { + die "ERROR! unknown input format '$format'. exit...\n"; + } + print ">>70% complete\n"; + print $LOG ">>70% complete\n" if $LOG; +##################################################################################################### + # Run Jnet for the prediction + print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE; + print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG; + my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) ); # CC sub is below + print ">>90% complete\n"; + print $LOG ">>90% complete\n" if $LOG; + print $jnetlog if $VERBOSE; + print $LOG $jnetlog if $LOG; +} + +unless ( defined $nofinal ) { + my $aligfile = $prefix . ".align"; + my $jnetfile = $prefix . ".jnet"; + my $jnetfastafile = $prefix . ".jnet.fasta"; + $aligfile = $infile if ( 'fasta' eq $format ); + $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format ); + concise2fasta( $jnetfile, $jnetfastafile ); + open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied"; + open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied"; + open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; + while (<$IN2>) { print $OUT $_; } + while (<$IN1>) { print $OUT $_; } + close($IN1); + close($IN2); + close($OUT); + unlink $jnetfastafile; + my $seqs = FASTA::File->new( read_file => $outfile ); + open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n"; + $seqs->write($OUT2); + close($OUT2); +} else { + if ( defined $outfile and $prefix . ".jnet" ne $outfile ) { + rename $prefix . ".jnet", $outfile; + } +} + +print ">>100% complete\n"; +print "Jpred Finished\n"; +print $LOG ">>100% complete\n" if $LOG; +print $LOG "Jpred Finished\n" if $LOG; +close($LOG) if $LOG; +exit; + +##################################################################################################### +# Functions +##################################################################################################### +sub check_FASTA_format { + my $infile = shift; + + open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n"; + my $check_first_line = 1; + my $nseq = 0; + local $/ = "\n>"; + while (<$IN>) { + if ($check_first_line) { + return 0 unless (/^>/); + $check_first_line = 0; + } + s/^>//g; + s/>$//g; + + my ( $id, @seqs ) = split /\n/, $_; + return 0 unless ( defined $id or @seqs ); + my $seq = join( "", @seqs ); + return 0 unless ( $seq =~ /[a-zA-Z\.-]/ ); + ++$nseq; + } + close($IN); + + return $nseq; +} +##################################################################################################### +sub check_MSF_format { + my $infile = shift; + $? = 0; + system("$readseq -fMSF -a -p < $infile > /dev/null"); + return 0 if ($?); + + return 1; +} +##################################################################################################### +sub check_BLC_format { + my $infile = shift; + + my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2; + open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n"; + print $fh "silent_mode\nblock_file $infile\n"; + print $fh "output_file /dev/null\nmax_nseq 2000\n"; + print $fh "pir_save $tmpfile2\nsetup\n"; + close $fh; + + $? = 0; + system("$alscript -s -f $tmpfile1"); + $? = 0; + system("$readseq -f8 -a -p < $tmpfile2 > /dev/null"); + unlink $tmpfile1; + unlink $tmpfile2; + return 0 if ($?); + + return 1; +} +############################################################################################################################################ +# convertor BLC -> MSF +sub msf2fasta { + my $infile = shift; + my $outfile = shift; + + my $sequence = ""; + open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n"; + while (<$IN>) { + $sequence .= $_; + } + close($IN); + + $sequence =~ s/~/\./g; + + ## check for non-unique sequence names - readseq has problems with these + my @lines = split /\n/, $sequence; + my %names; + foreach my $line (@lines) { + if ( $line =~ /Name:\s+(\S+)/ ) { + die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 ); + $names{$1}++; + } + } + + my $tmpfile1 = File::Temp->new->filename; + my $tmpfile2 = File::Temp->new->filename; + open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!"; + print $FILE $sequence; + close $FILE; + + $? = 0; + system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2"); + die "Reformatting of $infile failed. exit...\n" if ($?); + unlink $tmpfile1; + + $? = 0; + system("$readseq -f8 -a -p < $tmpfile2 > $outfile"); + die "Reformatting of $infile failed. exit...\n" if ($?); + unlink $tmpfile2; + + die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile ); +} + +############################################################################################################################################ +# convertor BLC -> MSF +sub blc2msf { + my ( $in, $out ) = @_; + + my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2; + + open my $fh, ">$tmp" or die "$tmp: $!\n"; + print $fh "silent_mode\nblock_file $infile\n"; + print $fh "output_file /dev/null\nmax_nseq 2000\n"; + print $fh "pir_save $tmp_pir\nsetup\n"; + close $fh; + + $? = 0; + system("$alscript -s -f $tmp"); + die "Reformatting of $infile failed. exit...\n" if ($?); + system("$readseq -f=MSF -o=$out -a $tmp_pir"); + + unlink $tmp; + unlink $tmp_pir; +} + +##################################################################################################### + +=begin :private + +=head2 fasta2concise ($infile, $outfile) + +Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file + +=cut + +sub fasta2concise { + my $infile = shift; + my $outfile = shift; + + open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied"; + my ( $seq, @seqs, @title ); + while (<$IN>) { + if (s/^>//) { + if ($seq) { + $seq =~ s/\n|\s//g; + $seq =~ s/(.)/$1,/g; + push @seqs, $seq; + $seq = ""; + } + chomp; + push @title, $_; + } else { + chomp; + $seq .= $_; + } + } + $seq =~ s/\n|\s//g; + $seq =~ s/(.)/$1,/g; + push @seqs, $seq; + close($IN); + + if ( @title != @seqs ) { + die("non matching number of titles and sequences!\n"); + } + + open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; + foreach ( 0 .. $#title ) { + print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n"; + } + close($OUT); +} + +=begin :private + +=head2 concise2fasta ($infile, $outfile) + +Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file + +=cut + +##################################################################################################### +sub concise2fasta { + my $infile = shift; + my $outfile = shift; + + my ( @seqs, %seq, @pred, %pred ); + + my @var = ( + "Lupas_21", "Lupas_14", "Lupas_28", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ", + "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred" + ); + + # parse input concise file + open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied"; + while (<$IN>) { + unless (/^\n/) { + my ( $id, $seq ) = split( ":", $_ ); + if ( defined $id and defined $seq ) { # Check we have proper values + $seq =~ s/,//g; + chomp $seq; + if ( $id =~ /;/ ) { # Then it's an alignment + @_ = split( ";", $id ); + push @seqs, $_[1]; + $seq{ $_[1] } = $seq; + } + foreach my $v (@var) { + if ( $v eq $id ) { + push @pred, $v; + $pred{$v} = $seq; + } + } + } + } + } + close($IN); + + open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; + + # print out sequences found with PSI-BLAST + foreach (@seqs) { + $seq{$_} =~ s/(.{72})/$1\n/g; + print $OUT ">$_\n$seq{$_}\n"; + } + + # print out predictions + foreach my $p (@pred) { + $pred{$p} =~ s/[TCYWXZSI\?_]/-/g; + $pred{$p} =~ s/B/E/g; + $pred{$p} =~ s/G/H/g; + + if ( $p =~ /SOL/ ) { + $pred{$p} =~ s/E/B/g; + } + $pred{$p} =~ s/(.{72})/$1\n/g; + print $OUT ">$p\n$pred{$p}\n"; + } + close($OUT); +} + +=begin :private + +=head2 $PSISEQ = remove_seq_masks($PSISEQ) + +Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database. +Does it in place. + +=cut + +##################################################################################################### +sub remove_seq_masks { + my $idx = shift; + my $seq = shift; + + # Only bother if we have a sequence which has been masked + return unless grep { uc $_ eq 'X' } @{ $seq->align }; + + my ( $id, $start, @align ) = ( $seq->id, $seq->start, @{ $seq->align } ); + my $searchdb = $database->{$db_entry}{unfiltered}; + $start--; # We need this to be zero based + + my $f = $idx->get_sequence( $id, $searchdb ) or croak "JPRED: No such entry for $id in $searchdb\n" and return; + my @seqs = $f->get_entries; + my $db = shift @seqs; + + unless ($db) { + confess "JPRED: remove_seq_masks: Accession number $id returned no sequences"; + } elsif (@seqs) { + confess "JPRED: remove_seq_masks: Accession number $id returned more than one sequence"; + } + + my @db_seq = @{ $db->seq }; + + # $i is a pointer into the original, ungapped, unmasked sequence. $_ is for the aligned sequence + my $i = 0; + for ( 0 .. $#align ) { + next if $align[$_] =~ /$GAP/; + + if ( $align[$_] eq 'X' ) { + unless ( defined $db_seq[ $i + $start ] ) { + croak "JPRED: remove_seq_masks: the database sequence is not as long as the query sequence!"; + } + $align[$_] = $db_seq[ $i + $start ]; + } + $i++; + } + $seq->align( \@align ); +} + +=head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN); + +Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN. + +Equivilent to reduce script. + +=cut + +##################################################################################################### +sub reduce { + my ( $max, @seqs ) = @_; + + my $num = int( @seqs / $max ); + my @res = shift @seqs; + + # Don't bother if we have less than the required sequences + if ( @seqs < $max or 0 == $num ) { + return @res, @seqs; + } + + for ( 0 .. $#seqs ) { + push @res, $seqs[$_] if ( 0 == $_ % $num ); + } + + return @res; +} + +=head2 nonred($cutoff, @PSISEQS); + +Removes redundant sequences at $cutoff level of sequence identity. + +Equivilent to nonred script. + +=cut + +##################################################################################################### +sub nonred { + my ( $cutoff, @seqs ) = @_; + + # Run pairwise and then OC and remove similar sequences + unless ( defined $cutoff ) { croak "JPRED: nonred() not passed a cutoff" } + unless (@seqs) { croak "JPRED: No sequences passed to nonred()" } + if ( @seqs == 1 ) { + warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n"; + return @seqs; + } + + # Convert the sequences into a FASTA object, which is what pairwise expects (simpler version). + my $fasta = FASTA::File->new; + foreach (@seqs) { + my $f = FASTA->new( id => $_->id ); + $f->seq( @{ $_->align } ); + $fasta->add_entries($f); + } + + # Run pairwise + my $pair = Pairwise->new( path => $pairwise ); + my $foo = join "\n", $pair->run($fasta) or die $!; + + # Run OC + my $ocres = OC->new( path => "$oc sim complete cut $cutoff" ); + $ocres->read_string( join "", $ocres->run($foo) ); + + # Now found those entries that are unrelated + my @keep; + for ( $ocres->get_groups ) { + my ( $group, $score, $size, @labels ) = @{$_}; + + # Keep the QUERY from the cluster, otherwise just get onelt + if ( grep { /QUERY/ } @labels ) { + + # Make sure the query stays at the start + unshift @keep, "QUERY"; + next; + } else { + push @keep, shift @labels; + } + } + push @keep, $ocres->get_unclustered; + + # Return the matching entries. + # We use the temporay to mimic the selection of sequences in the nonred + # script, which took the sequences in the order they occured in the file. + my @filtered_res; + for (@seqs) { + my $label = $_->id; + if ( grep { $_ =~ /^$label$/ } @keep ) { + push @filtered_res, $_; + } + } + + return @filtered_res; +} + +=head2 @seqs = del_long_seqs($percentage, @PSISEQS) + +Removes those sequences that are more than $percentage longer or shorter than the first +sequence in @seqs. @seqs is an array of PSISEQ objects. + +Equivilent to trim_seqs script. + +=cut + +##################################################################################################### +sub del_long_seqs { + my ( $factor, @seqs ) = @_; + + # Keep the query sequence (we assume query is the 1st FASTA record) + my $query = shift @seqs; + my @res = $query; + + # Find the length of the query sequence without any gaps + my $length = ( () = ( join '', @{ $query->align } ) =~ /[^$GAP]/g ); + + # Calculate the upper and lower allowed bounds + my ( $upper, $lower ); + { + my $bounds = ( $length / 100 ) * $factor; + ( $upper, $lower ) = ( $length + $bounds, $length - $bounds ); + } + + # Now try each sequence and see how long they are + for (@seqs) { + my $l = ( () = ( join '', @{ $_->align } ) =~ /[^$GAP]/g ); + if ( $l >= $lower && $l <= $upper ) { push @res, $_ } + } + + return @res; +} + +=head2 toupper ($PSISEQ) + +Converts sequence held in PSISEQ object to uppercase. + +=cut + +##################################################################################################### +sub toupper { + $_[0]->align( [ split //, uc join '', @{ $_[0]->align } ] ); +} + +=head2 degap($PSISEQ_QUERY, @PSISEQS) + +Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place. + +Equivilent to fasta2jnet script. + +=cut + +##################################################################################################### +sub degap { + my (@seqs) = @_; + + # Find where the gaps in the query sequence are + my @gaps; + my $i = 0; + for ( @{ $seqs[0]->align } ) { + push @gaps, $i if $_ !~ /$GAP/; + $i++; + } + + return unless @gaps; + + # Remove the gaps in all the sequences. + # Derefences the array reference and takes a slice inside the method call + # arguments, then coerce back to an array ref. + for (@seqs) { + $_->align( [ @{ $_->align }[@gaps] ] ); + } +} + +=head2 getfreq($filename, @PSISEQS) + +Creates a PSIBLAST like profile and writes it to the $filename. + +Equivilant to profile or getfreq (older) script. This doesn't create the same answer as +the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output, +but it's *not* the same. + +=cut + +##################################################################################################### +sub getfreq { + my ( $fn, @seqs ) = @_; + + #croak "JPRED: Passed non PSISEQ object" if grep { !isa( $_, 'PSISEQ' ) } @seqs; + + # Create a PSIBLAST like profile from the alignment + # This doesn't greate the same result as old Jpred, I think this is because + # differences in the input between old and new + + # For testing + # my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa"); + # my @profile = profile( + # map { join "", @{ $_->seq } } $f->get_entries + # ); + + my @profile = profile( map { join "", @{ $_->align } } @seqs ); + + open my $fh, ">$fn" or die "JPRED: $fn: $!"; + print $fh join( " ", @{$_} ), "\n" for @profile; +} + +=head2 HMMER::Profile::Jnet = hmmer(@PSISEQS) + +Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object. + +Equivilent to gethmm script. + +=cut + +##################################################################################################### +sub hmmer { + my (@seqs) = @_; + + # Temp files required + my ( $hmmbuild_out, $hmmconvert_out, $tmp_fasta ) = map { File::Temp->new->filename } 1 .. 3; + + # Create the FASTA file + psiseq2fasta( $tmp_fasta, @seqs ); + + # Run HMMer + system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta"); + system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out"); + + # Read in the HMMER file + my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out ); + + # Convert from HMMER::Profile to HMMER::Profile::Jnet + my $hmmer_jnet = HMMER::Profile::Jnet->new; + $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line; + + return $hmmer_jnet; +} + +=head2 jnet($hmmer, $out, $pssm ) + +Runs Jnet. Pass the paths of the filenames to do the prediction on + +=cut + +################################################################################################################################################# + +=head2 get_hmm($alignFile, $name) + +Adapted from the hmmer() function in Jpred as written by JDB. + +Generates an HMMer profile output compatible with Jnet. + +Equivilent to gethmm script. + +=cut + +sub hmm_for_align { + my $fastafile = shift; + + # Temp files required + print "hmm_for_align: fastafile = $fastafile\n"; + my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2; + + system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile"); + system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out"); + + # Read in the HMMER file + my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out ); + + # Read in the HMMER file + my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out ); + + # Convert from HMMER::Profile to HMMER::Profile::Jnet + my $hmmer_jnet = HMMER::Profile::Jnet->new; + $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line; + + return $hmmer_jnet; +} + +##################################################################################################### +sub jnet { + my ( $hmmer, $outfile, $pssm ) = @_; + if ($pssm) { + + # run Jnet prediction with all the profile data + system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile"); + } else { + + # run Jnet prediction with only HMMer and alignment data + system("$jnet --concise --hmmer $hmmer > $outfile"); + } + check( "jnet", $? ) or exit 1; + my $res = ""; + my $logging = ""; + open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; + while (<$IN>) { + if (/^jnetpred:|^JNET[A-Z0-9]+:/) { + $res .= $_; + } else { + $logging .= $_; + } + } + close $IN; + open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied"; + print $OUT "\n" . $res; + close $OUT; + return $logging; +} + +=head1 psiseq2fasta($path, @PSISEQ) + +Writes a FASTA file given an array of PSISEQ objects. + +=cut + +##################################################################################################### +sub psiseq2fasta { + my ( $fn, @seqs ) = @_; + + #confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ'; + confess "JPRED: not passed PSISEQs" unless @seqs; + + #confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs; + + # Changed from using FASTA::File object due to seg faulting on some large, + # long alignments, presumably due to running out of memory. + # my $fasta = FASTA::File->new; + # $fasta->add_entries( + # map { + # my $f = FASTA->new(id => $_->id); + # $f->seq(@{$_->align}); + # $f; + # } @seqs + # ); + # $fasta->write_file($fn); + + my $fh; + $fn =~ /\.gz$/ + ? ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" ) + : ( open $fh, ">$fn" or die "JPRED: $!" ); + + for (@seqs) { + print $fh ">", $_->id, "\n"; + my $seq = join( "", @{ $_->align } ); + $seq =~ s/(.{72})/$1\n/g; + chomp $seq; + print $fh $seq . "\n"; + } + close $fh; +} + +=head1 @PSISEQ = fasta2psiseq($path) + +Reads in a FASTA file and returns an array of PSISEQ objects. + +=cut + +##################################################################################################### +sub fasta2psiseq { + my ($fn) = @_; + + my $fasta = + $fn =~ /\.gz$/ + ? FASTA::File->new( read_gzip_file => $fn ) + : FASTA::File->new( read_file => $fn ); + $fasta or die "JPRED: $!"; + + my @seqs; + for ( $fasta->get_entries ) { + my $seq = PSISEQ->new; + $seq->id( $_->id ); + $seq->align( [ @{ $_->seq } ] ); + push @seqs, $seq; + } + + return @seqs; +} + +##################################################################################################### +sub fasta2psiseq_lowmemory { + my $filename = shift; + + local $/ = "\n"; + my $fh; + $filename =~ /\.gz$/ + ? ( open $fh, "gzip -dc $filename|" or die $! ) + : ( open $fh, $filename or die $! ); + + my @seqs; + my ( $id, $seq ); + while (<$fh>) { + chomp; + if (s/^>//) { + $seq =~ s/ //g if defined $seq; + + if ( $id and $seq ) { + my $psi = PSISEQ->new( id => $id ); + $psi->align( [ split //, $seq ] ); + push @seqs, $psi; + undef $id; + undef $seq; + } else { + $id = $_; + } + } else { + $seq .= $_; + } + } + if ( $id and $seq ) { + my $psi = PSISEQ->new( id => $id ); + $psi->seq($seq); + push @seqs, $psi; + } + + return @seqs; +} + +##################################################################################################### + +=head1 extend($FASTA::File, @PSISEQ) + +Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues +against the query, this function extends the alignment so that all of the query is +present in the alignment. The other sequences are extended with gap characters. + +=cut + +sub extend { + my ( $query, @seqs ) = @_; + + #croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query; + #croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs; + + # Get the query sequence + my $q_query = $query->get_entry(0); + + # Get the query sequence from the PSIBLAST alignment + my $a_query = shift @seqs; + + # The gap size required to expand the alignment + my ( $start_gap_length, $end_gap_length ); + { + + # Make handling the sequence easier + my $q_seq = join "", @{ [ $q_query->seq ] }; + my $a_seq = join "", @{ $a_query->align }; + + $q_seq =~ s/-//g; + $a_seq =~ s/-//g; + + ( $q_seq, $a_seq ) = map { uc $_ } $q_seq, $a_seq; + + $start_gap_length = index( $q_seq, $a_seq ); + croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1; + + $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length; + } + + # Add the gaps to the end of the alignments + for (@seqs) { + $_->align( [ ($GAP) x $start_gap_length, @{ $_->align }, ($GAP) x $end_gap_length ] ); + } + + # Put the query sequence back + unshift @seqs, + PSISEQ->new( + id => $a_query->id, + align => [ + ( $start_gap_length ? @{ $q_query->seq }[ 0 .. ( $start_gap_length - 1 ) ] : () ), + @{ $a_query->align }, + ( $end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : () ), + ] + ); + + return @seqs; +} + +##################################################################################################### + +=head1 $int = fasta_seq_length($path) + +Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue. + +=cut + +sub fasta_seq_length { + my ($fn) = @_; + open my $fh, $fn or croak "Can't open file $fn: $!\n"; + + my $length = 0; + local $_, $/ = "\n"; + while (<$fh>) { + next if /^>/; + chomp; + $length += tr///c; + } + + return $length; +}