=head1 SYNOPSIS
-./jpred.pl -in/-sequence <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
+./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
=head1 DESCRIPTION
-This is a program which predicts the secondary structure of a protein sequence given a path to a
-FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
-
-The program is primarily design for use by the Jpred server, but it can be used directly by any user.
-Some of the output may not be meaningful if used directly - it can be safely ignored.
+This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence
+(with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC.
+For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the
+secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
=head1 OPTIONS
=over 5
-=item -in/-sequence <FILE1>
+=item -in <FILE1>
+
+The path to the sequence file (in FASTA, MSF, or BLC format)
-The path to the sequence file (in FASTA format) you want to predict.
+=item -seq
+
+The input file is a FASTA file with one sequence only.
=item -output <FILEPREFIX>
use Data::Dumper;
use File::Temp;
-#use UNIVERSAL;
-
# path to Jpred modules
use FindBin qw($Bin);
use lib "$Bin/lib";
# internal jpred modules
use Jpred;
-use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS setup_env);
+use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin $alscript $readseq check_OS setup_env);
use HMMER::Profile;
use HMMER::Profile::Jnet;
use PSIBLAST;
#####################################################################################################
my $infile;
+my $infastafile;
+my $format;
+my $goal = "align";
+my $seqgoal;
my $outfile;
my $prefix;
my $psiblast;
my $jabaws;
my $logfile;
-my $ncpu = 1; # number of CPUs for psiblast calculations
-my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
+my $ncpu = 1; # number of CPUs for psiblast calculations
+my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
my $db_path = "/homes/www-jpred/databases";
my $db_entry = "cluster";
my $nofinal;
my ( $help, $man, $DEBUG, $VERBOSE );
GetOptions(
- "in|sequence=s" => \$infile,
- "output=s" => \$prefix,
- "outfile=s" => \$outfile,
- "logfile=s" => \$logfile,
- "psi=s" => \$psiblast,
- "dbname=s" => \$db_entry,
- "dbpath=s" => \$db_path,
- "ncpu=s" => \$ncpu,
- "pred-nohits" => \$predNoHits,
- "no-final" => \$nofinal,
- "jabaws" => \$jabaws,
- "help" => \$help,
- "man" => \$man,
- "debug" => \$DEBUG,
- "verbose" => \$VERBOSE
+ "in=s" => \$infile,
+ "output=s" => \$prefix,
+ "outfile=s" => \$outfile,
+ "logfile=s" => \$logfile,
+ "psi=s" => \$psiblast,
+ "dbname=s" => \$db_entry,
+ "dbpath=s" => \$db_path,
+ "ncpu=s" => \$ncpu,
+ "pred-nohits" => \$predNoHits,
+ "no-final" => \$nofinal,
+ "seq" => \$seqgoal,
+ "jabaws" => \$jabaws,
+
+ "help" => \$help,
+ "man" => \$man,
+ "debug" => \$DEBUG,
+ "verbose" => \$VERBOSE
) or pod2usage(2);
pod2usage(1) if $help;
pod2usage( verbose => 2 ) if $man;
+$goal = "seq" if ( defined $seqgoal );
+
#####################################################################################################
# Key to database information and information for accessing them
-if (defined $jabaws) {
- $db_path = "/data/UNIREFdb";
- $db_entry = "cluster";
+if ( defined $jabaws ) {
+ $db_path = "/data/UNIREFdb";
+ $db_entry = "cluster";
}
my $database = {
};
my $dp = $database->{$db_entry}{database};
-pod2usage("ERROR! Input file with a sequence should be provided with -sequence/-in") unless $infile;
-pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database" ) unless exists $database->{$db_entry};
+pod2usage("ERROR! Input file should be provided with -sequence/-in") unless $infile;
+pod2usage("ERROR! Unknown database at $db_path. Use -dbpath and -dbname for configuring the database") unless exists $database->{$db_entry};
pod2usage("ERROR! UNIREF database is not available at $dp. Use -dbpath and -dbname for configuring the database") unless -f $dp;
+#####################################################################################################
+# lots of preparation steps
if ( defined $prefix ) {
- unless (defined $outfile) {
- $outfile = $prefix. ".res.fasta";
+ unless ( defined $outfile ) {
+ $outfile = $prefix . ".res.fasta";
}
} else {
- if (defined $outfile) {
+ if ( defined $outfile ) {
print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
$prefix = $outfile . ".tmp";
} else {
print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
- $prefix = $infile . ".res";
- $outfile = $prefix. ".jnet";
+ $prefix = $infile . ".res";
+ $outfile = $prefix . ".jnet";
}
}
$VERBOSE = 1 if $DEBUG;
my $LOG;
-if (defined $logfile) {
+if ( defined $logfile ) {
open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
}
print $LOG "$par\n" if $LOG;
}
-#####################################################################################################
-if (defined $jabaws) {
+if ( defined $jabaws ) {
setup_jpred_env($Bin);
setup_env($Bin);
}
print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
#####################################################################################################
-my $query = FASTA::File->new( read_file => $infile );
-
-my @seqs;
-my $psi = PSIBLAST->new;
-unless ( defined $psiblast && -e $psiblast ) {
- ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
- # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence
- # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
- # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
- my $psi_run = PSIBLAST::Run->new(
- debug => $DEBUG,
- args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
- path => $psiblastbin,
- input => $infile,
- output => "$prefix.blast",
- matrix => "$prefix.profile",
- database => $database->{$db_entry}{database},
- );
-
- # For reduced databases, get the size of the orginal and use this as
- # the equivilent DB size
- if ( $db_entry =~ /^sp_red_/ ) {
- ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
- $psi_run->args(
- $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below
- );
+# check input file format
+if ( 'seq' eq $goal ) {
+ $format = "seq";
+ if ( 1 != check_FASTA_format($infile) ) {
+ die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
}
- print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
- print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
- print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
- print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG;
-
- print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
- print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
- $psi_run->run or die; # CC sub is from PSIBLAST::Run
-
- $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
- system("gzip -9f $prefix.blast");
} else {
- if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
- else { $psi->read_file($psiblast) } # CC ditto above
+ my $nseq = check_FASTA_format($infile);
+ if ( 0 < $nseq ) {
+ $format = "fasta";
+ if ( 1 == $nseq ) {
+ die "\nERROR! jpred requires alignment with more than 1 sequence\n if you provide only one sequence use the -seq option.\n";
+ }
+ } elsif ( 0 < check_MSF_format($infile) ) {
+ $format = "msf";
+ } elsif ( 0 < check_BLC_format($infile) ) {
+ $format = "blc";
+ } else {
+ die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
+ }
}
+$infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
#####################################################################################################
-# Convert the last itteration into a collection of PSISEQ objects
-for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm
- my ( $id, $start, $align ) = @{$_};
- push @seqs,
- PSISEQ->new(
- id => $id,
- start => $start,
- align => [ split( //, $align ) ]
+if ( 'seq' eq $goal ) {
+ my $query = FASTA::File->new( read_file => $infile );
+ my @seqs;
+ my $psi = PSIBLAST->new;
+ unless ( defined $psiblast && -e $psiblast ) {
+ ## potentially a better set of parameters (esp. -b & -v) which avoid PSI-BLAST dying with large number of hits
+ # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"' # 'soft' filtering of the query sequence
+ # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
+ # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
+ my $psi_run = PSIBLAST::Run->new(
+ debug => $DEBUG,
+ args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
+ path => $psiblastbin,
+ input => $infile,
+ output => "$prefix.blast",
+ matrix => "$prefix.profile",
+ database => $database->{$db_entry}{database},
);
-}
+
+ # For reduced databases, get the size of the orginal and use this as
+ # the equivilent DB size
+ if ( $db_entry =~ /^sp_red_/ ) {
+ ( my $entry = $db_entry ) =~ s/^sp_red_/sp_/;
+ $psi_run->args(
+ $psi_run->args . " -z " . fasta_seq_length( $database->{$entry}{database} ) # CC sub is below
+ );
+ }
+ print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
+ print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
+ print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
+ print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG;
+
+ print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
+ print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
+ $psi_run->run or die; # CC sub is from PSIBLAST::Run
+
+ $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
+ system("gzip -9f $prefix.blast");
+ } else {
+ if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
+ else { $psi->read_file($psiblast) } # CC ditto above
+ }
+
+#####################################################################################################
+ # Convert the last itteration into a collection of PSISEQ objects
+ for ( $psi->get_all ) { # CC sub is from PSIBLAST.pm
+ my ( $id, $start, $align ) = @{$_};
+ push @seqs,
+ PSISEQ->new(
+ id => $id,
+ start => $start,
+ align => [ split( //, $align ) ]
+ );
+ }
#####################################################################################################
-## When there are no PSI-BLAST hits generate an HMM from the query
-## and run Jnet against that only.
-## Accuracy won't be as good, but at least it's a prediction
-if ( @seqs == 0 ) {
- if ( $predNoHits == 0 ) {
- warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
- print ">>100% complete\n";
- print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
- print $LOG ">>100% complete\n" if $LOG;
- close ($LOG) if $LOG;
- exit;
+ # When there are no PSI-BLAST hits generate an HMM from the query and run Jnet against that only.
+ # Accuracy won't be as good, but at least it's a prediction
+ if ( @seqs == 0 ) {
+ if ( $predNoHits == 0 ) {
+ warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
+ print ">>100% complete\n";
+ print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
+ print $LOG ">>100% complete\n" if $LOG;
+ close($LOG) if $LOG;
+ exit;
+ } else {
+ print ">>50% complete\n";
+ warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
+ print "Running HMMer on query...\n" if $VERBOSE;
+ print $LOG ">>50% complete\n" if $LOG;
+ print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
+ print $LOG "Running HMMer on query...\n" if $LOG;
+
+ # copy input query to alignment
+ #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
+ open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
+ open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
+ while (<$ifh>) { print $ofh, $_ }
+ close $ifh;
+ close $ofh;
+
+ # Temp files required for HMMer
+ my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
+ system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
+ system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+ unlink $hmmbuild_out;
+
+ # Read in the HMMER file
+ my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+ # Convert from HMMER::Profile to HMMER::Profile::Jnet
+ my $hmmer_jnet = HMMER::Profile::Jnet->new;
+ $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
+ $hmmer_jnet->write_file("$prefix.hmm");
+ print ">>70% complete\n";
+ print $LOG ">>70% complete\n" if $LOG;
+
+ # Run Jnet for the prediction
+ print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
+ print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
+ my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
+ print ">>90% complete\n";
+ print $LOG ">>90% complete\n" if $LOG;
+ print $jnetlog if $VERBOSE;
+ print $LOG $jnetlog if $LOG;
+ }
} else {
+ psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
+ print ">>40% complete\n";
+ print $LOG ">>40% complete\n" if $LOG;
+
+ #####################################################################################################
+ # Make PSIBLAST truncated alignments the right length
+ print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
+ print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
+ @seqs = extend( $query, @seqs );
+ psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
+
+ #####################################################################################################
+ # Remove masking from PSIBLAST alignment
+ print "Unmasking the alignments...\n" if $VERBOSE;
+ print $LOG "Unmasking the alignments...\n" if $LOG;
+ my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
+ remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
+ psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
+
+ #####################################################################################################
+ # Convert the sequences to upper case
+ print "Converting sequences to the same case...\n" if $VERBOSE;
+ print $LOG "Converting sequences to the same case...\n" if $LOG;
+ toupper($_) for @seqs; # CC sub is below
+ psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
+
+ #####################################################################################################
+ # Remove excessive sequences
+ print "Remove excessive sequences...\n" if $VERBOSE;
+ print $LOG "Remove excessive sequences...\n" if $LOG;
+ @seqs = reduce( $MAX_ALIGN, @seqs );
+ psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
+
+ #####################################################################################################
+ # Remove sequences that are too long or too short
+ print "Remove sequences which too long or short...\n" if $VERBOSE;
+ print $LOG "Remove sequences which too long or short...\n" if $LOG;
+ @seqs = del_long_seqs( 50, @seqs );
+ psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
+
+ #####################################################################################################
+ # Remove redundant sequences based upon pairwise identity and OC clustering
+ print "Remove redundant sequences...\n" if $VERBOSE;
+ print $LOG "Remove redundant sequences...\n" if $LOG;
+ @seqs = nonred( $NR_CUT, @seqs );
+ psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
+
+ #####################################################################################################
+ # Check that we haven't got rid of all of the sequences
+ if ( @seqs < 2 ) {
+ warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
+ print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
+ @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
+ }
+ unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
+
+ # Remove gaps in the query sequence and same positions in the alignment
+ print "Removing gaps in the query sequence...\n" if $VERBOSE;
+ print $LOG "Removing gaps in the query sequence...\n" if $LOG;
+ degap(@seqs);
+ psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
+
+ # Output the alignment for the prediction
+ print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
+ print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
+ psiseq2fasta( "$prefix.align", @seqs );
+
+ #####################################################################################################
+ # Equivilent to getpssm script
+ print "Output the PSSM matrix from the PSI-BLAST profile...\n";
+ print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
+ my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
+ $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
print ">>50% complete\n";
- warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
- print "Running HMMer on query...\n" if $VERBOSE;
print $LOG ">>50% complete\n" if $LOG;
- print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
- print $LOG "Running HMMer on query...\n" if $LOG;
-
- # copy input query to alignment
- #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
- open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
- open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
- while (<$ifh>) { print $ofh, $_ }
- close $ifh;
- close $ofh;
-
- # Temp files required for HMMer
- my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
-
- system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $infile");
- system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
-
- # Read in the HMMER file
- my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
-
- # Convert from HMMER::Profile to HMMER::Profile::Jnet
- my $hmmer_jnet = HMMER::Profile::Jnet->new;
- $hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
- $hmmer_jnet->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
+
+ #####################################################################################################
+ # Run HMMER on the sequences
+ print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
+ print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
+ my $hmmer = hmmer(@seqs); # CC sub is below
+ $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
print ">>70% complete\n";
print $LOG ">>70% complete\n" if $LOG;
-
+#####################################################################################################
# Run Jnet for the prediction
- print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
- print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
- my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
+ print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
+ print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
+ my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below
print ">>90% complete\n";
print $LOG ">>90% complete\n" if $LOG;
- print $jnetlog if $VERBOSE;
- print $LOG $jnetlog if $LOG;
+ print $jnetlog if $VERBOSE;
+ print $LOG $jnetlog if $LOG;
}
} else {
- psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
- print ">>40% complete\n";
- print $LOG ">>40% complete\n" if $LOG;
-
- #####################################################################################################
- # Make PSIBLAST truncated alignments the right length
- print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
- print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
- @seqs = extend( $query, @seqs );
- psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
-
- #####################################################################################################
- # Remove masking from PSIBLAST alignment
- print "Unmasking the alignments...\n" if $VERBOSE;
- print $LOG "Unmasking the alignments...\n" if $LOG;
- my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
- remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
- psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
-
- #####################################################################################################
- # Convert the sequences to upper case
- print "Converting sequences to the same case...\n" if $VERBOSE;
- print $LOG "Converting sequences to the same case...\n" if $LOG;
- toupper($_) for @seqs; # CC sub is below
- psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
-
- #####################################################################################################
- # Remove excessive sequences
- print "Remove excessive sequences...\n" if $VERBOSE;
- print $LOG "Remove excessive sequences...\n" if $LOG;
- @seqs = reduce( $MAX_ALIGN, @seqs );
- psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
-
- #####################################################################################################
- # Remove sequences that are too long or too short
- print "Remove sequences which too long or short...\n" if $VERBOSE;
- print $LOG "Remove sequences which too long or short...\n" if $LOG;
- @seqs = del_long_seqs( 50, @seqs );
- psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
-
- #####################################################################################################
- # Remove redundant sequences based upon pairwise identity and OC clustering
- print "Remove redundant sequences...\n" if $VERBOSE;
- print $LOG "Remove redundant sequences...\n" if $LOG;
- @seqs = nonred( $NR_CUT, @seqs );
- psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
-
- #####################################################################################################
- # Check that we haven't got rid of all of the sequences
- if ( @seqs < 2 ) {
- warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
- print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
- @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
+ if ( 'fasta' eq $format ) {
+ print "Read FASTA file\n";
+ my $hmmer1 = hmm_for_align($infile);
+ $hmmer1->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
+ } elsif ( 'msf' eq $format ) {
+ msf2fasta( $infile, $infastafile );
+ my $hmmer2 = hmm_for_align($infastafile);
+ $hmmer2->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
+ } elsif ( 'blc' eq $format ) {
+ my $tmpfile = File::Temp->new->filename;
+ blc2msf( $infile, $tmpfile );
+ msf2fasta( $infile, $infastafile );
+ my $hmmer3 = hmm_for_align($infastafile);
+ $hmmer3->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
+ } else {
+ die "ERROR! unknown input format '$format'. exit...\n";
}
- unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
-
- # Remove gaps in the query sequence and same positions in the alignment
- print "Removing gaps in the query sequence...\n" if $VERBOSE;
- print $LOG "Removing gaps in the query sequence...\n" if $LOG;
- degap(@seqs);
- psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
-
- # Output the alignment for the prediction
- print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
- print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
- psiseq2fasta( "$prefix.align", @seqs );
-
- #####################################################################################################
- # Equivilent to getpssm script
- print "Output the PSSM matrix from the PSI-BLAST profile...\n";
- print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
- my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
- $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
- print ">>50% complete\n";
- print $LOG ">>50% complete\n" if $LOG;
-
- #####################################################################################################
- # Run HMMER on the sequences
- print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
- print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
- my $hmmer = hmmer(@seqs); # CC sub is below
- $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
print ">>70% complete\n";
print $LOG ">>70% complete\n" if $LOG;
-
- #####################################################################################################
+#####################################################################################################
# Run Jnet for the prediction
- print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
- print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
- my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below
+ print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
+ print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
+ my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) ); # CC sub is below
print ">>90% complete\n";
print $LOG ">>90% complete\n" if $LOG;
- print $jnetlog if $VERBOSE;
- print $LOG $jnetlog if $LOG;
+ print $jnetlog if $VERBOSE;
+ print $LOG $jnetlog if $LOG;
}
unless ( defined $nofinal ) {
my $aligfile = $prefix . ".align";
my $jnetfile = $prefix . ".jnet";
my $jnetfastafile = $prefix . ".jnet.fasta";
+ $aligfile = $infile if ( 'fasta' eq $format );
+ $aligfile = $infastafile if ( 'msf' eq $format or 'blc' eq $format );
concise2fasta( $jnetfile, $jnetfastafile );
open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
close($IN2);
close($OUT);
unlink $jnetfastafile;
+ my $seqs = FASTA::File->new( read_file => $outfile );
+ open( my $OUT2, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\n";
+ $seqs->write($OUT2);
+ close($OUT2);
} else {
- if (defined $outfile and $prefix.".jnet" ne $outfile) {
- rename $prefix.".jnet", $outfile;
+ if ( defined $outfile and $prefix . ".jnet" ne $outfile ) {
+ rename $prefix . ".jnet", $outfile;
}
}
print ">>100% complete\n";
print "Jpred Finished\n";
print $LOG ">>100% complete\n" if $LOG;
-print $LOG "Jpred Finished\n" if $LOG;
-close ($LOG) if $LOG;
+print $LOG "Jpred Finished\n" if $LOG;
+close($LOG) if $LOG;
exit;
#####################################################################################################
# Functions
#####################################################################################################
+sub check_FASTA_format {
+ my $infile = shift;
+
+ open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+ my $check_first_line = 1;
+ my $nseq = 0;
+ local $/ = "\n>";
+ while (<$IN>) {
+ if ($check_first_line) {
+ return 0 unless (/^>/);
+ $check_first_line = 0;
+ }
+ s/^>//g;
+ s/>$//g;
+
+ my ( $id, @seqs ) = split /\n/, $_;
+ return 0 unless ( defined $id or @seqs );
+ my $seq = join( "", @seqs );
+ return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
+ ++$nseq;
+ }
+ close($IN);
+
+ return $nseq;
+}
+#####################################################################################################
+sub check_MSF_format {
+ my $infile = shift;
+ $? = 0;
+ system("$readseq -fMSF -a -p < $infile > /dev/null");
+ return 0 if ($?);
+
+ return 1;
+}
+#####################################################################################################
+sub check_BLC_format {
+ my $infile = shift;
+
+ my ( $tmpfile1, $tmpfile2 ) = map { File::Temp->new->filename } 1 .. 2;
+ open my $fh, ">$tmpfile1" or die "$tmpfile1: $!\n";
+ print $fh "silent_mode\nblock_file $infile\n";
+ print $fh "output_file /dev/null\nmax_nseq 2000\n";
+ print $fh "pir_save $tmpfile2\nsetup\n";
+ close $fh;
+
+ $? = 0;
+ system("$alscript -s -f $tmpfile1");
+ $? = 0;
+ system("$readseq -f8 -a -p < $tmpfile2 > /dev/null");
+ unlink $tmpfile1;
+ unlink $tmpfile2;
+ return 0 if ($?);
+
+ return 1;
+}
+############################################################################################################################################
+# convertor BLC -> MSF
+sub msf2fasta {
+ my $infile = shift;
+ my $outfile = shift;
+
+ my $sequence = "";
+ open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+ while (<$IN>) {
+ $sequence .= $_;
+ }
+ close($IN);
+
+ $sequence =~ s/~/\./g;
+
+ ## check for non-unique sequence names - readseq has problems with these
+ my @lines = split /\n/, $sequence;
+ my %names;
+ foreach my $line (@lines) {
+ if ( $line =~ /Name:\s+(\S+)/ ) {
+ die "ERROR! Alignment has non-unique sequence ids. exit...\n" if ( defined( $names{$1} ) && $names{$1} >= 1 );
+ $names{$1}++;
+ }
+ }
+
+ my $tmpfile1 = File::Temp->new->filename;
+ my $tmpfile2 = File::Temp->new->filename;
+ open my $FILE, ">$tmpfile1" or die "open $tmpfile1 failed: $!";
+ print $FILE $sequence;
+ close $FILE;
+
+ $? = 0;
+ system("$readseq -fMSF -a -p < $tmpfile1 > $tmpfile2");
+ die "Reformatting of $infile failed. exit...\n" if ($?);
+ unlink $tmpfile1;
+
+ $? = 0;
+ system("$readseq -f8 -a -p < $tmpfile2 > $outfile");
+ die "Reformatting of $infile failed. exit...\n" if ($?);
+ unlink $tmpfile2;
+
+ die "Reformatting the input alignment has failed. exit...\n" unless ( -e $outfile );
+}
+
+############################################################################################################################################
+# convertor BLC -> MSF
+sub blc2msf {
+ my ( $in, $out ) = @_;
+
+ my ( $tmp, $tmp_pir ) = map { File::Temp->new->filename } 1 .. 2;
+
+ open my $fh, ">$tmp" or die "$tmp: $!\n";
+ print $fh "silent_mode\nblock_file $infile\n";
+ print $fh "output_file /dev/null\nmax_nseq 2000\n";
+ print $fh "pir_save $tmp_pir\nsetup\n";
+ close $fh;
+
+ $? = 0;
+ system("$alscript -s -f $tmp");
+ die "Reformatting of $infile failed. exit...\n" if ($?);
+ system("$readseq -f=MSF -o=$out -a $tmp_pir");
+
+ unlink $tmp;
+ unlink $tmp_pir;
+}
#####################################################################################################
=cut
+#################################################################################################################################################
+
+=head2 get_hmm($alignFile, $name)
+
+Adapted from the hmmer() function in Jpred as written by JDB.
+
+Generates an HMMer profile output compatible with Jnet.
+
+Equivilent to gethmm script.
+
+=cut
+
+sub hmm_for_align {
+ my $fastafile = shift;
+
+ # Temp files required
+ print "hmm_for_align: fastafile = $fastafile\n";
+ my ( $hmmbuild_out, $hmmconvert_out ) = map { File::Temp->new->filename } 1 .. 2;
+
+ system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $fastafile");
+ system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+
+ # Read in the HMMER file
+ my $hmmer = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+ # Read in the HMMER file
+ my $hmmer_tmp = HMMER::Profile->new( read_file => $hmmconvert_out );
+
+ # Convert from HMMER::Profile to HMMER::Profile::Jnet
+ my $hmmer_jnet = HMMER::Profile::Jnet->new;
+ $hmmer_jnet->add_line( @{$_} ) for $hmmer_tmp->get_line;
+
+ return $hmmer_jnet;
+}
+
#####################################################################################################
sub jnet {
my ( $hmmer, $outfile, $pssm ) = @_;
system("$jnet --concise --hmmer $hmmer > $outfile");
}
check( "jnet", $? ) or exit 1;
- my $res = "";
+ my $res = "";
my $logging = "";
open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
while (<$IN>) {
}
close $IN;
open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
- print $OUT "\n".$res;
+ print $OUT "\n" . $res;
close $OUT;
return $logging;
}
return @seqs;
}
+#####################################################################################################
+
=head1 extend($FASTA::File, @PSISEQ)
Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues
=cut
-#####################################################################################################
sub extend {
my ( $query, @seqs ) = @_;
return @seqs;
}
+#####################################################################################################
+
=head1 $int = fasta_seq_length($path)
Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
=cut
-#####################################################################################################
sub fasta_seq_length {
my ($fn) = @_;
open my $fh, $fn or croak "Can't open file $fn: $!\n";
return $length;
}
-
-=head1 log(@messages)
-
-Report messages to STDERR
-
-=cut