=head1 SYNOPSIS
-./jpred.pl -in/-sequence <FILE1> [-out/-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-verbose] [-debug] [-help] [-man]
+./jpred.pl -in/-sequence <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
=head1 DESCRIPTION
=over 5
-=item -in/-sequence FILE
+=item -in/-sequence <FILE1>
The path to the sequence file (in FASTA format) you want to predict.
-=item -out/-output FILEPREFIX
+=item -output <FILEPREFIX>
A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
+=item -outfile <FILE2>
+
+An output file, by default it is undefined and the -output option is working
+
+=item -logfile <FILE3>
+
+Logging file, by default it is undefined and logging switched off
+
=item -dbpath PATH
-Path to the uniref database used for PSI-BLAST querying.
-Default: .
+Path to the uniref database used for PSI-BLAST querying. default value: .
=item -dbname database
Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code.
-Default: uniref90
+Default value: uniref90
=item -psi path
=item -no-final
-Untoggle final step of Jpred prediction
+By default jpred generates a fasts file with sequences found with PSI-BLAST and predicted sequence features
+The option untoggles this step and stops jpred at the concise file generation
+
+=item -jabaws
+
+Redefined basic variable in order to use the script within JABAWS
=item -verbose
# internal jpred modules
use Jpred;
-use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS);
+use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin check_OS setup_env);
use HMMER::Profile;
use HMMER::Profile::Jnet;
use PSIBLAST;
#####################################################################################################
my $infile;
-my $output;
+my $outfile;
+my $prefix;
my $psiblast;
+my $jabaws;
+my $logfile;
my $ncpu = 1; # number of CPUs for psiblast calculations
my $predNoHits = 0; # define whether to make predictions when no PSI-BLAST hits are found
-my $db_path = ".";
-my $db_entry = "unknown";
+my $db_path = "/homes/www-jpred/databases";
+my $db_entry = "cluster";
my $nofinal;
my ( $help, $man, $DEBUG, $VERBOSE );
GetOptions(
"in|sequence=s" => \$infile,
- "out|output=s" => \$output,
+ "output=s" => \$prefix,
+ "outfile=s" => \$outfile,
+ "logfile=s" => \$logfile,
"psi=s" => \$psiblast,
"dbname=s" => \$db_entry,
"dbpath=s" => \$db_path,
"ncpu=s" => \$ncpu,
"pred-nohits" => \$predNoHits,
"no-final" => \$nofinal,
+ "jabaws" => \$jabaws,
"help" => \$help,
"man" => \$man,
"debug" => \$DEBUG,
database => $db_path . "/uniref90.filt",
unfiltered => $db_path . "/uniref90",
},
+
## cluster-specific path for Jpred
cluster => {
- database => "/homes/www-jpred/databases/uniref90.filt",
- unfiltered => "/homes/www-jpred/databases/uniref90",
+ database => $db_path . "uniref90.filt",
+ unfiltered => $db_path . "uniref90",
},
## these other DBs are experimental ones used during development.
## check they exist before using them.
+ # generic entry for use with validate_jnet.pl
+ # real db location defined by validate_jnet.pl
swall => {
-
- # generic entry for use with validate_jnet.pl
- # real db location defined by validate_jnet.pl
database => $db_path . "/swall/swall.filt",
unfiltered => $db_path . "/swall/swall",
},
+ # Path to PSIBLAST db
uniprot => {
-
- # Path to PSIBLAST db
database => $db_path . "/3/swall.filt",
unfiltered => $db_path . "/3/swall",
},
},
};
-pod2usage(' -sequence argument not provided') unless $infile;
-die "-db $db_entry not recognised" unless exists $database->{$db_entry};
-$output = $infile . ".res" unless $output;
-$ncpu = 1 if ( 1 > $ncpu or 8 < $ncpu );
+pod2usage(' ERROR! Input file with a sequence should be provided with -sequence/-in') unless $infile;
+pod2usage(' ERROR! Unknown database at ' . $db_path . '. Use -dbpath and -dbname for configuring the database' ) unless exists $database->{$db_entry};
+pod2usage(' ERROR! UNIREF database doesn\'t exist. Use -dbpath and -dbname for configuring the database') unless -f $database->{$db_entry}{database};
+
+if ( defined $prefix ) {
+ unless (defined $outfile) {
+ $outfile = $prefix. ".res.fasta";
+ }
+} else {
+ if (defined $outfile) {
+ print "WARNING! file prefix is not defined. Jpred will use $outfile.tmp as the prefix\n";
+ $prefix = $outfile . ".tmp";
+ } else {
+ print "WARNING! file prefix is not defined. Jpred will use $infile.res as the prefix\n";
+ $prefix = $infile . ".res";
+ $outfile = $prefix. ".jnet";
+ }
+}
-for ( [ "input", $infile ], [ "output", $output ], [ "psi", $psiblast ], [ "db", $db_entry ] ) {
+if ( 1 > $ncpu or 8 < $ncpu ) {
+ print "WARNING! the nuber of CPUs should be between 1 and 8. Jpred will use 1 CPU\n";
+ $ncpu = 1;
+}
+
+$VERBOSE = 1 if $DEBUG;
+
+my $LOG;
+if (defined $logfile) {
+ open( $LOG, ">", $logfile ) or die "ERROR! unable to open '$logfile': ${!}\nDied";
+}
+
+for ( [ "input file", $infile ], [ "out file", $outfile ], [ "outprefix", $prefix ], [ "psi", $psiblast ], [ "db name", $db_entry ], [ "db path", $db_path ] ) {
my ( $key, $value ) = @{$_};
defined $value or next;
- errlog( join( ": ", $key, $value ), "\n" );
+ my $par = join( ": ", $key, $value );
+ print "$par\n" if $DEBUG;
+ print $LOG "$par\n" if $LOG;
}
#####################################################################################################
+if (defined $jabaws) {
+ setup_jpred_env($Bin);
+ setup_env($Bin);
+}
my $platform = check_OS();
print "JPRED: checking platiform... $platform\n" if $DEBUG;
+print $LOG "JPRED: checking platiform... $platform\n" if $LOG;
#####################################################################################################
my $query = FASTA::File->new( read_file => $infile );
args => "-a" . $ncpu . " -e0.05 -h0.01 -m6 -b10000 -v10000 -j3",
path => $psiblastbin,
input => $infile,
- output => "$output.blast",
- matrix => "$output.profile",
+ output => "$prefix.blast",
+ matrix => "$prefix.profile",
database => $database->{$db_entry}{database},
);
}
print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
+ print $LOG "BLAST matrix: $ENV{BLASTMAT}\n" if $LOG;
+ print $LOG "BLASTDB path: $ENV{BLASTDB}\n" if $LOG;
print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
+ print $LOG "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $LOG;
$psi_run->run or die; # CC sub is from PSIBLAST::Run
- $psi->read_file("$output.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
- system("gzip -9f $output.blast");
+ $psi->read_file("$prefix.blast"); # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
+ system("gzip -9f $prefix.blast");
} else {
if ( $psiblast =~ /.gz$/ ) { $psi->read_gzip_file($psiblast) } # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
else { $psi->read_file($psiblast) } # CC ditto above
if ( $predNoHits == 0 ) {
warn "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n";
print ">>100% complete\n";
+ print $LOG "\nJPRED: Warning! no PSI-BLAST hits found and '-pred-nohits' option not set. Exiting...\n" if $LOG;
+ print $LOG ">>100% complete\n" if $LOG;
+ close ($LOG) if $LOG;
exit;
} else {
print ">>50% complete\n";
warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
print "Running HMMer on query...\n" if $VERBOSE;
+ print $LOG ">>50% complete\n" if $LOG;
+ print $LOG "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n" if $LOG;
+ print $LOG "Running HMMer on query...\n" if $LOG;
# copy input query to alignment
- #system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
+ #system("cp $path $prefix.align") == 0 or croak "Error: unable to copy '$path'\n";
open( my $ifh, "<", $infile ) or die "JPRED: cannot open file $infile: $!";
- open( my $ofh, ">", $output . ".align" ) or die "JPRED: cannot open file $output\.align: $!";
+ open( my $ofh, ">", $prefix . ".align" ) or die "JPRED: cannot open file $prefix\.align: $!";
while (<$ifh>) { print $ofh, $_ }
close $ifh;
close $ofh;
# Convert from HMMER::Profile to HMMER::Profile::Jnet
my $hmmer_jnet = HMMER::Profile::Jnet->new;
$hmmer_jnet->add_line( @{$_} ) for $hmmer->get_line;
- $hmmer_jnet->write_file("$output.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
+ $hmmer_jnet->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
print ">>70% complete\n";
+ print $LOG ">>70% complete\n" if $LOG;
# Run Jnet for the prediction
print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
- jnet( map { "$output.$_" } qw(hmm jnet) ); # CC sub is below
+ print $LOG "Running JNet using the generated inputs from HMM only...\n" if $LOG;
+ my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet) );
+ print ">>90% complete\n";
+ print $LOG ">>90% complete\n" if $LOG;
+ print $jnetlog if $VERBOSE;
+ print $LOG $jnetlog if $LOG;
}
} else {
- psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG; # CC sub is below
+ psiseq2fasta( "0.fasta.gz", @seqs ) if $DEBUG;
print ">>40% complete\n";
+ print $LOG ">>40% complete\n" if $LOG;
#####################################################################################################
# Make PSIBLAST truncated alignments the right length
print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
- @seqs = extend( $query, @seqs ); # CC sub si below
+ print $LOG "Untruncating the PSIBLAST alignments...\n" if $LOG;
+ @seqs = extend( $query, @seqs );
psiseq2fasta( "1.fasta.gz", @seqs ) if $DEBUG;
#####################################################################################################
# Remove masking from PSIBLAST alignment
print "Unmasking the alignments...\n" if $VERBOSE;
+ print $LOG "Unmasking the alignments...\n" if $LOG;
my $idx = Index->new( type => "Index::FastaCMD" ) or die "Index type cannot be found.";
remove_seq_masks( $idx, $_ ) for @seqs[ 1 .. $#seqs ];
psiseq2fasta( "2.fasta.gz", @seqs ) if $DEBUG;
#####################################################################################################
# Convert the sequences to upper case
print "Converting sequences to the same case...\n" if $VERBOSE;
+ print $LOG "Converting sequences to the same case...\n" if $LOG;
toupper($_) for @seqs; # CC sub is below
- psiseq2fasta( "${output}_backup.fasta.gz", @seqs );
+ psiseq2fasta( "${prefix}_backup.fasta.gz", @seqs );
#####################################################################################################
# Remove excessive sequences
print "Remove excessive sequences...\n" if $VERBOSE;
+ print $LOG "Remove excessive sequences...\n" if $LOG;
@seqs = reduce( $MAX_ALIGN, @seqs );
psiseq2fasta( "3.fasta.gz", @seqs ) if $DEBUG;
#####################################################################################################
# Remove sequences that are too long or too short
print "Remove sequences which too long or short...\n" if $VERBOSE;
+ print $LOG "Remove sequences which too long or short...\n" if $LOG;
@seqs = del_long_seqs( 50, @seqs );
psiseq2fasta( "4.fasta.gz", @seqs ) if $DEBUG;
#####################################################################################################
# Remove redundant sequences based upon pairwise identity and OC clustering
print "Remove redundant sequences...\n" if $VERBOSE;
+ print $LOG "Remove redundant sequences...\n" if $LOG;
@seqs = nonred( $NR_CUT, @seqs );
psiseq2fasta( "5.fasta.gz", @seqs ) if $DEBUG;
# Check that we haven't got rid of all of the sequences
if ( @seqs < 2 ) {
warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
- @seqs = fasta2psiseq("${output}_backup.fasta.gz");
+ print $LOG "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n" if $LOG;
+ @seqs = fasta2psiseq("${prefix}_backup.fasta.gz");
}
- unlink("${output}_backup.fasta.gz") unless $DEBUG;
+ unlink("${prefix}_backup.fasta.gz") unless $DEBUG;
# Remove gaps in the query sequence and same positions in the alignment
print "Removing gaps in the query sequence...\n" if $VERBOSE;
+ print $LOG "Removing gaps in the query sequence...\n" if $LOG;
degap(@seqs);
psiseq2fasta( "6.fasta.gz", @seqs ) if $DEBUG;
# Output the alignment for the prediction
print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
- psiseq2fasta( "$output.align", @seqs );
+ print $LOG "Outputting cleaned-up PSI_BLAST alignment...\n" if $LOG;
+ psiseq2fasta( "$prefix.align", @seqs );
#####################################################################################################
# Equivilent to getpssm script
print "Output the PSSM matrix from the PSI-BLAST profile...\n";
- my $pssm = PSIBLAST::PSSM->new( read_file => "$output.profile" );
- $pssm->write_file("$output.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
+ print $LOG "Output the PSSM matrix from the PSI-BLAST profile...\n" if $LOG;
+ my $pssm = PSIBLAST::PSSM->new( read_file => "$prefix.profile" );
+ $pssm->write_file("$prefix.pssm"); # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
print ">>50% complete\n";
+ print $LOG ">>50% complete\n" if $LOG;
#####################################################################################################
# Run HMMER on the sequences
print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
+ print $LOG "Running HMMer on sequences found by PSI-BLAST...\n" if $LOG;
my $hmmer = hmmer(@seqs); # CC sub is below
- $hmmer->write_file("$output.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
+ $hmmer->write_file("$prefix.hmm"); # write_file is in the Read.pm called from the HMMER::Profile module
print ">>70% complete\n";
+ print $LOG ">>70% complete\n" if $LOG;
#####################################################################################################
# Run Jnet for the prediction
print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
- jnet( map { "$output.$_" } qw(hmm jnet pssm) ); # CC sub is below
+ print $LOG "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $LOG;
+ my $jnetlog = jnet( map { "$prefix.$_" } qw(hmm jnet pssm) ); # CC sub is below
+ print ">>90% complete\n";
+ print $LOG ">>90% complete\n" if $LOG;
+ print $jnetlog if $VERBOSE;
+ print $LOG $jnetlog if $LOG;
}
-print ">>100% complete\n";
-
unless ( defined $nofinal ) {
- my $aligfile = $output.".align";
- my $jnetfile = $output.".jnet";
- my $jnetfastafile = $output.".jnet.fasta";
- my $resufile = $output.".res";
- concise2fasta($jnetfile, $jnetfastafile);
- open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
+ my $aligfile = $prefix . ".align";
+ my $jnetfile = $prefix . ".jnet";
+ my $jnetfastafile = $prefix . ".jnet.fasta";
+ concise2fasta( $jnetfile, $jnetfastafile );
+ open( my $IN1, "<", $aligfile ) or die "ERROR! unable to open '$aligfile': ${!}\nDied";
open( my $IN2, "<", $jnetfastafile ) or die "ERROR! unable to open '$jnetfastafile': ${!}\nDied";
- open( my $OUT, ">", $resufile ) or die "ERROR! unable to open '$resufile': ${!}\nDied";
- while (<$IN2>) {print $OUT $_;}
- while (<$IN1>) {print $OUT $_;}
+ open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+ while (<$IN2>) { print $OUT $_; }
+ while (<$IN1>) { print $OUT $_; }
close($IN1);
close($IN2);
close($OUT);
+ unlink $jnetfastafile;
+} else {
+ if (defined $outfile and $prefix.".jnet" ne $outfile) {
+ rename $prefix.".jnet", $outfile;
+ }
}
+print ">>100% complete\n";
print "Jpred Finished\n";
+print $LOG ">>100% complete\n" if $LOG;
+print $LOG "Jpred Finished\n" if $LOG;
+close ($LOG) if $LOG;
exit;
#####################################################################################################
#####################################################################################################
#####################################################################################################
+
+=begin :private
+
+=head2 fasta2concise ($infile, $outfile)
+
+Convert a file with PSI-BLAST sequences and prediction in the fasta format into concise file
+
+=cut
+
sub fasta2concise {
my $infile = shift;
my $outfile = shift;
- open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
- open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
-
+ open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
my ( $seq, @seqs, @title );
while (<$IN>) {
if (s/^>//) {
$seq =~ s/\n|\s//g;
$seq =~ s/(.)/$1,/g;
push @seqs, $seq;
+ close($IN);
- if ( @title != @seqs ) { die("non matching number of titles and sequences!\n"); }
+ if ( @title != @seqs ) {
+ die("non matching number of titles and sequences!\n");
+ }
+ open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
foreach ( 0 .. $#title ) {
- print "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
+ print $OUT "align" . ( $_ + 1 ) . ";$title[$_]:$seqs[$_]\n";
}
- close($IN);
close($OUT);
}
+=begin :private
+
+=head2 concise2fasta ($infile, $outfile)
+
+Convert a file with PSI-BLAST sequences and prediction in the concise format into fasta file
+
+=cut
+
#####################################################################################################
sub concise2fasta {
my $infile = shift;
my ( @seqs, %seq, @pred, %pred );
my @var = (
- "Lupas_21", "Lupas_14", "Lupas_28", "JNETPSSM", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETFREQ",
- "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
+ "Lupas_21", "Lupas_14", "Lupas_28", "MULTCOIL", "MULTCOIL_TRIMER", "MULTCOIL_DIMER", "JNETPSSM", "JNETFREQ",
+ "JNETALIGN", "JNETHMM", "JNETSOL5", "JNETSOL25", "JNETSOL0", "jnetpred", "jpred"
);
- open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
- open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
-
+ # parse input concise file
+ open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\nDied";
while (<$IN>) {
- if (/^\n/) { next; }
- my ( $id, $seq ) = split( ":", $_ );
- if ( !$id || !$seq ) { next; } # Check we have proper values
- $seq =~ s/,//g;
- chomp($seq);
- if ( $id =~ /;/ ) { # Then it's an alignment
- @_ = split( ";", $id );
- push @seqs, $_[1];
- $seq{ $_[1] } = $seq;
- }
- foreach (@var) {
- if ( $_ eq $id ) {
- push @pred, $_;
- $pred{$_} = $seq;
+ unless (/^\n/) {
+ my ( $id, $seq ) = split( ":", $_ );
+ if ( defined $id and defined $seq ) { # Check we have proper values
+ $seq =~ s/,//g;
+ chomp $seq;
+ if ( $id =~ /;/ ) { # Then it's an alignment
+ @_ = split( ";", $id );
+ push @seqs, $_[1];
+ $seq{ $_[1] } = $seq;
+ }
+ foreach my $v (@var) {
+ if ( $v eq $id ) {
+ push @pred, $v;
+ $pred{$v} = $seq;
+ }
+ }
}
}
}
close($IN);
+ open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+
+ # print out sequences found with PSI-BLAST
foreach (@seqs) {
$seq{$_} =~ s/(.{72})/$1\n/g;
print $OUT ">$_\n$seq{$_}\n";
}
- foreach (@pred) {
- $pred{$_} =~ s/[TCYWXZSI\?_]/-/g;
- $pred{$_} =~ s/B/E/g;
- $pred{$_} =~ s/G/H/g;
+ # print out predictions
+ foreach my $p (@pred) {
+ $pred{$p} =~ s/[TCYWXZSI\?_]/-/g;
+ $pred{$p} =~ s/B/E/g;
+ $pred{$p} =~ s/G/H/g;
- if (/SOL/) { $pred{$_} =~ s/E/B/g; }
- $pred{$_} =~ s/(.{72})/$1\n/g;
- print $OUT ">$_\n$pred{$_}\n";
+ if ( $p =~ /SOL/ ) {
+ $pred{$p} =~ s/E/B/g;
+ }
+ $pred{$p} =~ s/(.{72})/$1\n/g;
+ print $OUT ">$p\n$pred{$p}\n";
}
close($OUT);
}
system("$jnet --concise --hmmer $hmmer > $outfile");
}
check( "jnet", $? ) or exit 1;
+ my $res = "";
+ my $logging = "";
+ open( my $IN, "<", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+ while (<$IN>) {
+ if (/^jnetpred:|^JNET[A-Z0-9]+:/) {
+ $res .= $_;
+ } else {
+ $logging .= $_;
+ }
+ }
+ close $IN;
+ open( my $OUT, ">", $outfile ) or die "ERROR! unable to open '$outfile': ${!}\nDied";
+ print $OUT "\n".$res;
+ close $OUT;
+ return $logging;
}
=head1 psiseq2fasta($path, @PSISEQ)
Report messages to STDERR
=cut
-
-#####################################################################################################
-sub errlog { print STDERR @_ }