From 2a4439190b9b6cf0f881d5d07531c25b057117d6 Mon Sep 17 00:00:00 2001 From: Sasha Sherstnev Date: Mon, 14 Oct 2013 15:20:41 +0100 Subject: [PATCH] Re-design jpred.pl input options: no -seq for a file with 1 FASTA record only. --- binaries/src/jpred/jpred.pl | 82 +++++++++++++++++---------- conf/settings/JpredParameters.xml | 31 +--------- conf/settings/JpredPresets.xml | 35 +----------- runner/compbio/runner/predictors/Jpred.java | 1 + 4 files changed, 60 insertions(+), 89 deletions(-) diff --git a/binaries/src/jpred/jpred.pl b/binaries/src/jpred/jpred.pl index 99d7fdc..bd23814 100755 --- a/binaries/src/jpred/jpred.pl +++ b/binaries/src/jpred/jpred.pl @@ -6,14 +6,15 @@ jpred - Secondary structure prediction program =head1 SYNOPSIS -./jpred.pl -in [-outfile ] [-logfile ] [-output ] [-dbname ] [-dbpath ] [-ncpu NNN] [-psi ] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man] +./jpred.pl -in [-outfile ] [-logfile ] [-output ] [-dbname ] [-dbpath ] [-ncpu NNN] [-psi ] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man] =head1 DESCRIPTION -This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence -(with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC. -For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the -secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet. +This is a program for predicting the secondary structure of a multiple sequence alignment or a protein sequence. +The input file can be stored in 3 formats: FASTA, MSF, or BLC. +For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and +predicting the secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, +created from the alignment, is used in Jnet. =head1 OPTIONS @@ -23,10 +24,6 @@ secondary structure with Jnet. For the multiple sequence alignment only the HMM The path to the sequence file (in FASTA, MSF, or BLC format) -=item -seq - -The input file is a FASTA file with one sequence only. - =item -output A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in. @@ -211,7 +208,6 @@ GetOptions( "ncpu=s" => \$ncpu, "pred-nohits" => \$predNoHits, "no-final" => \$nofinal, - "seq" => \$seqgoal, "jabaws" => \$jabaws, "help" => \$help, @@ -222,8 +218,6 @@ GetOptions( pod2usage(1) if $help; pod2usage( verbose => 2 ) if $man; -$goal = "seq" if ( defined $seqgoal ); - ##################################################################################################### # Key to database information and information for accessing them my $database = { @@ -327,25 +321,23 @@ print $LOG "JPRED: checking platiform... $platform\n" if $LOG; ##################################################################################################### # check input file format -if ( 'seq' eq $goal ) { - $format = "seq"; - if ( 1 != check_FASTA_format($infile) ) { - die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n"; - } -} else { - my $nseq = check_FASTA_format($infile); - if ( 0 < $nseq ) { - $format = "fasta"; - if ( 1 == $nseq ) { - die "\nERROR! jpred requires alignment with more than 1 sequence\n if you provide only one sequence use the -seq option.\n"; - } - } elsif ( 0 < check_MSF_format($infile) ) { - $format = "msf"; - } elsif ( 0 < check_BLC_format($infile) ) { - $format = "blc"; +my $nseq = check_FASTA_format($infile); +if ( 0 < $nseq ) { + $format = "fasta"; + if ( 1 == $nseq ) { + # one FASTA record + $goal = 'seq'; } else { - die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n"; + unless ( 0 < check_FASTA_alignment($infile)) { + die "\nERROR! jpred requires either FASTA alignment or 1 sequence in the FASTA, MSF, or BLC formats\n"; + } } +} elsif ( 0 < check_MSF_format($infile) ) { + $format = "msf"; +} elsif ( 0 < check_BLC_format($infile) ) { + $format = "blc"; +} else { + die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n"; } $infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format ); @@ -645,6 +637,38 @@ sub check_FASTA_format { return $nseq; } ##################################################################################################### +sub check_FASTA_alignment { + my $infile = shift; + + open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n"; + my $check_first_line = 1; + my $nseq = 0; + my $seqlen = -1; + local $/ = "\n>"; + while (<$IN>) { + if ($check_first_line) { + return 0 unless (/^>/); + $check_first_line = 0; + } + s/^>//g; + s/>$//g; + + my ( $id, @seqs ) = split /\n/, $_; + return 0 unless ( defined $id or @seqs ); + my $seq = join( "", @seqs ); + return 0 unless ( $seq =~ /[a-zA-Z\.-]/ ); + if (-1 == $seqlen) { + $seqlen = length ($seq); + } else { + return 0 if ($seqlen != length ($seq) ); + } + ++$nseq; + } + close($IN); + + return $nseq; +} +##################################################################################################### sub check_MSF_format { my $infile = shift; $? = 0; diff --git a/conf/settings/JpredParameters.xml b/conf/settings/JpredParameters.xml index 3697e70..365ffcd 100644 --- a/conf/settings/JpredParameters.xml +++ b/conf/settings/JpredParameters.xml @@ -15,21 +15,12 @@ JABAWS configuration - Configure Jpred to worik within JABAWS + Configure Jpred to work within JABAWS -jabaws prog_docs/jpred.txt - - Single sequence prediction - - Configure Jpred to worik within JABAWS - - -seq - prog_docs/jpred.txt - - @@ -39,7 +30,7 @@ -dbpath prog_docs/jpred.txt - . + /data/UNIREFdb @@ -53,19 +44,12 @@ uniref90 ported_db cluster - Number of CPUs - Number of CPU used by jpred.pl. Maximum value is 8 + Number of CPU used by Jpred. Maximum value is 8 -ncpu prog_docs/jpred.txt @@ -77,13 +61,4 @@ experimental development databases: - - PSI-BLAST output file - - Path to a PSI-BLAST output file - - -psi - prog_docs/jpred.txt - - diff --git a/conf/settings/JpredPresets.xml b/conf/settings/JpredPresets.xml index 5407708..c70e709 100644 --- a/conf/settings/JpredPresets.xml +++ b/conf/settings/JpredPresets.xml @@ -3,26 +3,12 @@ compbio.runner.predictors.Jpred - cluser-single + cluser-configuration Dundee cluser configuration for single sequence prediction - - - - - - - - cluster-alignment - - Dundee cluser configuration for multiple sequence alignment prediction - - - - @@ -30,26 +16,11 @@ single - 4-core laptop configuration for single sequence prediction - - - - - - - - - - - - alignemnt - - 4-core laptop configuration for multiple sequence alignment prediction + 4-core computer configuration for single sequence prediction - - + diff --git a/runner/compbio/runner/predictors/Jpred.java b/runner/compbio/runner/predictors/Jpred.java index 0e08e0a..4d94620 100644 --- a/runner/compbio/runner/predictors/Jpred.java +++ b/runner/compbio/runner/predictors/Jpred.java @@ -65,6 +65,7 @@ public class Jpred extends SkeletalExecutable { addParameters(Arrays.asList("-logfile " + STAT_FILE)); addParameters(Arrays.asList("-dbpath " + dbpath)); addParameters(Arrays.asList("-dbname " + dbname)); + addParameters(Arrays.asList("-jabaws")); } // HashMap -- 1.7.10.2