=head1 SYNOPSIS
-./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-seq] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
+./jpred.pl -in <FILE1> [-outfile <FILE2>] [-logfile <FILE3>] [-output <FILEPREFIX>] [-dbname <DBNAME>] [-dbpath <PATH>] [-ncpu NNN] [-psi <psiblast output>] [-pred-nohits] [-no-final] [-jabaws] [-verbose] [-debug] [-help] [-man]
=head1 DESCRIPTION
-This is a program for predicting the secondary structure of a multiple sequence alignment (by default) or a protein sequence
-(with the -seq option). The input file can be stored in 3 formats: FASTA, MSF, or BLC.
-For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and predicting the
-secondary structure with Jnet. For the multiple sequence alignment only the HMM profile, created from the alignment, is used in Jnet.
+This is a program for predicting the secondary structure of a multiple sequence alignment or a protein sequence.
+The input file can be stored in 3 formats: FASTA, MSF, or BLC.
+For the single sequence the program does all the PSI-BLAST searching, preparing PSSM and HMM profiles and
+predicting the secondary structure with Jnet. For the multiple sequence alignment only the HMM profile,
+created from the alignment, is used in Jnet.
=head1 OPTIONS
The path to the sequence file (in FASTA, MSF, or BLC format)
-=item -seq
-
-The input file is a FASTA file with one sequence only.
-
=item -output <FILEPREFIX>
A prefix to the filenames created by Jpred, defaults to the value set by -sequence/-in.
"ncpu=s" => \$ncpu,
"pred-nohits" => \$predNoHits,
"no-final" => \$nofinal,
- "seq" => \$seqgoal,
"jabaws" => \$jabaws,
"help" => \$help,
pod2usage(1) if $help;
pod2usage( verbose => 2 ) if $man;
-$goal = "seq" if ( defined $seqgoal );
-
#####################################################################################################
# Key to database information and information for accessing them
my $database = {
#####################################################################################################
# check input file format
-if ( 'seq' eq $goal ) {
- $format = "seq";
- if ( 1 != check_FASTA_format($infile) ) {
- die "\nERROR! jpred requires 1 sequence in the FASTA file if the option -seq used. exit\n";
- }
-} else {
- my $nseq = check_FASTA_format($infile);
- if ( 0 < $nseq ) {
- $format = "fasta";
- if ( 1 == $nseq ) {
- die "\nERROR! jpred requires alignment with more than 1 sequence\n if you provide only one sequence use the -seq option.\n";
- }
- } elsif ( 0 < check_MSF_format($infile) ) {
- $format = "msf";
- } elsif ( 0 < check_BLC_format($infile) ) {
- $format = "blc";
+my $nseq = check_FASTA_format($infile);
+if ( 0 < $nseq ) {
+ $format = "fasta";
+ if ( 1 == $nseq ) {
+ # one FASTA record
+ $goal = 'seq';
} else {
- die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
+ unless ( 0 < check_FASTA_alignment($infile)) {
+ die "\nERROR! jpred requires either FASTA alignment or 1 sequence in the FASTA, MSF, or BLC formats\n";
+ }
}
+} elsif ( 0 < check_MSF_format($infile) ) {
+ $format = "msf";
+} elsif ( 0 < check_BLC_format($infile) ) {
+ $format = "blc";
+} else {
+ die "ERROR! unknown input file format for multiple sequence alignment (can be FASTA, MSF, or BLC). exit...\n";
}
$infastafile = $infile . ".fasta" if ( 'msf' eq $format or 'blc' eq $format );
return $nseq;
}
#####################################################################################################
+sub check_FASTA_alignment {
+ my $infile = shift;
+
+ open( my $IN, "<", $infile ) or die "ERROR! unable to open '$infile': ${!}\n";
+ my $check_first_line = 1;
+ my $nseq = 0;
+ my $seqlen = -1;
+ local $/ = "\n>";
+ while (<$IN>) {
+ if ($check_first_line) {
+ return 0 unless (/^>/);
+ $check_first_line = 0;
+ }
+ s/^>//g;
+ s/>$//g;
+
+ my ( $id, @seqs ) = split /\n/, $_;
+ return 0 unless ( defined $id or @seqs );
+ my $seq = join( "", @seqs );
+ return 0 unless ( $seq =~ /[a-zA-Z\.-]/ );
+ if (-1 == $seqlen) {
+ $seqlen = length ($seq);
+ } else {
+ return 0 if ($seqlen != length ($seq) );
+ }
+ ++$nseq;
+ }
+ close($IN);
+
+ return $nseq;
+}
+#####################################################################################################
sub check_MSF_format {
my $infile = shift;
$? = 0;