JPRED-2 Move Jpred 3.0.1 to public Git

author Sasha Sherstnev <a.sherstnev@dundee.ac.uk>

Mon, 18 Nov 2013 14:18:19 +0000 (14:18 +0000)

committer Sasha Sherstnev <a.sherstnev@dundee.ac.uk>

Mon, 18 Nov 2013 14:18:19 +0000 (14:18 +0000)
author Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
Mon, 18 Nov 2013 14:18:19 +0000 (14:18 +0000)
committer Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
Mon, 18 Nov 2013 14:18:19 +0000 (14:18 +0000)
diff --git a/jpred/bin/blastpgp b/jpred/bin/blastpgp

new file mode 100755 (executable)

index 0000000..3e6f188

Binary files /dev/null and b/jpred/bin/blastpgp differ
diff --git a/jpred/bin/fastacmd b/jpred/bin/fastacmd

new file mode 100755 (executable)

index 0000000..0311d7f

Binary files /dev/null and b/jpred/bin/fastacmd differ
diff --git a/jpred/bin/hmmbuild b/jpred/bin/hmmbuild

new file mode 100755 (executable)

index 0000000..7badafe

Binary files /dev/null and b/jpred/bin/hmmbuild differ
diff --git a/jpred/bin/hmmconvert b/jpred/bin/hmmconvert

new file mode 100755 (executable)

index 0000000..65a3a14

Binary files /dev/null and b/jpred/bin/hmmconvert differ
diff --git a/jpred/bin/jnet b/jpred/bin/jnet

new file mode 100755 (executable)

index 0000000..de1c8b0

Binary files /dev/null and b/jpred/bin/jnet differ
diff --git a/jpred/bin/oc b/jpred/bin/oc

new file mode 100755 (executable)

index 0000000..91d1e9c

Binary files /dev/null and b/jpred/bin/oc differ
diff --git a/jpred/bin/pairwise b/jpred/bin/pairwise

new file mode 100755 (executable)

index 0000000..e42d939

Binary files /dev/null and b/jpred/bin/pairwise differ
diff --git a/jpred/jpred b/jpred/jpred

new file mode 100755 (executable)

index 0000000..6ed4104
--- /dev/null
+++ b/jpred/jpred
@@ -0,0 +1,931 @@
+#!/usr/bin/perl
+
+=head1 NAME
+
+jpred - Secondary structure prediction program
+
+=head1 SYNOPSIS
+
+  jpred --sequence <fasta file> [--output <output prefix>] [--db <database>] [--psi <psiblast output>] [--pred-nohits] [--verbose] [--debug] [--help] [--man]
+
+=head1 DESCRIPTION
+
+This is a program which predicts the secondary structure of a sequence given a path to a FASTA sequence. It does all the PSI-BLAST searching and alignment 'meddling' as required by Jnet.
+
+The program is primarily design for use by the Jpred server, but it can be used directly by any user. Some of the output may not be meaningful if used directly - it can be safely ignored.
+
+=head1 OPTIONS
+
+=over 5
+
+=item --sequence path
+
+The path to the sequence (in FASTA format) you want to predict.
+
+=item --output file
+
+A prefix to the filenames created by Jpred, defaults to the value set by --sequence.
+
+=item --db database
+
+Database to use for PSI-BLAST querying. This only accepts databases from a list which is pre-defined in the code. Crap I know, but that's the way it is at the moment. It's probably best to stick the default for the time being.
+
+Default: uniref90
+
+=item --psi path
+
+Path to a PSIBLAST output file.
+
+=item --pred-nohits
+
+Toggle allowing Jpred to make predictions even when there are no PSI-BLAST hits.
+
+=item --verbose
+
+Verbose mode. Print more information while running jpred.
+
+=item --debug
+
+Debug mode. Print debugging information while running jpred. 
+
+=item --help
+
+Gives help on the programs usage.
+
+=item --man
+
+Displays this man page.
+
+=back
+
+=head1 BUGS
+
+Can't cope with gaps in the query sequence.
+
+Doesn't accept alignments.
+
+If you find any others please contact me.
+
+=head1 AUTHORS
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
+
+Chris Cole <christian@cole.name> (current maintainer)
+
+=cut
+
+## TODO check for gaps in query sequence
+##      check for disallowed chars in sequence
+
+use strict;
+use warnings;
+use Getopt::Long;
+use Pod::Usage;
+use Carp;
+use UNIVERSAL qw(isa);
+use Data::Dumper;
+use File::Temp;
+
+use Jpred;  # needed to define BLAST environment variables
+
+# my modules
+use HMMER::Profile;
+use HMMER::Profile::Jnet;
+
+use PSIBLAST;
+use PSIBLAST::PSSM;
+use PSIBLAST::Run;
+
+use FASTA::File;
+use FASTA;
+
+use Index;
+use OC;
+use Utils qw(profile);
+use Run qw(check);
+use Paths qw($pairwise $oc $jnet $hmmbuild $hmmconvert $psiblastbin);
+
+my $VERSION = '3.0.1';
+my $MAX_ALIGN = 1000;  # maximum number of alignment sequences allowed
+my $NR_CUT = 75;       # cut-off seqeunce ID value for defining non-redundancy
+
+# Light object to hold sequence information, light to prevent memory overload
+# for big search results. Even so this is quite memory intensive, I can't see
+# how to reduce the size further.
+{
+   package PSISEQ;
+   #use Compress::LZF qw(compress decompress);
+
+   sub new {
+      my ($class, %args) = @_;
+      my $self = [];
+      bless $self, $class;
+      for (keys %args) { $self->$_($args{$_}) }
+      return $self;
+   }
+
+   sub id {
+      return defined $_[1] ?
+         $_[0]->[0] = $_[1] :
+         $_[0]->[0];
+   }
+
+   sub start {
+      return defined $_[1] ?
+         $_[0]->[1] = $_[1] :
+         $_[0]->[1];
+   }
+
+   sub align {
+      if (defined $_[1]) {
+         ref $_[1] eq 'ARRAY' or die "Not passed ARRAY ref";
+         #$_[0]->[2] = compress(join "", @{ $_[1] });
+         $_[0]->[2] = join("", @{ $_[1] });
+      }
+      else {
+         #return [ split //, decompress($_[0]->[2]) ];
+         return [ split(//, $_[0]->[2]) ];
+      }
+   }
+}
+
+# Key to database information and information for accessing them
+my $dbPATH = '/homes/chris/projects/Jnet/db';
+my $db_entry = "uniref90";
+my $database = {
+   
+   ## default database used by Jpred
+   uniref90 => {
+      database => 'uniref90.filt',
+      unfiltered => 'uniref90',
+   },
+   
+   ## database used during Jnet training
+   training => {
+      database => 'training/uniref90.filt',
+      unfiltered => 'training/uniref90',
+   },
+
+   ## cluster-specific path for Jpred
+   cluster => {
+      database => '/local/gjb_lab/www-jpred/uniref90.filt',
+      unfiltered => '/local/gjb_lab/www-jpred/uniref90',
+   },
+
+   ## these other DBs are experimental ones used during development. 
+   ## check they exist before using them.
+   swall => {
+      # generic entry for use with validate_jnet.pl
+      # real db location defined by validate_jnet.pl
+      database => "$dbPATH/swall/swall.filt",  # CC new
+      unfiltered => "$dbPATH/swall/swall",
+   },
+   uniprot => {
+      # Path to PSIBLAST db
+      database => "$dbPATH/3/swall.filt",  # CC new
+      unfiltered => "$dbPATH/3/swall",
+   },
+   uniref50 => {
+      database => "$dbPATH/6/swall.filt",
+      unfiltered => "$dbPATH/6/swall",
+   },
+};
+
+# Gap characters in sequences
+our $GAP = '-';
+
+my ($help, $man, $path, $output, $psiblast, $DEBUG, $VERBOSE);
+my $predNoHits = 0;  # define whether to make predictions when no PSI-BLAST hits are found
+GetOptions(
+   "help" => \$help,
+   "man" => \$man,
+   "verbose" => \$VERBOSE, 
+   "sequence=s" => \$path,
+   "output=s" => \$output,
+   "psi=s" => \$psiblast,
+   "db=s" => \$db_entry,
+   "pred-nohits" => \$predNoHits,
+   "debug" => \$DEBUG
+) or pod2usage(2);
+pod2usage(1) if $help;
+pod2usage(verbose => 2) if $man;
+
+pod2usage(' --sequence argument not provided') unless $path;
+die "--db $db_entry not recognised" unless exists $database->{$db_entry};
+$output = $path unless $output;
+
+for (
+   [ "path", $path ],
+   [ "output", $output ],
+   [ "psi", $psiblast ],
+   [ "db", $db_entry]
+   ) {
+   my ($key, $value) = @{$_};
+   defined $value or next;
+   errlog(join(": ", $key, $value), "\n");
+}
+
+my $query = FASTA::File->new(read_file => $path);
+
+my @seqs;
+if (1) {
+   my $psi = PSIBLAST->new;        # CC PSIBLAST.pm doesn't have a new() class??
+   unless (defined $psiblast && -e $psiblast) {
+      my $psi_run = PSIBLAST::Run->new(
+         debug => $DEBUG,    
+#         path => "/software/jpred_bin/blastpgp",
+         path => $psiblastbin,  # CC new path
+         input => $path,
+         output => "$output.blast",
+         matrix => "$output.profile",
+         database => $database->{$db_entry}{database},
+         ## potentially a better set of parameters (esp. -b & -v)
+         ## which avoid PSI-BLAST dying with large number of hits
+         # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3 -F "m S"'  # 'soft' filtering of the query sequence
+         # args => '-e0.001 -h0.0001 -m6 -b10000 -v10000 -j3' # stricter searching criteria
+         # args => '-e0.05 -h0.01 -m6 -b10000 -v10000 -j3'
+      );
+
+      # For reduced databases, get the size of the orginal and use this as
+      # the equivilent DB size
+      #print $db_entry, "\n";
+      if ($db_entry =~ /^sp_red_/) {
+         (my $entry = $db_entry) =~ s/^sp_red_/sp_/;
+         $psi_run->args( $psi_run->args . " -z " . 
+            fasta_seq_length($database->{$entry}{database})   # CC sub is below
+         );
+      }
+      print "BLAST matrix: $ENV{BLASTMAT}\n" if $DEBUG;
+      print "BLASTDB path: $ENV{BLASTDB}\n" if $DEBUG;
+      
+      print "Running PSI-BLAST on query against \'$database->{$db_entry}{database}\'...\n" if $VERBOSE;
+      $psi_run->run or die;    # CC sub is from PSIBLAST::Run
+
+      $psi->read_file("$output.blast");    # CC PSIBLAST.pm doesn't have a read_file(), but Read.pm does?
+      system("gzip -9f $output.blast");
+   }
+   else {
+      if ($psiblast =~ /.gz$/) { $psi->read_gzip_file($psiblast) }    # CC PSIBALST.pm doesn't have a read_gzip_file() object, but Read.pm does?
+      else { $psi->read_file($psiblast) }  # CC ditto above
+   }
+
+   # Convert the last itteration into a collection of PSISEQ objects
+   for ($psi->get_all) {     # CC sub is from PSIBLAST.pm
+      my ($id, $start, $align) = @{$_};
+      push @seqs, PSISEQ->new(
+         id => $id,
+         start => $start,
+         align => [ split(//, $align) ]
+      );
+   }
+}
+
+## When there are no PSI-BLAST hits generate an HMM from the query 
+## and run Jnet against that only.
+## Accuracy won't be as good, but at least it's a prediction
+if (@seqs == 0) {
+   if ($predNoHits == 0) {
+      warn "\nJPRED: Warning - no PSI-BLAST hits found and '--pred-nohits' option not set. Exiting...\n";
+      print ">>100% complete\n";
+      exit;
+   }
+   print ">>50% complete\n";
+   warn "\nJPRED: no PSI-BLAST hits found. Continuing with single query sequence only.\n\n";
+   print "Running HMMer on query...\n" if $VERBOSE;
+   
+   # copy input query to alignment 
+   system("cp $path $output.align") == 0 or croak "Error: unable to copy '$path'\n";
+
+   # Temp files required for HMMer
+   my ($hmmbuild_out, $hmmconvert_out) = map {
+      File::Temp->new->filename
+   } 1..2;
+
+   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $path");
+   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+
+   # Read in the HMMER file
+   my $hmmer = HMMER::Profile->new(
+      read_file => $hmmconvert_out
+   );
+   
+   # Convert from HMMER::Profile to HMMER::Profile::Jnet
+   my $hmmer_jnet = HMMER::Profile::Jnet->new;
+   $hmmer_jnet->add_line(@{$_}) for $hmmer->get_line;
+   $hmmer_jnet->write_file("$output.hmm");   # write_file is in the Read.pm called from the HMMER::Profile module
+   print ">>70% complete\n";
+   
+   # Run Jnet for the prediction
+   print "Running JNet using the generated inputs from HMM only...\n" if $VERBOSE;
+   jnet(map { "$output.$_" } qw(hmm jnet));   # CC sub is below
+
+   print "Jpred Finished\n";
+   exit;
+}
+
+psiseq2fasta("0.fasta.gz", @seqs) if $DEBUG;     # CC sub is below
+print ">>40% complete\n";
+
+# Make PSIBLAST truncated alignments the right length
+print "Untruncating the PSIBLAST alignments...\n" if $VERBOSE;
+@seqs = extend($query, @seqs);         # CC sub si below
+psiseq2fasta("1.fasta.gz", @seqs) if $DEBUG;
+
+# Remove masking from PSIBLAST alignment
+print "Unmasking the alignments...\n" if $VERBOSE;
+dex($_) for @seqs[ 1..$#seqs ];        ## <-- CC dies here in dex() - found below
+psiseq2fasta("2.fasta.gz", @seqs) if $DEBUG;
+
+# Convert the sequences to upper case
+print "Converting sequences to the same case...\n" if $VERBOSE;
+toupper($_) for @seqs;                 # CC sub is below
+psiseq2fasta("${output}_backup.fasta.gz", @seqs);
+
+# Remove excessive sequences
+print "Remove excessive sequences...\n" if $VERBOSE;
+@seqs = reduce($MAX_ALIGN, @seqs);           # CC sub is below
+psiseq2fasta("3.fasta.gz", @seqs) if $DEBUG;
+
+# Remove sequences that are too long or too short
+print "Remove sequences which too long or short...\n" if $VERBOSE;
+@seqs = del_long_seqs(50, @seqs);      # CC sub is below
+psiseq2fasta("4.fasta.gz", @seqs) if $DEBUG;
+
+# Remove redundant sequences based upon pairwise identity and OC clustering
+print "Remove redundant sequences...\n" if $VERBOSE;
+@seqs = nonred($NR_CUT, @seqs);             # CC sub is below
+psiseq2fasta("5.fasta.gz", @seqs) if $DEBUG;
+
+# Check that we haven't got rid of all of the sequences
+if (@seqs < 2) {
+   warn "JPRED: All the sequences found by PSIBLAST were removed during filtering, reverting to prefiltered sequences\n";
+   @seqs = fasta2psiseq("${output}_backup.fasta.gz");
+}
+unlink("${output}_backup.fasta.gz") unless $DEBUG;
+
+# Remove gaps in the query sequence and same positions in the alignment
+print "Removing gaps in the query sequence...\n" if $VERBOSE;
+degap(@seqs);                          # CC sub is below
+psiseq2fasta("6.fasta.gz", @seqs) if $DEBUG;
+
+# Output the alignment for the prediction
+print "Outputting cleaned-up PSI_BLAST alignment...\n" if $VERBOSE;
+psiseq2fasta("$output.align", @seqs);
+
+{
+   # Equivilent to getpssm script
+   print "Output the PSSM matrix from the PSI-BLAST profile...\n";
+   my $pssm = PSIBLAST::PSSM->new(read_file => "$output.profile");
+   $pssm->write_file("$output.pssm");    # CC write_file() sub is in Write.pm loaded via PSIBLAST::PSSM
+}
+print ">>50% complete\n";
+
+# Run HMMER on the sequences
+{
+   print "Running HMMer on sequences found by PSI-BLAST...\n" if $VERBOSE;
+   my $hmmer = hmmer(@seqs);          # CC sub is below
+   $hmmer->write_file("$output.hmm");   # write_file is in the Read.pm called from the HMMER::Profile module
+}
+print ">>70% complete\n";
+
+# Run Jnet for the prediction
+print "Running JNet using the generated inputs from HMM and PSI-BLAST...\n" if $VERBOSE;
+jnet(map { "$output.$_" } qw(hmm jnet pssm));   # CC sub is below
+
+print "Jpred Finished\n";
+
+# Then do an accuracy prediction
+
+############################################################
+# Functions
+############################################################
+
+=begin :private
+
+=head2 $PSISEQ = dex($PSISEQ)
+
+Returns a PSISEQ object which has any masked residues replaced by their entry from the unfiltered database. Does it in place.
+
+=cut
+
+
+BEGIN {
+   my $idx = Index->new(type => "Index::FastaCMD") or die "Index type cannot be found.";
+
+   sub dex {
+      my ($seq) = @_;
+
+      croak "JPRED: dex: can only take a PSISEQ object as an argument" unless isa $seq, 'PSISEQ';
+
+      # Only bother if we have a sequence which has been masked
+      return unless grep { uc $_ eq 'X' } @{ $seq->align };
+
+      my ($id, $start, @align) = ($seq->id, $seq->start, @{$seq->align});
+
+      #$id = "$id"; # Add the EMBOSS USA
+      #$id = "sprot_37:$id"; # Add the EMBOSS USA
+      #$id = "sprot_43:$id"; # Add the EMBOSS USA
+#      $id = $database->{$db_entry}{emboss} . ":$id";  # commented out to try FastaCMD module
+      my $searchdb = $database->{$db_entry}{unfiltered};
+      $start--; # We need this to be zero based
+
+      my $f = $idx->get_sequence($id, $searchdb) or croak "JPRED: No such entry for $id in $searchdb\n" and return;   # CC sub is from Index::EMBOSS
+      my @seqs = $f->get_entries;       ## Sub is from Sequence::File
+      my $db = shift @seqs;
+
+      unless ($db) {
+         confess "JPRED: dex: Accession number $id returned no sequences";
+      }
+      elsif (@seqs) {
+         confess "JPRED: dex: Accession number $id returned more than one sequence";
+      }
+
+      my @db_seq = @{ $db->seq };
+
+      # $i is a pointer into the original, ungapped, unmasked sequence. $_ is
+      # for the aligned sequence
+      my $i = 0;
+      for (0..$#align) {
+         next if $align[$_] =~ /$GAP/;
+
+         if ($align[$_] eq 'X') {
+            unless (defined $db_seq[$i + $start]) {
+               croak "JPRED: dex: the database sequence is not as long as the query sequence!"
+            }
+            $align[$_] = $db_seq[$i + $start];
+         }
+         $i++;
+      }
+
+      $seq->align(\@align);
+   }
+}
+
+=head2 @PSISEQS_OUT = reduce ($max, @PSISEQS_IN);
+
+Reduces @PSISEQS_IN to at most $max sequences by taking every n sequences in @PSISEQ_IN.
+
+Equivilent to reduce script.
+
+=cut
+
+sub reduce {
+   my ($max, @seqs) = @_;
+
+   my $num = int(@seqs / $max);
+
+   my @res = shift @seqs;
+
+   # Don't bother if we have less than the required sequences
+   if (@seqs < $max) { return @res, @seqs }
+   if ($num == 0) { return @res, @seqs }
+
+   for (0..$#seqs) { push @res, $seqs[$_] if $_ % $num == 0 }
+
+   return @res;
+}
+
+=head2 nonred($cutoff, @PSISEQS);
+
+Removes redundant sequences at $cutoff level of sequence identity.
+
+Equivilent to nonred script.
+
+=cut
+
+sub nonred {
+   my ($cutoff, @seqs) = @_;
+   # Run pairwise and then OC
+   # Remove similar sequences
+
+   unless (defined $cutoff) { croak "JPRED: nonred() not passed a cutoff" }
+   unless (@seqs) { croak "JPRED: No sequences passed to nonred()" }
+   if (@seqs == 1) {
+      warn "JPRED: Only one sequence passed to nonred(). Not reducing sequence redundancy\n";
+      return @seqs;
+   }
+
+   my $fasta = FASTA::File->new;
+
+   # Convert the sequences into a FASTA object, which is what
+   # pairwise expects.
+   $fasta->add_entries(
+      map {
+         my $f = FASTA->new(id => $_->id);
+         $f->seq(@{$_->align});
+         $f;
+      } @seqs
+   );
+
+   # Run pairwise via wrappers
+   use Pairwise;
+   my $pair = Pairwise->new(
+      path => $pairwise
+   );
+   my $foo = join "\n", $pair->run($fasta) or die $!;
+
+   # Run OC
+   my $ocres = OC->new(
+      path => "$oc sim complete cut $cutoff"
+   );
+   $ocres->read_string(join "", $ocres->run($foo) );
+
+   # Now found those entries that aren't related
+   my @keep;
+   {
+      for ($ocres->get_groups) {
+         my ($group, $score, $size, @labels) = @{ $_ };
+
+         # Keep the QUERY from the cluster, otherwise just get one
+         if (grep { /QUERY/ } @labels) {
+            # Make sure the query stays at the start
+            unshift @keep, "QUERY";
+            next;
+         }
+         else { push @keep, shift @labels }
+      }
+
+      push @keep, $ocres->get_unclustered;
+   }
+
+   # Return the matching entries.
+   my @return;
+
+   # We use the temporay to mimic the selection of sequences in the nonred
+   # script, which took the sequences in the order they occured in the file.
+   for (@seqs) {
+      my $label = $_->id;
+      if (grep { $_ =~ /^$label$/ } @keep) {
+         push @return, $_;
+      }
+   }
+
+   return @return;
+}
+
+=head2 @seqs = del_long_seqs($percentage, @PSISEQS)
+
+Removes those sequences that are more than $percentage longer or shorter than the first sequence in @seqs. @seqs is an array of PSISEQ objects.
+
+Equivilent to trim_seqs script.
+
+=cut
+
+sub del_long_seqs {
+   my ($factor, @seqs) = @_;
+
+   # Keep the query sequence
+   my $query = shift @seqs;
+   my @res = $query;
+
+   # Find the length of the query sequence without any gaps
+   my $length = (() = (join '', @{$query->align}) =~ /[^$GAP]/g);
+
+   # Calculate the upper and lower allowed bounds
+   my ($upper, $lower);
+   {
+      my $bounds = ($length / 100) * $factor;
+      ($upper, $lower) = ($length + $bounds, $length - $bounds);
+   }
+
+   # Now try each sequence and see how long they are
+   for (@seqs) {
+      my $l = (() = (join '', @{$_->align}) =~ /[^$GAP]/g);
+      if ($l >= $lower && $l <= $upper) { push @res, $_ }
+   }
+
+   return @res;
+}
+
+=head2 toupper ($PSISEQ)
+
+Converts sequence held in PSISEQ object to uppercase.
+
+=cut
+
+sub toupper {
+   croak "JPRED: toupper: argument not a PSISEQ object" unless isa $_[0], 'PSISEQ';
+   $_[0]->align(
+      [ split //, uc join '', @{ $_[0]->align } ]
+   );
+}
+
+=head2 degap($PSISEQ_QUERY, @PSISEQS)
+
+Removes gaps in the query sequence ($PSISEQ_QUERY) and corresponding positions in @PSISEQS. Change made in place.
+
+Equivilent to fasta2jnet script.
+
+=cut
+
+sub degap {
+   my (@seqs) = @_;
+
+   # Find where the gaps in the query sequence are
+   my @gaps;
+   {
+      my $i = 0;
+      for (@{$seqs[0]->align}) {
+         push @gaps, $i if $_ !~ /$GAP/;
+         $i++;
+      }
+   }
+
+   return unless @gaps;
+
+   # Remove the gaps in all the sequences.
+   # Derefences the array reference and takes a slice inside the method call
+   # arguments, then coerce back to an array ref.
+   for (@seqs) {
+      $_->align([ @{$_->align}[@gaps] ]);
+   }
+}
+
+=head2 getfreq($filename, @PSISEQS)
+
+Creates a PSIBLAST like profile and writes it to the $filename.
+
+Equivilant to profile or getfreq (older) script. This doesn't create the same answer as the old jpred, as it got the numbers from PSIBLAST (v2.0.3), whereas this a *similar* output, but it's *not* the same.
+
+=cut
+
+sub getfreq {
+   my ($fn, @seqs) = @_;
+
+   croak "JPRED: Passed non PSISEQ object" if grep { ! isa($_, 'PSISEQ') } @seqs;
+
+   # Create a PSIBLAST like profile from the alignment
+   # This doesn't greate the same result as old Jpred, I think this is because
+   # differences in the input between old and new
+
+   # For testing
+#   my $f = FASTA::File->new(read_file => "/homes/jon/jnet/test/1bk4/1bk4.align.fa");
+#   my @profile = profile(
+#      map { join "", @{ $_->seq } } $f->get_entries
+#   );
+
+   my @profile = profile(
+      map { join "", @{ $_->align } } @seqs
+   );
+
+   open my $fh, ">$fn" or die "JPRED: $fn: $!";
+   print $fh join(" ", @{$_}), "\n" for @profile;
+}
+
+=head2 HMMER::Profile::Jnet = hmmer(@PSISEQS)
+
+Creates a HMMER profile from the aligned sequences in @PSISEQS. Returns a HMMER::Profile::Jnet object.
+
+Equivilent to gethmm script.
+
+=cut
+
+sub hmmer {
+   my (@seqs) = @_;
+
+   # Temp files required
+   my ($hmmbuild_out, $hmmconvert_out, $tmp_fasta) = map {
+      File::Temp->new->filename
+   } 1..3;
+
+   # Create the FASTA file
+   psiseq2fasta($tmp_fasta, @seqs);
+
+   system("$hmmbuild -F --fast --amino --gapmax 1 --wblosum $hmmbuild_out $tmp_fasta");
+   system("$hmmconvert -F -p $hmmbuild_out $hmmconvert_out");
+
+   # Read in the HMMER file
+   my $hmmer = HMMER::Profile->new(
+      read_file => $hmmconvert_out
+   );
+
+   # Convert from HMMER::Profile to HMMER::Profile::Jnet
+   my $hmmer_jnet = HMMER::Profile::Jnet->new;
+   $hmmer_jnet->add_line(@{$_}) for $hmmer->get_line;
+
+   return $hmmer_jnet;
+}
+
+=head2 jnet($hmmer, $out, $pssm )
+
+Runs Jnet. Pass the paths of the filenames to do the prediction on
+
+=cut
+
+sub jnet {
+   my ($hmmer, $outfile, $pssm) = @_;
+   if ($pssm) {
+      # run Jnet prediction with all the profile data
+      system("$jnet --concise --hmmer $hmmer --pssm $pssm > $outfile");
+   } else {
+      # run Jnet prediction with only HMMer and alignment data
+      system("$jnet --concise --hmmer $hmmer > $outfile");
+   }
+   check("jnet", $?) or exit 1;
+}
+
+=head1 psiseq2fasta($path, @PSISEQ)
+
+Writes a FASTA file given an array of PSISEQ objects.
+
+=cut
+
+sub psiseq2fasta {
+   my ($fn, @seqs) = @_;
+
+   confess "JPRED: Not passed filename" if isa $fn, 'PSISEQ';
+   @seqs or confess "JPRED: not passed PSISEQs";
+
+   confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
+
+   # Changed from using FASTA::File object due to seg faulting on some large,
+   # long alignments, presumably due to running out of memory.
+#   my $fasta = FASTA::File->new;
+#   $fasta->add_entries(
+#      map {
+#         my $f = FASTA->new(id => $_->id);
+#         $f->seq(@{$_->align});
+#         $f;
+#      } @seqs
+#   );
+#   $fasta->write_file($fn);
+
+   my $fh;
+   $fn =~ /\.gz$/ ?
+      ( open $fh, "| gzip -9 > $fn" or die "JPRED: $!" ):
+      ( open $fh, ">$fn" or die "JPRED: $!" );
+
+   for (@seqs) {
+      print $fh ">", $_->id, "\n";
+      my $seq = join("", @{ $_->align });
+      $seq =~ s/(.{72})/$1\n/g;
+      chomp $seq;
+      print $fh $seq."\n";
+   }
+   close $fh;
+}
+
+=head1 @PSISEQ = fasta2psiseq($path)
+
+Reads in a FASTA file and returns an array of PSISEQ objects.
+
+=cut
+
+sub fasta2psiseq {
+   my ($fn) = @_;
+
+   my $fasta = $fn =~ /\.gz$/ ?
+      FASTA::File->new(read_gzip_file => $fn) :
+      FASTA::File->new(read_file => $fn);
+   $fasta or die "JPRED: $!";
+
+   my @seqs;
+   for ($fasta->get_entries) {
+      my $seq = PSISEQ->new;
+      $seq->id($_->id);
+      $seq->align( [ @{ $_->seq } ] );
+      push @seqs, $seq;
+   }
+
+   return @seqs;
+}
+
+sub fasta2psiseq_lowmem {
+   my ($fn) = @_;
+
+   local $/ = "\n";
+   my $fh;
+   $fn =~ /\.gz$/ ?
+      (open $fh, "gzip -dc $fn|" or die $!) :
+      (open $fh, $fn or die $!);
+
+   my @seqs;
+
+   my ($id, $seq);
+   while (<$fh>) {
+      chomp;
+      if (s/^>//) {
+         $seq =~ s/ //g if defined $seq;
+
+         if ($id and $seq) {
+            my $psi = PSISEQ->new(id => $id);
+            $psi->align([ split //, $seq ]);
+            push @seqs, $psi;
+            undef $id; undef $seq;
+         }
+         else { $id = $_ }
+      }
+      else {
+         $seq .= $_;
+      }
+   }
+   if ($id and $seq) {
+      my $psi = PSISEQ->new(id => $id);
+      $psi->seq($seq);
+      push @seqs, $psi;
+   }
+
+   return @seqs;
+}
+
+=head1 die_if_no_seqs($size, $message, @seqs)
+
+Exits with error message if @seqs is not at least $size.
+
+=cut
+
+sub die_if_no_seqs {
+   my ($size, $message, @seqs) = @_;
+
+   confess "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
+
+   chomp $message;
+   die "JPRED: $message\n" unless @seqs > $size;
+}
+
+=head1 extend($FASTA::File, @PSISEQ)
+
+Sometimes PSIBLAST truncates the outputed alignment where it can't matched residues against the query, this function extends the alignment so that all of the query is present in the alignment. The other sequences are extended with gap characters.
+
+=cut
+
+sub extend {
+   my ($query, @seqs) = @_;
+
+   croak "JPRED: Not a Sequence::File" if grep { not isa $_, 'Sequence::File' } $query;
+   croak "JPRED: Not PSISEQ" if grep { not isa $_, 'PSISEQ' } @seqs;
+
+   # Get the query sequence
+   my $q_query = $query->get_entry(0);
+
+   # Get the query sequence from the PSIBLAST alignment
+   my $a_query = shift @seqs;
+
+   # The gap size required to expand the alignment
+   my ($start_gap_length, $end_gap_length);
+   {
+      # Make handling the sequence easier
+      my $q_seq = join "", @{[ $q_query->seq ]};
+      my $a_seq = join "", @{ $a_query->align };
+
+      $q_seq =~ s/-//g;
+      $a_seq =~ s/-//g;
+
+      ($q_seq, $a_seq) = map { uc $_ } $q_seq, $a_seq;
+      
+      $start_gap_length = index($q_seq, $a_seq);
+      croak "JPRED: query sequence from alignment not found in original query sequence" if $start_gap_length == -1;
+
+      $end_gap_length = length($q_seq) - length($a_seq) - $start_gap_length;
+   }
+
+   # Add the gaps to the end of the alignments
+   for (@seqs) {
+      $_->align([
+         ($GAP) x $start_gap_length,
+         @{ $_->align },
+         ($GAP) x $end_gap_length
+      ]);
+   }
+
+   # Put the query sequence back
+   unshift @seqs, PSISEQ->new(
+      id => $a_query->id,
+      align => [
+         ($start_gap_length ? @{ $q_query->seq }[ 0..($start_gap_length-1) ] : ()),
+         @{ $a_query->align },
+         ($end_gap_length ? @{ $q_query->seq }[ -$end_gap_length .. -1 ] : ()),
+      ]
+   );
+
+   return @seqs;
+}
+
+=head1 $int = fasta_seq_length($path)
+
+Given the path to a FASTA database, find the number of residues/bases in it. Counts whitespace as a residue.
+
+=cut
+
+sub fasta_seq_length {
+   my ($fn) = @_;
+   open my $fh, $fn or croak "Can't open file $fn: $!\n";
+
+   my $length = 0;
+   local $_, $/ = "\n";
+   while (<$fh>) {
+      next if /^>/;
+      chomp;
+      $length += tr///c;
+   }
+
+   return $length;
+}
+
+=head1 log(@messages)
+
+Report messages to STDERR
+
+=cut
+
+sub errlog { print STDERR @_ }
diff --git a/jpred/lib/BLC.pm b/jpred/lib/BLC.pm

new file mode 100755 (executable)

index 0000000..618bb43
--- /dev/null
+++ b/jpred/lib/BLC.pm
@@ -0,0 +1,264 @@
+package BLC;
+
+use strict;
+use warnings;
+use Carp;
+
+use constant {
+       ID => 0,
+       TITLE => 1,
+       SEQ => 2
+};
+
+=head2 my $blc = new BLC;
+
+=cut
+
+sub new {
+       my $proto = shift;
+       my $class = ref($proto) || $proto;
+       my $self = { 
+               _itteration => 0,
+               _max_itteration => 0,
+       };
+       return bless $self, $class;
+}
+
+=head2 my $blc->read_file($file)
+
+Read in a BLC file.
+
+=cut
+
+sub read_file {
+       my @seqs;
+       my $self = shift;
+       my ($fn) = @_;
+
+       open BLC, $fn or croak "$fn: $!\n";
+
+       while (<BLC>) {
+               chomp;
+               if (s/^>//) {
+                       s/\s*$//g;
+                       my ($id, $title) = split /\s+/, $_, 2;
+                       $title = '' unless $title;
+                       push @seqs, [ $id, $title ];
+               }
+
+               # This regex copes with odd variations in the start of
+               # the iterations line that marks the start of sequences
+               if (/^\s*\*\s*[Ii]teration:?\s*(\d+)\s*$/) {
+                       # The start position of the sequences is the same
+                       # as the offset of the asterix from the start of
+                       # the line
+                       my $start = index $_, '*', 0;
+                       my $iter = $1;
+                       croak "Iteration not greater than 0 in BLC file $fn" unless $iter > 0;
+
+                       while (<BLC>) {
+                               chomp;
+                               last if /^\s{$start}\*\s*$/;
+                               my $line = substr $_, $start, @seqs;
+
+                               foreach (0..$#seqs) {
+                                       # Not just $iter, as we need to
+                                       # leave room for the title
+                                       # and header
+                                       $seqs[$_]->[SEQ + $iter - 1] .= substr($line, $_, 1);
+                               }
+                       }
+               }
+       }
+       close BLC;
+       croak "No sequences found in BLC file $fn" unless @seqs;
+
+       foreach (@seqs) {
+               my ($id, $title, @seqs) = @{$_};
+               $title = '' unless defined $title;
+               $id = '' unless defined $id;
+               $self->set_sequence($id, $title, @seqs);
+       }
+}
+
+=head2 $blc->set_sequence($id, $title, @seq)
+
+Add a BLC file to the object of $id, $title and @sequence, one sequence per iteration.
+
+=cut
+
+sub set_sequence {
+       my $self = shift;
+       my ($id, $title, @data) = @_;
+       push @{$self->{_seqs}}, [ $id, $title, @data ];
+}
+
+=head2 $blc->get_sequence($number)
+
+Returns a list of the ($id, $title, @sequences) where each member of @sequences is from the itterations, from first to final. Defaults to the first sequences.
+
+=cut
+
+sub get_sequence {
+       my ($self, $number) = @_;
+       if (defined $number and $number > $self->get_num_seqs) {
+               croak "You're trying to retrive a sequence past than the end of the BLC file in get_sequence($number)";
+       }
+       elsif (defined $number and $self->get_num_seqs + $number < 0) {
+               croak "You're trying to retrive a sequence before the begining of the BLC file in get_sequence($number)";
+       }
+       $number = 0 unless defined $number;
+
+       return @{${$self->{_seqs}}[$number]};
+}
+
+=head2 $blc->get_num_seqs()
+
+Returns the number of sequences in the BLC file.
+
+=cut
+
+sub get_num_seqs {
+       my ($self) = @_;
+       return $#{$self->{_seqs}};
+}
+
+=head2 $blc->get_sequences($iteration)
+
+Returns all of the sequences in a block file for a particular itteration, in the same order as they occured in the block file. If left undefined, it will return the sequences from the first itteration.
+
+=cut
+
+sub get_sequences {
+       my ($self, $iteration) = @_;
+       $iteration = 0 unless $iteration and $iteration > 0;
+       return map { $_->[SEQ + $iteration] } @{$self->{_seqs}};
+}
+
+=head2 $blc->get_seq_ids($number)
+
+Returns the ID for the $number sequence to occur in the file, if left undefined it'll return all of the IDs.
+
+=cut
+
+sub get_seq_ids {
+       my ($self, $number) = @_;
+       if (defined $number) {
+               return ${$self->{_seqs}[$number]}[ID];
+       }
+       else {
+               return map { $_->[ID] } @{$self->{_seqs}};
+       }
+}
+
+=head2 $blc->get_seq_titles($number)
+
+Returns the titles for the number sequence to occur in the file, if left undefined it'll return all of the titles.
+
+=cut
+
+sub get_seq_titles {
+       my ($self, $number) = @_;
+       if (defined $number) {
+               return ${$self->{_seqs}[$number]}[TITLE];
+       }
+       else {
+               return map { $_->[TITLE] } @{$self->{_seqs}};
+       }
+}
+
+=head2 $blc->print_blc($fh);
+
+This will print the BLC file object to the filehandle if given, otherwise to STDOUT.
+
+=cut
+
+sub print_blc {
+       my ($self, $fh) = @_;
+
+       if ($fh) { *OUT = $fh }
+       else { *OUT = *STDOUT }
+
+       # Print the IDs
+       print OUT ">$_\n" foreach $self->get_seq_ids;
+
+       # Get the sequences
+       #my @sequences = $self->get_sequences
+       my $i = 0;
+       while (1) {
+               my @sequences = $self->get_sequences($i);
+               last unless defined $sequences[0];
+               print OUT "* iteration ".($i + 1)."\n";
+               foreach my $j (0..length($sequences[0]) -1) {
+                       foreach (@sequences) {
+                               print OUT substr $_, $j, 1;
+                       }
+                       print OUT "\n";
+               }
+               print OUT "*\n";
+               $i++;
+       } 
+}
+
+=head2 $blc->print_fasta($fh)
+
+Prints the BLC file out in FASTA format, each sequence is 72 characters wide.
+
+=cut
+
+sub print_fasta {
+       my ($self, $fh) = @_;
+
+       if ($fh) { *OUT = $fh }
+       else { *OUT = *STDOUT }
+
+       my @ids = $self->get_seq_ids;
+       my @sequences = $self->get_sequences;
+
+       croak "Different number of sequences and IDs\n" unless @ids == @sequences;
+       foreach (0..$#ids) {
+               print OUT ">$ids[$_]\n";
+               $sequences[$_] =~ s/(.{72})/$1\n/g;
+               print OUT "$sequences[$_]\n";
+       }
+}
+
+=head2 $blc->next_itteration;
+
+=cut
+
+sub next_itteration {
+       my ($self) = @_;
+       $self->itteration($self->itteration + 1);
+}
+
+=head2 $blc->next_itteration;
+
+=cut
+
+sub max_itteration {
+       my ($self) = @_;
+       return $self->{_max_itteration};
+}
+
+=head2 $blc->itteration($number)
+
+If $number is defined, sets the itteration to $number, otherwise returns the number of itterations.
+
+=cut
+
+sub itteration {
+       my ($self, $itteration) = @_;
+       if (defined $itteration) {
+               if ($itteration < 1) {
+                       return undef;
+               }
+               #if ($itteration > $self->{_max_itteration};
+               #$self->{_itteration} = $itteration;
+       }
+       else {
+               return $self->{_itteration};
+       }
+}
+
+1;
diff --git a/jpred/lib/Common.pm b/jpred/lib/Common.pm

new file mode 100644 (file)

index 0000000..c3fcf65
--- /dev/null
+++ b/jpred/lib/Common.pm
@@ -0,0 +1,32 @@
+package Common;
+use strict;
+use warnings;
+use Carp;
+
+sub path {
+       my ($self, $path) = @_;
+       if (defined $path) { $self->{__PACKAGE__."path"} = $path }
+       else {
+               if (defined $self->{__PACKAGE__."path"}) {
+                       return $self->{__PACKAGE__."path"}
+               }
+               else { croak "path() is not defined" }
+       }
+}
+
+1;
+__END__
+
+=head1 NAME
+
+Common
+
+=head1 DESCRIPTION
+
+Methods that might be useful to any other object, but not nessecery.
+
+=head1 METHODS
+
+=head2 $obj->path($path)
+
+Get/Setter for the path of a filename.
diff --git a/jpred/lib/Concise.pm b/jpred/lib/Concise.pm

new file mode 100644 (file)

index 0000000..c11b513
--- /dev/null
+++ b/jpred/lib/Concise.pm
@@ -0,0 +1,29 @@
+package Concise;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Root Sequence);
+
+sub id {
+       my ($self, $id) = @_;
+
+       if (defined $id) {
+               warn "'$1' detected in ID, changing to ';'\n" if $id =~ s/([:#])/;/
+       }
+
+       $self->SUPER::id($id);
+}
+
+sub seq {
+       my ($self, @entries) = @_;
+
+       for (@entries) {
+               warn "'$1' detected in entry, changing to ';'\n" if s/([:#,])/;/
+       }
+
+       $self->SUPER::seq(@entries);
+}
+
+1;
diff --git a/jpred/lib/Concise/File.pm b/jpred/lib/Concise/File.pm

new file mode 100644 (file)

index 0000000..b9835e2
--- /dev/null
+++ b/jpred/lib/Concise/File.pm
@@ -0,0 +1,93 @@
+package Concise::File;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Root Read Write Sequence::File);
+use Concise;
+
+# Pass a filehandle
+sub read {
+       my ($self, $fh) = @_;
+
+       while (<$fh>) {
+               chomp;
+
+               next if /^\s*$/;        # Skip blank lines
+               next if /^#/;           # skip comment lines
+#              s/#.*//g;                       # Clear comments
+               
+               my ($head, $field) = split /:/, $_, 2;
+
+               unless (defined $field and defined $head) {
+                       carp "Line $. doesn't match concise file format, skipping";
+                       next;
+               }
+
+               $field =~ s/,$//g;
+               my @fields = split /,/, $field;
+
+               my $new = Concise->new(id => $head);
+               $new->seq(@fields);
+               $self->add_entries($new);
+       }
+}
+
+sub write {
+       my ($self, $fh) = @_;
+
+       for ($self->get_entries) {
+               my $id = $_->id;
+               my @seq = $_->seq;
+
+               my $seq = join ',', @seq;
+               $seq =~ s/,$//;
+
+               print $fh "$id:$seq\n";
+       }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Concise::File - Module to read Concise file
+
+=head1 SYNOPSYS
+
+  # Read a concise file and print the id's in the file
+  $concise = Concise::File->new(read_file => "path_to_file");
+  for ($concise->get_entries) {
+    print $_->id, "\n";
+  }
+
+=head1 DESCRIPTION
+
+This module allows you to read concise files and then get all the information held in the file or to select those entries you want by their IDs.
+
+=head1 METHODS
+
+This module inherits from the Root, Read and Write modules. See these for default methods.
+
+=over
+
+=item $concise->add_entries(@Concise)
+
+Add a list of Concise objects to the Concise::File object.
+
+=item $concise->get_entries
+
+Returns a list of the entries held in the object. These are held as Concise objects.
+
+=item $concise->get_entry_by_id( qr/foo/ )
+
+Get entries dependant upon their IDs and a regex. The argument can either be a regex object or a string to be interpreted as a regex. A list of Concise objects is returned.
+
+=back
+
+=head1 AUTHOR
+
+Jonathan Barber (jon@compbio.dundee.ac.uk)
diff --git a/jpred/lib/DSSP.pm b/jpred/lib/DSSP.pm

new file mode 100644 (file)

index 0000000..d82fc89
--- /dev/null
+++ b/jpred/lib/DSSP.pm
@@ -0,0 +1,260 @@
+package DSSP;
+
+use strict;
+use warnings;
+use Carp;
+use IO::String;
+use UNIVERSAL qw(isa);
+
+use base qw(Root Read);
+
+=head1 DSSP
+
+Object to hold some of the information found in a DSSP file.
+
+=head2 my $dssp = DSSP->new;
+
+Creates new object. Any accessor method is valid to use as an argument in a hash key/value set of pairs.
+
+=head2 $dssp->read($string);
+
+Reads in the DSSP file contained within the $string. Returns true if it there is information in the file, otherwise false.
+
+=cut
+
+sub read {
+       my ($self, $data) = @_;
+
+       defined $data or croak "No filehandle passed\n";
+       local $/ = "\n";
+       my $flag = 0; # Test to see if there's any information in the file
+
+       while (<$data>) {
+               if (/^  #  RESIDUE AA STRUCTURE/) {
+                       $flag = 1;
+                       my $chain;
+
+                       while (<$data>) {
+                               chomp;
+                               my $aa_pri = substr($_, 13, 1);
+
+                               # Chain break, has to come here in order to remember what the
+                               # chain was
+                               if ($aa_pri eq '!') {
+                                       $self->add_position(
+                                               DSSP::Position->new(
+                                                       chain => $chain,
+                                                       break => 1
+                                               )
+                                       );
+                                       next;
+                               }
+
+                               # If there isn't a chain break, continue as normal
+                               my $aa_sec = substr($_, 16, 1);
+                               my $aa_acc = substr($_, 35, 3);
+                               $chain = substr($_, 11, 1);
+
+                               # Field is six wide to include alternatives, but I'm ignoring
+                               # those
+                               my $pos = substr($_, 5, 6);
+                               my $seq_pos = substr($_, 0, 5);
+
+                               # Replace empty chain defn with NULL
+                               $chain = 'NULL' if $chain eq ' ';
+
+                               # Replace lower case letters (cystine
+                               # bridge pairs) with 'c'
+                               $aa_pri =~ tr/[a-z]/c/;
+
+                               $aa_sec =~ tr/ /-/;
+
+                               # Remove any padding whitespace
+                               for ($aa_pri, $aa_sec, $aa_acc, $chain, $pos, $seq_pos) {
+                                       s/ //g
+                               }
+
+                               $self->add_position(
+                                       DSSP::Position->new(
+                                               chain => $chain,
+                                               pri => $aa_pri,
+                                               sec => $aa_sec,
+                                               acc => $aa_acc,
+                                               off => $pos,
+                                               seq_off => $seq_pos,
+                                               break => undef
+                                       )
+                               );
+                       }
+               }
+       }
+       return $flag;
+}
+
+=head2 $dssp->add_position($DSSP::Position);
+
+Adds a position object.
+
+=cut
+
+sub add_position {
+       my ($self, $pos) = @_;
+
+       unless (defined $pos and isa $pos, 'DSSP::Position') {
+               croak "No object passed"
+       }
+
+       my ($chain, $offset);
+
+       defined($chain = $pos->chain) or croak "Chain not set\n"; 
+       unless ($pos->break) {
+               defined($offset = $pos->off) or croak "Offset not set\n";
+       }
+
+       # Use a stack based system now
+       push @{ $self->{__PACKAGE__."pos"}{$chain} }, $pos;
+
+       # This is for backwards compatibility, shudder, also required changes in
+       # get_position() and position()
+       unless ($pos->break) {
+               $self->{__PACKAGE__."offset_compat"}{offset}{$chain}{$offset} = \$self->{__PACKAGE__."pos"}{$chain}->[ -1 ];
+
+               if (exists $self->{__PACKAGE__."pos"}{offset}{$chain}{$offset}) {
+                       my $out = "Overwriting DSSP position $offset in chain $chain";
+                       $out .= $self->path ? " for file ".$self->path."\n" : "\n";
+                       warn $out;
+               }
+       }
+
+       # Add the chain if not already present
+       my @chains = grep { defined } $self->chains;
+       unless (grep { $chain eq $_ } @chains) {
+               $self->chains(@chains, $chain);
+       }
+}
+
+=head2 $DSSP::Position = $dssp->get_chain_defs($chain)
+
+New method to replace get_position() and positions(). Returns all of the DSSP::Position objects for a particular chain in the order they are seen in the DSSP file. Then you have to decide what to do with them.
+
+=cut
+
+sub get_chain_defs {
+       my ($self, $chain) = @_;
+
+       croak "Chain not recognised" unless grep { $_ eq $chain } $self->chains;
+
+       return @{ $self->{__PACKAGE__."pos"}{$chain} };
+}
+
+=head2 $DSSP::Position = $dssp->get_position($chain, $offset)
+
+Depricated! Don't use this methods as it's broken, it's available for backwards compatibility.
+
+Returns the position object for a particular location.
+
+=cut
+
+sub get_position {
+       my ($self, $chain, $offset) = @_;
+
+       unless (defined $chain) { croak "No chain given" }
+       unless (defined $offset) { croak "No offset given" }
+
+       confess("No such chain '$chain'") and return unless exists $self->{__PACKAGE__."pos"}{$chain};
+
+       confess("No such offset $offset in chain $chain") and return
+               #unless exists $self->{__PACKAGE__."pos"}{$chain};
+               unless exists $self->{__PACKAGE__."offset_compat"}{offset}{$chain}{$offset};
+
+       return ${ $self->{__PACKAGE__."offset_compat"}{offset}{$chain}{$offset} };
+}
+
+=head2 $DSSP::Position = $dssp->positions($chain);
+
+Depricated! Don't use this methods as it's broken, it's available for backwards compatibility.
+
+Returns all the offsets that are available for a particular chain, sorted by offset.
+
+=cut
+
+sub positions {
+       my ($self, $chain) = @_;
+
+       unless (defined $chain) { croak "No chain given" }
+
+       croak "No such chain $chain" unless exists $self->{__PACKAGE__."pos"}{$chain};
+
+       no warnings;
+       return
+               map { join "", @{$_} }
+               sort { $a->[0] <=> $b->[0] || $a->[1] cmp $b->[1] }
+               map { [ /^(-?\d+)(\D+)?$/ ] }
+               #keys %{$self->{__PACKAGE__."pos"}{$chain}};
+               keys %{$self->{__PACKAGE__."offset_compat"}{offset}{$chain}};
+}
+
+=head2 @chains = $dssp->chains;
+
+Finds the chains in the structure, or alters them, but don't do that. Returns an empty list if no chains have been found.
+
+=cut
+
+sub chains {
+       my ($self, @chains) = @_;
+       if (@chains) { @{$self->{__PACKAGE__."chains"}} = @chains }
+       elsif ($self->{__PACKAGE__."chains"}) {
+               return @{ $self->{__PACKAGE__."chains"} };
+       }
+       else {
+               return ();
+       }
+}
+
+=head1 DSSP::Position
+
+Object to hold information on each position of a DSSP output.
+
+=head2 my $dssp_pos = DSSP::Position->new;
+
+Create a new object.
+
+=head2 Accessors
+
+=over
+
+=item * chain
+
+Holds the chain ID.
+
+=item * pri
+
+Holds the primary structre.
+
+=item * sec
+
+Holds the secondary structre.
+
+=item * acc
+
+Holds the solvent structre.
+
+=item * off
+
+Holds the position in the PDB structure.
+
+=back
+
+=cut
+
+use Class::Struct "DSSP::Position" => {
+       chain => '$',
+       pri => '$',
+       sec => '$',
+       acc => '$',
+       off => '$',
+       seq_off => '$',
+       break => '$'
+};
+
+1;
diff --git a/jpred/lib/DSSP/SCOP.pm b/jpred/lib/DSSP/SCOP.pm

new file mode 100644 (file)

index 0000000..5975c33
--- /dev/null
+++ b/jpred/lib/DSSP/SCOP.pm
@@ -0,0 +1,134 @@
+package DSSP::SCOP;
+
+use strict;
+use warnings;
+use Carp;
+use Scalar::Util qw(looks_like_number);
+
+use base qw(DSSP);
+use Utils qw($split_resid sort_resid);
+
+=head1 NAME
+
+DSSP::SCOP - Module to select DSSP records from SCOP description
+
+=head1 SYNOPSIS
+
+  my $dssp = DSSP::SCOP->new(read_file => "foo.dssp");
+  my @range = $dssp->get_range("A:33-416");
+  
+=head1 DESCRIPTION
+
+A module that provides a method of selecting a range of residues from a DSSP file. The description of the range is in the format used by SCOP version 1.65 in the dir.des file.
+
+=head1 METHODS
+
+The module inherits all the methods from the DSSP module.
+
+=head2 $dssp->get_range("A:1-200,B:1-30")
+
+Returns the DSSP::Position objects from within the range given. These are in the same order as they are presented in the DSSP file. They are returned as an array of array's, with each range having it's own array.
+
+=cut
+
+sub get_range {
+       my ($self, $defn) = @_;
+
+       $defn =~ / / and croak "get_range('$defn') has a space in it, which isn't allowed";
+       my @ranges = split /,/, $defn;
+
+       my @data;
+       for (@ranges) {
+               my @range;
+               # A:1-200 or A:
+               if (/:/) {
+                       my ($chain, $range) = split /:/, $_;
+
+                       # A:1-200
+                       if ($range =~ /([[:alnum:]]+)-([[:alnum:]]+)/) {
+                               push @data, [ $self->select_positions($chain, $1, $2) ];
+                       }
+                       # Hmmmmm.
+                       elsif ($range) {
+                               confess "You shouldn't be here";
+                       }
+                       # A:
+                       else {
+                               push @data, [ $self->get_chain_defs($chain) ];
+                       }
+               }
+               else {
+                       # 297-368
+                       if (/([[:alnum:]]+)-([[:alnum:]]+)/) {
+                               my @chains = $self->chains;
+                               if (@chains > 1) {
+                                       die "Don't know which chain to select"
+                               }
+                               my ($chain) = @chains;
+
+                               push @data, [ $self->select_positions($chain, $1, $2) ];
+                       }
+                       # -
+                       elsif (/^-$/) {
+                               for my $chain ($self->chains) {
+                                       push @data, [ $self->get_chain_defs($chain) ];
+                               }
+                       }
+                       # Otherwise we don't know what it is
+                       else {
+                               confess "Missed particular '$_'";
+                       }
+               }
+       }
+
+       return @data;
+}
+
+=head2 $self->select_positions($chain, $start, $end);
+
+Passed a chain
+
+# Select those, $chain, $start, $end;
+
+=cut
+
+sub select_positions {
+       my ($self, $chain, $start, $end) = @_;
+
+       confess "Not passed all arguments" if grep { not defined } $chain, $start, $end;
+       confess "Not passed a valid chain" unless grep { $_ eq $chain } $self->chains;
+       
+       my ($flag, @positions) = 0;
+       my $test = sub {
+               my ($off, $start, $end) = @_;
+               if (3 == grep { looks_like_number $_ } $off, $start, $end) {
+                       return 1 if $off >= $start and $off <= $end;
+               }
+               else {
+                       return undef;
+               }
+       };
+
+       for my $pos ($self->get_chain_defs($chain)) {
+               if ($pos->break) {
+                       push @positions, $pos if $flag;
+                       next;
+               }
+
+               my $off = $pos->off;
+               if (
+                       ($off eq $start or $test->($off, $start, $end))
+                       ..
+                       ($off eq $end or $test->($off, $start, $end))
+       #       $test($off, $start, $end) $off >= $start and $off <= $end)
+       #($off eq $start or )
+               ) {
+                       push @positions, $pos;
+                       $flag = 1;
+               }
+               else { $flag = 0 }
+       }
+       return @positions;
+}
+
+1;
diff --git a/jpred/lib/FASTA.pm b/jpred/lib/FASTA.pm

new file mode 100644 (file)

index 0000000..e0a81c7
--- /dev/null
+++ b/jpred/lib/FASTA.pm
@@ -0,0 +1,29 @@
+package FASTA;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Root Sequence);
+
+sub id {
+       my ($self, $id) = @_;
+
+       if (defined $id) {
+               #warn "'$1' detected in ID, changing to ';'\n" if $id =~ s/([:#])/;/
+       }
+
+       $self->SUPER::id($id);
+}
+
+sub seq {
+       my ($self, @entries) = @_;
+
+       for (@entries) {
+               warn "'$1' detected in entry, changing to ';'\n" if s/([:#,])/;/
+       }
+
+       $self->SUPER::seq(@entries);
+}
+
+1;
diff --git a/jpred/lib/FASTA/File.pm b/jpred/lib/FASTA/File.pm

new file mode 100644 (file)

index 0000000..a4356d0
--- /dev/null
+++ b/jpred/lib/FASTA/File.pm
@@ -0,0 +1,70 @@
+package FASTA::File;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Sequence::File);
+use FASTA;
+
+sub old_read {
+       my ($self, $fh) = @_;
+
+       my ($id, $seq, @seqs);
+       while (<$fh>) {
+               chomp;
+               next if /^\s+$/;
+
+               if (s/^>//) {
+                       push @seqs, [ $id, $seq ] if $id and $seq;
+                       $seq = undef;
+                       $id = $_;
+               }
+               else { $seq .= $_ }
+       }
+       push @seqs, [ $id, $seq ] if $id and $seq;
+
+       for (@seqs) {
+               my $new = FASTA->new(id => ${$_}[0]);
+               $new->seq(split //, ${$_}[1]);
+               $self->add_entries($new);
+       }
+
+       1;
+}
+
+sub read {
+       my ($self, $fh) = @_;
+       local $/ = "\n>";
+       while (<$fh>) {
+               s/^>//g;
+               s/>$//g;
+
+               my ($id, @data) = split /\n/, $_;
+               my $entry = FASTA->new(id => $id);
+               $entry->seq( split //, join("", @data) );
+
+               $self->add_entries($entry);
+       }
+
+       1;
+}
+
+sub write {
+       my ($self, $fh) = @_;
+
+       local $| = 1;
+       
+       for ($self->get_entries) {
+               my $id = $_->id;
+               my @seq = $_->seq;
+
+               my $seq = join '', @seq;
+               $seq =~ s/\s*//g;
+               $seq =~ s/(.{72})/$1\n/g;
+
+               print $fh ">$id\n$seq\n";
+       }
+}
+
+1;
diff --git a/jpred/lib/Filelist.pm b/jpred/lib/Filelist.pm

new file mode 100644 (file)

index 0000000..b37adce
--- /dev/null
+++ b/jpred/lib/Filelist.pm
@@ -0,0 +1,75 @@
+package Filelist;
+use strict;
+use warnings;
+use base qw(Root Read Write);
+
+sub read {
+       my ($self, $fh) = @_;
+
+       local $/ = "\n";
+       while (<$fh>) {
+               chomp;
+               my @files = split /\s+/, $_;
+               $self->add_files(@files);
+       }
+}
+
+sub add_files {
+       my ($self, @files) = @_;
+       push @{ $self->{__PACKAGE__."files"} }, \@files;
+}
+
+sub get_files {
+       my ($self, @list) = @_;
+       if (@list) { return @{ $self->{__PACKAGE__."files"} }[@list] }
+       else { return @{ $self->{__PACKAGE__."files"} } }
+}
+
+sub all_files {
+       my ($self) = @_;
+       $self->{__PACKAGE__."offset"} ||= 0;
+
+       my $files = $self->get_files( $self->{__PACKAGE__."offset"}++ );
+       defined $files or undef $self->{__PACKAGE__."offset"}, return;
+       return @{$files};
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Filelist - Reads in a list of files.
+
+=head1 SYNOPSIS
+
+Reads in a file containing a list of files per line and allows you to retrieve them easily.
+
+=head1 FILE EXAMPLE
+
+  foo0 bar0
+  foo1 bar1
+  foo2 bar2 moo2
+
+=head1 INHERITS
+
+Root, Read and Write.
+
+=head1 METHODS
+
+=head2 $list->read($filehandle)
+
+Reads in the filelist. Splits the lines in the file on whitespace to find the files.
+
+=head2 $list->add_files(@paths);
+
+Add your own files to an existing Filelist object.
+
+=head2 $list->get_files(@offsets)
+
+If @offsets is not an empty list, returns an AoA of the files. Otherwise returns all off the files as an AoA.
+
+=head2 $list->all_files()
+
+Itterator for retrieving all the files held by the object. Every call returns the files as from get_files() until all of the lines have been gone through, when it returns undef.
diff --git a/jpred/lib/HMMER/Profile.pm b/jpred/lib/HMMER/Profile.pm

new file mode 100644 (file)

index 0000000..19e7085
--- /dev/null
+++ b/jpred/lib/HMMER/Profile.pm
@@ -0,0 +1,102 @@
+package HMMER::Profile;
+
+use strict;
+use warnings;
+use Carp;
+use base qw(Root Read Write);
+
+sub read {
+       my ($self, $fh) = @_;
+
+       local $/ = "\n";
+
+       my ($flag, @data) = 0;
+       while (my $line = <$fh>) {
+               $flag = 1, next if $line =~ /^Cons/;
+               next unless $flag;
+               next if $line =~ /^! \d+/;
+               last if $line =~ /^ \*/;
+
+               chomp $line;
+               $line =~ s/^\s*//g;
+               #$self->seq($self->seq, [ (split / +/, $line)[1..24] ]);
+
+               $self->add_line((split /\s+/, $line)[1..24]);
+       }
+}
+
+sub write {
+       my ($self, $fh) = @_;
+       for ($self->get_line) {
+               print join(" ", @{ $_}) , "\n";
+               exit;
+               #print join(" ", map { @{ $_ }), "\n";
+               print $fh join(" ", map { sprintf "%.5f", $_ } @{ $_ }), "\n";
+       }
+}
+
+sub add_line {
+       my $self = shift;
+       @_ or croak "No data passed to HMMER::Profile::add_line";
+       push @{ $self->{__PACKAGE__."lines"} }, pack "d*", @_;
+}
+
+sub get_line {
+       my ($self, $line) = @_;
+
+       if (defined $line) { return [ unpack "d*", ${$self->{__PACKAGE__."lines"}}[$line] ] }
+       else {
+               return map { [ unpack "d*", $_ ] } @{$self->{__PACKAGE__."lines"}}
+       }
+}
+
+1;
+__END__
+
+=head1 NAME
+
+HMMER::Profile - Convert the output of hmmconvert to a profile for Jnet
+
+=head1 EXAMPLE
+
+system("hmmconvert -p hmmer.model hmmer.prf");
+
+my $prof = HMMER::Profile->new;
+$prof->read_file("hmmer.prf");
+
+my @data = $prof->jnet
+print join(" ", @{$_}), "\n" for @data;
+
+=head1 DESCRIPTION
+
+Takes the output of the HMMER program hmmconvert with the -p option and calculates the profile used by Jpred to calculate secondary structure.
+
+=head1 METHODS
+
+=head2 my $prof = HMMER::Profile->new;
+
+Create a new object.
+
+=head2 $prof->read_file("path_to_hmmer_model_converted_to_prf");
+
+Read a HMMER model that's been converted to PRF format via the HMMER hmmconvert program with the -p option.
+
+=head2 my @data = $prof->seq
+
+Returns an array of array refs of the HMMER numbers.
+
+=head2 my @data = $prof->jnet
+
+Returns an array of array refs of frequences for use in Jnet.
+
+=head1 SEE ALSO
+
+Packages Root, Read and Sequence.
+
+=head1 BUGS
+
+The whole set of profile modules is a bit of an abortion now, as there is no standard interface and the methods do different things for different objects. Converting the different representations is a nightmare because of this.
+
+=head1 AUTHOR
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
diff --git a/jpred/lib/HMMER/Profile/Jnet.pm b/jpred/lib/HMMER/Profile/Jnet.pm

new file mode 100644 (file)

index 0000000..febc73d
--- /dev/null
+++ b/jpred/lib/HMMER/Profile/Jnet.pm
@@ -0,0 +1,80 @@
+package HMMER::Profile::Jnet;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(HMMER::Profile Write);
+
+sub read {
+       my ($self, $fh) = @_;
+
+       local $/ = "\n";
+
+       while (my $line = <$fh>) {
+               chomp $line;
+               my @data = split / /, $line;
+               croak "HMMER::Profile::Jnet: Wrong number of data items in ".$self->path." at line $." if @data != 24;
+               $self->add_line(@data);
+       }
+}
+
+sub jnet {
+       my ($self) = @_;
+       my @data;
+       for ($self->get_line) {
+               # This provides the same output as for hmm2profile
+               push @data, [
+                       map {
+                               sprintf "%2.5f", 1 / (1 + (exp(-($_ / 100))));
+                       } @{$_}
+               ];
+       }
+       @data;
+}
+
+sub write {
+       my ($self, $fh) = @_;
+
+       #die "Sorry, I don't think you mean to use this function. If you do, remove this line";
+
+       for ($self->jnet) {
+               print $fh join(" ", @{$_}), "\n";
+       }
+}
+
+*seq = *jnet;
+
+1;
+
+__END__
+
+=head1 NAME
+
+HMMER::Profile::Jnet - Read in the data output from HMMER::Profile::jnet()
+
+=head1 DESCRIPTION
+
+Reads the output of the HMMER::Profile::jnet() function.
+
+=head1 METHODS
+
+=head2 my $prof = HMMER::Profile::Jnet->new;
+
+Create a new object.
+
+=head2 $prof->read_file($path_to_file);
+
+Read in the data.
+
+=head2 my @data = $prof->seq
+
+Returns an AoA of the numbers.
+
+=head1 SEE ALSO
+
+Packages Root, Read, Sequence and HMMER::Profile.
+
+=head1 AUTHOR
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
diff --git a/jpred/lib/HPM.pm b/jpred/lib/HPM.pm

new file mode 100644 (file)

index 0000000..02e1f81
--- /dev/null
+++ b/jpred/lib/HPM.pm
@@ -0,0 +1,94 @@
+package HPM;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Exporter);
+
+our @EXPORT_OK = qw(hydrophobic_moment);
+
+=head1 NAME
+
+HPM - module to calculate hydrophobic moments of a peptide sequence
+
+=head1 $moment = hydrophobic_moment($angle, @sequence)
+
+Calculate the hydrophobic moment (Eisenberg et al.) of a series of amino acids in @sequence for angle $angle in degrees. This is 100 for alpha helices and between 160 and 180.
+
+An example helix is: RIIRRIIRRIRRIIRRII which has a maximum at 100 degrees.
+An example sheet is: ALALALALAL, with a maximum at between 160-180 degrees.
+
+See Eisenberg et al., PNAS, 1984, v81, p140-144 for more details.
+
+=cut
+
+# Hydrophobic moments from Eisenberg et al., PNAS, 1984, v81, p140.
+my %moments = (
+       I => 0.73,
+       F => 0.61,
+       V => 0.54,
+       L => 0.53,
+       W => 0.37,
+       M => 0.26,
+       A => 0.25,
+       G => 0.16,
+       C => 0.04,
+       Y => 0.02,
+       P => -0.07,
+       T => -0.18,
+       S => -0.26,
+       H => -0.40,
+       E => -0.62,
+       N => -0.64,
+       Q => -0.69,
+       D => -0.72,
+       K => -1.10,
+       R => -1.76
+);
+
+# Hydrophobic moments from EMBOSS 2.9.0 hmoment program. Gives similar graphs
+# to the above moments.
+my %e_moments = (
+       A => 0.62,
+       B => -100,
+       C => 0.29,
+       D => -0.90,
+       E => -0.74,
+       F => 1.19,
+       G => 0.48,
+       H => -0.4,
+       I => 1.38,
+       J => -100,
+       K => -1.5,
+       L => 1.06,
+       M => 0.64,
+       N => -0.78,
+       O => -100,
+       P => 0.12,
+       Q => -0.85,
+       R => -2.53,
+       S => -0.18,
+       T => -0.05,
+       U => -100,
+       V => 1.08,
+       W => 0.81,
+       X => -100,
+       Y => 0.26,
+       Z => -100,
+);
+
+sub deg2rad { $_[0] and return $_[0] * (3.141593 / 180) }
+
+sub hydrophobic_moment {
+       my ($angle, @seq) = @_;
+       my $tangle = $angle;
+
+       my ($sum_sin, $sum_cos) = (0, 0);
+       for (@seq) {
+               $sum_sin += ($moments{ $_ } * sin(deg2rad $tangle));
+               $sum_cos += ($moments{ $_ } * cos(deg2rad $tangle));
+               $tangle = ($tangle + $angle) % 360;
+       }
+       return sqrt($sum_sin * $sum_sin + $sum_cos * $sum_cos) / @seq;
+}
diff --git a/jpred/lib/IO/String.pm b/jpred/lib/IO/String.pm

new file mode 100644 (file)

index 0000000..c44bce4
--- /dev/null
+++ b/jpred/lib/IO/String.pm
@@ -0,0 +1,551 @@
+package IO::String;
+
+# Copyright 1998-2005 Gisle Aas.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the same terms as Perl itself.
+
+require 5.005_03;
+use strict;
+use vars qw($VERSION $DEBUG $IO_CONSTANTS);
+$VERSION = "1.08";  # $Date: 2007/02/05 17:08:37 $
+
+use Symbol ();
+
+sub new
+{
+    my $class = shift;
+    my $self = bless Symbol::gensym(), ref($class) || $class;
+    tie *$self, $self;
+    $self->open(@_);
+    return $self;
+}
+
+sub open
+{
+    my $self = shift;
+    return $self->new(@_) unless ref($self);
+
+    if (@_) {
+       my $bufref = ref($_[0]) ? $_[0] : \$_[0];
+       $$bufref = "" unless defined $$bufref;
+       *$self->{buf} = $bufref;
+    }
+    else {
+       my $buf = "";
+       *$self->{buf} = \$buf;
+    }
+    *$self->{pos} = 0;
+    *$self->{lno} = 0;
+    return $self;
+}
+
+sub pad
+{
+    my $self = shift;
+    my $old = *$self->{pad};
+    *$self->{pad} = substr($_[0], 0, 1) if @_;
+    return "\0" unless defined($old) && length($old);
+    return $old;
+}
+
+sub dump
+{
+    require Data::Dumper;
+    my $self = shift;
+    print Data::Dumper->Dump([$self], ['*self']);
+    print Data::Dumper->Dump([*$self{HASH}], ['$self{HASH}']);
+    return;
+}
+
+sub TIEHANDLE
+{
+    print "TIEHANDLE @_\n" if $DEBUG;
+    return $_[0] if ref($_[0]);
+    my $class = shift;
+    my $self = bless Symbol::gensym(), $class;
+    $self->open(@_);
+    return $self;
+}
+
+sub DESTROY
+{
+    print "DESTROY @_\n" if $DEBUG;
+}
+
+sub close
+{
+    my $self = shift;
+    delete *$self->{buf};
+    delete *$self->{pos};
+    delete *$self->{lno};
+    undef *$self if $] eq "5.008";  # workaround for some bug
+    return 1;
+}
+
+sub opened
+{
+    my $self = shift;
+    return defined *$self->{buf};
+}
+
+sub binmode
+{
+    my $self = shift;
+    return 1 unless @_;
+    # XXX don't know much about layers yet :-(
+    return 0;
+}
+
+sub getc
+{
+    my $self = shift;
+    my $buf;
+    return $buf if $self->read($buf, 1);
+    return undef;
+}
+
+sub ungetc
+{
+    my $self = shift;
+    $self->setpos($self->getpos() - 1);
+    return 1;
+}
+
+sub eof
+{
+    my $self = shift;
+    return length(${*$self->{buf}}) <= *$self->{pos};
+}
+
+sub print
+{
+    my $self = shift;
+    if (defined $\) {
+       if (defined $,) {
+           $self->write(join($,, @_).$\);
+       }
+       else {
+           $self->write(join("",@_).$\);
+       }
+    }
+    else {
+       if (defined $,) {
+           $self->write(join($,, @_));
+       }
+       else {
+           $self->write(join("",@_));
+       }
+    }
+    return 1;
+}
+*printflush = \*print;
+
+sub printf
+{
+    my $self = shift;
+    print "PRINTF(@_)\n" if $DEBUG;
+    my $fmt = shift;
+    $self->write(sprintf($fmt, @_));
+    return 1;
+}
+
+
+my($SEEK_SET, $SEEK_CUR, $SEEK_END);
+
+sub _init_seek_constants
+{
+    if ($IO_CONSTANTS) {
+       require IO::Handle;
+       $SEEK_SET = &IO::Handle::SEEK_SET;
+       $SEEK_CUR = &IO::Handle::SEEK_CUR;
+       $SEEK_END = &IO::Handle::SEEK_END;
+    }
+    else {
+       $SEEK_SET = 0;
+       $SEEK_CUR = 1;
+       $SEEK_END = 2;
+    }
+}
+
+
+sub seek
+{
+    my($self,$off,$whence) = @_;
+    my $buf = *$self->{buf} || return 0;
+    my $len = length($$buf);
+    my $pos = *$self->{pos};
+
+    _init_seek_constants() unless defined $SEEK_SET;
+
+    if    ($whence == $SEEK_SET) { $pos = $off }
+    elsif ($whence == $SEEK_CUR) { $pos += $off }
+    elsif ($whence == $SEEK_END) { $pos = $len + $off }
+    else                         { die "Bad whence ($whence)" }
+    print "SEEK(POS=$pos,OFF=$off,LEN=$len)\n" if $DEBUG;
+
+    $pos = 0 if $pos < 0;
+    $self->truncate($pos) if $pos > $len;  # extend file
+    *$self->{pos} = $pos;
+    return 1;
+}
+
+sub pos
+{
+    my $self = shift;
+    my $old = *$self->{pos};
+    if (@_) {
+       my $pos = shift || 0;
+       my $buf = *$self->{buf};
+       my $len = $buf ? length($$buf) : 0;
+       $pos = $len if $pos > $len;
+       *$self->{pos} = $pos;
+    }
+    return $old;
+}
+
+sub getpos { shift->pos; }
+
+*sysseek = \&seek;
+*setpos  = \&pos;
+*tell    = \&getpos;
+
+
+
+sub getline
+{
+    my $self = shift;
+    my $buf  = *$self->{buf} || return;
+    my $len  = length($$buf);
+    my $pos  = *$self->{pos};
+    return if $pos >= $len;
+
+    unless (defined $/) {  # slurp
+       *$self->{pos} = $len;
+       return substr($$buf, $pos);
+    }
+
+    unless (length $/) {  # paragraph mode
+       # XXX slow&lazy implementation using getc()
+       my $para = "";
+       my $eol = 0;
+       my $c;
+       while (defined($c = $self->getc)) {
+           if ($c eq "\n") {
+               $eol++;
+               next if $eol > 2;
+           }
+           elsif ($eol > 1) {
+               $self->ungetc($c);
+               last;
+           }
+           else {
+               $eol = 0;
+           }
+           $para .= $c;
+       }
+       return $para;   # XXX wantarray
+    }
+
+    my $idx = index($$buf,$/,$pos);
+    if ($idx < 0) {
+       # return rest of it
+       *$self->{pos} = $len;
+       $. = ++ *$self->{lno};
+       return substr($$buf, $pos);
+    }
+    $len = $idx - $pos + length($/);
+    *$self->{pos} += $len;
+    $. = ++ *$self->{lno};
+    return substr($$buf, $pos, $len);
+}
+
+sub getlines
+{
+    die "getlines() called in scalar context\n" unless wantarray;
+    my $self = shift;
+    my($line, @lines);
+    push(@lines, $line) while defined($line = $self->getline);
+    return @lines;
+}
+
+sub READLINE
+{
+    goto &getlines if wantarray;
+    goto &getline;
+}
+
+sub input_line_number
+{
+    my $self = shift;
+    my $old = *$self->{lno};
+    *$self->{lno} = shift if @_;
+    return $old;
+}
+
+sub truncate
+{
+    my $self = shift;
+    my $len = shift || 0;
+    my $buf = *$self->{buf};
+    if (length($$buf) >= $len) {
+       substr($$buf, $len) = '';
+       *$self->{pos} = $len if $len < *$self->{pos};
+    }
+    else {
+       $$buf .= ($self->pad x ($len - length($$buf)));
+    }
+    return 1;
+}
+
+sub read
+{
+    my $self = shift;
+    my $buf = *$self->{buf};
+    return undef unless $buf;
+
+    my $pos = *$self->{pos};
+    my $rem = length($$buf) - $pos;
+    my $len = $_[1];
+    $len = $rem if $len > $rem;
+    return undef if $len < 0;
+    if (@_ > 2) { # read offset
+       substr($_[0],$_[2]) = substr($$buf, $pos, $len);
+    }
+    else {
+       $_[0] = substr($$buf, $pos, $len);
+    }
+    *$self->{pos} += $len;
+    return $len;
+}
+
+sub write
+{
+    my $self = shift;
+    my $buf = *$self->{buf};
+    return unless $buf;
+
+    my $pos = *$self->{pos};
+    my $slen = length($_[0]);
+    my $len = $slen;
+    my $off = 0;
+    if (@_ > 1) {
+       $len = $_[1] if $_[1] < $len;
+       if (@_ > 2) {
+           $off = $_[2] || 0;
+           die "Offset outside string" if $off > $slen;
+           if ($off < 0) {
+               $off += $slen;
+               die "Offset outside string" if $off < 0;
+           }
+           my $rem = $slen - $off;
+           $len = $rem if $rem < $len;
+       }
+    }
+    substr($$buf, $pos, $len) = substr($_[0], $off, $len);
+    *$self->{pos} += $len;
+    return $len;
+}
+
+*sysread = \&read;
+*syswrite = \&write;
+
+sub stat
+{
+    my $self = shift;
+    return unless $self->opened;
+    return 1 unless wantarray;
+    my $len = length ${*$self->{buf}};
+
+    return (
+     undef, undef,  # dev, ino
+     0666,          # filemode
+     1,             # links
+     $>,            # user id
+     $),            # group id
+     undef,         # device id
+     $len,          # size
+     undef,         # atime
+     undef,         # mtime
+     undef,         # ctime
+     512,           # blksize
+     int(($len+511)/512)  # blocks
+    );
+}
+
+sub FILENO {
+    return undef;   # XXX perlfunc says this means the file is closed
+}
+
+sub blocking {
+    my $self = shift;
+    my $old = *$self->{blocking} || 0;
+    *$self->{blocking} = shift if @_;
+    return $old;
+}
+
+my $notmuch = sub { return };
+
+*fileno    = $notmuch;
+*error     = $notmuch;
+*clearerr  = $notmuch; 
+*sync      = $notmuch;
+*flush     = $notmuch;
+*setbuf    = $notmuch;
+*setvbuf   = $notmuch;
+
+*untaint   = $notmuch;
+*autoflush = $notmuch;
+*fcntl     = $notmuch;
+*ioctl     = $notmuch;
+
+*GETC   = \&getc;
+*PRINT  = \&print;
+*PRINTF = \&printf;
+*READ   = \&read;
+*WRITE  = \&write;
+*SEEK   = \&seek;
+*TELL   = \&getpos;
+*EOF    = \&eof;
+*CLOSE  = \&close;
+*BINMODE = \&binmode;
+
+
+sub string_ref
+{
+    my $self = shift;
+    return *$self->{buf};
+}
+*sref = \&string_ref;
+
+1;
+
+__END__
+
+=head1 NAME
+
+IO::String - Emulate file interface for in-core strings
+
+=head1 SYNOPSIS
+
+ use IO::String;
+ $io = IO::String->new;
+ $io = IO::String->new($var);
+ tie *IO, 'IO::String';
+
+ # read data
+ <$io>;
+ $io->getline;
+ read($io, $buf, 100);
+
+ # write data
+ print $io "string\n";
+ $io->print(@data);
+ syswrite($io, $buf, 100);
+
+ select $io;
+ printf "Some text %s\n", $str;
+
+ # seek
+ $pos = $io->getpos;
+ $io->setpos(0);        # rewind
+ $io->seek(-30, -1);
+ seek($io, 0, 0);
+
+=head1 DESCRIPTION
+
+The C<IO::String> module provides the C<IO::File> interface for in-core
+strings.  An C<IO::String> object can be attached to a string, and
+makes it possible to use the normal file operations for reading or
+writing data, as well as for seeking to various locations of the string.
+This is useful when you want to use a library module that only
+provides an interface to file handles on data that you have in a string
+variable.
+
+Note that perl-5.8 and better has built-in support for "in memory"
+files, which are set up by passing a reference instead of a filename
+to the open() call. The reason for using this module is that it
+makes the code backwards compatible with older versions of Perl.
+
+The C<IO::String> module provides an interface compatible with
+C<IO::File> as distributed with F<IO-1.20>, but the following methods
+are not available: new_from_fd, fdopen, format_write,
+format_page_number, format_lines_per_page, format_lines_left,
+format_name, format_top_name.
+
+The following methods are specific to the C<IO::String> class:
+
+=over 4
+
+=item $io = IO::String->new
+
+=item $io = IO::String->new( $string )
+
+The constructor returns a newly-created C<IO::String> object.  It
+takes an optional argument, which is the string to read from or write
+into.  If no $string argument is given, then an internal buffer
+(initially empty) is allocated.
+
+The C<IO::String> object returned is tied to itself.  This means
+that you can use most Perl I/O built-ins on it too: readline, <>, getc,
+print, printf, syswrite, sysread, close.
+
+=item $io->open
+
+=item $io->open( $string )
+
+Attaches an existing IO::String object to some other $string, or
+allocates a new internal buffer (if no argument is given).  The
+position is reset to 0.
+
+=item $io->string_ref
+
+Returns a reference to the string that is attached to
+the C<IO::String> object.  Most useful when you let the C<IO::String>
+create an internal buffer to write into.
+
+=item $io->pad
+
+=item $io->pad( $char )
+
+Specifies the padding to use if
+the string is extended by either the seek() or truncate() methods.  It
+is a single character and defaults to "\0".
+
+=item $io->pos
+
+=item $io->pos( $newpos )
+
+Yet another interface for reading and setting the current read/write
+position within the string (the normal getpos/setpos/tell/seek
+methods are also available).  The pos() method always returns the
+old position, and if you pass it an argument it sets the new
+position.
+
+There is (deliberately) a difference between the setpos() and seek()
+methods in that seek() extends the string (with the specified
+padding) if you go to a location past the end, whereas setpos()
+just snaps back to the end.  If truncate() is used to extend the string,
+then it works as seek().
+
+=back
+
+=head1 BUGS
+
+In Perl versions < 5.6, the TIEHANDLE interface was incomplete.
+If you use such a Perl, then seek(), tell(), eof(), fileno(), binmode() will
+not do anything on an C<IO::String> handle.  See L<perltie> for
+details.
+
+=head1 SEE ALSO
+
+L<IO::File>, L<IO::Stringy>, L<perlfunc/open>
+
+=head1 COPYRIGHT
+
+Copyright 1998-2005 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
diff --git a/jpred/lib/Index.pm b/jpred/lib/Index.pm

new file mode 100644 (file)

index 0000000..2920c61
--- /dev/null
+++ b/jpred/lib/Index.pm
@@ -0,0 +1,56 @@
+package Index;
+
+use strict;
+use warnings;
+use Carp;
+
+our @ISA;
+
+=head1 NAME
+
+Index.pm - Module for creating indexed databases and retrieving the sequences from them
+
+=head1 SYNOPSIS
+
+  my $index = Index->new("Index::EMBOSS")
+
+=head1 DESCRIPTION
+
+This module provides an interface into retriving sequences from an indexed database, or given a database creates an index. First you have to choose what the type of index you want to use. Then there are a set of common functions for do these things.
+
+=head1 EXAMPLE
+
+First run:
+  my $index = Index->new("Index::EMBOSS")
+
+Where Index::EMBOSS is a type of database/indexing system you want to use and there is a module available for. This makes a load of methods available which can be called. These are generic but should work on specific databases.
+
+=head1 AVAILABLE METHODS
+
+You should then be able to call various methods on the Index object.
+
+=cut
+
+sub new {
+       my ($class, %args) = @_;
+
+       my $type = $args{type};
+       eval "use $type";
+       push @ISA, $type;
+
+       my $self = {};
+       bless $self, ref($class) || $class;
+       return $self;
+}
+
+=head1 SEE ALSO
+
+Look at the various perldoc pages for Index::*
+
+=head1 AUTHOR
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
+
+=cut
+
+1;
diff --git a/jpred/lib/Index/EMBOSS.pm b/jpred/lib/Index/EMBOSS.pm

new file mode 100644 (file)

index 0000000..4d2d06c
--- /dev/null
+++ b/jpred/lib/Index/EMBOSS.pm
@@ -0,0 +1,51 @@
+package Index::EMBOSS;
+
+use strict;
+use warnings;
+use Carp;
+
+use POSIX qw(WIFEXITED WEXITSTATUS WIFSIGNALED WTERMSIG);
+use IPC::Open3;
+
+use base qw(Root);
+
+use FASTA::File;
+
+sub read_index { 1 }
+
+sub get_sequence {
+       my ($self, $key) = @_;
+       croak "No value for get_sequence" unless $key;
+
+       my @cmd = split / /, "seqret -filter $key";
+
+       my $pid = open3(undef, \*RD, \*ERR, @cmd);
+
+       my $seqs = FASTA::File->new(read => \*RD);
+       my @err = <ERR>;
+
+       pop @err;
+       pop @err;
+
+       close RD; close ERR;
+       waitpid $pid, 0;
+       
+       # Everything was okay...
+       if (WIFEXITED($?) and not WEXITSTATUS($?)) { return $seqs }
+       # Non-zero exit
+       elsif (WIFEXITED($?) and WEXITSTATUS($?)) {
+               carp "seqret had a problem: $?; $!";
+               carp @err;
+       }
+       # Was it stopped by an external program
+       elsif (WIFSIGNALED($?)) {
+               carp "seqret halted by external signal ".WTERMSIG($?)
+       }
+       else {
+               carp "seqret suffered from a random pantwetting event"
+       }
+
+       return undef;
+}
+
+1;
diff --git a/jpred/lib/Index/FastaCMD.pm b/jpred/lib/Index/FastaCMD.pm

new file mode 100644 (file)

index 0000000..0d428b9
--- /dev/null
+++ b/jpred/lib/Index/FastaCMD.pm
@@ -0,0 +1,82 @@
+package Index::FastaCMD;
+
+our $VERSION = '0.2';
+
+=head1 NAME
+
+Index::FastaCMD - module to use the fastacmd program to read BLAST formatted databases
+
+=head1 DESCRIPTION
+
+This module was written as a replacement for Index::EMBOSS as EMBOSS needed to be statically linked to libgd, whcih wasn't a very sysadmin-friendly way of installing the software. As BLAST is pretty ubiquitous the fastacmd was deemed to be a better option. Copied heavily from Index::EMBOSS.
+
+=head1 SYNOPSIS
+
+=head1 CHANGES
+
+=over 8
+
+=item 0.2
+
+Added 'use Path' module to ensure that the correct fastacmd binary is used. Previously on the cluster the wrong fastacmd was used, which resulted in a failure. Now is more robust although the path to fastacmd is hardcoded in Paths.pm
+
+=head1 AUTHOR
+
+Chris Cole <christian@cole.name>
+
+=cut
+
+
+use strict;
+use warnings;
+
+
+use Carp;
+use POSIX qw(WIFEXITED WEXITSTATUS WIFSIGNALED WTERMSIG);
+use IPC::Open3;
+
+use base qw(Root);
+
+use FASTA::File;
+use Paths qw($fastacmd);
+
+sub get_sequence {
+   my ($self, $key, $db) = @_;
+   
+   my $appName = $fastacmd;
+   
+   croak "No sequence code given for get_sequence" unless $key;
+   croak "No database given for get_sequence" unless $db;
+   
+   my @cmd = split / /, "$appName -s $key -d $db";
+
+   my $pid = open3(undef, \*RD, \*ERR, @cmd);
+
+   my $seq = FASTA::File->new(read => \*RD);
+   my @err = <ERR>;
+
+   pop @err;
+   pop @err;
+
+   close RD; close ERR;
+   waitpid $pid, 0;
+   
+   # Everything was okay...
+   if (WIFEXITED($?) and not WEXITSTATUS($?)) { return $seq }
+   # Non-zero exit
+   elsif (WIFEXITED($?) and WEXITSTATUS($?)) {
+      carp "Command: '@cmd' had a problem: $?; $!";
+      carp @err;
+   }
+   # Was it stopped by an external program
+   elsif (WIFSIGNALED($?)) {
+      carp "$appName halted by external signal ".WTERMSIG($?)
+   }
+   else {
+      carp "$appName suffered from a random pantwetting event"
+   }
+
+   return undef;
+}
+
+1;
diff --git a/jpred/lib/Jpred.pm b/jpred/lib/Jpred.pm

new file mode 100755 (executable)

index 0000000..6209621
--- /dev/null
+++ b/jpred/lib/Jpred.pm
@@ -0,0 +1,154 @@
+package Jpred;
+
+=head1 NAME
+
+jpred.pm -- Jpred module to define all necessary global and environment variables
+
+=cut
+
+use warnings;
+use strict;
+
+BEGIN {
+       use Exporter;
+       our @ISA = ('Exporter');
+       our @EXPORT = qw($WEBSERVER $WEBSERVERCGI $SERVERROOT $SRSSERVER $CHKLOG $RESULTS $PDBLNK $CSS $IMAGES $JNET $TIMEOUT $BATCHLIM $DUNDEE $JPREDUSER $JPREDEMAIL $MAILHOST $JPREDROOT $BINDIR $LIBDIR $JOBDIR $PREFIX $RESOURCE $BLASTDB $SWALL $SWALLFILT $PDB $PDB_DAT $JPREDHEAD $JPREDFOOT &xsystem); 
+       our @EXPORT_OK = @EXPORT;
+}
+
+# URIs
+our $WEBSERVER = 'http://www.compbio.dundee.ac.uk/www-jpred';
+#our $WEBSERVER = 'http://webservices.compbio.dundee.ac.uk:3209';
+our $WEBSERVERCGI = "$WEBSERVER/cgi-bin";
+#$SERVERROOT = "$WEBSERVER/~www-jpred";
+our $SERVERROOT = "$WEBSERVER";
+our $SRSSERVER = 'http://srs.ebi.ac.uk/srs6bin/cgi-bin';
+our $CHKLOG = "$WEBSERVERCGI/chklog?";
+our $RESULTS = "$SERVERROOT/results";
+our $PDBLNK = "http://www.ebi.ac.uk/pdbsum/";
+our $CSS = "$SERVERROOT/jpred.css";
+our $IMAGES = "$SERVERROOT/images";
+our $JNET = "$SERVERROOT/about.html#jnet";
+
+# This is the time (in seconds) the job is allowed to take
+our $TIMEOUT = 60 * 60; # now is 60mins
+our $BATCHLIM = 200;
+
+our $DUNDEE = "http://www.dundee.ac.uk";
+
+# e-mail details
+our $JPREDUSER = 'www-jpred';
+our $JPREDEMAIL = 'www-jpred@compbio.dundee.ac.uk';
+our $MAILHOST = 'smtp.lifesci.dundee.ac.uk';   # CC 19/05/06 - updated to current smtp host from weevil
+
+
+# Server paths
+our $JPREDROOT = '/homes/www-jpred/live';
+
+# Directory for binaries either on the cluster or on the www server
+our $BINDIR = "$JPREDROOT/bin";
+
+# path to perl modules
+our $LIBDIR = "$JPREDROOT/lib";
+
+# Cluster paths
+our $JOBDIR = "$JPREDROOT/public_html/results";        # directory for output
+
+# Cluster names
+our $PREFIX = "jp_";           # Prefix for job submissions (qstat will only display 10 chars of job name)
+our $RESOURCE = "www_service2";        # Resource for the submission to use
+
+# Variables for external programs
+# psiblast
+$ENV{BLASTMAT} = "$JPREDROOT/data/blast";
+our $BLASTDB = $JPREDROOT."/databases";
+$ENV{BLASTDB} = $BLASTDB;
+our $SWALL = "$BLASTDB/uniref90";
+our $SWALLFILT = "$SWALL.filt";
+our $PDB = '/db/blastdb/pdb';
+our $PDB_DAT = '/db/blastdb/DB.dat';
+
+# ncoils matrix location
+$ENV{COILSDIR} = "$JPREDROOT/data/coils";
+
+# Error checking system call
+sub xsystem {
+       my ($command) = @_;
+       my $rc = 0xffff & system $command;
+       if ($rc == 0) {
+               return;
+       }
+       elsif ($rc == 0xff00) {
+               print "'$command' failed!\n";
+               die "Jpred failed\n";
+       }
+       elsif (($rc & 0xff) == 0) {
+               $rc >>= 8;
+               die "'$command' did something else! (exited $rc)\n";
+       }
+       else {
+               if ($rc & 0x80) {
+                       $rc &= ~0x80;
+                       print "'$command' dumped core!\n";
+                       die "Jpred failed\n";
+               }
+       }
+}
+
+# The header and footer for any HTML produced dynamically
+our $JPREDHEAD = "      <div id=\"header\">
+
+         <a href=\"$DUNDEE\">
+         <img class=\"uod\" src=\"$IMAGES/logo_white_small.gif\" height=\"100\" width=\"108\" title=\"University of Dundee\" alt=\"University of Dundee Logo\" />
+         </a>
+        <a href=\"$SERVERROOT/index.html\"> 
+       <img src=\"$IMAGES/jpred3.png\" border=\"0\" height=\"102\" width=\"520\" title=\"Jpred Logo\" alt=\"Jpred Logo\" />
+       </a>
+         <a href=\"http://www.compbio.dundee.ac.uk\">
+            <img src=\"$IMAGES/group.png\" border=\"0\" height=\"102\" width=\"70\" title=\"Group Link\" alt=\"The Barton Group\" />
+         </a>
+
+         <div id=\"navigation\">
+            <table border=\"0\" cellpadding=\"0\" width=\"100%\">
+               <tr>
+                  <td class=\"nav_table\"><a class=\"nav\" href=\"$SERVERROOT/index.html\">Home</a></td>
+
+                  <td class=\"nav_table\"><a class=\"nav\" href=\"$SERVERROOT/about.html\">About</a></td>
+                  <td class=\"nav_table\"><a class=\"nav\" href=\"$SERVERROOT/new.html\">News</a></td>
+                  <td class=\"nav_table\"><a class=\"nav\" href=\"$SERVERROOT/faq.html\">FAQ</a></td>
+                  <td class=\"nav_table\"><a class=\"nav\" href=\"$SERVERROOT/help.html\">Help</a></td>
+                  <td class=\"nav_table\"><a class=\"nav\" href=\"$SERVERROOT/contact.html\">Contact</a></td>
+               </tr>
+
+             </table>
+         </div> <!-- End #navigation -->
+      </div> <!-- End #header -->
+";
+
+our $JPREDFOOT = '
+      <script type="text/javascript">
+         var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+         document.write(unescape("%3Cscript src=\'" + gaJsHost + "google-analytics.com/ga.js\' type=\'text/javascript\'%3E%3C/script%3E"));
+      </script>
+      <script type="text/javascript">
+         try{
+            var pageTracker = _gat._getTracker("UA-5356328-1");
+            pageTracker._trackPageview();
+         } catch(err) {}
+      </script>
+';
+1;
+
+=head1 DESCRIPTION
+
+This module defines all the global and envirnmental variables required by the Jpred server scripts and binaries.
+
+=head1 AUTHOR
+
+I'm not sure who originally wrote this module, but I guess all of the following contributed:
+
+James Cuff
+Jon Barber
+Chris Cole <christian@cole.name> (current maintainer)
+
+=cut
diff --git a/jpred/lib/OC.pm b/jpred/lib/OC.pm

new file mode 100644 (file)

index 0000000..436285c
--- /dev/null
+++ b/jpred/lib/OC.pm
@@ -0,0 +1,163 @@
+package OC;
+
+use strict;
+use warnings;
+use Carp;
+use IPC::Open3;
+
+use base qw(Root Read Common);
+use Run qw(check);
+
+=head1 NAME
+
+OC - Read OC output 
+
+=head1 SYNOPSIS
+
+  $oc = OC->new;
+  $oc->read_file($path);
+
+  $oc->get_group(1); # Returns information about cluster 1
+  $oc->get_groups; # Returns all the groups
+
+=head1 DESCRIPTION
+
+This is a module to read OC output.
+
+=head1 $oc->read($fh);
+
+Reads an OC file from a filehandle.
+
+=cut
+
+sub read {
+       my ($self, $fh) = @_;
+
+       local $/ = "\n";
+
+       while (<$fh>) {
+               chomp;
+               next if /^\s*$/;
+
+               if (s/^##\s*//) {
+                       my ($num, $score, $size) = split;
+                       my $nl = <$fh>;
+                       $nl =~ s/^\s+//;
+                       my (@labels) = map { chomp; $_ } split / /, $nl;
+
+                       # Add to self
+                       $self->add_group($num, $score, $size, @labels);
+               }
+               elsif (/^UNCLUSTERED ENTITIES$/) {
+                       my (@labels) = map { chomp; $_ } <$fh>;
+                       $self->add_unclustered(@labels);
+               }
+       }
+}
+
+=head $oc->add_group($number, $score, $size, @labels)
+
+Add information about a group.
+
+=cut
+
+sub add_group {
+       my ($self, $num, $score, $size, @labels) = @_;
+
+       for (qw(num score size)) {
+               croak "No value for $_ passed to add_group" unless eval "defined \$$_"
+       }
+
+       croak "No labels passed to add_group" unless @labels;
+
+       $self->{__PACKAGE__."score"}{$num} = $score;
+       $self->{__PACKAGE__."size"}{$num} = $size;
+       $self->{__PACKAGE__."labels"}{$num} = [ @labels ];
+}
+
+=head2 $oc->add_unclustered(@labels)
+
+Adds those entities that are unclustered.
+
+=cut
+
+sub add_unclustered {
+       my ($self, @labels) = @_;
+       $self->{__PACKAGE__."unclust"} = \@labels;
+}
+
+=head2 $oc->get_unclustered
+
+Returns those unclustered entities as a list.
+
+=cut
+
+sub get_unclustered {
+       my ($self) = @_;
+       return @{$self->{__PACKAGE__."unclust"}} if defined $self->{__PACKAGE__."unclust"};
+}
+
+=head2 ($score, $size, @labels) = $oc->get_group($n);
+
+Returns information about a particular group. If the group doesn't exist you get a warning message and undef.
+
+=cut
+
+sub get_group {
+       my ($self, $num) = @_;
+       if (defined $num) {
+               if (exists $self->{__PACKAGE__."score"}{$num}) {
+                       return 
+                               $self->{__PACKAGE__."score"}{$num},
+                               $self->{__PACKAGE__."size"}{$num},
+                               @{ $self->{__PACKAGE__."labels"}{$num} };
+               }
+               else {
+                       carp "No such group '$num'";
+                       return undef;
+               }
+       }
+       else {
+               croak "No value passed to get_group";
+       }
+}
+
+=head1 @info = $oc->get_groups;
+
+Returns information about all of the groups. This is an array of arrays, the second array holds group id and then the information returned by the get_group method.
+
+=cut
+
+sub get_groups {
+       my ($self) = @_;
+
+       map {
+               [ $_, $self->get_group($_) ]
+       } sort keys %{ $self->{__PACKAGE__."score"} };
+}
+
+=head1 $oc->run();
+
+=cut
+
+sub run {
+       my ($self, $file) = @_;
+       
+       local $/ = undef;
+       local $| = 1;
+
+       my $pid = open3(\*WRT, \*RD, \*ERR, $self->path);
+
+       print WRT $file;
+       close WRT;
+
+       my @output = join "\n", split "\n", <RD>;
+       close RD;
+
+       waitpid $pid, 0;
+       check($self->path, $?) or die "OC was naughty";
+
+       return @output;
+}
+
+1;
diff --git a/jpred/lib/PSIBLAST.pm b/jpred/lib/PSIBLAST.pm

new file mode 100644 (file)

index 0000000..04818b4
--- /dev/null
+++ b/jpred/lib/PSIBLAST.pm
@@ -0,0 +1,239 @@
+package PSIBLAST;
+
+use strict;
+use warnings;
+use Carp;
+use IO::String;
+
+use base qw(Root Read);
+
+=head1 NAME
+
+PSIBLAST - Yet Another PSIBLAST Parser
+
+=head1 SYNOPSIS
+
+  use PSIBLAST;
+
+  $psi = PSIBLAST->new;
+  $psi->read_file("psiblast.output");
+  $psi->get_all;
+
+=head1 DESCRIPTION
+
+A module to parse PSIBLAST output. It only grabs the last itteration, and there are simple methods to retrieve the ids of the sequences, the aligments and the start positions of the alignments.
+
+Now gets the scores for the last iteration as well.
+
+=head1 METHODS
+
+=head2 $psi->read(\*FH)
+
+Reads a PSIBLAST file from a filehandle.
+
+=cut
+
+sub read {
+       my ($self, $fh) = @_;
+
+       croak "No filehandle passed to method read" unless ref $fh eq 'GLOB';
+
+       # Make sure that $/ is a defined value
+       local $/ = "\n";
+
+       $self->_blastpgp_parse($fh);
+}
+
+sub _blastpgp_parse {
+       my ($self, $fh) = @_;
+
+       local $_;
+       my $version = <$fh>;
+       my $pos;
+
+       #die "Wrong version of BLASTP" if $version !~ /BLASTP 2.2.6 \[Apr-09-2003\]/;
+
+       # Find the position of the last itteration
+       while (<$fh>) {
+               <$fh>, $pos = tell $fh if /^Sequences used in model and found again:/
+       }
+
+       # and make sure we found it...
+       unless ($pos) { $self->{psi} = undef, return }
+       seek $fh, $pos, 0;
+
+       # Now read in the last itteration
+       # @alignments holds the lines containing the aligment
+       # @order is a list of the IDs
+       # $flag indicates that the ID's have been collected
+       #
+       # All of this has to be list based as there may be multiple local
+       # alignments against the same sequence
+       my (@alignments, @order, $flag);
+
+       while (<$fh>) {
+               # Have we reached the end of the alignment
+               last if /^\s*Database: /;
+
+               # Skip babble
+               next if /^Sequences not found previously or not previously below threshold:/;
+               #next if /^\s*$/;
+
+               chomp;
+               # Capture E scores and check to see if we've reached the start of the alignment
+               unless (/^QUERY\s*/ or @alignments) {
+                       next if /^$/;
+                       my ($id, $bit, $e) = unpack "A70A3A*", $_;
+                       ($id) = (split /\s+/, $id)[0];
+                       s/\s*//g for $bit, $e;
+
+                       # Store the scores
+                       $self->{$id}{bit} = $bit;
+                       $self->{$id}{evalue} = $e;
+                       next;
+               }
+
+               # Have we reached the end of the first block of the
+               # alignment
+               $flag = 1, next if /^$/;
+
+               # Add IDs if we're still in the first block
+               push @order, (split)[0] unless $flag;
+               # Add alignment lines
+               push @alignments, $_;
+       }
+       
+       $self->size(scalar @order);
+
+       # Add sequences and start positions to self 
+       for (0..$#alignments) {
+               my $i = $_ % @order;
+               my @fields = split ' ', $alignments[$_], 4;
+
+               # blastpgp output allways has a ID at the begining of the line, but can
+               # have a variable number of fields after that. e.g.:
+# $seqid $start $seq                                                         $end
+# O24727        ------------------------------------------------------------
+# O24727   0    ------------------------------------------------------------
+# O24727   26   -Y--A---A-----L-----R----S-------L---------G-------P----V--- 35
+
+               if (@fields == 4 or @fields == 3) {
+                       $self->{psi}{$i}{seq} .= $fields[2];
+                       $self->{psi}{$i}{start} = $fields[1]
+                               unless defined $self->{psi}{$i}{start};
+               }
+               elsif (@fields == 2) { $self->{psi}{$i}{seq} .= $fields[1] }
+               else {
+                       croak "Fatal error! Wrong number of fields in BLAST alignment:\n$alignments[$_]\n"
+               }
+       }
+       
+       # Add IDs
+       for (0..$#order) {
+               $self->{psi}{$_}{id} = $order[$_];
+       }
+
+       1;
+}
+
+=head2 @ids = $psi->get_ids
+
+The IDs that were in the last iteration in the order they appeared.
+
+=cut
+
+sub get_ids {
+       my ($self) = @_;
+       #return map { $self->{psi}{$_}{id} } sort { $a <=> $b } keys %{$self->{psi}}
+       return map { $self->{psi}{$_}{id} } 0..$self->size - 1;
+}
+
+=head2 @alignments = $psi->get_alignments
+
+The alignment from the last iteration in the same order as they appeared.
+
+=cut
+
+sub get_alignments {
+       my ($self) = @_;
+       #return map { $self->{psi}{$_}{seq} } sort { $a <=> $b } keys %{$self->{psi}};
+       return map { $self->{psi}{$_}{seq} } 0..$self->size - 1;
+}
+
+=head2 @starts = $psi->get_starts
+
+The start position of the first residue from the alignments, returned in the same order as they appeared.
+
+=cut
+
+sub get_starts {
+       my ($self) = @_;
+       return map { $self->{psi}{$_}{start} } sort { $a <=> $b } keys %{$self->{psi}};
+}
+
+=head2 @all = $psi->get_all
+
+Returns an array of arrays containing the ID, start position and alignment from the PSIBLAST output in the order they appeared.
+
+=cut
+
+sub get_all {
+       my ($self) = @_;
+
+       my @ids = $self->get_ids;
+       my @starts = $self->get_starts;
+       my @alignments = $self->get_alignments;
+
+       return map { [ $ids[$_], $starts[$_], $alignments[$_] ] } 0..$#ids
+}
+
+=head2 $psi->evalue($label)
+
+=cut
+
+sub evalue {
+       my ($self, $label) = @_;
+       defined $label or confess "No label passed\n";
+       exists $self->{evalue}{$label} or warn "No such label '$label'\n" and return undef;
+       return $self->{evalue}{$label};
+}
+
+=head2 $psi->bit($label)
+
+=cut
+
+sub bit {
+       my ($self, $label) = @_;
+       defined $label or confess "No label passed\n";
+       exists $self->{bit}{$label} or warn "No such label '$label'\n" and return undef;
+       return $self->{bit}{$label};
+}
+
+=head2 $psi->size
+
+Returns the number of sequences in the alignments.
+
+=cut
+
+sub size {
+       my ($self, $size) = @_;
+
+       unless (defined $size) {
+               exists $self->{size} and return $self->{size};
+       }
+       else {
+               $self->{size} = $size;
+       }
+}
+
+=head2 BUGS
+
+Doesn't aquire any other information.
+
+=head2 AUTHOR
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
+
+=cut
+
+1;
diff --git a/jpred/lib/PSIBLAST/PSSM.pm b/jpred/lib/PSIBLAST/PSSM.pm

new file mode 100644 (file)

index 0000000..6093d20
--- /dev/null
+++ b/jpred/lib/PSIBLAST/PSSM.pm
@@ -0,0 +1,149 @@
+package PSIBLAST::PSSM;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Root Read Write);
+
+sub read {
+       my ($self, $fh) = @_;
+
+       local $/ = "\n";
+
+       my $xsum = 0;
+       while (my $line = <$fh>) {
+               if ($line =~ /^\s{11}A  R  N  D  C  Q  E/) {
+                       while ($line = <$fh>) {
+                               chomp;
+                               last if $line =~ /^\s*$/;
+
+                               my @pssm = split /\s+/, $line;
+
+                               my @data;
+                               for (@pssm[3..22]) {
+                                       push @data, sprintf "%2.8f", 1 / (1 + (exp(- $_)));
+                                       $xsum += $_;
+                               }
+
+                               $self->add_pos(@data);
+                       }
+               }
+       }
+
+}
+
+sub add_pos {
+       my ($self, @data) = @_;
+
+       @data == 20 or croak "add_pos passed wrong length array";
+
+       push @{ $self->{__PACKAGE__."pos"} }, \@data;
+}
+
+{
+       my $i = 0;
+
+       sub get_pos {
+               my ($self) = @_;
+
+               if (defined (my $foo = $self->get_num_pos($i++))) {
+                       return $foo
+               }
+               else {
+                       $i = 0;
+                       return undef;
+               }
+       }
+}
+
+sub seq {
+       my ($self, $data) = @_;
+
+       if ($data) {
+               confess "Not passed ARRAY" unless ref $data eq 'ARRAY';
+               $self->add_pos(@{ $data });
+       }
+       else {
+               my $size = $self->get_pos_size - 1;
+               $size or return ();
+               return map { $self->get_num_pos($_) } 0..$size;
+       }
+}
+
+sub get_num_pos {
+       my ($self, $pos) = @_;
+       confess "No positions passed to get_num_pos" unless defined $pos;
+       return undef unless exists $self->{__PACKAGE__."pos"};
+       return ${ $self->{__PACKAGE__."pos"} }[$pos];
+}
+
+sub get_pos_size {
+       my ($self) = @_;
+       exists $self->{__PACKAGE__."pos"} ?
+               return scalar @{ $self->{__PACKAGE__."pos"} } :
+               0;
+}
+
+sub write {
+       my ($self, $fh) = @_;
+       while (my $pos = $self->get_pos) {
+               print $fh join(" ", @{$pos}), "\n"
+       }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+PSIBLAST::PSSM - Read PSIBLAST PSSM files
+
+=head1 EXAMPLE
+
+ my $pssm = PSIBLAST::PSSM->new;
+ $pssm->read_file("data.pssm");
+
+ for ($pssm->seq) {
+       print join(" ", @{$_}), "\n";
+ }
+
+=head1 DESCRIPTION
+
+Class for reading PSIBLAST PSSM files, subclasses the Root, Read and Write classes.
+
+=head1 METHODS
+
+=head2 my $pssm = PSIBLAST::PSSM->new()
+
+Create the object. See Root module for details.
+
+=head2 $pssm->read_file("path_to_file");
+
+Read in a PSIBLAST PSSM file. See Read module for other methods.
+
+=head2 $pssm->add_pos(@data);
+
+Adds data from PSIBLAST PSSM.
+
+=head2 $pssm->get_pos;
+
+Itterator for returning the data for each position loaded in the object.
+
+=head2 $pssm->write;
+
+Prints the PSSM file for use by Jnet.
+
+=head2 $pssm->seq([ @data ] );
+  $data = $pssm->seq;
+
+Gah! Deprecated.
+
+=head1 SEE ALSO
+
+Root, Read and Write modules.
+
+=head1 AUTHOR
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
diff --git a/jpred/lib/PSIBLAST/PSSM/Jnet.pm b/jpred/lib/PSIBLAST/PSSM/Jnet.pm

new file mode 100644 (file)

index 0000000..d5db271
--- /dev/null
+++ b/jpred/lib/PSIBLAST/PSSM/Jnet.pm
@@ -0,0 +1,19 @@
+package PSIBLAST::PSSM::Jnet;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(PSIBLAST::PSSM);
+
+sub read {
+       my ($self, $fh) = @_;
+       local $/ = "\n";
+
+       while (<$fh>) {
+               chomp;
+               $self->add_pos(split);
+       }
+}
+
+1;
diff --git a/jpred/lib/PSIBLAST/Run.pm b/jpred/lib/PSIBLAST/Run.pm

new file mode 100644 (file)

index 0000000..99188e1
--- /dev/null
+++ b/jpred/lib/PSIBLAST/Run.pm
@@ -0,0 +1,72 @@
+package PSIBLAST::Run;
+
+use Carp;
+use base qw(Root);
+use Run qw(check);
+
+sub new {
+       my ($class, %args) = @_;
+       my $self = {};
+       bless $self, ref($class) || $class;
+
+       # -a number of CPUs used (default 1)
+       # -e e-value threshold for reporting sequences (default 10)
+       # -h e-value threshold for including sequences in PSSM (default 0.002)
+       # -m type of output
+       # -b max number of alignments reported (default 250)
+       # -v max number of sequences described (default 500)
+       # -j max number of itterations (default 1)
+       $self->args("-e0.05 -h0.01 -m6 -b10000 -v10000 -j3");
+       #$self->args("-a2 -e0.05 -h0.01 -m6 -b20000 -v20000 -j15");
+#      $self->BLASTMAT("/software/jpred_bin/blast");
+#      $self->BLASTDB("/software/jpred_uniprot_all_filt");
+
+       # Automatically run any arguments passed to the constructor
+       for (keys %args) {
+               croak "No such method '$_' of object $class" unless $self->can($_);
+               $self->$_($args{$_});
+       }
+
+       return $self;
+}
+
+sub path { defined $_[1] ? $_[0]->{path} = $_[1] : $_[0]->{path} }
+sub args { defined $_[1] ? $_[0]->{args} = $_[1] : $_[0]->{args} }
+sub database { defined $_[1] ? $_[0]->{data} = "-d $_[1]" : $_[0]->{data} }
+sub input { defined $_[1] ? $_[0]->{input} = "-i $_[1]" : $_[0]->{input} }
+sub output { defined $_[1] ? $_[0]->{output} = "-o $_[1]" : $_[0]->{output} }
+sub matrix { defined $_[1] ? $_[0]->{matrix} = "-Q $_[1]" : $_[0]->{matrix} }
+sub debug { defined $_[1] ? $_[0]->{debug} = $_[1] : $_[0]->{debug} }      # CC corrected - now works as expected. Before, debug was always on.
+sub BLASTMAT { $ENV{BLASTMAT} = $_[1] }
+sub BLASTDB { $ENV{BLASTDB} = $_[1] }
+
+sub run {
+       my ($self) = @_;
+
+       # Required arguments
+       for (qw(path output database input)) {
+               croak "Method '$_' needs to be set" unless defined $self->$_;
+       }
+
+       # Construct the command line
+       my @cmd;
+       for (qw(path args database input matrix output)) {
+               next unless $self->$_;
+               push @cmd, $self->$_;
+       }
+
+       my $cmd = join " ", @cmd;
+       # Execute PSIBLAST and check it ran okay
+       if ($self->debug) {
+               #for (keys %ENV) { warn "$_=$ENV{$_}\n" }
+               warn "$cmd\n"
+       }
+
+       system($cmd) == 0 or check("blastpgp", $?) and die "blastpgp was naughty";
+
+       # Clean up the error.log file it produces
+       if (-z "error.log") { unlink "error.log" }
+       else { warn "blastpgp error.log file was not empty\n" }
+}
+
+1;
diff --git a/jpred/lib/Pairwise.pm b/jpred/lib/Pairwise.pm

new file mode 100644 (file)

index 0000000..d6d32bd
--- /dev/null
+++ b/jpred/lib/Pairwise.pm
@@ -0,0 +1,42 @@
+package Pairwise;
+
+use strict;
+use warnings;
+use Carp;
+use IPC::Open3;
+use UNIVERSAL qw(isa);
+use File::Temp;
+use base qw(Root Common);
+
+use Run qw(check);
+
+sub path {
+       my ($self, $path) = @_;
+       if (defined $path) { $self->{path}= $path }
+       else {
+               if (defined $self->{path}) { return $self->{path} }
+               else { croak "Path not defined" }
+       }
+}
+
+sub run {
+       my ($self, $fasta) = @_;
+
+       croak "Non FASTA::File object passed to Pairwise::run" unless isa $fasta, 'FASTA::File';
+
+       local ($/, $?) = (undef, 0);
+
+       my $f = File::Temp->new->filename;
+       $fasta->write_file($f);
+
+       my $pid = open my $fh, $self->path." $f |" or die $!;
+
+       my @output = join "\n", split "\n", <$fh>;
+
+       waitpid $pid, 0;
+       check($self->path, $?) or die "Pairwise was naughty\n";
+
+       return @output;
+}
+
+1;
diff --git a/jpred/lib/Paths.pm b/jpred/lib/Paths.pm

new file mode 100644 (file)

index 0000000..04fab10
--- /dev/null
+++ b/jpred/lib/Paths.pm
@@ -0,0 +1,53 @@
+package Paths;
+
+use strict;
+use warnings;
+use base qw(Exporter);
+use Sys::Hostname ();
+
+=head1 NAME
+
+Paths - Sets paths for executable programs
+
+=head1 DESCRIPTION
+
+This module gathers together all of the little pieces of information that would other wise be floating around in all of the other modules that run external programs. Namely, the path to the executable and nessecery environment variables. 
+
+Putting it all here should mean that this is the only file that needs updating when their location changes, or it's redeployed. Plus some degree of automagic can be used to try and detect changes.
+
+=cut
+
+our @EXPORT_OK = qw($ff_bignet $analyze $batchman $sov $pairwise $oc $jnet $fastacmd $hmmbuild $hmmconvert $psiblastbin);
+
+my $HOME = $ENV{HOME};
+my $BINDIR = '/homes/www-jpred/live/jpred/bin';
+
+our $pairwise  = "$BINDIR/pairwise";  # CC modified for correct path
+our $oc        = "$BINDIR/oc";
+our $jnet      = "$BINDIR/jnet";
+our $fastacmd  = "$BINDIR/fastacmd";              # CC added to avoid failure on cluster
+our $hmmbuild  = "$BINDIR/hmmbuild";
+our $hmmconvert = "$BINDIR/hmmconvert";
+our $psiblastbin  = "$BINDIR/blastpgp";
+
+# required for training with SNNS, but unused currently
+our $ff_bignet = "$HOME/projects/Jnet/bin/snns/ff_bignet";  # CC modified for new path (SNNS app)
+our $analyze   = "$HOME/projects/Jnet/bin/snns/analyze";    # CC modified for new path (SNNS app)
+our $batchman  = "$HOME/projects/Jnet/bin/snns/batchman";   # CC modified for new path (SNNS app)
+our $sov       = "$HOME/projects/Jnet/bin/sov";
+
+
+=head1 AUTOMATED CHANGES
+
+Currently the paths are altered on the basis of per host rules.
+
+=cut
+
+#my $host = Sys::Hostname::hostname();
+#if (grep { $host =~ $_ } qr/^(anshar|kinshar)/o, qr/^cluster-64-/o) {
+#      for my $temp (@EXPORT_OK) {
+#              eval "$temp =~ s#$HOME\/tmp#$HOME#g";
+#      }
+#}
+
+1;
diff --git a/jpred/lib/Profile.pm b/jpred/lib/Profile.pm

new file mode 100644 (file)

index 0000000..6990921
--- /dev/null
+++ b/jpred/lib/Profile.pm
@@ -0,0 +1,110 @@
+package Profile;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Root Read Write);
+
+=head1 NAME
+
+Profile - Base object for various types of profile
+
+=head1 SYNOPSIS
+
+  use Profile;
+
+  my $prof = Profile->new;
+
+=head1 DESCRIPTION
+
+Basic methods for storing information sequence profiles. The model of this object is for each sequence position to have an array of data associated with it.
+
+=head1 METHODS
+
+Inherits from Root, Read and Write.
+
+=head2 $prof->add_pos(@data)
+
+Add data to the end position of the profile.
+
+=cut
+
+sub add_pos {
+       my $self = shift;
+       @_ or croak "No data passed to ".__PACKAGE__."\n";
+       push @{ $self->{__PACKAGE__."lines"} }, pack "d*", @_;
+}
+
+sub get_aoa {
+       my ($self) = @_;
+       map { [ $self->get_pos ] } 0.. $self->get_size - 1;
+}
+
+=head2 $prof->add_line(@data)
+
+Alias for add_pos().
+
+=cut
+
+*add_line = *add_pos;
+
+=head2 @data $prof->get_pos
+
+Itterator for getting all the positions in a profile. Returns the same information as get_num_pos, or undef when it reaches the end of the profile. When this happens, a new call starts at the begining of the profile.
+
+=cut
+
+sub get_pos {
+       my ($self) = @_;
+       $self->{__PACKAGE__."counter"} ||= 0;
+
+       if ((my @foo = $self->get_num_pos( $self->{__PACKAGE__."counter"}++ )) > 0) {
+               return @foo;
+       }
+       else {
+               $self->{__PACKAGE__."counter"} = 0;
+               return undef;
+       }
+}
+
+=head2 @data = $prof->get_num_pos($position)
+
+Access the profile information for a particular position in the profile.
+
+=cut
+
+sub get_num_pos {
+       my ($self, $pos) = @_;
+
+       confess "get_num_pos didn't receive position" unless defined $pos;
+
+       #use Data::Dumper;
+       #print $pos, "\n";
+       #print Dumper unpack "d*", @{ $self->{__PACKAGE__."lines"}->[$pos]};
+
+       confess "No positions present" unless exists $self->{__PACKAGE__."lines"};
+       return undef unless defined $self->{__PACKAGE__."lines"}->[$pos];
+       return unpack "d*", $self->{__PACKAGE__."lines"}->[$pos];
+}
+
+=head2 $prof->get_size
+
+Returns the number of positions in the profile.
+
+=cut
+
+sub get_size {
+       my ($self) = @_;
+       return @{ $self->{__PACKAGE__."lines"} };
+}
+
+=head2 $prof->get_line($position)
+
+Alias for get_num_pos().
+
+=cut
+
+*get_line = *get_num_pos;
+
+1;
diff --git a/jpred/lib/Profile/Generic.pm b/jpred/lib/Profile/Generic.pm

new file mode 100644 (file)

index 0000000..cb8cf7b
--- /dev/null
+++ b/jpred/lib/Profile/Generic.pm
@@ -0,0 +1,49 @@
+package Profile::Generic;
+
+use strict;
+use warnings;
+use base qw(Profile);
+
+=head1 NAME
+
+Profile::Generic
+
+=head1 DESCRIPTION
+
+Reads in a generic profile, where each position is represented by a line, and each piece of information is white space deliminated.
+
+=head1 INHERITS
+
+Profile.
+
+=head1 METHODS
+
+=head2 $obj->read($filehandle)
+
+Reads in the profile contained in the file.
+
+=cut
+
+sub read {
+       my ($self, $fh) = @_;
+       while (<$fh>) {
+               chomp;
+               $self->add_pos(split);
+       }
+}
+
+=head2 $obj->write($filehandle);
+
+Writes WS seperated profile, with each position on each line.
+
+=cut
+
+sub write {
+       my ($self, $fh) = @_;
+
+       while ((my @data = $self->get_pos) > 1) {
+               print $fh " ".join("  ", @data), "\n";
+       }
+}
+
+1;
diff --git a/jpred/lib/Profile/Jnet.pm b/jpred/lib/Profile/Jnet.pm

new file mode 100644 (file)

index 0000000..bf91431
--- /dev/null
+++ b/jpred/lib/Profile/Jnet.pm
@@ -0,0 +1,64 @@
+package Profile::Jnet;
+
+use strict;
+use warnings;
+use Carp;
+use UNIVERSAL qw(isa);
+
+use base qw(Profile);
+use Utils qw(profile);
+
+sub read {
+       my ($self, $fh) = @_;
+
+       local $/ = "\n";
+
+       while (<$fh>) {
+               chomp;
+               $self->add_pos(split);
+       }
+}
+
+=head2 $prof->jpred_profile(@Sequence) 
+
+Creates a PSIBLAST like profile for Jpred.
+
+=cut
+
+sub jpred_profile {
+       my ($self, @seqs) = @_;
+       croak "Not passed Sequence objects" if grep { not isa $_, 'Sequence' } @seqs;
+       $self->_check_seq_length(@seqs) or croak "Not passed sequences of equal length\n";
+
+       my @profile = profile( map { join "", $_->seqs } @seqs );
+
+       for (@profile) {
+               $self->add_pos(@{$_});
+       }
+}
+
+=for private
+
+=head2 _check_seq_length(@Sequence);
+
+Checks that we've been passed PSISEQ objects and that they are the same length. Returns undef if their not and warns, otherwise returns the length of the sequence.
+
+=cut
+
+sub _check_seq_length {
+       undef = shift @_;
+
+       my %lengths;
+       for (@_) {
+               $lengths{ $_->seq } = 1;
+       }
+
+       if (keys %lengths != 1) {
+               warn "The sequences are of different lengths";
+               return undef;
+       }
+
+       return (keys %lengths)[0];
+}
+
+1;
diff --git a/jpred/lib/Profile/Scanps.pm b/jpred/lib/Profile/Scanps.pm

new file mode 100644 (file)

index 0000000..963b3c5
--- /dev/null
+++ b/jpred/lib/Profile/Scanps.pm
@@ -0,0 +1,9 @@
+package Profile::Scanps;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Profile::Generic);
+
+1;
diff --git a/jpred/lib/Profile/Scanps/Result.pm b/jpred/lib/Profile/Scanps/Result.pm

new file mode 100644 (file)

index 0000000..90be61d
--- /dev/null
+++ b/jpred/lib/Profile/Scanps/Result.pm
@@ -0,0 +1,54 @@
+package Profile::Scanps::Result;
+
+use strict;
+use warnings;
+use Carp;
+use base qw(Profile::Generic);
+use Scanps::Result;
+
+sub read {
+       my ($self, $fh) = @_;
+       my $scanps = Scanps::Result->new($fh);
+
+       # Get third, or last iteration, which ever is smaller
+       my $it = $scanps->iterations - 1;
+       $it > 2 and $it = 2; # iterations is zero based
+       my @positions = $scanps->profile($it);
+
+       for (@positions) {
+               my (undef, @data) = split /\s+/, $_;
+               $self->add_pos(@data[0..22]);
+       }
+}
+
+sub write {
+       my ($self, $fh) = @_;
+
+       while ((my @data = $self->get_pos) > 1) {
+               print $fh map({ sprintf "%2.5f ", 1 / (1 + exp(-$_/100)) } @data), "\n";
+               #print $fh join(" ", map  $_ / 100 } @data), "\n";
+               #print $fh join(" ", @data), "\n";
+       }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Profile::Scanps::Result - Reads a profile from a Scanps result file
+
+=head1 DESCRIPTION
+
+Reads the profile that can be provided at the end of a Scanps file.
+
+=head1 INHERITANCE
+
+Inherits from Profile and Generic modules.
+
+=head1 METHODS
+
+=head2 read()
+
+=cut
diff --git a/jpred/lib/Quality.pm b/jpred/lib/Quality.pm

new file mode 100644 (file)

index 0000000..d88e810
--- /dev/null
+++ b/jpred/lib/Quality.pm
@@ -0,0 +1,242 @@
+package Quality;
+
+use strict;
+use warnings;
+use Carp;
+#use Hash::Util qw(lock_keys);
+use POSIX qw(floor);
+use File::Temp qw(tempfile);
+
+use Paths qw($sov);
+use SOV;
+
+use base qw(Exporter);
+our @EXPORT_OK = qw(q3 sov sov_wrapper matthews);
+our %EXPORT_TAGS = (all => [ @EXPORT_OK ]);
+
+=head1 NAME 
+
+Quality.pm - Module for assessing the quality of a secondary structure prediction.
+
+=head1 SYNOPSIS
+
+ use Quality.pm qw(:all)
+
+ my $pred_seq = "HHHHEEEELLLL";
+ my $obs_seq  = "HEHHEEEELELL";
+
+ print q3($pred_seq, $obs_seq);  # Should print 0.8333
+
+ print sov($pred_seq, $obs_seq); # Should print 
+
+ print sov_wrapper($pred_seq, $obs_seq);
+
+=head1 DESCRIPTION
+
+A collection of functions for calculating the accuracy of a particular secondary structure predicion. Provided are functions for calculating Q3, Segment overlap and the Matthews correlation.
+
+=head2 $q3_score = q3($predicted_sequence, $observed_sequence)
+
+Both sequences should be scalars of equal length.
+
+Passing an optional third argument (a single character) indicates that you don't want the Q3 score but the Q-index score for a particular state.
+
+=cut
+
+sub q3 ($$;$) {
+       my ($pred, $obs, $i) = @_;
+
+       croak "Undefined parameter" if grep { not defined $_ } $pred, $obs;
+
+       my @pred = split //, $pred;
+       my @obs = split //, $obs;
+
+       croak "Predicted sequence and observed sequence are not the same length" unless @pred == @obs;
+       croak "Sequences are of zero length" unless @pred;
+
+       if ($i) {
+               my $correct = grep { $pred[$_] eq $i and $pred[$_] eq $obs[$_]  } 0..$#pred;
+               my $number = grep { $_ eq $i } @obs;
+               return $number ? $correct / $number : 1;
+       }
+       else {
+               my $correct = grep { $pred[$_] eq $obs[$_] } 0..$#pred;
+               return $correct / @pred;
+       }
+}
+
+=head1 $sov = sov($pred, $obs);
+
+A pure perl implementation of the SOV algorithim as found in the C SOV program
+from CASP. Not finished.
+
+=cut
+
+# Used in sov()
+sub max_min {
+       my ($one, $two) = @_;
+       return $one > $two ? ($one, $two) : ($two, $one);
+}
+
+sub sov ($$;$) {
+       my ($pred, $obs, $state) = @_;
+
+       die "This function has not been error checked yet";
+
+       croak "Undefined parameter" if grep { not defined $_ } $pred, $obs;
+
+       my @pred = split //, $pred;
+       my @obs = split //, $obs;
+
+       croak "Predicted sequence and observed sequence are not the same length" unless @pred == @obs;
+       croak "Sequences are of zero length" unless @pred;
+
+       my (%seg_one, %seg_two);
+       #lock_keys %seg_one, qw(start state end length);
+       #lock_keys %seg_two, qw(start state end length);
+       my ($sov_method, $sov_delta, $sov_delta_s) = (1, 1, 0.5);
+
+       my $number = $state ? grep { $state eq $_ } 0..$#obs : @obs;
+       return 1 unless $number;
+
+       my ($s, $n, $i) = (0, 0, 0);
+       while ($i < $#pred) {
+
+               ($seg_one{start}, $seg_one{state}) = ($i, $obs[$i]);
+               $i++ while $obs[$i] eq $seg_one{state} and $i < $#pred;
+               ($seg_one{end}, $seg_one{length}) = ($i - 1, $i - $seg_one{start});
+
+               my ($multiple, $k) = (0, 0);
+               while ($k < $#pred) {
+
+                       ($seg_two{start}, $seg_two{state}) = ($k, $pred[$k]);
+                       $k++ while $pred[$k] eq $seg_two{state} and $k < $#pred;
+                       ($seg_two{end}, $seg_two{length}) = ($k - 1, $k - $seg_two{start});
+
+                       if ($state ? $seg_one{state} eq $state : 1) {
+                               if (
+                                       $seg_one{state} eq $seg_two{state} and
+                                       $seg_two{end}   >= $seg_one{start} and
+                                       $seg_two{start} <= $seg_one{start}
+                               ) {
+                                       $n += $seg_one{length} if $multiple > 0 and $sov_method == 1;
+
+                                       my ($j1, $j2) = max_min $seg_one{start}, $seg_two{start};
+                                       my ($k1, $k2) = reverse max_min $seg_one{end}, $seg_two{end};
+
+                                       my ($minov, $maxov) = ($k1 - $j1 + 1, $k2 - $j2 + 1);
+                                       my ($d1, $d2) = map { floor $_ * $sov_delta_s } $seg_one{length}, $seg_two{length};
+
+                                       my $d;
+                                       if ($d1 > $d2) { $d = $d2 }
+                                       elsif ($d1 <= $d2 or $sov_method == 0) { $d = $d1 }
+
+                                       if ($d > $minov) { $d = $minov }
+                                       if ($d > $maxov - $minov) { $d = $maxov - $minov }
+
+                                       my $x = ($minov + $sov_delta * $d) * $seg_one{length};
+
+                                       if ($maxov > 0) { $s += $x / $maxov }
+                                       else { carp "Negative maxov\n" }
+                               }
+                       }
+               }
+       }
+
+       return $s / $number;
+}
+
+=head1 $sov = sov_wrapper($aa, $pred, $obs);
+
+This is a wrapper to the CASP SOV C program.
+
+All three arguments are required, the first is the amino acid sequence, the
+second is the predicted sequence, the third is the observed sequence.
+
+A object of class SOV will be returned on success.
+
+=cut
+
+sub sov_wrapper ($$$) {
+       my ($seq, $pred, $obs) = @_;
+
+       croak "Sequences of different length" if grep { length $_ != length $seq } $obs, $pred;
+
+       map { y/-/C/ } $obs, $pred;
+
+       my $file = "AA  OSEC PSEC\n";
+       $file .= join "\n", map {
+               substr($obs, $_, 1).'   '.
+               substr($obs, $_, 1).'    '.
+               substr($pred, $_, 1)
+       } 0..length $obs;
+
+       my $sovres = SOV->new;
+       {
+               my ($fh, $fn) = tempfile;
+               print $fh $file;
+               close $fh;
+
+               local $/ = undef;
+               my $pid = open my $rdr, "$sov $fn|" or die $!;
+
+               $sovres->read($rdr);
+               close $rdr;
+               unlink $fn;
+       }
+
+       return $sovres;
+}
+
+=head1 $matthews = matthews($pred, $obs)
+
+Calculates the Matthews coffeicent of a predicted secondary structure.
+
+=cut
+
+sub matthews ($$;$) {
+       my ($pred, $obs, $i) = @_;
+
+       croak "Undefined parameter" if grep { not defined $_ } $pred, $obs, $i;
+
+       my @pred = split //, $pred;
+       my @obs = split //, $obs;
+
+       croak "Predicted sequence and observed sequence are not the same length" unless @pred == @obs;
+       croak "Sequences are of zero length" unless @pred;
+
+       my ($p, $n, $o, $u) = (0, 0, 0, 0);
+       for (0..$#pred) {
+
+               # Correct prediction
+               if ($pred[$_] eq $obs[$_]) {
+                       # True positive
+                       if ($pred[$_] eq $i) { $p++ }
+                       # True negative
+                       else { $n++ }
+               }
+
+               # Incorrect prediction
+               else {
+                       # False positive
+                       if ($pred[$_] eq $i) { $o++ }
+                       # False negative
+                       else { $u++ }
+               }
+       }
+
+       return ( ($p * $n) - ($u * $o) ) /
+               sqrt( ($n + $u) * ($n + $o) * ($p + $u) * ($p + $o) );
+}
+
+=head1 SEE ALSO
+
+http://predictioncenter.llnl.gov/casp3/results/ab-measures.html
+
+=head1 AUTHOR
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
+
+=cut
+
+1;
diff --git a/jpred/lib/Read.pm b/jpred/lib/Read.pm

new file mode 100644 (file)

index 0000000..72e333b
--- /dev/null
+++ b/jpred/lib/Read.pm
@@ -0,0 +1,79 @@
+package Read;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Common);
+
+use IO::String;
+use File::Temp;
+
+=head1 NAME
+
+Read - Two base methods for reading information.
+
+=head1 DESCRIPTION
+
+This module contains three methods for reading information into a program. They allow the reading of information from a string or filename and expect the method read() to be defind by the class that inherits this module.
+
+=head1 METHODS
+
+=head2 read_file($path)
+
+Opens a file at the given path and then calls the read method.
+
+=head2 read_file_gzip($path)
+
+Calls read_file after decompressing a gzip compressed file using the system gzip command.
+
+=head2 path($path)
+
+Accessor for finding where a file was located.
+
+=head2 read_string($scalar)
+
+Reads the data in the scalar and passes it to the read method.
+
+=head2 read($filehandle)
+
+This method will cause a fatal error unless it's overidden by the class that inherits this module.
+
+=cut
+
+sub read {
+       confess "The inheriting package hasn't defined the read() method\n"
+}
+
+sub read_file {
+       my ($self, $fn) = @_;
+
+       open my $fh, $fn or confess "Can't open file $fn: $!";
+       $self->path($fn);
+       $self->read($fh);
+}
+
+=head2 read_gzip_file($scalar);
+
+Like read_file($scalar), but ungzip's the file first.
+
+=cut
+
+sub read_gzip_file {
+       my ($self, $fn) = @_;
+
+       my $gzipd_fn = File::Temp->new->filename;
+
+       system("gzip -dc $fn > $gzipd_fn");
+
+       $self->read_file($gzipd_fn);
+}
+
+sub read_string {
+       my ($self, $string) = @_;
+
+       my $fh = IO::String->new($string);
+       $self->read($fh);
+}
+
+1;
diff --git a/jpred/lib/Root.pm b/jpred/lib/Root.pm

new file mode 100644 (file)

index 0000000..e25c776
--- /dev/null
+++ b/jpred/lib/Root.pm
@@ -0,0 +1,56 @@
+package Root;
+
+use strict;
+use warnings;
+use Carp;
+
+# CC This lib path is unnecessary - commented out as supercedes the 'lib' use in jpred itself.
+#use lib qw(/homes/jon/cvs/jon/jpred/src /homes/jon/usr/lib/perl5);
+
+=head1 NAME 
+
+Root - Base module for new classes to inherit from
+
+=head1 DESCRIPTION
+
+This modules provides a new method for other classes to use if they so wish.
+
+=head1 METHODS
+
+=head2 new(foo => "bar")
+
+This constructs an object of the right class and returns it. Any arugments passed to the constructer will be parsed as a hash with the first argument from the pair acting as the method that should be called and the second being the argument for that method.
+
+=cut
+
+sub new {
+       my ($class, %args) = @_;
+       my $self = bless {}, ref($class) || $class;
+
+       for (keys %args) {
+               croak "No such method '$_'" unless $self->can($_);
+               $self->$_($args{$_});
+       }
+
+       return $self;
+}
+
+sub trace {
+       my ($self) = @_;
+
+       my $i = 1;
+
+       while (my @caller = caller $i) {
+               #print join("#", map { defined $_ ? $_ : ""} @caller), "\n";
+               print $caller[0], ":", $caller[2], " ", $caller[3], "\n";
+               $i++;
+       }
+}
+
+sub warn {
+       my ($self, @message) = @_;
+       print STDERR join("\n", @message), "\n";
+       $self->trace;
+}
+
+1;
diff --git a/jpred/lib/Run.pm b/jpred/lib/Run.pm

new file mode 100644 (file)

index 0000000..6bc6a78
--- /dev/null
+++ b/jpred/lib/Run.pm
@@ -0,0 +1,33 @@
+package Run;
+
+use strict;
+use warnings;
+use Carp;
+
+use Exporter;
+use POSIX qw(WIFEXITED WEXITSTATUS WIFSIGNALED WTERMSIG);
+
+our @ISA = qw(Exporter);
+our @EXPORT_OK = qw(check);
+
+sub check {
+       my ($prog, $status) = @_;
+
+       if ($status == 0) {
+               return 1;
+       }
+       elsif ($status == -1) {
+               croak "$prog executable not found\n";
+       }
+       elsif (WIFEXITED($status) and WEXITSTATUS($status)) {
+               croak "$prog exited with status ".WEXITSTATUS($status)."\n";
+       }
+       elsif (WIFSIGNALED($status)) {
+               croak "$prog halted by external signal ".WTERMSIG($status)."\n";
+       }
+       else {
+               croak "$prog suffered from a random pantwetting event";
+       }
+}
+
+1;
diff --git a/jpred/lib/SOV.pm b/jpred/lib/SOV.pm

new file mode 100644 (file)

index 0000000..40e0383
--- /dev/null
+++ b/jpred/lib/SOV.pm
@@ -0,0 +1,111 @@
+package SOV;
+
+=head1 NAME
+
+SOV - Parses output for CASP SOV program.
+
+=head1 DESCRIPTION
+
+Class for parsing SOV output.
+
+=head1 METHODS
+
+Inherits from Read and Root.
+
+=cut
+
+use base qw(Root Read);
+
+=head2 q3()
+
+Access q3 information. Returns SOV::Types object.
+
+=head2 sov(), sov_0(), sov_50()
+
+As for q3()
+
+=cut
+
+sub q3 {
+       if ($_[1]) { $_[0]->{q3} = $_[1] }
+       else { return defined $_[0]->{q3} ? $_[0]->{q3} : undef }
+}
+
+sub sov_0 {
+       if ($_[1]) { $_[0]->{sov_0} = $_[1] }
+       else { return defined $_[0]->{sov_0} ? $_[0]->{sov_0} : undef }
+}
+
+sub sov_50 {
+       if ($_[1]) { $_[0]->{sov_50} = $_[1] }
+       else { return defined $_[0]->{sov_50} ? $_[0]->{sov_50} : undef }
+}
+
+sub sov {
+       if ($_[1]) { $_[0]->{sov} = @_[1] }
+       else { return defined $_[0]->{sov} ? $_[0]->{sov} : undef }
+}
+
+sub read {
+       my ($self, $fh) = @_;
+
+       local $/ = "\n";
+       local $_;
+
+       # Get to the SOV and Q3 results
+       while (<$fh>) { last if /^\s-{22}/ }
+
+       while (<$fh>) {
+               chomp;
+               if (s/^\s*Q3\s*:\s*//) {
+                       $self->q3(to_struct(split / +/, $_))
+               }
+               elsif (s/^\s*SOV\s*:\s*//) {
+                       $self->sov(to_struct(split / +/, $_))
+               }
+               elsif (s/^\s*SOV.*0].*:\s*//) {
+                       $self->sov_0(to_struct(split / +/, $_))
+               }
+               elsif (s/^\s*SOV.*50%.*:\s*//) {
+                       $self->sov_50(to_struct(split / +/, $_))
+               }
+       }
+
+}
+
+sub to_struct ($$$$) {
+       map { $_ > 1 ? $_ /= 100 : $_ } @_;
+       my $new = SOV::Types->new(
+               all => +shift,
+               helix => +shift,
+               sheet => +shift,
+               coil => +shift
+       );
+}
+
+=head1 SOV::Types
+
+Object that stores prediction information
+
+=head1 METHODS
+
+=head2 all(), helix(), sheet(), coil()
+
+Accessors for accuracy of different types of secondary structure.
+
+=cut
+
+use Class::Struct SOV::Types => [
+       all => '$', 
+       helix => '$',
+       sheet => '$',
+       coil => '$'
+];
+
+=head1 AUTHOR
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
+
+=cut
+
+1;
diff --git a/jpred/lib/Sequence.pm b/jpred/lib/Sequence.pm

new file mode 100644 (file)

index 0000000..6a53af1
--- /dev/null
+++ b/jpred/lib/Sequence.pm
@@ -0,0 +1,135 @@
+package Sequence;
+
+use strict;
+use warnings;
+use Carp;
+use base qw(Root);
+
+sub id {
+       my ($self, $id) = @_;
+       defined $id ?
+               $self->{__PACKAGE__."id"} = $id :
+               return $self->{__PACKAGE__."id"};
+}
+
+sub seq {
+       my ($self, @entries) = @_;
+       if (@entries) { $self->{__PACKAGE__."seq"} = [ @entries ] }
+       else {
+               if (wantarray) {
+                       return defined $self->{__PACKAGE__."seq"} ?
+                               @{ $self->{__PACKAGE__."seq"} } :
+                               ();
+               }
+               else { return $self->{__PACKAGE__."seq"} }
+       }
+}
+
+sub length {
+       my ($self) = @_;
+       return scalar @{ [ $self->seq ] };
+}
+
+#sub numeric {
+#      my ($self, $set) = @_;
+#      return $set ? 
+#              $self->{__PACKAGE__."numeric"} = $set :
+#              $self->{__PACKAGE__."numeric"};
+#}
+#
+#use Scalar::Util qw(looks_like_number);
+#sub seq_compress {
+#      my ($self, @entries) = @_;
+#      if (@entries) {
+#              my $numeric = grep { looks_like_number $_ } @entries;
+#              if ($numeric != 0 && $numeric != @entries) {
+#                      die "Can't compress both numeric and non-numeric data";
+#              }
+#
+#              if ($numeric) {
+#                      $self->numeric(1);
+#                      $self->seq(pack "d*", @entries)
+#              }
+#              else { $self->seq(join '', @entries) }
+#      }
+#      else {
+#              return $self->seq
+#      }
+#}
+
+sub sub_seq {
+       my ($self, @segments) = @_;
+
+       my $package = ref $self;
+
+       return $self->seq unless @segments;
+       confess "Passed an uneven number of segments" unless @segments % 2 == 0;
+
+       {
+               my %segments = @segments;
+               # Find out if there are overlapping segments
+               for my $pri (sort keys %segments) {
+                       for my $test (sort keys %segments) {
+                               next if $pri == $test;
+                               croak "Overlapping segments" if $test < $segments{$pri};
+                       }
+               }
+       }
+       
+       my @sections;
+       my ($i, @seq) = (0, $self->seq);
+       do {
+               my ($start, $end) = @segments[$i, $i + 1];
+
+               # Create a new sequence object in the correct package, and put the
+               # segment into it
+               my $new_seq = $package->new(id => $self->id);
+               $new_seq->seq( @seq[ $start .. $end ] );
+               push @sections, $new_seq;
+               $i += 2;
+       } while ($i < @segments);
+
+       return @sections;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Sequence - Holds information about a sequence
+
+=head1 EXAMPLE
+
+  my $seq = Sequence->new;
+
+  $seq->id("Big wobbly molecule");
+  my @residues = split //, "ASEQENCE";
+  $seq->seq(@residues);
+
+=head1 INHERITANCE
+
+Inherits from the Root class.
+
+=head1 METHODS
+
+=head2 id()
+
+Accessor for the sequence ID.
+
+=head2 seq(@sequence_data)
+
+Accessor for the sequence. Pass the information in as an array, one piece per position.
+
+Returns an array reference if called in a scalar context, otherwise an array of the information.
+
+=head2 length()
+
+Returns the length of the sequence.
+
+=head1 AUTHOR
+
+Jonathan Barber <jon@compbio.dundee.ac.uk>
+
+=cut
diff --git a/jpred/lib/Sequence/File.pm b/jpred/lib/Sequence/File.pm

new file mode 100644 (file)

index 0000000..77b64da
--- /dev/null
+++ b/jpred/lib/Sequence/File.pm
@@ -0,0 +1,142 @@
+package Sequence::File;
+
+use strict;
+use warnings;
+use Carp;
+
+use UNIVERSAL qw(isa);
+use base qw(Root Read Write);
+
+=head2 $file->get_entries
+
+Returns all records so far found in the file.
+
+=cut
+
+sub get_entries {
+       my ($self) = @_;
+
+       if (exists $self->{__PACKAGE__."entries"}) {
+               return @{ $self->{__PACKAGE__."entries"} };
+       }
+       else { return undef; }
+}
+
+=head2 $file->get_entry(@positions);
+
+Retrieves the @position's'th record found in the file or undef if there is no such sequence.
+
+=cut
+
+sub get_entry {
+       my ($self, @offsets) = @_;
+
+       if (exists $self->{__PACKAGE__."entries"} and @offsets) {
+               return @{ $self->{__PACKAGE__."entries"} }[@offsets];
+       }
+       else { return undef }
+}
+
+=head2 $self->add_entries(@entries)
+
+Adds more entries to the object. Returns the number of entries added to the object. This may be less than those passed if set_max_entries() has been called.
+
+=cut
+
+sub add_entries {
+       my ($self, @entries) = @_;
+       return unless @entries;
+
+       for (@entries) {
+               croak "Adding non Sequence object" unless isa $_, "Sequence";
+               $self->_ids($_->id);
+               push @{ $self->{__PACKAGE__."entries"} }, $_;
+       }
+
+#      my $max = $self->get_max_entries;
+#      if (defined $max) {
+#              my $exist_size = @{ $self->{__PACKAGE__."entries"} };
+#              if ($exist_size > $max) { return 0 }
+#              elsif ($exist_size + @entries > $max) {
+#                      return push @{ $self->{__PACKAGE__."entries"} }, @entries[0..$max - $exist_size];
+#              }
+#              else {
+#                      return push @{ $self->{__PACKAGE__."entries"} }, @entries;
+#              }
+#      }
+#      else {
+#              return push @{ $self->{__PACKAGE__."entries"} }, @entries;
+#      }
+}
+
+=head2 $file->get_entry_by_id(/regex/);
+
+Returns all of those entries which have id's that match the regex.
+
+=cut
+
+# Cache of sequence IDs for fast grepping
+sub _ids {
+       my ($self, $id) = @_;
+       if ($id) { push @{ $self->{__PACKAGE__."ids"} }, $id }
+       else { return @{ $self->{__PACKAGE__."ids"} } }
+}
+
+sub get_entry_by_id {
+       my ($self, $id) = @_;
+       croak "No id passed" unless defined $id;
+
+       #return grep { $_->id =~ /$id/ } $self->get_entries;
+
+       {
+               my @ids = $self->_ids;
+               my @indices = grep { $ids[$_] =~ /$id/ } 0..$#ids;
+               return $self->get_entry(@indices);
+       }
+}
+
+=head2 $file->set_max_entries($size);
+
+Limits the storing of $size records. Will prevent the addition of more records, but won't delete existing records in the object if there are already more than $size entries.
+
+=cut
+
+sub set_max_entries {
+       my ($self, $size) = @_;
+
+       $self->{__PACKAGE__."max_size"} = $size;
+       return $size;
+}
+
+=head2 $file->get_max_entries
+
+Accessor for set_max_entries().
+
+=cut
+
+sub get_max_entries {
+       my ($self) = @_;
+       return $self->{__PACKAGE__."max_size"};
+}
+
+=head2 
+
+=cut
+
+sub sub_seq {
+       my ($self, $start, $end) = @_;
+
+       croak "Not passed start and end arguments" unless 2 == grep { defined } $start, $end;
+
+       # Produce a new version of myself, in the right namespace
+       my ($new_self) = (ref $self)->new;
+
+       for ($self->get_entries) {
+               my ($seq) = $_->sub_seq($start, $end);
+               $new_self->add_entries($seq);
+       }
+
+       return $new_self;
+}
+
+1;
diff --git a/jpred/lib/Utils.pm b/jpred/lib/Utils.pm

new file mode 100644 (file)

index 0000000..76b8c57
--- /dev/null
+++ b/jpred/lib/Utils.pm
@@ -0,0 +1,430 @@
+package Utils;
+
+use strict;
+use warnings;
+use Carp;
+use File::Temp qw(tempfile);
+use File::Find qw(find);
+
+use base qw(Exporter);
+
+our @EXPORT_OK = qw(xsystem profile reduce_dssp conf jury find_jury read_file_list concise_record seq2int int2seq abs_solv_acc rel_solv_acc array_shuffle array_shuffle_in_place array_split $split_resid sort_resid find_files);
+push @EXPORT_OK, map { "reduce_dssp_$_" } qw(a b c jpred);
+
+# Reduce eight state DSSP definition to three states
+sub reduce_dssp ($;&) {
+       my ($seq, $code_ref) = @_;
+
+       $code_ref ||= \&reduce_dssp_jpred;
+       return $code_ref->($seq);
+}
+
+# reduce_dssp_[abc] are the methods mentioned in the Jpred
+# paper
+sub reduce_dssp_a ($) {
+       my ($sec_string) = @_;
+
+       $sec_string =~ tr/EBGH/EEHH/;
+
+       $sec_string =~ tr/EH/-/c;
+       return $sec_string;
+}
+
+sub reduce_dssp_b ($) {
+       my ($sec_string) = @_;
+
+       $sec_string =~ tr/EH/-/c;
+       $sec_string =~ s/(?<!E)EE(?!E)/--/g;
+       $sec_string =~ s/(?<!H)HHHH(?!H)/----/g;
+
+       $sec_string =~ tr/EH/-/c;
+       return $sec_string;
+}
+
+sub reduce_dssp_c ($) {
+       my ($sec_string) = @_;
+
+       $sec_string =~ s/GGGHHHH/HHHHHHH/g;
+       $sec_string =~ tr/B/-/;
+       $sec_string =~ s/GGG/---/g;
+
+       $sec_string =~ tr/EH/-/c;
+       return $sec_string;
+}
+
+=head2 reduce_dssp_jpred()
+
+# This is the method actually used in the q3anal.version2
+# program.
+# q3anal{,_dev} convert G to H.
+
+=cut
+
+sub reduce_dssp_jpred ($) {
+       my ($sec_string) = @_;
+
+       $sec_string =~ tr/ST_bGIC? /-/;
+       $sec_string =~ tr/B/E/;
+       $sec_string =~ tr/EH/-/c;
+
+       return $sec_string;
+}
+
+=head2 conf()
+
+# Select the subset of a prediction where the confidence is greater than some
+# limit.
+# Pass at least three scalars:
+# First is a string containing a list of integers between 0 and 9 which
+# correspond to the confidence at that position.
+# Second is a lower limit for the confidence.
+# Other arguments are strings of the sequence to be selected.
+
+=cut
+
+sub conf ($$$;@) {
+       my ($conf, $limit, @seqs) = @_;
+
+       croak "Sequences of different length" if grep { length $_ ne length $conf } @seqs;
+
+       for my $pos (reverse 0..length($conf) - 1) {
+               if (substr($conf, $pos, 1) >= $limit) {
+                       for (@seqs, $conf) {
+                               $_ = substr($_, 0, $pos).substr($_, $pos)
+                       }
+               }
+       }
+
+       return @seqs;
+}
+
+=head2 jury();
+
+Find the positions where there is jury agreement
+First argument is the jury sequence, the rest are sequences to be subselected.
+
+=cut
+
+sub jury ($$;@) {
+       my ($jury, @seqs) = @_;
+
+       croak "Sequences of different length" if grep { length $_ != length $jury } @seqs;
+
+       # All positions are jury
+       if ($jury  =~ y/*// == length $jury) {
+               return @seqs;
+       }
+       # Goes through sequences and removes those positions that are not jury positions.
+       for my $pos (reverse 0..length($jury)- 1) {
+               if (substr($jury, $pos, 1) eq '*') {
+                       $_ = substr($_, 0, $pos).substr($_, $pos + 1) for @seqs;
+                       $_ = substr($_, 0, $pos).substr($_, $pos + 1) for $jury;
+               }
+       }
+
+       return @seqs;
+}
+
+=head2 find_jury(@seqs)
+
+Find those positions in a prediction that are in a jury position (those positions where the predictions agree).
+
+Pass an array of sequences (represented as strings). Returns a string with the jury position represented by a space and non-jury positions as "*".
+
+=cut
+
+sub find_jury (@) {
+       my (@seqs) = @_;
+       
+       my $length = length $seqs[0];
+       croak "Sequences of different length" if grep { $length != length $_ } @seqs;
+
+       my $jury;
+       for my $pos (0..$length - 1) {
+               # Are all of the positions the same?
+               if (keys %{ { map { substr($_, $pos, 1), undef } @seqs } } == 1) {
+                       $jury .= " "
+               }
+               else {
+                       $jury .= "*"
+               }
+       }
+
+       return $jury;
+}
+
+=head2 @AoA = read_file_list($path)
+
+Reads in a file from the $path of the format:
+
+  file1a file2a
+  file1b file2b
+  file1c file2c
+  file1d file2d
+  file1e file2e
+
+Returns an array of arrays of the list of files. For the above file, this would be:
+
+  [
+    [ "file1a", "file2a" ],
+    [ "file1b", "file2b" ],
+    [ "file1c", "file2c" ],
+    [ "file1d", "file2d" ],
+    [ "file1e", "file2e" ]
+  ]
+
+=cut
+
+sub read_file_list ($) {
+       my ($file) = @_;
+       
+       local ($/, $_) = "\n";
+
+       open my $fh, $file or croak "Can't open file $file";
+
+       my @files;
+       while (<$fh>) {
+               chomp;
+               push @files, [ split ];
+       }
+       return @files
+}
+
+=head2 @AoA = profile(qw(PROTEIN MSA----))
+
+Pass an array of proteins sequences represented as strings of one letter residues codes, returns an array of arrays of the frequences of each residue.
+
+This is calculated using all of the sequences in the alignment, and doesn't count gaps, 'X' or 'Z'.
+
+=cut
+
+# Does a profile on a sequence
+sub profile (@) {
+       my (@seqs) = @_;
+       my @results;
+
+       # Check that all the sequences are the same length
+       my $length = length $seqs[0];
+       croak "Not all sequences are the same length" if grep { length $_ != $length } @seqs;
+
+       # Convert residues to integers
+       my @ar = map { [ seq2int(split //, $_) ] } @seqs;
+
+       # Find out how many of each type of residue occur at a position and create
+       # the profile
+       for my $i (0..$length - 1) { # for each position
+               # The number of elements should be one larger than the max size in
+               # seq2int
+               my @countup = (0) x 20;
+
+               $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++ for 0..$#ar; # for each sequence
+
+               for (0..$#ar) {
+                       $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++
+               }
+
+               my $total = 0;
+               $total += $_ for @countup;
+               #$total += $countup[$_] for 0..$#countup;
+
+               push @results, [
+                       map {
+                               # Check $total for div/0
+#                              sprintf "%2.f", ($total ? $countup[$_] / $total * 10 : 0)
+#                      } 0..$#countup
+                               sprintf "%2.f", ($total ? $_ / $total * 10 : 0)
+                       } @countup
+               ];
+       }
+
+       return @results;
+}
+
+=head2 @integers = seq2int(@residues)
+
+Converts a sequence into integer values. @residues should be an array of characters of the one letter amino acids codes.
+
+=head2 @residues = int2seq(@integers)
+
+Converts an array of integers into one letter amino acids codes produced by seq2int().
+
+=cut
+
+{
+       # This values should be as seq2int() in jnet.c
+       my (%seq2int, %int2seq);
+       @seq2int{split //, 'ARNDCQEGHILKMFPSTWYVBZX.-'} = (0..23, 23);
+       $seq2int{U} = $seq2int{C};
+       %int2seq = reverse %seq2int;
+
+       sub seq2int (@) {
+               map {
+                       exists $seq2int{uc $_} ?
+                               $seq2int{uc $_} :
+                               croak "Residue '$_' not recognised"
+               } @_
+       }
+
+       sub int2seq (@) {
+               map {
+                       exists $int2seq{$_} ?
+                               $int2seq{$_} :
+                               croak "Residue '$_' not recognised"
+               } @_
+       }
+}
+
+=head2 xsystem($system)
+
+Wraps the perl system() function. Returns undef if the command in $system doesn't work, perhaps, look at the code.
+
+=cut
+
+sub xsystem ($) {
+       my $status = system(@_);
+       if ($status == -1) {
+               carp "Failed to run command '@_': $!";
+               return;
+       }
+       elsif ($? & 127) {
+               printf "child died with signal %d, %s coredump\n", ($? & 127),  ($? & 128) ? 'with' : 'without';
+               return;
+       }
+       else {
+               #printf "child exited with value %d\n", $? >> 8
+               return $? >> 8;
+       }
+}
+
+=head2 $concise = concise_record($title, @data)
+
+Simple function to give a line in a concise file. Pass the title of the concise file, and the data that you want in the record. Returns a string in the concise format with a newline.
+
+=cut
+
+sub concise_record ($@) {
+       my ($title, @items) = @_;
+       return "$title:".join(",", @items)."\n";
+}
+
+{
+       my %data = (
+               'A' => '118', 'B' => '162', 'C' => '146', 'D' => '159',
+               'E' => '186', 'F' => '223', 'G' => '88', 'H' => '203',                          'I' => '181', 'K' => '226', 'L' => '193', 'M' => '204',
+               'N' => '166', 'P' => '147', 'Q' => '193', 'R' => '256',
+               'S' => '130', 'T' => '153', 'V' => '165', 'W' => '266',
+               'X' => '200', 'Y' => '237', 'Z' => '189', 'c' => '146'
+       );
+
+=head2 $solv = rel_solv_acc($residue, $accesibility);
+
+Pass the residue and the accessiblilty of the residue, and the relative solvent accesibility of the residue will be returned.
+
+=cut
+
+       sub rel_solv_acc ($$) {
+               my ($residue, $acc) = @_;
+               return unless $data{uc $residue};
+               return $acc / $data{uc $residue};
+       }
+
+=head2 $solv = abs_solv_acc($residue, $accesibility);
+
+Pass the residue and the accessiblilty of the residue, and the absolute solvent accesibility of the residue will be returned.
+
+=cut
+
+       sub abs_solv_acc ($$) {
+               my ($residue, $acc) = @_;
+               return unless $data{uc $residue};
+               return $data{uc $residue} * $acc;
+       }
+}
+
+=head2 @shuffled_data = array_shuffle(@data)
+
+Randomly shuffles an array.
+
+=cut
+
+sub array_shuffle (@) {
+       my (@data) = @_;
+       for (0..$#data) { 
+               my ($foo, $bar) = (int(rand $#data), int(rand $#data));
+               @data[$foo, $bar] = @data[$bar, $foo];
+       }
+       return @data;
+}
+
+=head2 array_shuffle_in_place(@data)
+
+Randomly shuffle an array in place.
+
+=cut
+
+sub array_shuffle_in_place (@) {
+       for (0..$#_) {
+               my ($foo, $bar) = (int(rand $#_), int(rand $#_));
+               @_[$foo, $bar] = @_[$bar, $foo];
+       }
+}
+
+=head2 @AoA = array_split($size, @data)
+
+Splits an array into $size sets of data returned in an AoA.
+
+=cut
+
+sub array_split ($@) {
+       my ($size, @array) = @_;
+       my ($i, @temp) = 0;
+       while (@array) { push @{ $temp[$i++ % $size] }, pop @array }
+       return @temp;
+}
+
+
+=head2 $split_resid = qr/(-?\d+)([[:alpha:]])/;
+
+Regular expression for dividing PDB residue offsets into the offset and letter components.
+
+=cut
+
+our $split_resid = qr/(-?\d+)([[:alpha:]])?/;
+
+=head2 sort_resid
+
+A subroutine for the sorting of PDB residue offsets, allows for the correct sorting of IDs such as "1A".
+
+=cut
+
+{
+       no warnings;
+       sub sort_resid {
+               (my ($a_off, $a_id) = $a =~ $split_resid);
+               (my ($b_off, $b_id) = $b =~ $split_resid);
+
+               $a_off <=> $b_off || $a_id cmp $b_id;
+       }
+}
+
+=head2 @files = find_files($dir)
+
+Find all files under $dir.
+
+=cut
+
+sub find_files ($) {
+    my ($dir) = @_;
+    my @files;
+
+    find(
+        sub {
+            push @files, $File::Find::dir."/".$_ if -e $_;
+        },
+        $dir
+    );
+
+    return @files;
+}
+
+1;
diff --git a/jpred/lib/Write.pm b/jpred/lib/Write.pm

new file mode 100644 (file)

index 0000000..af86eed
--- /dev/null
+++ b/jpred/lib/Write.pm
@@ -0,0 +1,70 @@
+package Write;
+
+use strict;
+use warnings;
+use Carp;
+
+use base qw(Root);
+
+use IO::String;
+use File::Temp;
+
+=head1 NAME
+
+Write - Two base methods for writing information.
+
+=head1 DESCRIPTION
+
+This module contains three methods for writing information into a program. They allow the writing of information from a string or filename and expect the method write() to be defind by the class that inherits this module.
+
+=head1 METHODS
+
+=head2 write_file($path)
+
+Opens a file at the given path and then calls the write method.
+
+=head2 write_gzip_file($path)
+
+Like write_file, but compresses the output using the system gzip command.
+
+=head2 write_string($scalar)
+
+Writes the data in the scalar and passes it to the write method.
+
+=head2 write($filehandle)
+
+This method will cause a fatal error unless it's overidden by the class that inherits this module.
+
+=cut
+
+sub write {
+       confess "The inheriting package hasn't defined the write() method\n"
+}
+
+sub write_file {
+       my ($self, $fn) = @_;
+
+       open my $fh, ">$fn" or confess "Can't open file $fn: $!";
+       $self->write($fh);
+       close $fh;
+}
+
+sub write_gzip_file {
+               my ($self, $fn) = @_;
+
+               my $gzipd_fn = File::Temp->new->filename;
+
+               $self->write_file($gzipd_fn);
+
+               system "gzip -c $gzipd_fn > $fn";
+}
+
+sub write_string {
+       my ($self) = @_;
+
+       my $string;
+       $self->write(IO::String->new($string));
+       return $string;
+}
+
+1;
author	Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
	Mon, 18 Nov 2013 14:18:19 +0000 (14:18 +0000)
committer	Sasha Sherstnev <a.sherstnev@dundee.ac.uk>
	Mon, 18 Nov 2013 14:18:19 +0000 (14:18 +0000)
jpred/bin/blastpgp	[new file with mode: 0755]	patch \| blob
jpred/bin/fastacmd	[new file with mode: 0755]	patch \| blob
jpred/bin/hmmbuild	[new file with mode: 0755]	patch \| blob
jpred/bin/hmmconvert	[new file with mode: 0755]	patch \| blob
jpred/bin/jnet	[new file with mode: 0755]	patch \| blob
jpred/bin/oc	[new file with mode: 0755]	patch \| blob
jpred/bin/pairwise	[new file with mode: 0755]	patch \| blob
jpred/jpred	[new file with mode: 0755]	patch \| blob
jpred/lib/BLC.pm	[new file with mode: 0755]	patch \| blob
jpred/lib/Common.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Concise.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Concise/File.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/DSSP.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/DSSP/SCOP.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/FASTA.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/FASTA/File.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Filelist.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/HMMER/Profile.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/HMMER/Profile/Jnet.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/HPM.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/IO/String.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Index.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Index/EMBOSS.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Index/FastaCMD.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Jpred.pm	[new file with mode: 0755]	patch \| blob
jpred/lib/OC.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/PSIBLAST.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/PSIBLAST/PSSM.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/PSIBLAST/PSSM/Jnet.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/PSIBLAST/Run.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Pairwise.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Paths.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Profile.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Profile/Generic.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Profile/Jnet.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Profile/Scanps.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Profile/Scanps/Result.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Quality.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Read.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Root.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Run.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/SOV.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Sequence.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Sequence/File.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Utils.pm	[new file with mode: 0644]	patch \| blob
jpred/lib/Write.pm	[new file with mode: 0644]	patch \| blob