--- /dev/null
+#!/usr/bin/perl
+
+#
+# Program to query SWALL with a sequence and PSIBLAST and return the
+# profile, the PSIBLAST output and the alignment used by PSIBLAST.
+# The alignment is placed on STDOUT by default in FASTA format.
+#
+# --query <path> path to query in FASTA format
+# --profile <path> path to location for profile output
+# --blast <path> path to location for PSIBLAST output
+# --out <path> location for alignment output, defaults to STDOUT
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+use lib '/homes/www-jpred/new_site/bin64';
+
+use jpred;
+
+# Modified function which uses fastacmd rather the indexing
+# scheme used the db_dbindex db creation script. This is useful
+# as the previous method was 32- bit dependent.
+# - CC 30/08/06
+sub getseq {
+ my ($hash_ref) = shift @_;
+
+ foreach my $seqID (keys %{$hash_ref}) {
+
+ next if ($seqID eq 'QUERY'); # skip query sequence
+
+ my $seq;
+
+ open(FCMD, "$BINDIR/fastacmd -d $SWALL -s $seqID |") or die "unable to retrieve sequence $seqID from database $SWALL: $!";
+ while (<FCMD>) {
+ chomp;
+ next if /^>/; # skip desc line
+ $seq .= $_; #
+ }
+ close(FCMD);
+ if (!$seq) {
+ die "ERROR - $seqID has no sequence in SWALL database.\n";
+ }
+ ${$hash_ref}{$seqID} = $seq;
+ }
+}
+
+my ($out, $query, $profile, $blast) = qw(-);
+GetOptions(
+ "query=s" => \$query,
+ "profile=s" => \$profile,
+ "blast=s" => \$blast,
+ "out:s" => \$out
+);
+
+die "no query sequence path given\n" unless $query;
+die "no profile output path given\n" unless $profile;
+die "no BLAST output path given\n" unless $blast;
+
+xsystem("$BINDIR/blastpgp -e0.05 -h0.01 -m6 -b20000 -j3 -d $SWALLFILT -i $query -Q $profile -o $blast");
+
+# Find the number of passes and get the position at which the sequences
+# start, plus find the length of the query to check if it has been shortened
+# in the alignment later on.
+my $position;
+my $queryLength = 0;
+open BLAST, "<$blast" or die "$blast: $!\n";
+while (<BLAST>) {
+
+ if (/\((\d+) letters\)/) {
+ $queryLength = $1;
+ }
+
+ if (/^Sequences used in model and found/) {
+ <BLAST>;
+ $position = tell BLAST;
+ }
+}
+
+# Check we found some sequences (FIXME: may need to be more robust)
+unless ($position) {
+ warn "PSIBLAST found no homologous sequences and couldn't construct an alignment. As Jpred requires a multiple alignment it couldn't continue.\n";
+ die "Jpred failed\n";
+}
+
+if (!$queryLength) {
+ warn "unable to match query length\n";
+}
+
+# Get the sequences used and their alignments
+seek BLAST, $position, 0;
+my (@alignments, @order, $flag, %seqid);
+while (<BLAST>) {
+ # Get the alignments
+ if (/^QUERY/) {
+ my $id = (split ' ', $_)[0];
+ push @order, $id;
+ $seqid{$id} = undef;
+
+ chomp;
+ push @alignments, $_;
+
+ while (<BLAST>) {
+ last if /^ Database: /;
+ last if /^ \*{5} No hits found \*{5}/;
+ last if /^Searching/;
+ # Only bother getting the sequence ID's
+ # from the first block of alignments...
+ if (/^\n/) { $flag = 1, next }
+ unless ($flag) { # Get the sequence ID
+ my $id = (split " ", $_)[0];
+ push @order, $id;
+ $seqid{$id} = undef;
+ }
+ # ...but continue adding the sequences from the alignments
+ chomp;
+ push @alignments, $_;
+ }
+ }
+}
+close BLAST;
+
+# Get the ID and sequence for each of the alignment sequence
+&getseq(\%seqid);
+
+# Replace the X's in the alignment with their proper characters
+# Add these onto @sequences
+my @sequences;
+my $padStart;
+my $padEnd;
+foreach (0..$#alignments) {
+ my $i = $_ % @order;
+ my ($seqid, $start, $seq, $end) = split ' ', $alignments[$_], 4;
+
+ # blastpgp output allways has a ID at the begining of the line,
+ # but can have a variable number of fields after that, but it
+ # means the sequence contains no characters of interest
+ # e.g.:
+# $seqid $start $seq $end
+# O24727 ------------------------------------------------------------
+# O24727 0 ------------------------------------------------------------
+# O24727 26 -Y--A---A-----L-----R----S-------L---------G-------P----V--- 35
+ unless ($end) {
+ unless ($seq) {
+ $sequences[$i] .= $start;
+ next;
+ }
+ else {
+ $sequences[$i] .= $seq;
+ next;
+ }
+ }
+
+ # Don't do anything for the query sequence, but check whether the ends have been chopped off
+ if ($i == 0) {
+ if ( ($_ == 0) && ($start != 1) ) {
+ #warn "QUERY doesn't start at the beginning! ($start)\n";
+ $padStart = '-'x($start-1);
+ }
+ if ((scalar @alignments/scalar @order == ($_/scalar @order) +1 ) && ($queryLength != $end)) {
+ my $diff = $queryLength - $end;
+ #warn "end is missing from QUERY! ($diff)\n";
+ $padEnd = '-'x$diff;
+ }
+ $sequences[$i] .= $seq;
+ next;
+ }
+
+ my @seq = split '', $seq;
+ my $j = 0;
+ foreach (@seq) {
+ # Skip gaps and blanks, don't add one to the counter ($j)
+ # expect for proper characters.
+ next if $_ eq '-' or $_ eq ' ';# /[ -]/;
+
+ if (/X/) {
+ unless ($seqid{$seqid}) {
+ die "Internal conflict between SWISS-PROT and BLAST databases (were they built from the same data?): $seqid has was not found.\n";
+ }
+ # If the length of the sequence is less than
+ # where we're accessing it from there's a problem.
+ if (length $seqid{$seqid} < $start + $j) {
+ warn length $seqid{$seqid}, ", $start, $j, $seqid, $seqid{$seqid}, $i\n";
+ }
+
+ # - 1 as start is indexed from 1...
+ $_ = substr($seqid{$seqid}, $start + $j - 1, 1);
+ }
+ $j++;
+ }
+ $sequences[$i] .= join '', @seq;
+}
+
+## If BLAST has shortened the query sequence replace the missing residues
+## in the QUERY and pad the other sequences with gaps
+if ($padEnd or $padStart) {
+ warn "BLAST has shortened the query sequence. Getting original query and padding aligned matches.\n";
+
+ ## get query sequence
+ my $querySeq;
+ open(my $FH, $query) or die "$query: $!";
+ while(<$FH>) {
+ next if (/>/);
+ chomp;
+ $querySeq .= $_;
+ }
+ close($FH);
+ die "Unable to find a sequence in file: $query" if (!$querySeq);
+
+ ## correct sequences
+ if ($padEnd) {
+ my $len = length($padEnd);
+ my $pad = substr($querySeq, -$len); # grab right no. of residues from end of query
+ $sequences[0] .= $pad; # add missing residues to end of query
+ $sequences[$_] .= $padEnd for (1..$#order); # add equivalent no. of gaps in hit seqs
+ }
+ if ($padStart) {
+ my $len = length($padStart);
+ my $pad = substr($querySeq, 0, $len); # grab right no. of residues from start of query
+ $sequences[0] = $pad.$sequences[0]; # add missing residues to start of query
+ $sequences[$_] = $padStart.$sequences[$_] for (1..$#order); # add equivalent no. of gaps in hit seqs
+ }
+}
+
+# Print the results in FASTA, replace QUERY with filename of query,
+# prints them in order in the profile occurance, if two (or more) local
+# alignments occur in one sequence, print them as seperate results
+open OUT, ">$out" or die "$out: $!\n";
+foreach (0..$#order) {
+ print OUT ">$order[$_]\n";
+ $sequences[$_] =~ s/(.{72})/$1\n/g;
+ print OUT $sequences[$_], "\n";
+}
+close OUT;
+
+# Remove zero sized error logs that blastpgp helpfully creates
+if (-e 'error.log' && ! -z 'error.log') {
+ warn "blastpgp wrote error log file\n";
+}
+else {
+ unlink "error.log";
+}