#!/usr/bin/perl # # Program to query SWALL with a sequence and PSIBLAST and return the # profile, the PSIBLAST output and the alignment used by PSIBLAST. # The alignment is placed on STDOUT by default in FASTA format. # # --query path to query in FASTA format # --profile path to location for profile output # --blast path to location for PSIBLAST output # --out location for alignment output, defaults to STDOUT # use strict; use warnings; use Getopt::Long; use lib '/homes/www-jpred/new_site/bin64'; use jpred; # Modified function which uses fastacmd rather the indexing # scheme used the db_dbindex db creation script. This is useful # as the previous method was 32- bit dependent. # - CC 30/08/06 sub getseq { my ($hash_ref) = shift @_; foreach my $seqID (keys %{$hash_ref}) { next if ($seqID eq 'QUERY'); # skip query sequence my $seq; open(FCMD, "$BINDIR/fastacmd -d $SWALL -s $seqID |") or die "unable to retrieve sequence $seqID from database $SWALL: $!"; while () { chomp; next if /^>/; # skip desc line $seq .= $_; # } close(FCMD); if (!$seq) { die "ERROR - $seqID has no sequence in SWALL database.\n"; } ${$hash_ref}{$seqID} = $seq; } } my ($out, $query, $profile, $blast) = qw(-); GetOptions( "query=s" => \$query, "profile=s" => \$profile, "blast=s" => \$blast, "out:s" => \$out ); die "no query sequence path given\n" unless $query; die "no profile output path given\n" unless $profile; die "no BLAST output path given\n" unless $blast; xsystem("$BINDIR/blastpgp -e0.05 -h0.01 -m6 -b20000 -j3 -d $SWALLFILT -i $query -Q $profile -o $blast"); # Find the number of passes and get the position at which the sequences # start, plus find the length of the query to check if it has been shortened # in the alignment later on. my $position; my $queryLength = 0; open BLAST, "<$blast" or die "$blast: $!\n"; while () { if (/\((\d+) letters\)/) { $queryLength = $1; } if (/^Sequences used in model and found/) { ; $position = tell BLAST; } } # Check we found some sequences (FIXME: may need to be more robust) unless ($position) { warn "PSIBLAST found no homologous sequences and couldn't construct an alignment. As Jpred requires a multiple alignment it couldn't continue.\n"; die "Jpred failed\n"; } if (!$queryLength) { warn "unable to match query length\n"; } # Get the sequences used and their alignments seek BLAST, $position, 0; my (@alignments, @order, $flag, %seqid); while () { # Get the alignments if (/^QUERY/) { my $id = (split ' ', $_)[0]; push @order, $id; $seqid{$id} = undef; chomp; push @alignments, $_; while () { last if /^ Database: /; last if /^ \*{5} No hits found \*{5}/; last if /^Searching/; # Only bother getting the sequence ID's # from the first block of alignments... if (/^\n/) { $flag = 1, next } unless ($flag) { # Get the sequence ID my $id = (split " ", $_)[0]; push @order, $id; $seqid{$id} = undef; } # ...but continue adding the sequences from the alignments chomp; push @alignments, $_; } } } close BLAST; # Get the ID and sequence for each of the alignment sequence &getseq(\%seqid); # Replace the X's in the alignment with their proper characters # Add these onto @sequences my @sequences; my $padStart; my $padEnd; foreach (0..$#alignments) { my $i = $_ % @order; my ($seqid, $start, $seq, $end) = split ' ', $alignments[$_], 4; # blastpgp output allways has a ID at the begining of the line, # but can have a variable number of fields after that, but it # means the sequence contains no characters of interest # e.g.: # $seqid $start $seq $end # O24727 ------------------------------------------------------------ # O24727 0 ------------------------------------------------------------ # O24727 26 -Y--A---A-----L-----R----S-------L---------G-------P----V--- 35 unless ($end) { unless ($seq) { $sequences[$i] .= $start; next; } else { $sequences[$i] .= $seq; next; } } # Don't do anything for the query sequence, but check whether the ends have been chopped off if ($i == 0) { if ( ($_ == 0) && ($start != 1) ) { #warn "QUERY doesn't start at the beginning! ($start)\n"; $padStart = '-'x($start-1); } if ((scalar @alignments/scalar @order == ($_/scalar @order) +1 ) && ($queryLength != $end)) { my $diff = $queryLength - $end; #warn "end is missing from QUERY! ($diff)\n"; $padEnd = '-'x$diff; } $sequences[$i] .= $seq; next; } my @seq = split '', $seq; my $j = 0; foreach (@seq) { # Skip gaps and blanks, don't add one to the counter ($j) # expect for proper characters. next if $_ eq '-' or $_ eq ' ';# /[ -]/; if (/X/) { unless ($seqid{$seqid}) { die "Internal conflict between SWISS-PROT and BLAST databases (were they built from the same data?): $seqid has was not found.\n"; } # If the length of the sequence is less than # where we're accessing it from there's a problem. if (length $seqid{$seqid} < $start + $j) { warn length $seqid{$seqid}, ", $start, $j, $seqid, $seqid{$seqid}, $i\n"; } # - 1 as start is indexed from 1... $_ = substr($seqid{$seqid}, $start + $j - 1, 1); } $j++; } $sequences[$i] .= join '', @seq; } ## If BLAST has shortened the query sequence replace the missing residues ## in the QUERY and pad the other sequences with gaps if ($padEnd or $padStart) { warn "BLAST has shortened the query sequence. Getting original query and padding aligned matches.\n"; ## get query sequence my $querySeq; open(my $FH, $query) or die "$query: $!"; while(<$FH>) { next if (/>/); chomp; $querySeq .= $_; } close($FH); die "Unable to find a sequence in file: $query" if (!$querySeq); ## correct sequences if ($padEnd) { my $len = length($padEnd); my $pad = substr($querySeq, -$len); # grab right no. of residues from end of query $sequences[0] .= $pad; # add missing residues to end of query $sequences[$_] .= $padEnd for (1..$#order); # add equivalent no. of gaps in hit seqs } if ($padStart) { my $len = length($padStart); my $pad = substr($querySeq, 0, $len); # grab right no. of residues from start of query $sequences[0] = $pad.$sequences[0]; # add missing residues to start of query $sequences[$_] = $padStart.$sequences[$_] for (1..$#order); # add equivalent no. of gaps in hit seqs } } # Print the results in FASTA, replace QUERY with filename of query, # prints them in order in the profile occurance, if two (or more) local # alignments occur in one sequence, print them as seperate results open OUT, ">$out" or die "$out: $!\n"; foreach (0..$#order) { print OUT ">$order[$_]\n"; $sequences[$_] =~ s/(.{72})/$1\n/g; print OUT $sequences[$_], "\n"; } close OUT; # Remove zero sized error logs that blastpgp helpfully creates if (-e 'error.log' && ! -z 'error.log') { warn "blastpgp wrote error log file\n"; } else { unlink "error.log"; }