4 # Program to query SWALL with a sequence and PSIBLAST and return the
5 # profile, the PSIBLAST output and the alignment used by PSIBLAST.
6 # The alignment is placed on STDOUT by default in FASTA format.
8 # --query <path> path to query in FASTA format
9 # --profile <path> path to location for profile output
10 # --blast <path> path to location for PSIBLAST output
11 # --out <path> location for alignment output, defaults to STDOUT
18 use lib '/homes/www-jpred/new_site/bin64';
22 # Modified function which uses fastacmd rather the indexing
23 # scheme used the db_dbindex db creation script. This is useful
24 # as the previous method was 32- bit dependent.
27 my ($hash_ref) = shift @_;
29 foreach my $seqID (keys %{$hash_ref}) {
31 next if ($seqID eq 'QUERY'); # skip query sequence
35 open(FCMD, "$BINDIR/fastacmd -d $SWALL -s $seqID |") or die "unable to retrieve sequence $seqID from database $SWALL: $!";
38 next if /^>/; # skip desc line
43 die "ERROR - $seqID has no sequence in SWALL database.\n";
45 ${$hash_ref}{$seqID} = $seq;
49 my ($out, $query, $profile, $blast) = qw(-);
52 "profile=s" => \$profile,
57 die "no query sequence path given\n" unless $query;
58 die "no profile output path given\n" unless $profile;
59 die "no BLAST output path given\n" unless $blast;
61 xsystem("$BINDIR/blastpgp -e0.05 -h0.01 -m6 -b20000 -j3 -d $SWALLFILT -i $query -Q $profile -o $blast");
63 # Find the number of passes and get the position at which the sequences
64 # start, plus find the length of the query to check if it has been shortened
65 # in the alignment later on.
68 open BLAST, "<$blast" or die "$blast: $!\n";
71 if (/\((\d+) letters\)/) {
75 if (/^Sequences used in model and found/) {
77 $position = tell BLAST;
81 # Check we found some sequences (FIXME: may need to be more robust)
83 warn "PSIBLAST found no homologous sequences and couldn't construct an alignment. As Jpred requires a multiple alignment it couldn't continue.\n";
88 warn "unable to match query length\n";
91 # Get the sequences used and their alignments
92 seek BLAST, $position, 0;
93 my (@alignments, @order, $flag, %seqid);
97 my $id = (split ' ', $_)[0];
102 push @alignments, $_;
105 last if /^ Database: /;
106 last if /^ \*{5} No hits found \*{5}/;
107 last if /^Searching/;
108 # Only bother getting the sequence ID's
109 # from the first block of alignments...
110 if (/^\n/) { $flag = 1, next }
111 unless ($flag) { # Get the sequence ID
112 my $id = (split " ", $_)[0];
116 # ...but continue adding the sequences from the alignments
118 push @alignments, $_;
124 # Get the ID and sequence for each of the alignment sequence
127 # Replace the X's in the alignment with their proper characters
128 # Add these onto @sequences
132 foreach (0..$#alignments) {
134 my ($seqid, $start, $seq, $end) = split ' ', $alignments[$_], 4;
136 # blastpgp output allways has a ID at the begining of the line,
137 # but can have a variable number of fields after that, but it
138 # means the sequence contains no characters of interest
140 # $seqid $start $seq $end
141 # O24727 ------------------------------------------------------------
142 # O24727 0 ------------------------------------------------------------
143 # O24727 26 -Y--A---A-----L-----R----S-------L---------G-------P----V--- 35
146 $sequences[$i] .= $start;
150 $sequences[$i] .= $seq;
155 # Don't do anything for the query sequence, but check whether the ends have been chopped off
157 if ( ($_ == 0) && ($start != 1) ) {
158 #warn "QUERY doesn't start at the beginning! ($start)\n";
159 $padStart = '-'x($start-1);
161 if ((scalar @alignments/scalar @order == ($_/scalar @order) +1 ) && ($queryLength != $end)) {
162 my $diff = $queryLength - $end;
163 #warn "end is missing from QUERY! ($diff)\n";
166 $sequences[$i] .= $seq;
170 my @seq = split '', $seq;
173 # Skip gaps and blanks, don't add one to the counter ($j)
174 # expect for proper characters.
175 next if $_ eq '-' or $_ eq ' ';# /[ -]/;
178 unless ($seqid{$seqid}) {
179 die "Internal conflict between SWISS-PROT and BLAST databases (were they built from the same data?): $seqid has was not found.\n";
181 # If the length of the sequence is less than
182 # where we're accessing it from there's a problem.
183 if (length $seqid{$seqid} < $start + $j) {
184 warn length $seqid{$seqid}, ", $start, $j, $seqid, $seqid{$seqid}, $i\n";
187 # - 1 as start is indexed from 1...
188 $_ = substr($seqid{$seqid}, $start + $j - 1, 1);
192 $sequences[$i] .= join '', @seq;
195 ## If BLAST has shortened the query sequence replace the missing residues
196 ## in the QUERY and pad the other sequences with gaps
197 if ($padEnd or $padStart) {
198 warn "BLAST has shortened the query sequence. Getting original query and padding aligned matches.\n";
200 ## get query sequence
202 open(my $FH, $query) or die "$query: $!";
209 die "Unable to find a sequence in file: $query" if (!$querySeq);
213 my $len = length($padEnd);
214 my $pad = substr($querySeq, -$len); # grab right no. of residues from end of query
215 $sequences[0] .= $pad; # add missing residues to end of query
216 $sequences[$_] .= $padEnd for (1..$#order); # add equivalent no. of gaps in hit seqs
219 my $len = length($padStart);
220 my $pad = substr($querySeq, 0, $len); # grab right no. of residues from start of query
221 $sequences[0] = $pad.$sequences[0]; # add missing residues to start of query
222 $sequences[$_] = $padStart.$sequences[$_] for (1..$#order); # add equivalent no. of gaps in hit seqs
226 # Print the results in FASTA, replace QUERY with filename of query,
227 # prints them in order in the profile occurance, if two (or more) local
228 # alignments occur in one sequence, print them as seperate results
229 open OUT, ">$out" or die "$out: $!\n";
230 foreach (0..$#order) {
231 print OUT ">$order[$_]\n";
232 $sequences[$_] =~ s/(.{72})/$1\n/g;
233 print OUT $sequences[$_], "\n";
237 # Remove zero sized error logs that blastpgp helpfully creates
238 if (-e 'error.log' && ! -z 'error.log') {
239 warn "blastpgp wrote error log file\n";