--- /dev/null
+
+/* Implementation of the Wozniak "anti-diagonal" vectorization
+ strategy for Smith-Waterman comparison, Wozniak (1997) Comp.
+ Appl. Biosci. 13:145-150
+
+ November, 2004
+*/
+
+/*
+ Written by Erik Lindahl, Stockholm Bioinformatics Center, 2004.
+ Please send bug reports and/or suggestions to lindahl@sbc.su.se.
+*/
+
+#include <stdio.h>
+
+#include "defs.h"
+#include "param.h"
+#include "dropgsw.h"
+
+#ifdef SW_ALTIVEC
+
+int
+smith_waterman_altivec_word(unsigned char * query_sequence,
+ unsigned short * query_profile_word,
+ int query_length,
+ unsigned char * db_sequence,
+ int db_length,
+ unsigned short bias,
+ unsigned short gap_open,
+ unsigned short gap_extend,
+ struct f_struct * f_str)
+{
+ int i,j,k;
+ unsigned short * p;
+ unsigned short score;
+ unsigned char * p_dbseq;
+ int alphabet_size = f_str->alphabet_size;
+ unsigned short * workspace = (unsigned short *)f_str->workspace;
+
+ vector unsigned short Fup,Hup1,Hup2,E,F,H,tmp;
+ vector unsigned char perm;
+ vector unsigned short v_maxscore;
+ vector unsigned short v_bias,v_gapopen,v_gapextend;
+ vector unsigned short v_score;
+ vector unsigned short v_score_q1;
+ vector unsigned short v_score_q2;
+ vector unsigned short v_score_q3;
+ vector unsigned short v_score_load;
+ vector unsigned char queue1_to_score = (vector unsigned char)(16,17,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+ vector unsigned char queue2_to_queue1 = (vector unsigned char)(0,1,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
+ vector unsigned char queue3_to_queue2 = (vector unsigned char)(16,16,16,16,16,21,16,0,16,1,16,2,16,3,16,4);
+ vector unsigned char queue3_with_load = (vector unsigned char)(23,5,6,7,8,25,9,10,11,27,12,13,29,14,31,16);
+
+ /* Load the bias to all elements of a constant */
+ v_bias = vec_lde(0,&bias);
+ perm = vec_lvsl(0,&bias);
+ v_bias = vec_perm(v_bias,v_bias,perm);
+ v_bias = vec_splat(v_bias,0);
+
+ /* Load gap opening penalty to all elements of a constant */
+ v_gapopen = vec_lde(0,&gap_open);
+ perm = vec_lvsl(0,&gap_open);
+ v_gapopen = vec_perm(v_gapopen,v_gapopen,perm);
+ v_gapopen = vec_splat(v_gapopen,0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ v_gapextend = vec_lde(0,&gap_extend);
+ perm = vec_lvsl(0,&gap_extend);
+ v_gapextend = vec_perm(v_gapextend,v_gapextend,perm);
+ v_gapextend = vec_splat(v_gapextend,0);
+
+ v_maxscore = vec_xor(v_maxscore,v_maxscore);
+
+ // Zero out the storage vector
+ k = 2*(db_length+7);
+
+ for(i=0,j=0;i<k;i++,j+=16)
+ {
+ // borrow the zero value in v_maxscore to have something to store
+ vec_st(v_maxscore,j,workspace);
+ }
+
+ for(i=0;i<query_length;i+=8)
+ {
+ // fetch first data asap.
+ p_dbseq = db_sequence;
+ k = *p_dbseq++;
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // zero lots of stuff.
+ // We use both the VPERM and VSIU unit to knock off some cycles.
+
+ E = vec_splat_u16(0);
+ F = vec_xor(F,F);
+ H = vec_splat_u16(0);
+ Hup2 = vec_xor(Hup2,Hup2);
+ v_score_q1 = vec_splat_u16(0);
+ v_score_q2 = vec_xor(v_score_q2,v_score_q2);
+ v_score_q3 = vec_splat_u16(0);
+
+ // reset pointers to the start of the saved data from the last row
+ p = workspace;
+
+ // PROLOGUE 1
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 2
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 3
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 4
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 5
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 6
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 7
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 8
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // reset pointers to the start of the saved data from the last row
+ p = workspace;
+
+ for(j=8;j<db_length;j+=8)
+ {
+ // STEP 1
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup1 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 2
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup2 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup2 = vec_sld(Hup2,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 3
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup1 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 4
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup2 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup2 = vec_sld(Hup2,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 5
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup1 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 6
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup2 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup2 = vec_sld(Hup2,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 7
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup1 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 8
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup2 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup2 = vec_sld(Hup2,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+ }
+
+ v_score_load = vec_splat_u16(0);
+
+ for(;j<db_length+7;j++)
+ {
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+ //
+ // This could of course be done with only vec_perm or vec_sel,
+ // but since they use different execution units we have found
+ // it to be slightly faster to mix them.
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // v_score_load contains all zeros
+ Fup = vec_sld(v_score_load,F,14);
+ Hup1 = vec_sld(v_score_load,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+ }
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+
+ query_profile_word += 8*alphabet_size;
+ }
+
+ // find largest score in the v_maxscore vector
+ tmp = vec_sld(v_maxscore,v_maxscore,8);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,4);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,2);
+ v_maxscore = vec_max(v_maxscore,tmp);
+
+ // store in temporary variable
+ vec_ste(v_maxscore,0,&score);
+
+ // return largest score
+ return score;
+}
+
+int
+smith_waterman_altivec_byte(unsigned char * query_sequence,
+ unsigned char * query_profile_byte,
+ int query_length,
+ unsigned char * db_sequence,
+ int db_length,
+ unsigned char bias,
+ unsigned char gap_open,
+ unsigned char gap_extend,
+ struct f_struct * f_str)
+{
+ int i,j,k,k8;
+ int overflow;
+ unsigned char * p;
+ unsigned char score;
+ int alphabet_size = f_str->alphabet_size;
+ unsigned char * workspace = (unsigned char *)f_str->workspace;
+
+ vector unsigned char Fup,Hup1,Hup2,E,F,H,tmp;
+ vector unsigned char perm;
+ vector unsigned char v_maxscore;
+ vector unsigned char v_bias,v_gapopen,v_gapextend;
+ vector unsigned char v_score;
+ vector unsigned char v_score_q1;
+ vector unsigned char v_score_q2;
+ vector unsigned char v_score_q3;
+ vector unsigned char v_score_q4;
+ vector unsigned char v_score_q5;
+ vector unsigned char v_score_load1;
+ vector unsigned char v_score_load2;
+ vector unsigned char v_zero;
+
+ vector unsigned char queue1_to_score = (vector unsigned char)(16,1,2,3,4,5,6,7,24,9,10,11,12,13,14,15);
+ vector unsigned char queue2_to_queue1 = (vector unsigned char)(16,17,2,3,4,5,6,7,24,25,10,11,12,13,14,15);
+ vector unsigned char queue3_to_queue2 = (vector unsigned char)(16,17,18,3,4,5,6,7,24,25,26,11,12,13,14,15);
+ vector unsigned char queue4_to_queue3 = (vector unsigned char)(16,17,18,19,4,5,6,7,24,25,26,27,12,13,14,15);
+ vector unsigned char queue5_to_queue4 = (vector unsigned char)(16,17,18,19,20,2,3,4,24,25,26,27,28,10,11,12);
+ vector unsigned char queue5_with_load = (vector unsigned char)(19,20,21,5,6,22,7,23,27,28,29,13,14,30,15,31);
+ vector unsigned char merge_score_load = (vector unsigned char)(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+
+ v_zero = vec_splat_u8(0);
+
+ /* Load the bias to all elements of a constant */
+ v_bias = vec_lde(0,&bias);
+ perm = vec_lvsl(0,&bias);
+ v_bias = vec_perm(v_bias,v_bias,perm);
+ v_bias = vec_splat(v_bias,0);
+
+ /* Load gap opening penalty to all elements of a constant */
+ v_gapopen = vec_lde(0,&gap_open);
+ perm = vec_lvsl(0,&gap_open);
+ v_gapopen = vec_perm(v_gapopen,v_gapopen,perm);
+ v_gapopen = vec_splat(v_gapopen,0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ v_gapextend = vec_lde(0,&gap_extend);
+ perm = vec_lvsl(0,&gap_extend);
+ v_gapextend = vec_perm(v_gapextend,v_gapextend,perm);
+ v_gapextend = vec_splat(v_gapextend,0);
+
+ v_maxscore = vec_xor(v_maxscore,v_maxscore);
+
+ // Zero out the storage vector
+ k = (db_length+15);
+ for(i=0,j=0;i<k;i++,j+=32)
+ {
+ // borrow the zero value in v_maxscore to have something to store
+ vec_st(v_maxscore,j,workspace);
+ vec_st(v_maxscore,j+16,workspace);
+ }
+
+ for(i=0;i<query_length;i+=16)
+ {
+ // zero lots of stuff.
+ // We use both the VPERM and VSIU unit to knock off some cycles.
+
+ E = vec_splat_u8(0);
+ F = vec_xor(F,F);
+ H = vec_splat_u8(0);
+ Hup2 = vec_xor(Hup2,Hup2);
+ v_score_q1 = vec_splat_u8(0);
+ v_score_q2 = vec_xor(v_score_q2,v_score_q2);
+ v_score_q3 = vec_splat_u8(0);
+ v_score_q4 = vec_xor(v_score_q4,v_score_q4);
+ v_score_q5 = vec_splat_u8(0);
+
+ // reset pointers to the start of the saved data from the last row
+ p = workspace;
+
+ // start directly and prefetch score column
+ k = db_sequence[0];
+ k8 = k;
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = v_score_load1;
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // PROLOGUE 1
+ // prefetch next residue
+ k = db_sequence[1];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+ // PROLOGUE 2
+ // prefetch next residue
+ k = db_sequence[2];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 3
+ // prefetch next residue
+ k = db_sequence[3];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 4
+ // prefetch next residue
+ k = db_sequence[4];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 5
+ // prefetch next residue
+ k = db_sequence[5];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 6
+ // prefetch next residue
+ k = db_sequence[6];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 7
+ // prefetch next residue
+ k = db_sequence[7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 8
+ // prefetch next residue
+ k = db_sequence[8];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+ // PROLOGUE 9
+ // prefetch next residue
+ k = db_sequence[9];
+ k8 = db_sequence[1];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 10
+ // prefetch next residue
+ k = db_sequence[10];
+ k8 = db_sequence[2];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+ // PROLOGUE 11
+ // prefetch next residue
+ k = db_sequence[11];
+ k8 = db_sequence[3];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 12
+ // prefetch next residue
+ k = db_sequence[12];
+ k8 = db_sequence[4];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+ // PROLOGUE 13
+ // prefetch next residue
+ k = db_sequence[13];
+ k8 = db_sequence[5];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 14
+ // prefetch next residue
+ k = db_sequence[14];
+ k8 = db_sequence[6];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 15
+ // prefetch next residue
+ k = db_sequence[15];
+ k8 = db_sequence[7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 16
+ // prefetch next residue
+ k = db_sequence[16];
+ k8 = db_sequence[8];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ p = workspace;
+
+ for(j=16;j<db_length;j+=16)
+ {
+ // STEP 1
+
+ // prefetch next residue
+ k = db_sequence[j+1];
+ k8 = db_sequence[j-7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+ // STEP 2
+
+ // prefetch next residue
+ k = db_sequence[j+2];
+ k8 = db_sequence[j-6];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 3
+
+ // prefetch next residue
+ k = db_sequence[j+3];
+ k8 = db_sequence[j-5];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 4
+
+ // prefetch next residue
+ k = db_sequence[j+4];
+ k8 = db_sequence[j-4];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 5
+
+ // prefetch next residue
+ k = db_sequence[j+5];
+ k8 = db_sequence[j-3];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 6
+
+ // prefetch next residue
+ k = db_sequence[j+6];
+ k8 = db_sequence[j-2];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 7
+
+ // prefetch next residue
+ k = db_sequence[j+7];
+ k8 = db_sequence[j-1];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 8
+
+ // prefetch next residue
+ k = db_sequence[j+8];
+ k8 = db_sequence[j];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 9
+
+ // prefetch next residue
+ k = db_sequence[j+9];
+ k8 = db_sequence[j+1];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 10
+
+ // prefetch next residue
+ k = db_sequence[j+10];
+ k8 = db_sequence[j+2];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 11
+
+ // prefetch next residue
+ k = db_sequence[j+11];
+ k8 = db_sequence[j+3];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 12
+
+ // prefetch next residue
+ k = db_sequence[j+12];
+ k8 = db_sequence[j+4];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 13
+
+ // prefetch next residue
+ k = db_sequence[j+13];
+ k8 = db_sequence[j+5];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 14
+
+ // prefetch next residue
+ k = db_sequence[j+14];
+ k8 = db_sequence[j+6];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 15
+
+ // prefetch next residue
+ k = db_sequence[j+15];
+ k8 = db_sequence[j+7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 16
+
+ // prefetch next residue
+ k = db_sequence[j+16];
+ k8 = db_sequence[j+8];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ }
+
+ for(;j<db_length+15;j++)
+ {
+ k8 = db_sequence[j-7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+ v_score_load1 = vec_perm(v_zero,v_score_load2,merge_score_load);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32; // move ahead 32 bytes
+
+ Fup = vec_sld(v_zero,F,15);
+ Hup1 = vec_sld(v_zero,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+ }
+ vec_st(F, 512, p);
+ vec_st(H, 528, p);
+
+ query_profile_byte += 16*alphabet_size;
+
+ // End of this row (actually 16 rows due to SIMD).
+ // Before we continue, check for overflow.
+ tmp = vec_subs(vec_splat_u8(-1),v_bias);
+ overflow = vec_any_ge(v_maxscore,tmp);
+
+
+ }
+
+ if(overflow)
+ {
+ return 255;
+ }
+ else
+ {
+ // find largest score in the v_maxscore vector
+ tmp = vec_sld(v_maxscore,v_maxscore,8);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,4);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,2);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,1);
+ v_maxscore = vec_max(v_maxscore,tmp);
+
+ // store in temporary variable
+ vec_ste(v_maxscore,0,&score);
+
+ // return largest score
+ return score;
+ }}
+
+
+#else
+
+/* No Altivec support. Avoid compiler complaints about empty object */
+
+int sw_dummy;
+
+#endif