Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / fasta34 / smith_waterman_altivec.c
diff --git a/website/archive/binaries/mac/src/fasta34/smith_waterman_altivec.c b/website/archive/binaries/mac/src/fasta34/smith_waterman_altivec.c
new file mode 100644 (file)
index 0000000..122aab2
--- /dev/null
@@ -0,0 +1,3086 @@
+
+/* Implementation of the Wozniak "anti-diagonal" vectorization
+   strategy for Smith-Waterman comparison, Wozniak (1997) Comp.
+   Appl. Biosci. 13:145-150
+
+   November, 2004
+*/
+
+/*
+  Written by Erik Lindahl, Stockholm Bioinformatics Center, 2004.
+  Please send bug reports and/or suggestions to lindahl@sbc.su.se.
+*/
+
+#include <stdio.h>
+
+#include "defs.h"
+#include "param.h"
+#include "dropgsw.h"
+
+#ifdef SW_ALTIVEC
+
+int
+smith_waterman_altivec_word(unsigned char *     query_sequence,
+                            unsigned short *    query_profile_word,
+                            int                 query_length,
+                            unsigned char *     db_sequence,
+                            int                 db_length,
+                            unsigned short      bias,
+                            unsigned short      gap_open,
+                            unsigned short      gap_extend,
+                            struct f_struct *   f_str)
+{
+    int                     i,j,k;
+    unsigned short *        p;
+    unsigned short          score;   
+    unsigned char *         p_dbseq;
+    int                     alphabet_size = f_str->alphabet_size;
+    unsigned short *        workspace     = (unsigned short *)f_str->workspace;
+
+    vector unsigned short   Fup,Hup1,Hup2,E,F,H,tmp;
+    vector unsigned char    perm;
+    vector unsigned short   v_maxscore;
+    vector unsigned short   v_bias,v_gapopen,v_gapextend;
+    vector unsigned short   v_score;
+    vector unsigned short   v_score_q1;
+    vector unsigned short   v_score_q2;
+    vector unsigned short   v_score_q3;
+    vector unsigned short   v_score_load; 
+    vector unsigned char    queue1_to_score  = (vector unsigned char)(16,17,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+    vector unsigned char    queue2_to_queue1 = (vector unsigned char)(0,1,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
+    vector unsigned char    queue3_to_queue2 = (vector unsigned char)(16,16,16,16,16,21,16,0,16,1,16,2,16,3,16,4);
+    vector unsigned char    queue3_with_load = (vector unsigned char)(23,5,6,7,8,25,9,10,11,27,12,13,29,14,31,16);
+        
+    /* Load the bias to all elements of a constant */
+    v_bias           = vec_lde(0,&bias);
+    perm             = vec_lvsl(0,&bias);
+    v_bias           = vec_perm(v_bias,v_bias,perm);
+    v_bias           = vec_splat(v_bias,0);
+    
+    /* Load gap opening penalty to all elements of a constant */
+    v_gapopen        = vec_lde(0,&gap_open);
+    perm             = vec_lvsl(0,&gap_open);
+    v_gapopen        = vec_perm(v_gapopen,v_gapopen,perm);
+    v_gapopen        = vec_splat(v_gapopen,0);
+
+    /* Load gap extension penalty to all elements of a constant */
+    v_gapextend      = vec_lde(0,&gap_extend);  
+    perm             = vec_lvsl(0,&gap_extend);
+    v_gapextend      = vec_perm(v_gapextend,v_gapextend,perm);
+    v_gapextend      = vec_splat(v_gapextend,0);
+    
+    v_maxscore = vec_xor(v_maxscore,v_maxscore);
+   
+    // Zero out the storage vector 
+    k = 2*(db_length+7);
+        
+    for(i=0,j=0;i<k;i++,j+=16)
+    {
+        // borrow the zero value in v_maxscore to have something to store
+        vec_st(v_maxscore,j,workspace);
+    }
+    
+    for(i=0;i<query_length;i+=8)
+    {
+        // fetch first data asap.
+        p_dbseq    = db_sequence;
+        k          = *p_dbseq++;
+        v_score_load = vec_ld(16*k,query_profile_word);
+
+        // zero lots of stuff. 
+        // We use both the VPERM and VSIU unit to knock off some cycles.
+        
+        E          = vec_splat_u16(0);
+        F          = vec_xor(F,F);
+        H          = vec_splat_u16(0);
+        Hup2       = vec_xor(Hup2,Hup2);
+        v_score_q1 = vec_splat_u16(0);
+        v_score_q2 = vec_xor(v_score_q2,v_score_q2);
+        v_score_q3 = vec_splat_u16(0);
+
+        // reset pointers to the start of the saved data from the last row
+        p = workspace;
+                
+        // PROLOGUE 1
+        // prefetch next residue
+        k          = *p_dbseq++;
+        
+        // Create the actual diagonal score vector
+        // and update the queue of incomplete score vectors
+        
+        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+        
+        // prefetch score for next step 
+        v_score_load = vec_ld(16*k,query_profile_word);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 16; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,14);
+        Hup1   = vec_sld(Hup1,H,14);            
+        
+        // do the dynamic programming 
+
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Save value to use for next diagonal H 
+        Hup2 = Hup1;
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        // PROLOGUE 2
+        // prefetch next residue
+        k          = *p_dbseq++;
+        
+        // Create the actual diagonal score vector
+        // and update the queue of incomplete score vectors
+        
+        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+        
+        // prefetch score for next step 
+        v_score_load = vec_ld(16*k,query_profile_word);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 16; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,14);
+        Hup1   = vec_sld(Hup1,H,14);            
+        
+        // do the dynamic programming 
+
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Save value to use for next diagonal H 
+        Hup2 = Hup1;
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+
+        // PROLOGUE 3
+        // prefetch next residue
+        k          = *p_dbseq++;
+        
+        // Create the actual diagonal score vector
+        // and update the queue of incomplete score vectors
+        
+        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+        // prefetch score for next step 
+        v_score_load = vec_ld(16*k,query_profile_word);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 16; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,14);
+        Hup1   = vec_sld(Hup1,H,14);            
+        
+        // do the dynamic programming 
+
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Save value to use for next diagonal H 
+        Hup2 = Hup1;
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+
+        // PROLOGUE 4
+        // prefetch next residue
+        k          = *p_dbseq++;
+        
+        // Create the actual diagonal score vector
+        // and update the queue of incomplete score vectors
+        
+        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+        
+        // prefetch score for next step 
+        v_score_load = vec_ld(16*k,query_profile_word);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 16; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,14);
+        Hup1   = vec_sld(Hup1,H,14);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Save value to use for next diagonal H 
+        Hup2 = Hup1;
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+
+        // PROLOGUE 5
+        // prefetch next residue
+        k          = *p_dbseq++;
+        
+        // Create the actual diagonal score vector
+        // and update the queue of incomplete score vectors
+        
+        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+        
+        // prefetch score for next step 
+        v_score_load = vec_ld(16*k,query_profile_word);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 16; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,14);
+        Hup1   = vec_sld(Hup1,H,14);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Save value to use for next diagonal H 
+        Hup2 = Hup1;
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+
+        // PROLOGUE 6
+        // prefetch next residue
+        k          = *p_dbseq++;
+        
+        // Create the actual diagonal score vector
+        // and update the queue of incomplete score vectors
+        
+        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+        
+        // prefetch score for next step 
+        v_score_load = vec_ld(16*k,query_profile_word);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 16; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,14);
+        Hup1   = vec_sld(Hup1,H,14);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Save value to use for next diagonal H 
+        Hup2 = Hup1;
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+
+        
+        // PROLOGUE 7
+        // prefetch next residue
+        k          = *p_dbseq++;
+        
+        // Create the actual diagonal score vector
+        // and update the queue of incomplete score vectors
+        
+        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+        
+        // prefetch score for next step 
+        v_score_load = vec_ld(16*k,query_profile_word);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 16; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,14);
+        Hup1   = vec_sld(Hup1,H,14);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Save value to use for next diagonal H 
+        Hup2 = Hup1;
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+
+        // PROLOGUE 8
+        // prefetch next residue
+        k          = *p_dbseq++;
+        
+        // Create the actual diagonal score vector
+        // and update the queue of incomplete score vectors
+        
+        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+        
+        // prefetch score for next step 
+        v_score_load = vec_ld(16*k,query_profile_word);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 16; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,14);
+        Hup1   = vec_sld(Hup1,H,14);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Save value to use for next diagonal H 
+        Hup2 = Hup1;
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+    
+
+        // reset pointers to the start of the saved data from the last row
+        p = workspace;
+
+        for(j=8;j<db_length;j+=8)
+        {           
+            // STEP 1
+            
+            // prefetch next residue
+            k          = *p_dbseq++;
+            
+            // Create the actual diagonal score vector
+            // and update the queue of incomplete score vectors
+
+            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+            
+            // prefetch score for next step
+            v_score_load = vec_ld(16*k,query_profile_word);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(256, p);
+            Hup1   = vec_ld(272, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 16; // move ahead 32 bytes
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,14);
+            Hup1   = vec_sld(Hup1,H,14);            
+
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F); 
+            
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            // STEP 2
+            
+            // prefetch next residue
+            k          = *p_dbseq++;
+            
+            // Create the actual diagonal score vector
+            // and update the queue of incomplete score vectors
+            
+            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+            
+            // prefetch score for next step
+            v_score_load = vec_ld(16*k,query_profile_word);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(256, p);
+            Hup2   = vec_ld(272, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 16; // move ahead 32 bytes
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,14);
+            Hup2   = vec_sld(Hup2,H,14);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F); 
+            
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+
+
+            // STEP 3
+            
+            // prefetch next residue
+            k          = *p_dbseq++;
+            
+            // Create the actual diagonal score vector
+            // and update the queue of incomplete score vectors
+            
+            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+            
+            // prefetch score for next step
+            v_score_load = vec_ld(16*k,query_profile_word);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(256, p);
+            Hup1   = vec_ld(272, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 16; // move ahead 32 bytes
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,14);
+            Hup1   = vec_sld(Hup1,H,14);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F); 
+            
+
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+
+            
+            // STEP 4
+            
+            // prefetch next residue
+            k          = *p_dbseq++;
+            
+            // Create the actual diagonal score vector
+            // and update the queue of incomplete score vectors
+            
+            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+            
+            // prefetch score for next step
+            v_score_load = vec_ld(16*k,query_profile_word);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(256, p);
+            Hup2   = vec_ld(272, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 16; // move ahead 32 bytes
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,14);
+            Hup2   = vec_sld(Hup2,H,14);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F); 
+
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+
+
+            // STEP 5
+            
+            // prefetch next residue
+            k          = *p_dbseq++;
+            
+            // Create the actual diagonal score vector
+            // and update the queue of incomplete score vectors
+            
+            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+            
+            // prefetch score for next step
+            v_score_load = vec_ld(16*k,query_profile_word);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(256, p);
+            Hup1   = vec_ld(272, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 16; // move ahead 32 bytes
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,14);
+            Hup1   = vec_sld(Hup1,H,14);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F); 
+            
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+
+
+            // STEP 6
+            
+            // prefetch next residue
+            k          = *p_dbseq++;
+            
+            // Create the actual diagonal score vector
+            // and update the queue of incomplete score vectors
+            
+            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+            
+            // prefetch score for next step
+            v_score_load = vec_ld(16*k,query_profile_word);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(256, p);
+            Hup2   = vec_ld(272, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 16; // move ahead 32 bytes
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,14);
+            Hup2   = vec_sld(Hup2,H,14);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F); 
+            
+
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+
+            
+            // STEP 7
+            
+            // prefetch next residue
+            k          = *p_dbseq++;
+            
+            // Create the actual diagonal score vector
+            // and update the queue of incomplete score vectors
+            
+            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+            
+            // prefetch score for next step
+            v_score_load = vec_ld(16*k,query_profile_word);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(256, p);
+            Hup1   = vec_ld(272, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 16; // move ahead 32 bytes
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,14);
+            Hup1   = vec_sld(Hup1,H,14);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F); 
+            
+
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+
+
+            // STEP 8
+            
+            // prefetch next residue
+            k          = *p_dbseq++;
+            
+            // Create the actual diagonal score vector
+            // and update the queue of incomplete score vectors
+            
+            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+            
+            // prefetch score for next step
+            v_score_load = vec_ld(16*k,query_profile_word);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(256, p);
+            Hup2   = vec_ld(272, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 16; // move ahead 32 bytes
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,14);
+            Hup2   = vec_sld(Hup2,H,14);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F); 
+            
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+        }
+        
+        v_score_load = vec_splat_u16(0);
+        
+        for(;j<db_length+7;j++)
+        {
+            // Create the actual diagonal score vector
+            // and update the queue of incomplete score vectors
+            //
+            // This could of course be done with only vec_perm or vec_sel,
+            // but since they use different execution units we have found
+            // it to be slightly faster to mix them.
+            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 16; // move ahead 32 bytes
+            
+            // v_score_load contains all zeros
+            Fup    = vec_sld(v_score_load,F,14);
+            Hup1   = vec_sld(v_score_load,H,14);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Save value to use for next diagonal H 
+            Hup2 = Hup1;
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+        }
+        vec_st(F, 0,  p);
+        vec_st(H, 16, p);
+
+        query_profile_word += 8*alphabet_size;
+    }
+
+    // find largest score in the v_maxscore vector
+    tmp = vec_sld(v_maxscore,v_maxscore,8);
+    v_maxscore = vec_max(v_maxscore,tmp);
+    tmp = vec_sld(v_maxscore,v_maxscore,4);
+    v_maxscore = vec_max(v_maxscore,tmp);
+    tmp = vec_sld(v_maxscore,v_maxscore,2);
+    v_maxscore = vec_max(v_maxscore,tmp);
+
+    // store in temporary variable
+    vec_ste(v_maxscore,0,&score);
+    
+    // return largest score
+    return score;
+}
+
+int
+smith_waterman_altivec_byte(unsigned char *     query_sequence,
+                            unsigned char *     query_profile_byte,
+                            int                 query_length,
+                            unsigned char *     db_sequence,
+                            int                 db_length,
+                            unsigned char       bias,
+                            unsigned char       gap_open,
+                            unsigned char       gap_extend,
+                            struct f_struct *   f_str)
+{
+    int                     i,j,k,k8;
+    int                     overflow;
+    unsigned char *         p;
+    unsigned char           score;   
+    int                     alphabet_size = f_str->alphabet_size;
+    unsigned char *         workspace     = (unsigned char *)f_str->workspace;
+    
+    vector unsigned char    Fup,Hup1,Hup2,E,F,H,tmp;
+    vector unsigned char    perm;
+    vector unsigned char    v_maxscore;
+    vector unsigned char    v_bias,v_gapopen,v_gapextend;
+    vector unsigned char    v_score;
+    vector unsigned char    v_score_q1;
+    vector unsigned char    v_score_q2;
+    vector unsigned char    v_score_q3;
+    vector unsigned char    v_score_q4;
+    vector unsigned char    v_score_q5;
+    vector unsigned char    v_score_load1;
+    vector unsigned char    v_score_load2;  
+    vector unsigned char    v_zero;  
+
+    vector unsigned char    queue1_to_score  = (vector unsigned char)(16,1,2,3,4,5,6,7,24,9,10,11,12,13,14,15);
+    vector unsigned char    queue2_to_queue1 = (vector unsigned char)(16,17,2,3,4,5,6,7,24,25,10,11,12,13,14,15);
+    vector unsigned char    queue3_to_queue2 = (vector unsigned char)(16,17,18,3,4,5,6,7,24,25,26,11,12,13,14,15);
+    vector unsigned char    queue4_to_queue3 = (vector unsigned char)(16,17,18,19,4,5,6,7,24,25,26,27,12,13,14,15);
+    vector unsigned char    queue5_to_queue4 = (vector unsigned char)(16,17,18,19,20,2,3,4,24,25,26,27,28,10,11,12);
+    vector unsigned char    queue5_with_load = (vector unsigned char)(19,20,21,5,6,22,7,23,27,28,29,13,14,30,15,31);
+    vector unsigned char    merge_score_load = (vector unsigned char)(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+
+    v_zero           = vec_splat_u8(0);
+        
+    /* Load the bias to all elements of a constant */
+    v_bias           = vec_lde(0,&bias);
+    perm             = vec_lvsl(0,&bias);
+    v_bias           = vec_perm(v_bias,v_bias,perm);
+    v_bias           = vec_splat(v_bias,0);
+    
+    /* Load gap opening penalty to all elements of a constant */
+    v_gapopen        = vec_lde(0,&gap_open);
+    perm             = vec_lvsl(0,&gap_open);
+    v_gapopen        = vec_perm(v_gapopen,v_gapopen,perm);
+    v_gapopen        = vec_splat(v_gapopen,0);
+
+    /* Load gap extension penalty to all elements of a constant */
+    v_gapextend      = vec_lde(0,&gap_extend);  
+    perm             = vec_lvsl(0,&gap_extend);
+    v_gapextend      = vec_perm(v_gapextend,v_gapextend,perm);
+    v_gapextend      = vec_splat(v_gapextend,0);
+    
+    v_maxscore = vec_xor(v_maxscore,v_maxscore);
+   
+    // Zero out the storage vector 
+    k = (db_length+15);
+    for(i=0,j=0;i<k;i++,j+=32)
+    {
+        // borrow the zero value in v_maxscore to have something to store
+        vec_st(v_maxscore,j,workspace);
+        vec_st(v_maxscore,j+16,workspace);
+    }
+    
+    for(i=0;i<query_length;i+=16)
+    {
+        // zero lots of stuff. 
+        // We use both the VPERM and VSIU unit to knock off some cycles.
+        
+        E          = vec_splat_u8(0);
+        F          = vec_xor(F,F);
+        H          = vec_splat_u8(0);
+        Hup2      = vec_xor(Hup2,Hup2);
+        v_score_q1 = vec_splat_u8(0);
+        v_score_q2 = vec_xor(v_score_q2,v_score_q2);
+        v_score_q3 = vec_splat_u8(0);
+        v_score_q4 = vec_xor(v_score_q4,v_score_q4);
+        v_score_q5 = vec_splat_u8(0);
+
+        // reset pointers to the start of the saved data from the last row
+        p = workspace;
+        
+        // start directly and prefetch score column
+        k             = db_sequence[0];
+        k8            = k;
+        v_score_load1 = vec_ld(16*k,query_profile_byte);
+        v_score_load2 = v_score_load1;
+        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+        // PROLOGUE 1
+        // prefetch next residue
+        k                = db_sequence[1];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup1    = vec_sld(Hup1,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        
+        // PROLOGUE 2
+        // prefetch next residue
+        k                = db_sequence[2];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+  
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup2   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup2   = vec_sld(Hup2,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup2,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup1,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+     
+        
+        // PROLOGUE 3
+        // prefetch next residue
+        k                = db_sequence[3];
+  
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup1    = vec_sld(Hup1,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        // PROLOGUE 4
+        // prefetch next residue
+        k                = db_sequence[4];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup2   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup2   = vec_sld(Hup2,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup2,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup1,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        // PROLOGUE 5
+        // prefetch next residue
+        k                = db_sequence[5];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+     
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup1    = vec_sld(Hup1,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        // PROLOGUE 6
+        // prefetch next residue
+        k                = db_sequence[6];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup2   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup2   = vec_sld(Hup2,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup2,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup1,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        // PROLOGUE 7
+        // prefetch next residue
+        k                = db_sequence[7];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup1    = vec_sld(Hup1,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        // PROLOGUE 8
+        // prefetch next residue
+        k                = db_sequence[8];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup2   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup2   = vec_sld(Hup2,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup2,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup1,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        
+        // PROLOGUE 9
+        // prefetch next residue
+        k                = db_sequence[9];
+        k8               = db_sequence[1];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        v_score_load2 = vec_ld(16*k8,query_profile_byte);
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1    = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup1    = vec_sld(Hup1,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        // PROLOGUE 10
+        // prefetch next residue
+        k                = db_sequence[10];
+        k8               = db_sequence[2];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        v_score_load2 = vec_ld(16*k8,query_profile_byte);
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup2   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup2   = vec_sld(Hup2,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup2,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup1,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        
+        // PROLOGUE 11
+        // prefetch next residue
+        k                = db_sequence[11];
+        k8               = db_sequence[3];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        v_score_load2 = vec_ld(16*k8,query_profile_byte);
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1    = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup1    = vec_sld(Hup1,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        // PROLOGUE 12
+        // prefetch next residue
+        k                = db_sequence[12];
+        k8               = db_sequence[4];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        v_score_load2 = vec_ld(16*k8,query_profile_byte);
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup2   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup2   = vec_sld(Hup2,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup2,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup1,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        
+        // PROLOGUE 13
+        // prefetch next residue
+        k                = db_sequence[13];
+        k8               = db_sequence[5];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        v_score_load2 = vec_ld(16*k8,query_profile_byte);
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1    = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup1    = vec_sld(Hup1,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        // PROLOGUE 14
+        // prefetch next residue
+        k                = db_sequence[14];
+        k8               = db_sequence[6];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        v_score_load2 = vec_ld(16*k8,query_profile_byte);
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup2   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup2   = vec_sld(Hup2,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup2,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup1,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        // PROLOGUE 15
+        // prefetch next residue
+        k                = db_sequence[15];
+        k8               = db_sequence[7];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        v_score_load2 = vec_ld(16*k8,query_profile_byte);
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup1    = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup1    = vec_sld(Hup1,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup1,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup2,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        
+        
+        // PROLOGUE 16
+        // prefetch next residue
+        k                = db_sequence[16];
+        k8               = db_sequence[8];
+        
+        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+        
+
+        // prefetch score for next step 
+        v_score_load1 = vec_ld(16*k,query_profile_byte);            
+        v_score_load2 = vec_ld(16*k8,query_profile_byte);
+        
+        // load values of F and H from previous row (one unit up)
+        Fup    = vec_ld(0,  p);
+        Hup2   = vec_ld(16, p);
+        p += 32; // move ahead 32 bytes
+        
+        // shift into place so we have complete F and H vectors
+        // that refer to the values one unit up from each cell
+        // that we are currently working on.
+        Fup    = vec_sld(Fup,F,15);
+        Hup2   = vec_sld(Hup2,H,15);            
+        
+        // do the dynamic programming 
+        
+        // update E value
+        E   = vec_subs(E,v_gapextend);
+        tmp = vec_subs(H,v_gapopen);
+        E   = vec_max(E,tmp);
+        
+        // update F value
+        F   = vec_subs(Fup,v_gapextend);
+        tmp = vec_subs(Hup2,v_gapopen);
+        F   = vec_max(F,tmp);
+        
+        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+        
+        // add score to H
+        H   = vec_adds(Hup1,v_score);
+        H   = vec_subs(H,v_bias);
+        
+        // set H to max of H,E,F
+        H   = vec_max(H,E);
+        H   = vec_max(H,F);
+        
+        // Update highest score encountered this far
+        v_maxscore = vec_max(v_maxscore,H);
+        
+        p = workspace;
+        
+        for(j=16;j<db_length;j+=16)
+        { 
+            // STEP 1
+            
+            // prefetch next residue 
+            k                = db_sequence[j+1];
+            k8               = db_sequence[j-7];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+       
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup1   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup1    = vec_sld(Hup1,H,15);            
+
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+          
+
+            
+            
+            
+            // STEP 2
+            
+            // prefetch next residue
+            k                = db_sequence[j+2];
+            k8               = db_sequence[j-6];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup2   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup2   = vec_sld(Hup2,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            
+
+            
+            
+            
+            // STEP 3
+            
+            // prefetch next residue
+            k                = db_sequence[j+3];
+            k8               = db_sequence[j-5];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup1   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup1    = vec_sld(Hup1,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+      
+            
+
+            
+            
+            // STEP 4
+            
+            // prefetch next residue
+            k                = db_sequence[j+4];
+            k8               = db_sequence[j-4];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup2   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup2   = vec_sld(Hup2,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            
+            
+
+            
+            
+            // STEP 5
+            
+            // prefetch next residue
+            k                = db_sequence[j+5];
+            k8               = db_sequence[j-3];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup1    = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup1   = vec_sld(Hup1,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            
+
+            
+            
+            
+            // STEP 6
+            
+            // prefetch next residue
+            k                = db_sequence[j+6];
+            k8               = db_sequence[j-2];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup2   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup2   = vec_sld(Hup2,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            
+
+            
+            
+            
+            // STEP 7
+            
+            // prefetch next residue
+            k                = db_sequence[j+7];
+            k8               = db_sequence[j-1];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup1    = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup1    = vec_sld(Hup1,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            
+            
+
+            
+            
+            // STEP 8
+            
+            // prefetch next residue
+            k                = db_sequence[j+8];
+            k8               = db_sequence[j];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup2   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup2   = vec_sld(Hup2,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            
+            
+
+            
+            
+            // STEP 9
+            
+            // prefetch next residue
+            k                = db_sequence[j+9];
+            k8               = db_sequence[j+1];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup1   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup1   = vec_sld(Hup1,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            // STEP 10
+            
+            // prefetch next residue
+            k                = db_sequence[j+10];
+            k8               = db_sequence[j+2];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup2   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup2   = vec_sld(Hup2,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+        
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            // STEP 11
+            
+            // prefetch next residue
+            k                = db_sequence[j+11];
+            k8               = db_sequence[j+3];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup1   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup1   = vec_sld(Hup1,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            // STEP 12
+            
+            // prefetch next residue
+            k                = db_sequence[j+12];
+            k8               = db_sequence[j+4];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup2   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup2   = vec_sld(Hup2,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            // STEP 13
+            
+            // prefetch next residue
+            k                = db_sequence[j+13];
+            k8               = db_sequence[j+5];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup1   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup1   = vec_sld(Hup1,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            // STEP 14
+            
+            // prefetch next residue
+            k                = db_sequence[j+14];
+            k8               = db_sequence[j+6];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup2   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup2   = vec_sld(Hup2,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            // STEP 15
+            
+            // prefetch next residue
+            k                = db_sequence[j+15];
+            k8               = db_sequence[j+7];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+                        
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup1   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup1   = vec_sld(Hup1,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+            // STEP 16
+            
+            // prefetch next residue
+            k                = db_sequence[j+16];
+            k8               = db_sequence[j+8];
+            
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load1 = vec_ld(16*k,query_profile_byte);
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            
+            // load values of F and H from previous row (one unit up)
+            Fup    = vec_ld(512, p);
+            Hup2   = vec_ld(528, p);
+            
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32;
+            
+            // shift into place so we have complete F and H vectors
+            // that refer to the values one unit up from each cell
+            // that we are currently working on.
+            Fup    = vec_sld(Fup,F,15);
+            Hup2   = vec_sld(Hup2,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup2,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+            
+            // add score to H
+            H   = vec_adds(Hup1,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+            
+        }
+        
+        for(;j<db_length+15;j++)
+        {
+            k8               = db_sequence[j-7];
+
+            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
+            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
+            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
+            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
+            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
+            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
+            
+            
+            // prefetch scores for next step
+            v_score_load2 = vec_ld(16*k8,query_profile_byte);
+            v_score_load1 = vec_perm(v_zero,v_score_load2,merge_score_load);
+
+            // save old values of F and H to use on next row
+            vec_st(F, 0,  p);
+            vec_st(H, 16, p);
+            p += 32; // move ahead 32 bytes
+            
+            Fup    = vec_sld(v_zero,F,15);
+            Hup1   = vec_sld(v_zero,H,15);            
+            
+            // do the dynamic programming 
+            
+            // update E value
+            E   = vec_subs(E,v_gapextend);
+            tmp = vec_subs(H,v_gapopen);
+            E   = vec_max(E,tmp);
+            
+            // update F value
+            F   = vec_subs(Fup,v_gapextend);
+            tmp = vec_subs(Hup1,v_gapopen);
+            F   = vec_max(F,tmp);
+            
+            // add score to H
+            H   = vec_adds(Hup2,v_score);
+            H   = vec_subs(H,v_bias);
+            
+            // set H to max of H,E,F
+            H   = vec_max(H,E);
+            H   = vec_max(H,F);
+            
+            // Save value to use for next diagonal H 
+            Hup2 = Hup1;
+
+            // Update highest score encountered this far
+            v_maxscore = vec_max(v_maxscore,H);
+        }
+        vec_st(F, 512, p);
+        vec_st(H, 528, p);
+
+        query_profile_byte += 16*alphabet_size;
+
+        // End of this row (actually 16 rows due to SIMD).
+        // Before we continue, check for overflow.
+        tmp      = vec_subs(vec_splat_u8(-1),v_bias);
+        overflow = vec_any_ge(v_maxscore,tmp);
+        
+
+    }
+
+    if(overflow)
+    {
+        return 255;
+    }
+    else
+    {
+        // find largest score in the v_maxscore vector
+        tmp = vec_sld(v_maxscore,v_maxscore,8);
+        v_maxscore = vec_max(v_maxscore,tmp);
+        tmp = vec_sld(v_maxscore,v_maxscore,4);
+        v_maxscore = vec_max(v_maxscore,tmp);
+        tmp = vec_sld(v_maxscore,v_maxscore,2);
+        v_maxscore = vec_max(v_maxscore,tmp);
+        tmp = vec_sld(v_maxscore,v_maxscore,1);
+        v_maxscore = vec_max(v_maxscore,tmp);
+        
+        // store in temporary variable
+        vec_ste(v_maxscore,0,&score);
+        
+        // return largest score
+        return score;
+    }}
+
+
+#else
+
+/* No Altivec support. Avoid compiler complaints about empty object */
+
+int sw_dummy;
+
+#endif