Delete unneeded directory
[jabaws.git] / website / archive / binaries / mac / src / fasta34 / smith_waterman_altivec.c
diff --git a/website/archive/binaries/mac/src/fasta34/smith_waterman_altivec.c b/website/archive/binaries/mac/src/fasta34/smith_waterman_altivec.c
deleted file mode 100644 (file)
index 122aab2..0000000
+++ /dev/null
@@ -1,3086 +0,0 @@
-
-/* Implementation of the Wozniak "anti-diagonal" vectorization
-   strategy for Smith-Waterman comparison, Wozniak (1997) Comp.
-   Appl. Biosci. 13:145-150
-
-   November, 2004
-*/
-
-/*
-  Written by Erik Lindahl, Stockholm Bioinformatics Center, 2004.
-  Please send bug reports and/or suggestions to lindahl@sbc.su.se.
-*/
-
-#include <stdio.h>
-
-#include "defs.h"
-#include "param.h"
-#include "dropgsw.h"
-
-#ifdef SW_ALTIVEC
-
-int
-smith_waterman_altivec_word(unsigned char *     query_sequence,
-                            unsigned short *    query_profile_word,
-                            int                 query_length,
-                            unsigned char *     db_sequence,
-                            int                 db_length,
-                            unsigned short      bias,
-                            unsigned short      gap_open,
-                            unsigned short      gap_extend,
-                            struct f_struct *   f_str)
-{
-    int                     i,j,k;
-    unsigned short *        p;
-    unsigned short          score;   
-    unsigned char *         p_dbseq;
-    int                     alphabet_size = f_str->alphabet_size;
-    unsigned short *        workspace     = (unsigned short *)f_str->workspace;
-
-    vector unsigned short   Fup,Hup1,Hup2,E,F,H,tmp;
-    vector unsigned char    perm;
-    vector unsigned short   v_maxscore;
-    vector unsigned short   v_bias,v_gapopen,v_gapextend;
-    vector unsigned short   v_score;
-    vector unsigned short   v_score_q1;
-    vector unsigned short   v_score_q2;
-    vector unsigned short   v_score_q3;
-    vector unsigned short   v_score_load; 
-    vector unsigned char    queue1_to_score  = (vector unsigned char)(16,17,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
-    vector unsigned char    queue2_to_queue1 = (vector unsigned char)(0,1,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
-    vector unsigned char    queue3_to_queue2 = (vector unsigned char)(16,16,16,16,16,21,16,0,16,1,16,2,16,3,16,4);
-    vector unsigned char    queue3_with_load = (vector unsigned char)(23,5,6,7,8,25,9,10,11,27,12,13,29,14,31,16);
-        
-    /* Load the bias to all elements of a constant */
-    v_bias           = vec_lde(0,&bias);
-    perm             = vec_lvsl(0,&bias);
-    v_bias           = vec_perm(v_bias,v_bias,perm);
-    v_bias           = vec_splat(v_bias,0);
-    
-    /* Load gap opening penalty to all elements of a constant */
-    v_gapopen        = vec_lde(0,&gap_open);
-    perm             = vec_lvsl(0,&gap_open);
-    v_gapopen        = vec_perm(v_gapopen,v_gapopen,perm);
-    v_gapopen        = vec_splat(v_gapopen,0);
-
-    /* Load gap extension penalty to all elements of a constant */
-    v_gapextend      = vec_lde(0,&gap_extend);  
-    perm             = vec_lvsl(0,&gap_extend);
-    v_gapextend      = vec_perm(v_gapextend,v_gapextend,perm);
-    v_gapextend      = vec_splat(v_gapextend,0);
-    
-    v_maxscore = vec_xor(v_maxscore,v_maxscore);
-   
-    // Zero out the storage vector 
-    k = 2*(db_length+7);
-        
-    for(i=0,j=0;i<k;i++,j+=16)
-    {
-        // borrow the zero value in v_maxscore to have something to store
-        vec_st(v_maxscore,j,workspace);
-    }
-    
-    for(i=0;i<query_length;i+=8)
-    {
-        // fetch first data asap.
-        p_dbseq    = db_sequence;
-        k          = *p_dbseq++;
-        v_score_load = vec_ld(16*k,query_profile_word);
-
-        // zero lots of stuff. 
-        // We use both the VPERM and VSIU unit to knock off some cycles.
-        
-        E          = vec_splat_u16(0);
-        F          = vec_xor(F,F);
-        H          = vec_splat_u16(0);
-        Hup2       = vec_xor(Hup2,Hup2);
-        v_score_q1 = vec_splat_u16(0);
-        v_score_q2 = vec_xor(v_score_q2,v_score_q2);
-        v_score_q3 = vec_splat_u16(0);
-
-        // reset pointers to the start of the saved data from the last row
-        p = workspace;
-                
-        // PROLOGUE 1
-        // prefetch next residue
-        k          = *p_dbseq++;
-        
-        // Create the actual diagonal score vector
-        // and update the queue of incomplete score vectors
-        
-        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-        
-        // prefetch score for next step 
-        v_score_load = vec_ld(16*k,query_profile_word);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 16; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,14);
-        Hup1   = vec_sld(Hup1,H,14);            
-        
-        // do the dynamic programming 
-
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Save value to use for next diagonal H 
-        Hup2 = Hup1;
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        // PROLOGUE 2
-        // prefetch next residue
-        k          = *p_dbseq++;
-        
-        // Create the actual diagonal score vector
-        // and update the queue of incomplete score vectors
-        
-        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-        
-        // prefetch score for next step 
-        v_score_load = vec_ld(16*k,query_profile_word);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 16; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,14);
-        Hup1   = vec_sld(Hup1,H,14);            
-        
-        // do the dynamic programming 
-
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Save value to use for next diagonal H 
-        Hup2 = Hup1;
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-
-        // PROLOGUE 3
-        // prefetch next residue
-        k          = *p_dbseq++;
-        
-        // Create the actual diagonal score vector
-        // and update the queue of incomplete score vectors
-        
-        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-
-        // prefetch score for next step 
-        v_score_load = vec_ld(16*k,query_profile_word);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 16; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,14);
-        Hup1   = vec_sld(Hup1,H,14);            
-        
-        // do the dynamic programming 
-
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Save value to use for next diagonal H 
-        Hup2 = Hup1;
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-
-        // PROLOGUE 4
-        // prefetch next residue
-        k          = *p_dbseq++;
-        
-        // Create the actual diagonal score vector
-        // and update the queue of incomplete score vectors
-        
-        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-        
-        // prefetch score for next step 
-        v_score_load = vec_ld(16*k,query_profile_word);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 16; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,14);
-        Hup1   = vec_sld(Hup1,H,14);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Save value to use for next diagonal H 
-        Hup2 = Hup1;
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-
-        // PROLOGUE 5
-        // prefetch next residue
-        k          = *p_dbseq++;
-        
-        // Create the actual diagonal score vector
-        // and update the queue of incomplete score vectors
-        
-        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-        
-        // prefetch score for next step 
-        v_score_load = vec_ld(16*k,query_profile_word);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 16; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,14);
-        Hup1   = vec_sld(Hup1,H,14);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Save value to use for next diagonal H 
-        Hup2 = Hup1;
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-
-        // PROLOGUE 6
-        // prefetch next residue
-        k          = *p_dbseq++;
-        
-        // Create the actual diagonal score vector
-        // and update the queue of incomplete score vectors
-        
-        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-        
-        // prefetch score for next step 
-        v_score_load = vec_ld(16*k,query_profile_word);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 16; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,14);
-        Hup1   = vec_sld(Hup1,H,14);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Save value to use for next diagonal H 
-        Hup2 = Hup1;
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-
-        
-        // PROLOGUE 7
-        // prefetch next residue
-        k          = *p_dbseq++;
-        
-        // Create the actual diagonal score vector
-        // and update the queue of incomplete score vectors
-        
-        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-        
-        // prefetch score for next step 
-        v_score_load = vec_ld(16*k,query_profile_word);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 16; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,14);
-        Hup1   = vec_sld(Hup1,H,14);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Save value to use for next diagonal H 
-        Hup2 = Hup1;
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-
-        // PROLOGUE 8
-        // prefetch next residue
-        k          = *p_dbseq++;
-        
-        // Create the actual diagonal score vector
-        // and update the queue of incomplete score vectors
-        
-        v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-        v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-        v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-        v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-        
-        // prefetch score for next step 
-        v_score_load = vec_ld(16*k,query_profile_word);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 16; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,14);
-        Hup1   = vec_sld(Hup1,H,14);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Save value to use for next diagonal H 
-        Hup2 = Hup1;
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-    
-
-        // reset pointers to the start of the saved data from the last row
-        p = workspace;
-
-        for(j=8;j<db_length;j+=8)
-        {           
-            // STEP 1
-            
-            // prefetch next residue
-            k          = *p_dbseq++;
-            
-            // Create the actual diagonal score vector
-            // and update the queue of incomplete score vectors
-
-            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-            
-            // prefetch score for next step
-            v_score_load = vec_ld(16*k,query_profile_word);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(256, p);
-            Hup1   = vec_ld(272, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 16; // move ahead 32 bytes
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,14);
-            Hup1   = vec_sld(Hup1,H,14);            
-
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F); 
-            
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            // STEP 2
-            
-            // prefetch next residue
-            k          = *p_dbseq++;
-            
-            // Create the actual diagonal score vector
-            // and update the queue of incomplete score vectors
-            
-            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-            
-            // prefetch score for next step
-            v_score_load = vec_ld(16*k,query_profile_word);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(256, p);
-            Hup2   = vec_ld(272, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 16; // move ahead 32 bytes
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,14);
-            Hup2   = vec_sld(Hup2,H,14);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F); 
-            
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-
-
-            // STEP 3
-            
-            // prefetch next residue
-            k          = *p_dbseq++;
-            
-            // Create the actual diagonal score vector
-            // and update the queue of incomplete score vectors
-            
-            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-            
-            // prefetch score for next step
-            v_score_load = vec_ld(16*k,query_profile_word);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(256, p);
-            Hup1   = vec_ld(272, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 16; // move ahead 32 bytes
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,14);
-            Hup1   = vec_sld(Hup1,H,14);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F); 
-            
-
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-
-            
-            // STEP 4
-            
-            // prefetch next residue
-            k          = *p_dbseq++;
-            
-            // Create the actual diagonal score vector
-            // and update the queue of incomplete score vectors
-            
-            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-            
-            // prefetch score for next step
-            v_score_load = vec_ld(16*k,query_profile_word);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(256, p);
-            Hup2   = vec_ld(272, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 16; // move ahead 32 bytes
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,14);
-            Hup2   = vec_sld(Hup2,H,14);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F); 
-
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-
-
-            // STEP 5
-            
-            // prefetch next residue
-            k          = *p_dbseq++;
-            
-            // Create the actual diagonal score vector
-            // and update the queue of incomplete score vectors
-            
-            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-            
-            // prefetch score for next step
-            v_score_load = vec_ld(16*k,query_profile_word);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(256, p);
-            Hup1   = vec_ld(272, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 16; // move ahead 32 bytes
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,14);
-            Hup1   = vec_sld(Hup1,H,14);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F); 
-            
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-
-
-            // STEP 6
-            
-            // prefetch next residue
-            k          = *p_dbseq++;
-            
-            // Create the actual diagonal score vector
-            // and update the queue of incomplete score vectors
-            
-            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-            
-            // prefetch score for next step
-            v_score_load = vec_ld(16*k,query_profile_word);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(256, p);
-            Hup2   = vec_ld(272, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 16; // move ahead 32 bytes
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,14);
-            Hup2   = vec_sld(Hup2,H,14);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F); 
-            
-
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-
-            
-            // STEP 7
-            
-            // prefetch next residue
-            k          = *p_dbseq++;
-            
-            // Create the actual diagonal score vector
-            // and update the queue of incomplete score vectors
-            
-            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-            
-            // prefetch score for next step
-            v_score_load = vec_ld(16*k,query_profile_word);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(256, p);
-            Hup1   = vec_ld(272, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 16; // move ahead 32 bytes
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,14);
-            Hup1   = vec_sld(Hup1,H,14);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F); 
-            
-
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-
-
-            // STEP 8
-            
-            // prefetch next residue
-            k          = *p_dbseq++;
-            
-            // Create the actual diagonal score vector
-            // and update the queue of incomplete score vectors
-            
-            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-            
-            // prefetch score for next step
-            v_score_load = vec_ld(16*k,query_profile_word);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(256, p);
-            Hup2   = vec_ld(272, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 16; // move ahead 32 bytes
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,14);
-            Hup2   = vec_sld(Hup2,H,14);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F); 
-            
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-        }
-        
-        v_score_load = vec_splat_u16(0);
-        
-        for(;j<db_length+7;j++)
-        {
-            // Create the actual diagonal score vector
-            // and update the queue of incomplete score vectors
-            //
-            // This could of course be done with only vec_perm or vec_sel,
-            // but since they use different execution units we have found
-            // it to be slightly faster to mix them.
-            v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
-            v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
-            v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
-            v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 16; // move ahead 32 bytes
-            
-            // v_score_load contains all zeros
-            Fup    = vec_sld(v_score_load,F,14);
-            Hup1   = vec_sld(v_score_load,H,14);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Save value to use for next diagonal H 
-            Hup2 = Hup1;
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-        }
-        vec_st(F, 0,  p);
-        vec_st(H, 16, p);
-
-        query_profile_word += 8*alphabet_size;
-    }
-
-    // find largest score in the v_maxscore vector
-    tmp = vec_sld(v_maxscore,v_maxscore,8);
-    v_maxscore = vec_max(v_maxscore,tmp);
-    tmp = vec_sld(v_maxscore,v_maxscore,4);
-    v_maxscore = vec_max(v_maxscore,tmp);
-    tmp = vec_sld(v_maxscore,v_maxscore,2);
-    v_maxscore = vec_max(v_maxscore,tmp);
-
-    // store in temporary variable
-    vec_ste(v_maxscore,0,&score);
-    
-    // return largest score
-    return score;
-}
-
-int
-smith_waterman_altivec_byte(unsigned char *     query_sequence,
-                            unsigned char *     query_profile_byte,
-                            int                 query_length,
-                            unsigned char *     db_sequence,
-                            int                 db_length,
-                            unsigned char       bias,
-                            unsigned char       gap_open,
-                            unsigned char       gap_extend,
-                            struct f_struct *   f_str)
-{
-    int                     i,j,k,k8;
-    int                     overflow;
-    unsigned char *         p;
-    unsigned char           score;   
-    int                     alphabet_size = f_str->alphabet_size;
-    unsigned char *         workspace     = (unsigned char *)f_str->workspace;
-    
-    vector unsigned char    Fup,Hup1,Hup2,E,F,H,tmp;
-    vector unsigned char    perm;
-    vector unsigned char    v_maxscore;
-    vector unsigned char    v_bias,v_gapopen,v_gapextend;
-    vector unsigned char    v_score;
-    vector unsigned char    v_score_q1;
-    vector unsigned char    v_score_q2;
-    vector unsigned char    v_score_q3;
-    vector unsigned char    v_score_q4;
-    vector unsigned char    v_score_q5;
-    vector unsigned char    v_score_load1;
-    vector unsigned char    v_score_load2;  
-    vector unsigned char    v_zero;  
-
-    vector unsigned char    queue1_to_score  = (vector unsigned char)(16,1,2,3,4,5,6,7,24,9,10,11,12,13,14,15);
-    vector unsigned char    queue2_to_queue1 = (vector unsigned char)(16,17,2,3,4,5,6,7,24,25,10,11,12,13,14,15);
-    vector unsigned char    queue3_to_queue2 = (vector unsigned char)(16,17,18,3,4,5,6,7,24,25,26,11,12,13,14,15);
-    vector unsigned char    queue4_to_queue3 = (vector unsigned char)(16,17,18,19,4,5,6,7,24,25,26,27,12,13,14,15);
-    vector unsigned char    queue5_to_queue4 = (vector unsigned char)(16,17,18,19,20,2,3,4,24,25,26,27,28,10,11,12);
-    vector unsigned char    queue5_with_load = (vector unsigned char)(19,20,21,5,6,22,7,23,27,28,29,13,14,30,15,31);
-    vector unsigned char    merge_score_load = (vector unsigned char)(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
-
-    v_zero           = vec_splat_u8(0);
-        
-    /* Load the bias to all elements of a constant */
-    v_bias           = vec_lde(0,&bias);
-    perm             = vec_lvsl(0,&bias);
-    v_bias           = vec_perm(v_bias,v_bias,perm);
-    v_bias           = vec_splat(v_bias,0);
-    
-    /* Load gap opening penalty to all elements of a constant */
-    v_gapopen        = vec_lde(0,&gap_open);
-    perm             = vec_lvsl(0,&gap_open);
-    v_gapopen        = vec_perm(v_gapopen,v_gapopen,perm);
-    v_gapopen        = vec_splat(v_gapopen,0);
-
-    /* Load gap extension penalty to all elements of a constant */
-    v_gapextend      = vec_lde(0,&gap_extend);  
-    perm             = vec_lvsl(0,&gap_extend);
-    v_gapextend      = vec_perm(v_gapextend,v_gapextend,perm);
-    v_gapextend      = vec_splat(v_gapextend,0);
-    
-    v_maxscore = vec_xor(v_maxscore,v_maxscore);
-   
-    // Zero out the storage vector 
-    k = (db_length+15);
-    for(i=0,j=0;i<k;i++,j+=32)
-    {
-        // borrow the zero value in v_maxscore to have something to store
-        vec_st(v_maxscore,j,workspace);
-        vec_st(v_maxscore,j+16,workspace);
-    }
-    
-    for(i=0;i<query_length;i+=16)
-    {
-        // zero lots of stuff. 
-        // We use both the VPERM and VSIU unit to knock off some cycles.
-        
-        E          = vec_splat_u8(0);
-        F          = vec_xor(F,F);
-        H          = vec_splat_u8(0);
-        Hup2      = vec_xor(Hup2,Hup2);
-        v_score_q1 = vec_splat_u8(0);
-        v_score_q2 = vec_xor(v_score_q2,v_score_q2);
-        v_score_q3 = vec_splat_u8(0);
-        v_score_q4 = vec_xor(v_score_q4,v_score_q4);
-        v_score_q5 = vec_splat_u8(0);
-
-        // reset pointers to the start of the saved data from the last row
-        p = workspace;
-        
-        // start directly and prefetch score column
-        k             = db_sequence[0];
-        k8            = k;
-        v_score_load1 = vec_ld(16*k,query_profile_byte);
-        v_score_load2 = v_score_load1;
-        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
-
-        // PROLOGUE 1
-        // prefetch next residue
-        k                = db_sequence[1];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup1    = vec_sld(Hup1,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        
-        // PROLOGUE 2
-        // prefetch next residue
-        k                = db_sequence[2];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-  
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup2   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup2   = vec_sld(Hup2,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup2,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup1,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-     
-        
-        // PROLOGUE 3
-        // prefetch next residue
-        k                = db_sequence[3];
-  
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup1    = vec_sld(Hup1,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        // PROLOGUE 4
-        // prefetch next residue
-        k                = db_sequence[4];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup2   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup2   = vec_sld(Hup2,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup2,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup1,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        // PROLOGUE 5
-        // prefetch next residue
-        k                = db_sequence[5];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-     
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup1    = vec_sld(Hup1,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        // PROLOGUE 6
-        // prefetch next residue
-        k                = db_sequence[6];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup2   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup2   = vec_sld(Hup2,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup2,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup1,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        // PROLOGUE 7
-        // prefetch next residue
-        k                = db_sequence[7];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup1    = vec_sld(Hup1,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        // PROLOGUE 8
-        // prefetch next residue
-        k                = db_sequence[8];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup2   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup2   = vec_sld(Hup2,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup2,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup1,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        
-        // PROLOGUE 9
-        // prefetch next residue
-        k                = db_sequence[9];
-        k8               = db_sequence[1];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        v_score_load2 = vec_ld(16*k8,query_profile_byte);
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1    = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup1    = vec_sld(Hup1,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        // PROLOGUE 10
-        // prefetch next residue
-        k                = db_sequence[10];
-        k8               = db_sequence[2];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        v_score_load2 = vec_ld(16*k8,query_profile_byte);
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup2   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup2   = vec_sld(Hup2,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup2,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup1,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        
-        // PROLOGUE 11
-        // prefetch next residue
-        k                = db_sequence[11];
-        k8               = db_sequence[3];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        v_score_load2 = vec_ld(16*k8,query_profile_byte);
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1    = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup1    = vec_sld(Hup1,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        // PROLOGUE 12
-        // prefetch next residue
-        k                = db_sequence[12];
-        k8               = db_sequence[4];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        v_score_load2 = vec_ld(16*k8,query_profile_byte);
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup2   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup2   = vec_sld(Hup2,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup2,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup1,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        
-        // PROLOGUE 13
-        // prefetch next residue
-        k                = db_sequence[13];
-        k8               = db_sequence[5];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        v_score_load2 = vec_ld(16*k8,query_profile_byte);
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1    = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup1    = vec_sld(Hup1,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        // PROLOGUE 14
-        // prefetch next residue
-        k                = db_sequence[14];
-        k8               = db_sequence[6];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        v_score_load2 = vec_ld(16*k8,query_profile_byte);
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup2   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup2   = vec_sld(Hup2,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup2,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup1,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        // PROLOGUE 15
-        // prefetch next residue
-        k                = db_sequence[15];
-        k8               = db_sequence[7];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        v_score_load2 = vec_ld(16*k8,query_profile_byte);
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup1    = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup1    = vec_sld(Hup1,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup1,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup2,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        
-        
-        // PROLOGUE 16
-        // prefetch next residue
-        k                = db_sequence[16];
-        k8               = db_sequence[8];
-        
-        v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-        v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-        v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-        v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-        v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-        v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-        
-
-        // prefetch score for next step 
-        v_score_load1 = vec_ld(16*k,query_profile_byte);            
-        v_score_load2 = vec_ld(16*k8,query_profile_byte);
-        
-        // load values of F and H from previous row (one unit up)
-        Fup    = vec_ld(0,  p);
-        Hup2   = vec_ld(16, p);
-        p += 32; // move ahead 32 bytes
-        
-        // shift into place so we have complete F and H vectors
-        // that refer to the values one unit up from each cell
-        // that we are currently working on.
-        Fup    = vec_sld(Fup,F,15);
-        Hup2   = vec_sld(Hup2,H,15);            
-        
-        // do the dynamic programming 
-        
-        // update E value
-        E   = vec_subs(E,v_gapextend);
-        tmp = vec_subs(H,v_gapopen);
-        E   = vec_max(E,tmp);
-        
-        // update F value
-        F   = vec_subs(Fup,v_gapextend);
-        tmp = vec_subs(Hup2,v_gapopen);
-        F   = vec_max(F,tmp);
-        
-        v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-        
-        // add score to H
-        H   = vec_adds(Hup1,v_score);
-        H   = vec_subs(H,v_bias);
-        
-        // set H to max of H,E,F
-        H   = vec_max(H,E);
-        H   = vec_max(H,F);
-        
-        // Update highest score encountered this far
-        v_maxscore = vec_max(v_maxscore,H);
-        
-        p = workspace;
-        
-        for(j=16;j<db_length;j+=16)
-        { 
-            // STEP 1
-            
-            // prefetch next residue 
-            k                = db_sequence[j+1];
-            k8               = db_sequence[j-7];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-       
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup1   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup1    = vec_sld(Hup1,H,15);            
-
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-          
-
-            
-            
-            
-            // STEP 2
-            
-            // prefetch next residue
-            k                = db_sequence[j+2];
-            k8               = db_sequence[j-6];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup2   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup2   = vec_sld(Hup2,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            
-
-            
-            
-            
-            // STEP 3
-            
-            // prefetch next residue
-            k                = db_sequence[j+3];
-            k8               = db_sequence[j-5];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup1   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup1    = vec_sld(Hup1,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-      
-            
-
-            
-            
-            // STEP 4
-            
-            // prefetch next residue
-            k                = db_sequence[j+4];
-            k8               = db_sequence[j-4];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup2   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup2   = vec_sld(Hup2,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            
-            
-
-            
-            
-            // STEP 5
-            
-            // prefetch next residue
-            k                = db_sequence[j+5];
-            k8               = db_sequence[j-3];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup1    = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup1   = vec_sld(Hup1,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            
-
-            
-            
-            
-            // STEP 6
-            
-            // prefetch next residue
-            k                = db_sequence[j+6];
-            k8               = db_sequence[j-2];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup2   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup2   = vec_sld(Hup2,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            
-
-            
-            
-            
-            // STEP 7
-            
-            // prefetch next residue
-            k                = db_sequence[j+7];
-            k8               = db_sequence[j-1];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup1    = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup1    = vec_sld(Hup1,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            
-            
-
-            
-            
-            // STEP 8
-            
-            // prefetch next residue
-            k                = db_sequence[j+8];
-            k8               = db_sequence[j];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup2   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup2   = vec_sld(Hup2,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            
-            
-
-            
-            
-            // STEP 9
-            
-            // prefetch next residue
-            k                = db_sequence[j+9];
-            k8               = db_sequence[j+1];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup1   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup1   = vec_sld(Hup1,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            // STEP 10
-            
-            // prefetch next residue
-            k                = db_sequence[j+10];
-            k8               = db_sequence[j+2];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup2   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup2   = vec_sld(Hup2,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-        
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            // STEP 11
-            
-            // prefetch next residue
-            k                = db_sequence[j+11];
-            k8               = db_sequence[j+3];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup1   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup1   = vec_sld(Hup1,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            // STEP 12
-            
-            // prefetch next residue
-            k                = db_sequence[j+12];
-            k8               = db_sequence[j+4];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup2   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup2   = vec_sld(Hup2,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            // STEP 13
-            
-            // prefetch next residue
-            k                = db_sequence[j+13];
-            k8               = db_sequence[j+5];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup1   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup1   = vec_sld(Hup1,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            // STEP 14
-            
-            // prefetch next residue
-            k                = db_sequence[j+14];
-            k8               = db_sequence[j+6];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup2   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup2   = vec_sld(Hup2,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            // STEP 15
-            
-            // prefetch next residue
-            k                = db_sequence[j+15];
-            k8               = db_sequence[j+7];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-                        
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup1   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup1   = vec_sld(Hup1,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-            // STEP 16
-            
-            // prefetch next residue
-            k                = db_sequence[j+16];
-            k8               = db_sequence[j+8];
-            
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load1 = vec_ld(16*k,query_profile_byte);
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            
-            // load values of F and H from previous row (one unit up)
-            Fup    = vec_ld(512, p);
-            Hup2   = vec_ld(528, p);
-            
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32;
-            
-            // shift into place so we have complete F and H vectors
-            // that refer to the values one unit up from each cell
-            // that we are currently working on.
-            Fup    = vec_sld(Fup,F,15);
-            Hup2   = vec_sld(Hup2,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup2,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
-            
-            // add score to H
-            H   = vec_adds(Hup1,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-            
-        }
-        
-        for(;j<db_length+15;j++)
-        {
-            k8               = db_sequence[j-7];
-
-            v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
-            v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
-            v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
-            v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
-            v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
-            v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
-            
-            
-            // prefetch scores for next step
-            v_score_load2 = vec_ld(16*k8,query_profile_byte);
-            v_score_load1 = vec_perm(v_zero,v_score_load2,merge_score_load);
-
-            // save old values of F and H to use on next row
-            vec_st(F, 0,  p);
-            vec_st(H, 16, p);
-            p += 32; // move ahead 32 bytes
-            
-            Fup    = vec_sld(v_zero,F,15);
-            Hup1   = vec_sld(v_zero,H,15);            
-            
-            // do the dynamic programming 
-            
-            // update E value
-            E   = vec_subs(E,v_gapextend);
-            tmp = vec_subs(H,v_gapopen);
-            E   = vec_max(E,tmp);
-            
-            // update F value
-            F   = vec_subs(Fup,v_gapextend);
-            tmp = vec_subs(Hup1,v_gapopen);
-            F   = vec_max(F,tmp);
-            
-            // add score to H
-            H   = vec_adds(Hup2,v_score);
-            H   = vec_subs(H,v_bias);
-            
-            // set H to max of H,E,F
-            H   = vec_max(H,E);
-            H   = vec_max(H,F);
-            
-            // Save value to use for next diagonal H 
-            Hup2 = Hup1;
-
-            // Update highest score encountered this far
-            v_maxscore = vec_max(v_maxscore,H);
-        }
-        vec_st(F, 512, p);
-        vec_st(H, 528, p);
-
-        query_profile_byte += 16*alphabet_size;
-
-        // End of this row (actually 16 rows due to SIMD).
-        // Before we continue, check for overflow.
-        tmp      = vec_subs(vec_splat_u8(-1),v_bias);
-        overflow = vec_any_ge(v_maxscore,tmp);
-        
-
-    }
-
-    if(overflow)
-    {
-        return 255;
-    }
-    else
-    {
-        // find largest score in the v_maxscore vector
-        tmp = vec_sld(v_maxscore,v_maxscore,8);
-        v_maxscore = vec_max(v_maxscore,tmp);
-        tmp = vec_sld(v_maxscore,v_maxscore,4);
-        v_maxscore = vec_max(v_maxscore,tmp);
-        tmp = vec_sld(v_maxscore,v_maxscore,2);
-        v_maxscore = vec_max(v_maxscore,tmp);
-        tmp = vec_sld(v_maxscore,v_maxscore,1);
-        v_maxscore = vec_max(v_maxscore,tmp);
-        
-        // store in temporary variable
-        vec_ste(v_maxscore,0,&score);
-        
-        // return largest score
-        return score;
-    }}
-
-
-#else
-
-/* No Altivec support. Avoid compiler complaints about empty object */
-
-int sw_dummy;
-
-#endif