Next version of JABA
[jabaws.git] / binaries / src / fasta34 / smith_waterman_altivec.c
1
2 /* Implementation of the Wozniak "anti-diagonal" vectorization
3    strategy for Smith-Waterman comparison, Wozniak (1997) Comp.
4    Appl. Biosci. 13:145-150
5
6    November, 2004
7 */
8
9 /*
10   Written by Erik Lindahl, Stockholm Bioinformatics Center, 2004.
11   Please send bug reports and/or suggestions to lindahl@sbc.su.se.
12 */
13
14 #include <stdio.h>
15
16 #include "defs.h"
17 #include "param.h"
18 #include "dropgsw.h"
19
20 #ifdef SW_ALTIVEC
21
22 int
23 smith_waterman_altivec_word(unsigned char *     query_sequence,
24                             unsigned short *    query_profile_word,
25                             int                 query_length,
26                             unsigned char *     db_sequence,
27                             int                 db_length,
28                             unsigned short      bias,
29                             unsigned short      gap_open,
30                             unsigned short      gap_extend,
31                             struct f_struct *   f_str)
32 {
33     int                     i,j,k;
34     unsigned short *        p;
35     unsigned short          score;   
36     unsigned char *         p_dbseq;
37     int                     alphabet_size = f_str->alphabet_size;
38     unsigned short *        workspace     = (unsigned short *)f_str->workspace;
39
40     vector unsigned short   Fup,Hup1,Hup2,E,F,H,tmp;
41     vector unsigned char    perm;
42     vector unsigned short   v_maxscore;
43     vector unsigned short   v_bias,v_gapopen,v_gapextend;
44     vector unsigned short   v_score;
45     vector unsigned short   v_score_q1;
46     vector unsigned short   v_score_q2;
47     vector unsigned short   v_score_q3;
48     vector unsigned short   v_score_load; 
49     vector unsigned char    queue1_to_score  = (vector unsigned char)(16,17,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
50     vector unsigned char    queue2_to_queue1 = (vector unsigned char)(0,1,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
51     vector unsigned char    queue3_to_queue2 = (vector unsigned char)(16,16,16,16,16,21,16,0,16,1,16,2,16,3,16,4);
52     vector unsigned char    queue3_with_load = (vector unsigned char)(23,5,6,7,8,25,9,10,11,27,12,13,29,14,31,16);
53         
54     /* Load the bias to all elements of a constant */
55     v_bias           = vec_lde(0,&bias);
56     perm             = vec_lvsl(0,&bias);
57     v_bias           = vec_perm(v_bias,v_bias,perm);
58     v_bias           = vec_splat(v_bias,0);
59     
60     /* Load gap opening penalty to all elements of a constant */
61     v_gapopen        = vec_lde(0,&gap_open);
62     perm             = vec_lvsl(0,&gap_open);
63     v_gapopen        = vec_perm(v_gapopen,v_gapopen,perm);
64     v_gapopen        = vec_splat(v_gapopen,0);
65
66     /* Load gap extension penalty to all elements of a constant */
67     v_gapextend      = vec_lde(0,&gap_extend);  
68     perm             = vec_lvsl(0,&gap_extend);
69     v_gapextend      = vec_perm(v_gapextend,v_gapextend,perm);
70     v_gapextend      = vec_splat(v_gapextend,0);
71     
72     v_maxscore = vec_xor(v_maxscore,v_maxscore);
73    
74     // Zero out the storage vector 
75     k = 2*(db_length+7);
76         
77     for(i=0,j=0;i<k;i++,j+=16)
78     {
79         // borrow the zero value in v_maxscore to have something to store
80         vec_st(v_maxscore,j,workspace);
81     }
82     
83     for(i=0;i<query_length;i+=8)
84     {
85         // fetch first data asap.
86         p_dbseq    = db_sequence;
87         k          = *p_dbseq++;
88         v_score_load = vec_ld(16*k,query_profile_word);
89
90         // zero lots of stuff. 
91         // We use both the VPERM and VSIU unit to knock off some cycles.
92         
93         E          = vec_splat_u16(0);
94         F          = vec_xor(F,F);
95         H          = vec_splat_u16(0);
96         Hup2       = vec_xor(Hup2,Hup2);
97         v_score_q1 = vec_splat_u16(0);
98         v_score_q2 = vec_xor(v_score_q2,v_score_q2);
99         v_score_q3 = vec_splat_u16(0);
100
101         // reset pointers to the start of the saved data from the last row
102         p = workspace;
103                 
104         // PROLOGUE 1
105         // prefetch next residue
106         k          = *p_dbseq++;
107         
108         // Create the actual diagonal score vector
109         // and update the queue of incomplete score vectors
110         
111         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
112         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
113         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
114         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
115         
116         // prefetch score for next step 
117         v_score_load = vec_ld(16*k,query_profile_word);            
118         
119         // load values of F and H from previous row (one unit up)
120         Fup    = vec_ld(0,  p);
121         Hup1   = vec_ld(16, p);
122         p += 16; // move ahead 32 bytes
123         
124         // shift into place so we have complete F and H vectors
125         // that refer to the values one unit up from each cell
126         // that we are currently working on.
127         Fup    = vec_sld(Fup,F,14);
128         Hup1   = vec_sld(Hup1,H,14);            
129         
130         // do the dynamic programming 
131
132         // update E value
133         E   = vec_subs(E,v_gapextend);
134         tmp = vec_subs(H,v_gapopen);
135         E   = vec_max(E,tmp);
136         
137         // update F value
138         F   = vec_subs(Fup,v_gapextend);
139         tmp = vec_subs(Hup1,v_gapopen);
140         F   = vec_max(F,tmp);
141         
142         // add score to H
143         H   = vec_adds(Hup2,v_score);
144         H   = vec_subs(H,v_bias);
145         
146         // set H to max of H,E,F
147         H   = vec_max(H,E);
148         H   = vec_max(H,F);
149         
150         // Save value to use for next diagonal H 
151         Hup2 = Hup1;
152         
153         // Update highest score encountered this far
154         v_maxscore = vec_max(v_maxscore,H);
155         
156         
157         // PROLOGUE 2
158         // prefetch next residue
159         k          = *p_dbseq++;
160         
161         // Create the actual diagonal score vector
162         // and update the queue of incomplete score vectors
163         
164         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
165         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
166         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
167         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
168         
169         // prefetch score for next step 
170         v_score_load = vec_ld(16*k,query_profile_word);            
171         
172         // load values of F and H from previous row (one unit up)
173         Fup    = vec_ld(0,  p);
174         Hup1   = vec_ld(16, p);
175         p += 16; // move ahead 32 bytes
176         
177         // shift into place so we have complete F and H vectors
178         // that refer to the values one unit up from each cell
179         // that we are currently working on.
180         Fup    = vec_sld(Fup,F,14);
181         Hup1   = vec_sld(Hup1,H,14);            
182         
183         // do the dynamic programming 
184
185         // update E value
186         E   = vec_subs(E,v_gapextend);
187         tmp = vec_subs(H,v_gapopen);
188         E   = vec_max(E,tmp);
189         
190         // update F value
191         F   = vec_subs(Fup,v_gapextend);
192         tmp = vec_subs(Hup1,v_gapopen);
193         F   = vec_max(F,tmp);
194         
195         // add score to H
196         H   = vec_adds(Hup2,v_score);
197         H   = vec_subs(H,v_bias);
198         
199         // set H to max of H,E,F
200         H   = vec_max(H,E);
201         H   = vec_max(H,F);
202         
203         // Save value to use for next diagonal H 
204         Hup2 = Hup1;
205         
206         // Update highest score encountered this far
207         v_maxscore = vec_max(v_maxscore,H);
208         
209
210         // PROLOGUE 3
211         // prefetch next residue
212         k          = *p_dbseq++;
213         
214         // Create the actual diagonal score vector
215         // and update the queue of incomplete score vectors
216         
217         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
218         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
219         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
220         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
221
222         // prefetch score for next step 
223         v_score_load = vec_ld(16*k,query_profile_word);            
224         
225         // load values of F and H from previous row (one unit up)
226         Fup    = vec_ld(0,  p);
227         Hup1   = vec_ld(16, p);
228         p += 16; // move ahead 32 bytes
229         
230         // shift into place so we have complete F and H vectors
231         // that refer to the values one unit up from each cell
232         // that we are currently working on.
233         Fup    = vec_sld(Fup,F,14);
234         Hup1   = vec_sld(Hup1,H,14);            
235         
236         // do the dynamic programming 
237
238         // update E value
239         E   = vec_subs(E,v_gapextend);
240         tmp = vec_subs(H,v_gapopen);
241         E   = vec_max(E,tmp);
242         
243         // update F value
244         F   = vec_subs(Fup,v_gapextend);
245         tmp = vec_subs(Hup1,v_gapopen);
246         F   = vec_max(F,tmp);
247         
248         // add score to H
249         H   = vec_adds(Hup2,v_score);
250         H   = vec_subs(H,v_bias);
251         
252         // set H to max of H,E,F
253         H   = vec_max(H,E);
254         H   = vec_max(H,F);
255         
256         // Save value to use for next diagonal H 
257         Hup2 = Hup1;
258         
259         // Update highest score encountered this far
260         v_maxscore = vec_max(v_maxscore,H);
261         
262
263         // PROLOGUE 4
264         // prefetch next residue
265         k          = *p_dbseq++;
266         
267         // Create the actual diagonal score vector
268         // and update the queue of incomplete score vectors
269         
270         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
271         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
272         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
273         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
274         
275         // prefetch score for next step 
276         v_score_load = vec_ld(16*k,query_profile_word);            
277         
278         // load values of F and H from previous row (one unit up)
279         Fup    = vec_ld(0,  p);
280         Hup1   = vec_ld(16, p);
281         p += 16; // move ahead 32 bytes
282         
283         // shift into place so we have complete F and H vectors
284         // that refer to the values one unit up from each cell
285         // that we are currently working on.
286         Fup    = vec_sld(Fup,F,14);
287         Hup1   = vec_sld(Hup1,H,14);            
288         
289         // do the dynamic programming 
290         
291         // update E value
292         E   = vec_subs(E,v_gapextend);
293         tmp = vec_subs(H,v_gapopen);
294         E   = vec_max(E,tmp);
295         
296         // update F value
297         F   = vec_subs(Fup,v_gapextend);
298         tmp = vec_subs(Hup1,v_gapopen);
299         F   = vec_max(F,tmp);
300         
301         // add score to H
302         H   = vec_adds(Hup2,v_score);
303         H   = vec_subs(H,v_bias);
304         
305         // set H to max of H,E,F
306         H   = vec_max(H,E);
307         H   = vec_max(H,F);
308         
309         // Save value to use for next diagonal H 
310         Hup2 = Hup1;
311         
312         // Update highest score encountered this far
313         v_maxscore = vec_max(v_maxscore,H);
314         
315
316         // PROLOGUE 5
317         // prefetch next residue
318         k          = *p_dbseq++;
319         
320         // Create the actual diagonal score vector
321         // and update the queue of incomplete score vectors
322         
323         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
324         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
325         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
326         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
327         
328         // prefetch score for next step 
329         v_score_load = vec_ld(16*k,query_profile_word);            
330         
331         // load values of F and H from previous row (one unit up)
332         Fup    = vec_ld(0,  p);
333         Hup1   = vec_ld(16, p);
334         p += 16; // move ahead 32 bytes
335         
336         // shift into place so we have complete F and H vectors
337         // that refer to the values one unit up from each cell
338         // that we are currently working on.
339         Fup    = vec_sld(Fup,F,14);
340         Hup1   = vec_sld(Hup1,H,14);            
341         
342         // do the dynamic programming 
343         
344         // update E value
345         E   = vec_subs(E,v_gapextend);
346         tmp = vec_subs(H,v_gapopen);
347         E   = vec_max(E,tmp);
348         
349         // update F value
350         F   = vec_subs(Fup,v_gapextend);
351         tmp = vec_subs(Hup1,v_gapopen);
352         F   = vec_max(F,tmp);
353         
354         // add score to H
355         H   = vec_adds(Hup2,v_score);
356         H   = vec_subs(H,v_bias);
357         
358         // set H to max of H,E,F
359         H   = vec_max(H,E);
360         H   = vec_max(H,F);
361         
362         // Save value to use for next diagonal H 
363         Hup2 = Hup1;
364         
365         // Update highest score encountered this far
366         v_maxscore = vec_max(v_maxscore,H);
367         
368
369         // PROLOGUE 6
370         // prefetch next residue
371         k          = *p_dbseq++;
372         
373         // Create the actual diagonal score vector
374         // and update the queue of incomplete score vectors
375         
376         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
377         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
378         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
379         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
380         
381         // prefetch score for next step 
382         v_score_load = vec_ld(16*k,query_profile_word);            
383         
384         // load values of F and H from previous row (one unit up)
385         Fup    = vec_ld(0,  p);
386         Hup1   = vec_ld(16, p);
387         p += 16; // move ahead 32 bytes
388         
389         // shift into place so we have complete F and H vectors
390         // that refer to the values one unit up from each cell
391         // that we are currently working on.
392         Fup    = vec_sld(Fup,F,14);
393         Hup1   = vec_sld(Hup1,H,14);            
394         
395         // do the dynamic programming 
396         
397         // update E value
398         E   = vec_subs(E,v_gapextend);
399         tmp = vec_subs(H,v_gapopen);
400         E   = vec_max(E,tmp);
401         
402         // update F value
403         F   = vec_subs(Fup,v_gapextend);
404         tmp = vec_subs(Hup1,v_gapopen);
405         F   = vec_max(F,tmp);
406         
407         // add score to H
408         H   = vec_adds(Hup2,v_score);
409         H   = vec_subs(H,v_bias);
410         
411         // set H to max of H,E,F
412         H   = vec_max(H,E);
413         H   = vec_max(H,F);
414         
415         // Save value to use for next diagonal H 
416         Hup2 = Hup1;
417         
418         // Update highest score encountered this far
419         v_maxscore = vec_max(v_maxscore,H);
420
421         
422         // PROLOGUE 7
423         // prefetch next residue
424         k          = *p_dbseq++;
425         
426         // Create the actual diagonal score vector
427         // and update the queue of incomplete score vectors
428         
429         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
430         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
431         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
432         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
433         
434         // prefetch score for next step 
435         v_score_load = vec_ld(16*k,query_profile_word);            
436         
437         // load values of F and H from previous row (one unit up)
438         Fup    = vec_ld(0,  p);
439         Hup1   = vec_ld(16, p);
440         p += 16; // move ahead 32 bytes
441         
442         // shift into place so we have complete F and H vectors
443         // that refer to the values one unit up from each cell
444         // that we are currently working on.
445         Fup    = vec_sld(Fup,F,14);
446         Hup1   = vec_sld(Hup1,H,14);            
447         
448         // do the dynamic programming 
449         
450         // update E value
451         E   = vec_subs(E,v_gapextend);
452         tmp = vec_subs(H,v_gapopen);
453         E   = vec_max(E,tmp);
454         
455         // update F value
456         F   = vec_subs(Fup,v_gapextend);
457         tmp = vec_subs(Hup1,v_gapopen);
458         F   = vec_max(F,tmp);
459         
460         // add score to H
461         H   = vec_adds(Hup2,v_score);
462         H   = vec_subs(H,v_bias);
463         
464         // set H to max of H,E,F
465         H   = vec_max(H,E);
466         H   = vec_max(H,F);
467         
468         // Save value to use for next diagonal H 
469         Hup2 = Hup1;
470         
471         // Update highest score encountered this far
472         v_maxscore = vec_max(v_maxscore,H);
473         
474
475         // PROLOGUE 8
476         // prefetch next residue
477         k          = *p_dbseq++;
478         
479         // Create the actual diagonal score vector
480         // and update the queue of incomplete score vectors
481         
482         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
483         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
484         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
485         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
486         
487         // prefetch score for next step 
488         v_score_load = vec_ld(16*k,query_profile_word);            
489         
490         // load values of F and H from previous row (one unit up)
491         Fup    = vec_ld(0,  p);
492         Hup1   = vec_ld(16, p);
493         p += 16; // move ahead 32 bytes
494         
495         // shift into place so we have complete F and H vectors
496         // that refer to the values one unit up from each cell
497         // that we are currently working on.
498         Fup    = vec_sld(Fup,F,14);
499         Hup1   = vec_sld(Hup1,H,14);            
500         
501         // do the dynamic programming 
502         
503         // update E value
504         E   = vec_subs(E,v_gapextend);
505         tmp = vec_subs(H,v_gapopen);
506         E   = vec_max(E,tmp);
507         
508         // update F value
509         F   = vec_subs(Fup,v_gapextend);
510         tmp = vec_subs(Hup1,v_gapopen);
511         F   = vec_max(F,tmp);
512         
513         // add score to H
514         H   = vec_adds(Hup2,v_score);
515         H   = vec_subs(H,v_bias);
516         
517         // set H to max of H,E,F
518         H   = vec_max(H,E);
519         H   = vec_max(H,F);
520         
521         // Save value to use for next diagonal H 
522         Hup2 = Hup1;
523         
524         // Update highest score encountered this far
525         v_maxscore = vec_max(v_maxscore,H);
526     
527
528         // reset pointers to the start of the saved data from the last row
529         p = workspace;
530
531         for(j=8;j<db_length;j+=8)
532         {           
533             // STEP 1
534             
535             // prefetch next residue
536             k          = *p_dbseq++;
537             
538             // Create the actual diagonal score vector
539             // and update the queue of incomplete score vectors
540
541             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
542             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
543             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
544             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
545             
546             // prefetch score for next step
547             v_score_load = vec_ld(16*k,query_profile_word);
548             
549             // load values of F and H from previous row (one unit up)
550             Fup    = vec_ld(256, p);
551             Hup1   = vec_ld(272, p);
552             
553             // save old values of F and H to use on next row
554             vec_st(F, 0,  p);
555             vec_st(H, 16, p);
556             p += 16; // move ahead 32 bytes
557             
558             // shift into place so we have complete F and H vectors
559             // that refer to the values one unit up from each cell
560             // that we are currently working on.
561             Fup    = vec_sld(Fup,F,14);
562             Hup1   = vec_sld(Hup1,H,14);            
563
564             // do the dynamic programming 
565             
566             // update E value
567             E   = vec_subs(E,v_gapextend);
568             tmp = vec_subs(H,v_gapopen);
569             E   = vec_max(E,tmp);
570             
571             // update F value
572             F   = vec_subs(Fup,v_gapextend);
573             tmp = vec_subs(Hup1,v_gapopen);
574             F   = vec_max(F,tmp);
575
576             // add score to H
577             H   = vec_adds(Hup2,v_score);
578             H   = vec_subs(H,v_bias);
579             
580             // set H to max of H,E,F
581             H   = vec_max(H,E);
582             H   = vec_max(H,F); 
583             
584             
585             // Update highest score encountered this far
586             v_maxscore = vec_max(v_maxscore,H);
587             
588  
589  
590             // STEP 2
591             
592             // prefetch next residue
593             k          = *p_dbseq++;
594             
595             // Create the actual diagonal score vector
596             // and update the queue of incomplete score vectors
597             
598             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
599             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
600             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
601             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
602             
603             // prefetch score for next step
604             v_score_load = vec_ld(16*k,query_profile_word);
605             
606             // load values of F and H from previous row (one unit up)
607             Fup    = vec_ld(256, p);
608             Hup2   = vec_ld(272, p);
609             
610             // save old values of F and H to use on next row
611             vec_st(F, 0,  p);
612             vec_st(H, 16, p);
613             p += 16; // move ahead 32 bytes
614             
615             // shift into place so we have complete F and H vectors
616             // that refer to the values one unit up from each cell
617             // that we are currently working on.
618             Fup    = vec_sld(Fup,F,14);
619             Hup2   = vec_sld(Hup2,H,14);            
620             
621             // do the dynamic programming 
622             
623             // update E value
624             E   = vec_subs(E,v_gapextend);
625             tmp = vec_subs(H,v_gapopen);
626             E   = vec_max(E,tmp);
627             
628             // update F value
629             F   = vec_subs(Fup,v_gapextend);
630             tmp = vec_subs(Hup2,v_gapopen);
631             F   = vec_max(F,tmp);
632             
633             // add score to H
634             H   = vec_adds(Hup1,v_score);
635             H   = vec_subs(H,v_bias);
636             
637             // set H to max of H,E,F
638             H   = vec_max(H,E);
639             H   = vec_max(H,F); 
640             
641             
642             // Update highest score encountered this far
643             v_maxscore = vec_max(v_maxscore,H);
644             
645
646
647             // STEP 3
648             
649             // prefetch next residue
650             k          = *p_dbseq++;
651             
652             // Create the actual diagonal score vector
653             // and update the queue of incomplete score vectors
654             
655             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
656             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
657             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
658             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
659             
660             // prefetch score for next step
661             v_score_load = vec_ld(16*k,query_profile_word);
662             
663             // load values of F and H from previous row (one unit up)
664             Fup    = vec_ld(256, p);
665             Hup1   = vec_ld(272, p);
666             
667             // save old values of F and H to use on next row
668             vec_st(F, 0,  p);
669             vec_st(H, 16, p);
670             p += 16; // move ahead 32 bytes
671             
672             // shift into place so we have complete F and H vectors
673             // that refer to the values one unit up from each cell
674             // that we are currently working on.
675             Fup    = vec_sld(Fup,F,14);
676             Hup1   = vec_sld(Hup1,H,14);            
677             
678             // do the dynamic programming 
679             
680             // update E value
681             E   = vec_subs(E,v_gapextend);
682             tmp = vec_subs(H,v_gapopen);
683             E   = vec_max(E,tmp);
684             
685             // update F value
686             F   = vec_subs(Fup,v_gapextend);
687             tmp = vec_subs(Hup1,v_gapopen);
688             F   = vec_max(F,tmp);
689             
690             // add score to H
691             H   = vec_adds(Hup2,v_score);
692             H   = vec_subs(H,v_bias);
693             
694             // set H to max of H,E,F
695             H   = vec_max(H,E);
696             H   = vec_max(H,F); 
697             
698
699             
700             // Update highest score encountered this far
701             v_maxscore = vec_max(v_maxscore,H);
702             
703
704             
705             // STEP 4
706             
707             // prefetch next residue
708             k          = *p_dbseq++;
709             
710             // Create the actual diagonal score vector
711             // and update the queue of incomplete score vectors
712             
713             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
714             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
715             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
716             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
717             
718             // prefetch score for next step
719             v_score_load = vec_ld(16*k,query_profile_word);
720             
721             // load values of F and H from previous row (one unit up)
722             Fup    = vec_ld(256, p);
723             Hup2   = vec_ld(272, p);
724             
725             // save old values of F and H to use on next row
726             vec_st(F, 0,  p);
727             vec_st(H, 16, p);
728             p += 16; // move ahead 32 bytes
729             
730             // shift into place so we have complete F and H vectors
731             // that refer to the values one unit up from each cell
732             // that we are currently working on.
733             Fup    = vec_sld(Fup,F,14);
734             Hup2   = vec_sld(Hup2,H,14);            
735             
736             // do the dynamic programming 
737             
738             // update E value
739             E   = vec_subs(E,v_gapextend);
740             tmp = vec_subs(H,v_gapopen);
741             E   = vec_max(E,tmp);
742             
743             // update F value
744             F   = vec_subs(Fup,v_gapextend);
745             tmp = vec_subs(Hup2,v_gapopen);
746             F   = vec_max(F,tmp);
747             
748             // add score to H
749             H   = vec_adds(Hup1,v_score);
750             H   = vec_subs(H,v_bias);
751             
752             // set H to max of H,E,F
753             H   = vec_max(H,E);
754             H   = vec_max(H,F); 
755
756             
757             // Update highest score encountered this far
758             v_maxscore = vec_max(v_maxscore,H);
759             
760
761
762             // STEP 5
763             
764             // prefetch next residue
765             k          = *p_dbseq++;
766             
767             // Create the actual diagonal score vector
768             // and update the queue of incomplete score vectors
769             
770             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
771             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
772             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
773             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
774             
775             // prefetch score for next step
776             v_score_load = vec_ld(16*k,query_profile_word);
777             
778             // load values of F and H from previous row (one unit up)
779             Fup    = vec_ld(256, p);
780             Hup1   = vec_ld(272, p);
781             
782             // save old values of F and H to use on next row
783             vec_st(F, 0,  p);
784             vec_st(H, 16, p);
785             p += 16; // move ahead 32 bytes
786             
787             // shift into place so we have complete F and H vectors
788             // that refer to the values one unit up from each cell
789             // that we are currently working on.
790             Fup    = vec_sld(Fup,F,14);
791             Hup1   = vec_sld(Hup1,H,14);            
792             
793             // do the dynamic programming 
794             
795             // update E value
796             E   = vec_subs(E,v_gapextend);
797             tmp = vec_subs(H,v_gapopen);
798             E   = vec_max(E,tmp);
799             
800             // update F value
801             F   = vec_subs(Fup,v_gapextend);
802             tmp = vec_subs(Hup1,v_gapopen);
803             F   = vec_max(F,tmp);
804             
805             // add score to H
806             H   = vec_adds(Hup2,v_score);
807             H   = vec_subs(H,v_bias);
808             
809             // set H to max of H,E,F
810             H   = vec_max(H,E);
811             H   = vec_max(H,F); 
812             
813             
814             // Update highest score encountered this far
815             v_maxscore = vec_max(v_maxscore,H);
816             
817
818
819             // STEP 6
820             
821             // prefetch next residue
822             k          = *p_dbseq++;
823             
824             // Create the actual diagonal score vector
825             // and update the queue of incomplete score vectors
826             
827             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
828             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
829             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
830             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
831             
832             // prefetch score for next step
833             v_score_load = vec_ld(16*k,query_profile_word);
834             
835             // load values of F and H from previous row (one unit up)
836             Fup    = vec_ld(256, p);
837             Hup2   = vec_ld(272, p);
838             
839             // save old values of F and H to use on next row
840             vec_st(F, 0,  p);
841             vec_st(H, 16, p);
842             p += 16; // move ahead 32 bytes
843             
844             // shift into place so we have complete F and H vectors
845             // that refer to the values one unit up from each cell
846             // that we are currently working on.
847             Fup    = vec_sld(Fup,F,14);
848             Hup2   = vec_sld(Hup2,H,14);            
849             
850             // do the dynamic programming 
851             
852             // update E value
853             E   = vec_subs(E,v_gapextend);
854             tmp = vec_subs(H,v_gapopen);
855             E   = vec_max(E,tmp);
856             
857             // update F value
858             F   = vec_subs(Fup,v_gapextend);
859             tmp = vec_subs(Hup2,v_gapopen);
860             F   = vec_max(F,tmp);
861             
862             // add score to H
863             H   = vec_adds(Hup1,v_score);
864             H   = vec_subs(H,v_bias);
865             
866             // set H to max of H,E,F
867             H   = vec_max(H,E);
868             H   = vec_max(H,F); 
869             
870
871             
872             // Update highest score encountered this far
873             v_maxscore = vec_max(v_maxscore,H);
874             
875
876             
877             // STEP 7
878             
879             // prefetch next residue
880             k          = *p_dbseq++;
881             
882             // Create the actual diagonal score vector
883             // and update the queue of incomplete score vectors
884             
885             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
886             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
887             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
888             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
889             
890             // prefetch score for next step
891             v_score_load = vec_ld(16*k,query_profile_word);
892             
893             // load values of F and H from previous row (one unit up)
894             Fup    = vec_ld(256, p);
895             Hup1   = vec_ld(272, p);
896             
897             // save old values of F and H to use on next row
898             vec_st(F, 0,  p);
899             vec_st(H, 16, p);
900             p += 16; // move ahead 32 bytes
901             
902             // shift into place so we have complete F and H vectors
903             // that refer to the values one unit up from each cell
904             // that we are currently working on.
905             Fup    = vec_sld(Fup,F,14);
906             Hup1   = vec_sld(Hup1,H,14);            
907             
908             // do the dynamic programming 
909             
910             // update E value
911             E   = vec_subs(E,v_gapextend);
912             tmp = vec_subs(H,v_gapopen);
913             E   = vec_max(E,tmp);
914             
915             // update F value
916             F   = vec_subs(Fup,v_gapextend);
917             tmp = vec_subs(Hup1,v_gapopen);
918             F   = vec_max(F,tmp);
919             
920             // add score to H
921             H   = vec_adds(Hup2,v_score);
922             H   = vec_subs(H,v_bias);
923             
924             // set H to max of H,E,F
925             H   = vec_max(H,E);
926             H   = vec_max(H,F); 
927             
928
929             
930             // Update highest score encountered this far
931             v_maxscore = vec_max(v_maxscore,H);
932             
933
934
935             // STEP 8
936             
937             // prefetch next residue
938             k          = *p_dbseq++;
939             
940             // Create the actual diagonal score vector
941             // and update the queue of incomplete score vectors
942             
943             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
944             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
945             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
946             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
947             
948             // prefetch score for next step
949             v_score_load = vec_ld(16*k,query_profile_word);
950             
951             // load values of F and H from previous row (one unit up)
952             Fup    = vec_ld(256, p);
953             Hup2   = vec_ld(272, p);
954             
955             // save old values of F and H to use on next row
956             vec_st(F, 0,  p);
957             vec_st(H, 16, p);
958             p += 16; // move ahead 32 bytes
959             
960             // shift into place so we have complete F and H vectors
961             // that refer to the values one unit up from each cell
962             // that we are currently working on.
963             Fup    = vec_sld(Fup,F,14);
964             Hup2   = vec_sld(Hup2,H,14);            
965             
966             // do the dynamic programming 
967             
968             // update E value
969             E   = vec_subs(E,v_gapextend);
970             tmp = vec_subs(H,v_gapopen);
971             E   = vec_max(E,tmp);
972             
973             // update F value
974             F   = vec_subs(Fup,v_gapextend);
975             tmp = vec_subs(Hup2,v_gapopen);
976             F   = vec_max(F,tmp);
977             
978             // add score to H
979             H   = vec_adds(Hup1,v_score);
980             H   = vec_subs(H,v_bias);
981             
982             // set H to max of H,E,F
983             H   = vec_max(H,E);
984             H   = vec_max(H,F); 
985             
986             
987             // Update highest score encountered this far
988             v_maxscore = vec_max(v_maxscore,H);
989         }
990         
991         v_score_load = vec_splat_u16(0);
992         
993         for(;j<db_length+7;j++)
994         {
995             // Create the actual diagonal score vector
996             // and update the queue of incomplete score vectors
997             //
998             // This could of course be done with only vec_perm or vec_sel,
999             // but since they use different execution units we have found
1000             // it to be slightly faster to mix them.
1001             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
1002             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
1003             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
1004             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
1005             
1006             // save old values of F and H to use on next row
1007             vec_st(F, 0,  p);
1008             vec_st(H, 16, p);
1009             p += 16; // move ahead 32 bytes
1010             
1011             // v_score_load contains all zeros
1012             Fup    = vec_sld(v_score_load,F,14);
1013             Hup1   = vec_sld(v_score_load,H,14);            
1014             
1015             // do the dynamic programming 
1016             
1017             // update E value
1018             E   = vec_subs(E,v_gapextend);
1019             tmp = vec_subs(H,v_gapopen);
1020             E   = vec_max(E,tmp);
1021             
1022             // update F value
1023             F   = vec_subs(Fup,v_gapextend);
1024             tmp = vec_subs(Hup1,v_gapopen);
1025             F   = vec_max(F,tmp);
1026             
1027             // add score to H
1028             H   = vec_adds(Hup2,v_score);
1029             H   = vec_subs(H,v_bias);
1030             
1031             // set H to max of H,E,F
1032             H   = vec_max(H,E);
1033             H   = vec_max(H,F);
1034             
1035             // Save value to use for next diagonal H 
1036             Hup2 = Hup1;
1037             
1038             // Update highest score encountered this far
1039             v_maxscore = vec_max(v_maxscore,H);
1040         }
1041         vec_st(F, 0,  p);
1042         vec_st(H, 16, p);
1043
1044         query_profile_word += 8*alphabet_size;
1045     }
1046
1047     // find largest score in the v_maxscore vector
1048     tmp = vec_sld(v_maxscore,v_maxscore,8);
1049     v_maxscore = vec_max(v_maxscore,tmp);
1050     tmp = vec_sld(v_maxscore,v_maxscore,4);
1051     v_maxscore = vec_max(v_maxscore,tmp);
1052     tmp = vec_sld(v_maxscore,v_maxscore,2);
1053     v_maxscore = vec_max(v_maxscore,tmp);
1054
1055     // store in temporary variable
1056     vec_ste(v_maxscore,0,&score);
1057     
1058     // return largest score
1059     return score;
1060 }
1061
1062 int
1063 smith_waterman_altivec_byte(unsigned char *     query_sequence,
1064                             unsigned char *     query_profile_byte,
1065                             int                 query_length,
1066                             unsigned char *     db_sequence,
1067                             int                 db_length,
1068                             unsigned char       bias,
1069                             unsigned char       gap_open,
1070                             unsigned char       gap_extend,
1071                             struct f_struct *   f_str)
1072 {
1073     int                     i,j,k,k8;
1074     int                     overflow;
1075     unsigned char *         p;
1076     unsigned char           score;   
1077     int                     alphabet_size = f_str->alphabet_size;
1078     unsigned char *         workspace     = (unsigned char *)f_str->workspace;
1079     
1080     vector unsigned char    Fup,Hup1,Hup2,E,F,H,tmp;
1081     vector unsigned char    perm;
1082     vector unsigned char    v_maxscore;
1083     vector unsigned char    v_bias,v_gapopen,v_gapextend;
1084     vector unsigned char    v_score;
1085     vector unsigned char    v_score_q1;
1086     vector unsigned char    v_score_q2;
1087     vector unsigned char    v_score_q3;
1088     vector unsigned char    v_score_q4;
1089     vector unsigned char    v_score_q5;
1090     vector unsigned char    v_score_load1;
1091     vector unsigned char    v_score_load2;  
1092     vector unsigned char    v_zero;  
1093
1094     vector unsigned char    queue1_to_score  = (vector unsigned char)(16,1,2,3,4,5,6,7,24,9,10,11,12,13,14,15);
1095     vector unsigned char    queue2_to_queue1 = (vector unsigned char)(16,17,2,3,4,5,6,7,24,25,10,11,12,13,14,15);
1096     vector unsigned char    queue3_to_queue2 = (vector unsigned char)(16,17,18,3,4,5,6,7,24,25,26,11,12,13,14,15);
1097     vector unsigned char    queue4_to_queue3 = (vector unsigned char)(16,17,18,19,4,5,6,7,24,25,26,27,12,13,14,15);
1098     vector unsigned char    queue5_to_queue4 = (vector unsigned char)(16,17,18,19,20,2,3,4,24,25,26,27,28,10,11,12);
1099     vector unsigned char    queue5_with_load = (vector unsigned char)(19,20,21,5,6,22,7,23,27,28,29,13,14,30,15,31);
1100     vector unsigned char    merge_score_load = (vector unsigned char)(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
1101
1102     v_zero           = vec_splat_u8(0);
1103         
1104     /* Load the bias to all elements of a constant */
1105     v_bias           = vec_lde(0,&bias);
1106     perm             = vec_lvsl(0,&bias);
1107     v_bias           = vec_perm(v_bias,v_bias,perm);
1108     v_bias           = vec_splat(v_bias,0);
1109     
1110     /* Load gap opening penalty to all elements of a constant */
1111     v_gapopen        = vec_lde(0,&gap_open);
1112     perm             = vec_lvsl(0,&gap_open);
1113     v_gapopen        = vec_perm(v_gapopen,v_gapopen,perm);
1114     v_gapopen        = vec_splat(v_gapopen,0);
1115
1116     /* Load gap extension penalty to all elements of a constant */
1117     v_gapextend      = vec_lde(0,&gap_extend);  
1118     perm             = vec_lvsl(0,&gap_extend);
1119     v_gapextend      = vec_perm(v_gapextend,v_gapextend,perm);
1120     v_gapextend      = vec_splat(v_gapextend,0);
1121     
1122     v_maxscore = vec_xor(v_maxscore,v_maxscore);
1123    
1124     // Zero out the storage vector 
1125     k = (db_length+15);
1126     for(i=0,j=0;i<k;i++,j+=32)
1127     {
1128         // borrow the zero value in v_maxscore to have something to store
1129         vec_st(v_maxscore,j,workspace);
1130         vec_st(v_maxscore,j+16,workspace);
1131     }
1132     
1133     for(i=0;i<query_length;i+=16)
1134     {
1135         // zero lots of stuff. 
1136         // We use both the VPERM and VSIU unit to knock off some cycles.
1137         
1138         E          = vec_splat_u8(0);
1139         F          = vec_xor(F,F);
1140         H          = vec_splat_u8(0);
1141         Hup2      = vec_xor(Hup2,Hup2);
1142         v_score_q1 = vec_splat_u8(0);
1143         v_score_q2 = vec_xor(v_score_q2,v_score_q2);
1144         v_score_q3 = vec_splat_u8(0);
1145         v_score_q4 = vec_xor(v_score_q4,v_score_q4);
1146         v_score_q5 = vec_splat_u8(0);
1147
1148         // reset pointers to the start of the saved data from the last row
1149         p = workspace;
1150         
1151         // start directly and prefetch score column
1152         k             = db_sequence[0];
1153         k8            = k;
1154         v_score_load1 = vec_ld(16*k,query_profile_byte);
1155         v_score_load2 = v_score_load1;
1156         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1157
1158         // PROLOGUE 1
1159         // prefetch next residue
1160         k                = db_sequence[1];
1161         
1162         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1163         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1164         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1165         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1166         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1167         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1168         
1169         // prefetch score for next step 
1170         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1171         
1172         // load values of F and H from previous row (one unit up)
1173         Fup    = vec_ld(0,  p);
1174         Hup1   = vec_ld(16, p);
1175         p += 32; // move ahead 32 bytes
1176         
1177         // shift into place so we have complete F and H vectors
1178         // that refer to the values one unit up from each cell
1179         // that we are currently working on.
1180         Fup    = vec_sld(Fup,F,15);
1181         Hup1    = vec_sld(Hup1,H,15);            
1182         
1183         // do the dynamic programming 
1184         
1185         // update E value
1186         E   = vec_subs(E,v_gapextend);
1187         tmp = vec_subs(H,v_gapopen);
1188         E   = vec_max(E,tmp);
1189         
1190         // update F value
1191         F   = vec_subs(Fup,v_gapextend);
1192         tmp = vec_subs(Hup1,v_gapopen);
1193         F   = vec_max(F,tmp);
1194         
1195         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1196         
1197         // add score to H
1198         H   = vec_adds(Hup2,v_score);
1199         H   = vec_subs(H,v_bias);
1200         
1201         // set H to max of H,E,F
1202         H   = vec_max(H,E);
1203         H   = vec_max(H,F);
1204         
1205         // Update highest score encountered this far
1206         v_maxscore = vec_max(v_maxscore,H);
1207         
1208         
1209         
1210         
1211         // PROLOGUE 2
1212         // prefetch next residue
1213         k                = db_sequence[2];
1214         
1215         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1216         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1217         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1218         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1219         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1220         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1221         
1222   
1223         // prefetch score for next step 
1224         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1225         
1226         // load values of F and H from previous row (one unit up)
1227         Fup    = vec_ld(0,  p);
1228         Hup2   = vec_ld(16, p);
1229         p += 32; // move ahead 32 bytes
1230         
1231         // shift into place so we have complete F and H vectors
1232         // that refer to the values one unit up from each cell
1233         // that we are currently working on.
1234         Fup    = vec_sld(Fup,F,15);
1235         Hup2   = vec_sld(Hup2,H,15);            
1236         
1237         // do the dynamic programming 
1238         
1239         // update E value
1240         E   = vec_subs(E,v_gapextend);
1241         tmp = vec_subs(H,v_gapopen);
1242         E   = vec_max(E,tmp);
1243         
1244         // update F value
1245         F   = vec_subs(Fup,v_gapextend);
1246         tmp = vec_subs(Hup2,v_gapopen);
1247         F   = vec_max(F,tmp);
1248         
1249         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1250         
1251         // add score to H
1252         H   = vec_adds(Hup1,v_score);
1253         H   = vec_subs(H,v_bias);
1254         
1255         // set H to max of H,E,F
1256         H   = vec_max(H,E);
1257         H   = vec_max(H,F);
1258         
1259         // Update highest score encountered this far
1260         v_maxscore = vec_max(v_maxscore,H);
1261      
1262         
1263         // PROLOGUE 3
1264         // prefetch next residue
1265         k                = db_sequence[3];
1266   
1267         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1268         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1269         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1270         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1271         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1272         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1273         
1274
1275         // prefetch score for next step 
1276         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1277         
1278         // load values of F and H from previous row (one unit up)
1279         Fup    = vec_ld(0,  p);
1280         Hup1   = vec_ld(16, p);
1281         p += 32; // move ahead 32 bytes
1282         
1283         // shift into place so we have complete F and H vectors
1284         // that refer to the values one unit up from each cell
1285         // that we are currently working on.
1286         Fup    = vec_sld(Fup,F,15);
1287         Hup1    = vec_sld(Hup1,H,15);            
1288         
1289         // do the dynamic programming 
1290         
1291         // update E value
1292         E   = vec_subs(E,v_gapextend);
1293         tmp = vec_subs(H,v_gapopen);
1294         E   = vec_max(E,tmp);
1295         
1296         // update F value
1297         F   = vec_subs(Fup,v_gapextend);
1298         tmp = vec_subs(Hup1,v_gapopen);
1299         F   = vec_max(F,tmp);
1300         
1301         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1302         
1303         // add score to H
1304         H   = vec_adds(Hup2,v_score);
1305         H   = vec_subs(H,v_bias);
1306         
1307         // set H to max of H,E,F
1308         H   = vec_max(H,E);
1309         H   = vec_max(H,F);
1310         
1311         // Update highest score encountered this far
1312         v_maxscore = vec_max(v_maxscore,H);
1313         
1314         
1315         // PROLOGUE 4
1316         // prefetch next residue
1317         k                = db_sequence[4];
1318         
1319         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1320         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1321         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1322         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1323         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1324         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1325         
1326
1327         // prefetch score for next step 
1328         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1329         
1330         // load values of F and H from previous row (one unit up)
1331         Fup    = vec_ld(0,  p);
1332         Hup2   = vec_ld(16, p);
1333         p += 32; // move ahead 32 bytes
1334         
1335         // shift into place so we have complete F and H vectors
1336         // that refer to the values one unit up from each cell
1337         // that we are currently working on.
1338         Fup    = vec_sld(Fup,F,15);
1339         Hup2   = vec_sld(Hup2,H,15);            
1340         
1341         // do the dynamic programming 
1342         
1343         // update E value
1344         E   = vec_subs(E,v_gapextend);
1345         tmp = vec_subs(H,v_gapopen);
1346         E   = vec_max(E,tmp);
1347         
1348         // update F value
1349         F   = vec_subs(Fup,v_gapextend);
1350         tmp = vec_subs(Hup2,v_gapopen);
1351         F   = vec_max(F,tmp);
1352         
1353         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1354         
1355         // add score to H
1356         H   = vec_adds(Hup1,v_score);
1357         H   = vec_subs(H,v_bias);
1358         
1359         // set H to max of H,E,F
1360         H   = vec_max(H,E);
1361         H   = vec_max(H,F);
1362         
1363         // Update highest score encountered this far
1364         v_maxscore = vec_max(v_maxscore,H);
1365         
1366         
1367         // PROLOGUE 5
1368         // prefetch next residue
1369         k                = db_sequence[5];
1370         
1371         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1372         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1373         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1374         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1375         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1376         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1377      
1378
1379         // prefetch score for next step 
1380         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1381         
1382         // load values of F and H from previous row (one unit up)
1383         Fup    = vec_ld(0,  p);
1384         Hup1   = vec_ld(16, p);
1385         p += 32; // move ahead 32 bytes
1386         
1387         // shift into place so we have complete F and H vectors
1388         // that refer to the values one unit up from each cell
1389         // that we are currently working on.
1390         Fup    = vec_sld(Fup,F,15);
1391         Hup1    = vec_sld(Hup1,H,15);            
1392         
1393         // do the dynamic programming 
1394         
1395         // update E value
1396         E   = vec_subs(E,v_gapextend);
1397         tmp = vec_subs(H,v_gapopen);
1398         E   = vec_max(E,tmp);
1399         
1400         // update F value
1401         F   = vec_subs(Fup,v_gapextend);
1402         tmp = vec_subs(Hup1,v_gapopen);
1403         F   = vec_max(F,tmp);
1404         
1405         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1406         
1407         // add score to H
1408         H   = vec_adds(Hup2,v_score);
1409         H   = vec_subs(H,v_bias);
1410         
1411         // set H to max of H,E,F
1412         H   = vec_max(H,E);
1413         H   = vec_max(H,F);
1414         
1415         // Update highest score encountered this far
1416         v_maxscore = vec_max(v_maxscore,H);
1417         
1418         
1419         // PROLOGUE 6
1420         // prefetch next residue
1421         k                = db_sequence[6];
1422         
1423         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1424         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1425         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1426         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1427         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1428         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1429         
1430
1431         // prefetch score for next step 
1432         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1433         
1434         // load values of F and H from previous row (one unit up)
1435         Fup    = vec_ld(0,  p);
1436         Hup2   = vec_ld(16, p);
1437         p += 32; // move ahead 32 bytes
1438         
1439         // shift into place so we have complete F and H vectors
1440         // that refer to the values one unit up from each cell
1441         // that we are currently working on.
1442         Fup    = vec_sld(Fup,F,15);
1443         Hup2   = vec_sld(Hup2,H,15);            
1444         
1445         // do the dynamic programming 
1446         
1447         // update E value
1448         E   = vec_subs(E,v_gapextend);
1449         tmp = vec_subs(H,v_gapopen);
1450         E   = vec_max(E,tmp);
1451         
1452         // update F value
1453         F   = vec_subs(Fup,v_gapextend);
1454         tmp = vec_subs(Hup2,v_gapopen);
1455         F   = vec_max(F,tmp);
1456         
1457         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1458         
1459         // add score to H
1460         H   = vec_adds(Hup1,v_score);
1461         H   = vec_subs(H,v_bias);
1462         
1463         // set H to max of H,E,F
1464         H   = vec_max(H,E);
1465         H   = vec_max(H,F);
1466         
1467         // Update highest score encountered this far
1468         v_maxscore = vec_max(v_maxscore,H);
1469         
1470         
1471         
1472         // PROLOGUE 7
1473         // prefetch next residue
1474         k                = db_sequence[7];
1475         
1476         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1477         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1478         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1479         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1480         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1481         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1482         
1483
1484         // prefetch score for next step 
1485         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1486         
1487         // load values of F and H from previous row (one unit up)
1488         Fup    = vec_ld(0,  p);
1489         Hup1   = vec_ld(16, p);
1490         p += 32; // move ahead 32 bytes
1491         
1492         // shift into place so we have complete F and H vectors
1493         // that refer to the values one unit up from each cell
1494         // that we are currently working on.
1495         Fup    = vec_sld(Fup,F,15);
1496         Hup1    = vec_sld(Hup1,H,15);            
1497         
1498         // do the dynamic programming 
1499         
1500         // update E value
1501         E   = vec_subs(E,v_gapextend);
1502         tmp = vec_subs(H,v_gapopen);
1503         E   = vec_max(E,tmp);
1504         
1505         // update F value
1506         F   = vec_subs(Fup,v_gapextend);
1507         tmp = vec_subs(Hup1,v_gapopen);
1508         F   = vec_max(F,tmp);
1509         
1510         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1511         
1512         // add score to H
1513         H   = vec_adds(Hup2,v_score);
1514         H   = vec_subs(H,v_bias);
1515         
1516         // set H to max of H,E,F
1517         H   = vec_max(H,E);
1518         H   = vec_max(H,F);
1519         
1520         // Update highest score encountered this far
1521         v_maxscore = vec_max(v_maxscore,H);
1522         
1523         
1524         
1525         // PROLOGUE 8
1526         // prefetch next residue
1527         k                = db_sequence[8];
1528         
1529         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1530         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1531         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1532         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1533         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1534         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1535         
1536
1537         // prefetch score for next step 
1538         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1539         
1540         // load values of F and H from previous row (one unit up)
1541         Fup    = vec_ld(0,  p);
1542         Hup2   = vec_ld(16, p);
1543         p += 32; // move ahead 32 bytes
1544         
1545         // shift into place so we have complete F and H vectors
1546         // that refer to the values one unit up from each cell
1547         // that we are currently working on.
1548         Fup    = vec_sld(Fup,F,15);
1549         Hup2   = vec_sld(Hup2,H,15);            
1550         
1551         // do the dynamic programming 
1552         
1553         // update E value
1554         E   = vec_subs(E,v_gapextend);
1555         tmp = vec_subs(H,v_gapopen);
1556         E   = vec_max(E,tmp);
1557         
1558         // update F value
1559         F   = vec_subs(Fup,v_gapextend);
1560         tmp = vec_subs(Hup2,v_gapopen);
1561         F   = vec_max(F,tmp);
1562         
1563         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1564         
1565         // add score to H
1566         H   = vec_adds(Hup1,v_score);
1567         H   = vec_subs(H,v_bias);
1568         
1569         // set H to max of H,E,F
1570         H   = vec_max(H,E);
1571         H   = vec_max(H,F);
1572         
1573         // Update highest score encountered this far
1574         v_maxscore = vec_max(v_maxscore,H);
1575         
1576         
1577         
1578         
1579         // PROLOGUE 9
1580         // prefetch next residue
1581         k                = db_sequence[9];
1582         k8               = db_sequence[1];
1583         
1584         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1585         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1586         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1587         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1588         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1589         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1590         
1591
1592         // prefetch score for next step 
1593         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1594         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1595         
1596         // load values of F and H from previous row (one unit up)
1597         Fup    = vec_ld(0,  p);
1598         Hup1    = vec_ld(16, p);
1599         p += 32; // move ahead 32 bytes
1600         
1601         // shift into place so we have complete F and H vectors
1602         // that refer to the values one unit up from each cell
1603         // that we are currently working on.
1604         Fup    = vec_sld(Fup,F,15);
1605         Hup1    = vec_sld(Hup1,H,15);            
1606         
1607         // do the dynamic programming 
1608         
1609         // update E value
1610         E   = vec_subs(E,v_gapextend);
1611         tmp = vec_subs(H,v_gapopen);
1612         E   = vec_max(E,tmp);
1613         
1614         // update F value
1615         F   = vec_subs(Fup,v_gapextend);
1616         tmp = vec_subs(Hup1,v_gapopen);
1617         F   = vec_max(F,tmp);
1618         
1619         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1620         
1621         // add score to H
1622         H   = vec_adds(Hup2,v_score);
1623         H   = vec_subs(H,v_bias);
1624         
1625         // set H to max of H,E,F
1626         H   = vec_max(H,E);
1627         H   = vec_max(H,F);
1628         
1629         // Update highest score encountered this far
1630         v_maxscore = vec_max(v_maxscore,H);
1631         
1632         
1633         
1634         // PROLOGUE 10
1635         // prefetch next residue
1636         k                = db_sequence[10];
1637         k8               = db_sequence[2];
1638         
1639         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1640         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1641         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1642         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1643         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1644         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1645         
1646
1647         // prefetch score for next step 
1648         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1649         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1650         
1651         // load values of F and H from previous row (one unit up)
1652         Fup    = vec_ld(0,  p);
1653         Hup2   = vec_ld(16, p);
1654         p += 32; // move ahead 32 bytes
1655         
1656         // shift into place so we have complete F and H vectors
1657         // that refer to the values one unit up from each cell
1658         // that we are currently working on.
1659         Fup    = vec_sld(Fup,F,15);
1660         Hup2   = vec_sld(Hup2,H,15);            
1661         
1662         // do the dynamic programming 
1663         
1664         // update E value
1665         E   = vec_subs(E,v_gapextend);
1666         tmp = vec_subs(H,v_gapopen);
1667         E   = vec_max(E,tmp);
1668         
1669         // update F value
1670         F   = vec_subs(Fup,v_gapextend);
1671         tmp = vec_subs(Hup2,v_gapopen);
1672         F   = vec_max(F,tmp);
1673         
1674         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1675         
1676         // add score to H
1677         H   = vec_adds(Hup1,v_score);
1678         H   = vec_subs(H,v_bias);
1679         
1680         // set H to max of H,E,F
1681         H   = vec_max(H,E);
1682         H   = vec_max(H,F);
1683         
1684         // Update highest score encountered this far
1685         v_maxscore = vec_max(v_maxscore,H);
1686         
1687         
1688         
1689         
1690         // PROLOGUE 11
1691         // prefetch next residue
1692         k                = db_sequence[11];
1693         k8               = db_sequence[3];
1694         
1695         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1696         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1697         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1698         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1699         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1700         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1701         
1702
1703         // prefetch score for next step 
1704         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1705         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1706         
1707         // load values of F and H from previous row (one unit up)
1708         Fup    = vec_ld(0,  p);
1709         Hup1    = vec_ld(16, p);
1710         p += 32; // move ahead 32 bytes
1711         
1712         // shift into place so we have complete F and H vectors
1713         // that refer to the values one unit up from each cell
1714         // that we are currently working on.
1715         Fup    = vec_sld(Fup,F,15);
1716         Hup1    = vec_sld(Hup1,H,15);            
1717         
1718         // do the dynamic programming 
1719         
1720         // update E value
1721         E   = vec_subs(E,v_gapextend);
1722         tmp = vec_subs(H,v_gapopen);
1723         E   = vec_max(E,tmp);
1724         
1725         // update F value
1726         F   = vec_subs(Fup,v_gapextend);
1727         tmp = vec_subs(Hup1,v_gapopen);
1728         F   = vec_max(F,tmp);
1729         
1730         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1731         
1732         // add score to H
1733         H   = vec_adds(Hup2,v_score);
1734         H   = vec_subs(H,v_bias);
1735         
1736         // set H to max of H,E,F
1737         H   = vec_max(H,E);
1738         H   = vec_max(H,F);
1739         
1740         // Update highest score encountered this far
1741         v_maxscore = vec_max(v_maxscore,H);
1742         
1743         
1744         
1745         // PROLOGUE 12
1746         // prefetch next residue
1747         k                = db_sequence[12];
1748         k8               = db_sequence[4];
1749         
1750         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1751         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1752         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1753         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1754         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1755         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1756         
1757
1758         // prefetch score for next step 
1759         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1760         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1761         
1762         // load values of F and H from previous row (one unit up)
1763         Fup    = vec_ld(0,  p);
1764         Hup2   = vec_ld(16, p);
1765         p += 32; // move ahead 32 bytes
1766         
1767         // shift into place so we have complete F and H vectors
1768         // that refer to the values one unit up from each cell
1769         // that we are currently working on.
1770         Fup    = vec_sld(Fup,F,15);
1771         Hup2   = vec_sld(Hup2,H,15);            
1772         
1773         // do the dynamic programming 
1774         
1775         // update E value
1776         E   = vec_subs(E,v_gapextend);
1777         tmp = vec_subs(H,v_gapopen);
1778         E   = vec_max(E,tmp);
1779         
1780         // update F value
1781         F   = vec_subs(Fup,v_gapextend);
1782         tmp = vec_subs(Hup2,v_gapopen);
1783         F   = vec_max(F,tmp);
1784         
1785         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1786         
1787         // add score to H
1788         H   = vec_adds(Hup1,v_score);
1789         H   = vec_subs(H,v_bias);
1790         
1791         // set H to max of H,E,F
1792         H   = vec_max(H,E);
1793         H   = vec_max(H,F);
1794         
1795         // Update highest score encountered this far
1796         v_maxscore = vec_max(v_maxscore,H);
1797         
1798         
1799         
1800         
1801         // PROLOGUE 13
1802         // prefetch next residue
1803         k                = db_sequence[13];
1804         k8               = db_sequence[5];
1805         
1806         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1807         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1808         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1809         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1810         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1811         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1812         
1813
1814         // prefetch score for next step 
1815         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1816         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1817         
1818         // load values of F and H from previous row (one unit up)
1819         Fup    = vec_ld(0,  p);
1820         Hup1    = vec_ld(16, p);
1821         p += 32; // move ahead 32 bytes
1822         
1823         // shift into place so we have complete F and H vectors
1824         // that refer to the values one unit up from each cell
1825         // that we are currently working on.
1826         Fup    = vec_sld(Fup,F,15);
1827         Hup1    = vec_sld(Hup1,H,15);            
1828         
1829         // do the dynamic programming 
1830         
1831         // update E value
1832         E   = vec_subs(E,v_gapextend);
1833         tmp = vec_subs(H,v_gapopen);
1834         E   = vec_max(E,tmp);
1835         
1836         // update F value
1837         F   = vec_subs(Fup,v_gapextend);
1838         tmp = vec_subs(Hup1,v_gapopen);
1839         F   = vec_max(F,tmp);
1840         
1841         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1842         
1843         // add score to H
1844         H   = vec_adds(Hup2,v_score);
1845         H   = vec_subs(H,v_bias);
1846         
1847         // set H to max of H,E,F
1848         H   = vec_max(H,E);
1849         H   = vec_max(H,F);
1850         
1851         // Update highest score encountered this far
1852         v_maxscore = vec_max(v_maxscore,H);
1853         
1854         
1855         
1856         // PROLOGUE 14
1857         // prefetch next residue
1858         k                = db_sequence[14];
1859         k8               = db_sequence[6];
1860         
1861         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1862         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1863         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1864         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1865         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1866         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1867         
1868
1869         // prefetch score for next step 
1870         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1871         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1872         
1873         // load values of F and H from previous row (one unit up)
1874         Fup    = vec_ld(0,  p);
1875         Hup2   = vec_ld(16, p);
1876         p += 32; // move ahead 32 bytes
1877         
1878         // shift into place so we have complete F and H vectors
1879         // that refer to the values one unit up from each cell
1880         // that we are currently working on.
1881         Fup    = vec_sld(Fup,F,15);
1882         Hup2   = vec_sld(Hup2,H,15);            
1883         
1884         // do the dynamic programming 
1885         
1886         // update E value
1887         E   = vec_subs(E,v_gapextend);
1888         tmp = vec_subs(H,v_gapopen);
1889         E   = vec_max(E,tmp);
1890         
1891         // update F value
1892         F   = vec_subs(Fup,v_gapextend);
1893         tmp = vec_subs(Hup2,v_gapopen);
1894         F   = vec_max(F,tmp);
1895         
1896         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1897         
1898         // add score to H
1899         H   = vec_adds(Hup1,v_score);
1900         H   = vec_subs(H,v_bias);
1901         
1902         // set H to max of H,E,F
1903         H   = vec_max(H,E);
1904         H   = vec_max(H,F);
1905         
1906         // Update highest score encountered this far
1907         v_maxscore = vec_max(v_maxscore,H);
1908         
1909         
1910         
1911         // PROLOGUE 15
1912         // prefetch next residue
1913         k                = db_sequence[15];
1914         k8               = db_sequence[7];
1915         
1916         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1917         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1918         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1919         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1920         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1921         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1922         
1923
1924         // prefetch score for next step 
1925         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1926         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1927         
1928         // load values of F and H from previous row (one unit up)
1929         Fup    = vec_ld(0,  p);
1930         Hup1    = vec_ld(16, p);
1931         p += 32; // move ahead 32 bytes
1932         
1933         // shift into place so we have complete F and H vectors
1934         // that refer to the values one unit up from each cell
1935         // that we are currently working on.
1936         Fup    = vec_sld(Fup,F,15);
1937         Hup1    = vec_sld(Hup1,H,15);            
1938         
1939         // do the dynamic programming 
1940         
1941         // update E value
1942         E   = vec_subs(E,v_gapextend);
1943         tmp = vec_subs(H,v_gapopen);
1944         E   = vec_max(E,tmp);
1945         
1946         // update F value
1947         F   = vec_subs(Fup,v_gapextend);
1948         tmp = vec_subs(Hup1,v_gapopen);
1949         F   = vec_max(F,tmp);
1950         
1951         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1952         
1953         // add score to H
1954         H   = vec_adds(Hup2,v_score);
1955         H   = vec_subs(H,v_bias);
1956         
1957         // set H to max of H,E,F
1958         H   = vec_max(H,E);
1959         H   = vec_max(H,F);
1960         
1961         // Update highest score encountered this far
1962         v_maxscore = vec_max(v_maxscore,H);
1963         
1964         
1965         
1966         // PROLOGUE 16
1967         // prefetch next residue
1968         k                = db_sequence[16];
1969         k8               = db_sequence[8];
1970         
1971         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1972         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1973         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1974         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1975         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1976         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1977         
1978
1979         // prefetch score for next step 
1980         v_score_load1 = vec_ld(16*k,query_profile_byte);            
1981         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1982         
1983         // load values of F and H from previous row (one unit up)
1984         Fup    = vec_ld(0,  p);
1985         Hup2   = vec_ld(16, p);
1986         p += 32; // move ahead 32 bytes
1987         
1988         // shift into place so we have complete F and H vectors
1989         // that refer to the values one unit up from each cell
1990         // that we are currently working on.
1991         Fup    = vec_sld(Fup,F,15);
1992         Hup2   = vec_sld(Hup2,H,15);            
1993         
1994         // do the dynamic programming 
1995         
1996         // update E value
1997         E   = vec_subs(E,v_gapextend);
1998         tmp = vec_subs(H,v_gapopen);
1999         E   = vec_max(E,tmp);
2000         
2001         // update F value
2002         F   = vec_subs(Fup,v_gapextend);
2003         tmp = vec_subs(Hup2,v_gapopen);
2004         F   = vec_max(F,tmp);
2005         
2006         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2007         
2008         // add score to H
2009         H   = vec_adds(Hup1,v_score);
2010         H   = vec_subs(H,v_bias);
2011         
2012         // set H to max of H,E,F
2013         H   = vec_max(H,E);
2014         H   = vec_max(H,F);
2015         
2016         // Update highest score encountered this far
2017         v_maxscore = vec_max(v_maxscore,H);
2018         
2019         p = workspace;
2020         
2021         for(j=16;j<db_length;j+=16)
2022         { 
2023             // STEP 1
2024             
2025             // prefetch next residue 
2026             k                = db_sequence[j+1];
2027             k8               = db_sequence[j-7];
2028             
2029             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2030             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2031             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2032             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2033             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2034             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2035             
2036             // prefetch scores for next step
2037             v_score_load1 = vec_ld(16*k,query_profile_byte);
2038             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2039        
2040             // load values of F and H from previous row (one unit up)
2041             Fup    = vec_ld(512, p);
2042             Hup1   = vec_ld(528, p);
2043             
2044             // save old values of F and H to use on next row
2045             vec_st(F, 0,  p);
2046             vec_st(H, 16, p);
2047             p += 32;
2048             
2049             // shift into place so we have complete F and H vectors
2050             // that refer to the values one unit up from each cell
2051             // that we are currently working on.
2052             Fup    = vec_sld(Fup,F,15);
2053             Hup1    = vec_sld(Hup1,H,15);            
2054
2055             // do the dynamic programming 
2056             
2057             // update E value
2058             E   = vec_subs(E,v_gapextend);
2059             tmp = vec_subs(H,v_gapopen);
2060             E   = vec_max(E,tmp);
2061             
2062             // update F value
2063             F   = vec_subs(Fup,v_gapextend);
2064             tmp = vec_subs(Hup1,v_gapopen);
2065             F   = vec_max(F,tmp);
2066
2067             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2068             
2069             // add score to H
2070             H   = vec_adds(Hup2,v_score);
2071             H   = vec_subs(H,v_bias);
2072             
2073             // set H to max of H,E,F
2074             H   = vec_max(H,E);
2075             H   = vec_max(H,F);
2076             
2077
2078             
2079             // Update highest score encountered this far
2080             v_maxscore = vec_max(v_maxscore,H);
2081           
2082
2083             
2084             
2085             
2086             // STEP 2
2087             
2088             // prefetch next residue
2089             k                = db_sequence[j+2];
2090             k8               = db_sequence[j-6];
2091             
2092             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2093             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2094             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2095             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2096             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2097             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2098             
2099             
2100             // prefetch scores for next step
2101             v_score_load1 = vec_ld(16*k,query_profile_byte);
2102             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2103             
2104             // load values of F and H from previous row (one unit up)
2105             Fup    = vec_ld(512, p);
2106             Hup2   = vec_ld(528, p);
2107             
2108             // save old values of F and H to use on next row
2109             vec_st(F, 0,  p);
2110             vec_st(H, 16, p);
2111             p += 32;
2112             
2113             // shift into place so we have complete F and H vectors
2114             // that refer to the values one unit up from each cell
2115             // that we are currently working on.
2116             Fup    = vec_sld(Fup,F,15);
2117             Hup2   = vec_sld(Hup2,H,15);            
2118             
2119             // do the dynamic programming 
2120             
2121             // update E value
2122             E   = vec_subs(E,v_gapextend);
2123             tmp = vec_subs(H,v_gapopen);
2124             E   = vec_max(E,tmp);
2125             
2126             // update F value
2127             F   = vec_subs(Fup,v_gapextend);
2128             tmp = vec_subs(Hup2,v_gapopen);
2129             F   = vec_max(F,tmp);
2130             
2131             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2132             
2133             // add score to H
2134             H   = vec_adds(Hup1,v_score);
2135             H   = vec_subs(H,v_bias);
2136             
2137             // set H to max of H,E,F
2138             H   = vec_max(H,E);
2139             H   = vec_max(H,F);
2140             
2141             
2142             // Update highest score encountered this far
2143             v_maxscore = vec_max(v_maxscore,H);
2144             
2145             
2146
2147             
2148             
2149             
2150             // STEP 3
2151             
2152             // prefetch next residue
2153             k                = db_sequence[j+3];
2154             k8               = db_sequence[j-5];
2155             
2156             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2157             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2158             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2159             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2160             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2161             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2162             
2163             
2164             // prefetch scores for next step
2165             v_score_load1 = vec_ld(16*k,query_profile_byte);
2166             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2167             
2168             // load values of F and H from previous row (one unit up)
2169             Fup    = vec_ld(512, p);
2170             Hup1   = vec_ld(528, p);
2171             
2172             // save old values of F and H to use on next row
2173             vec_st(F, 0,  p);
2174             vec_st(H, 16, p);
2175             p += 32;
2176             
2177             // shift into place so we have complete F and H vectors
2178             // that refer to the values one unit up from each cell
2179             // that we are currently working on.
2180             Fup    = vec_sld(Fup,F,15);
2181             Hup1    = vec_sld(Hup1,H,15);            
2182             
2183             // do the dynamic programming 
2184             
2185             // update E value
2186             E   = vec_subs(E,v_gapextend);
2187             tmp = vec_subs(H,v_gapopen);
2188             E   = vec_max(E,tmp);
2189             
2190             // update F value
2191             F   = vec_subs(Fup,v_gapextend);
2192             tmp = vec_subs(Hup1,v_gapopen);
2193             F   = vec_max(F,tmp);
2194             
2195             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2196             
2197             // add score to H
2198             H   = vec_adds(Hup2,v_score);
2199             H   = vec_subs(H,v_bias);
2200             
2201             // set H to max of H,E,F
2202             H   = vec_max(H,E);
2203             H   = vec_max(H,F);
2204             
2205             // Update highest score encountered this far
2206             v_maxscore = vec_max(v_maxscore,H);
2207             
2208       
2209             
2210
2211             
2212             
2213             // STEP 4
2214             
2215             // prefetch next residue
2216             k                = db_sequence[j+4];
2217             k8               = db_sequence[j-4];
2218             
2219             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2220             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2221             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2222             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2223             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2224             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2225             
2226             
2227             // prefetch scores for next step
2228             v_score_load1 = vec_ld(16*k,query_profile_byte);
2229             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2230             
2231             // load values of F and H from previous row (one unit up)
2232             Fup    = vec_ld(512, p);
2233             Hup2   = vec_ld(528, p);
2234             
2235             // save old values of F and H to use on next row
2236             vec_st(F, 0,  p);
2237             vec_st(H, 16, p);
2238             p += 32;
2239             
2240             // shift into place so we have complete F and H vectors
2241             // that refer to the values one unit up from each cell
2242             // that we are currently working on.
2243             Fup    = vec_sld(Fup,F,15);
2244             Hup2   = vec_sld(Hup2,H,15);            
2245             
2246             // do the dynamic programming 
2247             
2248             // update E value
2249             E   = vec_subs(E,v_gapextend);
2250             tmp = vec_subs(H,v_gapopen);
2251             E   = vec_max(E,tmp);
2252             
2253             // update F value
2254             F   = vec_subs(Fup,v_gapextend);
2255             tmp = vec_subs(Hup2,v_gapopen);
2256             F   = vec_max(F,tmp);
2257             
2258             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2259             
2260             // add score to H
2261             H   = vec_adds(Hup1,v_score);
2262             H   = vec_subs(H,v_bias);
2263             
2264             // set H to max of H,E,F
2265             H   = vec_max(H,E);
2266             H   = vec_max(H,F);
2267             
2268             // Update highest score encountered this far
2269             v_maxscore = vec_max(v_maxscore,H);
2270             
2271             
2272             
2273
2274             
2275             
2276             // STEP 5
2277             
2278             // prefetch next residue
2279             k                = db_sequence[j+5];
2280             k8               = db_sequence[j-3];
2281             
2282             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2283             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2284             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2285             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2286             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2287             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2288             
2289             
2290             // prefetch scores for next step
2291             v_score_load1 = vec_ld(16*k,query_profile_byte);
2292             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2293             
2294             // load values of F and H from previous row (one unit up)
2295             Fup    = vec_ld(512, p);
2296             Hup1    = vec_ld(528, p);
2297             
2298             // save old values of F and H to use on next row
2299             vec_st(F, 0,  p);
2300             vec_st(H, 16, p);
2301             p += 32;
2302             
2303             // shift into place so we have complete F and H vectors
2304             // that refer to the values one unit up from each cell
2305             // that we are currently working on.
2306             Fup    = vec_sld(Fup,F,15);
2307             Hup1   = vec_sld(Hup1,H,15);            
2308             
2309             // do the dynamic programming 
2310             
2311             // update E value
2312             E   = vec_subs(E,v_gapextend);
2313             tmp = vec_subs(H,v_gapopen);
2314             E   = vec_max(E,tmp);
2315             
2316             // update F value
2317             F   = vec_subs(Fup,v_gapextend);
2318             tmp = vec_subs(Hup1,v_gapopen);
2319             F   = vec_max(F,tmp);
2320             
2321             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2322             
2323             // add score to H
2324             H   = vec_adds(Hup2,v_score);
2325             H   = vec_subs(H,v_bias);
2326             
2327             // set H to max of H,E,F
2328             H   = vec_max(H,E);
2329             H   = vec_max(H,F);
2330             
2331             // Update highest score encountered this far
2332             v_maxscore = vec_max(v_maxscore,H);
2333             
2334             
2335
2336             
2337             
2338             
2339             // STEP 6
2340             
2341             // prefetch next residue
2342             k                = db_sequence[j+6];
2343             k8               = db_sequence[j-2];
2344             
2345             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2346             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2347             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2348             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2349             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2350             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2351             
2352             
2353             // prefetch scores for next step
2354             v_score_load1 = vec_ld(16*k,query_profile_byte);
2355             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2356             
2357             // load values of F and H from previous row (one unit up)
2358             Fup    = vec_ld(512, p);
2359             Hup2   = vec_ld(528, p);
2360             
2361             // save old values of F and H to use on next row
2362             vec_st(F, 0,  p);
2363             vec_st(H, 16, p);
2364             p += 32;
2365             
2366             // shift into place so we have complete F and H vectors
2367             // that refer to the values one unit up from each cell
2368             // that we are currently working on.
2369             Fup    = vec_sld(Fup,F,15);
2370             Hup2   = vec_sld(Hup2,H,15);            
2371             
2372             // do the dynamic programming 
2373             
2374             // update E value
2375             E   = vec_subs(E,v_gapextend);
2376             tmp = vec_subs(H,v_gapopen);
2377             E   = vec_max(E,tmp);
2378             
2379             // update F value
2380             F   = vec_subs(Fup,v_gapextend);
2381             tmp = vec_subs(Hup2,v_gapopen);
2382             F   = vec_max(F,tmp);
2383             
2384             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2385             
2386             // add score to H
2387             H   = vec_adds(Hup1,v_score);
2388             H   = vec_subs(H,v_bias);
2389             
2390             // set H to max of H,E,F
2391             H   = vec_max(H,E);
2392             H   = vec_max(H,F);
2393             
2394             // Update highest score encountered this far
2395             v_maxscore = vec_max(v_maxscore,H);
2396             
2397             
2398
2399             
2400             
2401             
2402             // STEP 7
2403             
2404             // prefetch next residue
2405             k                = db_sequence[j+7];
2406             k8               = db_sequence[j-1];
2407             
2408             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2409             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2410             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2411             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2412             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2413             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2414             
2415             
2416             // prefetch scores for next step
2417             v_score_load1 = vec_ld(16*k,query_profile_byte);
2418             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2419             
2420             // load values of F and H from previous row (one unit up)
2421             Fup    = vec_ld(512, p);
2422             Hup1    = vec_ld(528, p);
2423             
2424             // save old values of F and H to use on next row
2425             vec_st(F, 0,  p);
2426             vec_st(H, 16, p);
2427             p += 32;
2428             
2429             // shift into place so we have complete F and H vectors
2430             // that refer to the values one unit up from each cell
2431             // that we are currently working on.
2432             Fup    = vec_sld(Fup,F,15);
2433             Hup1    = vec_sld(Hup1,H,15);            
2434             
2435             // do the dynamic programming 
2436             
2437             // update E value
2438             E   = vec_subs(E,v_gapextend);
2439             tmp = vec_subs(H,v_gapopen);
2440             E   = vec_max(E,tmp);
2441             
2442             // update F value
2443             F   = vec_subs(Fup,v_gapextend);
2444             tmp = vec_subs(Hup1,v_gapopen);
2445             F   = vec_max(F,tmp);
2446             
2447             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2448             
2449             // add score to H
2450             H   = vec_adds(Hup2,v_score);
2451             H   = vec_subs(H,v_bias);
2452             
2453             // set H to max of H,E,F
2454             H   = vec_max(H,E);
2455             H   = vec_max(H,F);
2456             
2457             // Update highest score encountered this far
2458             v_maxscore = vec_max(v_maxscore,H);
2459             
2460             
2461             
2462
2463             
2464             
2465             // STEP 8
2466             
2467             // prefetch next residue
2468             k                = db_sequence[j+8];
2469             k8               = db_sequence[j];
2470             
2471             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2472             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2473             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2474             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2475             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2476             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2477             
2478             
2479             // prefetch scores for next step
2480             v_score_load1 = vec_ld(16*k,query_profile_byte);
2481             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2482             
2483             // load values of F and H from previous row (one unit up)
2484             Fup    = vec_ld(512, p);
2485             Hup2   = vec_ld(528, p);
2486             
2487             // save old values of F and H to use on next row
2488             vec_st(F, 0,  p);
2489             vec_st(H, 16, p);
2490             p += 32;
2491             
2492             // shift into place so we have complete F and H vectors
2493             // that refer to the values one unit up from each cell
2494             // that we are currently working on.
2495             Fup    = vec_sld(Fup,F,15);
2496             Hup2   = vec_sld(Hup2,H,15);            
2497             
2498             // do the dynamic programming 
2499             
2500             // update E value
2501             E   = vec_subs(E,v_gapextend);
2502             tmp = vec_subs(H,v_gapopen);
2503             E   = vec_max(E,tmp);
2504             
2505             // update F value
2506             F   = vec_subs(Fup,v_gapextend);
2507             tmp = vec_subs(Hup2,v_gapopen);
2508             F   = vec_max(F,tmp);
2509             
2510             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2511             
2512             // add score to H
2513             H   = vec_adds(Hup1,v_score);
2514             H   = vec_subs(H,v_bias);
2515             
2516             // set H to max of H,E,F
2517             H   = vec_max(H,E);
2518             H   = vec_max(H,F);
2519             
2520             // Update highest score encountered this far
2521             v_maxscore = vec_max(v_maxscore,H);
2522             
2523             
2524             
2525
2526             
2527             
2528             // STEP 9
2529             
2530             // prefetch next residue
2531             k                = db_sequence[j+9];
2532             k8               = db_sequence[j+1];
2533             
2534             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2535             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2536             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2537             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2538             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2539             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2540             
2541             
2542             // prefetch scores for next step
2543             v_score_load1 = vec_ld(16*k,query_profile_byte);
2544             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2545             
2546             // load values of F and H from previous row (one unit up)
2547             Fup    = vec_ld(512, p);
2548             Hup1   = vec_ld(528, p);
2549             
2550             // save old values of F and H to use on next row
2551             vec_st(F, 0,  p);
2552             vec_st(H, 16, p);
2553             p += 32;
2554             
2555             // shift into place so we have complete F and H vectors
2556             // that refer to the values one unit up from each cell
2557             // that we are currently working on.
2558             Fup    = vec_sld(Fup,F,15);
2559             Hup1   = vec_sld(Hup1,H,15);            
2560             
2561             // do the dynamic programming 
2562             
2563             // update E value
2564             E   = vec_subs(E,v_gapextend);
2565             tmp = vec_subs(H,v_gapopen);
2566             E   = vec_max(E,tmp);
2567             
2568             // update F value
2569             F   = vec_subs(Fup,v_gapextend);
2570             tmp = vec_subs(Hup1,v_gapopen);
2571             F   = vec_max(F,tmp);
2572             
2573             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2574             
2575             // add score to H
2576             H   = vec_adds(Hup2,v_score);
2577             H   = vec_subs(H,v_bias);
2578             
2579             // set H to max of H,E,F
2580             H   = vec_max(H,E);
2581             H   = vec_max(H,F);
2582             
2583             // Update highest score encountered this far
2584             v_maxscore = vec_max(v_maxscore,H);
2585             
2586             // STEP 10
2587             
2588             // prefetch next residue
2589             k                = db_sequence[j+10];
2590             k8               = db_sequence[j+2];
2591             
2592             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2593             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2594             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2595             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2596             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2597             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2598             
2599             
2600             // prefetch scores for next step
2601             v_score_load1 = vec_ld(16*k,query_profile_byte);
2602             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2603             
2604             // load values of F and H from previous row (one unit up)
2605             Fup    = vec_ld(512, p);
2606             Hup2   = vec_ld(528, p);
2607             
2608             // save old values of F and H to use on next row
2609             vec_st(F, 0,  p);
2610             vec_st(H, 16, p);
2611             p += 32;
2612             
2613             // shift into place so we have complete F and H vectors
2614             // that refer to the values one unit up from each cell
2615             // that we are currently working on.
2616             Fup    = vec_sld(Fup,F,15);
2617             Hup2   = vec_sld(Hup2,H,15);            
2618             
2619             // do the dynamic programming 
2620             
2621             // update E value
2622             E   = vec_subs(E,v_gapextend);
2623             tmp = vec_subs(H,v_gapopen);
2624             E   = vec_max(E,tmp);
2625             
2626             // update F value
2627             F   = vec_subs(Fup,v_gapextend);
2628             tmp = vec_subs(Hup2,v_gapopen);
2629             F   = vec_max(F,tmp);
2630             
2631             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2632             
2633             // add score to H
2634             H   = vec_adds(Hup1,v_score);
2635             H   = vec_subs(H,v_bias);
2636             
2637             // set H to max of H,E,F
2638             H   = vec_max(H,E);
2639             H   = vec_max(H,F);
2640         
2641             // Update highest score encountered this far
2642             v_maxscore = vec_max(v_maxscore,H);
2643             
2644             // STEP 11
2645             
2646             // prefetch next residue
2647             k                = db_sequence[j+11];
2648             k8               = db_sequence[j+3];
2649             
2650             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2651             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2652             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2653             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2654             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2655             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2656             
2657             
2658             // prefetch scores for next step
2659             v_score_load1 = vec_ld(16*k,query_profile_byte);
2660             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2661             
2662             // load values of F and H from previous row (one unit up)
2663             Fup    = vec_ld(512, p);
2664             Hup1   = vec_ld(528, p);
2665             
2666             // save old values of F and H to use on next row
2667             vec_st(F, 0,  p);
2668             vec_st(H, 16, p);
2669             p += 32;
2670             
2671             // shift into place so we have complete F and H vectors
2672             // that refer to the values one unit up from each cell
2673             // that we are currently working on.
2674             Fup    = vec_sld(Fup,F,15);
2675             Hup1   = vec_sld(Hup1,H,15);            
2676             
2677             // do the dynamic programming 
2678             
2679             // update E value
2680             E   = vec_subs(E,v_gapextend);
2681             tmp = vec_subs(H,v_gapopen);
2682             E   = vec_max(E,tmp);
2683             
2684             // update F value
2685             F   = vec_subs(Fup,v_gapextend);
2686             tmp = vec_subs(Hup1,v_gapopen);
2687             F   = vec_max(F,tmp);
2688             
2689             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2690             
2691             // add score to H
2692             H   = vec_adds(Hup2,v_score);
2693             H   = vec_subs(H,v_bias);
2694             
2695             // set H to max of H,E,F
2696             H   = vec_max(H,E);
2697             H   = vec_max(H,F);
2698             
2699             // Update highest score encountered this far
2700             v_maxscore = vec_max(v_maxscore,H);
2701             
2702             // STEP 12
2703             
2704             // prefetch next residue
2705             k                = db_sequence[j+12];
2706             k8               = db_sequence[j+4];
2707             
2708             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2709             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2710             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2711             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2712             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2713             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2714             
2715             
2716             // prefetch scores for next step
2717             v_score_load1 = vec_ld(16*k,query_profile_byte);
2718             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2719             
2720             // load values of F and H from previous row (one unit up)
2721             Fup    = vec_ld(512, p);
2722             Hup2   = vec_ld(528, p);
2723             
2724             // save old values of F and H to use on next row
2725             vec_st(F, 0,  p);
2726             vec_st(H, 16, p);
2727             p += 32;
2728             
2729             // shift into place so we have complete F and H vectors
2730             // that refer to the values one unit up from each cell
2731             // that we are currently working on.
2732             Fup    = vec_sld(Fup,F,15);
2733             Hup2   = vec_sld(Hup2,H,15);            
2734             
2735             // do the dynamic programming 
2736             
2737             // update E value
2738             E   = vec_subs(E,v_gapextend);
2739             tmp = vec_subs(H,v_gapopen);
2740             E   = vec_max(E,tmp);
2741             
2742             // update F value
2743             F   = vec_subs(Fup,v_gapextend);
2744             tmp = vec_subs(Hup2,v_gapopen);
2745             F   = vec_max(F,tmp);
2746             
2747             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2748             
2749             // add score to H
2750             H   = vec_adds(Hup1,v_score);
2751             H   = vec_subs(H,v_bias);
2752             
2753             // set H to max of H,E,F
2754             H   = vec_max(H,E);
2755             H   = vec_max(H,F);
2756             
2757             // Update highest score encountered this far
2758             v_maxscore = vec_max(v_maxscore,H);
2759             
2760             // STEP 13
2761             
2762             // prefetch next residue
2763             k                = db_sequence[j+13];
2764             k8               = db_sequence[j+5];
2765             
2766             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2767             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2768             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2769             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2770             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2771             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2772             
2773             
2774             // prefetch scores for next step
2775             v_score_load1 = vec_ld(16*k,query_profile_byte);
2776             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2777             
2778             // load values of F and H from previous row (one unit up)
2779             Fup    = vec_ld(512, p);
2780             Hup1   = vec_ld(528, p);
2781             
2782             // save old values of F and H to use on next row
2783             vec_st(F, 0,  p);
2784             vec_st(H, 16, p);
2785             p += 32;
2786             
2787             // shift into place so we have complete F and H vectors
2788             // that refer to the values one unit up from each cell
2789             // that we are currently working on.
2790             Fup    = vec_sld(Fup,F,15);
2791             Hup1   = vec_sld(Hup1,H,15);            
2792             
2793             // do the dynamic programming 
2794             
2795             // update E value
2796             E   = vec_subs(E,v_gapextend);
2797             tmp = vec_subs(H,v_gapopen);
2798             E   = vec_max(E,tmp);
2799             
2800             // update F value
2801             F   = vec_subs(Fup,v_gapextend);
2802             tmp = vec_subs(Hup1,v_gapopen);
2803             F   = vec_max(F,tmp);
2804             
2805             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2806             
2807             // add score to H
2808             H   = vec_adds(Hup2,v_score);
2809             H   = vec_subs(H,v_bias);
2810             
2811             // set H to max of H,E,F
2812             H   = vec_max(H,E);
2813             H   = vec_max(H,F);
2814             
2815             // Update highest score encountered this far
2816             v_maxscore = vec_max(v_maxscore,H);
2817             
2818             // STEP 14
2819             
2820             // prefetch next residue
2821             k                = db_sequence[j+14];
2822             k8               = db_sequence[j+6];
2823             
2824             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2825             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2826             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2827             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2828             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2829             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2830             
2831             
2832             // prefetch scores for next step
2833             v_score_load1 = vec_ld(16*k,query_profile_byte);
2834             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2835             
2836             // load values of F and H from previous row (one unit up)
2837             Fup    = vec_ld(512, p);
2838             Hup2   = vec_ld(528, p);
2839             
2840             // save old values of F and H to use on next row
2841             vec_st(F, 0,  p);
2842             vec_st(H, 16, p);
2843             p += 32;
2844             
2845             // shift into place so we have complete F and H vectors
2846             // that refer to the values one unit up from each cell
2847             // that we are currently working on.
2848             Fup    = vec_sld(Fup,F,15);
2849             Hup2   = vec_sld(Hup2,H,15);            
2850             
2851             // do the dynamic programming 
2852             
2853             // update E value
2854             E   = vec_subs(E,v_gapextend);
2855             tmp = vec_subs(H,v_gapopen);
2856             E   = vec_max(E,tmp);
2857             
2858             // update F value
2859             F   = vec_subs(Fup,v_gapextend);
2860             tmp = vec_subs(Hup2,v_gapopen);
2861             F   = vec_max(F,tmp);
2862             
2863             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2864             
2865             // add score to H
2866             H   = vec_adds(Hup1,v_score);
2867             H   = vec_subs(H,v_bias);
2868             
2869             // set H to max of H,E,F
2870             H   = vec_max(H,E);
2871             H   = vec_max(H,F);
2872             
2873             // Update highest score encountered this far
2874             v_maxscore = vec_max(v_maxscore,H);
2875             
2876             // STEP 15
2877             
2878             // prefetch next residue
2879             k                = db_sequence[j+15];
2880             k8               = db_sequence[j+7];
2881             
2882             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2883             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2884             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2885             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2886             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2887             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2888                         
2889             // prefetch scores for next step
2890             v_score_load1 = vec_ld(16*k,query_profile_byte);
2891             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2892             
2893             // load values of F and H from previous row (one unit up)
2894             Fup    = vec_ld(512, p);
2895             Hup1   = vec_ld(528, p);
2896             
2897             // save old values of F and H to use on next row
2898             vec_st(F, 0,  p);
2899             vec_st(H, 16, p);
2900             p += 32;
2901             
2902             // shift into place so we have complete F and H vectors
2903             // that refer to the values one unit up from each cell
2904             // that we are currently working on.
2905             Fup    = vec_sld(Fup,F,15);
2906             Hup1   = vec_sld(Hup1,H,15);            
2907             
2908             // do the dynamic programming 
2909             
2910             // update E value
2911             E   = vec_subs(E,v_gapextend);
2912             tmp = vec_subs(H,v_gapopen);
2913             E   = vec_max(E,tmp);
2914             
2915             // update F value
2916             F   = vec_subs(Fup,v_gapextend);
2917             tmp = vec_subs(Hup1,v_gapopen);
2918             F   = vec_max(F,tmp);
2919             
2920             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2921             
2922             // add score to H
2923             H   = vec_adds(Hup2,v_score);
2924             H   = vec_subs(H,v_bias);
2925             
2926             // set H to max of H,E,F
2927             H   = vec_max(H,E);
2928             H   = vec_max(H,F);
2929             
2930             // Update highest score encountered this far
2931             v_maxscore = vec_max(v_maxscore,H);
2932             
2933             // STEP 16
2934             
2935             // prefetch next residue
2936             k                = db_sequence[j+16];
2937             k8               = db_sequence[j+8];
2938             
2939             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2940             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2941             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2942             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2943             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2944             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2945             
2946             
2947             // prefetch scores for next step
2948             v_score_load1 = vec_ld(16*k,query_profile_byte);
2949             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2950             
2951             // load values of F and H from previous row (one unit up)
2952             Fup    = vec_ld(512, p);
2953             Hup2   = vec_ld(528, p);
2954             
2955             // save old values of F and H to use on next row
2956             vec_st(F, 0,  p);
2957             vec_st(H, 16, p);
2958             p += 32;
2959             
2960             // shift into place so we have complete F and H vectors
2961             // that refer to the values one unit up from each cell
2962             // that we are currently working on.
2963             Fup    = vec_sld(Fup,F,15);
2964             Hup2   = vec_sld(Hup2,H,15);            
2965             
2966             // do the dynamic programming 
2967             
2968             // update E value
2969             E   = vec_subs(E,v_gapextend);
2970             tmp = vec_subs(H,v_gapopen);
2971             E   = vec_max(E,tmp);
2972             
2973             // update F value
2974             F   = vec_subs(Fup,v_gapextend);
2975             tmp = vec_subs(Hup2,v_gapopen);
2976             F   = vec_max(F,tmp);
2977             
2978             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2979             
2980             // add score to H
2981             H   = vec_adds(Hup1,v_score);
2982             H   = vec_subs(H,v_bias);
2983             
2984             // set H to max of H,E,F
2985             H   = vec_max(H,E);
2986             H   = vec_max(H,F);
2987             
2988             // Update highest score encountered this far
2989             v_maxscore = vec_max(v_maxscore,H);
2990             
2991         }
2992         
2993         for(;j<db_length+15;j++)
2994         {
2995             k8               = db_sequence[j-7];
2996
2997             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2998             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2999             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
3000             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
3001             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
3002             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
3003             
3004             
3005             // prefetch scores for next step
3006             v_score_load2 = vec_ld(16*k8,query_profile_byte);
3007             v_score_load1 = vec_perm(v_zero,v_score_load2,merge_score_load);
3008
3009             // save old values of F and H to use on next row
3010             vec_st(F, 0,  p);
3011             vec_st(H, 16, p);
3012             p += 32; // move ahead 32 bytes
3013             
3014             Fup    = vec_sld(v_zero,F,15);
3015             Hup1   = vec_sld(v_zero,H,15);            
3016             
3017             // do the dynamic programming 
3018             
3019             // update E value
3020             E   = vec_subs(E,v_gapextend);
3021             tmp = vec_subs(H,v_gapopen);
3022             E   = vec_max(E,tmp);
3023             
3024             // update F value
3025             F   = vec_subs(Fup,v_gapextend);
3026             tmp = vec_subs(Hup1,v_gapopen);
3027             F   = vec_max(F,tmp);
3028             
3029             // add score to H
3030             H   = vec_adds(Hup2,v_score);
3031             H   = vec_subs(H,v_bias);
3032             
3033             // set H to max of H,E,F
3034             H   = vec_max(H,E);
3035             H   = vec_max(H,F);
3036             
3037             // Save value to use for next diagonal H 
3038             Hup2 = Hup1;
3039
3040             // Update highest score encountered this far
3041             v_maxscore = vec_max(v_maxscore,H);
3042         }
3043         vec_st(F, 512, p);
3044         vec_st(H, 528, p);
3045
3046         query_profile_byte += 16*alphabet_size;
3047
3048         // End of this row (actually 16 rows due to SIMD).
3049         // Before we continue, check for overflow.
3050         tmp      = vec_subs(vec_splat_u8(-1),v_bias);
3051         overflow = vec_any_ge(v_maxscore,tmp);
3052         
3053
3054     }
3055
3056     if(overflow)
3057     {
3058         return 255;
3059     }
3060     else
3061     {
3062         // find largest score in the v_maxscore vector
3063         tmp = vec_sld(v_maxscore,v_maxscore,8);
3064         v_maxscore = vec_max(v_maxscore,tmp);
3065         tmp = vec_sld(v_maxscore,v_maxscore,4);
3066         v_maxscore = vec_max(v_maxscore,tmp);
3067         tmp = vec_sld(v_maxscore,v_maxscore,2);
3068         v_maxscore = vec_max(v_maxscore,tmp);
3069         tmp = vec_sld(v_maxscore,v_maxscore,1);
3070         v_maxscore = vec_max(v_maxscore,tmp);
3071         
3072         // store in temporary variable
3073         vec_ste(v_maxscore,0,&score);
3074         
3075         // return largest score
3076         return score;
3077     }}
3078
3079
3080 #else
3081
3082 /* No Altivec support. Avoid compiler complaints about empty object */
3083
3084 int sw_dummy;
3085
3086 #endif