2 /* Implementation of the Wozniak "anti-diagonal" vectorization
3 strategy for Smith-Waterman comparison, Wozniak (1997) Comp.
4 Appl. Biosci. 13:145-150
10 Written by Erik Lindahl, Stockholm Bioinformatics Center, 2004.
11 Please send bug reports and/or suggestions to lindahl@sbc.su.se.
23 smith_waterman_altivec_word(unsigned char * query_sequence,
24 unsigned short * query_profile_word,
26 unsigned char * db_sequence,
29 unsigned short gap_open,
30 unsigned short gap_extend,
31 struct f_struct * f_str)
36 unsigned char * p_dbseq;
37 int alphabet_size = f_str->alphabet_size;
38 unsigned short * workspace = (unsigned short *)f_str->workspace;
40 vector unsigned short Fup,Hup1,Hup2,E,F,H,tmp;
41 vector unsigned char perm;
42 vector unsigned short v_maxscore;
43 vector unsigned short v_bias,v_gapopen,v_gapextend;
44 vector unsigned short v_score;
45 vector unsigned short v_score_q1;
46 vector unsigned short v_score_q2;
47 vector unsigned short v_score_q3;
48 vector unsigned short v_score_load;
49 vector unsigned char queue1_to_score = (vector unsigned char)(16,17,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
50 vector unsigned char queue2_to_queue1 = (vector unsigned char)(0,1,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
51 vector unsigned char queue3_to_queue2 = (vector unsigned char)(16,16,16,16,16,21,16,0,16,1,16,2,16,3,16,4);
52 vector unsigned char queue3_with_load = (vector unsigned char)(23,5,6,7,8,25,9,10,11,27,12,13,29,14,31,16);
54 /* Load the bias to all elements of a constant */
55 v_bias = vec_lde(0,&bias);
56 perm = vec_lvsl(0,&bias);
57 v_bias = vec_perm(v_bias,v_bias,perm);
58 v_bias = vec_splat(v_bias,0);
60 /* Load gap opening penalty to all elements of a constant */
61 v_gapopen = vec_lde(0,&gap_open);
62 perm = vec_lvsl(0,&gap_open);
63 v_gapopen = vec_perm(v_gapopen,v_gapopen,perm);
64 v_gapopen = vec_splat(v_gapopen,0);
66 /* Load gap extension penalty to all elements of a constant */
67 v_gapextend = vec_lde(0,&gap_extend);
68 perm = vec_lvsl(0,&gap_extend);
69 v_gapextend = vec_perm(v_gapextend,v_gapextend,perm);
70 v_gapextend = vec_splat(v_gapextend,0);
72 v_maxscore = vec_xor(v_maxscore,v_maxscore);
74 // Zero out the storage vector
77 for(i=0,j=0;i<k;i++,j+=16)
79 // borrow the zero value in v_maxscore to have something to store
80 vec_st(v_maxscore,j,workspace);
83 for(i=0;i<query_length;i+=8)
85 // fetch first data asap.
86 p_dbseq = db_sequence;
88 v_score_load = vec_ld(16*k,query_profile_word);
90 // zero lots of stuff.
91 // We use both the VPERM and VSIU unit to knock off some cycles.
96 Hup2 = vec_xor(Hup2,Hup2);
97 v_score_q1 = vec_splat_u16(0);
98 v_score_q2 = vec_xor(v_score_q2,v_score_q2);
99 v_score_q3 = vec_splat_u16(0);
101 // reset pointers to the start of the saved data from the last row
105 // prefetch next residue
108 // Create the actual diagonal score vector
109 // and update the queue of incomplete score vectors
111 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
112 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
113 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
114 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
116 // prefetch score for next step
117 v_score_load = vec_ld(16*k,query_profile_word);
119 // load values of F and H from previous row (one unit up)
121 Hup1 = vec_ld(16, p);
122 p += 16; // move ahead 32 bytes
124 // shift into place so we have complete F and H vectors
125 // that refer to the values one unit up from each cell
126 // that we are currently working on.
127 Fup = vec_sld(Fup,F,14);
128 Hup1 = vec_sld(Hup1,H,14);
130 // do the dynamic programming
133 E = vec_subs(E,v_gapextend);
134 tmp = vec_subs(H,v_gapopen);
138 F = vec_subs(Fup,v_gapextend);
139 tmp = vec_subs(Hup1,v_gapopen);
143 H = vec_adds(Hup2,v_score);
144 H = vec_subs(H,v_bias);
146 // set H to max of H,E,F
150 // Save value to use for next diagonal H
153 // Update highest score encountered this far
154 v_maxscore = vec_max(v_maxscore,H);
158 // prefetch next residue
161 // Create the actual diagonal score vector
162 // and update the queue of incomplete score vectors
164 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
165 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
166 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
167 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
169 // prefetch score for next step
170 v_score_load = vec_ld(16*k,query_profile_word);
172 // load values of F and H from previous row (one unit up)
174 Hup1 = vec_ld(16, p);
175 p += 16; // move ahead 32 bytes
177 // shift into place so we have complete F and H vectors
178 // that refer to the values one unit up from each cell
179 // that we are currently working on.
180 Fup = vec_sld(Fup,F,14);
181 Hup1 = vec_sld(Hup1,H,14);
183 // do the dynamic programming
186 E = vec_subs(E,v_gapextend);
187 tmp = vec_subs(H,v_gapopen);
191 F = vec_subs(Fup,v_gapextend);
192 tmp = vec_subs(Hup1,v_gapopen);
196 H = vec_adds(Hup2,v_score);
197 H = vec_subs(H,v_bias);
199 // set H to max of H,E,F
203 // Save value to use for next diagonal H
206 // Update highest score encountered this far
207 v_maxscore = vec_max(v_maxscore,H);
211 // prefetch next residue
214 // Create the actual diagonal score vector
215 // and update the queue of incomplete score vectors
217 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
218 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
219 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
220 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
222 // prefetch score for next step
223 v_score_load = vec_ld(16*k,query_profile_word);
225 // load values of F and H from previous row (one unit up)
227 Hup1 = vec_ld(16, p);
228 p += 16; // move ahead 32 bytes
230 // shift into place so we have complete F and H vectors
231 // that refer to the values one unit up from each cell
232 // that we are currently working on.
233 Fup = vec_sld(Fup,F,14);
234 Hup1 = vec_sld(Hup1,H,14);
236 // do the dynamic programming
239 E = vec_subs(E,v_gapextend);
240 tmp = vec_subs(H,v_gapopen);
244 F = vec_subs(Fup,v_gapextend);
245 tmp = vec_subs(Hup1,v_gapopen);
249 H = vec_adds(Hup2,v_score);
250 H = vec_subs(H,v_bias);
252 // set H to max of H,E,F
256 // Save value to use for next diagonal H
259 // Update highest score encountered this far
260 v_maxscore = vec_max(v_maxscore,H);
264 // prefetch next residue
267 // Create the actual diagonal score vector
268 // and update the queue of incomplete score vectors
270 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
271 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
272 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
273 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
275 // prefetch score for next step
276 v_score_load = vec_ld(16*k,query_profile_word);
278 // load values of F and H from previous row (one unit up)
280 Hup1 = vec_ld(16, p);
281 p += 16; // move ahead 32 bytes
283 // shift into place so we have complete F and H vectors
284 // that refer to the values one unit up from each cell
285 // that we are currently working on.
286 Fup = vec_sld(Fup,F,14);
287 Hup1 = vec_sld(Hup1,H,14);
289 // do the dynamic programming
292 E = vec_subs(E,v_gapextend);
293 tmp = vec_subs(H,v_gapopen);
297 F = vec_subs(Fup,v_gapextend);
298 tmp = vec_subs(Hup1,v_gapopen);
302 H = vec_adds(Hup2,v_score);
303 H = vec_subs(H,v_bias);
305 // set H to max of H,E,F
309 // Save value to use for next diagonal H
312 // Update highest score encountered this far
313 v_maxscore = vec_max(v_maxscore,H);
317 // prefetch next residue
320 // Create the actual diagonal score vector
321 // and update the queue of incomplete score vectors
323 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
324 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
325 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
326 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
328 // prefetch score for next step
329 v_score_load = vec_ld(16*k,query_profile_word);
331 // load values of F and H from previous row (one unit up)
333 Hup1 = vec_ld(16, p);
334 p += 16; // move ahead 32 bytes
336 // shift into place so we have complete F and H vectors
337 // that refer to the values one unit up from each cell
338 // that we are currently working on.
339 Fup = vec_sld(Fup,F,14);
340 Hup1 = vec_sld(Hup1,H,14);
342 // do the dynamic programming
345 E = vec_subs(E,v_gapextend);
346 tmp = vec_subs(H,v_gapopen);
350 F = vec_subs(Fup,v_gapextend);
351 tmp = vec_subs(Hup1,v_gapopen);
355 H = vec_adds(Hup2,v_score);
356 H = vec_subs(H,v_bias);
358 // set H to max of H,E,F
362 // Save value to use for next diagonal H
365 // Update highest score encountered this far
366 v_maxscore = vec_max(v_maxscore,H);
370 // prefetch next residue
373 // Create the actual diagonal score vector
374 // and update the queue of incomplete score vectors
376 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
377 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
378 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
379 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
381 // prefetch score for next step
382 v_score_load = vec_ld(16*k,query_profile_word);
384 // load values of F and H from previous row (one unit up)
386 Hup1 = vec_ld(16, p);
387 p += 16; // move ahead 32 bytes
389 // shift into place so we have complete F and H vectors
390 // that refer to the values one unit up from each cell
391 // that we are currently working on.
392 Fup = vec_sld(Fup,F,14);
393 Hup1 = vec_sld(Hup1,H,14);
395 // do the dynamic programming
398 E = vec_subs(E,v_gapextend);
399 tmp = vec_subs(H,v_gapopen);
403 F = vec_subs(Fup,v_gapextend);
404 tmp = vec_subs(Hup1,v_gapopen);
408 H = vec_adds(Hup2,v_score);
409 H = vec_subs(H,v_bias);
411 // set H to max of H,E,F
415 // Save value to use for next diagonal H
418 // Update highest score encountered this far
419 v_maxscore = vec_max(v_maxscore,H);
423 // prefetch next residue
426 // Create the actual diagonal score vector
427 // and update the queue of incomplete score vectors
429 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
430 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
431 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
432 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
434 // prefetch score for next step
435 v_score_load = vec_ld(16*k,query_profile_word);
437 // load values of F and H from previous row (one unit up)
439 Hup1 = vec_ld(16, p);
440 p += 16; // move ahead 32 bytes
442 // shift into place so we have complete F and H vectors
443 // that refer to the values one unit up from each cell
444 // that we are currently working on.
445 Fup = vec_sld(Fup,F,14);
446 Hup1 = vec_sld(Hup1,H,14);
448 // do the dynamic programming
451 E = vec_subs(E,v_gapextend);
452 tmp = vec_subs(H,v_gapopen);
456 F = vec_subs(Fup,v_gapextend);
457 tmp = vec_subs(Hup1,v_gapopen);
461 H = vec_adds(Hup2,v_score);
462 H = vec_subs(H,v_bias);
464 // set H to max of H,E,F
468 // Save value to use for next diagonal H
471 // Update highest score encountered this far
472 v_maxscore = vec_max(v_maxscore,H);
476 // prefetch next residue
479 // Create the actual diagonal score vector
480 // and update the queue of incomplete score vectors
482 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
483 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
484 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
485 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
487 // prefetch score for next step
488 v_score_load = vec_ld(16*k,query_profile_word);
490 // load values of F and H from previous row (one unit up)
492 Hup1 = vec_ld(16, p);
493 p += 16; // move ahead 32 bytes
495 // shift into place so we have complete F and H vectors
496 // that refer to the values one unit up from each cell
497 // that we are currently working on.
498 Fup = vec_sld(Fup,F,14);
499 Hup1 = vec_sld(Hup1,H,14);
501 // do the dynamic programming
504 E = vec_subs(E,v_gapextend);
505 tmp = vec_subs(H,v_gapopen);
509 F = vec_subs(Fup,v_gapextend);
510 tmp = vec_subs(Hup1,v_gapopen);
514 H = vec_adds(Hup2,v_score);
515 H = vec_subs(H,v_bias);
517 // set H to max of H,E,F
521 // Save value to use for next diagonal H
524 // Update highest score encountered this far
525 v_maxscore = vec_max(v_maxscore,H);
528 // reset pointers to the start of the saved data from the last row
531 for(j=8;j<db_length;j+=8)
535 // prefetch next residue
538 // Create the actual diagonal score vector
539 // and update the queue of incomplete score vectors
541 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
542 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
543 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
544 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
546 // prefetch score for next step
547 v_score_load = vec_ld(16*k,query_profile_word);
549 // load values of F and H from previous row (one unit up)
550 Fup = vec_ld(256, p);
551 Hup1 = vec_ld(272, p);
553 // save old values of F and H to use on next row
556 p += 16; // move ahead 32 bytes
558 // shift into place so we have complete F and H vectors
559 // that refer to the values one unit up from each cell
560 // that we are currently working on.
561 Fup = vec_sld(Fup,F,14);
562 Hup1 = vec_sld(Hup1,H,14);
564 // do the dynamic programming
567 E = vec_subs(E,v_gapextend);
568 tmp = vec_subs(H,v_gapopen);
572 F = vec_subs(Fup,v_gapextend);
573 tmp = vec_subs(Hup1,v_gapopen);
577 H = vec_adds(Hup2,v_score);
578 H = vec_subs(H,v_bias);
580 // set H to max of H,E,F
585 // Update highest score encountered this far
586 v_maxscore = vec_max(v_maxscore,H);
592 // prefetch next residue
595 // Create the actual diagonal score vector
596 // and update the queue of incomplete score vectors
598 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
599 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
600 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
601 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
603 // prefetch score for next step
604 v_score_load = vec_ld(16*k,query_profile_word);
606 // load values of F and H from previous row (one unit up)
607 Fup = vec_ld(256, p);
608 Hup2 = vec_ld(272, p);
610 // save old values of F and H to use on next row
613 p += 16; // move ahead 32 bytes
615 // shift into place so we have complete F and H vectors
616 // that refer to the values one unit up from each cell
617 // that we are currently working on.
618 Fup = vec_sld(Fup,F,14);
619 Hup2 = vec_sld(Hup2,H,14);
621 // do the dynamic programming
624 E = vec_subs(E,v_gapextend);
625 tmp = vec_subs(H,v_gapopen);
629 F = vec_subs(Fup,v_gapextend);
630 tmp = vec_subs(Hup2,v_gapopen);
634 H = vec_adds(Hup1,v_score);
635 H = vec_subs(H,v_bias);
637 // set H to max of H,E,F
642 // Update highest score encountered this far
643 v_maxscore = vec_max(v_maxscore,H);
649 // prefetch next residue
652 // Create the actual diagonal score vector
653 // and update the queue of incomplete score vectors
655 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
656 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
657 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
658 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
660 // prefetch score for next step
661 v_score_load = vec_ld(16*k,query_profile_word);
663 // load values of F and H from previous row (one unit up)
664 Fup = vec_ld(256, p);
665 Hup1 = vec_ld(272, p);
667 // save old values of F and H to use on next row
670 p += 16; // move ahead 32 bytes
672 // shift into place so we have complete F and H vectors
673 // that refer to the values one unit up from each cell
674 // that we are currently working on.
675 Fup = vec_sld(Fup,F,14);
676 Hup1 = vec_sld(Hup1,H,14);
678 // do the dynamic programming
681 E = vec_subs(E,v_gapextend);
682 tmp = vec_subs(H,v_gapopen);
686 F = vec_subs(Fup,v_gapextend);
687 tmp = vec_subs(Hup1,v_gapopen);
691 H = vec_adds(Hup2,v_score);
692 H = vec_subs(H,v_bias);
694 // set H to max of H,E,F
700 // Update highest score encountered this far
701 v_maxscore = vec_max(v_maxscore,H);
707 // prefetch next residue
710 // Create the actual diagonal score vector
711 // and update the queue of incomplete score vectors
713 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
714 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
715 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
716 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
718 // prefetch score for next step
719 v_score_load = vec_ld(16*k,query_profile_word);
721 // load values of F and H from previous row (one unit up)
722 Fup = vec_ld(256, p);
723 Hup2 = vec_ld(272, p);
725 // save old values of F and H to use on next row
728 p += 16; // move ahead 32 bytes
730 // shift into place so we have complete F and H vectors
731 // that refer to the values one unit up from each cell
732 // that we are currently working on.
733 Fup = vec_sld(Fup,F,14);
734 Hup2 = vec_sld(Hup2,H,14);
736 // do the dynamic programming
739 E = vec_subs(E,v_gapextend);
740 tmp = vec_subs(H,v_gapopen);
744 F = vec_subs(Fup,v_gapextend);
745 tmp = vec_subs(Hup2,v_gapopen);
749 H = vec_adds(Hup1,v_score);
750 H = vec_subs(H,v_bias);
752 // set H to max of H,E,F
757 // Update highest score encountered this far
758 v_maxscore = vec_max(v_maxscore,H);
764 // prefetch next residue
767 // Create the actual diagonal score vector
768 // and update the queue of incomplete score vectors
770 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
771 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
772 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
773 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
775 // prefetch score for next step
776 v_score_load = vec_ld(16*k,query_profile_word);
778 // load values of F and H from previous row (one unit up)
779 Fup = vec_ld(256, p);
780 Hup1 = vec_ld(272, p);
782 // save old values of F and H to use on next row
785 p += 16; // move ahead 32 bytes
787 // shift into place so we have complete F and H vectors
788 // that refer to the values one unit up from each cell
789 // that we are currently working on.
790 Fup = vec_sld(Fup,F,14);
791 Hup1 = vec_sld(Hup1,H,14);
793 // do the dynamic programming
796 E = vec_subs(E,v_gapextend);
797 tmp = vec_subs(H,v_gapopen);
801 F = vec_subs(Fup,v_gapextend);
802 tmp = vec_subs(Hup1,v_gapopen);
806 H = vec_adds(Hup2,v_score);
807 H = vec_subs(H,v_bias);
809 // set H to max of H,E,F
814 // Update highest score encountered this far
815 v_maxscore = vec_max(v_maxscore,H);
821 // prefetch next residue
824 // Create the actual diagonal score vector
825 // and update the queue of incomplete score vectors
827 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
828 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
829 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
830 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
832 // prefetch score for next step
833 v_score_load = vec_ld(16*k,query_profile_word);
835 // load values of F and H from previous row (one unit up)
836 Fup = vec_ld(256, p);
837 Hup2 = vec_ld(272, p);
839 // save old values of F and H to use on next row
842 p += 16; // move ahead 32 bytes
844 // shift into place so we have complete F and H vectors
845 // that refer to the values one unit up from each cell
846 // that we are currently working on.
847 Fup = vec_sld(Fup,F,14);
848 Hup2 = vec_sld(Hup2,H,14);
850 // do the dynamic programming
853 E = vec_subs(E,v_gapextend);
854 tmp = vec_subs(H,v_gapopen);
858 F = vec_subs(Fup,v_gapextend);
859 tmp = vec_subs(Hup2,v_gapopen);
863 H = vec_adds(Hup1,v_score);
864 H = vec_subs(H,v_bias);
866 // set H to max of H,E,F
872 // Update highest score encountered this far
873 v_maxscore = vec_max(v_maxscore,H);
879 // prefetch next residue
882 // Create the actual diagonal score vector
883 // and update the queue of incomplete score vectors
885 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
886 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
887 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
888 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
890 // prefetch score for next step
891 v_score_load = vec_ld(16*k,query_profile_word);
893 // load values of F and H from previous row (one unit up)
894 Fup = vec_ld(256, p);
895 Hup1 = vec_ld(272, p);
897 // save old values of F and H to use on next row
900 p += 16; // move ahead 32 bytes
902 // shift into place so we have complete F and H vectors
903 // that refer to the values one unit up from each cell
904 // that we are currently working on.
905 Fup = vec_sld(Fup,F,14);
906 Hup1 = vec_sld(Hup1,H,14);
908 // do the dynamic programming
911 E = vec_subs(E,v_gapextend);
912 tmp = vec_subs(H,v_gapopen);
916 F = vec_subs(Fup,v_gapextend);
917 tmp = vec_subs(Hup1,v_gapopen);
921 H = vec_adds(Hup2,v_score);
922 H = vec_subs(H,v_bias);
924 // set H to max of H,E,F
930 // Update highest score encountered this far
931 v_maxscore = vec_max(v_maxscore,H);
937 // prefetch next residue
940 // Create the actual diagonal score vector
941 // and update the queue of incomplete score vectors
943 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
944 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
945 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
946 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
948 // prefetch score for next step
949 v_score_load = vec_ld(16*k,query_profile_word);
951 // load values of F and H from previous row (one unit up)
952 Fup = vec_ld(256, p);
953 Hup2 = vec_ld(272, p);
955 // save old values of F and H to use on next row
958 p += 16; // move ahead 32 bytes
960 // shift into place so we have complete F and H vectors
961 // that refer to the values one unit up from each cell
962 // that we are currently working on.
963 Fup = vec_sld(Fup,F,14);
964 Hup2 = vec_sld(Hup2,H,14);
966 // do the dynamic programming
969 E = vec_subs(E,v_gapextend);
970 tmp = vec_subs(H,v_gapopen);
974 F = vec_subs(Fup,v_gapextend);
975 tmp = vec_subs(Hup2,v_gapopen);
979 H = vec_adds(Hup1,v_score);
980 H = vec_subs(H,v_bias);
982 // set H to max of H,E,F
987 // Update highest score encountered this far
988 v_maxscore = vec_max(v_maxscore,H);
991 v_score_load = vec_splat_u16(0);
993 for(;j<db_length+7;j++)
995 // Create the actual diagonal score vector
996 // and update the queue of incomplete score vectors
998 // This could of course be done with only vec_perm or vec_sel,
999 // but since they use different execution units we have found
1000 // it to be slightly faster to mix them.
1001 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
1002 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
1003 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
1004 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
1006 // save old values of F and H to use on next row
1009 p += 16; // move ahead 32 bytes
1011 // v_score_load contains all zeros
1012 Fup = vec_sld(v_score_load,F,14);
1013 Hup1 = vec_sld(v_score_load,H,14);
1015 // do the dynamic programming
1018 E = vec_subs(E,v_gapextend);
1019 tmp = vec_subs(H,v_gapopen);
1023 F = vec_subs(Fup,v_gapextend);
1024 tmp = vec_subs(Hup1,v_gapopen);
1028 H = vec_adds(Hup2,v_score);
1029 H = vec_subs(H,v_bias);
1031 // set H to max of H,E,F
1035 // Save value to use for next diagonal H
1038 // Update highest score encountered this far
1039 v_maxscore = vec_max(v_maxscore,H);
1044 query_profile_word += 8*alphabet_size;
1047 // find largest score in the v_maxscore vector
1048 tmp = vec_sld(v_maxscore,v_maxscore,8);
1049 v_maxscore = vec_max(v_maxscore,tmp);
1050 tmp = vec_sld(v_maxscore,v_maxscore,4);
1051 v_maxscore = vec_max(v_maxscore,tmp);
1052 tmp = vec_sld(v_maxscore,v_maxscore,2);
1053 v_maxscore = vec_max(v_maxscore,tmp);
1055 // store in temporary variable
1056 vec_ste(v_maxscore,0,&score);
1058 // return largest score
1063 smith_waterman_altivec_byte(unsigned char * query_sequence,
1064 unsigned char * query_profile_byte,
1066 unsigned char * db_sequence,
1069 unsigned char gap_open,
1070 unsigned char gap_extend,
1071 struct f_struct * f_str)
1076 unsigned char score;
1077 int alphabet_size = f_str->alphabet_size;
1078 unsigned char * workspace = (unsigned char *)f_str->workspace;
1080 vector unsigned char Fup,Hup1,Hup2,E,F,H,tmp;
1081 vector unsigned char perm;
1082 vector unsigned char v_maxscore;
1083 vector unsigned char v_bias,v_gapopen,v_gapextend;
1084 vector unsigned char v_score;
1085 vector unsigned char v_score_q1;
1086 vector unsigned char v_score_q2;
1087 vector unsigned char v_score_q3;
1088 vector unsigned char v_score_q4;
1089 vector unsigned char v_score_q5;
1090 vector unsigned char v_score_load1;
1091 vector unsigned char v_score_load2;
1092 vector unsigned char v_zero;
1094 vector unsigned char queue1_to_score = (vector unsigned char)(16,1,2,3,4,5,6,7,24,9,10,11,12,13,14,15);
1095 vector unsigned char queue2_to_queue1 = (vector unsigned char)(16,17,2,3,4,5,6,7,24,25,10,11,12,13,14,15);
1096 vector unsigned char queue3_to_queue2 = (vector unsigned char)(16,17,18,3,4,5,6,7,24,25,26,11,12,13,14,15);
1097 vector unsigned char queue4_to_queue3 = (vector unsigned char)(16,17,18,19,4,5,6,7,24,25,26,27,12,13,14,15);
1098 vector unsigned char queue5_to_queue4 = (vector unsigned char)(16,17,18,19,20,2,3,4,24,25,26,27,28,10,11,12);
1099 vector unsigned char queue5_with_load = (vector unsigned char)(19,20,21,5,6,22,7,23,27,28,29,13,14,30,15,31);
1100 vector unsigned char merge_score_load = (vector unsigned char)(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
1102 v_zero = vec_splat_u8(0);
1104 /* Load the bias to all elements of a constant */
1105 v_bias = vec_lde(0,&bias);
1106 perm = vec_lvsl(0,&bias);
1107 v_bias = vec_perm(v_bias,v_bias,perm);
1108 v_bias = vec_splat(v_bias,0);
1110 /* Load gap opening penalty to all elements of a constant */
1111 v_gapopen = vec_lde(0,&gap_open);
1112 perm = vec_lvsl(0,&gap_open);
1113 v_gapopen = vec_perm(v_gapopen,v_gapopen,perm);
1114 v_gapopen = vec_splat(v_gapopen,0);
1116 /* Load gap extension penalty to all elements of a constant */
1117 v_gapextend = vec_lde(0,&gap_extend);
1118 perm = vec_lvsl(0,&gap_extend);
1119 v_gapextend = vec_perm(v_gapextend,v_gapextend,perm);
1120 v_gapextend = vec_splat(v_gapextend,0);
1122 v_maxscore = vec_xor(v_maxscore,v_maxscore);
1124 // Zero out the storage vector
1126 for(i=0,j=0;i<k;i++,j+=32)
1128 // borrow the zero value in v_maxscore to have something to store
1129 vec_st(v_maxscore,j,workspace);
1130 vec_st(v_maxscore,j+16,workspace);
1133 for(i=0;i<query_length;i+=16)
1135 // zero lots of stuff.
1136 // We use both the VPERM and VSIU unit to knock off some cycles.
1138 E = vec_splat_u8(0);
1140 H = vec_splat_u8(0);
1141 Hup2 = vec_xor(Hup2,Hup2);
1142 v_score_q1 = vec_splat_u8(0);
1143 v_score_q2 = vec_xor(v_score_q2,v_score_q2);
1144 v_score_q3 = vec_splat_u8(0);
1145 v_score_q4 = vec_xor(v_score_q4,v_score_q4);
1146 v_score_q5 = vec_splat_u8(0);
1148 // reset pointers to the start of the saved data from the last row
1151 // start directly and prefetch score column
1154 v_score_load1 = vec_ld(16*k,query_profile_byte);
1155 v_score_load2 = v_score_load1;
1156 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1159 // prefetch next residue
1162 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1163 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1164 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1165 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1166 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1167 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1169 // prefetch score for next step
1170 v_score_load1 = vec_ld(16*k,query_profile_byte);
1172 // load values of F and H from previous row (one unit up)
1174 Hup1 = vec_ld(16, p);
1175 p += 32; // move ahead 32 bytes
1177 // shift into place so we have complete F and H vectors
1178 // that refer to the values one unit up from each cell
1179 // that we are currently working on.
1180 Fup = vec_sld(Fup,F,15);
1181 Hup1 = vec_sld(Hup1,H,15);
1183 // do the dynamic programming
1186 E = vec_subs(E,v_gapextend);
1187 tmp = vec_subs(H,v_gapopen);
1191 F = vec_subs(Fup,v_gapextend);
1192 tmp = vec_subs(Hup1,v_gapopen);
1195 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1198 H = vec_adds(Hup2,v_score);
1199 H = vec_subs(H,v_bias);
1201 // set H to max of H,E,F
1205 // Update highest score encountered this far
1206 v_maxscore = vec_max(v_maxscore,H);
1212 // prefetch next residue
1215 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1216 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1217 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1218 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1219 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1220 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1223 // prefetch score for next step
1224 v_score_load1 = vec_ld(16*k,query_profile_byte);
1226 // load values of F and H from previous row (one unit up)
1228 Hup2 = vec_ld(16, p);
1229 p += 32; // move ahead 32 bytes
1231 // shift into place so we have complete F and H vectors
1232 // that refer to the values one unit up from each cell
1233 // that we are currently working on.
1234 Fup = vec_sld(Fup,F,15);
1235 Hup2 = vec_sld(Hup2,H,15);
1237 // do the dynamic programming
1240 E = vec_subs(E,v_gapextend);
1241 tmp = vec_subs(H,v_gapopen);
1245 F = vec_subs(Fup,v_gapextend);
1246 tmp = vec_subs(Hup2,v_gapopen);
1249 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1252 H = vec_adds(Hup1,v_score);
1253 H = vec_subs(H,v_bias);
1255 // set H to max of H,E,F
1259 // Update highest score encountered this far
1260 v_maxscore = vec_max(v_maxscore,H);
1264 // prefetch next residue
1267 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1268 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1269 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1270 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1271 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1272 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1275 // prefetch score for next step
1276 v_score_load1 = vec_ld(16*k,query_profile_byte);
1278 // load values of F and H from previous row (one unit up)
1280 Hup1 = vec_ld(16, p);
1281 p += 32; // move ahead 32 bytes
1283 // shift into place so we have complete F and H vectors
1284 // that refer to the values one unit up from each cell
1285 // that we are currently working on.
1286 Fup = vec_sld(Fup,F,15);
1287 Hup1 = vec_sld(Hup1,H,15);
1289 // do the dynamic programming
1292 E = vec_subs(E,v_gapextend);
1293 tmp = vec_subs(H,v_gapopen);
1297 F = vec_subs(Fup,v_gapextend);
1298 tmp = vec_subs(Hup1,v_gapopen);
1301 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1304 H = vec_adds(Hup2,v_score);
1305 H = vec_subs(H,v_bias);
1307 // set H to max of H,E,F
1311 // Update highest score encountered this far
1312 v_maxscore = vec_max(v_maxscore,H);
1316 // prefetch next residue
1319 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1320 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1321 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1322 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1323 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1324 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1327 // prefetch score for next step
1328 v_score_load1 = vec_ld(16*k,query_profile_byte);
1330 // load values of F and H from previous row (one unit up)
1332 Hup2 = vec_ld(16, p);
1333 p += 32; // move ahead 32 bytes
1335 // shift into place so we have complete F and H vectors
1336 // that refer to the values one unit up from each cell
1337 // that we are currently working on.
1338 Fup = vec_sld(Fup,F,15);
1339 Hup2 = vec_sld(Hup2,H,15);
1341 // do the dynamic programming
1344 E = vec_subs(E,v_gapextend);
1345 tmp = vec_subs(H,v_gapopen);
1349 F = vec_subs(Fup,v_gapextend);
1350 tmp = vec_subs(Hup2,v_gapopen);
1353 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1356 H = vec_adds(Hup1,v_score);
1357 H = vec_subs(H,v_bias);
1359 // set H to max of H,E,F
1363 // Update highest score encountered this far
1364 v_maxscore = vec_max(v_maxscore,H);
1368 // prefetch next residue
1371 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1372 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1373 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1374 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1375 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1376 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1379 // prefetch score for next step
1380 v_score_load1 = vec_ld(16*k,query_profile_byte);
1382 // load values of F and H from previous row (one unit up)
1384 Hup1 = vec_ld(16, p);
1385 p += 32; // move ahead 32 bytes
1387 // shift into place so we have complete F and H vectors
1388 // that refer to the values one unit up from each cell
1389 // that we are currently working on.
1390 Fup = vec_sld(Fup,F,15);
1391 Hup1 = vec_sld(Hup1,H,15);
1393 // do the dynamic programming
1396 E = vec_subs(E,v_gapextend);
1397 tmp = vec_subs(H,v_gapopen);
1401 F = vec_subs(Fup,v_gapextend);
1402 tmp = vec_subs(Hup1,v_gapopen);
1405 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1408 H = vec_adds(Hup2,v_score);
1409 H = vec_subs(H,v_bias);
1411 // set H to max of H,E,F
1415 // Update highest score encountered this far
1416 v_maxscore = vec_max(v_maxscore,H);
1420 // prefetch next residue
1423 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1424 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1425 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1426 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1427 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1428 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1431 // prefetch score for next step
1432 v_score_load1 = vec_ld(16*k,query_profile_byte);
1434 // load values of F and H from previous row (one unit up)
1436 Hup2 = vec_ld(16, p);
1437 p += 32; // move ahead 32 bytes
1439 // shift into place so we have complete F and H vectors
1440 // that refer to the values one unit up from each cell
1441 // that we are currently working on.
1442 Fup = vec_sld(Fup,F,15);
1443 Hup2 = vec_sld(Hup2,H,15);
1445 // do the dynamic programming
1448 E = vec_subs(E,v_gapextend);
1449 tmp = vec_subs(H,v_gapopen);
1453 F = vec_subs(Fup,v_gapextend);
1454 tmp = vec_subs(Hup2,v_gapopen);
1457 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1460 H = vec_adds(Hup1,v_score);
1461 H = vec_subs(H,v_bias);
1463 // set H to max of H,E,F
1467 // Update highest score encountered this far
1468 v_maxscore = vec_max(v_maxscore,H);
1473 // prefetch next residue
1476 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1477 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1478 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1479 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1480 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1481 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1484 // prefetch score for next step
1485 v_score_load1 = vec_ld(16*k,query_profile_byte);
1487 // load values of F and H from previous row (one unit up)
1489 Hup1 = vec_ld(16, p);
1490 p += 32; // move ahead 32 bytes
1492 // shift into place so we have complete F and H vectors
1493 // that refer to the values one unit up from each cell
1494 // that we are currently working on.
1495 Fup = vec_sld(Fup,F,15);
1496 Hup1 = vec_sld(Hup1,H,15);
1498 // do the dynamic programming
1501 E = vec_subs(E,v_gapextend);
1502 tmp = vec_subs(H,v_gapopen);
1506 F = vec_subs(Fup,v_gapextend);
1507 tmp = vec_subs(Hup1,v_gapopen);
1510 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1513 H = vec_adds(Hup2,v_score);
1514 H = vec_subs(H,v_bias);
1516 // set H to max of H,E,F
1520 // Update highest score encountered this far
1521 v_maxscore = vec_max(v_maxscore,H);
1526 // prefetch next residue
1529 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1530 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1531 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1532 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1533 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1534 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1537 // prefetch score for next step
1538 v_score_load1 = vec_ld(16*k,query_profile_byte);
1540 // load values of F and H from previous row (one unit up)
1542 Hup2 = vec_ld(16, p);
1543 p += 32; // move ahead 32 bytes
1545 // shift into place so we have complete F and H vectors
1546 // that refer to the values one unit up from each cell
1547 // that we are currently working on.
1548 Fup = vec_sld(Fup,F,15);
1549 Hup2 = vec_sld(Hup2,H,15);
1551 // do the dynamic programming
1554 E = vec_subs(E,v_gapextend);
1555 tmp = vec_subs(H,v_gapopen);
1559 F = vec_subs(Fup,v_gapextend);
1560 tmp = vec_subs(Hup2,v_gapopen);
1563 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1566 H = vec_adds(Hup1,v_score);
1567 H = vec_subs(H,v_bias);
1569 // set H to max of H,E,F
1573 // Update highest score encountered this far
1574 v_maxscore = vec_max(v_maxscore,H);
1580 // prefetch next residue
1582 k8 = db_sequence[1];
1584 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1585 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1586 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1587 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1588 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1589 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1592 // prefetch score for next step
1593 v_score_load1 = vec_ld(16*k,query_profile_byte);
1594 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1596 // load values of F and H from previous row (one unit up)
1598 Hup1 = vec_ld(16, p);
1599 p += 32; // move ahead 32 bytes
1601 // shift into place so we have complete F and H vectors
1602 // that refer to the values one unit up from each cell
1603 // that we are currently working on.
1604 Fup = vec_sld(Fup,F,15);
1605 Hup1 = vec_sld(Hup1,H,15);
1607 // do the dynamic programming
1610 E = vec_subs(E,v_gapextend);
1611 tmp = vec_subs(H,v_gapopen);
1615 F = vec_subs(Fup,v_gapextend);
1616 tmp = vec_subs(Hup1,v_gapopen);
1619 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1622 H = vec_adds(Hup2,v_score);
1623 H = vec_subs(H,v_bias);
1625 // set H to max of H,E,F
1629 // Update highest score encountered this far
1630 v_maxscore = vec_max(v_maxscore,H);
1635 // prefetch next residue
1636 k = db_sequence[10];
1637 k8 = db_sequence[2];
1639 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1640 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1641 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1642 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1643 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1644 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1647 // prefetch score for next step
1648 v_score_load1 = vec_ld(16*k,query_profile_byte);
1649 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1651 // load values of F and H from previous row (one unit up)
1653 Hup2 = vec_ld(16, p);
1654 p += 32; // move ahead 32 bytes
1656 // shift into place so we have complete F and H vectors
1657 // that refer to the values one unit up from each cell
1658 // that we are currently working on.
1659 Fup = vec_sld(Fup,F,15);
1660 Hup2 = vec_sld(Hup2,H,15);
1662 // do the dynamic programming
1665 E = vec_subs(E,v_gapextend);
1666 tmp = vec_subs(H,v_gapopen);
1670 F = vec_subs(Fup,v_gapextend);
1671 tmp = vec_subs(Hup2,v_gapopen);
1674 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1677 H = vec_adds(Hup1,v_score);
1678 H = vec_subs(H,v_bias);
1680 // set H to max of H,E,F
1684 // Update highest score encountered this far
1685 v_maxscore = vec_max(v_maxscore,H);
1691 // prefetch next residue
1692 k = db_sequence[11];
1693 k8 = db_sequence[3];
1695 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1696 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1697 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1698 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1699 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1700 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1703 // prefetch score for next step
1704 v_score_load1 = vec_ld(16*k,query_profile_byte);
1705 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1707 // load values of F and H from previous row (one unit up)
1709 Hup1 = vec_ld(16, p);
1710 p += 32; // move ahead 32 bytes
1712 // shift into place so we have complete F and H vectors
1713 // that refer to the values one unit up from each cell
1714 // that we are currently working on.
1715 Fup = vec_sld(Fup,F,15);
1716 Hup1 = vec_sld(Hup1,H,15);
1718 // do the dynamic programming
1721 E = vec_subs(E,v_gapextend);
1722 tmp = vec_subs(H,v_gapopen);
1726 F = vec_subs(Fup,v_gapextend);
1727 tmp = vec_subs(Hup1,v_gapopen);
1730 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1733 H = vec_adds(Hup2,v_score);
1734 H = vec_subs(H,v_bias);
1736 // set H to max of H,E,F
1740 // Update highest score encountered this far
1741 v_maxscore = vec_max(v_maxscore,H);
1746 // prefetch next residue
1747 k = db_sequence[12];
1748 k8 = db_sequence[4];
1750 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1751 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1752 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1753 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1754 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1755 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1758 // prefetch score for next step
1759 v_score_load1 = vec_ld(16*k,query_profile_byte);
1760 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1762 // load values of F and H from previous row (one unit up)
1764 Hup2 = vec_ld(16, p);
1765 p += 32; // move ahead 32 bytes
1767 // shift into place so we have complete F and H vectors
1768 // that refer to the values one unit up from each cell
1769 // that we are currently working on.
1770 Fup = vec_sld(Fup,F,15);
1771 Hup2 = vec_sld(Hup2,H,15);
1773 // do the dynamic programming
1776 E = vec_subs(E,v_gapextend);
1777 tmp = vec_subs(H,v_gapopen);
1781 F = vec_subs(Fup,v_gapextend);
1782 tmp = vec_subs(Hup2,v_gapopen);
1785 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1788 H = vec_adds(Hup1,v_score);
1789 H = vec_subs(H,v_bias);
1791 // set H to max of H,E,F
1795 // Update highest score encountered this far
1796 v_maxscore = vec_max(v_maxscore,H);
1802 // prefetch next residue
1803 k = db_sequence[13];
1804 k8 = db_sequence[5];
1806 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1807 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1808 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1809 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1810 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1811 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1814 // prefetch score for next step
1815 v_score_load1 = vec_ld(16*k,query_profile_byte);
1816 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1818 // load values of F and H from previous row (one unit up)
1820 Hup1 = vec_ld(16, p);
1821 p += 32; // move ahead 32 bytes
1823 // shift into place so we have complete F and H vectors
1824 // that refer to the values one unit up from each cell
1825 // that we are currently working on.
1826 Fup = vec_sld(Fup,F,15);
1827 Hup1 = vec_sld(Hup1,H,15);
1829 // do the dynamic programming
1832 E = vec_subs(E,v_gapextend);
1833 tmp = vec_subs(H,v_gapopen);
1837 F = vec_subs(Fup,v_gapextend);
1838 tmp = vec_subs(Hup1,v_gapopen);
1841 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1844 H = vec_adds(Hup2,v_score);
1845 H = vec_subs(H,v_bias);
1847 // set H to max of H,E,F
1851 // Update highest score encountered this far
1852 v_maxscore = vec_max(v_maxscore,H);
1857 // prefetch next residue
1858 k = db_sequence[14];
1859 k8 = db_sequence[6];
1861 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1862 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1863 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1864 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1865 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1866 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1869 // prefetch score for next step
1870 v_score_load1 = vec_ld(16*k,query_profile_byte);
1871 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1873 // load values of F and H from previous row (one unit up)
1875 Hup2 = vec_ld(16, p);
1876 p += 32; // move ahead 32 bytes
1878 // shift into place so we have complete F and H vectors
1879 // that refer to the values one unit up from each cell
1880 // that we are currently working on.
1881 Fup = vec_sld(Fup,F,15);
1882 Hup2 = vec_sld(Hup2,H,15);
1884 // do the dynamic programming
1887 E = vec_subs(E,v_gapextend);
1888 tmp = vec_subs(H,v_gapopen);
1892 F = vec_subs(Fup,v_gapextend);
1893 tmp = vec_subs(Hup2,v_gapopen);
1896 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1899 H = vec_adds(Hup1,v_score);
1900 H = vec_subs(H,v_bias);
1902 // set H to max of H,E,F
1906 // Update highest score encountered this far
1907 v_maxscore = vec_max(v_maxscore,H);
1912 // prefetch next residue
1913 k = db_sequence[15];
1914 k8 = db_sequence[7];
1916 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1917 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1918 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1919 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1920 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1921 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1924 // prefetch score for next step
1925 v_score_load1 = vec_ld(16*k,query_profile_byte);
1926 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1928 // load values of F and H from previous row (one unit up)
1930 Hup1 = vec_ld(16, p);
1931 p += 32; // move ahead 32 bytes
1933 // shift into place so we have complete F and H vectors
1934 // that refer to the values one unit up from each cell
1935 // that we are currently working on.
1936 Fup = vec_sld(Fup,F,15);
1937 Hup1 = vec_sld(Hup1,H,15);
1939 // do the dynamic programming
1942 E = vec_subs(E,v_gapextend);
1943 tmp = vec_subs(H,v_gapopen);
1947 F = vec_subs(Fup,v_gapextend);
1948 tmp = vec_subs(Hup1,v_gapopen);
1951 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1954 H = vec_adds(Hup2,v_score);
1955 H = vec_subs(H,v_bias);
1957 // set H to max of H,E,F
1961 // Update highest score encountered this far
1962 v_maxscore = vec_max(v_maxscore,H);
1967 // prefetch next residue
1968 k = db_sequence[16];
1969 k8 = db_sequence[8];
1971 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1972 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1973 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1974 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1975 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1976 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1979 // prefetch score for next step
1980 v_score_load1 = vec_ld(16*k,query_profile_byte);
1981 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1983 // load values of F and H from previous row (one unit up)
1985 Hup2 = vec_ld(16, p);
1986 p += 32; // move ahead 32 bytes
1988 // shift into place so we have complete F and H vectors
1989 // that refer to the values one unit up from each cell
1990 // that we are currently working on.
1991 Fup = vec_sld(Fup,F,15);
1992 Hup2 = vec_sld(Hup2,H,15);
1994 // do the dynamic programming
1997 E = vec_subs(E,v_gapextend);
1998 tmp = vec_subs(H,v_gapopen);
2002 F = vec_subs(Fup,v_gapextend);
2003 tmp = vec_subs(Hup2,v_gapopen);
2006 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2009 H = vec_adds(Hup1,v_score);
2010 H = vec_subs(H,v_bias);
2012 // set H to max of H,E,F
2016 // Update highest score encountered this far
2017 v_maxscore = vec_max(v_maxscore,H);
2021 for(j=16;j<db_length;j+=16)
2025 // prefetch next residue
2026 k = db_sequence[j+1];
2027 k8 = db_sequence[j-7];
2029 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2030 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2031 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2032 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2033 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2034 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2036 // prefetch scores for next step
2037 v_score_load1 = vec_ld(16*k,query_profile_byte);
2038 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2040 // load values of F and H from previous row (one unit up)
2041 Fup = vec_ld(512, p);
2042 Hup1 = vec_ld(528, p);
2044 // save old values of F and H to use on next row
2049 // shift into place so we have complete F and H vectors
2050 // that refer to the values one unit up from each cell
2051 // that we are currently working on.
2052 Fup = vec_sld(Fup,F,15);
2053 Hup1 = vec_sld(Hup1,H,15);
2055 // do the dynamic programming
2058 E = vec_subs(E,v_gapextend);
2059 tmp = vec_subs(H,v_gapopen);
2063 F = vec_subs(Fup,v_gapextend);
2064 tmp = vec_subs(Hup1,v_gapopen);
2067 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2070 H = vec_adds(Hup2,v_score);
2071 H = vec_subs(H,v_bias);
2073 // set H to max of H,E,F
2079 // Update highest score encountered this far
2080 v_maxscore = vec_max(v_maxscore,H);
2088 // prefetch next residue
2089 k = db_sequence[j+2];
2090 k8 = db_sequence[j-6];
2092 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2093 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2094 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2095 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2096 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2097 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2100 // prefetch scores for next step
2101 v_score_load1 = vec_ld(16*k,query_profile_byte);
2102 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2104 // load values of F and H from previous row (one unit up)
2105 Fup = vec_ld(512, p);
2106 Hup2 = vec_ld(528, p);
2108 // save old values of F and H to use on next row
2113 // shift into place so we have complete F and H vectors
2114 // that refer to the values one unit up from each cell
2115 // that we are currently working on.
2116 Fup = vec_sld(Fup,F,15);
2117 Hup2 = vec_sld(Hup2,H,15);
2119 // do the dynamic programming
2122 E = vec_subs(E,v_gapextend);
2123 tmp = vec_subs(H,v_gapopen);
2127 F = vec_subs(Fup,v_gapextend);
2128 tmp = vec_subs(Hup2,v_gapopen);
2131 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2134 H = vec_adds(Hup1,v_score);
2135 H = vec_subs(H,v_bias);
2137 // set H to max of H,E,F
2142 // Update highest score encountered this far
2143 v_maxscore = vec_max(v_maxscore,H);
2152 // prefetch next residue
2153 k = db_sequence[j+3];
2154 k8 = db_sequence[j-5];
2156 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2157 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2158 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2159 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2160 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2161 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2164 // prefetch scores for next step
2165 v_score_load1 = vec_ld(16*k,query_profile_byte);
2166 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2168 // load values of F and H from previous row (one unit up)
2169 Fup = vec_ld(512, p);
2170 Hup1 = vec_ld(528, p);
2172 // save old values of F and H to use on next row
2177 // shift into place so we have complete F and H vectors
2178 // that refer to the values one unit up from each cell
2179 // that we are currently working on.
2180 Fup = vec_sld(Fup,F,15);
2181 Hup1 = vec_sld(Hup1,H,15);
2183 // do the dynamic programming
2186 E = vec_subs(E,v_gapextend);
2187 tmp = vec_subs(H,v_gapopen);
2191 F = vec_subs(Fup,v_gapextend);
2192 tmp = vec_subs(Hup1,v_gapopen);
2195 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2198 H = vec_adds(Hup2,v_score);
2199 H = vec_subs(H,v_bias);
2201 // set H to max of H,E,F
2205 // Update highest score encountered this far
2206 v_maxscore = vec_max(v_maxscore,H);
2215 // prefetch next residue
2216 k = db_sequence[j+4];
2217 k8 = db_sequence[j-4];
2219 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2220 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2221 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2222 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2223 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2224 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2227 // prefetch scores for next step
2228 v_score_load1 = vec_ld(16*k,query_profile_byte);
2229 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2231 // load values of F and H from previous row (one unit up)
2232 Fup = vec_ld(512, p);
2233 Hup2 = vec_ld(528, p);
2235 // save old values of F and H to use on next row
2240 // shift into place so we have complete F and H vectors
2241 // that refer to the values one unit up from each cell
2242 // that we are currently working on.
2243 Fup = vec_sld(Fup,F,15);
2244 Hup2 = vec_sld(Hup2,H,15);
2246 // do the dynamic programming
2249 E = vec_subs(E,v_gapextend);
2250 tmp = vec_subs(H,v_gapopen);
2254 F = vec_subs(Fup,v_gapextend);
2255 tmp = vec_subs(Hup2,v_gapopen);
2258 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2261 H = vec_adds(Hup1,v_score);
2262 H = vec_subs(H,v_bias);
2264 // set H to max of H,E,F
2268 // Update highest score encountered this far
2269 v_maxscore = vec_max(v_maxscore,H);
2278 // prefetch next residue
2279 k = db_sequence[j+5];
2280 k8 = db_sequence[j-3];
2282 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2283 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2284 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2285 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2286 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2287 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2290 // prefetch scores for next step
2291 v_score_load1 = vec_ld(16*k,query_profile_byte);
2292 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2294 // load values of F and H from previous row (one unit up)
2295 Fup = vec_ld(512, p);
2296 Hup1 = vec_ld(528, p);
2298 // save old values of F and H to use on next row
2303 // shift into place so we have complete F and H vectors
2304 // that refer to the values one unit up from each cell
2305 // that we are currently working on.
2306 Fup = vec_sld(Fup,F,15);
2307 Hup1 = vec_sld(Hup1,H,15);
2309 // do the dynamic programming
2312 E = vec_subs(E,v_gapextend);
2313 tmp = vec_subs(H,v_gapopen);
2317 F = vec_subs(Fup,v_gapextend);
2318 tmp = vec_subs(Hup1,v_gapopen);
2321 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2324 H = vec_adds(Hup2,v_score);
2325 H = vec_subs(H,v_bias);
2327 // set H to max of H,E,F
2331 // Update highest score encountered this far
2332 v_maxscore = vec_max(v_maxscore,H);
2341 // prefetch next residue
2342 k = db_sequence[j+6];
2343 k8 = db_sequence[j-2];
2345 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2346 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2347 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2348 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2349 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2350 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2353 // prefetch scores for next step
2354 v_score_load1 = vec_ld(16*k,query_profile_byte);
2355 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2357 // load values of F and H from previous row (one unit up)
2358 Fup = vec_ld(512, p);
2359 Hup2 = vec_ld(528, p);
2361 // save old values of F and H to use on next row
2366 // shift into place so we have complete F and H vectors
2367 // that refer to the values one unit up from each cell
2368 // that we are currently working on.
2369 Fup = vec_sld(Fup,F,15);
2370 Hup2 = vec_sld(Hup2,H,15);
2372 // do the dynamic programming
2375 E = vec_subs(E,v_gapextend);
2376 tmp = vec_subs(H,v_gapopen);
2380 F = vec_subs(Fup,v_gapextend);
2381 tmp = vec_subs(Hup2,v_gapopen);
2384 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2387 H = vec_adds(Hup1,v_score);
2388 H = vec_subs(H,v_bias);
2390 // set H to max of H,E,F
2394 // Update highest score encountered this far
2395 v_maxscore = vec_max(v_maxscore,H);
2404 // prefetch next residue
2405 k = db_sequence[j+7];
2406 k8 = db_sequence[j-1];
2408 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2409 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2410 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2411 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2412 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2413 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2416 // prefetch scores for next step
2417 v_score_load1 = vec_ld(16*k,query_profile_byte);
2418 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2420 // load values of F and H from previous row (one unit up)
2421 Fup = vec_ld(512, p);
2422 Hup1 = vec_ld(528, p);
2424 // save old values of F and H to use on next row
2429 // shift into place so we have complete F and H vectors
2430 // that refer to the values one unit up from each cell
2431 // that we are currently working on.
2432 Fup = vec_sld(Fup,F,15);
2433 Hup1 = vec_sld(Hup1,H,15);
2435 // do the dynamic programming
2438 E = vec_subs(E,v_gapextend);
2439 tmp = vec_subs(H,v_gapopen);
2443 F = vec_subs(Fup,v_gapextend);
2444 tmp = vec_subs(Hup1,v_gapopen);
2447 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2450 H = vec_adds(Hup2,v_score);
2451 H = vec_subs(H,v_bias);
2453 // set H to max of H,E,F
2457 // Update highest score encountered this far
2458 v_maxscore = vec_max(v_maxscore,H);
2467 // prefetch next residue
2468 k = db_sequence[j+8];
2469 k8 = db_sequence[j];
2471 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2472 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2473 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2474 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2475 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2476 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2479 // prefetch scores for next step
2480 v_score_load1 = vec_ld(16*k,query_profile_byte);
2481 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2483 // load values of F and H from previous row (one unit up)
2484 Fup = vec_ld(512, p);
2485 Hup2 = vec_ld(528, p);
2487 // save old values of F and H to use on next row
2492 // shift into place so we have complete F and H vectors
2493 // that refer to the values one unit up from each cell
2494 // that we are currently working on.
2495 Fup = vec_sld(Fup,F,15);
2496 Hup2 = vec_sld(Hup2,H,15);
2498 // do the dynamic programming
2501 E = vec_subs(E,v_gapextend);
2502 tmp = vec_subs(H,v_gapopen);
2506 F = vec_subs(Fup,v_gapextend);
2507 tmp = vec_subs(Hup2,v_gapopen);
2510 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2513 H = vec_adds(Hup1,v_score);
2514 H = vec_subs(H,v_bias);
2516 // set H to max of H,E,F
2520 // Update highest score encountered this far
2521 v_maxscore = vec_max(v_maxscore,H);
2530 // prefetch next residue
2531 k = db_sequence[j+9];
2532 k8 = db_sequence[j+1];
2534 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2535 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2536 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2537 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2538 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2539 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2542 // prefetch scores for next step
2543 v_score_load1 = vec_ld(16*k,query_profile_byte);
2544 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2546 // load values of F and H from previous row (one unit up)
2547 Fup = vec_ld(512, p);
2548 Hup1 = vec_ld(528, p);
2550 // save old values of F and H to use on next row
2555 // shift into place so we have complete F and H vectors
2556 // that refer to the values one unit up from each cell
2557 // that we are currently working on.
2558 Fup = vec_sld(Fup,F,15);
2559 Hup1 = vec_sld(Hup1,H,15);
2561 // do the dynamic programming
2564 E = vec_subs(E,v_gapextend);
2565 tmp = vec_subs(H,v_gapopen);
2569 F = vec_subs(Fup,v_gapextend);
2570 tmp = vec_subs(Hup1,v_gapopen);
2573 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2576 H = vec_adds(Hup2,v_score);
2577 H = vec_subs(H,v_bias);
2579 // set H to max of H,E,F
2583 // Update highest score encountered this far
2584 v_maxscore = vec_max(v_maxscore,H);
2588 // prefetch next residue
2589 k = db_sequence[j+10];
2590 k8 = db_sequence[j+2];
2592 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2593 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2594 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2595 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2596 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2597 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2600 // prefetch scores for next step
2601 v_score_load1 = vec_ld(16*k,query_profile_byte);
2602 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2604 // load values of F and H from previous row (one unit up)
2605 Fup = vec_ld(512, p);
2606 Hup2 = vec_ld(528, p);
2608 // save old values of F and H to use on next row
2613 // shift into place so we have complete F and H vectors
2614 // that refer to the values one unit up from each cell
2615 // that we are currently working on.
2616 Fup = vec_sld(Fup,F,15);
2617 Hup2 = vec_sld(Hup2,H,15);
2619 // do the dynamic programming
2622 E = vec_subs(E,v_gapextend);
2623 tmp = vec_subs(H,v_gapopen);
2627 F = vec_subs(Fup,v_gapextend);
2628 tmp = vec_subs(Hup2,v_gapopen);
2631 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2634 H = vec_adds(Hup1,v_score);
2635 H = vec_subs(H,v_bias);
2637 // set H to max of H,E,F
2641 // Update highest score encountered this far
2642 v_maxscore = vec_max(v_maxscore,H);
2646 // prefetch next residue
2647 k = db_sequence[j+11];
2648 k8 = db_sequence[j+3];
2650 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2651 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2652 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2653 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2654 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2655 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2658 // prefetch scores for next step
2659 v_score_load1 = vec_ld(16*k,query_profile_byte);
2660 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2662 // load values of F and H from previous row (one unit up)
2663 Fup = vec_ld(512, p);
2664 Hup1 = vec_ld(528, p);
2666 // save old values of F and H to use on next row
2671 // shift into place so we have complete F and H vectors
2672 // that refer to the values one unit up from each cell
2673 // that we are currently working on.
2674 Fup = vec_sld(Fup,F,15);
2675 Hup1 = vec_sld(Hup1,H,15);
2677 // do the dynamic programming
2680 E = vec_subs(E,v_gapextend);
2681 tmp = vec_subs(H,v_gapopen);
2685 F = vec_subs(Fup,v_gapextend);
2686 tmp = vec_subs(Hup1,v_gapopen);
2689 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2692 H = vec_adds(Hup2,v_score);
2693 H = vec_subs(H,v_bias);
2695 // set H to max of H,E,F
2699 // Update highest score encountered this far
2700 v_maxscore = vec_max(v_maxscore,H);
2704 // prefetch next residue
2705 k = db_sequence[j+12];
2706 k8 = db_sequence[j+4];
2708 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2709 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2710 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2711 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2712 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2713 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2716 // prefetch scores for next step
2717 v_score_load1 = vec_ld(16*k,query_profile_byte);
2718 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2720 // load values of F and H from previous row (one unit up)
2721 Fup = vec_ld(512, p);
2722 Hup2 = vec_ld(528, p);
2724 // save old values of F and H to use on next row
2729 // shift into place so we have complete F and H vectors
2730 // that refer to the values one unit up from each cell
2731 // that we are currently working on.
2732 Fup = vec_sld(Fup,F,15);
2733 Hup2 = vec_sld(Hup2,H,15);
2735 // do the dynamic programming
2738 E = vec_subs(E,v_gapextend);
2739 tmp = vec_subs(H,v_gapopen);
2743 F = vec_subs(Fup,v_gapextend);
2744 tmp = vec_subs(Hup2,v_gapopen);
2747 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2750 H = vec_adds(Hup1,v_score);
2751 H = vec_subs(H,v_bias);
2753 // set H to max of H,E,F
2757 // Update highest score encountered this far
2758 v_maxscore = vec_max(v_maxscore,H);
2762 // prefetch next residue
2763 k = db_sequence[j+13];
2764 k8 = db_sequence[j+5];
2766 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2767 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2768 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2769 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2770 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2771 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2774 // prefetch scores for next step
2775 v_score_load1 = vec_ld(16*k,query_profile_byte);
2776 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2778 // load values of F and H from previous row (one unit up)
2779 Fup = vec_ld(512, p);
2780 Hup1 = vec_ld(528, p);
2782 // save old values of F and H to use on next row
2787 // shift into place so we have complete F and H vectors
2788 // that refer to the values one unit up from each cell
2789 // that we are currently working on.
2790 Fup = vec_sld(Fup,F,15);
2791 Hup1 = vec_sld(Hup1,H,15);
2793 // do the dynamic programming
2796 E = vec_subs(E,v_gapextend);
2797 tmp = vec_subs(H,v_gapopen);
2801 F = vec_subs(Fup,v_gapextend);
2802 tmp = vec_subs(Hup1,v_gapopen);
2805 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2808 H = vec_adds(Hup2,v_score);
2809 H = vec_subs(H,v_bias);
2811 // set H to max of H,E,F
2815 // Update highest score encountered this far
2816 v_maxscore = vec_max(v_maxscore,H);
2820 // prefetch next residue
2821 k = db_sequence[j+14];
2822 k8 = db_sequence[j+6];
2824 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2825 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2826 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2827 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2828 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2829 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2832 // prefetch scores for next step
2833 v_score_load1 = vec_ld(16*k,query_profile_byte);
2834 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2836 // load values of F and H from previous row (one unit up)
2837 Fup = vec_ld(512, p);
2838 Hup2 = vec_ld(528, p);
2840 // save old values of F and H to use on next row
2845 // shift into place so we have complete F and H vectors
2846 // that refer to the values one unit up from each cell
2847 // that we are currently working on.
2848 Fup = vec_sld(Fup,F,15);
2849 Hup2 = vec_sld(Hup2,H,15);
2851 // do the dynamic programming
2854 E = vec_subs(E,v_gapextend);
2855 tmp = vec_subs(H,v_gapopen);
2859 F = vec_subs(Fup,v_gapextend);
2860 tmp = vec_subs(Hup2,v_gapopen);
2863 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2866 H = vec_adds(Hup1,v_score);
2867 H = vec_subs(H,v_bias);
2869 // set H to max of H,E,F
2873 // Update highest score encountered this far
2874 v_maxscore = vec_max(v_maxscore,H);
2878 // prefetch next residue
2879 k = db_sequence[j+15];
2880 k8 = db_sequence[j+7];
2882 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2883 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2884 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2885 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2886 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2887 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2889 // prefetch scores for next step
2890 v_score_load1 = vec_ld(16*k,query_profile_byte);
2891 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2893 // load values of F and H from previous row (one unit up)
2894 Fup = vec_ld(512, p);
2895 Hup1 = vec_ld(528, p);
2897 // save old values of F and H to use on next row
2902 // shift into place so we have complete F and H vectors
2903 // that refer to the values one unit up from each cell
2904 // that we are currently working on.
2905 Fup = vec_sld(Fup,F,15);
2906 Hup1 = vec_sld(Hup1,H,15);
2908 // do the dynamic programming
2911 E = vec_subs(E,v_gapextend);
2912 tmp = vec_subs(H,v_gapopen);
2916 F = vec_subs(Fup,v_gapextend);
2917 tmp = vec_subs(Hup1,v_gapopen);
2920 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2923 H = vec_adds(Hup2,v_score);
2924 H = vec_subs(H,v_bias);
2926 // set H to max of H,E,F
2930 // Update highest score encountered this far
2931 v_maxscore = vec_max(v_maxscore,H);
2935 // prefetch next residue
2936 k = db_sequence[j+16];
2937 k8 = db_sequence[j+8];
2939 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2940 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2941 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2942 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2943 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2944 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2947 // prefetch scores for next step
2948 v_score_load1 = vec_ld(16*k,query_profile_byte);
2949 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2951 // load values of F and H from previous row (one unit up)
2952 Fup = vec_ld(512, p);
2953 Hup2 = vec_ld(528, p);
2955 // save old values of F and H to use on next row
2960 // shift into place so we have complete F and H vectors
2961 // that refer to the values one unit up from each cell
2962 // that we are currently working on.
2963 Fup = vec_sld(Fup,F,15);
2964 Hup2 = vec_sld(Hup2,H,15);
2966 // do the dynamic programming
2969 E = vec_subs(E,v_gapextend);
2970 tmp = vec_subs(H,v_gapopen);
2974 F = vec_subs(Fup,v_gapextend);
2975 tmp = vec_subs(Hup2,v_gapopen);
2978 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2981 H = vec_adds(Hup1,v_score);
2982 H = vec_subs(H,v_bias);
2984 // set H to max of H,E,F
2988 // Update highest score encountered this far
2989 v_maxscore = vec_max(v_maxscore,H);
2993 for(;j<db_length+15;j++)
2995 k8 = db_sequence[j-7];
2997 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2998 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2999 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
3000 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
3001 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
3002 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
3005 // prefetch scores for next step
3006 v_score_load2 = vec_ld(16*k8,query_profile_byte);
3007 v_score_load1 = vec_perm(v_zero,v_score_load2,merge_score_load);
3009 // save old values of F and H to use on next row
3012 p += 32; // move ahead 32 bytes
3014 Fup = vec_sld(v_zero,F,15);
3015 Hup1 = vec_sld(v_zero,H,15);
3017 // do the dynamic programming
3020 E = vec_subs(E,v_gapextend);
3021 tmp = vec_subs(H,v_gapopen);
3025 F = vec_subs(Fup,v_gapextend);
3026 tmp = vec_subs(Hup1,v_gapopen);
3030 H = vec_adds(Hup2,v_score);
3031 H = vec_subs(H,v_bias);
3033 // set H to max of H,E,F
3037 // Save value to use for next diagonal H
3040 // Update highest score encountered this far
3041 v_maxscore = vec_max(v_maxscore,H);
3046 query_profile_byte += 16*alphabet_size;
3048 // End of this row (actually 16 rows due to SIMD).
3049 // Before we continue, check for overflow.
3050 tmp = vec_subs(vec_splat_u8(-1),v_bias);
3051 overflow = vec_any_ge(v_maxscore,tmp);
3062 // find largest score in the v_maxscore vector
3063 tmp = vec_sld(v_maxscore,v_maxscore,8);
3064 v_maxscore = vec_max(v_maxscore,tmp);
3065 tmp = vec_sld(v_maxscore,v_maxscore,4);
3066 v_maxscore = vec_max(v_maxscore,tmp);
3067 tmp = vec_sld(v_maxscore,v_maxscore,2);
3068 v_maxscore = vec_max(v_maxscore,tmp);
3069 tmp = vec_sld(v_maxscore,v_maxscore,1);
3070 v_maxscore = vec_max(v_maxscore,tmp);
3072 // store in temporary variable
3073 vec_ste(v_maxscore,0,&score);
3075 // return largest score
3082 /* No Altivec support. Avoid compiler complaints about empty object */