1 /******************************************************************
2 Copyright 2006 by Michael Farrar. All rights reserved.
3 This program may not be sold or incorporated into a commercial product,
4 in whole or in part, without written consent of Michael Farrar. For
5 further information regarding permission for use or reproduction, please
6 contact: Michael Farrar at farrar.michael@gmail.com.
7 *******************************************************************/
10 Written by Michael Farrar, 2006.
11 Please send bug reports and/or suggestions to farrar.michael@gmail.com.
19 #include "smith_waterman_sse2.h"
22 #include <sunmedia_intrin.h>
24 #include <emmintrin.h>
30 smith_waterman_sse2_word(const unsigned char * query_sequence,
31 unsigned short * query_profile_word,
32 const int query_length,
33 const unsigned char * db_sequence,
35 unsigned short gap_open,
36 unsigned short gap_extend,
37 struct f_struct * f_str)
43 int iter = (query_length + 7) / 8;
46 __m128i *workspace = (__m128i *) f_str->workspace;
58 __m128i *pHLoad, *pHStore;
63 /* Load gap opening penalty to all elements of a constant */
64 v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0);
65 v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
66 v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);
68 /* Load gap extension penalty to all elements of a constant */
69 v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0);
70 v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
71 v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);
73 /* load v_maxscore with the zeros. since we are using signed */
74 /* math, we will bias the maxscore to -32768 so we have the */
75 /* full range of the short. */
76 v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore);
77 v_maxscore = _mm_slli_epi16 (v_maxscore, 15);
79 v_minimums = _mm_shuffle_epi32 (v_maxscore, 0);
81 v_min = _mm_shuffle_epi32 (v_maxscore, 0);
82 v_min = _mm_srli_si128 (v_min, 14);
84 /* Zero out the storage vector */
88 for (i = 0; i < k; i++)
90 _mm_store_si128 (p++, v_maxscore);
95 pHLoad = pHStore + iter;
97 for (i = 0; i < db_length; ++i)
99 /* fetch first data asap. */
100 pScore = (__m128i *) query_profile_word + db_sequence[i] * iter;
102 /* bias all elements in F to -32768 */
103 F = _mm_cmpeq_epi16 (F, F);
104 F = _mm_slli_epi16 (F, 15);
106 /* load the next h value */
107 H = _mm_load_si128 (pHStore + iter - 1);
108 H = _mm_slli_si128 (H, 2);
109 H = _mm_or_si128 (H, v_min);
115 for (j = 0; j < iter; j++)
118 E = _mm_load_si128 (pE + j);
121 H = _mm_adds_epi16 (H, *pScore++);
123 /* Update highest score encountered this far */
124 v_maxscore = _mm_max_epi16 (v_maxscore, H);
126 /* get max from H, E and F */
127 H = _mm_max_epi16 (H, E);
128 H = _mm_max_epi16 (H, F);
131 _mm_store_si128 (pHStore + j, H);
133 /* subtract the gap open penalty from H */
134 H = _mm_subs_epi16 (H, v_gapopen);
137 E = _mm_subs_epi16 (E, v_gapextend);
138 E = _mm_max_epi16 (E, H);
141 F = _mm_subs_epi16 (F, v_gapextend);
142 F = _mm_max_epi16 (F, H);
145 _mm_store_si128 (pE + j, E);
147 /* load the next h value */
148 H = _mm_load_si128 (pHLoad + j);
151 /* reset pointers to the start of the saved data */
153 H = _mm_load_si128 (pHStore + j);
155 /* the computed F value is for the given column. since */
156 /* we are at the end, we need to shift the F value over */
157 /* to the next column. */
158 F = _mm_slli_si128 (F, 2);
159 F = _mm_or_si128 (F, v_min);
160 v_temp = _mm_subs_epi16 (H, v_gapopen);
161 v_temp = _mm_cmpgt_epi16 (F, v_temp);
162 cmp = _mm_movemask_epi8 (v_temp);
164 while (cmp != 0x0000)
166 E = _mm_load_si128 (pE + j);
168 H = _mm_max_epi16 (H, F);
171 _mm_store_si128 (pHStore + j, H);
173 /* update E in case the new H value would change it */
174 H = _mm_subs_epi16 (H, v_gapopen);
175 E = _mm_max_epi16 (E, H);
176 _mm_store_si128 (pE + j, E);
179 F = _mm_subs_epi16 (F, v_gapextend);
185 F = _mm_slli_si128 (F, 2);
186 F = _mm_or_si128 (F, v_min);
188 H = _mm_load_si128 (pHStore + j);
190 v_temp = _mm_subs_epi16 (H, v_gapopen);
191 v_temp = _mm_cmpgt_epi16 (F, v_temp);
192 cmp = _mm_movemask_epi8 (v_temp);
196 /* find largest score in the v_maxscore vector */
197 v_temp = _mm_srli_si128 (v_maxscore, 8);
198 v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
199 v_temp = _mm_srli_si128 (v_maxscore, 4);
200 v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
201 v_temp = _mm_srli_si128 (v_maxscore, 2);
202 v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
204 /* extract the largest score */
205 score = _mm_extract_epi16 (v_maxscore, 0);
207 /* return largest score biased by 32768 */
208 return score + 32768;
215 smith_waterman_sse2_byte(const unsigned char * query_sequence,
216 unsigned char * query_profile_byte,
217 const int query_length,
218 const unsigned char * db_sequence,
221 unsigned char gap_open,
222 unsigned char gap_extend,
223 struct f_struct * f_str)
230 int iter = (query_length + 15) / 16;
233 __m128i *workspace = (__m128i *) f_str->workspace;
245 __m128i *pHLoad, *pHStore;
250 /* Load the bias to all elements of a constant */
251 dup = ((short) bias << 8) | bias;
252 v_bias = _mm_insert_epi16 (v_bias, dup, 0);
253 v_bias = _mm_shufflelo_epi16 (v_bias, 0);
254 v_bias = _mm_shuffle_epi32 (v_bias, 0);
256 /* Load gap opening penalty to all elements of a constant */
257 dup = ((short) gap_open << 8) | gap_open;
258 v_gapopen = _mm_insert_epi16 (v_gapopen, dup, 0);
259 v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
260 v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);
262 /* Load gap extension penalty to all elements of a constant */
263 dup = ((short) gap_extend << 8) | gap_extend;
264 v_gapextend = _mm_insert_epi16 (v_gapextend, dup, 0);
265 v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
266 v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);
268 /* initialize the max score */
269 v_maxscore = _mm_xor_si128 (v_maxscore, v_maxscore);
271 /* create a constant of all zeros for comparison */
272 v_zero = _mm_xor_si128 (v_zero, v_zero);
274 /* Zero out the storage vector */
278 for (i = 0; i < k; i++)
280 _mm_store_si128 (p++, v_maxscore);
285 pHLoad = pHStore + iter;
287 for (i = 0; i < db_length; ++i)
289 /* fetch first data asap. */
290 pScore = (__m128i *) query_profile_byte + db_sequence[i] * iter;
292 /* zero out F value. */
293 F = _mm_xor_si128 (F, F);
295 /* load the next h value */
296 H = _mm_load_si128 (pHStore + iter - 1);
297 H = _mm_slli_si128 (H, 1);
303 for (j = 0; j < iter; j++)
306 E = _mm_load_si128 (pE + j);
309 H = _mm_adds_epu8 (H, *pScore++);
310 H = _mm_subs_epu8 (H, v_bias);
312 /* Update highest score encountered this far */
313 v_maxscore = _mm_max_epu8 (v_maxscore, H);
315 /* get max from H, E and F */
316 H = _mm_max_epu8 (H, E);
317 H = _mm_max_epu8 (H, F);
320 _mm_store_si128 (pHStore + j, H);
322 /* subtract the gap open penalty from H */
323 H = _mm_subs_epu8 (H, v_gapopen);
326 E = _mm_subs_epu8 (E, v_gapextend);
327 E = _mm_max_epu8 (E, H);
330 F = _mm_subs_epu8 (F, v_gapextend);
331 F = _mm_max_epu8 (F, H);
334 _mm_store_si128 (pE + j, E);
336 /* load the next h value */
337 H = _mm_load_si128 (pHLoad + j);
340 /* reset pointers to the start of the saved data */
342 H = _mm_load_si128 (pHStore + j);
344 /* the computed F value is for the given column. since */
345 /* we are at the end, we need to shift the F value over */
346 /* to the next column. */
347 F = _mm_slli_si128 (F, 1);
348 v_temp = _mm_subs_epu8 (H, v_gapopen);
349 v_temp = _mm_subs_epu8 (F, v_temp);
350 v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
351 cmp = _mm_movemask_epi8 (v_temp);
353 while (cmp != 0xffff)
355 E = _mm_load_si128 (pE + j);
357 H = _mm_max_epu8 (H, F);
360 _mm_store_si128 (pHStore + j, H);
362 /* update E in case the new H value would change it */
363 H = _mm_subs_epu8 (H, v_gapopen);
364 E = _mm_max_epu8 (E, H);
365 _mm_store_si128 (pE + j, E);
368 F = _mm_subs_epu8 (F, v_gapextend);
374 F = _mm_slli_si128 (F, 1);
376 H = _mm_load_si128 (pHStore + j);
378 v_temp = _mm_subs_epu8 (H, v_gapopen);
379 v_temp = _mm_subs_epu8 (F, v_temp);
380 v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
381 cmp = _mm_movemask_epi8 (v_temp);
385 /* find largest score in the v_maxscore vector */
386 v_temp = _mm_srli_si128 (v_maxscore, 8);
387 v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
388 v_temp = _mm_srli_si128 (v_maxscore, 4);
389 v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
390 v_temp = _mm_srli_si128 (v_maxscore, 2);
391 v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
392 v_temp = _mm_srli_si128 (v_maxscore, 1);
393 v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
395 /* store in temporary variable */
396 score = _mm_extract_epi16 (v_maxscore, 0);
397 score = score & 0x00ff;
399 /* check if we might have overflowed */
400 if (score + bias >= 255)
405 /* return largest score */
410 /* No SSE2 support. Avoid compiler complaints about empty object */