+++ /dev/null
-/******************************************************************
- Copyright 2006 by Michael Farrar. All rights reserved.
- This program may not be sold or incorporated into a commercial product,
- in whole or in part, without written consent of Michael Farrar. For
- further information regarding permission for use or reproduction, please
- contact: Michael Farrar at farrar.michael@gmail.com.
-*******************************************************************/
-
-/*
- Written by Michael Farrar, 2006.
- Please send bug reports and/or suggestions to farrar.michael@gmail.com.
-*/
-
-#include <stdio.h>
-
-#include "defs.h"
-#include "param.h"
-#include "dropgsw.h"
-#include "smith_waterman_sse2.h"
-
-#ifdef __SUNPRO_C
-#include <sunmedia_intrin.h>
-#else
-#include <emmintrin.h>
-#endif
-
-#ifdef SW_SSE2
-
-int
-smith_waterman_sse2_word(const unsigned char * query_sequence,
- unsigned short * query_profile_word,
- const int query_length,
- const unsigned char * db_sequence,
- const int db_length,
- unsigned short gap_open,
- unsigned short gap_extend,
- struct f_struct * f_str)
-{
- int i, j, k;
- short score;
-
- int cmp;
- int iter = (query_length + 7) / 8;
-
- __m128i *p;
- __m128i *workspace = (__m128i *) f_str->workspace;
-
- __m128i E, F, H;
-
- __m128i v_maxscore;
- __m128i v_gapopen;
- __m128i v_gapextend;
-
- __m128i v_min;
- __m128i v_minimums;
- __m128i v_temp;
-
- __m128i *pHLoad, *pHStore;
- __m128i *pE;
-
- __m128i *pScore;
-
- /* Load gap opening penalty to all elements of a constant */
- v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0);
- v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
- v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);
-
- /* Load gap extension penalty to all elements of a constant */
- v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0);
- v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
- v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);
-
- /* load v_maxscore with the zeros. since we are using signed */
- /* math, we will bias the maxscore to -32768 so we have the */
- /* full range of the short. */
- v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore);
- v_maxscore = _mm_slli_epi16 (v_maxscore, 15);
-
- v_minimums = _mm_shuffle_epi32 (v_maxscore, 0);
-
- v_min = _mm_shuffle_epi32 (v_maxscore, 0);
- v_min = _mm_srli_si128 (v_min, 14);
-
- /* Zero out the storage vector */
- k = 2 * iter;
-
- p = workspace;
- for (i = 0; i < k; i++)
- {
- _mm_store_si128 (p++, v_maxscore);
- }
-
- pE = workspace;
- pHStore = pE + iter;
- pHLoad = pHStore + iter;
-
- for (i = 0; i < db_length; ++i)
- {
- /* fetch first data asap. */
- pScore = (__m128i *) query_profile_word + db_sequence[i] * iter;
-
- /* bias all elements in F to -32768 */
- F = _mm_cmpeq_epi16 (F, F);
- F = _mm_slli_epi16 (F, 15);
-
- /* load the next h value */
- H = _mm_load_si128 (pHStore + iter - 1);
- H = _mm_slli_si128 (H, 2);
- H = _mm_or_si128 (H, v_min);
-
- p = pHLoad;
- pHLoad = pHStore;
- pHStore = p;
-
- for (j = 0; j < iter; j++)
- {
- /* load E values */
- E = _mm_load_si128 (pE + j);
-
- /* add score to H */
- H = _mm_adds_epi16 (H, *pScore++);
-
- /* Update highest score encountered this far */
- v_maxscore = _mm_max_epi16 (v_maxscore, H);
-
- /* get max from H, E and F */
- H = _mm_max_epi16 (H, E);
- H = _mm_max_epi16 (H, F);
-
- /* save H values */
- _mm_store_si128 (pHStore + j, H);
-
- /* subtract the gap open penalty from H */
- H = _mm_subs_epi16 (H, v_gapopen);
-
- /* update E value */
- E = _mm_subs_epi16 (E, v_gapextend);
- E = _mm_max_epi16 (E, H);
-
- /* update F value */
- F = _mm_subs_epi16 (F, v_gapextend);
- F = _mm_max_epi16 (F, H);
-
- /* save E values */
- _mm_store_si128 (pE + j, E);
-
- /* load the next h value */
- H = _mm_load_si128 (pHLoad + j);
- }
-
- /* reset pointers to the start of the saved data */
- j = 0;
- H = _mm_load_si128 (pHStore + j);
-
- /* the computed F value is for the given column. since */
- /* we are at the end, we need to shift the F value over */
- /* to the next column. */
- F = _mm_slli_si128 (F, 2);
- F = _mm_or_si128 (F, v_min);
- v_temp = _mm_subs_epi16 (H, v_gapopen);
- v_temp = _mm_cmpgt_epi16 (F, v_temp);
- cmp = _mm_movemask_epi8 (v_temp);
-
- while (cmp != 0x0000)
- {
- E = _mm_load_si128 (pE + j);
-
- H = _mm_max_epi16 (H, F);
-
- /* save H values */
- _mm_store_si128 (pHStore + j, H);
-
- /* update E in case the new H value would change it */
- H = _mm_subs_epi16 (H, v_gapopen);
- E = _mm_max_epi16 (E, H);
- _mm_store_si128 (pE + j, E);
-
- /* update F value */
- F = _mm_subs_epi16 (F, v_gapextend);
-
- j++;
- if (j >= iter)
- {
- j = 0;
- F = _mm_slli_si128 (F, 2);
- F = _mm_or_si128 (F, v_min);
- }
- H = _mm_load_si128 (pHStore + j);
-
- v_temp = _mm_subs_epi16 (H, v_gapopen);
- v_temp = _mm_cmpgt_epi16 (F, v_temp);
- cmp = _mm_movemask_epi8 (v_temp);
- }
- }
-
- /* find largest score in the v_maxscore vector */
- v_temp = _mm_srli_si128 (v_maxscore, 8);
- v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
- v_temp = _mm_srli_si128 (v_maxscore, 4);
- v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
- v_temp = _mm_srli_si128 (v_maxscore, 2);
- v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
-
- /* extract the largest score */
- score = _mm_extract_epi16 (v_maxscore, 0);
-
- /* return largest score biased by 32768 */
- return score + 32768;
-}
-
-
-
-
-int
-smith_waterman_sse2_byte(const unsigned char * query_sequence,
- unsigned char * query_profile_byte,
- const int query_length,
- const unsigned char * db_sequence,
- const int db_length,
- unsigned char bias,
- unsigned char gap_open,
- unsigned char gap_extend,
- struct f_struct * f_str)
-{
- int i, j, k;
- int score;
-
- int dup;
- int cmp;
- int iter = (query_length + 15) / 16;
-
- __m128i *p;
- __m128i *workspace = (__m128i *) f_str->workspace;
-
- __m128i E, F, H;
-
- __m128i v_maxscore;
- __m128i v_bias;
- __m128i v_gapopen;
- __m128i v_gapextend;
-
- __m128i v_temp;
- __m128i v_zero;
-
- __m128i *pHLoad, *pHStore;
- __m128i *pE;
-
- __m128i *pScore;
-
- /* Load the bias to all elements of a constant */
- dup = ((short) bias << 8) | bias;
- v_bias = _mm_insert_epi16 (v_bias, dup, 0);
- v_bias = _mm_shufflelo_epi16 (v_bias, 0);
- v_bias = _mm_shuffle_epi32 (v_bias, 0);
-
- /* Load gap opening penalty to all elements of a constant */
- dup = ((short) gap_open << 8) | gap_open;
- v_gapopen = _mm_insert_epi16 (v_gapopen, dup, 0);
- v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
- v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);
-
- /* Load gap extension penalty to all elements of a constant */
- dup = ((short) gap_extend << 8) | gap_extend;
- v_gapextend = _mm_insert_epi16 (v_gapextend, dup, 0);
- v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
- v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);
-
- /* initialize the max score */
- v_maxscore = _mm_xor_si128 (v_maxscore, v_maxscore);
-
- /* create a constant of all zeros for comparison */
- v_zero = _mm_xor_si128 (v_zero, v_zero);
-
- /* Zero out the storage vector */
- k = iter * 2;
-
- p = workspace;
- for (i = 0; i < k; i++)
- {
- _mm_store_si128 (p++, v_maxscore);
- }
-
- pE = workspace;
- pHStore = pE + iter;
- pHLoad = pHStore + iter;
-
- for (i = 0; i < db_length; ++i)
- {
- /* fetch first data asap. */
- pScore = (__m128i *) query_profile_byte + db_sequence[i] * iter;
-
- /* zero out F value. */
- F = _mm_xor_si128 (F, F);
-
- /* load the next h value */
- H = _mm_load_si128 (pHStore + iter - 1);
- H = _mm_slli_si128 (H, 1);
-
- p = pHLoad;
- pHLoad = pHStore;
- pHStore = p;
-
- for (j = 0; j < iter; j++)
- {
- /* load values E. */
- E = _mm_load_si128 (pE + j);
-
- /* add score to H */
- H = _mm_adds_epu8 (H, *pScore++);
- H = _mm_subs_epu8 (H, v_bias);
-
- /* Update highest score encountered this far */
- v_maxscore = _mm_max_epu8 (v_maxscore, H);
-
- /* get max from H, E and F */
- H = _mm_max_epu8 (H, E);
- H = _mm_max_epu8 (H, F);
-
- /* save H values */
- _mm_store_si128 (pHStore + j, H);
-
- /* subtract the gap open penalty from H */
- H = _mm_subs_epu8 (H, v_gapopen);
-
- /* update E value */
- E = _mm_subs_epu8 (E, v_gapextend);
- E = _mm_max_epu8 (E, H);
-
- /* update F value */
- F = _mm_subs_epu8 (F, v_gapextend);
- F = _mm_max_epu8 (F, H);
-
- /* save E values */
- _mm_store_si128 (pE + j, E);
-
- /* load the next h value */
- H = _mm_load_si128 (pHLoad + j);
- }
-
- /* reset pointers to the start of the saved data */
- j = 0;
- H = _mm_load_si128 (pHStore + j);
-
- /* the computed F value is for the given column. since */
- /* we are at the end, we need to shift the F value over */
- /* to the next column. */
- F = _mm_slli_si128 (F, 1);
- v_temp = _mm_subs_epu8 (H, v_gapopen);
- v_temp = _mm_subs_epu8 (F, v_temp);
- v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
- cmp = _mm_movemask_epi8 (v_temp);
-
- while (cmp != 0xffff)
- {
- E = _mm_load_si128 (pE + j);
-
- H = _mm_max_epu8 (H, F);
-
- /* save H values */
- _mm_store_si128 (pHStore + j, H);
-
- /* update E in case the new H value would change it */
- H = _mm_subs_epu8 (H, v_gapopen);
- E = _mm_max_epu8 (E, H);
- _mm_store_si128 (pE + j, E);
-
- /* update F value */
- F = _mm_subs_epu8 (F, v_gapextend);
-
- j++;
- if (j >= iter)
- {
- j = 0;
- F = _mm_slli_si128 (F, 1);
- }
- H = _mm_load_si128 (pHStore + j);
-
- v_temp = _mm_subs_epu8 (H, v_gapopen);
- v_temp = _mm_subs_epu8 (F, v_temp);
- v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
- cmp = _mm_movemask_epi8 (v_temp);
- }
- }
-
- /* find largest score in the v_maxscore vector */
- v_temp = _mm_srli_si128 (v_maxscore, 8);
- v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
- v_temp = _mm_srli_si128 (v_maxscore, 4);
- v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
- v_temp = _mm_srli_si128 (v_maxscore, 2);
- v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
- v_temp = _mm_srli_si128 (v_maxscore, 1);
- v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
-
- /* store in temporary variable */
- score = _mm_extract_epi16 (v_maxscore, 0);
- score = score & 0x00ff;
-
- /* check if we might have overflowed */
- if (score + bias >= 255)
- {
- score = 255;
- }
-
- /* return largest score */
- return score;
-}
-#else
-
-/* No SSE2 support. Avoid compiler complaints about empty object */
-
-int sw_dummy;
-
-#endif