binaries/src/fasta34/smith_waterman_sse2.c

   1 /******************************************************************
   2   Copyright 2006 by Michael Farrar.  All rights reserved.
   3   This program may not be sold or incorporated into a commercial product,
   4   in whole or in part, without written consent of Michael Farrar.  For
   5   further information regarding permission for use or reproduction, please
   6   contact: Michael Farrar at farrar.michael@gmail.com.
   7 *******************************************************************/
   8
   9 /*
  10   Written by Michael Farrar, 2006.
  11   Please send bug reports and/or suggestions to farrar.michael@gmail.com.
  12 */
  13
  14 #include <stdio.h>
  15
  16 #include "defs.h"
  17 #include "param.h"
  18 #include "dropgsw.h"
  19 #include "smith_waterman_sse2.h"
  20
  21 #ifdef __SUNPRO_C
  22 #include <sunmedia_intrin.h>
  23 #else
  24 #include <emmintrin.h>
  25 #endif
  26
  27 #ifdef SW_SSE2
  28
  29 int
  30 smith_waterman_sse2_word(const unsigned char *     query_sequence,
  31                          unsigned short *    query_profile_word,
  32                          const int                 query_length,
  33                          const unsigned char *     db_sequence,
  34                          const int                 db_length,
  35                          unsigned short      gap_open,
  36                          unsigned short      gap_extend,
  37                          struct f_struct *   f_str)
  38 {
  39     int     i, j, k;
  40     short   score;
  41
  42     int     cmp;
  43     int     iter = (query_length + 7) / 8;
  44
  45     __m128i *p;
  46     __m128i *workspace = (__m128i *) f_str->workspace;
  47
  48     __m128i E, F, H;
  49
  50     __m128i v_maxscore;
  51     __m128i v_gapopen;
  52     __m128i v_gapextend;
  53
  54     __m128i v_min;
  55     __m128i v_minimums;
  56     __m128i v_temp;
  57
  58     __m128i *pHLoad, *pHStore;
  59     __m128i *pE;
  60
  61     __m128i *pScore;
  62
  63     /* Load gap opening penalty to all elements of a constant */
  64     v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0);
  65     v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
  66     v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);
  67
  68     /* Load gap extension penalty to all elements of a constant */
  69     v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0);
  70     v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
  71     v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);
  72
  73     /* load v_maxscore with the zeros.  since we are using signed */
  74     /*  math, we will bias the maxscore to -32768 so we have the */
  75     /*  full range of the short. */
  76     v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore);
  77     v_maxscore = _mm_slli_epi16 (v_maxscore, 15);
  78
  79     v_minimums = _mm_shuffle_epi32 (v_maxscore, 0);
  80
  81     v_min = _mm_shuffle_epi32 (v_maxscore, 0);
  82     v_min = _mm_srli_si128 (v_min, 14);
  83
  84     /* Zero out the storage vector */
  85     k = 2 * iter;
  86
  87     p = workspace;
  88     for (i = 0; i < k; i++)
  89     {
  90         _mm_store_si128 (p++, v_maxscore);
  91     }
  92
  93     pE = workspace;
  94     pHStore = pE + iter;
  95     pHLoad = pHStore + iter;
  96
  97     for (i = 0; i < db_length; ++i)
  98     {
  99         /* fetch first data asap. */
 100         pScore = (__m128i *) query_profile_word + db_sequence[i] * iter;
 101
 102         /* bias all elements in F to -32768 */
 103         F = _mm_cmpeq_epi16 (F, F);
 104         F = _mm_slli_epi16 (F, 15);
 105
 106         /* load the next h value */
 107         H = _mm_load_si128 (pHStore + iter - 1);
 108         H = _mm_slli_si128 (H, 2);
 109         H = _mm_or_si128 (H, v_min);
 110
 111         p = pHLoad;
 112         pHLoad = pHStore;
 113         pHStore = p;
 114
 115         for (j = 0; j < iter; j++)
 116         {
 117             /* load E values */
 118             E = _mm_load_si128 (pE + j);
 119
 120             /* add score to H */
 121             H = _mm_adds_epi16 (H, *pScore++);
 122
 123             /* Update highest score encountered this far */
 124             v_maxscore = _mm_max_epi16 (v_maxscore, H);
 125
 126             /* get max from H, E and F */
 127             H = _mm_max_epi16 (H, E);
 128             H = _mm_max_epi16 (H, F);
 129
 130             /* save H values */
 131             _mm_store_si128 (pHStore + j, H);
 132
 133             /* subtract the gap open penalty from H */
 134             H = _mm_subs_epi16 (H, v_gapopen);
 135
 136             /* update E value */
 137             E = _mm_subs_epi16 (E, v_gapextend);
 138             E = _mm_max_epi16 (E, H);
 139
 140             /* update F value */
 141             F = _mm_subs_epi16 (F, v_gapextend);
 142             F = _mm_max_epi16 (F, H);
 143
 144             /* save E values */
 145             _mm_store_si128 (pE + j, E);
 146
 147             /* load the next h value */
 148             H = _mm_load_si128 (pHLoad + j);
 149         }
 150
 151         /* reset pointers to the start of the saved data */
 152         j = 0;
 153         H = _mm_load_si128 (pHStore + j);
 154
 155         /*  the computed F value is for the given column.  since */
 156         /*  we are at the end, we need to shift the F value over */
 157         /*  to the next column. */
 158         F = _mm_slli_si128 (F, 2);
 159         F = _mm_or_si128 (F, v_min);
 160         v_temp = _mm_subs_epi16 (H, v_gapopen);
 161         v_temp = _mm_cmpgt_epi16 (F, v_temp);
 162         cmp  = _mm_movemask_epi8 (v_temp);
 163
 164         while (cmp != 0x0000)
 165         {
 166             E = _mm_load_si128 (pE + j);
 167
 168             H = _mm_max_epi16 (H, F);
 169
 170             /* save H values */
 171             _mm_store_si128 (pHStore + j, H);
 172
 173             /* update E in case the new H value would change it */
 174             H = _mm_subs_epi16 (H, v_gapopen);
 175             E = _mm_max_epi16 (E, H);
 176             _mm_store_si128 (pE + j, E);
 177
 178             /* update F value */
 179             F = _mm_subs_epi16 (F, v_gapextend);
 180
 181             j++;
 182             if (j >= iter)
 183             {
 184                 j = 0;
 185                 F = _mm_slli_si128 (F, 2);
 186                 F = _mm_or_si128 (F, v_min);
 187             }
 188             H = _mm_load_si128 (pHStore + j);
 189
 190             v_temp = _mm_subs_epi16 (H, v_gapopen);
 191             v_temp = _mm_cmpgt_epi16 (F, v_temp);
 192             cmp  = _mm_movemask_epi8 (v_temp);
 193         }
 194     }
 195
 196     /* find largest score in the v_maxscore vector */
 197     v_temp = _mm_srli_si128 (v_maxscore, 8);
 198     v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
 199     v_temp = _mm_srli_si128 (v_maxscore, 4);
 200     v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
 201     v_temp = _mm_srli_si128 (v_maxscore, 2);
 202     v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
 203
 204     /* extract the largest score */
 205     score = _mm_extract_epi16 (v_maxscore, 0);
 206
 207     /* return largest score biased by 32768 */
 208     return score + 32768;
 209 }
 210
 211
 212
 213
 214 int
 215 smith_waterman_sse2_byte(const unsigned char *     query_sequence,
 216                          unsigned char *     query_profile_byte,
 217                          const int                 query_length,
 218                          const unsigned char *     db_sequence,
 219                          const int                 db_length,
 220                          unsigned char       bias,
 221                          unsigned char       gap_open,
 222                          unsigned char       gap_extend,
 223                          struct f_struct *   f_str)
 224 {
 225     int     i, j, k;
 226     int     score;
 227
 228     int     dup;
 229     int     cmp;
 230     int     iter = (query_length + 15) / 16;
 231
 232     __m128i *p;
 233     __m128i *workspace = (__m128i *) f_str->workspace;
 234
 235     __m128i E, F, H;
 236
 237     __m128i v_maxscore;
 238     __m128i v_bias;
 239     __m128i v_gapopen;
 240     __m128i v_gapextend;
 241
 242     __m128i v_temp;
 243     __m128i v_zero;
 244
 245     __m128i *pHLoad, *pHStore;
 246     __m128i *pE;
 247
 248     __m128i *pScore;
 249
 250     /* Load the bias to all elements of a constant */
 251     dup    = ((short) bias << 8) | bias;
 252     v_bias = _mm_insert_epi16 (v_bias, dup, 0);
 253     v_bias = _mm_shufflelo_epi16 (v_bias, 0);
 254     v_bias = _mm_shuffle_epi32 (v_bias, 0);
 255
 256     /* Load gap opening penalty to all elements of a constant */
 257     dup  = ((short) gap_open << 8) | gap_open;
 258     v_gapopen = _mm_insert_epi16 (v_gapopen, dup, 0);
 259     v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
 260     v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);
 261
 262     /* Load gap extension penalty to all elements of a constant */
 263     dup  = ((short) gap_extend << 8) | gap_extend;
 264     v_gapextend = _mm_insert_epi16 (v_gapextend, dup, 0);
 265     v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
 266     v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);
 267
 268     /* initialize the max score */
 269     v_maxscore = _mm_xor_si128 (v_maxscore, v_maxscore);
 270
 271     /* create a constant of all zeros for comparison */
 272     v_zero = _mm_xor_si128 (v_zero, v_zero);
 273
 274     /* Zero out the storage vector */
 275     k = iter * 2;
 276
 277     p = workspace;
 278     for (i = 0; i < k; i++)
 279     {
 280         _mm_store_si128 (p++, v_maxscore);
 281     }
 282
 283     pE = workspace;
 284     pHStore = pE + iter;
 285     pHLoad = pHStore + iter;
 286
 287     for (i = 0; i < db_length; ++i)
 288     {
 289         /* fetch first data asap. */
 290         pScore = (__m128i *) query_profile_byte + db_sequence[i] * iter;
 291
 292         /* zero out F value. */
 293         F = _mm_xor_si128 (F, F);
 294
 295         /* load the next h value */
 296         H = _mm_load_si128 (pHStore + iter - 1);
 297         H = _mm_slli_si128 (H, 1);
 298
 299         p = pHLoad;
 300         pHLoad = pHStore;
 301         pHStore = p;
 302
 303         for (j = 0; j < iter; j++)
 304         {
 305             /* load values E. */
 306             E = _mm_load_si128 (pE + j);
 307
 308             /* add score to H */
 309             H = _mm_adds_epu8 (H, *pScore++);
 310             H = _mm_subs_epu8 (H, v_bias);
 311
 312             /* Update highest score encountered this far */
 313             v_maxscore = _mm_max_epu8 (v_maxscore, H);
 314
 315             /* get max from H, E and F */
 316             H = _mm_max_epu8 (H, E);
 317             H = _mm_max_epu8 (H, F);
 318
 319             /* save H values */
 320             _mm_store_si128 (pHStore + j, H);
 321
 322             /* subtract the gap open penalty from H */
 323             H = _mm_subs_epu8 (H, v_gapopen);
 324
 325             /* update E value */
 326             E = _mm_subs_epu8 (E, v_gapextend);
 327             E = _mm_max_epu8 (E, H);
 328
 329             /* update F value */
 330             F = _mm_subs_epu8 (F, v_gapextend);
 331             F = _mm_max_epu8 (F, H);
 332
 333             /* save E values */
 334             _mm_store_si128 (pE + j, E);
 335
 336             /* load the next h value */
 337             H = _mm_load_si128 (pHLoad + j);
 338         }
 339
 340         /* reset pointers to the start of the saved data */
 341         j = 0;
 342         H = _mm_load_si128 (pHStore + j);
 343
 344         /*  the computed F value is for the given column.  since */
 345         /*  we are at the end, we need to shift the F value over */
 346         /*  to the next column. */
 347         F = _mm_slli_si128 (F, 1);
 348         v_temp = _mm_subs_epu8 (H, v_gapopen);
 349         v_temp = _mm_subs_epu8 (F, v_temp);
 350         v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
 351         cmp  = _mm_movemask_epi8 (v_temp);
 352
 353         while (cmp != 0xffff)
 354         {
 355             E = _mm_load_si128 (pE + j);
 356
 357             H = _mm_max_epu8 (H, F);
 358
 359             /* save H values */
 360             _mm_store_si128 (pHStore + j, H);
 361
 362             /* update E in case the new H value would change it */
 363             H = _mm_subs_epu8 (H, v_gapopen);
 364             E = _mm_max_epu8 (E, H);
 365             _mm_store_si128 (pE + j, E);
 366
 367             /* update F value */
 368             F = _mm_subs_epu8 (F, v_gapextend);
 369
 370             j++;
 371             if (j >= iter)
 372             {
 373                 j = 0;
 374                 F = _mm_slli_si128 (F, 1);
 375             }
 376             H = _mm_load_si128 (pHStore + j);
 377
 378             v_temp = _mm_subs_epu8 (H, v_gapopen);
 379             v_temp = _mm_subs_epu8 (F, v_temp);
 380             v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
 381             cmp  = _mm_movemask_epi8 (v_temp);
 382         }
 383     }
 384
 385     /* find largest score in the v_maxscore vector */
 386     v_temp = _mm_srli_si128 (v_maxscore, 8);
 387     v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
 388     v_temp = _mm_srli_si128 (v_maxscore, 4);
 389     v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
 390     v_temp = _mm_srli_si128 (v_maxscore, 2);
 391     v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
 392     v_temp = _mm_srli_si128 (v_maxscore, 1);
 393     v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
 394
 395     /* store in temporary variable */
 396     score = _mm_extract_epi16 (v_maxscore, 0);
 397     score = score & 0x00ff;
 398
 399     /*  check if we might have overflowed */
 400     if (score + bias >= 255)
 401     {
 402         score = 255;
 403     }
 404
 405     /* return largest score */
 406     return score;
 407 }
 408 #else
 409
 410 /* No SSE2 support. Avoid compiler complaints about empty object */
 411
 412 int sw_dummy;
 413
 414 #endif