BitMagic-C++
bmsse2.h
Go to the documentation of this file.
1 #ifndef BMSSE2__H__INCLUDED__
2 #define BMSSE2__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse2.h
22  \brief Compute functions for SSE2 SIMD instruction set (internal)
23 */
24 
25 #include<mmintrin.h>
26 #include<emmintrin.h>
27 
28 #include "bmdef.h"
29 #include "bmsse_util.h"
30 #include "bmutil.h"
31 
32 
33 #ifdef __GNUG__
34 #pragma GCC diagnostic push
35 #pragma GCC diagnostic ignored "-Wconversion"
36 #endif
37 
38 namespace bm
39 {
40 
41 
42 /*!
43  SSE2 optimized bitcounting function implements parallel bitcounting
44  algorithm for SSE2 instruction set.
45 
46 <pre>
47 unsigned CalcBitCount32(unsigned b)
48 {
49  b = (b & 0x55555555) + (b >> 1 & 0x55555555);
50  b = (b & 0x33333333) + (b >> 2 & 0x33333333);
51  b = (b + (b >> 4)) & 0x0F0F0F0F;
52  b = b + (b >> 8);
53  b = (b + (b >> 16)) & 0x0000003F;
54  return b;
55 }
56 </pre>
57 
58  @ingroup SSE2
59 
60 */
61 inline
62 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
63 {
64  const unsigned mu1 = 0x55555555;
65  const unsigned mu2 = 0x33333333;
66  const unsigned mu3 = 0x0F0F0F0F;
67  const unsigned mu4 = 0x0000003F;
68 
69  // Loading masks
70  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
71  __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
72  __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
73  __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
74  __m128i mcnt;
75  mcnt = _mm_xor_si128(m1, m1); // cnt = 0
76 
77  __m128i tmp1, tmp2;
78  do
79  {
80  __m128i b = _mm_load_si128(block);
81  ++block;
82 
83  // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
84  tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
85  tmp1 = _mm_and_si128(tmp1, m1);
86  tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
87  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
88 
89  // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
90  tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
91  tmp1 = _mm_and_si128(tmp1, m2);
92  tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
93  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
94 
95  // b = (b + (b >> 4)) & 0x0F0F0F0F;
96  tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
97  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
98  b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
99 
100  // b = b + (b >> 8);
101  tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
102  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
103 
104  // b = (b + (b >> 16)) & 0x0000003F;
105  tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
106  b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
107  b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
108 
109  mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
110 
111  } while (block < block_end);
112 
113 
115  _mm_store_si128((__m128i*)tcnt, mcnt);
116 
117  return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
118 }
119 
120 
121 
122 template<class Func>
123 bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,
124  const __m128i* BMRESTRICT block_end,
125  const __m128i* BMRESTRICT mask_block,
126  Func sse2_func)
127 {
128  const unsigned mu1 = 0x55555555;
129  const unsigned mu2 = 0x33333333;
130  const unsigned mu3 = 0x0F0F0F0F;
131  const unsigned mu4 = 0x0000003F;
132 
133  // Loading masks
134  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
135  __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
136  __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
137  __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
138  __m128i mcnt;
139  mcnt = _mm_xor_si128(m1, m1); // cnt = 0
140  do
141  {
142  __m128i tmp1, tmp2;
143  __m128i b = _mm_load_si128(block++);
144 
145  tmp1 = _mm_load_si128(mask_block++);
146 
147  b = sse2_func(b, tmp1);
148 
149  // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
150  tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
151  tmp1 = _mm_and_si128(tmp1, m1);
152  tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
153  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
154 
155  // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
156  tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
157  tmp1 = _mm_and_si128(tmp1, m2);
158  tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
159  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
160 
161  // b = (b + (b >> 4)) & 0x0F0F0F0F;
162  tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
163  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
164  b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
165 
166  // b = b + (b >> 8);
167  tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
168  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
169 
170  // b = (b + (b >> 16)) & 0x0000003F;
171  tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
172  b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
173  b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
174 
175  mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
176 
177  } while (block < block_end);
178 
180  _mm_store_si128((__m128i*)tcnt, mcnt);
181 
182  return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
183 }
184 
185 
186 inline
188  const __m128i* BMRESTRICT block_end,
189  unsigned* BMRESTRICT bit_count)
190 {
191  const unsigned mu1 = 0x55555555;
192  const unsigned mu2 = 0x33333333;
193  const unsigned mu3 = 0x0F0F0F0F;
194  const unsigned mu4 = 0x0000003F;
195 
196  // Loading masks
197  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
198  __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
199  __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
200  __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
201  __m128i mcnt;//, ccnt;
202  mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0
203  //ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0
204 
205  __m128i tmp1, tmp2;
206 
207  int count = (int)(block_end - block)*4; //0;//1;
208 
209  bm::word_t w, w0, w_prev;//, w_l;
210  const int w_shift = sizeof(w) * 8 - 1;
211  bool first_word = true;
212 
213  // first word
214  {
215  const bm::word_t* blk = (const bm::word_t*) block;
216  w = w0 = blk[0];
217  w ^= (w >> 1);
218  BM_INCWORD_BITCOUNT(count, w);
219  count -= (w_prev = (w0 >> w_shift)); // negative value correction
220  }
221 
223 
224  do
225  {
226  // compute bit-count
227  // ---------------------------------------------------------------------
228  {
229  __m128i b = _mm_load_si128(block);
230 
231  // w ^(w >> 1)
232  tmp1 = _mm_srli_epi32(b, 1); // tmp1 = b >> 1
233  tmp2 = _mm_xor_si128(b, tmp1); // tmp2 = tmp1 ^ b;
234  _mm_store_si128((__m128i*)tcnt, tmp2);
235 
236 
237  // compare with zero
238  // SSE4: _mm_test_all_zero()
239  {
240  // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
241  //tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
242  tmp1 = _mm_and_si128(tmp1, m1);
243  tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
244  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
245 
246  // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
247  tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
248  tmp1 = _mm_and_si128(tmp1, m2);
249  tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
250  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
251 
252  // b = (b + (b >> 4)) & 0x0F0F0F0F;
253  tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
254  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
255  b = _mm_and_si128(b, m3); //& 0x0F0F0F0F
256 
257  // b = b + (b >> 8);
258  tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
259  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
260 
261  // b = (b + (b >> 16)) & 0x0000003F;
262  tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
263  b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
264  b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
265 
266  mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
267  }
268 
269  }
270  // ---------------------------------------------------------------------
271  {
272  //__m128i b = _mm_load_si128(block);
273  // TODO: SSE4...
274  //w = _mm_extract_epi32(b, i);
275 
276  const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
277 
278  if (first_word)
279  {
280  first_word = false;
281  }
282  else
283  {
284  if (0!=(w0=blk[0]))
285  {
286  BM_INCWORD_BITCOUNT(count, tcnt[0]);
287  count -= !(w_prev ^ (w0 & 1));
288  count -= w_prev = (w0 >> w_shift);
289  }
290  else
291  {
292  count -= !w_prev; w_prev ^= w_prev;
293  }
294  }
295  if (0!=(w0=blk[1]))
296  {
297  BM_INCWORD_BITCOUNT(count, tcnt[1]);
298  count -= !(w_prev ^ (w0 & 1));
299  count -= w_prev = (w0 >> w_shift);
300  }
301  else
302  {
303  count -= !w_prev; w_prev ^= w_prev;
304  }
305  if (0!=(w0=blk[2]))
306  {
307  BM_INCWORD_BITCOUNT(count, tcnt[2]);
308  count -= !(w_prev ^ (w0 & 1));
309  count -= w_prev = (w0 >> w_shift);
310  }
311  else
312  {
313  count -= !w_prev; w_prev ^= w_prev;
314  }
315  if (0!=(w0=blk[3]))
316  {
317  BM_INCWORD_BITCOUNT(count, tcnt[3]);
318  count -= !(w_prev ^ (w0 & 1));
319  count -= w_prev = (w0 >> w_shift);
320  }
321  else
322  {
323  count -= !w_prev; w_prev ^= w_prev;
324  }
325  }
326  } while (++block < block_end);
327 
328  _mm_store_si128((__m128i*)tcnt, mcnt);
329  *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
330 
331  return unsigned(count);
332 }
333 
334 #ifdef __GNUG__
335 // necessary measure to silence false warning from GCC about negative pointer arithmetics
336 #pragma GCC diagnostic push
337 #pragma GCC diagnostic ignored "-Warray-bounds"
338 #endif
339 
340 /*!
341 SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
342 \internal
343 */
344 inline
345 unsigned sse2_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
346 {
347  BM_ASSERT(size <= 16);
348  BM_ASSERT(size);
349 
350  const unsigned unroll_factor = 8;
351  if (size < 4) // for very short vector use conventional scan
352  {
353  unsigned j;
354  for (j = 0; j < size; ++j)
355  {
356  if (pbuf[j] >= pos)
357  break;
358  }
359  return j;
360  }
361 
362  __m128i m1, mz, maskF, maskFL;
363 
364  mz = _mm_setzero_si128();
365  m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
366 
367  maskF = _mm_cmpeq_epi32(mz, mz); // set all FF
368  maskFL = _mm_slli_si128(maskF, 4 * 2); // byle shift to make [0000 FFFF]
369  int shiftL = (64 - (unroll_factor - size) * 16);
370  maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
371 
372  m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
373  m1 = _mm_or_si128(m1, maskFL);
374 
375  __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
376  __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
377  int mi = _mm_movemask_epi8(mge_mask); // collect flag bits
378  if (mi)
379  {
380  int bsr_i= bm::bit_scan_fwd(mi) >> 1;
381  return bsr_i; // address of first one element (target)
382  }
383  if (size == 8)
384  return size;
385 
386  // inspect the next lane with possible step back (to avoid over-read the block boundaries)
387  // GCC gives a false warning for "- unroll_factor" here
388  const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
389  BM_ASSERT(pbuf2 > pbuf); // assert in place to make sure GCC warning is indeed false
390 
391  m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
392  mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
393  mi = _mm_movemask_epi8(mge_mask);
394  if (mi)
395  {
396  int bsr_i = bm::bit_scan_fwd(mi) >> 1;
397  return size - (unroll_factor - bsr_i);
398  }
399  return size;
400 }
401 #ifdef __GNUG__
402 #pragma GCC diagnostic pop
403 #endif
404 
405 
406 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
407  sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
408 
409 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
410  sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
411 
412 #define VECT_BITCOUNT(first, last) \
413  sse2_bit_count((__m128i*) (first), (__m128i*) (last))
414 
415 #define VECT_BITCOUNT_AND(first, last, mask) \
416  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
417 
418 #define VECT_BITCOUNT_OR(first, last, mask) \
419  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
420 
421 #define VECT_BITCOUNT_XOR(first, last, mask) \
422  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
423 
424 #define VECT_BITCOUNT_SUB(first, last, mask) \
425  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
426 
427 #define VECT_INVERT_BLOCK(first) \
428  sse2_invert_block((__m128i*)first);
429 
430 #define VECT_AND_BLOCK(dst, src) \
431  sse2_and_block((__m128i*) dst, (__m128i*) (src))
432 
433 #define VECT_OR_BLOCK(dst, src) \
434  sse2_or_block((__m128i*) dst, (__m128i*) (src))
435 
436 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
437  sse2_or_block_2way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
438 
439 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
440  sse2_or_block_3way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
441 
442 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
443  sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
444 
445 #define VECT_SUB_BLOCK(dst, src) \
446  sse2_sub_block((__m128i*) dst, (__m128i*) (src))
447 
448 #define VECT_XOR_BLOCK(dst, src) \
449  sse2_xor_block((__m128i*) dst, (__m128i*) (src))
450 
451 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
452  sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
453 
454 #define VECT_COPY_BLOCK(dst, src) \
455  sse2_copy_block((__m128i*) dst, (__m128i*) (src))
456 
457 #define VECT_STREAM_BLOCK(dst, src) \
458  sse2_stream_block((__m128i*) dst, (__m128i*) (src))
459 
460 #define VECT_SET_BLOCK(dst, value) \
461  sse2_set_block((__m128i*) dst, value)
462 
463 
464 
465 } // namespace
466 
467 
468 #ifdef __GNUG__
469 #pragma GCC diagnostic pop
470 #endif
471 
472 
473 #endif
Definition: bm.h:76
bm::id_t sse2_bit_block_calc_count_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, unsigned *BMRESTRICT bit_count)
Definition: bmsse2.h:187
#define BM_ALIGN16
Definition: bmdef.h:263
T bit_scan_fwd(T v)
Definition: bmutil.h:283
Compute functions for SSE SIMD instruction set (internal)
bm::id_t sse2_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse2.h:62
unsigned int word_t
Definition: bmconst.h:38
unsigned short gap_word_t
Definition: bmconst.h:76
#define BM_INCWORD_BITCOUNT(cnt, w)
Definition: bmdef.h:373
bm::id_t sse2_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition: bmsse2.h:123
Definitions(internal)
unsigned sse2_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition: bmsse2.h:345
#define BM_ALIGN16ATTR
Definition: bmdef.h:264
unsigned int id_t
Definition: bmconst.h:37
#define BM_ASSERT
Definition: bmdef.h:117
Bit manipulation primitives (internal)
#define BMRESTRICT
Definition: bmdef.h:180