1 #ifndef BMSSE4__H__INCLUDED__ 2 #define BMSSE4__H__INCLUDED__ 45 #pragma GCC diagnostic push 46 #pragma GCC diagnostic ignored "-Wconversion" 50 #pragma warning( push ) 51 #pragma warning( disable : 4146) 86 count += unsigned( _mm_popcnt_u64(b[0]) +
87 _mm_popcnt_u64(b[1]));
93 const unsigned* b = (
unsigned*) block;
94 count += _mm_popcnt_u32(b[0]) +
95 _mm_popcnt_u32(b[1]) +
96 _mm_popcnt_u32(b[2]) +
98 }
while (++block < block_end);
109 unsigned ret = (a ^ b);
117 unsigned op_or(
unsigned a,
unsigned b)
142 __m128i tmp0 = _mm_load_si128(block);
143 __m128i tmp1 = _mm_load_si128(mask_block);
144 __m128i b = sse2_func(tmp0, tmp1);
146 count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
147 count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
149 ++block; ++mask_block;
150 }
while (block < block_end);
154 __m128i tmp0 = _mm_load_si128(block);
155 __m128i tmp1 = _mm_load_si128(mask_block);
156 __m128i b = sse2_func(tmp0, tmp1);
158 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
159 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
160 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
161 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
163 ++block; ++mask_block;
164 }
while (block < block_end);
178 __m128i maskz = _mm_setzero_si128();
184 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
185 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz)))
187 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
188 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz)))
191 }
while (block < block_end);
202 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
203 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
204 wA = _mm_or_si128(wA, wB);
205 bool z1 = _mm_test_all_zeros(wA, wA);
207 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
208 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
209 wA = _mm_or_si128(wA, wB);
210 bool z2 = _mm_test_all_zeros(wA, wA);
221 __m128i mV = _mm_set1_epi32(
int(value));
222 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
223 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
224 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
225 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
240 __m128i m1A, m1B, m1C, m1D;
241 __m128i accA, accB, accC, accD;
246 accA = accB = accC = accD = _mm_setzero_si128();
250 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
251 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
252 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
253 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
255 _mm_store_si128(dst+0, m1A);
256 _mm_store_si128(dst+1, m1B);
257 _mm_store_si128(dst+2, m1C);
258 _mm_store_si128(dst+3, m1D);
260 accA = _mm_or_si128(accA, m1A);
261 accB = _mm_or_si128(accB, m1B);
262 accC = _mm_or_si128(accC, m1C);
263 accD = _mm_or_si128(accD, m1D);
266 }
while (src < src_end);
268 accA = _mm_or_si128(accA, accB);
269 accC = _mm_or_si128(accC, accD);
270 accA = _mm_or_si128(accA, accC);
272 return !_mm_testz_si128(accA, accA);
287 __m128i m1A, m1B, m1C, m1D;
289 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
290 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
291 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
292 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
294 _mm_store_si128(dst+0, m1A);
295 _mm_store_si128(dst+1, m1B);
296 _mm_store_si128(dst+2, m1C);
297 _mm_store_si128(dst+3, m1D);
299 m1A = _mm_or_si128(m1A, m1B);
300 m1C = _mm_or_si128(m1C, m1D);
301 m1A = _mm_or_si128(m1A, m1C);
303 bool z1 = _mm_testz_si128(m1A, m1A);
305 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
306 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
307 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
308 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
310 _mm_store_si128(dst+4, m1A);
311 _mm_store_si128(dst+5, m1B);
312 _mm_store_si128(dst+6, m1C);
313 _mm_store_si128(dst+7, m1D);
315 m1A = _mm_or_si128(m1A, m1B);
316 m1C = _mm_or_si128(m1C, m1D);
317 m1A = _mm_or_si128(m1A, m1C);
319 bool z2 = _mm_testz_si128(m1A, m1A);
336 __m128i m1A, m1B, m1C, m1D;
338 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
339 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
340 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
341 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
343 _mm_store_si128(dst+0, m1A);
344 _mm_store_si128(dst+1, m1B);
345 _mm_store_si128(dst+2, m1C);
346 _mm_store_si128(dst+3, m1D);
348 m1A = _mm_or_si128(m1A, m1B);
349 m1C = _mm_or_si128(m1C, m1D);
350 m1A = _mm_or_si128(m1A, m1C);
352 bool z1 = _mm_testz_si128(m1A, m1A);
354 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
355 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
356 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
357 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
359 _mm_store_si128(dst+4, m1A);
360 _mm_store_si128(dst+5, m1B);
361 _mm_store_si128(dst+6, m1C);
362 _mm_store_si128(dst+7, m1D);
364 m1A = _mm_or_si128(m1A, m1B);
365 m1C = _mm_or_si128(m1C, m1D);
366 m1A = _mm_or_si128(m1A, m1C);
368 bool z2 = _mm_testz_si128(m1A, m1A);
385 __m128i m1A, m1B, m1C, m1D;
386 __m128i m1E, m1F, m1G, m1H;
388 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
389 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
390 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
391 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
393 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
394 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
395 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
396 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
398 m1A = _mm_and_si128(m1A, m1E);
399 m1B = _mm_and_si128(m1B, m1F);
400 m1C = _mm_and_si128(m1C, m1G);
401 m1D = _mm_and_si128(m1D, m1H);
403 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
404 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
405 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
406 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
408 _mm_store_si128(dst+0, m1A);
409 _mm_store_si128(dst+1, m1B);
410 _mm_store_si128(dst+2, m1C);
411 _mm_store_si128(dst+3, m1D);
413 m1A = _mm_or_si128(m1A, m1B);
414 m1C = _mm_or_si128(m1C, m1D);
415 m1A = _mm_or_si128(m1A, m1C);
417 bool z1 = _mm_testz_si128(m1A, m1A);
419 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
420 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
421 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
422 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
424 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
425 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
426 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
427 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
429 m1A = _mm_and_si128(m1A, m1E);
430 m1B = _mm_and_si128(m1B, m1F);
431 m1C = _mm_and_si128(m1C, m1G);
432 m1D = _mm_and_si128(m1D, m1H);
434 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
435 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
436 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
437 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
439 _mm_store_si128(dst+4, m1A);
440 _mm_store_si128(dst+5, m1B);
441 _mm_store_si128(dst+6, m1C);
442 _mm_store_si128(dst+7, m1D);
444 m1A = _mm_or_si128(m1A, m1B);
445 m1C = _mm_or_si128(m1C, m1D);
446 m1A = _mm_or_si128(m1A, m1C);
448 bool z2 = _mm_testz_si128(m1A, m1A);
465 __m128i m1A, m1B, m1C, m1D;
467 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
468 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
469 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
470 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
472 _mm_store_si128(dst+0, m1A);
473 _mm_store_si128(dst+1, m1B);
474 _mm_store_si128(dst+2, m1C);
475 _mm_store_si128(dst+3, m1D);
477 m1A = _mm_or_si128(m1A, m1B);
478 m1C = _mm_or_si128(m1C, m1D);
479 m1A = _mm_or_si128(m1A, m1C);
481 bool z1 = _mm_testz_si128(m1A, m1A);
483 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
484 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
485 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
486 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
488 _mm_store_si128(dst+4, m1A);
489 _mm_store_si128(dst+5, m1B);
490 _mm_store_si128(dst+6, m1C);
491 _mm_store_si128(dst+7, m1D);
493 m1A = _mm_or_si128(m1A, m1B);
494 m1C = _mm_or_si128(m1C, m1D);
495 m1A = _mm_or_si128(m1A, m1C);
497 bool z2 = _mm_testz_si128(m1A, m1A);
515 __m128i m1A, m1B, m1C, m1D;
517 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
518 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
519 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
520 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
522 _mm_store_si128(dst+0, m1A);
523 _mm_store_si128(dst+1, m1B);
524 _mm_store_si128(dst+2, m1C);
525 _mm_store_si128(dst+3, m1D);
527 m1A = _mm_or_si128(m1A, m1B);
528 m1C = _mm_or_si128(m1C, m1D);
529 m1A = _mm_or_si128(m1A, m1C);
531 bool z1 = _mm_testz_si128(m1A, m1A);
533 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
534 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
535 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
536 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
538 _mm_store_si128(dst+4, m1A);
539 _mm_store_si128(dst+5, m1B);
540 _mm_store_si128(dst+6, m1C);
541 _mm_store_si128(dst+7, m1D);
543 m1A = _mm_or_si128(m1A, m1B);
544 m1C = _mm_or_si128(m1C, m1D);
545 m1A = _mm_or_si128(m1A, m1C);
547 bool z2 = _mm_testz_si128(m1A, m1A);
567 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
568 if (!_mm_test_all_ones(w))
570 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
571 if (!_mm_test_all_ones(w))
575 }
while (block < block_end);
586 __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
587 return _mm_testz_si128(w0, w0);
597 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
598 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
599 w0 = _mm_or_si128(w0, w1);
600 return _mm_testz_si128(w0, w0);
610 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
611 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
612 w0 = _mm_xor_si128(w0, w1);
613 return _mm_testz_si128(w0, w0);
624 const __m128i* block_end =
626 __m128i m1COshft, m2COshft;
631 unsigned co2, co1 = 0;
632 for (;block < block_end; block += 2)
634 __m128i m1A = _mm_load_si128(block);
635 __m128i m2A = _mm_load_si128(block+1);
637 __m128i m1CO = _mm_srli_epi32(m1A, 31);
638 __m128i m2CO = _mm_srli_epi32(m2A, 31);
640 co2 = _mm_extract_epi32(m1CO, 3);
642 __m128i m1As = _mm_slli_epi32(m1A, 1);
643 __m128i m2As = _mm_slli_epi32(m2A, 1);
645 m1COshft = _mm_slli_si128 (m1CO, 4);
646 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
650 co2 = _mm_extract_epi32(m2CO, 3);
652 m2COshft = _mm_slli_si128 (m2CO, 4);
653 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
655 m1As = _mm_or_si128(m1As, m1COshft);
656 m2As = _mm_or_si128(m2As, m2COshft);
661 m1A = _mm_xor_si128(m1A, m1As);
662 m2A = _mm_xor_si128(m2A, m2As);
667 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
669 m0 = _mm_extract_epi64(m2A, 0);
670 m1 = _mm_extract_epi64(m2A, 1);
671 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
673 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
674 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
675 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
676 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
677 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
678 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
680 m0 = _mm_extract_epi32(m2A, 0);
681 m1 = _mm_extract_epi32(m2A, 1);
682 m2 = _mm_extract_epi32(m2A, 2);
683 m3 = _mm_extract_epi32(m2A, 3);
684 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
685 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
696 #pragma GCC diagnostic push 697 #pragma GCC diagnostic ignored "-Warray-bounds" 711 const unsigned unroll_factor = 8;
715 for (j = 0; j < size; ++j)
723 __m128i m1, mz, maskF, maskFL;
725 mz = _mm_setzero_si128();
726 m1 = _mm_loadu_si128((__m128i*)(pbuf));
728 maskF = _mm_cmpeq_epi64(mz, mz);
729 maskFL = _mm_slli_si128(maskF, 4 * 2);
730 int shiftL= (64 - (unroll_factor - size) * 16);
731 maskFL = _mm_slli_epi64(maskFL, shiftL);
733 m1 = _mm_andnot_si128(maskFL, m1);
734 m1 = _mm_or_si128(m1, maskFL);
736 __m128i mp = _mm_set1_epi16(pos);
737 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
738 __m128i c_mask = _mm_slli_epi16(mge_mask, 15);
739 int mi = _mm_movemask_epi8(c_mask);
743 unsigned bc = _mm_popcnt_u32(mi);
744 return unroll_factor - bc;
751 m1 = _mm_loadu_si128((__m128i*)(pbuf2));
752 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
753 mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
754 unsigned bc = _mm_popcnt_u32(mi);
772 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
773 __m128i mm_val = _mm_set1_epi32(value);
775 __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8);
776 __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8);
778 __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
779 __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
781 __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
782 int mask = _mm_movemask_epi8(cmp_mask_ge);
785 int bsf = bm::bsf_asm32(mask);
807 const unsigned*
BMRESTRICT arr_base = &arr[from];
809 unsigned unroll_factor = 8;
810 unsigned len = to - from + 1;
811 unsigned len_unr = len - (len % unroll_factor);
813 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
814 __m128i vect_target = _mm_set1_epi32(target);
815 __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8);
818 __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
821 for (; k < len_unr; k+=unroll_factor)
823 vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k]));
824 norm_vect40 = _mm_sub_epi32(vect40, mask0x8);
826 cmp_mask_ge = _mm_or_si128(
827 _mm_cmpgt_epi32 (norm_vect40, norm_target),
828 _mm_cmpeq_epi32 (vect40, vect_target)
830 mask = _mm_movemask_epi8(cmp_mask_ge);
833 int bsf = bm::bsf_asm32(mask);
834 return from + k + (bsf / 4);
836 vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
837 norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
839 cmp_mask_ge = _mm_or_si128(
840 _mm_cmpgt_epi32 (norm_vect41, norm_target),
841 _mm_cmpeq_epi32 (vect41, vect_target)
843 mask = _mm_movemask_epi8(cmp_mask_ge);
846 int bsf = bm::bsf_asm32(mask);
847 return 4 + from + k + (bsf / 4);
853 if (arr_base[k] >= target)
867 unsigned nb,
unsigned start)
869 const unsigned unroll_factor = 8;
870 const unsigned len = (size - start);
871 const unsigned len_unr = len - (len % unroll_factor);
876 __m128i nbM = _mm_set1_epi32(nb);
878 for (k = 0; k < len_unr; k+=unroll_factor)
880 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
881 __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
885 if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
886 !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
905 unsigned start,
unsigned stop )
907 const unsigned unroll_factor = 4;
908 const unsigned len = (stop - start);
909 const unsigned len_unr = len - (len % unroll_factor);
920 for (; k < len_unr; k+=unroll_factor)
922 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
923 __m128i nbitA = _mm_and_si128 (idxA, sb_mask);
927 nbitA = _mm_and_si128 (nbitA, sw_mask);
928 _mm_store_si128 ((__m128i*)mshift_v, nbitA);
932 __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0);
933 __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA);
934 if (_mm_test_all_ones(cmpA))
936 unsigned nword = _mm_extract_epi32(nwordA, 0);
937 block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
938 |(1u << mshift_v[2]) | (1u << mshift_v[3]);
942 _mm_store_si128 ((__m128i*)mword_v, nwordA);
944 block[mword_v[0]] |= (1u << mshift_v[0]);
945 block[mword_v[1]] |= (1u << mshift_v[1]);
946 block[mword_v[2]] |= (1u << mshift_v[2]);
947 block[mword_v[3]] |= (1u << mshift_v[3]);
958 block[nword] |= (1u << nbit);
994 const unsigned unroll_factor = 4;
995 const unsigned len = (size - start);
996 const unsigned len_unr = len - (len % unroll_factor);
1000 __m128i maskFF = _mm_set1_epi32(~0u);
1001 __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1003 __m128i mask_tmp, mask_0;
1009 unsigned base = start + k;
1010 __m128i* idx_ptr = (__m128i*)(idx + base);
1011 __m128i* target_ptr = (__m128i*)(arr + base);
1012 for (; k < len_unr; k+=unroll_factor)
1014 __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask);
1017 _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1018 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1026 __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1027 __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1028 mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1029 mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1030 mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1032 __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1033 _mm_slli_si128 (am_0, 8));
1034 mask_tmp = _mm_and_si128 (
1035 _mm_slli_epi32(mask1, mshift_v[3]),
1036 _mm_slli_si128 (am_0, 12)
1039 mask_0 = _mm_or_si128 (mask_0,
1040 _mm_or_si128 (mask_2, mask_tmp));
1043 mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1048 mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1049 blk[mword_v[1]], blk[mword_v[0]]),
1054 mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ);
1055 mask_tmp = _mm_xor_si128 (mask_tmp, maskFF);
1056 mask_tmp = _mm_srli_epi32 (mask_tmp, 31);
1058 mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx);
1060 _mm_storeu_si128 (target_ptr,
1061 _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1063 ++idx_ptr; ++target_ptr;
1064 _mm_prefetch((
const char*)target_ptr, _MM_HINT_T0);
1067 for (; k < len; ++k)
1083 __m128i* block_end =
1085 __m128i mAcc = _mm_set1_epi32(0);
1086 __m128i mMask1 = _mm_set1_epi32(1);
1089 for (--block_end; block_end >= block; block_end -= 2)
1091 __m128i m1A = _mm_load_si128(block_end);
1092 __m128i m2A = _mm_load_si128(block_end-1);
1094 __m128i m1CO = _mm_and_si128(m1A, mMask1);
1095 __m128i m2CO = _mm_and_si128(m2A, mMask1);
1097 co2 = _mm_extract_epi32(m1CO, 0);
1099 m1A = _mm_srli_epi32(m1A, 1);
1100 m2A = _mm_srli_epi32(m2A, 1);
1102 __m128i m1COshft = _mm_srli_si128 (m1CO, 4);
1103 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1104 m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1105 m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1106 m1COshft = _mm_slli_epi32(m1COshft, 31);
1107 m2COshft = _mm_slli_epi32(m2COshft, 31);
1109 m1A = _mm_or_si128(m1A, m1COshft);
1110 m2A = _mm_or_si128(m2A, m2COshft);
1112 co1 = _mm_extract_epi32(m2CO, 0);
1114 _mm_store_si128(block_end, m1A);
1115 _mm_store_si128(block_end-1, m2A);
1117 mAcc = _mm_or_si128(mAcc, m1A);
1118 mAcc = _mm_or_si128(mAcc, m2A);
1121 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1133 __m128i* block_end =
1135 __m128i m1COshft, m2COshft;
1136 __m128i mAcc = _mm_set1_epi32(0);
1139 for (;block < block_end; block += 2)
1141 __m128i m1A = _mm_load_si128(block);
1142 __m128i m2A = _mm_load_si128(block+1);
1144 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1145 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1147 co2 = _mm_extract_epi32(m1CO, 3);
1149 m1A = _mm_slli_epi32(m1A, 1);
1150 m2A = _mm_slli_epi32(m2A, 1);
1152 m1COshft = _mm_slli_si128 (m1CO, 4);
1153 m2COshft = _mm_slli_si128 (m2CO, 4);
1154 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1155 m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
1157 m1A = _mm_or_si128(m1A, m1COshft);
1158 m2A = _mm_or_si128(m2A, m2COshft);
1160 co1 = _mm_extract_epi32(m2CO, 3);
1162 _mm_store_si128(block, m1A);
1163 _mm_store_si128(block+1, m2A);
1165 mAcc = _mm_or_si128(mAcc, m1A);
1166 mAcc = _mm_or_si128(mAcc, m2A);
1168 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1189 __m128i m1COshft, m2COshft;
1190 __m128i mAcc = _mm_set1_epi32(0);
1201 di = unsigned(_mm_popcnt_u64(t - 1));
1208 di += unsigned(_mm_popcnt_u32(t32 - 1));
1212 for (; di < 64 ; ++di)
1218 block = (__m128i*) &wblock[d_base];
1219 mask_block = (__m128i*) &mblock[d_base];
1220 mAcc = _mm_xor_si128(mAcc, mAcc);
1221 for (
unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1223 __m128i m1A = _mm_load_si128(block);
1224 __m128i m2A = _mm_load_si128(block+1);
1226 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1227 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1229 co2 = _mm_extract_epi32(m1CO, 3);
1231 m1A = _mm_slli_epi32(m1A, 1);
1232 m2A = _mm_slli_epi32(m2A, 1);
1234 m1COshft = _mm_slli_si128 (m1CO, 4);
1235 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1239 co2 = _mm_extract_epi32(m2CO, 3);
1241 m2COshft = _mm_slli_si128 (m2CO, 4);
1242 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1244 m1A = _mm_or_si128(m1A, m1COshft);
1245 m2A = _mm_or_si128(m2A, m2COshft);
1247 m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block));
1248 m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1));
1250 mAcc = _mm_or_si128(mAcc, m1A);
1251 mAcc = _mm_or_si128(mAcc, m2A);
1253 _mm_store_si128(block, m1A);
1254 _mm_store_si128(block+1, m2A);
1260 if (_mm_testz_si128(mAcc, mAcc))
1271 bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1272 d |= (dmask & (w0 << di));
1285 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\ 1286 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask) 1288 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\ 1289 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask) 1291 #define VECT_BITCOUNT(first, last) \ 1292 sse4_bit_count((__m128i*) (first), (__m128i*) (last)) 1294 #define VECT_BITCOUNT_AND(first, last, mask) \ 1295 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) 1297 #define VECT_BITCOUNT_OR(first, last, mask) \ 1298 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) 1300 #define VECT_BITCOUNT_XOR(first, last, mask) \ 1301 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) 1303 #define VECT_BITCOUNT_SUB(first, last, mask) \ 1304 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) 1306 #define VECT_INVERT_BLOCK(first) \ 1307 sse2_invert_block((__m128i*)first); 1309 #define VECT_AND_BLOCK(dst, src) \ 1310 sse4_and_block((__m128i*) dst, (__m128i*) (src)) 1312 #define VECT_AND_DIGEST(dst, src) \ 1313 sse4_and_digest((__m128i*) dst, (const __m128i*) (src)) 1315 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \ 1316 sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4)) 1318 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \ 1319 sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) 1321 #define VECT_OR_BLOCK(dst, src) \ 1322 sse2_or_block((__m128i*) dst, (__m128i*) (src)) 1324 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \ 1325 sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2)) 1327 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \ 1328 sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2)) 1330 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \ 1331 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4)) 1333 #define VECT_SUB_BLOCK(dst, src) \ 1334 sse2_sub_block((__m128i*) dst, (const __m128i*) (src)) 1336 #define VECT_SUB_DIGEST(dst, src) \ 1337 sse4_sub_digest((__m128i*) dst, (const __m128i*) (src)) 1339 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \ 1340 sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2)) 1342 #define VECT_XOR_BLOCK(dst, src) \ 1343 sse2_xor_block((__m128i*) dst, (__m128i*) (src)) 1345 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \ 1346 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2)) 1348 #define VECT_COPY_BLOCK(dst, src) \ 1349 sse2_copy_block((__m128i*) dst, (__m128i*) (src)) 1351 #define VECT_STREAM_BLOCK(dst, src) \ 1352 sse2_stream_block((__m128i*) dst, (__m128i*) (src)) 1354 #define VECT_SET_BLOCK(dst, value) \ 1355 sse2_set_block((__m128i*) dst, value) 1357 #define VECT_IS_ZERO_BLOCK(dst) \ 1358 sse4_is_all_zero((__m128i*) dst) 1360 #define VECT_IS_ONE_BLOCK(dst) \ 1361 sse4_is_all_one((__m128i*) dst) 1363 #define VECT_IS_DIGEST_ZERO(start) \ 1364 sse4_is_digest_zero((__m128i*)start) 1366 #define VECT_BLOCK_SET_DIGEST(dst, val) \ 1367 sse4_block_set_digest((__m128i*)dst, val) 1369 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \ 1370 sse4_lower_bound_scan_u32(arr, target, from, to) 1372 #define VECT_SHIFT_L1(b, acc, co) \ 1373 sse42_shift_l1((__m128i*)b, acc, co) 1375 #define VECT_SHIFT_R1(b, acc, co) \ 1376 sse42_shift_r1((__m128i*)b, acc, co) 1378 #define VECT_SHIFT_R1_AND(b, co, m, digest) \ 1379 sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest) 1381 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \ 1382 sse42_idx_arr_block_lookup(idx, size, nb, start) 1384 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \ 1385 sse42_set_block_bits(block, idx, start, stop) 1387 #define VECT_BLOCK_CHANGE(block) \ 1388 sse42_bit_block_calc_change((__m128i*)block) 1392 #pragma GCC diagnostic pop 1396 #pragma warning( pop ) BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 waves of pointers are all NULL
unsigned sse4_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
const unsigned set_block_size
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end)
const unsigned set_word_shift
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift right by 1
bool sse4_is_all_zero(const __m128i *BMRESTRICT block)
check if block is all zero bits
unsigned long long int id64_t
bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
AND block digest stride dst = *src1 & src2.
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block)
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if wave of 2 pointers are the same (null or FULL)
bool sse4_is_all_one(const __m128i *BMRESTRICT block)
check if block is all zero bits
Compute functions for SSE SIMD instruction set (internal)
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b)
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
int sse42_cmpge_u32(__m128i vect4, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array...
bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b)
unsigned short gap_word_t
bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND block digest stride dst &= *src.
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
AND block digest stride.
bool sse42_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift left by 1
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
bool sse4_is_digest_zero(const __m128i *BMRESTRICT block)
check if digest stride is all zero bits
bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
const unsigned set_block_mask
void sse4_block_set_digest(__m128i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
const unsigned set_word_mask
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b)
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest)
block shift right by 1 plus AND
const unsigned set_block_shift
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Bit manipulation primitives (internal)
const unsigned set_block_digest_wave_size