BitMagic-C++
bmsse_util.h
Go to the documentation of this file.
1#ifndef BMSSE_UTIL__H__INCLUDED__
2#define BMSSE_UTIL__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmsse_util.h
22 \brief Compute functions for SSE SIMD instruction set (internal)
23*/
24
25namespace bm
26{
27
28/** @defgroup SSE2 SSE2 functions
29 Processor specific optimizations for SSE2 instructions (internals)
30 @internal
31 @ingroup bvector
32 */
33
34#ifdef __GNUG__
35#pragma GCC diagnostic push
36#pragma GCC diagnostic ignored "-Wconversion"
37#endif
38
39
40/*!
41 @brief SSE2 reinitialization guard class
42
43 SSE2 requires to call _mm_empty() if we are intermixing
44 MMX integer commands with floating point arithmetics.
45 This class guards critical code fragments where SSE2 integer
46 is used.
47
48 As of 2015 _mm_empty() is considered deprecated, and not even recognised
49 by some compilers (like MSVC) in 64-bit mode.
50 As MMX instructions gets old, we here deprecate and comment out
51 use of _mm_empty()
52
53 @ingroup SSE2
54*/
56{
57public:
59 {
60 //_mm_empty();
61 }
62
64 {
65 //_mm_empty();
66 }
67};
68
69
70
71/*!
72 @brief XOR array elements to specified mask
73 *dst = *src ^ mask
74
75 @ingroup SSE2
76*/
77inline
79 const __m128i* BMRESTRICT src,
80 const __m128i* BMRESTRICT src_end,
81 bm::word_t mask)
82{
83 __m128i xM = _mm_set1_epi32((int)mask);
84 do
85 {
86 _mm_store_si128(dst+0, _mm_xor_si128(_mm_load_si128(src+0), xM));
87 _mm_store_si128(dst+1, _mm_xor_si128(_mm_load_si128(src+1), xM));
88 _mm_store_si128(dst+2, _mm_xor_si128(_mm_load_si128(src+2), xM));
89 _mm_store_si128(dst+3, _mm_xor_si128(_mm_load_si128(src+3), xM));
90 dst += 4; src += 4;
91 } while (src < src_end);
92}
93
94
95/*!
96 @brief Inverts array elements and NOT them to specified mask
97 *dst = ~*src & mask
98
99 @ingroup SSE2
100*/
101inline
103 const __m128i* BMRESTRICT src,
104 const __m128i* BMRESTRICT src_end,
105 bm::word_t mask)
106{
107 __m128i xM = _mm_set1_epi32((int)mask);
108 do
109 {
110 _mm_store_si128(dst+0, _mm_andnot_si128(_mm_load_si128(src+0), xM)); // xmm1 = (~xmm1) & xM
111 _mm_store_si128(dst+1, _mm_andnot_si128(_mm_load_si128(src+1), xM));
112 _mm_store_si128(dst+2, _mm_andnot_si128(_mm_load_si128(src+2), xM));
113 _mm_store_si128(dst+3, _mm_andnot_si128(_mm_load_si128(src+3), xM));
114 dst += 4; src += 4;
115 } while (src < src_end);
116}
117
118/*!
119 @brief AND blocks2
120 *dst &= *src
121 @return 0 if no bits were set
122 @ingroup SSE2
123*/
124inline
125unsigned sse2_and_block(__m128i* BMRESTRICT dst,
126 const __m128i* BMRESTRICT src)
127{
128 __m128i m1A, m1B, m1C, m1D;
129 __m128i accA, accB, accC, accD;
130
131 const __m128i* BMRESTRICT src_end =
132 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
133
134 accA = accB = accC = accD = _mm_setzero_si128();
135
136 do
137 {
138 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
139 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
140 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
141 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
142
143 _mm_store_si128(dst+0, m1A);
144 _mm_store_si128(dst+1, m1B);
145 _mm_store_si128(dst+2, m1C);
146 _mm_store_si128(dst+3, m1D);
147
148 accA = _mm_or_si128(accA, m1A);
149 accB = _mm_or_si128(accB, m1B);
150 accC = _mm_or_si128(accC, m1C);
151 accD = _mm_or_si128(accD, m1D);
152
153 src += 4; dst += 4;
154 } while (src < src_end);
155
156 accA = _mm_or_si128(accA, accB); // A = A | B
157 accC = _mm_or_si128(accC, accD); // C = C | D
158 accA = _mm_or_si128(accA, accC); // A = A | C
159
160
162 _mm_store_si128((__m128i*)macc, accA);
163 return macc[0] | macc[1] | macc[2] | macc[3];
164}
165
166/*!
167 @brief AND array elements against another array (unaligned)
168 *dst &= *src
169
170 @return 0 if no bits were set
171
172 @ingroup SSE2
173*/
174inline
175unsigned sse2_and_arr_unal(__m128i* BMRESTRICT dst,
176 const __m128i* BMRESTRICT src,
177 const __m128i* BMRESTRICT src_end)
178{
179 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
180 __m128i accA, accB, accC, accD;
181
182 accA = _mm_setzero_si128();
183 accB = _mm_setzero_si128();
184 accC = _mm_setzero_si128();
185 accD = _mm_setzero_si128();
186
187 do
188 {
189 m1A = _mm_loadu_si128(src+0);
190 m2A = _mm_load_si128(dst+0);
191 m1A = _mm_and_si128(m1A, m2A);
192 _mm_store_si128(dst+0, m1A);
193 accA = _mm_or_si128(accA, m1A);
194
195 m1B = _mm_loadu_si128(src+1);
196 m2B = _mm_load_si128(dst+1);
197 m1B = _mm_and_si128(m1B, m2B);
198 _mm_store_si128(dst+1, m1B);
199 accB = _mm_or_si128(accB, m1B);
200
201 m1C = _mm_loadu_si128(src+2);
202 m2C = _mm_load_si128(dst+2);
203 m1C = _mm_and_si128(m1C, m2C);
204 _mm_store_si128(dst+2, m1C);
205 accC = _mm_or_si128(accC, m1C);
206
207 m1D = _mm_loadu_si128(src+3);
208 m2D = _mm_load_si128(dst+3);
209 m1D = _mm_and_si128(m1D, m2D);
210 _mm_store_si128(dst+3, m1D);
211 accD = _mm_or_si128(accD, m1D);
212
213 src += 4; dst += 4;
214 } while (src < src_end);
215
216 accA = _mm_or_si128(accA, accB); // A = A | B
217 accC = _mm_or_si128(accC, accD); // C = C | D
218 accA = _mm_or_si128(accA, accC); // A = A | C
219
220
222 _mm_store_si128((__m128i*)macc, accA);
223 return macc[0] | macc[1] | macc[2] | macc[3];
224}
225
226
227inline
228unsigned sse2_and_block(__m128i* BMRESTRICT dst,
229 const __m128i* BMRESTRICT src,
230 const __m128i* BMRESTRICT src_end)
231{
232 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
233 __m128i accA, accB, accC, accD;
234
235 accA = _mm_setzero_si128();
236 accB = _mm_setzero_si128();
237 accC = _mm_setzero_si128();
238 accD = _mm_setzero_si128();
239
240 do
241 {
242 m1A = _mm_load_si128(src + 0);
243 m2A = _mm_load_si128(dst + 0);
244 m1A = _mm_and_si128(m1A, m2A);
245 _mm_store_si128(dst + 0, m1A);
246 accA = _mm_or_si128(accA, m1A);
247
248 m1B = _mm_load_si128(src + 1);
249 m2B = _mm_load_si128(dst + 1);
250 m1B = _mm_and_si128(m1B, m2B);
251 _mm_store_si128(dst + 1, m1B);
252 accB = _mm_or_si128(accB, m1B);
253
254 m1C = _mm_load_si128(src + 2);
255 m2C = _mm_load_si128(dst + 2);
256 m1C = _mm_and_si128(m1C, m2C);
257 _mm_store_si128(dst + 2, m1C);
258 accC = _mm_or_si128(accC, m1C);
259
260 m1D = _mm_load_si128(src + 3);
261 m2D = _mm_load_si128(dst + 3);
262 m1D = _mm_and_si128(m1D, m2D);
263 _mm_store_si128(dst + 3, m1D);
264 accD = _mm_or_si128(accD, m1D);
265
266 src += 4; dst += 4;
267 } while (src < src_end);
268
269 accA = _mm_or_si128(accA, accB); // A = A | B
270 accC = _mm_or_si128(accC, accD); // C = C | D
271 accA = _mm_or_si128(accA, accC); // A = A | C
272
273
275 _mm_store_si128((__m128i*)macc, accA);
276 return macc[0] | macc[1] | macc[2] | macc[3];
277}
278
279
280
281/*!
282 @brief OR array elements against another array
283 *dst |= *src
284 @return true if all bits are 1
285 @ingroup SSE2
286*/
287inline
288bool sse2_or_block(__m128i* BMRESTRICT dst,
289 const __m128i* BMRESTRICT src)
290{
291 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
292 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
293 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
294 const __m128i* BMRESTRICT src_end =
295 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
296
297 do
298 {
299 m1A = _mm_load_si128(src + 0);
300 m2A = _mm_load_si128(dst + 0);
301 m1A = _mm_or_si128(m1A, m2A);
302 _mm_store_si128(dst + 0, m1A);
303
304 m1B = _mm_load_si128(src + 1);
305 m2B = _mm_load_si128(dst + 1);
306 m1B = _mm_or_si128(m1B, m2B);
307 _mm_store_si128(dst + 1, m1B);
308
309 m1C = _mm_load_si128(src + 2);
310 m2C = _mm_load_si128(dst + 2);
311 m1C = _mm_or_si128(m1C, m2C);
312 _mm_store_si128(dst + 2, m1C);
313
314 m1D = _mm_load_si128(src + 3);
315 m2D = _mm_load_si128(dst + 3);
316 m1D = _mm_or_si128(m1D, m2D);
317 _mm_store_si128(dst + 3, m1D);
318
319 mAccF1 = _mm_and_si128(mAccF1, m1C);
320 mAccF1 = _mm_and_si128(mAccF1, m1D);
321 mAccF0 = _mm_and_si128(mAccF0, m1A);
322 mAccF0 = _mm_and_si128(mAccF0, m1B);
323
324 src += 4; dst += 4;
325 } while (src < src_end);
326
327 __m128i maskF = _mm_set1_epi32(~0u);
328 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
329 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
330 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
331
332 return (maskA == 0xFFFFu);
333}
334
335/*!
336 @brief OR array elements against another array (unaligned)
337 *dst |= *src
338 @return true if all bits are 1
339 @ingroup SSE2
340*/
341inline
342bool sse2_or_arr_unal(__m128i* BMRESTRICT dst,
343 const __m128i* BMRESTRICT src,
344 const __m128i* BMRESTRICT src_end)
345{
346 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
347 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
348 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
349 do
350 {
351 m1A = _mm_loadu_si128(src + 0);
352 m2A = _mm_load_si128(dst + 0);
353 m1A = _mm_or_si128(m1A, m2A);
354 _mm_store_si128(dst + 0, m1A);
355
356 m1B = _mm_loadu_si128(src + 1);
357 m2B = _mm_load_si128(dst + 1);
358 m1B = _mm_or_si128(m1B, m2B);
359 _mm_store_si128(dst + 1, m1B);
360
361 m1C = _mm_loadu_si128(src + 2);
362 m2C = _mm_load_si128(dst + 2);
363 m1C = _mm_or_si128(m1C, m2C);
364 _mm_store_si128(dst + 2, m1C);
365
366 m1D = _mm_loadu_si128(src + 3);
367 m2D = _mm_load_si128(dst + 3);
368 m1D = _mm_or_si128(m1D, m2D);
369 _mm_store_si128(dst + 3, m1D);
370
371 mAccF1 = _mm_and_si128(mAccF1, m1C);
372 mAccF1 = _mm_and_si128(mAccF1, m1D);
373 mAccF0 = _mm_and_si128(mAccF0, m1A);
374 mAccF0 = _mm_and_si128(mAccF0, m1B);
375
376 src += 4; dst += 4;
377 } while (src < src_end);
378
379 __m128i maskF = _mm_set1_epi32(~0u);
380 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
381 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
382 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
383 return (maskA == 0xFFFFu);
384}
385
386/*!
387 @brief OR 2 blocks anc copy result to the destination
388 *dst = *src1 | src2
389 @return true if all bits are 1
390
391 @ingroup SSE2
392*/
393inline
395 const __m128i* BMRESTRICT src1,
396 const __m128i* BMRESTRICT src2)
397{
398 __m128i m1A, m1B, m1C, m1D;
399 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
400 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
401 const __m128i* BMRESTRICT src_end1 =
402 (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
403
404 do
405 {
406 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
407 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
408 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
409 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
410
411 _mm_store_si128(dst + 0, m1A);
412 _mm_store_si128(dst + 1, m1B);
413 _mm_store_si128(dst + 2, m1C);
414 _mm_store_si128(dst + 3, m1D);
415
416 mAccF1 = _mm_and_si128(mAccF1, m1C);
417 mAccF1 = _mm_and_si128(mAccF1, m1D);
418 mAccF0 = _mm_and_si128(mAccF0, m1A);
419 mAccF0 = _mm_and_si128(mAccF0, m1B);
420
421 src1 += 4; src2 += 4; dst += 4;
422
423 } while (src1 < src_end1);
424
425 __m128i maskF = _mm_set1_epi32(~0u);
426 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
427 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
428 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
429 return (maskA == 0xFFFFu);
430}
431
432/*!
433 @brief OR array elements against another 2 arrays
434 *dst |= *src1 | src2
435 @return true if all bits are 1
436
437 @ingroup SSE2
438*/
439inline
441 const __m128i* BMRESTRICT src1,
442 const __m128i* BMRESTRICT src2)
443{
444 __m128i m1A, m1B, m1C, m1D;
445 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
446 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
447 const __m128i* BMRESTRICT src_end1 =
448 (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
449
450 do
451 {
452 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
453 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
454 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
455 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
456
457 m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
458 m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
459 m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
460 m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
461
462 _mm_store_si128(dst + 0, m1A);
463 _mm_store_si128(dst + 1, m1B);
464 _mm_store_si128(dst + 2, m1C);
465 _mm_store_si128(dst + 3, m1D);
466
467 mAccF1 = _mm_and_si128(mAccF1, m1C);
468 mAccF1 = _mm_and_si128(mAccF1, m1D);
469 mAccF0 = _mm_and_si128(mAccF0, m1A);
470 mAccF0 = _mm_and_si128(mAccF0, m1B);
471
472 src1 += 4; src2 += 4; dst += 4;
473
474 } while (src1 < src_end1);
475
476 __m128i maskF = _mm_set1_epi32(~0u);
477 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
478 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
479 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
480 return (maskA == 0xFFFFu);
481}
482
483/*!
484 @brief OR array elements against another 2 arrays
485 *dst |= *src1 | src2 | src3 | src4
486 @return true if all bits are 1
487
488 @ingroup SSE2
489*/
490inline
492 const __m128i* BMRESTRICT src1,
493 const __m128i* BMRESTRICT src2,
494 const __m128i* BMRESTRICT src3,
495 const __m128i* BMRESTRICT src4)
496{
497 __m128i m1A, m1B, m1C, m1D;
498 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
499 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
500
501 const __m128i* BMRESTRICT src_end1 =
502 (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
503
504 do
505 {
506 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
507 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
508 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
509 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
510
511 m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
512 m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
513 m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
514 m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
515
516 m1A = _mm_or_si128(m1A, _mm_load_si128(src3 + 0));
517 m1B = _mm_or_si128(m1B, _mm_load_si128(src3 + 1));
518 m1C = _mm_or_si128(m1C, _mm_load_si128(src3 + 2));
519 m1D = _mm_or_si128(m1D, _mm_load_si128(src3 + 3));
520
521 m1A = _mm_or_si128(m1A, _mm_load_si128(src4 + 0));
522 m1B = _mm_or_si128(m1B, _mm_load_si128(src4 + 1));
523 m1C = _mm_or_si128(m1C, _mm_load_si128(src4 + 2));
524 m1D = _mm_or_si128(m1D, _mm_load_si128(src4 + 3));
525
526 _mm_stream_si128(dst + 0, m1A);
527 _mm_stream_si128(dst + 1, m1B);
528 _mm_stream_si128(dst + 2, m1C);
529 _mm_stream_si128(dst + 3, m1D);
530
531 mAccF1 = _mm_and_si128(mAccF1, m1C);
532 mAccF1 = _mm_and_si128(mAccF1, m1D);
533 mAccF0 = _mm_and_si128(mAccF0, m1A);
534 mAccF0 = _mm_and_si128(mAccF0, m1B);
535
536 src1 += 4; src2 += 4;
537 src3 += 4; src4 += 4;
538
539 _mm_prefetch ((const char*)src3, _MM_HINT_T0);
540 _mm_prefetch ((const char*)src4, _MM_HINT_T0);
541
542 dst += 4;
543
544 } while (src1 < src_end1);
545
546 __m128i maskF = _mm_set1_epi32(~0u);
547 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
548 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
549 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
550 return (maskA == 0xFFFFu);
551}
552
553
554
555/*!
556 @brief XOR block against another
557 *dst ^= *src
558 @return 0 if no bits were set
559 @ingroup SSE2
560*/
561inline
562unsigned sse2_xor_block(__m128i* BMRESTRICT dst,
563 const __m128i* BMRESTRICT src)
564{
565 __m128i m1A, m1B, m1C, m1D;
566 __m128i accA, accB, accC, accD;
567
568 const __m128i* BMRESTRICT src_end =
569 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
570
571 accA = accB = accC = accD = _mm_setzero_si128();
572
573 do
574 {
575 m1A = _mm_xor_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
576 m1B = _mm_xor_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
577 m1C = _mm_xor_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
578 m1D = _mm_xor_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
579
580 _mm_store_si128(dst+0, m1A);
581 _mm_store_si128(dst+1, m1B);
582 _mm_store_si128(dst+2, m1C);
583 _mm_store_si128(dst+3, m1D);
584
585 accA = _mm_or_si128(accA, m1A);
586 accB = _mm_or_si128(accB, m1B);
587 accC = _mm_or_si128(accC, m1C);
588 accD = _mm_or_si128(accD, m1D);
589
590 src += 4; dst += 4;
591 } while (src < src_end);
592
593 accA = _mm_or_si128(accA, accB); // A = A | B
594 accC = _mm_or_si128(accC, accD); // C = C | D
595 accA = _mm_or_si128(accA, accC); // A = A | C
596
598 _mm_store_si128((__m128i*)macc, accA);
599 return macc[0] | macc[1] | macc[2] | macc[3];
600}
601
602/*!
603 @brief 3 operand XOR
604 *dst = *src1 ^ src2
605 @return 0 if no bits were set
606 @ingroup SSE2
607*/
608inline
609unsigned sse2_xor_block_2way(__m128i* BMRESTRICT dst,
610 const __m128i* BMRESTRICT src1,
611 const __m128i* BMRESTRICT src2)
612{
613 __m128i m1A, m1B, m1C, m1D;
614 __m128i accA, accB, accC, accD;
615
616 const __m128i* BMRESTRICT src1_end =
617 (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
618
619 accA = accB = accC = accD = _mm_setzero_si128();
620
621 do
622 {
623 m1A = _mm_xor_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
624 m1B = _mm_xor_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
625 m1C = _mm_xor_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
626 m1D = _mm_xor_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
627
628 _mm_store_si128(dst + 0, m1A);
629 _mm_store_si128(dst + 1, m1B);
630 _mm_store_si128(dst + 2, m1C);
631 _mm_store_si128(dst + 3, m1D);
632
633 accA = _mm_or_si128(accA, m1A);
634 accB = _mm_or_si128(accB, m1B);
635 accC = _mm_or_si128(accC, m1C);
636 accD = _mm_or_si128(accD, m1D);
637
638 src1 += 4; src2 += 4; dst += 4;
639 } while (src1 < src1_end);
640
641 accA = _mm_or_si128(accA, accB); // A = A | B
642 accC = _mm_or_si128(accC, accD); // C = C | D
643 accA = _mm_or_si128(accA, accC); // A = A | C
644
646 _mm_store_si128((__m128i*)macc, accA);
647 return macc[0] | macc[1] | macc[2] | macc[3];
648}
649
650
651/*!
652 @brief AND-NOT (SUB) array elements against another array
653 *dst &= ~*src
654
655 @return 0 if no bits were set
656
657 @ingroup SSE2
658*/
659inline
660unsigned sse2_sub_block(__m128i* BMRESTRICT dst,
661 const __m128i* BMRESTRICT src)
662{
663 __m128i m1A, m1B, m1C, m1D;
664 __m128i accA, accB, accC, accD;
665
666 accA = accB = accC = accD = _mm_setzero_si128();
667
668 const __m128i* BMRESTRICT src_end =
669 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
670
671 do
672 {
673 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
674 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
675 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
676 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
677
678 _mm_store_si128(dst+0, m1A);
679 _mm_store_si128(dst+1, m1B);
680 _mm_store_si128(dst+2, m1C);
681 _mm_store_si128(dst+3, m1D);
682
683 accA = _mm_or_si128(accA, m1A);
684 accB = _mm_or_si128(accB, m1B);
685 accC = _mm_or_si128(accC, m1C);
686 accD = _mm_or_si128(accD, m1D);
687
688 src += 4; dst += 4;
689 } while (src < src_end);
690
691 accA = _mm_or_si128(accA, accB); // A = A | B
692 accC = _mm_or_si128(accC, accD); // C = C | D
693 accA = _mm_or_si128(accA, accC); // A = A | C
694
695
697 _mm_store_si128((__m128i*)macc, accA);
698 return macc[0] | macc[1] | macc[2] | macc[3];
699}
700
701
702/*!
703 @brief SSE2 block memset
704 *dst = value
705
706 @ingroup SSE2
707*/
708
710void sse2_set_block(__m128i* BMRESTRICT dst, bm::word_t value)
711{
712 __m128i* BMRESTRICT dst_end =
713 (__m128i*)((bm::word_t*)(dst) + bm::set_block_size);
714
715 __m128i xmm0 = _mm_set1_epi32((int)value);
716 do
717 {
718 _mm_store_si128(dst, xmm0);
719 _mm_store_si128(dst+1, xmm0);
720 _mm_store_si128(dst+2, xmm0);
721 _mm_store_si128(dst+3, xmm0);
722
723 _mm_store_si128(dst+4, xmm0);
724 _mm_store_si128(dst+5, xmm0);
725 _mm_store_si128(dst+6, xmm0);
726 _mm_store_si128(dst+7, xmm0);
727
728 dst += 8;
729 } while (dst < dst_end);
730}
731
732/*!
733 @brief SSE2 block copy
734 *dst = *src
735
736 @ingroup SSE2
737*/
739void sse2_copy_block(__m128i* BMRESTRICT dst,
740 const __m128i* BMRESTRICT src)
741{
742 __m128i xmm0, xmm1, xmm2, xmm3;
743 const __m128i* BMRESTRICT src_end =
744 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
745
746 do
747 {
748 xmm0 = _mm_load_si128(src+0);
749 xmm1 = _mm_load_si128(src+1);
750 xmm2 = _mm_load_si128(src+2);
751 xmm3 = _mm_load_si128(src+3);
752
753 _mm_store_si128(dst+0, xmm0);
754 _mm_store_si128(dst+1, xmm1);
755 _mm_store_si128(dst+2, xmm2);
756 _mm_store_si128(dst+3, xmm3);
757
758 xmm0 = _mm_load_si128(src+4);
759 xmm1 = _mm_load_si128(src+5);
760 xmm2 = _mm_load_si128(src+6);
761 xmm3 = _mm_load_si128(src+7);
762
763 _mm_store_si128(dst+4, xmm0);
764 _mm_store_si128(dst+5, xmm1);
765 _mm_store_si128(dst+6, xmm2);
766 _mm_store_si128(dst+7, xmm3);
767
768 src += 8; dst += 8;
769
770 } while (src < src_end);
771}
772
773/*!
774 @brief SSE2 block copy
775 *dst = *src
776
777 @ingroup SSE2
778*/
781 const __m128i* BMRESTRICT src)
782{
783 __m128i xmm0, xmm1, xmm2, xmm3;
784 const __m128i* BMRESTRICT src_end =
785 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
786
787 do
788 {
789 xmm0 = _mm_load_si128(src+0);
790 xmm1 = _mm_load_si128(src+1);
791 xmm2 = _mm_load_si128(src+2);
792 xmm3 = _mm_load_si128(src+3);
793
794 _mm_stream_si128(dst+0, xmm0);
795 _mm_stream_si128(dst+1, xmm1);
796 _mm_stream_si128(dst+2, xmm2);
797 _mm_stream_si128(dst+3, xmm3);
798
799 xmm0 = _mm_load_si128(src+4);
800 xmm1 = _mm_load_si128(src+5);
801 xmm2 = _mm_load_si128(src+6);
802 xmm3 = _mm_load_si128(src+7);
803
804 _mm_stream_si128(dst+4, xmm0);
805 _mm_stream_si128(dst+5, xmm1);
806 _mm_stream_si128(dst+6, xmm2);
807 _mm_stream_si128(dst+7, xmm3);
808
809 src += 8; dst += 8;
810
811 } while (src < src_end);
812}
813
814
815/*!
816 @brief Invert bit block
817 *dst = ~*dst
818 or
819 *dst ^= *dst
820
821 @ingroup SSE2
822*/
823inline
824void sse2_invert_block(__m128i* dst)
825{
826 __m128i maskF = _mm_set1_epi32(~0u);
827 __m128i* BMRESTRICT dst_end =
828 (__m128i*)((bm::word_t*)(dst) + bm::set_block_size);
829
830 __m128i mA, mB, mC, mD;
831 do
832 {
833 mA = _mm_load_si128(dst + 0);
834 mA = _mm_xor_si128(mA, maskF);
835 _mm_store_si128(dst+0, mA);
836
837 mB = _mm_load_si128(dst + 1);
838 mB = _mm_xor_si128(mB, maskF);
839 _mm_store_si128(dst + 1, mB);
840
841 mC = _mm_load_si128(dst + 2);
842 mC = _mm_xor_si128(mC, maskF);
843 _mm_store_si128(dst + 2, mC);
844
845 mD = _mm_load_si128(dst + 3);
846 mD = _mm_xor_si128(mD, maskF);
847 _mm_store_si128(dst + 3, mD);
848
849 dst += 4;
850
851 } while (dst < (__m128i*)dst_end);
852}
853
855__m128i sse2_and(__m128i a, __m128i b)
856{
857 return _mm_and_si128(a, b);
858}
859
861__m128i sse2_or(__m128i a, __m128i b)
862{
863 return _mm_or_si128(a, b);
864}
865
866
868__m128i sse2_xor(__m128i a, __m128i b)
869{
870 return _mm_xor_si128(a, b);
871}
872
874__m128i sse2_sub(__m128i a, __m128i b)
875{
876 return _mm_andnot_si128(b, a);
877}
878
879
880/*!
881 @brief Gap block population count (array sum) utility
882 @param pbuf - unrolled, aligned to 1-start GAP buffer
883 @param sse_vect_waves - number of SSE vector lines to process
884 @param sum - result acumulator
885 @return tail pointer
886
887 @internal
888 @ingroup SSE2
889*/
890inline
892 const bm::gap_word_t* BMRESTRICT pbuf,
893 unsigned sse_vect_waves,
894 unsigned* sum)
895{
896 __m128i xcnt = _mm_setzero_si128();
897
898 for (unsigned i = 0; i < sse_vect_waves; ++i)
899 {
900 __m128i mm0 = _mm_loadu_si128((__m128i*)(pbuf - 1));
901 __m128i mm1 = _mm_loadu_si128((__m128i*)(pbuf + 8 - 1));
902 __m128i mm_s2 = _mm_add_epi16(mm1, mm0);
903 xcnt = _mm_add_epi16(xcnt, mm_s2);
904 pbuf += 16;
905 }
906 xcnt = _mm_sub_epi16(_mm_srli_epi32(xcnt, 16), xcnt);
907
908 unsigned short* cnt8 = (unsigned short*)&xcnt;
909 *sum += (cnt8[0]) + (cnt8[2]) + (cnt8[4]) + (cnt8[6]);
910 return pbuf;
911}
912
913#ifdef __GNUG__
914#pragma GCC diagnostic pop
915#endif
916
917
918} // namespace
919
920
921
922#endif
#define BM_ALIGN16
Definition bmdef.h:276
#define BMRESTRICT
Definition bmdef.h:193
#define BM_ALIGN16ATTR
Definition bmdef.h:277
#define BMFORCEINLINE
Definition bmdef.h:203
SSE2 reinitialization guard class.
Definition bmsse_util.h:56
BMFORCEINLINE sse_empty_guard()
Definition bmsse_util.h:58
BMFORCEINLINE ~sse_empty_guard()
Definition bmsse_util.h:63
bool sse2_or_block_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
OR array elements against another 2 arrays dst |= *src1 | src2 | src3 | src4.
Definition bmsse_util.h:491
unsigned sse2_xor_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
3 operand XOR dst = *src1 ^ src2
Definition bmsse_util.h:609
BMFORCEINLINE void sse2_set_block(__m128i *BMRESTRICT dst, bm::word_t value)
SSE2 block memset dst = value.
Definition bmsse_util.h:710
void sse2_andnot_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
Definition bmsse_util.h:102
bool sse2_or_block_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
Definition bmsse_util.h:440
bool sse2_or_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
OR array elements against another array dst |= *src.
Definition bmsse_util.h:288
BMFORCEINLINE void sse2_copy_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SSE2 block copy dst = *src.
Definition bmsse_util.h:739
bool sse2_or_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end)
OR array elements against another array (unaligned) dst |= *src.
Definition bmsse_util.h:342
unsigned sse2_xor_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
XOR block against another dst ^= *src.
Definition bmsse_util.h:562
void sse2_invert_block(__m128i *dst)
Invert bit block dst = ~*dst or dst ^= *dst.
Definition bmsse_util.h:824
unsigned sse2_sub_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
Definition bmsse_util.h:660
unsigned sse2_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Definition bmsse_util.h:125
bool sse2_or_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
OR 2 blocks anc copy result to the destination dst = *src1 | src2.
Definition bmsse_util.h:394
void sse2_xor_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
Definition bmsse_util.h:78
BMFORCEINLINE void sse2_stream_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SSE2 block copy dst = *src.
Definition bmsse_util.h:780
unsigned sse2_and_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end)
AND array elements against another array (unaligned) dst &= *src.
Definition bmsse_util.h:175
Definition bm.h:77
unsigned int word_t
Definition bmconst.h:38
const bm::gap_word_t * sse2_gap_sum_arr(const bm::gap_word_t *BMRESTRICT pbuf, unsigned sse_vect_waves, unsigned *sum)
Gap block population count (array sum) utility.
Definition bmsse_util.h:891
BMFORCEINLINE __m128i sse2_xor(__m128i a, __m128i b)
Definition bmsse_util.h:868
const unsigned set_block_size
Definition bmconst.h:54
unsigned int id_t
Definition bmconst.h:37
BMFORCEINLINE __m128i sse2_sub(__m128i a, __m128i b)
Definition bmsse_util.h:874
BMFORCEINLINE __m128i sse2_and(__m128i a, __m128i b)
Definition bmsse_util.h:855
unsigned short gap_word_t
Definition bmconst.h:77
BMFORCEINLINE __m128i sse2_or(__m128i a, __m128i b)
Definition bmsse_util.h:861