9#include "mdds/global.hpp"
10#include "../types.hpp"
19namespace mdds {
namespace mtv {
namespace soa {
namespace detail {
21template<
typename Blks, lu_factor_t F>
24 void operator()(Blks& , int64_t , int64_t )
const
27 mdds::detail::invalid_static_int<F>,
"The loop-unrolling factor must be one of 0, 4, 8, 16, or 32.");
31template<
typename Blks>
34 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
36 int64_t n = block_store.positions.size();
38 if (start_block_index >= n)
42#pragma omp parallel for
44 for (int64_t i = start_block_index; i < n; ++i)
45 block_store.positions[i] += delta;
49template<
typename Blks>
52 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
54 int64_t n = block_store.positions.size();
56 if (start_block_index >= n)
60 int64_t len = n - start_block_index;
61 int64_t rem = len & 3;
63 len += start_block_index;
65#pragma omp parallel for
67 for (int64_t i = start_block_index; i < len; i += 4)
69 block_store.positions[i + 0] += delta;
70 block_store.positions[i + 1] += delta;
71 block_store.positions[i + 2] += delta;
72 block_store.positions[i + 3] += delta;
76 for (int64_t i = len; i < rem; ++i)
77 block_store.positions[i] += delta;
81template<
typename Blks>
84 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
86 int64_t n = block_store.positions.size();
88 if (start_block_index >= n)
92 int64_t len = n - start_block_index;
93 int64_t rem = len & 7;
95 len += start_block_index;
97#pragma omp parallel for
99 for (int64_t i = start_block_index; i < len; i += 8)
101 block_store.positions[i + 0] += delta;
102 block_store.positions[i + 1] += delta;
103 block_store.positions[i + 2] += delta;
104 block_store.positions[i + 3] += delta;
105 block_store.positions[i + 4] += delta;
106 block_store.positions[i + 5] += delta;
107 block_store.positions[i + 6] += delta;
108 block_store.positions[i + 7] += delta;
112 for (int64_t i = len; i < rem; ++i)
113 block_store.positions[i] += delta;
117template<
typename Blks>
120 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
122 int64_t n = block_store.positions.size();
124 if (start_block_index >= n)
128 int64_t len = n - start_block_index;
129 int64_t rem = len & 15;
131 len += start_block_index;
133#pragma omp parallel for
135 for (int64_t i = start_block_index; i < len; i += 16)
137 block_store.positions[i + 0] += delta;
138 block_store.positions[i + 1] += delta;
139 block_store.positions[i + 2] += delta;
140 block_store.positions[i + 3] += delta;
141 block_store.positions[i + 4] += delta;
142 block_store.positions[i + 5] += delta;
143 block_store.positions[i + 6] += delta;
144 block_store.positions[i + 7] += delta;
145 block_store.positions[i + 8] += delta;
146 block_store.positions[i + 9] += delta;
147 block_store.positions[i + 10] += delta;
148 block_store.positions[i + 11] += delta;
149 block_store.positions[i + 12] += delta;
150 block_store.positions[i + 13] += delta;
151 block_store.positions[i + 14] += delta;
152 block_store.positions[i + 15] += delta;
156 for (int64_t i = len; i < rem; ++i)
157 block_store.positions[i] += delta;
161template<
typename Blks>
164 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
166 int64_t n = block_store.positions.size();
168 if (start_block_index >= n)
172 int64_t len = n - start_block_index;
173 int64_t rem = len & 31;
175 len += start_block_index;
177#pragma omp parallel for
179 for (int64_t i = start_block_index; i < len; i += 32)
181 block_store.positions[i + 0] += delta;
182 block_store.positions[i + 1] += delta;
183 block_store.positions[i + 2] += delta;
184 block_store.positions[i + 3] += delta;
185 block_store.positions[i + 4] += delta;
186 block_store.positions[i + 5] += delta;
187 block_store.positions[i + 6] += delta;
188 block_store.positions[i + 7] += delta;
189 block_store.positions[i + 8] += delta;
190 block_store.positions[i + 9] += delta;
191 block_store.positions[i + 10] += delta;
192 block_store.positions[i + 11] += delta;
193 block_store.positions[i + 12] += delta;
194 block_store.positions[i + 13] += delta;
195 block_store.positions[i + 14] += delta;
196 block_store.positions[i + 15] += delta;
197 block_store.positions[i + 16] += delta;
198 block_store.positions[i + 17] += delta;
199 block_store.positions[i + 18] += delta;
200 block_store.positions[i + 19] += delta;
201 block_store.positions[i + 20] += delta;
202 block_store.positions[i + 21] += delta;
203 block_store.positions[i + 22] += delta;
204 block_store.positions[i + 23] += delta;
205 block_store.positions[i + 24] += delta;
206 block_store.positions[i + 25] += delta;
207 block_store.positions[i + 26] += delta;
208 block_store.positions[i + 27] += delta;
209 block_store.positions[i + 28] += delta;
210 block_store.positions[i + 29] += delta;
211 block_store.positions[i + 30] += delta;
212 block_store.positions[i + 31] += delta;
216 for (int64_t i = len; i < rem; ++i)
217 block_store.positions[i] += delta;
223template<
typename Blks>
226 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
229 sizeof(
typename decltype(block_store.positions)
::value_type) == 8,
230 "This code works only when the position values are 64-bit wide.");
232 int64_t n = block_store.positions.size();
234 if (start_block_index >= n)
238 int64_t len = n - start_block_index;
243 len += start_block_index;
245 __m128i right = _mm_set_epi64x(delta, delta);
248#pragma omp parallel for
250 for (int64_t i = start_block_index; i < len; i += 2)
252 __m128i* dst = (__m128i*)&block_store.positions[i];
253 _mm_storeu_si128(dst, _mm_add_epi64(_mm_loadu_si128(dst), right));
257 block_store.positions[len] += delta;
261template<
typename Blks>
262struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu4>
264 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
267 sizeof(
typename decltype(block_store.positions)
::value_type) == 8,
268 "This code works only when the position values are 64-bit wide.");
270 int64_t n = block_store.positions.size();
272 if (start_block_index >= n)
276 int64_t len = n - start_block_index;
277 int64_t rem = len & 7;
279 len += start_block_index;
281 __m128i right = _mm_set_epi64x(delta, delta);
284#pragma omp parallel for
286 for (int64_t i = start_block_index; i < len; i += 8)
288 __m128i* dst0 = (__m128i*)&block_store.positions[i];
289 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
291 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
292 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
294 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
295 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
297 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
298 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
302 for (int64_t i = len; i < rem; ++i)
303 block_store.positions[i] += delta;
307template<
typename Blks>
310 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
313 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
314 "This code works only when the position values are 64-bit wide.");
316 int64_t n = block_store.positions.size();
318 if (start_block_index >= n)
322 int64_t len = n - start_block_index;
323 int64_t rem = len & 15;
325 len += start_block_index;
327 __m128i right = _mm_set_epi64x(delta, delta);
330#pragma omp parallel for
332 for (int64_t i = start_block_index; i < len; i += 16)
334 __m128i* dst0 = (__m128i*)&block_store.positions[i];
335 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
337 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
338 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
340 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
341 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
343 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
344 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
346 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
347 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
349 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
350 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
352 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
353 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
355 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
356 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
360 for (int64_t i = len; i < rem; ++i)
361 block_store.positions[i] += delta;
365template<
typename Blks>
368 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
371 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
372 "This code works only when the position values are 64-bit wide.");
374 int64_t n = block_store.positions.size();
376 if (start_block_index >= n)
380 int64_t len = n - start_block_index;
381 int64_t rem = len & 31;
383 len += start_block_index;
385 __m128i right = _mm_set_epi64x(delta, delta);
388#pragma omp parallel for
390 for (int64_t i = start_block_index; i < len; i += 32)
392 __m128i* dst0 = (__m128i*)&block_store.positions[i];
393 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
395 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
396 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
398 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
399 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
401 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
402 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
404 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
405 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
407 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
408 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
410 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
411 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
413 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
414 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
416 __m128i* dst16 = (__m128i*)&block_store.positions[i + 16];
417 _mm_storeu_si128(dst16, _mm_add_epi64(_mm_loadu_si128(dst16), right));
419 __m128i* dst18 = (__m128i*)&block_store.positions[i + 18];
420 _mm_storeu_si128(dst18, _mm_add_epi64(_mm_loadu_si128(dst18), right));
422 __m128i* dst20 = (__m128i*)&block_store.positions[i + 20];
423 _mm_storeu_si128(dst20, _mm_add_epi64(_mm_loadu_si128(dst20), right));
425 __m128i* dst22 = (__m128i*)&block_store.positions[i + 22];
426 _mm_storeu_si128(dst22, _mm_add_epi64(_mm_loadu_si128(dst22), right));
428 __m128i* dst24 = (__m128i*)&block_store.positions[i + 24];
429 _mm_storeu_si128(dst24, _mm_add_epi64(_mm_loadu_si128(dst24), right));
431 __m128i* dst26 = (__m128i*)&block_store.positions[i + 26];
432 _mm_storeu_si128(dst26, _mm_add_epi64(_mm_loadu_si128(dst26), right));
434 __m128i* dst28 = (__m128i*)&block_store.positions[i + 28];
435 _mm_storeu_si128(dst28, _mm_add_epi64(_mm_loadu_si128(dst28), right));
437 __m128i* dst30 = (__m128i*)&block_store.positions[i + 30];
438 _mm_storeu_si128(dst30, _mm_add_epi64(_mm_loadu_si128(dst30), right));
442 for (int64_t i = len; i < rem; ++i)
443 block_store.positions[i] += delta;
451template<
typename Blks>
454 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
457 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
458 "This code works only when the position values are 64-bit wide.");
460 int64_t n = block_store.positions.size();
462 if (start_block_index >= n)
466 int64_t len = n - start_block_index;
467 int64_t rem = len & 3;
469 len += start_block_index;
471 __m256i right = _mm256_set1_epi64x(delta);
474#pragma omp parallel for
476 for (int64_t i = start_block_index; i < len; i += 4)
478 __m256i* dst = (__m256i*)&block_store.positions[i];
479 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
483 for (int64_t i = len; i < rem; ++i)
484 block_store.positions[i] += delta;
488template<
typename Blks>
491 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
494 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
495 "This code works only when the position values are 64-bit wide.");
497 int64_t n = block_store.positions.size();
499 if (start_block_index >= n)
503 int64_t len = n - start_block_index;
504 int64_t rem = len & 15;
506 len += start_block_index;
508 __m256i right = _mm256_set1_epi64x(delta);
511#pragma omp parallel for
513 for (int64_t i = start_block_index; i < len; i += 16)
515 __m256i* dst = (__m256i*)&block_store.positions[i];
516 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
518 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
519 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
521 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
522 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
524 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
525 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
529 for (int64_t i = len; i < rem; ++i)
530 block_store.positions[i] += delta;
534template<
typename Blks>
537 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
540 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
541 "This code works only when the position values are 64-bit wide.");
543 int64_t n = block_store.positions.size();
545 if (start_block_index >= n)
549 int64_t len = n - start_block_index;
550 int64_t rem = len & 31;
552 len += start_block_index;
554 __m256i right = _mm256_set1_epi64x(delta);
557#pragma omp parallel for
559 for (int64_t i = start_block_index; i < len; i += 32)
561 __m256i* dst = (__m256i*)&block_store.positions[i];
562 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
564 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
565 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
567 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
568 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
570 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
571 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
573 __m256i* dst16 = (__m256i*)&block_store.positions[i + 16];
574 _mm256_storeu_si256(dst16, _mm256_add_epi64(_mm256_loadu_si256(dst16), right));
576 __m256i* dst20 = (__m256i*)&block_store.positions[i + 20];
577 _mm256_storeu_si256(dst20, _mm256_add_epi64(_mm256_loadu_si256(dst20), right));
579 __m256i* dst24 = (__m256i*)&block_store.positions[i + 24];
580 _mm256_storeu_si256(dst24, _mm256_add_epi64(_mm256_loadu_si256(dst24), right));
582 __m256i* dst28 = (__m256i*)&block_store.positions[i + 28];
583 _mm256_storeu_si256(dst28, _mm256_add_epi64(_mm256_loadu_si256(dst28), right));
587 for (int64_t i = len; i < rem; ++i)
588 block_store.positions[i] += delta;
Definition soa/block_util.hpp:23