ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
electrokinetics/generated_kernels/philox_rand.h
Go to the documentation of this file.
1// kernel generated with pystencils v1.0+12.g54b91e2, lbmpy
2// v1.0+9.g19115d4.dirty, lbmpy_walberla/pystencils_walberla from commit
3// e1fe2ad1dcbe8f31ea79d95e8a5a5cc0ee3691f3
4
5#include <cstdint>
6
7#if defined(__SSE2__) || defined(_MSC_VER)
8#include <emmintrin.h> // SSE2
9#endif
10#ifdef __AVX2__
11#include <immintrin.h> // AVX*
12#elif defined(__SSE4_1__) || defined(_MSC_VER)
13#include <smmintrin.h> // SSE4
14#ifdef __FMA__
15#include <immintrin.h> // FMA
16#endif
17#endif
18
19#ifdef __ARM_NEON
20#include <arm_neon.h>
21#endif
22#ifdef __ARM_FEATURE_SVE
23#include <arm_sve.h>
24#endif
25
26#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && \
27 !defined(__xlC__)
28#include <ppu_intrinsics.h>
29#endif
30#ifdef __ALTIVEC__
31#include <altivec.h>
32#undef bool
33#ifndef _ARCH_PWR8
34#include <pveclib/vec_int64_ppc.h>
35#endif
36#endif
37
38#if defined(__CUDA_ARCH__) || defined(__clang__) && defined(__CUDA__)
39#if defined(__clang__) && defined(QUALIFIERS)
40#undef QUALIFIERS
41#endif
42#define QUALIFIERS static __forceinline__ __device__
43#else
44#if defined(__clang__) && defined(QUALIFIERS)
45#undef QUALIFIERS
46#endif
47#define QUALIFIERS inline
48#include "myintrin.h"
49#endif
50
51#define PHILOX_W32_0 (0x9E3779B9)
52#define PHILOX_W32_1 (0xBB67AE85)
53#define PHILOX_M4x32_0 (0xD2511F53)
54#define PHILOX_M4x32_1 (0xCD9E8D57)
55#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
56#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
57
58typedef std::uint32_t uint32;
59typedef std::uint64_t uint64;
60
61#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && \
62 __ARM_FEATURE_SVE_BITS > 0
63typedef svfloat32_t svfloat32_st
64 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
65typedef svfloat64_t svfloat64_st
66 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
67#elif defined(__ARM_FEATURE_SVE)
68typedef svfloat32_t svfloat32_st;
69typedef svfloat64_t svfloat64_st;
70#endif
71
73#ifndef __CUDA_ARCH__
74 // host code
75#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
76 *hip = __mulhwu(a, b);
77 return a * b;
78#else
79 uint64 product = ((uint64)a) * ((uint64)b);
80 *hip = product >> 32;
81 return (uint32)product;
82#endif
83#else
84 // device code
85 *hip = __umulhi(a, b);
86 return a * b;
87#endif
88}
89
91 uint32 hi0;
92 uint32 hi1;
93 uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
94 uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
95
96 ctr[0] = hi1 ^ ctr[1] ^ key[0];
97 ctr[1] = lo1;
98 ctr[2] = hi0 ^ ctr[3] ^ key[1];
99 ctr[3] = lo0;
100}
101
103 key[0] += PHILOX_W32_0;
104 key[1] += PHILOX_W32_1;
105}
106
108 double z = (double)((uint64)x ^ ((uint64)y << (53 - 32)));
109 return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE / 2.0);
110}
111
113 uint32 ctr3, uint32 key0, uint32 key1,
114 double &rnd1, double &rnd2) {
115 uint32 key[2] = {key0, key1};
116 uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
117 _philox4x32round(ctr, key); // 1
119 _philox4x32round(ctr, key); // 2
121 _philox4x32round(ctr, key); // 3
123 _philox4x32round(ctr, key); // 4
125 _philox4x32round(ctr, key); // 5
127 _philox4x32round(ctr, key); // 6
129 _philox4x32round(ctr, key); // 7
131 _philox4x32round(ctr, key); // 8
133 _philox4x32round(ctr, key); // 9
135 _philox4x32round(ctr, key); // 10
136
137 rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
138 rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
139}
140
142 uint32 ctr3, uint32 key0, uint32 key1,
143 float &rnd1, float &rnd2, float &rnd3,
144 float &rnd4) {
145 uint32 key[2] = {key0, key1};
146 uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
147 _philox4x32round(ctr, key); // 1
149 _philox4x32round(ctr, key); // 2
151 _philox4x32round(ctr, key); // 3
153 _philox4x32round(ctr, key); // 4
155 _philox4x32round(ctr, key); // 5
157 _philox4x32round(ctr, key); // 6
159 _philox4x32round(ctr, key); // 7
161 _philox4x32round(ctr, key); // 8
163 _philox4x32round(ctr, key); // 9
165 _philox4x32round(ctr, key); // 10
166
167 rnd1 = (float)(ctr[0]) * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT / 2.0f);
168 rnd2 = (float)(ctr[1]) * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT / 2.0f);
169 rnd3 = (float)(ctr[2]) * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT / 2.0f);
170 rnd4 = (float)(ctr[3]) * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT / 2.0f);
171}
172
173#ifndef __CUDA_ARCH__
174#if defined(__SSE4_1__) || defined(_MSC_VER)
175QUALIFIERS void _philox4x32round(__m128i *ctr, __m128i *key) {
176 __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
177 __m128i lohi0b =
178 _mm_mul_epu32(_mm_srli_epi64(ctr[0], 32), _mm_set1_epi32(PHILOX_M4x32_0));
179 __m128i lohi1a = _mm_mul_epu32(ctr[2], _mm_set1_epi32(PHILOX_M4x32_1));
180 __m128i lohi1b =
181 _mm_mul_epu32(_mm_srli_epi64(ctr[2], 32), _mm_set1_epi32(PHILOX_M4x32_1));
182
183 lohi0a = _mm_shuffle_epi32(lohi0a, 0xD8);
184 lohi0b = _mm_shuffle_epi32(lohi0b, 0xD8);
185 lohi1a = _mm_shuffle_epi32(lohi1a, 0xD8);
186 lohi1b = _mm_shuffle_epi32(lohi1b, 0xD8);
187
188 __m128i lo0 = _mm_unpacklo_epi32(lohi0a, lohi0b);
189 __m128i hi0 = _mm_unpackhi_epi32(lohi0a, lohi0b);
190 __m128i lo1 = _mm_unpacklo_epi32(lohi1a, lohi1b);
191 __m128i hi1 = _mm_unpackhi_epi32(lohi1a, lohi1b);
192
193 ctr[0] = _mm_xor_si128(_mm_xor_si128(hi1, ctr[1]), key[0]);
194 ctr[1] = lo1;
195 ctr[2] = _mm_xor_si128(_mm_xor_si128(hi0, ctr[3]), key[1]);
196 ctr[3] = lo0;
197}
198
199QUALIFIERS void _philox4x32bumpkey(__m128i *key) {
200 key[0] = _mm_add_epi32(key[0], _mm_set1_epi32(PHILOX_W32_0));
201 key[1] = _mm_add_epi32(key[1], _mm_set1_epi32(PHILOX_W32_1));
202}
203
204template <bool high>
205QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y) {
206 // convert 32 to 64 bit
207 if (high) {
208 x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
209 y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
210 } else {
211 x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
212 y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
213 }
214
215 // calculate z = x ^ y << (53 - 32))
216 __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
217 z = _mm_xor_si128(x, z);
218
219 // convert uint64 to double
220 __m128d rs = _my_cvtepu64_pd(z);
221 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
222#ifdef __FMA__
223 rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE),
224 _mm_set1_pd(TWOPOW53_INV_DOUBLE / 2.0));
225#else
226 rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
227 rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE / 2.0));
228#endif
229
230 return rs;
231}
232
233QUALIFIERS void philox_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2,
234 __m128i ctr3, uint32 key0, uint32 key1,
235 __m128 &rnd1, __m128 &rnd2, __m128 &rnd3,
236 __m128 &rnd4) {
237 __m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
238 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
239 _philox4x32round(ctr, key); // 1
241 _philox4x32round(ctr, key); // 2
243 _philox4x32round(ctr, key); // 3
245 _philox4x32round(ctr, key); // 4
247 _philox4x32round(ctr, key); // 5
249 _philox4x32round(ctr, key); // 6
251 _philox4x32round(ctr, key); // 7
253 _philox4x32round(ctr, key); // 8
255 _philox4x32round(ctr, key); // 9
257 _philox4x32round(ctr, key); // 10
258
259 // convert uint32 to float
260 rnd1 = _my_cvtepu32_ps(ctr[0]);
261 rnd2 = _my_cvtepu32_ps(ctr[1]);
262 rnd3 = _my_cvtepu32_ps(ctr[2]);
263 rnd4 = _my_cvtepu32_ps(ctr[3]);
264 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
265#ifdef __FMA__
266 rnd1 = _mm_fmadd_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT),
267 _mm_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
268 rnd2 = _mm_fmadd_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT),
269 _mm_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
270 rnd3 = _mm_fmadd_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT),
271 _mm_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
272 rnd4 = _mm_fmadd_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT),
273 _mm_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
274#else
275 rnd1 = _mm_mul_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT));
276 rnd1 = _mm_add_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT / 2.0f));
277 rnd2 = _mm_mul_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT));
278 rnd2 = _mm_add_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT / 2.0f));
279 rnd3 = _mm_mul_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT));
280 rnd3 = _mm_add_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT / 2.0f));
281 rnd4 = _mm_mul_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT));
282 rnd4 = _mm_add_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT / 2.0f));
283#endif
284}
285
286QUALIFIERS void philox_double2(__m128i ctr0, __m128i ctr1, __m128i ctr2,
287 __m128i ctr3, uint32 key0, uint32 key1,
288 __m128d &rnd1lo, __m128d &rnd1hi,
289 __m128d &rnd2lo, __m128d &rnd2hi) {
290 __m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
291 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
292 _philox4x32round(ctr, key); // 1
294 _philox4x32round(ctr, key); // 2
296 _philox4x32round(ctr, key); // 3
298 _philox4x32round(ctr, key); // 4
300 _philox4x32round(ctr, key); // 5
302 _philox4x32round(ctr, key); // 6
304 _philox4x32round(ctr, key); // 7
306 _philox4x32round(ctr, key); // 8
308 _philox4x32round(ctr, key); // 9
310 _philox4x32round(ctr, key); // 10
311
312 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
313 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
314 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
315 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
316}
317
318QUALIFIERS void philox_float4(uint32 ctr0, __m128i ctr1, uint32 ctr2,
319 uint32 ctr3, uint32 key0, uint32 key1,
320 __m128 &rnd1, __m128 &rnd2, __m128 &rnd3,
321 __m128 &rnd4) {
322 __m128i ctr0v = _mm_set1_epi32(ctr0);
323 __m128i ctr2v = _mm_set1_epi32(ctr2);
324 __m128i ctr3v = _mm_set1_epi32(ctr3);
325
326 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
327}
328
329QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2,
330 uint32 ctr3, uint32 key0, uint32 key1,
331 __m128d &rnd1lo, __m128d &rnd1hi,
332 __m128d &rnd2lo, __m128d &rnd2hi) {
333 __m128i ctr0v = _mm_set1_epi32(ctr0);
334 __m128i ctr2v = _mm_set1_epi32(ctr2);
335 __m128i ctr3v = _mm_set1_epi32(ctr3);
336
337 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
338 rnd2hi);
339}
340
341QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2,
342 uint32 ctr3, uint32 key0, uint32 key1,
343 __m128d &rnd1, __m128d &rnd2) {
344 __m128i ctr0v = _mm_set1_epi32(ctr0);
345 __m128i ctr2v = _mm_set1_epi32(ctr2);
346 __m128i ctr3v = _mm_set1_epi32(ctr3);
347
348 __m128d ignore;
349 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
350 ignore);
351}
352#endif
353
354#ifdef __ALTIVEC__
355QUALIFIERS void _philox4x32round(__vector unsigned int *ctr,
356 __vector unsigned int *key) {
357#ifndef _ARCH_PWR8
358 __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
359 __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
360 __vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
361 __vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
362#elif defined(_ARCH_PWR10)
363 __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
364 __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
365 __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
366 __vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
367#else
368 __vector unsigned int lohi0a =
369 (__vector unsigned int)vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
370 __vector unsigned int lohi0b =
371 (__vector unsigned int)vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
372 __vector unsigned int lohi1a =
373 (__vector unsigned int)vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
374 __vector unsigned int lohi1b =
375 (__vector unsigned int)vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
376
377#ifdef __LITTLE_ENDIAN__
378 __vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
379 __vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
380 __vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
381 __vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
382#else
383 __vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
384 __vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
385 __vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
386 __vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
387#endif
388#endif
389
390 ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
391 ctr[1] = lo1;
392 ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
393 ctr[3] = lo0;
394}
395
396QUALIFIERS void _philox4x32bumpkey(__vector unsigned int *key) {
397 key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0));
398 key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1));
399}
400
401#ifdef __VSX__
402template <bool high>
403QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x,
404 __vector unsigned int y) {
405 // convert 32 to 64 bit
406#ifdef __LITTLE_ENDIAN__
407 if (high) {
408 x = vec_mergel(x, vec_splats(0U));
409 y = vec_mergel(y, vec_splats(0U));
410 } else {
411 x = vec_mergeh(x, vec_splats(0U));
412 y = vec_mergeh(y, vec_splats(0U));
413 }
414#else
415 if (high) {
416 x = vec_mergel(vec_splats(0U), x);
417 y = vec_mergel(vec_splats(0U), y);
418 } else {
419 x = vec_mergeh(vec_splats(0U), x);
420 y = vec_mergeh(vec_splats(0U), y);
421 }
422#endif
423
424 // calculate z = x ^ y << (53 - 32))
425#ifdef _ARCH_PWR8
426 __vector unsigned long long z =
427 vec_sl((__vector unsigned long long)y, vec_splats(53ULL - 32ULL));
428#else
429 __vector unsigned long long z =
430 vec_vsld((__vector unsigned long long)y, vec_splats(53ULL - 32ULL));
431#endif
432 z = vec_xor((__vector unsigned long long)x, z);
433
434 // convert uint64 to double
435#ifdef __xlC__
436 __vector double rs = vec_ctd(z, 0);
437#else
438 __vector double rs = vec_ctf(z, 0);
439#endif
440 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
441 rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE),
442 vec_splats(TWOPOW53_INV_DOUBLE / 2.0));
443
444 return rs;
445}
446#endif
447
448QUALIFIERS void philox_float4(__vector unsigned int ctr0,
449 __vector unsigned int ctr1,
450 __vector unsigned int ctr2,
451 __vector unsigned int ctr3, uint32 key0,
452 uint32 key1, __vector float &rnd1,
453 __vector float &rnd2, __vector float &rnd3,
454 __vector float &rnd4) {
455 __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
456 __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
457 _philox4x32round(ctr, key); // 1
459 _philox4x32round(ctr, key); // 2
461 _philox4x32round(ctr, key); // 3
463 _philox4x32round(ctr, key); // 4
465 _philox4x32round(ctr, key); // 5
467 _philox4x32round(ctr, key); // 6
469 _philox4x32round(ctr, key); // 7
471 _philox4x32round(ctr, key); // 8
473 _philox4x32round(ctr, key); // 9
475 _philox4x32round(ctr, key); // 10
476
477 // convert uint32 to float
478 rnd1 = vec_ctf(ctr[0], 0);
479 rnd2 = vec_ctf(ctr[1], 0);
480 rnd3 = vec_ctf(ctr[2], 0);
481 rnd4 = vec_ctf(ctr[3], 0);
482 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
483 rnd1 = vec_madd(rnd1, vec_splats(TWOPOW32_INV_FLOAT),
484 vec_splats(TWOPOW32_INV_FLOAT / 2.0f));
485 rnd2 = vec_madd(rnd2, vec_splats(TWOPOW32_INV_FLOAT),
486 vec_splats(TWOPOW32_INV_FLOAT / 2.0f));
487 rnd3 = vec_madd(rnd3, vec_splats(TWOPOW32_INV_FLOAT),
488 vec_splats(TWOPOW32_INV_FLOAT / 2.0f));
489 rnd4 = vec_madd(rnd4, vec_splats(TWOPOW32_INV_FLOAT),
490 vec_splats(TWOPOW32_INV_FLOAT / 2.0f));
491}
492
493#ifdef __VSX__
494QUALIFIERS void philox_double2(__vector unsigned int ctr0,
495 __vector unsigned int ctr1,
496 __vector unsigned int ctr2,
497 __vector unsigned int ctr3, uint32 key0,
498 uint32 key1, __vector double &rnd1lo,
499 __vector double &rnd1hi, __vector double &rnd2lo,
500 __vector double &rnd2hi) {
501 __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
502 __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
503 _philox4x32round(ctr, key); // 1
505 _philox4x32round(ctr, key); // 2
507 _philox4x32round(ctr, key); // 3
509 _philox4x32round(ctr, key); // 4
511 _philox4x32round(ctr, key); // 5
513 _philox4x32round(ctr, key); // 6
515 _philox4x32round(ctr, key); // 7
517 _philox4x32round(ctr, key); // 8
519 _philox4x32round(ctr, key); // 9
521 _philox4x32round(ctr, key); // 10
522
523 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
524 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
525 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
526 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
527}
528#endif
529
530QUALIFIERS void philox_float4(uint32 ctr0, __vector unsigned int ctr1,
531 uint32 ctr2, uint32 ctr3, uint32 key0,
532 uint32 key1, __vector float &rnd1,
533 __vector float &rnd2, __vector float &rnd3,
534 __vector float &rnd4) {
535 __vector unsigned int ctr0v = vec_splats(ctr0);
536 __vector unsigned int ctr2v = vec_splats(ctr2);
537 __vector unsigned int ctr3v = vec_splats(ctr3);
538
539 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
540}
541
542QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2,
543 uint32 ctr3, uint32 key0, uint32 key1,
544 __vector float &rnd1, __vector float &rnd2,
545 __vector float &rnd3, __vector float &rnd4) {
546 philox_float4(ctr0, (__vector unsigned int)ctr1, ctr2, ctr3, key0, key1, rnd1,
547 rnd2, rnd3, rnd4);
548}
549
550#ifdef __VSX__
551QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1,
552 uint32 ctr2, uint32 ctr3, uint32 key0,
553 uint32 key1, __vector double &rnd1lo,
554 __vector double &rnd1hi, __vector double &rnd2lo,
555 __vector double &rnd2hi) {
556 __vector unsigned int ctr0v = vec_splats(ctr0);
557 __vector unsigned int ctr2v = vec_splats(ctr2);
558 __vector unsigned int ctr3v = vec_splats(ctr3);
559
560 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
561 rnd2hi);
562}
563
564QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1,
565 uint32 ctr2, uint32 ctr3, uint32 key0,
566 uint32 key1, __vector double &rnd1,
567 __vector double &rnd2) {
568 __vector unsigned int ctr0v = vec_splats(ctr0);
569 __vector unsigned int ctr2v = vec_splats(ctr2);
570 __vector unsigned int ctr3v = vec_splats(ctr3);
571
572 __vector double ignore;
573 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
574 ignore);
575}
576
577QUALIFIERS void philox_double2(uint32 ctr0, __vector int ctr1, uint32 ctr2,
578 uint32 ctr3, uint32 key0, uint32 key1,
579 __vector double &rnd1, __vector double &rnd2) {
580 philox_double2(ctr0, (__vector unsigned int)ctr1, ctr2, ctr3, key0, key1,
581 rnd1, rnd2);
582}
583#endif
584#endif
585
586#if defined(__ARM_NEON)
587QUALIFIERS void _philox4x32round(uint32x4_t *ctr, uint32x4_t *key) {
588 uint32x4_t lohi0a = vreinterpretq_u32_u64(
589 vmull_u32(vget_low_u32(ctr[0]), vdup_n_u32(PHILOX_M4x32_0)));
590 uint32x4_t lohi0b = vreinterpretq_u32_u64(
591 vmull_high_u32(ctr[0], vdupq_n_u32(PHILOX_M4x32_0)));
592 uint32x4_t lohi1a = vreinterpretq_u32_u64(
593 vmull_u32(vget_low_u32(ctr[2]), vdup_n_u32(PHILOX_M4x32_1)));
594 uint32x4_t lohi1b = vreinterpretq_u32_u64(
595 vmull_high_u32(ctr[2], vdupq_n_u32(PHILOX_M4x32_1)));
596
597 uint32x4_t lo0 = vuzp1q_u32(lohi0a, lohi0b);
598 uint32x4_t lo1 = vuzp1q_u32(lohi1a, lohi1b);
599 uint32x4_t hi0 = vuzp2q_u32(lohi0a, lohi0b);
600 uint32x4_t hi1 = vuzp2q_u32(lohi1a, lohi1b);
601
602 ctr[0] = veorq_u32(veorq_u32(hi1, ctr[1]), key[0]);
603 ctr[1] = lo1;
604 ctr[2] = veorq_u32(veorq_u32(hi0, ctr[3]), key[1]);
605 ctr[3] = lo0;
606}
607
608QUALIFIERS void _philox4x32bumpkey(uint32x4_t *key) {
609 key[0] = vaddq_u32(key[0], vdupq_n_u32(PHILOX_W32_0));
610 key[1] = vaddq_u32(key[1], vdupq_n_u32(PHILOX_W32_1));
611}
612
613template <bool high>
614QUALIFIERS float64x2_t _uniform_double_hq(uint32x4_t x, uint32x4_t y) {
615 // convert 32 to 64 bit
616 if (high) {
617 x = vzip2q_u32(x, vdupq_n_u32(0));
618 y = vzip2q_u32(y, vdupq_n_u32(0));
619 } else {
620 x = vzip1q_u32(x, vdupq_n_u32(0));
621 y = vzip1q_u32(y, vdupq_n_u32(0));
622 }
623
624 // calculate z = x ^ y << (53 - 32))
625 uint64x2_t z = vshlq_n_u64(vreinterpretq_u64_u32(y), 53 - 32);
626 z = veorq_u64(vreinterpretq_u64_u32(x), z);
627
628 // convert uint64 to double
629 float64x2_t rs = vcvtq_f64_u64(z);
630 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
631 rs = vfmaq_f64(vdupq_n_f64(TWOPOW53_INV_DOUBLE / 2.0),
632 vdupq_n_f64(TWOPOW53_INV_DOUBLE), rs);
633
634 return rs;
635}
636
637QUALIFIERS void philox_float4(uint32x4_t ctr0, uint32x4_t ctr1, uint32x4_t ctr2,
638 uint32x4_t ctr3, uint32 key0, uint32 key1,
639 float32x4_t &rnd1, float32x4_t &rnd2,
640 float32x4_t &rnd3, float32x4_t &rnd4) {
641 uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
642 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
643 _philox4x32round(ctr, key); // 1
645 _philox4x32round(ctr, key); // 2
647 _philox4x32round(ctr, key); // 3
649 _philox4x32round(ctr, key); // 4
651 _philox4x32round(ctr, key); // 5
653 _philox4x32round(ctr, key); // 6
655 _philox4x32round(ctr, key); // 7
657 _philox4x32round(ctr, key); // 8
659 _philox4x32round(ctr, key); // 9
661 _philox4x32round(ctr, key); // 10
662
663 // convert uint32 to float
664 rnd1 = vcvtq_f32_u32(ctr[0]);
665 rnd2 = vcvtq_f32_u32(ctr[1]);
666 rnd3 = vcvtq_f32_u32(ctr[2]);
667 rnd4 = vcvtq_f32_u32(ctr[3]);
668 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
669 rnd1 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT / 2.0),
670 vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd1);
671 rnd2 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT / 2.0),
672 vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd2);
673 rnd3 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT / 2.0),
674 vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd3);
675 rnd4 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT / 2.0),
676 vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd4);
677}
678
679QUALIFIERS void philox_double2(uint32x4_t ctr0, uint32x4_t ctr1,
680 uint32x4_t ctr2, uint32x4_t ctr3, uint32 key0,
681 uint32 key1, float64x2_t &rnd1lo,
682 float64x2_t &rnd1hi, float64x2_t &rnd2lo,
683 float64x2_t &rnd2hi) {
684 uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
685 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
686 _philox4x32round(ctr, key); // 1
688 _philox4x32round(ctr, key); // 2
690 _philox4x32round(ctr, key); // 3
692 _philox4x32round(ctr, key); // 4
694 _philox4x32round(ctr, key); // 5
696 _philox4x32round(ctr, key); // 6
698 _philox4x32round(ctr, key); // 7
700 _philox4x32round(ctr, key); // 8
702 _philox4x32round(ctr, key); // 9
704 _philox4x32round(ctr, key); // 10
705
706 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
707 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
708 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
709 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
710}
711
712QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2,
713 uint32 ctr3, uint32 key0, uint32 key1,
714 float32x4_t &rnd1, float32x4_t &rnd2,
715 float32x4_t &rnd3, float32x4_t &rnd4) {
716 uint32x4_t ctr0v = vdupq_n_u32(ctr0);
717 uint32x4_t ctr2v = vdupq_n_u32(ctr2);
718 uint32x4_t ctr3v = vdupq_n_u32(ctr3);
719
720 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
721}
722
723QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2,
724 uint32 ctr3, uint32 key0, uint32 key1,
725 float32x4_t &rnd1, float32x4_t &rnd2,
726 float32x4_t &rnd3, float32x4_t &rnd4) {
727 philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1,
728 rnd2, rnd3, rnd4);
729}
730
731QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2,
732 uint32 ctr3, uint32 key0, uint32 key1,
733 float64x2_t &rnd1lo, float64x2_t &rnd1hi,
734 float64x2_t &rnd2lo, float64x2_t &rnd2hi) {
735 uint32x4_t ctr0v = vdupq_n_u32(ctr0);
736 uint32x4_t ctr2v = vdupq_n_u32(ctr2);
737 uint32x4_t ctr3v = vdupq_n_u32(ctr3);
738
739 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
740 rnd2hi);
741}
742
743QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2,
744 uint32 ctr3, uint32 key0, uint32 key1,
745 float64x2_t &rnd1, float64x2_t &rnd2) {
746 uint32x4_t ctr0v = vdupq_n_u32(ctr0);
747 uint32x4_t ctr2v = vdupq_n_u32(ctr2);
748 uint32x4_t ctr3v = vdupq_n_u32(ctr3);
749
750 float64x2_t ignore;
751 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
752 ignore);
753}
754
755QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2,
756 uint32 ctr3, uint32 key0, uint32 key1,
757 float64x2_t &rnd1, float64x2_t &rnd2) {
758 philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1,
759 rnd1, rnd2);
760}
761#endif
762
763#if defined(__ARM_FEATURE_SVE)
764QUALIFIERS void _philox4x32round(svuint32x4_t &ctr, svuint32x2_t &key) {
765 svuint32_t lo0 =
766 svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
767 svuint32_t lo1 =
768 svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
769 svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0),
770 svdup_u32(PHILOX_M4x32_0));
771 svuint32_t hi1 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 2),
772 svdup_u32(PHILOX_M4x32_1));
773
774 ctr = svset4_u32(
775 ctr, 0,
776 sveor_u32_x(svptrue_b32(),
777 sveor_u32_x(svptrue_b32(), hi1, svget4_u32(ctr, 1)),
778 svget2_u32(key, 0)));
779 ctr = svset4_u32(ctr, 1, lo1);
780 ctr = svset4_u32(
781 ctr, 2,
782 sveor_u32_x(svptrue_b32(),
783 sveor_u32_x(svptrue_b32(), hi0, svget4_u32(ctr, 3)),
784 svget2_u32(key, 1)));
785 ctr = svset4_u32(ctr, 3, lo0);
786}
787
788QUALIFIERS void _philox4x32bumpkey(svuint32x2_t &key) {
789 key = svset2_u32(
790 key, 0,
791 svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0)));
792 key = svset2_u32(
793 key, 1,
794 svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1)));
795}
796
797template <bool high>
798QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y) {
799 // convert 32 to 64 bit
800 if (high) {
801 x = svzip2_u32(x, svdup_u32(0));
802 y = svzip2_u32(y, svdup_u32(0));
803 } else {
804 x = svzip1_u32(x, svdup_u32(0));
805 y = svzip1_u32(y, svdup_u32(0));
806 }
807
808 // calculate z = x ^ y << (53 - 32))
809 svuint64_t z =
810 svlsl_n_u64_x(svptrue_b64(), svreinterpret_u64_u32(y), 53 - 32);
811 z = sveor_u64_x(svptrue_b64(), svreinterpret_u64_u32(x), z);
812
813 // convert uint64 to double
814 svfloat64_t rs = svcvt_f64_u64_x(svptrue_b64(), z);
815 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
816 rs = svmad_f64_x(svptrue_b64(), rs, svdup_f64(TWOPOW53_INV_DOUBLE),
817 svdup_f64(TWOPOW53_INV_DOUBLE / 2.0));
818
819 return rs;
820}
821
822QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2,
823 svuint32_t ctr3, uint32 key0, uint32 key1,
824 svfloat32_st &rnd1, svfloat32_st &rnd2,
825 svfloat32_st &rnd3, svfloat32_st &rnd4) {
826 svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
827 svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
828 _philox4x32round(ctr, key); // 1
830 _philox4x32round(ctr, key); // 2
832 _philox4x32round(ctr, key); // 3
834 _philox4x32round(ctr, key); // 4
836 _philox4x32round(ctr, key); // 5
838 _philox4x32round(ctr, key); // 6
840 _philox4x32round(ctr, key); // 7
842 _philox4x32round(ctr, key); // 8
844 _philox4x32round(ctr, key); // 9
846 _philox4x32round(ctr, key); // 10
847
848 // convert uint32 to float
849 rnd1 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 0));
850 rnd2 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 1));
851 rnd3 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 2));
852 rnd4 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 3));
853 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
854 rnd1 = svmad_f32_x(svptrue_b32(), rnd1, svdup_f32(TWOPOW32_INV_FLOAT),
855 svdup_f32(TWOPOW32_INV_FLOAT / 2.0));
856 rnd2 = svmad_f32_x(svptrue_b32(), rnd2, svdup_f32(TWOPOW32_INV_FLOAT),
857 svdup_f32(TWOPOW32_INV_FLOAT / 2.0));
858 rnd3 = svmad_f32_x(svptrue_b32(), rnd3, svdup_f32(TWOPOW32_INV_FLOAT),
859 svdup_f32(TWOPOW32_INV_FLOAT / 2.0));
860 rnd4 = svmad_f32_x(svptrue_b32(), rnd4, svdup_f32(TWOPOW32_INV_FLOAT),
861 svdup_f32(TWOPOW32_INV_FLOAT / 2.0));
862}
863
864QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1,
865 svuint32_t ctr2, svuint32_t ctr3, uint32 key0,
866 uint32 key1, svfloat64_st &rnd1lo,
867 svfloat64_st &rnd1hi, svfloat64_st &rnd2lo,
868 svfloat64_st &rnd2hi) {
869 svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
870 svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
871 _philox4x32round(ctr, key); // 1
873 _philox4x32round(ctr, key); // 2
875 _philox4x32round(ctr, key); // 3
877 _philox4x32round(ctr, key); // 4
879 _philox4x32round(ctr, key); // 5
881 _philox4x32round(ctr, key); // 6
883 _philox4x32round(ctr, key); // 7
885 _philox4x32round(ctr, key); // 8
887 _philox4x32round(ctr, key); // 9
889 _philox4x32round(ctr, key); // 10
890
891 rnd1lo = _uniform_double_hq<false>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
892 rnd1hi = _uniform_double_hq<true>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
893 rnd2lo = _uniform_double_hq<false>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
894 rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
895}
896
897QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2,
898 uint32 ctr3, uint32 key0, uint32 key1,
899 svfloat32_st &rnd1, svfloat32_st &rnd2,
900 svfloat32_st &rnd3, svfloat32_st &rnd4) {
901 svuint32_t ctr0v = svdup_u32(ctr0);
902 svuint32_t ctr2v = svdup_u32(ctr2);
903 svuint32_t ctr3v = svdup_u32(ctr3);
904
905 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
906}
907
908QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2,
909 uint32 ctr3, uint32 key0, uint32 key1,
910 svfloat32_st &rnd1, svfloat32_st &rnd2,
911 svfloat32_st &rnd3, svfloat32_st &rnd4) {
912 philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1,
913 rnd2, rnd3, rnd4);
914}
915
916QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2,
917 uint32 ctr3, uint32 key0, uint32 key1,
918 svfloat64_st &rnd1lo, svfloat64_st &rnd1hi,
919 svfloat64_st &rnd2lo, svfloat64_st &rnd2hi) {
920 svuint32_t ctr0v = svdup_u32(ctr0);
921 svuint32_t ctr2v = svdup_u32(ctr2);
922 svuint32_t ctr3v = svdup_u32(ctr3);
923
924 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
925 rnd2hi);
926}
927
928QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2,
929 uint32 ctr3, uint32 key0, uint32 key1,
930 svfloat64_st &rnd1, svfloat64_st &rnd2) {
931 svuint32_t ctr0v = svdup_u32(ctr0);
932 svuint32_t ctr2v = svdup_u32(ctr2);
933 svuint32_t ctr3v = svdup_u32(ctr3);
934
935 svfloat64_st ignore;
936 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
937 ignore);
938}
939
940QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2,
941 uint32 ctr3, uint32 key0, uint32 key1,
942 svfloat64_st &rnd1, svfloat64_st &rnd2) {
943 philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1,
944 rnd1, rnd2);
945}
946#endif
947
948#ifdef __AVX2__
949QUALIFIERS void _philox4x32round(__m256i *ctr, __m256i *key) {
950 __m256i lohi0a = _mm256_mul_epu32(ctr[0], _mm256_set1_epi32(PHILOX_M4x32_0));
951 __m256i lohi0b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[0], 32),
952 _mm256_set1_epi32(PHILOX_M4x32_0));
953 __m256i lohi1a = _mm256_mul_epu32(ctr[2], _mm256_set1_epi32(PHILOX_M4x32_1));
954 __m256i lohi1b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[2], 32),
955 _mm256_set1_epi32(PHILOX_M4x32_1));
956
957 lohi0a = _mm256_shuffle_epi32(lohi0a, 0xD8);
958 lohi0b = _mm256_shuffle_epi32(lohi0b, 0xD8);
959 lohi1a = _mm256_shuffle_epi32(lohi1a, 0xD8);
960 lohi1b = _mm256_shuffle_epi32(lohi1b, 0xD8);
961
962 __m256i lo0 = _mm256_unpacklo_epi32(lohi0a, lohi0b);
963 __m256i hi0 = _mm256_unpackhi_epi32(lohi0a, lohi0b);
964 __m256i lo1 = _mm256_unpacklo_epi32(lohi1a, lohi1b);
965 __m256i hi1 = _mm256_unpackhi_epi32(lohi1a, lohi1b);
966
967 ctr[0] = _mm256_xor_si256(_mm256_xor_si256(hi1, ctr[1]), key[0]);
968 ctr[1] = lo1;
969 ctr[2] = _mm256_xor_si256(_mm256_xor_si256(hi0, ctr[3]), key[1]);
970 ctr[3] = lo0;
971}
972
973QUALIFIERS void _philox4x32bumpkey(__m256i *key) {
974 key[0] = _mm256_add_epi32(key[0], _mm256_set1_epi32(PHILOX_W32_0));
975 key[1] = _mm256_add_epi32(key[1], _mm256_set1_epi32(PHILOX_W32_1));
976}
977
978template <bool high>
979QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y) {
980 // convert 32 to 64 bit
981 if (high) {
982 x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 1));
983 y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 1));
984 } else {
985 x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 0));
986 y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 0));
987 }
988
989 // calculate z = x ^ y << (53 - 32))
990 __m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
991 z = _mm256_xor_si256(x, z);
992
993 // convert uint64 to double
994 __m256d rs = _my256_cvtepu64_pd(z);
995 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
996#ifdef __FMA__
997 rs = _mm256_fmadd_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE),
998 _mm256_set1_pd(TWOPOW53_INV_DOUBLE / 2.0));
999#else
1000 rs = _mm256_mul_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE));
1001 rs = _mm256_add_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE / 2.0));
1002#endif
1003
1004 return rs;
1005}
1006
1007QUALIFIERS void philox_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2,
1008 __m256i ctr3, uint32 key0, uint32 key1,
1009 __m256 &rnd1, __m256 &rnd2, __m256 &rnd3,
1010 __m256 &rnd4) {
1011 __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
1012 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1013 _philox4x32round(ctr, key); // 1
1014 _philox4x32bumpkey(key);
1015 _philox4x32round(ctr, key); // 2
1016 _philox4x32bumpkey(key);
1017 _philox4x32round(ctr, key); // 3
1018 _philox4x32bumpkey(key);
1019 _philox4x32round(ctr, key); // 4
1020 _philox4x32bumpkey(key);
1021 _philox4x32round(ctr, key); // 5
1022 _philox4x32bumpkey(key);
1023 _philox4x32round(ctr, key); // 6
1024 _philox4x32bumpkey(key);
1025 _philox4x32round(ctr, key); // 7
1026 _philox4x32bumpkey(key);
1027 _philox4x32round(ctr, key); // 8
1028 _philox4x32bumpkey(key);
1029 _philox4x32round(ctr, key); // 9
1030 _philox4x32bumpkey(key);
1031 _philox4x32round(ctr, key); // 10
1032
1033 // convert uint32 to float
1034 rnd1 = _my256_cvtepu32_ps(ctr[0]);
1035 rnd2 = _my256_cvtepu32_ps(ctr[1]);
1036 rnd3 = _my256_cvtepu32_ps(ctr[2]);
1037 rnd4 = _my256_cvtepu32_ps(ctr[3]);
1038 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
1039#ifdef __FMA__
1040 rnd1 = _mm256_fmadd_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT),
1041 _mm256_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
1042 rnd2 = _mm256_fmadd_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT),
1043 _mm256_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
1044 rnd3 = _mm256_fmadd_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT),
1045 _mm256_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
1046 rnd4 = _mm256_fmadd_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT),
1047 _mm256_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
1048#else
1049 rnd1 = _mm256_mul_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
1050 rnd1 = _mm256_add_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT / 2.0f));
1051 rnd2 = _mm256_mul_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
1052 rnd2 = _mm256_add_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT / 2.0f));
1053 rnd3 = _mm256_mul_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
1054 rnd3 = _mm256_add_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT / 2.0f));
1055 rnd4 = _mm256_mul_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
1056 rnd4 = _mm256_add_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT / 2.0f));
1057#endif
1058}
1059
1060QUALIFIERS void philox_double2(__m256i ctr0, __m256i ctr1, __m256i ctr2,
1061 __m256i ctr3, uint32 key0, uint32 key1,
1062 __m256d &rnd1lo, __m256d &rnd1hi,
1063 __m256d &rnd2lo, __m256d &rnd2hi) {
1064 __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
1065 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1066 _philox4x32round(ctr, key); // 1
1067 _philox4x32bumpkey(key);
1068 _philox4x32round(ctr, key); // 2
1069 _philox4x32bumpkey(key);
1070 _philox4x32round(ctr, key); // 3
1071 _philox4x32bumpkey(key);
1072 _philox4x32round(ctr, key); // 4
1073 _philox4x32bumpkey(key);
1074 _philox4x32round(ctr, key); // 5
1075 _philox4x32bumpkey(key);
1076 _philox4x32round(ctr, key); // 6
1077 _philox4x32bumpkey(key);
1078 _philox4x32round(ctr, key); // 7
1079 _philox4x32bumpkey(key);
1080 _philox4x32round(ctr, key); // 8
1081 _philox4x32bumpkey(key);
1082 _philox4x32round(ctr, key); // 9
1083 _philox4x32bumpkey(key);
1084 _philox4x32round(ctr, key); // 10
1085
1086 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
1087 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
1088 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
1089 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
1090}
1091
1092QUALIFIERS void philox_float4(uint32 ctr0, __m256i ctr1, uint32 ctr2,
1093 uint32 ctr3, uint32 key0, uint32 key1,
1094 __m256 &rnd1, __m256 &rnd2, __m256 &rnd3,
1095 __m256 &rnd4) {
1096 __m256i ctr0v = _mm256_set1_epi32(ctr0);
1097 __m256i ctr2v = _mm256_set1_epi32(ctr2);
1098 __m256i ctr3v = _mm256_set1_epi32(ctr3);
1099
1100 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
1101}
1102
1103QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2,
1104 uint32 ctr3, uint32 key0, uint32 key1,
1105 __m256d &rnd1lo, __m256d &rnd1hi,
1106 __m256d &rnd2lo, __m256d &rnd2hi) {
1107 __m256i ctr0v = _mm256_set1_epi32(ctr0);
1108 __m256i ctr2v = _mm256_set1_epi32(ctr2);
1109 __m256i ctr3v = _mm256_set1_epi32(ctr3);
1110
1111 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
1112 rnd2hi);
1113}
1114
1115QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2,
1116 uint32 ctr3, uint32 key0, uint32 key1,
1117 __m256d &rnd1, __m256d &rnd2) {
1118#if 0
1119 __m256i ctr0v = _mm256_set1_epi32(ctr0);
1120 __m256i ctr2v = _mm256_set1_epi32(ctr2);
1121 __m256i ctr3v = _mm256_set1_epi32(ctr3);
1122
1123 __m256d ignore;
1124 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
1125#else
1126 __m128d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
1127 philox_double2(ctr0, _mm256_extractf128_si256(ctr1, 0), ctr2, ctr3, key0,
1128 key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
1129 rnd1 = _my256_set_m128d(rnd1hi, rnd1lo);
1130 rnd2 = _my256_set_m128d(rnd2hi, rnd2lo);
1131#endif
1132}
1133#endif
1134
1135#ifdef __AVX512F__
1136QUALIFIERS void _philox4x32round(__m512i *ctr, __m512i *key) {
1137 __m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0));
1138 __m512i lohi0b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[0], 32),
1139 _mm512_set1_epi32(PHILOX_M4x32_0));
1140 __m512i lohi1a = _mm512_mul_epu32(ctr[2], _mm512_set1_epi32(PHILOX_M4x32_1));
1141 __m512i lohi1b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[2], 32),
1142 _mm512_set1_epi32(PHILOX_M4x32_1));
1143
1144 lohi0a = _mm512_shuffle_epi32(lohi0a, _MM_PERM_DBCA);
1145 lohi0b = _mm512_shuffle_epi32(lohi0b, _MM_PERM_DBCA);
1146 lohi1a = _mm512_shuffle_epi32(lohi1a, _MM_PERM_DBCA);
1147 lohi1b = _mm512_shuffle_epi32(lohi1b, _MM_PERM_DBCA);
1148
1149 __m512i lo0 = _mm512_unpacklo_epi32(lohi0a, lohi0b);
1150 __m512i hi0 = _mm512_unpackhi_epi32(lohi0a, lohi0b);
1151 __m512i lo1 = _mm512_unpacklo_epi32(lohi1a, lohi1b);
1152 __m512i hi1 = _mm512_unpackhi_epi32(lohi1a, lohi1b);
1153
1154 ctr[0] = _mm512_xor_si512(_mm512_xor_si512(hi1, ctr[1]), key[0]);
1155 ctr[1] = lo1;
1156 ctr[2] = _mm512_xor_si512(_mm512_xor_si512(hi0, ctr[3]), key[1]);
1157 ctr[3] = lo0;
1158}
1159
1160QUALIFIERS void _philox4x32bumpkey(__m512i *key) {
1161 key[0] = _mm512_add_epi32(key[0], _mm512_set1_epi32(PHILOX_W32_0));
1162 key[1] = _mm512_add_epi32(key[1], _mm512_set1_epi32(PHILOX_W32_1));
1163}
1164
1165template <bool high>
1166QUALIFIERS __m512d _uniform_double_hq(__m512i x, __m512i y) {
1167 // convert 32 to 64 bit
1168 if (high) {
1169 x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 1));
1170 y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 1));
1171 } else {
1172 x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 0));
1173 y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 0));
1174 }
1175
1176 // calculate z = x ^ y << (53 - 32))
1177 __m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
1178 z = _mm512_xor_si512(x, z);
1179
1180 // convert uint64 to double
1181 __m512d rs = _mm512_cvtepu64_pd(z);
1182 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
1183 rs = _mm512_fmadd_pd(rs, _mm512_set1_pd(TWOPOW53_INV_DOUBLE),
1184 _mm512_set1_pd(TWOPOW53_INV_DOUBLE / 2.0));
1185
1186 return rs;
1187}
1188
1189QUALIFIERS void philox_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2,
1190 __m512i ctr3, uint32 key0, uint32 key1,
1191 __m512 &rnd1, __m512 &rnd2, __m512 &rnd3,
1192 __m512 &rnd4) {
1193 __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
1194 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1195 _philox4x32round(ctr, key); // 1
1196 _philox4x32bumpkey(key);
1197 _philox4x32round(ctr, key); // 2
1198 _philox4x32bumpkey(key);
1199 _philox4x32round(ctr, key); // 3
1200 _philox4x32bumpkey(key);
1201 _philox4x32round(ctr, key); // 4
1202 _philox4x32bumpkey(key);
1203 _philox4x32round(ctr, key); // 5
1204 _philox4x32bumpkey(key);
1205 _philox4x32round(ctr, key); // 6
1206 _philox4x32bumpkey(key);
1207 _philox4x32round(ctr, key); // 7
1208 _philox4x32bumpkey(key);
1209 _philox4x32round(ctr, key); // 8
1210 _philox4x32bumpkey(key);
1211 _philox4x32round(ctr, key); // 9
1212 _philox4x32bumpkey(key);
1213 _philox4x32round(ctr, key); // 10
1214
1215 // convert uint32 to float
1216 rnd1 = _mm512_cvtepu32_ps(ctr[0]);
1217 rnd2 = _mm512_cvtepu32_ps(ctr[1]);
1218 rnd3 = _mm512_cvtepu32_ps(ctr[2]);
1219 rnd4 = _mm512_cvtepu32_ps(ctr[3]);
1220 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
1221 rnd1 = _mm512_fmadd_ps(rnd1, _mm512_set1_ps(TWOPOW32_INV_FLOAT),
1222 _mm512_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
1223 rnd2 = _mm512_fmadd_ps(rnd2, _mm512_set1_ps(TWOPOW32_INV_FLOAT),
1224 _mm512_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
1225 rnd3 = _mm512_fmadd_ps(rnd3, _mm512_set1_ps(TWOPOW32_INV_FLOAT),
1226 _mm512_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
1227 rnd4 = _mm512_fmadd_ps(rnd4, _mm512_set1_ps(TWOPOW32_INV_FLOAT),
1228 _mm512_set1_ps(TWOPOW32_INV_FLOAT / 2.0));
1229}
1230
1231QUALIFIERS void philox_double2(__m512i ctr0, __m512i ctr1, __m512i ctr2,
1232 __m512i ctr3, uint32 key0, uint32 key1,
1233 __m512d &rnd1lo, __m512d &rnd1hi,
1234 __m512d &rnd2lo, __m512d &rnd2hi) {
1235 __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
1236 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1237 _philox4x32round(ctr, key); // 1
1238 _philox4x32bumpkey(key);
1239 _philox4x32round(ctr, key); // 2
1240 _philox4x32bumpkey(key);
1241 _philox4x32round(ctr, key); // 3
1242 _philox4x32bumpkey(key);
1243 _philox4x32round(ctr, key); // 4
1244 _philox4x32bumpkey(key);
1245 _philox4x32round(ctr, key); // 5
1246 _philox4x32bumpkey(key);
1247 _philox4x32round(ctr, key); // 6
1248 _philox4x32bumpkey(key);
1249 _philox4x32round(ctr, key); // 7
1250 _philox4x32bumpkey(key);
1251 _philox4x32round(ctr, key); // 8
1252 _philox4x32bumpkey(key);
1253 _philox4x32round(ctr, key); // 9
1254 _philox4x32bumpkey(key);
1255 _philox4x32round(ctr, key); // 10
1256
1257 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
1258 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
1259 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
1260 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
1261}
1262
1263QUALIFIERS void philox_float4(uint32 ctr0, __m512i ctr1, uint32 ctr2,
1264 uint32 ctr3, uint32 key0, uint32 key1,
1265 __m512 &rnd1, __m512 &rnd2, __m512 &rnd3,
1266 __m512 &rnd4) {
1267 __m512i ctr0v = _mm512_set1_epi32(ctr0);
1268 __m512i ctr2v = _mm512_set1_epi32(ctr2);
1269 __m512i ctr3v = _mm512_set1_epi32(ctr3);
1270
1271 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
1272}
1273
1274QUALIFIERS void philox_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2,
1275 uint32 ctr3, uint32 key0, uint32 key1,
1276 __m512d &rnd1lo, __m512d &rnd1hi,
1277 __m512d &rnd2lo, __m512d &rnd2hi) {
1278 __m512i ctr0v = _mm512_set1_epi32(ctr0);
1279 __m512i ctr2v = _mm512_set1_epi32(ctr2);
1280 __m512i ctr3v = _mm512_set1_epi32(ctr3);
1281
1282 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
1283 rnd2hi);
1284}
1285
1286QUALIFIERS void philox_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2,
1287 uint32 ctr3, uint32 key0, uint32 key1,
1288 __m512d &rnd1, __m512d &rnd2) {
1289#if 0
1290 __m512i ctr0v = _mm512_set1_epi32(ctr0);
1291 __m512i ctr2v = _mm512_set1_epi32(ctr2);
1292 __m512i ctr3v = _mm512_set1_epi32(ctr3);
1293
1294 __m512d ignore;
1295 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
1296#else
1297 __m256d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
1298 philox_double2(ctr0, _mm512_extracti64x4_epi64(ctr1, 0), ctr2, ctr3, key0,
1299 key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
1300 rnd1 = _my512_set_m256d(rnd1hi, rnd1lo);
1301 rnd2 = _my512_set_m256d(rnd2hi, rnd2lo);
1302#endif
1303}
1304#endif
1305#endif
QUALIFIERS void _philox4x32bumpkey(uint32 *key)
QUALIFIERS void _philox4x32round(uint32 *ctr, uint32 *key)
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32 *hip)
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, double &rnd1, double &rnd2)
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float &rnd1, float &rnd2, float &rnd3, float &rnd4)
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)