7#if defined(__SSE2__) || defined(_MSC_VER)
12#elif defined(__SSE4_1__) || defined(_MSC_VER)
22#ifdef __ARM_FEATURE_SVE
26#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && \
28#include <ppu_intrinsics.h>
34#include <pveclib/vec_int64_ppc.h>
38#if defined(__CUDA_ARCH__) || defined(__clang__) && defined(__CUDA__)
39#if defined(__clang__) && defined(QUALIFIERS)
42#define QUALIFIERS static __forceinline__ __device__
44#if defined(__clang__) && defined(QUALIFIERS)
47#define QUALIFIERS inline
51#define PHILOX_W32_0 (0x9E3779B9)
52#define PHILOX_W32_1 (0xBB67AE85)
53#define PHILOX_M4x32_0 (0xD2511F53)
54#define PHILOX_M4x32_1 (0xCD9E8D57)
55#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
56#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
61#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && \
62 __ARM_FEATURE_SVE_BITS > 0
63typedef svfloat32_t svfloat32_st
64 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
65typedef svfloat64_t svfloat64_st
66 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
67#elif defined(__ARM_FEATURE_SVE)
68typedef svfloat32_t svfloat32_st;
69typedef svfloat64_t svfloat64_st;
75#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
76 *hip = __mulhwu(a, b);
85 *hip = __umulhi(a, b);
96 ctr[0] = hi1 ^ ctr[1] ^ key[0];
98 ctr[2] = hi0 ^ ctr[3] ^ key[1];
108 double z = (double)((
uint64)x ^ ((
uint64)y << (53 - 32)));
114 double &rnd1,
double &rnd2) {
115 uint32 key[2] = {key0, key1};
116 uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
143 float &rnd1,
float &rnd2,
float &rnd3,
145 uint32 key[2] = {key0, key1};
146 uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
174#if defined(__SSE4_1__) || defined(_MSC_VER)
176 __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(
PHILOX_M4x32_0));
178 _mm_mul_epu32(_mm_srli_epi64(ctr[0], 32), _mm_set1_epi32(
PHILOX_M4x32_0));
179 __m128i lohi1a = _mm_mul_epu32(ctr[2], _mm_set1_epi32(
PHILOX_M4x32_1));
181 _mm_mul_epu32(_mm_srli_epi64(ctr[2], 32), _mm_set1_epi32(
PHILOX_M4x32_1));
183 lohi0a = _mm_shuffle_epi32(lohi0a, 0xD8);
184 lohi0b = _mm_shuffle_epi32(lohi0b, 0xD8);
185 lohi1a = _mm_shuffle_epi32(lohi1a, 0xD8);
186 lohi1b = _mm_shuffle_epi32(lohi1b, 0xD8);
188 __m128i lo0 = _mm_unpacklo_epi32(lohi0a, lohi0b);
189 __m128i hi0 = _mm_unpackhi_epi32(lohi0a, lohi0b);
190 __m128i lo1 = _mm_unpacklo_epi32(lohi1a, lohi1b);
191 __m128i hi1 = _mm_unpackhi_epi32(lohi1a, lohi1b);
193 ctr[0] = _mm_xor_si128(_mm_xor_si128(hi1, ctr[1]), key[0]);
195 ctr[2] = _mm_xor_si128(_mm_xor_si128(hi0, ctr[3]), key[1]);
200 key[0] = _mm_add_epi32(key[0], _mm_set1_epi32(
PHILOX_W32_0));
201 key[1] = _mm_add_epi32(key[1], _mm_set1_epi32(
PHILOX_W32_1));
208 x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
209 y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
211 x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
212 y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
216 __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
217 z = _mm_xor_si128(x, z);
220 __m128d rs = _my_cvtepu64_pd(z);
235 __m128 &rnd1, __m128 &rnd2, __m128 &rnd3,
237 __m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
238 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
260 rnd1 = _my_cvtepu32_ps(ctr[0]);
261 rnd2 = _my_cvtepu32_ps(ctr[1]);
262 rnd3 = _my_cvtepu32_ps(ctr[2]);
263 rnd4 = _my_cvtepu32_ps(ctr[3]);
288 __m128d &rnd1lo, __m128d &rnd1hi,
289 __m128d &rnd2lo, __m128d &rnd2hi) {
290 __m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
291 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
312 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
313 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
314 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
315 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
320 __m128 &rnd1, __m128 &rnd2, __m128 &rnd3,
322 __m128i ctr0v = _mm_set1_epi32(ctr0);
323 __m128i ctr2v = _mm_set1_epi32(ctr2);
324 __m128i ctr3v = _mm_set1_epi32(ctr3);
326 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
331 __m128d &rnd1lo, __m128d &rnd1hi,
332 __m128d &rnd2lo, __m128d &rnd2hi) {
333 __m128i ctr0v = _mm_set1_epi32(ctr0);
334 __m128i ctr2v = _mm_set1_epi32(ctr2);
335 __m128i ctr3v = _mm_set1_epi32(ctr3);
337 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
343 __m128d &rnd1, __m128d &rnd2) {
344 __m128i ctr0v = _mm_set1_epi32(ctr0);
345 __m128i ctr2v = _mm_set1_epi32(ctr2);
346 __m128i ctr3v = _mm_set1_epi32(ctr3);
349 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
356 __vector
unsigned int *key) {
358 __vector
unsigned int lo0 = vec_mul(ctr[0], vec_splats(
PHILOX_M4x32_0));
359 __vector
unsigned int lo1 = vec_mul(ctr[2], vec_splats(
PHILOX_M4x32_1));
360 __vector
unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(
PHILOX_M4x32_0));
361 __vector
unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(
PHILOX_M4x32_1));
362#elif defined(_ARCH_PWR10)
363 __vector
unsigned int lo0 = vec_mul(ctr[0], vec_splats(
PHILOX_M4x32_0));
364 __vector
unsigned int lo1 = vec_mul(ctr[2], vec_splats(
PHILOX_M4x32_1));
365 __vector
unsigned int hi0 = vec_mulh(ctr[0], vec_splats(
PHILOX_M4x32_0));
366 __vector
unsigned int hi1 = vec_mulh(ctr[2], vec_splats(
PHILOX_M4x32_1));
368 __vector
unsigned int lohi0a =
369 (__vector
unsigned int)vec_mule(ctr[0], vec_splats(
PHILOX_M4x32_0));
370 __vector
unsigned int lohi0b =
371 (__vector
unsigned int)vec_mulo(ctr[0], vec_splats(
PHILOX_M4x32_0));
372 __vector
unsigned int lohi1a =
373 (__vector
unsigned int)vec_mule(ctr[2], vec_splats(
PHILOX_M4x32_1));
374 __vector
unsigned int lohi1b =
375 (__vector
unsigned int)vec_mulo(ctr[2], vec_splats(
PHILOX_M4x32_1));
377#ifdef __LITTLE_ENDIAN__
378 __vector
unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
379 __vector
unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
380 __vector
unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
381 __vector
unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
383 __vector
unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
384 __vector
unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
385 __vector
unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
386 __vector
unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
390 ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
392 ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
404 __vector
unsigned int y) {
406#ifdef __LITTLE_ENDIAN__
408 x = vec_mergel(x, vec_splats(0U));
409 y = vec_mergel(y, vec_splats(0U));
411 x = vec_mergeh(x, vec_splats(0U));
412 y = vec_mergeh(y, vec_splats(0U));
416 x = vec_mergel(vec_splats(0U), x);
417 y = vec_mergel(vec_splats(0U), y);
419 x = vec_mergeh(vec_splats(0U), x);
420 y = vec_mergeh(vec_splats(0U), y);
426 __vector
unsigned long long z =
427 vec_sl((__vector
unsigned long long)y, vec_splats(53ULL - 32ULL));
429 __vector
unsigned long long z =
430 vec_vsld((__vector
unsigned long long)y, vec_splats(53ULL - 32ULL));
432 z = vec_xor((__vector
unsigned long long)x, z);
436 __vector
double rs = vec_ctd(z, 0);
438 __vector
double rs = vec_ctf(z, 0);
449 __vector
unsigned int ctr1,
450 __vector
unsigned int ctr2,
451 __vector
unsigned int ctr3,
uint32 key0,
452 uint32 key1, __vector
float &rnd1,
453 __vector
float &rnd2, __vector
float &rnd3,
454 __vector
float &rnd4) {
455 __vector
unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
456 __vector
unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
478 rnd1 = vec_ctf(ctr[0], 0);
479 rnd2 = vec_ctf(ctr[1], 0);
480 rnd3 = vec_ctf(ctr[2], 0);
481 rnd4 = vec_ctf(ctr[3], 0);
495 __vector
unsigned int ctr1,
496 __vector
unsigned int ctr2,
497 __vector
unsigned int ctr3,
uint32 key0,
498 uint32 key1, __vector
double &rnd1lo,
499 __vector
double &rnd1hi, __vector
double &rnd2lo,
500 __vector
double &rnd2hi) {
501 __vector
unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
502 __vector
unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
523 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
524 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
525 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
526 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
532 uint32 key1, __vector
float &rnd1,
533 __vector
float &rnd2, __vector
float &rnd3,
534 __vector
float &rnd4) {
535 __vector
unsigned int ctr0v = vec_splats(ctr0);
536 __vector
unsigned int ctr2v = vec_splats(ctr2);
537 __vector
unsigned int ctr3v = vec_splats(ctr3);
539 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
544 __vector
float &rnd1, __vector
float &rnd2,
545 __vector
float &rnd3, __vector
float &rnd4) {
546 philox_float4(ctr0, (__vector
unsigned int)ctr1, ctr2, ctr3, key0, key1, rnd1,
553 uint32 key1, __vector
double &rnd1lo,
554 __vector
double &rnd1hi, __vector
double &rnd2lo,
555 __vector
double &rnd2hi) {
556 __vector
unsigned int ctr0v = vec_splats(ctr0);
557 __vector
unsigned int ctr2v = vec_splats(ctr2);
558 __vector
unsigned int ctr3v = vec_splats(ctr3);
560 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
566 uint32 key1, __vector
double &rnd1,
567 __vector
double &rnd2) {
568 __vector
unsigned int ctr0v = vec_splats(ctr0);
569 __vector
unsigned int ctr2v = vec_splats(ctr2);
570 __vector
unsigned int ctr3v = vec_splats(ctr3);
572 __vector
double ignore;
573 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
579 __vector
double &rnd1, __vector
double &rnd2) {
580 philox_double2(ctr0, (__vector
unsigned int)ctr1, ctr2, ctr3, key0, key1,
586#if defined(__ARM_NEON)
588 uint32x4_t lohi0a = vreinterpretq_u32_u64(
590 uint32x4_t lohi0b = vreinterpretq_u32_u64(
592 uint32x4_t lohi1a = vreinterpretq_u32_u64(
594 uint32x4_t lohi1b = vreinterpretq_u32_u64(
597 uint32x4_t lo0 = vuzp1q_u32(lohi0a, lohi0b);
598 uint32x4_t lo1 = vuzp1q_u32(lohi1a, lohi1b);
599 uint32x4_t hi0 = vuzp2q_u32(lohi0a, lohi0b);
600 uint32x4_t hi1 = vuzp2q_u32(lohi1a, lohi1b);
602 ctr[0] = veorq_u32(veorq_u32(hi1, ctr[1]), key[0]);
604 ctr[2] = veorq_u32(veorq_u32(hi0, ctr[3]), key[1]);
617 x = vzip2q_u32(x, vdupq_n_u32(0));
618 y = vzip2q_u32(y, vdupq_n_u32(0));
620 x = vzip1q_u32(x, vdupq_n_u32(0));
621 y = vzip1q_u32(y, vdupq_n_u32(0));
625 uint64x2_t z = vshlq_n_u64(vreinterpretq_u64_u32(y), 53 - 32);
626 z = veorq_u64(vreinterpretq_u64_u32(x), z);
629 float64x2_t rs = vcvtq_f64_u64(z);
639 float32x4_t &rnd1, float32x4_t &rnd2,
640 float32x4_t &rnd3, float32x4_t &rnd4) {
641 uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
642 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
664 rnd1 = vcvtq_f32_u32(ctr[0]);
665 rnd2 = vcvtq_f32_u32(ctr[1]);
666 rnd3 = vcvtq_f32_u32(ctr[2]);
667 rnd4 = vcvtq_f32_u32(ctr[3]);
680 uint32x4_t ctr2, uint32x4_t ctr3,
uint32 key0,
681 uint32 key1, float64x2_t &rnd1lo,
682 float64x2_t &rnd1hi, float64x2_t &rnd2lo,
683 float64x2_t &rnd2hi) {
684 uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
685 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
706 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
707 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
708 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
709 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
714 float32x4_t &rnd1, float32x4_t &rnd2,
715 float32x4_t &rnd3, float32x4_t &rnd4) {
716 uint32x4_t ctr0v = vdupq_n_u32(ctr0);
717 uint32x4_t ctr2v = vdupq_n_u32(ctr2);
718 uint32x4_t ctr3v = vdupq_n_u32(ctr3);
720 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
725 float32x4_t &rnd1, float32x4_t &rnd2,
726 float32x4_t &rnd3, float32x4_t &rnd4) {
727 philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1,
733 float64x2_t &rnd1lo, float64x2_t &rnd1hi,
734 float64x2_t &rnd2lo, float64x2_t &rnd2hi) {
735 uint32x4_t ctr0v = vdupq_n_u32(ctr0);
736 uint32x4_t ctr2v = vdupq_n_u32(ctr2);
737 uint32x4_t ctr3v = vdupq_n_u32(ctr3);
739 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
745 float64x2_t &rnd1, float64x2_t &rnd2) {
746 uint32x4_t ctr0v = vdupq_n_u32(ctr0);
747 uint32x4_t ctr2v = vdupq_n_u32(ctr2);
748 uint32x4_t ctr3v = vdupq_n_u32(ctr3);
751 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
757 float64x2_t &rnd1, float64x2_t &rnd2) {
758 philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1,
763#if defined(__ARM_FEATURE_SVE)
766 svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(
PHILOX_M4x32_0));
768 svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(
PHILOX_M4x32_1));
769 svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0),
771 svuint32_t hi1 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 2),
776 sveor_u32_x(svptrue_b32(),
777 sveor_u32_x(svptrue_b32(), hi1, svget4_u32(ctr, 1)),
778 svget2_u32(key, 0)));
779 ctr = svset4_u32(ctr, 1, lo1);
782 sveor_u32_x(svptrue_b32(),
783 sveor_u32_x(svptrue_b32(), hi0, svget4_u32(ctr, 3)),
784 svget2_u32(key, 1)));
785 ctr = svset4_u32(ctr, 3, lo0);
791 svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(
PHILOX_W32_0)));
794 svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(
PHILOX_W32_1)));
801 x = svzip2_u32(x, svdup_u32(0));
802 y = svzip2_u32(y, svdup_u32(0));
804 x = svzip1_u32(x, svdup_u32(0));
805 y = svzip1_u32(y, svdup_u32(0));
810 svlsl_n_u64_x(svptrue_b64(), svreinterpret_u64_u32(y), 53 - 32);
811 z = sveor_u64_x(svptrue_b64(), svreinterpret_u64_u32(x), z);
814 svfloat64_t rs = svcvt_f64_u64_x(svptrue_b64(), z);
824 svfloat32_st &rnd1, svfloat32_st &rnd2,
825 svfloat32_st &rnd3, svfloat32_st &rnd4) {
826 svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
827 svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
849 rnd1 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 0));
850 rnd2 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 1));
851 rnd3 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 2));
852 rnd4 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 3));
865 svuint32_t ctr2, svuint32_t ctr3,
uint32 key0,
866 uint32 key1, svfloat64_st &rnd1lo,
867 svfloat64_st &rnd1hi, svfloat64_st &rnd2lo,
868 svfloat64_st &rnd2hi) {
869 svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
870 svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
891 rnd1lo = _uniform_double_hq<false>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
892 rnd1hi = _uniform_double_hq<true>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
893 rnd2lo = _uniform_double_hq<false>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
894 rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
899 svfloat32_st &rnd1, svfloat32_st &rnd2,
900 svfloat32_st &rnd3, svfloat32_st &rnd4) {
901 svuint32_t ctr0v = svdup_u32(ctr0);
902 svuint32_t ctr2v = svdup_u32(ctr2);
903 svuint32_t ctr3v = svdup_u32(ctr3);
905 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
910 svfloat32_st &rnd1, svfloat32_st &rnd2,
911 svfloat32_st &rnd3, svfloat32_st &rnd4) {
912 philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1,
918 svfloat64_st &rnd1lo, svfloat64_st &rnd1hi,
919 svfloat64_st &rnd2lo, svfloat64_st &rnd2hi) {
920 svuint32_t ctr0v = svdup_u32(ctr0);
921 svuint32_t ctr2v = svdup_u32(ctr2);
922 svuint32_t ctr3v = svdup_u32(ctr3);
924 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
930 svfloat64_st &rnd1, svfloat64_st &rnd2) {
931 svuint32_t ctr0v = svdup_u32(ctr0);
932 svuint32_t ctr2v = svdup_u32(ctr2);
933 svuint32_t ctr3v = svdup_u32(ctr3);
936 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
942 svfloat64_st &rnd1, svfloat64_st &rnd2) {
943 philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1,
950 __m256i lohi0a = _mm256_mul_epu32(ctr[0], _mm256_set1_epi32(
PHILOX_M4x32_0));
951 __m256i lohi0b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[0], 32),
953 __m256i lohi1a = _mm256_mul_epu32(ctr[2], _mm256_set1_epi32(
PHILOX_M4x32_1));
954 __m256i lohi1b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[2], 32),
957 lohi0a = _mm256_shuffle_epi32(lohi0a, 0xD8);
958 lohi0b = _mm256_shuffle_epi32(lohi0b, 0xD8);
959 lohi1a = _mm256_shuffle_epi32(lohi1a, 0xD8);
960 lohi1b = _mm256_shuffle_epi32(lohi1b, 0xD8);
962 __m256i lo0 = _mm256_unpacklo_epi32(lohi0a, lohi0b);
963 __m256i hi0 = _mm256_unpackhi_epi32(lohi0a, lohi0b);
964 __m256i lo1 = _mm256_unpacklo_epi32(lohi1a, lohi1b);
965 __m256i hi1 = _mm256_unpackhi_epi32(lohi1a, lohi1b);
967 ctr[0] = _mm256_xor_si256(_mm256_xor_si256(hi1, ctr[1]), key[0]);
969 ctr[2] = _mm256_xor_si256(_mm256_xor_si256(hi0, ctr[3]), key[1]);
974 key[0] = _mm256_add_epi32(key[0], _mm256_set1_epi32(
PHILOX_W32_0));
975 key[1] = _mm256_add_epi32(key[1], _mm256_set1_epi32(
PHILOX_W32_1));
982 x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 1));
983 y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 1));
985 x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 0));
986 y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 0));
990 __m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
991 z = _mm256_xor_si256(x, z);
994 __m256d rs = _my256_cvtepu64_pd(z);
1009 __m256 &rnd1, __m256 &rnd2, __m256 &rnd3,
1011 __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
1012 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1034 rnd1 = _my256_cvtepu32_ps(ctr[0]);
1035 rnd2 = _my256_cvtepu32_ps(ctr[1]);
1036 rnd3 = _my256_cvtepu32_ps(ctr[2]);
1037 rnd4 = _my256_cvtepu32_ps(ctr[3]);
1062 __m256d &rnd1lo, __m256d &rnd1hi,
1063 __m256d &rnd2lo, __m256d &rnd2hi) {
1064 __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
1065 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1086 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
1087 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
1088 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
1089 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
1094 __m256 &rnd1, __m256 &rnd2, __m256 &rnd3,
1096 __m256i ctr0v = _mm256_set1_epi32(ctr0);
1097 __m256i ctr2v = _mm256_set1_epi32(ctr2);
1098 __m256i ctr3v = _mm256_set1_epi32(ctr3);
1100 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
1105 __m256d &rnd1lo, __m256d &rnd1hi,
1106 __m256d &rnd2lo, __m256d &rnd2hi) {
1107 __m256i ctr0v = _mm256_set1_epi32(ctr0);
1108 __m256i ctr2v = _mm256_set1_epi32(ctr2);
1109 __m256i ctr3v = _mm256_set1_epi32(ctr3);
1111 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
1117 __m256d &rnd1, __m256d &rnd2) {
1119 __m256i ctr0v = _mm256_set1_epi32(ctr0);
1120 __m256i ctr2v = _mm256_set1_epi32(ctr2);
1121 __m256i ctr3v = _mm256_set1_epi32(ctr3);
1124 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
1126 __m128d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
1127 philox_double2(ctr0, _mm256_extractf128_si256(ctr1, 0), ctr2, ctr3, key0,
1128 key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
1129 rnd1 = _my256_set_m128d(rnd1hi, rnd1lo);
1130 rnd2 = _my256_set_m128d(rnd2hi, rnd2lo);
1137 __m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(
PHILOX_M4x32_0));
1138 __m512i lohi0b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[0], 32),
1140 __m512i lohi1a = _mm512_mul_epu32(ctr[2], _mm512_set1_epi32(
PHILOX_M4x32_1));
1141 __m512i lohi1b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[2], 32),
1144 lohi0a = _mm512_shuffle_epi32(lohi0a, _MM_PERM_DBCA);
1145 lohi0b = _mm512_shuffle_epi32(lohi0b, _MM_PERM_DBCA);
1146 lohi1a = _mm512_shuffle_epi32(lohi1a, _MM_PERM_DBCA);
1147 lohi1b = _mm512_shuffle_epi32(lohi1b, _MM_PERM_DBCA);
1149 __m512i lo0 = _mm512_unpacklo_epi32(lohi0a, lohi0b);
1150 __m512i hi0 = _mm512_unpackhi_epi32(lohi0a, lohi0b);
1151 __m512i lo1 = _mm512_unpacklo_epi32(lohi1a, lohi1b);
1152 __m512i hi1 = _mm512_unpackhi_epi32(lohi1a, lohi1b);
1154 ctr[0] = _mm512_xor_si512(_mm512_xor_si512(hi1, ctr[1]), key[0]);
1156 ctr[2] = _mm512_xor_si512(_mm512_xor_si512(hi0, ctr[3]), key[1]);
1161 key[0] = _mm512_add_epi32(key[0], _mm512_set1_epi32(
PHILOX_W32_0));
1162 key[1] = _mm512_add_epi32(key[1], _mm512_set1_epi32(
PHILOX_W32_1));
1169 x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 1));
1170 y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 1));
1172 x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 0));
1173 y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 0));
1177 __m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
1178 z = _mm512_xor_si512(x, z);
1181 __m512d rs = _mm512_cvtepu64_pd(z);
1191 __m512 &rnd1, __m512 &rnd2, __m512 &rnd3,
1193 __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
1194 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1216 rnd1 = _mm512_cvtepu32_ps(ctr[0]);
1217 rnd2 = _mm512_cvtepu32_ps(ctr[1]);
1218 rnd3 = _mm512_cvtepu32_ps(ctr[2]);
1219 rnd4 = _mm512_cvtepu32_ps(ctr[3]);
1233 __m512d &rnd1lo, __m512d &rnd1hi,
1234 __m512d &rnd2lo, __m512d &rnd2hi) {
1235 __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
1236 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1257 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
1258 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
1259 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
1260 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
1265 __m512 &rnd1, __m512 &rnd2, __m512 &rnd3,
1267 __m512i ctr0v = _mm512_set1_epi32(ctr0);
1268 __m512i ctr2v = _mm512_set1_epi32(ctr2);
1269 __m512i ctr3v = _mm512_set1_epi32(ctr3);
1271 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
1276 __m512d &rnd1lo, __m512d &rnd1hi,
1277 __m512d &rnd2lo, __m512d &rnd2hi) {
1278 __m512i ctr0v = _mm512_set1_epi32(ctr0);
1279 __m512i ctr2v = _mm512_set1_epi32(ctr2);
1280 __m512i ctr3v = _mm512_set1_epi32(ctr3);
1282 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
1288 __m512d &rnd1, __m512d &rnd2) {
1290 __m512i ctr0v = _mm512_set1_epi32(ctr0);
1291 __m512i ctr2v = _mm512_set1_epi32(ctr2);
1292 __m512i ctr3v = _mm512_set1_epi32(ctr3);
1295 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
1297 __m256d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
1298 philox_double2(ctr0, _mm512_extracti64x4_epi64(ctr1, 0), ctr2, ctr3, key0,
1299 key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
1300 rnd1 = _my512_set_m256d(rnd1hi, rnd1lo);
1301 rnd2 = _my512_set_m256d(rnd2hi, rnd2lo);
QUALIFIERS void _philox4x32bumpkey(uint32 *key)
#define TWOPOW53_INV_DOUBLE
QUALIFIERS void _philox4x32round(uint32 *ctr, uint32 *key)
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32 *hip)
#define TWOPOW32_INV_FLOAT
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, double &rnd1, double &rnd2)
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float &rnd1, float &rnd2, float &rnd3, float &rnd4)
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)