46#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
47#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
52#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
59#if defined(_MSC_VER) && defined(_M_ARM64)
66#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
70#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && \
72#include <ppu_intrinsics.h>
78#include <pveclib/vec_int64_ppc.h>
83#include <riscv_vector.h>
87#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_SVE)
88#define SVE_QUALIFIERS __arm_streaming_compatible
89#elif defined(__ARM_FEATURE_SME)
90#define SVE_QUALIFIERS __arm_streaming
95#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \
96 defined(__clang__) && defined(__CUDA__)
97#define QUALIFIERS static __forceinline__ __device__
98#elif defined(__OPENCL_VERSION__)
99#define QUALIFIERS static inline
101#define QUALIFIERS inline
105#define PHILOX_W32_0 (0x9E3779B9)
106#define PHILOX_W32_1 (0xBB67AE85)
107#define PHILOX_M4x32_0 (0xD2511F53)
108#define PHILOX_M4x32_1 (0xCD9E8D57)
109#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
110#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
112#ifdef __OPENCL_VERSION__
113#include "opencl_stdint.h"
124#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && \
125 __ARM_FEATURE_SVE_BITS > 0
126typedef svfloat32_t svfloat32_st
127 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
128typedef svfloat64_t svfloat64_st
129 __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
130#elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
131typedef svfloat32_t svfloat32_st;
132typedef svfloat64_t svfloat64_st;
136#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && \
137 (!defined(__clang__) || !defined(__CUDA__))
139#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
140 *hip = __mulhwu(a, b);
142#elif defined(__OPENCL_VERSION__)
147 *hip = product >> 32;
152 *hip = __umulhi(a, b);
163 ctr[0] = hi1 ^ ctr[1] ^ key[0];
165 ctr[2] = hi0 ^ ctr[3] ^ key[1];
181#ifdef __OPENCL_VERSION__
182 double *rnd1,
double *rnd2)
184 double &rnd1,
double &rnd2)
187 uint32 key[2] = {key0, key1};
188 uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
209#ifdef __OPENCL_VERSION__
220#ifdef __OPENCL_VERSION__
221 float *rnd1,
float *rnd2,
float *rnd3,
224 float &rnd1,
float &rnd2,
float &rnd3,
228 uint32 key[2] = {key0, key1};
229 uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
250#ifdef __OPENCL_VERSION__
263#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && \
264 !defined(__HIP_DEVICE_COMPILE__) && \
265 (!defined(__clang__) || !defined(__CUDA__))
266#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
268 __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(
PHILOX_M4x32_0));
270 _mm_mul_epu32(_mm_srli_epi64(ctr[0], 32), _mm_set1_epi32(
PHILOX_M4x32_0));
271 __m128i lohi1a = _mm_mul_epu32(ctr[2], _mm_set1_epi32(
PHILOX_M4x32_1));
273 _mm_mul_epu32(_mm_srli_epi64(ctr[2], 32), _mm_set1_epi32(
PHILOX_M4x32_1));
275 lohi0a = _mm_shuffle_epi32(lohi0a, 0xD8);
276 lohi0b = _mm_shuffle_epi32(lohi0b, 0xD8);
277 lohi1a = _mm_shuffle_epi32(lohi1a, 0xD8);
278 lohi1b = _mm_shuffle_epi32(lohi1b, 0xD8);
280 __m128i lo0 = _mm_unpacklo_epi32(lohi0a, lohi0b);
281 __m128i hi0 = _mm_unpackhi_epi32(lohi0a, lohi0b);
282 __m128i lo1 = _mm_unpacklo_epi32(lohi1a, lohi1b);
283 __m128i hi1 = _mm_unpackhi_epi32(lohi1a, lohi1b);
285 ctr[0] = _mm_xor_si128(_mm_xor_si128(hi1, ctr[1]), key[0]);
287 ctr[2] = _mm_xor_si128(_mm_xor_si128(hi0, ctr[3]), key[1]);
292 key[0] = _mm_add_epi32(key[0], _mm_set1_epi32(
PHILOX_W32_0));
293 key[1] = _mm_add_epi32(key[1], _mm_set1_epi32(
PHILOX_W32_1));
300 x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
301 y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
303 x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
304 y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
308 __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
309 z = _mm_xor_si128(x, z);
312 __m128d rs = _my_cvtepu64_pd(z);
327 __m128 &rnd1, __m128 &rnd2, __m128 &rnd3,
329 __m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
330 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
352 rnd1 = _my_cvtepu32_ps(ctr[0]);
353 rnd2 = _my_cvtepu32_ps(ctr[1]);
354 rnd3 = _my_cvtepu32_ps(ctr[2]);
355 rnd4 = _my_cvtepu32_ps(ctr[3]);
380 __m128d &rnd1lo, __m128d &rnd1hi,
381 __m128d &rnd2lo, __m128d &rnd2hi) {
382 __m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
383 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
404 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
405 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
406 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
407 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
412 __m128 &rnd1, __m128 &rnd2, __m128 &rnd3,
414 __m128i ctr0v = _mm_set1_epi32(ctr0);
415 __m128i ctr2v = _mm_set1_epi32(ctr2);
416 __m128i ctr3v = _mm_set1_epi32(ctr3);
418 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
423 __m128d &rnd1lo, __m128d &rnd1hi,
424 __m128d &rnd2lo, __m128d &rnd2hi) {
425 __m128i ctr0v = _mm_set1_epi32(ctr0);
426 __m128i ctr2v = _mm_set1_epi32(ctr2);
427 __m128i ctr3v = _mm_set1_epi32(ctr3);
429 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
435 __m128d &rnd1, __m128d &rnd2) {
436 __m128i ctr0v = _mm_set1_epi32(ctr0);
437 __m128i ctr2v = _mm_set1_epi32(ctr2);
438 __m128i ctr3v = _mm_set1_epi32(ctr3);
441 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
448 __vector
unsigned int *key) {
450 __vector
unsigned int lo0 = vec_mul(ctr[0], vec_splats(
PHILOX_M4x32_0));
451 __vector
unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(
PHILOX_M4x32_0));
452 __vector
unsigned int lo1 = vec_mul(ctr[2], vec_splats(
PHILOX_M4x32_1));
453 __vector
unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(
PHILOX_M4x32_1));
454#elif defined(_ARCH_PWR10)
455 __vector
unsigned int lo0 = vec_mul(ctr[0], vec_splats(
PHILOX_M4x32_0));
456 __vector
unsigned int hi0 = vec_mulh(ctr[0], vec_splats(
PHILOX_M4x32_0));
457 __vector
unsigned int lo1 = vec_mul(ctr[2], vec_splats(
PHILOX_M4x32_1));
458 __vector
unsigned int hi1 = vec_mulh(ctr[2], vec_splats(
PHILOX_M4x32_1));
460 __vector
unsigned int lohi0a =
461 (__vector
unsigned int)vec_mule(ctr[0], vec_splats(
PHILOX_M4x32_0));
462 __vector
unsigned int lohi0b =
463 (__vector
unsigned int)vec_mulo(ctr[0], vec_splats(
PHILOX_M4x32_0));
464 __vector
unsigned int lohi1a =
465 (__vector
unsigned int)vec_mule(ctr[2], vec_splats(
PHILOX_M4x32_1));
466 __vector
unsigned int lohi1b =
467 (__vector
unsigned int)vec_mulo(ctr[2], vec_splats(
PHILOX_M4x32_1));
469#ifdef __LITTLE_ENDIAN__
470 __vector
unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
471 __vector
unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
472 __vector
unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
473 __vector
unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
475 __vector
unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
476 __vector
unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
477 __vector
unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
478 __vector
unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
482 ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
484 ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
496 __vector
unsigned int y) {
498#ifdef __LITTLE_ENDIAN__
500 x = vec_mergel(x, vec_splats(0U));
501 y = vec_mergel(y, vec_splats(0U));
503 x = vec_mergeh(x, vec_splats(0U));
504 y = vec_mergeh(y, vec_splats(0U));
508 x = vec_mergel(vec_splats(0U), x);
509 y = vec_mergel(vec_splats(0U), y);
511 x = vec_mergeh(vec_splats(0U), x);
512 y = vec_mergeh(vec_splats(0U), y);
518 __vector
unsigned long long z =
519 vec_sl((__vector
unsigned long long)y, vec_splats(53ULL - 32ULL));
521 __vector
unsigned long long z =
522 vec_vsld((__vector
unsigned long long)y, vec_splats(53ULL - 32ULL));
524 z = vec_xor((__vector
unsigned long long)x, z);
528 __vector
double rs = vec_ctd(z, 0);
530 __vector
double rs = vec_ctf(z, 0);
541 __vector
unsigned int ctr1,
542 __vector
unsigned int ctr2,
543 __vector
unsigned int ctr3,
uint32 key0,
544 uint32 key1, __vector
float &rnd1,
545 __vector
float &rnd2, __vector
float &rnd3,
546 __vector
float &rnd4) {
547 __vector
unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
548 __vector
unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
570 rnd1 = vec_ctf(ctr[0], 0);
571 rnd2 = vec_ctf(ctr[1], 0);
572 rnd3 = vec_ctf(ctr[2], 0);
573 rnd4 = vec_ctf(ctr[3], 0);
587 __vector
unsigned int ctr1,
588 __vector
unsigned int ctr2,
589 __vector
unsigned int ctr3,
uint32 key0,
590 uint32 key1, __vector
double &rnd1lo,
591 __vector
double &rnd1hi, __vector
double &rnd2lo,
592 __vector
double &rnd2hi) {
593 __vector
unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
594 __vector
unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
615 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
616 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
617 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
618 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
624 uint32 key1, __vector
float &rnd1,
625 __vector
float &rnd2, __vector
float &rnd3,
626 __vector
float &rnd4) {
627 __vector
unsigned int ctr0v = vec_splats(ctr0);
628 __vector
unsigned int ctr2v = vec_splats(ctr2);
629 __vector
unsigned int ctr3v = vec_splats(ctr3);
631 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
636 __vector
float &rnd1, __vector
float &rnd2,
637 __vector
float &rnd3, __vector
float &rnd4) {
638 philox_float4(ctr0, (__vector
unsigned int)ctr1, ctr2, ctr3, key0, key1, rnd1,
645 uint32 key1, __vector
double &rnd1lo,
646 __vector
double &rnd1hi, __vector
double &rnd2lo,
647 __vector
double &rnd2hi) {
648 __vector
unsigned int ctr0v = vec_splats(ctr0);
649 __vector
unsigned int ctr2v = vec_splats(ctr2);
650 __vector
unsigned int ctr3v = vec_splats(ctr3);
652 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
658 uint32 key1, __vector
double &rnd1,
659 __vector
double &rnd2) {
660 __vector
unsigned int ctr0v = vec_splats(ctr0);
661 __vector
unsigned int ctr2v = vec_splats(ctr2);
662 __vector
unsigned int ctr3v = vec_splats(ctr3);
664 __vector
double ignore;
665 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
671 __vector
double &rnd1, __vector
double &rnd2) {
672 philox_double2(ctr0, (__vector
unsigned int)ctr1, ctr2, ctr3, key0, key1,
678#if defined(__ARM_NEON)
680 uint32x4_t lohi0a = vreinterpretq_u32_u64(
682 uint32x4_t lohi0b = vreinterpretq_u32_u64(
684 uint32x4_t lohi1a = vreinterpretq_u32_u64(
686 uint32x4_t lohi1b = vreinterpretq_u32_u64(
689 uint32x4_t lo0 = vuzp1q_u32(lohi0a, lohi0b);
690 uint32x4_t lo1 = vuzp1q_u32(lohi1a, lohi1b);
691 uint32x4_t hi0 = vuzp2q_u32(lohi0a, lohi0b);
692 uint32x4_t hi1 = vuzp2q_u32(lohi1a, lohi1b);
694 ctr[0] = veorq_u32(veorq_u32(hi1, ctr[1]), key[0]);
696 ctr[2] = veorq_u32(veorq_u32(hi0, ctr[3]), key[1]);
709 x = vzip2q_u32(x, vdupq_n_u32(0));
710 y = vzip2q_u32(y, vdupq_n_u32(0));
712 x = vzip1q_u32(x, vdupq_n_u32(0));
713 y = vzip1q_u32(y, vdupq_n_u32(0));
717 uint64x2_t z = vshlq_n_u64(vreinterpretq_u64_u32(y), 53 - 32);
718 z = veorq_u64(vreinterpretq_u64_u32(x), z);
721 float64x2_t rs = vcvtq_f64_u64(z);
731 float32x4_t &rnd1, float32x4_t &rnd2,
732 float32x4_t &rnd3, float32x4_t &rnd4) {
733 uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
734 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
756 rnd1 = vcvtq_f32_u32(ctr[0]);
757 rnd2 = vcvtq_f32_u32(ctr[1]);
758 rnd3 = vcvtq_f32_u32(ctr[2]);
759 rnd4 = vcvtq_f32_u32(ctr[3]);
772 uint32x4_t ctr2, uint32x4_t ctr3,
uint32 key0,
773 uint32 key1, float64x2_t &rnd1lo,
774 float64x2_t &rnd1hi, float64x2_t &rnd2lo,
775 float64x2_t &rnd2hi) {
776 uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
777 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
798 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
799 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
800 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
801 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
806 float32x4_t &rnd1, float32x4_t &rnd2,
807 float32x4_t &rnd3, float32x4_t &rnd4) {
808 uint32x4_t ctr0v = vdupq_n_u32(ctr0);
809 uint32x4_t ctr2v = vdupq_n_u32(ctr2);
810 uint32x4_t ctr3v = vdupq_n_u32(ctr3);
812 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
818 float32x4_t &rnd1, float32x4_t &rnd2,
819 float32x4_t &rnd3, float32x4_t &rnd4) {
820 philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1,
827 float64x2_t &rnd1lo, float64x2_t &rnd1hi,
828 float64x2_t &rnd2lo, float64x2_t &rnd2hi) {
829 uint32x4_t ctr0v = vdupq_n_u32(ctr0);
830 uint32x4_t ctr2v = vdupq_n_u32(ctr2);
831 uint32x4_t ctr3v = vdupq_n_u32(ctr3);
833 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
839 float64x2_t &rnd1, float64x2_t &rnd2) {
840 uint32x4_t ctr0v = vdupq_n_u32(ctr0);
841 uint32x4_t ctr2v = vdupq_n_u32(ctr2);
842 uint32x4_t ctr3v = vdupq_n_u32(ctr3);
845 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
852 float64x2_t &rnd1, float64x2_t &rnd2) {
853 philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1,
859#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
863 svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(
PHILOX_M4x32_0));
864 svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0),
867 svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(
PHILOX_M4x32_1));
868 svuint32_t hi1 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 2),
873 sveor_u32_x(svptrue_b32(),
874 sveor_u32_x(svptrue_b32(), hi1, svget4_u32(ctr, 1)),
875 svget2_u32(key, 0)));
876 ctr = svset4_u32(ctr, 1, lo1);
879 sveor_u32_x(svptrue_b32(),
880 sveor_u32_x(svptrue_b32(), hi0, svget4_u32(ctr, 3)),
881 svget2_u32(key, 1)));
882 ctr = svset4_u32(ctr, 3, lo0);
888 svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(
PHILOX_W32_0)));
891 svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(
PHILOX_W32_1)));
899 x = svzip2_u32(x, svdup_u32(0));
900 y = svzip2_u32(y, svdup_u32(0));
902 x = svzip1_u32(x, svdup_u32(0));
903 y = svzip1_u32(y, svdup_u32(0));
908 svlsl_n_u64_x(svptrue_b64(), svreinterpret_u64_u32(y), 53 - 32);
909 z = sveor_u64_x(svptrue_b64(), svreinterpret_u64_u32(x), z);
912 svfloat64_t rs = svcvt_f64_u64_x(svptrue_b64(), z);
922 svfloat32_st &rnd1, svfloat32_st &rnd2,
925 svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
926 svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
948 rnd1 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 0));
949 rnd2 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 1));
950 rnd3 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 2));
951 rnd4 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 3));
964 svuint32_t ctr2, svuint32_t ctr3,
uint32 key0,
965 uint32 key1, svfloat64_st &rnd1lo,
966 svfloat64_st &rnd1hi, svfloat64_st &rnd2lo,
968 svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
969 svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
990 rnd1lo = _uniform_double_hq<false>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
991 rnd1hi = _uniform_double_hq<true>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
992 rnd2lo = _uniform_double_hq<false>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
993 rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
998 svfloat32_st &rnd1, svfloat32_st &rnd2,
1001 svuint32_t ctr0v = svdup_u32(ctr0);
1002 svuint32_t ctr2v = svdup_u32(ctr2);
1003 svuint32_t ctr3v = svdup_u32(ctr3);
1005 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
1010 svfloat32_st &rnd1, svfloat32_st &rnd2,
1013 philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1,
1019 svfloat64_st &rnd1lo, svfloat64_st &rnd1hi,
1020 svfloat64_st &rnd2lo,
1022 svuint32_t ctr0v = svdup_u32(ctr0);
1023 svuint32_t ctr2v = svdup_u32(ctr2);
1024 svuint32_t ctr3v = svdup_u32(ctr3);
1026 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
1034 svuint32_t ctr0v = svdup_u32(ctr0);
1035 svuint32_t ctr2v = svdup_u32(ctr2);
1036 svuint32_t ctr3v = svdup_u32(ctr3);
1038 svfloat64_st ignore;
1039 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
1047 philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1,
1052#if defined(__riscv_v)
1054 vuint32m1_t &ctr2, vuint32m1_t &ctr3,
1055 vuint32m1_t key0, vuint32m1_t key1) {
1056 vuint32m1_t lo0 = __riscv_vmul_vv_u32m1(
1057 ctr0, __riscv_vmv_v_x_u32m1(
PHILOX_M4x32_0, __riscv_vsetvlmax_e32m1()),
1058 __riscv_vsetvlmax_e32m1());
1059 vuint32m1_t hi0 = __riscv_vmulhu_vv_u32m1(
1060 ctr0, __riscv_vmv_v_x_u32m1(
PHILOX_M4x32_0, __riscv_vsetvlmax_e32m1()),
1061 __riscv_vsetvlmax_e32m1());
1062 vuint32m1_t lo1 = __riscv_vmul_vv_u32m1(
1063 ctr2, __riscv_vmv_v_x_u32m1(
PHILOX_M4x32_1, __riscv_vsetvlmax_e32m1()),
1064 __riscv_vsetvlmax_e32m1());
1065 vuint32m1_t hi1 = __riscv_vmulhu_vv_u32m1(
1066 ctr2, __riscv_vmv_v_x_u32m1(
PHILOX_M4x32_1, __riscv_vsetvlmax_e32m1()),
1067 __riscv_vsetvlmax_e32m1());
1069 ctr0 = __riscv_vxor_vv_u32m1(
1070 __riscv_vxor_vv_u32m1(hi1, ctr1, __riscv_vsetvlmax_e32m1()), key0,
1071 __riscv_vsetvlmax_e32m1());
1073 ctr2 = __riscv_vxor_vv_u32m1(
1074 __riscv_vxor_vv_u32m1(hi0, ctr3, __riscv_vsetvlmax_e32m1()), key1,
1075 __riscv_vsetvlmax_e32m1());
1080 key0 = __riscv_vadd_vv_u32m1(
1081 key0, __riscv_vmv_v_x_u32m1(
PHILOX_W32_0, __riscv_vsetvlmax_e32m1()),
1082 __riscv_vsetvlmax_e32m1());
1083 key1 = __riscv_vadd_vv_u32m1(
1084 key1, __riscv_vmv_v_x_u32m1(
PHILOX_W32_1, __riscv_vsetvlmax_e32m1()),
1085 __riscv_vsetvlmax_e32m1());
1092 size_t s = __riscv_vsetvlmax_e32m1();
1093 x = __riscv_vslidedown_vx_u32m1(x, s / 2, s);
1094 y = __riscv_vslidedown_vx_u32m1(y, s / 2, s);
1096 vuint64m1_t x64 = __riscv_vwcvtu_x_x_v_u64m1(
1097 __riscv_vlmul_trunc_v_u32m1_u32mf2(x), __riscv_vsetvlmax_e64m1());
1098 vuint64m1_t y64 = __riscv_vwcvtu_x_x_v_u64m1(
1099 __riscv_vlmul_trunc_v_u32m1_u32mf2(y), __riscv_vsetvlmax_e64m1());
1103 __riscv_vsll_vx_u64m1(y64, 53 - 32, __riscv_vsetvlmax_e64m1());
1104 z = __riscv_vxor_vv_u64m1(x64, z, __riscv_vsetvlmax_e64m1());
1107 vfloat64m1_t rs = __riscv_vfcvt_f_xu_v_f64m1(z, __riscv_vsetvlmax_e64m1());
1109 rs = __riscv_vfmadd_vv_f64m1(
1113 __riscv_vsetvlmax_e64m1()),
1114 __riscv_vsetvlmax_e64m1());
1120 vuint32m1_t ctr2, vuint32m1_t ctr3,
uint32 key0,
1121 uint32 key1, vfloat32m1_t &rnd1,
1122 vfloat32m1_t &rnd2, vfloat32m1_t &rnd3,
1123 vfloat32m1_t &rnd4) {
1124 vuint32m1_t key0v = __riscv_vmv_v_x_u32m1(key0, __riscv_vsetvlmax_e32m1());
1125 vuint32m1_t key1v = __riscv_vmv_v_x_u32m1(key1, __riscv_vsetvlmax_e32m1());
1147 rnd1 = __riscv_vfcvt_f_xu_v_f32m1(ctr0, __riscv_vsetvlmax_e32m1());
1148 rnd2 = __riscv_vfcvt_f_xu_v_f32m1(ctr1, __riscv_vsetvlmax_e32m1());
1149 rnd3 = __riscv_vfcvt_f_xu_v_f32m1(ctr2, __riscv_vsetvlmax_e32m1());
1150 rnd4 = __riscv_vfcvt_f_xu_v_f32m1(ctr3, __riscv_vsetvlmax_e32m1());
1152 rnd1 = __riscv_vfmadd_vv_f32m1(
1156 __riscv_vsetvlmax_e32m1()),
1157 __riscv_vsetvlmax_e32m1());
1158 rnd2 = __riscv_vfmadd_vv_f32m1(
1162 __riscv_vsetvlmax_e32m1()),
1163 __riscv_vsetvlmax_e32m1());
1164 rnd3 = __riscv_vfmadd_vv_f32m1(
1168 __riscv_vsetvlmax_e32m1()),
1169 __riscv_vsetvlmax_e32m1());
1170 rnd4 = __riscv_vfmadd_vv_f32m1(
1174 __riscv_vsetvlmax_e32m1()),
1175 __riscv_vsetvlmax_e32m1());
1179 vuint32m1_t ctr2, vuint32m1_t ctr3,
uint32 key0,
1180 uint32 key1, vfloat64m1_t &rnd1lo,
1181 vfloat64m1_t &rnd1hi, vfloat64m1_t &rnd2lo,
1182 vfloat64m1_t &rnd2hi) {
1183 vuint32m1_t key0v = __riscv_vmv_v_x_u32m1(key0, __riscv_vsetvlmax_e32m1());
1184 vuint32m1_t key1v = __riscv_vmv_v_x_u32m1(key1, __riscv_vsetvlmax_e32m1());
1205 rnd1lo = _uniform_double_hq<false>(ctr0, ctr1);
1206 rnd1hi = _uniform_double_hq<true>(ctr0, ctr1);
1207 rnd2lo = _uniform_double_hq<false>(ctr2, ctr3);
1208 rnd2hi = _uniform_double_hq<true>(ctr2, ctr3);
1213 vfloat32m1_t &rnd1, vfloat32m1_t &rnd2,
1214 vfloat32m1_t &rnd3, vfloat32m1_t &rnd4) {
1215 vuint32m1_t ctr0v = __riscv_vmv_v_x_u32m1(ctr0, __riscv_vsetvlmax_e32m1());
1216 vuint32m1_t ctr2v = __riscv_vmv_v_x_u32m1(ctr2, __riscv_vsetvlmax_e32m1());
1217 vuint32m1_t ctr3v = __riscv_vmv_v_x_u32m1(ctr3, __riscv_vsetvlmax_e32m1());
1219 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
1224 vfloat32m1_t &rnd1, vfloat32m1_t &rnd2,
1225 vfloat32m1_t &rnd3, vfloat32m1_t &rnd4) {
1226 philox_float4(ctr0, __riscv_vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3,
1227 key0, key1, rnd1, rnd2, rnd3, rnd4);
1232 vfloat64m1_t &rnd1lo, vfloat64m1_t &rnd1hi,
1233 vfloat64m1_t &rnd2lo, vfloat64m1_t &rnd2hi) {
1234 vuint32m1_t ctr0v = __riscv_vmv_v_x_u32m1(ctr0, __riscv_vsetvlmax_e32m1());
1235 vuint32m1_t ctr2v = __riscv_vmv_v_x_u32m1(ctr2, __riscv_vsetvlmax_e32m1());
1236 vuint32m1_t ctr3v = __riscv_vmv_v_x_u32m1(ctr3, __riscv_vsetvlmax_e32m1());
1238 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
1244 vfloat64m1_t &rnd1, vfloat64m1_t &rnd2) {
1245 vuint32m1_t ctr0v = __riscv_vmv_v_x_u32m1(ctr0, __riscv_vsetvlmax_e32m1());
1246 vuint32m1_t ctr2v = __riscv_vmv_v_x_u32m1(ctr2, __riscv_vsetvlmax_e32m1());
1247 vuint32m1_t ctr3v = __riscv_vmv_v_x_u32m1(ctr3, __riscv_vsetvlmax_e32m1());
1249 vfloat64m1_t ignore;
1250 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2,
1256 vfloat64m1_t &rnd1, vfloat64m1_t &rnd2) {
1257 philox_double2(ctr0, __riscv_vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3,
1258 key0, key1, rnd1, rnd2);
1264 __m256i lohi0a = _mm256_mul_epu32(ctr[0], _mm256_set1_epi32(
PHILOX_M4x32_0));
1265 __m256i lohi0b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[0], 32),
1267 __m256i lohi1a = _mm256_mul_epu32(ctr[2], _mm256_set1_epi32(
PHILOX_M4x32_1));
1268 __m256i lohi1b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[2], 32),
1271 lohi0a = _mm256_shuffle_epi32(lohi0a, 0xD8);
1272 lohi0b = _mm256_shuffle_epi32(lohi0b, 0xD8);
1273 lohi1a = _mm256_shuffle_epi32(lohi1a, 0xD8);
1274 lohi1b = _mm256_shuffle_epi32(lohi1b, 0xD8);
1276 __m256i lo0 = _mm256_unpacklo_epi32(lohi0a, lohi0b);
1277 __m256i hi0 = _mm256_unpackhi_epi32(lohi0a, lohi0b);
1278 __m256i lo1 = _mm256_unpacklo_epi32(lohi1a, lohi1b);
1279 __m256i hi1 = _mm256_unpackhi_epi32(lohi1a, lohi1b);
1281 ctr[0] = _mm256_xor_si256(_mm256_xor_si256(hi1, ctr[1]), key[0]);
1283 ctr[2] = _mm256_xor_si256(_mm256_xor_si256(hi0, ctr[3]), key[1]);
1288 key[0] = _mm256_add_epi32(key[0], _mm256_set1_epi32(
PHILOX_W32_0));
1289 key[1] = _mm256_add_epi32(key[1], _mm256_set1_epi32(
PHILOX_W32_1));
1296 x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 1));
1297 y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 1));
1299 x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 0));
1300 y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 0));
1304 __m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
1305 z = _mm256_xor_si256(x, z);
1308 __m256d rs = _my256_cvtepu64_pd(z);
1323 __m256 &rnd1, __m256 &rnd2, __m256 &rnd3,
1325 __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
1326 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1348 rnd1 = _my256_cvtepu32_ps(ctr[0]);
1349 rnd2 = _my256_cvtepu32_ps(ctr[1]);
1350 rnd3 = _my256_cvtepu32_ps(ctr[2]);
1351 rnd4 = _my256_cvtepu32_ps(ctr[3]);
1376 __m256d &rnd1lo, __m256d &rnd1hi,
1377 __m256d &rnd2lo, __m256d &rnd2hi) {
1378 __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
1379 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1400 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
1401 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
1402 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
1403 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
1408 __m256 &rnd1, __m256 &rnd2, __m256 &rnd3,
1410 __m256i ctr0v = _mm256_set1_epi32(ctr0);
1411 __m256i ctr2v = _mm256_set1_epi32(ctr2);
1412 __m256i ctr3v = _mm256_set1_epi32(ctr3);
1414 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
1419 __m256d &rnd1lo, __m256d &rnd1hi,
1420 __m256d &rnd2lo, __m256d &rnd2hi) {
1421 __m256i ctr0v = _mm256_set1_epi32(ctr0);
1422 __m256i ctr2v = _mm256_set1_epi32(ctr2);
1423 __m256i ctr3v = _mm256_set1_epi32(ctr3);
1425 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
1431 __m256d &rnd1, __m256d &rnd2) {
1433 __m256i ctr0v = _mm256_set1_epi32(ctr0);
1434 __m256i ctr2v = _mm256_set1_epi32(ctr2);
1435 __m256i ctr3v = _mm256_set1_epi32(ctr3);
1438 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
1440 __m128d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
1441 philox_double2(ctr0, _mm256_extractf128_si256(ctr1, 0), ctr2, ctr3, key0,
1442 key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
1443 rnd1 = _my256_set_m128d(rnd1hi, rnd1lo);
1444 rnd2 = _my256_set_m128d(rnd2hi, rnd2lo);
1449#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
1451 __m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(
PHILOX_M4x32_0));
1452 __m512i lohi0b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[0], 32),
1454 __m512i lohi1a = _mm512_mul_epu32(ctr[2], _mm512_set1_epi32(
PHILOX_M4x32_1));
1455 __m512i lohi1b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[2], 32),
1458 lohi0a = _mm512_shuffle_epi32(lohi0a, _MM_PERM_DBCA);
1459 lohi0b = _mm512_shuffle_epi32(lohi0b, _MM_PERM_DBCA);
1460 lohi1a = _mm512_shuffle_epi32(lohi1a, _MM_PERM_DBCA);
1461 lohi1b = _mm512_shuffle_epi32(lohi1b, _MM_PERM_DBCA);
1463 __m512i lo0 = _mm512_unpacklo_epi32(lohi0a, lohi0b);
1464 __m512i hi0 = _mm512_unpackhi_epi32(lohi0a, lohi0b);
1465 __m512i lo1 = _mm512_unpacklo_epi32(lohi1a, lohi1b);
1466 __m512i hi1 = _mm512_unpackhi_epi32(lohi1a, lohi1b);
1468 ctr[0] = _mm512_xor_si512(_mm512_xor_si512(hi1, ctr[1]), key[0]);
1470 ctr[2] = _mm512_xor_si512(_mm512_xor_si512(hi0, ctr[3]), key[1]);
1475 key[0] = _mm512_add_epi32(key[0], _mm512_set1_epi32(
PHILOX_W32_0));
1476 key[1] = _mm512_add_epi32(key[1], _mm512_set1_epi32(
PHILOX_W32_1));
1483 x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 1));
1484 y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 1));
1486 x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 0));
1487 y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 0));
1491 __m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
1492 z = _mm512_xor_si512(x, z);
1495 __m512d rs = _mm512_cvtepu64_pd(z);
1505 __m512 &rnd1, __m512 &rnd2, __m512 &rnd3,
1507 __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
1508 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1530 rnd1 = _mm512_cvtepu32_ps(ctr[0]);
1531 rnd2 = _mm512_cvtepu32_ps(ctr[1]);
1532 rnd3 = _mm512_cvtepu32_ps(ctr[2]);
1533 rnd4 = _mm512_cvtepu32_ps(ctr[3]);
1547 __m512d &rnd1lo, __m512d &rnd1hi,
1548 __m512d &rnd2lo, __m512d &rnd2hi) {
1549 __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
1550 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1571 rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
1572 rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
1573 rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
1574 rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
1579 __m512 &rnd1, __m512 &rnd2, __m512 &rnd3,
1581 __m512i ctr0v = _mm512_set1_epi32(ctr0);
1582 __m512i ctr2v = _mm512_set1_epi32(ctr2);
1583 __m512i ctr3v = _mm512_set1_epi32(ctr3);
1585 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
1590 __m512d &rnd1lo, __m512d &rnd1hi,
1591 __m512d &rnd2lo, __m512d &rnd2hi) {
1592 __m512i ctr0v = _mm512_set1_epi32(ctr0);
1593 __m512i ctr2v = _mm512_set1_epi32(ctr2);
1594 __m512i ctr3v = _mm512_set1_epi32(ctr3);
1596 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo,
1602 __m512d &rnd1, __m512d &rnd2) {
1604 __m512i ctr0v = _mm512_set1_epi32(ctr0);
1605 __m512i ctr2v = _mm512_set1_epi32(ctr2);
1606 __m512i ctr3v = _mm512_set1_epi32(ctr3);
1609 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
1611 __m256d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
1612 philox_double2(ctr0, _mm512_extracti64x4_epi64(ctr1, 0), ctr2, ctr3, key0,
1613 key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
1614 rnd1 = _my512_set_m256d(rnd1hi, rnd1lo);
1615 rnd2 = _my512_set_m256d(rnd2hi, rnd2lo);
1622#undef SVE_QUALIFIERS
1625#undef PHILOX_M4x32_0
1626#undef PHILOX_M4x32_1
1627#undef TWOPOW53_INV_DOUBLE
1628#undef TWOPOW32_INV_FLOAT
QUALIFIERS void _philox4x32bumpkey(uint32 *key)
#define TWOPOW53_INV_DOUBLE
QUALIFIERS void _philox4x32round(uint32 *ctr, uint32 *key)
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32 *hip)
#define TWOPOW32_INV_FLOAT
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, double &rnd1, double &rnd2)
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float &rnd1, float &rnd2, float &rnd3, float &rnd4)
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
Philox counter-based RNG utility functions.