ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
lattice_boltzmann/generated_kernels/philox_rand.h
Go to the documentation of this file.
1/*
2Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
3Copyright 2019-2024, Michael Kuron.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are
7met:
8
9* Redistributions of source code must retain the above copyright
10 notice, this list of conditions, and the following disclaimer.
11
12* Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions, and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16* Neither the name of the copyright holder nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33// kernel generated with pystencils v1.4, lbmpy v1.4, sympy v1.12.1,
34// lbmpy_walberla/pystencils_walberla from waLBerla commit
35// b0376cce95f6817e924611cc2d9f9e2213610de6
36
37/**
38 * @file
39 * Philox counter-based RNG from @cite salmon11a.
40 * Adapted from the pystencils source file
41 * https://i10git.cs.fau.de/pycodegen/pystencils/-/blob/b4d7ef7cb5b499f3fa55ebfcd598ac7d6e11a3db/src/pystencils/include/philox_rand.h
42 */
43
44#pragma once
45
46#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
47#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
48#include <emmintrin.h> // SSE2
49#endif
50#ifdef __AVX2__
51#include <immintrin.h> // AVX*
52#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
53#include <smmintrin.h> // SSE4
54#ifdef __FMA__
55#include <immintrin.h> // FMA
56#endif
57#endif
58
59#if defined(_MSC_VER) && defined(_M_ARM64)
60#define __ARM_NEON
61#endif
62
63#ifdef __ARM_NEON
64#include <arm_neon.h>
65#endif
66#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
67#include <arm_sve.h>
68#endif
69
70#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && \
71 !defined(__xlC__)
72#include <ppu_intrinsics.h>
73#endif
74#ifdef __ALTIVEC__
75#include <altivec.h>
76#undef bool
77#ifndef _ARCH_PWR8
78#include <pveclib/vec_int64_ppc.h>
79#endif
80#endif
81
82#ifdef __riscv_v
83#include <riscv_vector.h>
84#endif
85#endif
86
87#if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_SVE)
88#define SVE_QUALIFIERS __arm_streaming_compatible
89#elif defined(__ARM_FEATURE_SME)
90#define SVE_QUALIFIERS __arm_streaming
91#else
92#define SVE_QUALIFIERS
93#endif
94
95#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \
96 defined(__clang__) && defined(__CUDA__)
97#define QUALIFIERS static __forceinline__ __device__
98#elif defined(__OPENCL_VERSION__)
99#define QUALIFIERS static inline
100#else
101#define QUALIFIERS inline
102#include "myintrin.h"
103#endif
104
105#define PHILOX_W32_0 (0x9E3779B9)
106#define PHILOX_W32_1 (0xBB67AE85)
107#define PHILOX_M4x32_0 (0xD2511F53)
108#define PHILOX_M4x32_1 (0xCD9E8D57)
109#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
110#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
111
112#ifdef __OPENCL_VERSION__
113#include "opencl_stdint.h"
114typedef uint32_t uint32;
115typedef uint64_t uint64;
116#else
117#ifndef __HIPCC_RTC__
118#include <cstdint>
119#endif
120typedef std::uint32_t uint32;
121typedef std::uint64_t uint64;
122#endif
123
124#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && \
125 __ARM_FEATURE_SVE_BITS > 0
130#elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
133#endif
134
136#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && \
137 (!defined(__clang__) || !defined(__CUDA__))
138 // host code
139#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
140 *hip = __mulhwu(a, b);
141 return a * b;
142#elif defined(__OPENCL_VERSION__)
143 *hip = mul_hi(a, b);
144 return a * b;
145#else
146 uint64 product = ((uint64)a) * ((uint64)b);
147 *hip = product >> 32;
148 return (uint32)product;
149#endif
150#else
151 // device code
152 *hip = __umulhi(a, b);
153 return a * b;
154#endif
155}
156
158 uint32 hi0;
159 uint32 hi1;
162
163 ctr[0] = hi1 ^ ctr[1] ^ key[0];
164 ctr[1] = lo1;
165 ctr[2] = hi0 ^ ctr[3] ^ key[1];
166 ctr[3] = lo0;
167}
168
173
175 uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
176 return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE / 2.0);
177}
178
182 double *rnd1, double *rnd2)
183#else
184 double &rnd1, double &rnd2)
185#endif
186{
187 uint32 key[2] = {key0, key1};
188 uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
189 _philox4x32round(ctr, key); // 1
191 _philox4x32round(ctr, key); // 2
193 _philox4x32round(ctr, key); // 3
195 _philox4x32round(ctr, key); // 4
197 _philox4x32round(ctr, key); // 5
199 _philox4x32round(ctr, key); // 6
201 _philox4x32round(ctr, key); // 7
203 _philox4x32round(ctr, key); // 8
205 _philox4x32round(ctr, key); // 9
207 _philox4x32round(ctr, key); // 10
208
209#ifdef __OPENCL_VERSION__
210 *rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
211 *rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
212#else
213 rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
214 rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
215#endif
216}
217
221 float *rnd1, float *rnd2, float *rnd3,
222 float *rnd4)
223#else
224 float &rnd1, float &rnd2, float &rnd3,
225 float &rnd4)
226#endif
227{
228 uint32 key[2] = {key0, key1};
229 uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
230 _philox4x32round(ctr, key); // 1
232 _philox4x32round(ctr, key); // 2
234 _philox4x32round(ctr, key); // 3
236 _philox4x32round(ctr, key); // 4
238 _philox4x32round(ctr, key); // 5
240 _philox4x32round(ctr, key); // 6
242 _philox4x32round(ctr, key); // 7
244 _philox4x32round(ctr, key); // 8
246 _philox4x32round(ctr, key); // 9
248 _philox4x32round(ctr, key); // 10
249
250#ifdef __OPENCL_VERSION__
255#else
260#endif
261}
262
263#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && \
264 !defined(__HIP_DEVICE_COMPILE__) && \
265 (!defined(__clang__) || !defined(__CUDA__))
266#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
274
279
284
285 ctr[0] = _mm_xor_si128(_mm_xor_si128(hi1, ctr[1]), key[0]);
286 ctr[1] = lo1;
287 ctr[2] = _mm_xor_si128(_mm_xor_si128(hi0, ctr[3]), key[1]);
288 ctr[3] = lo0;
289}
290
294}
295
296template <bool high>
298 // convert 32 to 64 bit
299 if (high) {
302 } else {
305 }
306
307 // calculate z = x ^ y << (53 - 32))
308 __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
309 z = _mm_xor_si128(x, z);
310
311 // convert uint64 to double
313 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
314#ifdef __FMA__
317#else
320#endif
321
322 return rs;
323}
324
328 __m128 &rnd4) {
330 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
331 _philox4x32round(ctr, key); // 1
333 _philox4x32round(ctr, key); // 2
335 _philox4x32round(ctr, key); // 3
337 _philox4x32round(ctr, key); // 4
339 _philox4x32round(ctr, key); // 5
341 _philox4x32round(ctr, key); // 6
343 _philox4x32round(ctr, key); // 7
345 _philox4x32round(ctr, key); // 8
347 _philox4x32round(ctr, key); // 9
349 _philox4x32round(ctr, key); // 10
350
351 // convert uint32 to float
356 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
357#ifdef __FMA__
366#else
375#endif
376}
377
383 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
384 _philox4x32round(ctr, key); // 1
386 _philox4x32round(ctr, key); // 2
388 _philox4x32round(ctr, key); // 3
390 _philox4x32round(ctr, key); // 4
392 _philox4x32round(ctr, key); // 5
394 _philox4x32round(ctr, key); // 6
396 _philox4x32round(ctr, key); // 7
398 _philox4x32round(ctr, key); // 8
400 _philox4x32round(ctr, key); // 9
402 _philox4x32round(ctr, key); // 10
403
408}
409
413 __m128 &rnd4) {
417
419}
420
428
430 rnd2hi);
431}
432
439
442 ignore);
443}
444#endif
445
446#ifdef __ALTIVEC__
447QUALIFIERS void _philox4x32round(__vector unsigned int *ctr,
448 __vector unsigned int *key) {
449#ifndef _ARCH_PWR8
450 __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
451 __vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
452 __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
453 __vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
454#elif defined(_ARCH_PWR10)
455 __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
456 __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
457 __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
458 __vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
459#else
460 __vector unsigned int lohi0a =
462 __vector unsigned int lohi0b =
464 __vector unsigned int lohi1a =
466 __vector unsigned int lohi1b =
468
469#ifdef __LITTLE_ENDIAN__
470 __vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
471 __vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
472 __vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
473 __vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
474#else
475 __vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
476 __vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
477 __vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
478 __vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
479#endif
480#endif
481
482 ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
483 ctr[1] = lo1;
484 ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
485 ctr[3] = lo0;
486}
487
488QUALIFIERS void _philox4x32bumpkey(__vector unsigned int *key) {
491}
492
493#ifdef __VSX__
494template <bool high>
495QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x,
496 __vector unsigned int y) {
497 // convert 32 to 64 bit
498#ifdef __LITTLE_ENDIAN__
499 if (high) {
500 x = vec_mergel(x, vec_splats(0U));
501 y = vec_mergel(y, vec_splats(0U));
502 } else {
503 x = vec_mergeh(x, vec_splats(0U));
504 y = vec_mergeh(y, vec_splats(0U));
505 }
506#else
507 if (high) {
508 x = vec_mergel(vec_splats(0U), x);
509 y = vec_mergel(vec_splats(0U), y);
510 } else {
511 x = vec_mergeh(vec_splats(0U), x);
512 y = vec_mergeh(vec_splats(0U), y);
513 }
514#endif
515
516 // calculate z = x ^ y << (53 - 32))
517#ifdef _ARCH_PWR8
518 __vector unsigned long long z =
519 vec_sl((__vector unsigned long long)y, vec_splats(53ULL - 32ULL));
520#else
521 __vector unsigned long long z =
522 vec_vsld((__vector unsigned long long)y, vec_splats(53ULL - 32ULL));
523#endif
524 z = vec_xor((__vector unsigned long long)x, z);
525
526 // convert uint64 to double
527#ifdef __xlC__
528 __vector double rs = vec_ctd(z, 0);
529#else
530 __vector double rs = vec_ctf(z, 0);
531#endif
532 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
535
536 return rs;
537}
538#endif
539
540QUALIFIERS void philox_float4(__vector unsigned int ctr0,
541 __vector unsigned int ctr1,
542 __vector unsigned int ctr2,
543 __vector unsigned int ctr3, uint32 key0,
544 uint32 key1, __vector float &rnd1,
545 __vector float &rnd2, __vector float &rnd3,
546 __vector float &rnd4) {
547 __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
548 __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
549 _philox4x32round(ctr, key); // 1
551 _philox4x32round(ctr, key); // 2
553 _philox4x32round(ctr, key); // 3
555 _philox4x32round(ctr, key); // 4
557 _philox4x32round(ctr, key); // 5
559 _philox4x32round(ctr, key); // 6
561 _philox4x32round(ctr, key); // 7
563 _philox4x32round(ctr, key); // 8
565 _philox4x32round(ctr, key); // 9
567 _philox4x32round(ctr, key); // 10
568
569 // convert uint32 to float
570 rnd1 = vec_ctf(ctr[0], 0);
571 rnd2 = vec_ctf(ctr[1], 0);
572 rnd3 = vec_ctf(ctr[2], 0);
573 rnd4 = vec_ctf(ctr[3], 0);
574 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
583}
584
585#ifdef __VSX__
586QUALIFIERS void philox_double2(__vector unsigned int ctr0,
587 __vector unsigned int ctr1,
588 __vector unsigned int ctr2,
589 __vector unsigned int ctr3, uint32 key0,
590 uint32 key1, __vector double &rnd1lo,
591 __vector double &rnd1hi, __vector double &rnd2lo,
592 __vector double &rnd2hi) {
593 __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
594 __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
595 _philox4x32round(ctr, key); // 1
597 _philox4x32round(ctr, key); // 2
599 _philox4x32round(ctr, key); // 3
601 _philox4x32round(ctr, key); // 4
603 _philox4x32round(ctr, key); // 5
605 _philox4x32round(ctr, key); // 6
607 _philox4x32round(ctr, key); // 7
609 _philox4x32round(ctr, key); // 8
611 _philox4x32round(ctr, key); // 9
613 _philox4x32round(ctr, key); // 10
614
619}
620#endif
621
624 uint32 key1, __vector float &rnd1,
625 __vector float &rnd2, __vector float &rnd3,
626 __vector float &rnd4) {
627 __vector unsigned int ctr0v = vec_splats(ctr0);
628 __vector unsigned int ctr2v = vec_splats(ctr2);
629 __vector unsigned int ctr3v = vec_splats(ctr3);
630
632}
633
636 __vector float &rnd1, __vector float &rnd2,
637 __vector float &rnd3, __vector float &rnd4) {
638 philox_float4(ctr0, (__vector unsigned int)ctr1, ctr2, ctr3, key0, key1, rnd1,
639 rnd2, rnd3, rnd4);
640}
641
642#ifdef __VSX__
645 uint32 key1, __vector double &rnd1lo,
646 __vector double &rnd1hi, __vector double &rnd2lo,
647 __vector double &rnd2hi) {
648 __vector unsigned int ctr0v = vec_splats(ctr0);
649 __vector unsigned int ctr2v = vec_splats(ctr2);
650 __vector unsigned int ctr3v = vec_splats(ctr3);
651
653 rnd2hi);
654}
655
658 uint32 key1, __vector double &rnd1,
659 __vector double &rnd2) {
660 __vector unsigned int ctr0v = vec_splats(ctr0);
661 __vector unsigned int ctr2v = vec_splats(ctr2);
662 __vector unsigned int ctr3v = vec_splats(ctr3);
663
664 __vector double ignore;
666 ignore);
667}
668
671 __vector double &rnd1, __vector double &rnd2) {
672 philox_double2(ctr0, (__vector unsigned int)ctr1, ctr2, ctr3, key0, key1,
673 rnd1, rnd2);
674}
675#endif
676#endif
677
678#if defined(__ARM_NEON)
688
693
694 ctr[0] = veorq_u32(veorq_u32(hi1, ctr[1]), key[0]);
695 ctr[1] = lo1;
696 ctr[2] = veorq_u32(veorq_u32(hi0, ctr[3]), key[1]);
697 ctr[3] = lo0;
698}
699
703}
704
705template <bool high>
707 // convert 32 to 64 bit
708 if (high) {
709 x = vzip2q_u32(x, vdupq_n_u32(0));
710 y = vzip2q_u32(y, vdupq_n_u32(0));
711 } else {
712 x = vzip1q_u32(x, vdupq_n_u32(0));
713 y = vzip1q_u32(y, vdupq_n_u32(0));
714 }
715
716 // calculate z = x ^ y << (53 - 32))
719
720 // convert uint64 to double
722 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
725
726 return rs;
727}
728
734 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
735 _philox4x32round(ctr, key); // 1
737 _philox4x32round(ctr, key); // 2
739 _philox4x32round(ctr, key); // 3
741 _philox4x32round(ctr, key); // 4
743 _philox4x32round(ctr, key); // 5
745 _philox4x32round(ctr, key); // 6
747 _philox4x32round(ctr, key); // 7
749 _philox4x32round(ctr, key); // 8
751 _philox4x32round(ctr, key); // 9
753 _philox4x32round(ctr, key); // 10
754
755 // convert uint32 to float
756 rnd1 = vcvtq_f32_u32(ctr[0]);
757 rnd2 = vcvtq_f32_u32(ctr[1]);
758 rnd3 = vcvtq_f32_u32(ctr[2]);
759 rnd4 = vcvtq_f32_u32(ctr[3]);
760 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
769}
770
777 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
778 _philox4x32round(ctr, key); // 1
780 _philox4x32round(ctr, key); // 2
782 _philox4x32round(ctr, key); // 3
784 _philox4x32round(ctr, key); // 4
786 _philox4x32round(ctr, key); // 5
788 _philox4x32round(ctr, key); // 6
790 _philox4x32round(ctr, key); // 7
792 _philox4x32round(ctr, key); // 8
794 _philox4x32round(ctr, key); // 9
796 _philox4x32round(ctr, key); // 10
797
802}
803
811
813}
814
815#ifndef _MSC_VER
821 rnd2, rnd3, rnd4);
822}
823#endif
824
832
834 rnd2hi);
835}
836
843
846 ignore);
847}
848
849#ifndef _MSC_VER
854 rnd1, rnd2);
855}
856#endif
857#endif
858
859#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
870
871 ctr = svset4_u32(
872 ctr, 0,
875 svget2_u32(key, 0)));
876 ctr = svset4_u32(ctr, 1, lo1);
877 ctr = svset4_u32(
878 ctr, 2,
881 svget2_u32(key, 1)));
882 ctr = svset4_u32(ctr, 3, lo0);
883}
884
886 key = svset2_u32(
887 key, 0,
889 key = svset2_u32(
890 key, 1,
892}
893
894template <bool high>
897 // convert 32 to 64 bit
898 if (high) {
899 x = svzip2_u32(x, svdup_u32(0));
900 y = svzip2_u32(y, svdup_u32(0));
901 } else {
902 x = svzip1_u32(x, svdup_u32(0));
903 y = svzip1_u32(y, svdup_u32(0));
904 }
905
906 // calculate z = x ^ y << (53 - 32))
907 svuint64_t z =
910
911 // convert uint64 to double
913 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
916
917 return rs;
918}
919
927 _philox4x32round(ctr, key); // 1
929 _philox4x32round(ctr, key); // 2
931 _philox4x32round(ctr, key); // 3
933 _philox4x32round(ctr, key); // 4
935 _philox4x32round(ctr, key); // 5
937 _philox4x32round(ctr, key); // 6
939 _philox4x32round(ctr, key); // 7
941 _philox4x32round(ctr, key); // 8
943 _philox4x32round(ctr, key); // 9
945 _philox4x32round(ctr, key); // 10
946
947 // convert uint32 to float
952 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
961}
962
970 _philox4x32round(ctr, key); // 1
972 _philox4x32round(ctr, key); // 2
974 _philox4x32round(ctr, key); // 3
976 _philox4x32round(ctr, key); // 4
978 _philox4x32round(ctr, key); // 5
980 _philox4x32round(ctr, key); // 6
982 _philox4x32round(ctr, key); // 7
984 _philox4x32round(ctr, key); // 8
986 _philox4x32round(ctr, key); // 9
988 _philox4x32round(ctr, key); // 10
989
994}
995
1004
1006}
1007
1014 rnd2, rnd3, rnd4);
1015}
1016
1025
1027 rnd2hi);
1028}
1029
1037
1040 ignore);
1041}
1042
1048 rnd1, rnd2);
1049}
1050#endif
1051
1052#if defined(__riscv_v)
1068
1072 ctr1 = lo1;
1076 ctr3 = lo0;
1077}
1078
1086}
1087
1088template <bool high>
1090 // convert 32 to 64 bit
1091 if (high) {
1092 size_t s = __riscv_vsetvlmax_e32m1();
1093 x = __riscv_vslidedown_vx_u32m1(x, s / 2, s);
1094 y = __riscv_vslidedown_vx_u32m1(y, s / 2, s);
1095 }
1100
1101 // calculate z = x ^ y << (53 - 32))
1102 vuint64m1_t z =
1105
1106 // convert uint64 to double
1108 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
1110 rs,
1115
1116 return rs;
1117}
1118
1123 vfloat32m1_t &rnd4) {
1145
1146 // convert uint32 to float
1151 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
1153 rnd1,
1159 rnd2,
1165 rnd3,
1171 rnd4,
1176}
1177
1204
1209}
1210
1218
1220}
1221
1227 key0, key1, rnd1, rnd2, rnd3, rnd4);
1228}
1229
1237
1239 rnd2hi);
1240}
1241
1248
1251 ignore);
1252}
1253
1258 key0, key1, rnd1, rnd2);
1259}
1260#endif
1261
1262#ifdef __AVX2__
1270
1275
1280
1282 ctr[1] = lo1;
1284 ctr[3] = lo0;
1285}
1286
1290}
1291
1292template <bool high>
1294 // convert 32 to 64 bit
1295 if (high) {
1298 } else {
1301 }
1302
1303 // calculate z = x ^ y << (53 - 32))
1304 __m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
1305 z = _mm256_xor_si256(x, z);
1306
1307 // convert uint64 to double
1309 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
1310#ifdef __FMA__
1313#else
1316#endif
1317
1318 return rs;
1319}
1320
1324 __m256 &rnd4) {
1326 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1327 _philox4x32round(ctr, key); // 1
1329 _philox4x32round(ctr, key); // 2
1331 _philox4x32round(ctr, key); // 3
1333 _philox4x32round(ctr, key); // 4
1335 _philox4x32round(ctr, key); // 5
1337 _philox4x32round(ctr, key); // 6
1339 _philox4x32round(ctr, key); // 7
1341 _philox4x32round(ctr, key); // 8
1343 _philox4x32round(ctr, key); // 9
1345 _philox4x32round(ctr, key); // 10
1346
1347 // convert uint32 to float
1352 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
1353#ifdef __FMA__
1362#else
1371#endif
1372}
1373
1379 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1380 _philox4x32round(ctr, key); // 1
1382 _philox4x32round(ctr, key); // 2
1384 _philox4x32round(ctr, key); // 3
1386 _philox4x32round(ctr, key); // 4
1388 _philox4x32round(ctr, key); // 5
1390 _philox4x32round(ctr, key); // 6
1392 _philox4x32round(ctr, key); // 7
1394 _philox4x32round(ctr, key); // 8
1396 _philox4x32round(ctr, key); // 9
1398 _philox4x32round(ctr, key); // 10
1399
1404}
1405
1409 __m256 &rnd4) {
1413
1415}
1416
1424
1426 rnd2hi);
1427}
1428
1431 __m256d &rnd1, __m256d &rnd2) {
1432#if 0
1436
1439#else
1445#endif
1446}
1447#endif
1448
1449#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
1457
1462
1467
1469 ctr[1] = lo1;
1471 ctr[3] = lo0;
1472}
1473
1477}
1478
1479template <bool high>
1481 // convert 32 to 64 bit
1482 if (high) {
1485 } else {
1488 }
1489
1490 // calculate z = x ^ y << (53 - 32))
1491 __m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
1492 z = _mm512_xor_si512(x, z);
1493
1494 // convert uint64 to double
1496 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
1499
1500 return rs;
1501}
1502
1506 __m512 &rnd4) {
1508 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1509 _philox4x32round(ctr, key); // 1
1511 _philox4x32round(ctr, key); // 2
1513 _philox4x32round(ctr, key); // 3
1515 _philox4x32round(ctr, key); // 4
1517 _philox4x32round(ctr, key); // 5
1519 _philox4x32round(ctr, key); // 6
1521 _philox4x32round(ctr, key); // 7
1523 _philox4x32round(ctr, key); // 8
1525 _philox4x32round(ctr, key); // 9
1527 _philox4x32round(ctr, key); // 10
1528
1529 // convert uint32 to float
1534 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
1543}
1544
1550 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1551 _philox4x32round(ctr, key); // 1
1553 _philox4x32round(ctr, key); // 2
1555 _philox4x32round(ctr, key); // 3
1557 _philox4x32round(ctr, key); // 4
1559 _philox4x32round(ctr, key); // 5
1561 _philox4x32round(ctr, key); // 6
1563 _philox4x32round(ctr, key); // 7
1565 _philox4x32round(ctr, key); // 8
1567 _philox4x32round(ctr, key); // 9
1569 _philox4x32round(ctr, key); // 10
1570
1575}
1576
1580 __m512 &rnd4) {
1584
1586}
1587
1595
1597 rnd2hi);
1598}
1599
1602 __m512d &rnd1, __m512d &rnd2) {
1603#if 0
1607
1610#else
1616#endif
1617}
1618#endif
1619#endif
1620
1621#undef QUALIFIERS
1622#undef SVE_QUALIFIERS
1623#undef PHILOX_W32_0
1624#undef PHILOX_W32_1
1625#undef PHILOX_M4x32_0
1626#undef PHILOX_M4x32_1
1627#undef TWOPOW53_INV_DOUBLE
1628#undef TWOPOW32_INV_FLOAT
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
Philox counter-based RNG utility functions.
QUALIFIERS void _philox4x32bumpkey(uint32 *key)
QUALIFIERS void _philox4x32round(uint32 *ctr, uint32 *key)
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32 *hip)
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, double &rnd1, double &rnd2)
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float &rnd1, float &rnd2, float &rnd3, float &rnd4)
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)