ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
lattice_boltzmann/generated_kernels/philox_rand.h
Go to the documentation of this file.
1/*
2Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
3Copyright 2019-2021, Michael Kuron.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are
7met:
8
9* Redistributions of source code must retain the above copyright
10 notice, this list of conditions, and the following disclaimer.
11
12* Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions, and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16* Neither the name of the copyright holder nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33/**
34 * @file
35 * Philox counter-based RNG from @cite salmon11a.
36 * Adapted from the pystencils source file
37 * https://i10git.cs.fau.de/pycodegen/pystencils/-/blob/896b4192/pystencils/include/philox_rand.h
38 */
39
40#include <cstdint>
41
42#if defined(__SSE2__) || defined(_MSC_VER)
43#include <emmintrin.h> // SSE2
44#endif
45#ifdef __AVX2__
46#include <immintrin.h> // AVX*
47#elif defined(__SSE4_1__) || defined(_MSC_VER)
48#include <smmintrin.h> // SSE4
49#ifdef __FMA__
50#include <immintrin.h> // FMA
51#endif
52#endif
53
54#ifdef __ARM_NEON
55#include <arm_neon.h>
56#endif
57#ifdef __ARM_FEATURE_SVE
58#include <arm_sve.h>
59#endif
60
61#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && \
62 !defined(__xlC__)
63#include <ppu_intrinsics.h>
64#endif
65#ifdef __ALTIVEC__
66#include <altivec.h>
67#undef bool
68#ifndef _ARCH_PWR8
69#include <pveclib/vec_int64_ppc.h>
70#endif
71#endif
72
73#if defined(__CUDA_ARCH__) || defined(__clang__) && defined(__CUDA__)
74#if defined(__clang__) && defined(QUALIFIERS)
75#undef QUALIFIERS
76#endif
77#define QUALIFIERS static __forceinline__ __device__
78#else
79#if defined(__clang__) && defined(QUALIFIERS)
80#undef QUALIFIERS
81#endif
82#define QUALIFIERS inline
83#include "myintrin.h"
84#endif
85
86#define PHILOX_W32_0 (0x9E3779B9)
87#define PHILOX_W32_1 (0xBB67AE85)
88#define PHILOX_M4x32_0 (0xD2511F53)
89#define PHILOX_M4x32_1 (0xCD9E8D57)
90#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
91#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
92
93typedef std::uint32_t uint32;
94typedef std::uint64_t uint64;
95
96#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && \
97 __ARM_FEATURE_SVE_BITS > 0
102#elif defined(__ARM_FEATURE_SVE)
105#endif
106
108#ifndef __CUDA_ARCH__
109 // host code
110#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
111 *hip = __mulhwu(a, b);
112 return a * b;
113#else
114 uint64 product = ((uint64)a) * ((uint64)b);
115 *hip = product >> 32;
116 return (uint32)product;
117#endif
118#else
119 // device code
120 *hip = __umulhi(a, b);
121 return a * b;
122#endif
123}
124
126 uint32 hi0;
127 uint32 hi1;
130
131 ctr[0] = hi1 ^ ctr[1] ^ key[0];
132 ctr[1] = lo1;
133 ctr[2] = hi0 ^ ctr[3] ^ key[1];
134 ctr[3] = lo0;
135}
136
141
143 double z = (double)((uint64)x ^ ((uint64)y << (53 - 32)));
144 return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE / 2.0);
145}
146
175
207
208#ifndef __CUDA_ARCH__
209#if defined(__SSE4_1__) || defined(_MSC_VER)
217
222
227
228 ctr[0] = _mm_xor_si128(_mm_xor_si128(hi1, ctr[1]), key[0]);
229 ctr[1] = lo1;
230 ctr[2] = _mm_xor_si128(_mm_xor_si128(hi0, ctr[3]), key[1]);
231 ctr[3] = lo0;
232}
233
237}
238
239template <bool high>
241 // convert 32 to 64 bit
242 if (high) {
245 } else {
248 }
249
250 // calculate z = x ^ y << (53 - 32))
251 __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
252 z = _mm_xor_si128(x, z);
253
254 // convert uint64 to double
256 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
257#ifdef __FMA__
260#else
263#endif
264
265 return rs;
266}
267
271 __m128 &rnd4) {
273 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
274 _philox4x32round(ctr, key); // 1
276 _philox4x32round(ctr, key); // 2
278 _philox4x32round(ctr, key); // 3
280 _philox4x32round(ctr, key); // 4
282 _philox4x32round(ctr, key); // 5
284 _philox4x32round(ctr, key); // 6
286 _philox4x32round(ctr, key); // 7
288 _philox4x32round(ctr, key); // 8
290 _philox4x32round(ctr, key); // 9
292 _philox4x32round(ctr, key); // 10
293
294 // convert uint32 to float
299 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
300#ifdef __FMA__
309#else
318#endif
319}
320
326 __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
327 _philox4x32round(ctr, key); // 1
329 _philox4x32round(ctr, key); // 2
331 _philox4x32round(ctr, key); // 3
333 _philox4x32round(ctr, key); // 4
335 _philox4x32round(ctr, key); // 5
337 _philox4x32round(ctr, key); // 6
339 _philox4x32round(ctr, key); // 7
341 _philox4x32round(ctr, key); // 8
343 _philox4x32round(ctr, key); // 9
345 _philox4x32round(ctr, key); // 10
346
351}
352
356 __m128 &rnd4) {
360
362}
363
371
373 rnd2hi);
374}
375
382
385 ignore);
386}
387#endif
388
389#ifdef __ALTIVEC__
390QUALIFIERS void _philox4x32round(__vector unsigned int *ctr,
391 __vector unsigned int *key) {
392#ifndef _ARCH_PWR8
393 __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
394 __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
395 __vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
396 __vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
397#elif defined(_ARCH_PWR10)
398 __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
399 __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
400 __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
401 __vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
402#else
403 __vector unsigned int lohi0a =
405 __vector unsigned int lohi0b =
407 __vector unsigned int lohi1a =
409 __vector unsigned int lohi1b =
411
412#ifdef __LITTLE_ENDIAN__
413 __vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
414 __vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
415 __vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
416 __vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
417#else
418 __vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
419 __vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
420 __vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
421 __vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
422#endif
423#endif
424
425 ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
426 ctr[1] = lo1;
427 ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
428 ctr[3] = lo0;
429}
430
431QUALIFIERS void _philox4x32bumpkey(__vector unsigned int *key) {
434}
435
436#ifdef __VSX__
437template <bool high>
438QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x,
439 __vector unsigned int y) {
440 // convert 32 to 64 bit
441#ifdef __LITTLE_ENDIAN__
442 if (high) {
443 x = vec_mergel(x, vec_splats(0U));
444 y = vec_mergel(y, vec_splats(0U));
445 } else {
446 x = vec_mergeh(x, vec_splats(0U));
447 y = vec_mergeh(y, vec_splats(0U));
448 }
449#else
450 if (high) {
451 x = vec_mergel(vec_splats(0U), x);
452 y = vec_mergel(vec_splats(0U), y);
453 } else {
454 x = vec_mergeh(vec_splats(0U), x);
455 y = vec_mergeh(vec_splats(0U), y);
456 }
457#endif
458
459 // calculate z = x ^ y << (53 - 32))
460#ifdef _ARCH_PWR8
461 __vector unsigned long long z =
462 vec_sl((__vector unsigned long long)y, vec_splats(53ULL - 32ULL));
463#else
464 __vector unsigned long long z =
465 vec_vsld((__vector unsigned long long)y, vec_splats(53ULL - 32ULL));
466#endif
467 z = vec_xor((__vector unsigned long long)x, z);
468
469 // convert uint64 to double
470#ifdef __xlC__
471 __vector double rs = vec_ctd(z, 0);
472#else
473 __vector double rs = vec_ctf(z, 0);
474#endif
475 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
478
479 return rs;
480}
481#endif
482
483QUALIFIERS void philox_float4(__vector unsigned int ctr0,
484 __vector unsigned int ctr1,
485 __vector unsigned int ctr2,
486 __vector unsigned int ctr3, uint32 key0,
487 uint32 key1, __vector float &rnd1,
488 __vector float &rnd2, __vector float &rnd3,
489 __vector float &rnd4) {
490 __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
491 __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
492 _philox4x32round(ctr, key); // 1
494 _philox4x32round(ctr, key); // 2
496 _philox4x32round(ctr, key); // 3
498 _philox4x32round(ctr, key); // 4
500 _philox4x32round(ctr, key); // 5
502 _philox4x32round(ctr, key); // 6
504 _philox4x32round(ctr, key); // 7
506 _philox4x32round(ctr, key); // 8
508 _philox4x32round(ctr, key); // 9
510 _philox4x32round(ctr, key); // 10
511
512 // convert uint32 to float
513 rnd1 = vec_ctf(ctr[0], 0);
514 rnd2 = vec_ctf(ctr[1], 0);
515 rnd3 = vec_ctf(ctr[2], 0);
516 rnd4 = vec_ctf(ctr[3], 0);
517 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
526}
527
528#ifdef __VSX__
529QUALIFIERS void philox_double2(__vector unsigned int ctr0,
530 __vector unsigned int ctr1,
531 __vector unsigned int ctr2,
532 __vector unsigned int ctr3, uint32 key0,
533 uint32 key1, __vector double &rnd1lo,
534 __vector double &rnd1hi, __vector double &rnd2lo,
535 __vector double &rnd2hi) {
536 __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
537 __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
538 _philox4x32round(ctr, key); // 1
540 _philox4x32round(ctr, key); // 2
542 _philox4x32round(ctr, key); // 3
544 _philox4x32round(ctr, key); // 4
546 _philox4x32round(ctr, key); // 5
548 _philox4x32round(ctr, key); // 6
550 _philox4x32round(ctr, key); // 7
552 _philox4x32round(ctr, key); // 8
554 _philox4x32round(ctr, key); // 9
556 _philox4x32round(ctr, key); // 10
557
562}
563#endif
564
567 uint32 key1, __vector float &rnd1,
568 __vector float &rnd2, __vector float &rnd3,
569 __vector float &rnd4) {
570 __vector unsigned int ctr0v = vec_splats(ctr0);
571 __vector unsigned int ctr2v = vec_splats(ctr2);
572 __vector unsigned int ctr3v = vec_splats(ctr3);
573
575}
576
579 __vector float &rnd1, __vector float &rnd2,
580 __vector float &rnd3, __vector float &rnd4) {
581 philox_float4(ctr0, (__vector unsigned int)ctr1, ctr2, ctr3, key0, key1, rnd1,
582 rnd2, rnd3, rnd4);
583}
584
585#ifdef __VSX__
588 uint32 key1, __vector double &rnd1lo,
589 __vector double &rnd1hi, __vector double &rnd2lo,
590 __vector double &rnd2hi) {
591 __vector unsigned int ctr0v = vec_splats(ctr0);
592 __vector unsigned int ctr2v = vec_splats(ctr2);
593 __vector unsigned int ctr3v = vec_splats(ctr3);
594
596 rnd2hi);
597}
598
601 uint32 key1, __vector double &rnd1,
602 __vector double &rnd2) {
603 __vector unsigned int ctr0v = vec_splats(ctr0);
604 __vector unsigned int ctr2v = vec_splats(ctr2);
605 __vector unsigned int ctr3v = vec_splats(ctr3);
606
607 __vector double ignore;
609 ignore);
610}
611
614 __vector double &rnd1, __vector double &rnd2) {
615 philox_double2(ctr0, (__vector unsigned int)ctr1, ctr2, ctr3, key0, key1,
616 rnd1, rnd2);
617}
618#endif
619#endif
620
621#if defined(__ARM_NEON)
631
636
637 ctr[0] = veorq_u32(veorq_u32(hi1, ctr[1]), key[0]);
638 ctr[1] = lo1;
639 ctr[2] = veorq_u32(veorq_u32(hi0, ctr[3]), key[1]);
640 ctr[3] = lo0;
641}
642
646}
647
648template <bool high>
650 // convert 32 to 64 bit
651 if (high) {
652 x = vzip2q_u32(x, vdupq_n_u32(0));
653 y = vzip2q_u32(y, vdupq_n_u32(0));
654 } else {
655 x = vzip1q_u32(x, vdupq_n_u32(0));
656 y = vzip1q_u32(y, vdupq_n_u32(0));
657 }
658
659 // calculate z = x ^ y << (53 - 32))
662
663 // convert uint64 to double
665 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
668
669 return rs;
670}
671
677 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
678 _philox4x32round(ctr, key); // 1
680 _philox4x32round(ctr, key); // 2
682 _philox4x32round(ctr, key); // 3
684 _philox4x32round(ctr, key); // 4
686 _philox4x32round(ctr, key); // 5
688 _philox4x32round(ctr, key); // 6
690 _philox4x32round(ctr, key); // 7
692 _philox4x32round(ctr, key); // 8
694 _philox4x32round(ctr, key); // 9
696 _philox4x32round(ctr, key); // 10
697
698 // convert uint32 to float
699 rnd1 = vcvtq_f32_u32(ctr[0]);
700 rnd2 = vcvtq_f32_u32(ctr[1]);
701 rnd3 = vcvtq_f32_u32(ctr[2]);
702 rnd4 = vcvtq_f32_u32(ctr[3]);
703 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
712}
713
720 uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
721 _philox4x32round(ctr, key); // 1
723 _philox4x32round(ctr, key); // 2
725 _philox4x32round(ctr, key); // 3
727 _philox4x32round(ctr, key); // 4
729 _philox4x32round(ctr, key); // 5
731 _philox4x32round(ctr, key); // 6
733 _philox4x32round(ctr, key); // 7
735 _philox4x32round(ctr, key); // 8
737 _philox4x32round(ctr, key); // 9
739 _philox4x32round(ctr, key); // 10
740
745}
746
754
756}
757
763 rnd2, rnd3, rnd4);
764}
765
773
775 rnd2hi);
776}
777
784
787 ignore);
788}
789
794 rnd1, rnd2);
795}
796#endif
797
798#if defined(__ARM_FEATURE_SVE)
808
809 ctr = svset4_u32(
810 ctr, 0,
813 svget2_u32(key, 0)));
814 ctr = svset4_u32(ctr, 1, lo1);
815 ctr = svset4_u32(
816 ctr, 2,
819 svget2_u32(key, 1)));
820 ctr = svset4_u32(ctr, 3, lo0);
821}
822
824 key = svset2_u32(
825 key, 0,
827 key = svset2_u32(
828 key, 1,
830}
831
832template <bool high>
834 // convert 32 to 64 bit
835 if (high) {
836 x = svzip2_u32(x, svdup_u32(0));
837 y = svzip2_u32(y, svdup_u32(0));
838 } else {
839 x = svzip1_u32(x, svdup_u32(0));
840 y = svzip1_u32(y, svdup_u32(0));
841 }
842
843 // calculate z = x ^ y << (53 - 32))
844 svuint64_t z =
847
848 // convert uint64 to double
850 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
853
854 return rs;
855}
856
863 _philox4x32round(ctr, key); // 1
865 _philox4x32round(ctr, key); // 2
867 _philox4x32round(ctr, key); // 3
869 _philox4x32round(ctr, key); // 4
871 _philox4x32round(ctr, key); // 5
873 _philox4x32round(ctr, key); // 6
875 _philox4x32round(ctr, key); // 7
877 _philox4x32round(ctr, key); // 8
879 _philox4x32round(ctr, key); // 9
881 _philox4x32round(ctr, key); // 10
882
883 // convert uint32 to float
888 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
897}
898
906 _philox4x32round(ctr, key); // 1
908 _philox4x32round(ctr, key); // 2
910 _philox4x32round(ctr, key); // 3
912 _philox4x32round(ctr, key); // 4
914 _philox4x32round(ctr, key); // 5
916 _philox4x32round(ctr, key); // 6
918 _philox4x32round(ctr, key); // 7
920 _philox4x32round(ctr, key); // 8
922 _philox4x32round(ctr, key); // 9
924 _philox4x32round(ctr, key); // 10
925
930}
931
939
941}
942
948 rnd2, rnd3, rnd4);
949}
950
958
960 rnd2hi);
961}
962
969
972 ignore);
973}
974
979 rnd1, rnd2);
980}
981#endif
982
983#ifdef __AVX2__
991
996
1001
1003 ctr[1] = lo1;
1005 ctr[3] = lo0;
1006}
1007
1011}
1012
1013template <bool high>
1015 // convert 32 to 64 bit
1016 if (high) {
1019 } else {
1022 }
1023
1024 // calculate z = x ^ y << (53 - 32))
1025 __m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
1026 z = _mm256_xor_si256(x, z);
1027
1028 // convert uint64 to double
1030 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
1031#ifdef __FMA__
1034#else
1037#endif
1038
1039 return rs;
1040}
1041
1045 __m256 &rnd4) {
1047 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1048 _philox4x32round(ctr, key); // 1
1050 _philox4x32round(ctr, key); // 2
1052 _philox4x32round(ctr, key); // 3
1054 _philox4x32round(ctr, key); // 4
1056 _philox4x32round(ctr, key); // 5
1058 _philox4x32round(ctr, key); // 6
1060 _philox4x32round(ctr, key); // 7
1062 _philox4x32round(ctr, key); // 8
1064 _philox4x32round(ctr, key); // 9
1066 _philox4x32round(ctr, key); // 10
1067
1068 // convert uint32 to float
1073 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
1074#ifdef __FMA__
1083#else
1092#endif
1093}
1094
1100 __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1101 _philox4x32round(ctr, key); // 1
1103 _philox4x32round(ctr, key); // 2
1105 _philox4x32round(ctr, key); // 3
1107 _philox4x32round(ctr, key); // 4
1109 _philox4x32round(ctr, key); // 5
1111 _philox4x32round(ctr, key); // 6
1113 _philox4x32round(ctr, key); // 7
1115 _philox4x32round(ctr, key); // 8
1117 _philox4x32round(ctr, key); // 9
1119 _philox4x32round(ctr, key); // 10
1120
1125}
1126
1130 __m256 &rnd4) {
1134
1136}
1137
1145
1147 rnd2hi);
1148}
1149
1152 __m256d &rnd1, __m256d &rnd2) {
1153#if 0
1157
1160#else
1166#endif
1167}
1168#endif
1169
1170#ifdef __AVX512F__
1178
1183
1188
1190 ctr[1] = lo1;
1192 ctr[3] = lo0;
1193}
1194
1198}
1199
1200template <bool high>
1202 // convert 32 to 64 bit
1203 if (high) {
1206 } else {
1209 }
1210
1211 // calculate z = x ^ y << (53 - 32))
1212 __m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
1213 z = _mm512_xor_si512(x, z);
1214
1215 // convert uint64 to double
1217 // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
1220
1221 return rs;
1222}
1223
1227 __m512 &rnd4) {
1229 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1230 _philox4x32round(ctr, key); // 1
1232 _philox4x32round(ctr, key); // 2
1234 _philox4x32round(ctr, key); // 3
1236 _philox4x32round(ctr, key); // 4
1238 _philox4x32round(ctr, key); // 5
1240 _philox4x32round(ctr, key); // 6
1242 _philox4x32round(ctr, key); // 7
1244 _philox4x32round(ctr, key); // 8
1246 _philox4x32round(ctr, key); // 9
1248 _philox4x32round(ctr, key); // 10
1249
1250 // convert uint32 to float
1255 // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
1264}
1265
1271 __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
1272 _philox4x32round(ctr, key); // 1
1274 _philox4x32round(ctr, key); // 2
1276 _philox4x32round(ctr, key); // 3
1278 _philox4x32round(ctr, key); // 4
1280 _philox4x32round(ctr, key); // 5
1282 _philox4x32round(ctr, key); // 6
1284 _philox4x32round(ctr, key); // 7
1286 _philox4x32round(ctr, key); // 8
1288 _philox4x32round(ctr, key); // 9
1290 _philox4x32round(ctr, key); // 10
1291
1296}
1297
1301 __m512 &rnd4) {
1305
1307}
1308
1316
1318 rnd2hi);
1319}
1320
1323 __m512d &rnd1, __m512d &rnd2) {
1324#if 0
1328
1331#else
1337#endif
1338}
1339#endif
1340#endif
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
Philox counter-based RNG utility functions.
QUALIFIERS void _philox4x32bumpkey(uint32 *key)
QUALIFIERS void _philox4x32round(uint32 *ctr, uint32 *key)
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32 *hip)
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, double &rnd1, double &rnd2)
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float &rnd1, float &rnd2, float &rnd3, float &rnd4)
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)