ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
lattice_boltzmann/generated_kernels/myintrin.h
Go to the documentation of this file.
1
/*
2
Copyright 2019-2021, Michael Kuron.
3
4
Redistribution and use in source and binary forms, with or without
5
modification, are permitted provided that the following conditions are
6
met:
7
8
* Redistributions of source code must retain the above copyright
9
notice, this list of conditions, and the following disclaimer.
10
11
* Redistributions in binary form must reproduce the above copyright
12
notice, this list of conditions, and the following disclaimer in the
13
documentation and/or other materials provided with the distribution.
14
15
* Neither the name of the copyright holder nor the names of its
16
contributors may be used to endorse or promote products derived from
17
this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
*/
31
32
/**
33
* @file
34
* Philox counter-based RNG utility functions.
35
* Adapted from the pystencils source file
36
* https://i10git.cs.fau.de/pycodegen/pystencils/-/blob/39c214af/pystencils/include/myintrin.h
37
*/
38
39
#pragma once
40
41
#if defined(__SSE2__) || defined(_MSC_VER)
42
QUALIFIERS
__m128
_my_cvtepu32_ps
(
const
__m128i
v) {
43
#ifdef __AVX512VL__
44
return
_mm_cvtepu32_ps
(v);
45
#else
46
__m128i
v2
=
_mm_srli_epi32
(v, 1);
47
__m128i
v1
=
_mm_and_si128
(v,
_mm_set1_epi32
(1));
48
__m128
v2f
=
_mm_cvtepi32_ps
(
v2
);
49
__m128
v1f
=
_mm_cvtepi32_ps
(
v1
);
50
return
_mm_add_ps
(
_mm_add_ps
(
v2f
,
v2f
),
v1f
);
51
#endif
52
}
53
54
QUALIFIERS
void
_MY_TRANSPOSE4_EPI32
(
__m128i
&
R0
,
__m128i
&
R1
,
__m128i
&
R2
,
55
__m128i
&
R3
) {
56
__m128i
T0
,
T1
,
T2
,
T3
;
57
T0
=
_mm_unpacklo_epi32
(
R0
,
R1
);
58
T1
=
_mm_unpacklo_epi32
(
R2
,
R3
);
59
T2
=
_mm_unpackhi_epi32
(
R0
,
R1
);
60
T3
=
_mm_unpackhi_epi32
(
R2
,
R3
);
61
R0
=
_mm_unpacklo_epi64
(
T0
,
T1
);
62
R1
=
_mm_unpackhi_epi64
(
T0
,
T1
);
63
R2
=
_mm_unpacklo_epi64
(
T2
,
T3
);
64
R3
=
_mm_unpackhi_epi64
(
T2
,
T3
);
65
}
66
#endif
67
68
#if defined(__SSE4_1__) || defined(_MSC_VER)
69
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && \
70
!defined(__clang__)
71
__attribute__
((
optimize
(
"no-associative-math"
)))
72
#endif
73
QUALIFIERS
__m128d
74
_my_cvtepu64_pd
(
const
__m128i
x) {
75
#ifdef __AVX512VL__
76
return
_mm_cvtepu64_pd
(x);
77
#elif defined(__clang__)
78
return
__builtin_convertvector
(
79
(
uint64_t
__attribute__
((
__vector_size__
(16))))x,
__m128d
);
80
#else
81
__m128i
xH
=
_mm_srli_epi64
(x, 32);
82
xH
=
_mm_or_si128
(
83
xH
,
_mm_castpd_si128
(
_mm_set1_pd
(19342813113834066795298816.)));
// 2^84
84
__m128i
xL
=
_mm_blend_epi16
(
85
x,
_mm_castpd_si128
(
_mm_set1_pd
(0x0010000000000000)), 0xcc);
// 2^52
86
__m128d
f =
87
_mm_sub_pd
(
_mm_castsi128_pd
(
xH
),
88
_mm_set1_pd
(19342813118337666422669312.));
// 2^84 + 2^52
89
return
_mm_add_pd
(f,
_mm_castsi128_pd
(
xL
));
90
#endif
91
}
92
#endif
93
94
#ifdef __AVX2__
95
QUALIFIERS
__m256i
_my256_set_m128i
(
__m128i
hi,
__m128i
lo
) {
96
#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
97
return
_mm256_set_m128i
(hi,
lo
);
98
#else
99
return
_mm256_insertf128_si256
(
_mm256_castsi128_si256
(
lo
), hi, 1);
100
#endif
101
}
102
103
QUALIFIERS
__m256d
_my256_set_m128d
(
__m128d
hi,
__m128d
lo
) {
104
#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
105
return
_mm256_set_m128d
(hi,
lo
);
106
#else
107
return
_mm256_insertf128_pd
(
_mm256_castpd128_pd256
(
lo
), hi, 1);
108
#endif
109
}
110
111
QUALIFIERS
__m256
_my256_cvtepu32_ps
(
const
__m256i
v) {
112
#ifdef __AVX512VL__
113
return
_mm256_cvtepu32_ps
(v);
114
#else
115
__m256i
v2
=
_mm256_srli_epi32
(v, 1);
116
__m256i
v1
=
_mm256_and_si256
(v,
_mm256_set1_epi32
(1));
117
__m256
v2f
=
_mm256_cvtepi32_ps
(
v2
);
118
__m256
v1f
=
_mm256_cvtepi32_ps
(
v1
);
119
return
_mm256_add_ps
(
_mm256_add_ps
(
v2f
,
v2f
),
v1f
);
120
#endif
121
}
122
123
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && \
124
!defined(__clang__)
125
__attribute__
((
optimize
(
"no-associative-math"
)))
126
#endif
127
QUALIFIERS
__m256d
128
_my256_cvtepu64_pd
(
const
__m256i
x) {
129
#ifdef __AVX512VL__
130
return
_mm256_cvtepu64_pd
(x);
131
#elif defined(__clang__)
132
return
__builtin_convertvector
(
133
(
uint64_t
__attribute__
((
__vector_size__
(32))))x,
__m256d
);
134
#else
135
__m256i
xH
=
_mm256_srli_epi64
(x, 32);
136
xH
=
_mm256_or_si256
(
xH
,
_mm256_castpd_si256
(
_mm256_set1_pd
(
137
19342813113834066795298816.)));
// 2^84
138
__m256i
xL
=
_mm256_blend_epi16
(
139
x,
_mm256_castpd_si256
(
_mm256_set1_pd
(0x0010000000000000)),
140
0xcc);
// 2^52
141
__m256d
f =
_mm256_sub_pd
(
142
_mm256_castsi256_pd
(
xH
),
143
_mm256_set1_pd
(19342813118337666422669312.));
// 2^84 + 2^52
144
return
_mm256_add_pd
(f,
_mm256_castsi256_pd
(
xL
));
145
#endif
146
}
147
#endif
148
149
#ifdef __AVX512F__
150
QUALIFIERS
__m512i
_my512_set_m128i
(
__m128i
d,
__m128i
c,
__m128i
b,
151
__m128i
a) {
152
return
_mm512_inserti32x4
(
153
_mm512_inserti32x4
(
_mm512_inserti32x4
(
_mm512_castsi128_si512
(a), b, 1), c,
154
2),
155
d, 3);
156
}
157
158
QUALIFIERS
__m512d
_my512_set_m256d
(
__m256d
b,
__m256d
a) {
159
return
_mm512_insertf64x4
(
_mm512_castpd256_pd512
(a), b, 1);
160
}
161
#endif
stream
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
Definition
common_cuda.cu:34
QUALIFIERS
#define QUALIFIERS
Definition
electrokinetics/generated_kernels/philox_rand.h:82
src
walberla_bridge
src
lattice_boltzmann
generated_kernels
myintrin.h
Generated on Tue Dec 9 2025 02:28:57 for ESPResSo by
1.9.8