ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
FixedFlux_single_precision_CUDA.cu
Go to the documentation of this file.
1//======================================================================================================================
2//
3// This file is part of waLBerla. waLBerla is free software: you can
4// redistribute it and/or modify it under the terms of the GNU General Public
5// License as published by the Free Software Foundation, either version 3 of
6// the License, or (at your option) any later version.
7//
8// waLBerla is distributed in the hope that it will be useful, but WITHOUT
9// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11// for more details.
12//
13// You should have received a copy of the GNU General Public License along
14// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
15//
16//! \\file FixedFlux_single_precision_CUDA.cpp
17//! \\author pystencils
18//======================================================================================================================
19
20// kernel generated with pystencils v1.3.7+13.gdfd203a, lbmpy v1.3.7+10.gd3f6236, sympy v1.12.1, lbmpy_walberla/pystencils_walberla from waLBerla commit c69cb11d6a95d32b2280544d3d9abde1fe5fdbb5
21
23#include "core/DataTypes.h"
24#include "core/Macros.h"
25#include "gpu/ErrorChecking.h"
26
27#define FUNC_PREFIX __global__
28
29using namespace std;
30
31namespace walberla {
32namespace pystencils {
33
34#if defined(__NVCC__)
35#define RESTRICT __restrict__
36#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
37#pragma nv_diagnostic push
38#pragma nv_diag_suppress 177 // unused variable
39#else
40#pragma push
41#pragma diag_suppress 177 // unused variable
42#endif // defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
43#elif defined(__clang__)
44#if defined(__CUDA__)
45#if defined(__CUDA_ARCH__)
46// clang compiling CUDA code in device mode
47#define RESTRICT __restrict__
48#pragma clang diagnostic push
49#pragma clang diagnostic ignored "-Wstrict-aliasing"
50#pragma clang diagnostic ignored "-Wunused-variable"
51#pragma clang diagnostic ignored "-Wconversion"
52#pragma clang diagnostic ignored "-Wsign-compare"
53#else
54// clang compiling CUDA code in host mode
55#define RESTRICT __restrict__
56#pragma clang diagnostic push
57#pragma clang diagnostic ignored "-Wstrict-aliasing"
58#pragma clang diagnostic ignored "-Wunused-variable"
59#pragma clang diagnostic ignored "-Wconversion"
60#pragma clang diagnostic ignored "-Wsign-compare"
61#endif // defined(__CUDA_ARCH__)
62#endif // defined(__CUDA__)
63#elif defined(__GNUC__) or defined(__GNUG__)
64#define RESTRICT __restrict__
65#pragma GCC diagnostic push
66#pragma GCC diagnostic ignored "-Wstrict-aliasing"
67#pragma GCC diagnostic ignored "-Wunused-variable"
68#pragma GCC diagnostic ignored "-Wconversion"
69#elif defined(_MSC_VER)
70#define RESTRICT __restrict
71#else
72#define RESTRICT
73#endif
74
75// NOLINTBEGIN(readability-non-const-parameter*)
76namespace internal_fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA {
77static FUNC_PREFIX __launch_bounds__(256) void fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA(float *RESTRICT const _data_flux, uint8_t *RESTRICT const _data_indexVector, int64_t const _stride_flux_0, int64_t const _stride_flux_1, int64_t const _stride_flux_2, int64_t const _stride_flux_3, int32_t indexVectorSize) {
78 if (blockDim.x * blockIdx.x + threadIdx.x < indexVectorSize) {
79 uint8_t *RESTRICT _data_indexVector_10 = _data_indexVector;
80 const int32_t x = *((int32_t *)(&_data_indexVector_10[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
81 uint8_t *RESTRICT _data_indexVector_14 = _data_indexVector + 4;
82 const int32_t y = *((int32_t *)(&_data_indexVector_14[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
83 uint8_t *RESTRICT _data_indexVector_18 = _data_indexVector + 8;
84 const int32_t z = *((int32_t *)(&_data_indexVector_18[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
85
86 const int32_t cx[] = {0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1};
87 const int32_t cy[] = {0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0, 1, 1, -1, -1, 1, 1, -1, -1};
88 const int32_t cz[] = {0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1};
89 const int32_t invdir[] = {0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13, 26, 25, 24, 23, 22, 21, 20, 19};
90
91 uint8_t *RESTRICT _data_indexVector_112 = _data_indexVector + 12;
92 const int32_t dir = *((int32_t *)(&_data_indexVector_112[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
93 if (((dir) == (26))) {
94 float *RESTRICT _data_flux_10_20_39 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 9 * _stride_flux_3;
95 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
96 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
97 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
98 _data_flux_10_20_39[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
99 } else {
100 if (((dir) == (25))) {
101 float *RESTRICT _data_flux_1m1_2m1_312 = _data_flux + _stride_flux_1 * y - _stride_flux_1 + _stride_flux_2 * z - _stride_flux_2 + 12 * _stride_flux_3;
102 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
103 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
104 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
105 _data_flux_1m1_2m1_312[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
106 } else {
107 if (((dir) == (24))) {
108 float *RESTRICT _data_flux_10_20_311 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 11 * _stride_flux_3;
109 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
110 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
111 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
112 _data_flux_10_20_311[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
113 } else {
114 if (((dir) == (23))) {
115 float *RESTRICT _data_flux_11_2m1_310 = _data_flux + _stride_flux_1 * y + _stride_flux_1 + _stride_flux_2 * z - _stride_flux_2 + 10 * _stride_flux_3;
116 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
117 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
118 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
119 _data_flux_11_2m1_310[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
120 } else {
121 if (((dir) == (22))) {
122 float *RESTRICT _data_flux_10_20_310 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 10 * _stride_flux_3;
123 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
124 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
125 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
126 _data_flux_10_20_310[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
127 } else {
128 if (((dir) == (21))) {
129 float *RESTRICT _data_flux_1m1_21_311 = _data_flux + _stride_flux_1 * y - _stride_flux_1 + _stride_flux_2 * z + _stride_flux_2 + 11 * _stride_flux_3;
130 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
131 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
132 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
133 _data_flux_1m1_21_311[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
134 } else {
135 if (((dir) == (20))) {
136 float *RESTRICT _data_flux_10_20_312 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 12 * _stride_flux_3;
137 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
138 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
139 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
140 _data_flux_10_20_312[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
141 } else {
142 if (((dir) == (19))) {
143 float *RESTRICT _data_flux_11_21_39 = _data_flux + _stride_flux_1 * y + _stride_flux_1 + _stride_flux_2 * z + _stride_flux_2 + 9 * _stride_flux_3;
144 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
145 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
146 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
147 _data_flux_11_21_39[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
148 } else {
149 if (((dir) == (18))) {
150 float *RESTRICT _data_flux_10_2m1_36 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z - _stride_flux_2 + 6 * _stride_flux_3;
151 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
152 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
153 _data_flux_10_2m1_36[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
154 } else {
155 if (((dir) == (17))) {
156 float *RESTRICT _data_flux_10_20_35 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 5 * _stride_flux_3;
157 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
158 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
159 _data_flux_10_20_35[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
160 } else {
161 if (((dir) == (16))) {
162 float *RESTRICT _data_flux_10_20_37 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 7 * _stride_flux_3;
163 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
164 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
165 _data_flux_10_20_37[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
166 } else {
167 if (((dir) == (15))) {
168 float *RESTRICT _data_flux_11_2m1_38 = _data_flux + _stride_flux_1 * y + _stride_flux_1 + _stride_flux_2 * z - _stride_flux_2 + 8 * _stride_flux_3;
169 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
170 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
171 _data_flux_11_2m1_38[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
172 } else {
173 if (((dir) == (14))) {
174 float *RESTRICT _data_flux_10_21_35 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + _stride_flux_2 + 5 * _stride_flux_3;
175 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
176 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
177 _data_flux_10_21_35[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
178 } else {
179 if (((dir) == (13))) {
180 float *RESTRICT _data_flux_10_20_36 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 6 * _stride_flux_3;
181 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
182 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
183 _data_flux_10_20_36[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
184 } else {
185 if (((dir) == (12))) {
186 float *RESTRICT _data_flux_10_20_38 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 8 * _stride_flux_3;
187 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
188 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
189 _data_flux_10_20_38[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
190 } else {
191 if (((dir) == (11))) {
192 float *RESTRICT _data_flux_11_21_37 = _data_flux + _stride_flux_1 * y + _stride_flux_1 + _stride_flux_2 * z + _stride_flux_2 + 7 * _stride_flux_3;
193 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
194 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
195 _data_flux_11_21_37[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
196 } else {
197 if (((dir) == (10))) {
198 float *RESTRICT _data_flux_1m1_20_34 = _data_flux + _stride_flux_1 * y - _stride_flux_1 + _stride_flux_2 * z + 4 * _stride_flux_3;
199 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
200 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
201 _data_flux_1m1_20_34[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
202 } else {
203 if (((dir) == (9))) {
204 float *RESTRICT _data_flux_10_20_33 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 3 * _stride_flux_3;
205 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
206 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
207 _data_flux_10_20_33[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
208 } else {
209 if (((dir) == (8))) {
210 float *RESTRICT _data_flux_11_20_33 = _data_flux + _stride_flux_1 * y + _stride_flux_1 + _stride_flux_2 * z + 3 * _stride_flux_3;
211 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
212 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
213 _data_flux_11_20_33[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
214 } else {
215 if (((dir) == (7))) {
216 float *RESTRICT _data_flux_10_20_34 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 4 * _stride_flux_3;
217 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
218 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
219 _data_flux_10_20_34[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
220 } else {
221 if (((dir) == (6))) {
222 float *RESTRICT _data_flux_10_20_32 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + 2 * _stride_flux_3;
223 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
224 _data_flux_10_20_32[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
225 } else {
226 if (((dir) == (5))) {
227 float *RESTRICT _data_flux_10_21_32 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + _stride_flux_2 + 2 * _stride_flux_3;
228 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
229 _data_flux_10_21_32[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
230 } else {
231 if (((dir) == (4))) {
232 float *RESTRICT _data_flux_10_20_30 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z;
233 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
234 _data_flux_10_20_30[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
235 } else {
236 if (((dir) == (3))) {
237 float *RESTRICT _data_flux_10_20_30 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z;
238 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
239 _data_flux_10_20_30[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
240 } else {
241 if (((dir) == (2))) {
242 float *RESTRICT _data_flux_10_20_31 = _data_flux + _stride_flux_1 * y + _stride_flux_2 * z + _stride_flux_3;
243 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
244 _data_flux_10_20_31[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
245 } else {
246 if (((dir) == (1))) {
247 float *RESTRICT _data_flux_11_20_31 = _data_flux + _stride_flux_1 * y + _stride_flux_1 + _stride_flux_2 * z + _stride_flux_3;
248 uint8_t *RESTRICT _data_indexVector_120 = _data_indexVector + 20;
249 _data_flux_11_20_31[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
250 }
251 }
252 }
253 }
254 }
255 }
256 }
257 }
258 }
259 }
260 }
261 }
262 }
263 }
264 }
265 }
266 }
267 }
268 }
269 }
270 }
271 }
272 }
273 }
274 }
275 }
276 }
277}
278} // namespace internal_fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA
279
280// NOLINTEND(readability-non-const-parameter*)
281
282#if defined(__NVCC__)
283#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
284#pragma nv_diagnostic pop
285#else
286#pragma pop
287#endif // defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
288#elif defined(__clang__)
289#if defined(__CUDA__)
290#if defined(__CUDA_ARCH__)
291// clang compiling CUDA code in device mode
292#pragma clang diagnostic pop
293#else
294// clang compiling CUDA code in host mode
295#pragma clang diagnostic pop
296#endif // defined(__CUDA_ARCH__)
297#endif // defined(__CUDA__)
298#elif defined(__GNUC__) or defined(__GNUG__)
299#pragma GCC diagnostic pop
300#endif
301
302void FixedFlux_single_precision_CUDA::run_impl(IBlock *block, IndexVectors::Type type, gpuStream_t stream) {
303 auto *indexVectors = block->getData<IndexVectors>(indexVectorID);
304 int32_t indexVectorSize = int32_c(indexVectors->indexVector(type).size());
305 if (indexVectorSize == 0)
306 return;
307
308 auto pointer = indexVectors->pointerGpu(type);
309
310 uint8_t *_data_indexVector = reinterpret_cast<uint8_t *>(pointer);
311
312 auto flux = block->getData<gpu::GPUField<float>>(fluxID);
313
314 WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(flux->nrOfGhostLayers()))
315 float *RESTRICT const _data_flux = flux->dataAt(0, 0, 0, 0);
316 const int64_t _stride_flux_0 = int64_t(flux->xStride());
317 const int64_t _stride_flux_1 = int64_t(flux->yStride());
318 const int64_t _stride_flux_2 = int64_t(flux->zStride());
319 const int64_t _stride_flux_3 = int64_t(1 * int64_t(flux->fStride()));
320 dim3 _block(uint32_c(((256 < indexVectorSize) ? 256 : indexVectorSize)), uint32_c(1), uint32_c(1));
321 dim3 _grid(uint32_c(((indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ((int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize))) + 1)), uint32_c(1), uint32_c(1));
322 internal_fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA::fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA<<<_grid, _block, 0, stream>>>(_data_flux, _data_indexVector, _stride_flux_0, _stride_flux_1, _stride_flux_2, _stride_flux_3, indexVectorSize);
323}
324
325void FixedFlux_single_precision_CUDA::run(IBlock *block, gpuStream_t stream) {
326 run_impl(block, IndexVectors::ALL, stream);
327}
328
330 run_impl(block, IndexVectors::INNER, stream);
331}
332
334 run_impl(block, IndexVectors::OUTER, stream);
335}
336
337} // namespace pystencils
338} // namespace walberla
#define FUNC_PREFIX
\file AdvectiveFluxKernel_double_precision.cpp \author pystencils
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:176
STL namespace.
static FUNC_PREFIX uint8_t *RESTRICT const int64_t const int64_t const int64_t const int64_t const _stride_flux_3
static FUNC_PREFIX uint8_t *RESTRICT const int64_t const int64_t const int64_t const _stride_flux_2
static FUNC_PREFIX __launch_bounds__(256) void fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA(float *RESTRICT const _data_flux
\file PackInfoPdfDoublePrecision.cpp \author pystencils