ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
FixedFlux_single_precision_CUDA.cu
Go to the documentation of this file.
1//======================================================================================================================
2//
3// This file is part of waLBerla. waLBerla is free software: you can
4// redistribute it and/or modify it under the terms of the GNU General Public
5// License as published by the Free Software Foundation, either version 3 of
6// the License, or (at your option) any later version.
7//
8// waLBerla is distributed in the hope that it will be useful, but WITHOUT
9// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11// for more details.
12//
13// You should have received a copy of the GNU General Public License along
14// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
15//
16//! \\file FixedFlux_single_precision_CUDA.cpp
17//! \\author pystencils
18//======================================================================================================================
19
20// kernel generated with pystencils v1.4+1.ge851f4e, lbmpy v1.4+1.ge9efe34, sympy v1.12.1, lbmpy_walberla/pystencils_walberla from waLBerla commit 007e77e077ad9d22b5eed6f3d3118240993e553c
21
23#include "core/DataTypes.h"
24#include "core/Macros.h"
25#include "gpu/ErrorChecking.h"
26
27#define FUNC_PREFIX __global__
28
29using namespace std;
30
31namespace walberla {
32namespace pystencils {
33
34#if defined(__NVCC__)
35#define RESTRICT __restrict__
36#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
37#pragma nv_diagnostic push
38#pragma nv_diag_suppress 177 // unused variable
39#else
40#pragma push
41#pragma diag_suppress 177 // unused variable
42#endif // defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
43#elif defined(__clang__)
44#if defined(__CUDA__)
45#if defined(__CUDA_ARCH__)
46// clang compiling CUDA code in device mode
47#define RESTRICT __restrict__
48#pragma clang diagnostic push
49#pragma clang diagnostic ignored "-Wstrict-aliasing"
50#pragma clang diagnostic ignored "-Wunused-variable"
51#pragma clang diagnostic ignored "-Wconversion"
52#pragma clang diagnostic ignored "-Wsign-compare"
53#else
54// clang compiling CUDA code in host mode
55#define RESTRICT __restrict__
56#pragma clang diagnostic push
57#pragma clang diagnostic ignored "-Wstrict-aliasing"
58#pragma clang diagnostic ignored "-Wunused-variable"
59#pragma clang diagnostic ignored "-Wconversion"
60#pragma clang diagnostic ignored "-Wsign-compare"
61#endif // defined(__CUDA_ARCH__)
62#endif // defined(__CUDA__)
63#elif defined(__GNUC__) or defined(__GNUG__)
64#define RESTRICT __restrict__
65#pragma GCC diagnostic push
66#pragma GCC diagnostic ignored "-Wstrict-aliasing"
67#pragma GCC diagnostic ignored "-Wunused-variable"
68#pragma GCC diagnostic ignored "-Wconversion"
69#elif defined(_MSC_VER)
70#define RESTRICT __restrict
71#else
72#define RESTRICT
73#endif
74
75// NOLINTBEGIN(readability-non-const-parameter*)
76namespace internal_fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA {
78 if (blockDim.x * blockIdx.x + threadIdx.x < indexVectorSize) {
80 const int32_t x = *((int32_t *)(&_data_indexVector_10[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
82 const int32_t y = *((int32_t *)(&_data_indexVector_14[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
84 const int32_t z = *((int32_t *)(&_data_indexVector_18[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
85
87 const int32_t dir = *((int32_t *)(&_data_indexVector_112[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
88 if (((dir) == (26))) {
93 _data_flux_10_20_39[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
94 } else {
95 if (((dir) == (25))) {
100 _data_flux_1m1_2m1_312[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
101 } else {
102 if (((dir) == (24))) {
107 _data_flux_10_20_311[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
108 } else {
109 if (((dir) == (23))) {
114 _data_flux_11_2m1_310[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
115 } else {
116 if (((dir) == (22))) {
121 _data_flux_10_20_310[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
122 } else {
123 if (((dir) == (21))) {
128 _data_flux_1m1_21_311[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
129 } else {
130 if (((dir) == (20))) {
135 _data_flux_10_20_312[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
136 } else {
137 if (((dir) == (19))) {
142 _data_flux_11_21_39[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
143 } else {
144 if (((dir) == (18))) {
148 _data_flux_10_2m1_36[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
149 } else {
150 if (((dir) == (17))) {
154 _data_flux_10_20_35[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
155 } else {
156 if (((dir) == (16))) {
160 _data_flux_10_20_37[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
161 } else {
162 if (((dir) == (15))) {
166 _data_flux_11_2m1_38[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
167 } else {
168 if (((dir) == (14))) {
172 _data_flux_10_21_35[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
173 } else {
174 if (((dir) == (13))) {
178 _data_flux_10_20_36[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
179 } else {
180 if (((dir) == (12))) {
184 _data_flux_10_20_38[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
185 } else {
186 if (((dir) == (11))) {
190 _data_flux_11_21_37[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
191 } else {
192 if (((dir) == (10))) {
196 _data_flux_1m1_20_34[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
197 } else {
198 if (((dir) == (9))) {
202 _data_flux_10_20_33[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
203 } else {
204 if (((dir) == (8))) {
208 _data_flux_11_20_33[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) - 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
209 } else {
210 if (((dir) == (7))) {
214 _data_flux_10_20_34[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x])) + 0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
215 } else {
216 if (((dir) == (6))) {
219 _data_flux_10_20_32[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
220 } else {
221 if (((dir) == (5))) {
224 _data_flux_10_21_32[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_124[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
225 } else {
226 if (((dir) == (4))) {
229 _data_flux_10_20_30[_stride_flux_0 * x + _stride_flux_0] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
230 } else {
231 if (((dir) == (3))) {
234 _data_flux_10_20_30[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_116[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
235 } else {
236 if (((dir) == (2))) {
239 _data_flux_10_20_31[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
240 } else {
241 if (((dir) == (1))) {
244 _data_flux_11_20_31[_stride_flux_0 * x] = -0.1111111111111111f * *((float *)(&_data_indexVector_120[28 * blockDim.x * blockIdx.x + 28 * threadIdx.x]));
245 }
246 }
247 }
248 }
249 }
250 }
251 }
252 }
253 }
254 }
255 }
256 }
257 }
258 }
259 }
260 }
261 }
262 }
263 }
264 }
265 }
266 }
267 }
268 }
269 }
270 }
271 }
272}
273} // namespace internal_fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA
274
275// NOLINTEND(readability-non-const-parameter*)
276
277#if defined(__NVCC__)
278#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
279#pragma nv_diagnostic pop
280#else
281#pragma pop
282#endif // defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
283#elif defined(__clang__)
284#if defined(__CUDA__)
285#if defined(__CUDA_ARCH__)
286// clang compiling CUDA code in device mode
287#pragma clang diagnostic pop
288#else
289// clang compiling CUDA code in host mode
290#pragma clang diagnostic pop
291#endif // defined(__CUDA_ARCH__)
292#endif // defined(__CUDA__)
293#elif defined(__GNUC__) or defined(__GNUG__)
294#pragma GCC diagnostic pop
295#endif
296
297void FixedFlux_single_precision_CUDA::run_impl(IBlock *block, IndexVectors::Type type, gpuStream_t stream) {
298 auto *indexVectors = block->getData<IndexVectors>(indexVectorID);
299 int32_t indexVectorSize = int32_c(indexVectors->indexVector(type).size());
300 if (indexVectorSize == 0)
301 return;
302
303 auto pointer = indexVectors->pointerGpu(type);
304
305 uint8_t *_data_indexVector = reinterpret_cast<uint8_t *>(pointer);
306
307 auto flux = block->getData<gpu::GPUField<float>>(fluxID);
308
309 WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(flux->nrOfGhostLayers()))
310 float *RESTRICT const _data_flux = flux->dataAt(0, 0, 0, 0);
311 const int64_t _stride_flux_0 = int64_t(flux->xStride());
312 const int64_t _stride_flux_1 = int64_t(flux->yStride());
313 const int64_t _stride_flux_2 = int64_t(flux->zStride());
314 const int64_t _stride_flux_3 = int64_t(1 * int64_t(flux->fStride()));
315 dim3 _block(uint32_c(((256 < indexVectorSize) ? 256 : indexVectorSize)), uint32_c(1), uint32_c(1));
316 dim3 _grid(uint32_c(((indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ((int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize))) + 1)), uint32_c(1), uint32_c(1));
317 internal_fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA::fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA<<<_grid, _block, 0, stream>>>(_data_flux, _data_indexVector, _stride_flux_0, _stride_flux_1, _stride_flux_2, _stride_flux_3, indexVectorSize);
318}
319
323
327
331
332} // namespace pystencils
333} // namespace walberla
#define FUNC_PREFIX
\file AdvectiveFluxKernel_double_precision.cpp \author pystencils
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:177
STL namespace.
static FUNC_PREFIX uint8_t *RESTRICT const int64_t const int64_t const int64_t const int64_t const _stride_flux_3
static FUNC_PREFIX uint8_t *RESTRICT const int64_t const int64_t const int64_t const _stride_flux_2
static FUNC_PREFIX __launch_bounds__(256) void fixedflux_single_precision_cuda_boundary_FixedFlux_single_precision_CUDA(float *RESTRICT const _data_flux
\file PackInfoPdfDoublePrecision.cpp \author pystencils