Loading [MathJax]/extensions/TeX/AMSmath.js
ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages Concepts
DynamicUBBDoublePrecisionCUDA.cu
Go to the documentation of this file.
1//======================================================================================================================
2//
3// This file is part of waLBerla. waLBerla is free software: you can
4// redistribute it and/or modify it under the terms of the GNU General Public
5// License as published by the Free Software Foundation, either version 3 of
6// the License, or (at your option) any later version.
7//
8// waLBerla is distributed in the hope that it will be useful, but WITHOUT
9// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11// for more details.
12//
13// You should have received a copy of the GNU General Public License along
14// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
15//
16//! \\file DynamicUBBDoublePrecisionCUDA.cpp
17//! \\author pystencils
18//======================================================================================================================
19
20// kernel generated with pystencils v1.3.7, lbmpy v1.3.7, sympy v1.12.1, lbmpy_walberla/pystencils_walberla from waLBerla commit f36fa0a68bae59f0b516f6587ea8fa7c24a41141
21
23#include "core/DataTypes.h"
24#include "core/Macros.h"
25#include "gpu/ErrorChecking.h"
26
27#define FUNC_PREFIX __global__
28
29using namespace std;
30
31namespace walberla {
32namespace lbm {
33
34#if defined(__NVCC__)
35#define RESTRICT __restrict__
36#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
37#pragma nv_diagnostic push
38#pragma nv_diag_suppress 177 // unused variable
39#else
40#pragma push
41#pragma diag_suppress 177 // unused variable
42#endif // defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
43#elif defined(__clang__)
44#if defined(__CUDA__)
45#if defined(__CUDA_ARCH__)
46// clang compiling CUDA code in device mode
47#define RESTRICT __restrict__
48#pragma clang diagnostic push
49#pragma clang diagnostic ignored "-Wstrict-aliasing"
50#pragma clang diagnostic ignored "-Wunused-variable"
51#pragma clang diagnostic ignored "-Wconversion"
52#pragma clang diagnostic ignored "-Wsign-compare"
53#else
54// clang compiling CUDA code in host mode
55#define RESTRICT __restrict__
56#pragma clang diagnostic push
57#pragma clang diagnostic ignored "-Wstrict-aliasing"
58#pragma clang diagnostic ignored "-Wunused-variable"
59#pragma clang diagnostic ignored "-Wconversion"
60#pragma clang diagnostic ignored "-Wsign-compare"
61#endif // defined(__CUDA_ARCH__)
62#endif // defined(__CUDA__)
63#elif defined(__GNUC__) or defined(__GNUG__)
64#define RESTRICT __restrict__
65#pragma GCC diagnostic push
66#pragma GCC diagnostic ignored "-Wstrict-aliasing"
67#pragma GCC diagnostic ignored "-Wunused-variable"
68#pragma GCC diagnostic ignored "-Wconversion"
69#elif defined(_MSC_VER)
70#define RESTRICT __restrict
71#else
72#define RESTRICT
73#endif
74
75// NOLINTBEGIN(readability-non-const-parameter*)
76namespace internal_dynamicubbdoubleprecisioncuda_boundary_DynamicUBBDoublePrecisionCUDA {
77static FUNC_PREFIX __launch_bounds__(256) void dynamicubbdoubleprecisioncuda_boundary_DynamicUBBDoublePrecisionCUDA(uint8_t *RESTRICT const _data_indexVector, double *RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) {
78
79 const int32_t f_in_inv_dir_idx[] = {0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13};
80 const int32_t f_in_inv_offsets_x[] = {0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1};
81 const int32_t f_in_inv_offsets_y[] = {0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0};
82 const int32_t f_in_inv_offsets_z[] = {0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1};
83
84 const double weights[] = {((double)(0.33333333333333333)), ((double)(0.055555555555555556)), ((double)(0.055555555555555556)), ((double)(0.055555555555555556)), ((double)(0.055555555555555556)), ((double)(0.055555555555555556)), ((double)(0.055555555555555556)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778)), ((double)(0.027777777777777778))};
85
86 const int32_t neighbour_offset_x[] = {0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1};
87 const int32_t neighbour_offset_y[] = {0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0};
88 const int32_t neighbour_offset_z[] = {0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1};
89
90 if (blockDim.x * blockIdx.x + threadIdx.x < indexVectorSize) {
91 uint8_t *RESTRICT _data_indexVector_10 = _data_indexVector;
92 const int32_t x = *((int32_t *)(&_data_indexVector_10[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]));
93 uint8_t *RESTRICT _data_indexVector_14 = _data_indexVector + 4;
94 const int32_t y = *((int32_t *)(&_data_indexVector_14[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]));
95 uint8_t *RESTRICT _data_indexVector_18 = _data_indexVector + 8;
96 const int32_t z = *((int32_t *)(&_data_indexVector_18[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]));
97 uint8_t *RESTRICT _data_indexVector_112 = _data_indexVector + 12;
98 const int32_t dir = *((int32_t *)(&_data_indexVector_112[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]));
99 double *RESTRICT _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 10 * _stride_pdfs_3;
100 double *RESTRICT _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 14 * _stride_pdfs_3;
101 double *RESTRICT _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 18 * _stride_pdfs_3;
102 double *RESTRICT _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 4 * _stride_pdfs_3;
103 double *RESTRICT _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 8 * _stride_pdfs_3;
104 const double vel0Term = _data_pdfs_10_20_310[_stride_pdfs_0 * x] + _data_pdfs_10_20_314[_stride_pdfs_0 * x] + _data_pdfs_10_20_318[_stride_pdfs_0 * x] + _data_pdfs_10_20_34[_stride_pdfs_0 * x] + _data_pdfs_10_20_38[_stride_pdfs_0 * x];
105 double *RESTRICT _data_pdfs_10_20_31 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_3;
106 double *RESTRICT _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 11 * _stride_pdfs_3;
107 double *RESTRICT _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 15 * _stride_pdfs_3;
108 double *RESTRICT _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 7 * _stride_pdfs_3;
109 const double vel1Term = _data_pdfs_10_20_311[_stride_pdfs_0 * x] + _data_pdfs_10_20_315[_stride_pdfs_0 * x] + _data_pdfs_10_20_31[_stride_pdfs_0 * x] + _data_pdfs_10_20_37[_stride_pdfs_0 * x];
110 double *RESTRICT _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 12 * _stride_pdfs_3;
111 double *RESTRICT _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 13 * _stride_pdfs_3;
112 double *RESTRICT _data_pdfs_10_20_35 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 5 * _stride_pdfs_3;
113 const double vel2Term = _data_pdfs_10_20_312[_stride_pdfs_0 * x] + _data_pdfs_10_20_313[_stride_pdfs_0 * x] + _data_pdfs_10_20_35[_stride_pdfs_0 * x];
114 double *RESTRICT _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z;
115 double *RESTRICT _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 16 * _stride_pdfs_3;
116 double *RESTRICT _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 17 * _stride_pdfs_3;
117 double *RESTRICT _data_pdfs_10_20_32 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 2 * _stride_pdfs_3;
118 double *RESTRICT _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 3 * _stride_pdfs_3;
119 double *RESTRICT _data_pdfs_10_20_36 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 6 * _stride_pdfs_3;
120 double *RESTRICT _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + 9 * _stride_pdfs_3;
121 const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_10_20_30[_stride_pdfs_0 * x] + _data_pdfs_10_20_316[_stride_pdfs_0 * x] + _data_pdfs_10_20_317[_stride_pdfs_0 * x] + _data_pdfs_10_20_32[_stride_pdfs_0 * x] + _data_pdfs_10_20_33[_stride_pdfs_0 * x] + _data_pdfs_10_20_36[_stride_pdfs_0 * x] + _data_pdfs_10_20_39[_stride_pdfs_0 * x];
122 double *RESTRICT _data_pdfs582130af3d909897 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_1 * f_in_inv_offsets_y[dir] + _stride_pdfs_2 * z + _stride_pdfs_2 * f_in_inv_offsets_z[dir] + _stride_pdfs_3 * f_in_inv_dir_idx[dir];
123 uint8_t *RESTRICT _data_indexVector_116 = _data_indexVector + 16;
124 uint8_t *RESTRICT _data_indexVector_124 = _data_indexVector + 24;
125 uint8_t *RESTRICT _data_indexVector_132 = _data_indexVector + 32;
126 double *RESTRICT _data_pdfs_10_20fc4162366eb9c5b1 = _data_pdfs + _stride_pdfs_1 * y + _stride_pdfs_2 * z + _stride_pdfs_3 * dir;
127 _data_pdfs582130af3d909897[_stride_pdfs_0 * x + _stride_pdfs_0 * f_in_inv_offsets_x[dir]] = -rho * (6.0 * ((double)(neighbour_offset_x[dir])) * *((double *)(&_data_indexVector_116[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x])) + 6.0 * ((double)(neighbour_offset_y[dir])) * *((double *)(&_data_indexVector_124[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x])) + 6.0 * ((double)(neighbour_offset_z[dir])) * *((double *)(&_data_indexVector_132[40 * blockDim.x * blockIdx.x + 40 * threadIdx.x]))) * weights[dir] + _data_pdfs_10_20fc4162366eb9c5b1[_stride_pdfs_0 * x];
128 }
129}
130} // namespace internal_dynamicubbdoubleprecisioncuda_boundary_DynamicUBBDoublePrecisionCUDA
131
132// NOLINTEND(readability-non-const-parameter*)
133
134#if defined(__NVCC__)
135#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
136#pragma nv_diagnostic pop
137#else
138#pragma pop
139#endif // defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
140#elif defined(__clang__)
141#if defined(__CUDA__)
142#if defined(__CUDA_ARCH__)
143// clang compiling CUDA code in device mode
144#pragma clang diagnostic pop
145#else
146// clang compiling CUDA code in host mode
147#pragma clang diagnostic pop
148#endif // defined(__CUDA_ARCH__)
149#endif // defined(__CUDA__)
150#elif defined(__GNUC__) or defined(__GNUG__)
151#pragma GCC diagnostic pop
152#endif
153
154void DynamicUBBDoublePrecisionCUDA::run_impl(IBlock *block, IndexVectors::Type type, gpuStream_t stream) {
155 auto *indexVectors = block->getData<IndexVectors>(indexVectorID);
156 int32_t indexVectorSize = int32_c(indexVectors->indexVector(type).size());
157 if (indexVectorSize == 0)
158 return;
159
160 auto pointer = indexVectors->pointerGpu(type);
161
162 uint8_t *_data_indexVector = reinterpret_cast<uint8_t *>(pointer);
163
164 auto pdfs = block->getData<gpu::GPUField<double>>(pdfsID);
165
166 WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
167 double *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
168 const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
169 const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
170 const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
171 const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
172 dim3 _block(uint32_c(((256 < indexVectorSize) ? 256 : indexVectorSize)), uint32_c(1), uint32_c(1));
173 dim3 _grid(uint32_c(((indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ((int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize))) + 1)), uint32_c(1), uint32_c(1));
174 internal_dynamicubbdoubleprecisioncuda_boundary_DynamicUBBDoublePrecisionCUDA::dynamicubbdoubleprecisioncuda_boundary_DynamicUBBDoublePrecisionCUDA<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
175}
176
177void DynamicUBBDoublePrecisionCUDA::run(IBlock *block, gpuStream_t stream) {
178 run_impl(block, IndexVectors::ALL, stream);
179}
180
182 run_impl(block, IndexVectors::INNER, stream);
183}
184
186 run_impl(block, IndexVectors::OUTER, stream);
187}
188
189} // namespace lbm
190} // namespace walberla
#define FUNC_PREFIX
\file AdvectiveFluxKernel_double_precision.cpp \author pystencils
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
void run(IBlock *block, gpuStream_t stream=nullptr)
void inner(IBlock *block, gpuStream_t stream=nullptr)
void outer(IBlock *block, gpuStream_t stream=nullptr)
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:172
static FUNC_PREFIX double *RESTRICT int64_t const int64_t const int64_t const int64_t const int32_t indexVectorSize
static FUNC_PREFIX double *RESTRICT int64_t const int64_t const int64_t const _stride_pdfs_2
static FUNC_PREFIX __launch_bounds__(256) void dynamicubbdoubleprecisioncuda_boundary_DynamicUBBDoublePrecisionCUDA(uint8_t *RESTRICT const _data_indexVector
static FUNC_PREFIX double *RESTRICT int64_t const int64_t const int64_t const int64_t const _stride_pdfs_3
\file PackInfoPdfDoublePrecision.cpp \author pystencils