ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
StreamSweepSinglePrecisionCUDA.cu
Go to the documentation of this file.
1//======================================================================================================================
2//
3// This file is part of waLBerla. waLBerla is free software: you can
4// redistribute it and/or modify it under the terms of the GNU General Public
5// License as published by the Free Software Foundation, either version 3 of
6// the License, or (at your option) any later version.
7//
8// waLBerla is distributed in the hope that it will be useful, but WITHOUT
9// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11// for more details.
12//
13// You should have received a copy of the GNU General Public License along
14// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
15//
16//! \\file StreamSweepSinglePrecisionCUDA.cpp
17//! \\author pystencils
18//======================================================================================================================
19
20// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 0c8b4b926c6979288fd8a6846d02ec0870e1fe41
21
22#include <cmath>
23
25#include "core/DataTypes.h"
26#include "core/Macros.h"
27
28#define FUNC_PREFIX __global__
29
30#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
31#pragma GCC diagnostic push
32#pragma GCC diagnostic ignored "-Wfloat-equal"
33#pragma GCC diagnostic ignored "-Wshadow"
34#pragma GCC diagnostic ignored "-Wconversion"
35#pragma GCC diagnostic ignored "-Wunused-variable"
36#endif
37
38#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
39#pragma warning push
40#pragma warning(disable : 1599)
41#endif
42
43using namespace std;
44
45namespace walberla {
46namespace pystencils {
47
48namespace internal_streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda {
49static FUNC_PREFIX __launch_bounds__(256) void streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda(float *RESTRICT const _data_force, float *RESTRICT const _data_pdfs, float *RESTRICT _data_pdfs_tmp, float *RESTRICT _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_0, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) {
50 if (blockDim.x * blockIdx.x + threadIdx.x + 1 < _size_force_0 - 1 && blockDim.y * blockIdx.y + threadIdx.y + 1 < _size_force_1 - 1 && blockDim.z * blockIdx.z + threadIdx.z + 1 < _size_force_2 - 1) {
51 const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x + 1;
52 const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y + 1;
53 const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z + 1;
54 float *RESTRICT _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2;
55 const float streamed_0 = _data_pdfs_10_20_30[_stride_pdfs_0 * ctr_0];
56 float *RESTRICT _data_pdfs_1m1_20_31 = _data_pdfs + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
57 const float streamed_1 = _data_pdfs_1m1_20_31[_stride_pdfs_0 * ctr_0];
58 float *RESTRICT _data_pdfs_11_20_32 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
59 const float streamed_2 = _data_pdfs_11_20_32[_stride_pdfs_0 * ctr_0];
60 float *RESTRICT _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
61 const float streamed_3 = _data_pdfs_10_20_33[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
62 float *RESTRICT _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
63 const float streamed_4 = _data_pdfs_10_20_34[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
64 float *RESTRICT _data_pdfs_10_2m1_35 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3;
65 const float streamed_5 = _data_pdfs_10_2m1_35[_stride_pdfs_0 * ctr_0];
66 float *RESTRICT _data_pdfs_10_21_36 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3;
67 const float streamed_6 = _data_pdfs_10_21_36[_stride_pdfs_0 * ctr_0];
68 float *RESTRICT _data_pdfs_1m1_20_37 = _data_pdfs + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
69 const float streamed_7 = _data_pdfs_1m1_20_37[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
70 float *RESTRICT _data_pdfs_1m1_20_38 = _data_pdfs + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
71 const float streamed_8 = _data_pdfs_1m1_20_38[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
72 float *RESTRICT _data_pdfs_11_20_39 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
73 const float streamed_9 = _data_pdfs_11_20_39[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
74 float *RESTRICT _data_pdfs_11_20_310 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
75 const float streamed_10 = _data_pdfs_11_20_310[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
76 float *RESTRICT _data_pdfs_1m1_2m1_311 = _data_pdfs + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3;
77 const float streamed_11 = _data_pdfs_1m1_2m1_311[_stride_pdfs_0 * ctr_0];
78 float *RESTRICT _data_pdfs_11_2m1_312 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3;
79 const float streamed_12 = _data_pdfs_11_2m1_312[_stride_pdfs_0 * ctr_0];
80 float *RESTRICT _data_pdfs_10_2m1_313 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3;
81 const float streamed_13 = _data_pdfs_10_2m1_313[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
82 float *RESTRICT _data_pdfs_10_2m1_314 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3;
83 const float streamed_14 = _data_pdfs_10_2m1_314[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
84 float *RESTRICT _data_pdfs_1m1_21_315 = _data_pdfs + _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3;
85 const float streamed_15 = _data_pdfs_1m1_21_315[_stride_pdfs_0 * ctr_0];
86 float *RESTRICT _data_pdfs_11_21_316 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3;
87 const float streamed_16 = _data_pdfs_11_21_316[_stride_pdfs_0 * ctr_0];
88 float *RESTRICT _data_pdfs_10_21_317 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3;
89 const float streamed_17 = _data_pdfs_10_21_317[_stride_pdfs_0 * ctr_0 + _stride_pdfs_0];
90 float *RESTRICT _data_pdfs_10_21_318 = _data_pdfs + _stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3;
91 const float streamed_18 = _data_pdfs_10_21_318[_stride_pdfs_0 * ctr_0 - _stride_pdfs_0];
92 const float vel0Term = streamed_10 + streamed_14 + streamed_18 + streamed_4 + streamed_8;
93 const float momdensity_0 = streamed_13 * -1.0f + streamed_17 * -1.0f + streamed_3 * -1.0f + streamed_7 * -1.0f + streamed_9 * -1.0f + vel0Term;
94 const float vel1Term = streamed_1 + streamed_11 + streamed_15 + streamed_7;
95 const float momdensity_1 = streamed_10 * -1.0f + streamed_12 * -1.0f + streamed_16 * -1.0f + streamed_2 * -1.0f + streamed_8 + streamed_9 * -1.0f + vel1Term;
96 const float vel2Term = streamed_12 + streamed_13 + streamed_5;
97 const float rho = streamed_0 + streamed_16 + streamed_17 + streamed_2 + streamed_3 + streamed_6 + streamed_9 + vel0Term + vel1Term + vel2Term;
98 const float momdensity_2 = streamed_11 + streamed_14 + streamed_15 * -1.0f + streamed_16 * -1.0f + streamed_17 * -1.0f + streamed_18 * -1.0f + streamed_6 * -1.0f + vel2Term;
99 float *RESTRICT _data_force_10_20_30 = _data_force + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2;
100 const float u_0 = momdensity_0 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force_10_20_30[_stride_force_0 * ctr_0];
101 float *RESTRICT _data_force_10_20_31 = _data_force + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3;
102 const float u_1 = momdensity_1 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force_10_20_31[_stride_force_0 * ctr_0];
103 float *RESTRICT _data_force_10_20_32 = _data_force + _stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
104 const float u_2 = momdensity_2 * ((1.0f) / (rho)) + 0.5f * ((1.0f) / (rho)) * _data_force_10_20_32[_stride_force_0 * ctr_0];
105 float *RESTRICT _data_velocity_10_20_30 = _data_velocity + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2;
106 _data_velocity_10_20_30[_stride_velocity_0 * ctr_0] = u_0;
107 float *RESTRICT _data_velocity_10_20_31 = _data_velocity + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + _stride_velocity_3;
108 _data_velocity_10_20_31[_stride_velocity_0 * ctr_0] = u_1;
109 float *RESTRICT _data_velocity_10_20_32 = _data_velocity + _stride_velocity_1 * ctr_1 + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3;
110 _data_velocity_10_20_32[_stride_velocity_0 * ctr_0] = u_2;
111 float *RESTRICT _data_pdfs_tmp_10_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2;
112 _data_pdfs_tmp_10_20_30[_stride_pdfs_tmp_0 * ctr_0] = streamed_0;
113 float *RESTRICT _data_pdfs_tmp_10_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3;
114 _data_pdfs_tmp_10_20_31[_stride_pdfs_tmp_0 * ctr_0] = streamed_1;
115 float *RESTRICT _data_pdfs_tmp_10_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3;
116 _data_pdfs_tmp_10_20_32[_stride_pdfs_tmp_0 * ctr_0] = streamed_2;
117 float *RESTRICT _data_pdfs_tmp_10_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3;
118 _data_pdfs_tmp_10_20_33[_stride_pdfs_tmp_0 * ctr_0] = streamed_3;
119 float *RESTRICT _data_pdfs_tmp_10_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3;
120 _data_pdfs_tmp_10_20_34[_stride_pdfs_tmp_0 * ctr_0] = streamed_4;
121 float *RESTRICT _data_pdfs_tmp_10_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3;
122 _data_pdfs_tmp_10_20_35[_stride_pdfs_tmp_0 * ctr_0] = streamed_5;
123 float *RESTRICT _data_pdfs_tmp_10_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3;
124 _data_pdfs_tmp_10_20_36[_stride_pdfs_tmp_0 * ctr_0] = streamed_6;
125 float *RESTRICT _data_pdfs_tmp_10_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3;
126 _data_pdfs_tmp_10_20_37[_stride_pdfs_tmp_0 * ctr_0] = streamed_7;
127 float *RESTRICT _data_pdfs_tmp_10_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3;
128 _data_pdfs_tmp_10_20_38[_stride_pdfs_tmp_0 * ctr_0] = streamed_8;
129 float *RESTRICT _data_pdfs_tmp_10_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3;
130 _data_pdfs_tmp_10_20_39[_stride_pdfs_tmp_0 * ctr_0] = streamed_9;
131 float *RESTRICT _data_pdfs_tmp_10_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3;
132 _data_pdfs_tmp_10_20_310[_stride_pdfs_tmp_0 * ctr_0] = streamed_10;
133 float *RESTRICT _data_pdfs_tmp_10_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3;
134 _data_pdfs_tmp_10_20_311[_stride_pdfs_tmp_0 * ctr_0] = streamed_11;
135 float *RESTRICT _data_pdfs_tmp_10_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3;
136 _data_pdfs_tmp_10_20_312[_stride_pdfs_tmp_0 * ctr_0] = streamed_12;
137 float *RESTRICT _data_pdfs_tmp_10_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3;
138 _data_pdfs_tmp_10_20_313[_stride_pdfs_tmp_0 * ctr_0] = streamed_13;
139 float *RESTRICT _data_pdfs_tmp_10_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3;
140 _data_pdfs_tmp_10_20_314[_stride_pdfs_tmp_0 * ctr_0] = streamed_14;
141 float *RESTRICT _data_pdfs_tmp_10_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3;
142 _data_pdfs_tmp_10_20_315[_stride_pdfs_tmp_0 * ctr_0] = streamed_15;
143 float *RESTRICT _data_pdfs_tmp_10_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3;
144 _data_pdfs_tmp_10_20_316[_stride_pdfs_tmp_0 * ctr_0] = streamed_16;
145 float *RESTRICT _data_pdfs_tmp_10_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3;
146 _data_pdfs_tmp_10_20_317[_stride_pdfs_tmp_0 * ctr_0] = streamed_17;
147 float *RESTRICT _data_pdfs_tmp_10_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_1 * ctr_1 + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3;
148 _data_pdfs_tmp_10_20_318[_stride_pdfs_tmp_0 * ctr_0] = streamed_18;
149 }
150}
151} // namespace internal_streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda
152
154 auto velocity = block->getData<gpu::GPUField<float>>(velocityID);
155 auto force = block->getData<gpu::GPUField<float>>(forceID);
156 auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
157 gpu::GPUField<float> *pdfs_tmp;
158 {
159 // Getting temporary field pdfs_tmp
160 auto it = cache_pdfs_.find(pdfs);
161 if (it != cache_pdfs_.end()) {
162 pdfs_tmp = *it;
163 } else {
164 pdfs_tmp = pdfs->cloneUninitialized();
165 cache_pdfs_.insert(pdfs_tmp);
166 }
167 }
168
169 WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()))
170 float *RESTRICT const _data_force = force->dataAt(-1, -1, -1, 0);
171 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
172 WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()))
173 float *RESTRICT const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0);
174 WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
175 WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()))
176 float *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0);
177 WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
178 WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()))
179 float *RESTRICT _data_velocity = velocity->dataAt(-1, -1, -1, 0);
180 WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
181 WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 2))
182 const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 2);
183 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
184 WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 2))
185 const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 2);
186 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
187 WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 2))
188 const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 2);
189 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
190 const int64_t _stride_force_0 = int64_t(force->xStride());
191 const int64_t _stride_force_1 = int64_t(force->yStride());
192 const int64_t _stride_force_2 = int64_t(force->zStride());
193 const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
194 const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
195 const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
196 const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
197 const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
198 const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
199 const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
200 const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
201 const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
202 const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
203 const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
204 const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
205 const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
206 dim3 _block(uint32_t(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)), uint32_t(((1024 < ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))), uint32_t(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))))));
207 dim3 _grid(uint32_t(((_size_force_0 - 2) % (((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)) == 0 ? (int64_t)(_size_force_0 - 2) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)) : ((int64_t)(_size_force_0 - 2) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))) + 1)), uint32_t(((_size_force_1 - 2) % (((1024 < ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))) == 0 ? (int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))) : ((int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) + 1)), uint32_t(((_size_force_2 - 2) % (((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))))) == 0 ? (int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))))) : ((int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))))) + 1)));
208 internal_streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda::streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_pdfs_tmp, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
209 pdfs->swapDataPointers(pdfs_tmp);
210}
211
212void StreamSweepSinglePrecisionCUDA::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream) {
213 CellInterval ci = globalCellInterval;
214 CellInterval blockBB = blocks->getBlockCellBB(*block);
215 blockBB.expand(ghostLayers);
216 ci.intersect(blockBB);
217 blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
218 if (ci.empty())
219 return;
220
221 auto velocity = block->getData<gpu::GPUField<float>>(velocityID);
222 auto force = block->getData<gpu::GPUField<float>>(forceID);
223 auto pdfs = block->getData<gpu::GPUField<float>>(pdfsID);
224 gpu::GPUField<float> *pdfs_tmp;
225 {
226 // Getting temporary field pdfs_tmp
227 auto it = cache_pdfs_.find(pdfs);
228 if (it != cache_pdfs_.end()) {
229 pdfs_tmp = *it;
230 } else {
231 pdfs_tmp = pdfs->cloneUninitialized();
232 cache_pdfs_.insert(pdfs_tmp);
233 }
234 }
235
236 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()))
237 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()))
238 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()))
239 float *RESTRICT const _data_force = force->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
240 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
241 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
242 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
243 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()))
244 float *RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
245 WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
246 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
247 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
248 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()))
249 float *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
250 WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx)
251 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()))
252 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()))
253 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()))
254 float *RESTRICT _data_velocity = velocity->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
255 WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx)
256 WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 2))
257 const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 2);
258 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
259 WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 2))
260 const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 2);
261 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
262 WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 2))
263 const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 2);
264 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
265 const int64_t _stride_force_0 = int64_t(force->xStride());
266 const int64_t _stride_force_1 = int64_t(force->yStride());
267 const int64_t _stride_force_2 = int64_t(force->zStride());
268 const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
269 const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
270 const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
271 const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
272 const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
273 const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
274 const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
275 const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
276 const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
277 const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
278 const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
279 const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
280 const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
281 dim3 _block(uint32_t(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)), uint32_t(((1024 < ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))), uint32_t(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))))));
282 dim3 _grid(uint32_t(((_size_force_0 - 2) % (((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)) == 0 ? (int64_t)(_size_force_0 - 2) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)) : ((int64_t)(_size_force_0 - 2) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))) + 1)), uint32_t(((_size_force_1 - 2) % (((1024 < ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))) == 0 ? (int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))) : ((int64_t)(_size_force_1 - 2) / (int64_t)(((1024 < ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))) ? 1024 : ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) + 1)), uint32_t(((_size_force_2 - 2) % (((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))))) == 0 ? (int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))))) : ((int64_t)(_size_force_2 - 2) / (int64_t)(((64 < ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))) ? 64 : ((_size_force_2 - 2 < ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2))))))) ? _size_force_2 - 2 : ((int64_t)(256) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2) * ((_size_force_1 - 2 < 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))) ? _size_force_1 - 2 : 16 * ((int64_t)(16) / (int64_t)(((16 < _size_force_0 - 2) ? 16 : _size_force_0 - 2)))))))))) + 1)));
283 internal_streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda::streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda<<<_grid, _block, 0, stream>>>(_data_force, _data_pdfs, _data_pdfs_tmp, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_0, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
284 pdfs->swapDataPointers(pdfs_tmp);
285}
286
287} // namespace pystencils
288} // namespace walberla
289
290#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
291#pragma GCC diagnostic pop
292#endif
293
294#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
295#pragma warning pop
296#endif
#define FUNC_PREFIX
\file AdvectiveFluxKernel_double_precision.cpp \ingroup lbm \author lbmpy
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
__global__ float * force
void runOnCellInterval(const shared_ptr< StructuredBlockStorage > &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block, gpuStream_t stream=nullptr)
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:174
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_velocity_1
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_force_2
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const _size_force_2
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_pdfs_tmp_0
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const _stride_force_0
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_pdfs_tmp_2
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_pdfs_3
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_force_3
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_pdfs_tmp_1
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_pdfs_tmp_3
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const _stride_force_1
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_velocity_0
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_pdfs_2
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_velocity_2
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT _data_velocity
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const _size_force_1
static FUNC_PREFIX __launch_bounds__(256) void streamsweepsingleprecisioncuda_streamsweepsingleprecisioncuda(float *RESTRICT const _data_force
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_pdfs_1
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const _size_force_0
static FUNC_PREFIX float *RESTRICT const float *RESTRICT float *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_pdfs_0
static Utils::Vector3d velocity(Particle const &p_ref, Particle const &p_vs)
Velocity of the virtual site.
Definition relative.cpp:64