ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
PackInfoVecSinglePrecisionCUDA.cu
Go to the documentation of this file.
1//======================================================================================================================
2//
3// This file is part of waLBerla. waLBerla is free software: you can
4// redistribute it and/or modify it under the terms of the GNU General Public
5// License as published by the Free Software Foundation, either version 3 of
6// the License, or (at your option) any later version.
7//
8// waLBerla is distributed in the hope that it will be useful, but WITHOUT
9// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11// for more details.
12//
13// You should have received a copy of the GNU General Public License along
14// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
15//
16//! \\file PackInfoVecSinglePrecisionCUDA.cpp
17//! \\author pystencils
18//======================================================================================================================
19
20// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
21
22#include "core/DataTypes.h"
23#include "core/cell/CellInterval.h"
24#include "domain_decomposition/IBlock.h"
25#include "stencil/Directions.h"
26
28
29#define FUNC_PREFIX __global__
30
31#if defined(__NVCC__)
32#define RESTRICT __restrict__
33#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
34#pragma nv_diagnostic push
35#pragma nv_diag_suppress 177 // unused variable
36#else
37#pragma push
38#pragma diag_suppress 177 // unused variable
39#endif // defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
40#elif defined(__clang__)
41#if defined(__CUDA__)
42#if defined(__CUDA_ARCH__)
43// clang compiling CUDA code in device mode
44#define RESTRICT __restrict__
45#pragma clang diagnostic push
46#pragma clang diagnostic ignored "-Wunused-variable"
47#else
48// clang compiling CUDA code in host mode
49#define RESTRICT __restrict__
50#pragma clang diagnostic push
51#pragma clang diagnostic ignored "-Wunused-variable"
52#endif // defined(__CUDA_ARCH__)
53#endif // defined(__CUDA__)
54#elif defined(__GNUC__) or defined(__GNUG__)
55#define RESTRICT __restrict__
56#pragma GCC diagnostic push
57#pragma GCC diagnostic ignored "-Wunused-variable"
58#elif defined(_MSC_VER)
59#define RESTRICT __restrict
60#else
61#define RESTRICT
62#endif
63
64namespace walberla {
65namespace pystencils {
66
67using walberla::cell::CellInterval;
68using walberla::stencil::Direction;
69
70namespace internal_pack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE {
71static FUNC_PREFIX __launch_bounds__(256) void pack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE(float *RESTRICT _data_buffer, float *RESTRICT const _data_field, int64_t const _size_field_0, int64_t const _size_field_1, int64_t const _size_field_2, int64_t const _stride_field_0, int64_t const _stride_field_1, int64_t const _stride_field_2, int64_t const _stride_field_3) {
72 if (blockDim.x * blockIdx.x + threadIdx.x < _size_field_0 && blockDim.y * blockIdx.y + threadIdx.y < _size_field_1 && blockDim.z * blockIdx.z + threadIdx.z < _size_field_2) {
73 const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x;
74 const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y;
75 const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z;
76 _data_buffer[3 * _size_field_0 * _size_field_1 * ctr_2 + 3 * _size_field_0 * ctr_1 + 3 * ctr_0] = _data_field[_stride_field_0 * ctr_0 + _stride_field_1 * ctr_1 + _stride_field_2 * ctr_2];
77 _data_buffer[3 * _size_field_0 * _size_field_1 * ctr_2 + 3 * _size_field_0 * ctr_1 + 3 * ctr_0 + 1] = _data_field[_stride_field_0 * ctr_0 + _stride_field_1 * ctr_1 + _stride_field_2 * ctr_2 + _stride_field_3];
78 _data_buffer[3 * _size_field_0 * _size_field_1 * ctr_2 + 3 * _size_field_0 * ctr_1 + 3 * ctr_0 + 2] = _data_field[_stride_field_0 * ctr_0 + _stride_field_1 * ctr_1 + _stride_field_2 * ctr_2 + 2 * _stride_field_3];
79 }
80}
81} // namespace internal_pack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE
82
83namespace internal_unpack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE {
84static FUNC_PREFIX __launch_bounds__(256) void unpack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE(float *RESTRICT const _data_buffer, float *RESTRICT _data_field, int64_t const _size_field_0, int64_t const _size_field_1, int64_t const _size_field_2, int64_t const _stride_field_0, int64_t const _stride_field_1, int64_t const _stride_field_2, int64_t const _stride_field_3) {
85 if (blockDim.x * blockIdx.x + threadIdx.x < _size_field_0 && blockDim.y * blockIdx.y + threadIdx.y < _size_field_1 && blockDim.z * blockIdx.z + threadIdx.z < _size_field_2) {
86 const int64_t ctr_0 = blockDim.x * blockIdx.x + threadIdx.x;
87 const int64_t ctr_1 = blockDim.y * blockIdx.y + threadIdx.y;
88 const int64_t ctr_2 = blockDim.z * blockIdx.z + threadIdx.z;
89 _data_field[_stride_field_0 * ctr_0 + _stride_field_1 * ctr_1 + _stride_field_2 * ctr_2] = _data_buffer[3 * _size_field_0 * _size_field_1 * ctr_2 + 3 * _size_field_0 * ctr_1 + 3 * ctr_0];
90 _data_field[_stride_field_0 * ctr_0 + _stride_field_1 * ctr_1 + _stride_field_2 * ctr_2 + _stride_field_3] = _data_buffer[3 * _size_field_0 * _size_field_1 * ctr_2 + 3 * _size_field_0 * ctr_1 + 3 * ctr_0 + 1];
91 _data_field[_stride_field_0 * ctr_0 + _stride_field_1 * ctr_1 + _stride_field_2 * ctr_2 + 2 * _stride_field_3] = _data_buffer[3 * _size_field_0 * _size_field_1 * ctr_2 + 3 * _size_field_0 * ctr_1 + 3 * ctr_0 + 2];
92 }
93}
94} // namespace internal_unpack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE
95
96void PackInfoVecSinglePrecisionCUDA::pack(Direction dir, unsigned char *byte_buffer, IBlock *block, gpuStream_t stream) {
97 float *buffer = reinterpret_cast<float *>(byte_buffer);
98
99 auto field = block->getData<gpu::GPUField<float>>(fieldID);
100
101 CellInterval ci;
102 field->getSliceBeforeGhostLayer(dir, ci, 1, false);
103
104 switch (dir) {
105 case stencil::SW:
106 case stencil::BW:
107 case stencil::W:
108 case stencil::TW:
109 case stencil::NW:
110 case stencil::BS:
111 case stencil::S:
112 case stencil::TS:
113 case stencil::B:
114 case stencil::C:
115 case stencil::T:
116 case stencil::BN:
117 case stencil::N:
118 case stencil::TN:
119 case stencil::SE:
120 case stencil::BE:
121 case stencil::E:
122 case stencil::TE:
123 case stencil::NE: {
124 float *RESTRICT _data_buffer = buffer;
125 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(field->nrOfGhostLayers()))
126 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(field->nrOfGhostLayers()))
127 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(field->nrOfGhostLayers()))
128 float *RESTRICT const _data_field = field->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
129 WALBERLA_ASSERT_GREATER_EQUAL(field->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
130 const int64_t _size_field_0 = int64_t(int64_c(ci.xSize()) + 0);
131 WALBERLA_ASSERT_GREATER_EQUAL(field->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
132 const int64_t _size_field_1 = int64_t(int64_c(ci.ySize()) + 0);
133 WALBERLA_ASSERT_GREATER_EQUAL(field->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
134 const int64_t _size_field_2 = int64_t(int64_c(ci.zSize()) + 0);
135 const int64_t _stride_field_0 = int64_t(field->xStride());
136 const int64_t _stride_field_1 = int64_t(field->yStride());
137 const int64_t _stride_field_2 = int64_t(field->zStride());
138 const int64_t _stride_field_3 = int64_t(1 * int64_t(field->fStride()));
139 dim3 _block(uint32_c(((128 < _size_field_0) ? 128 : _size_field_0)), uint32_c(((1024 < ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))) ? 1024 : ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))), uint32_c(((64 < ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))) ? 64 : ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))))));
140 dim3 _grid(uint32_c(((_size_field_0) % (((128 < _size_field_0) ? 128 : _size_field_0)) == 0 ? (int64_t)(_size_field_0) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)) : ((int64_t)(_size_field_0) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))) + 1)), uint32_c(((_size_field_1) % (((1024 < ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))) ? 1024 : ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))) == 0 ? (int64_t)(_size_field_1) / (int64_t)(((1024 < ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))) ? 1024 : ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))) : ((int64_t)(_size_field_1) / (int64_t)(((1024 < ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))) ? 1024 : ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) + 1)), uint32_c(((_size_field_2) % (((64 < ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))) ? 64 : ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))))) == 0 ? (int64_t)(_size_field_2) / (int64_t)(((64 < ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))) ? 64 : ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))))) : ((int64_t)(_size_field_2) / (int64_t)(((64 < ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))) ? 64 : ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))))) + 1)));
141 internal_pack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE::pack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_field, _size_field_0, _size_field_1, _size_field_2, _stride_field_0, _stride_field_1, _stride_field_2, _stride_field_3);
142 break;
143 }
144
145 default:
146 return;
147 }
148}
149
150void PackInfoVecSinglePrecisionCUDA::unpack(Direction dir, unsigned char *byte_buffer, IBlock *block, gpuStream_t stream) {
151 float *buffer = reinterpret_cast<float *>(byte_buffer);
152
153 auto field = block->getData<gpu::GPUField<float>>(fieldID);
154
155 CellInterval ci;
156 field->getGhostRegion(dir, ci, 1, false);
157 auto communciationDirection = stencil::inverseDir[dir];
158
159 switch (communciationDirection) {
160 case stencil::SW:
161 case stencil::BW:
162 case stencil::W:
163 case stencil::TW:
164 case stencil::NW:
165 case stencil::BS:
166 case stencil::S:
167 case stencil::TS:
168 case stencil::B:
169 case stencil::C:
170 case stencil::T:
171 case stencil::BN:
172 case stencil::N:
173 case stencil::TN:
174 case stencil::SE:
175 case stencil::BE:
176 case stencil::E:
177 case stencil::TE:
178 case stencil::NE: {
179 float *RESTRICT const _data_buffer = buffer;
180 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(field->nrOfGhostLayers()))
181 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(field->nrOfGhostLayers()))
182 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(field->nrOfGhostLayers()))
183 float *RESTRICT _data_field = field->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
184 WALBERLA_ASSERT_GREATER_EQUAL(field->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
185 const int64_t _size_field_0 = int64_t(int64_c(ci.xSize()) + 0);
186 WALBERLA_ASSERT_GREATER_EQUAL(field->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
187 const int64_t _size_field_1 = int64_t(int64_c(ci.ySize()) + 0);
188 WALBERLA_ASSERT_GREATER_EQUAL(field->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
189 const int64_t _size_field_2 = int64_t(int64_c(ci.zSize()) + 0);
190 const int64_t _stride_field_0 = int64_t(field->xStride());
191 const int64_t _stride_field_1 = int64_t(field->yStride());
192 const int64_t _stride_field_2 = int64_t(field->zStride());
193 const int64_t _stride_field_3 = int64_t(1 * int64_t(field->fStride()));
194 dim3 _block(uint32_c(((128 < _size_field_0) ? 128 : _size_field_0)), uint32_c(((1024 < ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))) ? 1024 : ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))), uint32_c(((64 < ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))) ? 64 : ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))))));
195 dim3 _grid(uint32_c(((_size_field_0) % (((128 < _size_field_0) ? 128 : _size_field_0)) == 0 ? (int64_t)(_size_field_0) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)) : ((int64_t)(_size_field_0) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))) + 1)), uint32_c(((_size_field_1) % (((1024 < ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))) ? 1024 : ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))) == 0 ? (int64_t)(_size_field_1) / (int64_t)(((1024 < ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))) ? 1024 : ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))) : ((int64_t)(_size_field_1) / (int64_t)(((1024 < ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))) ? 1024 : ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) + 1)), uint32_c(((_size_field_2) % (((64 < ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))) ? 64 : ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))))) == 0 ? (int64_t)(_size_field_2) / (int64_t)(((64 < ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))) ? 64 : ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))))) : ((int64_t)(_size_field_2) / (int64_t)(((64 < ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))) ? 64 : ((_size_field_2 < ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0))))))) ? _size_field_2 : ((int64_t)(256) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0) * ((_size_field_1 < 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))) ? _size_field_1 : 2 * ((int64_t)(128) / (int64_t)(((128 < _size_field_0) ? 128 : _size_field_0)))))))))) + 1)));
196 internal_unpack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE::unpack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_field, _size_field_0, _size_field_1, _size_field_2, _stride_field_0, _stride_field_1, _stride_field_2, _stride_field_3);
197 break;
198 }
199
200 default:
201 return;
202 }
203}
204
205uint_t PackInfoVecSinglePrecisionCUDA::size(stencil::Direction dir, IBlock *block) {
206 auto field = block->getData<gpu::GPUField<float>>(fieldID);
207
208 CellInterval ci;
209 field->getGhostRegion(dir, ci, 1, false);
210
211 uint_t elementsPerCell = 0;
212
213 switch (dir) {
214 case stencil::SW:
215 case stencil::BW:
216 case stencil::W:
217 case stencil::TW:
218 case stencil::NW:
219 case stencil::BS:
220 case stencil::S:
221 case stencil::TS:
222 case stencil::B:
223 case stencil::C:
224 case stencil::T:
225 case stencil::BN:
226 case stencil::N:
227 case stencil::TN:
228 case stencil::SE:
229 case stencil::BE:
230 case stencil::E:
231 case stencil::TE:
232 case stencil::NE:
233 elementsPerCell = 3;
234 break;
235
236 default:
237 elementsPerCell = 0;
238 }
239 return ci.numCells() * elementsPerCell * sizeof(float);
240}
241
242} // namespace pystencils
243} // namespace walberla
#define FUNC_PREFIX
\file AdvectiveFluxKernel_double_precision.cpp \author pystencils
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
void unpack(stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream) override
uint_t size(stencil::Direction dir, IBlock *block) override
void pack(stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream) override
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:172
static FUNC_PREFIX double *RESTRICT const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_field_1
static FUNC_PREFIX double *RESTRICT const int64_t const int64_t const int64_t const _size_field_2
static FUNC_PREFIX __launch_bounds__(256) void pack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE(double *RESTRICT _data_buffer
static FUNC_PREFIX double *RESTRICT const int64_t const int64_t const _size_field_1
static FUNC_PREFIX double *RESTRICT const int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_field_2
static FUNC_PREFIX double *RESTRICT const int64_t const int64_t const int64_t const int64_t const _stride_field_0
static FUNC_PREFIX double *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const int64_t const _stride_field_2
static FUNC_PREFIX __launch_bounds__(256) void unpack_SW_BW_W_TW_NW_BS_S_TS_B_C_T_BN_N_TN_SE_BE_E_TE_NE(double *RESTRICT const _data_buffer
static FUNC_PREFIX double *RESTRICT int64_t const int64_t const int64_t const int64_t const _stride_field_0
static FUNC_PREFIX double *RESTRICT int64_t const int64_t const int64_t const int64_t const int64_t const _stride_field_1
static FUNC_PREFIX double *RESTRICT int64_t const int64_t const int64_t const _size_field_2
\file PackInfoPdfDoublePrecision.cpp \author pystencils