Loading [MathJax]/extensions/TeX/AMSmath.js
ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages Concepts
FieldAccessorsSinglePrecisionCUDA.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2023-2024 The ESPResSo project
3 * Copyright (C) 2020 The waLBerla project
4 *
5 * This file is part of ESPResSo.
6 *
7 * ESPResSo is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * ESPResSo is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21// kernel generated with pystencils v1.3.7, lbmpy v1.3.7, sympy v1.12.1, lbmpy_walberla/pystencils_walberla from waLBerla commit f36fa0a68bae59f0b516f6587ea8fa7c24a41141
22
23/**
24 * @file
25 * Lattice field accessors.
26 * Adapted from the waLBerla source file
27 * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
28 */
29
30#include <core/DataTypes.h>
31#include <core/cell/Cell.h>
32#include <core/cell/CellInterval.h>
33#include <core/math/Matrix3.h>
34#include <core/math/Vector3.h>
35
36#include <field/iterators/IteratorMacros.h>
37
38#include <gpu/FieldAccessor.h>
39#include <gpu/FieldIndexing.h>
40#include <gpu/GPUField.h>
41#include <gpu/Kernel.h>
42
43#include <thrust/device_ptr.h>
44#include <thrust/device_vector.h>
45
46#include <array>
47#include <vector>
48
49#if defined(__NVCC__)
50#define RESTRICT __restrict__
51#pragma nv_diagnostic push
52#pragma nv_diag_suppress 177 // unused variable
53#elif defined(__clang__)
54#if defined(__CUDA__)
55#if defined(__CUDA_ARCH__)
56// clang compiling CUDA code in device mode
57#define RESTRICT __restrict__
58#pragma clang diagnostic push
59#pragma clang diagnostic ignored "-Wunused-variable"
60#else
61// clang compiling CUDA code in host mode
62#define RESTRICT __restrict__
63#pragma clang diagnostic push
64#pragma clang diagnostic ignored "-Wunused-variable"
65#endif
66#endif
67#elif defined(__GNUC__) or defined(__GNUG__)
68#define RESTRICT __restrict__
69#pragma GCC diagnostic push
70#pragma GCC diagnostic ignored "-Wunused-variable"
71#elif defined(_MSC_VER)
72#define RESTRICT __restrict
73#else
74#define RESTRICT
75#endif
76
77/** @brief Get linear index of flattened data with original layout @c fzyx. */
78static __forceinline__ __device__ uint getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset) {
79 auto const x = threadIdx.x;
80 auto const y = blockIdx.x;
81 auto const z = blockIdx.y;
82 auto const f = blockIdx.z;
83 auto const ySize = gridDim.x;
84 auto const zSize = gridDim.y;
85 auto const fSize = fOffset;
86 return f +
87 z * fSize +
88 y * fSize * zSize +
89 x * fSize * zSize * ySize;
90}
91
92namespace walberla {
93namespace lbm {
94namespace accessor {
95
96namespace Population {
97// LCOV_EXCL_START
98__global__ void kernel_get(
99 gpu::FieldAccessor<float> pdf,
100 float *RESTRICT pop) {
101 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
102 pdf.set(blockIdx, threadIdx);
103 pop += offset;
104 if (pdf.isValidPosition()) {
105 pop[0u] = pdf.get(0u);
106 pop[1u] = pdf.get(1u);
107 pop[2u] = pdf.get(2u);
108 pop[3u] = pdf.get(3u);
109 pop[4u] = pdf.get(4u);
110 pop[5u] = pdf.get(5u);
111 pop[6u] = pdf.get(6u);
112 pop[7u] = pdf.get(7u);
113 pop[8u] = pdf.get(8u);
114 pop[9u] = pdf.get(9u);
115 pop[10u] = pdf.get(10u);
116 pop[11u] = pdf.get(11u);
117 pop[12u] = pdf.get(12u);
118 pop[13u] = pdf.get(13u);
119 pop[14u] = pdf.get(14u);
120 pop[15u] = pdf.get(15u);
121 pop[16u] = pdf.get(16u);
122 pop[17u] = pdf.get(17u);
123 pop[18u] = pdf.get(18u);
124 }
125}
126
127__global__ void kernel_set(
128 gpu::FieldAccessor<float> pdf,
129 float const *RESTRICT pop) {
130 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
131 pdf.set(blockIdx, threadIdx);
132 pop += offset;
133 if (pdf.isValidPosition()) {
134 pdf.get(0u) = pop[0u];
135 pdf.get(1u) = pop[1u];
136 pdf.get(2u) = pop[2u];
137 pdf.get(3u) = pop[3u];
138 pdf.get(4u) = pop[4u];
139 pdf.get(5u) = pop[5u];
140 pdf.get(6u) = pop[6u];
141 pdf.get(7u) = pop[7u];
142 pdf.get(8u) = pop[8u];
143 pdf.get(9u) = pop[9u];
144 pdf.get(10u) = pop[10u];
145 pdf.get(11u) = pop[11u];
146 pdf.get(12u) = pop[12u];
147 pdf.get(13u) = pop[13u];
148 pdf.get(14u) = pop[14u];
149 pdf.get(15u) = pop[15u];
150 pdf.get(16u) = pop[16u];
151 pdf.get(17u) = pop[17u];
152 pdf.get(18u) = pop[18u];
153 }
154}
155
156__global__ void kernel_broadcast(
157 gpu::FieldAccessor<float> pdf,
158 float const *RESTRICT pop) {
159 pdf.set(blockIdx, threadIdx);
160 if (pdf.isValidPosition()) {
161 pdf.get(0u) = pop[0u];
162 pdf.get(1u) = pop[1u];
163 pdf.get(2u) = pop[2u];
164 pdf.get(3u) = pop[3u];
165 pdf.get(4u) = pop[4u];
166 pdf.get(5u) = pop[5u];
167 pdf.get(6u) = pop[6u];
168 pdf.get(7u) = pop[7u];
169 pdf.get(8u) = pop[8u];
170 pdf.get(9u) = pop[9u];
171 pdf.get(10u) = pop[10u];
172 pdf.get(11u) = pop[11u];
173 pdf.get(12u) = pop[12u];
174 pdf.get(13u) = pop[13u];
175 pdf.get(14u) = pop[14u];
176 pdf.get(15u) = pop[15u];
177 pdf.get(16u) = pop[16u];
178 pdf.get(17u) = pop[17u];
179 pdf.get(18u) = pop[18u];
180 }
181}
182
183__global__ void kernel_set_vel(
184 gpu::FieldAccessor<float> pdf,
185 gpu::FieldAccessor<float> velocity,
186 gpu::FieldAccessor<float> force,
187 float const *RESTRICT pop) {
188 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
189 pdf.set(blockIdx, threadIdx);
190 velocity.set(blockIdx, threadIdx);
191 force.set(blockIdx, threadIdx);
192 pop += offset;
193 if (pdf.isValidPosition()) {
194 const float f_0 = pdf.get(0u) = pop[0u];
195 const float f_1 = pdf.get(1u) = pop[1u];
196 const float f_2 = pdf.get(2u) = pop[2u];
197 const float f_3 = pdf.get(3u) = pop[3u];
198 const float f_4 = pdf.get(4u) = pop[4u];
199 const float f_5 = pdf.get(5u) = pop[5u];
200 const float f_6 = pdf.get(6u) = pop[6u];
201 const float f_7 = pdf.get(7u) = pop[7u];
202 const float f_8 = pdf.get(8u) = pop[8u];
203 const float f_9 = pdf.get(9u) = pop[9u];
204 const float f_10 = pdf.get(10u) = pop[10u];
205 const float f_11 = pdf.get(11u) = pop[11u];
206 const float f_12 = pdf.get(12u) = pop[12u];
207 const float f_13 = pdf.get(13u) = pop[13u];
208 const float f_14 = pdf.get(14u) = pop[14u];
209 const float f_15 = pdf.get(15u) = pop[15u];
210 const float f_16 = pdf.get(16u) = pop[16u];
211 const float f_17 = pdf.get(17u) = pop[17u];
212 const float f_18 = pdf.get(18u) = pop[18u];
213 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
214 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
215 const float vel1Term = f_1 + f_11 + f_15 + f_7;
216 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
217 const float vel2Term = f_12 + f_13 + f_5;
218 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
219 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
220 const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
221 const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
222 const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
223 const float rho_inv = float{1} / rho;
224 velocity.get(0u) = md_0 * rho_inv;
225 velocity.get(1u) = md_1 * rho_inv;
226 velocity.get(2u) = md_2 * rho_inv;
227 }
228}
229// LCOV_EXCL_STOP
230
231std::array<float, 19u> get(
232 gpu::GPUField<float> const *pdf_field,
233 Cell const &cell) {
234 CellInterval ci(cell, cell);
235 thrust::device_vector<float> dev_data(19u);
236 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
237 auto kernel = gpu::make_kernel(kernel_get);
238 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
239 kernel.addParam(dev_data_ptr);
240 kernel();
241 std::array<float, 19u> pop;
242 thrust::copy(dev_data.begin(), dev_data.end(), pop.data());
243 return pop;
244}
245
246void set(
247 gpu::GPUField<float> *pdf_field,
248 std::array<float, 19u> const &pop,
249 Cell const &cell) {
250 thrust::device_vector<float> dev_data(pop.begin(), pop.end());
251 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
252 CellInterval ci(cell, cell);
253 auto kernel = gpu::make_kernel(kernel_set);
254 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
255 kernel.addParam(const_cast<const float *>(dev_data_ptr));
256 kernel();
257}
258
259void set(
260 gpu::GPUField<float> *pdf_field,
261 gpu::GPUField<float> *velocity_field,
262 gpu::GPUField<float> const *force_field,
263 std::array<float, 19u> const &pop,
264 Cell const &cell) {
265 thrust::device_vector<float> dev_data(pop.begin(), pop.end());
266 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
267 CellInterval ci(cell, cell);
268 auto kernel = gpu::make_kernel(kernel_set_vel);
269 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
270 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
271 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
272 kernel.addParam(const_cast<const float *>(dev_data_ptr));
273 kernel();
274}
275
277 gpu::GPUField<float> *pdf_field,
278 std::array<float, 19u> const &pop) {
279 CellInterval ci = pdf_field->xyzSizeWithGhostLayer();
280 thrust::device_vector<float> dev_data(pop.begin(), pop.end());
281 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
282 auto kernel = gpu::make_kernel(kernel_broadcast);
283 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
284 kernel.addParam(const_cast<const float *>(dev_data_ptr));
285 kernel();
286}
287
288std::vector<float> get(
289 gpu::GPUField<float> const *pdf_field,
290 CellInterval const &ci) {
291 thrust::device_vector<float> dev_data(ci.numCells() * 19u);
292 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
293 auto kernel = gpu::make_kernel(kernel_get);
294 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
295 kernel.addParam(dev_data_ptr);
296 kernel();
297 std::vector<float> out(ci.numCells() * 19u);
298 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
299 return out;
300}
301
302void set(
303 gpu::GPUField<float> *pdf_field,
304 std::vector<float> const &values,
305 CellInterval const &ci) {
306 thrust::device_vector<float> dev_data(values.begin(), values.end());
307 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
308 auto kernel = gpu::make_kernel(kernel_set);
309 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
310 kernel.addParam(const_cast<const float *>(dev_data_ptr));
311 kernel();
312}
313
314void set(
315 gpu::GPUField<float> *pdf_field,
316 gpu::GPUField<float> *velocity_field,
317 gpu::GPUField<float> const *force_field,
318 std::vector<float> const &values,
319 CellInterval const &ci) {
320 thrust::device_vector<float> dev_data(values.begin(), values.end());
321 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
322 auto kernel = gpu::make_kernel(kernel_set_vel);
323 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
324 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
325 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
326 kernel.addParam(const_cast<const float *>(dev_data_ptr));
327 kernel();
328}
329} // namespace Population
330
331namespace Vector {
332// LCOV_EXCL_START
333__global__ void kernel_get(
334 gpu::FieldAccessor<float> vec,
335 float *u_out) {
336 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
337 vec.set(blockIdx, threadIdx);
338 u_out += offset;
339 if (vec.isValidPosition()) {
340 u_out[0u] = vec.get(0u);
341 u_out[1u] = vec.get(1u);
342 u_out[2u] = vec.get(2u);
343 }
344}
345
346__global__ void kernel_set(
347 gpu::FieldAccessor<float> vec,
348 float const *RESTRICT u_in) {
349 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
350 vec.set(blockIdx, threadIdx);
351 u_in += offset;
352 if (vec.isValidPosition()) {
353 vec.get(0u) = u_in[0u];
354 vec.get(1u) = u_in[1u];
355 vec.get(2u) = u_in[2u];
356 }
357}
358
359__global__ void kernel_broadcast(
360 gpu::FieldAccessor<float> vec,
361 float const *RESTRICT u_in) {
362 vec.set(blockIdx, threadIdx);
363 if (vec.isValidPosition()) {
364 vec.get(0u) = u_in[0u];
365 vec.get(1u) = u_in[1u];
366 vec.get(2u) = u_in[2u];
367 }
368}
369
370__global__ void kernel_add(
371 gpu::FieldAccessor<float> vec,
372 float const *RESTRICT u_in) {
373 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
374 vec.set(blockIdx, threadIdx);
375 u_in += offset;
376 if (vec.isValidPosition()) {
377 vec.get(0u) += u_in[0u];
378 vec.get(1u) += u_in[1u];
379 vec.get(2u) += u_in[2u];
380 }
381}
382
383__global__ void kernel_broadcast_add(
384 gpu::FieldAccessor<float> vec,
385 float const *RESTRICT u_in) {
386 vec.set(blockIdx, threadIdx);
387 if (vec.isValidPosition()) {
388 vec.get(0u) += u_in[0u];
389 vec.get(1u) += u_in[1u];
390 vec.get(2u) += u_in[2u];
391 }
392}
393// LCOV_EXCL_STOP
394
395Vector3<float> get(
396 gpu::GPUField<float> const *vec_field,
397 Cell const &cell) {
398 CellInterval ci(cell, cell);
399 thrust::device_vector<float> dev_data(3u);
400 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
401 auto kernel = gpu::make_kernel(kernel_get);
402 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
403 kernel.addParam(dev_data_ptr);
404 kernel();
405 Vector3<float> vec;
406 thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
407 return vec;
408}
409
410void set(
411 gpu::GPUField<float> *vec_field,
412 Vector3<float> const &vec,
413 Cell const &cell) {
414 CellInterval ci(cell, cell);
415 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
416 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
417 auto kernel = gpu::make_kernel(kernel_set);
418 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
419 kernel.addParam(const_cast<const float *>(dev_data_ptr));
420 kernel();
421}
422
423void add(
424 gpu::GPUField<float> *vec_field,
425 Vector3<float> const &vec,
426 Cell const &cell) {
427 CellInterval ci(cell, cell);
428 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
429 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
430 auto kernel = gpu::make_kernel(kernel_add);
431 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
432 kernel.addParam(const_cast<const float *>(dev_data_ptr));
433 kernel();
434}
435
437 gpu::GPUField<float> *vec_field,
438 Vector3<float> const &vec) {
439 CellInterval ci = vec_field->xyzSizeWithGhostLayer();
440 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
441 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
442 auto kernel = gpu::make_kernel(kernel_broadcast);
443 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
444 kernel.addParam(const_cast<const float *>(dev_data_ptr));
445 kernel();
446}
447
449 gpu::GPUField<float> *vec_field,
450 Vector3<float> const &vec) {
451 CellInterval ci = vec_field->xyzSizeWithGhostLayer();
452 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
453 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
454 auto kernel = gpu::make_kernel(kernel_broadcast_add);
455 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
456 kernel.addParam(const_cast<const float *>(dev_data_ptr));
457 kernel();
458}
459
460std::vector<float> get(
461 gpu::GPUField<float> const *vec_field,
462 CellInterval const &ci) {
463 thrust::device_vector<float> dev_data(ci.numCells() * 3u);
464 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
465 auto kernel = gpu::make_kernel(kernel_get);
466 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
467 kernel.addParam(dev_data_ptr);
468 kernel();
469 std::vector<float> out(ci.numCells() * 3u);
470 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
471 return out;
472}
473
474void set(
475 gpu::GPUField<float> *vec_field,
476 std::vector<float> const &values,
477 CellInterval const &ci) {
478 thrust::device_vector<float> dev_data(values.begin(), values.end());
479 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
480 auto kernel = gpu::make_kernel(kernel_set);
481 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
482 kernel.addParam(const_cast<const float *>(dev_data_ptr));
483 kernel();
484}
485} // namespace Vector
486
487namespace Interpolation {
488// LCOV_EXCL_START
489/** @brief Calculate interpolation weights. */
490static __forceinline__ __device__ void calculate_weights(
491 float const *RESTRICT const pos,
492 int *RESTRICT const corner,
493 float *RESTRICT const weights,
494 uint gl) {
495#pragma unroll
496 for (int dim = 0; dim < 3; ++dim) {
497 auto const fractional_index = pos[dim] - float{0.5};
498 auto const nmp = floorf(fractional_index);
499 auto const distance = fractional_index - nmp - float{0.5};
500 corner[dim] = __float2int_rn(nmp) + static_cast<int>(gl);
501 weights[dim * 2 + 0] = float{0.5} - distance;
502 weights[dim * 2 + 1] = float{0.5} + distance;
503 }
504}
505
506__global__ void kernel_get(
507 gpu::FieldAccessor<float> vec,
508 float const *RESTRICT const pos,
509 float *RESTRICT const vel,
510 uint n_pos,
511 uint gl) {
512
513 uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
514 blockDim.x * blockIdx.x + threadIdx.x;
515
516 vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
517 if (vec.isValidPosition() and pos_index < n_pos) {
518 auto const array_offset = pos_index * uint(3u);
519 int corner[3];
520 float weights[3][2];
521 calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
522#pragma unroll
523 for (int i = 0; i < 2; i++) {
524 auto const cx = corner[0] + i;
525 auto const wx = weights[0][i];
526#pragma unroll
527 for (int j = 0; j < 2; j++) {
528 auto const cy = corner[1] + j;
529 auto const wxy = wx * weights[1][j];
530#pragma unroll
531 for (int k = 0; k < 2; k++) {
532 auto const cz = corner[2] + k;
533 auto const weight = wxy * weights[2][k];
534 vel[array_offset + 0u] += weight * vec.getNeighbor(cx, cy, cz, 0u);
535 vel[array_offset + 1u] += weight * vec.getNeighbor(cx, cy, cz, 1u);
536 vel[array_offset + 2u] += weight * vec.getNeighbor(cx, cy, cz, 2u);
537 }
538 }
539 }
540 }
541}
542
543__global__ void kernel_set(
544 gpu::FieldAccessor<float> vec,
545 float const *RESTRICT const pos,
546 float const *RESTRICT const forces,
547 uint n_pos,
548 uint gl) {
549
550 uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
551 blockDim.x * blockIdx.x + threadIdx.x;
552
553 vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
554 if (vec.isValidPosition() and pos_index < n_pos) {
555 auto const array_offset = pos_index * uint(3u);
556 int corner[3];
557 float weights[3][2];
558 calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
559#pragma unroll
560 for (int i = 0; i < 2; i++) {
561 auto const cx = corner[0] + i;
562 auto const wx = weights[0][i];
563#pragma unroll
564 for (int j = 0; j < 2; j++) {
565 auto const cy = corner[1] + j;
566 auto const wxy = wx * weights[1][j];
567#pragma unroll
568 for (int k = 0; k < 2; k++) {
569 auto const cz = corner[2] + k;
570 auto const weight = wxy * weights[2][k];
571 atomicAdd(&vec.getNeighbor(cx, cy, cz, 0u),
572 weight * forces[array_offset + 0u]);
573 atomicAdd(&vec.getNeighbor(cx, cy, cz, 1u),
574 weight * forces[array_offset + 1u]);
575 atomicAdd(&vec.getNeighbor(cx, cy, cz, 2u),
576 weight * forces[array_offset + 2u]);
577 }
578 }
579 }
580 }
581}
582// LCOV_EXCL_STOP
583
584static dim3 calculate_dim_grid(uint const threads_x,
585 uint const blocks_per_grid_y,
586 uint const threads_per_block) {
587 assert(threads_x >= 1u);
588 assert(blocks_per_grid_y >= 1u);
589 assert(threads_per_block >= 1u);
590 auto const threads_y = threads_per_block * blocks_per_grid_y;
591 auto const blocks_per_grid_x = (threads_x + threads_y - 1) / threads_y;
592 return make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
593}
594
595std::vector<float>
597 gpu::GPUField<float> const *vec_field,
598 std::vector<float> const &pos,
599 uint gl) {
600 thrust::device_vector<float> dev_pos(pos.begin(), pos.end());
601 thrust::device_vector<float> dev_vel(pos.size());
602 auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
603 auto const dev_vel_ptr = thrust::raw_pointer_cast(dev_vel.data());
604
605 auto const threads_per_block = uint(64u);
606 auto const n_pos = static_cast<uint>(pos.size() / 3ul);
607 auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
608 kernel_get<<<dim_grid, threads_per_block, 0u, nullptr>>>(
609 gpu::FieldIndexing<float>::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
610 dev_pos_ptr, dev_vel_ptr, n_pos, gl);
611
612 std::vector<float> out(pos.size());
613 thrust::copy(dev_vel.begin(), dev_vel.end(), out.data());
614 return out;
615}
616
617void set(
618 gpu::GPUField<float> const *vec_field,
619 std::vector<float> const &pos,
620 std::vector<float> const &forces,
621 uint gl) {
622 thrust::device_vector<float> dev_pos(pos.begin(), pos.end());
623 thrust::device_vector<float> dev_for(forces.begin(), forces.end());
624 auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
625 auto const dev_for_ptr = thrust::raw_pointer_cast(dev_for.data());
626
627 auto const threads_per_block = uint(64u);
628 auto const n_pos = static_cast<uint>(pos.size() / 3ul);
629 auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
630 kernel_set<<<dim_grid, threads_per_block, 0u, nullptr>>>(
631 gpu::FieldIndexing<float>::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
632 dev_pos_ptr, dev_for_ptr, n_pos, gl);
633}
634} // namespace Interpolation
635
636namespace Equilibrium {
637// LCOV_EXCL_START
638__device__ void kernel_set_device(
639 gpu::FieldAccessor<float> pdf,
640 float const *RESTRICT const u,
641 float rho) {
642
643 pdf.get(0u) = rho * -0.33333333333333331f * (u[0] * u[0]) + rho * -0.33333333333333331f * (u[1] * u[1]) + rho * -0.33333333333333331f * (u[2] * u[2]) + rho * 0.33333333333333331f;
644 pdf.get(1u) = rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * u[1] + rho * 0.16666666666666666f * (u[1] * u[1]);
645 pdf.get(2u) = rho * -0.16666666666666666f * u[1] + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u[1] * u[1]);
646 pdf.get(3u) = rho * -0.16666666666666666f * u[0] + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u[0] * u[0]);
647 pdf.get(4u) = rho * -0.16666666666666666f * (u[1] * u[1]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * u[0] + rho * 0.16666666666666666f * (u[0] * u[0]);
648 pdf.get(5u) = rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * u[2] + rho * 0.16666666666666666f * (u[2] * u[2]);
649 pdf.get(6u) = rho * -0.16666666666666666f * u[2] + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u[2] * u[2]);
650 pdf.get(7u) = rho * -0.083333333333333329f * u[0] + rho * -0.25f * u[0] * u[1] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]);
651 pdf.get(8u) = rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.25f * u[0] * u[1];
652 pdf.get(9u) = rho * -0.083333333333333329f * u[0] + rho * -0.083333333333333329f * u[1] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.25f * u[0] * u[1];
653 pdf.get(10u) = rho * -0.083333333333333329f * u[1] + rho * -0.25f * u[0] * u[1] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]);
654 pdf.get(11u) = rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[1] * u[2];
655 pdf.get(12u) = rho * -0.083333333333333329f * u[1] + rho * -0.25f * u[1] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]);
656 pdf.get(13u) = rho * -0.083333333333333329f * u[0] + rho * -0.25f * u[0] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]);
657 pdf.get(14u) = rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[0] * u[2];
658 pdf.get(15u) = rho * -0.083333333333333329f * u[2] + rho * -0.25f * u[1] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]);
659 pdf.get(16u) = rho * -0.083333333333333329f * u[1] + rho * -0.083333333333333329f * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[1] * u[2];
660 pdf.get(17u) = rho * -0.083333333333333329f * u[0] + rho * -0.083333333333333329f * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[0] * u[2];
661 pdf.get(18u) = rho * -0.083333333333333329f * u[2] + rho * -0.25f * u[0] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]);
662}
663// LCOV_EXCL_STOP
664} // namespace Equilibrium
665
666namespace Density {
667// LCOV_EXCL_START
668__global__ void kernel_get(
669 gpu::FieldAccessor<float> pdf,
670 float *RESTRICT rho_out) {
671 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
672 pdf.set(blockIdx, threadIdx);
673 rho_out += offset;
674 if (pdf.isValidPosition()) {
675 float const f_0 = pdf.get(0u);
676 float const f_1 = pdf.get(1u);
677 float const f_2 = pdf.get(2u);
678 float const f_3 = pdf.get(3u);
679 float const f_4 = pdf.get(4u);
680 float const f_5 = pdf.get(5u);
681 float const f_6 = pdf.get(6u);
682 float const f_7 = pdf.get(7u);
683 float const f_8 = pdf.get(8u);
684 float const f_9 = pdf.get(9u);
685 float const f_10 = pdf.get(10u);
686 float const f_11 = pdf.get(11u);
687 float const f_12 = pdf.get(12u);
688 float const f_13 = pdf.get(13u);
689 float const f_14 = pdf.get(14u);
690 float const f_15 = pdf.get(15u);
691 float const f_16 = pdf.get(16u);
692 float const f_17 = pdf.get(17u);
693 float const f_18 = pdf.get(18u);
694 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
695 const float vel1Term = f_1 + f_11 + f_15 + f_7;
696 const float vel2Term = f_12 + f_13 + f_5;
697 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
698 rho_out[0u] = rho;
699 }
700}
701
702__global__ void kernel_set(
703 gpu::FieldAccessor<float> pdf,
704 float const *RESTRICT rho_in) {
705 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
706 pdf.set(blockIdx, threadIdx);
707 rho_in += offset;
708 if (pdf.isValidPosition()) {
709 float const f_0 = pdf.get(0u);
710 float const f_1 = pdf.get(1u);
711 float const f_2 = pdf.get(2u);
712 float const f_3 = pdf.get(3u);
713 float const f_4 = pdf.get(4u);
714 float const f_5 = pdf.get(5u);
715 float const f_6 = pdf.get(6u);
716 float const f_7 = pdf.get(7u);
717 float const f_8 = pdf.get(8u);
718 float const f_9 = pdf.get(9u);
719 float const f_10 = pdf.get(10u);
720 float const f_11 = pdf.get(11u);
721 float const f_12 = pdf.get(12u);
722 float const f_13 = pdf.get(13u);
723 float const f_14 = pdf.get(14u);
724 float const f_15 = pdf.get(15u);
725 float const f_16 = pdf.get(16u);
726 float const f_17 = pdf.get(17u);
727 float const f_18 = pdf.get(18u);
728 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
729 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
730 const float vel1Term = f_1 + f_11 + f_15 + f_7;
731 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
732 const float vel2Term = f_12 + f_13 + f_5;
733 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
734 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
735
736 // calculate current velocity (before density change)
737 float const rho_inv = float{1} / rho;
738 float const u_old[3] = {momdensity_0 * rho_inv, momdensity_1 * rho_inv, momdensity_2 * rho_inv};
739
740 Equilibrium::kernel_set_device(pdf, u_old, rho_in[0u]);
741 }
742}
743// LCOV_EXCL_STOP
744
745float get(
746 gpu::GPUField<float> const *pdf_field,
747 Cell const &cell) {
748 CellInterval ci(cell, cell);
749 thrust::device_vector<float> dev_data(1u);
750 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
751 auto kernel = gpu::make_kernel(kernel_get);
752 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
753 kernel.addParam(dev_data_ptr);
754 kernel();
755 float rho = dev_data[0u];
756 return rho;
757}
758
759std::vector<float> get(
760 gpu::GPUField<float> const *pdf_field,
761 CellInterval const &ci) {
762 thrust::device_vector<float> dev_data(ci.numCells());
763 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
764 auto kernel = gpu::make_kernel(kernel_get);
765 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
766 kernel.addParam(dev_data_ptr);
767 kernel();
768 std::vector<float> out(dev_data.size());
769 thrust::copy(dev_data.begin(), dev_data.end(), out.begin());
770 return out;
771}
772
773void set(
774 gpu::GPUField<float> *pdf_field,
775 const float rho,
776 Cell const &cell) {
777 CellInterval ci(cell, cell);
778 thrust::device_vector<float> dev_data(1u, rho);
779 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
780 auto kernel = gpu::make_kernel(kernel_set);
781 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
782 kernel.addParam(const_cast<const float *>(dev_data_ptr));
783 kernel();
784}
785
786void set(
787 gpu::GPUField<float> *pdf_field,
788 std::vector<float> const &values,
789 CellInterval const &ci) {
790 thrust::device_vector<float> dev_data(values.begin(), values.end());
791 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
792 auto kernel = gpu::make_kernel(kernel_set);
793 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
794 kernel.addParam(const_cast<const float *>(dev_data_ptr));
795 kernel();
796}
797} // namespace Density
798
799namespace Velocity {
800// LCOV_EXCL_START
801__global__ void kernel_get(
802 gpu::FieldAccessor<float> pdf,
803 gpu::FieldAccessor<float> force,
804 float *RESTRICT u_out) {
805 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
806 pdf.set(blockIdx, threadIdx);
807 force.set(blockIdx, threadIdx);
808 u_out += offset;
809 if (pdf.isValidPosition()) {
810 float const f_0 = pdf.get(0u);
811 float const f_1 = pdf.get(1u);
812 float const f_2 = pdf.get(2u);
813 float const f_3 = pdf.get(3u);
814 float const f_4 = pdf.get(4u);
815 float const f_5 = pdf.get(5u);
816 float const f_6 = pdf.get(6u);
817 float const f_7 = pdf.get(7u);
818 float const f_8 = pdf.get(8u);
819 float const f_9 = pdf.get(9u);
820 float const f_10 = pdf.get(10u);
821 float const f_11 = pdf.get(11u);
822 float const f_12 = pdf.get(12u);
823 float const f_13 = pdf.get(13u);
824 float const f_14 = pdf.get(14u);
825 float const f_15 = pdf.get(15u);
826 float const f_16 = pdf.get(16u);
827 float const f_17 = pdf.get(17u);
828 float const f_18 = pdf.get(18u);
829 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
830 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
831 const float vel1Term = f_1 + f_11 + f_15 + f_7;
832 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
833 const float vel2Term = f_12 + f_13 + f_5;
834 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
835 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
836 const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
837 const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
838 const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
839 auto const rho_inv = float{1} / rho;
840 u_out[0u] = md_0 * rho_inv;
841 u_out[1u] = md_1 * rho_inv;
842 u_out[2u] = md_2 * rho_inv;
843 }
844}
845
846__global__ void kernel_set(
847 gpu::FieldAccessor<float> pdf,
848 gpu::FieldAccessor<float> velocity,
849 gpu::FieldAccessor<float> force,
850 float const *RESTRICT u_in) {
851 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
852 pdf.set(blockIdx, threadIdx);
853 velocity.set(blockIdx, threadIdx);
854 force.set(blockIdx, threadIdx);
855 u_in += offset;
856 if (pdf.isValidPosition()) {
857 float const f_0 = pdf.get(0u);
858 float const f_1 = pdf.get(1u);
859 float const f_2 = pdf.get(2u);
860 float const f_3 = pdf.get(3u);
861 float const f_4 = pdf.get(4u);
862 float const f_5 = pdf.get(5u);
863 float const f_6 = pdf.get(6u);
864 float const f_7 = pdf.get(7u);
865 float const f_8 = pdf.get(8u);
866 float const f_9 = pdf.get(9u);
867 float const f_10 = pdf.get(10u);
868 float const f_11 = pdf.get(11u);
869 float const f_12 = pdf.get(12u);
870 float const f_13 = pdf.get(13u);
871 float const f_14 = pdf.get(14u);
872 float const f_15 = pdf.get(15u);
873 float const f_16 = pdf.get(16u);
874 float const f_17 = pdf.get(17u);
875 float const f_18 = pdf.get(18u);
876 float const *RESTRICT const u = u_in;
877 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
878 const float vel1Term = f_1 + f_11 + f_15 + f_7;
879 const float vel2Term = f_12 + f_13 + f_5;
880 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
881 const float u_0 = -force.get(0) * 0.50000000000000000f / rho + u[0];
882 const float u_1 = -force.get(1) * 0.50000000000000000f / rho + u[1];
883 const float u_2 = -force.get(2) * 0.50000000000000000f / rho + u[2];
884 velocity.get(0u) = u_in[0u];
885 velocity.get(1u) = u_in[1u];
886 velocity.get(2u) = u_in[2u];
887
888 float u_new[3] = {u_0, u_1, u_2};
889
890 Equilibrium::kernel_set_device(pdf, u_new, rho);
891 }
892}
893// LCOV_EXCL_STOP
894
895Vector3<float> get(
896 gpu::GPUField<float> const *pdf_field,
897 gpu::GPUField<float> const *force_field,
898 Cell const &cell) {
899 CellInterval ci(cell, cell);
900 thrust::device_vector<float> dev_data(3u);
901 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
902 auto kernel = gpu::make_kernel(kernel_get);
903 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
904 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
905 kernel.addParam(dev_data_ptr);
906 kernel();
907 Vector3<float> vec;
908 thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
909 return vec;
910}
911
912std::vector<float> get(
913 gpu::GPUField<float> const *pdf_field,
914 gpu::GPUField<float> const *force_field,
915 CellInterval const &ci) {
916 thrust::device_vector<float> dev_data(3u);
917 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
918 auto kernel = gpu::make_kernel(kernel_get);
919 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
920 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
921 kernel.addParam(dev_data_ptr);
922 kernel();
923 std::vector<float> out(dev_data.size());
924 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
925 return out;
926}
927
928void set(
929 gpu::GPUField<float> *pdf_field,
930 gpu::GPUField<float> *velocity_field,
931 gpu::GPUField<float> const *force_field,
932 Vector3<float> const &u,
933 Cell const &cell) {
934 CellInterval ci(cell, cell);
935 thrust::device_vector<float> dev_data(u.data(), u.data() + 3u);
936 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
937 auto kernel = gpu::make_kernel(kernel_set);
938 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
939 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
940 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
941 kernel.addParam(const_cast<const float *>(dev_data_ptr));
942 kernel();
943}
944
945void set(
946 gpu::GPUField<float> *pdf_field,
947 gpu::GPUField<float> *velocity_field,
948 gpu::GPUField<float> const *force_field,
949 std::vector<float> const &values,
950 CellInterval const &ci) {
951 thrust::device_vector<float> dev_data(values.begin(), values.end());
952 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
953 auto kernel = gpu::make_kernel(kernel_set);
954 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
955 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
956 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
957 kernel.addParam(const_cast<const float *>(dev_data_ptr));
958 kernel();
959}
960} // namespace Velocity
961
962namespace Force {
963// LCOV_EXCL_START
964__global__ void kernel_set(
965 gpu::FieldAccessor<float> pdf,
966 gpu::FieldAccessor<float> velocity,
967 gpu::FieldAccessor<float> force,
968 float const *RESTRICT f_in) {
969 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
970 pdf.set(blockIdx, threadIdx);
971 velocity.set(blockIdx, threadIdx);
972 force.set(blockIdx, threadIdx);
973 f_in += offset;
974 if (pdf.isValidPosition()) {
975 float const f_0 = pdf.get(0u);
976 float const f_1 = pdf.get(1u);
977 float const f_2 = pdf.get(2u);
978 float const f_3 = pdf.get(3u);
979 float const f_4 = pdf.get(4u);
980 float const f_5 = pdf.get(5u);
981 float const f_6 = pdf.get(6u);
982 float const f_7 = pdf.get(7u);
983 float const f_8 = pdf.get(8u);
984 float const f_9 = pdf.get(9u);
985 float const f_10 = pdf.get(10u);
986 float const f_11 = pdf.get(11u);
987 float const f_12 = pdf.get(12u);
988 float const f_13 = pdf.get(13u);
989 float const f_14 = pdf.get(14u);
990 float const f_15 = pdf.get(15u);
991 float const f_16 = pdf.get(16u);
992 float const f_17 = pdf.get(17u);
993 float const f_18 = pdf.get(18u);
994 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
995 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
996 const float vel1Term = f_1 + f_11 + f_15 + f_7;
997 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
998 const float vel2Term = f_12 + f_13 + f_5;
999 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
1000 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
1001 const float md_0 = f_in[0u] * 0.50000000000000000f + momdensity_0;
1002 const float md_1 = f_in[1u] * 0.50000000000000000f + momdensity_1;
1003 const float md_2 = f_in[2u] * 0.50000000000000000f + momdensity_2;
1004 auto const rho_inv = float{1} / rho;
1005
1006 force.get(0u) = f_in[0u];
1007 force.get(1u) = f_in[1u];
1008 force.get(2u) = f_in[2u];
1009
1010 velocity.get(0u) = md_0 * rho_inv;
1011 velocity.get(1u) = md_1 * rho_inv;
1012 velocity.get(2u) = md_2 * rho_inv;
1013 }
1014}
1015// LCOV_EXCL_STOP
1016
1017void set(gpu::GPUField<float> const *pdf_field,
1018 gpu::GPUField<float> *velocity_field,
1019 gpu::GPUField<float> *force_field,
1020 Vector3<float> const &u,
1021 Cell const &cell) {
1022 CellInterval ci(cell, cell);
1023 thrust::device_vector<float> dev_data(u.data(), u.data() + 3u);
1024 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1025 auto kernel = gpu::make_kernel(kernel_set);
1026 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1027 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
1028 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1029 kernel.addParam(const_cast<const float *>(dev_data_ptr));
1030 kernel();
1031}
1032
1033void set(gpu::GPUField<float> const *pdf_field,
1034 gpu::GPUField<float> *velocity_field,
1035 gpu::GPUField<float> *force_field,
1036 std::vector<float> const &values,
1037 CellInterval const &ci) {
1038 thrust::device_vector<float> dev_data(values.begin(), values.end());
1039 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1040 auto kernel = gpu::make_kernel(kernel_set);
1041 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1042 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
1043 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1044 kernel.addParam(const_cast<const float *>(dev_data_ptr));
1045 kernel();
1046}
1047} // namespace Force
1048
1049namespace MomentumDensity {
1050// LCOV_EXCL_START
1051__global__ void kernel_get(
1052 gpu::FieldAccessor<float> pdf,
1053 gpu::FieldAccessor<float> force,
1054 float *RESTRICT out) {
1055 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
1056 pdf.set(blockIdx, threadIdx);
1057 force.set(blockIdx, threadIdx);
1058 out += offset;
1059 if (pdf.isValidPosition()) {
1060 float const f_0 = pdf.get(0u);
1061 float const f_1 = pdf.get(1u);
1062 float const f_2 = pdf.get(2u);
1063 float const f_3 = pdf.get(3u);
1064 float const f_4 = pdf.get(4u);
1065 float const f_5 = pdf.get(5u);
1066 float const f_6 = pdf.get(6u);
1067 float const f_7 = pdf.get(7u);
1068 float const f_8 = pdf.get(8u);
1069 float const f_9 = pdf.get(9u);
1070 float const f_10 = pdf.get(10u);
1071 float const f_11 = pdf.get(11u);
1072 float const f_12 = pdf.get(12u);
1073 float const f_13 = pdf.get(13u);
1074 float const f_14 = pdf.get(14u);
1075 float const f_15 = pdf.get(15u);
1076 float const f_16 = pdf.get(16u);
1077 float const f_17 = pdf.get(17u);
1078 float const f_18 = pdf.get(18u);
1079 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
1080 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
1081 const float vel1Term = f_1 + f_11 + f_15 + f_7;
1082 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
1083 const float vel2Term = f_12 + f_13 + f_5;
1084 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
1085 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
1086 const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
1087 const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
1088 const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
1089 out[0u] = md_0;
1090 out[1u] = md_1;
1091 out[2u] = md_2;
1092 }
1093}
1094// LCOV_EXCL_STOP
1095
1096Vector3<float> reduce(
1097 gpu::GPUField<float> const *pdf_field,
1098 gpu::GPUField<float> const *force_field) {
1099 auto const ci = pdf_field->xyzSize();
1100 thrust::device_vector<float> dev_data(3u * ci.numCells());
1101 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1102 auto kernel = gpu::make_kernel(kernel_get);
1103 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1104 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1105 kernel.addParam(dev_data_ptr);
1106 kernel();
1107 std::vector<float> out(dev_data.size());
1108 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1109 Vector3<float> mom(float{0});
1110 for (auto i = uint_t{0u}; i < 3u * ci.numCells(); i += 3u) {
1111 mom[0u] += out[i + 0u];
1112 mom[1u] += out[i + 1u];
1113 mom[2u] += out[i + 2u];
1114 }
1115 return mom;
1116}
1117} // namespace MomentumDensity
1118
1119namespace PressureTensor {
1120// LCOV_EXCL_START
1121__global__ void kernel_get(
1122 gpu::FieldAccessor<float> pdf,
1123 float *RESTRICT p_out) {
1124 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 9u);
1125 pdf.set(blockIdx, threadIdx);
1126 p_out += offset;
1127 if (pdf.isValidPosition()) {
1128 float const f_0 = pdf.get(0u);
1129 float const f_1 = pdf.get(1u);
1130 float const f_2 = pdf.get(2u);
1131 float const f_3 = pdf.get(3u);
1132 float const f_4 = pdf.get(4u);
1133 float const f_5 = pdf.get(5u);
1134 float const f_6 = pdf.get(6u);
1135 float const f_7 = pdf.get(7u);
1136 float const f_8 = pdf.get(8u);
1137 float const f_9 = pdf.get(9u);
1138 float const f_10 = pdf.get(10u);
1139 float const f_11 = pdf.get(11u);
1140 float const f_12 = pdf.get(12u);
1141 float const f_13 = pdf.get(13u);
1142 float const f_14 = pdf.get(14u);
1143 float const f_15 = pdf.get(15u);
1144 float const f_16 = pdf.get(16u);
1145 float const f_17 = pdf.get(17u);
1146 float const f_18 = pdf.get(18u);
1147 const float p_0 = f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
1148 const float p_1 = -f_10 - f_7 + f_8 + f_9;
1149 const float p_2 = -f_13 + f_14 + f_17 - f_18;
1150 const float p_3 = -f_10 - f_7 + f_8 + f_9;
1151 const float p_4 = f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
1152 const float p_5 = f_11 - f_12 - f_15 + f_16;
1153 const float p_6 = -f_13 + f_14 + f_17 - f_18;
1154 const float p_7 = f_11 - f_12 - f_15 + f_16;
1155 const float p_8 = f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
1156 p_out[0u] = p_0;
1157 p_out[1u] = p_1;
1158 p_out[2u] = p_2;
1159 p_out[3u] = p_3;
1160 p_out[4u] = p_4;
1161 p_out[5u] = p_5;
1162 p_out[6u] = p_6;
1163 p_out[7u] = p_7;
1164 p_out[8u] = p_8;
1165 }
1166}
1167// LCOV_EXCL_STOP
1168
1169Matrix3<float> get(
1170 gpu::GPUField<float> const *pdf_field,
1171 Cell const &cell) {
1172 CellInterval ci(cell, cell);
1173 thrust::device_vector<float> dev_data(9u);
1174 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1175 auto kernel = gpu::make_kernel(kernel_get);
1176 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1177 kernel.addParam(dev_data_ptr);
1178 kernel();
1179 Matrix3<float> out;
1180 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1181 return out;
1182}
1183
1184std::vector<float> get(
1185 gpu::GPUField<float> const *pdf_field,
1186 CellInterval const &ci) {
1187 thrust::device_vector<float> dev_data(9u * ci.numCells());
1188 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1189 auto kernel = gpu::make_kernel(kernel_get);
1190 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1191 kernel.addParam(dev_data_ptr);
1192 kernel();
1193 std::vector<float> out(dev_data.size());
1194 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1195 return out;
1196}
1197
1198Matrix3<float> reduce(
1199 gpu::GPUField<float> const *pdf_field) {
1200 auto const ci = pdf_field->xyzSize();
1201 thrust::device_vector<float> dev_data(9u * ci.numCells());
1202 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1203 auto kernel = gpu::make_kernel(kernel_get);
1204 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1205 kernel.addParam(dev_data_ptr);
1206 kernel();
1207 std::vector<float> out(dev_data.size());
1208 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1209 Matrix3<float> pressureTensor(float{0});
1210 for (auto i = uint_t{0u}; i < 9u * ci.numCells(); i += 9u) {
1211 pressureTensor[0u] += out[i + 0u];
1212 pressureTensor[1u] += out[i + 1u];
1213 pressureTensor[2u] += out[i + 2u];
1214
1215 pressureTensor[3u] += out[i + 3u];
1216 pressureTensor[4u] += out[i + 4u];
1217 pressureTensor[5u] += out[i + 5u];
1218
1219 pressureTensor[6u] += out[i + 6u];
1220 pressureTensor[7u] += out[i + 7u];
1221 pressureTensor[8u] += out[i + 8u];
1222 }
1223 return pressureTensor;
1224}
1225} // namespace PressureTensor
1226
1227} // namespace accessor
1228} // namespace lbm
1229} // namespace walberla
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
static __forceinline__ __device__ uint getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset)
Get linear index of flattened data with original layout fzyx.
static __forceinline__ __device__ uint getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset)
Get linear index of flattened data with original layout fzyx.
Definition Cell.hpp:96
static double weight(int type, double r_cut, double k, double r)
Definition dpd.cpp:78
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, double *RESTRICT rho_out)
void set(GhostLayerField< double, uint_t{19u}> *pdf_field, double const rho_in, Cell const &cell)
double get(GhostLayerField< double, uint_t{19u}> const *pdf_field, Cell const &cell)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, double const *RESTRICT rho_in)
__device__ void kernel_set_device(gpu::FieldAccessor< double > pdf, double const *RESTRICT const u, double rho)
void set(GhostLayerField< double, uint_t{19u}> const *pdf_field, GhostLayerField< double, uint_t{3u}> *velocity_field, GhostLayerField< double, uint_t{3u}> *force_field, Vector3< double > const &force, Cell const &cell)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > velocity, gpu::FieldAccessor< double > force, double const *RESTRICT f_in)
__global__ void kernel_get(gpu::FieldAccessor< double > vec, double const *RESTRICT const pos, double *RESTRICT const vel, uint n_pos, uint gl)
static dim3 calculate_dim_grid(uint const threads_x, uint const blocks_per_grid_y, uint const threads_per_block)
std::vector< double > get(gpu::GPUField< double > const *vec_field, std::vector< double > const &pos, uint gl)
__global__ void kernel_set(gpu::FieldAccessor< double > vec, double const *RESTRICT const pos, double const *RESTRICT const forces, uint n_pos, uint gl)
void set(gpu::GPUField< double > const *vec_field, std::vector< double > const &pos, std::vector< double > const &forces, uint gl)
static __forceinline__ __device__ void calculate_weights(double const *RESTRICT const pos, int *RESTRICT const corner, double *RESTRICT const weights, uint gl)
Calculate interpolation weights.
auto reduce(GhostLayerField< double, uint_t{19u}> const *pdf_field, GhostLayerField< double, uint_t{3u}> const *force_field)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > force, double *RESTRICT out)
__global__ void kernel_set_vel(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > velocity, gpu::FieldAccessor< double > force, double const *RESTRICT pop)
__global__ void kernel_broadcast(gpu::FieldAccessor< double > pdf, double const *RESTRICT pop)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, double const *RESTRICT pop)
void set(GhostLayerField< double, uint_t{19u}> *pdf_field, std::array< double, 19u > const &pop, Cell const &cell)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, double *RESTRICT pop)
auto get(GhostLayerField< double, uint_t{19u}> const *pdf_field, Cell const &cell)
void initialize(GhostLayerField< double, uint_t{19u}> *pdf_field, std::array< double, 19u > const &pop)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, double *RESTRICT p_out)
auto get(GhostLayerField< double, uint_t{19u}> const *pdf_field, Cell const &cell)
auto reduce(GhostLayerField< double, uint_t{19u}> const *pdf_field)
void initialize(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec)
__global__ void kernel_get(gpu::FieldAccessor< double > vec, double *u_out)
__global__ void kernel_broadcast(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void add(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec, Cell const &cell)
__global__ void kernel_set(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void set(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec, Cell const &cell)
auto get(GhostLayerField< double, uint_t{3u}> const *vec_field, Cell const &cell)
__global__ void kernel_broadcast_add(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void add_to_all(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec)
__global__ void kernel_add(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void set(GhostLayerField< double, uint_t{19u}> *pdf_field, GhostLayerField< double, uint_t{3u}> *velocity_field, GhostLayerField< double, uint_t{3u}> const *force_field, Vector3< double > const &u, Cell const &cell)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > force, double *RESTRICT u_out)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > velocity, gpu::FieldAccessor< double > force, double const *RESTRICT u_in)
auto get(GhostLayerField< double, uint_t{19u}> const *pdf_field, GhostLayerField< double, uint_t{3u}> const *force_field, Cell const &cell)
\file PackInfoPdfDoublePrecision.cpp \author pystencils
static Utils::Vector3d velocity(Particle const &p_ref, Particle const &p_vs)
Velocity of the virtual site.
Definition relative.cpp:64