ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
FieldAccessorsSinglePrecisionCUDA.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2023-2024 The ESPResSo project
3 * Copyright (C) 2020 The waLBerla project
4 *
5 * This file is part of ESPResSo.
6 *
7 * ESPResSo is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * ESPResSo is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
22
23/**
24 * @file
25 * Lattice field accessors.
26 * Adapted from the waLBerla source file
27 * https://i10git.cs.fau.de/walberla/walberla/-/blob/a16141524c58ab88386e2a0f8fdd7c63c5edd704/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
28 */
29
30#include <core/DataTypes.h>
31#include <core/cell/Cell.h>
32#include <core/cell/CellInterval.h>
33#include <core/math/Matrix3.h>
34#include <core/math/Vector3.h>
35
36#include <field/iterators/IteratorMacros.h>
37
38#include <gpu/FieldAccessor.h>
39#include <gpu/FieldIndexing.h>
40#include <gpu/GPUField.h>
41#include <gpu/Kernel.h>
42
43#include <thrust/device_ptr.h>
44#include <thrust/device_vector.h>
45
46#include <array>
47#include <vector>
48
49#if defined(__NVCC__)
50#define RESTRICT __restrict__
51#pragma nv_diagnostic push
52#pragma nv_diag_suppress 177 // unused variable
53#elif defined(__clang__)
54#if defined(__CUDA__)
55#if defined(__CUDA_ARCH__)
56// clang compiling CUDA code in device mode
57#define RESTRICT __restrict__
58#pragma clang diagnostic push
59#pragma clang diagnostic ignored "-Wunused-variable"
60#else
61// clang compiling CUDA code in host mode
62#define RESTRICT __restrict__
63#pragma clang diagnostic push
64#pragma clang diagnostic ignored "-Wunused-variable"
65#endif
66#endif
67#elif defined(__GNUC__) or defined(__GNUG__)
68#define RESTRICT __restrict__
69#pragma GCC diagnostic push
70#pragma GCC diagnostic ignored "-Wunused-variable"
71#elif defined(_MSC_VER)
72#define RESTRICT __restrict
73#else
74#define RESTRICT
75#endif
76
77/** @brief Get linear index of flattened data with original layout @c fzyx. */
78static __forceinline__ __device__ uint getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset) {
79 auto const x = threadIdx.x;
80 auto const y = blockIdx.x;
81 auto const z = blockIdx.y;
82 auto const f = blockIdx.z;
83 auto const ySize = gridDim.x;
84 auto const zSize = gridDim.y;
85 auto const fSize = fOffset;
86 return f +
87 z * fSize +
88 y * fSize * zSize +
89 x * fSize * zSize * ySize;
90}
91
92namespace walberla {
93namespace lbm {
94namespace accessor {
95
96namespace Population {
97// LCOV_EXCL_START
98__global__ void kernel_get(
99 gpu::FieldAccessor<float> pdf,
100 float *RESTRICT pop) {
101 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
102 pdf.set(blockIdx, threadIdx);
103 pop += offset;
104 if (pdf.isValidPosition()) {
105 pop[0u] = pdf.get(0u);
106 pop[1u] = pdf.get(1u);
107 pop[2u] = pdf.get(2u);
108 pop[3u] = pdf.get(3u);
109 pop[4u] = pdf.get(4u);
110 pop[5u] = pdf.get(5u);
111 pop[6u] = pdf.get(6u);
112 pop[7u] = pdf.get(7u);
113 pop[8u] = pdf.get(8u);
114 pop[9u] = pdf.get(9u);
115 pop[10u] = pdf.get(10u);
116 pop[11u] = pdf.get(11u);
117 pop[12u] = pdf.get(12u);
118 pop[13u] = pdf.get(13u);
119 pop[14u] = pdf.get(14u);
120 pop[15u] = pdf.get(15u);
121 pop[16u] = pdf.get(16u);
122 pop[17u] = pdf.get(17u);
123 pop[18u] = pdf.get(18u);
124 }
125}
126
127__global__ void kernel_set(
128 gpu::FieldAccessor<float> pdf,
129 float const *RESTRICT pop) {
130 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
131 pdf.set(blockIdx, threadIdx);
132 pop += offset;
133 if (pdf.isValidPosition()) {
134 pdf.get(0u) = pop[0u];
135 pdf.get(1u) = pop[1u];
136 pdf.get(2u) = pop[2u];
137 pdf.get(3u) = pop[3u];
138 pdf.get(4u) = pop[4u];
139 pdf.get(5u) = pop[5u];
140 pdf.get(6u) = pop[6u];
141 pdf.get(7u) = pop[7u];
142 pdf.get(8u) = pop[8u];
143 pdf.get(9u) = pop[9u];
144 pdf.get(10u) = pop[10u];
145 pdf.get(11u) = pop[11u];
146 pdf.get(12u) = pop[12u];
147 pdf.get(13u) = pop[13u];
148 pdf.get(14u) = pop[14u];
149 pdf.get(15u) = pop[15u];
150 pdf.get(16u) = pop[16u];
151 pdf.get(17u) = pop[17u];
152 pdf.get(18u) = pop[18u];
153 }
154}
155
156__global__ void kernel_broadcast(
157 gpu::FieldAccessor<float> pdf,
158 float const *RESTRICT pop) {
159 pdf.set(blockIdx, threadIdx);
160 if (pdf.isValidPosition()) {
161 pdf.get(0u) = pop[0u];
162 pdf.get(1u) = pop[1u];
163 pdf.get(2u) = pop[2u];
164 pdf.get(3u) = pop[3u];
165 pdf.get(4u) = pop[4u];
166 pdf.get(5u) = pop[5u];
167 pdf.get(6u) = pop[6u];
168 pdf.get(7u) = pop[7u];
169 pdf.get(8u) = pop[8u];
170 pdf.get(9u) = pop[9u];
171 pdf.get(10u) = pop[10u];
172 pdf.get(11u) = pop[11u];
173 pdf.get(12u) = pop[12u];
174 pdf.get(13u) = pop[13u];
175 pdf.get(14u) = pop[14u];
176 pdf.get(15u) = pop[15u];
177 pdf.get(16u) = pop[16u];
178 pdf.get(17u) = pop[17u];
179 pdf.get(18u) = pop[18u];
180 }
181}
182
183__global__ void kernel_set_vel(
184 gpu::FieldAccessor<float> pdf,
185 gpu::FieldAccessor<float> velocity,
186 gpu::FieldAccessor<float> force,
187 float const *RESTRICT pop) {
188 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
189 pdf.set(blockIdx, threadIdx);
190 velocity.set(blockIdx, threadIdx);
191 force.set(blockIdx, threadIdx);
192 pop += offset;
193 if (pdf.isValidPosition()) {
194 const float f_0 = pdf.get(0u) = pop[0u];
195 const float f_1 = pdf.get(1u) = pop[1u];
196 const float f_2 = pdf.get(2u) = pop[2u];
197 const float f_3 = pdf.get(3u) = pop[3u];
198 const float f_4 = pdf.get(4u) = pop[4u];
199 const float f_5 = pdf.get(5u) = pop[5u];
200 const float f_6 = pdf.get(6u) = pop[6u];
201 const float f_7 = pdf.get(7u) = pop[7u];
202 const float f_8 = pdf.get(8u) = pop[8u];
203 const float f_9 = pdf.get(9u) = pop[9u];
204 const float f_10 = pdf.get(10u) = pop[10u];
205 const float f_11 = pdf.get(11u) = pop[11u];
206 const float f_12 = pdf.get(12u) = pop[12u];
207 const float f_13 = pdf.get(13u) = pop[13u];
208 const float f_14 = pdf.get(14u) = pop[14u];
209 const float f_15 = pdf.get(15u) = pop[15u];
210 const float f_16 = pdf.get(16u) = pop[16u];
211 const float f_17 = pdf.get(17u) = pop[17u];
212 const float f_18 = pdf.get(18u) = pop[18u];
213 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
214 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
215 const float vel1Term = f_1 + f_11 + f_15 + f_7;
216 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
217 const float vel2Term = f_12 + f_13 + f_5;
218 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
219 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
220 const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
221 const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
222 const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
223 const float rho_inv = float{1} / rho;
224 velocity.get(0u) = md_0 * rho_inv;
225 velocity.get(1u) = md_1 * rho_inv;
226 velocity.get(2u) = md_2 * rho_inv;
227 }
228}
229// LCOV_EXCL_STOP
230
231std::array<float, 19u> get(
232 gpu::GPUField<float> const *pdf_field,
233 Cell const &cell) {
234 CellInterval ci(cell, cell);
235 thrust::device_vector<float> dev_data(19u);
236 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
237 auto kernel = gpu::make_kernel(kernel_get);
238 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
239 kernel.addParam(dev_data_ptr);
240 kernel();
241 std::array<float, 19u> pop;
242 thrust::copy(dev_data.begin(), dev_data.end(), pop.data());
243 return pop;
244}
245
246void set(
247 gpu::GPUField<float> *pdf_field,
248 std::array<float, 19u> const &pop,
249 Cell const &cell) {
250 thrust::device_vector<float> dev_data(pop.begin(), pop.end());
251 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
252 CellInterval ci(cell, cell);
253 auto kernel = gpu::make_kernel(kernel_set);
254 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
255 kernel.addParam(const_cast<const float *>(dev_data_ptr));
256 kernel();
257}
258
259void set(
260 gpu::GPUField<float> *pdf_field,
261 gpu::GPUField<float> *velocity_field,
262 gpu::GPUField<float> const *force_field,
263 std::array<float, 19u> const &pop,
264 Cell const &cell) {
265 thrust::device_vector<float> dev_data(pop.begin(), pop.end());
266 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
267 CellInterval ci(cell, cell);
268 auto kernel = gpu::make_kernel(kernel_set_vel);
269 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
270 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
271 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
272 kernel.addParam(const_cast<const float *>(dev_data_ptr));
273 kernel();
274}
275
277 gpu::GPUField<float> *pdf_field,
278 std::array<float, 19u> const &pop) {
279 CellInterval ci = pdf_field->xyzSizeWithGhostLayer();
280 thrust::device_vector<float> dev_data(pop.begin(), pop.end());
281 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
282 auto kernel = gpu::make_kernel(kernel_broadcast);
283 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
284 kernel.addParam(const_cast<const float *>(dev_data_ptr));
285 kernel();
286}
287
288std::vector<float> get(
289 gpu::GPUField<float> const *pdf_field,
290 CellInterval const &ci) {
291 thrust::device_vector<float> dev_data(ci.numCells() * 19u);
292 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
293 auto kernel = gpu::make_kernel(kernel_get);
294 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
295 kernel.addParam(dev_data_ptr);
296 kernel();
297 std::vector<float> out(ci.numCells() * 19u);
298 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
299 return out;
300}
301
302void set(
303 gpu::GPUField<float> *pdf_field,
304 std::vector<float> const &values,
305 CellInterval const &ci) {
306 thrust::device_vector<float> dev_data(values.begin(), values.end());
307 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
308 auto kernel = gpu::make_kernel(kernel_set);
309 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
310 kernel.addParam(const_cast<const float *>(dev_data_ptr));
311 kernel();
312}
313
314void set(
315 gpu::GPUField<float> *pdf_field,
316 gpu::GPUField<float> *velocity_field,
317 gpu::GPUField<float> const *force_field,
318 std::vector<float> const &values,
319 CellInterval const &ci) {
320 thrust::device_vector<float> dev_data(values.begin(), values.end());
321 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
322 auto kernel = gpu::make_kernel(kernel_set_vel);
323 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
324 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
325 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
326 kernel.addParam(const_cast<const float *>(dev_data_ptr));
327 kernel();
328}
329} // namespace Population
330
331namespace Vector {
332// LCOV_EXCL_START
333__global__ void kernel_get(
334 gpu::FieldAccessor<float> vec,
335 float *u_out) {
336 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
337 vec.set(blockIdx, threadIdx);
338 u_out += offset;
339 if (vec.isValidPosition()) {
340 u_out[0u] = vec.get(0u);
341 u_out[1u] = vec.get(1u);
342 u_out[2u] = vec.get(2u);
343 }
344}
345
346__global__ void kernel_set(
347 gpu::FieldAccessor<float> vec,
348 float const *RESTRICT u_in) {
349 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
350 vec.set(blockIdx, threadIdx);
351 u_in += offset;
352 if (vec.isValidPosition()) {
353 vec.get(0u) = u_in[0u];
354 vec.get(1u) = u_in[1u];
355 vec.get(2u) = u_in[2u];
356 }
357}
358
359__global__ void kernel_broadcast(
360 gpu::FieldAccessor<float> vec,
361 float const *RESTRICT u_in) {
362 vec.set(blockIdx, threadIdx);
363 if (vec.isValidPosition()) {
364 vec.get(0u) = u_in[0u];
365 vec.get(1u) = u_in[1u];
366 vec.get(2u) = u_in[2u];
367 }
368}
369
370__global__ void kernel_add(
371 gpu::FieldAccessor<float> vec,
372 float const *RESTRICT u_in) {
373 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
374 vec.set(blockIdx, threadIdx);
375 u_in += offset;
376 if (vec.isValidPosition()) {
377 vec.get(0u) += u_in[0u];
378 vec.get(1u) += u_in[1u];
379 vec.get(2u) += u_in[2u];
380 }
381}
382
383__global__ void kernel_broadcast_add(
384 gpu::FieldAccessor<float> vec,
385 float const *RESTRICT u_in) {
386 vec.set(blockIdx, threadIdx);
387 if (vec.isValidPosition()) {
388 vec.get(0u) += u_in[0u];
389 vec.get(1u) += u_in[1u];
390 vec.get(2u) += u_in[2u];
391 }
392}
393// LCOV_EXCL_STOP
394
395Vector3<float> get(
396 gpu::GPUField<float> const *vec_field,
397 Cell const &cell) {
398 CellInterval ci(cell, cell);
399 thrust::device_vector<float> dev_data(3u);
400 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
401 auto kernel = gpu::make_kernel(kernel_get);
402 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
403 kernel.addParam(dev_data_ptr);
404 kernel();
405 Vector3<float> vec;
406 thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
407 return vec;
408}
409
410void set(
411 gpu::GPUField<float> *vec_field,
412 Vector3<float> const &vec,
413 Cell const &cell) {
414 CellInterval ci(cell, cell);
415 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
416 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
417 auto kernel = gpu::make_kernel(kernel_set);
418 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
419 kernel.addParam(const_cast<const float *>(dev_data_ptr));
420 kernel();
421}
422
423void add(
424 gpu::GPUField<float> *vec_field,
425 Vector3<float> const &vec,
426 Cell const &cell) {
427 CellInterval ci(cell, cell);
428 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
429 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
430 auto kernel = gpu::make_kernel(kernel_add);
431 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
432 kernel.addParam(const_cast<const float *>(dev_data_ptr));
433 kernel();
434}
435
437 gpu::GPUField<float> *vec_field,
438 Vector3<float> const &vec) {
439 CellInterval ci = vec_field->xyzSizeWithGhostLayer();
440 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
441 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
442 auto kernel = gpu::make_kernel(kernel_broadcast);
443 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
444 kernel.addParam(const_cast<const float *>(dev_data_ptr));
445 kernel();
446}
447
449 gpu::GPUField<float> *vec_field,
450 Vector3<float> const &vec) {
451 CellInterval ci = vec_field->xyzSizeWithGhostLayer();
452 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
453 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
454 auto kernel = gpu::make_kernel(kernel_broadcast_add);
455 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
456 kernel.addParam(const_cast<const float *>(dev_data_ptr));
457 kernel();
458}
459
460std::vector<float> get(
461 gpu::GPUField<float> const *vec_field,
462 CellInterval const &ci) {
463 thrust::device_vector<float> dev_data(ci.numCells() * 3u);
464 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
465 auto kernel = gpu::make_kernel(kernel_get);
466 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
467 kernel.addParam(dev_data_ptr);
468 kernel();
469 std::vector<float> out(ci.numCells() * 3u);
470 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
471 return out;
472}
473
474void set(
475 gpu::GPUField<float> *vec_field,
476 std::vector<float> const &values,
477 CellInterval const &ci) {
478 thrust::device_vector<float> dev_data(values.begin(), values.end());
479 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
480 auto kernel = gpu::make_kernel(kernel_set);
481 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
482 kernel.addParam(const_cast<const float *>(dev_data_ptr));
483 kernel();
484}
485} // namespace Vector
486
487namespace Interpolation {
488// LCOV_EXCL_START
489/** @brief Calculate interpolation weights. */
490static __forceinline__ __device__ void calculate_weights(
491 float const *RESTRICT const pos,
492 int *RESTRICT const corner,
493 float *RESTRICT const weights,
494 uint gl) {
495#pragma unroll
496 for (int dim = 0; dim < 3; ++dim) {
497 auto const fractional_index = pos[dim] - float{0.5};
498 auto const nmp = floorf(fractional_index);
499 auto const distance = fractional_index - nmp - float{0.5};
500 corner[dim] = __float2int_rn(nmp) + static_cast<int>(gl);
501 weights[dim * 2 + 0] = float{0.5} - distance;
502 weights[dim * 2 + 1] = float{0.5} + distance;
503 }
504}
505
506__global__ void kernel_get(
507 gpu::FieldAccessor<float> vec,
508 float const *RESTRICT const pos,
509 float *RESTRICT const vel,
510 uint n_pos,
511 uint gl) {
512
513 uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
514 blockDim.x * blockIdx.x + threadIdx.x;
515
516 vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
517 if (vec.isValidPosition() and pos_index < n_pos) {
518 auto const array_offset = pos_index * uint(3u);
519 int corner[3];
520 float weights[3][2];
521 calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
522#pragma unroll
523 for (int i = 0; i < 2; i++) {
524 auto const cx = corner[0] + i;
525 auto const wx = weights[0][i];
526#pragma unroll
527 for (int j = 0; j < 2; j++) {
528 auto const cy = corner[1] + j;
529 auto const wxy = wx * weights[1][j];
530#pragma unroll
531 for (int k = 0; k < 2; k++) {
532 auto const cz = corner[2] + k;
533 auto const weight = wxy * weights[2][k];
534 vel[array_offset + 0u] += weight * vec.getNeighbor(cx, cy, cz, 0u);
535 vel[array_offset + 1u] += weight * vec.getNeighbor(cx, cy, cz, 1u);
536 vel[array_offset + 2u] += weight * vec.getNeighbor(cx, cy, cz, 2u);
537 }
538 }
539 }
540 }
541}
542
543__global__ void kernel_set(
544 gpu::FieldAccessor<float> vec,
545 float const *RESTRICT const pos,
546 float const *RESTRICT const forces,
547 uint n_pos,
548 uint gl) {
549
550 uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
551 blockDim.x * blockIdx.x + threadIdx.x;
552
553 vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
554 if (vec.isValidPosition() and pos_index < n_pos) {
555 auto const array_offset = pos_index * uint(3u);
556 int corner[3];
557 float weights[3][2];
558 calculate_weights(pos + array_offset, corner, &weights[0][0], gl);
559#pragma unroll
560 for (int i = 0; i < 2; i++) {
561 auto const cx = corner[0] + i;
562 auto const wx = weights[0][i];
563#pragma unroll
564 for (int j = 0; j < 2; j++) {
565 auto const cy = corner[1] + j;
566 auto const wxy = wx * weights[1][j];
567#pragma unroll
568 for (int k = 0; k < 2; k++) {
569 auto const cz = corner[2] + k;
570 auto const weight = wxy * weights[2][k];
571 atomicAdd(&vec.getNeighbor(cx, cy, cz, 0u),
572 weight * forces[array_offset + 0u]);
573 atomicAdd(&vec.getNeighbor(cx, cy, cz, 1u),
574 weight * forces[array_offset + 1u]);
575 atomicAdd(&vec.getNeighbor(cx, cy, cz, 2u),
576 weight * forces[array_offset + 2u]);
577 }
578 }
579 }
580 }
581}
582// LCOV_EXCL_STOP
583
584static dim3 calculate_dim_grid(uint const threads_x,
585 uint const blocks_per_grid_y,
586 uint const threads_per_block) {
587 assert(threads_x >= 1u);
588 assert(blocks_per_grid_y >= 1u);
589 assert(threads_per_block >= 1u);
590 auto const threads_y = threads_per_block * blocks_per_grid_y;
591 auto const blocks_per_grid_x = (threads_x + threads_y - 1) / threads_y;
592 return make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
593}
594
595std::vector<float>
597 gpu::GPUField<float> const *vec_field,
598 std::vector<float> const &pos,
599 uint gl) {
600 thrust::device_vector<float> dev_pos(pos.begin(), pos.end());
601 thrust::device_vector<float> dev_vel(pos.size());
602 auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
603 auto const dev_vel_ptr = thrust::raw_pointer_cast(dev_vel.data());
604
605 auto const threads_per_block = uint(64u);
606 auto const n_pos = static_cast<uint>(pos.size() / 3ul);
607 auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
608 kernel_get<<<dim_grid, threads_per_block, 0u, nullptr>>>(
609 gpu::FieldIndexing<float>::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
610 dev_pos_ptr, dev_vel_ptr, n_pos, gl);
611
612 std::vector<float> out(pos.size());
613 thrust::copy(dev_vel.begin(), dev_vel.end(), out.data());
614 return out;
615}
616
617void set(
618 gpu::GPUField<float> const *vec_field,
619 std::vector<float> const &pos,
620 std::vector<float> const &forces,
621 uint gl) {
622 thrust::device_vector<float> dev_pos(pos.begin(), pos.end());
623 thrust::device_vector<float> dev_for(forces.begin(), forces.end());
624 auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
625 auto const dev_for_ptr = thrust::raw_pointer_cast(dev_for.data());
626
627 auto const threads_per_block = uint(64u);
628 auto const n_pos = static_cast<uint>(pos.size() / 3ul);
629 auto const dim_grid = calculate_dim_grid(n_pos, 4u, threads_per_block);
630 kernel_set<<<dim_grid, threads_per_block, 0u, nullptr>>>(
631 gpu::FieldIndexing<float>::withGhostLayerXYZ(*vec_field, gl).gpuAccess(),
632 dev_pos_ptr, dev_for_ptr, n_pos, gl);
633}
634} // namespace Interpolation
635
636namespace Equilibrium {
637// LCOV_EXCL_START
638__device__ void kernel_set_device(
639 gpu::FieldAccessor<float> pdf,
640 float const *RESTRICT const u,
641 float rho) {
642
643 pdf.get(0u) = rho * -0.33333333333333331f * (u[0] * u[0]) + rho * -0.33333333333333331f * (u[1] * u[1]) + rho * -0.33333333333333331f * (u[2] * u[2]) + rho * 0.33333333333333331f;
644 pdf.get(1u) = rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * u[1] + rho * 0.16666666666666666f * (u[1] * u[1]);
645 pdf.get(2u) = rho * -0.16666666666666666f * u[1] + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u[1] * u[1]);
646 pdf.get(3u) = rho * -0.16666666666666666f * u[0] + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u[0] * u[0]);
647 pdf.get(4u) = rho * -0.16666666666666666f * (u[1] * u[1]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * u[0] + rho * 0.16666666666666666f * (u[0] * u[0]);
648 pdf.get(5u) = rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * u[2] + rho * 0.16666666666666666f * (u[2] * u[2]);
649 pdf.get(6u) = rho * -0.16666666666666666f * u[2] + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * 0.055555555555555552f + rho * 0.16666666666666666f * (u[2] * u[2]);
650 pdf.get(7u) = rho * -0.083333333333333329f * u[0] + rho * -0.25f * u[0] * u[1] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]);
651 pdf.get(8u) = rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.25f * u[0] * u[1];
652 pdf.get(9u) = rho * -0.083333333333333329f * u[0] + rho * -0.083333333333333329f * u[1] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.25f * u[0] * u[1];
653 pdf.get(10u) = rho * -0.083333333333333329f * u[1] + rho * -0.25f * u[0] * u[1] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]);
654 pdf.get(11u) = rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[1] * u[2];
655 pdf.get(12u) = rho * -0.083333333333333329f * u[1] + rho * -0.25f * u[1] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]);
656 pdf.get(13u) = rho * -0.083333333333333329f * u[0] + rho * -0.25f * u[0] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]);
657 pdf.get(14u) = rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[0] * u[2];
658 pdf.get(15u) = rho * -0.083333333333333329f * u[2] + rho * -0.25f * u[1] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]);
659 pdf.get(16u) = rho * -0.083333333333333329f * u[1] + rho * -0.083333333333333329f * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[1] * u[2];
660 pdf.get(17u) = rho * -0.083333333333333329f * u[0] + rho * -0.083333333333333329f * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[0] * u[2];
661 pdf.get(18u) = rho * -0.083333333333333329f * u[2] + rho * -0.25f * u[0] * u[2] + rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]);
662}
663// LCOV_EXCL_STOP
664} // namespace Equilibrium
665
666namespace Density {
667// LCOV_EXCL_START
668__global__ void kernel_get(
669 gpu::FieldAccessor<float> pdf,
670 float *RESTRICT rho_out) {
671 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
672 pdf.set(blockIdx, threadIdx);
673 rho_out += offset;
674 if (pdf.isValidPosition()) {
675 float const f_0 = pdf.get(0u);
676 float const f_1 = pdf.get(1u);
677 float const f_2 = pdf.get(2u);
678 float const f_3 = pdf.get(3u);
679 float const f_4 = pdf.get(4u);
680 float const f_5 = pdf.get(5u);
681 float const f_6 = pdf.get(6u);
682 float const f_7 = pdf.get(7u);
683 float const f_8 = pdf.get(8u);
684 float const f_9 = pdf.get(9u);
685 float const f_10 = pdf.get(10u);
686 float const f_11 = pdf.get(11u);
687 float const f_12 = pdf.get(12u);
688 float const f_13 = pdf.get(13u);
689 float const f_14 = pdf.get(14u);
690 float const f_15 = pdf.get(15u);
691 float const f_16 = pdf.get(16u);
692 float const f_17 = pdf.get(17u);
693 float const f_18 = pdf.get(18u);
694 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
695 const float vel1Term = f_1 + f_11 + f_15 + f_7;
696 const float vel2Term = f_12 + f_13 + f_5;
697 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
698 rho_out[0u] = rho;
699 }
700}
701
702__global__ void kernel_set(
703 gpu::FieldAccessor<float> pdf,
704 float const *RESTRICT rho_in) {
705 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
706 pdf.set(blockIdx, threadIdx);
707 rho_in += offset;
708 if (pdf.isValidPosition()) {
709 float const f_0 = pdf.get(0u);
710 float const f_1 = pdf.get(1u);
711 float const f_2 = pdf.get(2u);
712 float const f_3 = pdf.get(3u);
713 float const f_4 = pdf.get(4u);
714 float const f_5 = pdf.get(5u);
715 float const f_6 = pdf.get(6u);
716 float const f_7 = pdf.get(7u);
717 float const f_8 = pdf.get(8u);
718 float const f_9 = pdf.get(9u);
719 float const f_10 = pdf.get(10u);
720 float const f_11 = pdf.get(11u);
721 float const f_12 = pdf.get(12u);
722 float const f_13 = pdf.get(13u);
723 float const f_14 = pdf.get(14u);
724 float const f_15 = pdf.get(15u);
725 float const f_16 = pdf.get(16u);
726 float const f_17 = pdf.get(17u);
727 float const f_18 = pdf.get(18u);
728 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
729 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
730 const float vel1Term = f_1 + f_11 + f_15 + f_7;
731 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
732 const float vel2Term = f_12 + f_13 + f_5;
733 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
734 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
735
736 // calculate current velocity (before density change)
737 float const rho_inv = float{1} / rho;
738 float const u_old[3] = {momdensity_0 * rho_inv, momdensity_1 * rho_inv, momdensity_2 * rho_inv};
739
740 Equilibrium::kernel_set_device(pdf, u_old, rho_in[0u]);
741 }
742}
743// LCOV_EXCL_STOP
744
745float get(
746 gpu::GPUField<float> const *pdf_field,
747 Cell const &cell) {
748 CellInterval ci(cell, cell);
749 thrust::device_vector<float> dev_data(1u);
750 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
751 auto kernel = gpu::make_kernel(kernel_get);
752 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
753 kernel.addParam(dev_data_ptr);
754 kernel();
755 float rho = dev_data[0u];
756 return rho;
757}
758
759void set(
760 gpu::GPUField<float> *pdf_field,
761 const float rho,
762 Cell const &cell) {
763 CellInterval ci(cell, cell);
764 thrust::device_vector<float> dev_data(1u, rho);
765 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
766 auto kernel = gpu::make_kernel(kernel_set);
767 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
768 kernel.addParam(const_cast<const float *>(dev_data_ptr));
769 kernel();
770}
771
772std::vector<float> get(
773 gpu::GPUField<float> const *pdf_field,
774 CellInterval const &ci) {
775 thrust::device_vector<float> dev_data(ci.numCells());
776 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
777 auto kernel = gpu::make_kernel(kernel_get);
778 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
779 kernel.addParam(dev_data_ptr);
780 kernel();
781 std::vector<float> out(dev_data.size());
782 thrust::copy(dev_data.begin(), dev_data.end(), out.begin());
783 return out;
784}
785
786void set(
787 gpu::GPUField<float> *pdf_field,
788 std::vector<float> const &values,
789 CellInterval const &ci) {
790 thrust::device_vector<float> dev_data(values.begin(), values.end());
791 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
792 auto kernel = gpu::make_kernel(kernel_set);
793 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
794 kernel.addParam(const_cast<const float *>(dev_data_ptr));
795 kernel();
796}
797} // namespace Density
798
799namespace Velocity {
800// LCOV_EXCL_START
801__global__ void kernel_get(
802 gpu::FieldAccessor<float> pdf,
803 gpu::FieldAccessor<float> force,
804 float *RESTRICT u_out) {
805 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
806 pdf.set(blockIdx, threadIdx);
807 force.set(blockIdx, threadIdx);
808 u_out += offset;
809 if (pdf.isValidPosition()) {
810 float const f_0 = pdf.get(0u);
811 float const f_1 = pdf.get(1u);
812 float const f_2 = pdf.get(2u);
813 float const f_3 = pdf.get(3u);
814 float const f_4 = pdf.get(4u);
815 float const f_5 = pdf.get(5u);
816 float const f_6 = pdf.get(6u);
817 float const f_7 = pdf.get(7u);
818 float const f_8 = pdf.get(8u);
819 float const f_9 = pdf.get(9u);
820 float const f_10 = pdf.get(10u);
821 float const f_11 = pdf.get(11u);
822 float const f_12 = pdf.get(12u);
823 float const f_13 = pdf.get(13u);
824 float const f_14 = pdf.get(14u);
825 float const f_15 = pdf.get(15u);
826 float const f_16 = pdf.get(16u);
827 float const f_17 = pdf.get(17u);
828 float const f_18 = pdf.get(18u);
829 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
830 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
831 const float vel1Term = f_1 + f_11 + f_15 + f_7;
832 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
833 const float vel2Term = f_12 + f_13 + f_5;
834 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
835 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
836 const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
837 const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
838 const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
839 auto const rho_inv = float{1} / rho;
840 u_out[0u] = md_0 * rho_inv;
841 u_out[1u] = md_1 * rho_inv;
842 u_out[2u] = md_2 * rho_inv;
843 }
844}
845
846__global__ void kernel_set(
847 gpu::FieldAccessor<float> pdf,
848 gpu::FieldAccessor<float> velocity,
849 gpu::FieldAccessor<float> force,
850 float const *RESTRICT u_in) {
851 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
852 pdf.set(blockIdx, threadIdx);
853 velocity.set(blockIdx, threadIdx);
854 force.set(blockIdx, threadIdx);
855 u_in += offset;
856 if (pdf.isValidPosition()) {
857 float const f_0 = pdf.get(0u);
858 float const f_1 = pdf.get(1u);
859 float const f_2 = pdf.get(2u);
860 float const f_3 = pdf.get(3u);
861 float const f_4 = pdf.get(4u);
862 float const f_5 = pdf.get(5u);
863 float const f_6 = pdf.get(6u);
864 float const f_7 = pdf.get(7u);
865 float const f_8 = pdf.get(8u);
866 float const f_9 = pdf.get(9u);
867 float const f_10 = pdf.get(10u);
868 float const f_11 = pdf.get(11u);
869 float const f_12 = pdf.get(12u);
870 float const f_13 = pdf.get(13u);
871 float const f_14 = pdf.get(14u);
872 float const f_15 = pdf.get(15u);
873 float const f_16 = pdf.get(16u);
874 float const f_17 = pdf.get(17u);
875 float const f_18 = pdf.get(18u);
876 float const *RESTRICT const u = u_in;
877 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
878 const float vel1Term = f_1 + f_11 + f_15 + f_7;
879 const float vel2Term = f_12 + f_13 + f_5;
880 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
881 const float u_0 = -force.get(0) * 0.50000000000000000f / rho + u[0];
882 const float u_1 = -force.get(1) * 0.50000000000000000f / rho + u[1];
883 const float u_2 = -force.get(2) * 0.50000000000000000f / rho + u[2];
884 velocity.get(0u) = u_in[0u];
885 velocity.get(1u) = u_in[1u];
886 velocity.get(2u) = u_in[2u];
887
888 float u_new[3] = {u_0, u_1, u_2};
889
890 Equilibrium::kernel_set_device(pdf, u_new, rho);
891 }
892}
893// LCOV_EXCL_STOP
894
895Vector3<float> get(
896 gpu::GPUField<float> const *pdf_field,
897 gpu::GPUField<float> const *force_field,
898 Cell const &cell) {
899 CellInterval ci(cell, cell);
900 thrust::device_vector<float> dev_data(3u);
901 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
902 auto kernel = gpu::make_kernel(kernel_get);
903 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
904 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
905 kernel.addParam(dev_data_ptr);
906 kernel();
907 Vector3<float> vec;
908 thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
909 return vec;
910}
911
912std::vector<float> get(
913 gpu::GPUField<float> const *pdf_field,
914 gpu::GPUField<float> const *force_field,
915 CellInterval const &ci) {
916 thrust::device_vector<float> dev_data(3u);
917 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
918 auto kernel = gpu::make_kernel(kernel_get);
919 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
920 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
921 kernel.addParam(dev_data_ptr);
922 kernel();
923 std::vector<float> out(dev_data.size());
924 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
925 return out;
926}
927
928void set(
929 gpu::GPUField<float> *pdf_field,
930 gpu::GPUField<float> *velocity_field,
931 gpu::GPUField<float> const *force_field,
932 Vector3<float> const &u,
933 Cell const &cell) {
934 CellInterval ci(cell, cell);
935 thrust::device_vector<float> dev_data(u.data(), u.data() + 3u);
936 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
937 auto kernel = gpu::make_kernel(kernel_set);
938 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
939 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
940 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
941 kernel.addParam(const_cast<const float *>(dev_data_ptr));
942 kernel();
943}
944
945void set(
946 gpu::GPUField<float> *pdf_field,
947 gpu::GPUField<float> *velocity_field,
948 gpu::GPUField<float> const *force_field,
949 std::vector<float> const &values,
950 CellInterval const &ci) {
951 thrust::device_vector<float> dev_data(values.begin(), values.end());
952 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
953 auto kernel = gpu::make_kernel(kernel_set);
954 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
955 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
956 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
957 kernel.addParam(const_cast<const float *>(dev_data_ptr));
958 kernel();
959}
960} // namespace Velocity
961
962namespace Force {
963// LCOV_EXCL_START
964__global__ void kernel_set(
965 gpu::FieldAccessor<float> pdf,
966 gpu::FieldAccessor<float> velocity,
967 gpu::FieldAccessor<float> force,
968 float const *RESTRICT f_in) {
969 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
970 pdf.set(blockIdx, threadIdx);
971 velocity.set(blockIdx, threadIdx);
972 force.set(blockIdx, threadIdx);
973 f_in += offset;
974 if (pdf.isValidPosition()) {
975 float const f_0 = pdf.get(0u);
976 float const f_1 = pdf.get(1u);
977 float const f_2 = pdf.get(2u);
978 float const f_3 = pdf.get(3u);
979 float const f_4 = pdf.get(4u);
980 float const f_5 = pdf.get(5u);
981 float const f_6 = pdf.get(6u);
982 float const f_7 = pdf.get(7u);
983 float const f_8 = pdf.get(8u);
984 float const f_9 = pdf.get(9u);
985 float const f_10 = pdf.get(10u);
986 float const f_11 = pdf.get(11u);
987 float const f_12 = pdf.get(12u);
988 float const f_13 = pdf.get(13u);
989 float const f_14 = pdf.get(14u);
990 float const f_15 = pdf.get(15u);
991 float const f_16 = pdf.get(16u);
992 float const f_17 = pdf.get(17u);
993 float const f_18 = pdf.get(18u);
994 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
995 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
996 const float vel1Term = f_1 + f_11 + f_15 + f_7;
997 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
998 const float vel2Term = f_12 + f_13 + f_5;
999 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
1000 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
1001 const float md_0 = f_in[0u] * 0.50000000000000000f + momdensity_0;
1002 const float md_1 = f_in[1u] * 0.50000000000000000f + momdensity_1;
1003 const float md_2 = f_in[2u] * 0.50000000000000000f + momdensity_2;
1004 auto const rho_inv = float{1} / rho;
1005
1006 force.get(0u) = f_in[0u];
1007 force.get(1u) = f_in[1u];
1008 force.get(2u) = f_in[2u];
1009
1010 velocity.get(0u) = md_0 * rho_inv;
1011 velocity.get(1u) = md_1 * rho_inv;
1012 velocity.get(2u) = md_2 * rho_inv;
1013 }
1014}
1015// LCOV_EXCL_STOP
1016
1017void set(gpu::GPUField<float> const *pdf_field,
1018 gpu::GPUField<float> *velocity_field,
1019 gpu::GPUField<float> *force_field,
1020 Vector3<float> const &u,
1021 Cell const &cell) {
1022 CellInterval ci(cell, cell);
1023 thrust::device_vector<float> dev_data(u.data(), u.data() + 3u);
1024 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1025 auto kernel = gpu::make_kernel(kernel_set);
1026 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1027 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
1028 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1029 kernel.addParam(const_cast<const float *>(dev_data_ptr));
1030 kernel();
1031}
1032
1033void set(gpu::GPUField<float> const *pdf_field,
1034 gpu::GPUField<float> *velocity_field,
1035 gpu::GPUField<float> *force_field,
1036 std::vector<float> const &values,
1037 CellInterval const &ci) {
1038 thrust::device_vector<float> dev_data(values.begin(), values.end());
1039 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1040 auto kernel = gpu::make_kernel(kernel_set);
1041 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1042 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
1043 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1044 kernel.addParam(const_cast<const float *>(dev_data_ptr));
1045 kernel();
1046}
1047} // namespace Force
1048
1049namespace MomentumDensity {
1050// LCOV_EXCL_START
1051__global__ void kernel_sum(
1052 gpu::FieldAccessor<float> pdf,
1053 gpu::FieldAccessor<float> force,
1054 float *RESTRICT out) {
1055 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
1056 pdf.set(blockIdx, threadIdx);
1057 force.set(blockIdx, threadIdx);
1058 out += offset;
1059 if (pdf.isValidPosition()) {
1060 float const f_0 = pdf.get(0u);
1061 float const f_1 = pdf.get(1u);
1062 float const f_2 = pdf.get(2u);
1063 float const f_3 = pdf.get(3u);
1064 float const f_4 = pdf.get(4u);
1065 float const f_5 = pdf.get(5u);
1066 float const f_6 = pdf.get(6u);
1067 float const f_7 = pdf.get(7u);
1068 float const f_8 = pdf.get(8u);
1069 float const f_9 = pdf.get(9u);
1070 float const f_10 = pdf.get(10u);
1071 float const f_11 = pdf.get(11u);
1072 float const f_12 = pdf.get(12u);
1073 float const f_13 = pdf.get(13u);
1074 float const f_14 = pdf.get(14u);
1075 float const f_15 = pdf.get(15u);
1076 float const f_16 = pdf.get(16u);
1077 float const f_17 = pdf.get(17u);
1078 float const f_18 = pdf.get(18u);
1079 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
1080 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
1081 const float vel1Term = f_1 + f_11 + f_15 + f_7;
1082 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
1083 const float vel2Term = f_12 + f_13 + f_5;
1084 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
1085 const float rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
1086 const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
1087 const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
1088 const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
1089 out[0u] += md_0;
1090 out[1u] += md_1;
1091 out[2u] += md_2;
1092 }
1093}
1094// LCOV_EXCL_STOP
1095
1096Vector3<float> reduce(
1097 gpu::GPUField<float> const *pdf_field,
1098 gpu::GPUField<float> const *force_field) {
1099 thrust::device_vector<float> dev_data(3u, float{0});
1100 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1101 WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
1102 Cell cell(x, y, z);
1103 CellInterval ci(cell, cell);
1104 auto kernel = gpu::make_kernel(kernel_sum);
1105 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1106 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1107 kernel.addParam(dev_data_ptr);
1108 kernel();
1109 });
1110 Vector3<float> mom(float{0});
1111 thrust::copy(dev_data.begin(), dev_data.begin() + 3u, mom.data());
1112 return mom;
1113}
1114} // namespace MomentumDensity
1115
1116namespace PressureTensor {
1117// LCOV_EXCL_START
1118__global__ void kernel_get(
1119 gpu::FieldAccessor<float> pdf,
1120 float *RESTRICT p_out) {
1121 auto const offset = getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 9u);
1122 pdf.set(blockIdx, threadIdx);
1123 p_out += offset;
1124 if (pdf.isValidPosition()) {
1125 float const f_0 = pdf.get(0u);
1126 float const f_1 = pdf.get(1u);
1127 float const f_2 = pdf.get(2u);
1128 float const f_3 = pdf.get(3u);
1129 float const f_4 = pdf.get(4u);
1130 float const f_5 = pdf.get(5u);
1131 float const f_6 = pdf.get(6u);
1132 float const f_7 = pdf.get(7u);
1133 float const f_8 = pdf.get(8u);
1134 float const f_9 = pdf.get(9u);
1135 float const f_10 = pdf.get(10u);
1136 float const f_11 = pdf.get(11u);
1137 float const f_12 = pdf.get(12u);
1138 float const f_13 = pdf.get(13u);
1139 float const f_14 = pdf.get(14u);
1140 float const f_15 = pdf.get(15u);
1141 float const f_16 = pdf.get(16u);
1142 float const f_17 = pdf.get(17u);
1143 float const f_18 = pdf.get(18u);
1144 const float p_0 = f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
1145 const float p_1 = -f_10 - f_7 + f_8 + f_9;
1146 const float p_2 = -f_13 + f_14 + f_17 - f_18;
1147 const float p_3 = -f_10 - f_7 + f_8 + f_9;
1148 const float p_4 = f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
1149 const float p_5 = f_11 - f_12 - f_15 + f_16;
1150 const float p_6 = -f_13 + f_14 + f_17 - f_18;
1151 const float p_7 = f_11 - f_12 - f_15 + f_16;
1152 const float p_8 = f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
1153 p_out[0u] = p_0;
1154 p_out[1u] = p_1;
1155 p_out[2u] = p_2;
1156 p_out[3u] = p_3;
1157 p_out[4u] = p_4;
1158 p_out[5u] = p_5;
1159 p_out[6u] = p_6;
1160 p_out[7u] = p_7;
1161 p_out[8u] = p_8;
1162 }
1163}
1164// LCOV_EXCL_STOP
1165
1166Matrix3<float> get(
1167 gpu::GPUField<float> const *pdf_field,
1168 Cell const &cell) {
1169 CellInterval ci(cell, cell);
1170 thrust::device_vector<float> dev_data(9u);
1171 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1172 auto kernel = gpu::make_kernel(kernel_get);
1173 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1174 kernel.addParam(dev_data_ptr);
1175 kernel();
1176 Matrix3<float> out;
1177 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1178 return out;
1179}
1180
1181std::vector<float> get(
1182 gpu::GPUField<float> const *pdf_field,
1183 CellInterval const &ci) {
1184 thrust::device_vector<float> dev_data(9u * ci.numCells());
1185 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1186 auto kernel = gpu::make_kernel(kernel_get);
1187 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1188 kernel.addParam(dev_data_ptr);
1189 kernel();
1190 std::vector<float> out(dev_data.size());
1191 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1192 return out;
1193}
1194} // namespace PressureTensor
1195
1196} // namespace accessor
1197} // namespace lbm
1198} // namespace walberla
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
static __forceinline__ __device__ uint getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset)
Get linear index of flattened data with original layout fzyx.
static __forceinline__ __device__ uint getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset)
Get linear index of flattened data with original layout fzyx.
Definition Cell.hpp:97
static double weight(int type, double r_cut, double k, double r)
Definition dpd.cpp:78
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, double *RESTRICT rho_out)
void set(GhostLayerField< double, uint_t{19u}> *pdf_field, double const rho_in, Cell const &cell)
double get(GhostLayerField< double, uint_t{19u}> const *pdf_field, Cell const &cell)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, double const *RESTRICT rho_in)
__device__ void kernel_set_device(gpu::FieldAccessor< double > pdf, double const *RESTRICT const u, double rho)
void set(GhostLayerField< double, uint_t{19u}> const *pdf_field, GhostLayerField< double, uint_t{3u}> *velocity_field, GhostLayerField< double, uint_t{3u}> *force_field, Vector3< double > const &force, Cell const &cell)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > velocity, gpu::FieldAccessor< double > force, double const *RESTRICT f_in)
__global__ void kernel_get(gpu::FieldAccessor< double > vec, double const *RESTRICT const pos, double *RESTRICT const vel, uint n_pos, uint gl)
static dim3 calculate_dim_grid(uint const threads_x, uint const blocks_per_grid_y, uint const threads_per_block)
std::vector< double > get(gpu::GPUField< double > const *vec_field, std::vector< double > const &pos, uint gl)
__global__ void kernel_set(gpu::FieldAccessor< double > vec, double const *RESTRICT const pos, double const *RESTRICT const forces, uint n_pos, uint gl)
void set(gpu::GPUField< double > const *vec_field, std::vector< double > const &pos, std::vector< double > const &forces, uint gl)
static __forceinline__ __device__ void calculate_weights(double const *RESTRICT const pos, int *RESTRICT const corner, double *RESTRICT const weights, uint gl)
Calculate interpolation weights.
__global__ void kernel_sum(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > force, double *RESTRICT out)
auto reduce(GhostLayerField< double, uint_t{19u}> const *pdf_field, GhostLayerField< double, uint_t{3u}> const *force_field)
__global__ void kernel_set_vel(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > velocity, gpu::FieldAccessor< double > force, double const *RESTRICT pop)
__global__ void kernel_broadcast(gpu::FieldAccessor< double > pdf, double const *RESTRICT pop)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, double const *RESTRICT pop)
void set(GhostLayerField< double, uint_t{19u}> *pdf_field, std::array< double, 19u > const &pop, Cell const &cell)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, double *RESTRICT pop)
auto get(GhostLayerField< double, uint_t{19u}> const *pdf_field, Cell const &cell)
void initialize(GhostLayerField< double, uint_t{19u}> *pdf_field, std::array< double, 19u > const &pop)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, double *RESTRICT p_out)
auto get(GhostLayerField< double, uint_t{19u}> const *pdf_field, Cell const &cell)
void initialize(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec)
__global__ void kernel_get(gpu::FieldAccessor< double > vec, double *u_out)
__global__ void kernel_broadcast(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void add(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec, Cell const &cell)
__global__ void kernel_set(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void set(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec, Cell const &cell)
auto get(GhostLayerField< double, uint_t{3u}> const *vec_field, Cell const &cell)
__global__ void kernel_broadcast_add(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void add_to_all(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec)
__global__ void kernel_add(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void set(GhostLayerField< double, uint_t{19u}> *pdf_field, GhostLayerField< double, uint_t{3u}> *velocity_field, GhostLayerField< double, uint_t{3u}> const *force_field, Vector3< double > const &u, Cell const &cell)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > force, double *RESTRICT u_out)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > velocity, gpu::FieldAccessor< double > force, double const *RESTRICT u_in)
auto get(GhostLayerField< double, uint_t{19u}> const *pdf_field, GhostLayerField< double, uint_t{3u}> const *force_field, Cell const &cell)
\file PackInfoPdfDoublePrecision.cpp \author pystencils
static Utils::Vector3d velocity(Particle const &p_ref, Particle const &p_vs)
Velocity of the virtual site.
Definition relative.cpp:64