30#include <core/DataTypes.h>
31#include <core/cell/Cell.h>
32#include <core/cell/CellInterval.h>
33#include <core/math/Matrix3.h>
34#include <core/math/Vector3.h>
36#include <field/iterators/IteratorMacros.h>
38#include <gpu/FieldAccessor.h>
39#include <gpu/FieldIndexing.h>
40#include <gpu/GPUField.h>
41#include <gpu/Kernel.h>
43#include <thrust/copy.h>
44#include <thrust/device_ptr.h>
45#include <thrust/device_vector.h>
46#include <thrust/functional.h>
47#include <thrust/transform.h>
53#define RESTRICT __restrict__
54#elif defined(__clang__)
56#if defined(__CUDA_ARCH__)
58#define RESTRICT __restrict__
61#define RESTRICT __restrict__
64#elif defined(__GNUC__) or defined(__GNUG__)
65#define RESTRICT __restrict__
66#elif defined(_MSC_VER)
67#define RESTRICT __restrict
74static __forceinline__ __device__ uint
getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset) {
75 auto const x = threadIdx.x;
76 auto const y = blockIdx.x;
77 auto const z = blockIdx.y;
78 auto const f = blockIdx.z;
79 auto const ySize = gridDim.x;
80 auto const zSize = gridDim.y;
81 auto const fSize = fOffset;
85 x * fSize * zSize * ySize;
93 __host__ __device__
constexpr float operator()(
float const &x)
const {
100 uint
const blocks_per_grid_y,
101 uint
const threads_per_block) {
102 assert(threads_x >= 1u);
103 assert(blocks_per_grid_y >= 1u);
104 assert(threads_per_block >= 1u);
105 auto const threads_y = threads_per_block * blocks_per_grid_y;
106 auto const blocks_per_grid_x = (threads_x + threads_y - 1) / threads_y;
107 return make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
114namespace Population {
117 gpu::FieldAccessor<float> pdf,
119 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
120 pdf.set(blockIdx, threadIdx);
122 if (pdf.isValidPosition()) {
123 pop[0u] = pdf.get(0u);
124 pop[1u] = pdf.get(1u);
125 pop[2u] = pdf.get(2u);
126 pop[3u] = pdf.get(3u);
127 pop[4u] = pdf.get(4u);
128 pop[5u] = pdf.get(5u);
129 pop[6u] = pdf.get(6u);
130 pop[7u] = pdf.get(7u);
131 pop[8u] = pdf.get(8u);
132 pop[9u] = pdf.get(9u);
133 pop[10u] = pdf.get(10u);
134 pop[11u] = pdf.get(11u);
135 pop[12u] = pdf.get(12u);
136 pop[13u] = pdf.get(13u);
137 pop[14u] = pdf.get(14u);
138 pop[15u] = pdf.get(15u);
139 pop[16u] = pdf.get(16u);
140 pop[17u] = pdf.get(17u);
141 pop[18u] = pdf.get(18u);
146 gpu::FieldAccessor<float> pdf,
148 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
149 pdf.set(blockIdx, threadIdx);
151 if (pdf.isValidPosition()) {
152 pdf.get(0u) = pop[0u];
153 pdf.get(1u) = pop[1u];
154 pdf.get(2u) = pop[2u];
155 pdf.get(3u) = pop[3u];
156 pdf.get(4u) = pop[4u];
157 pdf.get(5u) = pop[5u];
158 pdf.get(6u) = pop[6u];
159 pdf.get(7u) = pop[7u];
160 pdf.get(8u) = pop[8u];
161 pdf.get(9u) = pop[9u];
162 pdf.get(10u) = pop[10u];
163 pdf.get(11u) = pop[11u];
164 pdf.get(12u) = pop[12u];
165 pdf.get(13u) = pop[13u];
166 pdf.get(14u) = pop[14u];
167 pdf.get(15u) = pop[15u];
168 pdf.get(16u) = pop[16u];
169 pdf.get(17u) = pop[17u];
170 pdf.get(18u) = pop[18u];
175 gpu::FieldAccessor<float> pdf,
177 pdf.set(blockIdx, threadIdx);
178 if (pdf.isValidPosition()) {
179 pdf.get(0u) = pop[0u];
180 pdf.get(1u) = pop[1u];
181 pdf.get(2u) = pop[2u];
182 pdf.get(3u) = pop[3u];
183 pdf.get(4u) = pop[4u];
184 pdf.get(5u) = pop[5u];
185 pdf.get(6u) = pop[6u];
186 pdf.get(7u) = pop[7u];
187 pdf.get(8u) = pop[8u];
188 pdf.get(9u) = pop[9u];
189 pdf.get(10u) = pop[10u];
190 pdf.get(11u) = pop[11u];
191 pdf.get(12u) = pop[12u];
192 pdf.get(13u) = pop[13u];
193 pdf.get(14u) = pop[14u];
194 pdf.get(15u) = pop[15u];
195 pdf.get(16u) = pop[16u];
196 pdf.get(17u) = pop[17u];
197 pdf.get(18u) = pop[18u];
202 gpu::FieldAccessor<float> pdf,
204 gpu::FieldAccessor<float> force,
206 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 19u);
207 pdf.set(blockIdx, threadIdx);
209 force.set(blockIdx, threadIdx);
211 if (pdf.isValidPosition()) {
212 const float f_0 = pdf.get(0u) = pop[0u];
213 const float f_1 = pdf.get(1u) = pop[1u];
214 const float f_2 = pdf.get(2u) = pop[2u];
215 const float f_3 = pdf.get(3u) = pop[3u];
216 const float f_4 = pdf.get(4u) = pop[4u];
217 const float f_5 = pdf.get(5u) = pop[5u];
218 const float f_6 = pdf.get(6u) = pop[6u];
219 const float f_7 = pdf.get(7u) = pop[7u];
220 const float f_8 = pdf.get(8u) = pop[8u];
221 const float f_9 = pdf.get(9u) = pop[9u];
222 const float f_10 = pdf.get(10u) = pop[10u];
223 const float f_11 = pdf.get(11u) = pop[11u];
224 const float f_12 = pdf.get(12u) = pop[12u];
225 const float f_13 = pdf.get(13u) = pop[13u];
226 const float f_14 = pdf.get(14u) = pop[14u];
227 const float f_15 = pdf.get(15u) = pop[15u];
228 const float f_16 = pdf.get(16u) = pop[16u];
229 const float f_17 = pdf.get(17u) = pop[17u];
230 const float f_18 = pdf.get(18u) = pop[18u];
231 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
232 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
233 const float vel1Term = f_1 + f_11 + f_15 + f_7;
234 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
235 const float vel2Term = f_12 + f_13 + f_5;
236 const float delta_rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
237 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
238 const float rho = delta_rho + 1;
239 const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
240 const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
241 const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
242 const float rho_inv =
float{1} / rho;
250std::array<float, 19u>
get(
251 gpu::GPUField<float>
const *pdf_field,
253 CellInterval ci(cell, cell);
254 thrust::device_vector<float> dev_data(19u);
255 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
257 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
258 kernel.addParam(dev_data_ptr);
260 std::array<float, 19u> pop;
261 thrust::copy(dev_data.begin(), dev_data.end(), pop.data());
266 gpu::GPUField<float> *pdf_field,
267 std::array<float, 19u>
const &pop,
269 thrust::device_vector<float> dev_data(pop.begin(), pop.end());
270 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
271 CellInterval ci(cell, cell);
273 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
274 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
279 gpu::GPUField<float> *pdf_field,
280 gpu::GPUField<float> *velocity_field,
281 gpu::GPUField<float>
const *force_field,
282 std::array<float, 19u>
const &pop,
284 thrust::device_vector<float> dev_data(pop.begin(), pop.end());
285 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
286 CellInterval ci(cell, cell);
288 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
289 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
290 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
291 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
296 gpu::GPUField<float> *pdf_field,
297 std::array<float, 19u>
const &pop) {
298 CellInterval ci = pdf_field->xyzSizeWithGhostLayer();
299 thrust::device_vector<float> dev_data(pop.begin(), pop.end());
300 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
302 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
303 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
308 gpu::GPUField<float>
const *pdf_field,
309 CellInterval
const &ci) {
310 thrust::device_vector<float> dev_data(ci.numCells() * 19u);
311 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
313 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
314 kernel.addParam(dev_data_ptr);
316 std::vector<float> out(ci.numCells() * 19u);
317 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
322 gpu::GPUField<float> *pdf_field,
323 std::vector<float>
const &values,
324 CellInterval
const &ci) {
325 thrust::device_vector<float> dev_data(values.begin(), values.end());
326 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
328 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
329 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
334 gpu::GPUField<float> *pdf_field,
335 gpu::GPUField<float> *velocity_field,
336 gpu::GPUField<float>
const *force_field,
337 std::vector<float>
const &values,
338 CellInterval
const &ci) {
339 thrust::device_vector<float> dev_data(values.begin(), values.end());
340 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
342 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
343 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
344 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
345 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
353 gpu::FieldAccessor<float> vec,
355 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
356 vec.set(blockIdx, threadIdx);
358 if (vec.isValidPosition()) {
359 u_out[0u] = vec.get(0u);
360 u_out[1u] = vec.get(1u);
361 u_out[2u] = vec.get(2u);
366 gpu::FieldAccessor<float> vec,
368 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
369 vec.set(blockIdx, threadIdx);
371 if (vec.isValidPosition()) {
372 vec.get(0u) = u_in[0u];
373 vec.get(1u) = u_in[1u];
374 vec.get(2u) = u_in[2u];
379 gpu::FieldAccessor<float> vec,
381 vec.set(blockIdx, threadIdx);
382 if (vec.isValidPosition()) {
383 vec.get(0u) = u_in[0u];
384 vec.get(1u) = u_in[1u];
385 vec.get(2u) = u_in[2u];
390 gpu::FieldAccessor<float> vec,
392 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
393 vec.set(blockIdx, threadIdx);
395 if (vec.isValidPosition()) {
396 vec.get(0u) += u_in[0u];
397 vec.get(1u) += u_in[1u];
398 vec.get(2u) += u_in[2u];
403 gpu::FieldAccessor<float> vec,
405 vec.set(blockIdx, threadIdx);
406 if (vec.isValidPosition()) {
407 vec.get(0u) += u_in[0u];
408 vec.get(1u) += u_in[1u];
409 vec.get(2u) += u_in[2u];
414 gpu::FieldAccessor<float> vec,
419 uint index = blockIdx.y * gridDim.x * blockDim.x +
420 blockDim.x * blockIdx.x + threadIdx.x;
422 vec.set({0u, 0u, 0u}, {0u, 0u, 0u});
423 if (vec.isValidPosition() and index < length) {
424 auto const array_index = index * uint(3u);
425 auto const cx = indices[array_index + 0u];
426 auto const cy = indices[array_index + 1u];
427 auto const cz = indices[array_index + 2u];
429 for (uint cf = 0u; cf < 3u; ++cf) {
430 vec.getNeighbor(cx, cy, cz, cf) = values[array_index + cf];
437 gpu::GPUField<float>
const *vec_field,
439 CellInterval ci(cell, cell);
440 thrust::device_vector<float> dev_data(3u);
441 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
443 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
444 kernel.addParam(dev_data_ptr);
447 thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
452 gpu::GPUField<float> *vec_field,
453 Vector3<float>
const &vec,
455 CellInterval ci(cell, cell);
456 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
457 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
459 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
460 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
465 gpu::GPUField<float> *vec_field,
466 Vector3<float>
const &vec,
468 CellInterval ci(cell, cell);
469 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
470 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
472 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
473 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
478 gpu::GPUField<float> *vec_field,
479 Vector3<float>
const &vec) {
480 CellInterval ci = vec_field->xyzSizeWithGhostLayer();
481 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
482 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
484 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
485 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
490 gpu::GPUField<float> *vec_field,
491 Vector3<float>
const &vec) {
492 CellInterval ci = vec_field->xyzSizeWithGhostLayer();
493 thrust::device_vector<float> dev_data(vec.data(), vec.data() + 3u);
494 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
496 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
497 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
502 gpu::GPUField<float>
const *vec_field,
503 CellInterval
const &ci) {
504 thrust::device_vector<float> dev_data(ci.numCells() * 3u);
505 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
507 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
508 kernel.addParam(dev_data_ptr);
510 std::vector<float> out(ci.numCells() * 3u);
511 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
516 gpu::GPUField<float> *vec_field,
517 std::vector<float>
const &values,
518 CellInterval
const &ci) {
519 thrust::device_vector<float> dev_data(values.begin(), values.end());
520 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
522 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*vec_field, ci));
523 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
528 gpu::GPUField<float>
const *field,
529 thrust::device_vector<int>
const &indices,
530 thrust::device_vector<float>
const &values,
532 auto const dev_idx_ptr = thrust::raw_pointer_cast(indices.data());
533 auto const dev_val_ptr = thrust::raw_pointer_cast(values.data());
535 auto const threads_per_block = uint(64u);
536 auto const length =
static_cast<uint
>(indices.size() / 3ul);
538 kernel_set_from_list<<<dim_grid, threads_per_block, 0u, nullptr>>>(
539 gpu::FieldIndexing<float>::withGhostLayerXYZ(*field, gl).gpuAccess(),
540 dev_idx_ptr, dev_val_ptr, length);
544namespace Interpolation {
553 for (
int dim = 0; dim < 3; ++dim) {
554 auto const fractional_index = pos[dim] -
float{0.5};
555 auto const nmp = floorf(fractional_index);
556 auto const distance = fractional_index - nmp -
float{0.5};
557 corner[dim] = __float2int_rn(nmp) +
static_cast<int>(gl);
558 weights[dim * 2 + 0] =
float{0.5} - distance;
559 weights[dim * 2 + 1] =
float{0.5} + distance;
564 gpu::FieldAccessor<float> pdf,
571 uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
572 blockDim.x * blockIdx.x + threadIdx.x;
574 pdf.set({0u, 0u, 0u}, {0u, 0u, 0u});
575 if (pdf.isValidPosition() and pos_index < n_pos) {
576 auto const array_offset = pos_index * uint(3u);
581 for (
int i = 0; i < 2; i++) {
582 auto const cx = corner[0] + i;
583 auto const wx = weights[0][i];
585 for (
int j = 0; j < 2; j++) {
586 auto const cy = corner[1] + j;
587 auto const wxy = wx * weights[1][j];
589 for (
int k = 0; k < 2; k++) {
590 auto const cz = corner[2] + k;
591 auto const weight = wxy * weights[2][k];
592 float const f_0 = pdf.getNeighbor(cx, cy, cz, 0u);
593 float const f_1 = pdf.getNeighbor(cx, cy, cz, 1u);
594 float const f_2 = pdf.getNeighbor(cx, cy, cz, 2u);
595 float const f_3 = pdf.getNeighbor(cx, cy, cz, 3u);
596 float const f_4 = pdf.getNeighbor(cx, cy, cz, 4u);
597 float const f_5 = pdf.getNeighbor(cx, cy, cz, 5u);
598 float const f_6 = pdf.getNeighbor(cx, cy, cz, 6u);
599 float const f_7 = pdf.getNeighbor(cx, cy, cz, 7u);
600 float const f_8 = pdf.getNeighbor(cx, cy, cz, 8u);
601 float const f_9 = pdf.getNeighbor(cx, cy, cz, 9u);
602 float const f_10 = pdf.getNeighbor(cx, cy, cz, 10u);
603 float const f_11 = pdf.getNeighbor(cx, cy, cz, 11u);
604 float const f_12 = pdf.getNeighbor(cx, cy, cz, 12u);
605 float const f_13 = pdf.getNeighbor(cx, cy, cz, 13u);
606 float const f_14 = pdf.getNeighbor(cx, cy, cz, 14u);
607 float const f_15 = pdf.getNeighbor(cx, cy, cz, 15u);
608 float const f_16 = pdf.getNeighbor(cx, cy, cz, 16u);
609 float const f_17 = pdf.getNeighbor(cx, cy, cz, 17u);
610 float const f_18 = pdf.getNeighbor(cx, cy, cz, 18u);
611 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
612 const float vel1Term = f_1 + f_11 + f_15 + f_7;
613 const float vel2Term = f_12 + f_13 + f_5;
614 const float delta_rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
615 const float rho =
density * (delta_rho + 1);
616 rho_out[pos_index] += weight * rho;
624 gpu::FieldAccessor<float> vel,
630 uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
631 blockDim.x * blockIdx.x + threadIdx.x;
633 vel.set({0u, 0u, 0u}, {0u, 0u, 0u});
634 if (vel.isValidPosition() and pos_index < n_pos) {
635 auto const array_offset = pos_index * uint(3u);
640 for (
int i = 0; i < 2; i++) {
641 auto const cx = corner[0] + i;
642 auto const wx = weights[0][i];
644 for (
int j = 0; j < 2; j++) {
645 auto const cy = corner[1] + j;
646 auto const wxy = wx * weights[1][j];
648 for (
int k = 0; k < 2; k++) {
649 auto const cz = corner[2] + k;
650 auto const weight = wxy * weights[2][k];
652 for (uint cf = 0u; cf < 3u; ++cf) {
653 vel_out[array_offset + cf] += weight * vel.getNeighbor(cx, cy, cz, cf);
662 gpu::FieldAccessor<float> force,
668 uint pos_index = blockIdx.y * gridDim.x * blockDim.x +
669 blockDim.x * blockIdx.x + threadIdx.x;
671 force.set({0u, 0u, 0u}, {0u, 0u, 0u});
672 if (force.isValidPosition() and pos_index < n_pos) {
673 auto const array_offset = pos_index * uint(3u);
678 for (
int i = 0; i < 2; i++) {
679 auto const cx = corner[0] + i;
680 auto const wx = weights[0][i];
682 for (
int j = 0; j < 2; j++) {
683 auto const cy = corner[1] + j;
684 auto const wxy = wx * weights[1][j];
686 for (
int k = 0; k < 2; k++) {
687 auto const cz = corner[2] + k;
688 auto const weight = wxy * weights[2][k];
690 for (uint cf = 0u; cf < 3u; ++cf) {
691 atomicAdd(&force.getNeighbor(cx, cy, cz, cf),
692 weight * forces[array_offset + cf]);
703 gpu::GPUField<float>
const *field,
704 std::vector<float>
const &pos,
707 thrust::device_vector<float> dev_pos(pos.begin(), pos.end());
708 thrust::device_vector<float> dev_rho(pos.size() / 3ul);
709 auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
710 auto const dev_rho_ptr = thrust::raw_pointer_cast(dev_rho.data());
712 auto const threads_per_block = uint(64u);
713 auto const n_pos =
static_cast<uint
>(pos.size() / 3ul);
715 kernel_get_rho<<<dim_grid, threads_per_block, 0u, nullptr>>>(
716 gpu::FieldIndexing<float>::withGhostLayerXYZ(*field, gl).gpuAccess(),
717 dev_pos_ptr, dev_rho_ptr,
density, n_pos, gl);
719 std::vector<float> out(dev_rho.size());
720 thrust::copy(dev_rho.begin(), dev_rho.end(), out.data());
726 gpu::GPUField<float>
const *field,
727 std::vector<float>
const &pos,
729 thrust::device_vector<float> dev_pos(pos.begin(), pos.end());
730 thrust::device_vector<float> dev_vel(pos.size());
731 auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
732 auto const dev_vel_ptr = thrust::raw_pointer_cast(dev_vel.data());
734 auto const threads_per_block = uint(64u);
735 auto const n_pos =
static_cast<uint
>(pos.size() / 3ul);
737 kernel_get_vel<<<dim_grid, threads_per_block, 0u, nullptr>>>(
738 gpu::FieldIndexing<float>::withGhostLayerXYZ(*field, gl).gpuAccess(),
739 dev_pos_ptr, dev_vel_ptr, n_pos, gl);
741 std::vector<float> out(dev_vel.size());
742 thrust::copy(dev_vel.begin(), dev_vel.end(), out.data());
747 gpu::GPUField<float>
const *field,
748 std::vector<float>
const &pos,
749 std::vector<float>
const &forces,
751 thrust::device_vector<float> dev_pos(pos.begin(), pos.end());
752 thrust::device_vector<float> dev_for(forces.begin(), forces.end());
753 auto const dev_pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
754 auto const dev_for_ptr = thrust::raw_pointer_cast(dev_for.data());
756 auto const threads_per_block = uint(64u);
757 auto const n_pos =
static_cast<uint
>(pos.size() / 3ul);
759 kernel_add_force<<<dim_grid, threads_per_block, 0u, nullptr>>>(
760 gpu::FieldIndexing<float>::withGhostLayerXYZ(*field, gl).gpuAccess(),
761 dev_pos_ptr, dev_for_ptr, n_pos, gl);
765namespace Equilibrium {
768 gpu::FieldAccessor<float> pdf,
772 float delta_rho = rho -
float{1};
774 pdf.get(0u) = delta_rho * 0.33333333333333331f + rho * -0.33333333333333331f * (u[0] * u[0]) + rho * -0.33333333333333331f * (u[1] * u[1]) + rho * -0.33333333333333331f * (u[2] * u[2]);
775 pdf.get(1u) = delta_rho * 0.055555555555555552f + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.16666666666666666f * u[1] + rho * 0.16666666666666666f * (u[1] * u[1]);
776 pdf.get(2u) = delta_rho * 0.055555555555555552f + rho * -0.16666666666666666f * u[1] + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.16666666666666666f * (u[1] * u[1]);
777 pdf.get(3u) = delta_rho * 0.055555555555555552f + rho * -0.16666666666666666f * u[0] + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.16666666666666666f * (u[0] * u[0]);
778 pdf.get(4u) = delta_rho * 0.055555555555555552f + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * -0.16666666666666666f * (u[2] * u[2]) + rho * 0.16666666666666666f * u[0] + rho * 0.16666666666666666f * (u[0] * u[0]);
779 pdf.get(5u) = delta_rho * 0.055555555555555552f + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * 0.16666666666666666f * u[2] + rho * 0.16666666666666666f * (u[2] * u[2]);
780 pdf.get(6u) = delta_rho * 0.055555555555555552f + rho * -0.16666666666666666f * u[2] + rho * -0.16666666666666666f * (u[0] * u[0]) + rho * -0.16666666666666666f * (u[1] * u[1]) + rho * 0.16666666666666666f * (u[2] * u[2]);
781 pdf.get(7u) = delta_rho * 0.027777777777777776f + rho * -0.083333333333333329f * u[0] + rho * -0.25f * u[0] * u[1] + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]);
782 pdf.get(8u) = delta_rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.25f * u[0] * u[1];
783 pdf.get(9u) = delta_rho * 0.027777777777777776f + rho * -0.083333333333333329f * u[0] + rho * -0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.25f * u[0] * u[1];
784 pdf.get(10u) = delta_rho * 0.027777777777777776f + rho * -0.083333333333333329f * u[1] + rho * -0.25f * u[0] * u[1] + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[1] * u[1]);
785 pdf.get(11u) = delta_rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[1] * u[2];
786 pdf.get(12u) = delta_rho * 0.027777777777777776f + rho * -0.083333333333333329f * u[1] + rho * -0.25f * u[1] * u[2] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]);
787 pdf.get(13u) = delta_rho * 0.027777777777777776f + rho * -0.083333333333333329f * u[0] + rho * -0.25f * u[0] * u[2] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]);
788 pdf.get(14u) = delta_rho * 0.027777777777777776f + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[0] * u[2];
789 pdf.get(15u) = delta_rho * 0.027777777777777776f + rho * -0.083333333333333329f * u[2] + rho * -0.25f * u[1] * u[2] + rho * 0.083333333333333329f * u[1] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]);
790 pdf.get(16u) = delta_rho * 0.027777777777777776f + rho * -0.083333333333333329f * u[1] + rho * -0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[1] * u[1]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[1] * u[2];
791 pdf.get(17u) = delta_rho * 0.027777777777777776f + rho * -0.083333333333333329f * u[0] + rho * -0.083333333333333329f * u[2] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]) + rho * 0.25f * u[0] * u[2];
792 pdf.get(18u) = delta_rho * 0.027777777777777776f + rho * -0.083333333333333329f * u[2] + rho * -0.25f * u[0] * u[2] + rho * 0.083333333333333329f * u[0] + rho * 0.083333333333333329f * (u[0] * u[0]) + rho * 0.083333333333333329f * (u[2] * u[2]);
800 gpu::FieldAccessor<float> pdf,
803 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
804 pdf.set(blockIdx, threadIdx);
806 if (pdf.isValidPosition()) {
807 float const f_0 = pdf.get(0u);
808 float const f_1 = pdf.get(1u);
809 float const f_2 = pdf.get(2u);
810 float const f_3 = pdf.get(3u);
811 float const f_4 = pdf.get(4u);
812 float const f_5 = pdf.get(5u);
813 float const f_6 = pdf.get(6u);
814 float const f_7 = pdf.get(7u);
815 float const f_8 = pdf.get(8u);
816 float const f_9 = pdf.get(9u);
817 float const f_10 = pdf.get(10u);
818 float const f_11 = pdf.get(11u);
819 float const f_12 = pdf.get(12u);
820 float const f_13 = pdf.get(13u);
821 float const f_14 = pdf.get(14u);
822 float const f_15 = pdf.get(15u);
823 float const f_16 = pdf.get(16u);
824 float const f_17 = pdf.get(17u);
825 float const f_18 = pdf.get(18u);
826 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
827 const float vel1Term = f_1 + f_11 + f_15 + f_7;
828 const float vel2Term = f_12 + f_13 + f_5;
829 const float delta_rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
830 const float rho =
density * (delta_rho + 1);
836 gpu::FieldAccessor<float> pdf,
839 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 1u);
840 pdf.set(blockIdx, threadIdx);
842 if (pdf.isValidPosition()) {
843 float const f_0 = pdf.get(0u);
844 float const f_1 = pdf.get(1u);
845 float const f_2 = pdf.get(2u);
846 float const f_3 = pdf.get(3u);
847 float const f_4 = pdf.get(4u);
848 float const f_5 = pdf.get(5u);
849 float const f_6 = pdf.get(6u);
850 float const f_7 = pdf.get(7u);
851 float const f_8 = pdf.get(8u);
852 float const f_9 = pdf.get(9u);
853 float const f_10 = pdf.get(10u);
854 float const f_11 = pdf.get(11u);
855 float const f_12 = pdf.get(12u);
856 float const f_13 = pdf.get(13u);
857 float const f_14 = pdf.get(14u);
858 float const f_15 = pdf.get(15u);
859 float const f_16 = pdf.get(16u);
860 float const f_17 = pdf.get(17u);
861 float const f_18 = pdf.get(18u);
862 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
863 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
864 const float vel1Term = f_1 + f_11 + f_15 + f_7;
865 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
866 const float vel2Term = f_12 + f_13 + f_5;
867 const float delta_rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
868 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
869 const float rho = delta_rho + 1;
872 float const rho_inv =
float{1} / rho;
873 float const u_old[3] = {momdensity_0 * rho_inv, momdensity_1 * rho_inv, momdensity_2 * rho_inv};
881 gpu::GPUField<float>
const *pdf_field,
884 CellInterval ci(cell, cell);
885 thrust::device_vector<float> dev_data(1u);
886 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
888 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
889 kernel.addParam(dev_data_ptr);
893 thrust::copy(dev_data.begin(), dev_data.end(), &rho);
898 gpu::GPUField<float>
const *pdf_field,
900 CellInterval
const &ci) {
901 thrust::device_vector<float> dev_data(ci.numCells());
902 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
904 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
905 kernel.addParam(dev_data_ptr);
908 std::vector<float> out(dev_data.size());
909 thrust::copy(dev_data.begin(), dev_data.end(), out.begin());
914 gpu::GPUField<float> *pdf_field,
918 CellInterval ci(cell, cell);
919 thrust::device_vector<float> dev_data(1u, rho);
920 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
922 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
923 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
929 gpu::GPUField<float> *pdf_field,
930 std::vector<float>
const &values,
932 CellInterval
const &ci) {
933 thrust::device_vector<float> dev_data(values.begin(), values.end());
934 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
936 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
937 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
946 gpu::FieldAccessor<float> pdf,
947 gpu::FieldAccessor<float> force,
949 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
950 pdf.set(blockIdx, threadIdx);
951 force.set(blockIdx, threadIdx);
953 if (pdf.isValidPosition()) {
954 float const f_0 = pdf.get(0u);
955 float const f_1 = pdf.get(1u);
956 float const f_2 = pdf.get(2u);
957 float const f_3 = pdf.get(3u);
958 float const f_4 = pdf.get(4u);
959 float const f_5 = pdf.get(5u);
960 float const f_6 = pdf.get(6u);
961 float const f_7 = pdf.get(7u);
962 float const f_8 = pdf.get(8u);
963 float const f_9 = pdf.get(9u);
964 float const f_10 = pdf.get(10u);
965 float const f_11 = pdf.get(11u);
966 float const f_12 = pdf.get(12u);
967 float const f_13 = pdf.get(13u);
968 float const f_14 = pdf.get(14u);
969 float const f_15 = pdf.get(15u);
970 float const f_16 = pdf.get(16u);
971 float const f_17 = pdf.get(17u);
972 float const f_18 = pdf.get(18u);
973 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
974 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
975 const float vel1Term = f_1 + f_11 + f_15 + f_7;
976 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
977 const float vel2Term = f_12 + f_13 + f_5;
978 const float delta_rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
979 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
980 const float rho = delta_rho + 1;
981 const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
982 const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
983 const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
984 auto const rho_inv =
float{1} / rho;
985 u_out[0u] = md_0 * rho_inv;
986 u_out[1u] = md_1 * rho_inv;
987 u_out[2u] = md_2 * rho_inv;
992 gpu::FieldAccessor<float> pdf,
994 gpu::FieldAccessor<float> force,
996 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
997 pdf.set(blockIdx, threadIdx);
999 force.set(blockIdx, threadIdx);
1001 if (pdf.isValidPosition()) {
1002 float const f_0 = pdf.get(0u);
1003 float const f_1 = pdf.get(1u);
1004 float const f_2 = pdf.get(2u);
1005 float const f_3 = pdf.get(3u);
1006 float const f_4 = pdf.get(4u);
1007 float const f_5 = pdf.get(5u);
1008 float const f_6 = pdf.get(6u);
1009 float const f_7 = pdf.get(7u);
1010 float const f_8 = pdf.get(8u);
1011 float const f_9 = pdf.get(9u);
1012 float const f_10 = pdf.get(10u);
1013 float const f_11 = pdf.get(11u);
1014 float const f_12 = pdf.get(12u);
1015 float const f_13 = pdf.get(13u);
1016 float const f_14 = pdf.get(14u);
1017 float const f_15 = pdf.get(15u);
1018 float const f_16 = pdf.get(16u);
1019 float const f_17 = pdf.get(17u);
1020 float const f_18 = pdf.get(18u);
1021 float const *
RESTRICT const u = u_in;
1022 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
1023 const float vel1Term = f_1 + f_11 + f_15 + f_7;
1024 const float vel2Term = f_12 + f_13 + f_5;
1025 const float delta_rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
1026 const float rho = delta_rho + 1;
1027 const float u_0 = -force.get(0) * 0.50000000000000000f / rho + u[0];
1028 const float u_1 = -force.get(1) * 0.50000000000000000f / rho + u[1];
1029 const float u_2 = -force.get(2) * 0.50000000000000000f / rho + u[2];
1034 float u_new[3] = {u_0, u_1, u_2};
1042 gpu::GPUField<float>
const *pdf_field,
1043 gpu::GPUField<float>
const *force_field,
1045 CellInterval ci(cell, cell);
1046 thrust::device_vector<float> dev_data(3u);
1047 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1049 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1050 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1051 kernel.addParam(dev_data_ptr);
1054 thrust::copy(dev_data.begin(), dev_data.end(), vec.data());
1059 gpu::GPUField<float>
const *pdf_field,
1060 gpu::GPUField<float>
const *force_field,
1061 CellInterval
const &ci) {
1062 thrust::device_vector<float> dev_data(3u);
1063 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1065 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1066 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1067 kernel.addParam(dev_data_ptr);
1069 std::vector<float> out(dev_data.size());
1070 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1075 gpu::GPUField<float> *pdf_field,
1076 gpu::GPUField<float> *velocity_field,
1077 gpu::GPUField<float>
const *force_field,
1078 Vector3<float>
const &u,
1080 CellInterval ci(cell, cell);
1081 thrust::device_vector<float> dev_data(u.data(), u.data() + 3u);
1082 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1084 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1085 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
1086 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1087 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
1092 gpu::GPUField<float> *pdf_field,
1093 gpu::GPUField<float> *velocity_field,
1094 gpu::GPUField<float>
const *force_field,
1095 std::vector<float>
const &values,
1096 CellInterval
const &ci) {
1097 thrust::device_vector<float> dev_data(values.begin(), values.end());
1098 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1100 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1101 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
1102 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1103 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
1111 gpu::FieldAccessor<float> pdf,
1112 gpu::FieldAccessor<float>
velocity,
1113 gpu::FieldAccessor<float> force,
1116 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
1117 pdf.set(blockIdx, threadIdx);
1119 force.set(blockIdx, threadIdx);
1121 if (pdf.isValidPosition()) {
1122 float const f_0 = pdf.get(0u);
1123 float const f_1 = pdf.get(1u);
1124 float const f_2 = pdf.get(2u);
1125 float const f_3 = pdf.get(3u);
1126 float const f_4 = pdf.get(4u);
1127 float const f_5 = pdf.get(5u);
1128 float const f_6 = pdf.get(6u);
1129 float const f_7 = pdf.get(7u);
1130 float const f_8 = pdf.get(8u);
1131 float const f_9 = pdf.get(9u);
1132 float const f_10 = pdf.get(10u);
1133 float const f_11 = pdf.get(11u);
1134 float const f_12 = pdf.get(12u);
1135 float const f_13 = pdf.get(13u);
1136 float const f_14 = pdf.get(14u);
1137 float const f_15 = pdf.get(15u);
1138 float const f_16 = pdf.get(16u);
1139 float const f_17 = pdf.get(17u);
1140 float const f_18 = pdf.get(18u);
1141 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
1142 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
1143 const float vel1Term = f_1 + f_11 + f_15 + f_7;
1144 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
1145 const float vel2Term = f_12 + f_13 + f_5;
1146 const float delta_rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
1147 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
1148 const float rho = delta_rho + 1;
1149 const float md_0 = momdensity_0 + f_in[0u] * 0.50000000000000000f /
density;
1150 const float md_1 = momdensity_1 + f_in[1u] * 0.50000000000000000f /
density;
1151 const float md_2 = momdensity_2 + f_in[2u] * 0.50000000000000000f /
density;
1152 auto const rho_inv =
float{1} / rho;
1153 auto const density_inv =
float{1} /
density;
1155 force.get(0u) = f_in[0u] * density_inv;
1156 force.get(1u) = f_in[1u] * density_inv;
1157 force.get(2u) = f_in[2u] * density_inv;
1166void set(gpu::GPUField<float>
const *pdf_field,
1167 gpu::GPUField<float> *velocity_field,
1168 gpu::GPUField<float> *force_field,
1169 Vector3<float>
const &u,
1172 CellInterval ci(cell, cell);
1173 thrust::device_vector<float> dev_data(u.data(), u.data() + 3u);
1174 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1176 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1177 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
1178 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1179 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
1184void set(gpu::GPUField<float>
const *pdf_field,
1185 gpu::GPUField<float> *velocity_field,
1186 gpu::GPUField<float> *force_field,
1187 std::vector<float>
const &values,
1189 CellInterval
const &ci) {
1190 thrust::device_vector<float> dev_data(values.begin(), values.end());
1191 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1193 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1194 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*velocity_field, ci));
1195 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1196 kernel.addParam(
const_cast<const float *
>(dev_data_ptr));
1202namespace MomentumDensity {
1205 gpu::FieldAccessor<float> pdf,
1206 gpu::FieldAccessor<float> force,
1209 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 3u);
1210 pdf.set(blockIdx, threadIdx);
1211 force.set(blockIdx, threadIdx);
1213 if (pdf.isValidPosition()) {
1214 float const f_1 = pdf.get(1u);
1215 float const f_2 = pdf.get(2u);
1216 float const f_3 = pdf.get(3u);
1217 float const f_4 = pdf.get(4u);
1218 float const f_5 = pdf.get(5u);
1219 float const f_6 = pdf.get(6u);
1220 float const f_7 = pdf.get(7u);
1221 float const f_8 = pdf.get(8u);
1222 float const f_9 = pdf.get(9u);
1223 float const f_10 = pdf.get(10u);
1224 float const f_11 = pdf.get(11u);
1225 float const f_12 = pdf.get(12u);
1226 float const f_13 = pdf.get(13u);
1227 float const f_14 = pdf.get(14u);
1228 float const f_15 = pdf.get(15u);
1229 float const f_16 = pdf.get(16u);
1230 float const f_17 = pdf.get(17u);
1231 float const f_18 = pdf.get(18u);
1232 const float vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
1233 const float momdensity_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
1234 const float vel1Term = f_1 + f_11 + f_15 + f_7;
1235 const float momdensity_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
1236 const float vel2Term = f_12 + f_13 + f_5;
1237 const float momdensity_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
1238 const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
1239 const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
1240 const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
1249 gpu::GPUField<float>
const *pdf_field,
1250 gpu::GPUField<float>
const *force_field,
1252 auto const ci = pdf_field->xyzSize();
1253 thrust::device_vector<float> dev_data(3u * ci.numCells());
1254 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1256 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1257 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
1258 kernel.addParam(dev_data_ptr);
1261 std::vector<float> out(dev_data.size());
1262 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1263 Vector3<float> mom(
float{0});
1264 for (
auto i = uint_t{0u}; i < 3u * ci.numCells(); i += 3u) {
1265 mom[0u] += out[i + 0u];
1266 mom[1u] += out[i + 1u];
1267 mom[2u] += out[i + 2u];
1273namespace PressureTensor {
1276 gpu::FieldAccessor<float> pdf,
1278 auto const offset =
getLinearIndex(blockIdx, threadIdx, gridDim, blockDim, 9u);
1279 pdf.set(blockIdx, threadIdx);
1281 if (pdf.isValidPosition()) {
1282 float const f_1 = pdf.getNeighbor(0, -1, 0, 1u);
1283 float const f_2 = pdf.getNeighbor(0, 1, 0, 2u);
1284 float const f_3 = pdf.getNeighbor(1, 0, 0, 3u);
1285 float const f_4 = pdf.getNeighbor(-1, 0, 0, 4u);
1286 float const f_5 = pdf.getNeighbor(0, 0, -1, 5u);
1287 float const f_6 = pdf.getNeighbor(0, 0, 1, 6u);
1288 float const f_7 = pdf.getNeighbor(1, -1, 0, 7u);
1289 float const f_8 = pdf.getNeighbor(-1, -1, 0, 8u);
1290 float const f_9 = pdf.getNeighbor(1, 1, 0, 9u);
1291 float const f_10 = pdf.getNeighbor(-1, 1, 0, 10u);
1292 float const f_11 = pdf.getNeighbor(0, -1, -1, 11u);
1293 float const f_12 = pdf.getNeighbor(0, 1, -1, 12u);
1294 float const f_13 = pdf.getNeighbor(1, 0, -1, 13u);
1295 float const f_14 = pdf.getNeighbor(-1, 0, -1, 14u);
1296 float const f_15 = pdf.getNeighbor(0, -1, 1, 15u);
1297 float const f_16 = pdf.getNeighbor(0, 1, 1, 16u);
1298 float const f_17 = pdf.getNeighbor(1, 0, 1, 17u);
1299 float const f_18 = pdf.getNeighbor(-1, 0, 1, 18u);
1300 const float p_0 = f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9 + 0.33333333333333333f;
1301 const float p_1 = -f_10 - f_7 + f_8 + f_9;
1302 const float p_2 = -f_13 + f_14 + f_17 - f_18;
1303 const float p_3 = -f_10 - f_7 + f_8 + f_9;
1304 const float p_4 = f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9 + 0.33333333333333333f;
1305 const float p_5 = f_11 - f_12 - f_15 + f_16;
1306 const float p_6 = -f_13 + f_14 + f_17 - f_18;
1307 const float p_7 = f_11 - f_12 - f_15 + f_16;
1308 const float p_8 = f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6 + 0.33333333333333333f;
1323 gpu::GPUField<float>
const *pdf_field,
1326 CellInterval ci(cell, cell);
1327 thrust::device_vector<float> dev_data(9u);
1328 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1330 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1331 kernel.addParam(dev_data_ptr);
1334 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1339 gpu::GPUField<float>
const *pdf_field,
1341 CellInterval
const &ci) {
1342 thrust::device_vector<float> dev_data(9u * ci.numCells());
1343 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1345 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1346 kernel.addParam(dev_data_ptr);
1348 std::vector<float> out(dev_data.size());
1349 thrust::transform(dev_data.begin(), dev_data.end(), dev_data.begin(),
1351 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1356 gpu::GPUField<float>
const *pdf_field,
1358 auto const ci = pdf_field->xyzSize();
1359 thrust::device_vector<float> dev_data(9u * ci.numCells());
1360 auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
1362 kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
1363 kernel.addParam(dev_data_ptr);
1365 std::vector<float> out(dev_data.size());
1366 thrust::copy(dev_data.begin(), dev_data.end(), out.data());
1367 Matrix3<float> pressureTensor(
float{0});
1368 for (
auto i = uint_t{0u}; i < 9u * ci.numCells(); i += 9u) {
1369 pressureTensor[0u] += out[i + 0u];
1370 pressureTensor[1u] += out[i + 1u];
1371 pressureTensor[2u] += out[i + 2u];
1373 pressureTensor[3u] += out[i + 3u];
1374 pressureTensor[4u] += out[i + 4u];
1375 pressureTensor[5u] += out[i + 5u];
1377 pressureTensor[6u] += out[i + 6u];
1378 pressureTensor[7u] += out[i + 7u];
1379 pressureTensor[8u] += out[i + 8u];
1381 return pressureTensor *
density;
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
static __forceinline__ __device__ uint getLinearIndex(uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim, uint fOffset)
Get linear index of flattened data with original layout fzyx.
static dim3 calculate_dim_grid(uint const threads_x, uint const blocks_per_grid_y, uint const threads_per_block)
void set(GhostLayerField< double, uint_t{19u}> *pdf_field, double const rho_in, double const density, Cell const &cell)
double get(GhostLayerField< double, uint_t{19u}> const *pdf_field, double const density, Cell const &cell)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, double *RESTRICT rho_out, double const density)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, double const *RESTRICT rho_in, double const density)
__device__ void kernel_set_device(gpu::FieldAccessor< double > pdf, double const *RESTRICT const u, double rho)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > velocity, gpu::FieldAccessor< double > force, double const *RESTRICT f_in, double const density)
void set(GhostLayerField< double, uint_t{19u}> const *pdf_field, GhostLayerField< double, uint_t{3u}> *velocity_field, GhostLayerField< double, uint_t{3u}> *force_field, Vector3< double > const &force, double const density, Cell const &cell)
__global__ void kernel_get_rho(gpu::FieldAccessor< double > pdf, double const *RESTRICT const pos, double *RESTRICT const rho_out, double const density, uint n_pos, uint gl)
void add_force(gpu::GPUField< double > const *field, std::vector< double > const &pos, std::vector< double > const &forces, uint gl)
__global__ void kernel_add_force(gpu::FieldAccessor< double > force, double const *RESTRICT const pos, double const *RESTRICT const forces, uint n_pos, uint gl)
std::vector< double > get_rho(gpu::GPUField< double > const *field, std::vector< double > const &pos, double const density, uint gl)
__global__ void kernel_get_vel(gpu::FieldAccessor< double > vel, double const *RESTRICT const pos, double *RESTRICT const vel_out, uint n_pos, uint gl)
std::vector< double > get_vel(gpu::GPUField< double > const *field, std::vector< double > const &pos, uint gl)
static __forceinline__ __device__ void calculate_weights(double const *RESTRICT const pos, int *RESTRICT const corner, double *RESTRICT const weights, uint gl)
Calculate interpolation weights.
auto reduce(GhostLayerField< double, uint_t{19u}> const *pdf_field, GhostLayerField< double, uint_t{3u}> const *force_field, double const density)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > force, double *RESTRICT out, double const density)
__global__ void kernel_set_vel(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > velocity, gpu::FieldAccessor< double > force, double const *RESTRICT pop)
__global__ void kernel_broadcast(gpu::FieldAccessor< double > pdf, double const *RESTRICT pop)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, double const *RESTRICT pop)
void set(GhostLayerField< double, uint_t{19u}> *pdf_field, std::array< double, 19u > const &pop, Cell const &cell)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, double *RESTRICT pop)
auto get(GhostLayerField< double, uint_t{19u}> const *pdf_field, Cell const &cell)
void initialize(GhostLayerField< double, uint_t{19u}> *pdf_field, std::array< double, 19u > const &pop)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, double *RESTRICT p_out)
auto get(GhostLayerField< double, uint_t{19u}> const *pdf_field, double const density, Cell const &cell)
auto reduce(GhostLayerField< double, uint_t{19u}> const *pdf_field, double const density)
void initialize(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec)
__global__ void kernel_get(gpu::FieldAccessor< double > vec, double *u_out)
void set_from_list(gpu::GPUField< double > const *field, thrust::device_vector< int > const &indices, thrust::device_vector< double > const &values, uint gl)
__global__ void kernel_broadcast(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void add(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec, Cell const &cell)
__global__ void kernel_set(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void set(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec, Cell const &cell)
__global__ void kernel_set_from_list(gpu::FieldAccessor< double > vec, int const *RESTRICT const indices, double const *RESTRICT const values, uint length)
auto get(GhostLayerField< double, uint_t{3u}> const *vec_field, Cell const &cell)
__global__ void kernel_broadcast_add(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void add_to_all(GhostLayerField< double, uint_t{3u}> *vec_field, Vector3< double > const &vec)
__global__ void kernel_add(gpu::FieldAccessor< double > vec, double const *RESTRICT u_in)
void set(GhostLayerField< double, uint_t{19u}> *pdf_field, GhostLayerField< double, uint_t{3u}> *velocity_field, GhostLayerField< double, uint_t{3u}> const *force_field, Vector3< double > const &u, Cell const &cell)
__global__ void kernel_get(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > force, double *RESTRICT u_out)
__global__ void kernel_set(gpu::FieldAccessor< double > pdf, gpu::FieldAccessor< double > velocity, gpu::FieldAccessor< double > force, double const *RESTRICT u_in)
auto get(GhostLayerField< double, uint_t{19u}> const *pdf_field, GhostLayerField< double, uint_t{3u}> const *force_field, Cell const &cell)
\file PackInfoPdfDoublePrecision.cpp \author pystencils
static Utils::Vector3d velocity(Particle const &p_ref, Particle const &p_vs)
Velocity of the virtual site.
Rescale values in a device vector by some constant.
__host__ __device__ constexpr float operator()(float const &x) const
algo_rescale(float scale_factor)