ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
p3m_gpu_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2010-2026 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20/**
21 * @file
22 *
23 * P3M electrostatics on GPU.
24 *
25 * The corresponding header file is @ref p3m_gpu_cuda.cuh.
26 */
27
28#include <config/config.hpp>
29
30#ifdef ESPRESSO_P3M
31
32#define ESPRESSO_P3M_GPU_FLOAT
33// #define ESPRESSO_P3M_GPU_REAL_DOUBLE
34
35#ifdef ESPRESSO_P3M_GPU_FLOAT
36#define REAL_TYPE float
37#define FFT_TYPE_COMPLEX cufftComplex
38#define FFT_FORW_FFT cufftExecR2C
39#define FFT_BACK_FFT cufftExecC2R
40#define FFT_PLAN_FORW_FLAG CUFFT_R2C
41#define FFT_PLAN_BACK_FLAG CUFFT_C2R
42#endif
43
44#ifdef ESPRESSO_P3M_GPU_REAL_DOUBLE
45#define REAL_TYPE double
46#define FFT_TYPE_COMPLEX cufftDoubleComplex
47#define FFT_FORW_FFT cufftExecD2Z
48#define FFT_BACK_FFT cufftExecZ2D
49#define FFT_PLAN_FORW_FLAG CUFFT_D2Z
50#define FFT_PLAN_BACK_FLAG CUFFT_Z2D
51#endif
52
54
55#include "cuda/utils.cuh"
56#include "p3m/math.hpp"
57#include "system/System.hpp"
58
60
63#include <utils/math/sqr.hpp>
64
65#include <cuda.h>
66#include <cufft.h>
67
68#include <algorithm>
69#include <cassert>
70#include <cstddef>
71#include <limits>
72#include <numbers>
73#include <stdexcept>
74
75#if defined(OMPI_MPI_H) || defined(_MPI_H)
76#error CU-file includes mpi.h! This should not happen!
77#endif
78
79using Utils::int_pow;
80using Utils::sqr;
81
82struct P3MGpuData {
83 /** Charge mesh */
85 /** Force meshes */
89 /** Influence Function */
91 /** Charge assignment order */
92 int cao;
93 /** Total number of mesh points (including padding) */
95 /** Ewald parameter */
97 /** Number of particles */
98 unsigned int n_part; // oddity: size_t causes UB with GCC 11.4 in Debug mode
99 /** Box size */
101 /** Mesh dimensions */
102 int mesh[3];
103 /** Padded size */
105 /** Inverse mesh spacing */
107 /** Position shift */
109};
110
112 /** Forward FFT plan */
113 cufftHandle forw_plan;
114 /** Backward FFT plan */
115 cufftHandle back_plan;
116};
117
122
124
126 auto const free_device_pointer = [](auto *&ptr) {
127 if (ptr != nullptr) {
128 cuda_safe_mem(cudaFree(reinterpret_cast<void *>(ptr)));
129 ptr = nullptr;
130 }
131 };
132 free_device_pointer(p3m_gpu_data.charge_mesh);
133 free_device_pointer(p3m_gpu_data.force_mesh_x);
134 free_device_pointer(p3m_gpu_data.force_mesh_y);
135 free_device_pointer(p3m_gpu_data.force_mesh_z);
136 free_device_pointer(p3m_gpu_data.G_hat);
137 cufftDestroy(p3m_fft.forw_plan);
138 cufftDestroy(p3m_fft.back_plan);
139 is_initialized = false;
140 }
141};
142
143static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part) {
144 auto const cao3 = Utils::int_pow<3>(cao);
145 auto parts_per_block = 1u;
146 while ((parts_per_block + 1u) * cao3 <= 1024u) {
147 ++parts_per_block;
148 }
149 assert((n_part / parts_per_block) <= std::numeric_limits<unsigned>::max());
150 auto n = static_cast<unsigned int>(n_part / parts_per_block);
151 auto n_blocks = ((n_part % parts_per_block) == 0u) ? std::max(1u, n) : n + 1u;
152 assert(n_blocks <= std::numeric_limits<unsigned>::max());
153 return std::make_pair(parts_per_block, static_cast<unsigned>(n_blocks));
154}
155
156dim3 p3m_make_grid(unsigned int n_blocks) {
157 dim3 grid(n_blocks, 1u, 1u);
158 while (grid.x > 65536u) {
159 grid.y++;
160 if ((n_blocks % grid.y) == 0u)
161 grid.x = std::max(1u, n_blocks / grid.y);
162 else
163 grid.x = n_blocks / grid.y + 1u;
164 }
165 return grid;
166}
167
168template <int cao>
169__device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
170 int NZ, REAL_TYPE *Zaehler,
171 REAL_TYPE *Nenner) {
172 REAL_TYPE S1, S2, S3;
173 REAL_TYPE zwi;
174 int MX, MY, MZ;
175 REAL_TYPE NMX, NMY, NMZ;
176 REAL_TYPE NM2;
177 REAL_TYPE TE;
178 REAL_TYPE Leni[3];
179 REAL_TYPE Meshi[3];
180 for (int i = 0; i < 3; ++i) {
181 Leni[i] = 1.0f / p.box[i];
182 Meshi[i] = 1.0f / static_cast<REAL_TYPE>(p.mesh[i]);
183 }
184
185 Zaehler[0] = Zaehler[1] = Zaehler[2] = *Nenner = 0.0;
186
187 for (MX = -P3M_BRILLOUIN; MX <= P3M_BRILLOUIN; MX++) {
188 NMX = static_cast<REAL_TYPE>(((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX) +
189 p.mesh[0] * MX);
190 S1 = int_pow<2 * cao>(math::sinc(Meshi[0] * NMX));
191 for (MY = -P3M_BRILLOUIN; MY <= P3M_BRILLOUIN; MY++) {
192 NMY = static_cast<REAL_TYPE>(
193 ((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY) + p.mesh[1] * MY);
194 S2 = S1 * int_pow<2 * cao>(math::sinc(Meshi[1] * NMY));
195 for (MZ = -P3M_BRILLOUIN; MZ <= P3M_BRILLOUIN; MZ++) {
196 NMZ = static_cast<REAL_TYPE>(
197 ((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ) + p.mesh[2] * MZ);
198 S3 = S2 * int_pow<2 * cao>(math::sinc(Meshi[2] * NMZ));
199
200 NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]);
201 *Nenner += S3;
202
203 TE = exp(-sqr(std::numbers::pi_v<REAL_TYPE> / (p.alpha)) * NM2);
204 zwi = S3 * TE / NM2;
205 Zaehler[0] += NMX * zwi * Leni[0];
206 Zaehler[1] += NMY * zwi * Leni[1];
207 Zaehler[2] += NMZ * zwi * Leni[2];
208 }
209 }
210 }
211}
212
213/* Calculate influence function */
214template <int cao>
216
217 const auto NX = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
218 const auto NY = static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
219 const auto NZ = static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
220 REAL_TYPE Dnx, Dny, Dnz;
221 REAL_TYPE Zaehler[3] = {0.0, 0.0, 0.0}, Nenner = 0.0;
222 REAL_TYPE zwi;
223 auto index = 0;
224 REAL_TYPE Leni[3];
225 for (int i = 0; i < 3; ++i) {
226 Leni[i] = REAL_TYPE{1} / p.box[i];
227 }
228
229 if ((NX >= p.mesh[0]) || (NY >= p.mesh[1]) || (NZ >= (p.mesh[2] / 2 + 1)))
230 return;
231
232 index = NX * p.mesh[1] * (p.mesh[2] / 2 + 1) + NY * (p.mesh[2] / 2 + 1) + NZ;
233
234 if (((NX == 0) && (NY == 0) && (NZ == 0)) ||
235 ((NX % (p.mesh[0] / 2) == 0) && (NY % (p.mesh[1] / 2) == 0) &&
236 (NZ % (p.mesh[2] / 2) == 0))) {
237 p.G_hat[index] = 0;
238 } else {
239 Aliasing_sums_ik<cao>(p, NX, NY, NZ, Zaehler, &Nenner);
240
241 Dnx = static_cast<REAL_TYPE>((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX);
242 Dny = static_cast<REAL_TYPE>((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY);
243 Dnz = static_cast<REAL_TYPE>((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ);
244
245 zwi = Dnx * Zaehler[0] * Leni[0] + Dny * Zaehler[1] * Leni[1] +
246 Dnz * Zaehler[2] * Leni[2];
247 zwi /= ((sqr(Dnx * Leni[0]) + sqr(Dny * Leni[1]) + sqr(Dnz * Leni[2])) *
248 sqr(Nenner));
249 p.G_hat[index] = REAL_TYPE{2} * zwi / std::numbers::pi_v<REAL_TYPE>;
250 }
251}
252
253namespace {
254__device__ inline auto linear_index_r(P3MGpuData const &p, int i, int j,
255 int k) {
256 return static_cast<unsigned int>(p.mesh[1] * p.mesh_z_padded * i +
257 p.mesh_z_padded * j + k);
258}
259
260__device__ inline auto linear_index_k(P3MGpuData const &p, int i, int j,
261 int k) {
262 return static_cast<unsigned int>(p.mesh[1] * (p.mesh[2] / 2 + 1) * i +
263 (p.mesh[2] / 2 + 1) * j + k);
264}
265} // namespace
266
267__global__ void apply_diff_op(const P3MGpuData p) {
268 auto const linear_index = linear_index_k(p, static_cast<int>(blockIdx.x),
269 static_cast<int>(blockIdx.y),
270 static_cast<int>(threadIdx.x));
271
272 auto const bidx = static_cast<int>(blockIdx.x);
273 auto const bidy = static_cast<int>(blockIdx.y);
274 auto const nx = (bidx > p.mesh[0] / 2) ? bidx - p.mesh[0] : bidx;
275 auto const ny = (bidy > p.mesh[1] / 2) ? bidy - p.mesh[1] : bidy;
276 auto const nz = static_cast<int>(threadIdx.x);
277
278 const FFT_TYPE_COMPLEX meshw = p.charge_mesh[linear_index];
280 buf.x = REAL_TYPE(-2) * std::numbers::pi_v<REAL_TYPE> * meshw.y;
281 buf.y = REAL_TYPE(+2) * std::numbers::pi_v<REAL_TYPE> * meshw.x;
282
283 p.force_mesh_x[linear_index].x =
284 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nx) * buf.x / p.box[0];
285 p.force_mesh_x[linear_index].y =
286 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nx) * buf.y / p.box[0];
287
288 p.force_mesh_y[linear_index].x =
289 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(ny) * buf.x / p.box[1];
290 p.force_mesh_y[linear_index].y =
291 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(ny) * buf.y / p.box[1];
292
293 p.force_mesh_z[linear_index].x =
294 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nz) * buf.x / p.box[2];
295 p.force_mesh_z[linear_index].y =
296 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nz) * buf.y / p.box[2];
297}
298
299__device__ inline int wrap_index(const int ind, const int mesh) {
300 if (ind < 0)
301 return ind + mesh;
302 if (ind >= mesh)
303 return ind - mesh;
304 return ind;
305}
306
307__global__ void apply_influence_function(const P3MGpuData p) {
308 auto const linear_index = linear_index_k(p, static_cast<int>(blockIdx.x),
309 static_cast<int>(blockIdx.y),
310 static_cast<int>(threadIdx.x));
311
312 p.charge_mesh[linear_index].x *= p.G_hat[linear_index];
313 p.charge_mesh[linear_index].y *= p.G_hat[linear_index];
314}
315
316template <int cao, bool shared>
317__global__ void assign_charge_kernel(P3MGpuData const params,
318 float const *const __restrict__ part_pos,
319 float const *const __restrict__ part_q,
320 unsigned int const parts_per_block) {
321 auto const part_in_block = threadIdx.x / static_cast<unsigned int>(cao);
322 auto const cao_id_x =
323 threadIdx.x - part_in_block * static_cast<unsigned int>(cao);
324 /* id of the particle */
325 auto const id =
326 parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
327 if (id >= params.n_part)
328 return;
329 /* position relative to the closest gird point */
330 REAL_TYPE m_pos[3];
331 /* index of the nearest mesh point */
332 int nmp_x, nmp_y, nmp_z;
333
334 auto *charge_mesh = (REAL_TYPE *)params.charge_mesh;
335
336 m_pos[0] = part_pos[3 * id + 0] * params.hi[0] - params.pos_shift;
337 m_pos[1] = part_pos[3 * id + 1] * params.hi[1] - params.pos_shift;
338 m_pos[2] = part_pos[3 * id + 2] * params.hi[2] - params.pos_shift;
339
340 nmp_x = static_cast<int>(floorf(m_pos[0] + 0.5f));
341 nmp_y = static_cast<int>(floorf(m_pos[1] + 0.5f));
342 nmp_z = static_cast<int>(floorf(m_pos[2] + 0.5f));
343
344 m_pos[0] -= static_cast<REAL_TYPE>(nmp_x);
345 m_pos[1] -= static_cast<REAL_TYPE>(nmp_y);
346 m_pos[2] -= static_cast<REAL_TYPE>(nmp_z);
347
348 nmp_x = wrap_index(nmp_x + static_cast<int>(cao_id_x), params.mesh[0]);
349 nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), params.mesh[1]);
350 nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), params.mesh[2]);
351
352 auto const index = linear_index_r(params, nmp_x, nmp_y, nmp_z);
353
354 extern __shared__ float weights[];
355
356 if (shared) {
357 auto const offset = static_cast<unsigned int>(cao) * part_in_block;
358 if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) {
359 weights[3u * offset + 3u * cao_id_x + threadIdx.y] =
360 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[threadIdx.y]);
361 }
362
363 __syncthreads();
364
365 auto const c = weights[3u * offset + 3u * cao_id_x] *
366 weights[3u * offset + 3u * threadIdx.y + 1u] *
367 weights[3u * offset + 3u * threadIdx.z + 2u] * part_q[id];
368 atomicAdd(&(charge_mesh[index]), REAL_TYPE(c));
369
370 } else {
371 auto const c =
372 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[0]) * part_q[id] *
373 Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
374 Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]);
375 atomicAdd(&(charge_mesh[index]), REAL_TYPE(c));
376 }
377}
378
379void assign_charges(P3MGpuData const &params,
380 float const *const __restrict__ part_pos,
381 float const *const __restrict__ part_q) {
382 auto const cao = static_cast<unsigned int>(params.cao);
383 auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
384 dim3 block(parts_per_block * cao, cao, cao);
385 dim3 grid = p3m_make_grid(n_blocks);
386
387 auto const data_length = std::size_t(3u) *
388 static_cast<std::size_t>(parts_per_block) *
389 static_cast<std::size_t>(cao) * sizeof(REAL_TYPE);
390 switch (params.cao) {
391 case 1:
392 (assign_charge_kernel<1, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
393 params, part_pos, part_q, parts_per_block);
394 break;
395 case 2:
396 (assign_charge_kernel<2, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
397 params, part_pos, part_q, parts_per_block);
398 break;
399 case 3:
400 (assign_charge_kernel<3, true>)<<<grid, block, data_length, nullptr>>>(
401 params, part_pos, part_q, parts_per_block);
402 break;
403 case 4:
404 (assign_charge_kernel<4, true>)<<<grid, block, data_length, nullptr>>>(
405 params, part_pos, part_q, parts_per_block);
406 break;
407 case 5:
408 (assign_charge_kernel<5, true>)<<<grid, block, data_length, nullptr>>>(
409 params, part_pos, part_q, parts_per_block);
410 break;
411 case 6:
412 (assign_charge_kernel<6, true>)<<<grid, block, data_length, nullptr>>>(
413 params, part_pos, part_q, parts_per_block);
414 break;
415 case 7:
416 (assign_charge_kernel<7, true>)<<<grid, block, data_length, nullptr>>>(
417 params, part_pos, part_q, parts_per_block);
418 break;
419 default:
420 break;
421 }
422 cuda_check_errors_exit(block, grid, "assign_charge", __FILE__, __LINE__);
423}
424
425template <int cao, bool shared>
426__global__ void assign_forces_kernel(P3MGpuData const params,
427 float const *const __restrict__ part_pos,
428 float const *const __restrict__ part_q,
429 float *const __restrict__ part_f,
430 REAL_TYPE prefactor,
431 unsigned int const parts_per_block) {
432 auto const part_in_block = threadIdx.x / static_cast<unsigned int>(cao);
433 auto const cao_id_x =
434 threadIdx.x - part_in_block * static_cast<unsigned int>(cao);
435 /* id of the particle */
436 auto const id =
437 parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
438 if (id >= static_cast<unsigned>(params.n_part))
439 return;
440 /* position relative to the closest grid point */
441 REAL_TYPE m_pos[3];
442 /* index of the nearest mesh point */
443 int nmp_x, nmp_y, nmp_z;
444
445 m_pos[0] = part_pos[3u * id + 0u] * params.hi[0] - params.pos_shift;
446 m_pos[1] = part_pos[3u * id + 1u] * params.hi[1] - params.pos_shift;
447 m_pos[2] = part_pos[3u * id + 2u] * params.hi[2] - params.pos_shift;
448
449 nmp_x = static_cast<int>(floorf(m_pos[0] + REAL_TYPE{0.5}));
450 nmp_y = static_cast<int>(floorf(m_pos[1] + REAL_TYPE{0.5}));
451 nmp_z = static_cast<int>(floorf(m_pos[2] + REAL_TYPE{0.5}));
452
453 m_pos[0] -= static_cast<REAL_TYPE>(nmp_x);
454 m_pos[1] -= static_cast<REAL_TYPE>(nmp_y);
455 m_pos[2] -= static_cast<REAL_TYPE>(nmp_z);
456
457 nmp_x = wrap_index(nmp_x + static_cast<int>(cao_id_x), params.mesh[0]);
458 nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), params.mesh[1]);
459 nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), params.mesh[2]);
460
461 auto const index = linear_index_r(params, nmp_x, nmp_y, nmp_z);
462
463 extern __shared__ float weights[];
464
465 REAL_TYPE c = -prefactor * part_q[id];
466 if (shared) {
467 auto const offset = static_cast<unsigned int>(cao) * part_in_block;
468 if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) {
469 weights[3u * offset + 3u * cao_id_x + threadIdx.y] =
470 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[threadIdx.y]);
471 }
472
473 __syncthreads();
474
475 c *= REAL_TYPE(weights[3u * offset + 3u * cao_id_x] *
476 weights[3u * offset + 3u * threadIdx.y + 1u] *
477 weights[3u * offset + 3u * threadIdx.z + 2u]);
478 } else {
479 c *=
480 REAL_TYPE(Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[0]) *
481 Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
482 Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]));
483 }
484
485 const REAL_TYPE *force_mesh_x = (REAL_TYPE *)params.force_mesh_x;
486 const REAL_TYPE *force_mesh_y = (REAL_TYPE *)params.force_mesh_y;
487 const REAL_TYPE *force_mesh_z = (REAL_TYPE *)params.force_mesh_z;
488
489 atomicAdd(&(part_f[3u * id + 0u]), float(c * force_mesh_x[index]));
490 atomicAdd(&(part_f[3u * id + 1u]), float(c * force_mesh_y[index]));
491 atomicAdd(&(part_f[3u * id + 2u]), float(c * force_mesh_z[index]));
492}
493
494void assign_forces(P3MGpuData const &params,
495 float const *const __restrict__ part_pos,
496 float const *const __restrict__ part_q,
497 float *const __restrict__ part_f,
498 REAL_TYPE const prefactor) {
499 auto const cao = static_cast<unsigned int>(params.cao);
500 auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
501 dim3 block(parts_per_block * cao, cao, cao);
502 dim3 grid = p3m_make_grid(n_blocks);
503
504 /* Switch for assignment templates, the shared version only is faster for cao
505 * > 2 */
506 auto const data_length = std::size_t(3u) *
507 static_cast<std::size_t>(parts_per_block) *
508 static_cast<std::size_t>(cao) * sizeof(float);
509 switch (params.cao) {
510 case 1:
511 (assign_forces_kernel<1, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
512 params, part_pos, part_q, part_f, prefactor, parts_per_block);
513 break;
514 case 2:
515 (assign_forces_kernel<2, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
516 params, part_pos, part_q, part_f, prefactor, parts_per_block);
517 break;
518 case 3:
519 (assign_forces_kernel<3, true>)<<<grid, block, data_length, nullptr>>>(
520 params, part_pos, part_q, part_f, prefactor, parts_per_block);
521 break;
522 case 4:
523 (assign_forces_kernel<4, true>)<<<grid, block, data_length, nullptr>>>(
524 params, part_pos, part_q, part_f, prefactor, parts_per_block);
525 break;
526 case 5:
527 (assign_forces_kernel<5, true>)<<<grid, block, data_length, nullptr>>>(
528 params, part_pos, part_q, part_f, prefactor, parts_per_block);
529 break;
530 case 6:
531 (assign_forces_kernel<6, true>)<<<grid, block, data_length, nullptr>>>(
532 params, part_pos, part_q, part_f, prefactor, parts_per_block);
533 break;
534 case 7:
535 (assign_forces_kernel<7, true>)<<<grid, block, data_length, nullptr>>>(
536 params, part_pos, part_q, part_f, prefactor, parts_per_block);
537 break;
538 default:
539 break;
540 }
541 cuda_check_errors_exit(block, grid, "assign_forces", __FILE__, __LINE__);
542}
543
544/**
545 * @brief Initialize the internal data structure of the P3M GPU.
546 * Mainly allocation on the device and influence function calculation.
547 * Be advised: this needs `mesh^3*5*sizeof(REAL_TYPE)` of device memory.
548 * We use real to complex FFTs, so the size of the reciprocal mesh
549 * is (cuFFT convention) `Nx * Ny * ( Nz /2 + 1 )`.
550 */
551void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
552 Utils::Vector3i const &mesh, double alpha,
553 Utils::Vector3d const &box_l, std::size_t n_part) {
554 assert(mesh != Utils::Vector3i::broadcast(-1));
555
556 if (not data) {
557 data = std::make_shared<P3MGpuParams>();
558 }
559
560 auto &p3m_gpu_data = data->p3m_gpu_data;
561 bool do_reinit = false, mesh_changed = false;
562 assert(n_part <= std::numeric_limits<unsigned int>::max());
563 p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
564
565 if (not data->is_initialized or p3m_gpu_data.alpha != alpha) {
566 p3m_gpu_data.alpha = static_cast<REAL_TYPE>(alpha);
567 do_reinit = true;
568 }
569
570 if (not data->is_initialized or p3m_gpu_data.cao != cao) {
571 p3m_gpu_data.cao = cao;
572 // NOLINTNEXTLINE(bugprone-integer-division)
573 p3m_gpu_data.pos_shift = static_cast<REAL_TYPE>((p3m_gpu_data.cao - 1) / 2);
574 do_reinit = true;
575 }
576
577 if (not data->is_initialized or mesh != Utils::Vector3i(p3m_gpu_data.mesh)) {
578 std::ranges::copy(mesh, p3m_gpu_data.mesh);
579 mesh_changed = true;
580 do_reinit = true;
581 }
582
583 if (auto constexpr eps =
584 static_cast<double>(std::numeric_limits<float>::epsilon());
585 not data->is_initialized or
586 (box_l - Utils::Vector3d(p3m_gpu_data.box)).norm() >= eps) {
587 std::ranges::copy(box_l, p3m_gpu_data.box);
588 do_reinit = true;
589 }
590
591 p3m_gpu_data.mesh_z_padded = (mesh[2] / 2 + 1) * 2;
592 p3m_gpu_data.mesh_size = mesh[0] * mesh[1] * p3m_gpu_data.mesh_z_padded;
593
594 for (auto i = 0u; i < 3u; ++i) {
595 p3m_gpu_data.hi[i] =
596 static_cast<REAL_TYPE>(p3m_gpu_data.mesh[i]) / p3m_gpu_data.box[i];
597 }
598
599 if (data->is_initialized and mesh_changed) {
600 data->free_device_memory();
601 data->is_initialized = false;
602 }
603
604 if (not data->is_initialized and p3m_gpu_data.mesh_size > 0) {
605 /* Size of the complex mesh Nx * Ny * ( Nz / 2 + 1 ) */
606 auto const cmesh_size =
607 static_cast<std::size_t>(p3m_gpu_data.mesh[0]) *
608 static_cast<std::size_t>(p3m_gpu_data.mesh[1]) *
609 static_cast<std::size_t>(p3m_gpu_data.mesh[2] / 2 + 1);
610 auto const mesh_len = cmesh_size * sizeof(FFT_TYPE_COMPLEX);
611 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.charge_mesh), mesh_len));
612 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_x), mesh_len));
613 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_y), mesh_len));
614 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_z), mesh_len));
615 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.G_hat),
616 cmesh_size * sizeof(REAL_TYPE)));
617
618 {
619#ifdef ESPRESSO_FPE
620 // cuFFT builds device kernels using CUDA-JIT
621 // (https://docs.nvidia.com/cuda/archive/13.1.1/cufft/#plan-initialization-time)
622 // please note this operation is not guaranteed to succeed for all
623 // mesh sizes, and in rare cases, it can send the SIGFPE signal
624 auto const trap_pause = fe_trap::make_shared_pause_scoped();
625#endif
626 if (cufftPlan3d(&(data->p3m_fft.forw_plan), mesh[0], mesh[1], mesh[2],
627 FFT_PLAN_FORW_FLAG) != CUFFT_SUCCESS or
628 cufftPlan3d(&(data->p3m_fft.back_plan), mesh[0], mesh[1], mesh[2],
629 FFT_PLAN_BACK_FLAG) != CUFFT_SUCCESS) {
630 throw std::runtime_error("Unable to create fft plan");
631 }
632 }
633 }
634
635 if ((do_reinit or not data->is_initialized) and p3m_gpu_data.mesh_size > 0) {
636 dim3 grid(1, 1, 1);
637 dim3 block(1, 1, 1);
638 block.x = static_cast<unsigned>(512 / mesh[0] + 1);
639 block.y = static_cast<unsigned>(mesh[1]);
640 block.z = 1;
641 grid.x = static_cast<unsigned>(mesh[0]) / block.x + 1;
642 grid.z = static_cast<unsigned>(mesh[2]) / 2 + 1;
643
644 switch (p3m_gpu_data.cao) {
645 case 1:
646 KERNELCALL(calculate_influence_function_device<1>, grid, block,
647 p3m_gpu_data);
648 break;
649 case 2:
650 KERNELCALL(calculate_influence_function_device<2>, grid, block,
651 p3m_gpu_data);
652 break;
653 case 3:
654 KERNELCALL(calculate_influence_function_device<3>, grid, block,
655 p3m_gpu_data);
656 break;
657 case 4:
658 KERNELCALL(calculate_influence_function_device<4>, grid, block,
659 p3m_gpu_data);
660 break;
661 case 5:
662 KERNELCALL(calculate_influence_function_device<5>, grid, block,
663 p3m_gpu_data);
664 break;
665 case 6:
666 KERNELCALL(calculate_influence_function_device<6>, grid, block,
667 p3m_gpu_data);
668 break;
669 case 7:
670 KERNELCALL(calculate_influence_function_device<7>, grid, block,
671 p3m_gpu_data);
672 break;
673 }
674 }
675 if (p3m_gpu_data.mesh_size > 0)
676 data->is_initialized = true;
677}
678
679/**
680 * \brief The long-range part of the P3M algorithm.
681 */
683 double prefactor, std::size_t n_part) {
684 auto &p3m_gpu_data = data.p3m_gpu_data;
685 p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
686
687 if (n_part == 0u)
688 return;
689
690 auto const positions_device = gpu.get_particle_positions_device();
691 auto const charges_device = gpu.get_particle_charges_device();
692 auto const forces_device = gpu.get_particle_forces_device();
693
694 dim3 gridConv(static_cast<unsigned>(p3m_gpu_data.mesh[0]),
695 static_cast<unsigned>(p3m_gpu_data.mesh[1]), 1u);
696 dim3 threadsConv(static_cast<unsigned>(p3m_gpu_data.mesh[2] / 2 + 1), 1u, 1u);
697
698 auto const volume =
699 Utils::product(Utils::Vector3<REAL_TYPE>(p3m_gpu_data.box));
700 auto const pref = static_cast<REAL_TYPE>(prefactor) / (volume * REAL_TYPE{2});
701
702 cuda_safe_mem(cudaMemset(p3m_gpu_data.charge_mesh, 0,
703 static_cast<std::size_t>(p3m_gpu_data.mesh_size) *
704 sizeof(REAL_TYPE)));
705
706 /* Interpolate the charges to the mesh */
707 assign_charges(p3m_gpu_data, positions_device, charges_device);
708
709 /* Do forward FFT of the charge mesh */
711 (REAL_TYPE *)p3m_gpu_data.charge_mesh,
712 p3m_gpu_data.charge_mesh) != CUFFT_SUCCESS) {
713 throw std::runtime_error("CUFFT error: Forward FFT failed");
714 }
715
716 /* Do convolution */
717 KERNELCALL(apply_influence_function, gridConv, threadsConv, p3m_gpu_data);
718
719 /* Take derivative */
720 KERNELCALL(apply_diff_op, gridConv, threadsConv, p3m_gpu_data);
721
722 /* Transform the components of the electric field back */
723 FFT_BACK_FFT(data.p3m_fft.back_plan, p3m_gpu_data.force_mesh_x,
724 (REAL_TYPE *)p3m_gpu_data.force_mesh_x);
725 FFT_BACK_FFT(data.p3m_fft.back_plan, p3m_gpu_data.force_mesh_y,
726 (REAL_TYPE *)p3m_gpu_data.force_mesh_y);
727 FFT_BACK_FFT(data.p3m_fft.back_plan, p3m_gpu_data.force_mesh_z,
728 (REAL_TYPE *)p3m_gpu_data.force_mesh_z);
729
730 /* Assign the forces from the mesh back to the particles */
731 assign_forces(p3m_gpu_data, positions_device, charges_device, forces_device,
732 pref);
733}
734
735#endif // ESPRESSO_P3M
Particle data communication manager for the GPU.
float * get_particle_charges_device() const
float * get_particle_forces_device() const
float * get_particle_positions_device() const
static DEVICE_QUALIFIER constexpr Vector< T, N > broadcast(typename Base::value_type const &value) noexcept
Create a vector that has all entries set to the same value.
Definition Vector.hpp:131
static std::shared_ptr< scoped_pause > make_shared_pause_scoped()
Generate a shared handle to temporarily disable any currently active exception trap for the lifetime ...
Definition fe_trap.cpp:144
void cuda_check_errors_exit(const dim3 &block, const dim3 &grid, const char *function, const char *file, unsigned int line)
In case of error during a CUDA operation, print the error message and exit.
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:175
#define P3M_BRILLOUIN
P3M: Number of Brillouin zones taken into account in the calculation of the optimal influence functio...
Definition math.hpp:38
T product(Vector< T, N > const &v)
Definition Vector.hpp:372
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
Definition sqr.hpp:28
DEVICE_QUALIFIER constexpr T int_pow(T x)
Calculate integer powers.
Definition int_pow.hpp:34
__device__ auto linear_index_k(P3MGpuData const &p, int i, int j, int k)
__device__ auto linear_index_r(P3MGpuData const &p, int i, int j, int k)
DEVICE_QUALIFIER auto sinc(T x)
Calculate the function .
Definition math.hpp:71
__device__ static void Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, int NZ, REAL_TYPE *Zaehler, REAL_TYPE *Nenner)
#define FFT_BACK_FFT
__global__ void assign_charge_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, unsigned int const parts_per_block)
void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, double prefactor, std::size_t n_part)
The long-range part of the P3M algorithm.
__global__ void apply_influence_function(const P3MGpuData p)
void assign_charges(P3MGpuData const &params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q)
#define FFT_PLAN_FORW_FLAG
__global__ void apply_diff_op(const P3MGpuData p)
void p3m_gpu_init(std::shared_ptr< P3MGpuParams > &data, int cao, Utils::Vector3i const &mesh, double alpha, Utils::Vector3d const &box_l, std::size_t n_part)
Initialize the internal data structure of the P3M GPU.
static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part)
#define REAL_TYPE
void assign_forces(P3MGpuData const &params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE const prefactor)
__global__ void calculate_influence_function_device(const P3MGpuData p)
#define FFT_TYPE_COMPLEX
__device__ int wrap_index(const int ind, const int mesh)
#define FFT_PLAN_BACK_FLAG
dim3 p3m_make_grid(unsigned int n_blocks)
__global__ void assign_forces_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE prefactor, unsigned int const parts_per_block)
#define FFT_FORW_FFT
int mesh[3]
Mesh dimensions.
REAL_TYPE pos_shift
Position shift.
REAL_TYPE * G_hat
Influence Function.
FFT_TYPE_COMPLEX * force_mesh_x
Force meshes.
int mesh_z_padded
Padded size.
int mesh_size
Total number of mesh points (including padding)
int cao
Charge assignment order.
unsigned int n_part
Number of particles.
FFT_TYPE_COMPLEX * force_mesh_z
REAL_TYPE hi[3]
Inverse mesh spacing.
FFT_TYPE_COMPLEX * charge_mesh
Charge mesh.
REAL_TYPE box[3]
Box size.
FFT_TYPE_COMPLEX * force_mesh_y
REAL_TYPE alpha
Ewald parameter.
cufftHandle forw_plan
Forward FFT plan.
cufftHandle back_plan
Backward FFT plan.
void free_device_memory()
P3MGpuData p3m_gpu_data
P3MGpuFftPlan p3m_fft
#define cuda_safe_mem(a)
Definition utils.cuh:73
#define KERNELCALL(_function, _grid, _block,...)
Definition utils.cuh:79