ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
p3m_gpu_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2010-2022 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20/**
21 * @file
22 *
23 * P3M electrostatics on GPU.
24 *
25 * The corresponding header file is @ref p3m_gpu_cuda.cuh.
26 */
27
28#include "config/config.hpp"
29
30#ifdef ELECTROSTATICS
31
32#define P3M_GPU_FLOAT
33// #define P3M_GPU_REAL_DOUBLE
34
35#ifdef P3M_GPU_FLOAT
36#define REAL_TYPE float
37#define FFT_TYPE_COMPLEX cufftComplex
38#define FFT_FORW_FFT cufftExecR2C
39#define FFT_BACK_FFT cufftExecC2R
40#define FFT_PLAN_FORW_FLAG CUFFT_R2C
41#define FFT_PLAN_BACK_FLAG CUFFT_C2R
42#endif
43
44#ifdef P3M_GPU_REAL_DOUBLE
45#define REAL_TYPE double
46#define FFT_TYPE_COMPLEX cufftDoubleComplex
47#define FFT_FORW_FFT cufftExecD2Z
48#define FFT_BACK_FFT cufftExecZ2D
49#define FFT_PLAN_FORW_FLAG CUFFT_D2Z
50#define FFT_PLAN_BACK_FLAG CUFFT_Z2D
51#endif
52
54
55#include "cuda/utils.cuh"
56#include "p3m/math.hpp"
57#include "system/System.hpp"
58
61#include <utils/math/sqr.hpp>
62
63#include <cuda.h>
64#include <cufft.h>
65
66#include <algorithm>
67#include <cassert>
68#include <cstddef>
69#include <limits>
70#include <numbers>
71#include <stdexcept>
72
73#if defined(OMPI_MPI_H) || defined(_MPI_H)
74#error CU-file includes mpi.h! This should not happen!
75#endif
76
77using Utils::int_pow;
78using Utils::sqr;
79
80struct P3MGpuData {
81 /** Charge mesh */
83 /** Force meshes */
87 /** Influence Function */
89 /** Charge assignment order */
90 int cao;
91 /** Total number of mesh points (including padding) */
93 /** Ewald parameter */
95 /** Number of particles */
96 unsigned int n_part; // oddity: size_t causes UB with GCC 11.4 in Debug mode
97 /** Box size */
99 /** Mesh dimensions */
100 int mesh[3];
101 /** Padded size */
103 /** Inverse mesh spacing */
105 /** Position shift */
107};
108
110 /** Forward FFT plan */
111 cufftHandle forw_plan;
112 /** Backward FFT plan */
113 cufftHandle back_plan;
114};
115
120
122
124 auto const free_device_pointer = [](auto *&ptr) {
125 if (ptr != nullptr) {
126 cuda_safe_mem(cudaFree(reinterpret_cast<void *>(ptr)));
127 ptr = nullptr;
128 }
129 };
130 free_device_pointer(p3m_gpu_data.charge_mesh);
131 free_device_pointer(p3m_gpu_data.force_mesh_x);
132 free_device_pointer(p3m_gpu_data.force_mesh_y);
133 free_device_pointer(p3m_gpu_data.force_mesh_z);
134 free_device_pointer(p3m_gpu_data.G_hat);
135 cufftDestroy(p3m_fft.forw_plan);
136 cufftDestroy(p3m_fft.back_plan);
137 is_initialized = false;
138 }
139};
140
141static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part) {
142 auto const cao3 = Utils::int_pow<3>(cao);
143 auto parts_per_block = 1u;
144 while ((parts_per_block + 1u) * cao3 <= 1024u) {
145 ++parts_per_block;
146 }
147 assert((n_part / parts_per_block) <= std::numeric_limits<unsigned>::max());
148 auto n = static_cast<unsigned int>(n_part / parts_per_block);
149 auto n_blocks = ((n_part % parts_per_block) == 0u) ? std::max(1u, n) : n + 1u;
150 assert(n_blocks <= std::numeric_limits<unsigned>::max());
151 return std::make_pair(parts_per_block, static_cast<unsigned>(n_blocks));
152}
153
154dim3 p3m_make_grid(unsigned int n_blocks) {
155 dim3 grid(n_blocks, 1u, 1u);
156 while (grid.x > 65536u) {
157 grid.y++;
158 if ((n_blocks % grid.y) == 0u)
159 grid.x = std::max(1u, n_blocks / grid.y);
160 else
161 grid.x = n_blocks / grid.y + 1u;
162 }
163 return grid;
164}
165
166template <int cao>
167__device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
168 int NZ, REAL_TYPE *Zaehler,
169 REAL_TYPE *Nenner) {
170 REAL_TYPE S1, S2, S3;
171 REAL_TYPE zwi;
172 int MX, MY, MZ;
173 REAL_TYPE NMX, NMY, NMZ;
174 REAL_TYPE NM2;
175 REAL_TYPE TE;
176 REAL_TYPE Leni[3];
177 REAL_TYPE Meshi[3];
178 for (int i = 0; i < 3; ++i) {
179 Leni[i] = 1.0f / p.box[i];
180 Meshi[i] = 1.0f / static_cast<REAL_TYPE>(p.mesh[i]);
181 }
182
183 Zaehler[0] = Zaehler[1] = Zaehler[2] = *Nenner = 0.0;
184
185 for (MX = -P3M_BRILLOUIN; MX <= P3M_BRILLOUIN; MX++) {
186 NMX = static_cast<REAL_TYPE>(((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX) +
187 p.mesh[0] * MX);
188 S1 = int_pow<2 * cao>(math::sinc(Meshi[0] * NMX));
189 for (MY = -P3M_BRILLOUIN; MY <= P3M_BRILLOUIN; MY++) {
190 NMY = static_cast<REAL_TYPE>(
191 ((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY) + p.mesh[1] * MY);
192 S2 = S1 * int_pow<2 * cao>(math::sinc(Meshi[1] * NMY));
193 for (MZ = -P3M_BRILLOUIN; MZ <= P3M_BRILLOUIN; MZ++) {
194 NMZ = static_cast<REAL_TYPE>(
195 ((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ) + p.mesh[2] * MZ);
196 S3 = S2 * int_pow<2 * cao>(math::sinc(Meshi[2] * NMZ));
197
198 NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]);
199 *Nenner += S3;
200
201 TE = exp(-sqr(std::numbers::pi_v<REAL_TYPE> / (p.alpha)) * NM2);
202 zwi = S3 * TE / NM2;
203 Zaehler[0] += NMX * zwi * Leni[0];
204 Zaehler[1] += NMY * zwi * Leni[1];
205 Zaehler[2] += NMZ * zwi * Leni[2];
206 }
207 }
208 }
209}
210
211/* Calculate influence function */
212template <int cao>
214
215 const auto NX = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
216 const auto NY = static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
217 const auto NZ = static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
218 REAL_TYPE Dnx, Dny, Dnz;
219 REAL_TYPE Zaehler[3] = {0.0, 0.0, 0.0}, Nenner = 0.0;
220 REAL_TYPE zwi;
221 auto index = 0;
222 REAL_TYPE Leni[3];
223 for (int i = 0; i < 3; ++i) {
224 Leni[i] = REAL_TYPE{1} / p.box[i];
225 }
226
227 if ((NX >= p.mesh[0]) || (NY >= p.mesh[1]) || (NZ >= (p.mesh[2] / 2 + 1)))
228 return;
229
230 index = NX * p.mesh[1] * (p.mesh[2] / 2 + 1) + NY * (p.mesh[2] / 2 + 1) + NZ;
231
232 if (((NX == 0) && (NY == 0) && (NZ == 0)) ||
233 ((NX % (p.mesh[0] / 2) == 0) && (NY % (p.mesh[1] / 2) == 0) &&
234 (NZ % (p.mesh[2] / 2) == 0))) {
235 p.G_hat[index] = 0;
236 } else {
237 Aliasing_sums_ik<cao>(p, NX, NY, NZ, Zaehler, &Nenner);
238
239 Dnx = static_cast<REAL_TYPE>((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX);
240 Dny = static_cast<REAL_TYPE>((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY);
241 Dnz = static_cast<REAL_TYPE>((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ);
242
243 zwi = Dnx * Zaehler[0] * Leni[0] + Dny * Zaehler[1] * Leni[1] +
244 Dnz * Zaehler[2] * Leni[2];
245 zwi /= ((sqr(Dnx * Leni[0]) + sqr(Dny * Leni[1]) + sqr(Dnz * Leni[2])) *
246 sqr(Nenner));
247 p.G_hat[index] = REAL_TYPE{2} * zwi / std::numbers::pi_v<REAL_TYPE>;
248 }
249}
250
251namespace {
252__device__ inline auto linear_index_r(P3MGpuData const &p, int i, int j,
253 int k) {
254 return static_cast<unsigned int>(p.mesh[1] * p.mesh_z_padded * i +
255 p.mesh_z_padded * j + k);
256}
257
258__device__ inline auto linear_index_k(P3MGpuData const &p, int i, int j,
259 int k) {
260 return static_cast<unsigned int>(p.mesh[1] * (p.mesh[2] / 2 + 1) * i +
261 (p.mesh[2] / 2 + 1) * j + k);
262}
263} // namespace
264
265__global__ void apply_diff_op(const P3MGpuData p) {
266 auto const linear_index = linear_index_k(p, static_cast<int>(blockIdx.x),
267 static_cast<int>(blockIdx.y),
268 static_cast<int>(threadIdx.x));
269
270 auto const bidx = static_cast<int>(blockIdx.x);
271 auto const bidy = static_cast<int>(blockIdx.y);
272 auto const nx = (bidx > p.mesh[0] / 2) ? bidx - p.mesh[0] : bidx;
273 auto const ny = (bidy > p.mesh[1] / 2) ? bidy - p.mesh[1] : bidy;
274 auto const nz = static_cast<int>(threadIdx.x);
275
276 const FFT_TYPE_COMPLEX meshw = p.charge_mesh[linear_index];
278 buf.x = REAL_TYPE(-2) * std::numbers::pi_v<REAL_TYPE> * meshw.y;
279 buf.y = REAL_TYPE(+2) * std::numbers::pi_v<REAL_TYPE> * meshw.x;
280
281 p.force_mesh_x[linear_index].x =
282 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nx) * buf.x / p.box[0];
283 p.force_mesh_x[linear_index].y =
284 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nx) * buf.y / p.box[0];
285
286 p.force_mesh_y[linear_index].x =
287 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(ny) * buf.x / p.box[1];
288 p.force_mesh_y[linear_index].y =
289 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(ny) * buf.y / p.box[1];
290
291 p.force_mesh_z[linear_index].x =
292 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nz) * buf.x / p.box[2];
293 p.force_mesh_z[linear_index].y =
294 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nz) * buf.y / p.box[2];
295}
296
297__device__ inline int wrap_index(const int ind, const int mesh) {
298 if (ind < 0)
299 return ind + mesh;
300 if (ind >= mesh)
301 return ind - mesh;
302 return ind;
303}
304
305__global__ void apply_influence_function(const P3MGpuData p) {
306 auto const linear_index = linear_index_k(p, static_cast<int>(blockIdx.x),
307 static_cast<int>(blockIdx.y),
308 static_cast<int>(threadIdx.x));
309
310 p.charge_mesh[linear_index].x *= p.G_hat[linear_index];
311 p.charge_mesh[linear_index].y *= p.G_hat[linear_index];
312}
313
314template <int cao, bool shared>
316 float const *const __restrict__ part_pos,
317 float const *const __restrict__ part_q,
318 unsigned int const parts_per_block) {
319 auto const part_in_block = threadIdx.x / static_cast<unsigned int>(cao);
320 auto const cao_id_x =
321 threadIdx.x - part_in_block * static_cast<unsigned int>(cao);
322 /* id of the particle */
323 auto const id =
324 parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
325 if (id >= params.n_part)
326 return;
327 /* position relative to the closest gird point */
328 REAL_TYPE m_pos[3];
329 /* index of the nearest mesh point */
330 int nmp_x, nmp_y, nmp_z;
331
332 auto *charge_mesh = (REAL_TYPE *)params.charge_mesh;
333
334 m_pos[0] = part_pos[3 * id + 0] * params.hi[0] - params.pos_shift;
335 m_pos[1] = part_pos[3 * id + 1] * params.hi[1] - params.pos_shift;
336 m_pos[2] = part_pos[3 * id + 2] * params.hi[2] - params.pos_shift;
337
338 nmp_x = static_cast<int>(floorf(m_pos[0] + 0.5f));
339 nmp_y = static_cast<int>(floorf(m_pos[1] + 0.5f));
340 nmp_z = static_cast<int>(floorf(m_pos[2] + 0.5f));
341
342 m_pos[0] -= static_cast<REAL_TYPE>(nmp_x);
343 m_pos[1] -= static_cast<REAL_TYPE>(nmp_y);
344 m_pos[2] -= static_cast<REAL_TYPE>(nmp_z);
345
346 nmp_x = wrap_index(nmp_x + static_cast<int>(cao_id_x), params.mesh[0]);
347 nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), params.mesh[1]);
348 nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), params.mesh[2]);
349
350 auto const index = linear_index_r(params, nmp_x, nmp_y, nmp_z);
351
352 extern __shared__ float weights[];
353
354 if (shared) {
355 auto const offset = static_cast<unsigned int>(cao) * part_in_block;
356 if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) {
357 weights[3u * offset + 3u * cao_id_x + threadIdx.y] =
358 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[threadIdx.y]);
359 }
360
361 __syncthreads();
362
363 auto const c = weights[3u * offset + 3u * cao_id_x] *
364 weights[3u * offset + 3u * threadIdx.y + 1u] *
365 weights[3u * offset + 3u * threadIdx.z + 2u] * part_q[id];
366 atomicAdd(&(charge_mesh[index]), REAL_TYPE(c));
367
368 } else {
369 auto const c =
370 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[0]) * part_q[id] *
371 Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
372 Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]);
373 atomicAdd(&(charge_mesh[index]), REAL_TYPE(c));
374 }
375}
376
378 float const *const __restrict__ part_pos,
379 float const *const __restrict__ part_q) {
380 auto const cao = static_cast<unsigned int>(params.cao);
381 auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
382 dim3 block(parts_per_block * cao, cao, cao);
383 dim3 grid = p3m_make_grid(n_blocks);
384
385 auto const data_length = std::size_t(3u) *
386 static_cast<std::size_t>(parts_per_block) *
387 static_cast<std::size_t>(cao) * sizeof(REAL_TYPE);
388 switch (params.cao) {
389 case 1:
390 (assign_charge_kernel<1, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
391 params, part_pos, part_q, parts_per_block);
392 break;
393 case 2:
394 (assign_charge_kernel<2, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
395 params, part_pos, part_q, parts_per_block);
396 break;
397 case 3:
398 (assign_charge_kernel<3, true>)<<<grid, block, data_length, nullptr>>>(
399 params, part_pos, part_q, parts_per_block);
400 break;
401 case 4:
402 (assign_charge_kernel<4, true>)<<<grid, block, data_length, nullptr>>>(
403 params, part_pos, part_q, parts_per_block);
404 break;
405 case 5:
406 (assign_charge_kernel<5, true>)<<<grid, block, data_length, nullptr>>>(
407 params, part_pos, part_q, parts_per_block);
408 break;
409 case 6:
410 (assign_charge_kernel<6, true>)<<<grid, block, data_length, nullptr>>>(
411 params, part_pos, part_q, parts_per_block);
412 break;
413 case 7:
414 (assign_charge_kernel<7, true>)<<<grid, block, data_length, nullptr>>>(
415 params, part_pos, part_q, parts_per_block);
416 break;
417 default:
418 break;
419 }
420 cuda_check_errors_exit(block, grid, "assign_charge", __FILE__, __LINE__);
421}
422
423template <int cao, bool shared>
425 float const *const __restrict__ part_pos,
426 float const *const __restrict__ part_q,
427 float *const __restrict__ part_f,
428 REAL_TYPE prefactor,
429 unsigned int const parts_per_block) {
430 auto const part_in_block = threadIdx.x / static_cast<unsigned int>(cao);
431 auto const cao_id_x =
432 threadIdx.x - part_in_block * static_cast<unsigned int>(cao);
433 /* id of the particle */
434 auto const id =
435 parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
436 if (id >= static_cast<unsigned>(params.n_part))
437 return;
438 /* position relative to the closest grid point */
439 REAL_TYPE m_pos[3];
440 /* index of the nearest mesh point */
441 int nmp_x, nmp_y, nmp_z;
442
443 m_pos[0] = part_pos[3u * id + 0u] * params.hi[0] - params.pos_shift;
444 m_pos[1] = part_pos[3u * id + 1u] * params.hi[1] - params.pos_shift;
445 m_pos[2] = part_pos[3u * id + 2u] * params.hi[2] - params.pos_shift;
446
447 nmp_x = static_cast<int>(floorf(m_pos[0] + REAL_TYPE{0.5}));
448 nmp_y = static_cast<int>(floorf(m_pos[1] + REAL_TYPE{0.5}));
449 nmp_z = static_cast<int>(floorf(m_pos[2] + REAL_TYPE{0.5}));
450
451 m_pos[0] -= static_cast<REAL_TYPE>(nmp_x);
452 m_pos[1] -= static_cast<REAL_TYPE>(nmp_y);
453 m_pos[2] -= static_cast<REAL_TYPE>(nmp_z);
454
455 nmp_x = wrap_index(nmp_x + static_cast<int>(cao_id_x), params.mesh[0]);
456 nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), params.mesh[1]);
457 nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), params.mesh[2]);
458
459 auto const index = linear_index_r(params, nmp_x, nmp_y, nmp_z);
460
461 extern __shared__ float weights[];
462
463 REAL_TYPE c = -prefactor * part_q[id];
464 if (shared) {
465 auto const offset = static_cast<unsigned int>(cao) * part_in_block;
466 if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) {
467 weights[3u * offset + 3u * cao_id_x + threadIdx.y] =
468 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[threadIdx.y]);
469 }
470
471 __syncthreads();
472
473 c *= REAL_TYPE(weights[3u * offset + 3u * cao_id_x] *
474 weights[3u * offset + 3u * threadIdx.y + 1u] *
475 weights[3u * offset + 3u * threadIdx.z + 2u]);
476 } else {
477 c *=
478 REAL_TYPE(Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[0]) *
479 Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
480 Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]));
481 }
482
483 const REAL_TYPE *force_mesh_x = (REAL_TYPE *)params.force_mesh_x;
484 const REAL_TYPE *force_mesh_y = (REAL_TYPE *)params.force_mesh_y;
485 const REAL_TYPE *force_mesh_z = (REAL_TYPE *)params.force_mesh_z;
486
487 atomicAdd(&(part_f[3u * id + 0u]), float(c * force_mesh_x[index]));
488 atomicAdd(&(part_f[3u * id + 1u]), float(c * force_mesh_y[index]));
489 atomicAdd(&(part_f[3u * id + 2u]), float(c * force_mesh_z[index]));
490}
491
493 float const *const __restrict__ part_pos,
494 float const *const __restrict__ part_q,
495 float *const __restrict__ part_f,
496 REAL_TYPE const prefactor) {
497 auto const cao = static_cast<unsigned int>(params.cao);
498 auto const [parts_per_block, n_blocks] = p3m_calc_blocks(cao, params.n_part);
499 dim3 block(parts_per_block * cao, cao, cao);
500 dim3 grid = p3m_make_grid(n_blocks);
501
502 /* Switch for assignment templates, the shared version only is faster for cao
503 * > 2 */
504 auto const data_length = std::size_t(3u) *
505 static_cast<std::size_t>(parts_per_block) *
506 static_cast<std::size_t>(cao) * sizeof(float);
507 switch (params.cao) {
508 case 1:
509 (assign_forces_kernel<1, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
510 params, part_pos, part_q, part_f, prefactor, parts_per_block);
511 break;
512 case 2:
513 (assign_forces_kernel<2, false>)<<<grid, block, std::size_t(0u), nullptr>>>(
514 params, part_pos, part_q, part_f, prefactor, parts_per_block);
515 break;
516 case 3:
517 (assign_forces_kernel<3, true>)<<<grid, block, data_length, nullptr>>>(
518 params, part_pos, part_q, part_f, prefactor, parts_per_block);
519 break;
520 case 4:
521 (assign_forces_kernel<4, true>)<<<grid, block, data_length, nullptr>>>(
522 params, part_pos, part_q, part_f, prefactor, parts_per_block);
523 break;
524 case 5:
525 (assign_forces_kernel<5, true>)<<<grid, block, data_length, nullptr>>>(
526 params, part_pos, part_q, part_f, prefactor, parts_per_block);
527 break;
528 case 6:
529 (assign_forces_kernel<6, true>)<<<grid, block, data_length, nullptr>>>(
530 params, part_pos, part_q, part_f, prefactor, parts_per_block);
531 break;
532 case 7:
533 (assign_forces_kernel<7, true>)<<<grid, block, data_length, nullptr>>>(
534 params, part_pos, part_q, part_f, prefactor, parts_per_block);
535 break;
536 default:
537 break;
538 }
539 cuda_check_errors_exit(block, grid, "assign_forces", __FILE__, __LINE__);
540}
541
542/**
543 * @brief Initialize the internal data structure of the P3M GPU.
544 * Mainly allocation on the device and influence function calculation.
545 * Be advised: this needs `mesh^3*5*sizeof(REAL_TYPE)` of device memory.
546 * We use real to complex FFTs, so the size of the reciprocal mesh
547 * is (cuFFT convention) `Nx * Ny * ( Nz /2 + 1 )`.
548 */
549void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
550 Utils::Vector3i const &mesh, double alpha,
551 Utils::Vector3d const &box_l, std::size_t n_part) {
552 if (mesh == Utils::Vector3i::broadcast(-1))
553 throw std::runtime_error("P3M: invalid mesh size");
554
555 if (not data) {
556 data = std::make_shared<P3MGpuParams>();
557 }
558
559 auto &p3m_gpu_data = data->p3m_gpu_data;
560 bool do_reinit = false, mesh_changed = false;
561 assert(n_part <= std::numeric_limits<unsigned int>::max());
562 p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
563
564 if (not data->is_initialized or p3m_gpu_data.alpha != alpha) {
565 p3m_gpu_data.alpha = static_cast<REAL_TYPE>(alpha);
566 do_reinit = true;
567 }
568
569 if (not data->is_initialized or p3m_gpu_data.cao != cao) {
570 p3m_gpu_data.cao = cao;
571 // NOLINTNEXTLINE(bugprone-integer-division)
572 p3m_gpu_data.pos_shift = static_cast<REAL_TYPE>((p3m_gpu_data.cao - 1) / 2);
573 do_reinit = true;
574 }
575
576 if (not data->is_initialized or mesh != Utils::Vector3i(p3m_gpu_data.mesh)) {
577 std::copy(mesh.begin(), mesh.end(), p3m_gpu_data.mesh);
578 mesh_changed = true;
579 do_reinit = true;
580 }
581
582 if (auto constexpr eps =
583 static_cast<double>(std::numeric_limits<float>::epsilon());
584 not data->is_initialized or
585 (box_l - Utils::Vector3d(p3m_gpu_data.box)).norm() >= eps) {
586 std::copy(box_l.begin(), box_l.end(), p3m_gpu_data.box);
587 do_reinit = true;
588 }
589
590 p3m_gpu_data.mesh_z_padded = (mesh[2] / 2 + 1) * 2;
591 p3m_gpu_data.mesh_size = mesh[0] * mesh[1] * p3m_gpu_data.mesh_z_padded;
592
593 for (auto i = 0u; i < 3u; ++i) {
594 p3m_gpu_data.hi[i] =
595 static_cast<REAL_TYPE>(p3m_gpu_data.mesh[i]) / p3m_gpu_data.box[i];
596 }
597
598 if (data->is_initialized and mesh_changed) {
599 data->free_device_memory();
600 data->is_initialized = false;
601 }
602
603 if (not data->is_initialized and p3m_gpu_data.mesh_size > 0) {
604 /* Size of the complex mesh Nx * Ny * ( Nz / 2 + 1 ) */
605 auto const cmesh_size =
606 static_cast<std::size_t>(p3m_gpu_data.mesh[0]) *
607 static_cast<std::size_t>(p3m_gpu_data.mesh[1]) *
608 static_cast<std::size_t>(p3m_gpu_data.mesh[2] / 2 + 1);
609 auto const mesh_len = cmesh_size * sizeof(FFT_TYPE_COMPLEX);
610 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.charge_mesh), mesh_len));
611 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_x), mesh_len));
612 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_y), mesh_len));
613 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_z), mesh_len));
614 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.G_hat),
615 cmesh_size * sizeof(REAL_TYPE)));
616
617 if (cufftPlan3d(&(data->p3m_fft.forw_plan), mesh[0], mesh[1], mesh[2],
618 FFT_PLAN_FORW_FLAG) != CUFFT_SUCCESS or
619 cufftPlan3d(&(data->p3m_fft.back_plan), mesh[0], mesh[1], mesh[2],
620 FFT_PLAN_BACK_FLAG) != CUFFT_SUCCESS) {
621 throw std::runtime_error("Unable to create fft plan");
622 }
623 }
624
625 if ((do_reinit or not data->is_initialized) and p3m_gpu_data.mesh_size > 0) {
626 dim3 grid(1, 1, 1);
627 dim3 block(1, 1, 1);
628 block.x = static_cast<unsigned>(512 / mesh[0] + 1);
629 block.y = static_cast<unsigned>(mesh[1]);
630 block.z = 1;
631 grid.x = static_cast<unsigned>(mesh[0]) / block.x + 1;
632 grid.z = static_cast<unsigned>(mesh[2]) / 2 + 1;
633
634 switch (p3m_gpu_data.cao) {
635 case 1:
636 KERNELCALL(calculate_influence_function_device<1>, grid, block,
637 p3m_gpu_data);
638 break;
639 case 2:
640 KERNELCALL(calculate_influence_function_device<2>, grid, block,
641 p3m_gpu_data);
642 break;
643 case 3:
644 KERNELCALL(calculate_influence_function_device<3>, grid, block,
645 p3m_gpu_data);
646 break;
647 case 4:
648 KERNELCALL(calculate_influence_function_device<4>, grid, block,
649 p3m_gpu_data);
650 break;
651 case 5:
652 KERNELCALL(calculate_influence_function_device<5>, grid, block,
653 p3m_gpu_data);
654 break;
655 case 6:
656 KERNELCALL(calculate_influence_function_device<6>, grid, block,
657 p3m_gpu_data);
658 break;
659 case 7:
660 KERNELCALL(calculate_influence_function_device<7>, grid, block,
661 p3m_gpu_data);
662 break;
663 }
664 }
665 if (p3m_gpu_data.mesh_size > 0)
666 data->is_initialized = true;
667}
668
669/**
670 * \brief The long-range part of the P3M algorithm.
671 */
673 double prefactor, std::size_t n_part) {
674 auto &p3m_gpu_data = data.p3m_gpu_data;
675 p3m_gpu_data.n_part = static_cast<unsigned>(n_part);
676
677 if (n_part == 0u)
678 return;
679
680 auto const positions_device = gpu.get_particle_positions_device();
681 auto const charges_device = gpu.get_particle_charges_device();
682 auto const forces_device = gpu.get_particle_forces_device();
683
684 dim3 gridConv(static_cast<unsigned>(p3m_gpu_data.mesh[0]),
685 static_cast<unsigned>(p3m_gpu_data.mesh[1]), 1u);
686 dim3 threadsConv(static_cast<unsigned>(p3m_gpu_data.mesh[2] / 2 + 1), 1u, 1u);
687
688 auto const volume =
689 Utils::product(Utils::Vector3<REAL_TYPE>(p3m_gpu_data.box));
690 auto const pref = static_cast<REAL_TYPE>(prefactor) / (volume * REAL_TYPE{2});
691
692 cuda_safe_mem(cudaMemset(p3m_gpu_data.charge_mesh, 0,
693 static_cast<std::size_t>(p3m_gpu_data.mesh_size) *
694 sizeof(REAL_TYPE)));
695
696 /* Interpolate the charges to the mesh */
697 assign_charges(p3m_gpu_data, positions_device, charges_device);
698
699 /* Do forward FFT of the charge mesh */
701 (REAL_TYPE *)p3m_gpu_data.charge_mesh,
702 p3m_gpu_data.charge_mesh) != CUFFT_SUCCESS) {
703 throw std::runtime_error("CUFFT error: Forward FFT failed");
704 }
705
706 /* Do convolution */
707 KERNELCALL(apply_influence_function, gridConv, threadsConv, p3m_gpu_data);
708
709 /* Take derivative */
710 KERNELCALL(apply_diff_op, gridConv, threadsConv, p3m_gpu_data);
711
712 /* Transform the components of the electric field back */
713 FFT_BACK_FFT(data.p3m_fft.back_plan, p3m_gpu_data.force_mesh_x,
714 (REAL_TYPE *)p3m_gpu_data.force_mesh_x);
715 FFT_BACK_FFT(data.p3m_fft.back_plan, p3m_gpu_data.force_mesh_y,
716 (REAL_TYPE *)p3m_gpu_data.force_mesh_y);
717 FFT_BACK_FFT(data.p3m_fft.back_plan, p3m_gpu_data.force_mesh_z,
718 (REAL_TYPE *)p3m_gpu_data.force_mesh_z);
719
720 /* Assign the forces from the mesh back to the particles */
721 assign_forces(p3m_gpu_data, positions_device, charges_device, forces_device,
722 pref);
723}
724
725#endif // ELECTROSTATICS
Particle data communication manager for the GPU.
float * get_particle_charges_device() const
float * get_particle_forces_device() const
float * get_particle_positions_device() const
static DEVICE_QUALIFIER constexpr Vector< T, N > broadcast(typename Base::value_type const &value)
Create a vector that has all entries set to the same value.
Definition Vector.hpp:110
void cuda_check_errors_exit(const dim3 &block, const dim3 &grid, const char *function, const char *file, unsigned int line)
In case of error during a CUDA operation, print the error message and exit.
This file contains the defaults for ESPResSo.
#define P3M_BRILLOUIN
P3M: Number of Brillouin zones taken into account in the calculation of the optimal influence functio...
Definition config.hpp:53
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:172
T product(Vector< T, N > const &v)
Definition Vector.hpp:374
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
Definition sqr.hpp:28
DEVICE_QUALIFIER constexpr T int_pow(T x)
Calculate integer powers.
Definition int_pow.hpp:61
__device__ auto linear_index_k(P3MGpuData const &p, int i, int j, int k)
__device__ auto linear_index_r(P3MGpuData const &p, int i, int j, int k)
DEVICE_QUALIFIER auto sinc(T x)
Calculate the function .
Definition math.hpp:53
__device__ static void Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, int NZ, REAL_TYPE *Zaehler, REAL_TYPE *Nenner)
#define FFT_BACK_FFT
__global__ void assign_charge_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, unsigned int const parts_per_block)
void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, double prefactor, std::size_t n_part)
The long-range part of the P3M algorithm.
__global__ void apply_influence_function(const P3MGpuData p)
void assign_charges(P3MGpuData const &params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q)
#define FFT_PLAN_FORW_FLAG
__global__ void apply_diff_op(const P3MGpuData p)
void p3m_gpu_init(std::shared_ptr< P3MGpuParams > &data, int cao, Utils::Vector3i const &mesh, double alpha, Utils::Vector3d const &box_l, std::size_t n_part)
Initialize the internal data structure of the P3M GPU.
static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part)
#define REAL_TYPE
void assign_forces(P3MGpuData const &params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE const prefactor)
__global__ void calculate_influence_function_device(const P3MGpuData p)
#define FFT_TYPE_COMPLEX
__device__ int wrap_index(const int ind, const int mesh)
#define FFT_PLAN_BACK_FLAG
dim3 p3m_make_grid(unsigned int n_blocks)
__global__ void assign_forces_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE prefactor, unsigned int const parts_per_block)
#define FFT_FORW_FFT
static SteepestDescentParameters params
Currently active steepest descent instance.
int mesh[3]
Mesh dimensions.
REAL_TYPE pos_shift
Position shift.
REAL_TYPE * G_hat
Influence Function.
FFT_TYPE_COMPLEX * force_mesh_x
Force meshes.
int mesh_z_padded
Padded size.
int mesh_size
Total number of mesh points (including padding)
int cao
Charge assignment order.
unsigned int n_part
Number of particles.
FFT_TYPE_COMPLEX * force_mesh_z
REAL_TYPE hi[3]
Inverse mesh spacing.
FFT_TYPE_COMPLEX * charge_mesh
Charge mesh.
REAL_TYPE box[3]
Box size.
FFT_TYPE_COMPLEX * force_mesh_y
REAL_TYPE alpha
Ewald parameter.
cufftHandle forw_plan
Forward FFT plan.
cufftHandle back_plan
Backward FFT plan.
void free_device_memory()
P3MGpuData p3m_gpu_data
P3MGpuFftPlan p3m_fft
DEVICE_QUALIFIER constexpr iterator begin() noexcept
Definition Array.hpp:128
DEVICE_QUALIFIER constexpr iterator end() noexcept
Definition Array.hpp:140
#define cuda_safe_mem(a)
Definition utils.cuh:73
#define KERNELCALL(_function, _grid, _block,...)
Definition utils.cuh:79