36#define REAL_TYPE float
37#define FFT_TYPE_COMPLEX cufftComplex
38#define FFT_FORW_FFT cufftExecR2C
39#define FFT_BACK_FFT cufftExecC2R
40#define FFT_PLAN_FORW_FLAG CUFFT_R2C
41#define FFT_PLAN_BACK_FLAG CUFFT_C2R
44#ifdef P3M_GPU_REAL_DOUBLE
45#define REAL_TYPE double
46#define FFT_TYPE_COMPLEX cufftDoubleComplex
47#define FFT_FORW_FFT cufftExecD2Z
48#define FFT_BACK_FFT cufftExecZ2D
49#define FFT_PLAN_FORW_FLAG CUFFT_D2Z
50#define FFT_PLAN_BACK_FLAG CUFFT_Z2D
57#include "system/System.hpp"
73#if defined(OMPI_MPI_H) || defined(_MPI_H)
74#error CU-file includes mpi.h! This should not happen!
124 auto const free_device_pointer = [](
auto *&ptr) {
125 if (ptr !=
nullptr) {
142 auto const cao3 = Utils::int_pow<3>(cao);
143 auto parts_per_block = 1u;
144 while ((parts_per_block + 1u) * cao3 <= 1024u) {
147 assert((n_part / parts_per_block) <= std::numeric_limits<unsigned>::max());
148 auto n =
static_cast<unsigned int>(n_part / parts_per_block);
149 auto n_blocks = ((n_part % parts_per_block) == 0u) ? std::max(1u, n) : n + 1u;
150 assert(n_blocks <= std::numeric_limits<unsigned>::max());
151 return std::make_pair(parts_per_block,
static_cast<unsigned>(n_blocks));
155 dim3 grid(n_blocks, 1u, 1u);
156 while (grid.x > 65536u) {
158 if ((n_blocks % grid.y) == 0u)
159 grid.x = std::max(1u, n_blocks / grid.y);
161 grid.x = n_blocks / grid.y + 1u;
178 for (
int i = 0; i < 3; ++i) {
179 Leni[i] = 1.0f / p.
box[i];
183 Zaehler[0] = Zaehler[1] = Zaehler[2] = *Nenner = 0.0;
188 S1 = int_pow<2 * cao>(
math::sinc(Meshi[0] * NMX));
191 ((NY > p.
mesh[1] / 2) ? NY - p.
mesh[1] : NY) + p.
mesh[1] * MY);
192 S2 = S1 * int_pow<2 * cao>(
math::sinc(Meshi[1] * NMY));
195 ((NZ > p.
mesh[2] / 2) ? NZ - p.
mesh[2] : NZ) + p.
mesh[2] * MZ);
196 S3 = S2 * int_pow<2 * cao>(
math::sinc(Meshi[2] * NMZ));
198 NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]);
201 TE = exp(-sqr(std::numbers::pi_v<REAL_TYPE> / (p.
alpha)) * NM2);
203 Zaehler[0] += NMX * zwi * Leni[0];
204 Zaehler[1] += NMY * zwi * Leni[1];
205 Zaehler[2] += NMZ * zwi * Leni[2];
215 const auto NX =
static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
216 const auto NY =
static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
217 const auto NZ =
static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
219 REAL_TYPE Zaehler[3] = {0.0, 0.0, 0.0}, Nenner = 0.0;
223 for (
int i = 0; i < 3; ++i) {
227 if ((NX >= p.
mesh[0]) || (NY >= p.
mesh[1]) || (NZ >= (p.
mesh[2] / 2 + 1)))
230 index = NX * p.
mesh[1] * (p.
mesh[2] / 2 + 1) + NY * (p.
mesh[2] / 2 + 1) + NZ;
232 if (((NX == 0) && (NY == 0) && (NZ == 0)) ||
233 ((NX % (p.
mesh[0] / 2) == 0) && (NY % (p.
mesh[1] / 2) == 0) &&
234 (NZ % (p.
mesh[2] / 2) == 0))) {
237 Aliasing_sums_ik<cao>(p, NX, NY, NZ, Zaehler, &Nenner);
243 zwi = Dnx * Zaehler[0] * Leni[0] + Dny * Zaehler[1] * Leni[1] +
244 Dnz * Zaehler[2] * Leni[2];
245 zwi /= ((sqr(Dnx * Leni[0]) + sqr(Dny * Leni[1]) + sqr(Dnz * Leni[2])) *
247 p.
G_hat[index] =
REAL_TYPE{2} * zwi / std::numbers::pi_v<REAL_TYPE>;
260 return static_cast<unsigned int>(p.
mesh[1] * (p.
mesh[2] / 2 + 1) * i +
261 (p.
mesh[2] / 2 + 1) * j + k);
266 auto const linear_index =
linear_index_k(p,
static_cast<int>(blockIdx.x),
267 static_cast<int>(blockIdx.y),
268 static_cast<int>(threadIdx.x));
270 auto const bidx =
static_cast<int>(blockIdx.x);
271 auto const bidy =
static_cast<int>(blockIdx.y);
272 auto const nx = (bidx > p.
mesh[0] / 2) ? bidx - p.
mesh[0] : bidx;
273 auto const ny = (bidy > p.
mesh[1] / 2) ? bidy - p.
mesh[1] : bidy;
274 auto const nz =
static_cast<int>(threadIdx.x);
278 buf.x =
REAL_TYPE(-2) * std::numbers::pi_v<REAL_TYPE> * meshw.y;
279 buf.y =
REAL_TYPE(+2) * std::numbers::pi_v<REAL_TYPE> * meshw.x;
282 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(nx) * buf.x / p.
box[0];
284 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(nx) * buf.y / p.
box[0];
287 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(ny) * buf.x / p.
box[1];
289 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(ny) * buf.y / p.
box[1];
292 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(nz) * buf.x / p.
box[2];
294 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(nz) * buf.y / p.
box[2];
297__device__
inline int wrap_index(
const int ind,
const int mesh) {
306 auto const linear_index =
linear_index_k(p,
static_cast<int>(blockIdx.x),
307 static_cast<int>(blockIdx.y),
308 static_cast<int>(threadIdx.x));
314template <
int cao,
bool shared>
316 float const *
const __restrict__ part_pos,
317 float const *
const __restrict__ part_q,
318 unsigned int const parts_per_block) {
319 auto const part_in_block = threadIdx.x /
static_cast<unsigned int>(cao);
320 auto const cao_id_x =
321 threadIdx.x - part_in_block *
static_cast<unsigned int>(cao);
324 parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
330 int nmp_x, nmp_y, nmp_z;
334 m_pos[0] = part_pos[3 *
id + 0] *
params.hi[0] -
params.pos_shift;
335 m_pos[1] = part_pos[3 *
id + 1] *
params.hi[1] -
params.pos_shift;
336 m_pos[2] = part_pos[3 *
id + 2] *
params.hi[2] -
params.pos_shift;
338 nmp_x =
static_cast<int>(floorf(m_pos[0] + 0.5f));
339 nmp_y =
static_cast<int>(floorf(m_pos[1] + 0.5f));
340 nmp_z =
static_cast<int>(floorf(m_pos[2] + 0.5f));
342 m_pos[0] -=
static_cast<REAL_TYPE>(nmp_x);
343 m_pos[1] -=
static_cast<REAL_TYPE>(nmp_y);
344 m_pos[2] -=
static_cast<REAL_TYPE>(nmp_z);
352 extern __shared__
float weights[];
355 auto const offset =
static_cast<unsigned int>(cao) * part_in_block;
356 if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) {
357 weights[3u * offset + 3u * cao_id_x + threadIdx.y] =
358 Utils::bspline<cao>(
static_cast<int>(cao_id_x), m_pos[threadIdx.y]);
363 auto const c = weights[3u * offset + 3u * cao_id_x] *
364 weights[3u * offset + 3u * threadIdx.y + 1u] *
365 weights[3u * offset + 3u * threadIdx.z + 2u] * part_q[id];
366 atomicAdd(&(charge_mesh[index]),
REAL_TYPE(c));
370 Utils::bspline<cao>(
static_cast<int>(cao_id_x), m_pos[0]) * part_q[id] *
371 Utils::bspline<cao>(
static_cast<int>(threadIdx.y), m_pos[1]) *
372 Utils::bspline<cao>(
static_cast<int>(threadIdx.z), m_pos[2]);
373 atomicAdd(&(charge_mesh[index]),
REAL_TYPE(c));
378 float const *
const __restrict__ part_pos,
379 float const *
const __restrict__ part_q) {
380 auto const cao =
static_cast<unsigned int>(
params.cao);
382 dim3
block(parts_per_block * cao, cao, cao);
385 auto const data_length = std::size_t(3u) *
386 static_cast<std::size_t
>(parts_per_block) *
387 static_cast<std::size_t
>(cao) *
sizeof(
REAL_TYPE);
390 (assign_charge_kernel<1, false>)<<<grid,
block, std::size_t(0u),
nullptr>>>(
391 params, part_pos, part_q, parts_per_block);
394 (assign_charge_kernel<2, false>)<<<grid,
block, std::size_t(0u),
nullptr>>>(
395 params, part_pos, part_q, parts_per_block);
398 (assign_charge_kernel<3, true>)<<<grid,
block, data_length,
nullptr>>>(
399 params, part_pos, part_q, parts_per_block);
402 (assign_charge_kernel<4, true>)<<<grid,
block, data_length,
nullptr>>>(
403 params, part_pos, part_q, parts_per_block);
406 (assign_charge_kernel<5, true>)<<<grid,
block, data_length,
nullptr>>>(
407 params, part_pos, part_q, parts_per_block);
410 (assign_charge_kernel<6, true>)<<<grid,
block, data_length,
nullptr>>>(
411 params, part_pos, part_q, parts_per_block);
414 (assign_charge_kernel<7, true>)<<<grid,
block, data_length,
nullptr>>>(
415 params, part_pos, part_q, parts_per_block);
423template <
int cao,
bool shared>
425 float const *
const __restrict__ part_pos,
426 float const *
const __restrict__ part_q,
427 float *
const __restrict__ part_f,
429 unsigned int const parts_per_block) {
430 auto const part_in_block = threadIdx.x /
static_cast<unsigned int>(cao);
431 auto const cao_id_x =
432 threadIdx.x - part_in_block *
static_cast<unsigned int>(cao);
435 parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
436 if (
id >=
static_cast<unsigned>(
params.n_part))
441 int nmp_x, nmp_y, nmp_z;
443 m_pos[0] = part_pos[3u *
id + 0u] *
params.hi[0] -
params.pos_shift;
444 m_pos[1] = part_pos[3u *
id + 1u] *
params.hi[1] -
params.pos_shift;
445 m_pos[2] = part_pos[3u *
id + 2u] *
params.hi[2] -
params.pos_shift;
447 nmp_x =
static_cast<int>(floorf(m_pos[0] +
REAL_TYPE{0.5}));
448 nmp_y =
static_cast<int>(floorf(m_pos[1] +
REAL_TYPE{0.5}));
449 nmp_z =
static_cast<int>(floorf(m_pos[2] +
REAL_TYPE{0.5}));
451 m_pos[0] -=
static_cast<REAL_TYPE>(nmp_x);
452 m_pos[1] -=
static_cast<REAL_TYPE>(nmp_y);
453 m_pos[2] -=
static_cast<REAL_TYPE>(nmp_z);
461 extern __shared__
float weights[];
465 auto const offset =
static_cast<unsigned int>(cao) * part_in_block;
466 if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) {
467 weights[3u * offset + 3u * cao_id_x + threadIdx.y] =
468 Utils::bspline<cao>(
static_cast<int>(cao_id_x), m_pos[threadIdx.y]);
473 c *=
REAL_TYPE(weights[3u * offset + 3u * cao_id_x] *
474 weights[3u * offset + 3u * threadIdx.y + 1u] *
475 weights[3u * offset + 3u * threadIdx.z + 2u]);
478 REAL_TYPE(Utils::bspline<cao>(
static_cast<int>(cao_id_x), m_pos[0]) *
479 Utils::bspline<cao>(
static_cast<int>(threadIdx.y), m_pos[1]) *
480 Utils::bspline<cao>(
static_cast<int>(threadIdx.z), m_pos[2]));
487 atomicAdd(&(part_f[3u *
id + 0u]),
float(c * force_mesh_x[index]));
488 atomicAdd(&(part_f[3u *
id + 1u]),
float(c * force_mesh_y[index]));
489 atomicAdd(&(part_f[3u *
id + 2u]),
float(c * force_mesh_z[index]));
493 float const *
const __restrict__ part_pos,
494 float const *
const __restrict__ part_q,
495 float *
const __restrict__ part_f,
497 auto const cao =
static_cast<unsigned int>(
params.cao);
499 dim3
block(parts_per_block * cao, cao, cao);
504 auto const data_length = std::size_t(3u) *
505 static_cast<std::size_t
>(parts_per_block) *
506 static_cast<std::size_t
>(cao) *
sizeof(float);
509 (assign_forces_kernel<1, false>)<<<grid,
block, std::size_t(0u),
nullptr>>>(
510 params, part_pos, part_q, part_f, prefactor, parts_per_block);
513 (assign_forces_kernel<2, false>)<<<grid,
block, std::size_t(0u),
nullptr>>>(
514 params, part_pos, part_q, part_f, prefactor, parts_per_block);
517 (assign_forces_kernel<3, true>)<<<grid,
block, data_length,
nullptr>>>(
518 params, part_pos, part_q, part_f, prefactor, parts_per_block);
521 (assign_forces_kernel<4, true>)<<<grid,
block, data_length,
nullptr>>>(
522 params, part_pos, part_q, part_f, prefactor, parts_per_block);
525 (assign_forces_kernel<5, true>)<<<grid,
block, data_length,
nullptr>>>(
526 params, part_pos, part_q, part_f, prefactor, parts_per_block);
529 (assign_forces_kernel<6, true>)<<<grid,
block, data_length,
nullptr>>>(
530 params, part_pos, part_q, part_f, prefactor, parts_per_block);
533 (assign_forces_kernel<7, true>)<<<grid,
block, data_length,
nullptr>>>(
534 params, part_pos, part_q, part_f, prefactor, parts_per_block);
553 throw std::runtime_error(
"P3M: invalid mesh size");
556 data = std::make_shared<P3MGpuParams>();
559 auto &p3m_gpu_data = data->p3m_gpu_data;
560 bool do_reinit =
false, mesh_changed =
false;
561 assert(n_part <= std::numeric_limits<unsigned int>::max());
562 p3m_gpu_data.n_part =
static_cast<unsigned>(n_part);
564 if (not data->is_initialized or p3m_gpu_data.alpha != alpha) {
565 p3m_gpu_data.alpha =
static_cast<REAL_TYPE>(alpha);
569 if (not data->is_initialized or p3m_gpu_data.cao != cao) {
570 p3m_gpu_data.cao = cao;
572 p3m_gpu_data.pos_shift =
static_cast<REAL_TYPE>((p3m_gpu_data.cao - 1) / 2);
576 if (not data->is_initialized or mesh !=
Utils::Vector3i(p3m_gpu_data.mesh)) {
577 std::copy(mesh.
begin(), mesh.
end(), p3m_gpu_data.mesh);
582 if (
auto constexpr eps =
583 static_cast<double>(std::numeric_limits<float>::epsilon());
584 not data->is_initialized or
586 std::copy(box_l.
begin(), box_l.
end(), p3m_gpu_data.box);
590 p3m_gpu_data.mesh_z_padded = (mesh[2] / 2 + 1) * 2;
591 p3m_gpu_data.mesh_size = mesh[0] * mesh[1] * p3m_gpu_data.mesh_z_padded;
593 for (
auto i = 0u; i < 3u; ++i) {
595 static_cast<REAL_TYPE>(p3m_gpu_data.mesh[i]) / p3m_gpu_data.box[i];
598 if (data->is_initialized and mesh_changed) {
599 data->free_device_memory();
600 data->is_initialized =
false;
603 if (not data->is_initialized and p3m_gpu_data.mesh_size > 0) {
605 auto const cmesh_size =
606 static_cast<std::size_t
>(p3m_gpu_data.mesh[0]) *
607 static_cast<std::size_t
>(p3m_gpu_data.mesh[1]) *
608 static_cast<std::size_t
>(p3m_gpu_data.mesh[2] / 2 + 1);
610 cuda_safe_mem(cudaMalloc((
void **)&(p3m_gpu_data.charge_mesh), mesh_len));
611 cuda_safe_mem(cudaMalloc((
void **)&(p3m_gpu_data.force_mesh_x), mesh_len));
612 cuda_safe_mem(cudaMalloc((
void **)&(p3m_gpu_data.force_mesh_y), mesh_len));
613 cuda_safe_mem(cudaMalloc((
void **)&(p3m_gpu_data.force_mesh_z), mesh_len));
617 if (cufftPlan3d(&(data->p3m_fft.forw_plan), mesh[0], mesh[1], mesh[2],
619 cufftPlan3d(&(data->p3m_fft.back_plan), mesh[0], mesh[1], mesh[2],
621 throw std::runtime_error(
"Unable to create fft plan");
625 if ((do_reinit or not data->is_initialized) and p3m_gpu_data.mesh_size > 0) {
628 block.x =
static_cast<unsigned>(512 / mesh[0] + 1);
629 block.y =
static_cast<unsigned>(mesh[1]);
631 grid.x =
static_cast<unsigned>(mesh[0]) /
block.x + 1;
632 grid.z =
static_cast<unsigned>(mesh[2]) / 2 + 1;
634 switch (p3m_gpu_data.cao) {
665 if (p3m_gpu_data.mesh_size > 0)
666 data->is_initialized =
true;
673 double prefactor, std::size_t n_part) {
675 p3m_gpu_data.
n_part =
static_cast<unsigned>(n_part);
684 dim3 gridConv(
static_cast<unsigned>(p3m_gpu_data.mesh[0]),
685 static_cast<unsigned>(p3m_gpu_data.mesh[1]), 1u);
686 dim3 threadsConv(
static_cast<unsigned>(p3m_gpu_data.mesh[2] / 2 + 1), 1u, 1u);
693 static_cast<std::size_t
>(p3m_gpu_data.mesh_size) *
702 p3m_gpu_data.charge_mesh) != CUFFT_SUCCESS) {
703 throw std::runtime_error(
"CUFFT error: Forward FFT failed");
721 assign_forces(p3m_gpu_data, positions_device, charges_device, forces_device,
Particle data communication manager for the GPU.
float * get_particle_charges_device() const
float * get_particle_forces_device() const
float * get_particle_positions_device() const
static DEVICE_QUALIFIER constexpr Vector< T, N > broadcast(typename Base::value_type const &value)
Create a vector that has all entries set to the same value.
void cuda_check_errors_exit(const dim3 &block, const dim3 &grid, const char *function, const char *file, unsigned int line)
In case of error during a CUDA operation, print the error message and exit.
This file contains the defaults for ESPResSo.
#define P3M_BRILLOUIN
P3M: Number of Brillouin zones taken into account in the calculation of the optimal influence functio...
static double * block(double *p, std::size_t index, std::size_t size)
T product(Vector< T, N > const &v)
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
DEVICE_QUALIFIER constexpr T int_pow(T x)
Calculate integer powers.
__device__ auto linear_index_k(P3MGpuData const &p, int i, int j, int k)
__device__ auto linear_index_r(P3MGpuData const &p, int i, int j, int k)
DEVICE_QUALIFIER auto sinc(T x)
Calculate the function .
__device__ static void Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, int NZ, REAL_TYPE *Zaehler, REAL_TYPE *Nenner)
__global__ void assign_charge_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, unsigned int const parts_per_block)
void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, double prefactor, std::size_t n_part)
The long-range part of the P3M algorithm.
__global__ void apply_influence_function(const P3MGpuData p)
void assign_charges(P3MGpuData const ¶ms, float const *const __restrict__ part_pos, float const *const __restrict__ part_q)
#define FFT_PLAN_FORW_FLAG
__global__ void apply_diff_op(const P3MGpuData p)
void p3m_gpu_init(std::shared_ptr< P3MGpuParams > &data, int cao, Utils::Vector3i const &mesh, double alpha, Utils::Vector3d const &box_l, std::size_t n_part)
Initialize the internal data structure of the P3M GPU.
static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part)
void assign_forces(P3MGpuData const ¶ms, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE const prefactor)
__global__ void calculate_influence_function_device(const P3MGpuData p)
__device__ int wrap_index(const int ind, const int mesh)
#define FFT_PLAN_BACK_FLAG
dim3 p3m_make_grid(unsigned int n_blocks)
__global__ void assign_forces_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE prefactor, unsigned int const parts_per_block)
static SteepestDescentParameters params
Currently active steepest descent instance.
int mesh[3]
Mesh dimensions.
REAL_TYPE pos_shift
Position shift.
REAL_TYPE * G_hat
Influence Function.
FFT_TYPE_COMPLEX * force_mesh_x
Force meshes.
int mesh_z_padded
Padded size.
int mesh_size
Total number of mesh points (including padding)
int cao
Charge assignment order.
unsigned int n_part
Number of particles.
FFT_TYPE_COMPLEX * force_mesh_z
REAL_TYPE hi[3]
Inverse mesh spacing.
FFT_TYPE_COMPLEX * charge_mesh
Charge mesh.
REAL_TYPE box[3]
Box size.
FFT_TYPE_COMPLEX * force_mesh_y
REAL_TYPE alpha
Ewald parameter.
cufftHandle forw_plan
Forward FFT plan.
cufftHandle back_plan
Backward FFT plan.
void free_device_memory()
DEVICE_QUALIFIER constexpr iterator begin() noexcept
DEVICE_QUALIFIER constexpr iterator end() noexcept
#define KERNELCALL(_function, _grid, _block,...)