32#define ESPRESSO_P3M_GPU_FLOAT
35#ifdef ESPRESSO_P3M_GPU_FLOAT
36#define REAL_TYPE float
37#define FFT_TYPE_COMPLEX cufftComplex
38#define FFT_FORW_FFT cufftExecR2C
39#define FFT_BACK_FFT cufftExecC2R
40#define FFT_PLAN_FORW_FLAG CUFFT_R2C
41#define FFT_PLAN_BACK_FLAG CUFFT_C2R
44#ifdef ESPRESSO_P3M_GPU_REAL_DOUBLE
45#define REAL_TYPE double
46#define FFT_TYPE_COMPLEX cufftDoubleComplex
47#define FFT_FORW_FFT cufftExecD2Z
48#define FFT_BACK_FFT cufftExecZ2D
49#define FFT_PLAN_FORW_FLAG CUFFT_D2Z
50#define FFT_PLAN_BACK_FLAG CUFFT_Z2D
57#include "system/System.hpp"
73#if defined(OMPI_MPI_H) || defined(_MPI_H)
74#error CU-file includes mpi.h! This should not happen!
125 if (ptr !=
nullptr) {
142 auto const cao3 = Utils::int_pow<3>(cao);
156 while (
grid.x > 65536u) {
178 for (
int i = 0; i < 3; ++i) {
201 TE =
exp(-sqr(std::numbers::pi_v<REAL_TYPE> / (p.
alpha)) *
NM2);
223 for (
int i = 0; i < 3; ++i) {
232 if (((
NX == 0) && (
NY == 0) && (
NZ == 0)) ||
233 ((
NX % (p.
mesh[0] / 2) == 0) && (
NY % (p.
mesh[1] / 2) == 0) &&
234 (
NZ % (p.
mesh[2] / 2) == 0))) {
260 return static_cast<unsigned int>(p.
mesh[1] * (p.
mesh[2] / 2 + 1) * i +
261 (p.
mesh[2] / 2 + 1) *
j + k);
282 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(
nx) * buf.x / p.
box[0];
284 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(
nx) * buf.y / p.
box[0];
287 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(
ny) * buf.x / p.
box[1];
289 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(
ny) * buf.y / p.
box[1];
292 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(
nz) * buf.x / p.
box[2];
294 static_cast<decltype(FFT_TYPE_COMPLEX::x)
>(
nz) * buf.y / p.
box[2];
314template <
int cao,
bool shared>
355 auto const offset =
static_cast<unsigned int>(cao) *
part_in_block;
363 auto const c = weights[3u * offset + 3u *
cao_id_x] *
364 weights[3u * offset + 3u *
threadIdx.y + 1u] *
370 Utils::bspline<cao>(
static_cast<int>(
cao_id_x), m_pos[0]) *
part_q[id] *
371 Utils::bspline<cao>(
static_cast<int>(
threadIdx.y), m_pos[1]) *
372 Utils::bspline<cao>(
static_cast<int>(
threadIdx.z), m_pos[2]);
380 auto const cao =
static_cast<unsigned int>(params.
cao);
387 static_cast<std::size_t
>(cao) *
sizeof(
REAL_TYPE);
388 switch (params.
cao) {
423template <
int cao,
bool shared>
436 if (
id >=
static_cast<unsigned>(params.
n_part))
465 auto const offset =
static_cast<unsigned int>(cao) *
part_in_block;
474 weights[3u * offset + 3u *
threadIdx.y + 1u] *
475 weights[3u * offset + 3u *
threadIdx.z + 2u]);
479 Utils::bspline<cao>(
static_cast<int>(
threadIdx.y), m_pos[1]) *
480 Utils::bspline<cao>(
static_cast<int>(
threadIdx.z), m_pos[2]));
497 auto const cao =
static_cast<unsigned int>(params.
cao);
506 static_cast<std::size_t
>(cao) *
sizeof(
float);
507 switch (params.
cao) {
555 data = std::make_shared<P3MGpuParams>();
558 auto &p3m_gpu_data = data->p3m_gpu_data;
560 assert(n_part <= std::numeric_limits<unsigned int>::max());
561 p3m_gpu_data.n_part =
static_cast<unsigned>(n_part);
563 if (
not data->is_initialized
or p3m_gpu_data.alpha != alpha) {
564 p3m_gpu_data.alpha =
static_cast<REAL_TYPE>(alpha);
568 if (
not data->is_initialized
or p3m_gpu_data.cao != cao) {
569 p3m_gpu_data.cao = cao;
571 p3m_gpu_data.pos_shift =
static_cast<REAL_TYPE>((p3m_gpu_data.cao - 1) / 2);
576 std::ranges::copy(mesh, p3m_gpu_data.mesh);
581 if (
auto constexpr eps =
582 static_cast<double>(std::numeric_limits<float>::epsilon());
583 not data->is_initialized
or
585 std::ranges::copy(
box_l, p3m_gpu_data.box);
589 p3m_gpu_data.mesh_z_padded = (mesh[2] / 2 + 1) * 2;
590 p3m_gpu_data.mesh_size = mesh[0] * mesh[1] * p3m_gpu_data.mesh_z_padded;
592 for (
auto i = 0
u; i < 3u; ++i) {
594 static_cast<REAL_TYPE>(p3m_gpu_data.mesh[i]) / p3m_gpu_data.box[i];
598 data->free_device_memory();
599 data->is_initialized =
false;
602 if (
not data->is_initialized
and p3m_gpu_data.mesh_size > 0) {
605 static_cast<std::size_t
>(p3m_gpu_data.mesh[0]) *
606 static_cast<std::size_t
>(p3m_gpu_data.mesh[1]) *
607 static_cast<std::size_t
>(p3m_gpu_data.mesh[2] / 2 + 1);
616 if (
cufftPlan3d(&(data->p3m_fft.forw_plan), mesh[0], mesh[1], mesh[2],
618 cufftPlan3d(&(data->p3m_fft.back_plan), mesh[0], mesh[1], mesh[2],
620 throw std::runtime_error(
"Unable to create fft plan");
627 block.x =
static_cast<unsigned>(512 / mesh[0] + 1);
628 block.y =
static_cast<unsigned>(mesh[1]);
630 grid.x =
static_cast<unsigned>(mesh[0]) /
block.x + 1;
631 grid.z =
static_cast<unsigned>(mesh[2]) / 2 + 1;
633 switch (p3m_gpu_data.cao) {
664 if (p3m_gpu_data.mesh_size > 0)
665 data->is_initialized =
true;
672 double prefactor, std::size_t n_part) {
674 p3m_gpu_data.
n_part =
static_cast<unsigned>(n_part);
683 dim3 gridConv(
static_cast<unsigned>(p3m_gpu_data.mesh[0]),
684 static_cast<unsigned>(p3m_gpu_data.mesh[1]), 1u);
685 dim3 threadsConv(
static_cast<unsigned>(p3m_gpu_data.mesh[2] / 2 + 1), 1u, 1u);
692 static_cast<std::size_t
>(p3m_gpu_data.mesh_size) *
702 throw std::runtime_error(
"CUFFT error: Forward FFT failed");
Particle data communication manager for the GPU.
float * get_particle_charges_device() const
float * get_particle_forces_device() const
float * get_particle_positions_device() const
static DEVICE_QUALIFIER constexpr Vector< T, N > broadcast(typename Base::value_type const &value) noexcept
Create a vector that has all entries set to the same value.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
void cuda_check_errors_exit(const dim3 &block, const dim3 &grid, const char *function, const char *file, unsigned int line)
In case of error during a CUDA operation, print the error message and exit.
static double * block(double *p, std::size_t index, std::size_t size)
#define P3M_BRILLOUIN
P3M: Number of Brillouin zones taken into account in the calculation of the optimal influence functio...
T product(Vector< T, N > const &v)
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
DEVICE_QUALIFIER constexpr T int_pow(T x)
Calculate integer powers.
__device__ auto linear_index_k(P3MGpuData const &p, int i, int j, int k)
__device__ auto linear_index_r(P3MGpuData const &p, int i, int j, int k)
DEVICE_QUALIFIER auto sinc(T x)
Calculate the function .
__device__ static void Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, int NZ, REAL_TYPE *Zaehler, REAL_TYPE *Nenner)
__global__ void assign_charge_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, unsigned int const parts_per_block)
void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, double prefactor, std::size_t n_part)
The long-range part of the P3M algorithm.
__global__ void apply_influence_function(const P3MGpuData p)
void assign_charges(P3MGpuData const ¶ms, float const *const __restrict__ part_pos, float const *const __restrict__ part_q)
#define FFT_PLAN_FORW_FLAG
__global__ void apply_diff_op(const P3MGpuData p)
void p3m_gpu_init(std::shared_ptr< P3MGpuParams > &data, int cao, Utils::Vector3i const &mesh, double alpha, Utils::Vector3d const &box_l, std::size_t n_part)
Initialize the internal data structure of the P3M GPU.
static auto p3m_calc_blocks(unsigned int cao, std::size_t n_part)
void assign_forces(P3MGpuData const ¶ms, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE const prefactor)
__global__ void calculate_influence_function_device(const P3MGpuData p)
__device__ int wrap_index(const int ind, const int mesh)
#define FFT_PLAN_BACK_FLAG
dim3 p3m_make_grid(unsigned int n_blocks)
__global__ void assign_forces_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE prefactor, unsigned int const parts_per_block)
int mesh[3]
Mesh dimensions.
REAL_TYPE pos_shift
Position shift.
REAL_TYPE * G_hat
Influence Function.
FFT_TYPE_COMPLEX * force_mesh_x
Force meshes.
int mesh_z_padded
Padded size.
int mesh_size
Total number of mesh points (including padding)
int cao
Charge assignment order.
unsigned int n_part
Number of particles.
FFT_TYPE_COMPLEX * force_mesh_z
REAL_TYPE hi[3]
Inverse mesh spacing.
FFT_TYPE_COMPLEX * charge_mesh
Charge mesh.
REAL_TYPE box[3]
Box size.
FFT_TYPE_COMPLEX * force_mesh_y
REAL_TYPE alpha
Ewald parameter.
cufftHandle forw_plan
Forward FFT plan.
cufftHandle back_plan
Backward FFT plan.
void free_device_memory()
#define KERNELCALL(_function, _grid, _block,...)