35#include "system/System.hpp"
48#if defined(OMPI_MPI_H) || defined(_MPI_H)
49#error CU-file includes mpi.h! This should not happen!
57#define cudaSetDevice(d)
61__constant__
float uz[1];
90void CoulombMMM1DGpu::modpsi_init() {
94 std::vector<unsigned int> linModPsi_offsets(
modPsi.size());
95 std::vector<unsigned int> linModPsi_lengths(
modPsi.size());
96 for (std::size_t i = 0; i <
modPsi.size(); i++) {
98 linModPsi_offsets[i] =
99 linModPsi_offsets[i - 1] + linModPsi_lengths[i - 1];
100 linModPsi_lengths[i] =
static_cast<unsigned int>(
modPsi[i].size());
104 std::vector<float> linModPsi(linModPsi_offsets[
modPsi.size() - 1] +
105 linModPsi_lengths[
modPsi.size() - 1]);
106 for (std::size_t i = 0; i <
modPsi.size(); i++) {
107 for (std::size_t j = 0; j <
modPsi[i].size(); j++) {
108 linModPsi[linModPsi_offsets[i] + j] =
static_cast<float>(
modPsi[i][j]);
116 auto const linModPsiSize = linModPsi_offsets[
modPsi.size() - 1] +
117 linModPsi_lengths[
modPsi.size() - 1];
119 throw std::runtime_error(
120 "__constant__ device_linModPsi[] is not large enough");
123 linModPsi_offsets.data(),
124 modPsi.size() *
sizeof(
int)));
126 linModPsi_lengths.data(),
127 modPsi.size() *
sizeof(
int)));
129 linModPsiSize *
sizeof(
float)));
130 auto const n_modPsi =
static_cast<int>(
modPsi.size() >> 1);
137 auto const n_ops_per_thread = n_ops /
static_cast<std::size_t
>(
numThreads);
138 return 1u +
static_cast<unsigned int>(n_ops_per_thread);
149void CoulombMMM1DGpu::setup() {
151 auto const box_z =
static_cast<float>(system.box_geo->length()[2]);
152 auto const n_part = system.gpu.n_particles();
153 if (not m_is_tuned and n_part != 0) {
157 if (box_z != host_boxz) {
158 set_params(box_z, 0, -1, -1, -1);
162 if (n_part == host_npart and pairs != -1) {
168 auto const part_mem_size = 3ul *
Utils::sqr(n_part) *
sizeof(float);
173 std::size_t freeMem, totalMem;
174 cudaMemGetInfo(&freeMem, &totalMem);
175 if (freeMem / 2 < part_mem_size) {
177 fprintf(stderr,
"Switching to atomicAdd due to memory constraints.\n");
182 if (dev_forcePairs) {
187 cuda_safe_mem(cudaMalloc((
void **)&dev_forcePairs, part_mem_size));
189 if (dev_energyBlocks) {
194 host_npart =
static_cast<unsigned int>(n_part);
198 if (dev_forcePairs) {
201 if (dev_energyBlocks) {
206__forceinline__ __device__
float sqpow(
float x) {
return x * x; }
207__forceinline__ __device__
float cbpow(
float x) {
return x * x * x; }
210 auto const tid = threadIdx.x;
211 for (
auto i = blockDim.x / 2; i > 0; i /= 2) {
214 input[tid] += input[i + tid];
222 extern __shared__
float partialsums[];
225 std::size_t
const tid = threadIdx.x;
228 for (std::size_t i = 0; i <
N; i += blockDim.x) {
230 partialsums[tid] = 0.f;
232 partialsums[tid] = data[i + tid];
245 constexpr auto c_2pif = 2 * Utils::pi<float>();
246 auto const arg = c_2pif * *
uz * far_switch_radius;
247 auto const pref = 4 * *
uz * max(1.0f, c_2pif * *
uz);
251 err = pref *
dev_K1(arg *
static_cast<float>(
P)) * exp(arg) / arg *
252 (
static_cast<float>(
P) - 1 + 1 / arg);
265 auto const maxrad = host_boxz;
267 float besttime = INFINITY;
268 auto radius = 0.05 * maxrad;
269 while (radius < maxrad) {
272 auto const runtime = force_benchmark();
273 if (runtime < besttime) {
277 radius += 0.05 * maxrad;
285 constexpr auto maxCut = 30;
286 cuda_safe_mem(cudaMalloc((
void **)&dev_cutoff,
sizeof(
int)));
288 dev_cutoff, far_switch_radius_f, maxCut);
290 cuda_safe_mem(cudaMemcpy(&best_cutoff, dev_cutoff,
sizeof(
int),
291 cudaMemcpyDeviceToHost));
296 throw std::runtime_error(
297 "No reasonable Bessel cutoff could be determined.");
304void CoulombMMM1DGpu::set_params(
double boxz,
double prefactor,
308 throw std::runtime_error(
309 "switching radius must not be larger than box length");
317 auto const far_switch_radius_sq_f =
320 &far_switch_radius_sq_f,
sizeof(
float)));
323 host_boxz =
static_cast<float>(
boxz);
324 auto const uz = 1.0f / host_boxz;
330 auto const prefactor_f =
static_cast<float>(
prefactor);
341 auto const maxPWerror_f =
static_cast<float>(
maxPWerror);
343 cudaMemcpyToSymbol(
::maxPWerror, &maxPWerror_f,
sizeof(
float)));
350 float const *
const __restrict__ q,
351 float *
const __restrict__
force, std::size_t
N,
354 constexpr auto c_2pif = 2.f * Utils::pi<float>();
357 for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < tStop;
358 tid += blockDim.x * gridDim.x) {
359 auto const p1 = tid %
N, p2 = tid /
N;
360 auto x = r[3 * p2 + 0] - r[3 * p1 + 0];
361 auto y = r[3 * p2 + 1] - r[3 * p1 + 1];
362 auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
364 auto rxy = sqrt(rxy2);
368 while (fabs(z) > *
boxz / 2.f)
369 z -= (z > 0.f ? 1.f : -1.f) * *
boxz;
377 auto const uzz = *
uz * z;
378 auto const uzr = *
uz * rxy;
382 auto const sum_r_old = sum_r;
386 sum_r += 2 *
static_cast<float>(n) * mpe * uzrpow;
388 sum_z += mpo * uzrpow;
415 float arg = c_2pif * *
uz *
static_cast<float>(p);
416 sum_r +=
static_cast<float>(p) *
dev_K1(arg * rxy) * cos(arg * z);
417 sum_z +=
static_cast<float>(p) *
dev_K0(arg * rxy) * sin(arg * z);
419 sum_r *=
sqpow(*
uz) * 4.f * c_2pif;
420 sum_z *=
sqpow(*
uz) * 4.f * c_2pif;
421 sum_r += 2.f * *
uz / rxy;
426 force[3 * (p1 + p2 *
N) + 0] = pref * sum_r / rxy * x;
427 force[3 * (p1 + p2 *
N) + 1] = pref * sum_r / rxy * y;
428 force[3 * (p1 + p2 *
N) + 2] = pref * sum_z;
430 atomicAdd(&
force[3 * p2 + 0], pref * sum_r / rxy * x);
431 atomicAdd(&
force[3 * p2 + 1], pref * sum_r / rxy * y);
432 atomicAdd(&
force[3 * p2 + 2], pref * sum_z);
438 float const *
const __restrict__ q,
439 float *
const __restrict__ energy, std::size_t
N,
442 constexpr auto c_2pif = 2.f * Utils::pi<float>();
443 constexpr auto c_gammaf = Utils::gamma<float>();
446 extern __shared__
float partialsums[];
448 partialsums[threadIdx.x] = 0;
451 for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < tStop;
452 tid += blockDim.x * gridDim.x) {
453 auto const p1 = tid %
N, p2 = tid /
N;
454 auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
455 auto const rxy2 =
sqpow(r[3 * p2 + 0] - r[3 * p1 + 0]) +
456 sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]);
457 auto rxy = sqrt(rxy2);
460 while (fabs(z) > *
boxz / 2.f)
461 z -= (z > 0.f ? 1.f : -1.f) * *
boxz;
467 auto const uzz = *
uz * z;
468 auto const uzr2 =
sqpow(*
uz * rxy);
472 auto const sum_e_old = sum_e;
474 sum_e += mpe * uzrpow;
482 sum_e -= 2.f * *
uz * c_gammaf;
483 sum_e += rsqrt(rxy2 +
sqpow(z));
488 sum_e = -(log(rxy * *
uz / 2.f) + c_gammaf) / 2.f;
490 auto const arg = c_2pif * *
uz *
static_cast<float>(p);
491 sum_e +=
dev_K0(arg * rxy) * cos(arg * z);
512 for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid <
N;
513 tid += blockDim.x * gridDim.x) {
514 auto const offset = tid %
N;
515 for (std::size_t i = 0; tid + i *
N < tStop; i++) {
517 for (std::size_t d = 0; d < 3; d++) {
518 dst[3 * offset + d] -= src[3 * (tid + i *
N) + d];
528 throw std::runtime_error(
"MMM1D was not initialized correctly");
532 auto const positions_device = gpu.get_particle_positions_device();
533 auto const charges_device = gpu.get_particle_charges_device();
534 auto const forces_device = gpu.get_particle_forces_device();
535 auto const n_part = gpu.n_particles();
540 charges_device, dev_forcePairs, n_part, pairs)
542 forces_device, n_part)
545 charges_device, forces_device, n_part, pairs)
550 std::size_t
N,
float factor) {
551 for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid <
N;
552 tid += blockDim.x * gridDim.x) {
553 dst[tid] += src[tid] * factor;
561 throw std::runtime_error(
"MMM1D was not initialized correctly");
565 auto const positions_device = gpu.get_particle_positions_device();
566 auto const charges_device = gpu.get_particle_charges_device();
567 auto const energy_device = gpu.get_energy_device();
568 auto const n_part = gpu.n_particles();
569 auto const shared =
numThreads *
static_cast<unsigned>(
sizeof(float));
571 positions_device, charges_device, dev_energyBlocks, n_part,
576 auto constexpr factor = 0.5f;
578 dev_energyBlocks, 1, factor);
581float CoulombMMM1DGpu::force_benchmark() {
582 cudaEvent_t eventStart, eventStop;
584 float *dev_f_benchmark;
587 auto const positions_device = gpu.get_particle_positions_device();
588 auto const charges_device = gpu.get_particle_charges_device();
589 auto const n_part = gpu.n_particles();
591 cudaMalloc((
void **)&dev_f_benchmark, 3ul * n_part *
sizeof(
float)));
596 charges_device, dev_f_benchmark, n_part, 0)
599 cuda_safe_mem(cudaEventElapsedTime(&elapsedTime, eventStart, eventStop));
double far_switch_radius_sq
void add_long_range_forces()
void add_long_range_energy()
double prefactor
Electrostatics prefactor.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
This file contains the defaults for ESPResSo.
std::vector< std::vector< double > > modPsi
Table of the Taylor expansions of the modified polygamma functions.
void create_mod_psi_up_to(int new_n)
Create both the even and odd polygamma functions up to order 2*new_n
Common parts of the MMM family of methods for the electrostatic interaction: MMM1D and ELC.
MMM1D algorithm for long-range Coulomb interactions on the GPU.
__constant__ float device_linModPsi[modpsi_constant_size]
__global__ void vectorReductionKernel(float const *src, float *dst, std::size_t N)
__constant__ int bessel_cutoff[1]
__constant__ float far_switch_radius_sq[1]
static constexpr int deviceCount
__constant__ float maxPWerror[1]
__device__ float dev_mod_psi_odd(int n, float x)
__global__ void forcesKernel(float const *const __restrict__ r, float const *const __restrict__ q, float *const __restrict__ force, std::size_t N, int pairs)
__constant__ unsigned int device_linModPsi_lengths[2 *modpsi_order]
__constant__ int device_n_modPsi[1]
__device__ void sumReduction(float *input, float *sum)
__global__ void energiesKernel(float const *const __restrict__ r, float const *const __restrict__ q, float *const __restrict__ energy, std::size_t N, int pairs)
static auto numBlocksOps(std::size_t n_ops)
Get number of blocks for a given number of operations.
__constant__ float boxz[1]
__forceinline__ __device__ float cbpow(float x)
static constexpr unsigned int numThreads
__forceinline__ __device__ float sqpow(float x)
__global__ void besselTuneKernel(int *result, float far_switch_radius, int maxCut)
__constant__ unsigned int device_linModPsi_offsets[2 *modpsi_order]
__global__ void sumKernel(float *data, std::size_t N)
__device__ float dev_mod_psi_even(int n, float x)
static auto numBlocks(std::size_t n_part)
Get number of blocks for a given number of particles.
__global__ void scaleAndAddKernel(float *const dst, float const *const src, std::size_t N, float factor)
__constant__ float coulomb_prefactor[1]
constexpr int modpsi_order
constexpr int modpsi_constant_size
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
This file contains implementations for the modified Bessel functions of first and second kind.
__device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x)
__device__ float dev_K0(float x)
__device__ float dev_K1(float x)
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
#define KERNELCALL(_function, _grid, _block,...)