ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
mmm1d_gpu_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2014-2022 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20/** @file
21 * This file contains the code for the polygamma expansions used for the
22 * near formulas of MMM1D on GPU, as well as the force kernels.
23 */
24
25#include "config/config.hpp"
26
27#ifdef MMM1D_GPU
28
32
33#include "BoxGeometry.hpp"
34#include "cuda/utils.cuh"
35#include "system/System.hpp"
36
37#include <utils/constants.hpp>
38#include <utils/math/sqr.hpp>
39
40#include <cuda.h>
41
42#include <cmath>
43#include <cstddef>
44#include <cstdio>
45#include <stdexcept>
46#include <vector>
47
48#if defined(OMPI_MPI_H) || defined(_MPI_H)
49#error CU-file includes mpi.h! This should not happen!
50#endif
51
52// the code is mostly multi-GPU capable, but ESPResSo is not yet
53static constexpr int deviceCount = 1;
54static constexpr unsigned int numThreads = 64u;
55
56#undef cudaSetDevice
57#define cudaSetDevice(d)
58
59__constant__ float far_switch_radius_sq[1] = {0.05f * 0.05f};
60__constant__ float boxz[1];
61__constant__ float uz[1];
62__constant__ float coulomb_prefactor[1] = {1.0f};
63__constant__ int bessel_cutoff[1] = {5};
64__constant__ float maxPWerror[1] = {1e-5f};
65
66// As the coefficients are stored in __constant__ memory, the array needs to be
67// sized in advance. We don't know exactly how many coefficients per order, so
68// we size plentiful.
69constexpr int modpsi_order = 30;
71
72// linearized array on device
73__constant__ int device_n_modPsi[1] = {0};
74__constant__ unsigned int device_linModPsi_offsets[2 * modpsi_order];
75__constant__ unsigned int device_linModPsi_lengths[2 * modpsi_order];
77
78__device__ float dev_mod_psi_even(int n, float x) {
81 static_cast<int>(device_linModPsi_lengths[2 * n]), x * x);
82}
83
84__device__ float dev_mod_psi_odd(int n, float x) {
85 return x * evaluateAsTaylorSeriesAt(
87 static_cast<int>(device_linModPsi_lengths[2 * n + 1]), x * x);
88}
89
90void CoulombMMM1DGpu::modpsi_init() {
92
93 // linearized array on host
94 std::vector<unsigned int> linModPsi_offsets(modPsi.size());
95 std::vector<unsigned int> linModPsi_lengths(modPsi.size());
96 for (std::size_t i = 0; i < modPsi.size(); i++) {
97 if (i)
98 linModPsi_offsets[i] =
99 linModPsi_offsets[i - 1] + linModPsi_lengths[i - 1];
100 linModPsi_lengths[i] = static_cast<unsigned int>(modPsi[i].size());
101 }
102
103 // linearize the coefficients array
104 std::vector<float> linModPsi(linModPsi_offsets[modPsi.size() - 1] +
105 linModPsi_lengths[modPsi.size() - 1]);
106 for (std::size_t i = 0; i < modPsi.size(); i++) {
107 for (std::size_t j = 0; j < modPsi[i].size(); j++) {
108 linModPsi[linModPsi_offsets[i] + j] = static_cast<float>(modPsi[i][j]);
109 }
110 }
111
112 for (int d = 0; d < deviceCount; d++) {
113 cudaSetDevice(d);
114
115 // copy to GPU
116 auto const linModPsiSize = linModPsi_offsets[modPsi.size() - 1] +
117 linModPsi_lengths[modPsi.size() - 1];
118 if (linModPsiSize > static_cast<unsigned int>(modpsi_constant_size)) {
119 throw std::runtime_error(
120 "__constant__ device_linModPsi[] is not large enough");
121 }
122 cuda_safe_mem(cudaMemcpyToSymbol(device_linModPsi_offsets,
123 linModPsi_offsets.data(),
124 modPsi.size() * sizeof(int)));
125 cuda_safe_mem(cudaMemcpyToSymbol(device_linModPsi_lengths,
126 linModPsi_lengths.data(),
127 modPsi.size() * sizeof(int)));
128 cuda_safe_mem(cudaMemcpyToSymbol(device_linModPsi, linModPsi.data(),
129 linModPsiSize * sizeof(float)));
130 auto const n_modPsi = static_cast<int>(modPsi.size() >> 1);
131 cuda_safe_mem(cudaMemcpyToSymbol(device_n_modPsi, &n_modPsi, sizeof(int)));
132 }
133}
134
135/** @brief Get number of blocks for a given number of operations. */
136static auto numBlocksOps(std::size_t n_ops) {
137 auto const n_ops_per_thread = n_ops / static_cast<std::size_t>(numThreads);
138 return 1u + static_cast<unsigned int>(n_ops_per_thread);
139}
140
141/** @brief Get number of blocks for a given number of particles. */
142static auto numBlocks(std::size_t n_part) {
143 auto b = numBlocksOps(Utils::sqr(n_part)); // algorithm is N-square
144 if (b > 65535u)
145 b = 65535u;
146 return b;
147}
148
149void CoulombMMM1DGpu::setup() {
150 auto &system = get_system();
151 auto const box_z = static_cast<float>(system.box_geo->length()[2]);
152 auto const n_part = system.gpu.n_particles();
153 if (not m_is_tuned and n_part != 0) {
156 }
157 if (box_z != host_boxz) {
158 set_params(box_z, 0, -1, -1, -1);
159 }
160 // skip device memory reallocation if device memory is already
161 // allocated with the correct vector lengths
162 if (n_part == host_npart and pairs != -1) {
163 return;
164 }
165 // For all but the largest systems, it is faster to store force pairs
166 // and then sum them up. Atomics are slow, so we only use them when
167 // we're limited by device memory, do the latter.
168 auto const part_mem_size = 3ul * Utils::sqr(n_part) * sizeof(float);
169 pairs = 2;
170 for (int d = 0; d < deviceCount; d++) {
171 cudaSetDevice(d);
172
173 std::size_t freeMem, totalMem;
174 cudaMemGetInfo(&freeMem, &totalMem);
175 if (freeMem / 2 < part_mem_size) {
176 // don't use more than half the device's memory
177 fprintf(stderr, "Switching to atomicAdd due to memory constraints.\n");
178 pairs = 0;
179 break;
180 }
181 }
182 if (dev_forcePairs) {
183 cuda_safe_mem(cudaFree(dev_forcePairs));
184 }
185 if (pairs) {
186 // we need memory to store force pairs
187 cuda_safe_mem(cudaMalloc((void **)&dev_forcePairs, part_mem_size));
188 }
189 if (dev_energyBlocks) {
190 cuda_safe_mem(cudaFree(dev_energyBlocks));
191 }
192 cuda_safe_mem(cudaMalloc((void **)&dev_energyBlocks,
193 numBlocks(n_part) * sizeof(float)));
194 host_npart = static_cast<unsigned int>(n_part);
195}
196
198 if (dev_forcePairs) {
199 cuda_safe_mem(cudaFree(dev_forcePairs));
200 }
201 if (dev_energyBlocks) {
202 cuda_safe_mem(cudaFree(dev_energyBlocks));
203 }
204}
205
206__forceinline__ __device__ float sqpow(float x) { return x * x; }
207__forceinline__ __device__ float cbpow(float x) { return x * x * x; }
208
209__device__ void sumReduction(float *input, float *sum) {
210 auto const tid = threadIdx.x;
211 for (auto i = blockDim.x / 2; i > 0; i /= 2) {
213 if (tid < i)
214 input[tid] += input[i + tid];
215 }
217 if (tid == 0)
218 sum[0] = input[0];
219}
220
221__global__ void sumKernel(float *data, std::size_t N) {
222 extern __shared__ float partialsums[];
223 if (blockIdx.x != 0)
224 return;
225 std::size_t const tid = threadIdx.x;
226 auto result = 0.f;
227
228 for (std::size_t i = 0; i < N; i += blockDim.x) {
229 if (i + tid >= N)
230 partialsums[tid] = 0.f;
231 else
232 partialsums[tid] = data[i + tid];
233
234 sumReduction(partialsums, &result);
235 if (tid == 0) {
236 if (i == 0)
237 data[0] = 0.f;
238 data[0] += result;
239 }
240 }
241}
242
243__global__ void besselTuneKernel(int *result, float far_switch_radius,
244 int maxCut) {
245 constexpr auto c_2pif = 2 * Utils::pi<float>();
246 auto const arg = c_2pif * *uz * far_switch_radius;
247 auto const pref = 4 * *uz * max(1.0f, c_2pif * *uz);
248 float err;
249 int P = 1;
250 do {
251 err = pref * dev_K1(arg * static_cast<float>(P)) * exp(arg) / arg *
252 (static_cast<float>(P) - 1 + 1 / arg);
253 P++;
254 } while (err > *maxPWerror && P <= maxCut);
255 P--;
256
257 result[0] = P;
258}
259
260void CoulombMMM1DGpu::tune(double maxPWerror, double far_switch_radius,
261 int bessel_cutoff) {
262
263 if (far_switch_radius < 0.0 && bessel_cutoff < 0) {
264 // autodetermine switching radius and Bessel cutoff
265 auto const maxrad = host_boxz;
266 auto bestrad = 0.0;
267 float besttime = INFINITY;
268 auto radius = 0.05 * maxrad;
269 while (radius < maxrad) {
270 set_params(0, 0, maxPWerror, radius, bessel_cutoff);
271 tune(maxPWerror, radius, -2); // tune Bessel cutoff
272 auto const runtime = force_benchmark();
273 if (runtime < besttime) {
274 besttime = runtime;
275 bestrad = radius;
276 }
277 radius += 0.05 * maxrad;
278 }
279 set_params(0, 0, maxPWerror, bestrad, bessel_cutoff);
280 tune(maxPWerror, bestrad, -2); // tune Bessel cutoff
281 } else if (bessel_cutoff < 0) {
282 // autodetermine Bessel cutoff
283 auto const far_switch_radius_f = static_cast<float>(far_switch_radius);
284 int *dev_cutoff;
285 constexpr auto maxCut = 30;
286 cuda_safe_mem(cudaMalloc((void **)&dev_cutoff, sizeof(int)));
287 besselTuneKernel<<<dim3(1), dim3(1), 0, nullptr>>>(
288 dev_cutoff, far_switch_radius_f, maxCut);
289 int best_cutoff = 0;
290 cuda_safe_mem(cudaMemcpy(&best_cutoff, dev_cutoff, sizeof(int),
291 cudaMemcpyDeviceToHost));
292 cuda_safe_mem(cudaFree(dev_cutoff));
293 if (bessel_cutoff != -2 && best_cutoff >= maxCut) {
294 // we already had our switching radius and only needed to
295 // determine the cutoff, i.e. this was the final tuning round
296 throw std::runtime_error(
297 "No reasonable Bessel cutoff could be determined.");
298 }
299
300 set_params(0, 0, maxPWerror, far_switch_radius, best_cutoff);
301 }
302}
303
304void CoulombMMM1DGpu::set_params(double boxz, double prefactor,
305 double maxPWerror, double far_switch_radius,
306 int bessel_cutoff) {
307 if (boxz > 0.0 && far_switch_radius > boxz) {
308 throw std::runtime_error(
309 "switching radius must not be larger than box length");
310 }
311
312 for (int d = 0; d < deviceCount; d++) {
313 cudaSetDevice(d);
314 if (far_switch_radius >= 0.0) {
315 this->far_switch_radius = far_switch_radius;
317 auto const far_switch_radius_sq_f =
318 static_cast<float>(far_switch_radius_sq);
319 cuda_safe_mem(cudaMemcpyToSymbol(::far_switch_radius_sq,
320 &far_switch_radius_sq_f, sizeof(float)));
321 }
322 if (boxz > 0.0) {
323 host_boxz = static_cast<float>(boxz);
324 auto const uz = 1.0f / host_boxz;
325 cuda_safe_mem(cudaMemcpyToSymbol(::boxz, &host_boxz, sizeof(float)));
326 cuda_safe_mem(cudaMemcpyToSymbol(::uz, &uz, sizeof(float)));
327 }
328 if (prefactor != 0.0) {
329 this->prefactor = prefactor;
330 auto const prefactor_f = static_cast<float>(prefactor);
332 cudaMemcpyToSymbol(::coulomb_prefactor, &prefactor_f, sizeof(float)));
333 }
334 if (bessel_cutoff > 0) {
335 this->bessel_cutoff = bessel_cutoff;
337 cudaMemcpyToSymbol(::bessel_cutoff, &bessel_cutoff, sizeof(int)));
338 }
339 if (maxPWerror > 0.0) {
340 this->maxPWerror = maxPWerror;
341 auto const maxPWerror_f = static_cast<float>(maxPWerror);
343 cudaMemcpyToSymbol(::maxPWerror, &maxPWerror_f, sizeof(float)));
344 }
345 }
346 m_is_tuned = false;
347}
348
349__global__ void forcesKernel(float const *const __restrict__ r,
350 float const *const __restrict__ q,
351 float *const __restrict__ force, std::size_t N,
352 int pairs) {
353
354 constexpr auto c_2pif = 2.f * Utils::pi<float>();
355 auto const tStop = Utils::sqr(N);
356
357 for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < tStop;
358 tid += blockDim.x * gridDim.x) {
359 auto const p1 = tid % N, p2 = tid / N;
360 auto x = r[3 * p2 + 0] - r[3 * p1 + 0];
361 auto y = r[3 * p2 + 1] - r[3 * p1 + 1];
362 auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
363 auto const rxy2 = sqpow(x) + sqpow(y);
364 auto rxy = sqrt(rxy2);
365 auto sum_r = 0.f;
366 auto sum_z = 0.f;
367
368 while (fabs(z) > *boxz / 2.f) // make sure we take the shortest distance
369 z -= (z > 0.f ? 1.f : -1.f) * *boxz;
370
371 if (p1 == p2) {
372 // particle exerts no force on itself
373 rxy = 1.f; // so the division at the end doesn't fail with NaN
374 // (sum_r is 0 anyway)
375 } else if (rxy2 <= *far_switch_radius_sq) {
376 // near formula
377 auto const uzz = *uz * z;
378 auto const uzr = *uz * rxy;
379 sum_z = dev_mod_psi_odd(0, uzz);
380 auto uzrpow = uzr;
381 for (int n = 1; n < *device_n_modPsi; n++) {
382 auto const sum_r_old = sum_r;
383 auto const mpe = dev_mod_psi_even(n, uzz);
384 auto const mpo = dev_mod_psi_odd(n, uzz);
385
386 sum_r += 2 * static_cast<float>(n) * mpe * uzrpow;
387 uzrpow *= uzr;
388 sum_z += mpo * uzrpow;
389 uzrpow *= uzr;
390
391 if (fabs(sum_r_old - sum_r) < *maxPWerror)
392 break;
393 }
394
395 sum_r *= sqpow(*uz);
396 sum_z *= sqpow(*uz);
397
398 sum_r += rxy * cbpow(rsqrt(rxy2 + sqpow(z)));
399 sum_r += rxy * cbpow(rsqrt(rxy2 + sqpow(z + *boxz)));
400 sum_r += rxy * cbpow(rsqrt(rxy2 + sqpow(z - *boxz)));
401
402 sum_z += z * cbpow(rsqrt(rxy2 + sqpow(z)));
403 sum_z += (z + *boxz) * cbpow(rsqrt(rxy2 + sqpow(z + *boxz)));
404 sum_z += (z - *boxz) * cbpow(rsqrt(rxy2 + sqpow(z - *boxz)));
405
406 if (rxy == 0.f) {
407 // particles at the same radial position only exert a force
408 // in z direction
409 rxy = 1.f; // so the division at the end doesn't fail with NaN
410 // (sum_r is 0 anyway)
411 }
412 } else {
413 // far formula
414 for (int p = 1; p < *bessel_cutoff; p++) {
415 float arg = c_2pif * *uz * static_cast<float>(p);
416 sum_r += static_cast<float>(p) * dev_K1(arg * rxy) * cos(arg * z);
417 sum_z += static_cast<float>(p) * dev_K0(arg * rxy) * sin(arg * z);
418 }
419 sum_r *= sqpow(*uz) * 4.f * c_2pif;
420 sum_z *= sqpow(*uz) * 4.f * c_2pif;
421 sum_r += 2.f * *uz / rxy;
422 }
423
424 auto const pref = *coulomb_prefactor * q[p1] * q[p2];
425 if (pairs) {
426 force[3 * (p1 + p2 * N) + 0] = pref * sum_r / rxy * x;
427 force[3 * (p1 + p2 * N) + 1] = pref * sum_r / rxy * y;
428 force[3 * (p1 + p2 * N) + 2] = pref * sum_z;
429 } else {
430 atomicAdd(&force[3 * p2 + 0], pref * sum_r / rxy * x);
431 atomicAdd(&force[3 * p2 + 1], pref * sum_r / rxy * y);
432 atomicAdd(&force[3 * p2 + 2], pref * sum_z);
433 }
434 }
435}
436
437__global__ void energiesKernel(float const *const __restrict__ r,
438 float const *const __restrict__ q,
439 float *const __restrict__ energy, std::size_t N,
440 int pairs) {
441
442 constexpr auto c_2pif = 2.f * Utils::pi<float>();
443 constexpr auto c_gammaf = Utils::gamma<float>();
444 auto const tStop = Utils::sqr(N);
445
446 extern __shared__ float partialsums[];
447 if (!pairs) {
448 partialsums[threadIdx.x] = 0;
450 }
451 for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < tStop;
452 tid += blockDim.x * gridDim.x) {
453 auto const p1 = tid % N, p2 = tid / N;
454 auto z = r[3 * p2 + 2] - r[3 * p1 + 2];
455 auto const rxy2 = sqpow(r[3 * p2 + 0] - r[3 * p1 + 0]) +
456 sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]);
457 auto rxy = sqrt(rxy2);
458 auto sum_e = 0.f;
459
460 while (fabs(z) > *boxz / 2.f) // make sure we take the shortest distance
461 z -= (z > 0.f ? 1.f : -1.f) * *boxz;
462
463 if (p1 == p2) // particle exerts no force on itself
464 {
465 } else if (rxy2 <= *far_switch_radius_sq) // near formula
466 {
467 auto const uzz = *uz * z;
468 auto const uzr2 = sqpow(*uz * rxy);
469 auto uzrpow = uzr2;
470 sum_e = dev_mod_psi_even(0, uzz);
471 for (int n = 1; n < *device_n_modPsi; n++) {
472 auto const sum_e_old = sum_e;
473 auto const mpe = dev_mod_psi_even(n, uzz);
474 sum_e += mpe * uzrpow;
475 uzrpow *= uzr2;
476
477 if (fabs(sum_e_old - sum_e) < *maxPWerror)
478 break;
479 }
480
481 sum_e *= -1.f * *uz;
482 sum_e -= 2.f * *uz * c_gammaf;
483 sum_e += rsqrt(rxy2 + sqpow(z));
484 sum_e += rsqrt(rxy2 + sqpow(z + *boxz));
485 sum_e += rsqrt(rxy2 + sqpow(z - *boxz));
486 } else // far formula
487 {
488 sum_e = -(log(rxy * *uz / 2.f) + c_gammaf) / 2.f;
489 for (int p = 1; p < *bessel_cutoff; p++) {
490 auto const arg = c_2pif * *uz * static_cast<float>(p);
491 sum_e += dev_K0(arg * rxy) * cos(arg * z);
492 }
493 sum_e *= *uz * 4.f;
494 }
495
496 if (pairs) {
497 energy[p1 + p2 * N] = *coulomb_prefactor * q[p1] * q[p2] * sum_e;
498 } else {
499 partialsums[threadIdx.x] += *coulomb_prefactor * q[p1] * q[p2] * sum_e;
500 }
501 }
502 if (!pairs) {
503 sumReduction(partialsums, &energy[blockIdx.x]);
504 }
505}
506
507__global__ void vectorReductionKernel(float const *src, float *dst,
508 std::size_t N) {
509
510 auto const tStop = Utils::sqr(N);
511
512 for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < N;
513 tid += blockDim.x * gridDim.x) {
514 auto const offset = tid % N;
515 for (std::size_t i = 0; tid + i * N < tStop; i++) {
516#pragma unroll 3
517 for (std::size_t d = 0; d < 3; d++) {
518 dst[3 * offset + d] -= src[3 * (tid + i * N) + d];
519 }
520 }
521 }
522}
523
525 setup();
526
527 if (pairs < 0) {
528 throw std::runtime_error("MMM1D was not initialized correctly");
529 }
530
531 auto &gpu = get_system().gpu;
532 auto const positions_device = gpu.get_particle_positions_device();
533 auto const charges_device = gpu.get_particle_charges_device();
534 auto const forces_device = gpu.get_particle_forces_device();
535 auto const n_part = gpu.n_particles();
536 if (pairs) {
537 // if we calculate force pairs, we need to reduce them to forces
538 auto const numBlocksRed = numBlocksOps(n_part);
539 KERNELCALL(forcesKernel, numBlocks(n_part), numThreads, positions_device,
540 charges_device, dev_forcePairs, n_part, pairs)
541 KERNELCALL(vectorReductionKernel, numBlocksRed, numThreads, dev_forcePairs,
542 forces_device, n_part)
543 } else {
544 KERNELCALL(forcesKernel, numBlocks(n_part), numThreads, positions_device,
545 charges_device, forces_device, n_part, pairs)
546 }
547}
548
549__global__ void scaleAndAddKernel(float *const dst, float const *const src,
550 std::size_t N, float factor) {
551 for (std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < N;
552 tid += blockDim.x * gridDim.x) {
553 dst[tid] += src[tid] * factor;
554 }
555}
556
558 setup();
559
560 if (pairs < 0) {
561 throw std::runtime_error("MMM1D was not initialized correctly");
562 }
563
564 auto &gpu = get_system().gpu;
565 auto const positions_device = gpu.get_particle_positions_device();
566 auto const charges_device = gpu.get_particle_charges_device();
567 auto const energy_device = gpu.get_energy_device();
568 auto const n_part = gpu.n_particles();
569 auto const shared = numThreads * static_cast<unsigned>(sizeof(float));
571 positions_device, charges_device, dev_energyBlocks, n_part,
572 0);
573 KERNELCALL_shared(sumKernel, 1, numThreads, shared, dev_energyBlocks,
574 numBlocks(n_part));
575 // we count every interaction twice, so halve the total energy
576 auto constexpr factor = 0.5f;
577 KERNELCALL(scaleAndAddKernel, 1, 1, &(energy_device->coulomb),
578 dev_energyBlocks, 1, factor);
579}
580
581float CoulombMMM1DGpu::force_benchmark() {
582 cudaEvent_t eventStart, eventStop;
583 float elapsedTime;
584 float *dev_f_benchmark;
585
586 auto &gpu = get_system().gpu;
587 auto const positions_device = gpu.get_particle_positions_device();
588 auto const charges_device = gpu.get_particle_charges_device();
589 auto const n_part = gpu.n_particles();
591 cudaMalloc((void **)&dev_f_benchmark, 3ul * n_part * sizeof(float)));
592 cuda_safe_mem(cudaEventCreate(&eventStart));
593 cuda_safe_mem(cudaEventCreate(&eventStop));
594 cuda_safe_mem(cudaEventRecord(eventStart, stream[0]));
595 KERNELCALL(forcesKernel, numBlocks(n_part), numThreads, positions_device,
596 charges_device, dev_f_benchmark, n_part, 0)
597 cuda_safe_mem(cudaEventRecord(eventStop, stream[0]));
598 cuda_safe_mem(cudaEventSynchronize(eventStop));
599 cuda_safe_mem(cudaEventElapsedTime(&elapsedTime, eventStart, eventStop));
600 cuda_safe_mem(cudaEventDestroy(eventStart));
601 cuda_safe_mem(cudaEventDestroy(eventStop));
602 cuda_safe_mem(cudaFree(dev_f_benchmark));
603
604 return elapsedTime;
605}
606
607#endif // MMM1D_GPU
float sum
__global__ float * force
float N[3]
__syncthreads()
double far_switch_radius_sq
Definition mmm1d_gpu.hpp:41
double far_switch_radius
Definition mmm1d_gpu.hpp:40
double prefactor
Electrostatics prefactor.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
This file contains the defaults for ESPResSo.
std::vector< std::vector< double > > modPsi
Table of the Taylor expansions of the modified polygamma functions.
void create_mod_psi_up_to(int new_n)
Create both the even and odd polygamma functions up to order 2*new_n
Common parts of the MMM family of methods for the electrostatic interaction: MMM1D and ELC.
MMM1D algorithm for long-range Coulomb interactions on the GPU.
__constant__ float device_linModPsi[modpsi_constant_size]
__global__ void vectorReductionKernel(float const *src, float *dst, std::size_t N)
__constant__ int bessel_cutoff[1]
__constant__ float far_switch_radius_sq[1]
static constexpr int deviceCount
__constant__ float maxPWerror[1]
__device__ float dev_mod_psi_odd(int n, float x)
__global__ void forcesKernel(float const *const __restrict__ r, float const *const __restrict__ q, float *const __restrict__ force, std::size_t N, int pairs)
__constant__ unsigned int device_linModPsi_lengths[2 *modpsi_order]
__constant__ int device_n_modPsi[1]
__device__ void sumReduction(float *input, float *sum)
__global__ void energiesKernel(float const *const __restrict__ r, float const *const __restrict__ q, float *const __restrict__ energy, std::size_t N, int pairs)
static auto numBlocksOps(std::size_t n_ops)
Get number of blocks for a given number of operations.
__constant__ float boxz[1]
__forceinline__ __device__ float cbpow(float x)
static constexpr unsigned int numThreads
__constant__ float uz[1]
__forceinline__ __device__ float sqpow(float x)
__global__ void besselTuneKernel(int *result, float far_switch_radius, int maxCut)
__constant__ unsigned int device_linModPsi_offsets[2 *modpsi_order]
__global__ void sumKernel(float *data, std::size_t N)
__device__ float dev_mod_psi_even(int n, float x)
static auto numBlocks(std::size_t n_part)
Get number of blocks for a given number of particles.
__global__ void scaleAndAddKernel(float *const dst, float const *const src, std::size_t N, float factor)
__constant__ float coulomb_prefactor[1]
constexpr int modpsi_order
#define cudaSetDevice(d)
constexpr int modpsi_constant_size
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
Definition sqr.hpp:26
This file contains implementations for the modified Bessel functions of first and second kind.
__device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x)
Definition specfunc.cuh:146
__device__ float dev_K0(float x)
Definition specfunc.cuh:154
__device__ float dev_K1(float x)
Definition specfunc.cuh:173
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
Definition utils.cuh:73
#define cuda_safe_mem(a)
Definition utils.cuh:71
#define KERNELCALL(_function, _grid, _block,...)
Definition utils.cuh:77