ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
p3m_heffte.impl.hpp
Go to the documentation of this file.
1/*
2 * Copyright (C) 2010-2025 The ESPResSo project
3 * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
4 * Max-Planck-Institute for Polymer Research, Theory Group
5 *
6 * This file is part of ESPResSo.
7 *
8 * ESPResSo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * ESPResSo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22#include <config/config.hpp>
23
24#ifdef ESPRESSO_P3M
25
28
31#ifdef ESPRESSO_CUDA
34#endif // ESPRESSO_CUDA
36
37#include "electrostatics/p3m_heffte.hpp" // must be included after coulomb.hpp
38
39#include "p3m/P3MFFT.hpp"
41#include "p3m/TuningLogger.hpp"
43#include "p3m/for_each_3d.hpp"
45#include "p3m/math.hpp"
46
47#include "BoxGeometry.hpp"
48#include "LocalBox.hpp"
49#include "Particle.hpp"
51#include "ParticleRange.hpp"
52#include "PropagationMode.hpp"
53#include "actor/visitors.hpp"
54#include "aosoa_pack.hpp"
58#include "communication.hpp"
59#include "errorhandling.hpp"
61#include "npt.hpp"
62#include "p3m/send_mesh.hpp"
65#include "system/System.hpp"
66#include "tuning.hpp"
67
68#include <utils/Vector.hpp>
71#include <utils/math/sqr.hpp>
73
74#include <boost/mpi/collectives/all_reduce.hpp>
75#include <boost/mpi/collectives/broadcast.hpp>
76#include <boost/mpi/collectives/reduce.hpp>
77#include <boost/mpi/communicator.hpp>
78#include <boost/range/combine.hpp>
79#include <boost/range/numeric.hpp>
80
81#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
82#include <Kokkos_Core.hpp>
83#include <omp.h>
84#endif
85
86#include <algorithm>
87#include <array>
88#include <cassert>
89#include <complex>
90#include <cstddef>
91#include <functional>
92#include <initializer_list>
93#include <numbers>
94#include <optional>
95#include <span>
96#include <sstream>
97#include <stdexcept>
98#include <string>
99#include <tuple>
100#include <utility>
101#include <vector>
102
103template <typename FloatType>
104std::complex<FloatType>
105multiply_complex_by_imaginary(std::complex<FloatType> const &z, FloatType k) {
106 // Perform the multiplication manually: (re + i*imag) * (i*k)
107 return std::complex<FloatType>(-z.imag() * k, z.real() * k);
108}
109
110template <typename FloatType>
111std::complex<FloatType>
112multiply_complex_by_real(std::complex<FloatType> const &z, FloatType k) {
113 // Perform the multiplication manually: (re + i*imag) * k
114 return std::complex<FloatType>(z.real() * k, z.imag() * k);
115}
116
118 Utils::Vector3i const &mesh) {
119 return mesh[0u] % node_grid[0u] == 0 and mesh[1u] % node_grid[1u] == 0 and
120 mesh[2u] % node_grid[2u] == 0;
121}
122
123template <typename FloatType, Arch Architecture, class FFTConfig>
124void CoulombP3MHeffte<FloatType, Architecture,
125 FFTConfig>::count_charged_particles() {
126 struct Res {
127 std::size_t local_n = std::size_t{0u};
128 double local_q = 0.0;
129 double local_q2 = 0.0;
130 };
131 Reduction::AddPartialResultKernel<Res> kernel = [](Res &acc, auto const &p) {
132 if (p.q() != 0.0) {
133 acc.local_n++;
134 acc.local_q2 += Utils::sqr(p.q());
135 acc.local_q += p.q();
136 }
137 };
138
139 Reduction::ReductionOp<Res> reduce = [](Res &a, Res const &b) {
140 a.local_n += b.local_n;
141 a.local_q += b.local_q;
142 a.local_q2 += b.local_q2;
143 };
144 auto res = reduce_over_local_particles(*(get_system().cell_structure), kernel,
145 reduce);
146
147 boost::mpi::all_reduce(comm_cart, res.local_n, p3m.sum_qpart, std::plus<>());
148 boost::mpi::all_reduce(comm_cart, res.local_q2, p3m.sum_q2, std::plus<>());
149 boost::mpi::all_reduce(comm_cart, res.local_q, p3m.square_sum_q,
150 std::plus<>());
151 p3m.square_sum_q = Utils::sqr(p3m.square_sum_q);
152}
153
154/** Calculate the optimal influence function of @cite hockney88a.
155 * (optimised for force calculations)
156 *
157 * Each node calculates only the values for its domain in k-space.
158 *
159 * See also: @cite hockney88a eq. 8-22 (p. 275). Note the somewhat
160 * different convention for the prefactors, which is described in
161 * @cite deserno98a @cite deserno98b.
162 */
163template <typename FloatType, Arch Architecture, class FFTConfig>
164void CoulombP3MHeffte<FloatType, Architecture,
165 FFTConfig>::calc_influence_function_force() {
166 p3m.g_force = grid_influence_function<FloatType, 1, P3M_BRILLOUIN,
167 FFTConfig::k_space_order>(
168 p3m.params, p3m.fft->ks_local_ld_index(), p3m.fft->ks_local_ur_index(),
169 get_system().box_geo->length_inv());
170 if constexpr (FFTConfig::use_r2c) {
171 influence_function_r2c<FFTConfig::r2c_dir>(p3m.g_force, p3m.params.mesh,
172 p3m.fft->ks_local_size(),
173 p3m.fft->ks_local_ld_index());
174 }
175}
176
177/** Calculate the influence function optimized for the energy and the
178 * self energy correction.
179 */
180template <typename FloatType, Arch Architecture, class FFTConfig>
181void CoulombP3MHeffte<FloatType, Architecture,
182 FFTConfig>::calc_influence_function_energy() {
183 p3m.g_energy = grid_influence_function<FloatType, 0, P3M_BRILLOUIN,
184 FFTConfig::k_space_order>(
185 p3m.params, p3m.fft->ks_local_ld_index(), p3m.fft->ks_local_ur_index(),
186 get_system().box_geo->length_inv());
187 if constexpr (FFTConfig::use_r2c) {
188 influence_function_r2c<FFTConfig::r2c_dir>(p3m.g_energy, p3m.params.mesh,
189 p3m.fft->ks_local_size(),
190 p3m.fft->ks_local_ld_index());
191 }
192}
193
194/** Aliasing sum used by @ref p3m_k_space_error. */
196 Utils::Vector3i const &mesh,
197 Utils::Vector3d const &mesh_i, int cao,
198 double alpha_L_i) {
199
200 auto constexpr mesh_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
201 auto constexpr mesh_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
202 auto constexpr exp_min = -708.4; // for IEEE-compatible double
203 auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
204 auto alias1 = 0.;
205 auto alias2 = 0.;
206
207 Utils::Vector3i indices{};
208 Utils::Vector3i nm{};
209 Utils::Vector3d fnm{};
211 mesh_start, mesh_stop, indices,
212 [&]() {
213 auto const norm_sq = nm.norm2();
214 auto const exponent = -factor1 * norm_sq;
215 auto const exp_limit = (exp_min + std::log(norm_sq)) / 2.;
216 auto const ex = (exponent < exp_limit) ? 0. : std::exp(exponent);
217 auto const energy = std::pow(Utils::product(fnm), 2 * cao);
218 alias1 += Utils::sqr(ex) / norm_sq;
219 alias2 += energy * ex * (shift * nm) / norm_sq;
220 },
221 [&](unsigned dim, int n) {
222 nm[dim] = shift[dim] + n * mesh[dim];
223 fnm[dim] = math::sinc(nm[dim] * mesh_i[dim]);
224 });
225
226 return std::make_pair(alias1, alias2);
227}
228
229/** Calculate the real space contribution to the rms error in the force (as
230 * described by Kolafa and Perram).
231 * \param pref Prefactor of Coulomb interaction.
232 * \param r_cut_iL rescaled real space cutoff for p3m method.
233 * \param n_c_part number of charged particles in the system.
234 * \param sum_q2 sum of square of charges in the system
235 * \param alpha_L rescaled Ewald splitting parameter.
236 * \param box_l box dimensions.
237 * \return real space error
238 */
239inline double p3m_real_space_error(double pref, double r_cut_iL,
240 std::size_t n_c_part, double sum_q2,
241 double alpha_L,
242 Utils::Vector3d const &box_l) {
243 auto const volume = Utils::product(box_l);
244 return (2. * pref * sum_q2 * exp(-Utils::sqr(r_cut_iL * alpha_L))) /
245 sqrt(static_cast<double>(n_c_part) * r_cut_iL * box_l[0] * volume);
246}
247
248/** Calculate the analytic expression of the error estimate for the
249 * P3M method in @cite hockney88a (eq. 8-23 p. 275) in
250 * order to obtain the rms error in the force for a system of N
251 * randomly distributed particles in a cubic box (k-space part).
252 * \param pref Prefactor of Coulomb interaction.
253 * \param mesh number of mesh points in one direction.
254 * \param cao charge assignment order.
255 * \param n_c_part number of charged particles in the system.
256 * \param sum_q2 sum of square of charges in the system
257 * \param alpha_L rescaled Ewald splitting parameter.
258 * \param box_l box dimensions.
259 * \return reciprocal (k) space error
260 */
261inline double p3m_k_space_error(double pref, Utils::Vector3i const &mesh,
262 int cao, std::size_t n_c_part, double sum_q2,
263 double alpha_L, Utils::Vector3d const &box_l) {
264
265 auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
266 auto const mesh_i = 1. / Utils::Vector3d(mesh);
267 auto const alpha_L_i = 1. / alpha_L;
268 auto const mesh_stop = mesh / 2;
269 auto const mesh_start = -mesh_stop;
270 auto indices = Utils::Vector3i{};
271 auto values = Utils::Vector3d{};
272 auto he_q = 0.;
273
275 mesh_start, mesh_stop, indices,
276 [&]() {
277 if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
278 auto const n2 = indices.norm2();
279 auto const cs = Utils::product(values);
280 auto const [alias1, alias2] =
281 p3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
282 auto const d = alias1 - Utils::sqr(alias2 / cs) / n2;
283 /* at high precision, d can become negative due to extinction;
284 also, don't take values that have no significant digits left*/
285 if (d > 0. and std::fabs(d / alias1) > round_error_prec) {
286 he_q += d;
287 }
288 }
289 },
290 [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
291 values[dim] = cotangent_sum(n, mesh_i[dim]);
292 });
293
294 return 2. * pref * sum_q2 * sqrt(he_q / static_cast<double>(n_c_part)) /
295 (box_l[1] * box_l[2]);
296}
297
298template <typename FloatType, Arch Architecture, class FFTConfig>
300 assert(p3m.params.mesh >= Utils::Vector3i::broadcast(1));
301 assert(p3m.params.cao >= p3m_min_cao and p3m.params.cao <= p3m_max_cao);
302 assert(p3m.params.alpha > 0.);
303
304 auto const &system = get_system();
305 auto const &box_geo = *system.box_geo;
306 auto const &local_geo = *system.local_geo;
307 auto const skin = system.cell_structure->get_verlet_skin();
308
309 p3m.params.cao3 = Utils::int_pow<3>(p3m.params.cao);
310 p3m.params.recalc_a_ai_cao_cut(box_geo.length());
311
312 sanity_checks();
313
314 auto const &solver = system.coulomb.impl->solver;
315 double elc_layer = 0.;
316 if (auto actor = get_actor_by_type<ElectrostaticLayerCorrection>(solver)) {
317 elc_layer = actor->elc.space_layer;
318 }
319
320 p3m.local_mesh.calc_local_ca_mesh(p3m.params, local_geo, skin, elc_layer);
321 p3m.fft = std::make_shared<P3MFFT<FloatType, FFTConfig>>(
322 ::comm_cart, p3m.params.mesh, p3m.local_mesh.ld_no_halo,
323 p3m.local_mesh.ur_no_halo, ::communicator.node_grid);
324 auto const rs_array_size =
325 static_cast<std::size_t>(Utils::product(p3m.local_mesh.dim));
326 auto const rs_array_size_no_halo =
327 static_cast<std::size_t>(Utils::product(p3m.local_mesh.dim_no_halo));
328 auto const fft_mesh_size =
329 static_cast<std::size_t>(Utils::product(p3m.fft->ks_local_size()));
330 p3m.rs_charge_density.resize(rs_array_size);
331 p3m.ks_charge_density.resize(fft_mesh_size);
332 for (auto d : {0u, 1u, 2u}) {
333 p3m.ks_E_fields[d].resize(fft_mesh_size);
334 p3m.rs_E_fields[d].resize(rs_array_size);
335 p3m.rs_E_fields_no_halo[d].resize(rs_array_size_no_halo);
336 }
337 p3m.calc_differential_operator();
338
339 /* fix box length dependent constants */
340 scaleby_box_l();
341
342 count_charged_particles();
343}
344
345namespace {
346template <int cao> struct AssignCharge {
347 void operator()(auto &p3m, double q,
348 InterpolationWeights<cao> const &weights) {
349 using CoulombP3MState = std::remove_reference_t<decltype(p3m)>;
350 using value_type = CoulombP3MState::value_type;
351 p3m_interpolate(p3m.local_mesh, weights, [q, &p3m](int ind, double w) {
352 p3m.rs_charge_density[ind] += value_type(w * q);
353 });
354 }
355
356 void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos,
357 p3m_interpolation_cache &inter_weights) {
358 auto constexpr memory_order = Utils::MemoryOrder::ROW_MAJOR;
359 auto const weights = p3m_calculate_interpolation_weights<cao, memory_order>(
360 real_pos, p3m.params.ai, p3m.local_mesh);
361 inter_weights.store(weights);
362 this->operator()(p3m, q, weights);
363 }
364
365 void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos) {
366 auto constexpr memory_order = Utils::MemoryOrder::ROW_MAJOR;
367 auto const weights = p3m_calculate_interpolation_weights<cao, memory_order>(
368 real_pos, p3m.params.ai, p3m.local_mesh);
369 this->operator()(p3m, q, weights);
370 }
371
372#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
373 void operator()(auto &p3m, auto &cell_structure) {
374 using CoulombP3MState = std::remove_reference_t<decltype(p3m)>;
375 using value_type = CoulombP3MState::value_type;
376 auto constexpr memory_order = Utils::MemoryOrder::ROW_MAJOR;
377 auto const &aosoa = cell_structure.get_aosoa();
378 auto const n_part = cell_structure.count_local_particles();
379 p3m.inter_weights.zfill(n_part); // allocate buffer for parallel write
381 "InterpolateCharges", std::size_t{0u}, n_part, [&](auto p_index) {
382 auto const tid = omp_get_thread_num();
383 auto const pos = aosoa.get_vector_at(aosoa.position, p_index);
384 auto const q = aosoa.charge(p_index);
385 auto const weights =
386 p3m_calculate_interpolation_weights<cao, memory_order>(
387 pos, p3m.params.ai, p3m.local_mesh);
388 p3m.inter_weights.store_at(p_index, weights);
390 p3m.local_mesh, weights, [&, tid, q](int ind, double w) {
391 p3m.rs_charge_density_kokkos(tid, ind) += value_type(w * q);
392 });
393 });
394 Kokkos::fence();
395 using execution_space = Kokkos::DefaultExecutionSpace;
396 int num_threads = execution_space().concurrency();
397 Kokkos::RangePolicy<execution_space> policy(std::size_t{0},
398 p3m.local_mesh.size);
399 Kokkos::parallel_for("ReduceInterpolatedCharges", policy,
400 [&p3m, num_threads](std::size_t const i) {
401 value_type acc{};
402 for (int tid = 0; tid < num_threads; ++tid) {
403 acc += p3m.rs_charge_density_kokkos(tid, i);
404 }
405 p3m.rs_charge_density.at(i) += acc;
406 });
407 Kokkos::fence();
408 }
409#else // ESPRESSO_SHARED_MEMORY_PARALLELISM
410 void operator()(auto &p3m, auto const &p_q_pos_range) {
411 for (auto zipped : p_q_pos_range) {
412 auto const p_q = boost::get<0>(zipped);
413 if (p_q != 0.0) {
414 auto const &p_pos = boost::get<1>(zipped);
415 this->operator()(p3m, p_q, p_pos, p3m.inter_weights);
416 }
417 }
418 }
419#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
420};
421} // namespace
422
423template <typename FloatType, Arch Architecture, class FFTConfig>
425 [[maybe_unused]] ParticleRange const &particles) {
426 prepare_fft_mesh(true);
427
428#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
429 Utils::integral_parameter<int, AssignCharge, p3m_min_cao, p3m_max_cao>(
430 p3m.params.cao, p3m, *get_system().cell_structure);
431#else // ESPRESSO_SHARED_MEMORY_PARALLELISM
432 auto p_q_range = ParticlePropertyRange::charge_range(particles);
433 auto p_pos_range = ParticlePropertyRange::pos_range(particles);
434
435 Utils::integral_parameter<int, AssignCharge, p3m_min_cao, p3m_max_cao>(
436 p3m.params.cao, p3m, boost::combine(p_q_range, p_pos_range));
437#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
438}
439
440template <typename FloatType, Arch Architecture, class FFTConfig>
442 double q, Utils::Vector3d const &real_pos, bool skip_cache) {
443 if (skip_cache) {
444 Utils::integral_parameter<int, AssignCharge, p3m_min_cao, p3m_max_cao>(
445 p3m.params.cao, p3m, q, real_pos);
446 } else {
447 Utils::integral_parameter<int, AssignCharge, p3m_min_cao, p3m_max_cao>(
448 p3m.params.cao, p3m, q, real_pos, p3m.inter_weights);
449 }
450}
451
452namespace {
453template <int cao> struct AssignForces {
454 void operator()(auto &p3m, auto force_prefac,
455#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
456 CellStructure &cell_structure
457#else
458 auto const &p_q_force_range
459#endif
460 ) const {
461
462 assert(cao == p3m.inter_weights.cao());
463
464 auto const kernel = [&p3m](auto pref, auto &p_force, std::size_t p_index) {
465 auto const weights = p3m.inter_weights.template load<cao>(p_index);
466
467 Utils::Vector3d force{};
468 p3m_interpolate(p3m.local_mesh, weights,
469 [&force, &p3m](int ind, double w) {
470 force[0u] += w * double(p3m.rs_E_fields[0u][ind]);
471 force[1u] += w * double(p3m.rs_E_fields[1u][ind]);
472 force[2u] += w * double(p3m.rs_E_fields[2u][ind]);
473 });
474
475#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
476 auto const thread_id = omp_get_thread_num();
477 p_force(p_index, thread_id, 0) -= pref * force[0];
478 p_force(p_index, thread_id, 1) -= pref * force[1];
479 p_force(p_index, thread_id, 2) -= pref * force[2];
480#else
481 p_force -= pref * force;
482#endif
483 };
484
485#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
486 auto const n_part = cell_structure.count_local_particles();
487 auto const &aosoa = cell_structure.get_aosoa();
488 auto &local_force = cell_structure.get_local_force();
490 "AssignForces", std::size_t{0u}, n_part, [&](std::size_t p_index) {
491 if (auto const pref = aosoa.charge(p_index) * force_prefac) {
492 kernel(pref, local_force, p_index);
493 }
494 });
495#else // ESPRESSO_SHARED_MEMORY_PARALLELISM
496 /* charged particle counter */
497 std::size_t p_index{0ul};
498
499 for (auto zipped : p_q_force_range) {
500 auto p_q = boost::get<0>(zipped);
501 if (p_q != 0.) {
502 auto &p_force = boost::get<1>(zipped);
503 kernel(p_q * force_prefac, p_force, p_index);
504 ++p_index;
505 }
506 }
507#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
508 }
509};
510} // namespace
511
512#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
513inline auto calc_dipole_moment(boost::mpi::communicator const &comm,
514 auto const &cs, auto const &box_geo) {
515 auto const local_dip = reduce_over_local_particles<Utils::Vector3d>(
516 cs,
517 [&box_geo](Utils::Vector3d &acc, Particle const &p) {
518 acc += p.q() * box_geo.unfolded_position(p.pos(), p.image_box());
519 },
520 [](Utils::Vector3d &a, Utils::Vector3d const &b) { a = a + b; });
521 return boost::mpi::all_reduce(comm, local_dip, std::plus<>());
522}
523#else // ESPRESSO_SHARED_MEMORY_PARALLELISM
524inline auto calc_dipole_moment(boost::mpi::communicator const &comm,
525 auto const &p_q_unfolded_pos_range) {
526 auto const local_dip =
527 boost::accumulate(p_q_unfolded_pos_range, Utils::Vector3d{},
528 [](Utils::Vector3d const &dip, auto const &q_pos) {
529 auto const p_q = boost::get<0>(q_pos);
530 auto const &p_unfolded_pos = boost::get<1>(q_pos);
531 return dip + p_q * p_unfolded_pos;
532 });
533 return boost::mpi::all_reduce(comm, local_dip, std::plus<>());
534}
535#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
536
537template <typename FloatType, Arch Architecture, class FFTConfig>
538void CoulombP3MHeffte<FloatType, Architecture,
539 FFTConfig>::kernel_ks_charge_density() {
540 // halo communication of real space charge density
541 p3m.halo_comm.gather_grid(comm_cart, p3m.rs_charge_density.data(),
542 p3m.local_mesh.dim);
543
544 // get real space charge density without ghost layers
545 auto charge_density_no_halos =
546 extract_block<Utils::MemoryOrder::ROW_MAJOR, FFTConfig::r_space_order>(
547 p3m.rs_charge_density, p3m.local_mesh.dim, p3m.local_mesh.n_halo_ld,
548 p3m.local_mesh.dim - p3m.local_mesh.n_halo_ur);
549
550 // Set up the FFT using the Heffte library.
551 // This is in global mesh coordinates without any ghost layers
552 // The memory layout has to be specified, so the parts of
553 // the mesh held by each MPI rank are assembled correctly.
554 p3m.fft->forward(charge_density_no_halos.data(),
555 p3m.ks_charge_density.data());
556}
557
558template <typename FloatType, Arch Architecture, class FFTConfig>
559void CoulombP3MHeffte<FloatType, Architecture,
560 FFTConfig>::kernel_rs_electric_field() {
561 auto const mesh_start = p3m.fft->ks_local_ld_index();
562 auto const mesh_stop = p3m.fft->ks_local_ur_index();
563 auto const &box_geo = *get_system().box_geo;
564
565 // i*k differentiation
566 auto const wavevector =
567 Utils::Vector3<FloatType>((2. * std::numbers::pi) * box_geo.length_inv());
568
569 // compute electric field, Eq. (3.49) @cite deserno00b
570 for_each_3d_lin<FFTConfig::k_space_order>(
571 mesh_start, mesh_stop,
572 [&](Utils::Vector3i const &indices, int local_index) {
573#ifdef ESPRESSO_ADDITIONAL_CHECKS
574 assert(local_index ==
575 Utils::get_linear_index<FFTConfig::k_space_order>(
576 indices - mesh_start, p3m.fft->ks_local_size()));
577#endif
578 auto const phi_hat = multiply_complex_by_real(
579 p3m.ks_charge_density[local_index], p3m.g_force[local_index]);
580
581 for (auto d : {0u, 1u, 2u}) {
582 // wave vector of the current mesh point
583 auto const k = FloatType(p3m.d_op[d][indices[d]]) * wavevector[d];
584 // electric field in k-space
585 p3m.ks_E_fields[d][local_index] =
587 }
588 });
589
590 // back-transform the k-space electric field to real space
591 auto const size = p3m.local_mesh.ur_no_halo - p3m.local_mesh.ld_no_halo;
592 auto const rs_mesh_size_no_halo = Utils::product(size);
593 for (auto d : {0u, 1u, 2u}) {
594 auto k_space = p3m.ks_E_fields[d].data();
595 auto r_space = p3m.rs_E_fields_no_halo[d].data();
596 p3m.fft->backward(k_space, r_space);
597
598 // add zeros around the E-field in real space to make room for ghost layers
599 auto const begin = p3m.rs_E_fields_no_halo[d].begin();
600 p3m.rs_E_fields[d] =
601 pad_with_zeros_discard_imag<FFTConfig::r_space_order,
603 std::span(begin, rs_mesh_size_no_halo), p3m.local_mesh.dim_no_halo,
604 p3m.local_mesh.n_halo_ld, p3m.local_mesh.n_halo_ur);
605 }
606
607 // ghost communicate the boundary layers of the E-field in real space
608 std::array<FloatType *, 3u> rs_fields = {{p3m.rs_E_fields[0u].data(),
609 p3m.rs_E_fields[1u].data(),
610 p3m.rs_E_fields[2u].data()}};
611 p3m.halo_comm.spread_grid(comm_cart, rs_fields, p3m.local_mesh.dim);
612}
613
614/** @details Calculate the long range electrostatics part of the pressure
615 * tensor. This is part \f$\Pi_{\textrm{dir}, \alpha, \beta}\f$ eq. (2.6)
616 * in @cite essmann95a. The part \f$\Pi_{\textrm{corr}, \alpha, \beta}\f$
617 * eq. (2.8) is not present here since M is the empty set in our simulations.
618 */
619template <typename FloatType, Arch Architecture, class FFTConfig>
622 ParticleRange const &particles) {
623 auto const &box_geo = *get_system().box_geo;
624 Utils::Vector9d node_k_space_pressure_tensor{};
625
626 if (p3m.sum_q2 > 0.) {
627 charge_assign(particles);
628 kernel_ks_charge_density();
629
630 auto constexpr r2c_dir = FFTConfig::r2c_dir;
631 auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
632 auto const &global_size = p3m.params.mesh;
633 auto const local_size = p3m.fft->ks_local_size();
634 auto const local_origin = p3m.fft->ks_local_ld_index();
635 auto const half_alpha_inv_sq = Utils::sqr(1. / 2. / p3m.params.alpha);
636 auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
637 auto const cutoff_left = 1 - local_origin[r2c_dir];
638 auto const cutoff_right = global_size[r2c_dir] / 2 - local_origin[r2c_dir];
639 auto local_index = Utils::Vector3i::broadcast(0);
640 auto &short_dim = local_index[r2c_dir];
641 auto diagonal = 0.;
642 std::size_t index = 0u;
643 for_each_3d_order<FFTConfig::k_space_order>(
644 mesh_start, local_size, local_index, [&]() {
645 if (short_dim <= cutoff_right) {
646 auto const global_index = local_index + local_origin;
647 auto const kx = p3m.d_op[0u][global_index[0u]] * wavevector[0u];
648 auto const ky = p3m.d_op[1u][global_index[1u]] * wavevector[1u];
649 auto const kz = p3m.d_op[2u][global_index[2u]] * wavevector[2u];
650 auto const norm_sq =
651 Utils::sqr(kx) + Utils::sqr(ky) + Utils::sqr(kz);
652
653 if (norm_sq != 0.) {
654 auto cell_energy =
655 static_cast<double>(p3m.g_energy[index] *
656 std::norm(p3m.ks_charge_density[index]));
657 if (short_dim >= cutoff_left and short_dim <= cutoff_right - 1) {
658 // k-space symmetry: double counting except in the first and
659 // last planes of the short dimension; although the wavevector
660 // points in the opposite direction in the redundant region of
661 // k-space, the product of two components of the wavevector
662 // cancels out the negative sign
663 cell_energy *= 2.;
664 }
665 auto const vterm = -2. * (1. / norm_sq + half_alpha_inv_sq);
666 auto const pref = cell_energy * vterm;
667 diagonal += cell_energy;
668 node_k_space_pressure_tensor[0u] += pref * kx * kx; /* sigma_xx */
669 node_k_space_pressure_tensor[1u] += pref * kx * ky; /* sigma_xy */
670 node_k_space_pressure_tensor[2u] += pref * kx * kz; /* sigma_xz */
671 node_k_space_pressure_tensor[4u] += pref * ky * ky; /* sigma_yy */
672 node_k_space_pressure_tensor[5u] += pref * ky * kz; /* sigma_yz */
673 node_k_space_pressure_tensor[8u] += pref * kz * kz; /* sigma_zz */
674 }
675 }
676 ++index;
677 });
678
679 node_k_space_pressure_tensor[0u] += diagonal;
680 node_k_space_pressure_tensor[4u] += diagonal;
681 node_k_space_pressure_tensor[8u] += diagonal;
682 node_k_space_pressure_tensor[3u] = node_k_space_pressure_tensor[1u];
683 node_k_space_pressure_tensor[6u] = node_k_space_pressure_tensor[2u];
684 node_k_space_pressure_tensor[7u] = node_k_space_pressure_tensor[5u];
685 }
686
687 return node_k_space_pressure_tensor * prefactor / (2. * box_geo.volume());
688}
689
690template <typename FloatType, Arch Architecture, class FFTConfig>
692 bool force_flag, bool energy_flag, ParticleRange const &particles) {
693
694 auto const &system = get_system();
695 auto const &box_geo = *system.box_geo;
696#ifdef ESPRESSO_NPT
697 auto const npt_flag = force_flag and system.has_npt_enabled();
698#else
699 auto constexpr npt_flag = false;
700#endif
701 if (p3m.sum_qpart == 0u) {
702 return 0.;
703 }
704
705 if (not has_actor_of_type<ElectrostaticLayerCorrection>(
706 system.coulomb.impl->solver)) {
707 charge_assign(particles);
708 }
709
710 kernel_ks_charge_density();
711
712#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
713 auto &cell_structure = *system.cell_structure;
714 auto const &local_force = cell_structure.get_local_force();
715 auto const &aosoa = cell_structure.get_aosoa();
716#else
717 auto p_q_range = ParticlePropertyRange::charge_range(particles);
718 auto p_force_range = ParticlePropertyRange::force_range(particles);
719 auto p_unfolded_pos_range =
721#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
722
723 // The dipole moment is only needed if we don't have metallic boundaries
724 auto const box_dipole = (p3m.params.epsilon != P3M_EPSILON_METALLIC)
725 ? std::make_optional(calc_dipole_moment(
726#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
727 comm_cart, cell_structure, box_geo))
728#else
729 comm_cart,
730 boost::combine(p_q_range,
731 p_unfolded_pos_range)))
732#endif
733 : std::nullopt;
734 auto const volume = box_geo.volume();
735 auto const pref =
736 4. * std::numbers::pi / volume / (2. * p3m.params.epsilon + 1.);
737 auto energy = 0.;
738
739 /* === k-space force calculation === */
740 if (force_flag) {
741 kernel_rs_electric_field();
742
743 // assign particle forces
744 auto const force_prefac = prefactor / volume;
745#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
746 auto &particle_data = cell_structure;
747#else
748 auto const particle_data = boost::combine(p_q_range, p_force_range);
749#endif
750 Utils::integral_parameter<int, AssignForces, p3m_min_cao, p3m_max_cao>(
751 p3m.params.cao, p3m, force_prefac, particle_data);
752
753 // add dipole forces
754 // Eq. (3.19) @cite deserno00b
755 if (box_dipole) {
756 auto const dm = prefactor * pref * box_dipole.value();
757#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
758 auto const n_part = cell_structure.count_local_particles();
760 "AssignForcesBoxDipole", std::size_t{0u}, n_part,
761 [&aosoa, &local_force, dm](auto p_index) {
762 auto const thread_id = omp_get_thread_num();
763 auto const q = aosoa.charge(p_index);
764 local_force(p_index, thread_id, 0) -= q * dm[0];
765 local_force(p_index, thread_id, 1) -= q * dm[1];
766 local_force(p_index, thread_id, 2) -= q * dm[2];
767 });
768#else // ESPRESSO_SHARED_MEMORY_PARALLELISM
769 for (auto zipped : boost::combine(p_q_range, p_force_range)) {
770 auto p_q = boost::get<0>(zipped);
771 auto &p_force = boost::get<1>(zipped);
772 p_force -= p_q * dm;
773 }
774#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
775 }
776 }
777
778 /* === k-space energy calculation === */
779 if (energy_flag or npt_flag) {
780 auto constexpr r2c_dir = FFTConfig::r2c_dir;
781 auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
782 auto const &global_size = p3m.params.mesh;
783 auto const local_size = p3m.fft->ks_local_size();
784 auto const local_origin = p3m.fft->ks_local_ld_index();
785 auto const cutoff_left = 1 - local_origin[r2c_dir];
786 auto const cutoff_right = global_size[r2c_dir] / 2 - local_origin[r2c_dir];
787 auto local_index = Utils::Vector3i::broadcast(0);
788 auto &short_dim = local_index[r2c_dir];
789 auto node_energy = 0.;
790 std::size_t index = 0u;
791 for_each_3d_order<FFTConfig::k_space_order>(
792 mesh_start, local_size, local_index, [&]() {
793 if (short_dim <= cutoff_right) {
794 auto const &cell_field = p3m.ks_charge_density[index];
795 auto cell_energy = static_cast<double>(p3m.g_energy[index] *
796 std::norm(cell_field));
797 if (short_dim >= cutoff_left and short_dim <= cutoff_right - 1) {
798 // leverage symmetry of k-space: double counting except in the
799 // first and last planes of the short dimension
800 cell_energy += cell_energy;
801 }
802 node_energy += cell_energy;
803 }
804 ++index;
805 });
806 node_energy /= 2. * volume;
807
808 // add up energy contributions from all mpi ranks
809 boost::mpi::reduce(::comm_cart, node_energy, energy, std::plus<>(), 0);
810 if (this_node == 0) {
811 /* self energy correction */
812 // Eq. (3.8) @cite deserno00b
813 energy -= p3m.sum_q2 * p3m.params.alpha * std::numbers::inv_sqrtpi;
814 /* net charge correction */
815 // Eq. (3.11) @cite deserno00b
816 energy -= p3m.square_sum_q * std::numbers::pi /
817 (2. * volume * Utils::sqr(p3m.params.alpha));
818 /* dipole correction */
819 // Eq. (3.9) @cite deserno00b
820 if (box_dipole) {
821 energy += pref * box_dipole.value().norm2();
822 }
823 }
824 energy *= prefactor;
825#ifdef ESPRESSO_NPT
826 if (npt_flag) {
827 get_system().npt_add_virial_contribution(energy);
828 }
829#endif
830 if (not energy_flag) {
831 energy = 0.;
832 }
833 }
834
835 return energy;
836}
837
838template <typename FloatType, Arch Architecture, class FFTConfig>
842 double m_mesh_density_min = -1., m_mesh_density_max = -1.;
843 // indicates if mesh should be tuned
844 bool m_tune_mesh = false;
845 std::pair<std::optional<int>, std::optional<int>> m_tune_limits;
846
847protected:
848 P3MParameters &get_params() override { return p3m.params; }
849
850 constexpr std::tuple<int, int, int> get_memory_layout() const {
851 auto constexpr memory_order = FFTConfig::k_space_order;
852 if constexpr (memory_order == Utils::MemoryOrder::COLUMN_MAJOR) {
853 return {2, 1, 0};
854 }
855 return {0, 1, 2};
856 }
857
858public:
859 CoulombTuningAlgorithm(System::System &system, auto &input_p3m,
860 double prefactor, int timings,
861 decltype(m_tune_limits) tune_limits)
862 : TuningAlgorithm(system, prefactor, timings), p3m{input_p3m},
863 m_tune_limits{std::move(tune_limits)} {}
864
865 void on_solver_change() const override { m_system.on_coulomb_change(); }
866
867 void setup_logger(bool verbose) override {
868 auto const &box_geo = *m_system.box_geo;
869#ifdef ESPRESSO_CUDA
870 auto const on_gpu = Architecture == Arch::CUDA;
871#else
872 auto const on_gpu = false;
873#endif
874 m_logger = std::make_unique<TuningLogger>(
875 verbose and this_node == 0, (on_gpu) ? "CoulombP3MGPU" : "CoulombP3M",
877 m_logger->tuning_goals(p3m.params.accuracy, m_prefactor,
878 box_geo.length()[0], p3m.sum_qpart, p3m.sum_q2);
879 m_logger->log_tuning_start();
880 }
881
882 std::optional<std::string>
883 layer_correction_veto_r_cut(double r_cut) const override {
884 auto const &solver = m_system.coulomb.impl->solver;
885 if (auto actor = get_actor_by_type<ElectrostaticLayerCorrection>(solver)) {
886 return actor->veto_r_cut(r_cut);
887 }
888 return {};
889 }
890
891 std::optional<std::string> fft_decomposition_veto(
892 Utils::Vector3i const &mesh_size_r_space) const override {
893#ifdef ESPRESSO_CUDA
894 if constexpr (Architecture == Arch::CUDA) {
895 return std::nullopt;
896 }
897#endif
898 auto const [KX, KY, KZ] = get_memory_layout();
899 auto valid_decomposition = false;
900 // calculate box size in k-space
901 Utils::Vector3i mesh_size_k_space = {};
902 boost::mpi::reduce(
903 ::comm_cart, p3m.fft->ks_local_ur_index(), mesh_size_k_space,
904 [](Utils::Vector3i const &lhs, Utils::Vector3i const &rhs) {
905 return Utils::Vector3i{{std::max(lhs[0u], rhs[0u]),
906 std::max(lhs[1u], rhs[1u]),
907 std::max(lhs[2u], rhs[2u])}};
908 },
909 0);
910 if constexpr (FFTConfig::use_r2c) {
911 // adjust for reduced dimension
912 mesh_size_k_space[FFTConfig::r2c_dir] -= 1;
913 mesh_size_k_space[FFTConfig::r2c_dir] *= 2;
914 }
915 // check consistency with box size in real-space
916 if (::this_node == 0) {
917 auto const &node_grid = ::communicator.node_grid;
918 valid_decomposition =
919 (mesh_size_r_space[0u] == mesh_size_k_space[KX] and
920 mesh_size_r_space[1u] == mesh_size_k_space[KY] and
921 mesh_size_r_space[2u] == mesh_size_k_space[KZ] and
922 is_node_grid_compatible_with_mesh(node_grid, mesh_size_r_space));
923 }
924 boost::mpi::broadcast(::comm_cart, valid_decomposition, 0);
925 std::optional<std::string> retval{"conflict with FFT domain decomposition"};
926 if (valid_decomposition) {
927 retval = std::nullopt;
928 }
929 return retval;
930 }
931
932 std::tuple<double, double, double, double>
934 double r_cut_iL) const override {
935
936 auto const &box_geo = *m_system.box_geo;
937 double alpha_L, rs_err, ks_err;
938
939 /* calc maximal real space error for setting */
940 rs_err = p3m_real_space_error(m_prefactor, r_cut_iL, p3m.sum_qpart,
941 p3m.sum_q2, 0., box_geo.length());
942
943 if (std::numbers::sqrt2 * rs_err > p3m.params.accuracy) {
944 /* assume rs_err = ks_err -> rs_err = accuracy/sqrt(2.0) -> alpha_L */
945 alpha_L = sqrt(log(std::numbers::sqrt2 * rs_err / p3m.params.accuracy)) /
946 r_cut_iL;
947 } else {
948 /* even alpha=0 is ok, however, we cannot choose it since it kills the
949 k-space error formula.
950 Anyways, this very likely NOT the optimal solution */
951 alpha_L = 0.1;
952 }
953
954 /* calculate real-space and k-space error for this alpha_L */
955 rs_err = p3m_real_space_error(m_prefactor, r_cut_iL, p3m.sum_qpart,
956 p3m.sum_q2, alpha_L, box_geo.length());
957#ifdef ESPRESSO_CUDA
958 if constexpr (Architecture == Arch::CUDA) {
959 if (this_node == 0) {
960 ks_err =
961 p3m_k_space_error_gpu(m_prefactor, mesh.data(), cao, p3m.sum_qpart,
962 p3m.sum_q2, alpha_L, box_geo.length().data());
963 }
964 boost::mpi::broadcast(comm_cart, ks_err, 0);
965 } else
966#endif
967 ks_err = p3m_k_space_error(m_prefactor, mesh, cao, p3m.sum_qpart,
968 p3m.sum_q2, alpha_L, box_geo.length());
969
970 return {Utils::Vector2d{rs_err, ks_err}.norm(), rs_err, ks_err, alpha_L};
971 }
972
973 void determine_mesh_limits() override {
974 auto const &box_geo = *m_system.box_geo;
975 auto const mesh_density =
976 static_cast<double>(p3m.params.mesh[0]) * box_geo.length_inv()[0];
977
978 if (p3m.params.mesh == Utils::Vector3i::broadcast(-1)) {
979 /* avoid using more than 1 GB of FFT arrays */
980 auto const normalized_box_dim = std::cbrt(box_geo.volume());
981 auto const max_npart_per_dim = 512.;
982 /* simple heuristic to limit the tried meshes if the accuracy cannot
983 be obtained with smaller meshes, but normally not all these
984 meshes have to be tested */
985 auto const min_npart_per_dim = std::min(
986 max_npart_per_dim, std::cbrt(static_cast<double>(p3m.sum_qpart)));
987 m_mesh_density_min = min_npart_per_dim / normalized_box_dim;
988 m_mesh_density_max = max_npart_per_dim / normalized_box_dim;
989 if (m_tune_limits.first or m_tune_limits.second) {
990 auto const &box_l = box_geo.length();
991 auto const dim = std::max({box_l[0], box_l[1], box_l[2]});
992 if (m_tune_limits.first) {
993 m_mesh_density_min = static_cast<double>(*m_tune_limits.first) / dim;
994 }
995 if (m_tune_limits.second) {
996 m_mesh_density_max = static_cast<double>(*m_tune_limits.second) / dim;
997 }
998 }
999 m_tune_mesh = true;
1000 } else {
1001 m_mesh_density_min = m_mesh_density_max = mesh_density;
1002 assert(p3m.params.mesh[0] >= 1);
1003 if (p3m.params.mesh[1] == -1 and p3m.params.mesh[2] == -1) {
1004 // determine the two missing values by rescaling by the box length
1005 for (auto i : {1u, 2u}) {
1006 p3m.params.mesh[i] =
1007 static_cast<int>(std::round(mesh_density * box_geo.length()[i]));
1008 // make the mesh even in all directions
1009 p3m.params.mesh[i] += p3m.params.mesh[i] % 2;
1010 }
1011 }
1012 m_logger->report_fixed_mesh(p3m.params.mesh);
1013 }
1014 }
1015
1017 auto const &box_geo = *m_system.box_geo;
1018 auto const &solver = m_system.coulomb.impl->solver;
1019 auto tuned_params = TuningAlgorithm::Parameters{};
1020 auto time_best = time_sentinel;
1021 auto mesh_density = m_mesh_density_min;
1022 auto current_mesh = p3m.params.mesh;
1023 if (m_tune_mesh) {
1024 for (auto i : {0u, 1u, 2u}) {
1025 current_mesh[i] =
1026 static_cast<int>(std::round(box_geo.length()[i] * mesh_density));
1027 // make the mesh even in all directions
1028 current_mesh[i] += current_mesh[i] % 2;
1029 }
1030 }
1031
1032 while (mesh_density <= m_mesh_density_max) {
1033 auto trial_params = TuningAlgorithm::Parameters{};
1034 trial_params.mesh = current_mesh;
1035 trial_params.cao = cao_best;
1036 trial_params.cao = cao_best;
1037
1038 auto const trial_time =
1039 get_m_time(trial_params.mesh, trial_params.cao, trial_params.r_cut_iL,
1040 trial_params.alpha_L, trial_params.accuracy);
1041
1042 if (trial_time >= 0.) {
1043 /* the optimum r_cut for this mesh is the upper limit for higher meshes,
1044 everything else is slower */
1045 if (has_actor_of_type<CoulombP3M>(solver)) {
1046 m_r_cut_iL_max = trial_params.r_cut_iL;
1047 }
1048
1049 if (trial_time < time_best) {
1050 /* new optimum */
1051 reset_n_trials();
1052 tuned_params = trial_params;
1053 time_best = tuned_params.time = trial_time;
1054 } else if (trial_time > time_best + time_granularity or
1055 get_n_trials() > max_n_consecutive_trials) {
1056 /* no hope of further optimisation */
1057 break;
1058 }
1059 }
1060 if (m_tune_mesh) {
1061 current_mesh += Utils::Vector3i::broadcast(2);
1062 mesh_density = current_mesh[0] / box_geo.length()[0];
1063 } else {
1064 break;
1065 }
1066 }
1067 return tuned_params;
1068 }
1069};
1070
1071template <typename FloatType, Arch Architecture, class FFTConfig>
1073 auto &system = get_system();
1074 auto const &box_geo = *system.box_geo;
1075 if (p3m.params.alpha_L == 0. and p3m.params.alpha != 0.) {
1076 p3m.params.alpha_L = p3m.params.alpha * box_geo.length()[0];
1077 }
1078 if (p3m.params.r_cut_iL == 0. and p3m.params.r_cut != 0.) {
1079 p3m.params.r_cut_iL = p3m.params.r_cut * box_geo.length_inv()[0];
1080 }
1081 if (not is_tuned()) {
1082 count_charged_particles();
1083 if (p3m.sum_qpart == 0) {
1084 throw std::runtime_error(
1085 "CoulombP3M: no charged particles in the system");
1086 }
1087 try {
1089 system, p3m, prefactor, tuning.timings, tuning.limits);
1090 parameters.setup_logger(tuning.verbose);
1091 // parameter ranges
1092 parameters.determine_mesh_limits();
1093 parameters.determine_r_cut_limits();
1094 parameters.determine_cao_limits(7);
1095 // run tuning algorithm
1096 parameters.tune();
1097 m_is_tuned = true;
1098 system.on_coulomb_change();
1099 } catch (...) {
1100 p3m.params.tuning = false;
1101 throw;
1102 }
1103 }
1104 init();
1105}
1106
1108 auto const &system = get_system();
1109 auto const &box_geo = *system.box_geo;
1110 auto const &local_geo = *system.local_geo;
1111 for (auto i = 0u; i < 3u; i++) {
1112 /* check k-space cutoff */
1113 if (p3m_params.cao_cut[i] >= box_geo.length_half()[i]) {
1114 std::stringstream msg;
1115 msg << "P3M_init: k-space cutoff " << p3m_params.cao_cut[i]
1116 << " is larger than half of box dimension " << box_geo.length()[i];
1117 throw std::runtime_error(msg.str());
1118 }
1119 if (p3m_params.cao_cut[i] >= local_geo.length()[i]) {
1120 std::stringstream msg;
1121 msg << "P3M_init: k-space cutoff " << p3m_params.cao_cut[i]
1122 << " is larger than local box dimension " << local_geo.length()[i];
1123 throw std::runtime_error(msg.str());
1124 }
1125 }
1126
1128 if ((box_geo.length()[0] != box_geo.length()[1]) or
1129 (box_geo.length()[1] != box_geo.length()[2]) or
1130 (p3m_params.mesh[0] != p3m_params.mesh[1]) or
1131 (p3m_params.mesh[1] != p3m_params.mesh[2])) {
1132 throw std::runtime_error(
1133 "CoulombP3M: non-metallic epsilon requires cubic box");
1134 }
1135 }
1136}
1137
1139 auto const &box_geo = *get_system().box_geo;
1140 if (!box_geo.periodic(0) or !box_geo.periodic(1) or !box_geo.periodic(2)) {
1141 throw std::runtime_error(
1142 "CoulombP3M: requires periodicity (True, True, True)");
1143 }
1144}
1145
1147 auto const &local_geo = *get_system().local_geo;
1148 if (local_geo.cell_structure_type() != CellStructureType::REGULAR and
1149 local_geo.cell_structure_type() != CellStructureType::HYBRID) {
1150 throw std::runtime_error(
1151 "CoulombP3M: requires the regular or hybrid decomposition cell system");
1152 }
1153 if (::communicator.size > 1 and
1154 local_geo.cell_structure_type() == CellStructureType::HYBRID) {
1155 throw std::runtime_error(
1156 "CoulombP3M: does not work with the hybrid decomposition cell system, "
1157 "if using more than one MPI node");
1158 }
1159}
1160
1161template <typename FloatType, Arch Architecture, class FFTConfig>
1163 auto const &box_geo = *get_system().box_geo;
1164 p3m.params.r_cut = p3m.params.r_cut_iL * box_geo.length()[0];
1165 p3m.params.alpha = p3m.params.alpha_L * box_geo.length_inv()[0];
1166 p3m.params.recalc_a_ai_cao_cut(box_geo.length());
1168 sanity_checks_boxl();
1169 calc_influence_function_force();
1170 calc_influence_function_energy();
1171 p3m.halo_comm.resize(::comm_cart, p3m.local_mesh);
1172}
1173
1174#ifdef ESPRESSO_CUDA
1175template <typename FloatType, Arch Architecture, class FFTConfig>
1177 add_long_range_forces_gpu(ParticleRange const &particles) {
1178 if constexpr (Architecture == Arch::CUDA) {
1179#ifdef ESPRESSO_NPT
1180 if (get_system().has_npt_enabled()) {
1181 get_system().npt_add_virial_contribution(long_range_energy(particles));
1182 }
1183#else
1184 static_cast<void>(particles);
1185#endif
1186 if (this_node == 0) {
1187 auto &gpu = get_system().gpu;
1188 p3m_gpu_add_farfield_force(*m_gpu_data, gpu, prefactor,
1189 gpu.n_particles());
1190 }
1191 }
1192}
1193
1194/* Initialize the CPU kernels.
1195 * This operation is time-consuming and sets up data members
1196 * that are only relevant for ELC force corrections, since the
1197 * GPU implementation uses CPU kernels to compute energies.
1198 */
1199template <typename FloatType, Arch Architecture, class FFTConfig>
1201 if constexpr (Architecture == Arch::CUDA) {
1202 auto &system = get_system();
1203 if (has_actor_of_type<ElectrostaticLayerCorrection>(
1204 system.coulomb.impl->solver)) {
1205 init_cpu_kernels();
1206 }
1207 p3m_gpu_init(m_gpu_data, p3m.params.cao, p3m.params.mesh, p3m.params.alpha,
1208 system.box_geo->length(), system.gpu.n_particles());
1209 }
1210}
1211
1212template <typename FloatType, Arch Architecture, class FFTConfig>
1214 if constexpr (Architecture == Arch::CUDA) {
1215 auto &gpu_particle_data = get_system().gpu;
1217 gpu_particle_data.enable_property(GpuParticleData::prop::q);
1218 gpu_particle_data.enable_property(GpuParticleData::prop::pos);
1219 }
1220}
1221#endif // ESPRESSO_CUDA
1222
1223#endif // ESPRESSO_P3M
@ HYBRID
Hybrid decomposition.
@ REGULAR
Regular decomposition.
Vector implementation and trait types for boost qvm interoperability.
Describes a cell structure / cell system.
void determine_mesh_limits() override
std::optional< std::string > layer_correction_veto_r_cut(double r_cut) const override
TuningAlgorithm::Parameters get_time() override
void setup_logger(bool verbose) override
std::tuple< double, double, double, double > calculate_accuracy(Utils::Vector3i const &mesh, int cao, double r_cut_iL) const override
void on_solver_change() const override
CoulombTuningAlgorithm(System::System &system, auto &input_p3m, double prefactor, int timings, decltype(m_tune_limits) tune_limits)
constexpr std::tuple< int, int, int > get_memory_layout() const
std::optional< std::string > fft_decomposition_veto(Utils::Vector3i const &mesh_size_r_space) const override
P3MParameters & get_params() override
void enable_property(std::size_t property)
std::size_t n_particles() const
A range of particles.
Main system class.
std::shared_ptr< LocalBox > local_geo
GpuParticleData gpu
void npt_add_virial_contribution(double energy)
Definition npt.cpp:137
bool has_npt_enabled() const
Coulomb::Solver coulomb
std::shared_ptr< BoxGeometry > box_geo
Tuning algorithm for P3M.
System::System & m_system
void determine_cao_limits(int initial_cao)
Determine a sensible range for the charge assignment order.
void determine_r_cut_limits()
Determine a sensible range for the real-space cutoff.
std::unique_ptr< TuningLogger > m_logger
DEVICE_QUALIFIER constexpr pointer data() noexcept
Definition Array.hpp:132
T norm() const
Definition Vector.hpp:160
static DEVICE_QUALIFIER constexpr Vector< T, N > broadcast(typename Base::value_type const &value) noexcept
Create a vector that has all entries set to the same value.
Definition Vector.hpp:132
Cache for interpolation weights.
void zfill(std::size_t size)
Fill cache with zero-initialized data.
void store(InterpolationWeights< cao > const &weights)
Push back weights for one point.
Communicator communicator
boost::mpi::communicator comm_cart
The communicator.
int this_node
The number of this node.
constexpr auto round_error_prec
Precision below which a double-precision float is assumed to be zero.
Definition config.hpp:38
void charge_assign(elc_data const &elc, CoulombP3M &solver, combined_ranges const &p_q_pos_range)
Definition elc.cpp:1118
ELC algorithm for long-range Coulomb interactions.
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
auto pad_with_zeros_discard_imag(std::span< T > cropped_array, Utils::Vector3i const &cropped_dim, Utils::Vector3i const &pad_left, Utils::Vector3i const &pad_right)
Pad a 3D matrix with zeros to restore halo regions.
and std::invocable< Projector, unsigned, int > void for_each_3d(detail::IndexVectorConcept auto &&start, detail::IndexVectorConcept auto &&stop, detail::IndexVectorConcept auto &&counters, Kernel &&kernel, Projector &&projector=detail::noop_projector)
Repeat an operation on every element of a 3D grid.
std::vector< FloatType > grid_influence_function(P3MParameters const &params, Utils::Vector3i const &n_start, Utils::Vector3i const &n_stop, Utils::Vector3d const &inv_box_l)
Map influence function over a grid.
void p3m_interpolate(P3MLocalMesh const &local_mesh, WeightsStorage< cao > const &weights, Kernel kernel)
P3M grid interpolation.
constexpr int p3m_min_cao
Minimal charge assignment order.
Definition math.hpp:48
constexpr int p3m_max_cao
Maximal charge assignment order.
Definition math.hpp:50
#define P3M_BRILLOUIN
P3M: Number of Brillouin zones taken into account in the calculation of the optimal influence functio...
Definition math.hpp:38
auto charge_range(ParticleRange const &particles)
auto pos_range(ParticleRange const &particles)
auto force_range(ParticleRange const &particles)
auto unfolded_pos_range(ParticleRange const &particles, BoxGeometry const &box)
std::function< void(ResultType &, ResultType const &)> ReductionOp
Join two partial reduction results.
std::function< void(ResultType &, Particle const &)> AddPartialResultKernel
Kernel that adds the result from a single particle to a reduction.
System & get_system()
T product(Vector< T, N > const &v)
Definition Vector.hpp:373
VectorXd< 3 > Vector3d
Definition Vector.hpp:185
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
Definition sqr.hpp:28
DEVICE_QUALIFIER auto sinc(T x)
Calculate the function .
Definition math.hpp:71
auto get_analytic_cotangent_sum_kernel(int cao)
Definition math.hpp:146
STL namespace.
Exports for the NpT code.
auto constexpr P3M_EPSILON_METALLIC
This value indicates metallic boundary conditions.
P3M algorithm for long-range Coulomb interaction.
void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, double prefactor, std::size_t n_part)
The long-range part of the P3M algorithm.
void p3m_gpu_init(std::shared_ptr< P3MGpuParams > &data, int cao, Utils::Vector3i const &mesh, double alpha, Utils::Vector3d const &box_l, std::size_t n_part)
Initialize the internal data structure of the P3M GPU.
P3M electrostatics on GPU.
double p3m_k_space_error_gpu(double prefactor, const int *mesh, int cao, int npart, double sum_q2, double alpha_L, const double *box)
double p3m_k_space_error(double pref, Utils::Vector3i const &mesh, int cao, std::size_t n_c_part, double sum_q2, double alpha_L, Utils::Vector3d const &box_l)
Calculate the analytic expression of the error estimate for the P3M method in (eq.
std::complex< FloatType > multiply_complex_by_real(std::complex< FloatType > const &z, FloatType k)
auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift, Utils::Vector3i const &mesh, Utils::Vector3d const &mesh_i, int cao, double alpha_L_i)
Aliasing sum used by p3m_k_space_error.
double p3m_real_space_error(double pref, double r_cut_iL, std::size_t n_c_part, double sum_q2, double alpha_L, Utils::Vector3d const &box_l)
Calculate the real space contribution to the rms error in the force (as described by Kolafa and Perra...
std::complex< FloatType > multiply_complex_by_imaginary(std::complex< FloatType > const &z, FloatType k)
auto calc_dipole_moment(boost::mpi::communicator const &comm, auto const &cs, auto const &box_geo)
bool is_node_grid_compatible_with_mesh(Utils::Vector3i const &node_grid, Utils::Vector3i const &mesh)
ResultType reduce_over_local_particles(CellStructure const &cs, Reduction::AddPartialResultKernel< ResultType > add_partial, Reduction::ReductionOp< ResultType > reduce_op)
performs a reduction over all particles
ESPRESSO_ATTR_ALWAYS_INLINE void kokkos_parallel_range_for(auto const &name, auto start, auto end, auto const &kernel)
Utils::Vector3i node_grid
void tune() override
void charge_assign(ParticleRange const &particles) override
void assign_charge(double q, Utils::Vector3d const &real_pos, bool skip_cache) override
double long_range_kernel(bool force_flag, bool energy_flag, ParticleRange const &particles)
Compute the k-space part of forces and energies.
void add_long_range_forces_gpu(ParticleRange const &particles)
Utils::Vector9d long_range_pressure(ParticleRange const &particles) override
void scaleby_box_l() override
Base class for the electrostatics P3M algorithm.
std::size_t sum_qpart
number of charged particles.
FloatType value_type
std::shared_ptr< P3MFFT< FloatType, FFTConfig > > fft
p3m_send_mesh< FloatType > halo_comm
double sum_q2
Sum of square of charges.
p3m_interpolation_cache inter_weights
void sanity_checks_periodicity() const
void sanity_checks_boxl() const
Checks for correctness of the k-space cutoff.
void sanity_checks_cell_structure() const
P3MParameters const & p3m_params
Definition p3m.hpp:58
std::unique_ptr< Implementation > impl
Pointer-to-implementation.
static constexpr std::size_t force
static constexpr std::size_t pos
static constexpr std::size_t q
Interpolation weights for one point.
void recalc_ld_pos(P3MParameters const &params)
Recalculate quantities derived from the mesh and box length: ld_pos (position of the left down mesh).
Structure to hold P3M parameters and some dependent variables.
Utils::Vector3d cao_cut
cutoff for charge assignment.
double alpha
unscaled alpha_L for use with fast inline functions only
double r_cut_iL
cutoff radius for real space electrostatics (>0), rescaled to r_cut_iL = r_cut * box_l_i.
int cao
charge assignment order ([0,7]).
double accuracy
accuracy of the actual parameter set.
double alpha_L
Ewald splitting parameter (0.
double r_cut
unscaled r_cut_iL for use with fast inline functions only
void recalc_a_ai_cao_cut(Utils::Vector3d const &box_l)
Recalculate quantities derived from the mesh and box length: a, ai and cao_cut.
bool tuning
tuning or production?
Utils::Vector3i mesh
number of mesh points per coordinate direction (>0), in real space.
double epsilon
epsilon of the "surrounding dielectric".
P3MParameters params
P3M base parameters.
P3MLocalMesh local_mesh
Local mesh geometry information for this MPI rank.
Struct holding all information for one particle.
Definition Particle.hpp:450
auto const & q() const
Definition Particle.hpp:593
auto const & image_box() const
Definition Particle.hpp:499
auto const & pos() const
Definition Particle.hpp:486
void operator()(auto &p3m, double q, InterpolationWeights< cao > const &weights)
void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos)
void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos, p3m_interpolation_cache &inter_weights)
void operator()(auto &p3m, auto force_prefac, CellStructure &cell_structure) const