ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
dp3m_heffte.impl.hpp
Go to the documentation of this file.
1/*
2 * Copyright (C) 2010-2026 The ESPResSo project
3 * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
4 * Max-Planck-Institute for Polymer Research, Theory Group
5 *
6 * This file is part of ESPResSo.
7 *
8 * ESPResSo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * ESPResSo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22#include <config/config.hpp>
23
24#ifdef ESPRESSO_DP3M
25
27
30
31#include "magnetostatics/dp3m_heffte.hpp" // must be included after dipoles.hpp
32
33#include "fft/fft.hpp"
34#include "p3m/P3MFFT.hpp"
36#include "p3m/TuningLogger.hpp"
37#include "p3m/common.hpp"
40#include "p3m/interpolation.hpp"
41#include "p3m/math.hpp"
42
43#include "BoxGeometry.hpp"
44#include "LocalBox.hpp"
45#include "Particle.hpp"
46#include "PropagationMode.hpp"
49#include "communication.hpp"
50#include "errorhandling.hpp"
52#include "npt.hpp"
53#include "system/System.hpp"
54#include "tuning.hpp"
55
56#include <utils/Vector.hpp>
59#include <utils/math/sqr.hpp>
60
61#include <boost/mpi/collectives/all_reduce.hpp>
62#include <boost/mpi/collectives/reduce.hpp>
63
64#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
65#include <Kokkos_Core.hpp>
66#include <omp.h>
67#endif
68
69#include <algorithm>
70#include <array>
71#include <cmath>
72#include <cstddef>
73#include <cstdio>
74#include <functional>
75#include <iterator>
76#include <memory>
77#include <numbers>
78#include <optional>
79#include <span>
80#include <sstream>
81#include <stdexcept>
82#include <tuple>
83#include <utility>
84#include <vector>
85
86#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
87#ifndef NDEBUG
88template <typename T>
89bool heffte_almost_equal(T const &value, T const &reference) {
90 auto const diff = std::abs(value - reference);
91 using FT = std::remove_cvref_t<decltype(diff)>;
92 auto constexpr atol = std::is_same_v<FT, float> ? FT{2e-4} : FT{1e-6};
93 auto constexpr rtol = std::is_same_v<FT, float> ? FT{5e-5} : FT{1e-5};
94 auto const non_zero = std::abs(reference) != FT{0};
95 return (diff < atol) or (non_zero and (diff / std::abs(reference) < rtol));
96}
97#endif // not NDEBUG
98#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
99
100template <typename FloatType, Arch Architecture, class FFTConfig>
101void DipolarP3MHeffte<FloatType, Architecture,
103 auto local_n = std::size_t{0u};
104 double local_mu2 = 0.;
105
106 for (auto const &p : get_system().cell_structure->local_particles()) {
107 if (p.dipm() != 0.) {
108 local_mu2 += p.calc_dip().norm2();
109 local_n++;
110 }
111 }
112
113 boost::mpi::all_reduce(comm_cart, local_mu2, dp3m.sum_mu2, std::plus<>());
114 boost::mpi::all_reduce(comm_cart, local_n, dp3m.sum_dip_part, std::plus<>());
115}
116
117inline double dp3m_k_space_error(double box_size, int mesh, int cao,
118 std::size_t n_c_part, double sum_q2,
119 double alpha_L);
120
121inline double dp3m_real_space_error(double box_size, double r_cut_iL,
122 std::size_t n_c_part, double sum_q2,
123 double alpha_L);
124
125/** Compute the value of alpha through a bisection method.
126 * Based on eq. (33) @cite wang01a.
127 */
128double dp3m_rtbisection(double box_size, double r_cut_iL, std::size_t n_c_part,
129 double sum_q2, double x1, double x2, double xacc,
130 double tuned_accuracy);
131
132template <typename FloatType, Arch Architecture, class FFTConfig>
133double DipolarP3MHeffte<FloatType, Architecture,
134 FFTConfig>::calc_average_self_energy_k_space() const {
135 auto const &box_geo = *get_system().box_geo;
136 auto const node_phi =
138 dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy);
139
140 double phi = 0.;
141 boost::mpi::reduce(comm_cart, node_phi, phi, std::plus<>(), 0);
142 phi /= 3. * box_geo.length()[0] * Utils::int_pow<3>(dp3m.params.mesh[0]);
143 return phi * std::numbers::pi;
144}
145
146template <typename FloatType, Arch Architecture, class FFTConfig>
148 assert(dp3m.params.mesh >= Utils::Vector3i::broadcast(1));
149 assert(dp3m.params.cao >= p3m_min_cao and dp3m.params.cao <= p3m_max_cao);
150 assert(dp3m.params.alpha > 0.);
151
152 auto const &system = get_system();
153 auto const &box_geo = *system.box_geo;
154 auto const &local_geo = *system.local_geo;
155 auto const verlet_skin = system.cell_structure->get_verlet_skin();
156
157 dp3m.params.cao3 = Utils::int_pow<3>(dp3m.params.cao);
158 dp3m.params.recalc_a_ai_cao_cut(box_geo.length());
159
160 assert(dp3m.fft);
161 dp3m.local_mesh.calc_local_ca_mesh(dp3m.params, local_geo, verlet_skin, 0.);
162 dp3m.fft_buffers->init_halo();
163 dp3m.fft->init(dp3m.params);
164 dp3m.mesh.ks_pnum = dp3m.fft->get_ks_pnum();
165 dp3m.fft_buffers->init_meshes(dp3m.fft->get_ca_mesh_size());
166 dp3m.update_mesh_views();
167#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
168 dp3m.heffte.world_size = comm_cart.size();
169 dp3m.heffte.fft = std::make_shared<P3MFFT<FloatType, FFTConfig>>(
170 ::comm_cart, dp3m.params.mesh, dp3m.local_mesh.ld_no_halo,
171 dp3m.local_mesh.ur_no_halo, ::communicator.node_grid);
172 dp3m.resize_heffte_buffers();
173#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
174 dp3m.calc_differential_operator();
175
176 /* fix box length dependent constants */
177 scaleby_box_l();
178
180}
181
182namespace {
183template <int cao> struct AssignDipole {
184#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
185 void operator()(auto &dp3m, auto &cell_structure) {
186 using DipolarP3MState = std::remove_reference_t<decltype(dp3m)>;
187 using value_type = DipolarP3MState::value_type;
189 auto const &aosoa = cell_structure.get_aosoa();
190 auto const &unique_particles = cell_structure.get_unique_particles();
191 auto const n_part = cell_structure.count_local_particles();
192 dp3m.inter_weights.zfill(n_part); // allocate buffer for parallel write
194 "InterpolateDipoles", std::size_t{0u}, n_part, [&](auto p_index) {
195 auto const tid = omp_get_thread_num();
196 auto const p_pos = aosoa.get_vector_at(aosoa.position, p_index);
197 auto const dip = unique_particles.at(p_index)->calc_dip();
198 auto const weights =
200 p_pos, dp3m.params.ai, dp3m.local_mesh);
201 dp3m.inter_weights.store_at(p_index, weights);
203 dp3m.local_mesh, weights, [&dip, tid, &dp3m](int ind, double w) {
204 dp3m.rs_fields_kokkos(tid, 0u, ind) += value_type(w * dip[0u]);
205 dp3m.rs_fields_kokkos(tid, 1u, ind) += value_type(w * dip[1u]);
206 dp3m.rs_fields_kokkos(tid, 2u, ind) += value_type(w * dip[2u]);
207 });
208 });
209 Kokkos::fence();
210 using execution_space = Kokkos::DefaultExecutionSpace;
211 int num_threads = execution_space().concurrency();
212 Kokkos::RangePolicy<execution_space> policy(std::size_t{0},
213 dp3m.local_mesh.size);
214 Kokkos::parallel_for("ReduceInterpolatedDipoles", policy,
215 [&dp3m, num_threads](std::size_t const i) {
216 for (int dir = 0; dir < 3; ++dir) {
217 value_type acc{};
218 for (int tid = 0; tid < num_threads; ++tid) {
219 acc += dp3m.rs_fields_kokkos(tid, dir, i);
220 }
221 dp3m.mesh.rs_fields[dir][i] += acc;
222#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
223 dp3m.heffte.rs_dipole_density[dir][i] += acc;
224#endif
225 }
226 });
227 Kokkos::fence();
228 }
229#else // ESPRESSO_SHARED_MEMORY_PARALLELISM
230 void operator()(auto &dp3m, Utils::Vector3d const &real_pos,
231 Utils::Vector3d const &dip) const {
232 using DipolarP3MState = std::remove_reference_t<decltype(dp3m)>;
233 using value_type = DipolarP3MState::value_type;
236 real_pos, dp3m.params.ai, dp3m.local_mesh);
238 dp3m.local_mesh, weights, [&dip, &dp3m](int ind, double w) {
239 dp3m.mesh.rs_fields[0u][ind] += value_type(w * dip[0u]);
240 dp3m.mesh.rs_fields[1u][ind] += value_type(w * dip[1u]);
241 dp3m.mesh.rs_fields[2u][ind] += value_type(w * dip[2u]);
242#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
243 dp3m.heffte.rs_dipole_density[0u][ind] += value_type(w * dip[0u]);
244 dp3m.heffte.rs_dipole_density[1u][ind] += value_type(w * dip[1u]);
245 dp3m.heffte.rs_dipole_density[2u][ind] += value_type(w * dip[2u]);
246#endif
247 });
248
249 dp3m.inter_weights.template store<cao>(weights);
250 }
251#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
252};
253} // namespace
254
255template <typename FloatType, Arch Architecture, class FFTConfig>
257 prepare_fft_mesh();
258
259#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
260 Utils::integral_parameter<int, AssignDipole, p3m_min_cao, p3m_max_cao>(
261 dp3m.params.cao, dp3m, *get_system().cell_structure);
262#else // ESPRESSO_SHARED_MEMORY_PARALLELISM
263 for (auto const &p : get_system().cell_structure->local_particles()) {
264 if (p.dipm() != 0.) {
265 Utils::integral_parameter<int, AssignDipole, p3m_min_cao, p3m_max_cao>(
266 dp3m.params.cao, dp3m, p.pos(), p.calc_dip());
267 }
268 }
269#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
270}
271
272namespace {
273template <int cao> struct AssignTorques {
274 void operator()(auto &dp3m, double prefac, int d_rs,
275 CellStructure &cell_structure) const {
276
277 assert(cao == dp3m.inter_weights.cao());
278
279 auto const kernel = [d_rs, &dp3m](auto const &pref, auto &p_torque,
280 std::size_t p_index) {
281 auto const weights = dp3m.inter_weights.template load<cao>(p_index);
283 p3m_interpolate(dp3m.local_mesh, weights,
284 [&E, &dp3m, d_rs](int ind, double w) {
285 // heFFTe data: dp3m.heffte.ks_scalar.real()
286 E[d_rs] += w * double(dp3m.mesh.rs_scalar[ind]);
287 });
288
289 auto const torque = vector_product(pref, E);
290#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
291 auto const thread_id = omp_get_thread_num();
292 p_torque(p_index, thread_id, 0) -= torque[0];
293 p_torque(p_index, thread_id, 1) -= torque[1];
294 p_torque(p_index, thread_id, 2) -= torque[2];
295#else
296 p_torque -= torque;
297#endif
298 };
299
300#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
301 auto const n_part = dp3m.inter_weights.size();
302 auto const &unique_particles = cell_structure.get_unique_particles();
303 auto &local_torque = cell_structure.get_local_torque();
305 "AssignTorques", std::size_t{0u}, n_part, [&](std::size_t p_index) {
306 auto const &p = *unique_particles.at(p_index);
307 if (p.dipm() != 0.) {
308 kernel(p.calc_dip() * prefac, local_torque, p_index);
309 }
310 });
311#else // ESPRESSO_SHARED_MEMORY_PARALLELISM
312 /* magnetic particle index */
313 auto p_index = std::size_t{0ul};
314
315 for (auto &p : cell_structure.local_particles()) {
316 if (p.dipm() != 0.) {
317 kernel(p.calc_dip() * prefac, p.torque(), p_index);
318 ++p_index;
319 }
320 }
321#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
322 }
323};
324
325template <int cao> struct AssignForcesDip {
326 void operator()(auto &dp3m, double prefac, int d_rs,
327 CellStructure &cell_structure) const {
328
329 assert(cao == dp3m.inter_weights.cao());
330
331 auto const kernel = [d_rs, &dp3m](auto const &pref, auto &p_force,
332 std::size_t p_index) {
333 auto const weights = dp3m.inter_weights.template load<cao>(p_index);
334
336 p3m_interpolate(dp3m.local_mesh, weights, [&E, &dp3m](int ind, double w) {
337 // heFFTe data: dp3m.heffte.rs_B_fields
338 E[0u] += w * double(dp3m.mesh.rs_fields[0u][ind]);
339 E[1u] += w * double(dp3m.mesh.rs_fields[1u][ind]);
340 E[2u] += w * double(dp3m.mesh.rs_fields[2u][ind]);
341 });
342
343#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
344 auto const thread_id = omp_get_thread_num();
345 p_force(p_index, thread_id, d_rs) += pref * E;
346#else
347 p_force[d_rs] += pref * E;
348#endif
349 };
350
351#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
352 auto const n_part = dp3m.inter_weights.size();
353 auto const &unique_particles = cell_structure.get_unique_particles();
354 auto &local_force = cell_structure.get_local_force();
356 "AssignForcesDip", std::size_t{0u}, n_part, [&](std::size_t p_index) {
357 auto const &p = *unique_particles.at(p_index);
358 if (p.dipm() != 0.) {
359 kernel(p.calc_dip() * prefac, local_force, p_index);
360 }
361 });
362#else // ESPRESSO_SHARED_MEMORY_PARALLELISM
363 /* magnetic particle index */
364 auto p_index = std::size_t{0ul};
365
366 for (auto &p : cell_structure.local_particles()) {
367 if (p.dipm() != 0.) {
368 kernel(p.calc_dip() * prefac, p.force(), p_index);
369 ++p_index;
370 }
371 }
372#endif // ESPRESSO_SHARED_MEMORY_PARALLELISM
373 }
374};
375} // namespace
376
377#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
378template <typename FloatType, class FFTConfig>
380 auto const rs_array_size =
381 static_cast<std::size_t>(Utils::product(this->local_mesh.dim));
382 auto const rs_array_size_no_halo =
383 static_cast<std::size_t>(Utils::product(this->local_mesh.dim_no_halo));
384 auto const fft_mesh_size =
385 static_cast<std::size_t>(Utils::product(heffte.fft->ks_local_size()));
386 for (auto d : {0u, 1u, 2u}) {
387 heffte.rs_dipole_density[d].resize(rs_array_size);
388 heffte.ks_dipole_density[d].resize(fft_mesh_size);
389 heffte.rs_B_fields[d].resize(rs_array_size);
390 heffte.rs_B_fields_no_halo[d].resize(rs_array_size_no_halo);
391 }
392 heffte.ks_B_field_storage.resize(fft_mesh_size);
393 heffte.ks_scalar.resize(fft_mesh_size);
394}
395#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
396
397template <typename FloatType, Arch Architecture, class FFTConfig>
399 bool force_flag, bool energy_flag) {
400 /* k-space energy */
401 double energy = 0.;
402 auto const &system = get_system();
403 auto const &box_geo = *system.box_geo;
404 auto const dipole_prefac = prefactor / Utils::product(dp3m.params.mesh);
405#ifdef ESPRESSO_NPT
406 auto const npt_flag = force_flag and system.has_npt_enabled();
407#else
408 auto constexpr npt_flag = false;
409#endif
410
411 auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
413#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
414 auto constexpr r2c_dir = FFTConfig::r2c_dir;
415 auto const rs_local_size = dp3m.heffte.fft->rs_local_size();
416 auto const local_size = dp3m.heffte.fft->ks_local_size();
418 if constexpr (FFTConfig::use_r2c) {
419 local_size_full[r2c_dir] -= 1;
420 local_size_full[r2c_dir] *= 2;
421 }
422 auto const local_origin = dp3m.heffte.fft->ks_local_ld_index();
423#ifndef NDEBUG
424 auto const line_stride = local_size_full[0];
425 auto const plane_stride = local_size_full[0] * local_size_full[0];
426#endif
427 auto const &global_size = dp3m.params.mesh;
428 auto const cutoff_left = 1 - local_origin[r2c_dir];
429 auto const cutoff_right = global_size[r2c_dir] / 2 - local_origin[r2c_dir];
430 auto &short_dim = local_index[r2c_dir];
431 dp3m.resize_heffte_buffers();
432#endif
433
434 if (dp3m.sum_mu2 > 0.) {
435 dipole_assign();
436 dp3m.fft_buffers->perform_vector_halo_gather();
437 for (auto &rs_mesh : dp3m.fft_buffers->get_vector_mesh()) {
438 dp3m.fft->forward_fft(rs_mesh);
439 }
440 dp3m.update_mesh_views();
441
442#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
443 if (dp3m.heffte.world_size == 1) {
444 // halo communication of real space dipoles density
445 std::array<FloatType *, 3u> rs_fields = {
446 {dp3m.heffte.rs_dipole_density[0u].data(),
447 dp3m.heffte.rs_dipole_density[1u].data(),
448 dp3m.heffte.rs_dipole_density[2u].data()}};
449 dp3m.heffte.halo_comm.gather_grid(::comm_cart, rs_fields,
450 dp3m.local_mesh.dim);
451
452 for (auto dir : {0u, 1u, 2u}) {
453 // get real-space dipoles density without ghost layers
455 FFTConfig::r_space_order>(
456 dp3m.heffte.rs_dipole_density[dir], dp3m.local_mesh.dim,
457 dp3m.local_mesh.n_halo_ld,
458 dp3m.local_mesh.dim - dp3m.local_mesh.n_halo_ur);
459 // re-order data in row-major
460 std::vector<FloatType> rs_field_no_halo_reorder;
462 std::size_t index_row_major = 0u;
464 mesh_start, rs_local_size, local_index, [&]() {
465 auto constexpr KX = 1, KY = 2, KZ = 0;
466 auto const index = local_index[KZ] +
467 rs_local_size[0] * local_index[KY] +
468 Utils::sqr(rs_local_size[0]) * local_index[KX];
470 rs_field_no_halo[index];
472 });
473 dp3m.heffte.fft->forward(rs_field_no_halo_reorder.data(),
474 dp3m.heffte.ks_dipole_density[dir].data());
475#ifndef NDEBUG
476 if (not dp3m.params.tuning) {
477 std::size_t index_row_major_r2c = 0u;
480 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
481 auto constexpr KX = 2, KY = 0, KZ = 1;
482 auto const index_fft_legacy = local_index[KZ] +
485 auto const old_value = std::complex<FloatType>{
486 dp3m.mesh.rs_fields[dir][2 * index_fft_legacy],
487 dp3m.mesh.rs_fields[dir][2 * index_fft_legacy + 1]};
488 auto const &new_value =
489 dp3m.heffte.ks_dipole_density[dir][index_row_major_r2c];
492 }
493 });
494 }
495#endif // not NDEBUG
496 }
497 }
498#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
499 }
500
501 /* === k-space energy calculation === */
502 if (energy_flag or npt_flag) {
503 /*********************
504 Dipolar energy
505 **********************/
506 if (dp3m.sum_mu2 > 0.) {
507 /* i*k differentiation for dipolar gradients:
508 * |(\Fourier{\vect{mu}}(k)\cdot \vect{k})|^2 */
509
510 auto index = std::size_t(0u);
511 auto it_energy = dp3m.g_energy.begin();
512 auto node_energy = 0.;
513 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
514 auto constexpr KX = 2, KY = 0, KZ = 1;
515 auto const shift = local_index + dp3m.mesh.start;
516 auto const &d_op = dp3m.d_op[0u];
517 auto const &mesh_dip = dp3m.mesh.rs_fields;
518 // Re(mu)*k
519 auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
520 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
521 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
522 ++index;
523 // Im(mu)*k
524 auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
525 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
526 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
527 ++index;
528 node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im));
529 std::advance(it_energy, 1);
530 });
531#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
532 if (dp3m.heffte.world_size == 1) {
533 [[maybe_unused]] auto node_energy_heffte = 0.;
534 std::size_t index_row_major_r2c = 0u;
537 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
539 auto const &mesh_dip = dp3m.heffte.ks_dipole_density;
540 auto const cell_field =
542 FloatType(dp3m.d_op[0u][global_index[1u]]) +
544 FloatType(dp3m.d_op[1u][global_index[2u]]) +
546 FloatType(dp3m.d_op[2u][global_index[0u]]);
547 auto cell_energy = static_cast<double>(
548 dp3m.heffte.g_energy[index_row_major_r2c] *
549 std::norm(cell_field));
550 if (FFTConfig::use_r2c and (short_dim >= cutoff_left and
551 short_dim <= cutoff_right - 1)) {
552 // k-space symmetry: double counting except in the first and
553 // last planes of the short dimension; although the wavevector
554 // points in the opposite direction in the redundant region of
555 // k-space, the product of two components of the wavevector
556 // cancels out the negative sign
557 cell_energy *= 2.;
558 }
560 }
562 });
563 assert(heffte_almost_equal(static_cast<FloatType>(node_energy_heffte),
564 static_cast<FloatType>(node_energy)));
565 }
566#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
567 node_energy *= dipole_prefac * std::numbers::pi * box_geo.length_inv()[0];
568 boost::mpi::reduce(comm_cart, node_energy, energy, std::plus<>(), 0);
569
570 if (dp3m.energy_correction == 0.)
571 calc_energy_correction();
572
573 if (this_node == 0) {
574 /* self energy correction */
575 energy -= prefactor * dp3m.sum_mu2 * std::numbers::inv_sqrtpi *
576 (2. / 3.) * Utils::int_pow<3>(dp3m.params.alpha);
577
578 /* dipolar energy correction due to systematic Madelung-self effects */
579 energy += prefactor * dp3m.energy_correction / box_geo.volume();
580 }
581 }
582 } // if (energy_flag)
583
584 /* === k-space force calculation === */
585 if (force_flag) {
586 /****************************
587 * DIPOLAR TORQUES (k-space)
588 ****************************/
589 if (dp3m.sum_mu2 > 0.) {
590 auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u];
591 dp3m.ks_scalar.resize(dp3m.local_mesh.size);
592 /* fill in ks_scalar array for torque calculation */
593 {
594 auto index{std::size_t(0u)};
595 auto it_energy = dp3m.g_energy.begin();
596 auto it_ks_scalar = dp3m.ks_scalar.begin();
597 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() mutable {
598 auto constexpr KX = 2, KY = 0, KZ = 1;
599 auto const shift = local_index + dp3m.mesh.start;
600 auto const &d_op = dp3m.d_op[0u];
601 auto const &mesh_dip = dp3m.mesh.rs_fields;
602 // Re(mu)*k
603 auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
604 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
605 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
606 ++index;
607 // Im(mu)*k
608 auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
609 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
610 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
611 ++index;
612 *it_ks_scalar = *it_energy * std::complex<FloatType>{re, im};
613 std::advance(it_energy, 1);
614 std::advance(it_ks_scalar, 1);
615 });
616 }
617#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
618 if (dp3m.heffte.world_size == 1) {
619 std::size_t index_row_major_r2c = 0u;
622 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
624 auto const &mesh_dip = dp3m.heffte.ks_dipole_density;
625 dp3m.heffte.ks_scalar[index_row_major_r2c] =
626 dp3m.heffte.g_energy[index_row_major_r2c] *
628 FloatType(dp3m.d_op[0u][global_index[1u]]) +
630 FloatType(dp3m.d_op[1u][global_index[2u]]) +
632 FloatType(dp3m.d_op[2u][global_index[0u]]));
633#ifndef NDEBUG
634 if (not dp3m.params.tuning) {
635 auto constexpr KX = 2, KY = 0, KZ = 1;
636 auto const index_fft_legacy = local_index[KZ] +
640 dp3m.heffte.ks_scalar[index_row_major_r2c],
641 dp3m.ks_scalar[index_fft_legacy]));
642 }
643#endif // not NDEBUG
645 }
646 });
647 }
648#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
649
650 /* Torque component loop */
651 for (int d = 0; d < 3; d++) {
652 auto it_ks_scalar = dp3m.ks_scalar.begin();
653 auto index = 0u;
654 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
655 auto const &offset = dp3m.mesh.start;
656 auto const &d_op = dp3m.d_op[0u];
657 auto const d_op_val = FloatType(d_op[local_index[d] + offset[d]]);
658 auto const &value = *it_ks_scalar;
659 dp3m.mesh.rs_scalar[index] = d_op_val * value.real();
660 ++index;
661 dp3m.mesh.rs_scalar[index] = d_op_val * value.imag();
662 ++index;
663 std::advance(it_ks_scalar, 1);
664 });
665#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
666 if (dp3m.heffte.world_size == 1) {
667 unsigned int constexpr d_ks[3] = {2u, 0u, 1u};
668 std::size_t index_row_major_r2c = 0u;
671 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
673 auto const d_op_val =
674 FloatType(dp3m.d_op[d][global_index[d_ks[d]]]);
675 dp3m.heffte.ks_B_field_storage[index_row_major_r2c] =
676 d_op_val * dp3m.heffte.ks_scalar[index_row_major_r2c];
677#ifndef NDEBUG
678 if (not dp3m.params.tuning) {
679 auto constexpr KX = 2, KY = 0, KZ = 1;
680 auto const index_fft_legacy =
683 auto const old_value = std::complex<FloatType>{
684 dp3m.mesh.rs_scalar[2 * index_fft_legacy],
685 dp3m.mesh.rs_scalar[2 * index_fft_legacy + 1]};
686 auto const &new_value =
687 dp3m.heffte.ks_B_field_storage[index_row_major_r2c];
689 }
690#endif // not NDEBUG
692 }
693 });
694 dp3m.heffte.fft->backward(dp3m.heffte.ks_B_field_storage.data(),
695 dp3m.heffte.rs_B_fields_no_halo[d].data());
696 // pad zeros around the B-field in real space for ghost layers
697 dp3m.heffte.rs_B_fields[d] =
698 pad_with_zeros_discard_imag<FFTConfig::r_space_order,
700 std::span(dp3m.heffte.rs_B_fields_no_halo[d]),
701 dp3m.local_mesh.dim_no_halo, dp3m.local_mesh.n_halo_ld,
702 dp3m.local_mesh.n_halo_ur);
703 // communicate ghost layers of the B-field in real space
704 dp3m.heffte.halo_comm.spread_grid(::comm_cart,
705 dp3m.heffte.rs_B_fields[d].data(),
706 dp3m.local_mesh.dim);
707 }
708#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
709 dp3m.fft->backward_fft(dp3m.fft_buffers->get_scalar_mesh());
710 // communicate ghost layers of the B-field in real space
711 dp3m.fft_buffers->perform_scalar_halo_spread();
712 // assign torque component from mesh to particle
713 auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
714 Utils::integral_parameter<int, AssignTorques, p3m_min_cao, p3m_max_cao>(
715 dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs,
716 *system.cell_structure);
717 }
718
719 /***************************
720 DIPOLAR FORCES (k-space)
721 ****************************/
722 // Compute forces after torques because the algorithm below overwrites the
723 // grids dp3m.mesh.rs_fields !
724 // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this
725 // number to 6 !
726 /* fill in ks_scalar array for force calculation */
727 {
728 auto it_force = dp3m.g_force.begin();
729 auto it_ks_scalar = dp3m.ks_scalar.begin();
730 std::size_t index = 0u;
731 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
732 auto constexpr KX = 2, KY = 0, KZ = 1;
733 auto const shift = local_index + dp3m.mesh.start;
734 auto const &d_op = dp3m.d_op[0u];
735 auto const &mesh_dip = dp3m.mesh.rs_fields;
736 // Re(mu)*k
737 auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
738 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
739 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
740 ++index;
741 // Im(mu)*k
742 auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
743 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
744 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
745 ++index;
746 *it_ks_scalar = {*it_force * im, *it_force * (-re)};
747 std::advance(it_force, 1);
748 std::advance(it_ks_scalar, 1);
749 });
750 }
751
752#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
753 if (dp3m.heffte.world_size == 1) {
754 std::size_t index_row_major_r2c = 0u;
757 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
759 auto const &mesh_dip = dp3m.heffte.ks_dipole_density;
760 auto const value =
761 dp3m.heffte.g_force[index_row_major_r2c] *
763 FloatType(dp3m.d_op[0u][global_index[1u]]) +
765 FloatType(dp3m.d_op[1u][global_index[2u]]) +
767 FloatType(dp3m.d_op[2u][global_index[0u]]));
768 dp3m.heffte.ks_scalar[index_row_major_r2c] = {value.imag(),
769 -value.real()};
770#ifndef NDEBUG
771 if (not dp3m.params.tuning) {
772 auto constexpr KX = 2, KY = 0, KZ = 1;
773 auto const index_fft_legacy = local_index[KZ] +
777 dp3m.heffte.ks_scalar[index_row_major_r2c],
778 dp3m.ks_scalar[index_fft_legacy]));
779 }
780#endif // not NDEBUG
782 }
783 });
784 }
785#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
786
787 /* Force component loop */
788 for (int d = 0; d < 3; d++) {
789 std::size_t index = 0u;
790 auto it_ks_scalar = dp3m.ks_scalar.begin();
791 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
792 auto constexpr KX = 2, KY = 0, KZ = 1;
793 auto const shift = local_index + dp3m.mesh.start;
794 auto const &d_op = dp3m.d_op[0u];
795 auto const &mesh_dip = dp3m.mesh.rs_fields;
796 auto const d_op_val = FloatType(d_op[shift[d]]);
797 auto const f = *it_ks_scalar * d_op_val;
798 mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f.real();
799 mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f.real();
800 mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f.real();
801 ++index;
802 mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f.imag();
803 mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f.imag();
804 mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f.imag();
805 ++index;
806 std::advance(it_ks_scalar, 1);
807 });
808
809#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
810 if (dp3m.heffte.world_size == 1) {
811 std::size_t index_row_major_r2c = 0u;
814 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
815 auto constexpr KX = 1, KY = 2, KZ = 0;
817 auto const remapped_index =
820 auto const d_op_val =
821 FloatType(dp3m.d_op[d][global_index[d]]);
822 auto &mesh_dip = dp3m.heffte.ks_dipole_density;
824 FloatType(dp3m.d_op[d][global_index[2u]]) * d_op_val *
825 dp3m.heffte.ks_scalar[remapped_index];
827 FloatType(dp3m.d_op[d][global_index[0u]]) * d_op_val *
828 dp3m.heffte.ks_scalar[remapped_index];
830 FloatType(dp3m.d_op[d][global_index[1u]]) * d_op_val *
831 dp3m.heffte.ks_scalar[remapped_index];
832#ifndef NDEBUG
833 if (not FFTConfig::use_r2c and not dp3m.params.tuning) {
834 auto const index_fft_legacy = local_index[2] +
837 for (int j = 0; j < 3; ++j) {
838 auto const old_value = std::complex<FloatType>{
839 dp3m.mesh.rs_fields[j][2 * index_fft_legacy],
840 dp3m.mesh.rs_fields[j][2 * index_fft_legacy + 1]};
841 auto const &new_value = mesh_dip[j][index_row_major_r2c];
843 }
844 }
845#endif // not NDEBUG
847 }
848 });
849 for (int dir = 0u; dir < 3u; ++dir) {
850 dp3m.heffte.fft->backward(
851 dp3m.heffte.ks_dipole_density[dir].data(),
852 dp3m.heffte.rs_B_fields_no_halo[dir].data());
853 // pad zeros around the B-field in real space for ghost layers
854 dp3m.heffte.rs_B_fields[d] =
855 pad_with_zeros_discard_imag<FFTConfig::r_space_order,
857 std::span(dp3m.heffte.rs_B_fields_no_halo[dir]),
858 dp3m.local_mesh.dim_no_halo, dp3m.local_mesh.n_halo_ld,
859 dp3m.local_mesh.n_halo_ur);
860 }
861 // communicate ghost layers of the B-field in real space
862 auto rs_fields =
863 std::array<FloatType *, 3u>{{dp3m.heffte.rs_B_fields[0u].data(),
864 dp3m.heffte.rs_B_fields[1u].data(),
865 dp3m.heffte.rs_B_fields[2u].data()}};
866 dp3m.heffte.halo_comm.spread_grid(::comm_cart, rs_fields,
867 dp3m.local_mesh.dim);
868 }
869#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
870 for (auto &rs_mesh : dp3m.fft_buffers->get_vector_mesh()) {
871 dp3m.fft->backward_fft(rs_mesh);
872 }
873 // communicate ghost layers of the B-field in real space
874 dp3m.fft_buffers->perform_vector_halo_spread();
875 // assign force component from mesh to particle
876 auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
877 Utils::integral_parameter<int, AssignForcesDip, p3m_min_cao,
879 dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(wavenumber), d_rs,
880 *system.cell_structure);
881 }
882 } /* if (dp3m.sum_mu2 > 0) */
883 } /* if (force_flag) */
884
885 if (dp3m.params.epsilon != P3M_EPSILON_METALLIC) {
886 auto const surface_term =
887 calc_surface_term(force_flag, energy_flag or npt_flag);
888 if (this_node == 0) {
889 energy += surface_term;
890 }
891 }
892#ifdef ESPRESSO_NPT
893 if (npt_flag) {
895 }
896#endif
897 if (not energy_flag) {
898 energy = 0.;
899 }
900
901 return energy;
902}
903
904template <typename FloatType, Arch Architecture, class FFTConfig>
906 bool force_flag, bool energy_flag) {
907 auto const &system = get_system();
908 auto const &box_geo = *system.box_geo;
909 auto const particles = system.cell_structure->local_particles();
910 auto const pref = prefactor * 4. * std::numbers::pi / box_geo.volume() /
911 (2. * dp3m.params.epsilon + 1.);
912 auto const n_local_part = particles.size();
913
914 // We put all the dipolar momenta in a the arrays mx,my,mz according to the
915 // id-number of the particles
916 std::vector<double> mx(n_local_part);
917 std::vector<double> my(n_local_part);
918 std::vector<double> mz(n_local_part);
919
920 std::size_t ip = 0u;
921 for (auto const &p : particles) {
922 auto const dip = p.calc_dip();
923 mx[ip] = dip[0u];
924 my[ip] = dip[1u];
925 mz[ip] = dip[2u];
926 ip++;
927 }
928
929 // we will need the sum of all dipolar momenta vectors
930 auto local_dip = Utils::Vector3d{};
931 for (std::size_t i = 0u; i < n_local_part; i++) {
932 local_dip[0u] += mx[i];
933 local_dip[1u] += my[i];
934 local_dip[2u] += mz[i];
935 }
936 auto const box_dip =
937 boost::mpi::all_reduce(comm_cart, local_dip, std::plus<>());
938
939 double energy = 0.;
940 if (energy_flag) {
941 double sum_e = 0.;
942 for (std::size_t i = 0u; i < n_local_part; i++) {
943 sum_e += mx[i] * box_dip[0] + my[i] * box_dip[1] + mz[i] * box_dip[2];
944 }
945 energy =
946 0.5 * pref * boost::mpi::all_reduce(comm_cart, sum_e, std::plus<>());
947 }
948
949 if (force_flag) {
950
951 std::vector<double> sumix(n_local_part);
952 std::vector<double> sumiy(n_local_part);
953 std::vector<double> sumiz(n_local_part);
954
955 for (std::size_t i = 0u; i < n_local_part; i++) {
956 sumix[i] = my[i] * box_dip[2u] - mz[i] * box_dip[1u];
957 sumiy[i] = mz[i] * box_dip[0u] - mx[i] * box_dip[2u];
958 sumiz[i] = mx[i] * box_dip[1u] - my[i] * box_dip[0u];
959 }
960
961 ip = 0u;
962 for (auto &p : particles) {
963 auto &torque = p.torque();
964 torque[0u] -= pref * sumix[ip];
965 torque[1u] -= pref * sumiy[ip];
966 torque[2u] -= pref * sumiz[ip];
967 ip++;
968 }
969 }
970
971 return energy;
972}
973
974template <typename FloatType, Arch Architecture, class FFTConfig>
975void DipolarP3MHeffte<FloatType, Architecture,
976 FFTConfig>::calc_influence_function_force() {
977 dp3m.g_force = grid_influence_function_dipolar<FloatType, 3, P3M_BRILLOUIN,
978 FFTConfig::k_space_order>(
979 dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
980 get_system().box_geo->length_inv());
981#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
982 if (dp3m.heffte.world_size == 1) {
983 dp3m.heffte.g_force =
985 FFTConfig::k_space_order>(
986 dp3m.params, dp3m.heffte.fft->ks_local_ld_index(),
987 dp3m.heffte.fft->ks_local_ur_index(),
988 get_system().box_geo->length_inv());
989 if constexpr (FFTConfig::use_r2c) {
991 dp3m.heffte.g_force, dp3m.params.mesh,
992 dp3m.heffte.fft->ks_local_size(),
993 dp3m.heffte.fft->ks_local_ld_index());
994 }
995 }
996#endif
997}
998
999template <typename FloatType, Arch Architecture, class FFTConfig>
1000void DipolarP3MHeffte<FloatType, Architecture,
1001 FFTConfig>::calc_influence_function_energy() {
1002 dp3m.g_energy = grid_influence_function_dipolar<FloatType, 2, P3M_BRILLOUIN,
1003 FFTConfig::k_space_order>(
1004 dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
1005 get_system().box_geo->length_inv());
1006#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
1007 if (dp3m.heffte.world_size == 1) {
1008 dp3m.heffte.g_energy =
1010 FFTConfig::k_space_order>(
1011 dp3m.params, dp3m.heffte.fft->ks_local_ld_index(),
1012 dp3m.heffte.fft->ks_local_ur_index(),
1013 get_system().box_geo->length_inv());
1014 if constexpr (FFTConfig::use_r2c) {
1016 dp3m.heffte.g_energy, dp3m.params.mesh,
1017 dp3m.heffte.fft->ks_local_size(),
1018 dp3m.heffte.fft->ks_local_ld_index());
1019 }
1020 }
1021#endif
1022}
1023
1024template <typename FloatType, Arch Architecture, class FFTConfig>
1027 int m_mesh_max = -1, m_mesh_min = -1;
1028 std::pair<std::optional<int>, std::optional<int>> m_tune_limits;
1029
1030public:
1032 double prefactor, int timings,
1033 decltype(m_tune_limits) tune_limits)
1034 : TuningAlgorithm(system, prefactor, timings), dp3m{input_dp3m},
1035 m_tune_limits{std::move(tune_limits)} {}
1036
1037 P3MParameters &get_params() override { return dp3m.params; }
1038
1039 void on_solver_change() const override { m_system.on_dipoles_change(); }
1040
1041 std::optional<std::string>
1042 layer_correction_veto_r_cut(double) const override {
1043 return {};
1044 }
1045
1046 void setup_logger(bool verbose) override {
1047 auto const &box_geo = *m_system.box_geo;
1048 m_logger = std::make_unique<TuningLogger>(
1049 verbose and this_node == 0, "DipolarP3M", TuningLogger::Mode::Dipolar);
1050 m_logger->tuning_goals(dp3m.params.accuracy, m_prefactor,
1051 box_geo.length()[0], dp3m.sum_dip_part,
1052 dp3m.sum_mu2);
1053 m_logger->log_tuning_start();
1054 }
1055
1056 std::tuple<double, double, double, double>
1058 double r_cut_iL) const override {
1059
1060 double alpha_L, rs_err, ks_err;
1061 auto const &box_geo = *m_system.box_geo;
1062
1063 /* calc maximal real space error for setting */
1064 rs_err = dp3m_real_space_error(box_geo.length()[0], r_cut_iL,
1065 dp3m.sum_dip_part, dp3m.sum_mu2, 0.001);
1066 // alpha cannot be zero for dipoles because real-space formula breaks down
1067
1068 if (std::numbers::sqrt2 * rs_err > dp3m.params.accuracy) {
1069 /* assume rs_err = ks_err -> rs_err = accuracy/sqrt(2.0) -> alpha_L */
1070 alpha_L = dp3m_rtbisection(
1071 box_geo.length()[0], r_cut_iL, dp3m.sum_dip_part, dp3m.sum_mu2,
1072 0.0001 * box_geo.length()[0], 5. * box_geo.length()[0], 0.0001,
1073 dp3m.params.accuracy);
1074 } else {
1075 /* even alpha=0 is ok, however, we cannot choose it since it kills the
1076 k-space error formula.
1077 Anyways, this very likely NOT the optimal solution */
1078 alpha_L = 0.1;
1079 }
1080
1081 /* calculate real-space and k-space error for this alpha_L */
1082 rs_err = dp3m_real_space_error(box_geo.length()[0], r_cut_iL,
1083 dp3m.sum_dip_part, dp3m.sum_mu2, alpha_L);
1084 ks_err = dp3m_k_space_error(box_geo.length()[0], mesh[0], cao,
1085 dp3m.sum_dip_part, dp3m.sum_mu2, alpha_L);
1086
1087 return {Utils::Vector2d{rs_err, ks_err}.norm(), rs_err, ks_err, alpha_L};
1088 }
1089
1090 void determine_mesh_limits() override {
1091 if (dp3m.params.mesh[0] == -1) {
1092 /* simple heuristic to limit the tried meshes if the accuracy cannot
1093 be obtained with smaller meshes, but normally not all these
1094 meshes have to be tested */
1095 auto const expo = std::log(std::cbrt(dp3m.sum_dip_part)) / std::log(2.);
1096 /* Medium-educated guess for the minimal mesh */
1097 m_mesh_min = static_cast<int>(std::round(std::pow(2., std::floor(expo))));
1098 /* avoid using more than 1 GB of FFT arrays */
1099 m_mesh_max = 128;
1100 if (m_tune_limits.first) {
1101 m_mesh_min = *m_tune_limits.first;
1102 }
1103 if (m_tune_limits.second) {
1104 m_mesh_max = *m_tune_limits.second;
1105 }
1106 } else {
1107 m_mesh_min = m_mesh_max = dp3m.params.mesh[0];
1108 m_logger->report_fixed_mesh(dp3m.params.mesh);
1109 }
1110 }
1111
1114 auto time_best = time_sentinel;
1115 for (auto tmp_mesh = m_mesh_min; tmp_mesh <= m_mesh_max; tmp_mesh += 2) {
1118 trial_params.cao = cao_best;
1119
1120 auto const trial_time =
1122 trial_params.alpha_L, trial_params.accuracy);
1123
1124 /* this mesh does not work at all */
1125 if (trial_time < 0.)
1126 continue;
1127
1128 /* the optimum r_cut for this mesh is the upper limit for higher meshes,
1129 everything else is slower */
1130 m_r_cut_iL_max = trial_params.r_cut_iL;
1131
1132 if (trial_time < time_best) {
1133 /* new optimum */
1139 /* no hope of further optimisation */
1140 break;
1141 }
1142 }
1143 return tuned_params;
1144 }
1145};
1146
1147template <typename FloatType, Arch Architecture, class FFTConfig>
1149 auto &system = get_system();
1150 auto const &box_geo = *system.box_geo;
1151 if (dp3m.params.alpha_L == 0. and dp3m.params.alpha != 0.) {
1152 dp3m.params.alpha_L = dp3m.params.alpha * box_geo.length()[0];
1153 }
1154 if (dp3m.params.r_cut_iL == 0. and dp3m.params.r_cut != 0.) {
1155 dp3m.params.r_cut_iL = dp3m.params.r_cut * box_geo.length_inv()[0];
1156 }
1157 if (not is_tuned()) {
1159 if (dp3m.sum_dip_part == 0) {
1160 throw std::runtime_error(
1161 "DipolarP3M: no dipolar particles in the system");
1162 }
1163 try {
1165 system, dp3m, prefactor, tuning.timings, tuning.limits);
1166 parameters.setup_logger(tuning.verbose);
1167 // parameter ranges
1168 parameters.determine_mesh_limits();
1169 parameters.determine_r_cut_limits();
1170 parameters.determine_cao_limits(3);
1171 // run tuning algorithm
1172 parameters.tune();
1173 m_is_tuned = true;
1174 system.on_dipoles_change();
1175 } catch (...) {
1176 dp3m.params.tuning = false;
1177 throw;
1178 }
1179 }
1180 init();
1181}
1182
1183/** Tuning dipolar-P3M */
1184inline auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh,
1185 double mesh_i, int cao, double alpha_L_i) {
1186
1189 auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
1190 auto alias1 = 0.;
1191 auto alias2 = 0.;
1192
1198 [&]() {
1199 auto const norm_sq = nm.norm2();
1200 auto const ex = std::exp(-factor1 * norm_sq);
1201 auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
1203 alias2 += U2 * ex * std::pow(shift * nm, 3) / norm_sq;
1204 },
1205 [&](unsigned dim, int n) {
1206 nm[dim] = shift[dim] + n * mesh;
1207 fnm[dim] = math::sinc(nm[dim] * mesh_i);
1208 });
1209
1210 return std::make_pair(alias1, alias2);
1211}
1212
1213/** Calculate the k-space error of dipolar-P3M */
1214inline double dp3m_k_space_error(double box_size, int mesh, int cao,
1215 std::size_t n_c_part, double sum_q2,
1216 double alpha_L) {
1217
1219 auto const mesh_i = 1. / static_cast<double>(mesh);
1220 auto const alpha_L_i = 1. / alpha_L;
1221 auto const mesh_stop = Utils::Vector3i::broadcast(mesh / 2);
1222 auto const mesh_start = -mesh_stop;
1223 auto indices = Utils::Vector3i{};
1224 auto values = Utils::Vector3d{};
1225 auto he_q = 0.;
1226
1229 [&]() {
1230 if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
1231 auto const n2 = indices.norm2();
1232 auto const cs = Utils::product(values);
1233 auto const [alias1, alias2] =
1235 auto const d =
1236 alias1 - Utils::sqr(alias2 / cs) /
1237 Utils::int_pow<3>(static_cast<double>(n2));
1238 /* at high precision, d can become negative due to extinction;
1239 also, don't take values that have no significant digits left*/
1240 if (d > 0. and std::fabs(d / alias1) > round_error_prec)
1241 he_q += d;
1242 }
1243 },
1244 [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
1245 values[dim] = cotangent_sum(n, mesh_i);
1246 });
1247
1248 return 8. * Utils::sqr(std::numbers::pi) / 3. * sum_q2 *
1249 sqrt(he_q / static_cast<double>(n_c_part)) /
1250 Utils::int_pow<4>(box_size);
1251}
1252
1253/** Calculate the value of the errors for the REAL part of the force in terms
1254 * of the splitting parameter alpha of Ewald. Based on eq. (33) @cite wang01a.
1255 *
1256 * Please note that in this more refined approach we don't use
1257 * eq. (37), but eq. (33) which maintains all the powers in alpha.
1258 */
1259inline double dp3m_real_space_error(double box_size, double r_cut_iL,
1260 std::size_t n_c_part, double sum_q2,
1261 double alpha_L) {
1262 auto constexpr exp_min = -708.4; // for IEEE-compatible double
1263 double d_error_f, d_cc, d_dc, d_con;
1264
1265 auto const d_rcut = r_cut_iL * box_size;
1266 auto const d_rcut2 = Utils::sqr(d_rcut);
1267 auto const d_rcut4 = Utils::sqr(d_rcut2);
1268
1269 auto const d_a2 = Utils::sqr(alpha_L) / Utils::sqr(box_size);
1270 auto const exponent = -d_a2 * d_rcut2;
1271 auto const exp_term = (exponent < exp_min) ? 0. : std::exp(exponent);
1272 auto const d_c = sum_q2 * exp_term;
1273
1274 d_cc = 4. * Utils::sqr(d_a2) * Utils::sqr(d_rcut2) + 6. * d_a2 * d_rcut2 + 3.;
1275
1276 d_dc = 8. * Utils::int_pow<3>(d_a2) * Utils::int_pow<3>(d_rcut2) +
1277 20. * Utils::sqr(d_a2) * d_rcut4 + 30. * d_a2 * d_rcut2 + 15.;
1278
1279 d_con = 1. / sqrt(Utils::int_pow<3>(box_size) * Utils::sqr(d_a2) * d_rcut *
1280 Utils::sqr(d_rcut4) * static_cast<double>(n_c_part));
1281
1282 d_error_f = d_c * d_con *
1283 sqrt((13. / 6.) * Utils::sqr(d_cc) +
1284 (2. / 15.) * Utils::sqr(d_dc) - (13. / 15.) * d_cc * d_dc);
1285
1286 return d_error_f;
1287}
1288
1289/** Using bisection, find the root of a function "func-tuned_accuracy/sqrt(2.)"
1290 * known to lie between x1 and x2. The root, returned as rtbis, will be
1291 * refined until its accuracy is \f$\pm\f$ @p xacc.
1292 */
1293double dp3m_rtbisection(double box_size, double r_cut_iL, std::size_t n_c_part,
1294 double sum_q2, double x1, double x2, double xacc,
1295 double tuned_accuracy) {
1296 constexpr int JJ_RTBIS_MAX = 40;
1297
1298 auto const constant = tuned_accuracy / std::numbers::sqrt2;
1299
1300 auto const f1 =
1301 dp3m_real_space_error(box_size, r_cut_iL, n_c_part, sum_q2, x1) -
1302 constant;
1303 auto const f2 =
1304 dp3m_real_space_error(box_size, r_cut_iL, n_c_part, sum_q2, x2) -
1305 constant;
1306 if (f1 * f2 >= 0.0) {
1307 throw std::runtime_error(
1308 "Root must be bracketed for bisection in dp3m_rtbisection");
1309 }
1310 // Orient the search dx, and set rtb to x1 or x2 ...
1311 double dx;
1312 double rtb = f1 < 0.0 ? (dx = x2 - x1, x1) : (dx = x1 - x2, x2);
1313 for (int j = 1; j <= JJ_RTBIS_MAX; j++) {
1314 auto const xmid = rtb + (dx *= 0.5);
1315 auto const fmid =
1316 dp3m_real_space_error(box_size, r_cut_iL, n_c_part, sum_q2, xmid) -
1317 constant;
1318 if (fmid <= 0.0)
1319 rtb = xmid;
1320 if (fabs(dx) < xacc || fmid == 0.0)
1321 return rtb;
1322 }
1323 throw std::runtime_error("Too many bisections in dp3m_rtbisection");
1324}
1325
1327 auto const &system = get_system();
1328 auto const &box_geo = *system.box_geo;
1329 auto const &local_geo = *system.local_geo;
1330 for (auto i = 0u; i < 3u; i++) {
1331 /* check k-space cutoff */
1332 if (dp3m_params.cao_cut[i] >= box_geo.length_half()[i]) {
1333 std::stringstream msg;
1334 msg << "dipolar P3M_init: k-space cutoff " << dp3m_params.cao_cut[i]
1335 << " is larger than half of box dimension " << box_geo.length()[i];
1336 throw std::runtime_error(msg.str());
1337 }
1338 if (dp3m_params.cao_cut[i] >= local_geo.length()[i]) {
1339 std::stringstream msg;
1340 msg << "dipolar P3M_init: k-space cutoff " << dp3m_params.cao_cut[i]
1341 << " is larger than local box dimension " << local_geo.length()[i];
1342 throw std::runtime_error(msg.str());
1343 }
1344 }
1345
1346 if ((box_geo.length()[0] != box_geo.length()[1]) or
1347 (box_geo.length()[1] != box_geo.length()[2])) {
1348 throw std::runtime_error("DipolarP3M: requires a cubic box");
1349 }
1350}
1351
1353 auto const &box_geo = *get_system().box_geo;
1354 if (!box_geo.periodic(0) or !box_geo.periodic(1) or !box_geo.periodic(2)) {
1355 throw std::runtime_error(
1356 "DipolarP3M: requires periodicity (True, True, True)");
1357 }
1358}
1359
1361 auto const &local_geo = *get_system().local_geo;
1362 if (local_geo.cell_structure_type() != CellStructureType::REGULAR and
1363 local_geo.cell_structure_type() != CellStructureType::HYBRID) {
1364 throw std::runtime_error(
1365 "DipolarP3M: requires the regular or hybrid decomposition cell system");
1366 }
1367 if (::communicator.size > 1 and
1368 local_geo.cell_structure_type() == CellStructureType::HYBRID) {
1369 throw std::runtime_error(
1370 "DipolarP3M: does not work with the hybrid decomposition cell system, "
1371 "if using more than one MPI node");
1372 }
1373}
1374
1376 auto const &node_grid = ::communicator.node_grid;
1377 if (node_grid[0] < node_grid[1] or node_grid[1] < node_grid[2]) {
1378 throw std::runtime_error(
1379 "DipolarP3M: node grid must be sorted, largest first");
1380 }
1381}
1382
1383template <typename FloatType, Arch Architecture, class FFTConfig>
1385 auto const &box_geo = *get_system().box_geo;
1386 dp3m.params.r_cut = dp3m.params.r_cut_iL * box_geo.length()[0];
1387 dp3m.params.alpha = dp3m.params.alpha_L * box_geo.length_inv()[0];
1388 dp3m.params.recalc_a_ai_cao_cut(box_geo.length());
1389 dp3m.local_mesh.recalc_ld_pos(dp3m.params);
1390 sanity_checks_boxl();
1391 calc_influence_function_force();
1392 calc_influence_function_energy();
1393 dp3m.energy_correction = 0.;
1394#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
1395 if (dp3m.heffte.world_size == 1) {
1396 dp3m.heffte.halo_comm.resize(::comm_cart, dp3m.local_mesh);
1397 }
1398#endif
1399}
1400
1401template <typename FloatType, Arch Architecture, class FFTConfig>
1402void DipolarP3MHeffte<FloatType, Architecture,
1403 FFTConfig>::calc_energy_correction() {
1404 auto const &box_geo = *get_system().box_geo;
1405 auto const Ukp3m = calc_average_self_energy_k_space() * box_geo.volume();
1406 auto const Ewald_volume = Utils::int_pow<3>(dp3m.params.alpha_L);
1407 auto const Eself = -2. * Ewald_volume * std::numbers::inv_sqrtpi / 3.;
1408 dp3m.energy_correction =
1409 -dp3m.sum_mu2 * (Ukp3m + Eself + 2. * std::numbers::pi / 3.);
1410}
1411
1412#ifdef ESPRESSO_NPT
1413template <typename FloatType, Arch Architecture, class FFTConfig>
1414void DipolarP3MHeffte<FloatType, Architecture,
1415 FFTConfig>::npt_add_virial_contribution(double energy)
1416 const {
1417 get_system().npt_add_virial_contribution(energy);
1418}
1419#endif // ESPRESSO_NPT
1420
1421#endif // ESPRESSO_DP3M
@ HYBRID
Hybrid decomposition.
@ REGULAR
Regular decomposition.
Vector implementation and trait types for boost qvm interoperability.
Describes a cell structure / cell system.
auto & get_local_force()
auto & get_local_torque()
auto const & get_unique_particles() const
ParticleRange local_particles() const
std::tuple< double, double, double, double > calculate_accuracy(Utils::Vector3i const &mesh, int cao, double r_cut_iL) const override
TuningAlgorithm::Parameters get_time() override
DipolarTuningAlgorithm(System::System &system, decltype(dp3m) &input_dp3m, double prefactor, int timings, decltype(m_tune_limits) tune_limits)
void on_solver_change() const override
void determine_mesh_limits() override
P3MParameters & get_params() override
std::optional< std::string > layer_correction_veto_r_cut(double) const override
void setup_logger(bool verbose) override
base_type::size_type size() const
Main system class.
void npt_add_virial_contribution(double energy)
Definition npt.cpp:137
std::shared_ptr< CellStructure > cell_structure
std::shared_ptr< BoxGeometry > box_geo
Tuning algorithm for P3M.
double get_m_time(Utils::Vector3i const &mesh, int &tuned_cao, double &tuned_r_cut_iL, double &tuned_alpha_L, double &tuned_accuracy)
Get the optimal alpha and the corresponding computation time for a fixed mesh.
static auto constexpr time_sentinel
Value for invalid time measurements.
static auto constexpr max_n_consecutive_trials
Maximal number of consecutive trials that don't improve runtime.
System::System & m_system
std::unique_ptr< TuningLogger > m_logger
static auto constexpr time_granularity
Granularity of the time measurement (milliseconds).
T norm() const
Definition Vector.hpp:159
static DEVICE_QUALIFIER constexpr Vector< T, N > broadcast(typename Base::value_type const &value) noexcept
Create a vector that has all entries set to the same value.
Definition Vector.hpp:131
void zfill(std::size_t size)
Fill cache with zero-initialized data.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
Communicator communicator
boost::mpi::communicator comm_cart
The communicator.
int this_node
The number of this node.
constexpr auto round_error_prec
Precision below which a double-precision float is assumed to be zero.
Definition config.hpp:47
__device__ void vector_product(float const *a, float const *b, float *out)
static std::size_t count_magnetic_particles(ParticleRange const &particles)
Definition dlc.cpp:400
P3M algorithm for long-range magnetic dipole-dipole interaction.
double dp3m_real_space_error(double box_size, double r_cut_iL, std::size_t n_c_part, double sum_q2, double alpha_L)
Calculate the value of the errors for the REAL part of the force in terms of the splitting parameter ...
double dp3m_rtbisection(double box_size, double r_cut_iL, std::size_t n_c_part, double sum_q2, double x1, double x2, double xacc, double tuned_accuracy)
Compute the value of alpha through a bisection method.
auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh, double mesh_i, int cao, double alpha_L_i)
Tuning dipolar-P3M.
double dp3m_k_space_error(double box_size, int mesh, int cao, std::size_t n_c_part, double sum_q2, double alpha_L)
Calculate the k-space error of dipolar-P3M.
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
Routines, row decomposition, data structures and communication for the 3D-FFT.
auto pad_with_zeros_discard_imag(std::span< T > cropped_array, Utils::Vector3i const &cropped_dim, Utils::Vector3i const &pad_left, Utils::Vector3i const &pad_right)
Pad a 3D matrix with zeros to restore halo regions.
auto extract_block(Container const &in_array, Utils::Vector3i const &dimensions, Utils::Vector3i const &start, Utils::Vector3i const &stop)
and std::invocable< Projector, unsigned, int > void for_each_3d(detail::IndexVectorConcept auto &&start, detail::IndexVectorConcept auto &&stop, detail::IndexVectorConcept auto &&counters, Kernel &&kernel, Projector &&projector=detail::noop_projector)
Repeat an operation on every element of a 3D grid.
std::vector< FloatType > grid_influence_function_dipolar(P3MParameters const &params, Utils::Vector3i const &n_start, Utils::Vector3i const &n_stop, Utils::Vector3d const &inv_box_l)
Map influence function over a grid.
void p3m_interpolate(P3MLocalMesh const &local_mesh, WeightsStorage< cao > const &weights, Kernel kernel)
P3M grid interpolation.
constexpr int p3m_min_cao
Minimal charge assignment order.
Definition math.hpp:48
constexpr int p3m_max_cao
Maximal charge assignment order.
Definition math.hpp:50
#define P3M_BRILLOUIN
P3M: Number of Brillouin zones taken into account in the calculation of the optimal influence functio...
Definition math.hpp:38
System & get_system()
T product(Vector< T, N > const &v)
Definition Vector.hpp:372
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
Definition sqr.hpp:28
decltype(auto) integral_parameter(T i, Args &&...args)
Generate a call table for an integral non-type template parameter.
DEVICE_QUALIFIER auto sinc(T x)
Calculate the function .
Definition math.hpp:71
auto get_analytic_cotangent_sum_kernel(int cao)
Definition math.hpp:146
STL namespace.
Exports for the NpT code.
Common functions for dipolar and charge P3M.
auto constexpr P3M_EPSILON_METALLIC
This value indicates metallic boundary conditions.
ESPRESSO_ATTR_ALWAYS_INLINE void kokkos_parallel_range_for(auto const &name, auto start, auto end, auto const &kernel)
Utils::Vector3i node_grid
double calc_surface_term(bool force_flag, bool energy_flag) override
void dipole_assign() override
void scaleby_box_l() override
double long_range_kernel(bool force_flag, bool energy_flag)
Compute the k-space part of forces and energies.
Base class for the magnetostatics P3M algorithm.
double sum_mu2
Sum of square of magnetic dipoles.
p3m_interpolation_cache inter_weights
FloatType value_type
std::size_t sum_dip_part
number of dipolar particles.
p3m_send_mesh< FloatType > halo_comm
double energy_correction
cached k-space self-energy correction
void resize_heffte_buffers()
struct DipolarP3MState::@1 heffte
void sanity_checks_boxl() const
Checks for correctness of the k-space cutoff.
void sanity_checks_cell_structure() const
P3MParameters const & dp3m_params
Definition dp3m.hpp:55
void sanity_checks_periodicity() const
void sanity_checks_node_grid() const
void recalc_ld_pos(P3MParameters const &params)
Recalculate quantities derived from the mesh and box length: ld_pos (position of the left down mesh).
Structure to hold P3M parameters and some dependent variables.
Utils::Vector3d cao_cut
cutoff for charge assignment.
double alpha
unscaled alpha_L for use with fast inline functions only
double r_cut_iL
cutoff radius for real space electrostatics (>0), rescaled to r_cut_iL = r_cut * box_l_i.
double accuracy
accuracy of the actual parameter set.
double alpha_L
Ewald splitting parameter (0.
double r_cut
unscaled r_cut_iL for use with fast inline functions only
void recalc_a_ai_cao_cut(Utils::Vector3d const &box_l)
Recalculate quantities derived from the mesh and box length: a, ai and cao_cut.
bool tuning
tuning or production?
Utils::Vector3i mesh
number of mesh points per coordinate direction (>0), in real space.
P3MParameters params
P3M base parameters.
P3MLocalMesh local_mesh
Local mesh geometry information for this MPI rank.
void operator()(auto &dp3m, double prefac, int d_rs, CellStructure &cell_structure) const
void operator()(auto &dp3m, double prefac, int d_rs, CellStructure &cell_structure) const