ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
dp3m_heffte.impl.hpp
Go to the documentation of this file.
1/*
2 * Copyright (C) 2010-2026 The ESPResSo project
3 * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
4 * Max-Planck-Institute for Polymer Research, Theory Group
5 *
6 * This file is part of ESPResSo.
7 *
8 * ESPResSo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * ESPResSo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22#include <config/config.hpp>
23
24#ifdef ESPRESSO_DP3M
25
27
30
31#include "magnetostatics/dp3m_heffte.hpp" // must be included after dipoles.hpp
32
33#include "fft/fft.hpp"
34#include "p3m/P3MFFT.hpp"
36#include "p3m/TuningLogger.hpp"
37#include "p3m/common.hpp"
40#include "p3m/interpolation.hpp"
41#include "p3m/math.hpp"
42
43#include "BoxGeometry.hpp"
44#include "LocalBox.hpp"
45#include "Particle.hpp"
46#include "PropagationMode.hpp"
49#include "communication.hpp"
50#include "errorhandling.hpp"
52#include "npt.hpp"
53#include "system/System.hpp"
54#include "tuning.hpp"
55
56#include <utils/Vector.hpp>
59#include <utils/math/sqr.hpp>
60
61#include <boost/mpi/collectives/all_reduce.hpp>
62#include <boost/mpi/collectives/reduce.hpp>
63
64#include <Kokkos_Core.hpp>
65#include <omp.h>
66
67#include <algorithm>
68#include <array>
69#include <cmath>
70#include <cstddef>
71#include <cstdio>
72#include <functional>
73#include <iterator>
74#include <memory>
75#include <numbers>
76#include <optional>
77#include <span>
78#include <sstream>
79#include <stdexcept>
80#include <tuple>
81#include <utility>
82#include <vector>
83
84#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
85#ifndef NDEBUG
86template <typename T>
87bool heffte_almost_equal(T const &value, T const &reference) {
88 auto const diff = std::abs(value - reference);
89 using FT = std::remove_cvref_t<decltype(diff)>;
90 auto constexpr atol = std::is_same_v<FT, float> ? FT{2e-4} : FT{1e-6};
91 auto constexpr rtol = std::is_same_v<FT, float> ? FT{5e-5} : FT{1e-5};
92 auto const non_zero = std::abs(reference) != FT{0};
93 return (diff < atol) or (non_zero and (diff / std::abs(reference) < rtol));
94}
95#endif // not NDEBUG
96#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
97
98template <typename FloatType, Arch Architecture, class FFTConfig>
99void DipolarP3MHeffte<FloatType, Architecture,
101 auto local_n = std::size_t{0u};
102 double local_mu2 = 0.;
103
104 for (auto const &p : get_system().cell_structure->local_particles()) {
105 if (p.dipm() != 0.) {
106 local_mu2 += p.calc_dip().norm2();
107 local_n++;
108 }
109 }
110
111 boost::mpi::all_reduce(comm_cart, local_mu2, dp3m.sum_mu2, std::plus<>());
112 boost::mpi::all_reduce(comm_cart, local_n, dp3m.sum_dip_part, std::plus<>());
113}
114
115inline double dp3m_k_space_error(double box_size, int mesh, int cao,
116 std::size_t n_c_part, double sum_q2,
117 double alpha_L);
118
119inline double dp3m_real_space_error(double box_size, double r_cut_iL,
120 std::size_t n_c_part, double sum_q2,
121 double alpha_L);
122
123/** Compute the value of alpha through a bisection method.
124 * Based on eq. (33) @cite wang01a.
125 */
126double dp3m_rtbisection(double box_size, double r_cut_iL, std::size_t n_c_part,
127 double sum_q2, double x1, double x2, double xacc,
128 double tuned_accuracy);
129
130template <typename FloatType, Arch Architecture, class FFTConfig>
131double DipolarP3MHeffte<FloatType, Architecture,
132 FFTConfig>::calc_average_self_energy_k_space() const {
133 auto const &box_geo = *get_system().box_geo;
134 auto const node_phi =
135 grid_influence_function_self_energy<FloatType, P3M_BRILLOUIN>(
136 dp3m.params, dp3m.mesh.start, dp3m.mesh.stop, dp3m.g_energy);
137
138 double phi = 0.;
139 boost::mpi::reduce(comm_cart, node_phi, phi, std::plus<>(), 0);
140 phi /= 3. * box_geo.length()[0] * Utils::int_pow<3>(dp3m.params.mesh[0]);
141 return phi * std::numbers::pi;
142}
143
144template <typename FloatType, Arch Architecture, class FFTConfig>
146 assert(dp3m.params.mesh >= Utils::Vector3i::broadcast(1));
147 assert(dp3m.params.cao >= p3m_min_cao and dp3m.params.cao <= p3m_max_cao);
148 assert(dp3m.params.alpha > 0.);
149
150 auto const &system = get_system();
151 auto const &box_geo = *system.box_geo;
152 auto const &local_geo = *system.local_geo;
153 auto const verlet_skin = system.cell_structure->get_verlet_skin();
154
155 dp3m.params.cao3 = Utils::int_pow<3>(dp3m.params.cao);
156 dp3m.params.recalc_a_ai_cao_cut(box_geo.length());
157
158 assert(dp3m.fft);
159 dp3m.local_mesh.calc_local_ca_mesh(dp3m.params, local_geo, verlet_skin, 0.);
160 dp3m.fft_buffers->init_halo();
161 dp3m.fft->init(dp3m.params);
162 dp3m.mesh.ks_pnum = dp3m.fft->get_ks_pnum();
163 dp3m.fft_buffers->init_meshes(dp3m.fft->get_ca_mesh_size());
164 dp3m.update_mesh_views();
165#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
166 dp3m.heffte.world_size = comm_cart.size();
167 dp3m.heffte.fft = std::make_shared<P3MFFT<FloatType, FFTConfig>>(
168 ::comm_cart, dp3m.params.mesh, dp3m.local_mesh.ld_no_halo,
169 dp3m.local_mesh.ur_no_halo, ::communicator.node_grid);
170 dp3m.resize_heffte_buffers();
171#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
172 dp3m.calc_differential_operator();
173
174 /* fix box length dependent constants */
175 scaleby_box_l();
176
178}
179
180namespace {
181template <int cao> struct AssignDipole {
182 void operator()(auto &dp3m, auto &cell_structure) {
183 using DipolarP3MState = std::remove_reference_t<decltype(dp3m)>;
184 using value_type = DipolarP3MState::value_type;
185 auto constexpr memory_order = Utils::MemoryOrder::ROW_MAJOR;
186 auto const &aosoa = cell_structure.get_aosoa();
187 auto const &unique_particles = cell_structure.get_unique_particles();
188 auto const n_part = cell_structure.count_local_particles();
189 dp3m.inter_weights.zfill(n_part); // allocate buffer for parallel write
191 "InterpolateDipoles", std::size_t{0u}, n_part, [&](auto p_index) {
192 auto const tid = omp_get_thread_num();
193 auto const p_pos = aosoa.get_span_at(aosoa.position, p_index);
194 auto const dip = unique_particles.at(p_index)->calc_dip();
195 auto const weights =
196 p3m_calculate_interpolation_weights<cao, memory_order>(
197 p_pos, dp3m.params.ai, dp3m.local_mesh);
198 dp3m.inter_weights.store_at(p_index, weights);
200 dp3m.local_mesh, weights, [&dip, tid, &dp3m](int ind, double w) {
201 dp3m.rs_fields_kokkos(tid, 0u, ind) += value_type(w * dip[0u]);
202 dp3m.rs_fields_kokkos(tid, 1u, ind) += value_type(w * dip[1u]);
203 dp3m.rs_fields_kokkos(tid, 2u, ind) += value_type(w * dip[2u]);
204 });
205 });
206 Kokkos::fence();
207 using execution_space = Kokkos::DefaultExecutionSpace;
208 int num_threads = execution_space().concurrency();
209 Kokkos::RangePolicy<execution_space> policy(std::size_t{0},
210 dp3m.local_mesh.size);
211 Kokkos::parallel_for("ReduceInterpolatedDipoles", policy,
212 [&dp3m, num_threads](std::size_t const i) {
213 for (int dir = 0; dir < 3; ++dir) {
214 value_type acc{};
215 for (int tid = 0; tid < num_threads; ++tid) {
216 acc += dp3m.rs_fields_kokkos(tid, dir, i);
217 }
218 dp3m.mesh.rs_fields[dir][i] += acc;
219#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
220 dp3m.heffte.rs_dipole_density[dir][i] += acc;
221#endif
222 }
223 });
224 Kokkos::fence();
225 }
226};
227} // namespace
228
229template <typename FloatType, Arch Architecture, class FFTConfig>
231 prepare_fft_mesh();
232
233 Utils::integral_parameter<int, AssignDipole, p3m_min_cao, p3m_max_cao>(
234 dp3m.params.cao, dp3m, *get_system().cell_structure);
235}
236
237namespace {
238template <int cao> struct AssignTorques {
239 void operator()(auto &dp3m, double prefac, int d_rs,
240 CellStructure &cell_structure) const {
241
242 assert(cao == dp3m.inter_weights.cao());
243
244 auto const kernel = [d_rs, &dp3m](auto const &pref, auto &p_torque,
245 std::size_t p_index) {
246 auto const weights = dp3m.inter_weights.template load<cao>(p_index);
247 Utils::Vector3d E{};
248 p3m_interpolate(dp3m.local_mesh, weights,
249 [&E, &dp3m, d_rs](int ind, double w) {
250 // heFFTe data: dp3m.heffte.ks_scalar.real()
251 E[d_rs] += w * double(dp3m.mesh.rs_scalar[ind]);
252 });
253
254 auto const torque = vector_product(pref, E);
255 auto const thread_id = omp_get_thread_num();
256 p_torque(p_index, thread_id, 0) -= torque[0];
257 p_torque(p_index, thread_id, 1) -= torque[1];
258 p_torque(p_index, thread_id, 2) -= torque[2];
259 };
260
261 auto const n_part = dp3m.inter_weights.size();
262 auto const &unique_particles = cell_structure.get_unique_particles();
263 auto &local_torque = cell_structure.get_local_torque();
265 "AssignTorques", std::size_t{0u}, n_part, [&](std::size_t p_index) {
266 auto const &p = *unique_particles.at(p_index);
267 if (p.dipm() != 0.) {
268 kernel(p.calc_dip() * prefac, local_torque, p_index);
269 }
270 });
271 }
272};
273
274template <int cao> struct AssignForcesDip {
275 void operator()(auto &dp3m, double prefac, int d_rs,
276 CellStructure &cell_structure) const {
277
278 assert(cao == dp3m.inter_weights.cao());
279
280 auto const kernel = [d_rs, &dp3m](auto const &pref, auto &p_force,
281 std::size_t p_index) {
282 auto const weights = dp3m.inter_weights.template load<cao>(p_index);
283
284 Utils::Vector3d E{};
285 p3m_interpolate(dp3m.local_mesh, weights, [&E, &dp3m](int ind, double w) {
286 // heFFTe data: dp3m.heffte.rs_B_fields
287 E[0u] += w * double(dp3m.mesh.rs_fields[0u][ind]);
288 E[1u] += w * double(dp3m.mesh.rs_fields[1u][ind]);
289 E[2u] += w * double(dp3m.mesh.rs_fields[2u][ind]);
290 });
291
292 auto const thread_id = omp_get_thread_num();
293 p_force(p_index, thread_id, d_rs) += pref * E;
294 };
295
296 auto const n_part = dp3m.inter_weights.size();
297 auto const &unique_particles = cell_structure.get_unique_particles();
298 auto &local_force = cell_structure.get_local_force();
300 "AssignForcesDip", std::size_t{0u}, n_part, [&](std::size_t p_index) {
301 auto const &p = *unique_particles.at(p_index);
302 if (p.dipm() != 0.) {
303 kernel(p.calc_dip() * prefac, local_force, p_index);
304 }
305 });
306 }
307};
308} // namespace
309
310#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
311template <typename FloatType, class FFTConfig>
313 auto const rs_array_size =
314 static_cast<std::size_t>(Utils::product(this->local_mesh.dim));
315 auto const rs_array_size_no_halo =
316 static_cast<std::size_t>(Utils::product(this->local_mesh.dim_no_halo));
317 auto const fft_mesh_size =
318 static_cast<std::size_t>(Utils::product(heffte.fft->ks_local_size()));
319 for (auto d : {0u, 1u, 2u}) {
320 heffte.rs_dipole_density[d].resize(rs_array_size);
321 heffte.ks_dipole_density[d].resize(fft_mesh_size);
322 heffte.rs_B_fields[d].resize(rs_array_size);
323 heffte.rs_B_fields_no_halo[d].resize(rs_array_size_no_halo);
324 }
325 heffte.ks_B_field_storage.resize(fft_mesh_size);
326 heffte.ks_scalar.resize(fft_mesh_size);
327}
328#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
329
330template <typename FloatType, Arch Architecture, class FFTConfig>
332 bool force_flag, bool energy_flag) {
333 /* k-space energy */
334 double energy = 0.;
335 auto const &system = get_system();
336 auto const &box_geo = *system.box_geo;
337 auto const dipole_prefac = prefactor / Utils::product(dp3m.params.mesh);
338#ifdef ESPRESSO_NPT
339 auto const npt_flag = force_flag and system.has_npt_enabled();
340#else
341 auto constexpr npt_flag = false;
342#endif
343
344 auto constexpr mesh_start = Utils::Vector3i::broadcast(0);
345 auto local_index = Utils::Vector3i::broadcast(0);
346#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
347 auto constexpr r2c_dir = FFTConfig::r2c_dir;
348 auto const rs_local_size = dp3m.heffte.fft->rs_local_size();
349 auto const local_size = dp3m.heffte.fft->ks_local_size();
350 auto local_size_full = local_size;
351 if constexpr (FFTConfig::use_r2c) {
352 local_size_full[r2c_dir] -= 1;
353 local_size_full[r2c_dir] *= 2;
354 }
355 auto const local_origin = dp3m.heffte.fft->ks_local_ld_index();
356#ifndef NDEBUG
357 auto const line_stride = local_size_full[0];
358 auto const plane_stride = local_size_full[0] * local_size_full[0];
359#endif
360 auto const &global_size = dp3m.params.mesh;
361 auto const cutoff_left = 1 - local_origin[r2c_dir];
362 auto const cutoff_right = global_size[r2c_dir] / 2 - local_origin[r2c_dir];
363 auto &short_dim = local_index[r2c_dir];
364 dp3m.resize_heffte_buffers();
365#endif
366
367 if (dp3m.sum_mu2 > 0.) {
368 dipole_assign();
369 dp3m.fft_buffers->perform_vector_halo_gather();
370 for (auto &rs_mesh : dp3m.fft_buffers->get_vector_mesh()) {
371 dp3m.fft->forward_fft(rs_mesh);
372 }
373 dp3m.update_mesh_views();
374
375#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
376 if (dp3m.heffte.world_size == 1) {
377 // halo communication of real space dipoles density
378 std::array<FloatType *, 3u> rs_fields = {
379 {dp3m.heffte.rs_dipole_density[0u].data(),
380 dp3m.heffte.rs_dipole_density[1u].data(),
381 dp3m.heffte.rs_dipole_density[2u].data()}};
382 dp3m.heffte.halo_comm.gather_grid(::comm_cart, rs_fields,
383 dp3m.local_mesh.dim);
384
385 for (auto dir : {0u, 1u, 2u}) {
386 // get real-space dipoles density without ghost layers
387 auto rs_field_no_halo = extract_block<Utils::MemoryOrder::ROW_MAJOR,
388 FFTConfig::r_space_order>(
389 dp3m.heffte.rs_dipole_density[dir], dp3m.local_mesh.dim,
390 dp3m.local_mesh.n_halo_ld,
391 dp3m.local_mesh.dim - dp3m.local_mesh.n_halo_ur);
392 // re-order data in row-major
393 std::vector<FloatType> rs_field_no_halo_reorder;
394 rs_field_no_halo_reorder.resize(rs_field_no_halo.size());
395 std::size_t index_row_major = 0u;
396 for_each_3d_order<FFTConfig::k_space_order>(
397 mesh_start, rs_local_size, local_index, [&]() {
398 auto constexpr KX = 1, KY = 2, KZ = 0;
399 auto const index = local_index[KZ] +
400 rs_local_size[0] * local_index[KY] +
401 Utils::sqr(rs_local_size[0]) * local_index[KX];
402 rs_field_no_halo_reorder[index_row_major] =
403 rs_field_no_halo[index];
404 ++index_row_major;
405 });
406 dp3m.heffte.fft->forward(rs_field_no_halo_reorder.data(),
407 dp3m.heffte.ks_dipole_density[dir].data());
408#ifndef NDEBUG
409 if (not dp3m.params.tuning) {
410 std::size_t index_row_major_r2c = 0u;
411 for_each_3d_order<FFTConfig::k_space_order>(
412 mesh_start, local_size, local_index, [&]() {
413 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
414 auto constexpr KX = 2, KY = 0, KZ = 1;
415 auto const index_fft_legacy = local_index[KZ] +
416 line_stride * local_index[KY] +
417 plane_stride * local_index[KX];
418 auto const old_value = std::complex<FloatType>{
419 dp3m.mesh.rs_fields[dir][2 * index_fft_legacy],
420 dp3m.mesh.rs_fields[dir][2 * index_fft_legacy + 1]};
421 auto const &new_value =
422 dp3m.heffte.ks_dipole_density[dir][index_row_major_r2c];
423 assert(heffte_almost_equal(new_value, old_value));
424 ++index_row_major_r2c;
425 }
426 });
427 }
428#endif // not NDEBUG
429 }
430 }
431#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
432 }
433
434 /* === k-space energy calculation === */
435 if (energy_flag or npt_flag) {
436 /*********************
437 Dipolar energy
438 **********************/
439 if (dp3m.sum_mu2 > 0.) {
440 /* i*k differentiation for dipolar gradients:
441 * |(\Fourier{\vect{mu}}(k)\cdot \vect{k})|^2 */
442
443 auto index = std::size_t(0u);
444 auto it_energy = dp3m.g_energy.begin();
445 auto node_energy = 0.;
446 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
447 auto constexpr KX = 2, KY = 0, KZ = 1;
448 auto const shift = local_index + dp3m.mesh.start;
449 auto const &d_op = dp3m.d_op[0u];
450 auto const &mesh_dip = dp3m.mesh.rs_fields;
451 // Re(mu)*k
452 auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
453 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
454 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
455 ++index;
456 // Im(mu)*k
457 auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
458 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
459 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
460 ++index;
461 node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im));
462 std::advance(it_energy, 1);
463 });
464#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
465 if (dp3m.heffte.world_size == 1) {
466 [[maybe_unused]] auto node_energy_heffte = 0.;
467 std::size_t index_row_major_r2c = 0u;
468 for_each_3d_order<FFTConfig::k_space_order>(
469 mesh_start, local_size, local_index, [&]() {
470 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
471 auto const global_index = local_origin + local_index;
472 auto const &mesh_dip = dp3m.heffte.ks_dipole_density;
473 auto const cell_field =
474 mesh_dip[0u][index_row_major_r2c] *
475 FloatType(dp3m.d_op[0u][global_index[1u]]) +
476 mesh_dip[1u][index_row_major_r2c] *
477 FloatType(dp3m.d_op[1u][global_index[2u]]) +
478 mesh_dip[2u][index_row_major_r2c] *
479 FloatType(dp3m.d_op[2u][global_index[0u]]);
480 auto cell_energy = static_cast<double>(
481 dp3m.heffte.g_energy[index_row_major_r2c] *
482 std::norm(cell_field));
483 if (FFTConfig::use_r2c and (short_dim >= cutoff_left and
484 short_dim <= cutoff_right - 1)) {
485 // k-space symmetry: double counting except in the first and
486 // last planes of the short dimension; although the wavevector
487 // points in the opposite direction in the redundant region of
488 // k-space, the product of two components of the wavevector
489 // cancels out the negative sign
490 cell_energy *= 2.;
491 }
492 node_energy_heffte += cell_energy;
493 }
494 ++index_row_major_r2c;
495 });
496 assert(heffte_almost_equal(static_cast<FloatType>(node_energy_heffte),
497 static_cast<FloatType>(node_energy)));
498 }
499#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
500 node_energy *= dipole_prefac * std::numbers::pi * box_geo.length_inv()[0];
501 boost::mpi::reduce(comm_cart, node_energy, energy, std::plus<>(), 0);
502
503 if (dp3m.energy_correction == 0.)
504 calc_energy_correction();
505
506 if (this_node == 0) {
507 /* self energy correction */
508 energy -= prefactor * dp3m.sum_mu2 * std::numbers::inv_sqrtpi *
509 (2. / 3.) * Utils::int_pow<3>(dp3m.params.alpha);
510
511 /* dipolar energy correction due to systematic Madelung-self effects */
512 energy += prefactor * dp3m.energy_correction / box_geo.volume();
513 }
514 }
515 } // if (energy_flag)
516
517 /* === k-space force calculation === */
518 if (force_flag) {
519 /****************************
520 * DIPOLAR TORQUES (k-space)
521 ****************************/
522 if (dp3m.sum_mu2 > 0.) {
523 auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u];
524 dp3m.ks_scalar.resize(dp3m.local_mesh.size);
525 /* fill in ks_scalar array for torque calculation */
526 {
527 auto index{std::size_t(0u)};
528 auto it_energy = dp3m.g_energy.begin();
529 auto it_ks_scalar = dp3m.ks_scalar.begin();
530 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() mutable {
531 auto constexpr KX = 2, KY = 0, KZ = 1;
532 auto const shift = local_index + dp3m.mesh.start;
533 auto const &d_op = dp3m.d_op[0u];
534 auto const &mesh_dip = dp3m.mesh.rs_fields;
535 // Re(mu)*k
536 auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
537 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
538 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
539 ++index;
540 // Im(mu)*k
541 auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
542 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
543 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
544 ++index;
545 *it_ks_scalar = *it_energy * std::complex<FloatType>{re, im};
546 std::advance(it_energy, 1);
547 std::advance(it_ks_scalar, 1);
548 });
549 }
550#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
551 if (dp3m.heffte.world_size == 1) {
552 std::size_t index_row_major_r2c = 0u;
553 for_each_3d_order<FFTConfig::k_space_order>(
554 mesh_start, local_size, local_index, [&]() {
555 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
556 auto const global_index = local_origin + local_index;
557 auto const &mesh_dip = dp3m.heffte.ks_dipole_density;
558 dp3m.heffte.ks_scalar[index_row_major_r2c] =
559 dp3m.heffte.g_energy[index_row_major_r2c] *
560 (mesh_dip[0u][index_row_major_r2c] *
561 FloatType(dp3m.d_op[0u][global_index[1u]]) +
562 mesh_dip[1u][index_row_major_r2c] *
563 FloatType(dp3m.d_op[1u][global_index[2u]]) +
564 mesh_dip[2u][index_row_major_r2c] *
565 FloatType(dp3m.d_op[2u][global_index[0u]]));
566#ifndef NDEBUG
567 if (not dp3m.params.tuning) {
568 auto constexpr KX = 2, KY = 0, KZ = 1;
569 auto const index_fft_legacy = local_index[KZ] +
570 line_stride * local_index[KY] +
571 plane_stride * local_index[KX];
572 assert(heffte_almost_equal(
573 dp3m.heffte.ks_scalar[index_row_major_r2c],
574 dp3m.ks_scalar[index_fft_legacy]));
575 }
576#endif // not NDEBUG
577 ++index_row_major_r2c;
578 }
579 });
580 }
581#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
582
583 /* Torque component loop */
584 for (int d = 0; d < 3; d++) {
585 auto it_ks_scalar = dp3m.ks_scalar.begin();
586 auto index = 0u;
587 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
588 auto const &offset = dp3m.mesh.start;
589 auto const &d_op = dp3m.d_op[0u];
590 auto const d_op_val = FloatType(d_op[local_index[d] + offset[d]]);
591 auto const &value = *it_ks_scalar;
592 dp3m.mesh.rs_scalar[index] = d_op_val * value.real();
593 ++index;
594 dp3m.mesh.rs_scalar[index] = d_op_val * value.imag();
595 ++index;
596 std::advance(it_ks_scalar, 1);
597 });
598#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
599 if (dp3m.heffte.world_size == 1) {
600 unsigned int constexpr d_ks[3] = {2u, 0u, 1u};
601 std::size_t index_row_major_r2c = 0u;
602 for_each_3d_order<FFTConfig::k_space_order>(
603 mesh_start, local_size, local_index, [&]() {
604 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
605 auto const global_index = local_origin + local_index;
606 auto const d_op_val =
607 FloatType(dp3m.d_op[d][global_index[d_ks[d]]]);
608 dp3m.heffte.ks_B_field_storage[index_row_major_r2c] =
609 d_op_val * dp3m.heffte.ks_scalar[index_row_major_r2c];
610#ifndef NDEBUG
611 if (not dp3m.params.tuning) {
612 auto constexpr KX = 2, KY = 0, KZ = 1;
613 auto const index_fft_legacy =
614 local_index[KZ] + line_stride * local_index[KY] +
615 plane_stride * local_index[KX];
616 auto const old_value = std::complex<FloatType>{
617 dp3m.mesh.rs_scalar[2 * index_fft_legacy],
618 dp3m.mesh.rs_scalar[2 * index_fft_legacy + 1]};
619 auto const &new_value =
620 dp3m.heffte.ks_B_field_storage[index_row_major_r2c];
621 assert(heffte_almost_equal(new_value, old_value));
622 }
623#endif // not NDEBUG
624 ++index_row_major_r2c;
625 }
626 });
627 dp3m.heffte.fft->backward(dp3m.heffte.ks_B_field_storage.data(),
628 dp3m.heffte.rs_B_fields_no_halo[d].data());
629 // pad zeros around the B-field in real space for ghost layers
630 dp3m.heffte.rs_B_fields[d] =
631 pad_with_zeros_discard_imag<FFTConfig::r_space_order,
633 std::span(dp3m.heffte.rs_B_fields_no_halo[d]),
634 dp3m.local_mesh.dim_no_halo, dp3m.local_mesh.n_halo_ld,
635 dp3m.local_mesh.n_halo_ur);
636 // communicate ghost layers of the B-field in real space
637 dp3m.heffte.halo_comm.spread_grid(::comm_cart,
638 dp3m.heffte.rs_B_fields[d].data(),
639 dp3m.local_mesh.dim);
640 }
641#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
642 dp3m.fft->backward_fft(dp3m.fft_buffers->get_scalar_mesh());
643 // communicate ghost layers of the B-field in real space
644 dp3m.fft_buffers->perform_scalar_halo_spread();
645 // assign torque component from mesh to particle
646 auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
647 Utils::integral_parameter<int, AssignTorques, p3m_min_cao, p3m_max_cao>(
648 dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs,
649 *system.cell_structure);
650 }
651
652 /***************************
653 DIPOLAR FORCES (k-space)
654 ****************************/
655 // Compute forces after torques because the algorithm below overwrites the
656 // grids dp3m.mesh.rs_fields !
657 // Note: I'll do here 9 inverse FFTs. By symmetry, we can reduce this
658 // number to 6 !
659 /* fill in ks_scalar array for force calculation */
660 {
661 auto it_force = dp3m.g_force.begin();
662 auto it_ks_scalar = dp3m.ks_scalar.begin();
663 std::size_t index = 0u;
664 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
665 auto constexpr KX = 2, KY = 0, KZ = 1;
666 auto const shift = local_index + dp3m.mesh.start;
667 auto const &d_op = dp3m.d_op[0u];
668 auto const &mesh_dip = dp3m.mesh.rs_fields;
669 // Re(mu)*k
670 auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
671 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
672 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
673 ++index;
674 // Im(mu)*k
675 auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
676 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
677 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
678 ++index;
679 *it_ks_scalar = {*it_force * im, *it_force * (-re)};
680 std::advance(it_force, 1);
681 std::advance(it_ks_scalar, 1);
682 });
683 }
684
685#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
686 if (dp3m.heffte.world_size == 1) {
687 std::size_t index_row_major_r2c = 0u;
688 for_each_3d_order<FFTConfig::k_space_order>(
689 mesh_start, local_size, local_index, [&]() {
690 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
691 auto const global_index = local_origin + local_index;
692 auto const &mesh_dip = dp3m.heffte.ks_dipole_density;
693 auto const value =
694 dp3m.heffte.g_force[index_row_major_r2c] *
695 (mesh_dip[0u][index_row_major_r2c] *
696 FloatType(dp3m.d_op[0u][global_index[1u]]) +
697 mesh_dip[1u][index_row_major_r2c] *
698 FloatType(dp3m.d_op[1u][global_index[2u]]) +
699 mesh_dip[2u][index_row_major_r2c] *
700 FloatType(dp3m.d_op[2u][global_index[0u]]));
701 dp3m.heffte.ks_scalar[index_row_major_r2c] = {value.imag(),
702 -value.real()};
703#ifndef NDEBUG
704 if (not dp3m.params.tuning) {
705 auto constexpr KX = 2, KY = 0, KZ = 1;
706 auto const index_fft_legacy = local_index[KZ] +
707 line_stride * local_index[KY] +
708 plane_stride * local_index[KX];
709 assert(heffte_almost_equal(
710 dp3m.heffte.ks_scalar[index_row_major_r2c],
711 dp3m.ks_scalar[index_fft_legacy]));
712 }
713#endif // not NDEBUG
714 ++index_row_major_r2c;
715 }
716 });
717 }
718#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
719
720 /* Force component loop */
721 for (int d = 0; d < 3; d++) {
722 std::size_t index = 0u;
723 auto it_ks_scalar = dp3m.ks_scalar.begin();
724 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
725 auto constexpr KX = 2, KY = 0, KZ = 1;
726 auto const shift = local_index + dp3m.mesh.start;
727 auto const &d_op = dp3m.d_op[0u];
728 auto const &mesh_dip = dp3m.mesh.rs_fields;
729 auto const d_op_val = FloatType(d_op[shift[d]]);
730 auto const f = *it_ks_scalar * d_op_val;
731 mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f.real();
732 mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f.real();
733 mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f.real();
734 ++index;
735 mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f.imag();
736 mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f.imag();
737 mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f.imag();
738 ++index;
739 std::advance(it_ks_scalar, 1);
740 });
741
742#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
743 if (dp3m.heffte.world_size == 1) {
744 std::size_t index_row_major_r2c = 0u;
745 for_each_3d_order<FFTConfig::k_space_order>(
746 mesh_start, local_size, local_index, [&]() {
747 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
748 auto constexpr KX = 1, KY = 2, KZ = 0;
749 auto const global_index = local_origin + local_index;
750 auto const remapped_index =
751 local_index[KZ] + local_index[KY] * local_size[KZ] +
752 local_index[KX] * local_size[KZ] * local_size[KY];
753 auto const d_op_val =
754 FloatType(dp3m.d_op[d][global_index[d]]);
755 auto &mesh_dip = dp3m.heffte.ks_dipole_density;
756 mesh_dip[0u][index_row_major_r2c] =
757 FloatType(dp3m.d_op[d][global_index[2u]]) * d_op_val *
758 dp3m.heffte.ks_scalar[remapped_index];
759 mesh_dip[1u][index_row_major_r2c] =
760 FloatType(dp3m.d_op[d][global_index[0u]]) * d_op_val *
761 dp3m.heffte.ks_scalar[remapped_index];
762 mesh_dip[2u][index_row_major_r2c] =
763 FloatType(dp3m.d_op[d][global_index[1u]]) * d_op_val *
764 dp3m.heffte.ks_scalar[remapped_index];
765#ifndef NDEBUG
766 if (not FFTConfig::use_r2c and not dp3m.params.tuning) {
767 auto const index_fft_legacy = local_index[2] +
768 line_stride * local_index[1] +
769 plane_stride * local_index[0];
770 for (int j = 0; j < 3; ++j) {
771 auto const old_value = std::complex<FloatType>{
772 dp3m.mesh.rs_fields[j][2 * index_fft_legacy],
773 dp3m.mesh.rs_fields[j][2 * index_fft_legacy + 1]};
774 auto const &new_value = mesh_dip[j][index_row_major_r2c];
775 assert(heffte_almost_equal(new_value, old_value));
776 }
777 }
778#endif // not NDEBUG
779 ++index_row_major_r2c;
780 }
781 });
782 for (int dir = 0u; dir < 3u; ++dir) {
783 dp3m.heffte.fft->backward(
784 dp3m.heffte.ks_dipole_density[dir].data(),
785 dp3m.heffte.rs_B_fields_no_halo[dir].data());
786 // pad zeros around the B-field in real space for ghost layers
787 dp3m.heffte.rs_B_fields[d] =
788 pad_with_zeros_discard_imag<FFTConfig::r_space_order,
790 std::span(dp3m.heffte.rs_B_fields_no_halo[dir]),
791 dp3m.local_mesh.dim_no_halo, dp3m.local_mesh.n_halo_ld,
792 dp3m.local_mesh.n_halo_ur);
793 }
794 // communicate ghost layers of the B-field in real space
795 auto rs_fields =
796 std::array<FloatType *, 3u>{{dp3m.heffte.rs_B_fields[0u].data(),
797 dp3m.heffte.rs_B_fields[1u].data(),
798 dp3m.heffte.rs_B_fields[2u].data()}};
799 dp3m.heffte.halo_comm.spread_grid(::comm_cart, rs_fields,
800 dp3m.local_mesh.dim);
801 }
802#endif // ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
803 for (auto &rs_mesh : dp3m.fft_buffers->get_vector_mesh()) {
804 dp3m.fft->backward_fft(rs_mesh);
805 }
806 // communicate ghost layers of the B-field in real space
807 dp3m.fft_buffers->perform_vector_halo_spread();
808 // assign force component from mesh to particle
809 auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
810 Utils::integral_parameter<int, AssignForcesDip, p3m_min_cao,
812 dp3m.params.cao, dp3m, dipole_prefac * Utils::sqr(wavenumber), d_rs,
813 *system.cell_structure);
814 }
815 } /* if (dp3m.sum_mu2 > 0) */
816 } /* if (force_flag) */
817
818 if (dp3m.params.epsilon != P3M_EPSILON_METALLIC) {
819 auto const surface_term =
820 calc_surface_term(force_flag, energy_flag or npt_flag);
821 if (this_node == 0) {
822 energy += surface_term;
823 }
824 }
825#ifdef ESPRESSO_NPT
826 if (npt_flag) {
828 }
829#endif
830 if (not energy_flag) {
831 energy = 0.;
832 }
833
834 return energy;
835}
836
837template <typename FloatType, Arch Architecture, class FFTConfig>
839 bool force_flag, bool energy_flag) {
840 auto const &system = get_system();
841 auto const &box_geo = *system.box_geo;
842 auto const particles = system.cell_structure->local_particles();
843 auto const pref = prefactor * 4. * std::numbers::pi / box_geo.volume() /
844 (2. * dp3m.params.epsilon + 1.);
845 auto const n_local_part = particles.size();
846
847 // We put all the dipolar momenta in a the arrays mx,my,mz according to the
848 // id-number of the particles
849 std::vector<double> mx(n_local_part);
850 std::vector<double> my(n_local_part);
851 std::vector<double> mz(n_local_part);
852
853 std::size_t ip = 0u;
854 for (auto const &p : particles) {
855 auto const dip = p.calc_dip();
856 mx[ip] = dip[0u];
857 my[ip] = dip[1u];
858 mz[ip] = dip[2u];
859 ip++;
860 }
861
862 // we will need the sum of all dipolar momenta vectors
863 auto local_dip = Utils::Vector3d{};
864 for (std::size_t i = 0u; i < n_local_part; i++) {
865 local_dip[0u] += mx[i];
866 local_dip[1u] += my[i];
867 local_dip[2u] += mz[i];
868 }
869 auto const box_dip =
870 boost::mpi::all_reduce(comm_cart, local_dip, std::plus<>());
871
872 double energy = 0.;
873 if (energy_flag) {
874 double sum_e = 0.;
875 for (std::size_t i = 0u; i < n_local_part; i++) {
876 sum_e += mx[i] * box_dip[0] + my[i] * box_dip[1] + mz[i] * box_dip[2];
877 }
878 energy =
879 0.5 * pref * boost::mpi::all_reduce(comm_cart, sum_e, std::plus<>());
880 }
881
882 if (force_flag) {
883
884 std::vector<double> sumix(n_local_part);
885 std::vector<double> sumiy(n_local_part);
886 std::vector<double> sumiz(n_local_part);
887
888 for (std::size_t i = 0u; i < n_local_part; i++) {
889 sumix[i] = my[i] * box_dip[2u] - mz[i] * box_dip[1u];
890 sumiy[i] = mz[i] * box_dip[0u] - mx[i] * box_dip[2u];
891 sumiz[i] = mx[i] * box_dip[1u] - my[i] * box_dip[0u];
892 }
893
894 ip = 0u;
895 for (auto &p : particles) {
896 auto &torque = p.torque();
897 torque[0u] -= pref * sumix[ip];
898 torque[1u] -= pref * sumiy[ip];
899 torque[2u] -= pref * sumiz[ip];
900 ip++;
901 }
902 }
903
904 return energy;
905}
906
907template <typename FloatType, Arch Architecture, class FFTConfig>
908void DipolarP3MHeffte<FloatType, Architecture,
909 FFTConfig>::calc_influence_function_force() {
910 dp3m.g_force = grid_influence_function_dipolar<FloatType, 3, P3M_BRILLOUIN,
911 FFTConfig::k_space_order>(
912 dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
913 get_system().box_geo->length_inv());
914#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
915 if (dp3m.heffte.world_size == 1) {
916 dp3m.heffte.g_force =
918 FFTConfig::k_space_order>(
919 dp3m.params, dp3m.heffte.fft->ks_local_ld_index(),
920 dp3m.heffte.fft->ks_local_ur_index(),
921 get_system().box_geo->length_inv());
922 if constexpr (FFTConfig::use_r2c) {
923 influence_function_r2c<FFTConfig::r2c_dir>(
924 dp3m.heffte.g_force, dp3m.params.mesh,
925 dp3m.heffte.fft->ks_local_size(),
926 dp3m.heffte.fft->ks_local_ld_index());
927 }
928 }
929#endif
930}
931
932template <typename FloatType, Arch Architecture, class FFTConfig>
933void DipolarP3MHeffte<FloatType, Architecture,
934 FFTConfig>::calc_influence_function_energy() {
935 dp3m.g_energy = grid_influence_function_dipolar<FloatType, 2, P3M_BRILLOUIN,
936 FFTConfig::k_space_order>(
937 dp3m.params, dp3m.mesh.start, dp3m.mesh.stop,
938 get_system().box_geo->length_inv());
939#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
940 if (dp3m.heffte.world_size == 1) {
941 dp3m.heffte.g_energy =
943 FFTConfig::k_space_order>(
944 dp3m.params, dp3m.heffte.fft->ks_local_ld_index(),
945 dp3m.heffte.fft->ks_local_ur_index(),
946 get_system().box_geo->length_inv());
947 if constexpr (FFTConfig::use_r2c) {
948 influence_function_r2c<FFTConfig::r2c_dir>(
949 dp3m.heffte.g_energy, dp3m.params.mesh,
950 dp3m.heffte.fft->ks_local_size(),
951 dp3m.heffte.fft->ks_local_ld_index());
952 }
953 }
954#endif
955}
956
957template <typename FloatType, Arch Architecture, class FFTConfig>
960 int m_mesh_max = -1, m_mesh_min = -1;
961 std::pair<std::optional<int>, std::optional<int>> m_tune_limits;
962
963public:
964 DipolarTuningAlgorithm(System::System &system, decltype(dp3m) &input_dp3m,
965 double prefactor, int timings,
966 decltype(m_tune_limits) tune_limits)
967 : TuningAlgorithm(system, prefactor, timings), dp3m{input_dp3m},
968 m_tune_limits{std::move(tune_limits)} {}
969
970 P3MParameters &get_params() override { return dp3m.params; }
971
972 void on_solver_change() const override { m_system.on_dipoles_change(); }
973
974 std::optional<std::string>
975 layer_correction_veto_r_cut(double) const override {
976 return {};
977 }
978
979 void setup_logger(bool verbose) override {
980 auto const &box_geo = *m_system.box_geo;
981 m_logger = std::make_unique<TuningLogger>(
982 verbose and this_node == 0, "DipolarP3M", TuningLogger::Mode::Dipolar);
983 m_logger->tuning_goals(dp3m.params.accuracy, m_prefactor,
984 box_geo.length()[0], dp3m.sum_dip_part,
985 dp3m.sum_mu2);
986 m_logger->log_tuning_start();
987 }
988
989 std::tuple<double, double, double, double>
991 double r_cut_iL) const override {
992
993 double alpha_L, rs_err, ks_err;
994 auto const &box_geo = *m_system.box_geo;
995
996 /* calc maximal real space error for setting */
997 rs_err = dp3m_real_space_error(box_geo.length()[0], r_cut_iL,
998 dp3m.sum_dip_part, dp3m.sum_mu2, 0.001);
999 // alpha cannot be zero for dipoles because real-space formula breaks down
1000
1001 if (std::numbers::sqrt2 * rs_err > dp3m.params.accuracy) {
1002 /* assume rs_err = ks_err -> rs_err = accuracy/sqrt(2.0) -> alpha_L */
1003 alpha_L = dp3m_rtbisection(
1004 box_geo.length()[0], r_cut_iL, dp3m.sum_dip_part, dp3m.sum_mu2,
1005 0.0001 * box_geo.length()[0], 5. * box_geo.length()[0], 0.0001,
1006 dp3m.params.accuracy);
1007 } else {
1008 /* even alpha=0 is ok, however, we cannot choose it since it kills the
1009 k-space error formula.
1010 Anyways, this very likely NOT the optimal solution */
1011 alpha_L = 0.1;
1012 }
1013
1014 /* calculate real-space and k-space error for this alpha_L */
1015 rs_err = dp3m_real_space_error(box_geo.length()[0], r_cut_iL,
1016 dp3m.sum_dip_part, dp3m.sum_mu2, alpha_L);
1017 ks_err = dp3m_k_space_error(box_geo.length()[0], mesh[0], cao,
1018 dp3m.sum_dip_part, dp3m.sum_mu2, alpha_L);
1019
1020 return {Utils::Vector2d{rs_err, ks_err}.norm(), rs_err, ks_err, alpha_L};
1021 }
1022
1023 void determine_mesh_limits() override {
1024 if (dp3m.params.mesh[0] == -1) {
1025 /* simple heuristic to limit the tried meshes if the accuracy cannot
1026 be obtained with smaller meshes, but normally not all these
1027 meshes have to be tested */
1028 auto const expo = std::log(std::cbrt(dp3m.sum_dip_part)) / std::log(2.);
1029 /* Medium-educated guess for the minimal mesh */
1030 m_mesh_min = static_cast<int>(std::round(std::pow(2., std::floor(expo))));
1031 /* avoid using more than 1 GB of FFT arrays */
1032 m_mesh_max = 128;
1033 if (m_tune_limits.first) {
1034 m_mesh_min = *m_tune_limits.first;
1035 }
1036 if (m_tune_limits.second) {
1037 m_mesh_max = *m_tune_limits.second;
1038 }
1039 } else {
1040 m_mesh_min = m_mesh_max = dp3m.params.mesh[0];
1041 m_logger->report_fixed_mesh(dp3m.params.mesh);
1042 }
1043 }
1044
1046 auto tuned_params = TuningAlgorithm::Parameters{};
1047 auto time_best = time_sentinel;
1048 for (auto tmp_mesh = m_mesh_min; tmp_mesh <= m_mesh_max; tmp_mesh += 2) {
1049 auto trial_params = TuningAlgorithm::Parameters{};
1050 trial_params.mesh = Utils::Vector3i::broadcast(tmp_mesh);
1051 trial_params.cao = cao_best;
1052
1053 auto const trial_time =
1054 get_m_time(trial_params.mesh, trial_params.cao, trial_params.r_cut_iL,
1055 trial_params.alpha_L, trial_params.accuracy);
1056
1057 /* this mesh does not work at all */
1058 if (trial_time < 0.)
1059 continue;
1060
1061 /* the optimum r_cut for this mesh is the upper limit for higher meshes,
1062 everything else is slower */
1063 m_r_cut_iL_max = trial_params.r_cut_iL;
1064
1065 if (trial_time < time_best) {
1066 /* new optimum */
1068 tuned_params = trial_params;
1069 time_best = tuned_params.time = trial_time;
1070 } else if (trial_time > time_best + time_granularity or
1072 /* no hope of further optimisation */
1073 break;
1074 }
1075 }
1076 return tuned_params;
1077 }
1078};
1079
1080template <typename FloatType, Arch Architecture, class FFTConfig>
1082 auto &system = get_system();
1083 auto const &box_geo = *system.box_geo;
1084 if (dp3m.params.alpha_L == 0. and dp3m.params.alpha != 0.) {
1085 dp3m.params.alpha_L = dp3m.params.alpha * box_geo.length()[0];
1086 }
1087 if (dp3m.params.r_cut_iL == 0. and dp3m.params.r_cut != 0.) {
1088 dp3m.params.r_cut_iL = dp3m.params.r_cut * box_geo.length_inv()[0];
1089 }
1090 if (not is_tuned()) {
1092 if (dp3m.sum_dip_part == 0) {
1093 throw std::runtime_error(
1094 "DipolarP3M: no dipolar particles in the system");
1095 }
1096 try {
1098 system, dp3m, prefactor, tuning.timings, tuning.limits);
1099 parameters.setup_logger(tuning.verbose);
1100 // parameter ranges
1101 parameters.determine_mesh_limits();
1102 parameters.determine_r_cut_limits();
1103 parameters.determine_cao_limits(3);
1104 // run tuning algorithm
1105 parameters.tune();
1106 m_is_tuned = true;
1107 system.on_dipoles_change();
1108 } catch (...) {
1109 dp3m.params.tuning = false;
1110 throw;
1111 }
1112 }
1113 init();
1114}
1115
1116/** Tuning dipolar-P3M */
1117inline auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh,
1118 double mesh_i, int cao, double alpha_L_i) {
1119
1120 auto constexpr mesh_start = Utils::Vector3i::broadcast(-P3M_BRILLOUIN);
1121 auto constexpr mesh_stop = Utils::Vector3i::broadcast(P3M_BRILLOUIN + 1);
1122 auto const factor1 = Utils::sqr(std::numbers::pi * alpha_L_i);
1123 auto alias1 = 0.;
1124 auto alias2 = 0.;
1125
1126 Utils::Vector3i indices{};
1127 Utils::Vector3i nm{};
1128 Utils::Vector3d fnm{};
1130 mesh_start, mesh_stop, indices,
1131 [&]() {
1132 auto const norm_sq = nm.norm2();
1133 auto const ex = std::exp(-factor1 * norm_sq);
1134 auto const U2 = std::pow(Utils::product(fnm), 2 * cao);
1135 alias1 += Utils::sqr(ex) * norm_sq;
1136 alias2 += U2 * ex * std::pow(shift * nm, 3) / norm_sq;
1137 },
1138 [&](unsigned dim, int n) {
1139 nm[dim] = shift[dim] + n * mesh;
1140 fnm[dim] = math::sinc(nm[dim] * mesh_i);
1141 });
1142
1143 return std::make_pair(alias1, alias2);
1144}
1145
1146/** Calculate the k-space error of dipolar-P3M */
1147inline double dp3m_k_space_error(double box_size, int mesh, int cao,
1148 std::size_t n_c_part, double sum_q2,
1149 double alpha_L) {
1150
1151 auto const cotangent_sum = math::get_analytic_cotangent_sum_kernel(cao);
1152 auto const mesh_i = 1. / static_cast<double>(mesh);
1153 auto const alpha_L_i = 1. / alpha_L;
1154 auto const mesh_stop = Utils::Vector3i::broadcast(mesh / 2);
1155 auto const mesh_start = -mesh_stop;
1156 auto indices = Utils::Vector3i{};
1157 auto values = Utils::Vector3d{};
1158 auto he_q = 0.;
1159
1161 mesh_start, mesh_stop, indices,
1162 [&]() {
1163 if ((indices[0] != 0) or (indices[1] != 0) or (indices[2] != 0)) {
1164 auto const n2 = indices.norm2();
1165 auto const cs = Utils::product(values);
1166 auto const [alias1, alias2] =
1167 dp3m_tune_aliasing_sums(indices, mesh, mesh_i, cao, alpha_L_i);
1168 auto const d =
1169 alias1 - Utils::sqr(alias2 / cs) /
1170 Utils::int_pow<3>(static_cast<double>(n2));
1171 /* at high precision, d can become negative due to extinction;
1172 also, don't take values that have no significant digits left*/
1173 if (d > 0. and std::fabs(d / alias1) > round_error_prec)
1174 he_q += d;
1175 }
1176 },
1177 [&values, &mesh_i, cotangent_sum](unsigned dim, int n) {
1178 values[dim] = cotangent_sum(n, mesh_i);
1179 });
1180
1181 return 8. * Utils::sqr(std::numbers::pi) / 3. * sum_q2 *
1182 sqrt(he_q / static_cast<double>(n_c_part)) /
1183 Utils::int_pow<4>(box_size);
1184}
1185
1186/** Calculate the value of the errors for the REAL part of the force in terms
1187 * of the splitting parameter alpha of Ewald. Based on eq. (33) @cite wang01a.
1188 *
1189 * Please note that in this more refined approach we don't use
1190 * eq. (37), but eq. (33) which maintains all the powers in alpha.
1191 */
1192inline double dp3m_real_space_error(double box_size, double r_cut_iL,
1193 std::size_t n_c_part, double sum_q2,
1194 double alpha_L) {
1195 auto constexpr exp_min = -708.4; // for IEEE-compatible double
1196 double d_error_f, d_cc, d_dc, d_con;
1197
1198 auto const d_rcut = r_cut_iL * box_size;
1199 auto const d_rcut2 = Utils::sqr(d_rcut);
1200 auto const d_rcut4 = Utils::sqr(d_rcut2);
1201
1202 auto const d_a2 = Utils::sqr(alpha_L) / Utils::sqr(box_size);
1203 auto const exponent = -d_a2 * d_rcut2;
1204 auto const exp_term = (exponent < exp_min) ? 0. : std::exp(exponent);
1205 auto const d_c = sum_q2 * exp_term;
1206
1207 d_cc = 4. * Utils::sqr(d_a2) * Utils::sqr(d_rcut2) + 6. * d_a2 * d_rcut2 + 3.;
1208
1209 d_dc = 8. * Utils::int_pow<3>(d_a2) * Utils::int_pow<3>(d_rcut2) +
1210 20. * Utils::sqr(d_a2) * d_rcut4 + 30. * d_a2 * d_rcut2 + 15.;
1211
1212 d_con = 1. / sqrt(Utils::int_pow<3>(box_size) * Utils::sqr(d_a2) * d_rcut *
1213 Utils::sqr(d_rcut4) * static_cast<double>(n_c_part));
1214
1215 d_error_f = d_c * d_con *
1216 sqrt((13. / 6.) * Utils::sqr(d_cc) +
1217 (2. / 15.) * Utils::sqr(d_dc) - (13. / 15.) * d_cc * d_dc);
1218
1219 return d_error_f;
1220}
1221
1222/** Using bisection, find the root of a function "func-tuned_accuracy/sqrt(2.)"
1223 * known to lie between x1 and x2. The root, returned as rtbis, will be
1224 * refined until its accuracy is \f$\pm\f$ @p xacc.
1225 */
1226double dp3m_rtbisection(double box_size, double r_cut_iL, std::size_t n_c_part,
1227 double sum_q2, double x1, double x2, double xacc,
1228 double tuned_accuracy) {
1229 constexpr int JJ_RTBIS_MAX = 40;
1230
1231 auto const constant = tuned_accuracy / std::numbers::sqrt2;
1232
1233 auto const f1 =
1234 dp3m_real_space_error(box_size, r_cut_iL, n_c_part, sum_q2, x1) -
1235 constant;
1236 auto const f2 =
1237 dp3m_real_space_error(box_size, r_cut_iL, n_c_part, sum_q2, x2) -
1238 constant;
1239 if (f1 * f2 >= 0.0) {
1240 throw std::runtime_error(
1241 "Root must be bracketed for bisection in dp3m_rtbisection");
1242 }
1243 // Orient the search dx, and set rtb to x1 or x2 ...
1244 double dx;
1245 double rtb = f1 < 0.0 ? (dx = x2 - x1, x1) : (dx = x1 - x2, x2);
1246 for (int j = 1; j <= JJ_RTBIS_MAX; j++) {
1247 auto const xmid = rtb + (dx *= 0.5);
1248 auto const fmid =
1249 dp3m_real_space_error(box_size, r_cut_iL, n_c_part, sum_q2, xmid) -
1250 constant;
1251 if (fmid <= 0.0)
1252 rtb = xmid;
1253 if (fabs(dx) < xacc || fmid == 0.0)
1254 return rtb;
1255 }
1256 throw std::runtime_error("Too many bisections in dp3m_rtbisection");
1257}
1258
1260 auto const &system = get_system();
1261 auto const &box_geo = *system.box_geo;
1262 auto const &local_geo = *system.local_geo;
1263 for (auto i = 0u; i < 3u; i++) {
1264 /* check k-space cutoff */
1265 if (dp3m_params.cao_cut[i] >= box_geo.length_half()[i]) {
1266 std::stringstream msg;
1267 msg << "dipolar P3M_init: k-space cutoff " << dp3m_params.cao_cut[i]
1268 << " is larger than half of box dimension " << box_geo.length()[i];
1269 throw std::runtime_error(msg.str());
1270 }
1271 if (dp3m_params.cao_cut[i] >= local_geo.length()[i]) {
1272 std::stringstream msg;
1273 msg << "dipolar P3M_init: k-space cutoff " << dp3m_params.cao_cut[i]
1274 << " is larger than local box dimension " << local_geo.length()[i];
1275 throw std::runtime_error(msg.str());
1276 }
1277 }
1278
1279 if ((box_geo.length()[0] != box_geo.length()[1]) or
1280 (box_geo.length()[1] != box_geo.length()[2])) {
1281 throw std::runtime_error("DipolarP3M: requires a cubic box");
1282 }
1283}
1284
1286 auto const &box_geo = *get_system().box_geo;
1287 if (!box_geo.periodic(0) or !box_geo.periodic(1) or !box_geo.periodic(2)) {
1288 throw std::runtime_error(
1289 "DipolarP3M: requires periodicity (True, True, True)");
1290 }
1291}
1292
1294 auto const &local_geo = *get_system().local_geo;
1295 if (local_geo.cell_structure_type() != CellStructureType::REGULAR and
1296 local_geo.cell_structure_type() != CellStructureType::HYBRID) {
1297 throw std::runtime_error(
1298 "DipolarP3M: requires the regular or hybrid decomposition cell system");
1299 }
1300 if (::communicator.size > 1 and
1301 local_geo.cell_structure_type() == CellStructureType::HYBRID) {
1302 throw std::runtime_error(
1303 "DipolarP3M: does not work with the hybrid decomposition cell system, "
1304 "if using more than one MPI node");
1305 }
1306}
1307
1309 auto const &node_grid = ::communicator.node_grid;
1310 if (node_grid[0] < node_grid[1] or node_grid[1] < node_grid[2]) {
1311 throw std::runtime_error(
1312 "DipolarP3M: node grid must be sorted, largest first");
1313 }
1314}
1315
1316template <typename FloatType, Arch Architecture, class FFTConfig>
1318 auto const &box_geo = *get_system().box_geo;
1319 dp3m.params.r_cut = dp3m.params.r_cut_iL * box_geo.length()[0];
1320 dp3m.params.alpha = dp3m.params.alpha_L * box_geo.length_inv()[0];
1321 dp3m.params.recalc_a_ai_cao_cut(box_geo.length());
1322 dp3m.local_mesh.recalc_ld_pos(dp3m.params);
1323 sanity_checks_boxl();
1324 calc_influence_function_force();
1325 calc_influence_function_energy();
1326 dp3m.energy_correction = 0.;
1327#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
1328 if (dp3m.heffte.world_size == 1) {
1329 dp3m.heffte.halo_comm.resize(::comm_cart, dp3m.local_mesh);
1330 }
1331#endif
1332}
1333
1334template <typename FloatType, Arch Architecture, class FFTConfig>
1335void DipolarP3MHeffte<FloatType, Architecture,
1336 FFTConfig>::calc_energy_correction() {
1337 auto const &box_geo = *get_system().box_geo;
1338 auto const Ukp3m = calc_average_self_energy_k_space() * box_geo.volume();
1339 auto const Ewald_volume = Utils::int_pow<3>(dp3m.params.alpha_L);
1340 auto const Eself = -2. * Ewald_volume * std::numbers::inv_sqrtpi / 3.;
1341 dp3m.energy_correction =
1342 -dp3m.sum_mu2 * (Ukp3m + Eself + 2. * std::numbers::pi / 3.);
1343}
1344
1345#ifdef ESPRESSO_NPT
1346template <typename FloatType, Arch Architecture, class FFTConfig>
1347void DipolarP3MHeffte<FloatType, Architecture,
1348 FFTConfig>::npt_add_virial_contribution(double energy)
1349 const {
1350 get_system().npt_add_virial_contribution(energy);
1351}
1352#endif // ESPRESSO_NPT
1353
1354#endif // ESPRESSO_DP3M
@ HYBRID
Hybrid decomposition.
@ REGULAR
Regular decomposition.
Vector implementation and trait types for boost qvm interoperability.
Describes a cell structure / cell system.
auto & get_local_force()
auto & get_local_torque()
auto const & get_unique_particles() const
std::tuple< double, double, double, double > calculate_accuracy(Utils::Vector3i const &mesh, int cao, double r_cut_iL) const override
TuningAlgorithm::Parameters get_time() override
DipolarTuningAlgorithm(System::System &system, decltype(dp3m) &input_dp3m, double prefactor, int timings, decltype(m_tune_limits) tune_limits)
void on_solver_change() const override
void determine_mesh_limits() override
P3MParameters & get_params() override
std::optional< std::string > layer_correction_veto_r_cut(double) const override
void setup_logger(bool verbose) override
base_type::size_type size() const
Main system class.
std::shared_ptr< LocalBox > local_geo
void npt_add_virial_contribution(double energy)
Definition npt.cpp:137
std::shared_ptr< CellStructure > cell_structure
std::shared_ptr< BoxGeometry > box_geo
Tuning algorithm for P3M.
double get_m_time(Utils::Vector3i const &mesh, int &tuned_cao, double &tuned_r_cut_iL, double &tuned_alpha_L, double &tuned_accuracy)
Get the optimal alpha and the corresponding computation time for a fixed mesh.
static auto constexpr time_sentinel
Value for invalid time measurements.
static auto constexpr max_n_consecutive_trials
Maximal number of consecutive trials that don't improve runtime.
System::System & m_system
void determine_cao_limits(int initial_cao)
Determine a sensible range for the charge assignment order.
void determine_r_cut_limits()
Determine a sensible range for the real-space cutoff.
std::unique_ptr< TuningLogger > m_logger
static auto constexpr time_granularity
Granularity of the time measurement (milliseconds).
T norm() const
Definition Vector.hpp:159
static DEVICE_QUALIFIER constexpr Vector< T, N > broadcast(typename Base::value_type const &value) noexcept
Create a vector that has all entries set to the same value.
Definition Vector.hpp:131
void zfill(std::size_t size)
Fill cache with zero-initialized data.
Communicator communicator
boost::mpi::communicator comm_cart
The communicator.
int this_node
The number of this node.
constexpr auto round_error_prec
Precision below which a double-precision float is assumed to be zero.
Definition config.hpp:47
__device__ void vector_product(float const *a, float const *b, float *out)
static std::size_t count_magnetic_particles(ParticleRange const &particles)
Definition dlc.cpp:400
P3M algorithm for long-range magnetic dipole-dipole interaction.
double dp3m_real_space_error(double box_size, double r_cut_iL, std::size_t n_c_part, double sum_q2, double alpha_L)
Calculate the value of the errors for the REAL part of the force in terms of the splitting parameter ...
double dp3m_rtbisection(double box_size, double r_cut_iL, std::size_t n_c_part, double sum_q2, double x1, double x2, double xacc, double tuned_accuracy)
Compute the value of alpha through a bisection method.
auto dp3m_tune_aliasing_sums(Utils::Vector3i const &shift, int mesh, double mesh_i, int cao, double alpha_L_i)
Tuning dipolar-P3M.
double dp3m_k_space_error(double box_size, int mesh, int cao, std::size_t n_c_part, double sum_q2, double alpha_L)
Calculate the k-space error of dipolar-P3M.
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
Routines, row decomposition, data structures and communication for the 3D-FFT.
auto pad_with_zeros_discard_imag(std::span< T > cropped_array, Utils::Vector3i const &cropped_dim, Utils::Vector3i const &pad_left, Utils::Vector3i const &pad_right)
Pad a 3D matrix with zeros to restore halo regions.
auto extract_block(Container const &in_array, Utils::Vector3i const &dimensions, Utils::Vector3i const &start, Utils::Vector3i const &stop)
and std::invocable< Projector, unsigned, int > void for_each_3d(detail::IndexVectorConcept auto &&start, detail::IndexVectorConcept auto &&stop, detail::IndexVectorConcept auto &&counters, Kernel &&kernel, Projector &&projector=detail::noop_projector)
Repeat an operation on every element of a 3D grid.
std::vector< FloatType > grid_influence_function_dipolar(P3MParameters const &params, Utils::Vector3i const &n_start, Utils::Vector3i const &n_stop, Utils::Vector3d const &inv_box_l)
Map influence function over a grid.
void p3m_interpolate(P3MLocalMesh const &local_mesh, WeightsStorage< cao > const &weights, Kernel kernel)
P3M grid interpolation.
constexpr int p3m_min_cao
Minimal charge assignment order.
Definition math.hpp:48
constexpr int p3m_max_cao
Maximal charge assignment order.
Definition math.hpp:50
#define P3M_BRILLOUIN
P3M: Number of Brillouin zones taken into account in the calculation of the optimal influence functio...
Definition math.hpp:38
System & get_system()
T product(Vector< T, N > const &v)
Definition Vector.hpp:372
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
Definition sqr.hpp:28
decltype(auto) integral_parameter(T i, Args &&...args)
Generate a call table for an integral non-type template parameter.
DEVICE_QUALIFIER auto sinc(T x)
Calculate the function .
Definition math.hpp:71
auto get_analytic_cotangent_sum_kernel(int cao)
Definition math.hpp:146
STL namespace.
Exports for the NpT code.
Common functions for dipolar and charge P3M.
auto constexpr P3M_EPSILON_METALLIC
This value indicates metallic boundary conditions.
ESPRESSO_ATTR_ALWAYS_INLINE void kokkos_parallel_range_for(auto const &name, auto start, auto end, auto const &kernel)
Utils::Vector3i node_grid
double calc_surface_term(bool force_flag, bool energy_flag) override
void dipole_assign() override
void scaleby_box_l() override
double long_range_kernel(bool force_flag, bool energy_flag)
Compute the k-space part of forces and energies.
Base class for the magnetostatics P3M algorithm.
double sum_mu2
Sum of square of magnetic dipoles.
p3m_interpolation_cache inter_weights
FloatType value_type
std::size_t sum_dip_part
number of dipolar particles.
p3m_send_mesh< FloatType > halo_comm
double energy_correction
cached k-space self-energy correction
void resize_heffte_buffers()
struct DipolarP3MState::@1 heffte
void sanity_checks_boxl() const
Checks for correctness of the k-space cutoff.
void sanity_checks_cell_structure() const
P3MParameters const & dp3m_params
Definition dp3m.hpp:55
void sanity_checks_periodicity() const
void sanity_checks_node_grid() const
void recalc_ld_pos(P3MParameters const &params)
Recalculate quantities derived from the mesh and box length: ld_pos (position of the left down mesh).
Structure to hold P3M parameters and some dependent variables.
Utils::Vector3d cao_cut
cutoff for charge assignment.
double alpha
unscaled alpha_L for use with fast inline functions only
double r_cut_iL
cutoff radius for real space electrostatics (>0), rescaled to r_cut_iL = r_cut * box_l_i.
double accuracy
accuracy of the actual parameter set.
double alpha_L
Ewald splitting parameter (0.
double r_cut
unscaled r_cut_iL for use with fast inline functions only
void recalc_a_ai_cao_cut(Utils::Vector3d const &box_l)
Recalculate quantities derived from the mesh and box length: a, ai and cao_cut.
bool tuning
tuning or production?
Utils::Vector3i mesh
number of mesh points per coordinate direction (>0), in real space.
P3MParameters params
P3M base parameters.
P3MLocalMesh local_mesh
Local mesh geometry information for this MPI rank.
void operator()(auto &dp3m, double prefac, int d_rs, CellStructure &cell_structure) const
void operator()(auto &dp3m, double prefac, int d_rs, CellStructure &cell_structure) const