57#include "communication.hpp"
64#include "system/System.hpp"
73#include <boost/mpi/collectives/all_reduce.hpp>
74#include <boost/mpi/collectives/broadcast.hpp>
75#include <boost/mpi/collectives/reduce.hpp>
76#include <boost/mpi/communicator.hpp>
77#include <boost/range/combine.hpp>
78#include <boost/range/numeric.hpp>
80#include <Kokkos_Core.hpp>
81#include <Kokkos_ScatterView.hpp>
89#include <initializer_list>
100template <
typename FloatType>
101std::complex<FloatType>
104 return std::complex<FloatType>(-z.imag() * k, z.real() * k);
107template <
typename FloatType>
108std::complex<FloatType>
111 return std::complex<FloatType>(z.real() * k, z.imag() * k);
116 return mesh[0
u] % node_grid[0
u] == 0
and mesh[1u] % node_grid[1u] == 0
and
117 mesh[2u] % node_grid[2u] == 0;
120template <
typename FloatType, Arch Architecture,
class FFTConfig>
124 std::size_t
local_n = std::size_t{0
u};
132 acc.local_q += p.q();
137 a.local_n += b.local_n;
138 a.local_q += b.local_q;
139 a.local_q2 += b.local_q2;
144 boost::mpi::all_reduce(
comm_cart,
res.local_n, p3m.sum_qpart, std::plus<>());
145 boost::mpi::all_reduce(
comm_cart,
res.local_q2, p3m.sum_q2, std::plus<>());
146 boost::mpi::all_reduce(
comm_cart,
res.local_q, p3m.square_sum_q,
148 p3m.square_sum_q =
Utils::sqr(p3m.square_sum_q);
160template <
typename FloatType, Arch Architecture,
class FFTConfig>
164 FFTConfig::k_space_order>(
165 p3m.params, p3m.fft->ks_local_ld_index(), p3m.fft->ks_local_ur_index(),
166 get_system().
box_geo->length_inv());
167 if constexpr (FFTConfig::use_r2c) {
169 p3m.fft->ks_local_size(),
170 p3m.fft->ks_local_ld_index());
177template <
typename FloatType, Arch Architecture,
class FFTConfig>
181 FFTConfig::k_space_order>(
182 p3m.params, p3m.fft->ks_local_ld_index(), p3m.fft->ks_local_ur_index(),
183 get_system().
box_geo->length_inv());
184 if constexpr (FFTConfig::use_r2c) {
186 p3m.fft->ks_local_size(),
187 p3m.fft->ks_local_ld_index());
199 auto constexpr exp_min = -708.4;
218 [&](
unsigned dim,
int n) {
219 nm[dim] = shift[dim] + n * mesh[dim];
237 std::size_t
n_c_part,
double sum_q2,
241 return (2. * pref * sum_q2 *
exp(-
Utils::sqr(r_cut_iL * alpha_L))) /
242 sqrt(
static_cast<double>(
n_c_part) * r_cut_iL *
box_l[0] * volume);
259 int cao, std::size_t
n_c_part,
double sum_q2,
291 return 2. * pref * sum_q2 * sqrt(
he_q /
static_cast<double>(
n_c_part)) /
295template <
typename FloatType, Arch Architecture,
class FFTConfig>
299 assert(p3m.params.alpha > 0.);
301 auto const &
system = get_system();
302 auto const &box_geo = *
system.box_geo;
303 auto const &local_geo = *
system.local_geo;
304 auto const skin =
system.cell_structure->get_verlet_skin();
306 p3m.params.cao3 = Utils::int_pow<3>(p3m.params.cao);
307 p3m.params.recalc_a_ai_cao_cut(box_geo.length());
311 auto const &solver =
system.coulomb.impl->solver;
317 p3m.local_mesh.calc_local_ca_mesh(p3m.params, local_geo,
skin,
elc_layer);
318 p3m.fft = std::make_shared<P3MFFT<FloatType, FFTConfig>>(
319 ::comm_cart, p3m.params.mesh, p3m.local_mesh.ld_no_halo,
324 static_cast<std::size_t
>(
Utils::product(p3m.local_mesh.dim_no_halo));
326 static_cast<std::size_t
>(
Utils::product(p3m.fft->ks_local_size()));
329 for (
auto d : {0
u, 1u, 2u}) {
334 p3m.calc_differential_operator();
339 count_charged_particles();
348 p3m_interpolate(p3m.local_mesh, weights, [q, &p3m](
int ind,
double w) {
349 p3m.rs_charge_density[ind] += value_type(w * q);
357 real_pos.as_span(), p3m.params.ai, p3m.local_mesh);
358 inter_weights.
store(weights);
359 this->operator()(p3m, q, weights);
365 real_pos.as_span(), p3m.params.ai, p3m.local_mesh);
366 this->operator()(p3m, q, weights);
373 auto const &aosoa = cell_structure.get_aosoa();
374 auto const n_part = cell_structure.count_local_particles();
377 "InterpolateCharges", std::size_t{0
u}, n_part, [&](
auto p_index) {
379 auto const pos = aosoa.get_span_at(aosoa.position,
p_index);
380 auto const q = aosoa.charge(
p_index);
383 pos, p3m.params.ai, p3m.local_mesh);
384 p3m.inter_weights.store_at(
p_index, weights);
386 p3m.local_mesh, weights, [&,
tid, q](
int ind,
double w) {
387 p3m.rs_charge_density_kokkos(tid, ind) += value_type(w * q);
393 Kokkos::RangePolicy<execution_space> policy(std::size_t{0},
394 p3m.local_mesh.size);
395 Kokkos::parallel_for(
"ReduceInterpolatedCharges", policy,
399 acc += p3m.rs_charge_density_kokkos(
tid, i);
401 p3m.rs_charge_density.at(i) += acc;
408template <
typename FloatType, Arch Architecture,
class FFTConfig>
410 prepare_fft_mesh(
true);
412 Utils::integral_parameter<int, AssignCharge, p3m_min_cao, p3m_max_cao>(
413 p3m.params.cao, p3m, *get_system().cell_structure);
416template <
typename FloatType, Arch Architecture,
class FFTConfig>
420 Utils::integral_parameter<int, AssignCharge, p3m_min_cao, p3m_max_cao>(
423 Utils::integral_parameter<int, AssignCharge, p3m_min_cao, p3m_max_cao>(
424 p3m.params.cao, p3m, q,
real_pos, p3m.inter_weights);
433 assert(cao == p3m.inter_weights.cao());
435 auto const kernel = [&p3m](
auto pref,
auto &
p_force, std::size_t
p_index) {
440 [&force, &p3m](
int ind,
double w) {
441 force[0u] += w * double(p3m.rs_E_fields[0u][ind]);
442 force[1u] += w * double(p3m.rs_E_fields[1u][ind]);
443 force[2u] += w * double(p3m.rs_E_fields[2u][ind]);
446 auto access =
p_force.access();
447 access(
p_index, 0) -= pref * force[0];
448 access(
p_index, 1) -= pref * force[1];
449 access(
p_index, 2) -= pref * force[2];
453 auto const &aosoa = cell_structure.
get_aosoa();
456 "AssignForces", std::size_t{0
u}, n_part, [&](std::size_t
p_index) {
466 auto const &cs,
auto const &box_geo) {
470 acc += p.
q() * box_geo.unfolded_position(p.
pos(), p.
image_box());
473 return boost::mpi::all_reduce(comm,
local_dip, std::plus<>());
476template <
typename FloatType, Arch Architecture,
class FFTConfig>
480 p3m.halo_comm.gather_grid(
comm_cart, p3m.rs_charge_density.data(),
486 p3m.rs_charge_density, p3m.local_mesh.dim, p3m.local_mesh.n_halo_ld,
487 p3m.local_mesh.dim - p3m.local_mesh.n_halo_ur);
494 p3m.ks_charge_density.data());
497template <
typename FloatType, Arch Architecture,
class FFTConfig>
500 auto const mesh_start = p3m.fft->ks_local_ld_index();
501 auto const mesh_stop = p3m.fft->ks_local_ur_index();
512#ifdef ESPRESSO_ADDITIONAL_CHECKS
514 Utils::get_linear_index<FFTConfig::k_space_order>(
520 for (
auto d : {0
u, 1u, 2u}) {
530 auto const size = p3m.local_mesh.ur_no_halo - p3m.local_mesh.ld_no_halo;
532 for (
auto d : {0
u, 1u, 2u}) {
533 auto k_space = p3m.ks_E_fields[d].data();
534 auto r_space = p3m.rs_E_fields_no_halo[d].data();
538 auto const begin = p3m.rs_E_fields_no_halo[d].begin();
543 p3m.local_mesh.n_halo_ld, p3m.local_mesh.n_halo_ur);
547 std::array<FloatType *, 3u> rs_fields = {{p3m.rs_E_fields[0
u].data(),
548 p3m.rs_E_fields[1u].data(),
549 p3m.rs_E_fields[2u].data()}};
550 p3m.halo_comm.spread_grid(
comm_cart, rs_fields, p3m.local_mesh.dim);
558template <
typename FloatType, Arch Architecture,
class FFTConfig>
561 auto const &box_geo = *get_system().
box_geo;
564 if (p3m.sum_q2 > 0.) {
566 kernel_ks_charge_density();
568 auto constexpr r2c_dir = FFTConfig::r2c_dir;
571 auto const local_size = p3m.fft->ks_local_size();
574 auto const wavevector = (2. * std::numbers::pi) * box_geo.length_inv();
580 std::size_t index = 0
u;
593 static_cast<double>(p3m.g_energy[index] *
594 std::norm(p3m.ks_charge_density[index]));
628template <
typename FloatType, Arch Architecture,
class FFTConfig>
632 auto const &
system = get_system();
633 auto const &box_geo = *
system.box_geo;
639 if (p3m.sum_qpart == 0
u) {
642 auto &cell_structure = *
system.cell_structure;
645 system.coulomb.impl->solver)) {
649 kernel_ks_charge_density();
652 auto const &aosoa = cell_structure.get_aosoa();
659 auto const volume = box_geo.volume();
661 4. * std::numbers::pi / volume / (2. * p3m.params.epsilon + 1.);
666 kernel_rs_electric_field();
671 Utils::integral_parameter<int, AssignForces, p3m_min_cao, p3m_max_cao>(
678 auto const n_part = cell_structure.count_local_particles();
683 auto const q = aosoa.charge(
p_index);
693 auto constexpr r2c_dir = FFTConfig::r2c_dir;
696 auto const local_size = p3m.fft->ks_local_size();
703 std::size_t index = 0
u;
707 auto const &
cell_field = p3m.ks_charge_density[index];
708 auto cell_energy =
static_cast<double>(p3m.g_energy[index] *
726 energy -= p3m.sum_q2 * p3m.params.alpha * std::numbers::inv_sqrtpi;
729 energy -= p3m.square_sum_q * std::numbers::pi /
751template <
typename FloatType, Arch Architecture,
class FFTConfig>
755 double m_mesh_density_min = -1., m_mesh_density_max = -1.;
757 bool m_tune_mesh =
false;
758 std::pair<std::optional<int>, std::optional<int>> m_tune_limits;
773 double prefactor,
int timings,
785 auto const on_gpu =
false;
787 m_logger = std::make_unique<TuningLogger>(
795 std::optional<std::string>
799 return actor->veto_r_cut(r_cut);
818 return Utils::Vector3i{{std::max(lhs[0u], rhs[0u]),
819 std::max(lhs[1u], rhs[1u]),
820 std::max(lhs[2u], rhs[2u])}};
823 if constexpr (FFTConfig::use_r2c) {
838 std::optional<std::string>
retval{
"conflict with FFT domain decomposition"};
845 std::tuple<double, double, double, double>
847 double r_cut_iL)
const override {
849 auto const &box_geo = *m_system.box_geo;
854 p3m.
sum_q2, 0., box_geo.length());
869 p3m.
sum_q2, alpha_L, box_geo.length());
875 p3m.
sum_q2, alpha_L, box_geo.length().data());
881 p3m.
sum_q2, alpha_L, box_geo.length());
887 auto const &box_geo = *m_system.box_geo;
889 static_cast<double>(p3m.
params.
mesh[0]) * box_geo.length_inv()[0];
902 if (m_tune_limits.first
or m_tune_limits.second) {
903 auto const &
box_l = box_geo.length();
905 if (m_tune_limits.first) {
906 m_mesh_density_min =
static_cast<double>(*m_tune_limits.first) / dim;
908 if (m_tune_limits.second) {
909 m_mesh_density_max =
static_cast<double>(*m_tune_limits.second) / dim;
918 for (
auto i : {1u, 2u}) {
920 static_cast<int>(std::round(
mesh_density * box_geo.length()[i]));
930 auto const &box_geo = *m_system.box_geo;
931 auto const &solver = m_system.coulomb.impl->solver;
937 for (
auto i : {0
u, 1u, 2u}) {
939 static_cast<int>(std::round(box_geo.length()[i] *
mesh_density));
968 get_n_trials() > max_n_consecutive_trials) {
984template <
typename FloatType, Arch Architecture,
class FFTConfig>
986 auto &
system = get_system();
987 auto const &box_geo = *
system.box_geo;
994 if (
not is_tuned()) {
995 count_charged_particles();
997 throw std::runtime_error(
998 "CoulombP3M: no charged particles in the system");
1002 system, p3m, prefactor, tuning.timings, tuning.limits);
1011 system.on_coulomb_change();
1022 auto const &box_geo = *
system.box_geo;
1023 auto const &local_geo = *
system.local_geo;
1024 for (
auto i = 0
u; i < 3u; i++) {
1027 std::stringstream
msg;
1029 <<
" is larger than half of box dimension " << box_geo.length()[i];
1030 throw std::runtime_error(
msg.str());
1033 std::stringstream
msg;
1035 <<
" is larger than local box dimension " << local_geo.length()[i];
1036 throw std::runtime_error(
msg.str());
1041 if ((box_geo.length()[0] != box_geo.length()[1])
or
1042 (box_geo.length()[1] != box_geo.length()[2])
or
1045 throw std::runtime_error(
1046 "CoulombP3M: non-metallic epsilon requires cubic box");
1053 if (!box_geo.periodic(0)
or !box_geo.periodic(1)
or !box_geo.periodic(2)) {
1054 throw std::runtime_error(
1055 "CoulombP3M: requires periodicity (True, True, True)");
1060 auto const &local_geo = *
get_system().local_geo;
1063 throw std::runtime_error(
1064 "CoulombP3M: requires the regular or hybrid decomposition cell system");
1068 throw std::runtime_error(
1069 "CoulombP3M: does not work with the hybrid decomposition cell system, "
1070 "if using more than one MPI node");
1074template <
typename FloatType, Arch Architecture,
class FFTConfig>
1076 auto const &box_geo = *get_system().
box_geo;
1081 sanity_checks_boxl();
1082 calc_influence_function_force();
1083 calc_influence_function_energy();
1088template <
typename FloatType, Arch Architecture,
class FFTConfig>
1098 auto &gpu = *get_system().
gpu;
1110template <
typename FloatType, Arch Architecture,
class FFTConfig>
1113 auto &
system = get_system();
1115 system.coulomb.impl->solver)) {
1123template <
typename FloatType, Arch Architecture,
class FFTConfig>
@ HYBRID
Hybrid decomposition.
@ REGULAR
Regular decomposition.
Vector implementation and trait types for boost qvm interoperability.
Describes a cell structure / cell system.
std::size_t count_local_particles() const
void determine_mesh_limits() override
std::optional< std::string > layer_correction_veto_r_cut(double r_cut) const override
TuningAlgorithm::Parameters get_time() override
void setup_logger(bool verbose) override
std::tuple< double, double, double, double > calculate_accuracy(Utils::Vector3i const &mesh, int cao, double r_cut_iL) const override
void on_solver_change() const override
CoulombTuningAlgorithm(System::System &system, auto &input_p3m, double prefactor, int timings, decltype(m_tune_limits) tune_limits)
static constexpr std::tuple< int, int, int > get_memory_layout()
std::optional< std::string > fft_decomposition_veto(Utils::Vector3i const &mesh_size_r_space) const override
P3MParameters & get_params() override
void npt_add_virial_contribution(double energy)
std::shared_ptr< GpuParticleData > gpu
bool has_npt_enabled() const
std::shared_ptr< BoxGeometry > box_geo
Tuning algorithm for P3M.
System::System & m_system
std::unique_ptr< TuningLogger > m_logger
DEVICE_QUALIFIER constexpr pointer data() noexcept
static DEVICE_QUALIFIER constexpr Vector< T, N > broadcast(typename Base::value_type const &value) noexcept
Create a vector that has all entries set to the same value.
Cache for interpolation weights.
void zfill(std::size_t size)
Fill cache with zero-initialized data.
void store(InterpolationWeights< cao > const &weights)
Push back weights for one point.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
Communicator communicator
boost::mpi::communicator comm_cart
The communicator.
int this_node
The number of this node.
constexpr auto round_error_prec
Precision below which a double-precision float is assumed to be zero.
void charge_assign(elc_data const &elc, CoulombP3M &solver, combined_ranges const &p_q_pos_range)
ELC algorithm for long-range Coulomb interactions.
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
auto pad_with_zeros_discard_imag(std::span< T > cropped_array, Utils::Vector3i const &cropped_dim, Utils::Vector3i const &pad_left, Utils::Vector3i const &pad_right)
Pad a 3D matrix with zeros to restore halo regions.
and std::invocable< Projector, unsigned, int > void for_each_3d(detail::IndexVectorConcept auto &&start, detail::IndexVectorConcept auto &&stop, detail::IndexVectorConcept auto &&counters, Kernel &&kernel, Projector &&projector=detail::noop_projector)
Repeat an operation on every element of a 3D grid.
std::vector< FloatType > grid_influence_function(P3MParameters const ¶ms, Utils::Vector3i const &n_start, Utils::Vector3i const &n_stop, Utils::Vector3d const &inv_box_l)
Map influence function over a grid.
void p3m_interpolate(P3MLocalMesh const &local_mesh, WeightsStorage< cao > const &weights, Kernel kernel)
P3M grid interpolation.
constexpr int p3m_min_cao
Minimal charge assignment order.
constexpr int p3m_max_cao
Maximal charge assignment order.
#define P3M_BRILLOUIN
P3M: Number of Brillouin zones taken into account in the calculation of the optimal influence functio...
std::function< void(ResultType &, ResultType const &)> ReductionOp
Join two partial reduction results.
std::function< void(ResultType &, Particle const &)> AddPartialResultKernel
Kernel that adds the result from a single particle to a reduction.
T product(Vector< T, N > const &v)
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
DEVICE_QUALIFIER auto sinc(T x)
Calculate the function .
auto get_analytic_cotangent_sum_kernel(int cao)
Exports for the NpT code.
auto constexpr P3M_EPSILON_METALLIC
This value indicates metallic boundary conditions.
P3M algorithm for long-range Coulomb interaction.
void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, double prefactor, std::size_t n_part)
The long-range part of the P3M algorithm.
void p3m_gpu_init(std::shared_ptr< P3MGpuParams > &data, int cao, Utils::Vector3i const &mesh, double alpha, Utils::Vector3d const &box_l, std::size_t n_part)
Initialize the internal data structure of the P3M GPU.
P3M electrostatics on GPU.
double p3m_k_space_error_gpu(double prefactor, const int *mesh, int cao, int npart, double sum_q2, double alpha_L, const double *box)
double p3m_k_space_error(double pref, Utils::Vector3i const &mesh, int cao, std::size_t n_c_part, double sum_q2, double alpha_L, Utils::Vector3d const &box_l)
Calculate the analytic expression of the error estimate for the P3M method in (eq.
std::complex< FloatType > multiply_complex_by_real(std::complex< FloatType > const &z, FloatType k)
auto p3m_tune_aliasing_sums(Utils::Vector3i const &shift, Utils::Vector3i const &mesh, Utils::Vector3d const &mesh_i, int cao, double alpha_L_i)
Aliasing sum used by p3m_k_space_error.
double p3m_real_space_error(double pref, double r_cut_iL, std::size_t n_c_part, double sum_q2, double alpha_L, Utils::Vector3d const &box_l)
Calculate the real space contribution to the rms error in the force (as described by Kolafa and Perra...
std::complex< FloatType > multiply_complex_by_imaginary(std::complex< FloatType > const &z, FloatType k)
auto calc_dipole_moment(boost::mpi::communicator const &comm, auto const &cs, auto const &box_geo)
bool is_node_grid_compatible_with_mesh(Utils::Vector3i const &node_grid, Utils::Vector3i const &mesh)
ResultType reduce_over_local_particles(CellStructure const &cs, Reduction::AddPartialResultKernel< ResultType > add_partial, Reduction::ReductionOp< ResultType > reduce_op)
performs a reduction over all particles
ESPRESSO_ATTR_ALWAYS_INLINE void kokkos_parallel_range_for(auto const &name, auto start, auto end, auto const &kernel)
Utils::Vector3i node_grid
void assign_charge(double q, Utils::Vector3d const &real_pos, bool skip_cache) override
double long_range_kernel(bool force_flag, bool energy_flag)
Compute the k-space part of forces and energies.
void charge_assign() override
Utils::Vector9d long_range_pressure() override
void scaleby_box_l() override
Base class for the electrostatics P3M algorithm.
std::size_t sum_qpart
number of charged particles.
std::shared_ptr< P3MFFT< FloatType, FFTConfig > > fft
p3m_send_mesh< FloatType > halo_comm
double sum_q2
Sum of square of charges.
p3m_interpolation_cache inter_weights
void sanity_checks_periodicity() const
void sanity_checks_boxl() const
Checks for correctness of the k-space cutoff.
void sanity_checks_cell_structure() const
P3MParameters const & p3m_params
std::unique_ptr< Implementation > impl
Pointer-to-implementation.
static constexpr std::size_t force
static constexpr std::size_t pos
static constexpr std::size_t q
Interpolation weights for one point.
void recalc_ld_pos(P3MParameters const ¶ms)
Recalculate quantities derived from the mesh and box length: ld_pos (position of the left down mesh).
Structure to hold P3M parameters and some dependent variables.
Utils::Vector3d cao_cut
cutoff for charge assignment.
double alpha
unscaled alpha_L for use with fast inline functions only
double r_cut_iL
cutoff radius for real space electrostatics (>0), rescaled to r_cut_iL = r_cut * box_l_i.
int cao
charge assignment order ([0,7]).
double accuracy
accuracy of the actual parameter set.
double alpha_L
Ewald splitting parameter (0.
double r_cut
unscaled r_cut_iL for use with fast inline functions only
void recalc_a_ai_cao_cut(Utils::Vector3d const &box_l)
Recalculate quantities derived from the mesh and box length: a, ai and cao_cut.
bool tuning
tuning or production?
Utils::Vector3i mesh
number of mesh points per coordinate direction (>0), in real space.
double epsilon
epsilon of the "surrounding dielectric".
P3MParameters params
P3M base parameters.
P3MLocalMesh local_mesh
Local mesh geometry information for this MPI rank.
Struct holding all information for one particle.
auto const & image_box() const
void operator()(auto &p3m, double q, InterpolationWeights< cao > const &weights)
void operator()(auto &p3m, auto &cell_structure)
void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos)
void operator()(auto &p3m, double q, Utils::Vector3d const &real_pos, p3m_interpolation_cache &inter_weights)
void operator()(auto &p3m, auto force_prefac, CellStructure &cell_structure) const