29#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
30#include <Kokkos_Core.hpp>
72#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
74 std::vector<double> ca_frac_spillover;
76 std::vector<double> ca_frac;
79#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
81 std::vector<int> ca_fmp_spillover;
83 std::vector<int> ca_fmp;
87#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
89 : ca_frac(
"p3m_interpolation_cache::ca_frac", 0, 0),
90 ca_fmp(
"p3m_interpolation_cache::ca_fmp", 0, 0) {}
98#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
99 return ca_fmp.size() + ca_fmp_spillover.size();
101 return ca_fmp.size();
109 auto cao()
const {
return m_cao; }
117 auto const size_frac =
size *
static_cast<std::size_t
>(m_cao * 3);
118#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
120 Kokkos::realloc(Kokkos::WithoutInitializing, ca_fmp,
size);
121 Kokkos::realloc(Kokkos::WithoutInitializing, ca_frac,
size_frac);
123 assert(ca_frac_spillover.empty());
124 assert(ca_fmp_spillover.empty());
127 (std::distance(&ca_fmp(0), &ca_fmp(
size - 1)) ==
size - 1));
146#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
147 ca_fmp_spillover.emplace_back(weights.ind);
148 auto it = std::back_inserter(ca_frac_spillover);
149 std::ranges::copy(weights.w_x,
it);
150 std::ranges::copy(weights.w_y,
it);
151 std::ranges::copy(weights.w_z,
it);
153 ca_fmp.emplace_back(weights.ind);
154 auto it = std::back_inserter(ca_frac);
155 std::ranges::copy(weights.w_x,
it);
156 std::ranges::copy(weights.w_y,
it);
157 std::ranges::copy(weights.w_z,
it);
182#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
187 auto const offset =
p_index *
static_cast<std::size_t
>(
cao * 3);
208 double const *ptr =
nullptr;
209#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
210 if (
p_index >= ca_fmp.size()) {
212 index = ca_fmp_spillover[
p_index];
213 auto const offset =
p_index *
static_cast<std::size_t
>(
cao * 3);
214 ptr = std::as_const(ca_frac_spillover).data() + offset;
217 auto const offset =
p_index *
static_cast<std::size_t
>(
cao * 3);
218 ptr = std::as_const(ca_frac).data() + offset;
222 auto const offset =
p_index *
static_cast<std::size_t
>(
cao * 3);
223 ptr = std::as_const(ca_frac).data() + offset;
225 std::span<double const, cao> w_x(ptr,
cao);
226 std::advance(ptr,
cao);
227 std::span<double const, cao> w_y(ptr,
cao);
228 std::advance(ptr,
cao);
229 std::span<double const, cao> w_z(ptr,
cao);
230 return {index, w_x, w_y, w_z};
240#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
241 Kokkos::realloc(Kokkos::WithoutInitializing, ca_fmp, 0
ul);
242 Kokkos::realloc(Kokkos::WithoutInitializing, ca_frac, 0
ul);
243 ca_frac_spillover.clear();
244 ca_fmp_spillover.clear();
258template <
int cao, Utils::MemoryOrder memory_order>
264 auto const pos_shift = std::floor((cao - 1) / 2.0) - (cao % 2) / 2.0;
272 for (
unsigned int d = 0; d < 3; d++) {
274 auto const pos = ((position[d] - local_mesh.
ld_pos[d]) * ai[d]) - pos_shift;
276 nmp[d] =
static_cast<int>(pos);
279 dist[d] = (pos -
nmp[d]) - 0.5;
285 ret.
ind = Utils::get_linear_index<memory_order>(
nmp, local_mesh.
dim);
288 for (
int i = 0; i < cao; i++) {
313 auto q_ind = weights.ind;
314 for (
int i0 = 0;
i0 < cao;
i0++) {
315 auto const w_x = weights.w_x[
i0];
316 for (
int i1 = 0;
i1 < cao;
i1++) {
317 auto const w_xy = w_x * weights.w_y[
i1];
318 for (
int i2 = 0;
i2 < cao;
i2++) {
static DEVICE_QUALIFIER constexpr Vector< T, N > broadcast(typename Base::value_type const &value) noexcept
Create a vector that has all entries set to the same value.
Cache for interpolation weights.
void zfill(std::size_t size)
Fill cache with zero-initialized data.
void store(InterpolationWeights< cao > const &weights)
Push back weights for one point.
void store_at(std::size_t p_index, InterpolationWeights< cao > const &weights)
Insert weights for one point.
auto cao() const
Charge assignment order the weights are for.
void reset(int cao)
Reset the cache.
InterpolationWeightsView< cao > load(std::size_t p_index) const
Load entry from the cache.
p3m_interpolation_cache()
auto size() const
Number of points in the cache.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
InterpolationWeights< cao > p3m_calculate_interpolation_weights(const Utils::Vector3d &position, const Utils::Vector3d &ai, P3MLocalMesh const &local_mesh)
Calculate the P-th order interpolation weights.
void p3m_interpolate(P3MLocalMesh const &local_mesh, WeightsStorage< cao > const &weights, Kernel kernel)
P3M grid interpolation.
DEVICE_QUALIFIER auto bspline(int i, T x) -> T requires((order > 0) and(order<=7))
Formula of the B-spline.
int ind
Linear index of the corner of the interpolation cube.
std::span< double const, cao > w_z
std::span< double const, cao > w_x
Weights for the directions.
std::span< double const, cao > w_y
Interpolation weights for one point.
Utils::Array< double, cao > w_y
Utils::Array< double, cao > w_x
Weights for the directions.
int ind
Linear index of the corner of the interpolation cube.
Utils::Array< double, cao > w_z
Properties of the local mesh.
Utils::Vector3i dim
dimension (size) of local mesh including halo layers.
int q_2_off
offset between mesh lines of the last dimension
Utils::Vector3d ld_pos
position of the first local mesh point.
int q_21_off
offset between mesh lines of the two last dimensions