409 bool force_flag,
bool energy_flag,
ParticleRange const &particles) {
412 auto const &system = get_system();
413 auto const &box_geo = *system.box_geo;
414 auto const dipole_prefac = prefactor /
Utils::product(dp3m.params.mesh);
416 auto const npt_flag = force_flag and system.has_npt_enabled();
418 auto constexpr npt_flag =
false;
423#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
424 auto constexpr r2c_dir = FFTConfig::r2c_dir;
425 auto const rs_local_size = dp3m.heffte.fft->rs_local_size();
426 auto const local_size = dp3m.heffte.fft->ks_local_size();
427 auto local_size_full = local_size;
428 if constexpr (FFTConfig::use_r2c) {
429 local_size_full[r2c_dir] -= 1;
430 local_size_full[r2c_dir] *= 2;
432 auto const local_origin = dp3m.heffte.fft->ks_local_ld_index();
434 auto const line_stride = local_size_full[0];
435 auto const plane_stride = local_size_full[0] * local_size_full[0];
437 auto const &global_size = dp3m.params.mesh;
438 auto const cutoff_left = 1 - local_origin[r2c_dir];
439 auto const cutoff_right = global_size[r2c_dir] / 2 - local_origin[r2c_dir];
440 auto &short_dim = local_index[r2c_dir];
441 dp3m.resize_heffte_buffers();
444 if (dp3m.sum_mu2 > 0.) {
445 dipole_assign(particles);
446 dp3m.fft_buffers->perform_vector_halo_gather();
447 for (
auto &rs_mesh : dp3m.fft_buffers->get_vector_mesh()) {
448 dp3m.fft->forward_fft(rs_mesh);
450 dp3m.update_mesh_views();
452#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
453 if (dp3m.heffte.world_size == 1) {
455 std::array<FloatType *, 3u> rs_fields = {
456 {dp3m.heffte.rs_dipole_density[0u].data(),
457 dp3m.heffte.rs_dipole_density[1u].data(),
458 dp3m.heffte.rs_dipole_density[2u].data()}};
459 dp3m.heffte.halo_comm.gather_grid(
::comm_cart, rs_fields,
460 dp3m.local_mesh.dim);
462 for (
auto dir : {0u, 1u, 2u}) {
465 FFTConfig::r_space_order>(
466 dp3m.heffte.rs_dipole_density[dir], dp3m.local_mesh.dim,
467 dp3m.local_mesh.n_halo_ld,
468 dp3m.local_mesh.dim - dp3m.local_mesh.n_halo_ur);
470 std::vector<FloatType> rs_field_no_halo_reorder;
471 rs_field_no_halo_reorder.resize(rs_field_no_halo.size());
472 std::size_t index_row_major = 0u;
473 for_each_3d_order<FFTConfig::k_space_order>(
474 mesh_start, rs_local_size, local_index, [&]() {
475 auto constexpr KX = 1, KY = 2, KZ = 0;
476 auto const index = local_index[KZ] +
477 rs_local_size[0] * local_index[KY] +
478 Utils::sqr(rs_local_size[0]) * local_index[KX];
479 rs_field_no_halo_reorder[index_row_major] =
480 rs_field_no_halo[index];
483 dp3m.heffte.fft->forward(rs_field_no_halo_reorder.data(),
484 dp3m.heffte.ks_dipole_density[dir].data());
486 if (not dp3m.params.tuning) {
487 std::size_t index_row_major_r2c = 0u;
488 for_each_3d_order<FFTConfig::k_space_order>(
489 mesh_start, local_size, local_index, [&]() {
490 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
491 auto constexpr KX = 2, KY = 0, KZ = 1;
492 auto const index_fft_legacy = local_index[KZ] +
493 line_stride * local_index[KY] +
494 plane_stride * local_index[KX];
495 auto const old_value = std::complex<FloatType>{
496 dp3m.mesh.rs_fields[dir][2 * index_fft_legacy],
497 dp3m.mesh.rs_fields[dir][2 * index_fft_legacy + 1]};
498 auto const &new_value =
499 dp3m.heffte.ks_dipole_density[dir][index_row_major_r2c];
500 assert(heffte_almost_equal(new_value, old_value));
501 ++index_row_major_r2c;
512 if (energy_flag or npt_flag) {
516 if (dp3m.sum_mu2 > 0.) {
520 auto index = std::size_t(0u);
521 auto it_energy = dp3m.g_energy.begin();
522 auto node_energy = 0.;
523 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
524 auto constexpr KX = 2, KY = 0, KZ = 1;
525 auto const shift = local_index + dp3m.mesh.start;
526 auto const &d_op = dp3m.d_op[0u];
527 auto const &mesh_dip = dp3m.mesh.rs_fields;
529 auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
530 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
531 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
534 auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
535 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
536 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
538 node_energy += *it_energy * (Utils::sqr(re) + Utils::sqr(im));
539 std::advance(it_energy, 1);
541#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
542 if (dp3m.heffte.world_size == 1) {
543 [[maybe_unused]]
auto node_energy_heffte = 0.;
544 std::size_t index_row_major_r2c = 0u;
545 for_each_3d_order<FFTConfig::k_space_order>(
546 mesh_start, local_size, local_index, [&]() {
547 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
548 auto const global_index = local_origin + local_index;
549 auto const &mesh_dip = dp3m.heffte.ks_dipole_density;
550 auto const cell_field =
551 mesh_dip[0u][index_row_major_r2c] *
552 FloatType(dp3m.d_op[0u][global_index[1u]]) +
553 mesh_dip[1u][index_row_major_r2c] *
554 FloatType(dp3m.d_op[1u][global_index[2u]]) +
555 mesh_dip[2u][index_row_major_r2c] *
556 FloatType(dp3m.d_op[2u][global_index[0u]]);
557 auto cell_energy =
static_cast<double>(
558 dp3m.heffte.g_energy[index_row_major_r2c] *
559 std::norm(cell_field));
560 if (FFTConfig::use_r2c and (short_dim >= cutoff_left and
561 short_dim <= cutoff_right - 1)) {
569 node_energy_heffte += cell_energy;
571 ++index_row_major_r2c;
573 assert(heffte_almost_equal(
static_cast<FloatType
>(node_energy_heffte),
574 static_cast<FloatType
>(node_energy)));
577 node_energy *= dipole_prefac * std::numbers::pi * box_geo.length_inv()[0];
578 boost::mpi::reduce(
comm_cart, node_energy, energy, std::plus<>(), 0);
580 if (dp3m.energy_correction == 0.)
581 calc_energy_correction();
585 energy -= prefactor * dp3m.sum_mu2 * std::numbers::inv_sqrtpi *
586 (2. / 3.) * Utils::int_pow<3>(dp3m.params.alpha);
589 energy += prefactor * dp3m.energy_correction / box_geo.volume();
599 if (dp3m.sum_mu2 > 0.) {
600 auto const wavenumber = 2. * std::numbers::pi * box_geo.length_inv()[0u];
601 dp3m.ks_scalar.resize(dp3m.local_mesh.size);
604 auto index{std::size_t(0u)};
605 auto it_energy = dp3m.g_energy.begin();
606 auto it_ks_scalar = dp3m.ks_scalar.begin();
607 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]()
mutable {
608 auto constexpr KX = 2, KY = 0, KZ = 1;
609 auto const shift = local_index + dp3m.mesh.start;
610 auto const &d_op = dp3m.d_op[0u];
611 auto const &mesh_dip = dp3m.mesh.rs_fields;
613 auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
614 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
615 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
618 auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
619 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
620 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
622 *it_ks_scalar = *it_energy * std::complex<FloatType>{re, im};
623 std::advance(it_energy, 1);
624 std::advance(it_ks_scalar, 1);
627#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
628 if (dp3m.heffte.world_size == 1) {
629 std::size_t index_row_major_r2c = 0u;
630 for_each_3d_order<FFTConfig::k_space_order>(
631 mesh_start, local_size, local_index, [&]() {
632 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
633 auto const global_index = local_origin + local_index;
634 auto const &mesh_dip = dp3m.heffte.ks_dipole_density;
635 dp3m.heffte.ks_scalar[index_row_major_r2c] =
636 dp3m.heffte.g_energy[index_row_major_r2c] *
637 (mesh_dip[0u][index_row_major_r2c] *
638 FloatType(dp3m.d_op[0u][global_index[1u]]) +
639 mesh_dip[1u][index_row_major_r2c] *
640 FloatType(dp3m.d_op[1u][global_index[2u]]) +
641 mesh_dip[2u][index_row_major_r2c] *
642 FloatType(dp3m.d_op[2u][global_index[0u]]));
644 if (not dp3m.params.tuning) {
645 auto constexpr KX = 2, KY = 0, KZ = 1;
646 auto const index_fft_legacy = local_index[KZ] +
647 line_stride * local_index[KY] +
648 plane_stride * local_index[KX];
649 assert(heffte_almost_equal(
650 dp3m.heffte.ks_scalar[index_row_major_r2c],
651 dp3m.ks_scalar[index_fft_legacy]));
654 ++index_row_major_r2c;
661 for (
int d = 0; d < 3; d++) {
662 auto it_ks_scalar = dp3m.ks_scalar.begin();
664 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
665 auto const &offset = dp3m.mesh.start;
666 auto const &d_op = dp3m.d_op[0u];
667 auto const d_op_val = FloatType(d_op[local_index[d] + offset[d]]);
668 auto const &value = *it_ks_scalar;
669 dp3m.mesh.rs_scalar[index] = d_op_val * value.real();
671 dp3m.mesh.rs_scalar[index] = d_op_val * value.imag();
673 std::advance(it_ks_scalar, 1);
675#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
676 if (dp3m.heffte.world_size == 1) {
677 unsigned int constexpr d_ks[3] = {2u, 0u, 1u};
678 std::size_t index_row_major_r2c = 0u;
679 for_each_3d_order<FFTConfig::k_space_order>(
680 mesh_start, local_size, local_index, [&]() {
681 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
682 auto const global_index = local_origin + local_index;
683 auto const d_op_val =
684 FloatType(dp3m.d_op[d][global_index[d_ks[d]]]);
685 dp3m.heffte.ks_B_field_storage[index_row_major_r2c] =
686 d_op_val * dp3m.heffte.ks_scalar[index_row_major_r2c];
688 if (not dp3m.params.tuning) {
689 auto constexpr KX = 2, KY = 0, KZ = 1;
690 auto const index_fft_legacy =
691 local_index[KZ] + line_stride * local_index[KY] +
692 plane_stride * local_index[KX];
693 auto const old_value = std::complex<FloatType>{
694 dp3m.mesh.rs_scalar[2 * index_fft_legacy],
695 dp3m.mesh.rs_scalar[2 * index_fft_legacy + 1]};
696 auto const &new_value =
697 dp3m.heffte.ks_B_field_storage[index_row_major_r2c];
698 assert(heffte_almost_equal(new_value, old_value));
701 ++index_row_major_r2c;
704 dp3m.heffte.fft->backward(dp3m.heffte.ks_B_field_storage.data(),
705 dp3m.heffte.rs_B_fields_no_halo[d].data());
707 dp3m.heffte.rs_B_fields[d] =
710 std::span(dp3m.heffte.rs_B_fields_no_halo[d]),
711 dp3m.local_mesh.dim_no_halo, dp3m.local_mesh.n_halo_ld,
712 dp3m.local_mesh.n_halo_ur);
715 dp3m.heffte.rs_B_fields[d].data(),
716 dp3m.local_mesh.dim);
719 dp3m.fft->backward_fft(dp3m.fft_buffers->get_scalar_mesh());
721 dp3m.fft_buffers->perform_scalar_halo_spread();
723 auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
724#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
725 auto &particle_data = *system.cell_structure;
727 auto &particle_data = particles;
729 Utils::integral_parameter<int, AssignTorques, p3m_min_cao, p3m_max_cao>(
730 dp3m.params.cao, dp3m, dipole_prefac * wavenumber, d_rs,
743 auto it_force = dp3m.g_force.begin();
744 auto it_ks_scalar = dp3m.ks_scalar.begin();
745 std::size_t index = 0u;
746 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
747 auto constexpr KX = 2, KY = 0, KZ = 1;
748 auto const shift = local_index + dp3m.mesh.start;
749 auto const &d_op = dp3m.d_op[0u];
750 auto const &mesh_dip = dp3m.mesh.rs_fields;
752 auto const re = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
753 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
754 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
757 auto const im = mesh_dip[0u][index] * FloatType(d_op[shift[KX]]) +
758 mesh_dip[1u][index] * FloatType(d_op[shift[KY]]) +
759 mesh_dip[2u][index] * FloatType(d_op[shift[KZ]]);
761 *it_ks_scalar = {*it_force * im, *it_force * (-re)};
762 std::advance(it_force, 1);
763 std::advance(it_ks_scalar, 1);
767#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
768 if (dp3m.heffte.world_size == 1) {
769 std::size_t index_row_major_r2c = 0u;
770 for_each_3d_order<FFTConfig::k_space_order>(
771 mesh_start, local_size, local_index, [&]() {
772 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
773 auto const global_index = local_origin + local_index;
774 auto const &mesh_dip = dp3m.heffte.ks_dipole_density;
776 dp3m.heffte.g_force[index_row_major_r2c] *
777 (mesh_dip[0u][index_row_major_r2c] *
778 FloatType(dp3m.d_op[0u][global_index[1u]]) +
779 mesh_dip[1u][index_row_major_r2c] *
780 FloatType(dp3m.d_op[1u][global_index[2u]]) +
781 mesh_dip[2u][index_row_major_r2c] *
782 FloatType(dp3m.d_op[2u][global_index[0u]]));
783 dp3m.heffte.ks_scalar[index_row_major_r2c] = {value.imag(),
786 if (not dp3m.params.tuning) {
787 auto constexpr KX = 2, KY = 0, KZ = 1;
788 auto const index_fft_legacy = local_index[KZ] +
789 line_stride * local_index[KY] +
790 plane_stride * local_index[KX];
791 assert(heffte_almost_equal(
792 dp3m.heffte.ks_scalar[index_row_major_r2c],
793 dp3m.ks_scalar[index_fft_legacy]));
796 ++index_row_major_r2c;
803 for (
int d = 0; d < 3; d++) {
804 std::size_t index = 0u;
805 auto it_ks_scalar = dp3m.ks_scalar.begin();
806 for_each_3d(mesh_start, dp3m.mesh.size, local_index, [&]() {
807 auto constexpr KX = 2, KY = 0, KZ = 1;
808 auto const shift = local_index + dp3m.mesh.start;
809 auto const &d_op = dp3m.d_op[0u];
810 auto const &mesh_dip = dp3m.mesh.rs_fields;
811 auto const d_op_val = FloatType(d_op[shift[d]]);
812 auto const f = *it_ks_scalar * d_op_val;
813 mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f.real();
814 mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f.real();
815 mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f.real();
817 mesh_dip[0u][index] = FloatType(d_op[shift[KX]]) * f.imag();
818 mesh_dip[1u][index] = FloatType(d_op[shift[KY]]) * f.imag();
819 mesh_dip[2u][index] = FloatType(d_op[shift[KZ]]) * f.imag();
821 std::advance(it_ks_scalar, 1);
824#ifdef ESPRESSO_DP3M_HEFFTE_CROSS_CHECKS
825 if (dp3m.heffte.world_size == 1) {
826 std::size_t index_row_major_r2c = 0u;
827 for_each_3d_order<FFTConfig::k_space_order>(
828 mesh_start, local_size, local_index, [&]() {
829 if (not FFTConfig::use_r2c or (short_dim <= cutoff_right)) {
830 auto constexpr KX = 1, KY = 2, KZ = 0;
831 auto const global_index = local_origin + local_index;
832 auto const remapped_index =
833 local_index[KZ] + local_index[KY] * local_size[KZ] +
834 local_index[KX] * local_size[KZ] * local_size[KY];
835 auto const d_op_val =
836 FloatType(dp3m.d_op[d][global_index[d]]);
837 auto &mesh_dip = dp3m.heffte.ks_dipole_density;
838 mesh_dip[0u][index_row_major_r2c] =
839 FloatType(dp3m.d_op[d][global_index[2u]]) * d_op_val *
840 dp3m.heffte.ks_scalar[remapped_index];
841 mesh_dip[1u][index_row_major_r2c] =
842 FloatType(dp3m.d_op[d][global_index[0u]]) * d_op_val *
843 dp3m.heffte.ks_scalar[remapped_index];
844 mesh_dip[2u][index_row_major_r2c] =
845 FloatType(dp3m.d_op[d][global_index[1u]]) * d_op_val *
846 dp3m.heffte.ks_scalar[remapped_index];
848 if (not FFTConfig::use_r2c and not dp3m.params.tuning) {
849 auto const index_fft_legacy = local_index[2] +
850 line_stride * local_index[1] +
851 plane_stride * local_index[0];
852 for (
int j = 0; j < 3; ++j) {
853 auto const old_value = std::complex<FloatType>{
854 dp3m.mesh.rs_fields[j][2 * index_fft_legacy],
855 dp3m.mesh.rs_fields[j][2 * index_fft_legacy + 1]};
856 auto const &new_value = mesh_dip[j][index_row_major_r2c];
857 assert(heffte_almost_equal(new_value, old_value));
861 ++index_row_major_r2c;
864 for (
int dir = 0u; dir < 3u; ++dir) {
865 dp3m.heffte.fft->backward(
866 dp3m.heffte.ks_dipole_density[dir].data(),
867 dp3m.heffte.rs_B_fields_no_halo[dir].data());
869 dp3m.heffte.rs_B_fields[d] =
872 std::span(dp3m.heffte.rs_B_fields_no_halo[dir]),
873 dp3m.local_mesh.dim_no_halo, dp3m.local_mesh.n_halo_ld,
874 dp3m.local_mesh.n_halo_ur);
878 std::array<FloatType *, 3u>{{dp3m.heffte.rs_B_fields[0u].data(),
879 dp3m.heffte.rs_B_fields[1u].data(),
880 dp3m.heffte.rs_B_fields[2u].data()}};
881 dp3m.heffte.halo_comm.spread_grid(
::comm_cart, rs_fields,
882 dp3m.local_mesh.dim);
885 for (
auto &rs_mesh : dp3m.fft_buffers->get_vector_mesh()) {
886 dp3m.fft->backward_fft(rs_mesh);
889 dp3m.fft_buffers->perform_vector_halo_spread();
891 auto const d_rs = (d + dp3m.mesh.ks_pnum) % 3;
892#ifdef ESPRESSO_SHARED_MEMORY_PARALLELISM
893 auto &particle_data = *system.cell_structure;
895 auto &particle_data = particles;
899 dp3m.params.cao, dp3m, dipole_prefac *
Utils::sqr(wavenumber), d_rs,
906 auto const surface_term =
907 calc_surface_term(force_flag, energy_flag or npt_flag, particles);
909 energy += surface_term;
917 if (not energy_flag) {