22#include <boost/mpi/communicator.hpp>
23#include <boost/mpi/request.hpp>
34inline std::vector<int> displacements(std::span<int const>
sizes) {
38 for (std::size_t i = 0
u; i <
displ.size(); i++) {
47std::vector<boost::mpi::request>
48iall_gatherv_impl(boost::mpi::communicator
const &comm, T
const *
in_values,
51 auto const n_nodes = comm.size();
52 auto const rank = comm.rank();
59 std::vector<boost::mpi::request>
req;
60 for (
int i = 0; i <
n_nodes; i++) {
76 detail::displacements({
sizes,
static_cast<std::size_t
>(comm.size())});
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
auto iall_gatherv(boost::mpi::communicator const &comm, T const *in_values, int in_size, T *out_values, int const *sizes)