22#include <boost/mpi/communicator.hpp>
23#include <boost/mpi/datatype.hpp>
24#include <boost/mpi/exception.hpp>
25#include <boost/mpi/nonblocking.hpp>
36void gatherv_impl(
const boost::mpi::communicator &comm,
const T *
in_values,
38 const int *
displs,
int root, boost::mpl::true_) {
48 const_cast<int *
>(
sizes),
const_cast<int *
>(
displs),
59void gatherv_impl(
const boost::mpi::communicator &comm,
const T *
in_values,
61 const int *
displs,
int root, boost::mpl::false_) {
62 if (comm.rank() ==
root) {
63 auto const n_nodes = comm.size();
70 std::vector<boost::mpi::request>
req;
71 for (
int i = 0; i <
n_nodes; i++) {
79 boost::mpi::wait_all(
req.begin(),
req.end());
97 if (comm.rank() ==
root) {
98 std::vector<int>
displ(
static_cast<unsigned int>(comm.size()));
101 for (
unsigned int i = 0; i <
displ.size(); i++) {
119 "This overload can not be called on the root rank.");
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
void gatherv(const boost::mpi::communicator &comm, const T *in_values, int in_size, T *out_values, const int *sizes, const int *displs, int root)