26#include <boost/mpi/collectives.hpp>
27#include <boost/mpi/communicator.hpp>
52template <
typename T,
class Allocator>
54 boost::mpi::communicator
const &comm,
int root = 0) {
55 auto const n_elem =
static_cast<int>(buffer.size());
57 if (comm.rank() ==
root) {
58 std::vector<int>
sizes;
59 std::vector<int>
displ;
65 buffer.resize(
static_cast<unsigned int>(
tot_size));
69 for (
int i =
sizes[
root] - 1; i >= 0; --i) {
81 gatherv(comm, buffer.data(),
n_elem,
static_cast<T *
>(
nullptr),
nullptr,
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
void gather_buffer(std::vector< T, Allocator > &buffer, boost::mpi::communicator const &comm, int root=0)
Gather buffer with different size on each node.
void gatherv(const boost::mpi::communicator &comm, const T *in_values, int in_size, T *out_values, const int *sizes, const int *displs, int root)