38#include <thrust/copy.h>
39#include <thrust/device_vector.h>
47#if defined(OMPI_MPI_H) || defined(_MPI_H)
48#error CU-file includes mpi.h! This should not happen!
52 return thrust::raw_pointer_cast(vec.data());
55template <
class SpanLike> std::size_t
byte_size(SpanLike
const &v) {
56 return v.size() *
sizeof(
typename SpanLike::value_type);
78 if (vec.capacity() == 0) {
79 vec = thrust::device_vector<T>(n);
87 thrust::device_vector<T>().swap(vec);
92 void free_device_memory();
118 auto obj = std::make_shared<GpuParticleData::Storage>();
157 return m_data->particle_data_device.size();
161 return m_data->particle_pos_device;
176 return m_data->particle_dip_device;
182 return m_data->particle_q_device;
187 return m_data->energy_device;
191 m_need_particles_update =
true;
192 m_data->m_need[property] =
true;
194 m_split_particle_struct =
true;
196 enable_particle_transfer();
199bool GpuParticleData::has_compatible_device_impl()
const {
212void GpuParticleData::gpu_init_particle_comm() {
216 fprintf(stderr,
"ERROR: %s\n", err.what());
219 m_data->realloc_device_memory();
244 cudaMemcpyHostToDevice,
stream[0]);
247void GpuParticleData::copy_particles_to_device(
ParticleRange const &particles,
249 if (m_communication_enabled) {
250 gather_particle_data(particles, m_data->particle_data_host,
this_node);
252 m_data->copy_particles_to_device();
253 if (m_split_particle_struct) {
254 m_data->realloc_device_memory();
255 m_data->split_particle_struct();
263 if (m_communication_enabled) {
266 m_data->copy_particle_forces_to_host();
268 m_data->copy_particle_torques_to_host();
272 auto forces_buffer = m_data->get_particle_forces_host_span();
274 auto torques_buffer = m_data->get_particle_torques_host_span();
280 particles_scatter_forces(particles, forces_buffer, torques_buffer);
285 if (m_communication_enabled) {
286 if (m_data->energy_device ==
nullptr) {
295 if (m_communication_enabled) {
296 cuda_safe_mem(cudaMemcpy(&energy_host, m_data->energy_device,
297 sizeof(
GpuEnergy), cudaMemcpyDeviceToHost));
304 float *r, std::size_t n) {
305 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
309 auto const &p = particles[idx];
311 r[idx + 0
u] = p.p[0
u];
312 r[idx + 1u] = p.p[1u];
313 r[idx + 2u] = p.p[2u];
319 float *r,
float *q, std::size_t n) {
320 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
324 auto const &p = particles[idx];
325 r[3u * idx + 0
u] = p.p[0
u];
326 r[3u * idx + 1u] = p.p[1u];
327 r[3u * idx + 2u] = p.p[2u];
333 float *q, std::size_t n) {
334 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
338 auto const &p = particles[idx];
346 float *dip, std::size_t n) {
347 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
351 auto const &p = particles[idx];
355 dip[idx + 0
u] = p.dip[0
u];
356 dip[idx + 1u] = p.dip[1u];
357 dip[idx + 2u] = p.dip[2u];
362 auto const n_part = particle_data_device.
size();
367 dim3
const threadsPerBlock{512u, 1u, 1u};
368 dim3
const numBlocks{
static_cast<unsigned>(n_part / threadsPerBlock.x + 1ul)};
372 split_kernel_rq<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
374 particle_q_device, n_part);
376 split_kernel_q<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
381 split_kernel_r<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
386 split_kernel_dip<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
394 auto const new_size = particle_data_device.size();
395 auto const resize_needed = new_size != current_size;
396 if (m_need[
prop::pos] and (resize_needed or particle_pos_device ==
nullptr)) {
397 if (particle_pos_device !=
nullptr) {
401 cudaMalloc(&particle_pos_device, 3ul * new_size *
sizeof(
float)));
404 if (m_need[
prop::dip] and (resize_needed or particle_dip_device ==
nullptr)) {
405 if (particle_dip_device !=
nullptr) {
409 cudaMalloc(&particle_dip_device, 3ul * new_size *
sizeof(
float)));
413 if (m_need[
prop::q] and (resize_needed or particle_q_device ==
nullptr)) {
414 if (particle_q_device !=
nullptr) {
417 cuda_safe_mem(cudaMalloc(&particle_q_device, new_size *
sizeof(
float)));
420 current_size = new_size;
423void GpuParticleData::Storage::free_device_memory() {
424 auto const free_device_pointer = [](
auto *&ptr) {
425 if (ptr !=
nullptr) {
435 free_device_pointer(particle_pos_device);
437 free_device_pointer(particle_dip_device);
440 free_device_pointer(particle_q_device);
442 free_device_pointer(energy_device);
std::vector< T, CudaHostAllocator< T > > pinned_vector
void free_device_vector(thrust::device_vector< T > &vec)
void resize_or_replace(thrust::device_vector< T > &vec, std::size_t n)
Resize a thrust::device_vector.
T * raw_data_pointer(thrust::device_vector< T > &vec)
std::size_t byte_size(SpanLike const &v)
__global__ void split_kernel_rq(GpuParticleData::GpuParticle *particles, float *r, float *q, std::size_t n)
__global__ void split_kernel_dip(GpuParticleData::GpuParticle *particles, float *dip, std::size_t n)
__global__ void split_kernel_r(GpuParticleData::GpuParticle *particles, float *r, std::size_t n)
__global__ void split_kernel_q(GpuParticleData::GpuParticle *particles, float *q, std::size_t n)
Host and device containers for particle data.
thrust::device_vector< float > particle_torques_device
float * particle_q_device
void split_particle_struct()
pinned_vector< float > particle_torques_host
thrust::device_vector< GpuParticle > particle_data_device
void copy_particle_torques_to_host()
float * particle_pos_device
thrust::device_vector< float > particle_forces_device
GpuParticleData::prop::bitset m_need
Which particle properties are needed by GPU methods.
void copy_particle_forces_to_host()
Utils::Span< float > get_particle_forces_host_span()
pinned_vector< GpuParticle > particle_data_host
pinned_vector< float > particle_forces_host
GpuParticleData::GpuEnergy * energy_device
float * particle_dip_device
static auto make_shared(ResourceCleanup &cleanup_queue)
Utils::Span< float > get_particle_torques_host_span()
void copy_particles_to_device()
void realloc_device_memory()
float * get_particle_torques_device() const
float * get_particle_charges_device() const
void copy_forces_to_host(ParticleRange const &particles, int this_node)
GpuEnergy * get_energy_device() const
float * get_particle_dipoles_device() const
float * get_particle_forces_device() const
GpuEnergy copy_energy_to_host() const
void clear_energy_on_device()
void enable_property(std::size_t property)
std::size_t n_particles() const
float * get_particle_positions_device() const
base_type::size_type size() const
Attorney for a resource deallocator.
Queue to deallocate resources before normal program termination.
void push(std::shared_ptr< Container > const &resource)
Register a resource for cleanup.
A stripped-down version of std::span from C++17.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
int this_node
The number of this node.
This file contains the defaults for ESPResSo.
void errexit()
exit ungracefully, core dump if switched on.
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
void cuda_check_device()
Check that a device is available, that its compute capability is sufficient for ESPResSo,...
static auto numBlocks(std::size_t n_part)
Get number of blocks for a given number of particles.
Energies that are retrieved from the GPU.
Subset of Particle which is copied to the GPU.
Particle properties that need to be communicated to the GPU.
static constexpr std::size_t force
static constexpr std::size_t torque
static constexpr std::size_t pos
static constexpr std::size_t dip
static constexpr std::size_t q