36#include <thrust/copy.h>
37#include <thrust/device_vector.h>
46#if defined(OMPI_MPI_H) || defined(_MPI_H)
47#error CU-file includes mpi.h! This should not happen!
51 return thrust::raw_pointer_cast(vec.data());
54template <
class SpanLike> std::size_t
byte_size(SpanLike
const &v) {
55 return v.size() *
sizeof(
typename SpanLike::value_type);
77 if (vec.capacity() == 0) {
78 vec = thrust::device_vector<T>(n);
86 thrust::device_vector<T>().swap(vec);
91 void free_device_memory();
117 auto obj = std::make_shared<GpuParticleData::Storage>();
156 return m_data->particle_data_device.size();
160 return m_data->particle_pos_device;
175 return m_data->particle_dip_device;
181 return m_data->particle_q_device;
186 return m_data->energy_device;
190 m_need_particles_update =
true;
191 m_data->m_need[property] =
true;
193 m_split_particle_struct =
true;
195 enable_particle_transfer();
198bool GpuParticleData::has_compatible_device_impl()
const {
210void GpuParticleData::gpu_init_particle_comm() {
216 m_data->realloc_device_memory();
241 cudaMemcpyHostToDevice,
stream[0]);
244void GpuParticleData::copy_particles_to_device(
ParticleRange const &particles,
246 if (m_communication_enabled) {
247 gather_particle_data(particles, m_data->particle_data_host,
this_node);
249 m_data->copy_particles_to_device();
250 if (m_split_particle_struct) {
251 m_data->realloc_device_memory();
252 m_data->split_particle_struct();
260 if (m_communication_enabled) {
263 m_data->copy_particle_forces_to_host();
265 m_data->copy_particle_torques_to_host();
269 auto forces_buffer = m_data->get_particle_forces_host_span();
271 auto torques_buffer = m_data->get_particle_torques_host_span();
273 auto torques_buffer = std::span<float>();
277 particles_scatter_forces(particles, forces_buffer, torques_buffer);
282 if (m_communication_enabled) {
283 if (m_data->energy_device ==
nullptr) {
292 if (m_communication_enabled) {
293 cuda_safe_mem(cudaMemcpy(&energy_host, m_data->energy_device,
294 sizeof(
GpuEnergy), cudaMemcpyDeviceToHost));
301 float *r, std::size_t n) {
302 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
306 auto const &p = particles[idx];
308 r[idx + 0u] = p.p[0u];
309 r[idx + 1u] = p.p[1u];
310 r[idx + 2u] = p.p[2u];
316 float *r,
float *q, std::size_t n) {
317 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
321 auto const &p = particles[idx];
322 r[3u * idx + 0u] = p.p[0u];
323 r[3u * idx + 1u] = p.p[1u];
324 r[3u * idx + 2u] = p.p[2u];
330 float *q, std::size_t n) {
331 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
335 auto const &p = particles[idx];
343 float *dip, std::size_t n) {
344 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
348 auto const &p = particles[idx];
352 dip[idx + 0u] = p.dip[0u];
353 dip[idx + 1u] = p.dip[1u];
354 dip[idx + 2u] = p.dip[2u];
359 auto const n_part = particle_data_device.
size();
364 dim3
const threadsPerBlock{512u, 1u, 1u};
365 dim3
const numBlocks{
static_cast<unsigned>(n_part / threadsPerBlock.x + 1ul)};
369 split_kernel_rq<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
371 particle_q_device, n_part);
373 split_kernel_q<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
378 split_kernel_r<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
383 split_kernel_dip<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
391 auto const new_size = particle_data_device.size();
392 auto const resize_needed = new_size != current_size;
393 if (m_need[
prop::pos] and (resize_needed or particle_pos_device ==
nullptr)) {
394 if (particle_pos_device !=
nullptr) {
398 cudaMalloc(&particle_pos_device, 3ul * new_size *
sizeof(
float)));
401 if (m_need[
prop::dip] and (resize_needed or particle_dip_device ==
nullptr)) {
402 if (particle_dip_device !=
nullptr) {
406 cudaMalloc(&particle_dip_device, 3ul * new_size *
sizeof(
float)));
410 if (m_need[
prop::q] and (resize_needed or particle_q_device ==
nullptr)) {
411 if (particle_q_device !=
nullptr) {
414 cuda_safe_mem(cudaMalloc(&particle_q_device, new_size *
sizeof(
float)));
417 current_size = new_size;
420void GpuParticleData::Storage::free_device_memory() {
421 auto const free_device_pointer = [](
auto *&ptr) {
422 if (ptr !=
nullptr) {
432 free_device_pointer(particle_pos_device);
434 free_device_pointer(particle_dip_device);
437 free_device_pointer(particle_q_device);
439 free_device_pointer(energy_device);
std::vector< T, CudaHostAllocator< T > > pinned_vector
void free_device_vector(thrust::device_vector< T > &vec)
void resize_or_replace(thrust::device_vector< T > &vec, std::size_t n)
Resize a thrust::device_vector.
T * raw_data_pointer(thrust::device_vector< T > &vec)
std::size_t byte_size(SpanLike const &v)
__global__ void split_kernel_rq(GpuParticleData::GpuParticle *particles, float *r, float *q, std::size_t n)
__global__ void split_kernel_dip(GpuParticleData::GpuParticle *particles, float *dip, std::size_t n)
__global__ void split_kernel_r(GpuParticleData::GpuParticle *particles, float *r, std::size_t n)
__global__ void split_kernel_q(GpuParticleData::GpuParticle *particles, float *q, std::size_t n)
Host and device containers for particle data.
thrust::device_vector< float > particle_torques_device
float * particle_q_device
void split_particle_struct()
std::span< float > get_particle_forces_host_span()
pinned_vector< float > particle_torques_host
thrust::device_vector< GpuParticle > particle_data_device
void copy_particle_torques_to_host()
float * particle_pos_device
thrust::device_vector< float > particle_forces_device
GpuParticleData::prop::bitset m_need
Which particle properties are needed by GPU methods.
void copy_particle_forces_to_host()
pinned_vector< GpuParticle > particle_data_host
pinned_vector< float > particle_forces_host
GpuParticleData::GpuEnergy * energy_device
float * particle_dip_device
static auto make_shared(ResourceCleanup &cleanup_queue)
std::span< float > get_particle_torques_host_span()
void copy_particles_to_device()
void realloc_device_memory()
float * get_particle_torques_device() const
float * get_particle_charges_device() const
void copy_forces_to_host(ParticleRange const &particles, int this_node)
GpuEnergy * get_energy_device() const
float * get_particle_dipoles_device() const
float * get_particle_forces_device() const
GpuEnergy copy_energy_to_host() const
void clear_energy_on_device()
void enable_property(std::size_t property)
std::size_t n_particles() const
float * get_particle_positions_device() const
base_type::size_type size() const
Attorney for a resource deallocator.
Queue to deallocate resources before normal program termination.
void push(std::shared_ptr< Container > const &resource)
Register a resource for cleanup.
Wrapper for CUDA runtime exceptions.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
int this_node
The number of this node.
This file contains the defaults for ESPResSo.
void invoke_skip_cuda_exceptions(F &&f, Args &&...args)
Invoke a function and silently ignore any thrown cuda_runtime_error error.
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
void cuda_check_device()
Check that a device is available, that its compute capability is sufficient for ESPResSo,...
Energies that are retrieved from the GPU.
Subset of Particle which is copied to the GPU.
Particle properties that need to be communicated to the GPU.
static constexpr std::size_t force
static constexpr std::size_t torque
static constexpr std::size_t pos
static constexpr std::size_t dip
static constexpr std::size_t q