36#include <thrust/copy.h>
37#include <thrust/device_vector.h>
46#if defined(OMPI_MPI_H) || defined(_MPI_H)
47#error CU-file includes mpi.h! This should not happen!
51 return thrust::raw_pointer_cast(vec.data());
54template <
class SpanLike> std::size_t
byte_size(SpanLike
const &v) {
55 return v.size() *
sizeof(
typename SpanLike::value_type);
77 if (vec.capacity() == 0) {
78 vec = thrust::device_vector<T>(n);
86 thrust::device_vector<T>().swap(vec);
91 void free_device_memory();
104#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
108#ifdef ESPRESSO_ROTATION
113#ifdef ESPRESSO_DIPOLES
116#ifdef ESPRESSO_ELECTROSTATICS
121 auto obj = std::make_shared<GpuParticleData::Storage>();
137#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
147#ifdef ESPRESSO_ROTATION
160#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
166#ifdef ESPRESSO_ROTATION
178 return m_data->particle_data_device.size();
182 return m_data->particle_pos_device;
188#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
194#ifdef ESPRESSO_ROTATION
200#ifdef ESPRESSO_DIPOLES
202 return m_data->particle_dip_device;
206#ifdef ESPRESSO_ELECTROSTATICS
208 return m_data->particle_q_device;
213 return m_data->energy_device;
217 m_need_particles_update =
true;
218 m_data->m_need[property] =
true;
219#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
222 m_split_particle_struct =
true;
226 m_split_particle_struct =
true;
229 enable_particle_transfer();
232bool GpuParticleData::has_compatible_device_impl()
const {
244void GpuParticleData::gpu_init_particle_comm() {
250 m_data->realloc_device_memory();
260#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
265#ifdef ESPRESSO_ROTATION
274#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
279#ifdef ESPRESSO_ROTATION
287 cudaMemcpyHostToDevice,
stream[0]);
290void GpuParticleData::copy_particles_to_device(
ParticleRange const &particles,
292 if (m_communication_enabled) {
293 gather_particle_data(particles, m_data->particle_data_host,
this_node);
295 m_data->copy_particles_to_device();
296 if (m_split_particle_struct) {
297 m_data->realloc_device_memory();
298 m_data->split_particle_struct();
306 if (m_communication_enabled) {
309 m_data->copy_particle_forces_to_host();
310#ifdef ESPRESSO_ROTATION
311 m_data->copy_particle_torques_to_host();
315 auto forces_buffer = m_data->get_particle_forces_host_span();
316#ifdef ESPRESSO_ROTATION
317 auto torques_buffer = m_data->get_particle_torques_host_span();
319 auto torques_buffer = std::span<float>();
323 particles_scatter_forces(particles, forces_buffer, torques_buffer);
326#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
329 if (m_communication_enabled) {
332 m_data->copy_particle_dip_fld_to_host();
335 auto dipole_field_buffer = m_data->get_particle_dip_fld_host_span();
338 particles_scatter_dip_fld(particles, dipole_field_buffer);
344 if (m_communication_enabled) {
345 if (m_data->energy_device ==
nullptr) {
354 if (m_communication_enabled) {
355 cuda_safe_mem(cudaMemcpy(&energy_host, m_data->energy_device,
356 sizeof(
GpuEnergy), cudaMemcpyDeviceToHost));
363 float *r, std::size_t n) {
364 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
368 auto const &p = particles[idx];
370 r[idx + 0u] = p.p[0u];
371 r[idx + 1u] = p.p[1u];
372 r[idx + 2u] = p.p[2u];
375#ifdef ESPRESSO_ELECTROSTATICS
378 float *r,
float *q, std::size_t n) {
379 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
383 auto const &p = particles[idx];
384 r[3u * idx + 0u] = p.p[0u];
385 r[3u * idx + 1u] = p.p[1u];
386 r[3u * idx + 2u] = p.p[2u];
392 float *q, std::size_t n) {
393 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
397 auto const &p = particles[idx];
402#ifdef ESPRESSO_DIPOLES
405 float *dip, std::size_t n) {
406 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
410 auto const &p = particles[idx];
414 dip[idx + 0u] = p.dip[0u];
415 dip[idx + 1u] = p.dip[1u];
416 dip[idx + 2u] = p.dip[2u];
421 auto const n_part = particle_data_device.
size();
426 dim3
const threadsPerBlock{512u, 1u, 1u};
427 dim3
const numBlocks{
static_cast<unsigned>(n_part / threadsPerBlock.x + 1ul)};
429#ifdef ESPRESSO_ELECTROSTATICS
431 split_kernel_rq<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
433 particle_q_device, n_part);
435 split_kernel_q<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
440 split_kernel_r<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
443#ifdef ESPRESSO_DIPOLES
445 split_kernel_dip<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
453 auto const new_size = particle_data_device.size();
454 auto const resize_needed = new_size != current_size;
455 if (m_need[
prop::pos] and (resize_needed or particle_pos_device ==
nullptr)) {
456 if (particle_pos_device !=
nullptr) {
460 cudaMalloc(&particle_pos_device, 3ul * new_size *
sizeof(
float)));
462#ifdef ESPRESSO_DIPOLES
463 if (m_need[
prop::dip] and (resize_needed or particle_dip_device ==
nullptr)) {
464 if (particle_dip_device !=
nullptr) {
468 cudaMalloc(&particle_dip_device, 3ul * new_size *
sizeof(
float)));
471#ifdef ESPRESSO_ELECTROSTATICS
472 if (m_need[
prop::q] and (resize_needed or particle_q_device ==
nullptr)) {
473 if (particle_q_device !=
nullptr) {
476 cuda_safe_mem(cudaMalloc(&particle_q_device, new_size *
sizeof(
float)));
479 current_size = new_size;
482void GpuParticleData::Storage::free_device_memory() {
483 auto const free_device_pointer = [](
auto *&ptr) {
484 if (ptr !=
nullptr) {
491#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
494#ifdef ESPRESSO_ROTATION
497 free_device_pointer(particle_pos_device);
498#ifdef ESPRESSO_DIPOLES
499 free_device_pointer(particle_dip_device);
501#ifdef ESPRESSO_ELECTROSTATICS
502 free_device_pointer(particle_q_device);
504 free_device_pointer(energy_device);
std::vector< T, CudaHostAllocator< T > > pinned_vector
void free_device_vector(thrust::device_vector< T > &vec)
void resize_or_replace(thrust::device_vector< T > &vec, std::size_t n)
Resize a thrust::device_vector.
T * raw_data_pointer(thrust::device_vector< T > &vec)
std::size_t byte_size(SpanLike const &v)
__global__ void split_kernel_rq(GpuParticleData::GpuParticle *particles, float *r, float *q, std::size_t n)
__global__ void split_kernel_dip(GpuParticleData::GpuParticle *particles, float *dip, std::size_t n)
__global__ void split_kernel_r(GpuParticleData::GpuParticle *particles, float *r, std::size_t n)
__global__ void split_kernel_q(GpuParticleData::GpuParticle *particles, float *q, std::size_t n)
Host and device containers for particle data.
thrust::device_vector< float > particle_torques_device
float * particle_q_device
thrust::device_vector< float > particle_dip_fld_device
void split_particle_struct()
std::span< float > get_particle_forces_host_span()
pinned_vector< float > particle_torques_host
thrust::device_vector< GpuParticle > particle_data_device
void copy_particle_torques_to_host()
float * particle_pos_device
thrust::device_vector< float > particle_forces_device
void copy_particle_dip_fld_to_host()
GpuParticleData::prop::bitset m_need
Which particle properties are needed by GPU methods.
void copy_particle_forces_to_host()
pinned_vector< GpuParticle > particle_data_host
std::span< float > get_particle_dip_fld_host_span()
pinned_vector< float > particle_forces_host
GpuParticleData::GpuEnergy * energy_device
float * particle_dip_device
static auto make_shared(ResourceCleanup &cleanup_queue)
pinned_vector< float > particle_dip_fld_host
std::span< float > get_particle_torques_host_span()
void copy_particles_to_device()
void realloc_device_memory()
float * get_particle_dip_fld_device() const
float * get_particle_torques_device() const
float * get_particle_charges_device() const
void copy_forces_to_host(ParticleRange const &particles, int this_node)
GpuEnergy * get_energy_device() const
void copy_dip_fld_to_host(ParticleRange const &particles, int this_node)
float * get_particle_dipoles_device() const
float * get_particle_forces_device() const
GpuEnergy copy_energy_to_host() const
void clear_energy_on_device()
void enable_property(std::size_t property)
std::size_t n_particles() const
float * get_particle_positions_device() const
base_type::size_type size() const
Attorney for a resource deallocator.
Queue to deallocate resources before normal program termination.
void push(std::shared_ptr< Container > const &resource)
Register a resource for cleanup.
Wrapper for CUDA runtime exceptions.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
int this_node
The number of this node.
void invoke_skip_cuda_exceptions(F &&f, Args &&...args)
Invoke a function and silently ignore any thrown cuda_runtime_error error.
void cuda_check_device()
Check that a device is available, that its compute capability is sufficient for ESPResSo,...
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
Energies that are retrieved from the GPU.
Subset of Particle which is copied to the GPU.
Particle properties that need to be communicated to the GPU.
static constexpr std::size_t force
static constexpr std::size_t dip_fld
static constexpr std::size_t torque
static constexpr std::size_t pos
static constexpr std::size_t dip
static constexpr std::size_t q