ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
GpuParticleData_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2014-2026 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19/**
20 * @file
21 * CUDA kernels to convert the particles AoS to a SoA on the device.
22 */
23
24#include <config/config.hpp>
25
26#include "GpuParticleData.hpp"
27#include "System.hpp"
28
29#include "ParticleRange.hpp"
30#include "errorhandling.hpp"
31
32#include "cuda/init.hpp"
33#include "cuda/utils.cuh"
34
35#include <thrust/copy.h>
36#include <thrust/device_vector.h>
37
38#include <cuda.h>
39
40#include <cstddef>
41#include <cstdio>
42#include <memory>
43#include <span>
44
45#if defined(OMPI_MPI_H) || defined(_MPI_H)
46#error CU-file includes mpi.h! This should not happen!
47#endif
48
49template <class T> T *raw_data_pointer(thrust::device_vector<T> &vec) {
50 return thrust::raw_pointer_cast(vec.data());
51}
52
53template <class SpanLike> std::size_t byte_size(SpanLike const &v) {
54 return v.size() * sizeof(typename SpanLike::value_type);
55}
56
57/**
58 * @brief Resize a @c thrust::device_vector.
59 *
60 * Due to a bug in thrust (https://github.com/thrust/thrust/issues/939),
61 * resizing or appending to default constructed containers causes undefined
62 * behavior by dereferencing a null-pointer for certain types. This
63 * function is used instead of the resize member function to side-step
64 * the problem. This is done by replacing the existing vector by a new
65 * one constructed with the desired size if resizing from capacity zero.
66 * Behaves as-if @c vec.resize(n) was called.
67 * This is fixed in Thrust 1.11, shipped in CUDA 11.3
68 * (https://github.com/NVIDIA/thrust/commit/1c4f25d9).
69 * But Clang 20's UBSAN still triggers a runtime error as of CUDA 12.0
70 *
71 * @tparam T Type contained in the vector.
72 * @param vec Vector to resize.
73 * @param n Desired new size of the vector.
74 */
75template <class T>
76void resize_or_replace(thrust::device_vector<T> &vec, std::size_t n) {
77 if (vec.capacity() == 0) {
78 vec = thrust::device_vector<T>(n);
79 } else {
80 vec.resize(n);
81 }
82}
83
84template <typename T> void free_device_vector(thrust::device_vector<T> &vec) {
85 vec.clear();
86 thrust::device_vector<T>().swap(vec);
87}
88
89/** @brief Host and device containers for particle data. */
91public:
92 /** @brief Which particle properties are needed by GPU methods. */
95 std::size_t current_size = 0ul;
97 thrust::device_vector<GpuParticle> particle_data_device;
99 thrust::device_vector<float> particle_forces_device;
100#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
102 thrust::device_vector<float> particle_dip_fld_device;
103#endif
104#ifdef ESPRESSO_ROTATION
106 thrust::device_vector<float> particle_torques_device;
107#endif
108 float *particle_pos_device = nullptr;
109#ifdef ESPRESSO_DIPOLES
110 float *particle_dip_device = nullptr;
111#endif
112#ifdef ESPRESSO_ELECTROSTATICS
113 float *particle_q_device = nullptr;
114#endif
115
116 ~Storage();
121 if (not particle_forces_device.empty()) {
122 thrust::copy(particle_forces_device.begin(), particle_forces_device.end(),
123 particle_forces_host.begin());
124 }
125 }
126
127#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
129 if (not particle_dip_fld_device.empty()) {
130 thrust::copy(particle_dip_fld_device.begin(),
132 particle_dip_fld_host.begin());
133 }
134 }
135#endif
136
137#ifdef ESPRESSO_ROTATION
139 if (not particle_torques_device.empty()) {
140 thrust::copy(particle_torques_device.begin(),
142 particle_torques_host.begin());
143 }
144 }
145#endif
146 std::span<float> get_particle_forces_host_span() {
147 return {particle_forces_host.data(), particle_forces_host.size()};
148 }
149
150#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
152 return {particle_dip_fld_host.data(), particle_dip_fld_host.size()};
153 }
154#endif
155
156#ifdef ESPRESSO_ROTATION
158 return {particle_torques_host.data(), particle_torques_host.size()};
159 }
160#endif
161};
162
163// default ctor/dtor definitions must appear out-of-line due to the
164// forward-declaration of the Storage class
167
169 m_data = std::make_unique<GpuParticleData::Storage>();
170 get_system().cleanup_queue.push<DeviceMemory>(shared_from_this());
171}
172
173void GpuParticleData::deinitialize() noexcept { m_data.reset(); }
174
175std::size_t GpuParticleData::n_particles() const {
176 return m_data->particle_data_device.size();
177}
178
180 return m_data->particle_pos_device;
181}
182
184 return raw_data_pointer(m_data->particle_forces_device);
185}
186#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
188 return raw_data_pointer(m_data->particle_dip_fld_device);
189}
190#endif
191
192#ifdef ESPRESSO_ROTATION
194 return raw_data_pointer(m_data->particle_torques_device);
195}
196#endif
197
198#ifdef ESPRESSO_DIPOLES
200 return m_data->particle_dip_device;
201}
202#endif
203
204#ifdef ESPRESSO_ELECTROSTATICS
206 return m_data->particle_q_device;
207}
208#endif
209
211 return m_data->energy_device;
212}
213
215 m_need_particles_update = true;
216 m_data->m_need[property] = true;
217#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
220 m_split_particle_struct = true;
221 }
222#else
224 m_split_particle_struct = true;
225 }
226#endif
227 enable_particle_transfer();
228}
229
230bool GpuParticleData::has_compatible_device_impl() const {
231 auto result = false;
232 invoke_skip_cuda_exceptions([&result]() {
234 result = true;
235 });
236 return result;
237}
238
239/**
240 * @brief Setup and call particle reallocation from the host.
241 */
242void GpuParticleData::gpu_init_particle_comm() {
243 try {
245 } catch (cuda_runtime_error const &err) {
246 throw cuda_fatal_error(err.what());
247 }
248 m_data->realloc_device_memory();
249}
250
252 // resize buffers
253 auto const n_part = particle_data_host.size();
255 particle_forces_host.resize(3ul * n_part);
257
258#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
259 particle_dip_fld_host.resize(3ul * n_part);
261#endif
262
263#ifdef ESPRESSO_ROTATION
264 particle_torques_host.resize(3ul * n_part);
266#endif
267
268 // zero out device memory for forces and torques
271
272#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
275#endif
276
277#ifdef ESPRESSO_ROTATION
280#endif
281
282 // copy particles to device
286}
287
288void GpuParticleData::copy_particles_to_device(ParticleRange const &particles,
289 int this_node) {
290 if (m_communication_enabled) {
291 gather_particle_data(particles, m_data->particle_data_host, this_node);
292 if (this_node == 0) {
293 m_data->copy_particles_to_device();
294 if (m_split_particle_struct) {
295 m_data->realloc_device_memory();
296 m_data->split_particle_struct();
297 }
298 }
299 }
300}
301
303 int this_node) {
304 if (m_communication_enabled) {
305 // copy results from device memory to host memory
306 if (this_node == 0) {
307 m_data->copy_particle_forces_to_host();
308#ifdef ESPRESSO_ROTATION
309 m_data->copy_particle_torques_to_host();
310#endif
311 }
312
313 auto forces_buffer = m_data->get_particle_forces_host_span();
314#ifdef ESPRESSO_ROTATION
315 auto torques_buffer = m_data->get_particle_torques_host_span();
316#else
317 auto torques_buffer = std::span<float>();
318#endif
319
320 // add forces and torques to the particles
321 particles_scatter_forces(particles, forces_buffer, torques_buffer);
322 }
323}
324#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
326 int this_node) {
327 if (m_communication_enabled) {
328 // copy results from device memory to host memory
329 if (this_node == 0) {
330 m_data->copy_particle_dip_fld_to_host();
331 }
332
333 auto dipole_field_buffer = m_data->get_particle_dip_fld_host_span();
334
335 // add dip_fld to the particles
336 particles_scatter_dip_fld(particles, dipole_field_buffer);
337 }
338}
339#endif
340
342 if (m_communication_enabled) {
343 if (m_data->energy_device == nullptr) {
344 cuda_safe_mem(cudaMalloc(&m_data->energy_device, sizeof(GpuEnergy)));
345 }
346 cuda_safe_mem(cudaMemset(m_data->energy_device, 0, sizeof(GpuEnergy)));
347 }
348}
349
352 if (m_communication_enabled) {
353 cuda_safe_mem(cudaMemcpy(&energy_host, m_data->energy_device,
355 }
356 return energy_host;
357}
358
359// Position only
361 float *r, std::size_t n) {
362 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
363 if (idx >= n)
364 return;
365
366 auto const &p = particles[idx];
367 idx *= 3u;
368 r[idx + 0u] = p.p[0u];
369 r[idx + 1u] = p.p[1u];
370 r[idx + 2u] = p.p[2u];
371}
372
373#ifdef ESPRESSO_ELECTROSTATICS
374// Position and charge
376 float *r, float *q, std::size_t n) {
377 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
378 if (idx >= n)
379 return;
380
381 auto const &p = particles[idx];
382 r[3u * idx + 0u] = p.p[0u];
383 r[3u * idx + 1u] = p.p[1u];
384 r[3u * idx + 2u] = p.p[2u];
385 q[idx] = p.q;
386}
387
388// Charge only
390 float *q, std::size_t n) {
391 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
392 if (idx >= n)
393 return;
394
395 auto const &p = particles[idx];
396 q[idx] = p.q;
397}
398#endif
399
400#ifdef ESPRESSO_DIPOLES
401// Dipole moment
403 float *dip, std::size_t n) {
404 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
405 if (idx >= n)
406 return;
407
408 auto const &p = particles[idx];
409
410 idx *= 3u;
411
412 dip[idx + 0u] = p.dip[0u];
413 dip[idx + 1u] = p.dip[1u];
414 dip[idx + 2u] = p.dip[2u];
415}
416#endif
417
419 auto const n_part = particle_data_device.size();
420 if (n_part == 0ul)
421 return;
422
424 dim3 const threadsPerBlock{512u, 1u, 1u};
425 dim3 const numBlocks{static_cast<unsigned>(n_part / threadsPerBlock.x + 1ul)};
426
427#ifdef ESPRESSO_ELECTROSTATICS
428 if (m_need[prop::q] and m_need[prop::pos]) {
430 raw_data_pointer(particle_data_device), particle_pos_device,
431 particle_q_device, n_part);
432 } else if (m_need[prop::q]) {
434 raw_data_pointer(particle_data_device), particle_q_device, n_part);
435 } else
436#endif
437 if (m_need[prop::pos]) {
439 raw_data_pointer(particle_data_device), particle_pos_device, n_part);
440 }
441#ifdef ESPRESSO_DIPOLES
442 if (m_need[prop::dip]) {
444 raw_data_pointer(particle_data_device), particle_dip_device, n_part);
445 }
446#endif
447}
448
451 auto const new_size = particle_data_device.size();
452 auto const resize_needed = new_size != current_size;
453 if (m_need[prop::pos] and (resize_needed or particle_pos_device == nullptr)) {
454 if (particle_pos_device != nullptr) {
455 cuda_safe_mem(cudaFree(particle_pos_device));
456 }
458 cudaMalloc(&particle_pos_device, 3ul * new_size * sizeof(float)));
459 }
460#ifdef ESPRESSO_DIPOLES
461 if (m_need[prop::dip] and (resize_needed or particle_dip_device == nullptr)) {
462 if (particle_dip_device != nullptr) {
463 cuda_safe_mem(cudaFree(particle_dip_device));
464 }
466 cudaMalloc(&particle_dip_device, 3ul * new_size * sizeof(float)));
467 }
468#endif
469#ifdef ESPRESSO_ELECTROSTATICS
470 if (m_need[prop::q] and (resize_needed or particle_q_device == nullptr)) {
471 if (particle_q_device != nullptr) {
472 cuda_safe_mem(cudaFree(particle_q_device));
473 }
474 cuda_safe_mem(cudaMalloc(&particle_q_device, new_size * sizeof(float)));
475 }
476#endif
477 current_size = new_size;
478}
479
481 auto const free_device_pointer = [](auto *&ptr) noexcept {
482 if (ptr != nullptr) {
483 cudaFree(reinterpret_cast<void *>(ptr));
484 ptr = nullptr;
485 }
486 };
487 free_device_pointer(particle_pos_device);
488#ifdef ESPRESSO_DIPOLES
489 free_device_pointer(particle_dip_device);
490#endif
491#ifdef ESPRESSO_ELECTROSTATICS
492 free_device_pointer(particle_q_device);
493#endif
494 free_device_pointer(energy_device);
495}
std::vector< T, CudaHostAllocator< T > > pinned_vector
void free_device_vector(thrust::device_vector< T > &vec)
void resize_or_replace(thrust::device_vector< T > &vec, std::size_t n)
Resize a thrust::device_vector.
T * raw_data_pointer(thrust::device_vector< T > &vec)
std::size_t byte_size(SpanLike const &v)
__global__ void split_kernel_rq(GpuParticleData::GpuParticle *particles, float *r, float *q, std::size_t n)
__global__ void split_kernel_dip(GpuParticleData::GpuParticle *particles, float *dip, std::size_t n)
__global__ void split_kernel_r(GpuParticleData::GpuParticle *particles, float *r, std::size_t n)
__global__ void split_kernel_q(GpuParticleData::GpuParticle *particles, float *q, std::size_t n)
Host and device containers for particle data.
thrust::device_vector< float > particle_torques_device
thrust::device_vector< float > particle_dip_fld_device
std::span< float > get_particle_forces_host_span()
pinned_vector< float > particle_torques_host
thrust::device_vector< GpuParticle > particle_data_device
thrust::device_vector< float > particle_forces_device
GpuParticleData::prop::bitset m_need
Which particle properties are needed by GPU methods.
pinned_vector< GpuParticle > particle_data_host
std::span< float > get_particle_dip_fld_host_span()
pinned_vector< float > particle_forces_host
GpuParticleData::GpuEnergy * energy_device
pinned_vector< float > particle_dip_fld_host
std::span< float > get_particle_torques_host_span()
float * get_particle_dip_fld_device() const
float * get_particle_torques_device() const
float * get_particle_charges_device() const
void copy_forces_to_host(ParticleRange const &particles, int this_node)
GpuEnergy * get_energy_device() const
void copy_dip_fld_to_host(ParticleRange const &particles, int this_node)
float * get_particle_dipoles_device() const
float * get_particle_forces_device() const
GpuEnergy copy_energy_to_host() const
void enable_property(std::size_t property)
std::size_t n_particles() const
float * get_particle_positions_device() const
A range of particles.
base_type::size_type size() const
Attorney for a resource deallocator.
Fatal CUDA exception.
Wrapper for CUDA runtime exceptions.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
int this_node
The number of this node.
void invoke_skip_cuda_exceptions(F &&f, Args &&...args)
Invoke a function and silently ignore any thrown cuda_runtime_error error.
void cuda_check_device()
Check that a device is available, that its compute capability is sufficient for ESPResSo,...
Definition init_cuda.cu:102
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
Energies that are retrieved from the GPU.
Subset of Particle which is copied to the GPU.
Particle properties that need to be communicated to the GPU.
static constexpr std::size_t force
static constexpr std::size_t dip_fld
std::bitset< 6 > bitset
static constexpr std::size_t torque
static constexpr std::size_t pos
static constexpr std::size_t dip
static constexpr std::size_t q
#define cuda_safe_mem(a)
Definition utils.cuh:73