ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
GpuParticleData_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2014-2022 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19/**
20 * @file
21 * CUDA kernels to convert the particles AoS to a SoA on the device.
22 */
23
24#include "config/config.hpp"
25
26#include "GpuParticleData.hpp"
27#include "ResourceCleanup.hpp"
28#include "System.hpp"
29
30#include "ParticleRange.hpp"
31#include "errorhandling.hpp"
32
33#include "cuda/init.hpp"
34#include "cuda/utils.cuh"
35
36#include <utils/Span.hpp>
37
38#include <thrust/copy.h>
39#include <thrust/device_vector.h>
40
41#include <cuda.h>
42
43#include <cstddef>
44#include <cstdio>
45#include <memory>
46
47#if defined(OMPI_MPI_H) || defined(_MPI_H)
48#error CU-file includes mpi.h! This should not happen!
49#endif
50
51template <class T> T *raw_data_pointer(thrust::device_vector<T> &vec) {
52 return thrust::raw_pointer_cast(vec.data());
53}
54
55template <class SpanLike> std::size_t byte_size(SpanLike const &v) {
56 return v.size() * sizeof(typename SpanLike::value_type);
57}
58
59/**
60 * @brief Resize a @c thrust::device_vector.
61 *
62 * Due to a bug in thrust (https://github.com/thrust/thrust/issues/939),
63 * resizing or appending to default constructed containers causes undefined
64 * behavior by dereferencing a null-pointer for certain types. This
65 * function is used instead of the resize member function to side-step
66 * the problem. This is done by replacing the existing vector by a new
67 * one constructed with the desired size if resizing from capacity zero.
68 * Behaves as-if @c vec.resize(n) was called.
69 * This is fixed in Thrust 1.11, shipped in CUDA 11.3
70 * (https://github.com/NVIDIA/thrust/commit/1c4f25d9).
71 *
72 * @tparam T Type contained in the vector.
73 * @param vec Vector to resize.
74 * @param n Desired new size of the vector.
75 */
76template <class T>
77void resize_or_replace(thrust::device_vector<T> &vec, std::size_t n) {
78 if (vec.capacity() == 0) {
79 vec = thrust::device_vector<T>(n);
80 } else {
81 vec.resize(n);
82 }
83}
84
85template <typename T> void free_device_vector(thrust::device_vector<T> &vec) {
86 vec.clear();
87 thrust::device_vector<T>().swap(vec);
88}
89
90/** @brief Host and device containers for particle data. */
92 void free_device_memory();
94 friend DeviceMemory;
95
96public:
97 /** @brief Which particle properties are needed by GPU methods. */
100 std::size_t current_size = 0ul;
102 thrust::device_vector<GpuParticle> particle_data_device;
104 thrust::device_vector<float> particle_forces_device;
105#ifdef ROTATION
107 thrust::device_vector<float> particle_torques_device;
108#endif
109 float *particle_pos_device = nullptr;
110#ifdef DIPOLES
111 float *particle_dip_device = nullptr;
112#endif
113#ifdef ELECTROSTATICS
114 float *particle_q_device = nullptr;
115#endif
116
117 static auto make_shared(ResourceCleanup &cleanup_queue) {
118 auto obj = std::make_shared<GpuParticleData::Storage>();
119 cleanup_queue.push<DeviceMemory>(obj);
120 return obj;
121 }
122
123 ~Storage() { free_device_memory(); }
128 if (not particle_forces_device.empty()) {
129 thrust::copy(particle_forces_device.begin(), particle_forces_device.end(),
130 particle_forces_host.begin());
131 }
132 }
133#ifdef ROTATION
135 if (not particle_torques_device.empty()) {
136 thrust::copy(particle_torques_device.begin(),
138 particle_torques_host.begin());
139 }
140 }
141#endif
145#ifdef ROTATION
149#endif
150};
151
155
156std::size_t GpuParticleData::n_particles() const {
157 return m_data->particle_data_device.size();
158}
159
161 return m_data->particle_pos_device;
162}
163
165 return raw_data_pointer(m_data->particle_forces_device);
166}
167
168#ifdef ROTATION
170 return raw_data_pointer(m_data->particle_torques_device);
171}
172#endif
173
174#ifdef DIPOLES
176 return m_data->particle_dip_device;
177}
178#endif
179
180#ifdef ELECTROSTATICS
182 return m_data->particle_q_device;
183}
184#endif
185
187 return m_data->energy_device;
188}
189
190void GpuParticleData::enable_property(std::size_t property) {
191 m_need_particles_update = true;
192 m_data->m_need[property] = true;
193 if (property != prop::force and property != prop::torque) {
194 m_split_particle_struct = true;
195 }
196 enable_particle_transfer();
197}
198
199bool GpuParticleData::has_compatible_device_impl() const {
200 auto result = true;
201 try {
203 } catch (cuda_runtime_error const &err) {
204 result = false;
205 }
206 return result;
207}
208
209/**
210 * @brief Setup and call particle reallocation from the host.
211 */
212void GpuParticleData::gpu_init_particle_comm() {
213 try {
215 } catch (cuda_runtime_error const &err) {
216 fprintf(stderr, "ERROR: %s\n", err.what());
217 errexit();
218 }
219 m_data->realloc_device_memory();
220}
221
223 // resize buffers
224 auto const n_part = particle_data_host.size();
226 particle_forces_host.resize(3ul * n_part);
228#ifdef ROTATION
229 particle_torques_host.resize(3ul * n_part);
231#endif
232
233 // zero out device memory for forces and torques
234 cudaMemsetAsync(raw_data_pointer(particle_forces_device), 0x0,
236#ifdef ROTATION
237 cudaMemsetAsync(raw_data_pointer(particle_torques_device), 0x0,
239#endif
240
241 // copy particles to device
242 cudaMemcpyAsync(raw_data_pointer(particle_data_device),
244 cudaMemcpyHostToDevice, stream[0]);
245}
246
247void GpuParticleData::copy_particles_to_device(ParticleRange const &particles,
248 int this_node) {
249 if (m_communication_enabled) {
250 gather_particle_data(particles, m_data->particle_data_host, this_node);
251 if (this_node == 0) {
252 m_data->copy_particles_to_device();
253 if (m_split_particle_struct) {
254 m_data->realloc_device_memory();
255 m_data->split_particle_struct();
256 }
257 }
258 }
259}
260
262 int this_node) {
263 if (m_communication_enabled) {
264 // copy results from device memory to host memory
265 if (this_node == 0) {
266 m_data->copy_particle_forces_to_host();
267#ifdef ROTATION
268 m_data->copy_particle_torques_to_host();
269#endif
270 }
271
272 auto forces_buffer = m_data->get_particle_forces_host_span();
273#ifdef ROTATION
274 auto torques_buffer = m_data->get_particle_torques_host_span();
275#else
276 auto torques_buffer = Utils::Span<float>{nullptr, std::size_t{0ul}};
277#endif
278
279 // add forces and torques to the particles
280 particles_scatter_forces(particles, forces_buffer, torques_buffer);
281 }
282}
283
285 if (m_communication_enabled) {
286 if (m_data->energy_device == nullptr) {
287 cuda_safe_mem(cudaMalloc(&m_data->energy_device, sizeof(GpuEnergy)));
288 }
289 cuda_safe_mem(cudaMemset(m_data->energy_device, 0, sizeof(GpuEnergy)));
290 }
291}
292
294 GpuEnergy energy_host{};
295 if (m_communication_enabled) {
296 cuda_safe_mem(cudaMemcpy(&energy_host, m_data->energy_device,
297 sizeof(GpuEnergy), cudaMemcpyDeviceToHost));
298 }
299 return energy_host;
300}
301
302// Position only
304 float *r, std::size_t n) {
305 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
306 if (idx >= n)
307 return;
308
309 auto const &p = particles[idx];
310 idx *= 3u;
311 r[idx + 0u] = p.p[0u];
312 r[idx + 1u] = p.p[1u];
313 r[idx + 2u] = p.p[2u];
314}
315
316#ifdef ELECTROSTATICS
317// Position and charge
319 float *r, float *q, std::size_t n) {
320 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
321 if (idx >= n)
322 return;
323
324 auto const &p = particles[idx];
325 r[3u * idx + 0u] = p.p[0u];
326 r[3u * idx + 1u] = p.p[1u];
327 r[3u * idx + 2u] = p.p[2u];
328 q[idx] = p.q;
329}
330
331// Charge only
333 float *q, std::size_t n) {
334 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
335 if (idx >= n)
336 return;
337
338 auto const &p = particles[idx];
339 q[idx] = p.q;
340}
341#endif
342
343#ifdef DIPOLES
344// Dipole moment
346 float *dip, std::size_t n) {
347 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
348 if (idx >= n)
349 return;
350
351 auto const &p = particles[idx];
352
353 idx *= 3u;
354
355 dip[idx + 0u] = p.dip[0u];
356 dip[idx + 1u] = p.dip[1u];
357 dip[idx + 2u] = p.dip[2u];
358}
359#endif
360
362 auto const n_part = particle_data_device.size();
363 if (n_part == 0ul)
364 return;
365
367 dim3 const threadsPerBlock{512u, 1u, 1u};
368 dim3 const numBlocks{static_cast<unsigned>(n_part / threadsPerBlock.x + 1ul)};
369
370#ifdef ELECTROSTATICS
371 if (m_need[prop::q] and m_need[prop::pos]) {
372 split_kernel_rq<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
373 raw_data_pointer(particle_data_device), particle_pos_device,
374 particle_q_device, n_part);
375 } else if (m_need[prop::q]) {
376 split_kernel_q<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
377 raw_data_pointer(particle_data_device), particle_q_device, n_part);
378 } else
379#endif
380 if (m_need[prop::pos]) {
381 split_kernel_r<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
382 raw_data_pointer(particle_data_device), particle_pos_device, n_part);
383 }
384#ifdef DIPOLES
385 if (m_need[prop::dip]) {
386 split_kernel_dip<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
387 raw_data_pointer(particle_data_device), particle_dip_device, n_part);
388 }
389#endif
390}
391
394 auto const new_size = particle_data_device.size();
395 auto const resize_needed = new_size != current_size;
396 if (m_need[prop::pos] and (resize_needed or particle_pos_device == nullptr)) {
397 if (particle_pos_device != nullptr) {
398 cuda_safe_mem(cudaFree(particle_pos_device));
399 }
401 cudaMalloc(&particle_pos_device, 3ul * new_size * sizeof(float)));
402 }
403#ifdef DIPOLES
404 if (m_need[prop::dip] and (resize_needed or particle_dip_device == nullptr)) {
405 if (particle_dip_device != nullptr) {
406 cuda_safe_mem(cudaFree(particle_dip_device));
407 }
409 cudaMalloc(&particle_dip_device, 3ul * new_size * sizeof(float)));
410 }
411#endif
412#ifdef ELECTROSTATICS
413 if (m_need[prop::q] and (resize_needed or particle_q_device == nullptr)) {
414 if (particle_q_device != nullptr) {
415 cuda_safe_mem(cudaFree(particle_q_device));
416 }
417 cuda_safe_mem(cudaMalloc(&particle_q_device, new_size * sizeof(float)));
418 }
419#endif
420 current_size = new_size;
421}
422
423void GpuParticleData::Storage::free_device_memory() {
424 auto const free_device_pointer = [](auto *&ptr) {
425 if (ptr != nullptr) {
426 cuda_safe_mem(cudaFree(reinterpret_cast<void *>(ptr)));
427 ptr = nullptr;
428 }
429 };
430 free_device_vector(particle_data_device);
431 free_device_vector(particle_forces_device);
432#ifdef ROTATION
433 free_device_vector(particle_torques_device);
434#endif
435 free_device_pointer(particle_pos_device);
436#ifdef DIPOLES
437 free_device_pointer(particle_dip_device);
438#endif
439#ifdef ELECTROSTATICS
440 free_device_pointer(particle_q_device);
441#endif
442 free_device_pointer(energy_device);
443}
std::vector< T, CudaHostAllocator< T > > pinned_vector
void free_device_vector(thrust::device_vector< T > &vec)
void resize_or_replace(thrust::device_vector< T > &vec, std::size_t n)
Resize a thrust::device_vector.
T * raw_data_pointer(thrust::device_vector< T > &vec)
std::size_t byte_size(SpanLike const &v)
__global__ void split_kernel_rq(GpuParticleData::GpuParticle *particles, float *r, float *q, std::size_t n)
__global__ void split_kernel_dip(GpuParticleData::GpuParticle *particles, float *dip, std::size_t n)
__global__ void split_kernel_r(GpuParticleData::GpuParticle *particles, float *r, std::size_t n)
__global__ void split_kernel_q(GpuParticleData::GpuParticle *particles, float *q, std::size_t n)
float u[3]
Host and device containers for particle data.
thrust::device_vector< float > particle_torques_device
pinned_vector< float > particle_torques_host
thrust::device_vector< GpuParticle > particle_data_device
thrust::device_vector< float > particle_forces_device
GpuParticleData::prop::bitset m_need
Which particle properties are needed by GPU methods.
Utils::Span< float > get_particle_forces_host_span()
pinned_vector< GpuParticle > particle_data_host
pinned_vector< float > particle_forces_host
GpuParticleData::GpuEnergy * energy_device
static auto make_shared(ResourceCleanup &cleanup_queue)
Utils::Span< float > get_particle_torques_host_span()
float * get_particle_torques_device() const
float * get_particle_charges_device() const
void copy_forces_to_host(ParticleRange const &particles, int this_node)
GpuEnergy * get_energy_device() const
float * get_particle_dipoles_device() const
float * get_particle_forces_device() const
GpuEnergy copy_energy_to_host() const
void enable_property(std::size_t property)
std::size_t n_particles() const
float * get_particle_positions_device() const
A range of particles.
base_type::size_type size() const
Attorney for a resource deallocator.
Queue to deallocate resources before normal program termination.
void push(std::shared_ptr< Container > const &resource)
Register a resource for cleanup.
A stripped-down version of std::span from C++17.
Definition Span.hpp:38
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
int this_node
The number of this node.
This file contains the defaults for ESPResSo.
void errexit()
exit ungracefully, core dump if switched on.
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
void cuda_check_device()
Check that a device is available, that its compute capability is sufficient for ESPResSo,...
Definition init_cuda.cu:130
static auto numBlocks(std::size_t n_part)
Get number of blocks for a given number of particles.
Energies that are retrieved from the GPU.
Subset of Particle which is copied to the GPU.
Particle properties that need to be communicated to the GPU.
static constexpr std::size_t force
static constexpr std::size_t torque
std::bitset< 5 > bitset
static constexpr std::size_t pos
static constexpr std::size_t dip
static constexpr std::size_t q
#define cuda_safe_mem(a)
Definition utils.cuh:71