ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
GpuParticleData_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2014-2022 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19/**
20 * @file
21 * CUDA kernels to convert the particles AoS to a SoA on the device.
22 */
23
24#include <config/config.hpp>
25
26#include "GpuParticleData.hpp"
27#include "ResourceCleanup.hpp"
28#include "System.hpp"
29
30#include "ParticleRange.hpp"
31#include "errorhandling.hpp"
32
33#include "cuda/init.hpp"
34#include "cuda/utils.cuh"
35
36#include <thrust/copy.h>
37#include <thrust/device_vector.h>
38
39#include <cuda.h>
40
41#include <cstddef>
42#include <cstdio>
43#include <memory>
44#include <span>
45
46#if defined(OMPI_MPI_H) || defined(_MPI_H)
47#error CU-file includes mpi.h! This should not happen!
48#endif
49
50template <class T> T *raw_data_pointer(thrust::device_vector<T> &vec) {
51 return thrust::raw_pointer_cast(vec.data());
52}
53
54template <class SpanLike> std::size_t byte_size(SpanLike const &v) {
55 return v.size() * sizeof(typename SpanLike::value_type);
56}
57
58/**
59 * @brief Resize a @c thrust::device_vector.
60 *
61 * Due to a bug in thrust (https://github.com/thrust/thrust/issues/939),
62 * resizing or appending to default constructed containers causes undefined
63 * behavior by dereferencing a null-pointer for certain types. This
64 * function is used instead of the resize member function to side-step
65 * the problem. This is done by replacing the existing vector by a new
66 * one constructed with the desired size if resizing from capacity zero.
67 * Behaves as-if @c vec.resize(n) was called.
68 * This is fixed in Thrust 1.11, shipped in CUDA 11.3
69 * (https://github.com/NVIDIA/thrust/commit/1c4f25d9).
70 *
71 * @tparam T Type contained in the vector.
72 * @param vec Vector to resize.
73 * @param n Desired new size of the vector.
74 */
75template <class T>
76void resize_or_replace(thrust::device_vector<T> &vec, std::size_t n) {
77 if (vec.capacity() == 0) {
78 vec = thrust::device_vector<T>(n);
79 } else {
80 vec.resize(n);
81 }
82}
83
84template <typename T> void free_device_vector(thrust::device_vector<T> &vec) {
85 vec.clear();
86 thrust::device_vector<T>().swap(vec);
87}
88
89/** @brief Host and device containers for particle data. */
91 void free_device_memory();
93 friend DeviceMemory;
94
95public:
96 /** @brief Which particle properties are needed by GPU methods. */
99 std::size_t current_size = 0ul;
101 thrust::device_vector<GpuParticle> particle_data_device;
103 thrust::device_vector<float> particle_forces_device;
104#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
106 thrust::device_vector<float> particle_dip_fld_device;
107#endif
108#ifdef ESPRESSO_ROTATION
110 thrust::device_vector<float> particle_torques_device;
111#endif
112 float *particle_pos_device = nullptr;
113#ifdef ESPRESSO_DIPOLES
114 float *particle_dip_device = nullptr;
115#endif
116#ifdef ESPRESSO_ELECTROSTATICS
117 float *particle_q_device = nullptr;
118#endif
119
120 static auto make_shared(ResourceCleanup &cleanup_queue) {
121 auto obj = std::make_shared<GpuParticleData::Storage>();
122 cleanup_queue.push<DeviceMemory>(obj);
123 return obj;
124 }
125
126 ~Storage() { free_device_memory(); }
131 if (not particle_forces_device.empty()) {
132 thrust::copy(particle_forces_device.begin(), particle_forces_device.end(),
133 particle_forces_host.begin());
134 }
135 }
136
137#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
139 if (not particle_dip_fld_device.empty()) {
140 thrust::copy(particle_dip_fld_device.begin(),
142 particle_dip_fld_host.begin());
143 }
144 }
145#endif
146
147#ifdef ESPRESSO_ROTATION
149 if (not particle_torques_device.empty()) {
150 thrust::copy(particle_torques_device.begin(),
152 particle_torques_host.begin());
153 }
154 }
155#endif
156 std::span<float> get_particle_forces_host_span() {
157 return {particle_forces_host.data(), particle_forces_host.size()};
158 }
159
160#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
162 return {particle_dip_fld_host.data(), particle_dip_fld_host.size()};
163 }
164#endif
165
166#ifdef ESPRESSO_ROTATION
168 return {particle_torques_host.data(), particle_torques_host.size()};
169 }
170#endif
171};
172
176
177std::size_t GpuParticleData::n_particles() const {
178 return m_data->particle_data_device.size();
179}
180
182 return m_data->particle_pos_device;
183}
184
186 return raw_data_pointer(m_data->particle_forces_device);
187}
188#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
190 return raw_data_pointer(m_data->particle_dip_fld_device);
191}
192#endif
193
194#ifdef ESPRESSO_ROTATION
196 return raw_data_pointer(m_data->particle_torques_device);
197}
198#endif
199
200#ifdef ESPRESSO_DIPOLES
202 return m_data->particle_dip_device;
203}
204#endif
205
206#ifdef ESPRESSO_ELECTROSTATICS
208 return m_data->particle_q_device;
209}
210#endif
211
213 return m_data->energy_device;
214}
215
216void GpuParticleData::enable_property(std::size_t property) {
217 m_need_particles_update = true;
218 m_data->m_need[property] = true;
219#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
220 if (property != prop::force and property != prop::torque and
221 property != prop::dip_fld) {
222 m_split_particle_struct = true;
223 }
224#else
225 if (property != prop::force and property != prop::torque) {
226 m_split_particle_struct = true;
227 }
228#endif
229 enable_particle_transfer();
230}
231
232bool GpuParticleData::has_compatible_device_impl() const {
233 auto result = false;
234 invoke_skip_cuda_exceptions([&result]() {
236 result = true;
237 });
238 return result;
239}
240
241/**
242 * @brief Setup and call particle reallocation from the host.
243 */
244void GpuParticleData::gpu_init_particle_comm() {
245 try {
247 } catch (cuda_runtime_error const &err) {
248 throw cuda_fatal_error(err.what());
249 }
250 m_data->realloc_device_memory();
251}
252
254 // resize buffers
255 auto const n_part = particle_data_host.size();
257 particle_forces_host.resize(3ul * n_part);
259
260#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
261 particle_dip_fld_host.resize(3ul * n_part);
263#endif
264
265#ifdef ESPRESSO_ROTATION
266 particle_torques_host.resize(3ul * n_part);
268#endif
269
270 // zero out device memory for forces and torques
271 cudaMemsetAsync(raw_data_pointer(particle_forces_device), 0x0,
273
274#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
275 cudaMemsetAsync(raw_data_pointer(particle_dip_fld_device), 0x0,
277#endif
278
279#ifdef ESPRESSO_ROTATION
280 cudaMemsetAsync(raw_data_pointer(particle_torques_device), 0x0,
282#endif
283
284 // copy particles to device
285 cudaMemcpyAsync(raw_data_pointer(particle_data_device),
287 cudaMemcpyHostToDevice, stream[0]);
288}
289
290void GpuParticleData::copy_particles_to_device(ParticleRange const &particles,
291 int this_node) {
292 if (m_communication_enabled) {
293 gather_particle_data(particles, m_data->particle_data_host, this_node);
294 if (this_node == 0) {
295 m_data->copy_particles_to_device();
296 if (m_split_particle_struct) {
297 m_data->realloc_device_memory();
298 m_data->split_particle_struct();
299 }
300 }
301 }
302}
303
305 int this_node) {
306 if (m_communication_enabled) {
307 // copy results from device memory to host memory
308 if (this_node == 0) {
309 m_data->copy_particle_forces_to_host();
310#ifdef ESPRESSO_ROTATION
311 m_data->copy_particle_torques_to_host();
312#endif
313 }
314
315 auto forces_buffer = m_data->get_particle_forces_host_span();
316#ifdef ESPRESSO_ROTATION
317 auto torques_buffer = m_data->get_particle_torques_host_span();
318#else
319 auto torques_buffer = std::span<float>();
320#endif
321
322 // add forces and torques to the particles
323 particles_scatter_forces(particles, forces_buffer, torques_buffer);
324 }
325}
326#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
328 int this_node) {
329 if (m_communication_enabled) {
330 // copy results from device memory to host memory
331 if (this_node == 0) {
332 m_data->copy_particle_dip_fld_to_host();
333 }
334
335 auto dipole_field_buffer = m_data->get_particle_dip_fld_host_span();
336
337 // add dip_fld to the particles
338 particles_scatter_dip_fld(particles, dipole_field_buffer);
339 }
340}
341#endif
342
344 if (m_communication_enabled) {
345 if (m_data->energy_device == nullptr) {
346 cuda_safe_mem(cudaMalloc(&m_data->energy_device, sizeof(GpuEnergy)));
347 }
348 cuda_safe_mem(cudaMemset(m_data->energy_device, 0, sizeof(GpuEnergy)));
349 }
350}
351
353 GpuEnergy energy_host{};
354 if (m_communication_enabled) {
355 cuda_safe_mem(cudaMemcpy(&energy_host, m_data->energy_device,
356 sizeof(GpuEnergy), cudaMemcpyDeviceToHost));
357 }
358 return energy_host;
359}
360
361// Position only
363 float *r, std::size_t n) {
364 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
365 if (idx >= n)
366 return;
367
368 auto const &p = particles[idx];
369 idx *= 3u;
370 r[idx + 0u] = p.p[0u];
371 r[idx + 1u] = p.p[1u];
372 r[idx + 2u] = p.p[2u];
373}
374
375#ifdef ESPRESSO_ELECTROSTATICS
376// Position and charge
378 float *r, float *q, std::size_t n) {
379 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
380 if (idx >= n)
381 return;
382
383 auto const &p = particles[idx];
384 r[3u * idx + 0u] = p.p[0u];
385 r[3u * idx + 1u] = p.p[1u];
386 r[3u * idx + 2u] = p.p[2u];
387 q[idx] = p.q;
388}
389
390// Charge only
392 float *q, std::size_t n) {
393 auto const idx = blockDim.x * blockIdx.x + threadIdx.x;
394 if (idx >= n)
395 return;
396
397 auto const &p = particles[idx];
398 q[idx] = p.q;
399}
400#endif
401
402#ifdef ESPRESSO_DIPOLES
403// Dipole moment
405 float *dip, std::size_t n) {
406 auto idx = blockDim.x * blockIdx.x + threadIdx.x;
407 if (idx >= n)
408 return;
409
410 auto const &p = particles[idx];
411
412 idx *= 3u;
413
414 dip[idx + 0u] = p.dip[0u];
415 dip[idx + 1u] = p.dip[1u];
416 dip[idx + 2u] = p.dip[2u];
417}
418#endif
419
421 auto const n_part = particle_data_device.size();
422 if (n_part == 0ul)
423 return;
424
426 dim3 const threadsPerBlock{512u, 1u, 1u};
427 dim3 const numBlocks{static_cast<unsigned>(n_part / threadsPerBlock.x + 1ul)};
428
429#ifdef ESPRESSO_ELECTROSTATICS
430 if (m_need[prop::q] and m_need[prop::pos]) {
431 split_kernel_rq<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
432 raw_data_pointer(particle_data_device), particle_pos_device,
433 particle_q_device, n_part);
434 } else if (m_need[prop::q]) {
435 split_kernel_q<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
436 raw_data_pointer(particle_data_device), particle_q_device, n_part);
437 } else
438#endif
439 if (m_need[prop::pos]) {
440 split_kernel_r<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
441 raw_data_pointer(particle_data_device), particle_pos_device, n_part);
442 }
443#ifdef ESPRESSO_DIPOLES
444 if (m_need[prop::dip]) {
445 split_kernel_dip<<<numBlocks, threadsPerBlock, 0, nullptr>>>(
446 raw_data_pointer(particle_data_device), particle_dip_device, n_part);
447 }
448#endif
449}
450
453 auto const new_size = particle_data_device.size();
454 auto const resize_needed = new_size != current_size;
455 if (m_need[prop::pos] and (resize_needed or particle_pos_device == nullptr)) {
456 if (particle_pos_device != nullptr) {
457 cuda_safe_mem(cudaFree(particle_pos_device));
458 }
460 cudaMalloc(&particle_pos_device, 3ul * new_size * sizeof(float)));
461 }
462#ifdef ESPRESSO_DIPOLES
463 if (m_need[prop::dip] and (resize_needed or particle_dip_device == nullptr)) {
464 if (particle_dip_device != nullptr) {
465 cuda_safe_mem(cudaFree(particle_dip_device));
466 }
468 cudaMalloc(&particle_dip_device, 3ul * new_size * sizeof(float)));
469 }
470#endif
471#ifdef ESPRESSO_ELECTROSTATICS
472 if (m_need[prop::q] and (resize_needed or particle_q_device == nullptr)) {
473 if (particle_q_device != nullptr) {
474 cuda_safe_mem(cudaFree(particle_q_device));
475 }
476 cuda_safe_mem(cudaMalloc(&particle_q_device, new_size * sizeof(float)));
477 }
478#endif
479 current_size = new_size;
480}
481
482void GpuParticleData::Storage::free_device_memory() {
483 auto const free_device_pointer = [](auto *&ptr) {
484 if (ptr != nullptr) {
485 cuda_safe_mem(cudaFree(reinterpret_cast<void *>(ptr)));
486 ptr = nullptr;
487 }
488 };
489 free_device_vector(particle_data_device);
490 free_device_vector(particle_forces_device);
491#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
492 free_device_vector(particle_dip_fld_device);
493#endif
494#ifdef ESPRESSO_ROTATION
495 free_device_vector(particle_torques_device);
496#endif
497 free_device_pointer(particle_pos_device);
498#ifdef ESPRESSO_DIPOLES
499 free_device_pointer(particle_dip_device);
500#endif
501#ifdef ESPRESSO_ELECTROSTATICS
502 free_device_pointer(particle_q_device);
503#endif
504 free_device_pointer(energy_device);
505}
std::vector< T, CudaHostAllocator< T > > pinned_vector
void free_device_vector(thrust::device_vector< T > &vec)
void resize_or_replace(thrust::device_vector< T > &vec, std::size_t n)
Resize a thrust::device_vector.
T * raw_data_pointer(thrust::device_vector< T > &vec)
std::size_t byte_size(SpanLike const &v)
__global__ void split_kernel_rq(GpuParticleData::GpuParticle *particles, float *r, float *q, std::size_t n)
__global__ void split_kernel_dip(GpuParticleData::GpuParticle *particles, float *dip, std::size_t n)
__global__ void split_kernel_r(GpuParticleData::GpuParticle *particles, float *r, std::size_t n)
__global__ void split_kernel_q(GpuParticleData::GpuParticle *particles, float *q, std::size_t n)
Host and device containers for particle data.
thrust::device_vector< float > particle_torques_device
thrust::device_vector< float > particle_dip_fld_device
std::span< float > get_particle_forces_host_span()
pinned_vector< float > particle_torques_host
thrust::device_vector< GpuParticle > particle_data_device
thrust::device_vector< float > particle_forces_device
GpuParticleData::prop::bitset m_need
Which particle properties are needed by GPU methods.
pinned_vector< GpuParticle > particle_data_host
std::span< float > get_particle_dip_fld_host_span()
pinned_vector< float > particle_forces_host
GpuParticleData::GpuEnergy * energy_device
static auto make_shared(ResourceCleanup &cleanup_queue)
pinned_vector< float > particle_dip_fld_host
std::span< float > get_particle_torques_host_span()
float * get_particle_dip_fld_device() const
float * get_particle_torques_device() const
float * get_particle_charges_device() const
void copy_forces_to_host(ParticleRange const &particles, int this_node)
GpuEnergy * get_energy_device() const
void copy_dip_fld_to_host(ParticleRange const &particles, int this_node)
float * get_particle_dipoles_device() const
float * get_particle_forces_device() const
GpuEnergy copy_energy_to_host() const
void enable_property(std::size_t property)
std::size_t n_particles() const
float * get_particle_positions_device() const
A range of particles.
base_type::size_type size() const
Attorney for a resource deallocator.
Queue to deallocate resources before normal program termination.
void push(std::shared_ptr< Container > const &resource)
Register a resource for cleanup.
Fatal CUDA exception.
Wrapper for CUDA runtime exceptions.
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
int this_node
The number of this node.
void invoke_skip_cuda_exceptions(F &&f, Args &&...args)
Invoke a function and silently ignore any thrown cuda_runtime_error error.
void cuda_check_device()
Check that a device is available, that its compute capability is sufficient for ESPResSo,...
Definition init_cuda.cu:102
This file contains the errorhandling code for severe errors, like a broken bond or illegal parameter ...
Energies that are retrieved from the GPU.
Subset of Particle which is copied to the GPU.
Particle properties that need to be communicated to the GPU.
static constexpr std::size_t force
static constexpr std::size_t dip_fld
std::bitset< 6 > bitset
static constexpr std::size_t torque
static constexpr std::size_t pos
static constexpr std::size_t dip
static constexpr std::size_t q
#define cuda_safe_mem(a)
Definition utils.cuh:73