ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
dipolar_direct_sum_gpu_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2010-2022 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "config/config.hpp"
21
22#ifdef ESPRESSO_DIPOLAR_DIRECT_SUM
23
25
26#include "cuda/utils.cuh"
27
28#include <thrust/device_ptr.h>
29#include <thrust/reduce.h>
30
31#include <cuda.h>
32
33#if defined(OMPI_MPI_H) || defined(_MPI_H)
34#error CU-file includes mpi.h! This should not happen!
35#endif
36
37// LCOV_EXCL_START
38__device__ inline float scalar_product(float const *a, float const *b) {
39 return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
40}
41
42__device__ inline void vector_product(float const *a, float const *b,
43 float *out) {
44 out[0] = a[1] * b[2] - a[2] * b[1];
45 out[1] = a[2] * b[0] - a[0] * b[2];
46 out[2] = a[0] * b[1] - a[1] * b[0];
47}
48
49__device__ inline void get_mi_vector_dds(float res[3], float const a[3],
50 float const b[3], float const box_l[3],
51 int const periodic[3]) {
52 for (int i = 0; i < 3; i++) {
53 res[i] = a[i] - b[i];
54 if (periodic[i])
55 res[i] -= floor(res[i] / box_l[i] + 0.5f) * box_l[i];
56 }
57}
58
59__device__ void dipole_ia_force(float const pf, float const dr[3],
60 float const *const dip1,
61 float const *const dip2, float *const f1,
62 float *const torque1, float *const torque2,
63 [[maybe_unused]] float *const dip_fld1,
64 [[maybe_unused]] float *const dip_fld2) {
65 // Powers of distance
66 auto const r_sq = scalar_product(dr, dr);
67 if (r_sq == 0.0f) {
68 f1[0] = f1[1] = f1[2] = 0.0f;
69 torque1[0] = torque1[1] = torque1[2] = 0.0f;
70 torque2[0] = torque2[1] = torque2[2] = 0.0f;
71#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
72 dip_fld1[0] = dip_fld1[1] = dip_fld1[2] = 0.0f;
73 dip_fld2[0] = dip_fld2[1] = dip_fld2[2] = 0.0f;
74#endif
75 return;
76 }
77 auto const r_sq_inv = 1.0f / r_sq;
78 auto const r_inv = rsqrtf(r_sq);
79 auto const r3_inv = 1.0f / r_sq * r_inv;
80 auto const r5_inv = r3_inv * r_sq_inv;
81 auto const r7_inv = r5_inv * r_sq_inv;
82
83 // Dot products
84 auto const pe1 = scalar_product(dip1, dip2);
85 auto const pe2 = scalar_product(dip1, dr);
86 auto const pe3 = scalar_product(dip2, dr);
87 auto const pe4 = 3.0f * r5_inv;
88
89#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
90 auto const rep1 = pe3 * pe4;
91 auto const rep2 = pe2 * pe4;
92 dip_fld1[0] = pf * (rep1 * dr[0] - dip2[0] * r3_inv);
93 dip_fld1[1] = pf * (rep1 * dr[1] - dip2[1] * r3_inv);
94 dip_fld1[2] = pf * (rep1 * dr[2] - dip2[2] * r3_inv);
95
96 dip_fld2[0] = pf * (rep2 * dr[0] - dip1[0] * r3_inv);
97 dip_fld2[1] = pf * (rep2 * dr[1] - dip1[1] * r3_inv);
98 dip_fld2[2] = pf * (rep2 * dr[2] - dip1[2] * r3_inv);
99#endif
100
101 // Force
102 auto const aa = pe4 * pe1;
103 auto const bb = -15.0f * pe2 * pe3 * r7_inv;
104 auto const ab = aa + bb;
105 auto const cc = pe4 * pe3;
106 auto const dd = pe4 * pe2;
107
108 f1[0] = (pf * (ab * dr[0] + cc * dip1[0] + dd * dip2[0]));
109 f1[1] = (pf * (ab * dr[1] + cc * dip1[1] + dd * dip2[1]));
110 f1[2] = (pf * (ab * dr[2] + cc * dip1[2] + dd * dip2[2]));
111
112 // Torques
113 float a[3];
114 vector_product(dip1, dip2, a);
115
116 float b[3];
117 vector_product(dip1, dr, b);
118
119 torque1[0] = pf * (-a[0] * r3_inv + b[0] * cc);
120 torque1[1] = pf * (-a[1] * r3_inv + b[1] * cc);
121 torque1[2] = pf * (-a[2] * r3_inv + b[2] * cc);
122
123 vector_product(dip2, dr, b);
124
125 torque2[0] = pf * (a[0] * r3_inv + b[0] * dd);
126 torque2[1] = pf * (a[1] * r3_inv + b[1] * dd);
127 torque2[2] = pf * (a[2] * r3_inv + b[2] * dd);
128}
129
130__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2,
131 float const *dip1, float const *dip2,
132 float box_l[3], int periodic[3]) {
133 // Distance between particles
134 float dr[3];
135 get_mi_vector_dds(dr, r1, r2, box_l, periodic);
136
137 // Powers of distance
138 auto const r_sq = scalar_product(dr, dr);
139 auto const r_sq_inv = 1.0f / r_sq;
140 auto const r_inv = rsqrtf(r_sq);
141 auto const r3_inv = 1.0f / r_sq * r_inv;
142 auto const r5_inv = r3_inv * r_sq_inv;
143
144 // Dot products
145 auto const pe1 = scalar_product(dip1, dip2);
146 auto const pe2 = scalar_product(dip1, dr);
147 auto const pe3 = scalar_product(dip2, dr);
148 auto const pe4 = 3.0f * r5_inv;
149
150 // Energy
151 return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
152}
153// LCOV_EXCL_STOP
154
155__global__ void
156DipolarDirectSum_kernel_force(float const pf, unsigned int n,
157 float const *const pos, float const *const dip,
158 [[maybe_unused]] float *dip_fld, float *const f,
159 float *const torque, float const box_l[3],
160 int const periodic[3], int const n_replicas) {
161 unsigned int const i = blockIdx.x * blockDim.x + threadIdx.x;
162 if (i >= n)
163 return;
164
165 // Load particle i
166 auto const xi{pos[3u * i + 0u]}, yi{pos[3u * i + 1u]}, zi{pos[3u * i + 2u]};
167
168 auto const *const mi = dip + 3u * i;
169
170 // Per‐thread accumulators
171 float fsum[3] = {0.0f, 0.0f, 0.0f};
172 float tsum[3] = {0.0f, 0.0f, 0.0f};
173#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
174 float dfsum[3] = {0.0f, 0.0f, 0.0f};
175#endif
176
177 float fi[3], ti1[3], ti2[3];
178#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
179 float dfi[3], dfj[3];
180#else
181 float *const dfi{nullptr}, *const dfj{nullptr};
182#endif
183
184 // --- Minimal‐image branch (no replicas) ---
185 if (n_replicas == 0) {
186 for (unsigned int j = i + 1u; j < n; ++j) {
187 // Compute MIC vector
188 float d0[3];
189 get_mi_vector_dds(d0, pos + 3u * i, pos + 3u * j, box_l, periodic);
190 // Compute force/torque
191 dipole_ia_force(pf, d0, mi, dip + 3u * j, fi, ti1, ti2, dfi, dfj);
192
193 // Deposit on j (atomic)
194 for (unsigned int k = 0u; k < 3u; ++k) {
195 atomicAdd(f + 3u * j + k, -fi[k]);
196 atomicAdd(torque + 3u * j + k, ti2[k]);
197#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
198 atomicAdd(dip_fld + 3u * j + k, dfj[k]);
199#endif
200 }
201 // Accumulate on i
202 for (unsigned int k = 0u; k < 3u; ++k) {
203 fsum[k] += fi[k];
204 tsum[k] += ti1[k];
205#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
206 dfsum[k] += dfi[k];
207#endif
208 }
209 }
210 // Flush i's result
211 for (unsigned int k = 0u; k < 3u; ++k) {
212 atomicAdd(f + 3u * i + k, fsum[k]);
213 atomicAdd(torque + 3u * i + k, tsum[k]);
214#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
215 atomicAdd(dip_fld + 3u * i + k, dfsum[k]);
216#endif
217 }
218 return;
219 }
220 // --- Replica branch (n_replicas > 0) ---
221 int const nx = periodic[0] * n_replicas;
222 int const ny = periodic[1] * n_replicas;
223 int const nz = periodic[2] * n_replicas;
224
225 // Self‐images of i (exclude primary)
226 for (int dx = -nx; dx <= nx; ++dx) {
227 for (int dy = -ny; dy <= ny; ++dy) {
228 for (int dz = -nz; dz <= nz; ++dz) {
229 if (dx == 0 && dy == 0 && dz == 0)
230 continue;
231 float dr[3] = {static_cast<float>(dx) * box_l[0],
232 static_cast<float>(dy) * box_l[1],
233 static_cast<float>(dz) * box_l[2]};
234 dipole_ia_force(pf, dr, mi, mi, fi, ti1, ti2, dfi, dfj);
235 for (unsigned int k = 0u; k < 3u; ++k) {
236 fsum[k] += fi[k];
237 tsum[k] += ti1[k];
238#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
239 dfsum[k] += dfi[k];
240#endif
241 }
242 }
243 }
244 }
245
246 // Pairwise i <-> j over all images
247 for (unsigned int j = i + 1u; j < n; ++j) {
248 auto const xj{pos[3u * j + 0u]}, yj{pos[3u * j + 1u]}, zj{pos[3u * j + 2u]};
249 auto const *mj = dip + 3u * j;
250
251 for (int dx = -nx; dx <= nx; ++dx) {
252 for (int dy = -ny; dy <= ny; ++dy) {
253 for (int dz = -nz; dz <= nz; ++dz) {
254 float dr[3] = {(xi - xj) + static_cast<float>(dx) * box_l[0],
255 (yi - yj) + static_cast<float>(dy) * box_l[1],
256 (zi - zj) + static_cast<float>(dz) * box_l[2]};
257 dipole_ia_force(pf, dr, mi, mj, fi, ti1, ti2, dfi, dfj);
258
259 // Deposit on j (atomic)
260 for (unsigned int k = 0u; k < 3u; ++k) {
261 atomicAdd(f + 3u * j + k, -fi[k]);
262 atomicAdd(torque + 3u * j + k, ti2[k]);
263#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
264 atomicAdd(dip_fld + 3u * j + k, dfj[k]);
265#endif
266 fsum[k] += fi[k];
267 tsum[k] += ti1[k];
268#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
269 dfsum[k] += dfi[k];
270#endif
271 }
272 }
273 }
274 }
275 }
276
277 // Add i's total to global memory **atomically**,
278 // in case any other asynchronous update might race
279 for (unsigned int k = 0u; k < 3u; ++k) {
280 atomicAdd(f + 3u * i + k, fsum[k]);
281 atomicAdd(torque + 3u * i + k, tsum[k]);
282#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
283 atomicAdd(dip_fld + 3u * i + k, dfsum[k]);
284#endif
285 }
286}
287// LCOV_EXCL_START
288__device__ void dds_sumReduction(float *input, float *sum) {
289 auto const tid = static_cast<int>(threadIdx.x);
290 for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
291 __syncthreads();
292 if (tid < i / 2)
293 input[tid] += input[i / 2 + tid];
294 if ((i % 2 == 1) && (tid == 0))
295 input[tid] += input[i - 1];
296 }
297 __syncthreads();
298 if (tid == 0) {
299 sum[0] = input[0];
300 }
301}
302// LCOV_EXCL_STOP
303
304__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n,
305 float const *const pos,
306 float const *const dip,
307 float box_l[3], int periodic[3],
308 float *energySum) {
309
310 auto const i = blockIdx.x * blockDim.x + threadIdx.x;
311 float sum = 0.0;
312 extern __shared__ float res[];
313
314 // There is one thread per particle. Each thread computes interactions
315 // with particles whose id is larger than the thread id.
316 // The result for the particle id equal to the thread id is added
317 // to global memory at the end.
318
319 if (i < n) {
320 // Summation for particle i
321 for (unsigned int j = i + 1; j < n; j++) {
322 sum += dipole_ia_energy(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i,
323 dip + 3 * j, box_l, periodic);
324 }
325
326 // Save per thread result into block shared mem
327 res[threadIdx.x] = sum;
328 } else
329 res[threadIdx.x] = 0;
330
331 // Sum results within a block
332 __syncthreads(); // Wait till all threads in block are done
333 dds_sumReduction(res, &(energySum[blockIdx.x]));
334}
335
336inline void copy_box_data(float **box_l_gpu, int **periodic_gpu,
337 float const *box_l, int const *periodic) {
338 auto const s_box = 3u * sizeof(float);
339 auto const s_per = 3u * sizeof(int);
340 cuda_safe_mem(cudaMalloc(reinterpret_cast<void **>(box_l_gpu), s_box));
341 cuda_safe_mem(cudaMalloc(reinterpret_cast<void **>(periodic_gpu), s_per));
342 cuda_safe_mem(cudaMemcpy(*box_l_gpu, box_l, s_box, cudaMemcpyHostToDevice));
344 cudaMemcpy(*periodic_gpu, periodic, s_per, cudaMemcpyHostToDevice));
345}
346
347void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n,
348 float const *const pos,
349 float const *const dip,
350 float *dip_fld, float *f,
351 float *torque, float box_l[3],
352 int periodic[3], int n_replicas) {
353
354 unsigned int const bs = 64;
355 dim3 grid(1, 1, 1);
356 dim3 block(1, 1, 1);
357
358 if (n == 0)
359 return;
360
361 if (n <= bs) {
362 grid.x = 1;
363 block.x = n;
364 } else {
365 grid.x = n / bs + 1;
366 block.x = bs;
367 }
368
369 float *box_l_gpu;
370 int *periodic_gpu;
371 copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic);
372
373 KERNELCALL(DipolarDirectSum_kernel_force, grid, block, k, n, pos, dip,
374 dip_fld, f, torque, box_l_gpu, periodic_gpu, n_replicas);
375 cudaFree(box_l_gpu);
376 cudaFree(periodic_gpu);
377}
378
379void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n,
380 float const *const pos,
381 float const *const dip,
382 float box_l[3], int periodic[3],
383 float *E) {
384
385 unsigned int const bs = 512;
386 dim3 grid(1, 1, 1);
387 dim3 block(1, 1, 1);
388
389 if (n == 0)
390 return;
391
392 if (n <= bs) {
393 grid.x = 1;
394 block.x = n;
395 } else {
396 grid.x = n / bs + 1;
397 block.x = bs;
398 }
399
400 float *box_l_gpu;
401 int *periodic_gpu;
402 copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic);
403
404 float *energySum;
405 cuda_safe_mem(cudaMalloc(&energySum, sizeof(float) * grid.x));
406
407 // This will sum the energies up to the block level
409 bs * sizeof(float), k, n, pos, dip, box_l_gpu, periodic_gpu,
410 energySum);
411
412 // Sum the results of all blocks
413 // One thread per block in the prev kernel
414 // KERNELCALL(sumKernel,1,1,energySum,block.x,E);
415 thrust::device_ptr<float> t(energySum);
416 float x = thrust::reduce(t, t + grid.x);
417 cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));
418
419 cuda_safe_mem(cudaFree(energySum));
420 cuda_safe_mem(cudaFree(box_l_gpu));
421 cuda_safe_mem(cudaFree(periodic_gpu));
422}
423
424#endif // ESPRESSO_DIPOLAR_DIRECT_SUM
void copy_box_data(float **box_l_gpu, int **periodic_gpu, float const *box_l, int const *periodic)
__device__ void dipole_ia_force(float const pf, float const dr[3], float const *const dip1, float const *const dip2, float *const f1, float *const torque1, float *const torque2, float *const dip_fld1, float *const dip_fld2)
void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float const *const pos, float const *const dip, float box_l[3], int periodic[3], float *E)
void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float const *const pos, float const *const dip, float *dip_fld, float *f, float *torque, float box_l[3], int periodic[3], int n_replicas)
__device__ float scalar_product(float const *a, float const *b)
__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float box_l[3], int periodic[3])
__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n, float const *const pos, float const *const dip, float box_l[3], int periodic[3], float *energySum)
__device__ void get_mi_vector_dds(float res[3], float const a[3], float const b[3], float const box_l[3], int const periodic[3])
__device__ void vector_product(float const *a, float const *b, float *out)
__device__ void dds_sumReduction(float *input, float *sum)
__global__ void DipolarDirectSum_kernel_force(float const pf, unsigned int n, float const *const pos, float const *const dip, float *dip_fld, float *const f, float *const torque, float const box_l[3], int const periodic[3], int const n_replicas)
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:176
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
Definition utils.cuh:75
#define cuda_safe_mem(a)
Definition utils.cuh:73
#define KERNELCALL(_function, _grid, _block,...)
Definition utils.cuh:79