ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
dipolar_direct_sum_gpu_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2010-2022 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "config/config.hpp"
21
22#ifdef DIPOLAR_DIRECT_SUM
23
25
26#include "cuda/utils.cuh"
27
28#include <thrust/device_ptr.h>
29#include <thrust/reduce.h>
30
31#include <cuda.h>
32
33#if defined(OMPI_MPI_H) || defined(_MPI_H)
34#error CU-file includes mpi.h! This should not happen!
35#endif
36
37// LCOV_EXCL_START
38__device__ inline float scalar_product(float const *a, float const *b) {
39 return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
40}
41
42__device__ inline void vector_product(float const *a, float const *b,
43 float *out) {
44 out[0] = a[1] * b[2] - a[2] * b[1];
45 out[1] = a[2] * b[0] - a[0] * b[2];
46 out[2] = a[0] * b[1] - a[1] * b[0];
47}
48
49__device__ inline void get_mi_vector_dds(float res[3], float const a[3],
50 float const b[3], float const box_l[3],
51 int const periodic[3]) {
52 for (int i = 0; i < 3; i++) {
53 res[i] = a[i] - b[i];
54 if (periodic[i])
55 res[i] -= floor(res[i] / box_l[i] + 0.5f) * box_l[i];
56 }
57}
58
59__device__ void dipole_ia_force(float pf, float const *r1, float const *r2,
60 float const *dip1, float const *dip2, float *f1,
61 float *torque1, float *torque2, float box_l[3],
62 int periodic[3]) {
63 // Distance between particles
64 float dr[3];
65 get_mi_vector_dds(dr, r1, r2, box_l, periodic);
66
67 // Powers of distance
68 auto const r_sq = scalar_product(dr, dr);
69 auto const r_sq_inv = 1.0f / r_sq;
70 auto const r_inv = rsqrtf(r_sq);
71 auto const r3_inv = 1.0f / r_sq * r_inv;
72 auto const r5_inv = r3_inv * r_sq_inv;
73 auto const r7_inv = r5_inv * r_sq_inv;
74
75 // Dot products
76 auto const pe1 = scalar_product(dip1, dip2);
77 auto const pe2 = scalar_product(dip1, dr);
78 auto const pe3 = scalar_product(dip2, dr);
79 auto const pe4 = 3.0f * r5_inv;
80
81 // Force
82 auto const aa = pe4 * pe1;
83 auto const bb = -15.0f * pe2 * pe3 * r7_inv;
84 auto const ab = aa + bb;
85 auto const cc = pe4 * pe3;
86 auto const dd = pe4 * pe2;
87
88 f1[0] = (pf * (ab * dr[0] + cc * dip1[0] + dd * dip2[0]));
89 f1[1] = (pf * (ab * dr[1] + cc * dip1[1] + dd * dip2[1]));
90 f1[2] = (pf * (ab * dr[2] + cc * dip1[2] + dd * dip2[2]));
91
92#ifdef ROTATION
93 // Torques
94 float a[3];
95 vector_product(dip1, dip2, a);
96
97 float b[3];
98 vector_product(dip1, dr, b);
99
100 torque1[0] = pf * (-a[0] * r3_inv + b[0] * cc);
101 torque1[1] = pf * (-a[1] * r3_inv + b[1] * cc);
102 torque1[2] = pf * (-a[2] * r3_inv + b[2] * cc);
103
104 vector_product(dip2, dr, b);
105
106 torque2[0] = pf * (a[0] * r3_inv + b[0] * dd);
107 torque2[1] = pf * (a[1] * r3_inv + b[1] * dd);
108 torque2[2] = pf * (a[2] * r3_inv + b[2] * dd);
109#endif
110}
111
112__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2,
113 float const *dip1, float const *dip2,
114 float box_l[3], int periodic[3]) {
115 // Distance between particles
116 float dr[3];
117 get_mi_vector_dds(dr, r1, r2, box_l, periodic);
118
119 // Powers of distance
120 auto const r_sq = scalar_product(dr, dr);
121 auto const r_sq_inv = 1.0f / r_sq;
122 auto const r_inv = rsqrtf(r_sq);
123 auto const r3_inv = 1.0f / r_sq * r_inv;
124 auto const r5_inv = r3_inv * r_sq_inv;
125
126 // Dot products
127 auto const pe1 = scalar_product(dip1, dip2);
128 auto const pe2 = scalar_product(dip1, dr);
129 auto const pe3 = scalar_product(dip2, dr);
130 auto const pe4 = 3.0f * r5_inv;
131
132 // Energy
133 return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
134}
135// LCOV_EXCL_STOP
136
137__global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n,
138 float *pos, float *dip, float *f,
139 float *torque, float box_l[3],
140 int periodic[3]) {
141
142 auto const i = blockIdx.x * blockDim.x + threadIdx.x;
143
144 if (i >= n)
145 return;
146
147 // Kahan summation based on the wikipedia article
148 // Force
149 float fi[3], fsum[3], tj[3];
150
151 // Torque
152 float ti[3], tsum[3];
153
154 // There is one thread per particle. Each thread computes interactions
155 // with particles whose id is smaller than the thread id.
156 // The force and torque of all the interaction partners of the current thread
157 // is atomically added to global results at once.
158 // The result for the particle id equal to the thread id is atomically added
159 // to global memory at the end.
160
161 // Clear summation vars
162 for (unsigned int j = 0; j < 3; j++) {
163 // Force
164 fsum[j] = 0;
165 // Torque
166 tsum[j] = 0;
167 }
168
169 for (unsigned int j = i + 1; j < n; j++) {
170 dipole_ia_force(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i, dip + 3 * j, fi,
171 ti, tj, box_l, periodic);
172 for (unsigned int k = 0; k < 3; k++) {
173 // Add rhs to global memory
174 atomicAdd(f + 3 * j + k, -fi[k]);
175 atomicAdd((torque + 3 * j + k), tj[k]);
176 tsum[k] += ti[k];
177 fsum[k] += fi[k];
178 }
179 }
180
181 // Add the left hand side result to global memory
182 for (int j = 0; j < 3; j++) {
183 atomicAdd(f + 3 * i + j, fsum[j]);
184 atomicAdd(torque + 3 * i + j, tsum[j]);
185 }
186}
187
188// LCOV_EXCL_START
189__device__ void dds_sumReduction(float *input, float *sum) {
190 auto const tid = static_cast<int>(threadIdx.x);
191 for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
192 __syncthreads();
193 if (tid < i / 2)
194 input[tid] += input[i / 2 + tid];
195 if ((i % 2 == 1) && (tid == 0))
196 input[tid] += input[i - 1];
197 }
198 __syncthreads();
199 if (tid == 0) {
200 sum[0] = input[0];
201 }
202}
203// LCOV_EXCL_STOP
204
205__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n,
206 float *pos, float *dip,
207 float box_l[3], int periodic[3],
208 float *energySum) {
209
210 auto const i = blockIdx.x * blockDim.x + threadIdx.x;
211 float sum = 0.0;
212 extern __shared__ float res[];
213
214 // There is one thread per particle. Each thread computes interactions
215 // with particles whose id is larger than the thread id.
216 // The result for the particle id equal to the thread id is added
217 // to global memory at the end.
218
219 if (i < n) {
220 // Summation for particle i
221 for (unsigned int j = i + 1; j < n; j++) {
222 sum += dipole_ia_energy(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i,
223 dip + 3 * j, box_l, periodic);
224 }
225
226 // Save per thread result into block shared mem
227 res[threadIdx.x] = sum;
228 } else
229 res[threadIdx.x] = 0;
230
231 // Sum results within a block
232 __syncthreads(); // Wait till all threads in block are done
233 dds_sumReduction(res, &(energySum[blockIdx.x]));
234}
235
236inline void copy_box_data(float **box_l_gpu, int **periodic_gpu,
237 float const *box_l, int const *periodic) {
238 auto const s_box = 3u * sizeof(float);
239 auto const s_per = 3u * sizeof(int);
240 cuda_safe_mem(cudaMalloc(reinterpret_cast<void **>(box_l_gpu), s_box));
241 cuda_safe_mem(cudaMalloc(reinterpret_cast<void **>(periodic_gpu), s_per));
242 cuda_safe_mem(cudaMemcpy(*box_l_gpu, box_l, s_box, cudaMemcpyHostToDevice));
244 cudaMemcpy(*periodic_gpu, periodic, s_per, cudaMemcpyHostToDevice));
245}
246
247void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos,
248 float *dip, float *f, float *torque,
249 float box_l[3], int periodic[3]) {
250
251 unsigned int const bs = 64;
252 dim3 grid(1, 1, 1);
253 dim3 block(1, 1, 1);
254
255 if (n == 0)
256 return;
257
258 if (n <= bs) {
259 grid.x = 1;
260 block.x = n;
261 } else {
262 grid.x = n / bs + 1;
263 block.x = bs;
264 }
265
266 float *box_l_gpu;
267 int *periodic_gpu;
268 copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic);
269
270 KERNELCALL(DipolarDirectSum_kernel_force, grid, block, k, n, pos, dip, f,
271 torque, box_l_gpu, periodic_gpu);
272 cudaFree(box_l_gpu);
273 cudaFree(periodic_gpu);
274}
275
276void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos,
277 float *dip, float box_l[3],
278 int periodic[3], float *E) {
279
280 unsigned int const bs = 512;
281 dim3 grid(1, 1, 1);
282 dim3 block(1, 1, 1);
283
284 if (n == 0)
285 return;
286
287 if (n <= bs) {
288 grid.x = 1;
289 block.x = n;
290 } else {
291 grid.x = n / bs + 1;
292 block.x = bs;
293 }
294
295 float *box_l_gpu;
296 int *periodic_gpu;
297 copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic);
298
299 float *energySum;
300 cuda_safe_mem(cudaMalloc(&energySum, sizeof(float) * grid.x));
301
302 // This will sum the energies up to the block level
304 bs * sizeof(float), k, n, pos, dip, box_l_gpu, periodic_gpu,
305 energySum);
306
307 // Sum the results of all blocks
308 // One thread per block in the prev kernel
309 // KERNELCALL(sumKernel,1,1,energySum,block.x,E);
310 thrust::device_ptr<float> t(energySum);
311 float x = thrust::reduce(t, t + grid.x);
312 cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));
313
314 cuda_safe_mem(cudaFree(energySum));
315 cuda_safe_mem(cudaFree(box_l_gpu));
316 cuda_safe_mem(cudaFree(periodic_gpu));
317}
318
319#endif
This file contains the defaults for ESPResSo.
void copy_box_data(float **box_l_gpu, int **periodic_gpu, float const *box_l, int const *periodic)
void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *E)
void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3])
__device__ float scalar_product(float const *a, float const *b)
__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float box_l[3], int periodic[3])
__device__ void dipole_ia_force(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float *f1, float *torque1, float *torque2, float box_l[3], int periodic[3])
__device__ void get_mi_vector_dds(float res[3], float const a[3], float const b[3], float const box_l[3], int const periodic[3])
__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *energySum)
__device__ void vector_product(float const *a, float const *b, float *out)
__global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3])
__device__ void dds_sumReduction(float *input, float *sum)
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:172
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
Definition utils.cuh:75
#define cuda_safe_mem(a)
Definition utils.cuh:73
#define KERNELCALL(_function, _grid, _block,...)
Definition utils.cuh:79