ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
dipolar_direct_sum_gpu_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2010-2022 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "config/config.hpp"
21
22#ifdef DIPOLAR_DIRECT_SUM
23
25
26#include "cuda/utils.cuh"
27
28#include <thrust/device_ptr.h>
29#include <thrust/reduce.h>
30
31#include <cuda.h>
32
33#if defined(OMPI_MPI_H) || defined(_MPI_H)
34#error CU-file includes mpi.h! This should not happen!
35#endif
36
37__device__ inline float scalar_product(float const *a, float const *b) {
38 return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
39}
40
41__device__ inline void vector_product(float const *a, float const *b,
42 float *out) {
43 out[0] = a[1] * b[2] - a[2] * b[1];
44 out[1] = a[2] * b[0] - a[0] * b[2];
45 out[2] = a[0] * b[1] - a[1] * b[0];
46}
47
48__device__ inline void get_mi_vector_dds(float res[3], float const a[3],
49 float const b[3], float const box_l[3],
50 int const periodic[3]) {
51 for (int i = 0; i < 3; i++) {
52 res[i] = a[i] - b[i];
53 if (periodic[i])
54 res[i] -= floor(res[i] / box_l[i] + 0.5f) * box_l[i];
55 }
56}
57
58__device__ void dipole_ia_force(float pf, float const *r1, float const *r2,
59 float const *dip1, float const *dip2, float *f1,
60 float *torque1, float *torque2, float box_l[3],
61 int periodic[3]) {
62 // Distance between particles
63 float dr[3];
64 get_mi_vector_dds(dr, r1, r2, box_l, periodic);
65
66 // Powers of distance
67 auto const r_sq = scalar_product(dr, dr);
68 auto const r_sq_inv = 1.0f / r_sq;
69 auto const r_inv = rsqrtf(r_sq);
70 auto const r3_inv = 1.0f / r_sq * r_inv;
71 auto const r5_inv = r3_inv * r_sq_inv;
72 auto const r7_inv = r5_inv * r_sq_inv;
73
74 // Dot products
75 auto const pe1 = scalar_product(dip1, dip2);
76 auto const pe2 = scalar_product(dip1, dr);
77 auto const pe3 = scalar_product(dip2, dr);
78 auto const pe4 = 3.0f * r5_inv;
79
80 // Force
81 auto const aa = pe4 * pe1;
82 auto const bb = -15.0f * pe2 * pe3 * r7_inv;
83 auto const ab = aa + bb;
84 auto const cc = pe4 * pe3;
85 auto const dd = pe4 * pe2;
86
87 f1[0] = (pf * (ab * dr[0] + cc * dip1[0] + dd * dip2[0]));
88 f1[1] = (pf * (ab * dr[1] + cc * dip1[1] + dd * dip2[1]));
89 f1[2] = (pf * (ab * dr[2] + cc * dip1[2] + dd * dip2[2]));
90
91#ifdef ROTATION
92 // Torques
93 float a[3];
94 vector_product(dip1, dip2, a);
95
96 float b[3];
97 vector_product(dip1, dr, b);
98
99 torque1[0] = pf * (-a[0] * r3_inv + b[0] * cc);
100 torque1[1] = pf * (-a[1] * r3_inv + b[1] * cc);
101 torque1[2] = pf * (-a[2] * r3_inv + b[2] * cc);
102
103 vector_product(dip2, dr, b);
104
105 torque2[0] = pf * (a[0] * r3_inv + b[0] * dd);
106 torque2[1] = pf * (a[1] * r3_inv + b[1] * dd);
107 torque2[2] = pf * (a[2] * r3_inv + b[2] * dd);
108#endif
109}
110
111__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2,
112 float const *dip1, float const *dip2,
113 float box_l[3], int periodic[3]) {
114 // Distance between particles
115 float dr[3];
116 get_mi_vector_dds(dr, r1, r2, box_l, periodic);
117
118 // Powers of distance
119 auto const r_sq = scalar_product(dr, dr);
120 auto const r_sq_inv = 1.0f / r_sq;
121 auto const r_inv = rsqrtf(r_sq);
122 auto const r3_inv = 1.0f / r_sq * r_inv;
123 auto const r5_inv = r3_inv * r_sq_inv;
124
125 // Dot products
126 auto const pe1 = scalar_product(dip1, dip2);
127 auto const pe2 = scalar_product(dip1, dr);
128 auto const pe3 = scalar_product(dip2, dr);
129 auto const pe4 = 3.0f * r5_inv;
130
131 // Energy
132 return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
133}
134
135__global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n,
136 float *pos, float *dip, float *f,
137 float *torque, float box_l[3],
138 int periodic[3]) {
139
140 auto const i = blockIdx.x * blockDim.x + threadIdx.x;
141
142 if (i >= n)
143 return;
144
145 // Kahan summation based on the wikipedia article
146 // Force
147 float fi[3], fsum[3], tj[3];
148
149 // Torque
150 float ti[3], tsum[3];
151
152 // There is one thread per particle. Each thread computes interactions
153 // with particles whose id is smaller than the thread id.
154 // The force and torque of all the interaction partners of the current thread
155 // is atomically added to global results at once.
156 // The result for the particle id equal to the thread id is atomically added
157 // to global memory at the end.
158
159 // Clear summation vars
160 for (unsigned int j = 0; j < 3; j++) {
161 // Force
162 fsum[j] = 0;
163 // Torque
164 tsum[j] = 0;
165 }
166
167 for (unsigned int j = i + 1; j < n; j++) {
168 dipole_ia_force(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i, dip + 3 * j, fi,
169 ti, tj, box_l, periodic);
170 for (unsigned int k = 0; k < 3; k++) {
171 // Add rhs to global memory
172 atomicAdd(f + 3 * j + k, -fi[k]);
173 atomicAdd((torque + 3 * j + k), tj[k]);
174 tsum[k] += ti[k];
175 fsum[k] += fi[k];
176 }
177 }
178
179 // Add the left hand side result to global memory
180 for (int j = 0; j < 3; j++) {
181 atomicAdd(f + 3 * i + j, fsum[j]);
182 atomicAdd(torque + 3 * i + j, tsum[j]);
183 }
184}
185
186__device__ void dds_sumReduction(float *input, float *sum) {
187 auto const tid = static_cast<int>(threadIdx.x);
188 for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
190 if (tid < i / 2)
191 input[tid] += input[i / 2 + tid];
192 if ((i % 2 == 1) && (tid == 0))
193 input[tid] += input[i - 1];
194 }
196 if (tid == 0) {
197 sum[0] = input[0];
198 }
199}
200
201__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n,
202 float *pos, float *dip,
203 float box_l[3], int periodic[3],
204 float *energySum) {
205
206 auto const i = blockIdx.x * blockDim.x + threadIdx.x;
207 float sum = 0.0;
208 extern __shared__ float res[];
209
210 // There is one thread per particle. Each thread computes interactions
211 // with particles whose id is larger than the thread id.
212 // The result for the particle id equal to the thread id is added
213 // to global memory at the end.
214
215 if (i < n) {
216 // Summation for particle i
217 for (unsigned int j = i + 1; j < n; j++) {
218 sum += dipole_ia_energy(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i,
219 dip + 3 * j, box_l, periodic);
220 }
221
222 // Save per thread result into block shared mem
223 res[threadIdx.x] = sum;
224 } else
225 res[threadIdx.x] = 0;
226
227 // Sum results within a block
228 __syncthreads(); // Wait till all threads in block are done
229 dds_sumReduction(res, &(energySum[blockIdx.x]));
230}
231
232inline void copy_box_data(float **box_l_gpu, int **periodic_gpu,
233 float const *box_l, int const *periodic) {
234 auto const s_box = 3u * sizeof(float);
235 auto const s_per = 3u * sizeof(int);
236 cuda_safe_mem(cudaMalloc(reinterpret_cast<void **>(box_l_gpu), s_box));
237 cuda_safe_mem(cudaMalloc(reinterpret_cast<void **>(periodic_gpu), s_per));
238 cuda_safe_mem(cudaMemcpy(*box_l_gpu, box_l, s_box, cudaMemcpyHostToDevice));
240 cudaMemcpy(*periodic_gpu, periodic, s_per, cudaMemcpyHostToDevice));
241}
242
243void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos,
244 float *dip, float *f, float *torque,
245 float box_l[3], int periodic[3]) {
246
247 unsigned int const bs = 64;
248 dim3 grid(1, 1, 1);
249 dim3 block(1, 1, 1);
250
251 if (n == 0)
252 return;
253
254 if (n <= bs) {
255 grid.x = 1;
256 block.x = n;
257 } else {
258 grid.x = n / bs + 1;
259 block.x = bs;
260 }
261
262 float *box_l_gpu;
263 int *periodic_gpu;
264 copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic);
265
267 torque, box_l_gpu, periodic_gpu);
268 cudaFree(box_l_gpu);
269 cudaFree(periodic_gpu);
270}
271
272void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos,
273 float *dip, float box_l[3],
274 int periodic[3], float *E) {
275
276 unsigned int const bs = 512;
277 dim3 grid(1, 1, 1);
278 dim3 block(1, 1, 1);
279
280 if (n == 0)
281 return;
282
283 if (n <= bs) {
284 grid.x = 1;
285 block.x = n;
286 } else {
287 grid.x = n / bs + 1;
288 block.x = bs;
289 }
290
291 float *box_l_gpu;
292 int *periodic_gpu;
293 copy_box_data(&box_l_gpu, &periodic_gpu, box_l, periodic);
294
295 float *energySum;
296 cuda_safe_mem(cudaMalloc(&energySum, sizeof(float) * grid.x));
297
298 // This will sum the energies up to the block level
300 bs * sizeof(float), k, n, pos, dip, box_l_gpu, periodic_gpu,
301 energySum);
302
303 // Sum the results of all blocks
304 // One thread per block in the prev kernel
305 // KERNELCALL(sumKernel,1,1,energySum,block.x,E);
306 thrust::device_ptr<float> t(energySum);
307 float x = thrust::reduce(t, t + grid.x);
308 cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));
309
310 cuda_safe_mem(cudaFree(energySum));
311 cuda_safe_mem(cudaFree(box_l_gpu));
312 cuda_safe_mem(cudaFree(periodic_gpu));
313}
314
315#endif
float sum
float f[3]
__global__ float float * torque
float dr[3]
__shared__ int pos[MAXDEPTH *THREADS5/WARPSIZE]
__shared__ float res[]
__global__ float * energySum
__syncthreads()
This file contains the defaults for ESPResSo.
void copy_box_data(float **box_l_gpu, int **periodic_gpu, float const *box_l, int const *periodic)
void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *E)
void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3])
__device__ float scalar_product(float const *a, float const *b)
__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float box_l[3], int periodic[3])
__device__ void dipole_ia_force(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float *f1, float *torque1, float *torque2, float box_l[3], int periodic[3])
__device__ void get_mi_vector_dds(float res[3], float const a[3], float const b[3], float const box_l[3], int const periodic[3])
__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *energySum)
__device__ void vector_product(float const *a, float const *b, float *out)
__global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3])
__device__ void dds_sumReduction(float *input, float *sum)
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:174
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
Definition utils.cuh:73
#define cuda_safe_mem(a)
Definition utils.cuh:71
#define KERNELCALL(_function, _grid, _block,...)
Definition utils.cuh:77