22#ifdef DIPOLAR_DIRECT_SUM
28#include <thrust/device_ptr.h>
29#include <thrust/reduce.h>
33#if defined(OMPI_MPI_H) || defined(_MPI_H)
34#error CU-file includes mpi.h! This should not happen!
39 return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
44 out[0] = a[1] * b[2] - a[2] * b[1];
45 out[1] = a[2] * b[0] - a[0] * b[2];
46 out[2] = a[0] * b[1] - a[1] * b[0];
50 float const b[3],
float const box_l[3],
51 int const periodic[3]) {
52 for (
int i = 0; i < 3; i++) {
55 res[i] -= floor(res[i] / box_l[i] + 0.5f) * box_l[i];
60 float const *dip1,
float const *dip2,
float *f1,
61 float *torque1,
float *torque2,
float box_l[3],
69 auto const r_sq_inv = 1.0f / r_sq;
70 auto const r_inv = rsqrtf(r_sq);
71 auto const r3_inv = 1.0f / r_sq * r_inv;
72 auto const r5_inv = r3_inv * r_sq_inv;
73 auto const r7_inv = r5_inv * r_sq_inv;
79 auto const pe4 = 3.0f * r5_inv;
82 auto const aa = pe4 * pe1;
83 auto const bb = -15.0f * pe2 * pe3 * r7_inv;
84 auto const ab = aa + bb;
85 auto const cc = pe4 * pe3;
86 auto const dd = pe4 * pe2;
88 f1[0] = (pf * (ab * dr[0] + cc * dip1[0] + dd * dip2[0]));
89 f1[1] = (pf * (ab * dr[1] + cc * dip1[1] + dd * dip2[1]));
90 f1[2] = (pf * (ab * dr[2] + cc * dip1[2] + dd * dip2[2]));
100 torque1[0] = pf * (-a[0] * r3_inv + b[0] * cc);
101 torque1[1] = pf * (-a[1] * r3_inv + b[1] * cc);
102 torque1[2] = pf * (-a[2] * r3_inv + b[2] * cc);
106 torque2[0] = pf * (a[0] * r3_inv + b[0] * dd);
107 torque2[1] = pf * (a[1] * r3_inv + b[1] * dd);
108 torque2[2] = pf * (a[2] * r3_inv + b[2] * dd);
113 float const *dip1,
float const *dip2,
114 float box_l[3],
int periodic[3]) {
121 auto const r_sq_inv = 1.0f / r_sq;
122 auto const r_inv = rsqrtf(r_sq);
123 auto const r3_inv = 1.0f / r_sq * r_inv;
124 auto const r5_inv = r3_inv * r_sq_inv;
130 auto const pe4 = 3.0f * r5_inv;
133 return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
138 float *pos,
float *dip,
float *f,
139 float *torque,
float box_l[3],
142 auto const i = blockIdx.x * blockDim.x + threadIdx.x;
149 float fi[3], fsum[3], tj[3];
152 float ti[3], tsum[3];
162 for (
unsigned int j = 0; j < 3; j++) {
169 for (
unsigned int j = i + 1; j < n; j++) {
170 dipole_ia_force(pf, pos + 3 * i, pos + 3 * j, dip + 3 * i, dip + 3 * j, fi,
171 ti, tj, box_l, periodic);
172 for (
unsigned int k = 0; k < 3; k++) {
174 atomicAdd(f + 3 * j + k, -fi[k]);
175 atomicAdd((torque + 3 * j + k), tj[k]);
182 for (
int j = 0; j < 3; j++) {
183 atomicAdd(f + 3 * i + j, fsum[j]);
184 atomicAdd(torque + 3 * i + j, tsum[j]);
190 auto const tid =
static_cast<int>(threadIdx.x);
191 for (
auto i =
static_cast<int>(blockDim.x); i > 1; i /= 2) {
194 input[tid] += input[i / 2 + tid];
195 if ((i % 2 == 1) && (tid == 0))
196 input[tid] += input[i - 1];
206 float *pos,
float *dip,
207 float box_l[3],
int periodic[3],
210 auto const i = blockIdx.x * blockDim.x + threadIdx.x;
212 extern __shared__
float res[];
221 for (
unsigned int j = i + 1; j < n; j++) {
223 dip + 3 * j, box_l, periodic);
227 res[threadIdx.x] = sum;
229 res[threadIdx.x] = 0;
237 float const *box_l,
int const *periodic) {
238 auto const s_box = 3u *
sizeof(float);
239 auto const s_per = 3u *
sizeof(int);
240 cuda_safe_mem(cudaMalloc(
reinterpret_cast<void **
>(box_l_gpu), s_box));
241 cuda_safe_mem(cudaMalloc(
reinterpret_cast<void **
>(periodic_gpu), s_per));
242 cuda_safe_mem(cudaMemcpy(*box_l_gpu, box_l, s_box, cudaMemcpyHostToDevice));
244 cudaMemcpy(*periodic_gpu, periodic, s_per, cudaMemcpyHostToDevice));
248 float *dip,
float *f,
float *torque,
249 float box_l[3],
int periodic[3]) {
251 unsigned int const bs = 64;
271 torque, box_l_gpu, periodic_gpu);
273 cudaFree(periodic_gpu);
277 float *dip,
float box_l[3],
278 int periodic[3],
float *E) {
280 unsigned int const bs = 512;
300 cuda_safe_mem(cudaMalloc(&energySum,
sizeof(
float) * grid.x));
304 bs *
sizeof(
float), k, n, pos, dip, box_l_gpu, periodic_gpu,
310 thrust::device_ptr<float> t(energySum);
311 float x = thrust::reduce(t, t + grid.x);
312 cuda_safe_mem(cudaMemcpy(E, &x,
sizeof(
float), cudaMemcpyHostToDevice));
This file contains the defaults for ESPResSo.
void copy_box_data(float **box_l_gpu, int **periodic_gpu, float const *box_l, int const *periodic)
void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *E)
void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3])
__device__ float scalar_product(float const *a, float const *b)
__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float box_l[3], int periodic[3])
__device__ void dipole_ia_force(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float *f1, float *torque1, float *torque2, float box_l[3], int periodic[3])
__device__ void get_mi_vector_dds(float res[3], float const a[3], float const b[3], float const box_l[3], int const periodic[3])
__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *energySum)
__device__ void vector_product(float const *a, float const *b, float *out)
__global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3])
__device__ void dds_sumReduction(float *input, float *sum)
static double * block(double *p, std::size_t index, std::size_t size)
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
#define KERNELCALL(_function, _grid, _block,...)