22#ifdef DIPOLAR_DIRECT_SUM
28#include <thrust/device_ptr.h>
29#include <thrust/reduce.h>
33#if defined(OMPI_MPI_H) || defined(_MPI_H)
34#error CU-file includes mpi.h! This should not happen!
38 return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
43 out[0] = a[1] * b[2] - a[2] * b[1];
44 out[1] = a[2] * b[0] - a[0] * b[2];
45 out[2] = a[0] * b[1] - a[1] * b[0];
49 float const b[3],
float const box_l[3],
50 int const periodic[3]) {
51 for (
int i = 0; i < 3; i++) {
54 res[i] -= floor(
res[i] / box_l[i] + 0.5f) * box_l[i];
59 float const *dip1,
float const *dip2,
float *f1,
60 float *torque1,
float *torque2,
float box_l[3],
68 auto const r_sq_inv = 1.0f / r_sq;
69 auto const r_inv = rsqrtf(r_sq);
70 auto const r3_inv = 1.0f / r_sq * r_inv;
71 auto const r5_inv = r3_inv * r_sq_inv;
72 auto const r7_inv = r5_inv * r_sq_inv;
78 auto const pe4 = 3.0f * r5_inv;
81 auto const aa = pe4 * pe1;
82 auto const bb = -15.0f * pe2 * pe3 * r7_inv;
83 auto const ab = aa + bb;
84 auto const cc = pe4 * pe3;
85 auto const dd = pe4 * pe2;
87 f1[0] = (pf * (ab *
dr[0] + cc * dip1[0] + dd * dip2[0]));
88 f1[1] = (pf * (ab *
dr[1] + cc * dip1[1] + dd * dip2[1]));
89 f1[2] = (pf * (ab *
dr[2] + cc * dip1[2] + dd * dip2[2]));
99 torque1[0] = pf * (-a[0] * r3_inv + b[0] * cc);
100 torque1[1] = pf * (-a[1] * r3_inv + b[1] * cc);
101 torque1[2] = pf * (-a[2] * r3_inv + b[2] * cc);
105 torque2[0] = pf * (a[0] * r3_inv + b[0] * dd);
106 torque2[1] = pf * (a[1] * r3_inv + b[1] * dd);
107 torque2[2] = pf * (a[2] * r3_inv + b[2] * dd);
112 float const *dip1,
float const *dip2,
113 float box_l[3],
int periodic[3]) {
120 auto const r_sq_inv = 1.0f / r_sq;
121 auto const r_inv = rsqrtf(r_sq);
122 auto const r3_inv = 1.0f / r_sq * r_inv;
123 auto const r5_inv = r3_inv * r_sq_inv;
129 auto const pe4 = 3.0f * r5_inv;
132 return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
136 float *
pos,
float *dip,
float *
f,
137 float *
torque,
float box_l[3],
140 auto const i = blockIdx.x * blockDim.x + threadIdx.x;
147 float fi[3], fsum[3], tj[3];
150 float ti[3], tsum[3];
160 for (
unsigned int j = 0; j < 3; j++) {
167 for (
unsigned int j = i + 1; j < n; j++) {
169 ti, tj, box_l, periodic);
170 for (
unsigned int k = 0; k < 3; k++) {
172 atomicAdd(
f + 3 * j + k, -fi[k]);
173 atomicAdd((
torque + 3 * j + k), tj[k]);
180 for (
int j = 0; j < 3; j++) {
181 atomicAdd(
f + 3 * i + j, fsum[j]);
182 atomicAdd(
torque + 3 * i + j, tsum[j]);
187 auto const tid =
static_cast<int>(threadIdx.x);
188 for (
auto i =
static_cast<int>(blockDim.x); i > 1; i /= 2) {
191 input[tid] += input[i / 2 + tid];
192 if ((i % 2 == 1) && (tid == 0))
193 input[tid] += input[i - 1];
202 float *
pos,
float *dip,
203 float box_l[3],
int periodic[3],
206 auto const i = blockIdx.x * blockDim.x + threadIdx.x;
208 extern __shared__
float res[];
217 for (
unsigned int j = i + 1; j < n; j++) {
219 dip + 3 * j, box_l, periodic);
225 res[threadIdx.x] = 0;
233 float const *box_l,
int const *periodic) {
234 auto const s_box = 3u *
sizeof(float);
235 auto const s_per = 3u *
sizeof(int);
236 cuda_safe_mem(cudaMalloc(
reinterpret_cast<void **
>(box_l_gpu), s_box));
237 cuda_safe_mem(cudaMalloc(
reinterpret_cast<void **
>(periodic_gpu), s_per));
238 cuda_safe_mem(cudaMemcpy(*box_l_gpu, box_l, s_box, cudaMemcpyHostToDevice));
240 cudaMemcpy(*periodic_gpu, periodic, s_per, cudaMemcpyHostToDevice));
244 float *dip,
float *
f,
float *
torque,
245 float box_l[3],
int periodic[3]) {
247 unsigned int const bs = 64;
267 torque, box_l_gpu, periodic_gpu);
269 cudaFree(periodic_gpu);
273 float *dip,
float box_l[3],
274 int periodic[3],
float *E) {
276 unsigned int const bs = 512;
300 bs *
sizeof(
float), k, n,
pos, dip, box_l_gpu, periodic_gpu,
307 float x = thrust::reduce(t, t + grid.x);
308 cuda_safe_mem(cudaMemcpy(E, &x,
sizeof(
float), cudaMemcpyHostToDevice));
__global__ float float * torque
__shared__ int pos[MAXDEPTH *THREADS5/WARPSIZE]
__global__ float * energySum
This file contains the defaults for ESPResSo.
void copy_box_data(float **box_l_gpu, int **periodic_gpu, float const *box_l, int const *periodic)
void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *E)
void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3])
__device__ float scalar_product(float const *a, float const *b)
__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float box_l[3], int periodic[3])
__device__ void dipole_ia_force(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float *f1, float *torque1, float *torque2, float box_l[3], int periodic[3])
__device__ void get_mi_vector_dds(float res[3], float const a[3], float const b[3], float const box_l[3], int const periodic[3])
__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n, float *pos, float *dip, float box_l[3], int periodic[3], float *energySum)
__device__ void vector_product(float const *a, float const *b, float *out)
__global__ void DipolarDirectSum_kernel_force(float pf, unsigned int n, float *pos, float *dip, float *f, float *torque, float box_l[3], int periodic[3])
__device__ void dds_sumReduction(float *input, float *sum)
static double * block(double *p, std::size_t index, std::size_t size)
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
#define KERNELCALL(_function, _grid, _block,...)