60 float const *
const dip1,
61 float const *
const dip2,
float *
const f1,
62 float *
const torque1,
float *
const torque2,
63 [[maybe_unused]]
float *
const dip_fld1,
64 [[maybe_unused]]
float *
const dip_fld2) {
68 f1[0] = f1[1] = f1[2] = 0.0f;
69 torque1[0] = torque1[1] = torque1[2] = 0.0f;
70 torque2[0] = torque2[1] = torque2[2] = 0.0f;
71#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
72 dip_fld1[0] = dip_fld1[1] = dip_fld1[2] = 0.0f;
73 dip_fld2[0] = dip_fld2[1] = dip_fld2[2] = 0.0f;
77 auto const r_sq_inv = 1.0f / r_sq;
78 auto const r_inv = rsqrtf(r_sq);
79 auto const r3_inv = 1.0f / r_sq * r_inv;
80 auto const r5_inv = r3_inv * r_sq_inv;
81 auto const r7_inv = r5_inv * r_sq_inv;
87 auto const pe4 = 3.0f * r5_inv;
89#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
90 auto const rep1 = pe3 * pe4;
91 auto const rep2 = pe2 * pe4;
92 dip_fld1[0] = pf * (rep1 * dr[0] - dip2[0] * r3_inv);
93 dip_fld1[1] = pf * (rep1 * dr[1] - dip2[1] * r3_inv);
94 dip_fld1[2] = pf * (rep1 * dr[2] - dip2[2] * r3_inv);
96 dip_fld2[0] = pf * (rep2 * dr[0] - dip1[0] * r3_inv);
97 dip_fld2[1] = pf * (rep2 * dr[1] - dip1[1] * r3_inv);
98 dip_fld2[2] = pf * (rep2 * dr[2] - dip1[2] * r3_inv);
102 auto const aa = pe4 * pe1;
103 auto const bb = -15.0f * pe2 * pe3 * r7_inv;
104 auto const ab = aa + bb;
105 auto const cc = pe4 * pe3;
106 auto const dd = pe4 * pe2;
108 f1[0] = (pf * (ab * dr[0] + cc * dip1[0] + dd * dip2[0]));
109 f1[1] = (pf * (ab * dr[1] + cc * dip1[1] + dd * dip2[1]));
110 f1[2] = (pf * (ab * dr[2] + cc * dip1[2] + dd * dip2[2]));
119 torque1[0] = pf * (-a[0] * r3_inv + b[0] * cc);
120 torque1[1] = pf * (-a[1] * r3_inv + b[1] * cc);
121 torque1[2] = pf * (-a[2] * r3_inv + b[2] * cc);
125 torque2[0] = pf * (a[0] * r3_inv + b[0] * dd);
126 torque2[1] = pf * (a[1] * r3_inv + b[1] * dd);
127 torque2[2] = pf * (a[2] * r3_inv + b[2] * dd);
157 float const *
const pos,
float const *
const dip,
158 [[maybe_unused]]
float *dip_fld,
float *
const f,
159 float *
const torque,
float const box_l[3],
160 int const periodic[3],
int const n_replicas) {
161 unsigned int const i = blockIdx.x * blockDim.x + threadIdx.x;
166 auto const xi{pos[3u * i + 0u]}, yi{pos[3u * i + 1u]}, zi{pos[3u * i + 2u]};
168 auto const *
const mi = dip + 3u * i;
171 float fsum[3] = {0.0f, 0.0f, 0.0f};
172 float tsum[3] = {0.0f, 0.0f, 0.0f};
173#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
174 float dfsum[3] = {0.0f, 0.0f, 0.0f};
177 float fi[3], ti1[3], ti2[3];
178#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
179 float dfi[3], dfj[3];
181 float *
const dfi{
nullptr}, *
const dfj{
nullptr};
185 if (n_replicas == 0) {
186 for (
unsigned int j = i + 1u; j < n; ++j) {
194 for (
unsigned int k = 0u; k < 3u; ++k) {
195 atomicAdd(f + 3u * j + k, -fi[k]);
196 atomicAdd(torque + 3u * j + k, ti2[k]);
197#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
198 atomicAdd(dip_fld + 3u * j + k, dfj[k]);
202 for (
unsigned int k = 0u; k < 3u; ++k) {
205#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
211 for (
unsigned int k = 0u; k < 3u; ++k) {
212 atomicAdd(f + 3u * i + k, fsum[k]);
213 atomicAdd(torque + 3u * i + k, tsum[k]);
214#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
215 atomicAdd(dip_fld + 3u * i + k, dfsum[k]);
221 int const nx = periodic[0] * n_replicas;
222 int const ny = periodic[1] * n_replicas;
223 int const nz = periodic[2] * n_replicas;
226 for (
int dx = -nx; dx <= nx; ++dx) {
227 for (
int dy = -ny; dy <= ny; ++dy) {
228 for (
int dz = -nz; dz <= nz; ++dz) {
229 if (dx == 0 && dy == 0 && dz == 0)
231 float dr[3] = {
static_cast<float>(dx) * box_l[0],
232 static_cast<float>(dy) * box_l[1],
233 static_cast<float>(dz) * box_l[2]};
235 for (
unsigned int k = 0u; k < 3u; ++k) {
238#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
247 for (
unsigned int j = i + 1u; j < n; ++j) {
248 auto const xj{pos[3u * j + 0u]}, yj{pos[3u * j + 1u]}, zj{pos[3u * j + 2u]};
249 auto const *mj = dip + 3u * j;
251 for (
int dx = -nx; dx <= nx; ++dx) {
252 for (
int dy = -ny; dy <= ny; ++dy) {
253 for (
int dz = -nz; dz <= nz; ++dz) {
254 float dr[3] = {(xi - xj) +
static_cast<float>(dx) * box_l[0],
255 (yi - yj) +
static_cast<float>(dy) * box_l[1],
256 (zi - zj) +
static_cast<float>(dz) * box_l[2]};
260 for (
unsigned int k = 0u; k < 3u; ++k) {
261 atomicAdd(f + 3u * j + k, -fi[k]);
262 atomicAdd(torque + 3u * j + k, ti2[k]);
263#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
264 atomicAdd(dip_fld + 3u * j + k, dfj[k]);
268#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
279 for (
unsigned int k = 0u; k < 3u; ++k) {
280 atomicAdd(f + 3u * i + k, fsum[k]);
281 atomicAdd(torque + 3u * i + k, tsum[k]);
282#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
283 atomicAdd(dip_fld + 3u * i + k, dfsum[k]);
380 float const *
const pos,
381 float const *
const dip,
382 float box_l[3],
int periodic[3],
385 unsigned int const bs = 512;
405 cuda_safe_mem(cudaMalloc(&energySum,
sizeof(
float) * grid.x));
409 bs *
sizeof(
float), k, n, pos, dip, box_l_gpu, periodic_gpu,
415 thrust::device_ptr<float> t(energySum);
416 float x = thrust::reduce(t, t + grid.x);
417 cuda_safe_mem(cudaMemcpy(E, &x,
sizeof(
float), cudaMemcpyHostToDevice));
__device__ void dipole_ia_force(float const pf, float const dr[3], float const *const dip1, float const *const dip2, float *const f1, float *const torque1, float *const torque2, float *const dip_fld1, float *const dip_fld2)
__global__ void DipolarDirectSum_kernel_force(float const pf, unsigned int n, float const *const pos, float const *const dip, float *dip_fld, float *const f, float *const torque, float const box_l[3], int const periodic[3], int const n_replicas)