22#if defined(ESPRESSO_DIPOLES) and defined(ESPRESSO_CUDA)
28#include <thrust/device_ptr.h>
29#include <thrust/reduce.h>
33#if defined(OMPI_MPI_H) || defined(_MPI_H)
34#error CU-file includes mpi.h! This should not happen!
39 return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
44 out[0] = a[1] * b[2] - a[2] * b[1];
45 out[1] = a[2] * b[0] - a[0] * b[2];
46 out[2] = a[0] * b[1] - a[1] * b[0];
50 float const b[3],
float const box_l[3],
51 int const periodic[3]) {
52 for (
int i = 0; i < 3; i++) {
60 float const *
const dip1,
61 float const *
const dip2,
float *
const f1,
68 f1[0] =
f1[1] =
f1[2] = 0.0f;
71#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
89#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
131 float const *
dip1,
float const *
dip2,
132 float box_l[3],
int periodic[3]) {
157 float const *
const pos,
float const *
const dip,
159 float *
const torque,
float const box_l[3],
160 int const periodic[3],
int const n_replicas) {
166 auto const xi{pos[3u * i + 0
u]},
yi{pos[3u * i + 1u]},
zi{pos[3u * i + 2u]};
168 auto const *
const mi = dip + 3u * i;
171 float fsum[3] = {0.0f, 0.0f, 0.0f};
172 float tsum[3] = {0.0f, 0.0f, 0.0f};
173#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
174 float dfsum[3] = {0.0f, 0.0f, 0.0f};
178#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
181 float *
const dfi{
nullptr}, *
const dfj{
nullptr};
185 if (n_replicas == 0) {
186 for (
unsigned int j = i + 1u;
j < n; ++
j) {
194 for (
unsigned int k = 0
u; k < 3u; ++k) {
197#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
202 for (
unsigned int k = 0
u; k < 3u; ++k) {
205#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
211 for (
unsigned int k = 0
u; k < 3u; ++k) {
214#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
221 int const nx = periodic[0] * n_replicas;
222 int const ny = periodic[1] * n_replicas;
223 int const nz = periodic[2] * n_replicas;
229 if (
dx == 0 &&
dy == 0 &&
dz == 0)
231 float dr[3] = {
static_cast<float>(
dx) *
box_l[0],
232 static_cast<float>(
dy) *
box_l[1],
233 static_cast<float>(
dz) *
box_l[2]};
235 for (
unsigned int k = 0
u; k < 3u; ++k) {
238#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
247 for (
unsigned int j = i + 1u;
j < n; ++
j) {
248 auto const xj{pos[3u *
j + 0
u]},
yj{pos[3u *
j + 1u]},
zj{pos[3u *
j + 2u]};
249 auto const *
mj = dip + 3u *
j;
254 float dr[3] = {(
xi -
xj) +
static_cast<float>(
dx) *
box_l[0],
260 for (
unsigned int k = 0
u; k < 3u; ++k) {
263#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
268#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
279 for (
unsigned int k = 0
u; k < 3u; ++k) {
282#ifdef ESPRESSO_DIPOLE_FIELD_TRACKING
290 for (
auto i =
static_cast<int>(
blockDim.x); i > 1; i /= 2) {
294 if ((i % 2 == 1) && (
tid == 0))
305 float const *
const pos,
306 float const *
const dip,
307 float box_l[3],
int periodic[3],
321 for (
unsigned int j = i + 1;
j < n;
j++) {
323 dip + 3 *
j,
box_l, periodic);
337 float const *
box_l,
int const *periodic) {
339 auto const s_per = 3u *
sizeof(
int);
348 float const *
const pos,
349 float const *
const dip,
350 float *dip_fld,
float *f,
351 float *torque,
float box_l[3],
352 int periodic[3],
int n_replicas) {
354 unsigned int const bs = 64;
380 float const *
const pos,
381 float const *
const dip,
382 float box_l[3],
int periodic[3],
385 unsigned int const bs = 512;
416 float x = thrust::reduce(t, t +
grid.x);
cudaStream_t stream[1]
CUDA streams for parallel computing on CPU and GPU.
void copy_box_data(float **box_l_gpu, int **periodic_gpu, float const *box_l, int const *periodic)
__device__ void dipole_ia_force(float const pf, float const dr[3], float const *const dip1, float const *const dip2, float *const f1, float *const torque1, float *const torque2, float *const dip_fld1, float *const dip_fld2)
void DipolarDirectSum_kernel_wrapper_energy(float k, unsigned int n, float const *const pos, float const *const dip, float box_l[3], int periodic[3], float *E)
void DipolarDirectSum_kernel_wrapper_force(float k, unsigned int n, float const *const pos, float const *const dip, float *dip_fld, float *f, float *torque, float box_l[3], int periodic[3], int n_replicas)
__device__ float scalar_product(float const *a, float const *b)
__device__ float dipole_ia_energy(float pf, float const *r1, float const *r2, float const *dip1, float const *dip2, float box_l[3], int periodic[3])
__global__ void DipolarDirectSum_kernel_energy(float pf, unsigned int n, float const *const pos, float const *const dip, float box_l[3], int periodic[3], float *energySum)
__device__ void get_mi_vector_dds(float res[3], float const a[3], float const b[3], float const box_l[3], int const periodic[3])
__device__ void vector_product(float const *a, float const *b, float *out)
__device__ void dds_sumReduction(float *input, float *sum)
__global__ void DipolarDirectSum_kernel_force(float const pf, unsigned int n, float const *const pos, float const *const dip, float *dip_fld, float *const f, float *const torque, float const box_l[3], int const periodic[3], int const n_replicas)
static double * block(double *p, std::size_t index, std::size_t size)
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
#define KERNELCALL(_function, _grid, _block,...)