27#ifdef DIPOLAR_BARNES_HUT
34#include <thrust/device_ptr.h>
35#include <thrust/reduce.h>
59 auto tid =
static_cast<int>(threadIdx.x);
60 for (
auto i =
static_cast<int>(blockDim.x); i > 1; i /= 2) {
63 input[tid] += input[i / 2 + tid];
64 if ((i % 2 == 1) && (tid == 0))
65 input[tid] += input[i - 1];
68 if (threadIdx.x == 0) {
78 auto const ind = blockDim.x * blockIdx.x + threadIdx.x;
93 float minp[3], maxp[3];
96 for (
int l = 0; l < 3; l++) {
97 minp[l] = maxp[l] =
bhpara->
r[l];
106 auto i =
static_cast<int>(threadIdx.x);
107 int const inc =
THREADS1 *
static_cast<int>(gridDim.x);
113 for (
auto j = i +
static_cast<int>(blockIdx.x) *
THREADS1;
115 for (
int l = 0; l < 3; l++) {
116 auto const val =
bhpara->
r[3 * j + l];
117 minp[l] = min(minp[l], val);
118 maxp[l] = max(maxp[l], val);
123 for (
int l = 0; l < 3; l++) {
124 smin[3 * i + l] = minp[l];
125 smax[3 * i + l] = maxp[l];
135 for (
int t =
THREADS1 / 2; t > 0; t /= 2) {
139 for (
int l = 0; l < 3; l++) {
140 smin[3 * i + l] = minp[l] = min(minp[l], smin[3 * k + l]);
141 smax[3 * i + l] = maxp[l] = max(maxp[l], smax[3 * k + l]);
152 auto k =
static_cast<int>(blockIdx.x);
153 for (
int l = 0; l < 3; l++) {
161 auto const n_blocks =
static_cast<int>(gridDim.x) - 1;
163 if (n_blocks == atomicInc((
unsigned int *)&
blkcntd, n_blocks)) {
166 for (
int im = 0; im <= n_blocks; im++)
167 for (
int l = 0; l < 3; l++) {
168 minp[l] = min(minp[l],
bhpara->
minp[3 * im + l]);
169 maxp[l] = max(maxp[l],
bhpara->
maxp[3 * im + l]);
173 auto const val = max(maxp[0] - minp[0], maxp[1] - minp[1]);
174 radiusd = max(val, maxp[2] - minp[2]) * 0.5f;
192 for (
int l = 0; l < 3; l++)
193 bhpara->
r[3 * k + l] = (minp[l] + maxp[l]) * 0.5f;
196 for (i = 0; i < 8; i++)
218 for (
int l = 0; l < 3; l++)
221 int localmaxdepth = 1;
229 auto const inc =
static_cast<int>(blockDim.x * gridDim.x);
231 auto i =
static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
234 while (i < bhpara->nbodies) {
239 for (
int l = 0; l < 3; l++)
248 for (
int l = 0; l < 3; l++)
250 j +=
static_cast<int>(pow(2, l));
281 for (
int l = 0; l < 3; l++)
282 if (
bhpara->
r[3 * n + l] < p[l])
283 j +=
static_cast<int>(pow(2, l));
289 int const locked = n * 8 + j;
291 if (ch == atomicCAS((
int *)&
bhpara->
child[locked], ch, -2)) {
322 int const cell = atomicSub((
int *)&
bottomd, 1) - 1;
323 if (cell <= bhpara->nbodies) {
331 patch = max(patch, cell);
337 for (
int l = 0; l < 3; l++)
338 pos[l] =
static_cast<float>((j >> l) & 1) * r;
370 for (
int l = 0; l < 3; l++)
375 for (
int k = 0; k < 8; k++)
388 for (
int l = 0; l < 3; l++)
390 j +=
static_cast<int>(pow(2, l));
404 for (
int l = 0; l < 3; l++)
406 j +=
static_cast<int>(pow(2, l));
431 localmaxdepth = max(depth, localmaxdepth);
445 atomicMax((
int *)&
maxdepthd, localmaxdepth);
463 for (i = 0; i < 8; i++)
464 child[i *
THREADS3 + threadIdx.x] = -1;
467 auto const inc =
static_cast<int>(blockDim.x * gridDim.x);
473 auto k = bottom +
static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
486 while (k <= bhpara->nnodes) {
496 for (
int l = 0; l < 3; l++) {
502 for (i = 0; i < 8; i++) {
514 child[missing *
THREADS3 + threadIdx.x] = ch;
536 for (
int l = 0; l < 3; l++) {
537 p[l] +=
bhpara->
r[3 * ch + l] * m;
544 missing_max = missing;
552 for (
int im = 0; im < missing_max; im++) {
554 auto const ch = child[im *
THREADS3 + threadIdx.x];
563 child[im *
THREADS3 + threadIdx.x] = -1;
571 for (
int l = 0; l < 3; l++) {
572 p[l] +=
bhpara->
r[3 * ch + l] * m;
592 for (
int l = 0; l < 3; l++) {
593 bhpara->
r[3 * k + l] = p[l] * m;
616 k = bottom +
static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
632 auto const dec =
static_cast<int>(blockDim.x * gridDim.x);
639 static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
644 while (k >= bottom) {
654 for (
int i = 0; i < 8; i++) {
660 }
else if (ch >= 0) {
701 if (threadIdx.x == 0) {
716 dq[i] =
dq[i - 1] * 0.25f;
733 auto const base =
static_cast<int>(threadIdx.x) /
WARPSIZE;
741 auto const diff =
static_cast<int>(threadIdx.x) - sbase;
746 dq[diff + j] =
dq[diff];
751 for (
auto k =
static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
752 k <
bhpara->
nbodies; k +=
static_cast<int>(blockDim.x * gridDim.x)) {
756 for (
int l = 0; l < 3; l++) {
768 if (sbase == threadIdx.x) {
776 while ((t =
pos[depth]) < 8) {
779 if (sbase == threadIdx.x) {
786 __threadfence_block();
791 for (
int l = 0; l < 3; l++) {
793 tmp +=
dr[l] *
dr[l];
815 if ((n < bhpara->nbodies) ||
816 __all_sync(__activemask(), tmp >=
dq[depth])) {
819 auto const d1 = sqrtf(tmp);
820 auto const dd5 = __fdividef(1.0f, tmp * tmp * d1);
824 for (
int l = 0; l < 3; l++) {
828 umd5 +=
u[l] *
uc[l];
832 auto const bb2d7 = 15.0f * b * b2 * __fdividef(dd5, tmp);
834 for (
int l = 0; l < 3; l++) {
835 h[l] += (b * 3.0f *
dr[l] - tmp *
uc[l]) * dd5;
836 f[l] += -
dr[l] * (umd5 + bb2d7) +
837 3.0f * (b *
u[l] + b2 *
uc[l]) * dd5;
844 if (sbase == threadIdx.x) {
851 __threadfence_block();
857 depth = max(j, depth - 1);
864 N[0] =
u[1] *
h[2] -
u[2] *
h[1];
865 N[1] =
u[2] *
h[0] -
u[0] *
h[2];
866 N[2] =
u[0] *
h[1] -
u[1] *
h[0];
868 for (
int l = 0; l < 3; l++) {
869 if (
f[l] !=
f[l] ||
h[l] !=
h[l]) {
870 printf(
"Force Kernel: NAN in particle[%d]\n", i);
871 printf(
"x = %f, y = %f, z = %f,\n",
bhpara->
u[3 * i + 0],
873 printf(
"fx = %f, fy = %f, fz = %f,\n",
f[0],
f[1],
f[2]);
874 printf(
"hx = %f, hy = %f, hz = %f,\n",
h[0],
h[1],
h[2]);
876 atomicAdd(
force + 3 * i + l,
f[l] * pf);
877 atomicAdd(
torque + 3 * i + l,
N[l] * pf);
893 float dr[3],
h[3],
u[3],
uc[3];
898 extern __shared__
float res[];
900 if (threadIdx.x == 0) {
904 dq[i] =
dq[i - 1] * 0.25f;
917 auto const base =
static_cast<int>(threadIdx.x) /
WARPSIZE;
921 auto const diff =
static_cast<int>(threadIdx.x) - sbase;
924 dq[diff + j] =
dq[diff];
929 for (
auto k =
static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
930 k <
bhpara->
nbodies; k +=
static_cast<int>(blockDim.x * gridDim.x)) {
933 for (
int l = 0; l < 3; l++) {
940 if (sbase == threadIdx.x) {
947 while ((t =
pos[depth]) < 8) {
950 if (sbase == threadIdx.x) {
954 __threadfence_block();
957 for (
int l = 0; l < 3; l++) {
959 tmp +=
dr[l] *
dr[l];
963 if ((n < bhpara->nbodies) ||
964 __all_sync(__activemask(), tmp >=
dq[depth])) {
966 auto const d1 = sqrtf(tmp);
967 auto const dd5 = __fdividef(1.0f, tmp * tmp * d1);
969 for (
int l = 0; l < 3; l++) {
974 for (
int l = 0; l < 3; l++)
975 h[l] += (b * 3.0f *
dr[l] - tmp *
uc[l]) * dd5;
980 if (sbase == threadIdx.x) {
984 __threadfence_block();
987 depth = max(j, depth - 1);
994 for (
int l = 0; l < 3; l++) {
997 printf(
"Energy Kernel: NAN in particle[%d]\n", i);
998 printf(
"x = %f, y = %f, z = %f,\n",
bhpara->
u[3 * i + 0],
1000 printf(
"hx = %f, hy = %f, hz = %f,\n",
h[0],
h[1],
h[2]);
1021 dim3
block(1, 1, 1);
1029 cudaFuncSetCacheConfig(boundingBoxKernel, cudaFuncCachePreferShared);
1030 cudaFuncSetCacheConfig(treeBuildingKernel, cudaFuncCachePreferL1);
1031 cudaFuncSetCacheConfig(summarizationKernel, cudaFuncCachePreferShared);
1032 cudaFuncSetCacheConfig(sortKernel, cudaFuncCachePreferL1);
1033 cudaFuncSetCacheConfig(forceCalculationKernel, cudaFuncCachePreferL1);
1034 cudaFuncSetCacheConfig(energyCalculationKernel, cudaFuncCachePreferL1);
1042 dim3
block(1, 1, 1);
1047 cudaDeviceSynchronize();
1056 dim3
block(1, 1, 1);
1070 dim3
block(1, 1, 1);
1084 dim3
block(1, 1, 1);
1096 dim3
block(1, 1, 1);
1106 cudaMemcpyDeviceToHost));
1108 throw std::runtime_error(
"force kernel encountered a functional error");
1115 dim3
block(1, 1, 1);
1132 float x = thrust::reduce(t, t + grid.x);
1133 cuda_safe_mem(cudaMemcpy(E, &x,
sizeof(
float), cudaMemcpyHostToDevice));
1139 cudaMemcpyDeviceToHost));
1141 throw std::runtime_error(
"force kernel encountered a functional error");
1147 cudaMemcpyHostToDevice));
1149 cudaMemcpyHostToDevice));
1164 bh_data->
err =
nullptr;
1166 bh_data->
child =
nullptr;
1167 bh_data->
count =
nullptr;
1168 bh_data->
start =
nullptr;
1169 bh_data->
sort =
nullptr;
1170 bh_data->
mass =
nullptr;
1171 bh_data->
maxp =
nullptr;
1172 bh_data->
minp =
nullptr;
1173 bh_data->
r =
nullptr;
1174 bh_data->
u =
nullptr;
1178 if (bh_data->
nbodies == nbodies)
1186 bh_data->
blocks = dev.n_cores;
1191 int const n_total_threads = 1024 * bh_data->
blocks;
1192 if (bh_data->
nnodes < n_total_threads)
1193 bh_data->
nnodes = n_total_threads;
1195 bh_data->
nnodes = (bh_data->
nnodes / n_total_threads) * n_total_threads;
1205 sizeof(
int) * (bh_data->
nnodes + 1) * 8));
1209 sizeof(
int) * (bh_data->
nnodes + 1)));
1213 sizeof(
int) * (bh_data->
nnodes + 1)));
1217 sizeof(
int) * (bh_data->
nnodes + 1)));
1222 sizeof(
float) * (bh_data->
nnodes + 1)));
1226 auto *mass_tmp =
new float[bh_data->
nbodies];
1227 for (
int i = 0; i < bh_data->
nbodies; i++) {
1231 sizeof(
float) * bh_data->
nbodies,
1232 cudaMemcpyHostToDevice));
1251 cudaMalloc(&(bh_data->
r), 3 * (bh_data->
nnodes + 1) *
sizeof(
float)));
1255 cudaMalloc(&(bh_data->
u), 3 * (bh_data->
nnodes + 1) *
sizeof(
float)));
1259 auto const size = 3 * bh_data->
nbodies *
sizeof(float);
1261 cudaMemcpyHostToDevice));
1262 cuda_safe_mem(cudaMemcpy(bh_data->
r, r, size, cudaMemcpyDeviceToDevice));
1263 cuda_safe_mem(cudaMemcpy(bh_data->
u, dip, size, cudaMemcpyDeviceToDevice));
__shared__ float dq[MAXDEPTH *THREADS5/WARPSIZE]
void buildBoxBH(int blocks)
Building Barnes-Hut spatial min/max position box.
__device__ volatile int maxdepthd
__device__ void dds_sumReduction_BH(float *input, float *sum)
void setBHPrecision(float epssq, float itolsq)
Barnes-Hut parameters setter.
void energyBH(BHData *bh_data, float k, float *E)
Barnes-Hut energy calculation.
__device__ volatile int blkcntd
__global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel()
void forceBH(BHData *bh_data, float k, float *f, float *torque)
Barnes-Hut force calculation.
__global__ float float * torque
__global__ void initializationKernel()
__constant__ float itolsqd[1]
void initBHgpu(int blocks)
Barnes-Hut CUDA initialization.
__shared__ int pos[MAXDEPTH *THREADS5/WARPSIZE]
__shared__ int node[MAXDEPTH *THREADS5/WARPSIZE]
__constant__ float epssqd[1]
__device__ volatile float radiusd
void summarizeBH(int blocks)
Calculate octant cells masses and cell index counts. Determine cells centers of mass and total dipole...
void deallocBH(BHData *bh_data)
A deallocation of the GPU device memory.
__device__ volatile int bottomd
void allocBHmemCopy(int nbodies, BHData *bh_data)
An allocation of the GPU device memory and an initialization where it is needed.
__device__ __constant__ volatile BHData bhpara[1]
__global__ float * energySum
void sortBH(int blocks)
Sort particle indexes according to the Barnes-Hut tree representation. Crucial for the per-warp perfo...
void fill_bh_data(float const *r, float const *dip, BHData const *bh_data)
Copy Barnes-Hut data to bhpara and copy particle data.
void buildTreeBH(int blocks)
Building Barnes-Hut tree in a linear child array representation of octant cells and particles inside.
#define WARPSIZE
Barnes-Hut warp size.
#define MAXDEPTH
Maximal depth of the Barnes-Hut tree branching.
This file contains the defaults for ESPResSo.
static double * block(double *p, std::size_t index, std::size_t size)
EspressoGpuDevice cuda_get_device_props(int dev)
Get properties of a CUDA device.
int cuda_get_device()
Get the current CUDA device.
float * r
particle positions on the device:
float * mass
Not a real mass. Just a node weight coefficient.
int nbodies
each node corresponds to a split of the cubic box in 3D space to equal cubic boxes hence,...
int * child
The tree linear representation.
float * maxp
max positions' coordinates of the Barnes-Hut box.
int * sort
Indices of particles sorted according to the tree linear representation.
int * count
Supplementary array: a tree nodes (division octant cells/particles inside) counting.
float * u
particle dipole moments on the device:
float * minp
min positions' coordinates of the Barnes-Hut box.
int * max_lps
trace the max loops for a threads' sync
int * start
Start indices for the per-cell sorting.
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
#define KERNELCALL(_function, _grid, _block,...)