ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
barnes_hut_gpu_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2016-2022 The ESPResSo project
3 * Copyright (C) 2012 Alexander (Polyakov) Peletskyi
4 *
5 * This file is part of ESPResSo.
6 *
7 * ESPResSo is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * ESPResSo is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21/** @file
22 * The method is based on @cite burtscher11a.
23 */
24
25#include "config/config.hpp"
26
27#ifdef DIPOLAR_BARNES_HUT
28
30
31#include "cuda/init.hpp"
32#include "cuda/utils.cuh"
33
34#include <thrust/device_ptr.h>
35#include <thrust/reduce.h>
36
37#include <cuda.h>
38
39#include <algorithm>
40#include <cstdio>
41#include <stdexcept>
42
43// Method performance/accuracy parameters
44__constant__ float epssqd[1], itolsqd[1];
45// blkcntd is a factual blocks' count.
46// bottomd is a bottom Barnes-Hut node (the division octant cell) in a linear
47// array representation. maxdepthd is a largest length of the octree "branch"
48// till the "leaf".
49__device__ volatile int bottomd, maxdepthd, blkcntd;
50// half edge of the BH box
51__device__ volatile float radiusd;
52// the struct containing all the device pointers
53__device__ __constant__ volatile BHData bhpara[1];
54
55// The "half-convolution" multi-thread reduction.
56// The thread with a lower index will operate longer and
57// final result (here: the sum) is flowing down towards zero thread.
58__device__ void dds_sumReduction_BH(float *input, float *sum) {
59 auto tid = static_cast<int>(threadIdx.x);
60 for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
62 if (tid < i / 2)
63 input[tid] += input[i / 2 + tid];
64 if ((i % 2 == 1) && (tid == 0))
65 input[tid] += input[i - 1];
66 }
68 if (threadIdx.x == 0) {
69 sum[0] = input[0];
70 }
71}
72
73/******************************************************************************/
74/*** initialize memory ********************************************************/
75/******************************************************************************/
76
77__global__ void initializationKernel() {
78 auto const ind = blockDim.x * blockIdx.x + threadIdx.x;
79 if (ind == 0) {
80 *bhpara->err = 0;
81 *bhpara->max_lps = 0;
82 maxdepthd = 1;
83 blkcntd = 0;
84 }
85}
86
87/******************************************************************************/
88/*** compute center and radius ************************************************/
89/******************************************************************************/
90
91__global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel() {
92 // min/max positions per the thread:
93 float minp[3], maxp[3];
94 // min/max positions per block:
95 __shared__ float smin[3 * THREADS1], smax[3 * THREADS1];
96 for (int l = 0; l < 3; l++) {
97 minp[l] = maxp[l] = bhpara->r[l];
98 }
99
100 // Scan all bodies.
101 // In order to iterate over all bodies assigned to thread,
102 // it is necessary to step over all threads in the GPU:
103 // inc = [number of blocks: gridDim.x] * [THREADS1 per block within given
104 // kernel]. Hence, this approach could handle an infinite number of bodies
105 // (particles)
106 auto i = static_cast<int>(threadIdx.x);
107 int const inc = THREADS1 * static_cast<int>(gridDim.x);
108 // j is an absolute index of the particle.
109 // It is shifted over a count of the passed block threads behind: blockIdx.x *
110 // THREADS1. NOTE: this loop is extrema search among all particles of the
111 // given thread in the present block. However, one is not among all threads of
112 // this block.
113 for (auto j = i + static_cast<int>(blockIdx.x) * THREADS1;
114 j < bhpara->nbodies; j += inc)
115 for (int l = 0; l < 3; l++) {
116 auto const val = bhpara->r[3 * j + l];
117 minp[l] = min(minp[l], val);
118 maxp[l] = max(maxp[l], val);
119 }
120
121 // For a start point of a reduction in the given block shared memory
122 // of the i-th thread extrema:
123 for (int l = 0; l < 3; l++) {
124 smin[3 * i + l] = minp[l];
125 smax[3 * i + l] = maxp[l];
126 }
127
128 // Now it's time to (min)maximize among all threads of the given block.
129 // Each mim/max operation will be applied between
130 // the shared memory smin/smax and the given thread.
131 // The "half-convolution" multi-thread reduction
132 // the thread with a lower index will operate longer and
133 // final result (here: the shared memory extrema)
134 // is flowing down towards zero thread.
135 for (int t = THREADS1 / 2; t > 0; t /= 2) {
137 if (i < t) {
138 auto k = i + t;
139 for (int l = 0; l < 3; l++) {
140 smin[3 * i + l] = minp[l] = min(minp[l], smin[3 * k + l]);
141 smax[3 * i + l] = maxp[l] = max(maxp[l], smax[3 * k + l]);
142 // very last minp/maxp assignment will be made by zero thread (i == 0)
143 }
144 }
145 }
146
147 // Thread i == 0 is responsible for a writing
148 // of a given block result into the global memory
149 // and other per-block operations.
150 if (i == 0) {
151 // per k-th block
152 auto k = static_cast<int>(blockIdx.x);
153 for (int l = 0; l < 3; l++) {
154 // global memory storage of the per-block extrema
155 bhpara->minp[3 * k + l] = minp[l];
156 bhpara->maxp[3 * k + l] = maxp[l];
157 // note, that we are in zero thread and its variables minp/maxp
158 // contain de facto already reduced (see above) shared extrema smin/smax
159 }
160
161 auto const n_blocks = static_cast<int>(gridDim.x) - 1;
162 // The block increment is performing by its zero thread.
163 if (n_blocks == atomicInc((unsigned int *)&blkcntd, n_blocks)) {
164 // I'm the (randomly) last block, so combine all other blocks' results
165 // over the index j:
166 for (int im = 0; im <= n_blocks; im++)
167 for (int l = 0; l < 3; l++) {
168 minp[l] = min(minp[l], bhpara->minp[3 * im + l]);
169 maxp[l] = max(maxp[l], bhpara->maxp[3 * im + l]);
170 }
171
172 // Compute 'radius':
173 auto const val = max(maxp[0] - minp[0], maxp[1] - minp[1]);
174 radiusd = max(val, maxp[2] - minp[2]) * 0.5f;
175
176 // NOTE: now the extrema are global.
177 // Present code fragment will be executed once: in zero thread of the last
178 // block.
179
180 k = bhpara->nnodes;
181 // Create the root node of the Barnes-Hut octree.
182 // Bottom node is defined with max possible index just to start
183 // It will be updated within further tree building in
184 // corresponding kernel.
185 bottomd = k;
186 // Weight of the root node init.
187 bhpara->mass[k] = -1.0f;
188 // Sorting init for the tree root.
189 bhpara->start[k] = 0;
190 // Position of the root node should be in the center of just defined BH
191 // box:
192 for (int l = 0; l < 3; l++)
193 bhpara->r[3 * k + l] = (minp[l] + maxp[l]) * 0.5f;
194 // Init further tree building octo- meaning their absence at the
195 // beginning:
196 for (i = 0; i < 8; i++)
197 bhpara->child[8 * k + i] = -1;
198 }
199 }
200}
201
202/******************************************************************************/
203/*** build tree ***************************************************************/
204/******************************************************************************/
205
206__global__ __launch_bounds__(THREADS2, FACTOR2) void treeBuildingKernel() {
207 int j, depth;
208 float r;
209 float pos[3];
210 float p[3];
211 int n;
212 float root[3];
213
214 // Radius is determined in boundingBoxKernel
215 auto const radius = radiusd;
216 // The root node has been created at the end of the boundingBoxKernel.
217 // Cache the root data:
218 for (int l = 0; l < 3; l++)
219 root[l] = bhpara->r[3 * bhpara->nnodes + l];
220 // Maximum tree depth within the given thread.
221 int localmaxdepth = 1;
222 // Skip the branch following and start from the root.
223 int skip = 1;
224 // Number of loops for the threads sync algorithm
225 int lps = 0;
226 // Increment to move among the bodies assigned to the given thread.
227 // Hence, one should step over all other threads in GPU with
228 // a quantity of blockDim.x * gridDim.x.
229 auto const inc = static_cast<int>(blockDim.x * gridDim.x);
230 // Just a regular 1D GPU index
231 auto i = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
232
233 // Iterate over all bodies assigned to thread.
234 while (i < bhpara->nbodies) {
235 if (skip != 0) {
236 // New body, so start traversing at root. Skip it further.
237 skip = 0;
238 // Particle position corresponding to the given thread and block:
239 for (int l = 0; l < 3; l++)
240 p[l] = bhpara->r[3 * i + l];
241 // Let's start a moving via the tree from the root node 8 * nbodiesd:
242 n = bhpara->nnodes;
243 depth = 1;
244 r = radius;
245 // Determine which child to follow.
246 // j=0..7 determines the octant in a binary representations
247 j = 0;
248 for (int l = 0; l < 3; l++)
249 if (root[l] < p[l])
250 j += static_cast<int>(pow(2, l));
251 }
252
253 // Follow path to leaf cell. Should not happen at the first iteration of
254 // this loop.
255 int ch = bhpara->child[n * 8 + j];
256
257 // Global memory writing related threads sync
258 if (lps++ > THREADS2) {
259 *bhpara->max_lps = lps;
260 }
261 //.. now wait for global memory updates. This impacts on race conditions and
262 // frameworks level optimizations. Further kernels contain a similar code
263 // fragment.
264 __threadfence();
265
266 // The child with the index higher than nbodiesd (number of particles) means
267 // that it is an octant cell, not a body.
268 // Actually, we need nnodesd == 8 * nbodiesd nodes for the cells storage.
269 // Let's iterate this "while" loop before we will reach the particle
270 // or an absence of a child:
271 while (ch >= bhpara->nbodies) {
272 n = ch;
273 depth++;
274 // Going down through the octree depth: radius split corresponds to the
275 // cube split. Corresponding octants are cells, not bodies. Smaller radius
276 // will be used until the next skip == 1
277 r *= 0.5f;
278 j = 0;
279 // Determine which child octant to follow based on body coordinates.
280 // j=0..7 determines the octant in a binary representations.
281 for (int l = 0; l < 3; l++)
282 if (bhpara->r[3 * n + l] < p[l])
283 j += static_cast<int>(pow(2, l));
284 ch = bhpara->child[n * 8 + j];
285 }
286 // Now we are deep enough in the tree, passed all levels of cells and
287 // reached the body (particle).
288 if (ch != -2) { // Skip if child pointer is locked (-2) and try again later.
289 int const locked = n * 8 + j;
290 // Try to lock and iterate towards next body:
291 if (ch == atomicCAS((int *)&bhpara->child[locked], ch, -2)) {
292 // If we are here then child[locked] was equal to "ch" and now is
293 // assigned to -2, it is locked. We will not came here in a case if this
294 // child will be already locked by other threads because other particle
295 // could already do it in other thread. Several particles simultaneously
296 // could force a new octants' split. In this case we will not came here
297 // and "i += inc" will be not executed below. Hence, the present loop
298 // "while (i < nbodiesd)" will mandatory repeat in the given thread for
299 // the given i-th particle until other threads will unlock this cell for
300 // either a body insertion and/or new octant level cells creation.
301 if (ch == -2) {
302 // Cannot be here..
303 *bhpara->err = 1;
304 break;
305 }
306 if (ch == -1) {
307 // If -1 (i.e. no child index) then just insert a new body
308 bhpara->child[locked] = i;
309 } else {
310 int patch = -1;
311 // There already is a body and/or cell(s) in this position.
312 // We should start from a loop of the deepest child cell
313 // determination. Then, we need to create new cells and to distribute
314 // existing and new bodies between these new cells.
315 do {
316 // If we are here then the tree bottom level is moving further:
317 // Note, that the bottomd is not a global tree depth.
318 // It is rather a tree size in 1D representation of nnodesd == 8 *
319 // nbodiesd nodes. These lists will be correctly handled by the
320 // sortKernel later.
321 depth++;
322 int const cell = atomicSub((int *)&bottomd, 1) - 1;
323 if (cell <= bhpara->nbodies) {
324 // This should not happen. A cell cannot have such index. Error.
325 *bhpara->err = 1;
327 }
328 // The "patch" is saving the information about a first cell created
329 // in the current thread before it continues to dive into the tree
330 // depth. Hence, the "patch" defines the new branch inception index.
331 patch = max(patch, cell);
332
333 // Note: "j" is defined by the body position against the parent cell
334 // within above "while (ch >= nbodiesd)" loop.
335 // The new cell will be placed below relatively the center of
336 // corresponding j-th octant:
337 for (int l = 0; l < 3; l++)
338 pos[l] = static_cast<float>((j >> l) & 1) * r;
339 // Note, that negative octants correspond to pos[l] == 0 and
340 // positive octants correspond to pos[l] == r.
341
342 // Going down through the octree depth: radius split corresponds to
343 // the cube split. Corresponding octants are cells, not bodies.
344 // Smaller radius will be used until the next skip == 1
345 r *= 0.5f;
346
347 // Init the node weight coefficients.
348 // Note: particles has mass=1.0 is defined in allocBHmemCopy().
349 bhpara->mass[cell] = -1.0f;
350
351 // The startd array is crucial for the sortKernel.
352 // The original root node already has startd = 0.
353 // Here, we assign -1 to the cells in contrast to bodies.
354 // Bodies do not need this array. They need only array sortd,
355 // which will be defined in the sortKernel for a usage by the force
356 // and energy calculation kernels.
357 bhpara->start[cell] = -1;
358
359 // Now, let's save the cell coordinates locally (pos[l]) and
360 // globally (xd[3 * cell + l]). This location should be shifted from
361 // the octant center defined above (pos[l] before this assignment).
362 // Parent cell coordinates bhpara->r[3 * n + l] will be added.
363 // Parent radius now is equal to 2 * r, where "r" is already updated
364 // above: r *= 0.5f. Hence, the negative octant is defined above by
365 // pos[l] == 0 and positive - by pos[l] == 2 * r. In order to
366 // transform these coordinates into relative octant positions, we
367 // need to add -r to obtain -r and r for negative and positive
368 // octants. Now, the child (cell) octants centers are deriving from
369 // the parent (n) octant center:
370 for (int l = 0; l < 3; l++)
371 pos[l] = bhpara->r[3 * cell + l] =
372 bhpara->r[3 * n + l] - r + pos[l];
373
374 // By default, the new cell has no children in all k-th octants:
375 for (int k = 0; k < 8; k++)
376 bhpara->child[cell * 8 + k] = -1;
377
378 // This condition should always be true cause "patch" is -1 at the
379 // beginning and the bottomd/cell reduces further.
380 if (patch != cell) {
381 // New cell is assigned as a child of previous "n" parent:
382 bhpara->child[n * 8 + j] = cell;
383 }
384
385 // pos[l] already contains the child cell coordinates.
386 // Let's assign "child" then. First the octant should be selected:
387 j = 0;
388 for (int l = 0; l < 3; l++)
389 if (pos[l] < bhpara->r[3 * ch + l])
390 j += static_cast<int>(pow(2, l));
391 // New element just appeared in the chain of cells. Hence, that what
392 // supposed to be a child ("ch") before entering the present
393 // iteration, now will be a child of the new cell (after this
394 // smallest octant split into new octants):
395 bhpara->child[cell * 8 + j] = ch;
396
397 // Now cell is claimed to be a parent of further iteration of the
398 // present loop.
399 n = cell;
400 j = 0;
401 __threadfence();
402 // Let's handle the particle position (p[l]) corresponding to the
403 // given thread and block against new octant cell (pos[l]):
404 for (int l = 0; l < 3; l++)
405 if (pos[l] < p[l])
406 j += static_cast<int>(pow(2, l));
407
408 // Now the current cell's child should be considering in the new
409 // particle new octant:
410 ch = bhpara->child[n * 8 + j];
411 // Repeat until the two bodies will be different children.
412 // Hence, the current "child" should have no children.
413 // It is equivalent to an absence of other particles
414 // in the i-th particle new smallest octant, otherwise we should
415 // split octants further until these two particles will come to
416 // different octants:
417 } while (ch >= 0);
418
419 // i-th particle assignment as a child to the last created cell:
420 bhpara->child[n * 8 + j] = i;
421 // Push out the subtree among the whole grid.
422 // Data setting must be completed after this point.
423 __threadfence();
424 // The final locked child index is defined by a maximal cell index,
425 // i.e. by a beginning of the new tree of cells created within
426 // the loop "while (ch >= 0)".
427 // The "patch" defines the new just created branch inception index:
428 bhpara->child[locked] = patch;
429 }
430
431 localmaxdepth = max(depth, localmaxdepth);
432 // Each thread started from the skip=1 and made the above
433 // tree building loop procedure (while (ch >= 0)).
434 // They should do the same for remaining (each) particles.
435 // Note, that bottomd is a global variable and it is already updated.
436 // This is already taken into account in further sortKernel logic.
437 // Hence, move on to the next body assigned to the given thread:
438 i += inc;
439 skip = 1;
440 lps = 0;
441 }
442 }
443 }
444 // Record maximum tree depth:
445 atomicMax((int *)&maxdepthd, localmaxdepth);
446}
447
448/******************************************************************************/
449/*** compute center of mass ***************************************************/
450/******************************************************************************/
451
452__global__ __launch_bounds__(THREADS3, FACTOR3) void summarizationKernel() {
453 int i, j, cnt;
454 // the node "mass" and its count respectively:
455 float m, cm;
456 // position of equivalent total dipole and its magnitude:
457 // (like a mass and the center of mass)
458 float p[3], u[3];
459 // Per-block BH tree caching:
460 __shared__ int child[THREADS3 * 8];
461
462 // no children by default:
463 for (i = 0; i < 8; i++)
464 child[i * THREADS3 + threadIdx.x] = -1;
465 auto const bottom = bottomd;
466 // Increment towards other particles assigned to the given thread:
467 auto const inc = static_cast<int>(blockDim.x * gridDim.x);
468 // Nodes iteration "k" should start from the "bottomd" level of the cells,
469 // which is a minimal index of the last created cell.
470 // Starting "k" value should be aligned using the warp size
471 // according to the designed threads performance.
472 // k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;
473 auto k = bottom + static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
474 // Threads below the bottom line could proceed to their next cells.
475 // if (k < bottom) k += inc;
476
477 // Assume no missing children:
478 int missing = 0;
479 int missing_max = 0;
480 int iteration = 0;
481 int repeat_flag = 0;
482 __syncthreads(); // throttle
483 // threads sync related
484 int lps = 0;
485 // Iterate over all cells (not particles) assigned to the thread:
486 while (k <= bhpara->nnodes) {
487 if (lps++ > THREADS3) {
488 *bhpara->max_lps = lps;
489 __threadfence();
490 }
491 if (bhpara->mass[k] < 0.) {
492 iteration++;
493 if (missing == 0) {
494 // New cell, so initialize:
495 cm = 0.0f;
496 for (int l = 0; l < 3; l++) {
497 p[l] = 0.0f;
498 u[l] = 0.0f;
499 }
500 cnt = 0;
501 j = 0;
502 for (i = 0; i < 8; i++) {
503 auto const ch = bhpara->child[k * 8 + i];
504 if (ch >= 0) {
505 if (i != j) {
506 // Move child to front (needed later for a speed only).
507 // The child's octant change is incorrect from
508 // a tree organization perspective. However, the sum
509 // will be the same.
510 bhpara->child[k * 8 + i] = -1;
511 bhpara->child[k * 8 + j] = ch;
512 }
513 // Cache a missing child in the block shared memory:
514 child[missing * THREADS3 + threadIdx.x] = ch;
515 m = bhpara->mass[ch];
516 // Is a child the particle? Only particles have non-negative mass
517 // initialized originally. Another option: a cell which already
518 // aggregated masses of other cells and particles. "missing" means
519 // that a non-zero contribution of such kind is missing:
520 missing++;
521 if (m >= 0.0f) {
522 // child is ready
523 missing--;
524 // The child is a cell, not a body (ch >= nbodiesd).
525 // Also, the previous condition (m >= 0.0f) reveals
526 // that its' children total mass is already calculated.
527 // Hence, below command "countd[k] = cnt" is already executed by
528 // other threads/blocks and we can add this count
529 if (ch >= bhpara->nbodies) { // count bodies (needed later)
530 // As far as a child is a cell, its "countd" was already
531 // calculated.
532 cnt += bhpara->count[ch] - 1;
533 }
534 // add child's contribution
535 cm += m;
536 for (int l = 0; l < 3; l++) {
537 p[l] += bhpara->r[3 * ch + l] * m;
538 u[l] += bhpara->u[3 * ch + l];
539 }
540 }
541 j++;
542 } // if (ch >= 0)
543 }
544 missing_max = missing;
545 // Count of children:
546 cnt += j;
547 }
548
549 //__syncthreads(); // throttle
550
551 if (missing != 0) {
552 for (int im = 0; im < missing_max; im++) {
553 // poll missing child
554 auto const ch = child[im * THREADS3 + threadIdx.x];
555 if (ch >= 0) {
556 m = bhpara->mass[ch];
557 // Is a child the particle? Only particles have non-negative mass
558 // initialized originally. Another option: a cell which already
559 // aggregated masses of other cells and particles.
560 if (m >= 0.0f) {
561 // child is now ready
562 missing--;
563 child[im * THREADS3 + threadIdx.x] = -1;
564 // The child is a cell, not a body (ch >= nbodiesd).
565 if (ch >= bhpara->nbodies) {
566 // count bodies (needed later)
567 cnt += bhpara->count[ch] - 1;
568 }
569 // add child's contribution
570 cm += m;
571 for (int l = 0; l < 3; l++) {
572 p[l] += bhpara->r[3 * ch + l] * m;
573 u[l] += bhpara->u[3 * ch + l];
574 }
575 } // m >= 0.0f
576 } // ch >= 0
577 } // missing_max
578 // repeat until we are done or child is not ready
579 }
580
581 //__syncthreads(); // throttle
582
583 // (missing == 0) could be true and threads will move to next particles (k
584 // += inc) only if previous conditions (m >= 0.0f) will be true. It can
585 // happen only if cell will obtain the mass (only here below: "massd[k] =
586 // cm") or they will find the very last child: particles. Before that:
587 // do/while loop will continue.
588 if (missing == 0) {
589 // all children are ready, so store computed information
590 bhpara->count[k] = cnt;
591 m = 1.0f / cm;
592 for (int l = 0; l < 3; l++) {
593 bhpara->r[3 * k + l] = p[l] * m;
594 bhpara->u[3 * k + l] = u[l];
595 }
596 __threadfence(); // make sure data are visible before setting mass
597 bhpara->mass[k] = cm;
598 __threadfence();
599 k += inc;
600 iteration = 0;
601 lps = 0;
602 }
603 //__syncthreads(); // throttle
604 if (iteration > THREADS3 + 1) {
605 k += inc;
606 repeat_flag = 1;
607 iteration = 0;
608 missing = 0;
609 }
610 } else {
611 k += inc;
612 }
613 if ((k > bhpara->nnodes) && (repeat_flag)) {
614 repeat_flag = 0;
615 missing = 0;
616 k = bottom + static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
617 }
618 } // while
619}
620
621/******************************************************************************/
622/*** sort bodies **************************************************************/
623/******************************************************************************/
624
625// This kernel concurrently places the bodies into an array such that the bodies
626// appear in the same order in the array as they would during an in-order
627// traversal of the octree. This sorting groups spatially close bodies (in the
628// same octant cells) together, and these grouped bodies are crucial to speed up
629// forceCalculationKernel and energyCalculationKernel
630__global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() {
631 auto const bottom = bottomd;
632 auto const dec = static_cast<int>(blockDim.x * gridDim.x);
633 // Start from the end of the nnodesd == 8 * nbodiesd.
634 // Reverse order is required now cause octant cells which are more close
635 // to the root have a larger count of entities inside (countd[k]).
636 // Particles should be sorted over all entities count in the tree array
637 // representation made by treeBuildingKernel.
638 int k = bhpara->nnodes + 1 - dec +
639 static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
640 // threads sync related
641 int lps = 0;
642
643 // iterate over all cells assigned to thread
644 while (k >= bottom) {
645 int start = bhpara->start[k];
646 // Threads sync related
647 if (lps++ > THREADS4) {
648 *bhpara->max_lps = lps;
649 __threadfence();
650 }
651 // Let's start from the root which has only startd=0 defined
652 // in boundingBoxKernel. All other bodies and cells have -1.
653 if (start >= 0) {
654 for (int i = 0; i < 8; i++) {
655 int const ch = bhpara->child[k * 8 + i];
656 if (ch >= bhpara->nbodies) {
657 // child is a cell
658 bhpara->start[ch] = start; // set start ID of child
659 start += bhpara->count[ch]; // add # of bodies in subtree
660 } else if (ch >= 0) {
661 // Child is a body.
662 // This particle should be saved with a stepping over
663 // a count of particles in the cells.
664 // treeBuildingKernel already has ordered cells in a
665 // linear array way. The sortKernel just order random particle
666 // indices in the same order. Hence, they will be much faster accessed
667 // by forceCalculationKernel and energyCalculationKernel.
668 bhpara->sort[start] = ch; // record body in 'sorted' array
669 start++;
670 }
671 }
672 k -= dec; // move on to next cell
673 // Threads sync related
674 lps = 0;
675 }
676 //__syncthreads(); // throttle
677 }
678}
679
680/******************************************************************************/
681/*** compute dipole-dipole force and torque ***********************************/
682/******************************************************************************/
683
684__global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
685 float pf, float *force, float *torque) {
686 int i, t;
687 // dr is a distance between particles.
688 // f,h, and N are a target force, field, and torque respectively.
689 // u and uc are dipole moments of two particles.
690 float dr[3], f[3], h[3], u[3], uc[3], N[3];
691 // Shared memory aggregates of each warp-specific stacks
692 // with the MAXDEPTH size per each warp:
693 // "node" is the BH octant sub-cell in the stack.
694 // "pos"=0..7 - which octant we are examining now in the stack.
695 // dq is an array used to determine that the given BH cell is far enough.
696 __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE],
698 __shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE];
699
700 // Zero thread of the block initialize shared data for all warps.
701 if (threadIdx.x == 0) {
702 // Precompute values that depend only on tree level.
703 // The method's parameters (a trade-off accuracy/performance)
704 // which determine that the
705 // cell is far enough are "itolsqd" and
706 // "epssqd" which define a fraction of the octant cell and
707 // an additive distance respectively.
708 // Their joint contribution for the given tree depth are
709 // calculated within the array dq[i], which will
710 // be compared later with squared distance between the particle
711 // and the cell depending on a cell level.
712 // Original tree box edge (2*radiusd) should be halved
713 // as much as the tree depth takes place.
714 dq[0] = radiusd * radiusd * *itolsqd;
715 for (i = 1; i < maxdepthd; i++) {
716 dq[i] = dq[i - 1] * 0.25f; // halving of the squared distance
717 dq[i - 1] += *epssqd; // increase thickness of previous cell
718 }
719 dq[i - 1] += *epssqd;
720
721 // Only maximal Barnes-Hut tree depth is allowed.
722 // This error is technically possible, however, most applications
723 // are far from the 1/2^32 particles' convergence.
724 if (maxdepthd > MAXDEPTH) {
725 *bhpara->err = maxdepthd;
726 }
727 }
729
730 // Only maximal Barnes-Hut tree depth is allowed.
732 // How many warps are behind the current thread (?):
733 auto const base = static_cast<int>(threadIdx.x) / WARPSIZE;
734 // Figure out first thread in each warp (lane 0):
735 auto const sbase = base * WARPSIZE;
736 // Initial stack index is its MAXDEPTH portion start for the given warp
737 // count base:
738 auto const j = base * MAXDEPTH;
739
740 // How far the thread is from the warp beginning (?):
741 auto const diff = static_cast<int>(threadIdx.x) - sbase;
742 // Make multiple copies to avoid index calculations later:
743 if (diff < MAXDEPTH) {
744 // Each thread copies its own dq[] element to a part of
745 // dq array dedicated to the given warp:
746 dq[diff + j] = dq[diff];
747 }
749
750 // Iterate over all bodies assigned to thread:
751 for (auto k = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
752 k < bhpara->nbodies; k += static_cast<int>(blockDim.x * gridDim.x)) {
753 // Sorted body indexes assigned to me:
754 i = bhpara->sort[k]; // get permuted/sorted index
755 // Cache the particle position info:
756 for (int l = 0; l < 3; l++) {
757 u[l] = bhpara->u[3 * i + l];
758 h[l] = 0.0f;
759 f[l] = 0.0f;
760 }
761
762 // Force will be calculated for i-th particle.
763 // All other space is of interest, hence all cells will be considered.
764 // Hence, we should start from the root node (whole Barnes-Hut cube).
765 // Initialize iteration stack, i.e., push root node onto stack.
766 // Let's start from zero octant.
767 auto depth = j;
768 if (sbase == threadIdx.x) {
769 node[j] = bhpara->nnodes;
770 pos[j] = 0;
771 }
772
773 while (depth >= j) {
774 // Stack is not empty (depth is still higher than j-level).
775 // Hence, there are still some children to consider.
776 while ((t = pos[depth]) < 8) {
777 // Node on top of stack has more children to process:
778 auto const n = bhpara->child[node[depth] * 8 + t]; // child pointer
779 if (sbase == threadIdx.x) {
780 // I'm the first thread in the warp.
781 // Let me check for the current depth level of the tree
782 // whether we have more children among 8 octant cells.
783 // Hence, let's go to the next octant in the next iteration only:
784 pos[depth] = t + 1;
785 }
786 __threadfence_block();
787 // There is a child (octant cell) with a dipole moment uxd[3 * n + l]
788 // and the center position bhpara->r[3 * n + l]:
789 if (n >= 0) {
790 auto tmp = 0.0f; // compute distance squared
791 for (int l = 0; l < 3; l++) {
792 dr[l] = -bhpara->r[3 * n + l] + bhpara->r[3 * i + l];
793 tmp += dr[l] * dr[l];
794 }
795
796 // NOTE: i-th particle is specific for the given thread.
797 // However, the above-mentioned index "i" is already sorted by the
798 // sortKernel. Hence, below stack-diving operations will be
799 // performed by different threads of the same warp almost
800 // synchronously because adjacent threads have particles located in
801 // the space not far from each other (=adjacent octants of
802 // corresponding local maxdepth). Hence, such particles will have a
803 // similar stack of the below execution from a perspective of other
804 // Barnes-Hut tree branches distancing against them. Same global
805 // memory fragments (child[]) will be loaded to the same warp
806 // access and no corresponding threads' divergence will take place.
807 // Only in this way, zero thread could control others: see "if
808 // (sbase == threadIdx.x)" above and below.. and this control will
809 // be correct. It will be even with a single stack arrays per the
810 // single warp: The pos, node and dq array fragments are shared
811 // between all threads of the whole warp.
812
813 // Check if all threads agree that cell is far enough away (or is a
814 // body, i.e. n < nbodiesd).
815 if ((n < bhpara->nbodies) ||
816 __all_sync(__activemask(), tmp >= dq[depth])) {
817 if (n != i) {
818
819 auto const d1 = sqrtf(tmp);
820 auto const dd5 = __fdividef(1.0f, tmp * tmp * d1);
821 auto b = 0.0f;
822 auto b2 = 0.0f;
823 auto umd5 = 0.0f;
824 for (int l = 0; l < 3; l++) {
825 uc[l] = bhpara->u[3 * n + l];
826 b += uc[l] * dr[l];
827 b2 += u[l] * dr[l];
828 umd5 += u[l] * uc[l];
829 }
830 umd5 *= -3.0f * dd5;
831
832 auto const bb2d7 = 15.0f * b * b2 * __fdividef(dd5, tmp);
833
834 for (int l = 0; l < 3; l++) {
835 h[l] += (b * 3.0f * dr[l] - tmp * uc[l]) * dd5;
836 f[l] += -dr[l] * (umd5 + bb2d7) +
837 3.0f * (b * u[l] + b2 * uc[l]) * dd5;
838 }
839 }
840 } else {
841 // If it is not then let's split octants further (more depth).
842 // Push the cell onto the stack:
843 depth++;
844 if (sbase == threadIdx.x) {
845 // The given warp should start from this child as a root
846 // further:
847 node[depth] = n;
848 // Let's start from it zero octant:
849 pos[depth] = 0;
850 }
851 __threadfence_block();
852 }
853 } else {
854 // Early out because all remaining children are also zero.
855 // We should move to the next octant or to the next depth if other
856 // threads already checked other octants:
857 depth = max(j, depth - 1);
858 }
859 }
860 depth--; // Done with this level
861 }
862
863 // Torque:
864 N[0] = u[1] * h[2] - u[2] * h[1];
865 N[1] = u[2] * h[0] - u[0] * h[2];
866 N[2] = u[0] * h[1] - u[1] * h[0];
867
868 for (int l = 0; l < 3; l++) {
869 if (f[l] != f[l] || h[l] != h[l]) { // nan
870 printf("Force Kernel: NAN in particle[%d]\n", i);
871 printf("x = %f, y = %f, z = %f,\n", bhpara->u[3 * i + 0],
872 bhpara->u[3 * i + 1], bhpara->u[3 * i + 2]);
873 printf("fx = %f, fy = %f, fz = %f,\n", f[0], f[1], f[2]);
874 printf("hx = %f, hy = %f, hz = %f,\n", h[0], h[1], h[2]);
875 }
876 atomicAdd(force + 3 * i + l, f[l] * pf);
877 atomicAdd(torque + 3 * i + l, N[l] * pf);
878 }
879 }
880 }
881}
882
883/******************************************************************************/
884/*** compute energy ***********************************************************/
885/******************************************************************************/
886
887__global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
888 float pf, float *energySum) {
889 // NOTE: the algorithm of this kernel is almost identical to
890 // @ref forceCalculationKernel. See comments there.
891
892 int i, n, t;
893 float dr[3], h[3], u[3], uc[3];
894 __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE],
896 __shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE];
897 float sum = 0.0;
898 extern __shared__ float res[];
899
900 if (threadIdx.x == 0) {
901 // precompute values that depend only on tree level
902 dq[0] = radiusd * radiusd * *itolsqd;
903 for (i = 1; i < maxdepthd; i++) {
904 dq[i] = dq[i - 1] * 0.25f;
905 dq[i - 1] += *epssqd;
906 }
907 dq[i - 1] += *epssqd;
908
909 if (maxdepthd > MAXDEPTH) {
910 *bhpara->err = maxdepthd;
911 }
912 }
914
915 if (maxdepthd <= MAXDEPTH) {
916 // figure out first thread in each warp (lane 0)
917 auto const base = static_cast<int>(threadIdx.x) / WARPSIZE;
918 auto const sbase = base * WARPSIZE;
919 auto const j = base * MAXDEPTH;
920
921 auto const diff = static_cast<int>(threadIdx.x) - sbase;
922 // make multiple copies to avoid index calculations later
923 if (diff < MAXDEPTH) {
924 dq[diff + j] = dq[diff];
925 }
927
928 // iterate over all bodies assigned to thread
929 for (auto k = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
930 k < bhpara->nbodies; k += static_cast<int>(blockDim.x * gridDim.x)) {
931 i = bhpara->sort[k]; // get permuted/sorted index
932 // cache position info
933 for (int l = 0; l < 3; l++) {
934 u[l] = bhpara->u[3 * i + l];
935 h[l] = 0.0f;
936 }
937
938 // initialize iteration stack, i.e., push root node onto stack
939 auto depth = j;
940 if (sbase == threadIdx.x) {
941 node[j] = bhpara->nnodes;
942 pos[j] = 0;
943 }
944
945 while (depth >= j) {
946 // stack is not empty
947 while ((t = pos[depth]) < 8) {
948 // node on top of stack has more children to process
949 n = bhpara->child[node[depth] * 8 + t]; // load child pointer
950 if (sbase == threadIdx.x) {
951 // I'm the first thread in the warp
952 pos[depth] = t + 1;
953 }
954 __threadfence_block();
955 if (n >= 0) {
956 auto tmp = 0.0f;
957 for (int l = 0; l < 3; l++) {
958 dr[l] = -bhpara->r[3 * n + l] + bhpara->r[3 * i + l];
959 tmp += dr[l] * dr[l];
960 }
961 // check if all threads agree that cell is far enough away
962 // (or is a body)
963 if ((n < bhpara->nbodies) ||
964 __all_sync(__activemask(), tmp >= dq[depth])) {
965 if (n != i) {
966 auto const d1 = sqrtf(tmp);
967 auto const dd5 = __fdividef(1.0f, tmp * tmp * d1);
968 auto b = 0.0f;
969 for (int l = 0; l < 3; l++) {
970 uc[l] = bhpara->u[3 * n + l];
971 b += uc[l] * dr[l];
972 }
973
974 for (int l = 0; l < 3; l++)
975 h[l] += (b * 3.0f * dr[l] - tmp * uc[l]) * dd5;
976 }
977 } else {
978 // push cell onto stack
979 depth++;
980 if (sbase == threadIdx.x) {
981 node[depth] = n;
982 pos[depth] = 0;
983 }
984 __threadfence_block();
985 }
986 } else {
987 depth = max(j, depth - 1); // early out because all remaining
988 // children are also zero
989 }
990 }
991 depth--; // done with this level
992 }
993
994 for (int l = 0; l < 3; l++) {
995 sum += -u[l] * h[l];
996 if (h[l] != h[l]) { // nan
997 printf("Energy Kernel: NAN in particle[%d]\n", i);
998 printf("x = %f, y = %f, z = %f,\n", bhpara->u[3 * i + 0],
999 bhpara->u[3 * i + 1], bhpara->u[3 * i + 2]);
1000 printf("hx = %f, hy = %f, hz = %f,\n", h[0], h[1], h[2]);
1001 }
1002 }
1003 }
1004 }
1005
1006 sum *= pf;
1007 // the self-consistent field energy;
1008 // the Barnes-Hut algorithm, probably, does not allow to avoid this /2 cause
1009 // it is not symmetric:
1010 sum /= 2.0f;
1011 // Save per thread result into block shared mem
1012 res[threadIdx.x] = sum;
1013 // Sum results within a block
1014 __syncthreads(); // Wait til all threads in block are done
1016}
1017
1018// Required BH CUDA init.
1019void initBHgpu(int blocks) {
1020 dim3 grid(1, 1, 1);
1021 dim3 block(1, 1, 1);
1022
1023 grid.x = blocks * FACTOR5;
1024 block.x = THREADS5;
1025
1027
1028 // According to the experimental performance optimization:
1029 cudaFuncSetCacheConfig(boundingBoxKernel, cudaFuncCachePreferShared);
1030 cudaFuncSetCacheConfig(treeBuildingKernel, cudaFuncCachePreferL1);
1031 cudaFuncSetCacheConfig(summarizationKernel, cudaFuncCachePreferShared);
1032 cudaFuncSetCacheConfig(sortKernel, cudaFuncCachePreferL1);
1033 cudaFuncSetCacheConfig(forceCalculationKernel, cudaFuncCachePreferL1);
1034 cudaFuncSetCacheConfig(energyCalculationKernel, cudaFuncCachePreferL1);
1035
1036 cudaGetLastError(); // reset error value
1037}
1038
1039// Building Barnes-Hut spatial min/max position box
1040void buildBoxBH(int blocks) {
1041 dim3 grid(1, 1, 1);
1042 dim3 block(1, 1, 1);
1043
1044 grid.x = blocks * FACTOR1;
1045 block.x = THREADS1;
1046
1047 cudaDeviceSynchronize();
1048 KERNELCALL(boundingBoxKernel, grid, block);
1049 cuda_safe_mem(cudaDeviceSynchronize());
1050}
1051
1052// Building Barnes-Hut tree in a linear child array representation
1053// of octant cells and particles inside.
1054void buildTreeBH(int blocks) {
1055 dim3 grid(1, 1, 1);
1056 dim3 block(1, 1, 1);
1057
1058 grid.x = blocks * FACTOR2;
1059 block.x = THREADS2;
1060
1061 KERNELCALL(treeBuildingKernel, grid, block);
1062 cuda_safe_mem(cudaDeviceSynchronize());
1063}
1064
1065// Calculate octant cells masses and cell index counts.
1066// Determine cells centers of mass and total dipole moments
1067// on all possible levels of the BH tree.
1068void summarizeBH(int blocks) {
1069 dim3 grid(1, 1, 1);
1070 dim3 block(1, 1, 1);
1071
1072 grid.x = blocks * FACTOR3;
1073 block.x = THREADS3;
1074
1075 KERNELCALL(summarizationKernel, grid, block);
1076 cuda_safe_mem(cudaDeviceSynchronize());
1077}
1078
1079// Sort particle indexes according to the BH tree representation.
1080// Crucial for the per-warp performance tuning of forceCalculationKernel and
1081// energyCalculationKernel.
1082void sortBH(int blocks) {
1083 dim3 grid(1, 1, 1);
1084 dim3 block(1, 1, 1);
1085
1086 grid.x = blocks * FACTOR4;
1087 block.x = THREADS4;
1088
1089 KERNELCALL(sortKernel, grid, block);
1090 cuda_safe_mem(cudaDeviceSynchronize());
1091}
1092
1093// Force calculation.
1094void forceBH(BHData *bh_data, float k, float *f, float *torque) {
1095 dim3 grid(1, 1, 1);
1096 dim3 block(1, 1, 1);
1097
1098 grid.x = bh_data->blocks * FACTOR5;
1099 block.x = THREADS5;
1100
1101 KERNELCALL(forceCalculationKernel, grid, block, k, f, torque);
1102 cuda_safe_mem(cudaDeviceSynchronize());
1103
1104 int error_code = 0;
1105 cuda_safe_mem(cudaMemcpy(&error_code, bh_data->err, sizeof(int),
1106 cudaMemcpyDeviceToHost));
1107 if (error_code) {
1108 throw std::runtime_error("force kernel encountered a functional error");
1109 }
1110}
1111
1112// Energy calculation.
1113void energyBH(BHData *bh_data, float k, float *E) {
1114 dim3 grid(1, 1, 1);
1115 dim3 block(1, 1, 1);
1116
1117 grid.x = bh_data->blocks * FACTOR5;
1118 block.x = THREADS5;
1119
1120 float *energySum;
1121 cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x)));
1122 // cleanup the memory for the energy sum
1123 cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(float) * grid.x)));
1124
1125 KERNELCALL_shared(energyCalculationKernel, grid, block,
1126 block.x * sizeof(float), k, energySum);
1127 cuda_safe_mem(cudaDeviceSynchronize());
1128
1129 // Sum the results of all blocks
1130 // One energy part per block in the prev kernel
1131 thrust::device_ptr<float> t(energySum);
1132 float x = thrust::reduce(t, t + grid.x);
1133 cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));
1134
1135 cuda_safe_mem(cudaFree(energySum));
1136
1137 int error_code = 0;
1138 cuda_safe_mem(cudaMemcpy(&error_code, bh_data->err, sizeof(int),
1139 cudaMemcpyDeviceToHost));
1140 if (error_code) {
1141 throw std::runtime_error("force kernel encountered a functional error");
1142 }
1143}
1144
1145void setBHPrecision(float epssq, float itolsq) {
1146 cuda_safe_mem(cudaMemcpyToSymbol(epssqd, &epssq, sizeof(float), 0,
1147 cudaMemcpyHostToDevice));
1148 cuda_safe_mem(cudaMemcpyToSymbol(itolsqd, &itolsq, sizeof(float), 0,
1149 cudaMemcpyHostToDevice));
1150}
1151
1152void deallocBH(BHData *bh_data) {
1153 cuda_safe_mem(cudaFree(bh_data->err));
1154 cuda_safe_mem(cudaFree(bh_data->max_lps));
1155 cuda_safe_mem(cudaFree(bh_data->child));
1156 cuda_safe_mem(cudaFree(bh_data->count));
1157 cuda_safe_mem(cudaFree(bh_data->start));
1158 cuda_safe_mem(cudaFree(bh_data->sort));
1159 cuda_safe_mem(cudaFree(bh_data->mass));
1160 cuda_safe_mem(cudaFree(bh_data->maxp));
1161 cuda_safe_mem(cudaFree(bh_data->minp));
1162 cuda_safe_mem(cudaFree(bh_data->r));
1163 cuda_safe_mem(cudaFree(bh_data->u));
1164 bh_data->err = nullptr;
1165 bh_data->max_lps = nullptr;
1166 bh_data->child = nullptr;
1167 bh_data->count = nullptr;
1168 bh_data->start = nullptr;
1169 bh_data->sort = nullptr;
1170 bh_data->mass = nullptr;
1171 bh_data->maxp = nullptr;
1172 bh_data->minp = nullptr;
1173 bh_data->r = nullptr;
1174 bh_data->u = nullptr;
1175}
1176
1177void allocBHmemCopy(int nbodies, BHData *bh_data) {
1178 if (bh_data->nbodies == nbodies)
1179 return;
1180
1181 bh_data->nbodies = nbodies;
1182
1183 auto const devID = cuda_get_device();
1184 auto const dev = cuda_get_device_props(devID);
1185
1186 bh_data->blocks = dev.n_cores;
1187 // Each node corresponds to a split of the cubic box in 3D space to equal
1188 // cubic boxes hence, 8 nodes per particle is a theoretical octree limit:
1189 bh_data->nnodes = bh_data->nbodies * 8;
1190
1191 int const n_total_threads = 1024 * bh_data->blocks;
1192 if (bh_data->nnodes < n_total_threads)
1193 bh_data->nnodes = n_total_threads;
1194 else
1195 bh_data->nnodes = (bh_data->nnodes / n_total_threads) * n_total_threads;
1196
1197 cuda_safe_mem(cudaFree(bh_data->err));
1198 cuda_safe_mem(cudaMalloc((void **)&(bh_data->err), sizeof(int)));
1199
1200 cuda_safe_mem(cudaFree(bh_data->max_lps));
1201 cuda_safe_mem(cudaMalloc((void **)&(bh_data->max_lps), sizeof(int)));
1202
1203 cuda_safe_mem(cudaFree(bh_data->child));
1204 cuda_safe_mem(cudaMalloc((void **)&(bh_data->child),
1205 sizeof(int) * (bh_data->nnodes + 1) * 8));
1206
1207 cuda_safe_mem(cudaFree(bh_data->count));
1208 cuda_safe_mem(cudaMalloc((void **)&(bh_data->count),
1209 sizeof(int) * (bh_data->nnodes + 1)));
1210
1211 cuda_safe_mem(cudaFree(bh_data->start));
1212 cuda_safe_mem(cudaMalloc((void **)&(bh_data->start),
1213 sizeof(int) * (bh_data->nnodes + 1)));
1214
1215 cuda_safe_mem(cudaFree(bh_data->sort));
1216 cuda_safe_mem(cudaMalloc((void **)&(bh_data->sort),
1217 sizeof(int) * (bh_data->nnodes + 1)));
1218
1219 // Weight coefficients of m_bhnnodes nodes: both particles and octant cells
1220 cuda_safe_mem(cudaFree(bh_data->mass));
1221 cuda_safe_mem(cudaMalloc((void **)&(bh_data->mass),
1222 sizeof(float) * (bh_data->nnodes + 1)));
1223
1224 // n particles have unitary weight coefficients.
1225 // Cells will be defined with -1 later.
1226 auto *mass_tmp = new float[bh_data->nbodies];
1227 for (int i = 0; i < bh_data->nbodies; i++) {
1228 mass_tmp[i] = 1.0f;
1229 }
1230 cuda_safe_mem(cudaMemcpy(bh_data->mass, mass_tmp,
1231 sizeof(float) * bh_data->nbodies,
1232 cudaMemcpyHostToDevice));
1233 delete[] mass_tmp;
1234 // (max[3*i], max[3*i+1], max[3*i+2])
1235 // are the octree box dynamical spatial constraints
1236 // this array is updating per each block at each interaction calculation
1237 // within the boundingBoxKernel
1238 cuda_safe_mem(cudaFree(bh_data->maxp));
1239 cuda_safe_mem(cudaMalloc((void **)&(bh_data->maxp),
1240 sizeof(float) * bh_data->blocks * FACTOR1 * 3));
1241 // (min[3*i], min[3*i+1], min[3*i+2])
1242 // are the octree box dynamical spatial constraints
1243 // this array is updating per each block at each interaction calculation
1244 // within the boundingBoxKernel
1245 cuda_safe_mem(cudaFree(bh_data->minp));
1246 cuda_safe_mem(cudaMalloc((void **)&(bh_data->minp),
1247 sizeof(float) * bh_data->blocks * FACTOR1 * 3));
1248
1249 cuda_safe_mem(cudaFree(bh_data->r));
1251 cudaMalloc(&(bh_data->r), 3 * (bh_data->nnodes + 1) * sizeof(float)));
1252
1253 cuda_safe_mem(cudaFree(bh_data->u));
1255 cudaMalloc(&(bh_data->u), 3 * (bh_data->nnodes + 1) * sizeof(float)));
1256}
1257
1258void fill_bh_data(float const *r, float const *dip, BHData const *bh_data) {
1259 auto const size = 3 * bh_data->nbodies * sizeof(float);
1260 cuda_safe_mem(cudaMemcpyToSymbol(bhpara, bh_data, sizeof(BHData), 0,
1261 cudaMemcpyHostToDevice));
1262 cuda_safe_mem(cudaMemcpy(bh_data->r, r, size, cudaMemcpyDeviceToDevice));
1263 cuda_safe_mem(cudaMemcpy(bh_data->u, dip, size, cudaMemcpyDeviceToDevice));
1264}
1265
1266#endif // DIPOLAR_BARNES_HUT
__shared__ float dq[MAXDEPTH *THREADS5/WARPSIZE]
void buildBoxBH(int blocks)
Building Barnes-Hut spatial min/max position box.
__device__ volatile int maxdepthd
__device__ void dds_sumReduction_BH(float *input, float *sum)
void setBHPrecision(float epssq, float itolsq)
Barnes-Hut parameters setter.
float sum
float f[3]
void energyBH(BHData *bh_data, float k, float *E)
Barnes-Hut energy calculation.
__device__ volatile int blkcntd
__global__ __launch_bounds__(THREADS1, FACTOR1) void boundingBoxKernel()
void forceBH(BHData *bh_data, float k, float *f, float *torque)
Barnes-Hut force calculation.
__global__ float float * torque
__global__ float * force
__global__ void initializationKernel()
float dr[3]
__constant__ float itolsqd[1]
void initBHgpu(int blocks)
Barnes-Hut CUDA initialization.
__shared__ int pos[MAXDEPTH *THREADS5/WARPSIZE]
__shared__ int node[MAXDEPTH *THREADS5/WARPSIZE]
__constant__ float epssqd[1]
__device__ volatile float radiusd
void summarizeBH(int blocks)
Calculate octant cells masses and cell index counts. Determine cells centers of mass and total dipole...
float h[3]
void deallocBH(BHData *bh_data)
A deallocation of the GPU device memory.
__shared__ float res[]
__device__ volatile int bottomd
float N[3]
float u[3]
void allocBHmemCopy(int nbodies, BHData *bh_data)
An allocation of the GPU device memory and an initialization where it is needed.
__device__ __constant__ volatile BHData bhpara[1]
__global__ float * energySum
void sortBH(int blocks)
Sort particle indexes according to the Barnes-Hut tree representation. Crucial for the per-warp perfo...
void fill_bh_data(float const *r, float const *dip, BHData const *bh_data)
Copy Barnes-Hut data to bhpara and copy particle data.
__syncthreads()
void buildTreeBH(int blocks)
Building Barnes-Hut tree in a linear child array representation of octant cells and particles inside.
float uc[3]
#define WARPSIZE
Barnes-Hut warp size.
#define FACTOR4
#define MAXDEPTH
Maximal depth of the Barnes-Hut tree branching.
#define THREADS5
#define FACTOR5
#define FACTOR1
#define THREADS3
#define THREADS1
#define THREADS2
#define THREADS4
#define FACTOR3
#define FACTOR2
This file contains the defaults for ESPResSo.
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:174
EspressoGpuDevice cuda_get_device_props(int dev)
Get properties of a CUDA device.
Definition init_cuda.cu:77
int cuda_get_device()
Get the current CUDA device.
Definition init_cuda.cu:98
float * r
particle positions on the device:
float * mass
Not a real mass. Just a node weight coefficient.
int nbodies
each node corresponds to a split of the cubic box in 3D space to equal cubic boxes hence,...
int * err
Error report.
int * child
The tree linear representation.
float * maxp
max positions' coordinates of the Barnes-Hut box.
int * sort
Indices of particles sorted according to the tree linear representation.
int * count
Supplementary array: a tree nodes (division octant cells/particles inside) counting.
float * u
particle dipole moments on the device:
float * minp
min positions' coordinates of the Barnes-Hut box.
int * max_lps
trace the max loops for a threads' sync
int blocks
CUDA blocks.
int * start
Start indices for the per-cell sorting.
#define KERNELCALL_shared(_function, _grid, _block, _stream,...)
Definition utils.cuh:73
#define cuda_safe_mem(a)
Definition utils.cuh:71
#define KERNELCALL(_function, _grid, _block,...)
Definition utils.cuh:77