ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
StreamSweepDoublePrecisionAVX.cpp
Go to the documentation of this file.
1//======================================================================================================================
2//
3// This file is part of waLBerla. waLBerla is free software: you can
4// redistribute it and/or modify it under the terms of the GNU General Public
5// License as published by the Free Software Foundation, either version 3 of
6// the License, or (at your option) any later version.
7//
8// waLBerla is distributed in the hope that it will be useful, but WITHOUT
9// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11// for more details.
12//
13// You should have received a copy of the GNU General Public License along
14// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
15//
16//! \\file StreamSweepDoublePrecisionAVX.cpp
17//! \\ingroup lbm
18//! \\author lbmpy
19//======================================================================================================================
20
21// kernel generated with pystencils v1.2, lbmpy v1.2, lbmpy_walberla/pystencils_walberla from waLBerla commit 4d10e7f2358fc4a4f7e99195d0f67f0b759ecb6f
22
23#include <cmath>
24
26#include "core/DataTypes.h"
27#include "core/Macros.h"
28
29#include <immintrin.h>
30
31#define FUNC_PREFIX
32
33#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
34#pragma GCC diagnostic push
35#pragma GCC diagnostic ignored "-Wfloat-equal"
36#pragma GCC diagnostic ignored "-Wshadow"
37#pragma GCC diagnostic ignored "-Wconversion"
38#pragma GCC diagnostic ignored "-Wunused-variable"
39#endif
40
41#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
42#pragma warning push
43#pragma warning(disable : 1599)
44#endif
45
46using namespace std;
47
48namespace walberla {
49namespace pystencils {
50
51namespace internal_91e2c9bdb4c4fa8a405803890749bf98 {
52static FUNC_PREFIX void streamsweepdoubleprecisionavx_streamsweepdoubleprecisionavx(double *RESTRICT const _data_force, double *RESTRICT const _data_pdfs, double *RESTRICT _data_pdfs_tmp, double *RESTRICT _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) {
53 for (int64_t ctr_2 = 1; ctr_2 < _size_force_2 - 1; ctr_2 += 1) {
54 double *RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2 * ctr_2;
55 double *RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3;
56 double *RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3;
57 double *RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3;
58 double *RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3;
59 double *RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 5 * _stride_pdfs_3;
60 double *RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 6 * _stride_pdfs_3;
61 double *RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3;
62 double *RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3;
63 double *RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3;
64 double *RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3;
65 double *RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 11 * _stride_pdfs_3;
66 double *RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 12 * _stride_pdfs_3;
67 double *RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 13 * _stride_pdfs_3;
68 double *RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2 * ctr_2 - _stride_pdfs_2 + 14 * _stride_pdfs_3;
69 double *RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 15 * _stride_pdfs_3;
70 double *RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 16 * _stride_pdfs_3;
71 double *RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 17 * _stride_pdfs_3;
72 double *RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2 * ctr_2 + _stride_pdfs_2 + 18 * _stride_pdfs_3;
73 double *RESTRICT _data_force_20_30 = _data_force + _stride_force_2 * ctr_2;
74 double *RESTRICT _data_force_20_31 = _data_force + _stride_force_2 * ctr_2 + _stride_force_3;
75 double *RESTRICT _data_force_20_32 = _data_force + _stride_force_2 * ctr_2 + 2 * _stride_force_3;
76 double *RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2 * ctr_2;
77 double *RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2 * ctr_2 + _stride_velocity_3;
78 double *RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2 * ctr_2 + 2 * _stride_velocity_3;
79 double *RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2;
80 double *RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + _stride_pdfs_tmp_3;
81 double *RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 2 * _stride_pdfs_tmp_3;
82 double *RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 3 * _stride_pdfs_tmp_3;
83 double *RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 4 * _stride_pdfs_tmp_3;
84 double *RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 5 * _stride_pdfs_tmp_3;
85 double *RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 6 * _stride_pdfs_tmp_3;
86 double *RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 7 * _stride_pdfs_tmp_3;
87 double *RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 8 * _stride_pdfs_tmp_3;
88 double *RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 9 * _stride_pdfs_tmp_3;
89 double *RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 10 * _stride_pdfs_tmp_3;
90 double *RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 11 * _stride_pdfs_tmp_3;
91 double *RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 12 * _stride_pdfs_tmp_3;
92 double *RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 13 * _stride_pdfs_tmp_3;
93 double *RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 14 * _stride_pdfs_tmp_3;
94 double *RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 15 * _stride_pdfs_tmp_3;
95 double *RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 16 * _stride_pdfs_tmp_3;
96 double *RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 17 * _stride_pdfs_tmp_3;
97 double *RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2 * ctr_2 + 18 * _stride_pdfs_tmp_3;
98 for (int64_t ctr_1 = 1; ctr_1 < _size_force_1 - 1; ctr_1 += 1) {
99 double *RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_30;
100 double *RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
101 double *RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
102 double *RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_33;
103 double *RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_20_34;
104 double *RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_35;
105 double *RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_36;
106 double *RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
107 double *RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
108 double *RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
109 double *RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
110 double *RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
111 double *RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
112 double *RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_313;
113 double *RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_2m1_314;
114 double *RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1 * ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
115 double *RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1 * ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
116 double *RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_317;
117 double *RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1 * ctr_1 + _data_pdfs_21_318;
118 double *RESTRICT _data_force_20_30_10 = _stride_force_1 * ctr_1 + _data_force_20_30;
119 double *RESTRICT _data_force_20_31_10 = _stride_force_1 * ctr_1 + _data_force_20_31;
120 double *RESTRICT _data_force_20_32_10 = _stride_force_1 * ctr_1 + _data_force_20_32;
121 double *RESTRICT _data_velocity_20_30_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_30;
122 double *RESTRICT _data_velocity_20_31_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_31;
123 double *RESTRICT _data_velocity_20_32_10 = _stride_velocity_1 * ctr_1 + _data_velocity_20_32;
124 double *RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_30;
125 double *RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_31;
126 double *RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_32;
127 double *RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_33;
128 double *RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_34;
129 double *RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_35;
130 double *RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_36;
131 double *RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_37;
132 double *RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_38;
133 double *RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_39;
134 double *RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_310;
135 double *RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_311;
136 double *RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_312;
137 double *RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_313;
138 double *RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_314;
139 double *RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_315;
140 double *RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_316;
141 double *RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_317;
142 double *RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1 * ctr_1 + _data_pdfs_tmp_20_318;
143 {
144 for (int64_t ctr_0 = 1; ctr_0 < (int64_t)((_size_force_0 - 2) / (4)) * (4) + 1; ctr_0 += 4) {
145 const __m256d streamed_0 = _mm256_load_pd(&_data_pdfs_20_30_10[ctr_0]);
146 const __m256d streamed_1 = _mm256_load_pd(&_data_pdfs_20_31_1m1[ctr_0]);
147 const __m256d streamed_2 = _mm256_load_pd(&_data_pdfs_20_32_11[ctr_0]);
148 const __m256d streamed_3 = _mm256_loadu_pd(&_data_pdfs_20_33_10[ctr_0 + 1]);
149 const __m256d streamed_4 = _mm256_loadu_pd(&_data_pdfs_20_34_10[ctr_0 - 1]);
150 const __m256d streamed_5 = _mm256_load_pd(&_data_pdfs_2m1_35_10[ctr_0]);
151 const __m256d streamed_6 = _mm256_load_pd(&_data_pdfs_21_36_10[ctr_0]);
152 const __m256d streamed_7 = _mm256_loadu_pd(&_data_pdfs_20_37_1m1[ctr_0 + 1]);
153 const __m256d streamed_8 = _mm256_loadu_pd(&_data_pdfs_20_38_1m1[ctr_0 - 1]);
154 const __m256d streamed_9 = _mm256_loadu_pd(&_data_pdfs_20_39_11[ctr_0 + 1]);
155 const __m256d streamed_10 = _mm256_loadu_pd(&_data_pdfs_20_310_11[ctr_0 - 1]);
156 const __m256d streamed_11 = _mm256_load_pd(&_data_pdfs_2m1_311_1m1[ctr_0]);
157 const __m256d streamed_12 = _mm256_load_pd(&_data_pdfs_2m1_312_11[ctr_0]);
158 const __m256d streamed_13 = _mm256_loadu_pd(&_data_pdfs_2m1_313_10[ctr_0 + 1]);
159 const __m256d streamed_14 = _mm256_loadu_pd(&_data_pdfs_2m1_314_10[ctr_0 - 1]);
160 const __m256d streamed_15 = _mm256_load_pd(&_data_pdfs_21_315_1m1[ctr_0]);
161 const __m256d streamed_16 = _mm256_load_pd(&_data_pdfs_21_316_11[ctr_0]);
162 const __m256d streamed_17 = _mm256_loadu_pd(&_data_pdfs_21_317_10[ctr_0 + 1]);
163 const __m256d streamed_18 = _mm256_loadu_pd(&_data_pdfs_21_318_10[ctr_0 - 1]);
164 const __m256d vel0Term = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(streamed_10, streamed_14), streamed_18), streamed_4), streamed_8);
165 const __m256d momdensity_0 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(streamed_13, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(streamed_17, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_3, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_7, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_9, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), vel0Term);
166 const __m256d vel1Term = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(streamed_1, streamed_11), streamed_15), streamed_7);
167 const __m256d momdensity_1 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(streamed_10, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(streamed_12, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_16, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_2, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_9, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), streamed_8), vel1Term);
168 const __m256d vel2Term = _mm256_add_pd(_mm256_add_pd(streamed_12, streamed_13), streamed_5);
169 const __m256d rho = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(streamed_0, streamed_16), streamed_17), streamed_2), streamed_3), streamed_6), streamed_9), vel0Term), vel1Term), vel2Term);
170 const __m256d momdensity_2 = _mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_add_pd(_mm256_mul_pd(streamed_15, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0)), _mm256_mul_pd(streamed_16, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_17, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_18, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), _mm256_mul_pd(streamed_6, _mm256_set_pd(-1.0, -1.0, -1.0, -1.0))), streamed_11), streamed_14), vel2Term);
171 const __m256d u_0 = _mm256_add_pd(_mm256_mul_pd(momdensity_0, _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_mul_pd(_mm256_mul_pd(_mm256_set_pd(0.5, 0.5, 0.5, 0.5), _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_load_pd(&_data_force_20_30_10[ctr_0])));
172 const __m256d u_1 = _mm256_add_pd(_mm256_mul_pd(momdensity_1, _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_mul_pd(_mm256_mul_pd(_mm256_set_pd(0.5, 0.5, 0.5, 0.5), _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_load_pd(&_data_force_20_31_10[ctr_0])));
173 const __m256d u_2 = _mm256_add_pd(_mm256_mul_pd(momdensity_2, _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_mul_pd(_mm256_mul_pd(_mm256_set_pd(0.5, 0.5, 0.5, 0.5), _mm256_div_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), rho)), _mm256_load_pd(&_data_force_20_32_10[ctr_0])));
174 _mm256_store_pd(&_data_velocity_20_30_10[ctr_0], u_0);
175 _mm256_store_pd(&_data_velocity_20_31_10[ctr_0], u_1);
176 _mm256_store_pd(&_data_velocity_20_32_10[ctr_0], u_2);
177 _mm256_store_pd(&_data_pdfs_tmp_20_30_10[ctr_0], streamed_0);
178 _mm256_store_pd(&_data_pdfs_tmp_20_31_10[ctr_0], streamed_1);
179 _mm256_store_pd(&_data_pdfs_tmp_20_32_10[ctr_0], streamed_2);
180 _mm256_store_pd(&_data_pdfs_tmp_20_33_10[ctr_0], streamed_3);
181 _mm256_store_pd(&_data_pdfs_tmp_20_34_10[ctr_0], streamed_4);
182 _mm256_store_pd(&_data_pdfs_tmp_20_35_10[ctr_0], streamed_5);
183 _mm256_store_pd(&_data_pdfs_tmp_20_36_10[ctr_0], streamed_6);
184 _mm256_store_pd(&_data_pdfs_tmp_20_37_10[ctr_0], streamed_7);
185 _mm256_store_pd(&_data_pdfs_tmp_20_38_10[ctr_0], streamed_8);
186 _mm256_store_pd(&_data_pdfs_tmp_20_39_10[ctr_0], streamed_9);
187 _mm256_store_pd(&_data_pdfs_tmp_20_310_10[ctr_0], streamed_10);
188 _mm256_store_pd(&_data_pdfs_tmp_20_311_10[ctr_0], streamed_11);
189 _mm256_store_pd(&_data_pdfs_tmp_20_312_10[ctr_0], streamed_12);
190 _mm256_store_pd(&_data_pdfs_tmp_20_313_10[ctr_0], streamed_13);
191 _mm256_store_pd(&_data_pdfs_tmp_20_314_10[ctr_0], streamed_14);
192 _mm256_store_pd(&_data_pdfs_tmp_20_315_10[ctr_0], streamed_15);
193 _mm256_store_pd(&_data_pdfs_tmp_20_316_10[ctr_0], streamed_16);
194 _mm256_store_pd(&_data_pdfs_tmp_20_317_10[ctr_0], streamed_17);
195 _mm256_store_pd(&_data_pdfs_tmp_20_318_10[ctr_0], streamed_18);
196 }
197 for (int64_t ctr_0 = (int64_t)((_size_force_0 - 2) / (4)) * (4) + 1; ctr_0 < _size_force_0 - 1; ctr_0 += 1) {
198 const double streamed_0 = _data_pdfs_20_30_10[ctr_0];
199 const double streamed_1 = _data_pdfs_20_31_1m1[ctr_0];
200 const double streamed_2 = _data_pdfs_20_32_11[ctr_0];
201 const double streamed_3 = _data_pdfs_20_33_10[ctr_0 + 1];
202 const double streamed_4 = _data_pdfs_20_34_10[ctr_0 - 1];
203 const double streamed_5 = _data_pdfs_2m1_35_10[ctr_0];
204 const double streamed_6 = _data_pdfs_21_36_10[ctr_0];
205 const double streamed_7 = _data_pdfs_20_37_1m1[ctr_0 + 1];
206 const double streamed_8 = _data_pdfs_20_38_1m1[ctr_0 - 1];
207 const double streamed_9 = _data_pdfs_20_39_11[ctr_0 + 1];
208 const double streamed_10 = _data_pdfs_20_310_11[ctr_0 - 1];
209 const double streamed_11 = _data_pdfs_2m1_311_1m1[ctr_0];
210 const double streamed_12 = _data_pdfs_2m1_312_11[ctr_0];
211 const double streamed_13 = _data_pdfs_2m1_313_10[ctr_0 + 1];
212 const double streamed_14 = _data_pdfs_2m1_314_10[ctr_0 - 1];
213 const double streamed_15 = _data_pdfs_21_315_1m1[ctr_0];
214 const double streamed_16 = _data_pdfs_21_316_11[ctr_0];
215 const double streamed_17 = _data_pdfs_21_317_10[ctr_0 + 1];
216 const double streamed_18 = _data_pdfs_21_318_10[ctr_0 - 1];
217 const double vel0Term = streamed_10 + streamed_14 + streamed_18 + streamed_4 + streamed_8;
218 const double momdensity_0 = streamed_13 * -1.0 + streamed_17 * -1.0 + streamed_3 * -1.0 + streamed_7 * -1.0 + streamed_9 * -1.0 + vel0Term;
219 const double vel1Term = streamed_1 + streamed_11 + streamed_15 + streamed_7;
220 const double momdensity_1 = streamed_10 * -1.0 + streamed_12 * -1.0 + streamed_16 * -1.0 + streamed_2 * -1.0 + streamed_8 + streamed_9 * -1.0 + vel1Term;
221 const double vel2Term = streamed_12 + streamed_13 + streamed_5;
222 const double rho = streamed_0 + streamed_16 + streamed_17 + streamed_2 + streamed_3 + streamed_6 + streamed_9 + vel0Term + vel1Term + vel2Term;
223 const double momdensity_2 = streamed_11 + streamed_14 + streamed_15 * -1.0 + streamed_16 * -1.0 + streamed_17 * -1.0 + streamed_18 * -1.0 + streamed_6 * -1.0 + vel2Term;
224 const double u_0 = momdensity_0 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force_20_30_10[ctr_0];
225 const double u_1 = momdensity_1 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force_20_31_10[ctr_0];
226 const double u_2 = momdensity_2 * ((1.0) / (rho)) + 0.5 * ((1.0) / (rho)) * _data_force_20_32_10[ctr_0];
227 _data_velocity_20_30_10[ctr_0] = u_0;
228 _data_velocity_20_31_10[ctr_0] = u_1;
229 _data_velocity_20_32_10[ctr_0] = u_2;
230 _data_pdfs_tmp_20_30_10[ctr_0] = streamed_0;
231 _data_pdfs_tmp_20_31_10[ctr_0] = streamed_1;
232 _data_pdfs_tmp_20_32_10[ctr_0] = streamed_2;
233 _data_pdfs_tmp_20_33_10[ctr_0] = streamed_3;
234 _data_pdfs_tmp_20_34_10[ctr_0] = streamed_4;
235 _data_pdfs_tmp_20_35_10[ctr_0] = streamed_5;
236 _data_pdfs_tmp_20_36_10[ctr_0] = streamed_6;
237 _data_pdfs_tmp_20_37_10[ctr_0] = streamed_7;
238 _data_pdfs_tmp_20_38_10[ctr_0] = streamed_8;
239 _data_pdfs_tmp_20_39_10[ctr_0] = streamed_9;
240 _data_pdfs_tmp_20_310_10[ctr_0] = streamed_10;
241 _data_pdfs_tmp_20_311_10[ctr_0] = streamed_11;
242 _data_pdfs_tmp_20_312_10[ctr_0] = streamed_12;
243 _data_pdfs_tmp_20_313_10[ctr_0] = streamed_13;
244 _data_pdfs_tmp_20_314_10[ctr_0] = streamed_14;
245 _data_pdfs_tmp_20_315_10[ctr_0] = streamed_15;
246 _data_pdfs_tmp_20_316_10[ctr_0] = streamed_16;
247 _data_pdfs_tmp_20_317_10[ctr_0] = streamed_17;
248 _data_pdfs_tmp_20_318_10[ctr_0] = streamed_18;
249 }
250 }
251 }
252 }
253}
254} // namespace internal_91e2c9bdb4c4fa8a405803890749bf98
255
257 auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
258 auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
259 auto velocity = block->getData<field::GhostLayerField<double, 3>>(velocityID);
260 field::GhostLayerField<double, 19> *pdfs_tmp;
261 {
262 // Getting temporary field pdfs_tmp
263 auto it = cache_pdfs_.find(pdfs);
264 if (it != cache_pdfs_.end()) {
265 pdfs_tmp = *it;
266 } else {
267 pdfs_tmp = pdfs->cloneUninitialized();
268 cache_pdfs_.insert(pdfs_tmp);
269 }
270 }
271
272 WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(force->nrOfGhostLayers()));
273 double *RESTRICT const _data_force = force->dataAt(-1, -1, -1, 0);
274 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
275 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
276 WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()));
277 double *RESTRICT const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0);
278 WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
279 WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
280 WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()));
281 double *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0);
282 WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
283 WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0);
284 WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(velocity->nrOfGhostLayers()));
285 double *RESTRICT _data_velocity = velocity->dataAt(-1, -1, -1, 0);
286 WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
287 WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0);
288 WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(force->xSize()) + 2));
289 const int64_t _size_force_0 = int64_t(cell_idx_c(force->xSize()) + 2);
290 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
291 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
292 WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(force->ySize()) + 2));
293 const int64_t _size_force_1 = int64_t(cell_idx_c(force->ySize()) + 2);
294 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
295 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
296 WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(force->zSize()) + 2));
297 const int64_t _size_force_2 = int64_t(cell_idx_c(force->zSize()) + 2);
298 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
299 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
300 const int64_t _stride_force_1 = int64_t(force->yStride());
301 const int64_t _stride_force_2 = int64_t(force->zStride());
302 const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
303 const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
304 const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
305 const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
306 const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
307 const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
308 const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
309 const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
310 const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
311 const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
312 internal_91e2c9bdb4c4fa8a405803890749bf98::streamsweepdoubleprecisionavx_streamsweepdoubleprecisionavx(_data_force, _data_pdfs, _data_pdfs_tmp, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
313 pdfs->swapDataPointers(pdfs_tmp);
314}
315
316void StreamSweepDoublePrecisionAVX::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
317 CellInterval ci = globalCellInterval;
318 CellInterval blockBB = blocks->getBlockCellBB(*block);
319 blockBB.expand(ghostLayers);
320 ci.intersect(blockBB);
321 blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
322 if (ci.empty())
323 return;
324
325 auto force = block->getData<field::GhostLayerField<double, 3>>(forceID);
326 auto pdfs = block->getData<field::GhostLayerField<double, 19>>(pdfsID);
327 auto velocity = block->getData<field::GhostLayerField<double, 3>>(velocityID);
328 field::GhostLayerField<double, 19> *pdfs_tmp;
329 {
330 // Getting temporary field pdfs_tmp
331 auto it = cache_pdfs_.find(pdfs);
332 if (it != cache_pdfs_.end()) {
333 pdfs_tmp = *it;
334 } else {
335 pdfs_tmp = pdfs->cloneUninitialized();
336 cache_pdfs_.insert(pdfs_tmp);
337 }
338 }
339
340 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(force->nrOfGhostLayers()));
341 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(force->nrOfGhostLayers()));
342 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(force->nrOfGhostLayers()));
343 double *RESTRICT const _data_force = force->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
344 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
345 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
346 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
347 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
348 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
349 double *RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
350 WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx);
351 WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0);
352 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
353 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
354 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
355 double *RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
356 WALBERLA_ASSERT_EQUAL(pdfs_tmp->layout(), field::fzyx);
357 WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs_tmp->dataAt(0, 0, 0, 0) % 32, 0);
358 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(velocity->nrOfGhostLayers()));
359 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(velocity->nrOfGhostLayers()));
360 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(velocity->nrOfGhostLayers()));
361 double *RESTRICT _data_velocity = velocity->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
362 WALBERLA_ASSERT_EQUAL(velocity->layout(), field::fzyx);
363 WALBERLA_ASSERT_EQUAL((uintptr_t)velocity->dataAt(0, 0, 0, 0) % 32, 0);
364 WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 2));
365 const int64_t _size_force_0 = int64_t(cell_idx_c(ci.xSize()) + 2);
366 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
367 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
368 WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 2));
369 const int64_t _size_force_1 = int64_t(cell_idx_c(ci.ySize()) + 2);
370 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
371 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
372 WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 2));
373 const int64_t _size_force_2 = int64_t(cell_idx_c(ci.zSize()) + 2);
374 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx);
375 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0);
376 const int64_t _stride_force_1 = int64_t(force->yStride());
377 const int64_t _stride_force_2 = int64_t(force->zStride());
378 const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
379 const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
380 const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
381 const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
382 const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
383 const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
384 const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
385 const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
386 const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
387 const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
388 internal_91e2c9bdb4c4fa8a405803890749bf98::streamsweepdoubleprecisionavx_streamsweepdoubleprecisionavx(_data_force, _data_pdfs, _data_pdfs_tmp, _data_velocity, _size_force_0, _size_force_1, _size_force_2, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
389 pdfs->swapDataPointers(pdfs_tmp);
390}
391
392} // namespace pystencils
393} // namespace walberla
394
395#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
396#pragma GCC diagnostic pop
397#endif
398
399#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
400#pragma warning pop
401#endif
#define FUNC_PREFIX
\file AdvectiveFluxKernel_double_precision.cpp \ingroup lbm \author lbmpy
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
__global__ float * force
void runOnCellInterval(const shared_ptr< StructuredBlockStorage > &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block)
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:174
static FUNC_PREFIX void streamsweepdoubleprecisionavx_streamsweepdoubleprecisionavx(double *RESTRICT const _data_force, double *RESTRICT const _data_pdfs, double *RESTRICT _data_pdfs_tmp, double *RESTRICT _data_velocity, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3)
static Utils::Vector3d velocity(Particle const &p_ref, Particle const &p_vs)
Velocity of the virtual site.
Definition relative.cpp:64