Loading [MathJax]/extensions/TeX/AMSmath.js
ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages Concepts
CollideSweepSinglePrecisionThermalizedAVX.cpp
Go to the documentation of this file.
1//======================================================================================================================
2//
3// This file is part of waLBerla. waLBerla is free software: you can
4// redistribute it and/or modify it under the terms of the GNU General Public
5// License as published by the Free Software Foundation, either version 3 of
6// the License, or (at your option) any later version.
7//
8// waLBerla is distributed in the hope that it will be useful, but WITHOUT
9// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11// for more details.
12//
13// You should have received a copy of the GNU General Public License along
14// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
15//
16//! \\file CollideSweepSinglePrecisionThermalizedAVX.cpp
17//! \\author pystencils
18//======================================================================================================================
19
20// kernel generated with pystencils v1.3.3, lbmpy v1.3.3, lbmpy_walberla/pystencils_walberla from waLBerla commit b0842e1a493ce19ef1bbb8d2cf382fc343970a7f
21
22#include <cmath>
23
25#include "core/DataTypes.h"
26#include "core/Macros.h"
27
28#include "philox_rand.h"
29
30#include <immintrin.h>
31
32#define FUNC_PREFIX
33
34#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
35#pragma GCC diagnostic push
36#pragma GCC diagnostic ignored "-Wfloat-equal"
37#pragma GCC diagnostic ignored "-Wshadow"
38#pragma GCC diagnostic ignored "-Wconversion"
39#pragma GCC diagnostic ignored "-Wunused-variable"
40#endif
41
42#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
43#pragma warning push
44#pragma warning(disable : 1599)
45#endif
46
47using namespace std;
48
49namespace walberla {
50namespace pystencils {
51
52namespace internal_48c9ee502281a70505dce0378c55abd5 {
53static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsingleprecisionthermalizedavx(float *RESTRICT const _data_force, float *RESTRICT _data_pdfs, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, uint32_t block_offset_0, uint32_t block_offset_1, uint32_t block_offset_2, float kT, float omega_bulk, float omega_even, float omega_odd, float omega_shear, uint32_t seed, uint32_t time_step) {
54 const float xi_28 = omega_bulk * 0.5f;
55 const float xi_55 = omega_shear * 0.041666666666666664f;
56 const float xi_60 = omega_bulk * 0.041666666666666664f;
57 const float xi_71 = omega_shear * 0.125f;
58 const float xi_109 = 2.4494897427831779f;
59 const float xi_134 = omega_odd * 0.25f;
60 const float xi_145 = omega_odd * 0.083333333333333329f;
61 const float xi_198 = omega_shear * 0.25f;
62 const float xi_211 = omega_odd * 0.041666666666666664f;
63 const float xi_213 = omega_odd * 0.125f;
64 const float rr_0 = 0.0f;
65 const float xi_53 = rr_0 * 0.041666666666666664f;
66 for (int64_t ctr_2 = 0; ctr_2 < _size_force_2; ctr_2 += 1) {
67 for (int64_t ctr_1 = 0; ctr_1 < _size_force_1; ctr_1 += 1) {
68 {
69 for (int64_t ctr_0 = 0; ctr_0 < (int64_t)((_size_force_0) / (8)) * (8); ctr_0 += 8) {
70 const __m256 xi_244 = _mm256_loadu_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0]);
71 const __m256 xi_245 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0]);
72 const __m256 xi_246 = _mm256_load_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0]);
73 const __m256 xi_247 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0]);
74 const __m256 xi_248 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0]);
75 const __m256 xi_249 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0]);
76 const __m256 xi_250 = _mm256_load_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0]);
77 const __m256 xi_251 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0]);
78 const __m256 xi_252 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0]);
79 const __m256 xi_253 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0]);
80 const __m256 xi_254 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0]);
81 const __m256 xi_255 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0]);
82 const __m256 xi_256 = _mm256_loadu_ps(&_data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0]);
83 const __m256 xi_257 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0]);
84 const __m256 xi_258 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0]);
85 const __m256 xi_259 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0]);
86 const __m256 xi_260 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0]);
87 const __m256 xi_261 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0]);
88 const __m256 xi_262 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0]);
89 const __m256 xi_263 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0]);
90 const __m256 xi_264 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0]);
91 const __m256 xi_265 = _mm256_loadu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0]);
92
93 __m256 random_3_0{};
94 __m256 random_3_1{};
95 __m256 random_3_2{};
96 __m256 random_3_3{};
97 if (kT > 0.) {
98 philox_float4(time_step, _mm256_add_epi32(_mm256_add_epi32(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), _mm256_set_epi32(ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0)), _mm256_set_epi32(((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)))), block_offset_1 + ctr_1, block_offset_2 + ctr_2, 3, seed, random_3_0, random_3_1, random_3_2, random_3_3);
99 }
100
101 __m256 random_2_0{};
102 __m256 random_2_1{};
103 __m256 random_2_2{};
104 __m256 random_2_3{};
105 if (kT > 0.) {
106 philox_float4(time_step, _mm256_add_epi32(_mm256_add_epi32(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), _mm256_set_epi32(ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0)), _mm256_set_epi32(((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)))), block_offset_1 + ctr_1, block_offset_2 + ctr_2, 2, seed, random_2_0, random_2_1, random_2_2, random_2_3);
107 }
108
109 __m256 random_1_0{};
110 __m256 random_1_1{};
111 __m256 random_1_2{};
112 __m256 random_1_3{};
113 if (kT > 0.) {
114 philox_float4(time_step, _mm256_add_epi32(_mm256_add_epi32(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), _mm256_set_epi32(ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0)), _mm256_set_epi32(((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)))), block_offset_1 + ctr_1, block_offset_2 + ctr_2, 1, seed, random_1_0, random_1_1, random_1_2, random_1_3);
115 }
116
117 __m256 random_0_0{};
118 __m256 random_0_1{};
119 __m256 random_0_2{};
120 __m256 random_0_3{};
121 if (kT > 0.) {
122 philox_float4(time_step, _mm256_add_epi32(_mm256_add_epi32(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), _mm256_set_epi32(ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0, ctr_0)), _mm256_set_epi32(((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)), ((int64_t)(block_offset_0)))), block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1, random_0_2, random_0_3);
123 }
124 const __m256 xi_2 = _mm256_add_ps(xi_257, xi_262);
125 const __m256 xi_3 = _mm256_add_ps(xi_2, xi_265);
126 const __m256 xi_4 = _mm256_add_ps(_mm256_add_ps(xi_251, xi_254), xi_264);
127 const __m256 xi_5 = _mm256_add_ps(xi_253, xi_263);
128 const __m256 xi_6 = _mm256_add_ps(xi_245, xi_260);
129 const __m256 xi_8 = _mm256_mul_ps(xi_261, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
130 const __m256 xi_9 = _mm256_mul_ps(xi_255, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
131 const __m256 xi_10 = _mm256_mul_ps(xi_245, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
132 const __m256 xi_11 = _mm256_mul_ps(xi_259, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
133 const __m256 xi_12 = _mm256_mul_ps(xi_248, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
134 const __m256 xi_13 = _mm256_add_ps(_mm256_add_ps(xi_10, xi_11), xi_12);
135 const __m256 xi_14 = _mm256_mul_ps(xi_252, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
136 const __m256 xi_15 = _mm256_mul_ps(xi_258, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
137 const __m256 xi_16 = _mm256_add_ps(xi_14, xi_15);
138 const __m256 xi_17 = _mm256_mul_ps(xi_247, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
139 const __m256 xi_18 = _mm256_mul_ps(xi_253, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
140 const __m256 xi_19 = _mm256_add_ps(xi_17, xi_18);
141 const __m256 xi_20 = _mm256_mul_ps(xi_257, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
142 const __m256 xi_21 = _mm256_add_ps(xi_10, xi_20);
143 const __m256 xi_22 = _mm256_mul_ps(xi_254, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
144 const __m256 xi_23 = _mm256_mul_ps(xi_260, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
145 const __m256 xi_24 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_17, xi_22), xi_23), xi_264);
146 const __m256 xi_29 = _mm256_mul_ps(xi_244, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
147 const __m256 xi_30 = _mm256_mul_ps(xi_244, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
148 const __m256 xi_42 = _mm256_mul_ps(xi_246, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
149 const __m256 xi_43 = _mm256_mul_ps(xi_246, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
150 const __m256 xi_49 = _mm256_mul_ps(xi_256, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
151 const __m256 xi_50 = _mm256_mul_ps(xi_256, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
152 const __m256 xi_67 = _mm256_mul_ps(xi_244, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
153 const __m256 xi_72 = _mm256_mul_ps(xi_244, _mm256_set_ps(xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71));
154 const __m256 xi_114 = _mm256_mul_ps(xi_250, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
155 const __m256 xi_118 = _mm256_mul_ps(xi_264, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
156 const __m256 xi_119 = _mm256_add_ps(xi_118, xi_18);
157 const __m256 xi_120 = _mm256_add_ps(_mm256_mul_ps(xi_249, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_8);
158 const __m256 xi_122 = _mm256_mul_ps(xi_262, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
159 const __m256 xi_123 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_11, xi_122), xi_15), xi_21);
160 const __m256 xi_125 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_247, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_253, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(xi_254, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f))), _mm256_mul_ps(xi_264, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)));
161 const __m256 xi_126 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_248, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f)), _mm256_mul_ps(xi_265, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f))), xi_125);
162 const __m256 xi_128 = _mm256_mul_ps(xi_259, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
163 const __m256 xi_129 = _mm256_mul_ps(xi_262, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
164 const __m256 xi_130 = _mm256_add_ps(_mm256_mul_ps(xi_245, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_257, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)));
165 const __m256 xi_132 = _mm256_add_ps(xi_118, xi_253);
166 const __m256 xi_133 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_132, xi_14), xi_22), xi_247), xi_251);
167 const __m256 xi_135 = _mm256_mul_ps(xi_133, _mm256_set_ps(xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134));
168 const __m256 xi_136 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_2_3);
169 const __m256 xi_141 = _mm256_mul_ps(xi_255, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
170 const __m256 xi_142 = _mm256_mul_ps(xi_258, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f));
171 const __m256 xi_143 = _mm256_add_ps(_mm256_mul_ps(xi_261, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_249, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f)));
172 const __m256 xi_144 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_141, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_14), xi_142), xi_143), xi_19), xi_4);
173 const __m256 xi_146 = _mm256_mul_ps(xi_144, _mm256_set_ps(xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145));
174 const __m256 xi_147 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_1_2);
175 const __m256 xi_152 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_0_1);
176 const __m256 xi_166 = _mm256_add_ps(xi_122, xi_259);
177 const __m256 xi_167 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_12, xi_166), xi_20), xi_245), xi_265);
178 const __m256 xi_168 = _mm256_mul_ps(xi_167, _mm256_set_ps(xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134));
179 const __m256 xi_169 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_2_1);
180 const __m256 xi_171 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_142, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_13), xi_141), xi_143), xi_3);
181 const __m256 xi_172 = _mm256_mul_ps(xi_171, _mm256_set_ps(xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145));
182 const __m256 xi_173 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_2_0);
183 const __m256 xi_178 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_119, xi_23), xi_247), xi_254), xi_263);
184 const __m256 xi_179 = _mm256_mul_ps(xi_178, _mm256_set_ps(xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134, xi_134));
185 const __m256 xi_180 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_2_2);
186 const __m256 xi_182 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_128, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_129, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_130), xi_24), xi_5);
187 const __m256 xi_183 = _mm256_mul_ps(xi_182, _mm256_set_ps(xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145, xi_145));
188 const __m256 xi_184 = _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_1_3);
189 const __m256 xi_212 = _mm256_mul_ps(xi_182, _mm256_set_ps(xi_211, xi_211, xi_211, xi_211, xi_211, xi_211, xi_211, xi_211));
190 const __m256 xi_214 = _mm256_mul_ps(xi_178, _mm256_set_ps(xi_213, xi_213, xi_213, xi_213, xi_213, xi_213, xi_213, xi_213));
191 const __m256 xi_220 = _mm256_mul_ps(xi_144, _mm256_set_ps(xi_211, xi_211, xi_211, xi_211, xi_211, xi_211, xi_211, xi_211));
192 const __m256 xi_221 = _mm256_mul_ps(xi_133, _mm256_set_ps(xi_213, xi_213, xi_213, xi_213, xi_213, xi_213, xi_213, xi_213));
193 const __m256 xi_235 = _mm256_mul_ps(xi_167, _mm256_set_ps(xi_213, xi_213, xi_213, xi_213, xi_213, xi_213, xi_213, xi_213));
194 const __m256 xi_236 = _mm256_mul_ps(xi_171, _mm256_set_ps(xi_211, xi_211, xi_211, xi_211, xi_211, xi_211, xi_211, xi_211));
195 const __m256 xi_31 = _mm256_mul_ps(xi_30, _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0));
196 const __m256 xi_44 = _mm256_mul_ps(xi_43, _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0));
197 const __m256 xi_51 = _mm256_mul_ps(xi_50, _mm256_set_ps(rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0, rr_0));
198 const __m256 xi_54 = _mm256_mul_ps(xi_246, _mm256_set_ps(xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53));
199 const __m256 xi_59 = _mm256_mul_ps(xi_244, _mm256_set_ps(xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53));
200 const __m256 xi_81 = _mm256_mul_ps(xi_256, _mm256_set_ps(xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53, xi_53));
201 const __m256 vel0Term = _mm256_add_ps(_mm256_add_ps(xi_249, xi_258), xi_3);
202 const __m256 vel1Term = _mm256_add_ps(xi_255, xi_4);
203 const __m256 vel2Term = _mm256_add_ps(xi_259, xi_5);
204 const __m256 rho = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel0Term, vel1Term), vel2Term), xi_247), xi_248), xi_250), xi_252), xi_261), xi_6);
205 const __m256 xi_105 = _mm256_mul_ps(rho, _mm256_set_ps(kT, kT, kT, kT, kT, kT, kT, kT));
206 const __m256 xi_106 = _mm256_sqrt_ps(_mm256_mul_ps(xi_105, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f))));
207 const __m256 xi_107 = _mm256_mul_ps(_mm256_mul_ps(xi_106, _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_3_0)), _mm256_set_ps(3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f, 3.7416573867739413f));
208 const __m256 xi_108 = _mm256_mul_ps(_mm256_mul_ps(xi_106, _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_3_2)), _mm256_set_ps(5.4772255750516612f, 5.4772255750516612f, 5.4772255750516612f, 5.4772255750516612f, 5.4772255750516612f, 5.4772255750516612f, 5.4772255750516612f, 5.4772255750516612f));
209 const __m256 xi_110 = _mm256_mul_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_1_1), _mm256_set_ps(xi_109, xi_109, xi_109, xi_109, xi_109, xi_109, xi_109, xi_109)), _mm256_sqrt_ps(_mm256_mul_ps(xi_105, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)))));
210 const __m256 xi_111 = _mm256_mul_ps(_mm256_mul_ps(xi_106, _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_3_1)), _mm256_set_ps(8.3666002653407556f, 8.3666002653407556f, 8.3666002653407556f, 8.3666002653407556f, 8.3666002653407556f, 8.3666002653407556f, 8.3666002653407556f, 8.3666002653407556f));
211 const __m256 xi_137 = _mm256_sqrt_ps(_mm256_mul_ps(xi_105, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_odd, omega_odd, omega_odd, omega_odd, omega_odd, omega_odd, omega_odd, omega_odd)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_odd, omega_odd, omega_odd, omega_odd, omega_odd, omega_odd, omega_odd, omega_odd)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f))));
212 const __m256 xi_138 = _mm256_mul_ps(xi_137, _mm256_set_ps(1.4142135623730951f, 1.4142135623730951f, 1.4142135623730951f, 1.4142135623730951f, 1.4142135623730951f, 1.4142135623730951f, 1.4142135623730951f, 1.4142135623730951f));
213 const __m256 xi_139 = _mm256_mul_ps(xi_138, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f));
214 const __m256 xi_140 = _mm256_mul_ps(xi_136, xi_139);
215 const __m256 xi_148 = _mm256_mul_ps(xi_137, _mm256_set_ps(xi_109, xi_109, xi_109, xi_109, xi_109, xi_109, xi_109, xi_109));
216 const __m256 xi_149 = _mm256_mul_ps(xi_148, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
217 const __m256 xi_150 = _mm256_mul_ps(xi_147, xi_149);
218 const __m256 xi_151 = _mm256_add_ps(_mm256_mul_ps(xi_146, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_150, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)));
219 const __m256 xi_153 = _mm256_sqrt_ps(_mm256_mul_ps(xi_105, _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)), _mm256_add_ps(_mm256_mul_ps(_mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear)), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)))), _mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f))));
220 const __m256 xi_154 = _mm256_mul_ps(xi_153, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f));
221 const __m256 xi_155 = _mm256_mul_ps(xi_152, xi_154);
222 const __m256 xi_161 = _mm256_mul_ps(_mm256_mul_ps(xi_153, _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_0_0)), _mm256_set_ps(1.7320508075688772f, 1.7320508075688772f, 1.7320508075688772f, 1.7320508075688772f, 1.7320508075688772f, 1.7320508075688772f, 1.7320508075688772f, 1.7320508075688772f));
223 const __m256 xi_165 = _mm256_add_ps(xi_146, xi_150);
224 const __m256 xi_170 = _mm256_mul_ps(xi_139, xi_169);
225 const __m256 xi_174 = _mm256_mul_ps(xi_149, xi_173);
226 const __m256 xi_175 = _mm256_add_ps(xi_172, xi_174);
227 const __m256 xi_177 = _mm256_add_ps(_mm256_mul_ps(xi_172, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_174, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)));
228 const __m256 xi_181 = _mm256_mul_ps(xi_139, xi_180);
229 const __m256 xi_185 = _mm256_mul_ps(xi_149, xi_184);
230 const __m256 xi_186 = _mm256_add_ps(_mm256_mul_ps(xi_183, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_185, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)));
231 const __m256 xi_188 = _mm256_add_ps(xi_183, xi_185);
232 const __m256 xi_189 = _mm256_mul_ps(_mm256_mul_ps(xi_152, xi_153), _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
233 const __m256 xi_192 = _mm256_mul_ps(xi_107, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
234 const __m256 xi_196 = _mm256_mul_ps(xi_154, _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_0_2));
235 const __m256 xi_203 = _mm256_mul_ps(xi_154, _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_1_0));
236 const __m256 xi_207 = _mm256_mul_ps(xi_111, _mm256_set_ps(-0.014285714285714285f, -0.014285714285714285f, -0.014285714285714285f, -0.014285714285714285f, -0.014285714285714285f, -0.014285714285714285f, -0.014285714285714285f, -0.014285714285714285f));
237 const __m256 xi_208 = _mm256_mul_ps(xi_108, _mm256_set_ps(0.050000000000000003f, 0.050000000000000003f, 0.050000000000000003f, 0.050000000000000003f, 0.050000000000000003f, 0.050000000000000003f, 0.050000000000000003f, 0.050000000000000003f));
238 const __m256 xi_215 = _mm256_mul_ps(xi_148, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
239 const __m256 xi_216 = _mm256_mul_ps(xi_184, xi_215);
240 const __m256 xi_217 = _mm256_mul_ps(xi_138, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
241 const __m256 xi_218 = _mm256_mul_ps(xi_180, xi_217);
242 const __m256 xi_219 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_212, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_216, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_214), xi_218);
243 const __m256 xi_222 = _mm256_mul_ps(xi_147, xi_215);
244 const __m256 xi_223 = _mm256_mul_ps(xi_136, xi_217);
245 const __m256 xi_224 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_220, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_222, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_221), xi_223);
246 const __m256 xi_225 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_221, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_223, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_220), xi_222);
247 const __m256 xi_227 = _mm256_mul_ps(xi_189, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
248 const __m256 xi_230 = _mm256_mul_ps(xi_111, _mm256_set_ps(0.035714285714285712f, 0.035714285714285712f, 0.035714285714285712f, 0.035714285714285712f, 0.035714285714285712f, 0.035714285714285712f, 0.035714285714285712f, 0.035714285714285712f));
249 const __m256 xi_232 = _mm256_mul_ps(xi_154, _mm256_add_ps(_mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f), random_0_3));
250 const __m256 xi_237 = _mm256_mul_ps(xi_169, xi_217);
251 const __m256 xi_238 = _mm256_mul_ps(xi_173, xi_215);
252 const __m256 xi_239 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_235, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_237, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_236), xi_238);
253 const __m256 xi_241 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_236, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_238, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_235), xi_237);
254 const __m256 xi_242 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_214, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_218, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_212), xi_216);
255 const __m256 xi_0 = _mm256_div_ps(_mm256_set_ps(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f), rho);
256 const __m256 xi_7 = _mm256_mul_ps(xi_0, _mm256_set_ps(0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f));
257 const __m256 u_0 = _mm256_add_ps(_mm256_mul_ps(xi_0, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel0Term, xi_13), xi_8), xi_9)), _mm256_mul_ps(xi_246, xi_7));
258 const __m256 xi_25 = _mm256_mul_ps(u_0, xi_246);
259 const __m256 xi_37 = _mm256_mul_ps(xi_25, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
260 const __m256 xi_38 = _mm256_mul_ps(xi_25, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
261 const __m256 xi_39 = _mm256_mul_ps(xi_38, _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
262 const __m256 xi_40 = _mm256_add_ps(_mm256_mul_ps(xi_37, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_39);
263 const __m256 xi_56 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(xi_25, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(xi_55, xi_55, xi_55, xi_55, xi_55, xi_55, xi_55, xi_55)), xi_37);
264 const __m256 xi_57 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_43, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_54), xi_56);
265 const __m256 xi_61 = _mm256_mul_ps(_mm256_mul_ps(xi_25, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(xi_60, xi_60, xi_60, xi_60, xi_60, xi_60, xi_60, xi_60));
266 const __m256 xi_68 = _mm256_mul_ps(u_0, xi_67);
267 const __m256 xi_73 = _mm256_mul_ps(u_0, xi_72);
268 const __m256 xi_77 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_54, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_43), xi_56);
269 const __m256 xi_84 = _mm256_mul_ps(xi_38, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
270 const __m256 xi_95 = _mm256_mul_ps(u_0, xi_256);
271 const __m256 xi_96 = _mm256_mul_ps(xi_95, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
272 const __m256 xi_99 = _mm256_mul_ps(xi_95, _mm256_set_ps(xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71));
273 const __m256 xi_113 = _mm256_mul_ps(rho, _mm256_mul_ps(u_0, u_0));
274 const __m256 u_1 = _mm256_add_ps(_mm256_mul_ps(xi_0, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel1Term, xi_16), xi_19), xi_249), xi_8)), _mm256_mul_ps(xi_244, xi_7));
275 const __m256 xi_26 = _mm256_mul_ps(u_1, xi_244);
276 const __m256 xi_32 = _mm256_mul_ps(xi_26, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
277 const __m256 xi_45 = _mm256_mul_ps(xi_26, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
278 const __m256 xi_46 = _mm256_mul_ps(xi_45, _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
279 const __m256 xi_47 = _mm256_add_ps(_mm256_mul_ps(xi_32, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_46);
280 const __m256 xi_62 = _mm256_mul_ps(_mm256_mul_ps(xi_26, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(xi_60, xi_60, xi_60, xi_60, xi_60, xi_60, xi_60, xi_60));
281 const __m256 xi_69 = _mm256_mul_ps(u_1, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
282 const __m256 xi_70 = _mm256_mul_ps(xi_246, xi_69);
283 const __m256 xi_74 = _mm256_mul_ps(u_1, _mm256_set_ps(xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71));
284 const __m256 xi_75 = _mm256_mul_ps(xi_246, xi_74);
285 const __m256 xi_76 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_68, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_70, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_73), xi_75);
286 const __m256 xi_78 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_73, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_75, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_68), xi_70);
287 const __m256 xi_86 = _mm256_mul_ps(xi_256, xi_69);
288 const __m256 xi_88 = _mm256_mul_ps(xi_256, xi_74);
289 const __m256 xi_93 = _mm256_mul_ps(xi_45, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
290 const __m256 xi_112 = _mm256_mul_ps(rho, _mm256_mul_ps(u_1, u_1));
291 const __m256 xi_121 = _mm256_add_ps(_mm256_add_ps(xi_112, xi_120), xi_9);
292 const __m256 xi_197 = _mm256_mul_ps(rho, u_1);
293 const __m256 xi_199 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(u_0, xi_197), xi_120), xi_255), xi_258), _mm256_set_ps(xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198));
294 const __m256 xi_200 = _mm256_add_ps(_mm256_mul_ps(xi_196, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_199, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)));
295 const __m256 xi_201 = _mm256_add_ps(xi_196, xi_199);
296 const __m256 u_2 = _mm256_add_ps(_mm256_mul_ps(xi_0, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vel2Term, xi_21), xi_24), xi_262)), _mm256_mul_ps(xi_256, xi_7));
297 const __m256 xi_27 = _mm256_mul_ps(u_2, xi_256);
298 const __m256 xi_33 = _mm256_mul_ps(xi_27, _mm256_set_ps(0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f, 0.16666666666666666f));
299 const __m256 xi_34 = _mm256_mul_ps(xi_27, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f));
300 const __m256 xi_35 = _mm256_mul_ps(xi_34, _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
301 const __m256 xi_36 = _mm256_add_ps(_mm256_mul_ps(xi_33, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_35);
302 const __m256 xi_41 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_26, _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f)), _mm256_mul_ps(_mm256_mul_ps(xi_32, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), xi_36), xi_40);
303 const __m256 xi_48 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_25, _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f)), _mm256_mul_ps(_mm256_mul_ps(xi_37, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), xi_36), xi_47);
304 const __m256 xi_52 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_27, _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f)), _mm256_mul_ps(_mm256_mul_ps(xi_33, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear))), xi_40), xi_47);
305 const __m256 xi_58 = _mm256_mul_ps(xi_34, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
306 const __m256 xi_63 = _mm256_mul_ps(_mm256_mul_ps(xi_27, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(xi_60, xi_60, xi_60, xi_60, xi_60, xi_60, xi_60, xi_60));
307 const __m256 xi_64 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(xi_26, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(xi_55, xi_55, xi_55, xi_55, xi_55, xi_55, xi_55, xi_55)), xi_32), xi_61), xi_62), xi_63);
308 const __m256 xi_65 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_59, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_30), xi_64);
309 const __m256 xi_66 = _mm256_add_ps(_mm256_add_ps(xi_35, xi_58), xi_65);
310 const __m256 xi_79 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_30, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_59), xi_64);
311 const __m256 xi_80 = _mm256_add_ps(_mm256_add_ps(xi_35, xi_58), xi_79);
312 const __m256 xi_82 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(xi_27, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_set_ps(xi_55, xi_55, xi_55, xi_55, xi_55, xi_55, xi_55, xi_55)), xi_33);
313 const __m256 xi_83 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_81, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_50), xi_82);
314 const __m256 xi_85 = _mm256_add_ps(_mm256_add_ps(xi_39, xi_65), xi_84);
315 const __m256 xi_87 = _mm256_mul_ps(u_2, xi_67);
316 const __m256 xi_89 = _mm256_mul_ps(u_2, xi_72);
317 const __m256 xi_90 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_88, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_89, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_86), xi_87);
318 const __m256 xi_91 = _mm256_add_ps(_mm256_add_ps(xi_39, xi_79), xi_84);
319 const __m256 xi_92 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_86, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_87, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_88), xi_89);
320 const __m256 xi_94 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_46, xi_61), xi_62), xi_63), xi_83), xi_93);
321 const __m256 xi_97 = _mm256_mul_ps(u_2, xi_246);
322 const __m256 xi_98 = _mm256_mul_ps(xi_97, _mm256_set_ps(0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f, 0.25f));
323 const __m256 xi_100 = _mm256_mul_ps(xi_97, _mm256_set_ps(xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71, xi_71));
324 const __m256 xi_101 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_96, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_98, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_100), xi_99);
325 const __m256 xi_102 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_100, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_99, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_96), xi_98);
326 const __m256 xi_103 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_50, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_81), xi_82);
327 const __m256 xi_104 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_103, xi_46), xi_61), xi_62), xi_63), xi_93);
328 const __m256 xi_115 = _mm256_mul_ps(rho, _mm256_mul_ps(u_2, u_2));
329 const __m256 xi_116 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_260, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f)), _mm256_mul_ps(xi_263, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_mul_ps(xi_115, _mm256_set_ps(0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f))), xi_114);
330 const __m256 xi_117 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_251, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f)), _mm256_mul_ps(xi_252, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f))), _mm256_mul_ps(xi_112, _mm256_set_ps(0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f, 0.66666666666666663f))), _mm256_mul_ps(xi_113, _mm256_set_ps(1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f, 1.6666666666666667f))), _mm256_mul_ps(xi_247, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_mul_ps(xi_253, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_mul_ps(xi_254, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), _mm256_mul_ps(xi_264, _mm256_set_ps(-3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f, -3.0f))), xi_116), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even));
331 const __m256 xi_124 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_113, xi_115), xi_119), xi_121), xi_123), xi_17), xi_22), xi_250), _mm256_set_ps(omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk, omega_bulk));
332 const __m256 xi_127 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_112, _mm256_set_ps(2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f, 2.3333333333333335f)), _mm256_mul_ps(xi_251, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_mul_ps(xi_252, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_mul_ps(xi_245, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), _mm256_mul_ps(xi_257, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), _mm256_mul_ps(xi_259, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), _mm256_mul_ps(xi_262, _mm256_set_ps(-5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f, -5.0f))), xi_116), xi_126), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even));
333 const __m256 xi_131 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_115, _mm256_set_ps(3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f)), _mm256_mul_ps(xi_251, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f))), _mm256_mul_ps(xi_252, _mm256_set_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f))), _mm256_mul_ps(xi_260, _mm256_set_ps(-4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f))), _mm256_mul_ps(xi_263, _mm256_set_ps(-4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f))), _mm256_mul_ps(xi_249, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), _mm256_mul_ps(xi_255, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), _mm256_mul_ps(xi_258, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), _mm256_mul_ps(xi_261, _mm256_set_ps(-7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f, -7.0f))), xi_114), xi_126), xi_128), xi_129), xi_130), _mm256_set_ps(omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even, omega_even));
334 const __m256 xi_156 = _mm256_add_ps(_mm256_mul_ps(xi_115, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_263);
335 const __m256 xi_157 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_251, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_121), xi_156), xi_16), xi_2), xi_259), xi_6), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
336 const __m256 xi_158 = _mm256_mul_ps(xi_157, _mm256_set_ps(0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f, 0.125f));
337 const __m256 xi_159 = _mm256_add_ps(_mm256_mul_ps(xi_131, _mm256_set_ps(-0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f, -0.01984126984126984f)), _mm256_mul_ps(xi_107, _mm256_set_ps(-0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f, -0.11904761904761904f)));
338 const __m256 xi_160 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_113, _mm256_set_ps(2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)), _mm256_mul_ps(xi_112, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_248, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), _mm256_mul_ps(xi_265, _mm256_set_ps(-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f))), xi_120), xi_123), xi_125), xi_156), xi_251), xi_252), xi_260), xi_9), _mm256_set_ps(omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear, omega_shear));
339 const __m256 xi_162 = _mm256_add_ps(_mm256_mul_ps(xi_160, _mm256_set_ps(-0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f, -0.041666666666666664f)), _mm256_mul_ps(xi_161, _mm256_set_ps(-0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f, -0.16666666666666666f)));
340 const __m256 xi_163 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_117, _mm256_set_ps(-0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f, -0.050000000000000003f)), _mm256_mul_ps(xi_108, _mm256_set_ps(-0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f, -0.10000000000000001f))), xi_162);
341 const __m256 xi_164 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_127, _mm256_set_ps(0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f, 0.014285714285714285f)), _mm256_mul_ps(xi_111, _mm256_set_ps(0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f, 0.028571428571428571f))), xi_155), xi_158), xi_159), xi_163);
342 const __m256 xi_176 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_160, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)), _mm256_mul_ps(xi_161, _mm256_set_ps(0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f, 0.33333333333333331f))), _mm256_mul_ps(xi_127, _mm256_set_ps(-0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f, -0.035714285714285712f))), _mm256_mul_ps(xi_111, _mm256_set_ps(-0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f, -0.071428571428571425f))), xi_159);
343 const __m256 xi_187 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_131, _mm256_set_ps(0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f, 0.015873015873015872f)), _mm256_mul_ps(xi_107, _mm256_set_ps(0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f, 0.095238095238095233f))), _mm256_mul_ps(xi_155, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_158, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_127, _mm256_set_ps(-0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f, -0.021428571428571429f))), _mm256_mul_ps(xi_111, _mm256_set_ps(-0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f, -0.042857142857142858f))), xi_163);
344 const __m256 xi_190 = _mm256_mul_ps(xi_157, _mm256_set_ps(0.0625f, 0.0625f, 0.0625f, 0.0625f, 0.0625f, 0.0625f, 0.0625f, 0.0625f));
345 const __m256 xi_191 = _mm256_mul_ps(xi_131, _mm256_set_ps(0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f, 0.013888888888888888f));
346 const __m256 xi_193 = _mm256_add_ps(_mm256_mul_ps(xi_124, _mm256_set_ps(0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f, 0.041666666666666664f)), _mm256_mul_ps(xi_110, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f)));
347 const __m256 xi_194 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_160, _mm256_set_ps(0.020833333333333332f, 0.020833333333333332f, 0.020833333333333332f, 0.020833333333333332f, 0.020833333333333332f, 0.020833333333333332f, 0.020833333333333332f, 0.020833333333333332f)), _mm256_mul_ps(xi_161, _mm256_set_ps(0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f, 0.083333333333333329f))), xi_193);
348 const __m256 xi_195 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_165, xi_189), xi_190), xi_191), xi_192), xi_194);
349 const __m256 xi_202 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_151, xi_189), xi_190), xi_191), xi_192), xi_194);
350 const __m256 xi_204 = _mm256_mul_ps(xi_127, _mm256_set_ps(-0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f, -0.0071428571428571426f));
351 const __m256 xi_205 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(u_2, xi_197), xi_132), xi_17), xi_254), _mm256_set_ps(xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198));
352 const __m256 xi_206 = _mm256_mul_ps(xi_117, _mm256_set_ps(0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f, 0.025000000000000001f));
353 const __m256 xi_209 = _mm256_add_ps(_mm256_mul_ps(xi_131, _mm256_set_ps(-0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f, -0.003968253968253968f)), _mm256_mul_ps(xi_107, _mm256_set_ps(-0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f, -0.023809523809523808f)));
354 const __m256 xi_210 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_162, xi_193), xi_203), xi_204), xi_205), xi_206), xi_207), xi_208), xi_209);
355 const __m256 xi_226 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_203, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_205, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), xi_162), xi_193), xi_204), xi_206), xi_207), xi_208), xi_209);
356 const __m256 xi_228 = _mm256_mul_ps(xi_190, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f));
357 const __m256 xi_229 = _mm256_mul_ps(xi_127, _mm256_set_ps(0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f, 0.017857142857142856f));
358 const __m256 xi_231 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_188, xi_194), xi_209), xi_227), xi_228), xi_229), xi_230);
359 const __m256 xi_233 = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(rho, u_0), u_2), xi_10), xi_166), xi_257), _mm256_set_ps(xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198, xi_198));
360 const __m256 xi_234 = _mm256_add_ps(_mm256_mul_ps(xi_232, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_233, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)));
361 const __m256 xi_240 = _mm256_add_ps(xi_232, xi_233);
362 const __m256 xi_243 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(xi_186, xi_194), xi_209), xi_227), xi_228), xi_229), xi_230);
363 const __m256 forceTerm_0 = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_25, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_26, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_27, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), _mm256_mul_ps(xi_25, _mm256_set_ps(xi_28, xi_28, xi_28, xi_28, xi_28, xi_28, xi_28, xi_28))), _mm256_mul_ps(xi_26, _mm256_set_ps(xi_28, xi_28, xi_28, xi_28, xi_28, xi_28, xi_28, xi_28))), _mm256_mul_ps(xi_27, _mm256_set_ps(xi_28, xi_28, xi_28, xi_28, xi_28, xi_28, xi_28, xi_28)));
364 const __m256 forceTerm_1 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_31, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_29), xi_41);
365 const __m256 forceTerm_2 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_29, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_31), xi_41);
366 const __m256 forceTerm_3 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_42, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_44), xi_48);
367 const __m256 forceTerm_4 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_44, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_42), xi_48);
368 const __m256 forceTerm_5 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_51, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_49), xi_52);
369 const __m256 forceTerm_6 = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_49, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), xi_51), xi_52);
370 const __m256 forceTerm_7 = _mm256_add_ps(_mm256_add_ps(xi_57, xi_66), xi_76);
371 const __m256 forceTerm_8 = _mm256_add_ps(_mm256_add_ps(xi_66, xi_77), xi_78);
372 const __m256 forceTerm_9 = _mm256_add_ps(_mm256_add_ps(xi_57, xi_78), xi_80);
373 const __m256 forceTerm_10 = _mm256_add_ps(_mm256_add_ps(xi_76, xi_77), xi_80);
374 const __m256 forceTerm_11 = _mm256_add_ps(_mm256_add_ps(xi_83, xi_85), xi_90);
375 const __m256 forceTerm_12 = _mm256_add_ps(_mm256_add_ps(xi_83, xi_91), xi_92);
376 const __m256 forceTerm_13 = _mm256_add_ps(_mm256_add_ps(xi_101, xi_57), xi_94);
377 const __m256 forceTerm_14 = _mm256_add_ps(_mm256_add_ps(xi_102, xi_77), xi_94);
378 const __m256 forceTerm_15 = _mm256_add_ps(_mm256_add_ps(xi_103, xi_85), xi_92);
379 const __m256 forceTerm_16 = _mm256_add_ps(_mm256_add_ps(xi_103, xi_90), xi_91);
380 const __m256 forceTerm_17 = _mm256_add_ps(_mm256_add_ps(xi_102, xi_104), xi_57);
381 const __m256 forceTerm_18 = _mm256_add_ps(_mm256_add_ps(xi_101, xi_104), xi_77);
382 _mm256_store_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_131, _mm256_set_ps(0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f, 0.023809523809523808f)), _mm256_mul_ps(xi_107, _mm256_set_ps(0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f, 0.14285714285714285f))), _mm256_mul_ps(xi_127, _mm256_set_ps(0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f, 0.042857142857142858f))), _mm256_mul_ps(xi_111, _mm256_set_ps(0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f, 0.085714285714285715f))), _mm256_mul_ps(xi_117, _mm256_set_ps(0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f, 0.10000000000000001f))), _mm256_mul_ps(xi_108, _mm256_set_ps(0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f, 0.20000000000000001f))), _mm256_mul_ps(xi_124, _mm256_set_ps(-0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f))), _mm256_mul_ps(xi_110, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_0), xi_250));
383 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_135, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_140, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_1), xi_151), xi_164), xi_251));
384 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_2, xi_135), xi_140), xi_164), xi_165), xi_252));
385 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_3, xi_168), xi_170), xi_175), xi_176), xi_248));
386 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_168, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_170, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_4), xi_176), xi_177), xi_265));
387 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(xi_179, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)), _mm256_mul_ps(xi_181, _mm256_set_ps(-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))), forceTerm_5), xi_186), xi_187), xi_263));
388 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_6, xi_179), xi_181), xi_187), xi_188), xi_260));
389 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_7, xi_177), xi_195), xi_200), xi_255));
390 _mm256_store_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_8, xi_175), xi_195), xi_201), xi_249));
391 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_9, xi_177), xi_201), xi_202), xi_261));
392 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_10, xi_175), xi_200), xi_202), xi_258));
393 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_11, xi_210), xi_219), xi_224), xi_264));
394 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_12, xi_219), xi_225), xi_226), xi_253));
395 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_13, xi_231), xi_234), xi_239), xi_259));
396 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_14, xi_231), xi_240), xi_241), xi_262));
397 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_15, xi_224), xi_226), xi_242), xi_254));
398 _mm256_store_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_16, xi_210), xi_225), xi_242), xi_247));
399 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_17, xi_239), xi_240), xi_243), xi_245));
400 _mm256_storeu_ps(&_data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0], _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(forceTerm_18, xi_234), xi_241), xi_243), xi_257));
401 }
402 for (int64_t ctr_0 = (int64_t)((_size_force_0) / (8)) * (8); ctr_0 < _size_force_0; ctr_0 += 1) {
403 const float xi_244 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + _stride_force_3 + ctr_0];
404 const float xi_245 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0];
405 const float xi_246 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + ctr_0];
406 const float xi_247 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0];
407 const float xi_248 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0];
408 const float xi_249 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0];
409 const float xi_250 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0];
410 const float xi_251 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0];
411 const float xi_252 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0];
412 const float xi_253 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0];
413 const float xi_254 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0];
414 const float xi_255 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0];
415 const float xi_256 = _data_force[_stride_force_1 * ctr_1 + _stride_force_2 * ctr_2 + 2 * _stride_force_3 + ctr_0];
416 const float xi_257 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0];
417 const float xi_258 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0];
418 const float xi_259 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0];
419 const float xi_260 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0];
420 const float xi_261 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0];
421 const float xi_262 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0];
422 const float xi_263 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0];
423 const float xi_264 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0];
424 const float xi_265 = _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0];
425
426 float random_3_0{};
427 float random_3_1{};
428 float random_3_2{};
429 float random_3_3{};
430 if (kT > 0.) {
431 philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 3, seed, random_3_0, random_3_1, random_3_2, random_3_3);
432 }
433
434 float random_2_0{};
435 float random_2_1{};
436 float random_2_2{};
437 float random_2_3{};
438 if (kT > 0.) {
439 philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 2, seed, random_2_0, random_2_1, random_2_2, random_2_3);
440 }
441
442 float random_1_0{};
443 float random_1_1{};
444 float random_1_2{};
445 float random_1_3{};
446 if (kT > 0.) {
447 philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 1, seed, random_1_0, random_1_1, random_1_2, random_1_3);
448 }
449
450 float random_0_0{};
451 float random_0_1{};
452 float random_0_2{};
453 float random_0_3{};
454 if (kT > 0.) {
455 philox_float4(time_step, block_offset_0 + ctr_0, block_offset_1 + ctr_1, block_offset_2 + ctr_2, 0, seed, random_0_0, random_0_1, random_0_2, random_0_3);
456 }
457 const float xi_2 = xi_257 + xi_262;
458 const float xi_3 = xi_2 + xi_265;
459 const float xi_4 = xi_251 + xi_254 + xi_264;
460 const float xi_5 = xi_253 + xi_263;
461 const float xi_6 = xi_245 + xi_260;
462 const float xi_8 = -xi_261;
463 const float xi_9 = -xi_255;
464 const float xi_10 = -xi_245;
465 const float xi_11 = -xi_259;
466 const float xi_12 = -xi_248;
467 const float xi_13 = xi_10 + xi_11 + xi_12;
468 const float xi_14 = -xi_252;
469 const float xi_15 = -xi_258;
470 const float xi_16 = xi_14 + xi_15;
471 const float xi_17 = -xi_247;
472 const float xi_18 = -xi_253;
473 const float xi_19 = xi_17 + xi_18;
474 const float xi_20 = -xi_257;
475 const float xi_21 = xi_10 + xi_20;
476 const float xi_22 = -xi_254;
477 const float xi_23 = -xi_260;
478 const float xi_24 = xi_17 + xi_22 + xi_23 + xi_264;
479 const float xi_29 = xi_244 * 0.16666666666666666f;
480 const float xi_30 = xi_244 * 0.083333333333333329f;
481 const float xi_42 = xi_246 * 0.16666666666666666f;
482 const float xi_43 = xi_246 * 0.083333333333333329f;
483 const float xi_49 = xi_256 * 0.16666666666666666f;
484 const float xi_50 = xi_256 * 0.083333333333333329f;
485 const float xi_67 = xi_244 * 0.25f;
486 const float xi_72 = xi_244 * xi_71;
487 const float xi_114 = -xi_250;
488 const float xi_118 = -xi_264;
489 const float xi_119 = xi_118 + xi_18;
490 const float xi_120 = -xi_249 + xi_8;
491 const float xi_122 = -xi_262;
492 const float xi_123 = xi_11 + xi_122 + xi_15 + xi_21;
493 const float xi_125 = xi_247 * 2.0f + xi_253 * 2.0f + xi_254 * 2.0f + xi_264 * 2.0f;
494 const float xi_126 = xi_125 + xi_248 * 5.0f + xi_265 * 5.0f;
495 const float xi_128 = xi_259 * 2.0f;
496 const float xi_129 = xi_262 * 2.0f;
497 const float xi_130 = xi_245 * 2.0f + xi_257 * 2.0f;
498 const float xi_132 = xi_118 + xi_253;
499 const float xi_133 = xi_132 + xi_14 + xi_22 + xi_247 + xi_251;
500 const float xi_135 = xi_133 * xi_134;
501 const float xi_136 = random_2_3 - 0.5f;
502 const float xi_141 = xi_255 * 2.0f;
503 const float xi_142 = xi_258 * 2.0f;
504 const float xi_143 = xi_249 * -2.0f + xi_261 * 2.0f;
505 const float xi_144 = xi_14 - xi_141 + xi_142 + xi_143 + xi_19 + xi_4;
506 const float xi_146 = xi_144 * xi_145;
507 const float xi_147 = random_1_2 - 0.5f;
508 const float xi_152 = random_0_1 - 0.5f;
509 const float xi_166 = xi_122 + xi_259;
510 const float xi_167 = xi_12 + xi_166 + xi_20 + xi_245 + xi_265;
511 const float xi_168 = xi_134 * xi_167;
512 const float xi_169 = random_2_1 - 0.5f;
513 const float xi_171 = xi_13 + xi_141 - xi_142 + xi_143 + xi_3;
514 const float xi_172 = xi_145 * xi_171;
515 const float xi_173 = random_2_0 - 0.5f;
516 const float xi_178 = xi_119 + xi_23 + xi_247 + xi_254 + xi_263;
517 const float xi_179 = xi_134 * xi_178;
518 const float xi_180 = random_2_2 - 0.5f;
519 const float xi_182 = -xi_128 - xi_129 + xi_130 + xi_24 + xi_5;
520 const float xi_183 = xi_145 * xi_182;
521 const float xi_184 = random_1_3 - 0.5f;
522 const float xi_212 = xi_182 * xi_211;
523 const float xi_214 = xi_178 * xi_213;
524 const float xi_220 = xi_144 * xi_211;
525 const float xi_221 = xi_133 * xi_213;
526 const float xi_235 = xi_167 * xi_213;
527 const float xi_236 = xi_171 * xi_211;
528 const float xi_31 = rr_0 * xi_30;
529 const float xi_44 = rr_0 * xi_43;
530 const float xi_51 = rr_0 * xi_50;
531 const float xi_54 = xi_246 * xi_53;
532 const float xi_59 = xi_244 * xi_53;
533 const float xi_81 = xi_256 * xi_53;
534 const float vel0Term = xi_249 + xi_258 + xi_3;
535 const float vel1Term = xi_255 + xi_4;
536 const float vel2Term = xi_259 + xi_5;
537 const float rho = vel0Term + vel1Term + vel2Term + xi_247 + xi_248 + xi_250 + xi_252 + xi_261 + xi_6;
538 const float xi_105 = kT * rho;
539 const float xi_106 = powf(xi_105 * (1.0f - ((-omega_even + 1.0f) * (-omega_even + 1.0f))), 0.5f);
540 const float xi_107 = xi_106 * (random_3_0 - 0.5f) * 3.7416573867739413f;
541 const float xi_108 = xi_106 * (random_3_2 - 0.5f) * 5.4772255750516612f;
542 const float xi_110 = xi_109 * (random_1_1 - 0.5f) * powf(xi_105 * (1.0f - ((-omega_bulk + 1.0f) * (-omega_bulk + 1.0f))), 0.5f);
543 const float xi_111 = xi_106 * (random_3_1 - 0.5f) * 8.3666002653407556f;
544 const float xi_137 = powf(xi_105 * (1.0f - ((-omega_odd + 1.0f) * (-omega_odd + 1.0f))), 0.5f);
545 const float xi_138 = xi_137 * 1.4142135623730951f;
546 const float xi_139 = xi_138 * 0.5f;
547 const float xi_140 = xi_136 * xi_139;
548 const float xi_148 = xi_109 * xi_137;
549 const float xi_149 = xi_148 * 0.16666666666666666f;
550 const float xi_150 = xi_147 * xi_149;
551 const float xi_151 = -xi_146 - xi_150;
552 const float xi_153 = powf(xi_105 * (1.0f - ((-omega_shear + 1.0f) * (-omega_shear + 1.0f))), 0.5f);
553 const float xi_154 = xi_153 * 0.5f;
554 const float xi_155 = xi_152 * xi_154;
555 const float xi_161 = xi_153 * (random_0_0 - 0.5f) * 1.7320508075688772f;
556 const float xi_165 = xi_146 + xi_150;
557 const float xi_170 = xi_139 * xi_169;
558 const float xi_174 = xi_149 * xi_173;
559 const float xi_175 = xi_172 + xi_174;
560 const float xi_177 = -xi_172 - xi_174;
561 const float xi_181 = xi_139 * xi_180;
562 const float xi_185 = xi_149 * xi_184;
563 const float xi_186 = -xi_183 - xi_185;
564 const float xi_188 = xi_183 + xi_185;
565 const float xi_189 = xi_152 * xi_153 * 0.25f;
566 const float xi_192 = xi_107 * 0.083333333333333329f;
567 const float xi_196 = xi_154 * (random_0_2 - 0.5f);
568 const float xi_203 = xi_154 * (random_1_0 - 0.5f);
569 const float xi_207 = xi_111 * -0.014285714285714285f;
570 const float xi_208 = xi_108 * 0.050000000000000003f;
571 const float xi_215 = xi_148 * 0.083333333333333329f;
572 const float xi_216 = xi_184 * xi_215;
573 const float xi_217 = xi_138 * 0.25f;
574 const float xi_218 = xi_180 * xi_217;
575 const float xi_219 = -xi_212 + xi_214 - xi_216 + xi_218;
576 const float xi_222 = xi_147 * xi_215;
577 const float xi_223 = xi_136 * xi_217;
578 const float xi_224 = -xi_220 + xi_221 - xi_222 + xi_223;
579 const float xi_225 = xi_220 - xi_221 + xi_222 - xi_223;
580 const float xi_227 = -xi_189;
581 const float xi_230 = xi_111 * 0.035714285714285712f;
582 const float xi_232 = xi_154 * (random_0_3 - 0.5f);
583 const float xi_237 = xi_169 * xi_217;
584 const float xi_238 = xi_173 * xi_215;
585 const float xi_239 = -xi_235 + xi_236 - xi_237 + xi_238;
586 const float xi_241 = xi_235 - xi_236 + xi_237 - xi_238;
587 const float xi_242 = xi_212 - xi_214 + xi_216 - xi_218;
588 const float xi_0 = ((1.0f) / (rho));
589 const float xi_7 = xi_0 * 0.5f;
590 const float u_0 = xi_0 * (vel0Term + xi_13 + xi_8 + xi_9) + xi_246 * xi_7;
591 const float xi_25 = u_0 * xi_246;
592 const float xi_37 = xi_25 * 0.16666666666666666f;
593 const float xi_38 = xi_25 * 0.083333333333333329f;
594 const float xi_39 = omega_shear * xi_38;
595 const float xi_40 = -xi_37 + xi_39;
596 const float xi_56 = -xi_25 * xi_55 + xi_37;
597 const float xi_57 = -xi_43 + xi_54 + xi_56;
598 const float xi_61 = -xi_25 * xi_60;
599 const float xi_68 = u_0 * xi_67;
600 const float xi_73 = u_0 * xi_72;
601 const float xi_77 = xi_43 - xi_54 + xi_56;
602 const float xi_84 = -xi_38;
603 const float xi_95 = u_0 * xi_256;
604 const float xi_96 = xi_95 * 0.25f;
605 const float xi_99 = xi_71 * xi_95;
606 const float xi_113 = rho * (u_0 * u_0);
607 const float u_1 = xi_0 * (vel1Term + xi_16 + xi_19 + xi_249 + xi_8) + xi_244 * xi_7;
608 const float xi_26 = u_1 * xi_244;
609 const float xi_32 = xi_26 * 0.16666666666666666f;
610 const float xi_45 = xi_26 * 0.083333333333333329f;
611 const float xi_46 = omega_shear * xi_45;
612 const float xi_47 = -xi_32 + xi_46;
613 const float xi_62 = -xi_26 * xi_60;
614 const float xi_69 = u_1 * 0.25f;
615 const float xi_70 = xi_246 * xi_69;
616 const float xi_74 = u_1 * xi_71;
617 const float xi_75 = xi_246 * xi_74;
618 const float xi_76 = -xi_68 - xi_70 + xi_73 + xi_75;
619 const float xi_78 = xi_68 + xi_70 - xi_73 - xi_75;
620 const float xi_86 = xi_256 * xi_69;
621 const float xi_88 = xi_256 * xi_74;
622 const float xi_93 = -xi_45;
623 const float xi_112 = rho * (u_1 * u_1);
624 const float xi_121 = xi_112 + xi_120 + xi_9;
625 const float xi_197 = rho * u_1;
626 const float xi_199 = xi_198 * (u_0 * xi_197 + xi_120 + xi_255 + xi_258);
627 const float xi_200 = -xi_196 - xi_199;
628 const float xi_201 = xi_196 + xi_199;
629 const float u_2 = xi_0 * (vel2Term + xi_21 + xi_24 + xi_262) + xi_256 * xi_7;
630 const float xi_27 = u_2 * xi_256;
631 const float xi_33 = xi_27 * 0.16666666666666666f;
632 const float xi_34 = xi_27 * 0.083333333333333329f;
633 const float xi_35 = omega_shear * xi_34;
634 const float xi_36 = -xi_33 + xi_35;
635 const float xi_41 = -omega_shear * xi_32 + xi_26 * 0.33333333333333331f + xi_36 + xi_40;
636 const float xi_48 = -omega_shear * xi_37 + xi_25 * 0.33333333333333331f + xi_36 + xi_47;
637 const float xi_52 = -omega_shear * xi_33 + xi_27 * 0.33333333333333331f + xi_40 + xi_47;
638 const float xi_58 = -xi_34;
639 const float xi_63 = -xi_27 * xi_60;
640 const float xi_64 = -xi_26 * xi_55 + xi_32 + xi_61 + xi_62 + xi_63;
641 const float xi_65 = xi_30 - xi_59 + xi_64;
642 const float xi_66 = xi_35 + xi_58 + xi_65;
643 const float xi_79 = -xi_30 + xi_59 + xi_64;
644 const float xi_80 = xi_35 + xi_58 + xi_79;
645 const float xi_82 = -xi_27 * xi_55 + xi_33;
646 const float xi_83 = xi_50 - xi_81 + xi_82;
647 const float xi_85 = xi_39 + xi_65 + xi_84;
648 const float xi_87 = u_2 * xi_67;
649 const float xi_89 = u_2 * xi_72;
650 const float xi_90 = xi_86 + xi_87 - xi_88 - xi_89;
651 const float xi_91 = xi_39 + xi_79 + xi_84;
652 const float xi_92 = -xi_86 - xi_87 + xi_88 + xi_89;
653 const float xi_94 = xi_46 + xi_61 + xi_62 + xi_63 + xi_83 + xi_93;
654 const float xi_97 = u_2 * xi_246;
655 const float xi_98 = xi_97 * 0.25f;
656 const float xi_100 = xi_71 * xi_97;
657 const float xi_101 = xi_100 - xi_96 - xi_98 + xi_99;
658 const float xi_102 = -xi_100 + xi_96 + xi_98 - xi_99;
659 const float xi_103 = -xi_50 + xi_81 + xi_82;
660 const float xi_104 = xi_103 + xi_46 + xi_61 + xi_62 + xi_63 + xi_93;
661 const float xi_115 = rho * (u_2 * u_2);
662 const float xi_116 = xi_114 + xi_115 * 0.66666666666666663f + xi_260 * 3.0f + xi_263 * 3.0f;
663 const float xi_117 = omega_even * (xi_112 * 0.66666666666666663f + xi_113 * 1.6666666666666667f + xi_116 + xi_247 * -3.0f + xi_251 * 3.0f + xi_252 * 3.0f + xi_253 * -3.0f + xi_254 * -3.0f + xi_264 * -3.0f);
664 const float xi_124 = omega_bulk * (xi_113 + xi_115 + xi_119 + xi_121 + xi_123 + xi_17 + xi_22 + xi_250);
665 const float xi_127 = omega_even * (xi_112 * 2.3333333333333335f + xi_116 + xi_126 + xi_245 * -5.0f + xi_251 * -2.0f + xi_252 * -2.0f + xi_257 * -5.0f + xi_259 * -5.0f + xi_262 * -5.0f);
666 const float xi_131 = omega_even * (xi_114 + xi_115 * 3.0f + xi_126 + xi_128 + xi_129 + xi_130 + xi_249 * -7.0f + xi_251 * 5.0f + xi_252 * 5.0f + xi_255 * -7.0f + xi_258 * -7.0f + xi_260 * -4.0f + xi_261 * -7.0f + xi_263 * -4.0f);
667 const float xi_156 = -xi_115 + xi_263;
668 const float xi_157 = omega_shear * (xi_121 + xi_156 + xi_16 + xi_2 - xi_251 + xi_259 + xi_6);
669 const float xi_158 = xi_157 * 0.125f;
670 const float xi_159 = xi_107 * -0.11904761904761904f + xi_131 * -0.01984126984126984f;
671 const float xi_160 = omega_shear * (-xi_112 + xi_113 * 2.0f + xi_120 + xi_123 + xi_125 + xi_156 + xi_248 * -2.0f + xi_251 + xi_252 + xi_260 + xi_265 * -2.0f + xi_9);
672 const float xi_162 = xi_160 * -0.041666666666666664f + xi_161 * -0.16666666666666666f;
673 const float xi_163 = xi_108 * -0.10000000000000001f + xi_117 * -0.050000000000000003f + xi_162;
674 const float xi_164 = xi_111 * 0.028571428571428571f + xi_127 * 0.014285714285714285f + xi_155 + xi_158 + xi_159 + xi_163;
675 const float xi_176 = xi_111 * -0.071428571428571425f + xi_127 * -0.035714285714285712f + xi_159 + xi_160 * 0.083333333333333329f + xi_161 * 0.33333333333333331f;
676 const float xi_187 = xi_107 * 0.095238095238095233f + xi_111 * -0.042857142857142858f + xi_127 * -0.021428571428571429f + xi_131 * 0.015873015873015872f - xi_155 - xi_158 + xi_163;
677 const float xi_190 = xi_157 * 0.0625f;
678 const float xi_191 = xi_131 * 0.013888888888888888f;
679 const float xi_193 = xi_110 * 0.083333333333333329f + xi_124 * 0.041666666666666664f;
680 const float xi_194 = xi_160 * 0.020833333333333332f + xi_161 * 0.083333333333333329f + xi_193;
681 const float xi_195 = xi_165 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
682 const float xi_202 = xi_151 + xi_189 + xi_190 + xi_191 + xi_192 + xi_194;
683 const float xi_204 = xi_127 * -0.0071428571428571426f;
684 const float xi_205 = xi_198 * (u_2 * xi_197 + xi_132 + xi_17 + xi_254);
685 const float xi_206 = xi_117 * 0.025000000000000001f;
686 const float xi_209 = xi_107 * -0.023809523809523808f + xi_131 * -0.003968253968253968f;
687 const float xi_210 = xi_162 + xi_193 + xi_203 + xi_204 + xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
688 const float xi_226 = xi_162 + xi_193 - xi_203 + xi_204 - xi_205 + xi_206 + xi_207 + xi_208 + xi_209;
689 const float xi_228 = -xi_190;
690 const float xi_229 = xi_127 * 0.017857142857142856f;
691 const float xi_231 = xi_188 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
692 const float xi_233 = xi_198 * (rho * u_0 * u_2 + xi_10 + xi_166 + xi_257);
693 const float xi_234 = -xi_232 - xi_233;
694 const float xi_240 = xi_232 + xi_233;
695 const float xi_243 = xi_186 + xi_194 + xi_209 + xi_227 + xi_228 + xi_229 + xi_230;
696 const float forceTerm_0 = xi_25 * xi_28 - xi_25 + xi_26 * xi_28 - xi_26 + xi_27 * xi_28 - xi_27;
697 const float forceTerm_1 = xi_29 - xi_31 + xi_41;
698 const float forceTerm_2 = -xi_29 + xi_31 + xi_41;
699 const float forceTerm_3 = -xi_42 + xi_44 + xi_48;
700 const float forceTerm_4 = xi_42 - xi_44 + xi_48;
701 const float forceTerm_5 = xi_49 - xi_51 + xi_52;
702 const float forceTerm_6 = -xi_49 + xi_51 + xi_52;
703 const float forceTerm_7 = xi_57 + xi_66 + xi_76;
704 const float forceTerm_8 = xi_66 + xi_77 + xi_78;
705 const float forceTerm_9 = xi_57 + xi_78 + xi_80;
706 const float forceTerm_10 = xi_76 + xi_77 + xi_80;
707 const float forceTerm_11 = xi_83 + xi_85 + xi_90;
708 const float forceTerm_12 = xi_83 + xi_91 + xi_92;
709 const float forceTerm_13 = xi_101 + xi_57 + xi_94;
710 const float forceTerm_14 = xi_102 + xi_77 + xi_94;
711 const float forceTerm_15 = xi_103 + xi_85 + xi_92;
712 const float forceTerm_16 = xi_103 + xi_90 + xi_91;
713 const float forceTerm_17 = xi_102 + xi_104 + xi_57;
714 const float forceTerm_18 = xi_101 + xi_104 + xi_77;
715 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + ctr_0] = forceTerm_0 + xi_107 * 0.14285714285714285f + xi_108 * 0.20000000000000001f - xi_110 + xi_111 * 0.085714285714285715f + xi_117 * 0.10000000000000001f + xi_124 * -0.5f + xi_127 * 0.042857142857142858f + xi_131 * 0.023809523809523808f + xi_250;
716 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + _stride_pdfs_3 + ctr_0] = forceTerm_1 - xi_135 - xi_140 + xi_151 + xi_164 + xi_251;
717 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 2 * _stride_pdfs_3 + ctr_0] = forceTerm_2 + xi_135 + xi_140 + xi_164 + xi_165 + xi_252;
718 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 3 * _stride_pdfs_3 + ctr_0] = forceTerm_3 + xi_168 + xi_170 + xi_175 + xi_176 + xi_248;
719 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 4 * _stride_pdfs_3 + ctr_0] = forceTerm_4 - xi_168 - xi_170 + xi_176 + xi_177 + xi_265;
720 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 5 * _stride_pdfs_3 + ctr_0] = forceTerm_5 - xi_179 - xi_181 + xi_186 + xi_187 + xi_263;
721 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 6 * _stride_pdfs_3 + ctr_0] = forceTerm_6 + xi_179 + xi_181 + xi_187 + xi_188 + xi_260;
722 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 7 * _stride_pdfs_3 + ctr_0] = forceTerm_7 + xi_177 + xi_195 + xi_200 + xi_255;
723 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 8 * _stride_pdfs_3 + ctr_0] = forceTerm_8 + xi_175 + xi_195 + xi_201 + xi_249;
724 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 9 * _stride_pdfs_3 + ctr_0] = forceTerm_9 + xi_177 + xi_201 + xi_202 + xi_261;
725 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 10 * _stride_pdfs_3 + ctr_0] = forceTerm_10 + xi_175 + xi_200 + xi_202 + xi_258;
726 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 11 * _stride_pdfs_3 + ctr_0] = forceTerm_11 + xi_210 + xi_219 + xi_224 + xi_264;
727 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 12 * _stride_pdfs_3 + ctr_0] = forceTerm_12 + xi_219 + xi_225 + xi_226 + xi_253;
728 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 13 * _stride_pdfs_3 + ctr_0] = forceTerm_13 + xi_231 + xi_234 + xi_239 + xi_259;
729 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 14 * _stride_pdfs_3 + ctr_0] = forceTerm_14 + xi_231 + xi_240 + xi_241 + xi_262;
730 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 15 * _stride_pdfs_3 + ctr_0] = forceTerm_15 + xi_224 + xi_226 + xi_242 + xi_254;
731 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 16 * _stride_pdfs_3 + ctr_0] = forceTerm_16 + xi_210 + xi_225 + xi_242 + xi_247;
732 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 17 * _stride_pdfs_3 + ctr_0] = forceTerm_17 + xi_239 + xi_240 + xi_243 + xi_245;
733 _data_pdfs[_stride_pdfs_1 * ctr_1 + _stride_pdfs_2 * ctr_2 + 18 * _stride_pdfs_3 + ctr_0] = forceTerm_18 + xi_234 + xi_241 + xi_243 + xi_257;
734 }
735 }
736 }
737 }
738}
739} // namespace internal_48c9ee502281a70505dce0378c55abd5
740
742 if (!this->configured_)
743 WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
744
745 auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
746 auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
747
748 auto &omega_shear = this->omega_shear_;
749 auto &omega_even = this->omega_even_;
750 auto &block_offset_1 = this->block_offset_1_;
751 auto &kT = this->kT_;
752 auto &seed = this->seed_;
753 auto &time_step = this->time_step_;
754 auto &block_offset_2 = this->block_offset_2_;
755 auto &omega_odd = this->omega_odd_;
756 auto &block_offset_0 = this->block_offset_0_;
757 auto &omega_bulk = this->omega_bulk_;
758 WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(force->nrOfGhostLayers()))
759 float *RESTRICT const _data_force = force->dataAt(0, 0, 0, 0);
760 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
761 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
762 WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
763 float *RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
764 WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
765 WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
766 WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(force->xSize()) + 0))
767 const int64_t _size_force_0 = int64_t(int64_c(force->xSize()) + 0);
768 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
769 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
770 WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(force->ySize()) + 0))
771 const int64_t _size_force_1 = int64_t(int64_c(force->ySize()) + 0);
772 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
773 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
774 WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(force->zSize()) + 0))
775 const int64_t _size_force_2 = int64_t(int64_c(force->zSize()) + 0);
776 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
777 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
778 const int64_t _stride_force_1 = int64_t(force->yStride());
779 const int64_t _stride_force_2 = int64_t(force->zStride());
780 const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
781 const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
782 const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
783 const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
784 internal_48c9ee502281a70505dce0378c55abd5::collidesweepsingleprecisionthermalizedavx_collidesweepsingleprecisionthermalizedavx(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, block_offset_0, block_offset_1, block_offset_2, kT, omega_bulk, omega_even, omega_odd, omega_shear, seed, time_step);
785}
786
787void CollideSweepSinglePrecisionThermalizedAVX::runOnCellInterval(const shared_ptr<StructuredBlockStorage> &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block) {
788 if (!this->configured_)
789 WALBERLA_ABORT("This Sweep contains a configure function that needs to be called manually")
790
791 CellInterval ci = globalCellInterval;
792 CellInterval blockBB = blocks->getBlockCellBB(*block);
793 blockBB.expand(ghostLayers);
794 ci.intersect(blockBB);
795 blocks->transformGlobalToBlockLocalCellInterval(ci, *block);
796 if (ci.empty())
797 return;
798
799 auto pdfs = block->getData<field::GhostLayerField<float, 19>>(pdfsID);
800 auto force = block->getData<field::GhostLayerField<float, 3>>(forceID);
801
802 auto &omega_shear = this->omega_shear_;
803 auto &omega_even = this->omega_even_;
804 auto &block_offset_1 = this->block_offset_1_;
805 auto &kT = this->kT_;
806 auto &seed = this->seed_;
807 auto &time_step = this->time_step_;
808 auto &block_offset_2 = this->block_offset_2_;
809 auto &omega_odd = this->omega_odd_;
810 auto &block_offset_0 = this->block_offset_0_;
811 auto &omega_bulk = this->omega_bulk_;
812 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(force->nrOfGhostLayers()))
813 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(force->nrOfGhostLayers()))
814 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(force->nrOfGhostLayers()))
815 float *RESTRICT const _data_force = force->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
816 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
817 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
818 WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
819 WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
820 WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
821 float *RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
822 WALBERLA_ASSERT_EQUAL(pdfs->layout(), field::fzyx)
823 WALBERLA_ASSERT_EQUAL((uintptr_t)pdfs->dataAt(0, 0, 0, 0) % 32, 0)
824 WALBERLA_ASSERT_GREATER_EQUAL(force->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
825 const int64_t _size_force_0 = int64_t(int64_c(ci.xSize()) + 0);
826 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
827 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
828 WALBERLA_ASSERT_GREATER_EQUAL(force->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
829 const int64_t _size_force_1 = int64_t(int64_c(ci.ySize()) + 0);
830 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
831 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
832 WALBERLA_ASSERT_GREATER_EQUAL(force->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
833 const int64_t _size_force_2 = int64_t(int64_c(ci.zSize()) + 0);
834 WALBERLA_ASSERT_EQUAL(force->layout(), field::fzyx)
835 WALBERLA_ASSERT_EQUAL((uintptr_t)force->dataAt(0, 0, 0, 0) % 32, 0)
836 const int64_t _stride_force_1 = int64_t(force->yStride());
837 const int64_t _stride_force_2 = int64_t(force->zStride());
838 const int64_t _stride_force_3 = int64_t(1 * int64_t(force->fStride()));
839 const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
840 const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
841 const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
842 internal_48c9ee502281a70505dce0378c55abd5::collidesweepsingleprecisionthermalizedavx_collidesweepsingleprecisionthermalizedavx(_data_force, _data_pdfs, _size_force_0, _size_force_1, _size_force_2, _stride_force_1, _stride_force_2, _stride_force_3, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, block_offset_0, block_offset_1, block_offset_2, kT, omega_bulk, omega_even, omega_odd, omega_shear, seed, time_step);
843}
844
845} // namespace pystencils
846} // namespace walberla
847
848#if (defined WALBERLA_CXX_COMPILER_IS_GNU) || (defined WALBERLA_CXX_COMPILER_IS_CLANG)
849#pragma GCC diagnostic pop
850#endif
851
852#if (defined WALBERLA_CXX_COMPILER_IS_INTEL)
853#pragma warning pop
854#endif
#define FUNC_PREFIX
\file AdvectiveFluxKernel_double_precision.cpp \author pystencils
#define RESTRICT
\file AdvectiveFluxKernel_double_precision.h \author pystencils
void runOnCellInterval(const shared_ptr< StructuredBlockStorage > &blocks, const CellInterval &globalCellInterval, cell_idx_t ghostLayers, IBlock *block)
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:172
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float &rnd1, float &rnd2, float &rnd3, float &rnd4)
static FUNC_PREFIX void collidesweepsingleprecisionthermalizedavx_collidesweepsingleprecisionthermalizedavx(float *RESTRICT const _data_force, float *RESTRICT _data_pdfs, int64_t const _size_force_0, int64_t const _size_force_1, int64_t const _size_force_2, int64_t const _stride_force_1, int64_t const _stride_force_2, int64_t const _stride_force_3, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, uint32_t block_offset_0, uint32_t block_offset_1, uint32_t block_offset_2, float kT, float omega_bulk, float omega_even, float omega_odd, float omega_shear, uint32_t seed, uint32_t time_step)
\file PackInfoPdfDoublePrecision.cpp \author pystencils