ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
lattice_boltzmann/generated_kernels/myintrin.h
Go to the documentation of this file.
1// kernel generated with pystencils v1.0+12.g54b91e2, lbmpy
2// v1.0+9.g19115d4.dirty, lbmpy_walberla/pystencils_walberla from commit
3// e1fe2ad1dcbe8f31ea79d95e8a5a5cc0ee3691f3
4
5#pragma once
6
7#if defined(__SSE2__) || defined(_MSC_VER)
8QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) {
9#ifdef __AVX512VL__
10 return _mm_cvtepu32_ps(v);
11#else
12 __m128i v2 = _mm_srli_epi32(v, 1);
13 __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
14 __m128 v2f = _mm_cvtepi32_ps(v2);
15 __m128 v1f = _mm_cvtepi32_ps(v1);
16 return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
17#endif
18}
19
20QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i &R0, __m128i &R1, __m128i &R2,
21 __m128i &R3) {
22 __m128i T0, T1, T2, T3;
23 T0 = _mm_unpacklo_epi32(R0, R1);
24 T1 = _mm_unpacklo_epi32(R2, R3);
25 T2 = _mm_unpackhi_epi32(R0, R1);
26 T3 = _mm_unpackhi_epi32(R2, R3);
27 R0 = _mm_unpacklo_epi64(T0, T1);
28 R1 = _mm_unpackhi_epi64(T0, T1);
29 R2 = _mm_unpacklo_epi64(T2, T3);
30 R3 = _mm_unpackhi_epi64(T2, T3);
31}
32#endif
33
34#if defined(__SSE4_1__) || defined(_MSC_VER)
35#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && \
36 !defined(__clang__)
37__attribute__((optimize("no-associative-math")))
38#endif
39QUALIFIERS __m128d
40_my_cvtepu64_pd(const __m128i x) {
41#ifdef __AVX512VL__
42 return _mm_cvtepu64_pd(x);
43#elif defined(__clang__)
44 return __builtin_convertvector(
45 (uint64_t __attribute__((__vector_size__(16))))x, __m128d);
46#else
47 __m128i xH = _mm_srli_epi64(x, 32);
48 xH = _mm_or_si128(
49 xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
50 __m128i xL = _mm_blend_epi16(
51 x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
52 __m128d f =
53 _mm_sub_pd(_mm_castsi128_pd(xH),
54 _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
55 return _mm_add_pd(f, _mm_castsi128_pd(xL));
56#endif
57}
58#endif
59
60#ifdef __AVX2__
61QUALIFIERS __m256i _my256_set_m128i(__m128i hi, __m128i lo) {
62#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
63 return _mm256_set_m128i(hi, lo);
64#else
65 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
66#endif
67}
68
69QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo) {
70#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
71 return _mm256_set_m128d(hi, lo);
72#else
73 return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
74#endif
75}
76
77QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v) {
78#ifdef __AVX512VL__
79 return _mm256_cvtepu32_ps(v);
80#else
81 __m256i v2 = _mm256_srli_epi32(v, 1);
82 __m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
83 __m256 v2f = _mm256_cvtepi32_ps(v2);
84 __m256 v1f = _mm256_cvtepi32_ps(v1);
85 return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
86#endif
87}
88
89#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && \
90 !defined(__clang__)
91__attribute__((optimize("no-associative-math")))
92#endif
93QUALIFIERS __m256d
94_my256_cvtepu64_pd(const __m256i x) {
95#ifdef __AVX512VL__
96 return _mm256_cvtepu64_pd(x);
97#elif defined(__clang__)
98 return __builtin_convertvector(
99 (uint64_t __attribute__((__vector_size__(32))))x, __m256d);
100#else
101 __m256i xH = _mm256_srli_epi64(x, 32);
102 xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(
103 19342813113834066795298816.))); // 2^84
104 __m256i xL = _mm256_blend_epi16(
105 x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)),
106 0xcc); // 2^52
107 __m256d f = _mm256_sub_pd(
108 _mm256_castsi256_pd(xH),
109 _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
110 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
111#endif
112}
113#endif
114
115#ifdef __AVX512F__
116QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b,
117 __m128i a) {
118 return _mm512_inserti32x4(
119 _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c,
120 2),
121 d, 3);
122}
123
124QUALIFIERS __m512d _my512_set_m256d(__m256d b, __m256d a) {
125 return _mm512_insertf64x4(_mm512_castpd256_pd512(a), b, 1);
126}
127#endif