45#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
46QUALIFIERS __m128 _my_cvtepu32_ps(
const __m128i v) {
47#if defined(__AVX512VL__) || defined(__AVX10_1__)
48 return _mm_cvtepu32_ps(v);
50 __m128i v2 = _mm_srli_epi32(v, 1);
51 __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
52 __m128 v2f = _mm_cvtepi32_ps(v2);
53 __m128 v1f = _mm_cvtepi32_ps(v1);
54 return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
59#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
60#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && \
61 __GNUC__ >= 5 && !defined(__clang__)
62__attribute__((optimize(
"no-associative-math")))
65_my_cvtepu64_pd(
const __m128i x) {
66#if defined(__AVX512VL__) || defined(__AVX10_1__)
67 return _mm_cvtepu64_pd(x);
68#elif defined(__clang__)
69 return __builtin_convertvector(
70 (uint64_t __attribute__((__vector_size__(16))))x, __m128d);
72 __m128i xH = _mm_srli_epi64(x, 32);
74 xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));
75 __m128i xL = _mm_blend_epi16(
76 x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);
78 _mm_sub_pd(_mm_castsi128_pd(xH),
79 _mm_set1_pd(19342813118337666422669312.));
80 return _mm_add_pd(f, _mm_castsi128_pd(xL));
86QUALIFIERS __m256i _my256_set_m128i(__m128i hi, __m128i lo) {
87#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
88 return _mm256_set_m128i(hi, lo);
90 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
94QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo) {
95#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
96 return _mm256_set_m128d(hi, lo);
98 return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
102QUALIFIERS __m256 _my256_cvtepu32_ps(
const __m256i v) {
103#if defined(__AVX512VL__) || defined(__AVX10_1__)
104 return _mm256_cvtepu32_ps(v);
106 __m256i v2 = _mm256_srli_epi32(v, 1);
107 __m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
108 __m256 v2f = _mm256_cvtepi32_ps(v2);
109 __m256 v1f = _mm256_cvtepi32_ps(v1);
110 return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
114#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && \
115 __GNUC__ >= 5 && !defined(__clang__)
116__attribute__((optimize(
"no-associative-math")))
119_my256_cvtepu64_pd(
const __m256i x) {
120#if defined(__AVX512VL__) || defined(__AVX10_1__)
121 return _mm256_cvtepu64_pd(x);
122#elif defined(__clang__)
123 return __builtin_convertvector(
124 (uint64_t __attribute__((__vector_size__(32))))x, __m256d);
126 __m256i xH = _mm256_srli_epi64(x, 32);
127 xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(
128 19342813113834066795298816.)));
129 __m256i xL = _mm256_blend_epi16(
130 x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)),
132 __m256d f = _mm256_sub_pd(
133 _mm256_castsi256_pd(xH),
134 _mm256_set1_pd(19342813118337666422669312.));
135 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
140#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
141QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b,
143 return _mm512_inserti32x4(
144 _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c,
149QUALIFIERS __m512d _my512_set_m256d(__m256d b, __m256d a) {
150 return _mm512_insertf64x4(_mm512_castpd256_pd512(a), b, 1);