25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 28 #if __cplusplus >= 201703L 30 #if !_GLIBCXX_SIMD_X86INTRIN 32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available" 35 _GLIBCXX_SIMD_BEGIN_NAMESPACE
40 template <
typename _Tp,
size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 {
return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np
>>(__x._M_data); }
45 template <
typename _TV,
47 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
48 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
50 __to_masktype(_TV __x)
51 {
return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>
>(__x); }
55 template <
typename _Ap,
typename _Bp,
typename _Tp = common_type_t<_Ap, _Bp>,
56 typename _Trait = _VectorTraits<_Tp>>
57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
58 __interleave128_lo(
const _Ap& __av,
const _Bp& __bv)
62 if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
63 return _Tp{__a[0], __b[0]};
64 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
65 return _Tp{__a[0], __b[0], __a[1], __b[1]};
66 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
67 return _Tp{__a[0], __b[0], __a[1], __b[1],
68 __a[2], __b[2], __a[3], __b[3]};
69 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
72 __a[6], __b[6], __a[7], __b[7]};
73 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
74 return _Tp{__a[0], __b[0], __a[2], __b[2]};
75 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
76 return _Tp{__a[0], __b[0], __a[1], __b[1],
77 __a[4], __b[4], __a[5], __b[5]};
78 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
81 __a[10], __b[10], __a[11], __b[11]};
82 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
87 __a[22], __b[22], __a[23], __b[23]};
88 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
89 return _Tp{__a[0], __b[0], __a[2], __b[2],
90 __a[4], __b[4], __a[6], __b[6]};
91 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
94 __a[12], __b[12], __a[13], __b[13]};
95 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
100 __a[26], __b[26], __a[27], __b[27]};
101 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
113 __assert_unreachable<_Tp>();
118 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
119 _GLIBCXX_SIMD_INTRINSIC constexpr
bool 122 if (!__builtin_is_constant_evaluated())
124 if constexpr (__have_avx)
126 if constexpr (_TVT::template _S_is<float, 8>)
127 return _mm256_testz_ps(__a, __a);
128 else if constexpr (_TVT::template _S_is<double, 4>)
129 return _mm256_testz_pd(__a, __a);
130 else if constexpr (
sizeof(_Tp) == 32)
131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
132 else if constexpr (_TVT::template _S_is<
float>)
133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
134 else if constexpr (_TVT::template _S_is<
double, 2>)
135 return _mm_testz_pd(__a, __a);
137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
139 else if constexpr (__have_sse4_1)
140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
141 __intrin_bitcast<__m128i>(__a));
143 else if constexpr (sizeof(_Tp) <= 8)
144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
147 const auto __b = __vector_bitcast<_LLong>(__a);
148 if constexpr (
sizeof(__b) == 16)
149 return (__b[0] | __b[1]) == 0;
150 else if constexpr (sizeof(__b) == 32)
151 return __is_zero(__lo128(__b) | __hi128(__b));
152 else if constexpr (sizeof(__b) == 64)
153 return __is_zero(__lo256(__b) | __hi256(__b));
155 __assert_unreachable<_Tp>();
161 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST
int 165 if constexpr (
sizeof(_Tp) == 32)
167 if constexpr (_TVT::template _S_is<float>)
168 return _mm256_movemask_ps(__to_intrin(__a));
169 else if constexpr (_TVT::template _S_is<double>)
170 return _mm256_movemask_pd(__to_intrin(__a));
172 return _mm256_movemask_epi8(__to_intrin(__a));
174 else if constexpr (_TVT::template _S_is<float>)
175 return _mm_movemask_ps(__to_intrin(__a));
176 else if constexpr (_TVT::template _S_is<double>)
177 return _mm_movemask_pd(__to_intrin(__a));
179 return _mm_movemask_epi8(__to_intrin(__a));
184 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 186 __testz(_TI __a, _TI __b)
188 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
189 _TVT::_S_full_size>>);
190 if (!__builtin_is_constant_evaluated())
192 if constexpr (
sizeof(_TI) == 32)
194 if constexpr (_TVT::template _S_is<float>)
195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
196 else if constexpr (_TVT::template _S_is<double>)
197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
201 else if constexpr (_TVT::template _S_is<float> && __have_avx)
202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
203 else if constexpr (_TVT::template _S_is<double> && __have_avx)
204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
205 else if constexpr (__have_sse4_1)
206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
207 __intrin_bitcast<__m128i>(__to_intrin(__b)));
209 return __movemask(0 == __and(__a, __b)) != 0;
212 return __is_zero(__and(__a, __b));
218 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 220 __testc(_TI __a, _TI __b)
222 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
223 _TVT::_S_full_size>>);
224 if (__builtin_is_constant_evaluated())
225 return __is_zero(__andnot(__a, __b));
227 if constexpr (
sizeof(_TI) == 32)
229 if constexpr (_TVT::template _S_is<float>)
230 return _mm256_testc_ps(__a, __b);
231 else if constexpr (_TVT::template _S_is<double>)
232 return _mm256_testc_pd(__a, __b);
234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
236 else if constexpr (_TVT::template _S_is<float> && __have_avx)
237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
238 else if constexpr (_TVT::template _S_is<double> && __have_avx)
239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
244 __intrin_bitcast<__m128i>(__to_intrin(__b)));
250 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 252 __testnzc(_TI __a, _TI __b)
254 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
255 _TVT::_S_full_size>>);
256 if (!__builtin_is_constant_evaluated())
258 if constexpr (
sizeof(_TI) == 32)
260 if constexpr (_TVT::template _S_is<float>)
261 return _mm256_testnzc_ps(__a, __b);
262 else if constexpr (_TVT::template _S_is<double>)
263 return _mm256_testnzc_pd(__a, __b);
265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
267 else if constexpr (_TVT::template _S_is<float> && __have_avx)
268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
269 else if constexpr (_TVT::template _S_is<double> && __have_avx)
270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
271 else if constexpr (__have_sse4_1)
272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
273 __intrin_bitcast<__m128i>(__to_intrin(__b)));
275 return __movemask(0 == __and(__a, __b)) == 0
276 && __movemask(0 == __andnot(__a, __b)) == 0;
279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
286 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
287 _GLIBCXX_SIMD_INTRINSIC _Tp
290 if constexpr (
sizeof(_Tp) == 16)
293 is_floating_point_v<typename _TVT::value_type>, float,
int>>(__a);
294 return reinterpret_cast<_Tp
>(
295 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
297 else if constexpr (
sizeof(_Tp) == 32)
300 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
301 return reinterpret_cast<_Tp
>(
302 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
304 else if constexpr (
sizeof(_Tp) == 64)
307 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
308 return reinterpret_cast<_Tp
>(decltype(__x){__x[0], __x[1], __x[4],
309 __x[5], __x[2], __x[3],
313 __assert_unreachable<_Tp>();
318 template <
typename _Tp>
319 _GLIBCXX_SIMD_INTRINSIC
auto 320 __maskload_epi32(
const int* __ptr, _Tp __k)
322 if constexpr (
sizeof(__k) == 16)
323 return _mm_maskload_epi32(__ptr, __k);
325 return _mm256_maskload_epi32(__ptr, __k);
330 template <typename _Tp>
331 _GLIBCXX_SIMD_INTRINSIC auto
332 __maskload_epi64(const _LLong* __ptr, _Tp __k)
334 if constexpr (
sizeof(__k) == 16)
335 return _mm_maskload_epi64(__ptr, __k);
337 return _mm256_maskload_epi64(__ptr, __k);
342 template <typename _Tp>
343 _GLIBCXX_SIMD_INTRINSIC auto
344 __maskload_ps(const
float* __ptr, _Tp __k)
346 if constexpr (
sizeof(__k) == 16)
347 return _mm_maskload_ps(__ptr, __k);
349 return _mm256_maskload_ps(__ptr, __k);
354 template <typename _Tp>
355 _GLIBCXX_SIMD_INTRINSIC auto
356 __maskload_pd(const
double* __ptr, _Tp __k)
358 if constexpr (
sizeof(__k) == 16)
359 return _mm_maskload_pd(__ptr, __k);
361 return _mm256_maskload_pd(__ptr, __k);
367 template <
size_t _Np,
typename _Tp,
typename _Kp>
368 _GLIBCXX_SIMD_INTRINSIC constexpr
auto 369 __movm(_Kp __k) noexcept
371 static_assert(is_unsigned_v<_Kp>);
372 if constexpr (
sizeof(_Tp) == 1 && __have_avx512bw)
374 if constexpr (_Np <= 16 && __have_avx512vl)
375 return __builtin_ia32_cvtmask2b128(__k);
376 else if constexpr (_Np <= 32 && __have_avx512vl)
377 return __builtin_ia32_cvtmask2b256(__k);
379 return __builtin_ia32_cvtmask2b512(__k);
381 else if constexpr (
sizeof(_Tp) == 2 && __have_avx512bw)
383 if constexpr (_Np <= 8 && __have_avx512vl)
384 return __builtin_ia32_cvtmask2w128(__k);
385 else if constexpr (_Np <= 16 && __have_avx512vl)
386 return __builtin_ia32_cvtmask2w256(__k);
388 return __builtin_ia32_cvtmask2w512(__k);
390 else if constexpr (
sizeof(_Tp) == 4 && __have_avx512dq)
392 if constexpr (_Np <= 4 && __have_avx512vl)
393 return __builtin_ia32_cvtmask2d128(__k);
394 else if constexpr (_Np <= 8 && __have_avx512vl)
395 return __builtin_ia32_cvtmask2d256(__k);
397 return __builtin_ia32_cvtmask2d512(__k);
399 else if constexpr (
sizeof(_Tp) == 8 && __have_avx512dq)
401 if constexpr (_Np <= 2 && __have_avx512vl)
402 return __builtin_ia32_cvtmask2q128(__k);
403 else if constexpr (_Np <= 4 && __have_avx512vl)
404 return __builtin_ia32_cvtmask2q256(__k);
406 return __builtin_ia32_cvtmask2q512(__k);
409 __assert_unreachable<_Tp>();
413 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 414 #include "simd_x86_conversions.h" 418 template <
typename _Tp,
size_t _Np>
424 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
427 template <
typename _Tp,
size_t _Np>
433 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
436 template <
typename _Tp,
size_t _Np>
442 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
445 template <
typename _Tp,
size_t _Np>
451 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
454 template <
typename _Tp,
size_t _Np>
458 return __have_avx512f
460 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
463 template <
typename _Tp,
size_t _Np>
467 return __have_avx512f
469 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
473 struct _MaskImplX86Mixin;
476 struct _CommonImplX86 : _CommonImplBuiltin
478 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 480 template <
typename _From,
typename _To,
size_t _ToSize>
481 static constexpr
bool 482 _S_converts_via_decomposition()
484 if constexpr (is_integral_v<
485 _From> && is_integral_v<_To> &&
sizeof(_From) == 8
487 return (sizeof(_To) == 2 && !__have_ssse3)
488 || (sizeof(_To) == 1 && !__have_avx512f);
489 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
490 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
492 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
495 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
497 return (sizeof(_To) == 4 && _ToSize == 16)
498 || (sizeof(_To) == 8 && _ToSize < 64);
503 template <typename _From, typename _To,
size_t _ToSize>
504 static inline constexpr
bool __converts_via_decomposition_v
505 = _S_converts_via_decomposition<_From, _To, _ToSize>();
510 using _CommonImplBuiltin::_S_store;
512 template <
typename _Tp,
size_t _Np>
513 _GLIBCXX_SIMD_INTRINSIC
static constexpr
void 514 _S_store(_SimdWrapper<_Tp, _Np> __x,
void* __addr)
516 constexpr
size_t _Bytes = _Np *
sizeof(_Tp);
518 if (__builtin_is_constant_evaluated())
519 _CommonImplBuiltin::_S_store(__x, __addr);
520 else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
522 const auto __v = __to_intrin(__x);
524 if constexpr (_Bytes & 1)
526 if constexpr (_Bytes < 16)
527 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
528 __intrin_bitcast<__m128i>(__v));
529 else if constexpr (_Bytes < 32)
530 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
531 __intrin_bitcast<__m256i>(__v));
533 _mm512_mask_storeu_epi8(__addr,
534 0xffffffffffffffffull >> (64 - _Bytes),
535 __intrin_bitcast<__m512i>(__v));
537 else if constexpr (_Bytes & 2)
539 if constexpr (_Bytes < 16)
540 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
541 __intrin_bitcast<__m128i>(__v));
542 else if constexpr (_Bytes < 32)
543 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
544 __intrin_bitcast<__m256i>(__v));
546 _mm512_mask_storeu_epi16(__addr,
547 0xffffffffull >> (32 - _Bytes / 2),
548 __intrin_bitcast<__m512i>(__v));
550 else if constexpr (_Bytes & 4)
552 if constexpr (_Bytes < 16)
553 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
554 __intrin_bitcast<__m128i>(__v));
555 else if constexpr (_Bytes < 32)
556 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
557 __intrin_bitcast<__m256i>(__v));
559 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
560 __intrin_bitcast<__m512i>(__v));
566 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes " 567 "- 1)) != 0 is impossible");
568 if constexpr (_Bytes < 32)
569 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
570 __intrin_bitcast<__m256i>(__v));
572 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
573 __intrin_bitcast<__m512i>(__v));
577 _CommonImplBuiltin::_S_store(__x, __addr);
582 template <
size_t _Np,
bool _Sanitized>
583 _GLIBCXX_SIMD_INTRINSIC static constexpr
void 584 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x,
bool* __mem)
586 if (__builtin_is_constant_evaluated())
587 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
588 else if constexpr (__have_avx512bw_vl)
589 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
590 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
591 if constexpr (_Np <= 16)
592 return _mm_movm_epi8(__x._M_to_bits());
593 else if constexpr (_Np <= 32)
594 return _mm256_movm_epi8(__x._M_to_bits());
595 else if constexpr (_Np <= 64)
596 return _mm512_movm_epi8(__x._M_to_bits());
598 __assert_unreachable<_SizeConstant<_Np>>();
601 else if constexpr (__have_bmi2)
603 if constexpr (_Np <= 4)
604 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
606 __execute_n_times<__div_roundup(_Np, sizeof(
size_t))>(
607 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
608 constexpr
size_t __offset = __i *
sizeof(size_t);
609 constexpr
int __todo =
std::min(
sizeof(
size_t), _Np - __offset);
610 if constexpr (__todo == 1)
611 __mem[__offset] = __x[__offset];
616 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
617 0x0101010101010101ULL);
620 __x.template _M_extract<__offset>()._M_to_bits(),
623 _S_store<__todo>(__bools, __mem + __offset);
627 else if constexpr (__have_sse2 && _Np > 7)
628 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
629 constexpr
int __offset = __i * 16;
630 constexpr
int __todo =
std::min(16,
int(_Np) - __offset);
631 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
632 __vector_type16_t<_UChar> __bools;
633 if constexpr (__have_avx512f)
636 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
637 __vector_broadcast<16>(1)));
639 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
640 __todo > 8 ? __hi256(__as32bits)
642 __bools = __vector_bitcast<_UChar>(
643 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
647 using _V = __vector_type_t<_UChar, 16>;
648 auto __tmp = _mm_cvtsi32_si128(__bits);
649 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
650 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
651 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
652 _V __tmp2 =
reinterpret_cast<_V
>(__tmp);
653 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
654 1, 2, 4, 8, 16, 32, 64, 128};
655 __bools = (__tmp2 == 0) + 1;
657 _S_store<__todo>(__bools, __mem + __offset);
660 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
669 template <
typename _Kp,
typename _TV>
670 _GLIBCXX_SIMD_INTRINSIC
static _TV
671 _S_blend_avx512(
const _Kp __k,
const _TV __a,
const _TV __b) noexcept
673 static_assert(__is_vector_type_v<_TV>);
674 using _Tp =
typename _VectorTraits<_TV>::value_type;
675 static_assert(
sizeof(_TV) >= 16);
676 static_assert(
sizeof(_Tp) <= 8);
678 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a;
681 = conditional_t<(sizeof(_Tp) > 2),
683 conditional_t<sizeof(_Tp) == 1, char, short>>;
684 [[maybe_unused]]
const auto __aa = __vector_bitcast<_IntT>(__a);
685 [[maybe_unused]]
const auto __bb = __vector_bitcast<_IntT>(__b);
686 if constexpr (
sizeof(_TV) == 64)
688 if constexpr (
sizeof(_Tp) == 1)
689 return reinterpret_cast<_TV>(
690 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
691 else if constexpr (sizeof(_Tp) == 2)
692 return reinterpret_cast<_TV>(
693 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
694 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
695 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
696 else if constexpr (sizeof(_Tp) == 4)
697 return reinterpret_cast<_TV>(
698 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
699 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
700 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
701 else if constexpr (sizeof(_Tp) == 8)
702 return reinterpret_cast<_TV>(
703 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
705 else if constexpr (sizeof(_TV) == 32)
707 if constexpr (
sizeof(_Tp) == 1)
708 return reinterpret_cast<_TV>(
709 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
710 else if constexpr (sizeof(_Tp) == 2)
711 return reinterpret_cast<_TV>(
712 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
713 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
714 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
715 else if constexpr (sizeof(_Tp) == 4)
716 return reinterpret_cast<_TV>(
717 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
718 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
719 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
720 else if constexpr (sizeof(_Tp) == 8)
721 return reinterpret_cast<_TV>(
722 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
724 else if constexpr (sizeof(_TV) == 16)
726 if constexpr (
sizeof(_Tp) == 1)
727 return reinterpret_cast<_TV>(
728 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
729 else if constexpr (sizeof(_Tp) == 2)
730 return reinterpret_cast<_TV>(
731 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
732 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
733 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
734 else if constexpr (sizeof(_Tp) == 4)
735 return reinterpret_cast<_TV>(
736 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
737 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
738 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
739 else if constexpr (sizeof(_Tp) == 8)
740 return reinterpret_cast<_TV>(
741 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
752 template <
typename _Tp>
753 _GLIBCXX_SIMD_INTRINSIC
static _Tp
754 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept
756 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
759 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
760 __m128 __k)
const noexcept
762 return __builtin_ia32_blendvps(__a, __b, __k);
764 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
765 __m128d __k)
const noexcept
767 return __builtin_ia32_blendvpd(__a, __b, __k);
769 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
770 __m128i __k)
const noexcept
772 return reinterpret_cast<__m128i
>(
773 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
774 reinterpret_cast<__v16qi>(__b),
775 reinterpret_cast<__v16qi>(__k)));
777 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
778 __m256 __k)
const noexcept
780 return __builtin_ia32_blendvps256(__a, __b, __k);
782 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
783 __m256d __k)
const noexcept
785 return __builtin_ia32_blendvpd256(__a, __b, __k);
787 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
788 __m256i __k)
const noexcept
790 if constexpr (__have_avx2)
791 return reinterpret_cast<__m256i
>(
792 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
793 reinterpret_cast<__v32qi>(__b),
794 reinterpret_cast<__v32qi>(__k)));
796 return reinterpret_cast<__m256i
>(
797 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
798 reinterpret_cast<__v8sf>(__b),
799 reinterpret_cast<__v8sf>(__k)));
802 return __eval(__a, __b, __k);
809 template <
typename _Tp,
size_t _Np>
810 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
811 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
812 _SimdWrapper<_Tp, _Np> __at1)
814 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
815 if (__k._M_is_constprop() && __at0._M_is_constprop()
816 && __at1._M_is_constprop())
817 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
818 [&](
auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
819 return __k[__i] ? __at1[__i] : __at0[__i];
821 else if constexpr (
sizeof(__at0) == 64
822 || (__have_avx512vl &&
sizeof(__at0) >= 16))
823 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
826 static_assert((__have_avx512vl &&
sizeof(__at0) < 16)
827 || !__have_avx512vl);
828 constexpr
size_t __size = (__have_avx512vl ? 16 : 64) /
sizeof(_Tp);
829 return __vector_bitcast<_Tp, _Np>(
830 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
831 __vector_bitcast<_Tp, __size>(__at1)));
835 template <
typename _Tp,
size_t _Np>
836 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
837 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
838 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
840 const auto __kk = __wrapper_bitcast<_Tp>(__k);
841 if (__builtin_is_constant_evaluated()
842 || (__kk._M_is_constprop() && __at0._M_is_constprop()
843 && __at1._M_is_constprop()))
845 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
846 if (__r._M_is_constprop())
849 if constexpr (((__have_avx512f &&
sizeof(__at0) == 64) || __have_avx512vl)
850 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
853 _SimdWrapper<bool, _Np>(
854 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
862 if constexpr (__have_sse4_1)
863 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
866 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
875 template <
typename _Abi,
typename>
876 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
878 using _Base = _SimdImplBuiltin<_Abi>;
880 template <
typename _Tp>
881 using _MaskMember =
typename _Base::template _MaskMember<_Tp>;
883 template <
typename _Tp>
884 static constexpr
size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
886 template <
typename _Tp>
887 static constexpr
size_t _S_size = _Abi::template _S_size<_Tp>;
889 template <
typename _Tp>
890 static constexpr
size_t _S_max_store_size
891 = (
sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
892 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
895 using _MaskImpl =
typename _Abi::_MaskImpl;
898 template <
typename _Tp,
size_t _Np,
typename _Up>
899 static inline _SimdWrapper<_Tp, _Np>
900 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
901 const _Up* __mem) noexcept
903 static_assert(_Np == _S_size<_Tp>);
904 if constexpr (is_same_v<_Tp, _Up> ||
905 (
sizeof(_Tp) ==
sizeof(_Up)
907 _Tp> == is_integral_v<_Up>)
911 [[maybe_unused]]
const auto __intrin = __to_intrin(__merge);
912 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
915 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
916 if constexpr (
sizeof(__intrin) == 16)
917 __merge = __vector_bitcast<_Tp, _Np>(
918 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
919 else if constexpr (sizeof(__merge) == 32)
920 __merge = __vector_bitcast<_Tp, _Np>(
921 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
922 else if constexpr (sizeof(__merge) == 64)
923 __merge = __vector_bitcast<_Tp, _Np>(
924 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
926 __assert_unreachable<_Tp>();
928 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
931 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
932 if constexpr (
sizeof(__intrin) == 16)
933 __merge = __vector_bitcast<_Tp, _Np>(
934 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
935 else if constexpr (sizeof(__intrin) == 32)
936 __merge = __vector_bitcast<_Tp, _Np>(
937 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
938 else if constexpr (sizeof(__intrin) == 64)
939 __merge = __vector_bitcast<_Tp, _Np>(
940 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
942 __assert_unreachable<_Tp>();
944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
945 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
948 if constexpr (
sizeof(__intrin) == 16)
949 __merge = __vector_bitcast<_Tp, _Np>(
950 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
951 else if constexpr (sizeof(__intrin) == 32)
952 __merge = __vector_bitcast<_Tp, _Np>(
953 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
954 else if constexpr (sizeof(__intrin) == 64)
955 __merge = __vector_bitcast<_Tp, _Np>(
956 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
958 __assert_unreachable<_Tp>();
960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
961 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
964 if constexpr (
sizeof(__intrin) == 16)
965 __merge = __vector_bitcast<_Tp, _Np>(
966 _mm_mask_loadu_ps(__intrin, __kk, __mem));
967 else if constexpr (sizeof(__intrin) == 32)
968 __merge = __vector_bitcast<_Tp, _Np>(
969 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
970 else if constexpr (sizeof(__intrin) == 64)
971 __merge = __vector_bitcast<_Tp, _Np>(
972 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
974 __assert_unreachable<_Tp>();
976 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
977 && is_integral_v<_Up>)
979 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
982 __vector_bitcast<_Tp, _Np>(
983 __maskload_epi32(reinterpret_cast<const int*>(__mem),
986 else if constexpr (__have_avx &&
sizeof(_Tp) == 4)
988 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
991 __vector_bitcast<_Tp, _Np>(
992 __maskload_ps(reinterpret_cast<const float*>(__mem),
995 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
996 &&
sizeof(_Tp) == 8 && is_integral_v<_Up>)
998 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
999 if constexpr (
sizeof(__intrin) == 16)
1000 __merge = __vector_bitcast<_Tp, _Np>(
1001 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
1002 else if constexpr (sizeof(__intrin) == 32)
1003 __merge = __vector_bitcast<_Tp, _Np>(
1004 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
1005 else if constexpr (sizeof(__intrin) == 64)
1006 __merge = __vector_bitcast<_Tp, _Np>(
1007 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
1009 __assert_unreachable<_Tp>();
1011 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
1012 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
1014 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1015 if constexpr (
sizeof(__intrin) == 16)
1016 __merge = __vector_bitcast<_Tp, _Np>(
1017 _mm_mask_loadu_pd(__intrin, __kk, __mem));
1018 else if constexpr (sizeof(__intrin) == 32)
1019 __merge = __vector_bitcast<_Tp, _Np>(
1020 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
1021 else if constexpr (sizeof(__intrin) == 64)
1022 __merge = __vector_bitcast<_Tp, _Np>(
1023 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
1025 __assert_unreachable<_Tp>();
1027 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1028 && is_integral_v<_Up>)
1030 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
1032 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1033 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
1034 reinterpret_cast<const _LLong*>(__mem),
1035 __to_intrin(__k))));
1037 else if constexpr (__have_avx &&
sizeof(_Tp) == 8)
1039 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
1041 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1042 __vector_bitcast<_Tp, _Np>(
1043 __maskload_pd(reinterpret_cast<const double*>(__mem),
1044 __to_intrin(__k))));
1047 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1048 [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1049 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1077 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1083 template <
typename _Tp,
size_t _Np>
1084 _GLIBCXX_SIMD_INTRINSIC
static void 1085 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper<bool, _Np> __k)
1087 [[maybe_unused]]
const auto __vi = __to_intrin(__v);
1088 if constexpr (
sizeof(__vi) == 64)
1090 static_assert(
sizeof(__v) == 64 && __have_avx512f);
1091 if constexpr (__have_avx512bw &&
sizeof(_Tp) == 1)
1092 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1093 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1094 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1095 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1097 if constexpr (is_integral_v<_Tp>)
1098 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1100 _mm512_mask_storeu_ps(__mem, __k, __vi);
1102 else if constexpr (__have_avx512f &&
sizeof(_Tp) == 8)
1104 if constexpr (is_integral_v<_Tp>)
1105 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1107 _mm512_mask_storeu_pd(__mem, __k, __vi);
1110 __assert_unreachable<_Tp>();
1112 else if constexpr (
sizeof(__vi) == 32)
1114 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1115 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1116 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1117 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1118 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1120 if constexpr (is_integral_v<_Tp>)
1121 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1123 _mm256_mask_storeu_ps(__mem, __k, __vi);
1125 else if constexpr (__have_avx512vl &&
sizeof(_Tp) == 8)
1127 if constexpr (is_integral_v<_Tp>)
1128 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1130 _mm256_mask_storeu_pd(__mem, __k, __vi);
1132 else if constexpr (__have_avx512f
1133 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
1136 _S_masked_store_nocvt(
1137 _SimdWrapper64<_Tp>(
1138 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1139 __mem, _SimdWrapper<
bool, 64 /
sizeof(_Tp)>(__k._M_data));
1142 _S_masked_store_nocvt(__v, __mem,
1143 _MaskImpl::template _S_to_maskvector<
1144 __int_for_sizeof_t<_Tp>, _Np>(__k));
1146 else if constexpr (
sizeof(__vi) == 16)
1148 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1149 _mm_mask_storeu_epi8(__mem, __k, __vi);
1150 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1151 _mm_mask_storeu_epi16(__mem, __k, __vi);
1152 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1154 if constexpr (is_integral_v<_Tp>)
1155 _mm_mask_storeu_epi32(__mem, __k, __vi);
1157 _mm_mask_storeu_ps(__mem, __k, __vi);
1159 else if constexpr (__have_avx512vl &&
sizeof(_Tp) == 8)
1161 if constexpr (is_integral_v<_Tp>)
1162 _mm_mask_storeu_epi64(__mem, __k, __vi);
1164 _mm_mask_storeu_pd(__mem, __k, __vi);
1166 else if constexpr (__have_avx512f
1167 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
1170 _S_masked_store_nocvt(
1171 _SimdWrapper64<_Tp>(
1172 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1173 __mem, _SimdWrapper<
bool, 64 /
sizeof(_Tp)>(__k._M_data));
1176 _S_masked_store_nocvt(__v, __mem,
1177 _MaskImpl::template _S_to_maskvector<
1178 __int_for_sizeof_t<_Tp>, _Np>(__k));
1181 __assert_unreachable<_Tp>();
1184 template <
typename _Tp,
size_t _Np>
1185 _GLIBCXX_SIMD_INTRINSIC
static void 1186 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1187 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1189 if constexpr (
sizeof(__v) <= 16)
1191 [[maybe_unused]]
const auto __vi
1192 = __intrin_bitcast<__m128i>(__as_vector(__v));
1193 [[maybe_unused]]
const auto __ki
1194 = __intrin_bitcast<__m128i>(__as_vector(__k));
1195 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1196 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1197 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1198 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1199 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1200 && is_integral_v<_Tp>)
1201 _mm_maskstore_epi32(reinterpret_cast<
int*>(__mem), __ki, __vi);
1202 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1203 _mm_maskstore_ps(reinterpret_cast<
float*>(__mem), __ki,
1204 __vector_bitcast<
float>(__vi));
1205 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1206 && is_integral_v<_Tp>)
1207 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1208 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1209 _mm_maskstore_pd(reinterpret_cast<
double*>(__mem), __ki,
1210 __vector_bitcast<
double>(__vi));
1212 _Base::_S_masked_store_nocvt(__v, __mem, __k);
1214 else if constexpr (sizeof(__v) == 32)
1216 [[maybe_unused]]
const auto __vi
1217 = __intrin_bitcast<__m256i>(__as_vector(__v));
1218 [[maybe_unused]]
const auto __ki
1219 = __intrin_bitcast<__m256i>(__as_vector(__k));
1220 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1221 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1222 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1223 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1224 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1225 && is_integral_v<_Tp>)
1226 _mm256_maskstore_epi32(reinterpret_cast<
int*>(__mem), __ki, __vi);
1227 else if constexpr (sizeof(_Tp) == 4)
1228 _mm256_maskstore_ps(reinterpret_cast<
float*>(__mem), __ki,
1229 __vector_bitcast<
float>(__v));
1230 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1231 && is_integral_v<_Tp>)
1232 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1234 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1235 _mm256_maskstore_pd(reinterpret_cast<
double*>(__mem), __ki,
1236 __vector_bitcast<
double>(__v));
1238 _Base::_S_masked_store_nocvt(__v, __mem, __k);
1241 __assert_unreachable<_Tp>();
1246 template <typename _Tp,
size_t _Np, typename _Up>
1247 _GLIBCXX_SIMD_INTRINSIC static
void 1248 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1249 const _MaskMember<_Tp> __k) noexcept
1251 if constexpr (is_integral_v<
1252 _Tp> && is_integral_v<_Up> &&
sizeof(_Tp) >
sizeof(_Up)
1253 && __have_avx512f && (
sizeof(_Tp) >= 4 || __have_avx512bw)
1254 && (
sizeof(__v) == 64 || __have_avx512vl))
1256 const auto __vi = __to_intrin(__v);
1257 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1258 if constexpr (
sizeof(_Tp) == 8 &&
sizeof(_Up) == 4
1259 &&
sizeof(__vi) == 64)
1260 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1261 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1262 && sizeof(__vi) == 32)
1263 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1264 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1265 && sizeof(__vi) == 16)
1266 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1267 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1268 && sizeof(__vi) == 64)
1269 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1270 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1271 && sizeof(__vi) == 32)
1272 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1273 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1274 && sizeof(__vi) == 16)
1275 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1276 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1277 && sizeof(__vi) == 64)
1278 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1279 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1280 && sizeof(__vi) == 32)
1281 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1282 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1283 && sizeof(__vi) == 16)
1284 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1285 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1286 && sizeof(__vi) == 64)
1287 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1288 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1289 && sizeof(__vi) == 32)
1290 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1291 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1292 && sizeof(__vi) == 16)
1293 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1294 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1295 && sizeof(__vi) == 64)
1296 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1297 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1298 && sizeof(__vi) == 32)
1299 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1300 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1301 && sizeof(__vi) == 16)
1302 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1303 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1304 && sizeof(__vi) == 64)
1305 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1306 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1307 && sizeof(__vi) == 32)
1308 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1309 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1310 && sizeof(__vi) == 16)
1311 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1313 __assert_unreachable<_Tp>();
1316 _Base::_S_masked_store(__v, __mem, __k);
1321 template <typename _V, typename _VVT = _VectorTraits<_V>>
1322 _GLIBCXX_SIMD_INTRINSIC static constexpr _V
1323 _S_multiplies(_V __x, _V __y)
1325 using _Tp =
typename _VVT::value_type;
1326 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1327 || __y._M_is_constprop())
1328 return __as_vector(__x) * __as_vector(__y);
1329 else if constexpr (
sizeof(_Tp) == 1)
1331 if constexpr (
sizeof(_V) == 2)
1333 const auto __xs =
reinterpret_cast<short>(__x._M_data);
1334 const auto __ys =
reinterpret_cast<short>(__y._M_data);
1335 return reinterpret_cast<__vector_type_t<_Tp, 2>
>(short(
1336 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1338 else if constexpr (
sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1340 const auto __xi =
reinterpret_cast<int>(__x._M_data);
1341 const auto __yi =
reinterpret_cast<int>(__y._M_data);
1342 return reinterpret_cast<__vector_type_t<_Tp, 3>
>(
1343 ((__xi * __yi) & 0xff)
1344 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1345 | ((__xi >> 16) * (__yi & 0xff0000)));
1347 else if constexpr (
sizeof(_V) == 4)
1349 const auto __xi =
reinterpret_cast<int>(__x._M_data);
1350 const auto __yi =
reinterpret_cast<int>(__y._M_data);
1351 return reinterpret_cast<__vector_type_t<_Tp, 4>
>(
1352 ((__xi * __yi) & 0xff)
1353 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1354 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1355 | ((__xi >> 24) * (__yi & 0xff000000u)));
1357 else if constexpr (
sizeof(_V) == 8 && __have_avx2
1358 && is_signed_v<_Tp>)
1359 return __convert<typename _VVT::type>(
1360 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1361 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1362 else if constexpr (
sizeof(_V) == 8 && __have_avx2
1363 && is_unsigned_v<_Tp>)
1364 return __convert<typename _VVT::type>(
1365 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1366 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1370 constexpr
size_t __full_size = _VVT::_S_full_size;
1371 constexpr
int _Np =
sizeof(_V) >= 16 ? __full_size / 2 : 8;
1372 using _ShortW = _SimdWrapper<short, _Np>;
1373 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1374 * __vector_bitcast<short, _Np>(__y);
1375 _ShortW __high_byte = _ShortW()._M_data - 256;
1378 = (__vector_bitcast<short, _Np>(__x) >> 8)
1379 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1380 if constexpr (__have_avx512bw &&
sizeof(_V) > 2)
1381 return _CommonImplX86::_S_blend_avx512(
1382 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1383 __vector_bitcast<_Tp>(__odd));
1384 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1385 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1387 __to_intrin(__even),
1388 __to_intrin(__odd));
1391 __or(__andnot(__high_byte, __even), __odd));
1395 return _Base::_S_multiplies(__x, __y);
1400 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993 1401 template <
typename _Tp,
size_t _Np>
1402 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
1403 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1405 if (!__builtin_is_constant_evaluated()
1406 && !__builtin_constant_p(__y._M_data))
1407 if constexpr (is_integral_v<_Tp> &&
sizeof(_Tp) <= 4)
1426 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1427 constexpr
size_t __n_intermediate
1428 =
std::min(_Np, (__have_avx512f ? 64
1432 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1433 constexpr
size_t __n_floatv
1434 = __div_roundup(_Np, __n_intermediate);
1435 using _R = __vector_type_t<_Tp, _Np>;
1436 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1437 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1438 _Abi::__make_padding_nonzero(__as_vector(__y)));
1439 return __call_with_n_evaluations<__n_floatv>(
1440 [](
auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1441 return __vector_convert<_R>(__quotients...);
1443 [&__xf, &__yf](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1444 -> _SimdWrapper<_Float, __n_intermediate>
1446 #if __RECIPROCAL_MATH__ 1452 if constexpr (__have_avx)
1456 if constexpr (
sizeof(_Tp) == 4)
1457 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}
" 1459 : "x
"(__xf[__i]), "x
"(__yf[__i])); 1461 asm("vdivps\t{%2, %1, %0|%0, %1, %2}
" 1463 : "x
"(__xf[__i]), "x
"(__yf[__i])); 1468 if constexpr (sizeof(_Tp) == 4) 1469 asm("divpd\t{%1, %0|%0, %1}
" 1473 asm("divps\t{%1, %0|%0, %1}
" 1479 return __xf[__i] / __yf[__i]; 1483 /* 64-bit int division is potentially optimizable via double division if 1484 * the value in __x is small enough and the conversion between 1485 * int<->double is efficient enough: 1486 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> && 1489 if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1491 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull, 1492 0xffe0'0000'0000'0000ull})) 1494 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data) 1499 return _Base::_S_divides(__x, __y); 1502 using _Base::_S_divides; 1503 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993 1507 template <typename _Tp, size_t _Np> 1508 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1509 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1511 if (__builtin_is_constant_evaluated() 1512 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8) 1513 return _Base::_S_modulus(__x, __y); 1515 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y))); 1519 // _S_bit_shift_left {{{ 1520 // Notes on UB. C++2a [expr.shift] says: 1521 // -1- [...] The operands shall be of integral or unscoped enumeration type 1522 // and integral promotions are performed. The type of the result is that 1523 // of the promoted left operand. The behavior is undefined if the right 1524 // operand is negative, or greater than or equal to the width of the 1525 // promoted left operand. 1526 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo 1527 // 2^N, where N is the width of the type of the result. 1529 // C++17 [expr.shift] says: 1530 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated 1531 // bits are zero-filled. If E1 has an unsigned type, the value of the 1532 // result is E1 × 2^E2 , reduced modulo one more than the maximum value 1533 // representable in the result type. Otherwise, if E1 has a signed type 1534 // and non-negative value, and E1 × 2^E2 is representable in the 1535 // corresponding unsigned type of the result type, then that value, 1536 // converted to the result type, is the resulting value; otherwise, the 1537 // behavior is undefined. 1540 // With C++2a signed and unsigned types have the same UB 1542 // - left shift is not UB for 0 <= RHS < max(32, #bits(T)) 1544 // With C++17 there's little room for optimizations because the standard 1545 // requires all shifts to happen on promoted integrals (i.e. int). Thus, 1546 // short and char shifts must assume shifts affect bits of neighboring 1548 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1549 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1550 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1551 _S_bit_shift_left(_Tp __xx, int __y) 1553 using _V = typename _TVT::type; 1554 using _Up = typename _TVT::value_type; 1556 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1557 if (__builtin_is_constant_evaluated()) 1559 #if __cplusplus > 201703 1560 // after C++17, signed shifts have no UB, and behave just like unsigned 1562 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) 1563 return __vector_bitcast<_Up>( 1564 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1567 else if constexpr (sizeof(_Up) == 1) 1569 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894) 1570 if (__builtin_constant_p(__y)) 1581 else if (__y > 2 && __y < 8) 1583 if constexpr (sizeof(__x) > sizeof(unsigned)) 1585 const _UChar __mask = 0xff << __y; // precomputed vector 1586 return __vector_bitcast<_Up>( 1587 __vector_bitcast<_UChar>( 1588 __vector_bitcast<unsigned>(__x) << __y) 1593 const unsigned __mask 1594 = (0xff & (0xff << __y)) * 0x01010101u; 1595 return reinterpret_cast<_V>( 1596 static_cast<__int_for_sizeof_t<_V>>( 1598 reinterpret_cast<__int_for_sizeof_t<_V>>(__x) 1603 else if (__y >= 8 && __y < 32) 1606 __builtin_unreachable(); 1608 // general strategy in the following: use an sllv instead of sll 1609 // instruction, because it's 2 to 4 times faster: 1610 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16) 1611 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8( 1612 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix), 1613 _mm256_set1_epi16(__y)))); 1614 else if constexpr (__have_avx512bw && sizeof(__x) == 32) 1615 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1616 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix), 1617 _mm512_set1_epi16(__y)))); 1618 else if constexpr (__have_avx512bw && sizeof(__x) == 64) 1620 const auto __shift = _mm512_set1_epi16(__y); 1621 return __vector_bitcast<_Up>( 1622 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1623 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)), 1624 _mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1625 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift)))); 1627 else if constexpr (__have_avx2 && sizeof(__x) == 32) 1630 const auto __shift = _mm_cvtsi32_si128(__y); 1632 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift); 1633 __k |= _mm256_srli_epi16(__k, 8); 1634 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift) 1637 const _Up __k = 0xff << __y; 1638 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y) 1644 const auto __shift = _mm_cvtsi32_si128(__y); 1646 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift); 1647 __k |= _mm_srli_epi16(__k, 8); 1648 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k); 1654 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1655 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1656 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y) 1658 using _V = typename _TVT::type; 1659 using _Up = typename _TVT::value_type; 1661 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1662 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1663 if (__builtin_is_constant_evaluated()) 1665 #if __cplusplus > 201703 1666 // after C++17, signed shifts have no UB, and behave just like unsigned 1668 else if constexpr (is_signed_v<_Up>) 1669 return __vector_bitcast<_Up>( 1670 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1671 __vector_bitcast<make_unsigned_t<_Up>>(__y))); 1673 else if constexpr (sizeof(_Up) == 1) 1675 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1676 return __vector_bitcast<_Up>(__concat( 1677 _mm512_cvtepi16_epi8( 1678 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)), 1679 _mm512_cvtepu8_epi16(__lo256(__iy)))), 1680 _mm512_cvtepi16_epi8( 1681 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)), 1682 _mm512_cvtepu8_epi16(__hi256(__iy)))))); 1683 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1684 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1685 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix), 1686 _mm512_cvtepu8_epi16(__iy)))); 1687 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl) 1688 return __intrin_bitcast<_V>( 1689 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix), 1690 _mm_cvtepu8_epi16(__iy)))); 1691 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1692 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1693 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix), 1694 _mm256_cvtepu8_epi16(__iy)))); 1695 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1696 return __intrin_bitcast<_V>( 1697 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1698 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)), 1699 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy)))))); 1700 else if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1703 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5); 1705 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1707 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1708 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4))); 1711 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1713 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1714 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2))); 1716 auto __x1 = __x + __x; 1717 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1718 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1))); 1720 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1722 else if constexpr (sizeof(__x) == 16) 1725 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5); 1727 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1729 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x; 1732 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1734 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x; 1736 auto __x1 = __x + __x; 1737 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x; 1739 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1744 else if constexpr (sizeof(_Up) == 2) 1746 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1747 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy)); 1748 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl) 1749 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy)); 1750 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1751 return __vector_bitcast<_Up>( 1752 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix), 1753 _mm512_castsi256_si512(__iy)))); 1754 else if constexpr (sizeof __ix == 32 && __have_avx2) 1756 const auto __ux = __vector_bitcast<unsigned>(__x); 1757 const auto __uy = __vector_bitcast<unsigned>(__y); 1758 return __vector_bitcast<_Up>(_mm256_blend_epi16( 1759 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1760 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1762 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1763 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy)); 1764 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1765 return __intrin_bitcast<_V>( 1766 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix), 1767 _mm512_castsi128_si512(__iy)))); 1768 else if constexpr (sizeof __ix == 16 && __have_avx2) 1770 const auto __ux = __vector_bitcast<unsigned>(__ix); 1771 const auto __uy = __vector_bitcast<unsigned>(__iy); 1772 return __intrin_bitcast<_V>(_mm_blend_epi16( 1773 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1774 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1776 else if constexpr (sizeof __ix == 16) 1778 using _Float4 = __vector_type_t<float, 4>; 1779 using _Int4 = __vector_type_t<int, 4>; 1780 using _UInt4 = __vector_type_t<unsigned, 4>; 1782 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3))); 1784 * __intrin_bitcast<_V>( 1785 __vector_convert<_Int4>(_SimdWrapper<float, 4>( 1786 reinterpret_cast<_Float4>(__yu << 23))) 1787 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>( 1788 reinterpret_cast<_Float4>((__yu >> 16) << 23))) 1792 __assert_unreachable<_Tp>(); 1794 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16 1796 // latency is suboptimal, but throughput is at full speedup 1797 return __intrin_bitcast<_V>( 1798 __vector_bitcast<unsigned>(__ix) 1799 * __vector_convert<__vector_type16_t<int>>( 1800 _SimdWrapper<float, 4>(__vector_bitcast<float>( 1801 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000)))); 1802 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16 1805 const auto __lo = _mm_sll_epi64(__ix, __iy); 1807 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy)); 1808 if constexpr (__have_sse4_1) 1809 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0)); 1811 return __vector_bitcast<_Up>( 1812 _mm_move_sd(__vector_bitcast<double>(__hi), 1813 __vector_bitcast<double>(__lo))); 1818 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 1821 // _S_bit_shift_right {{{ 1822 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1823 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1824 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1825 _S_bit_shift_right(_Tp __xx, int __y) 1827 using _V = typename _TVT::type; 1828 using _Up = typename _TVT::value_type; 1830 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1831 if (__builtin_is_constant_evaluated()) 1833 else if (__builtin_constant_p(__y) 1835 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__)) 1837 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{ 1838 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y) 1841 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{ 1842 return __intrin_bitcast<_V>( 1843 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix) 1846 | (__vector_bitcast<_UShort>( 1847 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8) 1851 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected 1852 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{ 1855 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32) 1856 & _Up(0xffff'ffff'0000'0000ull)) 1857 | __vector_bitcast<_Up>( 1858 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix) 1862 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix) 1864 | __vector_bitcast<_Up>( 1865 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll) 1873 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1874 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1875 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y) 1877 using _V = typename _TVT::type; 1878 using _Up = typename _TVT::value_type; 1880 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1881 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1882 if (__builtin_is_constant_evaluated() 1883 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 1885 else if constexpr (sizeof(_Up) == 1) //{{{ 1887 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl) 1888 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8( 1889 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix), 1890 _mm_cvtepi8_epi16(__iy)) 1891 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix), 1892 _mm_cvtepu8_epi16(__iy)))); 1893 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl) 1894 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1896 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix), 1897 _mm256_cvtepi8_epi16(__iy)) 1898 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix), 1899 _mm256_cvtepu8_epi16(__iy)))); 1900 else if constexpr (sizeof(__x) == 32 && __have_avx512bw) 1901 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1903 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix), 1904 _mm512_cvtepi8_epi16(__iy)) 1905 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix), 1906 _mm512_cvtepu8_epi16(__iy)))); 1907 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>) 1908 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1909 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1910 0x5555'5555'5555'5555ull, 1912 _mm512_slli_epi16(__ix, 8), 1913 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy, 1914 _mm512_set1_epi16(8))))); 1915 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>) 1916 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1917 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1918 0x5555'5555'5555'5555ull, 1920 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix), 1921 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy)))); 1922 /* This has better throughput but higher latency than the impl below 1923 else if constexpr (__have_avx2 && sizeof(__x) == 16 && 1926 const auto __shorts = __to_intrin(_S_bit_shift_right( 1927 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)), 1928 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy)))); 1929 return __vector_bitcast<_Up>( 1930 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts))); 1933 else if constexpr (__have_avx2 && sizeof(__x) > 8) 1934 // the following uses vpsr[al]vd, which requires AVX2 1935 if constexpr (is_signed_v<_Up>) 1937 const auto r3 = __vector_bitcast<_UInt>( 1938 (__vector_bitcast<int>(__x) 1939 >> (__vector_bitcast<_UInt>(__y) >> 24))) 1942 = __vector_bitcast<_UInt>( 1943 ((__vector_bitcast<int>(__x) << 8) 1944 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))) 1947 = __vector_bitcast<_UInt>( 1948 ((__vector_bitcast<int>(__x) << 16) 1949 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))) 1951 const auto r0 = __vector_bitcast<_UInt>( 1952 (__vector_bitcast<int>(__x) << 24) 1953 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24)); 1954 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1959 const auto r3 = (__vector_bitcast<_UInt>(__x) 1960 >> (__vector_bitcast<_UInt>(__y) >> 24)) 1963 = ((__vector_bitcast<_UInt>(__x) << 8) 1964 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)) 1967 = ((__vector_bitcast<_UInt>(__x) << 16) 1968 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)) 1971 = (__vector_bitcast<_UInt>(__x) << 24) 1972 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24); 1973 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1976 else if constexpr (__have_sse4_1 1977 && is_unsigned_v<_Up> && sizeof(__x) > 2) 1979 auto __x128 = __vector_bitcast<_Up>(__ix); 1981 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5); 1982 auto __x4 = __vector_bitcast<_Up>( 1983 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f)); 1984 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1985 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4))); 1987 auto __x2 = __vector_bitcast<_Up>( 1988 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f)); 1989 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1990 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2))); 1992 auto __x1 = __vector_bitcast<_Up>( 1993 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f)); 1994 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1995 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1))); 1996 return __intrin_bitcast<_V>( 1998 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 1999 == 0)); // y > 7 nulls the result 2001 else if constexpr (__have_sse4_1 2002 && is_signed_v<_Up> && sizeof(__x) > 2) 2004 auto __mask = __vector_bitcast<_UChar>( 2005 __vector_bitcast<_UShort>(__iy) << 5); 2006 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2007 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); 2009 auto __xh = __vector_bitcast<short>(__ix); 2010 auto __xl = __vector_bitcast<short>(__ix) << 8; 2011 auto __xh4 = __xh >> 4; 2012 auto __xl4 = __xl >> 4; 2013 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2014 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4))); 2015 __xl = __vector_bitcast<short>( 2016 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2017 __to_intrin(__xl4))); 2019 auto __xh2 = __xh >> 2; 2020 auto __xl2 = __xl >> 2; 2021 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2022 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2))); 2023 __xl = __vector_bitcast<short>( 2024 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2025 __to_intrin(__xl2))); 2027 auto __xh1 = __xh >> 1; 2028 auto __xl1 = __xl >> 1; 2029 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2030 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1))); 2031 __xl = __vector_bitcast<short>( 2032 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2033 __to_intrin(__xl1))); 2034 return __intrin_bitcast<_V>( 2035 (__vector_bitcast<_Up>((__xh & short(0xff00))) 2036 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2038 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2039 == 0)); // y > 7 nulls the result 2041 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2 2044 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5); 2045 auto __x4 = __vector_bitcast<_Up>( 2046 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f)); 2047 __x = __mask > 0x7f ? __x4 : __x; 2049 auto __x2 = __vector_bitcast<_Up>( 2050 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f)); 2051 __x = __mask > 0x7f ? __x2 : __x; 2053 auto __x1 = __vector_bitcast<_Up>( 2054 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f)); 2055 __x = __mask > 0x7f ? __x1 : __x; 2057 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2059 else if constexpr (sizeof(__x) > 2) // signed SSE2 2061 static_assert(is_signed_v<_Up>); 2062 auto __maskh = __vector_bitcast<_UShort>(__y) << 5; 2063 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8); 2064 auto __xh = __vector_bitcast<short>(__x); 2065 auto __xl = __vector_bitcast<short>(__x) << 8; 2066 auto __xh4 = __xh >> 4; 2067 auto __xl4 = __xl >> 4; 2068 __xh = __maskh > 0x7fff ? __xh4 : __xh; 2069 __xl = __maskl > 0x7fff ? __xl4 : __xl; 2072 auto __xh2 = __xh >> 2; 2073 auto __xl2 = __xl >> 2; 2074 __xh = __maskh > 0x7fff ? __xh2 : __xh; 2075 __xl = __maskl > 0x7fff ? __xl2 : __xl; 2078 auto __xh1 = __xh >> 1; 2079 auto __xl1 = __xl >> 1; 2080 __xh = __maskh > 0x7fff ? __xh1 : __xh; 2081 __xl = __maskl > 0x7fff ? __xl1 : __xl; 2082 __x = __vector_bitcast<_Up>((__xh & short(0xff00))) 2083 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2086 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2091 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ 2093 [[maybe_unused]] auto __blend_0xaa 2094 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2095 if constexpr (sizeof(__a) == 16) 2096 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2098 else if constexpr (sizeof(__a) == 32) 2099 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2101 else if constexpr (sizeof(__a) == 64) 2102 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), 2105 __assert_unreachable<decltype(__a)>(); 2107 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) 2108 return __intrin_bitcast<_V>(is_signed_v<_Up> 2109 ? _mm_srav_epi16(__ix, __iy) 2110 : _mm_srlv_epi16(__ix, __iy)); 2111 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32) 2112 return __vector_bitcast<_Up>(is_signed_v<_Up> 2113 ? _mm256_srav_epi16(__ix, __iy) 2114 : _mm256_srlv_epi16(__ix, __iy)); 2115 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64) 2116 return __vector_bitcast<_Up>(is_signed_v<_Up> 2117 ? _mm512_srav_epi16(__ix, __iy) 2118 : _mm512_srlv_epi16(__ix, __iy)); 2119 else if constexpr (__have_avx2 && is_signed_v<_Up>) 2120 return __intrin_bitcast<_V>( 2121 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16) 2122 >> (__vector_bitcast<int>(__iy) & 0xffffu)) 2124 __vector_bitcast<int>(__ix) 2125 >> (__vector_bitcast<int>(__iy) >> 16))); 2126 else if constexpr (__have_avx2 && is_unsigned_v<_Up>) 2127 return __intrin_bitcast<_V>( 2128 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu) 2129 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu), 2130 __vector_bitcast<_UInt>(__ix) 2131 >> (__vector_bitcast<_UInt>(__iy) >> 16))); 2132 else if constexpr (__have_sse4_1) 2134 auto __mask = __vector_bitcast<_UShort>(__iy); 2135 auto __x128 = __vector_bitcast<_Up>(__ix); 2137 __mask = (__mask << 3) | (__mask << 11); 2138 // do __x128 = 0 where __y[4] is set 2139 __x128 = __vector_bitcast<_Up>( 2140 _mm_blendv_epi8(__to_intrin(__x128), __m128i(), 2141 __to_intrin(__mask))); 2142 // do __x128 =>> 8 where __y[3] is set 2143 __x128 = __vector_bitcast<_Up>( 2144 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8), 2145 __to_intrin(__mask += __mask))); 2146 // do __x128 =>> 4 where __y[2] is set 2147 __x128 = __vector_bitcast<_Up>( 2148 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4), 2149 __to_intrin(__mask += __mask))); 2150 // do __x128 =>> 2 where __y[1] is set 2151 __x128 = __vector_bitcast<_Up>( 2152 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2), 2153 __to_intrin(__mask += __mask))); 2154 // do __x128 =>> 1 where __y[0] is set 2155 return __intrin_bitcast<_V>( 2156 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1), 2157 __to_intrin(__mask + __mask))); 2161 auto __k = __vector_bitcast<_UShort>(__iy) << 11; 2162 auto __x128 = __vector_bitcast<_Up>(__ix); 2164 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2165 return __vector_bitcast<short>(__kk) < 0; 2167 // do __x128 = 0 where __y[4] is set 2168 __x128 = __mask(__k) ? decltype(__x128)() : __x128; 2169 // do __x128 =>> 8 where __y[3] is set 2170 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128; 2171 // do __x128 =>> 4 where __y[2] is set 2172 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128; 2173 // do __x128 =>> 2 where __y[1] is set 2174 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128; 2175 // do __x128 =>> 1 where __y[0] is set 2176 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1 2180 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{ 2182 if constexpr (is_unsigned_v<_Up>) 2184 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31 2185 const __m128 __factor_f = reinterpret_cast<__m128>( 2186 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23)); 2187 const __m128i __factor 2188 = __builtin_constant_p(__factor_f) 2190 __make_vector<unsigned>(__factor_f[0], __factor_f[1], 2191 __factor_f[2], __factor_f[3])) 2192 : _mm_cvttps_epi32(__factor_f); 2194 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31); 2195 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4), 2196 _mm_srli_si128(__factor, 4)); 2197 if constexpr (__have_sse4_1) 2198 return __intrin_bitcast<_V>( 2199 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33)); 2201 return __intrin_bitcast<_V>( 2202 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4)); 2206 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2207 if constexpr (is_signed_v<_Up>) 2208 return _mm_sra_epi32(__a, __b); 2210 return _mm_srl_epi32(__a, __b); 2213 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i())); 2214 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32)); 2216 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i())); 2217 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12)); 2218 if constexpr (__have_sse4_1) 2219 return __intrin_bitcast<_V>( 2220 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3), 2221 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0)); 2223 return __intrin_bitcast<_V>(_mm_unpacklo_epi64( 2224 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)), 2225 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4)))); 2231 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 2236 template <typename _Tp, size_t _Np> 2237 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2238 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2240 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2242 if (__builtin_is_constant_evaluated() 2243 || (__x._M_is_constprop() && __y._M_is_constprop())) 2244 return _MaskImpl::_S_to_bits( 2245 __as_wrapper<_Np>(__x._M_data == __y._M_data)); 2247 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2248 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2249 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2250 if constexpr (is_floating_point_v<_Tp>) 2252 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2253 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2254 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2255 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2256 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2257 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2258 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2259 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2260 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2261 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2262 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2263 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2265 __assert_unreachable<_Tp>(); 2267 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2268 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2269 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2270 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2271 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2272 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2273 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2274 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2275 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2276 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2277 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2278 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2279 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2280 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2281 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2282 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2283 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2284 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2285 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2286 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2287 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2288 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2289 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2290 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2292 __assert_unreachable<_Tp>(); 2294 else if (__builtin_is_constant_evaluated()) 2295 return _Base::_S_equal_to(__x, __y); 2296 else if constexpr (sizeof(__x) == 8) 2298 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2299 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2300 _MaskMember<_Tp> __r64{}; 2301 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2305 return _Base::_S_equal_to(__x, __y); 2309 // _S_not_equal_to {{{ 2310 template <typename _Tp, size_t _Np> 2311 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2312 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2314 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2316 if (__builtin_is_constant_evaluated() 2317 || (__x._M_is_constprop() && __y._M_is_constprop())) 2318 return _MaskImpl::_S_to_bits( 2319 __as_wrapper<_Np>(__x._M_data != __y._M_data)); 2321 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2322 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2323 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2324 if constexpr (is_floating_point_v<_Tp>) 2326 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2327 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2328 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2329 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2330 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2331 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2332 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2333 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2334 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2335 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2336 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2337 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2339 __assert_unreachable<_Tp>(); 2341 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2342 return _mm512_mask_cmpneq_epi64_mask(__k1, __xi, __yi); 2343 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2344 return _mm512_mask_cmpneq_epi32_mask(__k1, __xi, __yi); 2345 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2346 return _mm512_mask_cmpneq_epi16_mask(__k1, __xi, __yi); 2347 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2348 return _mm512_mask_cmpneq_epi8_mask(__k1, __xi, __yi); 2349 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2350 return _mm256_mask_cmpneq_epi64_mask(__k1, __xi, __yi); 2351 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2352 return _mm256_mask_cmpneq_epi32_mask(__k1, __xi, __yi); 2353 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2354 return _mm256_mask_cmpneq_epi16_mask(__k1, __xi, __yi); 2355 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2356 return _mm256_mask_cmpneq_epi8_mask(__k1, __xi, __yi); 2357 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2358 return _mm_mask_cmpneq_epi64_mask(__k1, __xi, __yi); 2359 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2360 return _mm_mask_cmpneq_epi32_mask(__k1, __xi, __yi); 2361 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2362 return _mm_mask_cmpneq_epi16_mask(__k1, __xi, __yi); 2363 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2364 return _mm_mask_cmpneq_epi8_mask(__k1, __xi, __yi); 2366 __assert_unreachable<_Tp>(); 2368 else if (__builtin_is_constant_evaluated()) 2369 return _Base::_S_not_equal_to(__x, __y); 2370 else if constexpr (sizeof(__x) == 8) 2372 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2373 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2374 _MaskMember<_Tp> __r64{}; 2375 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2379 return _Base::_S_not_equal_to(__x, __y); 2384 template <typename _Tp, size_t _Np> 2385 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2386 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2388 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2390 if (__builtin_is_constant_evaluated() 2391 || (__x._M_is_constprop() && __y._M_is_constprop())) 2392 return _MaskImpl::_S_to_bits( 2393 __as_wrapper<_Np>(__x._M_data < __y._M_data)); 2395 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2396 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2397 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2398 if constexpr (sizeof(__xi) == 64) 2400 if constexpr (is_same_v<_Tp, float>) 2401 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2402 else if constexpr (is_same_v<_Tp, double>) 2403 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2404 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2405 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2406 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2407 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2408 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2409 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2410 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2411 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2412 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2413 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2414 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2415 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2416 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2417 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2418 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2419 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2421 __assert_unreachable<_Tp>(); 2423 else if constexpr (sizeof(__xi) == 32) 2425 if constexpr (is_same_v<_Tp, float>) 2426 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2427 else if constexpr (is_same_v<_Tp, double>) 2428 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2429 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2430 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2431 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2432 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2433 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2434 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2435 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2436 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2437 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2438 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2439 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2440 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2441 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2442 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2443 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2444 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2446 __assert_unreachable<_Tp>(); 2448 else if constexpr (sizeof(__xi) == 16) 2450 if constexpr (is_same_v<_Tp, float>) 2451 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2452 else if constexpr (is_same_v<_Tp, double>) 2453 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2454 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2455 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2456 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2457 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2458 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2459 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2460 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2461 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2462 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2463 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2464 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2465 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2466 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2467 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2468 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2469 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2471 __assert_unreachable<_Tp>(); 2474 __assert_unreachable<_Tp>(); 2476 else if (__builtin_is_constant_evaluated()) 2477 return _Base::_S_less(__x, __y); 2478 else if constexpr (sizeof(__x) == 8) 2480 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2481 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2482 _MaskMember<_Tp> __r64{}; 2483 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2487 return _Base::_S_less(__x, __y); 2491 // _S_less_equal {{{ 2492 template <typename _Tp, size_t _Np> 2493 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2494 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2496 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2498 if (__builtin_is_constant_evaluated() 2499 || (__x._M_is_constprop() && __y._M_is_constprop())) 2500 return _MaskImpl::_S_to_bits( 2501 __as_wrapper<_Np>(__x._M_data <= __y._M_data)); 2503 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2504 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2505 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2506 if constexpr (sizeof(__xi) == 64) 2508 if constexpr (is_same_v<_Tp, float>) 2509 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2510 else if constexpr (is_same_v<_Tp, double>) 2511 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2512 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2513 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi); 2514 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2515 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi); 2516 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2517 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi); 2518 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2519 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi); 2520 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2521 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi); 2522 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2523 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi); 2524 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2525 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi); 2526 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2527 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi); 2529 __assert_unreachable<_Tp>(); 2531 else if constexpr (sizeof(__xi) == 32) 2533 if constexpr (is_same_v<_Tp, float>) 2534 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2535 else if constexpr (is_same_v<_Tp, double>) 2536 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2537 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2538 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi); 2539 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2540 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi); 2541 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2542 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi); 2543 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2544 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi); 2545 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2546 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi); 2547 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2548 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi); 2549 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2550 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi); 2551 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2552 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi); 2554 __assert_unreachable<_Tp>(); 2556 else if constexpr (sizeof(__xi) == 16) 2558 if constexpr (is_same_v<_Tp, float>) 2559 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2560 else if constexpr (is_same_v<_Tp, double>) 2561 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2562 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2563 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi); 2564 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2565 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi); 2566 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2567 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi); 2568 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2569 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi); 2570 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2571 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi); 2572 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2573 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi); 2574 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2575 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi); 2576 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2577 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi); 2579 __assert_unreachable<_Tp>(); 2582 __assert_unreachable<_Tp>(); 2584 else if (__builtin_is_constant_evaluated()) 2585 return _Base::_S_less_equal(__x, __y); 2586 else if constexpr (sizeof(__x) == 8) 2588 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2589 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2590 _MaskMember<_Tp> __r64{}; 2591 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2595 return _Base::_S_less_equal(__x, __y); 2600 template <typename _Tp, size_t _Np> 2601 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2602 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 2604 if constexpr (__is_avx512_abi<_Abi>()) 2605 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>()); 2607 return _Base::_S_negate(__x); 2612 using _Base::_S_abs; 2615 template <typename _Tp, size_t _Np> 2616 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2617 _S_sqrt(_SimdWrapper<_Tp, _Np> __x) 2619 if constexpr (__is_sse_ps<_Tp, _Np>()) 2620 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x))); 2621 else if constexpr (__is_sse_pd<_Tp, _Np>()) 2622 return _mm_sqrt_pd(__x); 2623 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2624 return _mm256_sqrt_ps(__x); 2625 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2626 return _mm256_sqrt_pd(__x); 2627 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 2628 return _mm512_sqrt_ps(__x); 2629 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2630 return _mm512_sqrt_pd(__x); 2632 __assert_unreachable<_Tp>(); 2637 template <typename _Tp, size_t _Np> 2638 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2639 _S_ldexp(_SimdWrapper<_Tp, _Np> __x, 2640 __fixed_size_storage_t<int, _Np> __exp) 2642 if constexpr (sizeof(__x) == 64 || __have_avx512vl) 2644 const auto __xi = __to_intrin(__x); 2645 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi> 2647 const auto __expi = __to_intrin(__cvt(__exp)); 2648 using _Up = __bool_storage_member_type_t<_Np>; 2649 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up(); 2650 if constexpr (sizeof(__xi) == 16) 2652 if constexpr (sizeof(_Tp) == 8) 2653 return _mm_maskz_scalef_pd(__k1, __xi, __expi); 2655 return _mm_maskz_scalef_ps(__k1, __xi, __expi); 2657 else if constexpr (sizeof(__xi) == 32) 2659 if constexpr (sizeof(_Tp) == 8) 2660 return _mm256_maskz_scalef_pd(__k1, __xi, __expi); 2662 return _mm256_maskz_scalef_ps(__k1, __xi, __expi); 2666 static_assert(sizeof(__xi) == 64); 2667 if constexpr (sizeof(_Tp) == 8) 2668 return _mm512_maskz_scalef_pd(__k1, __xi, __expi); 2670 return _mm512_maskz_scalef_ps(__k1, __xi, __expi); 2674 return _Base::_S_ldexp(__x, __exp); 2679 template <typename _Tp, size_t _Np> 2680 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2681 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2683 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2684 return _mm512_roundscale_ps(__x, 0x0b); 2685 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2686 return _mm512_roundscale_pd(__x, 0x0b); 2687 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2688 return _mm256_round_ps(__x, 0xb); 2689 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2690 return _mm256_round_pd(__x, 0xb); 2691 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2692 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb)); 2693 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2694 return _mm_round_pd(__x, 0xb); 2695 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2698 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))); 2699 const auto __no_fractional_values 2700 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x)) 2702 < 0x4b000000; // the exponent is so large that no mantissa bits 2703 // signify fractional values (0x3f8 + 23*8 = 2705 return __no_fractional_values ? __truncated : __to_intrin(__x); 2708 return _Base::_S_trunc(__x); 2713 template <typename _Tp, size_t _Np> 2714 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2715 _S_round(_SimdWrapper<_Tp, _Np> __x) 2717 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away 2718 // from zero as required by std::round. Therefore this function is more 2720 using _V = __vector_type_t<_Tp, _Np>; 2722 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2723 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b); 2724 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2725 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b); 2726 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2727 __truncated = _mm256_round_ps(__x._M_data, 2728 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2729 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2730 __truncated = _mm256_round_pd(__x._M_data, 2731 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2732 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2733 __truncated = __auto_bitcast( 2734 _mm_round_ps(__to_intrin(__x), 2735 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); 2736 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2738 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2739 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2740 __truncated = __auto_bitcast( 2741 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)))); 2743 return _Base::_S_round(__x); 2745 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0 2746 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0 2750 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5) 2751 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1) 2753 if constexpr (__have_sse4_1) 2755 else // adjust for missing range in cvttps_epi32 2756 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded 2762 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2763 _GLIBCXX_SIMD_INTRINSIC static _Tp 2764 _S_nearbyint(_Tp __x) noexcept 2766 if constexpr (_TVT::template _S_is<float, 16>) 2767 return _mm512_roundscale_ps(__x, 0x0c); 2768 else if constexpr (_TVT::template _S_is<double, 8>) 2769 return _mm512_roundscale_pd(__x, 0x0c); 2770 else if constexpr (_TVT::template _S_is<float, 8>) 2771 return _mm256_round_ps(__x, 2772 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2773 else if constexpr (_TVT::template _S_is<double, 4>) 2774 return _mm256_round_pd(__x, 2775 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2776 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2777 return _mm_round_ps(__x, 2778 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2779 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2780 return _mm_round_pd(__x, 2781 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2783 return _Base::_S_nearbyint(__x); 2788 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2789 _GLIBCXX_SIMD_INTRINSIC static _Tp 2790 _S_rint(_Tp __x) noexcept 2792 if constexpr (_TVT::template _S_is<float, 16>) 2793 return _mm512_roundscale_ps(__x, 0x04); 2794 else if constexpr (_TVT::template _S_is<double, 8>) 2795 return _mm512_roundscale_pd(__x, 0x04); 2796 else if constexpr (_TVT::template _S_is<float, 8>) 2797 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2798 else if constexpr (_TVT::template _S_is<double, 4>) 2799 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2800 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2801 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2802 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2803 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2805 return _Base::_S_rint(__x); 2810 template <typename _Tp, size_t _Np> 2811 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2812 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2814 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2815 return _mm512_roundscale_ps(__x, 0x09); 2816 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2817 return _mm512_roundscale_pd(__x, 0x09); 2818 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2819 return _mm256_round_ps(__x, 0x9); 2820 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2821 return _mm256_round_pd(__x, 0x9); 2822 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2823 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9)); 2824 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2825 return _mm_round_pd(__x, 0x9); 2827 return _Base::_S_floor(__x); 2832 template <typename _Tp, size_t _Np> 2833 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2834 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2836 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2837 return _mm512_roundscale_ps(__x, 0x0a); 2838 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2839 return _mm512_roundscale_pd(__x, 0x0a); 2840 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2841 return _mm256_round_ps(__x, 0xa); 2842 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2843 return _mm256_round_pd(__x, 0xa); 2844 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2845 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa)); 2846 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2847 return _mm_round_pd(__x, 0xa); 2849 return _Base::_S_ceil(__x); 2854 template <typename _Tp, size_t _Np> 2855 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2856 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2858 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2860 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4) 2861 return _mm512_movepi32_mask( 2862 __intrin_bitcast<__m512i>(__x._M_data)); 2863 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8) 2864 return _mm512_movepi64_mask( 2865 __intrin_bitcast<__m512i>(__x._M_data)); 2866 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4) 2867 return _mm256_movepi32_mask( 2868 __intrin_bitcast<__m256i>(__x._M_data)); 2869 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8) 2870 return _mm256_movepi64_mask( 2871 __intrin_bitcast<__m256i>(__x._M_data)); 2872 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4) 2873 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2874 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8) 2875 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2877 else if constexpr (__is_avx512_abi<_Abi>()) 2879 const auto __xi = __to_intrin(__x); 2880 [[maybe_unused]] constexpr auto __k1 2881 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2882 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2883 return _mm_movemask_ps(__xi); 2884 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2885 return _mm_movemask_pd(__xi); 2886 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2887 return _mm256_movemask_ps(__xi); 2888 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2889 return _mm256_movemask_pd(__xi); 2890 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2891 return _mm512_mask_cmplt_epi32_mask( 2892 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2893 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2894 return _mm512_mask_cmplt_epi64_mask( 2895 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2897 __assert_unreachable<_Tp>(); 2900 return _Base::_S_signbit(__x); 2902 using _I = __int_for_sizeof_t<_Tp>; 2903 if constexpr (sizeof(__x) == 64) 2904 return _S_less(__vector_bitcast<_I>(__x), _I()); 2907 const auto __xx = __vector_bitcast<_I>(__x._M_data); 2908 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>; 2909 if constexpr ((sizeof(_Tp) == 4 && 2910 (__have_avx2 || sizeof(__x) == 16)) || 2913 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>); 2915 else if constexpr ((__have_avx2 || 2916 (__have_ssse3 && sizeof(__x) == 16))) 2918 return __vector_bitcast<_Tp>((__xx & __signmask) == 2922 { // SSE2/3 or AVX (w/o AVX2) 2923 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1); 2924 return __vector_bitcast<_Tp>( 2925 __vector_bitcast<_Tp>( 2926 (__xx & __signmask) | 2927 __vector_bitcast<_I>(__one)) // -1 or 1 2935 // _S_isnonzerovalue_mask {{{ 2936 // (isnormal | is subnormal == !isinf & !isnan & !is zero) 2937 template <typename _Tp> 2938 _GLIBCXX_SIMD_INTRINSIC static auto 2939 _S_isnonzerovalue_mask(_Tp __x) 2941 using _Traits = _VectorTraits<_Tp>; 2942 if constexpr (__have_avx512dq_vl) 2944 if constexpr (_Traits::template _S_is< 2945 float, 2> || _Traits::template _S_is<float, 4>) 2946 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f)); 2947 else if constexpr (_Traits::template _S_is<float, 8>) 2948 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f)); 2949 else if constexpr (_Traits::template _S_is<float, 16>) 2950 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f)); 2951 else if constexpr (_Traits::template _S_is<double, 2>) 2952 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f)); 2953 else if constexpr (_Traits::template _S_is<double, 4>) 2954 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f)); 2955 else if constexpr (_Traits::template _S_is<double, 8>) 2956 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f)); 2958 __assert_unreachable<_Tp>(); 2962 using _Up = typename _Traits::value_type; 2963 constexpr size_t _Np = _Traits::_S_full_size; 2964 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0 2965 const auto __b = __x * _Up(); // NaN if __x == inf 2966 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>()) 2967 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b), 2969 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>()) 2971 & _mm512_cmp_ps_mask(__auto_bitcast(__a), 2972 __auto_bitcast(__b), 2974 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>()) 2975 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2976 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>()) 2978 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2979 __auto_bitcast(__b), 2981 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>()) 2982 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2983 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>()) 2984 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a), 2985 __auto_bitcast(__b), 2987 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>()) 2988 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2989 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>()) 2991 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2992 __auto_bitcast(__b), 2994 else if constexpr (__is_avx512_ps<_Up, _Np>()) 2995 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2996 else if constexpr (__is_avx512_pd<_Up, _Np>()) 2997 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2999 __assert_unreachable<_Tp>(); 3005 template <typename _Tp, size_t _Np> 3006 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3007 _S_isfinite(_SimdWrapper<_Tp, _Np> __x) 3009 static_assert(is_floating_point_v<_Tp>); 3010 #if !__FINITE_MATH_ONLY__ 3011 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3013 const auto __xi = __to_intrin(__x); 3014 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3015 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3016 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3017 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3018 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3019 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3020 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3021 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3022 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3023 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3024 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3025 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3026 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3028 else if constexpr (__is_avx512_abi<_Abi>()) 3030 // if all exponent bits are set, __x is either inf or NaN 3031 using _I = __int_for_sizeof_t<_Tp>; 3032 const auto __inf = __vector_bitcast<_I>( 3033 __vector_broadcast<_Np>(__infinity_v<_Tp>)); 3034 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf); 3038 return _Base::_S_isfinite(__x); 3043 template <typename _Tp, size_t _Np> 3044 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3045 _S_isinf(_SimdWrapper<_Tp, _Np> __x) 3047 #if !__FINITE_MATH_ONLY__ 3048 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3050 const auto __xi = __to_intrin(__x); 3051 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3052 return _mm512_fpclass_ps_mask(__xi, 0x18); 3053 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3054 return _mm512_fpclass_pd_mask(__xi, 0x18); 3055 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3056 return _mm256_fpclass_ps_mask(__xi, 0x18); 3057 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3058 return _mm256_fpclass_pd_mask(__xi, 0x18); 3059 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3060 return _mm_fpclass_ps_mask(__xi, 0x18); 3061 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3062 return _mm_fpclass_pd_mask(__xi, 0x18); 3064 __assert_unreachable<_Tp>(); 3066 else if constexpr (__have_avx512dq_vl) 3068 if constexpr (__is_sse_pd<_Tp, _Np>()) 3069 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18)); 3070 else if constexpr (__is_avx_pd<_Tp, _Np>()) 3071 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18)); 3072 else if constexpr (__is_sse_ps<_Tp, _Np>()) 3073 return _mm_movm_epi32( 3074 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18)); 3075 else if constexpr (__is_avx_ps<_Tp, _Np>()) 3076 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18)); 3078 __assert_unreachable<_Tp>(); 3082 return _Base::_S_isinf(__x); 3087 template <typename _Tp, size_t _Np> 3088 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3089 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 3091 #if __FINITE_MATH_ONLY__ 3092 [[maybe_unused]] constexpr int __mode = 0x26; 3094 [[maybe_unused]] constexpr int __mode = 0xbf; 3096 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3098 const auto __xi = __to_intrin(__x); 3099 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3100 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3101 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode); 3102 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3103 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode); 3104 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3105 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode); 3106 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3107 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode); 3108 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3109 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode); 3110 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3111 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode); 3113 __assert_unreachable<_Tp>(); 3115 else if constexpr (__have_avx512dq) 3117 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) 3118 return _mm_movm_epi32( 3119 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode))); 3120 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) 3121 return _mm256_movm_epi32( 3122 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode))); 3123 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 3124 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode)); 3125 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) 3126 return _mm_movm_epi64( 3127 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode))); 3128 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) 3129 return _mm256_movm_epi64( 3130 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode))); 3131 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 3132 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode)); 3134 __assert_unreachable<_Tp>(); 3136 else if constexpr (__is_avx512_abi<_Abi>()) 3138 using _I = __int_for_sizeof_t<_Tp>; 3139 const auto absn = __vector_bitcast<_I>(_S_abs(__x)); 3140 const auto minn = __vector_bitcast<_I>( 3141 __vector_broadcast<_Np>(__norm_min_v<_Tp>)); 3142 #if __FINITE_MATH_ONLY__ 3143 return _S_less_equal<_I, _Np>(minn, absn); 3146 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 3147 return __and(_S_less_equal<_I, _Np>(minn, absn), 3148 _S_less<_I, _Np>(absn, infn)); 3152 return _Base::_S_isnormal(__x); 3157 template <typename _Tp, size_t _Np> 3158 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3159 _S_isnan(_SimdWrapper<_Tp, _Np> __x) 3160 { return _S_isunordered(__x, __x); } 3163 // _S_isunordered {{{ 3164 template <typename _Tp, size_t _Np> 3165 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3166 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x, 3167 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y) 3169 #if __FINITE_MATH_ONLY__ 3172 const auto __xi = __to_intrin(__x); 3173 const auto __yi = __to_intrin(__y); 3174 if constexpr (__is_avx512_abi<_Abi>()) 3176 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3177 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3178 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3179 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3180 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3181 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3182 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3183 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3184 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3185 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3186 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3187 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3188 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3190 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3191 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q)); 3192 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3193 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q)); 3194 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3195 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi)); 3196 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3197 return __to_masktype(_mm_cmpunord_pd(__xi, __yi)); 3199 __assert_unreachable<_Tp>(); 3205 template <typename _Tp, size_t _Np> 3206 static constexpr _MaskMember<_Tp> 3207 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3209 const auto __xi = __to_intrin(__x); 3210 const auto __yi = __to_intrin(__y); 3211 if constexpr (__is_avx512_abi<_Abi>()) 3213 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3214 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3215 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3216 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3217 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3218 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3219 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3220 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3221 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3222 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3223 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3224 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3225 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3227 __assert_unreachable<_Tp>(); 3229 else if constexpr (__have_avx) 3231 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3232 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3233 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3234 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3235 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3236 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3237 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3238 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3240 __assert_unreachable<_Tp>(); 3242 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3243 && sizeof(_Tp) == 4) 3245 const auto __xn = __vector_bitcast<int>(__xi); 3246 const auto __yn = __vector_bitcast<int>(__yi); 3247 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3248 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3249 return __auto_bitcast( 3250 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp)); 3252 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3253 && sizeof(_Tp) == 8) 3254 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3255 -_mm_ucomigt_sd(__xi, __yi), 3256 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi), 3257 _mm_unpackhi_pd(__yi, __yi))}; 3259 return _Base::_S_isgreater(__x, __y); 3263 // _S_isgreaterequal {{{ 3264 template <typename _Tp, size_t _Np> 3265 static constexpr _MaskMember<_Tp> 3266 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3268 const auto __xi = __to_intrin(__x); 3269 const auto __yi = __to_intrin(__y); 3270 if constexpr (__is_avx512_abi<_Abi>()) 3272 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3273 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3274 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3275 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3276 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3277 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3278 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3279 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3280 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3281 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3282 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3283 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3284 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3286 __assert_unreachable<_Tp>(); 3288 else if constexpr (__have_avx) 3290 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3291 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3292 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3293 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3294 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3295 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3296 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3297 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3299 __assert_unreachable<_Tp>(); 3301 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3302 && sizeof(_Tp) == 4) 3304 const auto __xn = __vector_bitcast<int>(__xi); 3305 const auto __yn = __vector_bitcast<int>(__yi); 3306 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3307 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3308 return __auto_bitcast( 3309 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp)); 3311 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3312 && sizeof(_Tp) == 8) 3313 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3314 -_mm_ucomige_sd(__xi, __yi), 3315 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi), 3316 _mm_unpackhi_pd(__yi, __yi))}; 3318 return _Base::_S_isgreaterequal(__x, __y); 3323 template <typename _Tp, size_t _Np> 3324 static constexpr _MaskMember<_Tp> 3325 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3327 const auto __xi = __to_intrin(__x); 3328 const auto __yi = __to_intrin(__y); 3329 if constexpr (__is_avx512_abi<_Abi>()) 3331 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3332 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3333 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3334 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3335 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3336 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3337 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3338 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3339 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3340 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3341 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3342 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3343 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3345 __assert_unreachable<_Tp>(); 3347 else if constexpr (__have_avx) 3349 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3350 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3351 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3352 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3353 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3354 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3355 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3356 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3358 __assert_unreachable<_Tp>(); 3360 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3361 && sizeof(_Tp) == 4) 3363 const auto __xn = __vector_bitcast<int>(__xi); 3364 const auto __yn = __vector_bitcast<int>(__yi); 3365 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3366 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3367 return __auto_bitcast( 3368 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp)); 3370 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3371 && sizeof(_Tp) == 8) 3372 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3373 -_mm_ucomigt_sd(__yi, __xi), 3374 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi), 3375 _mm_unpackhi_pd(__xi, __xi))}; 3377 return _Base::_S_isless(__x, __y); 3381 // _S_islessequal {{{ 3382 template <typename _Tp, size_t _Np> 3383 static constexpr _MaskMember<_Tp> 3384 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3386 const auto __xi = __to_intrin(__x); 3387 const auto __yi = __to_intrin(__y); 3388 if constexpr (__is_avx512_abi<_Abi>()) 3390 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3391 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3392 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3393 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3394 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3395 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3396 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3397 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3398 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3399 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3400 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3401 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3402 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3404 __assert_unreachable<_Tp>(); 3406 else if constexpr (__have_avx) 3408 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3409 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3410 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3411 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3412 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3413 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3414 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3415 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3417 __assert_unreachable<_Tp>(); 3419 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3420 && sizeof(_Tp) == 4) 3422 const auto __xn = __vector_bitcast<int>(__xi); 3423 const auto __yn = __vector_bitcast<int>(__yi); 3424 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3425 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3426 return __auto_bitcast( 3427 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp)); 3429 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3430 && sizeof(_Tp) == 8) 3431 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3432 -_mm_ucomige_sd(__yi, __xi), 3433 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi), 3434 _mm_unpackhi_pd(__xi, __xi))}; 3436 return _Base::_S_islessequal(__x, __y); 3440 // _S_islessgreater {{{ 3441 template <typename _Tp, size_t _Np> 3442 static constexpr _MaskMember<_Tp> 3443 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3445 const auto __xi = __to_intrin(__x); 3446 const auto __yi = __to_intrin(__y); 3447 if constexpr (__is_avx512_abi<_Abi>()) 3449 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3450 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3451 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3452 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3453 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3454 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3455 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3456 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3457 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3458 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3459 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3460 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3461 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3463 __assert_unreachable<_Tp>(); 3465 else if constexpr (__have_avx) 3467 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3468 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3469 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3470 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3471 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3472 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3473 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3474 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3476 __assert_unreachable<_Tp>(); 3478 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3479 return __auto_bitcast( 3480 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi))); 3481 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3482 return __to_masktype( 3483 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi))); 3485 __assert_unreachable<_Tp>(); 3489 template <template <typename> class _Op, typename _Tp, typename _K, size_t _Np> 3490 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 3491 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v) 3493 if (__k._M_is_constprop_none_of()) 3495 else if (__k._M_is_constprop_all_of()) 3497 auto __vv = _Base::_M_make_simd(__v); 3498 _Op<decltype(__vv)> __op; 3499 return __data(__op(__vv)); 3501 else if constexpr (__is_bitmask_v<decltype(__k)> 3502 && (is_same_v<_Op<void>, __increment<void>> 3503 || is_same_v<_Op<void>, __decrement<void>>)) 3505 // optimize masked unary increment and decrement as masked sub +/-1 3506 constexpr int __pm_one 3507 = is_same_v<_Op<void>, __increment<void>> ? -1 : 1; 3509 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data; 3511 using _TV = __vector_type_t<_Tp, _Np>; 3512 constexpr size_t __bytes = sizeof(__v) < 16 ? 16 : sizeof(__v); 3513 constexpr size_t __width = __bytes / sizeof(_Tp); 3514 if constexpr (is_integral_v<_Tp>) 3516 constexpr bool __lp64 = sizeof(long) == sizeof(long long); 3517 using _Ip = std::make_signed_t<_Tp>; 3518 using _Up = std::conditional_t< 3519 std::is_same_v<_Ip, long>, 3520 std::conditional_t<__lp64, long long, int>, 3522 std::is_same_v<_Ip, signed char>, char, _Ip>>; 3523 const auto __value = __intrin_bitcast<__vector_type_t<_Up, __width>>(__v._M_data); 3524 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3525 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \ 3526 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask(__value, \ 3527 __vector_broadcast<__width>(_Up(__pm_one)), __value, __k._M_data)) 3528 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512); 3529 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256); 3530 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128); 3531 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512); 3532 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256); 3533 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128); 3534 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512); 3535 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256); 3536 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128); 3537 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512); 3538 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256); 3539 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128); 3540 #undef _GLIBCXX_SIMD_MASK_SUB 3544 const auto __value = __intrin_bitcast<__vector_type_t<_Tp, __width>>(__v._M_data); 3545 #define _GLIBCXX_SIMD_MASK_SUB_512(_Sizeof, _Width, _Instr) \ 3546 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \ 3547 return __builtin_ia32_##_Instr##_mask( \ 3548 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \ 3549 __k._M_data, _MM_FROUND_CUR_DIRECTION) 3550 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3551 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \ 3552 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask( \ 3553 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \ 3555 _GLIBCXX_SIMD_MASK_SUB_512(4, 64, subps512); 3556 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256); 3557 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128); 3558 _GLIBCXX_SIMD_MASK_SUB_512(8, 64, subpd512); 3559 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256); 3560 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128); 3561 #undef _GLIBCXX_SIMD_MASK_SUB_512 3562 #undef _GLIBCXX_SIMD_MASK_SUB 3567 return _Base::template _S_masked_unary<_Op>(__k, __v); 3572 // _MaskImplX86Mixin {{{ 3573 struct _MaskImplX86Mixin 3575 template <typename _Tp> 3576 using _TypeTag = _Tp*; 3578 using _Base = _MaskImplBuiltinMixin; 3580 // _S_to_maskvector(bool) {{{ 3581 template <typename _Up, size_t _ToN = 1, typename _Tp> 3582 _GLIBCXX_SIMD_INTRINSIC static constexpr 3583 enable_if_t<is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>> 3584 _S_to_maskvector(_Tp __x) 3586 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3587 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 3588 : __vector_type_t<_Up, _ToN>(); 3592 // _S_to_maskvector(_SanitizedBitMask) {{{ 3593 template <typename _Up, size_t _UpN = 0, size_t _Np, size_t _ToN = _UpN == 0 ? _Np : _UpN> 3594 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3595 _S_to_maskvector(_SanitizedBitMask<_Np> __x) 3597 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3598 using _UV = __vector_type_t<_Up, _ToN>; 3599 using _UI = __intrinsic_type_t<_Up, _ToN>; 3600 [[maybe_unused]] const auto __k = __x._M_to_bits(); 3601 if constexpr (_Np == 1) 3602 return _S_to_maskvector<_Up, _ToN>(__k); 3603 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3604 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>( 3605 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; }); 3606 else if constexpr (sizeof(_Up) == 1) 3608 if constexpr (sizeof(_UI) == 16) 3610 if constexpr (__have_avx512bw_vl) 3611 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k)); 3612 else if constexpr (__have_avx512bw) 3613 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k))); 3614 else if constexpr (__have_avx512f) 3616 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3618 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3619 __hi256(__as32bits))); 3620 return __intrin_bitcast<_UV>( 3621 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 3623 else if constexpr (__have_ssse3) 3625 const auto __bitmask = __to_intrin( 3626 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 3627 8, 16, 32, 64, 128)); 3628 return __intrin_bitcast<_UV>( 3629 __vector_bitcast<_Up>( 3630 _mm_shuffle_epi8(__to_intrin( 3631 __vector_type_t<_ULLong, 2>{__k}), 3632 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3633 1, 1, 1, 1, 1, 1, 1)) 3637 // else fall through 3639 else if constexpr (sizeof(_UI) == 32) 3641 if constexpr (__have_avx512bw_vl) 3642 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k)); 3643 else if constexpr (__have_avx512bw) 3644 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k))); 3645 else if constexpr (__have_avx512f) 3647 auto __as16bits = // 0 16 1 17 ... 15 31 3648 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()), 3650 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16, 3653 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16( 3654 __lo256(__as16bits), 3655 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ... 3658 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8( 3659 __0_16_1_17, // 0 16 1 17 2 ... 3660 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 3661 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3663 15)))); // 0-7 16-23 8-15 24-31 -> xzyw 3664 // 0-3 8-11 16-19 24-27 3665 // 4-7 12-15 20-23 28-31 3667 else if constexpr (__have_avx2) 3669 const auto __bitmask 3670 = _mm256_broadcastsi128_si256(__to_intrin( 3671 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 3672 4, 8, 16, 32, 64, 128))); 3673 return __vector_bitcast<_Up>( 3674 __vector_bitcast<_Up>( 3675 _mm256_shuffle_epi8( 3676 _mm256_broadcastsi128_si256( 3677 __to_intrin(__vector_type_t<_ULLong, 2>{__k})), 3678 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3679 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3684 // else fall through 3686 else if constexpr (sizeof(_UI) == 64) 3687 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k)); 3688 if constexpr (std::min(_ToN, _Np) <= 4) 3690 if constexpr (_Np > 7) // avoid overflow 3691 __x &= _SanitizedBitMask<_Np>(0x0f); 3692 const _UInt __char_mask 3693 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL) 3696 __builtin_memcpy(&__r, &__char_mask, 3697 std::min(sizeof(__r), sizeof(__char_mask))); 3700 else if constexpr (std::min(_ToN, _Np) <= 7) 3702 if constexpr (_Np > 7) // avoid overflow 3703 __x &= _SanitizedBitMask<_Np>(0x7f); 3704 const _ULLong __char_mask 3705 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL) 3708 __builtin_memcpy(&__r, &__char_mask, 3709 std::min(sizeof(__r), sizeof(__char_mask))); 3713 else if constexpr (sizeof(_Up) == 2) 3715 if constexpr (sizeof(_UI) == 16) 3717 if constexpr (__have_avx512bw_vl) 3718 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k)); 3719 else if constexpr (__have_avx512bw) 3720 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k))); 3721 else if constexpr (__have_avx512f) 3723 __m256i __as32bits = {}; 3724 if constexpr (__have_avx512vl) 3725 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i()); 3728 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())); 3729 return __intrin_bitcast<_UV>( 3730 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits))); 3732 // else fall through 3734 else if constexpr (sizeof(_UI) == 32) 3736 if constexpr (__have_avx512bw_vl) 3737 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k)); 3738 else if constexpr (__have_avx512bw) 3739 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k))); 3740 else if constexpr (__have_avx512f) 3742 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3743 return __vector_bitcast<_Up>( 3744 __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3745 __hi256(__as32bits)))); 3747 // else fall through 3749 else if constexpr (sizeof(_UI) == 64) 3750 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k)); 3752 else if constexpr (sizeof(_Up) == 4) 3754 if constexpr (sizeof(_UI) == 16) 3756 if constexpr (__have_avx512dq_vl) 3757 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k)); 3758 else if constexpr (__have_avx512dq) 3759 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k))); 3760 else if constexpr (__have_avx512vl) 3761 return __intrin_bitcast<_UV>( 3762 _mm_maskz_mov_epi32(__k, ~__m128i())); 3763 else if constexpr (__have_avx512f) 3764 return __intrin_bitcast<_UV>( 3765 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3766 // else fall through 3768 else if constexpr (sizeof(_UI) == 32) 3770 if constexpr (__have_avx512dq_vl) 3771 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k)); 3772 else if constexpr (__have_avx512dq) 3773 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k))); 3774 else if constexpr (__have_avx512vl) 3775 return __vector_bitcast<_Up>( 3776 _mm256_maskz_mov_epi32(__k, ~__m256i())); 3777 else if constexpr (__have_avx512f) 3778 return __vector_bitcast<_Up>( 3779 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3780 // else fall through 3782 else if constexpr (sizeof(_UI) == 64) 3783 return __vector_bitcast<_Up>( 3784 __have_avx512dq ? _mm512_movm_epi32(__k) 3785 : _mm512_maskz_mov_epi32(__k, ~__m512i())); 3787 else if constexpr (sizeof(_Up) == 8) 3789 if constexpr (sizeof(_UI) == 16) 3791 if constexpr (__have_avx512dq_vl) 3792 return __vector_bitcast<_Up>(_mm_movm_epi64(__k)); 3793 else if constexpr (__have_avx512dq) 3794 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k))); 3795 else if constexpr (__have_avx512vl) 3796 return __vector_bitcast<_Up>( 3797 _mm_maskz_mov_epi64(__k, ~__m128i())); 3798 else if constexpr (__have_avx512f) 3799 return __vector_bitcast<_Up>( 3800 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3801 // else fall through 3803 else if constexpr (sizeof(_UI) == 32) 3805 if constexpr (__have_avx512dq_vl) 3806 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k)); 3807 else if constexpr (__have_avx512dq) 3808 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k))); 3809 else if constexpr (__have_avx512vl) 3810 return __vector_bitcast<_Up>( 3811 _mm256_maskz_mov_epi64(__k, ~__m256i())); 3812 else if constexpr (__have_avx512f) 3813 return __vector_bitcast<_Up>( 3814 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3815 // else fall through 3817 else if constexpr (sizeof(_UI) == 64) 3818 return __vector_bitcast<_Up>( 3819 __have_avx512dq ? _mm512_movm_epi64(__k) 3820 : _mm512_maskz_mov_epi64(__k, ~__m512i())); 3823 using _UpUInt = make_unsigned_t<_Up>; 3824 using _V = __vector_type_t<_UpUInt, _ToN>; 3825 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__; 3826 if constexpr (_ToN == 2) 3828 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])}); 3830 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32) 3832 if constexpr (sizeof(_Up) == 4) 3833 return __vector_bitcast<_Up>(_mm256_cmp_ps( 3834 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)), 3835 _mm256_castsi256_ps(_mm256_setr_epi32( 3836 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))), 3837 _mm256_setzero_ps(), _CMP_NEQ_UQ)); 3838 else if constexpr (sizeof(_Up) == 8) 3839 return __vector_bitcast<_Up>(_mm256_cmp_pd( 3840 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)), 3841 _mm256_castsi256_pd( 3842 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))), 3843 _mm256_setzero_pd(), _CMP_NEQ_UQ)); 3845 __assert_unreachable<_Up>(); 3847 else if constexpr (__bits_per_element >= _ToN) 3849 constexpr auto __bitmask 3850 = __generate_vector<_V>([](auto __i) 3851 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt 3852 { return __i < _ToN ? 1ull << __i : 0; }); 3854 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; 3855 if constexpr (__bits_per_element > _ToN) 3856 return __vector_bitcast<_Up>(__bits) > 0; 3858 return __vector_bitcast<_Up>(__bits != 0); 3863 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3864 return static_cast<_UpUInt>( 3865 __k >> (__bits_per_element * (__i / __bits_per_element))); 3867 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3868 return static_cast<_UpUInt>(1ull 3869 << (__i % __bits_per_element)); 3870 }); // mask bit index 3871 return __intrin_bitcast<_UV>(__tmp != _V()); 3876 // _S_to_maskvector(_SimdWrapper) {{{ 3877 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np, 3878 size_t _ToN = _UpN == 0 ? _Np : _UpN> 3879 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3880 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 3882 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3883 using _TW = _SimdWrapper<_Tp, _Np>; 3884 using _UW = _SimdWrapper<_Up, _ToN>; 3885 using _UI = __intrinsic_type_t<_Up, _ToN>; 3886 if constexpr (is_same_v<_Tp, bool>) // bits -> vector 3887 return _S_to_maskvector<_Up, _ToN>( 3888 _BitMask<_Np>(__x._M_data)._M_sanitized()); 3889 // vector -> vector bitcast 3890 else if constexpr (sizeof(_Up) == sizeof(_Tp) 3891 && sizeof(_TW) == sizeof(_UW)) 3892 return __wrapper_bitcast<_Up, _ToN>( 3895 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x)); 3896 else // vector -> vector {{{ 3898 if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3900 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); 3901 return __generate_from_n_evaluations<std::min(_ToN, _Np), 3902 __vector_type_t<_Up, _ToN>>( 3903 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; }); 3905 using _To = __vector_type_t<_Up, _ToN>; 3906 [[maybe_unused]] constexpr size_t _FromN = _Np; 3907 constexpr int _FromBytes = sizeof(_Tp); 3908 constexpr int _ToBytes = sizeof(_Up); 3909 const auto __k = __x._M_data; 3911 if constexpr (_FromBytes == _ToBytes) 3912 return __intrin_bitcast<_To>(__k); 3913 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16) 3915 if constexpr (_FromBytes == 4 && _ToBytes == 8) 3916 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3917 else if constexpr (_FromBytes == 2 && _ToBytes == 8) 3920 = __vector_bitcast<int>(__interleave128_lo(__k, __k)); 3921 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3923 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3926 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3928 = __vector_bitcast<int>(__interleave128_lo(__y, __y)); 3929 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z)); 3931 else if constexpr (_FromBytes == 8 && _ToBytes == 4 3933 return __intrin_bitcast<_To>( 3934 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3935 else if constexpr (_FromBytes == 8 && _ToBytes == 4) 3936 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k), 3938 else if constexpr (_FromBytes == 2 && _ToBytes == 4) 3939 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3940 else if constexpr (_FromBytes == 1 && _ToBytes == 4) 3943 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3944 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3946 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 3948 if constexpr (__have_sse2 && !__have_ssse3) 3949 return __intrin_bitcast<_To>(_mm_packs_epi32( 3950 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()), 3953 return __intrin_bitcast<_To>( 3954 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>( 3955 __vector_bitcast<_Up>(__k))); 3957 else if constexpr (_FromBytes == 4 && _ToBytes == 2) 3958 return __intrin_bitcast<_To>( 3959 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3960 else if constexpr (_FromBytes == 1 && _ToBytes == 2) 3961 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3962 else if constexpr (_FromBytes == 8 && _ToBytes == 1 3964 return __intrin_bitcast<_To>( 3965 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3966 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1, 3967 -1, -1, -1, -1, -1, -1, -1, 3969 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 3972 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3973 __y = _mm_packs_epi32(__y, __m128i()); 3974 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3976 else if constexpr (_FromBytes == 4 && _ToBytes == 1 3978 return __intrin_bitcast<_To>( 3979 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3980 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 3981 -1, -1, -1, -1, -1, -1, -1, 3983 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 3986 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3987 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3989 else if constexpr (_FromBytes == 2 && _ToBytes == 1) 3990 return __intrin_bitcast<_To>( 3991 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())); 3993 __assert_unreachable<_Tp>(); 3995 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32) 3997 if constexpr (_FromBytes == _ToBytes) 3998 __assert_unreachable<_Tp>(); 3999 else if constexpr (_FromBytes == _ToBytes * 2) 4001 const auto __y = __vector_bitcast<_LLong>(__k); 4002 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4003 _mm_packs_epi16(__lo128(__y), __hi128(__y)))); 4005 else if constexpr (_FromBytes == _ToBytes * 4) 4007 const auto __y = __vector_bitcast<_LLong>(__k); 4008 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4009 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4012 else if constexpr (_FromBytes == _ToBytes * 8) 4014 const auto __y = __vector_bitcast<_LLong>(__k); 4015 return __intrin_bitcast<_To>( 4016 _mm256_castsi128_si256(_mm_shuffle_epi8( 4017 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4018 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 4019 -1, -1, -1, -1, -1)))); 4021 else if constexpr (_FromBytes * 2 == _ToBytes) 4023 auto __y = __xzyw(__to_intrin(__k)); 4024 if constexpr (is_floating_point_v< 4025 _Tp> || (!__have_avx2 && _FromBytes == 4)) 4027 const auto __yy = __vector_bitcast<float>(__y); 4028 return __intrin_bitcast<_To>( 4029 _mm256_unpacklo_ps(__yy, __yy)); 4032 return __intrin_bitcast<_To>( 4033 _mm256_unpacklo_epi8(__y, __y)); 4035 else if constexpr (_FromBytes * 4 == _ToBytes) 4038 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4039 __lo128(__vector_bitcast<_LLong>( 4040 __k))); // drops 3/4 of input 4041 return __intrin_bitcast<_To>( 4042 __concat(_mm_unpacklo_epi16(__y, __y), 4043 _mm_unpackhi_epi16(__y, __y))); 4045 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 4048 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4049 __lo128(__vector_bitcast<_LLong>( 4050 __k))); // drops 3/4 of input 4052 = _mm_unpacklo_epi16(__y, 4053 __y); // drops another 1/2 => 7/8 total 4054 return __intrin_bitcast<_To>( 4055 __concat(_mm_unpacklo_epi32(__y, __y), 4056 _mm_unpackhi_epi32(__y, __y))); 4059 __assert_unreachable<_Tp>(); 4061 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16) 4063 if constexpr (_FromBytes == _ToBytes) 4064 return __intrin_bitcast<_To>( 4065 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>( 4066 __zero_extend(__to_intrin(__k)))); 4067 else if constexpr (_FromBytes * 2 == _ToBytes) 4069 return __intrin_bitcast<_To>( 4070 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k), 4071 __vector_bitcast<_LLong>(__k)), 4072 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k), 4073 __vector_bitcast<_LLong>(__k)))); 4075 else if constexpr (_FromBytes * 4 == _ToBytes) 4077 if constexpr (__have_avx2) 4079 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4080 __concat(__vector_bitcast<_LLong>(__k), 4081 __vector_bitcast<_LLong>(__k)), 4082 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 4083 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 4084 6, 6, 7, 7, 7, 7))); 4088 return __intrin_bitcast<_To>(__concat( 4089 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4090 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 4091 2, 2, 2, 2, 3, 3, 3, 3)), 4092 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4093 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5, 4094 6, 6, 6, 6, 7, 7, 7, 4098 else if constexpr (_FromBytes * 8 == _ToBytes) 4100 if constexpr (__have_avx2) 4102 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4103 __concat(__vector_bitcast<_LLong>(__k), 4104 __vector_bitcast<_LLong>(__k)), 4105 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 4106 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4107 3, 3, 3, 3, 3, 3))); 4111 return __intrin_bitcast<_To>(__concat( 4112 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4113 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 4114 1, 1, 1, 1, 1, 1, 1, 1)), 4115 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4116 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 4117 3, 3, 3, 3, 3, 3, 3, 4121 else if constexpr (_FromBytes == _ToBytes * 2) 4122 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4123 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())))); 4124 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 4126 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4127 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4128 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, 4129 -1, -1, -1, -1, -1, -1, -1, 4132 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4134 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4135 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4136 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4137 -1, -1, -1, -1, -1, -1, -1, 4140 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4142 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4143 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4144 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, 4145 -1, -1, -1, -1, -1, -1, -1, 4149 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable
"); 4151 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32) 4153 if constexpr (_FromBytes == _ToBytes) 4155 return __intrin_bitcast<_To>(__lo128(__k)); 4157 else if constexpr (_FromBytes == _ToBytes * 2) 4159 auto __y = __vector_bitcast<_LLong>(__k); 4160 return __intrin_bitcast<_To>( 4161 _mm_packs_epi16(__lo128(__y), __hi128(__y))); 4163 else if constexpr (_FromBytes == _ToBytes * 4) 4165 auto __y = __vector_bitcast<_LLong>(__k); 4166 return __intrin_bitcast<_To>( 4167 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4170 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4172 auto __y = __vector_bitcast<_LLong>(__k); 4173 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 4174 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4175 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, 4178 else if constexpr (_FromBytes * 2 == _ToBytes) 4180 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4181 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4183 else if constexpr (_FromBytes * 4 == _ToBytes) 4185 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4186 __y = _mm_unpacklo_epi8(__y, __y); 4187 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4189 else if constexpr (_FromBytes * 8 == _ToBytes) 4191 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4192 __y = _mm_unpacklo_epi8(__y, __y); 4193 __y = _mm_unpacklo_epi8(__y, __y); 4194 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4197 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable
"); 4200 return _Base::template _S_to_maskvector<_Up, _ToN>(__x); 4202 if constexpr (_FromBytes > _ToBytes) { 4203 const _To __y = __vector_bitcast<_Up>(__k); 4204 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4205 constexpr int _Stride = _FromBytes / _ToBytes; 4206 return _To{__y[(_Is + 1) * _Stride - 1]...}; 4207 }(make_index_sequence<std::min(_ToN, _FromN)>()); 4209 // {0, 0, 1, 1} (_Dups = 2, _Is<4>) 4210 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>) 4211 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>) 4213 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4214 constexpr int __dup = _ToBytes / _FromBytes; 4215 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...}); 4216 }(make_index_sequence<_FromN>()); 4224 template <typename _Tp, size_t _Np> 4225 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 4226 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 4228 if constexpr (is_same_v<_Tp, bool>) 4229 return _BitMask<_Np>(__x._M_data)._M_sanitized(); 4232 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4233 if (__builtin_is_constant_evaluated() 4234 || __builtin_constant_p(__x._M_data)) 4236 const auto __bools = -__x._M_data; 4237 const _ULLong __k = __call_with_n_evaluations<_Np>( 4238 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4239 return (__bits | ...); 4240 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4241 return _ULLong(__bools[+__i]) << __i; 4243 if (__builtin_is_constant_evaluated() 4244 || __builtin_constant_p(__k)) 4247 const auto __xi = __to_intrin(__x); 4248 if constexpr (sizeof(_Tp) == 1) 4249 if constexpr (sizeof(__xi) == 16) 4250 if constexpr (__have_avx512bw_vl) 4251 return _BitMask<_Np>(_mm_movepi8_mask(__xi)); 4252 else // implies SSE2 4253 return _BitMask<_Np>(_mm_movemask_epi8(__xi)); 4254 else if constexpr (sizeof(__xi) == 32) 4255 if constexpr (__have_avx512bw_vl) 4256 return _BitMask<_Np>(_mm256_movepi8_mask(__xi)); 4257 else // implies AVX2 4258 return _BitMask<_Np>(_mm256_movemask_epi8(__xi)); 4259 else // implies AVX512BW 4260 return _BitMask<_Np>(_mm512_movepi8_mask(__xi)); 4262 else if constexpr (sizeof(_Tp) == 2) 4263 if constexpr (sizeof(__xi) == 16) 4264 if constexpr (__have_avx512bw_vl) 4265 return _BitMask<_Np>(_mm_movepi16_mask(__xi)); 4266 else if constexpr (__have_avx512bw) 4267 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4268 else // implies SSE2 4269 return _BitMask<_Np>( 4270 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i()))); 4271 else if constexpr (sizeof(__xi) == 32) 4272 if constexpr (__have_avx512bw_vl) 4273 return _BitMask<_Np>(_mm256_movepi16_mask(__xi)); 4274 else if constexpr (__have_avx512bw) 4275 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4276 else // implies SSE2 4277 return _BitMask<_Np>(_mm_movemask_epi8( 4278 _mm_packs_epi16(__lo128(__xi), __hi128(__xi)))); 4279 else // implies AVX512BW 4280 return _BitMask<_Np>(_mm512_movepi16_mask(__xi)); 4282 else if constexpr (sizeof(_Tp) == 4) 4283 if constexpr (sizeof(__xi) == 16) 4284 if constexpr (__have_avx512dq_vl) 4285 return _BitMask<_Np>(_mm_movepi32_mask(__xi)); 4286 else if constexpr (__have_avx512vl) 4287 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i())); 4288 else if constexpr (__have_avx512dq) 4289 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4290 else if constexpr (__have_avx512f) 4291 return _BitMask<_Np>( 4292 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4294 return _BitMask<_Np>( 4295 _mm_movemask_ps(reinterpret_cast<__m128>(__xi))); 4296 else if constexpr (sizeof(__xi) == 32) 4297 if constexpr (__have_avx512dq_vl) 4298 return _BitMask<_Np>(_mm256_movepi32_mask(__xi)); 4299 else if constexpr (__have_avx512dq) 4300 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4301 else if constexpr (__have_avx512vl) 4302 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i())); 4303 else if constexpr (__have_avx512f) 4304 return _BitMask<_Np>( 4305 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4307 return _BitMask<_Np>( 4308 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi))); 4309 else // implies AVX512?? 4310 if constexpr (__have_avx512dq) 4311 return _BitMask<_Np>(_mm512_movepi32_mask(__xi)); 4312 else // implies AVX512F 4313 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i())); 4315 else if constexpr (sizeof(_Tp) == 8) 4316 if constexpr (sizeof(__xi) == 16) 4317 if constexpr (__have_avx512dq_vl) 4318 return _BitMask<_Np>(_mm_movepi64_mask(__xi)); 4319 else if constexpr (__have_avx512dq) 4320 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4321 else if constexpr (__have_avx512vl) 4322 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i())); 4323 else if constexpr (__have_avx512f) 4324 return _BitMask<_Np>( 4325 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4326 else // implies SSE2 4327 return _BitMask<_Np>( 4328 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi))); 4329 else if constexpr (sizeof(__xi) == 32) 4330 if constexpr (__have_avx512dq_vl) 4331 return _BitMask<_Np>(_mm256_movepi64_mask(__xi)); 4332 else if constexpr (__have_avx512dq) 4333 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4334 else if constexpr (__have_avx512vl) 4335 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i())); 4336 else if constexpr (__have_avx512f) 4337 return _BitMask<_Np>( 4338 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4340 return _BitMask<_Np>( 4341 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi))); 4342 else // implies AVX512?? 4343 if constexpr (__have_avx512dq) 4344 return _BitMask<_Np>(_mm512_movepi64_mask(__xi)); 4345 else // implies AVX512F 4346 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i())); 4349 __assert_unreachable<_Tp>(); 4357 template <typename _Abi, typename> 4358 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi> 4360 using _MaskImplX86Mixin::_S_to_bits; 4361 using _MaskImplX86Mixin::_S_to_maskvector; 4362 using _MaskImplBuiltin<_Abi>::_S_convert; 4365 template <typename _Tp> 4366 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 4368 template <typename _Tp> 4369 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 4371 template <typename _Tp> 4372 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 4374 using _Base = _MaskImplBuiltin<_Abi>; 4378 template <typename _Tp> 4379 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4380 _S_broadcast(bool __x) 4382 if constexpr (__is_avx512_abi<_Abi>()) 4383 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1)) 4384 : _MaskMember<_Tp>(); 4386 return _Base::template _S_broadcast<_Tp>(__x); 4391 template <typename _Tp> 4392 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4393 _S_load(const bool* __mem) 4395 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4396 if (__builtin_is_constant_evaluated()) 4398 if constexpr (__is_avx512_abi<_Abi>()) 4400 _MaskMember<_Tp> __r{}; 4401 for (size_t __i = 0; __i < _S_size<_Tp>; ++__i) 4402 __r._M_data |= _ULLong(__mem[__i]) << __i; 4406 return _Base::template _S_load<_Tp>(__mem); 4408 else if constexpr (__have_avx512bw) 4410 const auto __to_vec_or_bits 4411 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) { 4412 if constexpr (__is_avx512_abi<_Abi>()) 4415 return _S_to_maskvector<_Tp>( 4416 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); 4419 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) 4422 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4423 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a)); 4425 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl) 4428 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4429 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a)); 4431 else if constexpr (_S_size<_Tp> <= 64) 4434 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4435 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a)); 4438 else if constexpr (__is_avx512_abi<_Abi>()) 4440 if constexpr (_S_size<_Tp> <= 8) 4443 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4444 const auto __b = _mm512_cvtepi8_epi64(__a); 4445 return _mm512_test_epi64_mask(__b, __b); 4447 else if constexpr (_S_size<_Tp> <= 16) 4450 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4451 const auto __b = _mm512_cvtepi8_epi32(__a); 4452 return _mm512_test_epi32_mask(__b, __b); 4454 else if constexpr (_S_size<_Tp> <= 32) 4457 __builtin_memcpy(&__a, __mem, 16); 4458 const auto __b = _mm512_cvtepi8_epi32(__a); 4459 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16); 4460 const auto __c = _mm512_cvtepi8_epi32(__a); 4461 return _mm512_test_epi32_mask(__b, __b) 4462 | (_mm512_test_epi32_mask(__c, __c) << 16); 4464 else if constexpr (_S_size<_Tp> <= 64) 4467 __builtin_memcpy(&__a, __mem, 16); 4468 const auto __b = _mm512_cvtepi8_epi32(__a); 4469 __builtin_memcpy(&__a, __mem + 16, 16); 4470 const auto __c = _mm512_cvtepi8_epi32(__a); 4471 if constexpr (_S_size<_Tp> <= 48) 4473 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32); 4474 const auto __d = _mm512_cvtepi8_epi32(__a); 4475 return _mm512_test_epi32_mask(__b, __b) 4476 | (_mm512_test_epi32_mask(__c, __c) << 16) 4477 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32); 4481 __builtin_memcpy(&__a, __mem + 16, 16); 4482 const auto __d = _mm512_cvtepi8_epi32(__a); 4483 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48); 4484 const auto __e = _mm512_cvtepi8_epi32(__a); 4485 return _mm512_test_epi32_mask(__b, __b) 4486 | (_mm512_test_epi32_mask(__c, __c) << 16) 4487 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32) 4488 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48); 4492 __assert_unreachable<_Tp>(); 4494 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2) 4495 return __vector_bitcast<_Tp>( 4496 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]), 4497 -int(__mem[1]), -int(__mem[1])}); 4498 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx) 4501 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>); 4502 const auto __k = __to_intrin( 4503 (__vector_broadcast<4>(__bool4) 4504 & __make_vector<int>(0x1, 0x100, 0x10000, 4505 _S_size<_Tp> == 4 ? 0x1000000 : 0)) 4507 return __vector_bitcast<_Tp>( 4508 __concat(_mm_unpacklo_epi32(__k, __k), 4509 _mm_unpackhi_epi32(__k, __k))); 4511 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4) 4514 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>); 4515 if constexpr (__have_sse2) 4517 __m128i __k = _mm_cvtsi32_si128(__bools); 4518 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4519 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4520 _mm_unpacklo_epi16(__k, __k)); 4524 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools)); 4526 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4527 _mm_cmpgt_ps(__k, __m128())); 4530 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8) 4533 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4534 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4535 return __vector_bitcast<_Tp>( 4536 __concat(_mm_unpacklo_epi16(__k, __k), 4537 _mm_unpackhi_epi16(__k, __k))); 4539 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16) 4542 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4543 __k = _mm_cmpgt_epi8(__k, __m128i()); 4544 if constexpr (_S_size<_Tp> <= 8) 4545 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4546 _mm_unpacklo_epi8(__k, __k)); 4548 return __concat(_mm_unpacklo_epi8(__k, __k), 4549 _mm_unpackhi_epi8(__k, __k)); 4552 return _Base::template _S_load<_Tp>(__mem); 4556 // _S_from_bitmask{{{ 4557 template <size_t _Np, typename _Tp> 4558 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 4559 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) 4561 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4562 if constexpr (__is_avx512_abi<_Abi>()) 4563 return __bits._M_to_bits(); 4565 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); 4569 // _S_masked_load {{{2 4570 template <typename _Tp, size_t _Np> 4571 static inline _SimdWrapper<_Tp, _Np> 4572 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, 4573 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept 4575 if constexpr (__is_avx512_abi<_Abi>()) 4577 if constexpr (__have_avx512bw_vl) 4579 if constexpr (_Np <= 16) 4582 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem); 4583 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a); 4585 else if constexpr (_Np <= 32) 4588 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem); 4589 return (__merge & ~__mask) 4590 | _mm256_test_epi8_mask(__a, __a); 4592 else if constexpr (_Np <= 64) 4595 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem); 4596 return (__merge & ~__mask) 4597 | _mm512_test_epi8_mask(__a, __a); 4600 __assert_unreachable<_Tp>(); 4604 _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4605 __merge._M_set(__i, __mem[__i]); 4610 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1) 4612 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4613 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(), 4614 _mm256_mask_loadu_epi8(__m256i(), 4617 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1) 4619 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4621 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k, 4623 _mm_mask_loadu_epi8(__m128i(), __k, __mem)); 4625 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2) 4627 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4628 __merge = _mm256_mask_sub_epi16( 4629 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4630 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4632 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2) 4634 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4635 __merge = _mm_mask_sub_epi16( 4636 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4637 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4639 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4) 4641 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4642 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32( 4643 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4644 _mm256_cvtepi8_epi32( 4645 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4647 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4) 4649 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4650 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32( 4651 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4652 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4654 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8) 4656 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4657 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64( 4658 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4659 _mm256_cvtepi8_epi64( 4660 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4662 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8) 4664 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4665 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64( 4666 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4667 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4670 return _Base::_S_masked_load(__merge, __mask, __mem); 4675 template <typename _Tp, size_t _Np> 4676 _GLIBCXX_SIMD_INTRINSIC static constexpr void 4677 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept 4679 if (__builtin_is_constant_evaluated()) 4680 _Base::_S_store(__v, __mem); 4681 else if constexpr (__is_avx512_abi<_Abi>()) 4683 if constexpr (__have_avx512bw_vl) 4684 _CommonImplX86::_S_store<_Np>( 4685 __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4686 if constexpr (_Np <= 16) 4687 return _mm_maskz_set1_epi8(__data, 1); 4688 else if constexpr (_Np <= 32) 4689 return _mm256_maskz_set1_epi8(__data, 1); 4691 return _mm512_maskz_set1_epi8(__data, 1); 4694 else if constexpr (_Np <= 8) 4695 _CommonImplX86::_S_store<_Np>( 4696 __vector_bitcast<char>( 4697 #if defined __x86_64__ 4698 __make_wrapper<_ULLong>( 4699 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull) 4701 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U), 4702 _pdep_u32(__v._M_data >> 4, 4707 else if constexpr (_Np <= 16) 4708 _mm512_mask_cvtepi32_storeu_epi8( 4709 __mem, 0xffffu >> (16 - _Np), 4710 _mm512_maskz_set1_epi32(__v._M_data, 1)); 4712 __assert_unreachable<_Tp>(); 4714 else if constexpr (__is_sse_abi<_Abi>()) //{{{ 4716 if constexpr (_Np == 2 && sizeof(_Tp) == 8) 4718 const auto __k = __vector_bitcast<int>(__v); 4722 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 4724 if constexpr (__have_sse2) 4726 const unsigned __bool4 4727 = __vector_bitcast<_UInt>(_mm_packs_epi16( 4728 _mm_packs_epi32(__intrin_bitcast<__m128i>( 4733 __builtin_memcpy(__mem, &__bool4, _Np); 4735 else if constexpr (__have_mmx) 4737 const __m64 __k = _mm_cvtps_pi8( 4738 __and(__to_intrin(__v), _mm_set1_ps(1.f))); 4739 __builtin_memcpy(__mem, &__k, _Np); 4743 return _Base::_S_store(__v, __mem); 4745 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 4747 _CommonImplX86::_S_store<_Np>( 4748 __vector_bitcast<char>(_mm_packs_epi16( 4749 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15), 4753 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 4754 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem); 4756 __assert_unreachable<_Tp>(); 4758 else if constexpr (__is_avx_abi<_Abi>()) // {{{ 4760 if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 4762 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4764 if constexpr (__have_avx2) 4765 __bool4 = _mm256_movemask_epi8(__k); 4767 __bool4 = (_mm_movemask_epi8(__lo128(__k)) 4768 | (_mm_movemask_epi8(__hi128(__k)) << 16)); 4769 __bool4 &= 0x01010101; 4770 __builtin_memcpy(__mem, &__bool4, _Np); 4772 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4) 4774 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4776 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)), 4779 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i())); 4780 _CommonImplX86::_S_store<_Np>(__k3, __mem); 4782 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2) 4784 if constexpr (__have_avx2) 4786 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15); 4787 const auto __bools = __vector_bitcast<char>( 4788 _mm_packs_epi16(__lo128(__x), __hi128(__x))); 4789 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4795 & __vector_bitcast<_UChar>( 4796 _mm_packs_epi16(__lo128(__to_intrin(__v)), 4797 __hi128(__to_intrin(__v)))); 4798 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4801 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1) 4802 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem); 4804 __assert_unreachable<_Tp>(); 4807 __assert_unreachable<_Tp>(); 4810 // _S_masked_store {{{2 4811 template <typename _Tp, size_t _Np> 4813 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, 4814 const _SimdWrapper<_Tp, _Np> __k) noexcept 4816 if constexpr (__is_avx512_abi<_Abi>()) 4818 static_assert(is_same_v<_Tp, bool>); 4819 if constexpr (_Np <= 16 && __have_avx512bw_vl) 4820 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1)); 4821 else if constexpr (_Np <= 16) 4822 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k, 4823 _mm512_maskz_set1_epi32(__v, 1)); 4824 else if constexpr (_Np <= 32 && __have_avx512bw_vl) 4825 _mm256_mask_storeu_epi8(__mem, __k, 4826 _mm256_maskz_set1_epi8(__v, 1)); 4827 else if constexpr (_Np <= 32 && __have_avx512bw) 4828 _mm256_mask_storeu_epi8(__mem, __k, 4829 __lo256(_mm512_maskz_set1_epi8(__v, 1))); 4830 else if constexpr (_Np <= 64 && __have_avx512bw) 4831 _mm512_mask_storeu_epi8(__mem, __k, 4832 _mm512_maskz_set1_epi8(__v, 1)); 4834 __assert_unreachable<_Tp>(); 4837 _Base::_S_masked_store(__v, __mem, __k); 4840 // logical and bitwise operators {{{2 4841 template <typename _Tp, size_t _Np> 4842 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4843 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4845 if constexpr (is_same_v<_Tp, bool>) 4847 if (__builtin_is_constant_evaluated()) 4848 return __x._M_data & __y._M_data; 4849 else if constexpr (__have_avx512dq && _Np <= 8) 4850 return _kand_mask8(__x._M_data, __y._M_data); 4851 else if constexpr (_Np <= 16) 4852 return _kand_mask16(__x._M_data, __y._M_data); 4853 else if constexpr (__have_avx512bw && _Np <= 32) 4854 return _kand_mask32(__x._M_data, __y._M_data); 4855 else if constexpr (__have_avx512bw && _Np <= 64) 4856 return _kand_mask64(__x._M_data, __y._M_data); 4858 __assert_unreachable<_Tp>(); 4861 return _Base::_S_logical_and(__x, __y); 4864 template <typename _Tp, size_t _Np> 4865 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4866 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4868 if constexpr (is_same_v<_Tp, bool>) 4870 if (__builtin_is_constant_evaluated()) 4871 return __x._M_data | __y._M_data; 4872 else if constexpr (__have_avx512dq && _Np <= 8) 4873 return _kor_mask8(__x._M_data, __y._M_data); 4874 else if constexpr (_Np <= 16) 4875 return _kor_mask16(__x._M_data, __y._M_data); 4876 else if constexpr (__have_avx512bw && _Np <= 32) 4877 return _kor_mask32(__x._M_data, __y._M_data); 4878 else if constexpr (__have_avx512bw && _Np <= 64) 4879 return _kor_mask64(__x._M_data, __y._M_data); 4881 __assert_unreachable<_Tp>(); 4884 return _Base::_S_logical_or(__x, __y); 4887 template <typename _Tp, size_t _Np> 4888 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4889 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x) 4891 if constexpr (is_same_v<_Tp, bool>) 4893 if (__builtin_is_constant_evaluated()) 4894 return __x._M_data ^ _Abi::template __implicit_mask_n<_Np>(); 4895 else if constexpr (__have_avx512dq && _Np <= 8) 4896 return _kandn_mask8(__x._M_data, 4897 _Abi::template __implicit_mask_n<_Np>()); 4898 else if constexpr (_Np <= 16) 4899 return _kandn_mask16(__x._M_data, 4900 _Abi::template __implicit_mask_n<_Np>()); 4901 else if constexpr (__have_avx512bw && _Np <= 32) 4902 return _kandn_mask32(__x._M_data, 4903 _Abi::template __implicit_mask_n<_Np>()); 4904 else if constexpr (__have_avx512bw && _Np <= 64) 4905 return _kandn_mask64(__x._M_data, 4906 _Abi::template __implicit_mask_n<_Np>()); 4908 __assert_unreachable<_Tp>(); 4911 return _Base::_S_bit_not(__x); 4914 template <typename _Tp, size_t _Np> 4915 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4916 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4918 if constexpr (is_same_v<_Tp, bool>) 4920 if (__builtin_is_constant_evaluated()) 4921 return __x._M_data & __y._M_data; 4922 else if constexpr (__have_avx512dq && _Np <= 8) 4923 return _kand_mask8(__x._M_data, __y._M_data); 4924 else if constexpr (_Np <= 16) 4925 return _kand_mask16(__x._M_data, __y._M_data); 4926 else if constexpr (__have_avx512bw && _Np <= 32) 4927 return _kand_mask32(__x._M_data, __y._M_data); 4928 else if constexpr (__have_avx512bw && _Np <= 64) 4929 return _kand_mask64(__x._M_data, __y._M_data); 4931 __assert_unreachable<_Tp>(); 4934 return _Base::_S_bit_and(__x, __y); 4937 template <typename _Tp, size_t _Np> 4938 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4939 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4941 if constexpr (is_same_v<_Tp, bool>) 4943 if (__builtin_is_constant_evaluated()) 4944 return __x._M_data | __y._M_data; 4945 else if constexpr (__have_avx512dq && _Np <= 8) 4946 return _kor_mask8(__x._M_data, __y._M_data); 4947 else if constexpr (_Np <= 16) 4948 return _kor_mask16(__x._M_data, __y._M_data); 4949 else if constexpr (__have_avx512bw && _Np <= 32) 4950 return _kor_mask32(__x._M_data, __y._M_data); 4951 else if constexpr (__have_avx512bw && _Np <= 64) 4952 return _kor_mask64(__x._M_data, __y._M_data); 4954 __assert_unreachable<_Tp>(); 4957 return _Base::_S_bit_or(__x, __y); 4960 template <typename _Tp, size_t _Np> 4961 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4962 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4964 if constexpr (is_same_v<_Tp, bool>) 4966 if (__builtin_is_constant_evaluated()) 4967 return __x._M_data ^ __y._M_data; 4968 else if constexpr (__have_avx512dq && _Np <= 8) 4969 return _kxor_mask8(__x._M_data, __y._M_data); 4970 else if constexpr (_Np <= 16) 4971 return _kxor_mask16(__x._M_data, __y._M_data); 4972 else if constexpr (__have_avx512bw && _Np <= 32) 4973 return _kxor_mask32(__x._M_data, __y._M_data); 4974 else if constexpr (__have_avx512bw && _Np <= 64) 4975 return _kxor_mask64(__x._M_data, __y._M_data); 4977 __assert_unreachable<_Tp>(); 4980 return _Base::_S_bit_xor(__x, __y); 4984 // _S_masked_assign{{{ 4985 template <size_t _Np> 4986 _GLIBCXX_SIMD_INTRINSIC static void 4987 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 4988 _SimdWrapper<bool, _Np>& __lhs, _SimdWrapper<bool, _Np> __rhs) 4991 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data); 4994 template <size_t _Np> 4995 _GLIBCXX_SIMD_INTRINSIC static void 4996 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 4997 _SimdWrapper<bool, _Np>& __lhs, bool __rhs) 5000 __lhs._M_data = __k._M_data | __lhs._M_data; 5002 __lhs._M_data = ~__k._M_data & __lhs._M_data; 5005 using _MaskImplBuiltin<_Abi>::_S_masked_assign; 5009 template <typename _Tp> 5010 _GLIBCXX_SIMD_INTRINSIC static bool 5011 _S_all_of(simd_mask<_Tp, _Abi> __k) 5013 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5015 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5016 using _TI = __intrinsic_type_t<_Tp, _Np>; 5017 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5018 if constexpr (__have_sse4_1) 5020 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5021 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5022 return 0 != __testc(__a, __b); 5024 else if constexpr (is_same_v<_Tp, float>) 5025 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) 5027 else if constexpr (is_same_v<_Tp, double>) 5028 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) 5031 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 5032 == (1 << (_Np * sizeof(_Tp))) - 1; 5034 else if constexpr (__is_avx512_abi<_Abi>()) 5036 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>(); 5037 const auto __kk = __k._M_data._M_data; 5038 if constexpr (sizeof(__kk) == 1) 5040 if constexpr (__have_avx512dq) 5041 return _kortestc_mask8_u8(__kk, _Mask == 0xff 5043 : __mmask8(~_Mask)); 5045 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask)); 5047 else if constexpr (sizeof(__kk) == 2) 5048 return _kortestc_mask16_u8(__kk, _Mask == 0xffff 5050 : __mmask16(~_Mask)); 5051 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw) 5052 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU 5054 : __mmask32(~_Mask)); 5055 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw) 5056 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL 5058 : __mmask64(~_Mask)); 5060 __assert_unreachable<_Tp>(); 5066 template <typename _Tp> 5067 _GLIBCXX_SIMD_INTRINSIC static bool 5068 _S_any_of(simd_mask<_Tp, _Abi> __k) 5070 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5072 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5073 using _TI = __intrinsic_type_t<_Tp, _Np>; 5074 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5075 if constexpr (__have_sse4_1) 5077 if constexpr (_Abi::template _S_is_partial< 5078 _Tp> || sizeof(__k) < 16) 5080 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5081 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5082 return 0 == __testz(__a, __b); 5085 return 0 == __testz(__a, __a); 5087 else if constexpr (is_same_v<_Tp, float>) 5088 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0; 5089 else if constexpr (is_same_v<_Tp, double>) 5090 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0; 5092 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 5095 else if constexpr (__is_avx512_abi<_Abi>()) 5096 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 5102 template <typename _Tp> 5103 _GLIBCXX_SIMD_INTRINSIC static bool 5104 _S_none_of(simd_mask<_Tp, _Abi> __k) 5106 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5108 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5109 using _TI = __intrinsic_type_t<_Tp, _Np>; 5110 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5111 if constexpr (__have_sse4_1) 5113 if constexpr (_Abi::template _S_is_partial< 5114 _Tp> || sizeof(__k) < 16) 5116 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5117 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5118 return 0 != __testz(__a, __b); 5121 return 0 != __testz(__a, __a); 5123 else if constexpr (is_same_v<_Tp, float>) 5124 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 5125 else if constexpr (is_same_v<_Tp, double>) 5126 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 5128 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1)) 5131 else if constexpr (__is_avx512_abi<_Abi>()) 5132 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 5138 template <typename _Tp> 5139 _GLIBCXX_SIMD_INTRINSIC static bool 5140 _S_some_of(simd_mask<_Tp, _Abi> __k) 5142 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5144 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5145 using _TI = __intrinsic_type_t<_Tp, _Np>; 5146 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5147 if constexpr (__have_sse4_1) 5149 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5150 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5151 return 0 != __testnzc(__a, __b); 5153 else if constexpr (is_same_v<_Tp, float>) 5155 constexpr int __allbits = (1 << _Np) - 1; 5156 const auto __tmp = _mm_movemask_ps(__a) & __allbits; 5157 return __tmp > 0 && __tmp < __allbits; 5159 else if constexpr (is_same_v<_Tp, double>) 5161 constexpr int __allbits = (1 << _Np) - 1; 5162 const auto __tmp = _mm_movemask_pd(__a) & __allbits; 5163 return __tmp > 0 && __tmp < __allbits; 5167 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1; 5168 const auto __tmp = _mm_movemask_epi8(__a) & __allbits; 5169 return __tmp > 0 && __tmp < __allbits; 5172 else if constexpr (__is_avx512_abi<_Abi>()) 5173 return _S_any_of(__k) && !_S_all_of(__k); 5175 __assert_unreachable<_Tp>(); 5180 template <typename _Tp> 5181 _GLIBCXX_SIMD_INTRINSIC static int 5182 _S_popcount(simd_mask<_Tp, _Abi> __k) 5184 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5185 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data; 5186 if constexpr (__is_avx512_abi<_Abi>()) 5188 if constexpr (_Np > 32) 5189 return __builtin_popcountll(__kk); 5191 return __builtin_popcount(__kk); 5195 if constexpr (__have_popcnt) 5198 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk))); 5199 const int __count = __builtin_popcount(__bits); 5200 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count; 5202 else if constexpr (_Np == 2 && sizeof(_Tp) == 8) 5204 const int mask = _mm_movemask_pd(__auto_bitcast(__kk)); 5205 return mask - (mask >> 1); 5207 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 5209 auto __x = -(__lo128(__kk) + __hi128(__kk)); 5210 return __x[0] + __x[1]; 5212 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 5214 if constexpr (__have_sse2) 5216 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk)); 5217 __x = _mm_add_epi32( 5218 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5219 __x = _mm_add_epi32( 5220 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2))); 5221 return -_mm_cvtsi128_si32(__x); 5224 return __builtin_popcount( 5225 _mm_movemask_ps(__auto_bitcast(__kk))); 5227 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 5229 auto __x = __to_intrin(__kk); 5230 __x = _mm_add_epi16(__x, 5231 _mm_shuffle_epi32(__x, 5232 _MM_SHUFFLE(0, 1, 2, 3))); 5233 __x = _mm_add_epi16( 5234 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5235 __x = _mm_add_epi16( 5236 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1))); 5237 return -short(_mm_extract_epi16(__x, 0)); 5239 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 5241 auto __x = __to_intrin(__kk); 5242 __x = _mm_add_epi8(__x, 5243 _mm_shuffle_epi32(__x, 5244 _MM_SHUFFLE(0, 1, 2, 3))); 5245 __x = _mm_add_epi8(__x, 5246 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 5248 __x = _mm_add_epi8(__x, 5249 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 5251 auto __y = -__vector_bitcast<_UChar>(__x); 5252 if constexpr (__have_sse4_1) 5253 return __y[0] + __y[1]; 5256 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0); 5257 return (__z & 0xff) + (__z >> 8); 5260 else if constexpr (sizeof(__kk) == 32) 5262 // The following works only as long as the implementations above 5264 using _I = __int_for_sizeof_t<_Tp>; 5265 const auto __as_int = __vector_bitcast<_I>(__kk); 5266 _MaskImplX86<simd_abi::__sse>::_S_popcount( 5267 simd_mask<_I, simd_abi::__sse>(__private_init, 5269 + __hi128(__as_int))); 5272 __assert_unreachable<_Tp>(); 5277 // _S_find_first_set {{{ 5278 template <typename _Tp> 5279 _GLIBCXX_SIMD_INTRINSIC static int 5280 _S_find_first_set(simd_mask<_Tp, _Abi> __k) 5282 if constexpr (__is_avx512_abi<_Abi>()) 5283 return std::__countr_zero(__k._M_data._M_data); 5285 return _Base::_S_find_first_set(__k); 5289 // _S_find_last_set {{{ 5290 template <typename _Tp> 5291 _GLIBCXX_SIMD_INTRINSIC static int 5292 _S_find_last_set(simd_mask<_Tp, _Abi> __k) 5294 if constexpr (__is_avx512_abi<_Abi>()) 5295 return std::__bit_width(_Abi::_S_masked(__k._M_data)._M_data) - 1; 5297 return _Base::_S_find_last_set(__k); 5305 _GLIBCXX_SIMD_END_NAMESPACE 5306 #endif // __cplusplus >= 201703L 5307 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 5309 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80 constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.