This implements basic_mask for everything but complex.
libstdc++-v3/ChangeLog:
* include/bits/simd_mask.h: New file.
Signed-off-by: Matthias Kretz <[email protected]>
---
libstdc++-v3/include/bits/simd_mask.h | 1732 +++++++++++++++++++++++++
1 file changed, 1732 insertions(+)
create mode 100644 libstdc++-v3/include/bits/simd_mask.h
--
──────────────────────────────────────────────────────────────────────────
Dr. Matthias Kretz https://mattkretz.github.io
GSI Helmholtz Center for Heavy Ion Research https://gsi.de
std::simd
──────────────────────────────────────────────────────────────────────────diff --git a/libstdc++-v3/include/bits/simd_mask.h b/libstdc++-v3/include/bits/simd_mask.h
new file mode 100644
index 00000000000..7bb8598ca4c
--- /dev/null
+++ b/libstdc++-v3/include/bits/simd_mask.h
@@ -0,0 +1,1732 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
+/* Copyright © 2025 GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
+ * Matthias Kretz <[email protected]>
+ */
+
+#ifndef _GLIBCXX_SIMD_MASK_H
+#define _GLIBCXX_SIMD_MASK_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_iterator.h"
+#include "vec_ops.h"
+#if _GLIBCXX_X86
+#include "simd_x86.h"
+#endif
+
+#include <bit>
+#include <bitset>
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std::simd
+{
+ template <unsigned _Np>
+ struct _SwapNeighbors
+ {
+ consteval unsigned
+ operator()(unsigned __i, unsigned __size) const
+ {
+ if (__size % (2 * _Np) != 0)
+ __builtin_abort(); // swap_neighbors<N> permutation requires a multiple of 2N elements
+ else if (std::has_single_bit(_Np))
+ return __i ^ _Np;
+ else if (__i % (2 * _Np) >= _Np)
+ return __i - _Np;
+ else
+ return __i + _Np;
+ }
+ };
+
+ template <size_t _Np, size_t _Mp>
+ constexpr auto
+ __bitset_split(const bitset<_Mp>& __b)
+ {
+ constexpr auto __bits_per_word = __CHAR_BIT__ * __SIZEOF_LONG__;
+ if constexpr (_Np % __bits_per_word == 0)
+ {
+ struct _Tmp
+ {
+ bitset<_Np> _M_lo;
+ bitset<_Mp - _Np> _M_hi;
+ };
+ return __builtin_bit_cast(_Tmp, __b);
+ }
+ else
+ {
+ constexpr auto __bits_per_ullong = __CHAR_BIT__ * __SIZEOF_LONG_LONG__;
+ static_assert(_Mp <= __bits_per_ullong);
+ using _Lo = _Bitmask<_Np>;
+ using _Hi = _Bitmask<_Mp - _Np>;
+ struct _Tmp
+ {
+ _Lo _M_lo;
+ _Hi _M_hi;
+ };
+ return _Tmp {static_cast<_Lo>(__b.to_ullong()), static_cast<_Hi>(__b.to_ullong() >> _Np)};
+ }
+ }
+
+ // [simd.traits]
+ // --- rebind ---
+ template <typename _Tp, typename _Vp, _ArchTraits _Traits = {}>
+ struct rebind
+ {};
+
+ template <__vectorizable _Tp, __simd_vec_type _Vp, _ArchTraits _Traits>
+ //requires requires { typename __deduce_abi_t<_Tp, _Vp::size()>; }
+ struct rebind<_Tp, _Vp, _Traits>
+ { using type = __similar_vec<_Tp, _Vp::size(), typename _Vp::abi_type>; };
+
+ template <__vectorizable _Tp, __simd_mask_type _Mp, _ArchTraits _Traits>
+ //requires requires { typename __deduce_abi_t<_Tp, _Mp::size()>; }
+ struct rebind<_Tp, _Mp, _Traits>
+ { using type = __similar_mask<_Tp, _Mp::size(), typename _Mp::abi_type>; };
+
+ template <typename _Tp, typename _Vp>
+ using rebind_t = typename rebind<_Tp, _Vp>::type;
+
+ // --- resize ---
+ template <__simd_size_type _Np, typename _Vp, _ArchTraits _Traits = {}>
+ struct resize
+ {};
+
+ template <__simd_size_type _Np, __data_parallel_type _Vp, _ArchTraits _Traits>
+ requires requires { typename _Vp::mask_type; }
+ //requires requires { typename __deduce_abi_t<typename _Vp::value_type, _Np>; }
+ struct resize<_Np, _Vp, _Traits>
+ { using type = __similar_vec<typename _Vp::value_type, _Np, typename _Vp::abi_type>; };
+
+ template <__simd_size_type _Np, __simd_mask_type _Mp, _ArchTraits _Traits>
+ //requires requires { typename __deduce_abi_t<typename _Mp::value_type, _Np>; }
+ struct resize<_Np, _Mp, _Traits>
+ {
+ using _A1 = decltype(__abi_rebind<__mask_element_size<_Mp>, _Np, typename _Mp::abi_type,
+ true>());
+
+ static_assert(_Mp::abi_type::_S_variant == _A1::_S_variant or is_same_v<_A1, _ScalarAbi<_Np>>
+ or is_same_v<typename _Mp::abi_type, _ScalarAbi<_Mp::size()>>);
+
+ using type = basic_mask<__mask_element_size<_Mp>, _A1>;
+ };
+
+ template <__simd_size_type _Np, typename _Vp>
+ using resize_t = typename resize<_Np, _Vp>::type;
+
+ // [simd.syn]
+ static constexpr __simd_size_type zero_element = -1 << (sizeof(int) * __CHAR_BIT__ - 1);
+
+ static constexpr __simd_size_type uninit_element = zero_element + 1;
+
+ // [simd.permute.static]
+ template<__simd_size_type _Np = 0, __simd_vec_or_mask_type _Vp,
+ __index_permutation_function<_Vp> _IdxMap>
+ [[__gnu__::__always_inline__]]
+ constexpr resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp>
+ permute(const _Vp& __v, _IdxMap&& __idxmap)
+ { return resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp>::_S_static_permute(__v, __idxmap); }
+
+ // [simd.permute.dynamic]
+ template<__simd_vec_or_mask_type _Vp, __simd_integral _Ip>
+ [[__gnu__::__always_inline__]]
+ constexpr resize_t<_Ip::size(), _Vp>
+ permute(const _Vp& __v, const _Ip& __indices)
+ { return __v[__indices]; }
+
+ // [simd.creation] ----------------------------------------------------------
+ template<__simd_vec_type _Vp, typename _Ap>
+ constexpr auto
+ chunk(const basic_vec<typename _Vp::value_type, _Ap>& __x) noexcept
+ { return __x.template _M_chunk<_Vp>(); }
+
+ template<__simd_mask_type _Mp, typename _Ap>
+ constexpr auto
+ chunk(const basic_mask<__mask_element_size<_Mp>, _Ap>& __x) noexcept
+ { return __x.template _M_chunk<_Mp>(); }
+
+ template<__simd_size_type _Np, typename _Tp, typename _Ap>
+ constexpr auto
+ chunk(const basic_vec<_Tp, _Ap>& __x) noexcept
+ { return chunk<resize_t<_Np, basic_vec<_Tp, _Ap>>>(__x); }
+
+ template<__simd_size_type _Np, size_t _Bytes, typename _Ap>
+ constexpr auto
+ chunk(const basic_mask<_Bytes, _Ap>& __x) noexcept
+ { return chunk<resize_t<_Np, basic_mask<_Bytes, _Ap>>>(__x); }
+
+ template<typename _Tp, typename... _Abis>
+ constexpr basic_vec<_Tp, __deduce_abi_t<_Tp, (_Abis::_S_size + ...)>>
+ cat(const basic_vec<_Tp, _Abis>&... __xs) noexcept
+ { return basic_vec<_Tp, __deduce_abi_t<_Tp, (_Abis::_S_size + ...)>>::_S_concat(__xs...); }
+
+ template<size_t _Bytes, typename... _Abis>
+ constexpr basic_mask<_Bytes, __deduce_abi_t<__integer_from<_Bytes>, (_Abis::_S_size + ...)>>
+ cat(const basic_mask<_Bytes, _Abis>&... __xs) noexcept
+ { static_assert(false, "TODO: cat"); }
+
+ // [simd.mask] --------------------------------------------------------------
+ template <size_t _Bytes, typename _Abi>
+ class basic_mask
+ {
+ public:
+ using value_type = bool;
+
+ using abi_type = _Abi;
+
+#define _GLIBCXX_DELETE_SIMD \
+ _GLIBCXX_DELETE_MSG("This specialization is disabled because of an invalid combination " \
+ "of template arguments to basic_mask.")
+
+ basic_mask() = _GLIBCXX_DELETE_SIMD;
+
+ ~basic_mask() = _GLIBCXX_DELETE_SIMD;
+
+ basic_mask(const basic_mask&) = _GLIBCXX_DELETE_SIMD;
+
+ basic_mask& operator=(const basic_mask&) = _GLIBCXX_DELETE_SIMD;
+
+#undef _GLIBCXX_DELETE_SIMD
+ };
+
+ template <size_t _Bytes, __abi_tag _Ap>
+ requires (_Ap::_S_nreg == 1) and (not _Ap::_S_is_cx_ileav)
+ class basic_mask<_Bytes, _Ap>
+ {
+ template <size_t, typename>
+ friend class basic_mask;
+
+ template <typename, typename>
+ friend class basic_vec;
+
+ static constexpr int _S_size = _Ap::_S_size;
+
+ static constexpr bool _S_is_scalar = is_same_v<_Ap, _ScalarAbi<_Ap::_S_size>>;
+
+ static constexpr bool _S_use_bitmask = [] {
+ if constexpr (_S_is_scalar)
+ return false;
+ else
+ return __flags_test(_Ap::_S_variant, _AbiVariant::_BitMask);
+ }();
+
+ static constexpr int _S_full_size = [] {
+ if constexpr (_S_is_scalar)
+ return _S_size;
+ else if constexpr (_S_use_bitmask and _S_size < __CHAR_BIT__)
+ return __CHAR_BIT__;
+ else
+ return __bit_ceil(unsigned(_S_size));
+ }();
+
+ static constexpr bool _S_is_partial = _S_size != _S_full_size;
+
+ using _DataType = typename _Ap::template _MaskDataType<_Bytes>;
+
+ static constexpr _DataType _S_implicit_mask = [] {
+ if constexpr (_S_is_scalar)
+ return true;
+ else if (not _S_is_partial)
+ return _DataType(~_DataType());
+ else if constexpr (_S_use_bitmask)
+ return _DataType((_DataType(1) << _S_size) - 1);
+ else
+ {
+ constexpr auto [...__is] = __iota<int[_S_full_size]>;
+ return _DataType{ (__is < _S_size ? -1 : 0)... };
+ }
+ }();
+
+ using _VecType = __simd_vec_from_mask_t<_Bytes, _Ap>;
+
+ static_assert(destructible<_VecType>);
+
+ static constexpr bool _S_has_bool_member = _S_is_scalar;
+
+ // Actual padding bytes, not padding elements.
+ // => _S_padding_bytes is 0 even if _S_is_partial is true.
+ static constexpr size_t _S_padding_bytes = 0;
+
+ _DataType _M_data;
+
+ public:
+ using value_type = bool;
+
+ using abi_type = _Ap;
+
+ using iterator = __iterator<basic_mask>;
+
+ using const_iterator = __iterator<const basic_mask>;
+
+ constexpr iterator
+ begin() noexcept
+ { return {*this, 0}; }
+
+ constexpr const_iterator
+ begin() const noexcept
+ { return {*this, 0}; }
+
+ constexpr const_iterator
+ cbegin() const noexcept
+ { return {*this, 0}; }
+
+ constexpr default_sentinel_t
+ end() const noexcept
+ { return {}; }
+
+ constexpr default_sentinel_t
+ cend() const noexcept
+ { return {}; }
+
+ static constexpr auto size = __simd_size_constant<_S_size>;
+
+ // internal but public API ----------------------------------------------
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(_DataType __x)
+ {
+ basic_mask __r;
+ __r._M_data = __x;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(unsigned_integral auto __bits)
+ { return basic_mask(__bits); }
+
+ /** \internal
+ * Bit-cast the given object \p __x to basic_mask.
+ *
+ * This is necessary for _S_nreg > 1 where the last element can be bool or when the sizeof
+ * doesn't match because of different alignment requirements of the sub-masks.
+ */
+ template <size_t _UBytes, typename _UAbi>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x)
+ { return __builtin_bit_cast(basic_mask, __x._M_concat_data()); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_concat_data() const
+ {
+ if constexpr (_S_is_scalar)
+ return __vec_builtin_type<__integer_from<_Bytes>, 1>{__integer_from<_Bytes>(-_M_data)};
+ else if constexpr (_S_is_partial)
+ return _M_data & _S_implicit_mask;
+ else
+ return _M_data;
+ }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_partial_mask_of_n(int __n)
+ {
+ // assume __n > 0 and __n < _S_size
+ static_assert(not _S_is_scalar);
+ if constexpr (not _S_use_bitmask)
+ return _VecType([&](__integer_from<_Bytes> __i) { return __i; })
+ < __integer_from<_Bytes>(__n);
+ else
+ {
+#if __has_builtin(__builtin_ia32_bzhi_si)
+ if constexpr (_S_size <= 32 and _Traits._M_have_bmi2())
+ return __builtin_ia32_bzhi_si(~0u >> (32 - _S_size), unsigned(__n));
+#endif
+#if __has_builtin(__builtin_ia32_bzhi_di)
+ if constexpr (_S_size <= 64 and _Traits._M_have_bmi2())
+ return __builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n));
+#endif
+ if constexpr (_S_size <= 32)
+ return (1u << unsigned(__n)) - 1;
+ else if constexpr (_S_size <= 64)
+ return (1ull << unsigned(__n)) - 1;
+ else
+ static_assert(false);
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask&
+ _M_and_neighbors()
+ {
+ if constexpr (_S_use_bitmask)
+ _M_data &= ((_M_data >> 1) & 0x5555'5555'5555'5555ull)
+ | ((_M_data << 1) & ~0x5555'5555'5555'5555ull);
+ else
+ _M_data &= _VecOps<_DataType>::_S_swap_neighbors(_M_data);
+ return *this;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask&
+ _M_or_neighbors()
+ {
+ if constexpr (_S_use_bitmask)
+ _M_data |= ((_M_data >> 1) & 0x5555'5555'5555'5555ull)
+ | ((_M_data << 1) & ~0x5555'5555'5555'5555ull);
+ else
+ _M_data |= _VecOps<_DataType>::_S_swap_neighbors(_M_data);
+ return *this;
+ }
+
+ template <typename _Mp>
+ [[__gnu__::__always_inline__]]
+ constexpr auto _M_chunk() const noexcept
+ {
+ constexpr int __n = _S_size / _Mp::_S_size;
+ constexpr int __rem = _S_size % _Mp::_S_size;
+ constexpr int __stride = _Mp::_S_size;
+ constexpr auto [...__is] = __iota<int[__n]>;
+ if constexpr (_S_is_scalar)
+ {
+ if constexpr (__n == 0)
+ return array<_Mp, 1> {*this};
+ else
+ return tuple<basic_mask> {*this};
+ }
+ else if constexpr (_S_use_bitmask != _Mp::_S_use_bitmask)
+ // convert to whatever _Mp uses first and then recurse into _M_chunk
+ return resize_t<_S_size, _Mp>(*this).template _M_chunk<_Mp>();
+ else if constexpr (_S_use_bitmask and _Mp::_S_use_bitmask)
+ {
+ static_assert(is_unsigned_v<_DataType>);
+ if constexpr (__rem == 0)
+ return array<_Mp, __n> {_Mp::_S_init(_M_data >> (__is * __stride))...};
+ else
+ {
+ using _Rest = resize_t<__rem, _Mp>;
+ return tuple {_Mp::_S_init(_M_data >> (__is * __stride))...,
+ _Rest::_S_init([&] [[__gnu__::__always_inline__]]() {
+ if constexpr (is_same_v<typename _Rest::_DataType, bool>)
+ return operator[](__n * _Mp::_S_size);
+ else
+ return _M_data >> (__n * __stride);
+ }())};
+ }
+ }
+ else if constexpr (__rem == 0)
+ {
+ if constexpr (_Mp::_S_size == 1)
+ return array<_Mp, __n> {_Mp(operator[](__is))...};
+ else
+ {
+ static_assert(is_same_v<__vec_value_type<typename _Mp::_DataType>,
+ __vec_value_type<_DataType>>);
+ return array<_Mp, __n> {
+ _Mp::_S_init(
+ _VecOps<typename _Mp::_DataType>::_S_extract(
+ _M_data, integral_constant<int, __is * __stride>()))...};
+ }
+ }
+ else
+ {
+ using _Rest = resize_t<__rem, _Mp>;
+ return tuple {
+ _Mp::_S_init(
+ _VecOps<typename _Mp::_DataType>::_S_extract(
+ _M_data, integral_constant<int, __is * __stride>()))...,
+ _Rest::_S_init([&] [[__gnu__::__always_inline__]]() {
+ if constexpr (is_same_v<typename _Rest::_DataType, bool>)
+ return operator[](__n * _Mp::_S_size);
+ else
+ return _VecOps<typename _Rest::_DataType>::_S_extract(
+ _M_data, integral_constant<int, __n * __stride>());
+ }())
+ };
+ }
+ }
+
+ // [simd.mask.overview] default constructor -----------------------------
+ basic_mask() = default;
+
+ // [simd.mask.overview] conversion extensions ---------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_mask(_DataType __x) requires(not _S_is_scalar and not _S_use_bitmask)
+ : _M_data(__x)
+ {}
+
+ [[__gnu__::__always_inline__]]
+ constexpr
+ operator _DataType() requires(not _S_is_scalar and not _S_use_bitmask)
+ { return _M_data; }
+
+ // [simd.mask.ctor] broadcast constructor -------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(same_as<bool> auto __x) noexcept
+ : _M_data(__x ? _S_implicit_mask : _DataType())
+ {}
+
+ // [simd.mask.ctor] conversion constructor ------------------------------
+ template <size_t _UBytes, typename _UAbi>
+ requires (_S_size == _UAbi::_S_size)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes))
+ basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
+ : _M_data([&] [[__gnu__::__always_inline__]] {
+ using _UV = basic_mask<_UBytes, _UAbi>;
+ // bool to bool
+ if constexpr (_S_is_scalar)
+ return __x[0];
+
+ // converting from an "array of bool"
+ else if constexpr (_UV::_S_is_scalar)
+ {
+ constexpr auto [...__is] = __iota<int[_S_size]>;
+ return _DataType{__vec_value_type<_DataType>(-__x[__is])...};
+ }
+
+ // vec-/bit-mask to bit-mask | bit-mask to vec-mask
+ else if constexpr (_S_use_bitmask or _UV::_S_use_bitmask)
+ return basic_mask(__x.to_bitset())._M_data;
+
+ // vec-mask to vec-mask
+ // 2-mask-elements wrapper to plain mask
+ else if constexpr (_UAbi::_S_is_cx_ileav)
+ {
+ if constexpr (sizeof(__x) == sizeof(_M_data) and _Bytes == _UBytes
+ and _UV::_S_padding_bytes == 0)
+ {
+ static_assert(not _S_has_bool_member and not _UV::_S_has_bool_member);
+ return __builtin_bit_cast(_DataType, __x);
+ }
+ else if (not __builtin_is_constant_evaluated()
+ and sizeof(__x) == sizeof(_M_data) and _Bytes == _UBytes)
+ {
+ _DataType __tmp = {};
+ __builtin_memcpy(&__tmp, &__x, sizeof(__x) - _UV::_S_padding_bytes);
+ return __tmp;
+ }
+ else if constexpr (_UBytes / _Bytes == 16) // ughh
+ {
+ constexpr auto [...__is] = __iota<int[_S_size]>;
+ return _DataType{__vec_value_type<_DataType>(-__x[__is])...};
+ }
+ else if constexpr (_Bytes > 1)
+ {
+ return reinterpret_cast<_DataType>(
+ __vec_mask_cast<__vec_builtin_type_bytes<
+ __integer_from<_Bytes / 2>, sizeof(_M_data)>>(
+ __x._M_data._M_concat_data()));
+ }
+ else if constexpr (_UBytes <= 8)
+ {
+ const auto __xv = __x._M_data._M_concat_data();
+ return __vec_mask_cast<_DataType>(
+ reinterpret_cast<__vec_builtin_type_bytes<
+ __integer_from<_UBytes>, sizeof(__xv)>>(__xv));
+ }
+ }
+ else if constexpr (sizeof(__x) == sizeof(_M_data) and _Bytes == _UBytes
+ and not _S_has_bool_member and not _UV::_S_has_bool_member
+ and not _UV::_S_use_bitmask and _UV::_S_padding_bytes == 0)
+ return __builtin_bit_cast(_DataType, __x);
+ else if (not __builtin_is_constant_evaluated()
+ and sizeof(__x) == sizeof(_M_data) and _Bytes == _UBytes
+ and not _S_has_bool_member and not _UV::_S_has_bool_member
+ and not _UV::_S_use_bitmask and _UV::_S_padding_bytes != 0)
+ {
+ _DataType __tmp = {};
+ __builtin_memcpy(&__tmp, &__x, sizeof(__x) - _UV::_S_padding_bytes);
+ return __tmp;
+ }
+ else
+ {
+#if _GLIBCXX_X86
+ // TODO: turn this into a __vec_mask_cast overload in simd_x86.h
+ if constexpr (_Bytes == 1 and _UBytes == 2)
+ if (not __builtin_is_constant_evaluated() and not __x._M_is_constprop())
+ {
+ if constexpr (_UAbi::_S_nreg == 1)
+ return __x86_cvt_vecmask<_DataType>(__x._M_data);
+ else if constexpr (_UAbi::_S_nreg == 2)
+ {
+ auto __lo = __x._M_data0._M_data;
+ auto __hi = __vec_zero_pad_to<sizeof(__lo)>(
+ __x._M_data1._M_concat_data());
+ return __x86_cvt_vecmask<_DataType>(__lo, __hi);
+ }
+ }
+#endif
+ return __vec_mask_cast<_DataType>(__x._M_concat_data());
+ }
+ }())
+ {}
+
+ // [simd.mask.ctor] generator constructor -------------------------------
+ template <__simd_generator_invokable<bool, _S_size> _Fp>
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(_Fp&& __gen)
+ : _M_data([&] [[__gnu__::__always_inline__]] {
+ constexpr auto [...__is] = __iota<int[_S_size]>;
+ if constexpr (_S_is_scalar)
+ return __gen(__simd_size_constant<0>);
+ else if constexpr (_S_use_bitmask)
+ return _DataType(((_DataType(__gen(__simd_size_constant<__is>)) << __is)
+ | ...));
+ else
+ return _DataType{__vec_value_type<_DataType>(
+ __gen(__simd_size_constant<__is>) ? -1 : 0)...};
+ }())
+ {}
+
+ template <__almost_simd_generator_invokable<bool, _S_size> _Fp>
+ constexpr explicit
+ basic_mask(_Fp&&)
+ = _GLIBCXX_DELETE_MSG("Invalid return type of the mask generator function: "
+ "Needs to be 'bool'.");
+
+ // [simd.mask.ctor] bitset constructor ----------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_mask(const same_as<bitset<size()>> auto& __b) noexcept
+ : basic_mask(static_cast<_Bitmask<_S_size>>(__b.to_ullong()))
+ {
+ static_assert(_S_size <= 64); // more than 64 elements in one register? not yet.
+ }
+
+ // [simd.mask.ctor] uint constructor ------------------------------------
+ template <unsigned_integral _Tp>
+ requires (not same_as<_Tp, bool>)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(_Tp __val) noexcept
+ : _M_data([&] [[__gnu__::__always_inline__]] () {
+ if constexpr (_S_use_bitmask)
+ return __val;
+ else if constexpr (_S_is_scalar)
+ return bool(__val & 1);
+ else if (__builtin_is_constant_evaluated() or __builtin_constant_p(__val))
+ {
+ constexpr auto [...__is] = __iota<int[_S_size]>;
+ return _DataType {__vec_value_type<_DataType>((__val & (1ull << __is)) == 0
+ ? 0 : -1)...};
+ }
+ else
+ {
+ using _Ip = typename _VecType::value_type;
+ _VecType __v0 = _Ip(__val);
+ constexpr int __bits_per_element = sizeof(_Ip) * __CHAR_BIT__;
+ constexpr _VecType __pow2 = _VecType(1) << (__iota<_VecType> % __bits_per_element);
+ if constexpr (_S_size < __bits_per_element)
+ return ((__v0 & __pow2) > 0)._M_concat_data();
+ else if constexpr (_S_size == __bits_per_element)
+ return ((__v0 & __pow2) != 0)._M_concat_data();
+ else
+ {
+ static_assert(_Bytes == 1);
+ static_assert(sizeof(_Ip) == 1);
+ _Bitmask<_S_size> __bits = __val;
+ static_assert(sizeof(_VecType) % sizeof(__bits) == 0);
+ if constexpr (sizeof(_DataType) == 32)
+ {
+ __vec_builtin_type<_UInt<8>, 4> __v1 = {
+ 0xffu & (__bits >> (0 * __CHAR_BIT__)),
+ 0xffu & (__bits >> (1 * __CHAR_BIT__)),
+ 0xffu & (__bits >> (2 * __CHAR_BIT__)),
+ 0xffu & (__bits >> (3 * __CHAR_BIT__)),
+ };
+ __v1 *= 0x0101'0101'0101'0101ull;
+ __v0 = __builtin_bit_cast(_VecType, __v1);
+ return ((__v0 & __pow2) != 0)._M_data;
+ }
+ else
+ {
+ using _V1 = vec<_Ip, sizeof(__bits)>;
+ _V1 __v1 = __builtin_bit_cast(_V1, __bits);
+ __v0 = _VecType::_S_static_permute(__v1, [](int __i) {
+ return __i / __CHAR_BIT__;
+ });
+ return ((__v0 & __pow2) != 0)._M_data;
+ }
+ }
+ }
+ }())
+ {}
+
+ //Effects: Initializes the first M elements to the corresponding bit values in val, where M is
+ //the smaller of size() and the number of bits in the value representation
+ //([basic.types.general]) of the type of val. If M is less than size(), the remaining elements
+ //are initialized to zero.
+
+
+ // [simd.mask.subscr] ---------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ operator[](__simd_size_type __i) const
+ {
+ if constexpr (_S_is_scalar)
+ return _M_data;
+ else if constexpr (_S_use_bitmask)
+ return bool((_M_data >> __i) & 1);
+ else
+ return _M_data[__i] & 1;
+ }
+
+ // [simd.mask.unary] ----------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask
+ operator!() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return _S_init(!_M_data);
+ else
+ return _S_init(~_M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator+() const noexcept requires destructible<_VecType>
+ { return operator _VecType(); }
+
+ constexpr _VecType
+ operator+() const noexcept = delete;
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator-() const noexcept requires destructible<_VecType>
+ {
+ using _Ip = typename _VecType::value_type;
+ if constexpr (_S_is_scalar)
+ return -_Ip(_M_data);
+ else if constexpr (_S_use_bitmask)
+ return __select_impl(*this, _Ip(-1), _Ip());
+ else
+ {
+ static_assert(sizeof(_VecType) == sizeof(_M_data));
+ return __builtin_bit_cast(_VecType, _M_data);
+ }
+ }
+
+ constexpr _VecType
+ operator-() const noexcept = delete;
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator~() const noexcept requires destructible<_VecType>
+ {
+ using _Ip = typename _VecType::value_type;
+ if constexpr (_S_is_scalar)
+ return ~_Ip(_M_data);
+ else if constexpr (_S_use_bitmask)
+ return __select_impl(*this, _Ip(-2), _Ip(-1));
+ else
+ {
+ static_assert(sizeof(_VecType) == sizeof(_M_data));
+ return __builtin_bit_cast(_VecType, _M_data) - _Ip(1);
+ }
+ }
+
+ constexpr _VecType
+ operator~() const noexcept = delete;
+
+ // [simd.mask.conv] -----------------------------------------------------
+ template <typename _Up, typename _UAbi>
+ requires (__simd_size_v<_Up, _UAbi> == _S_size)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(sizeof(_Up) != _Bytes)
+ operator basic_vec<_Up, _UAbi>() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return _Up(_M_data);
+ else
+ return __select_impl(*this, _Up(1), _Up(0));
+ }
+
+ // [simd.mask.namedconv] ------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr bitset<_S_size>
+ to_bitset() const noexcept
+ {
+ static_assert(_S_size <= 64); // more than 64 elements in one register? not yet.
+ return to_ullong();
+ }
+
+ /** \internal
+ * Return the mask as the smallest possible unsigned integer (up to 64 bits).
+ *
+ * \tparam _Offset Adjust the return type & value to start at bit \p _Offset.
+ * \tparam _Use_2_for_1 Store the value of every second element into one bit of the result.
+ * (precondition: each even/odd pair stores the same value)
+ */
+ template <int _Offset = 0, bool _Use_2_for_1 = false, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr _Bitmask<_S_size / (_Use_2_for_1 + 1) + _Offset>
+ _M_to_uint() const
+ {
+ constexpr int __nbits = _S_size / (_Use_2_for_1 + 1);
+ static_assert(__nbits + _Offset <= 64);
+ static_assert(not (_S_is_scalar and _Use_2_for_1));
+ // before shifting
+ using _U0 = _Bitmask<__nbits>;
+ // potentially wider type needed for shift by _Offset
+ using _Ur = _Bitmask<__nbits + _Offset>;
+ if constexpr (_S_is_scalar or _S_use_bitmask)
+ {
+ auto __bits = _M_data;
+ if constexpr (_S_is_partial)
+ __bits &= _S_implicit_mask;
+ if constexpr (_Use_2_for_1)
+ __bits = __bit_extract_even<__nbits>(__bits);
+ return _Ur(__bits) << _Offset;
+ }
+ else if constexpr (_Bytes == 8 and _Use_2_for_1)
+ {
+ const auto __u32 = __vec_bit_cast<unsigned>(_M_data);
+ if constexpr (sizeof(_M_data) == 16)
+ {
+ if constexpr (_Offset < 32)
+ return __u32[0] & (1u << _Offset);
+ else
+ return _M_data[0] & (1ull << _Offset);
+ }
+ else if constexpr (sizeof(_M_data) == 32)
+ {
+ if constexpr (_Offset < 31)
+ return (__u32[4] & (2u << _Offset)) | (__u32[0] & (1u << _Offset));
+ else
+ return (_M_data[2] & (2ull << _Offset)) | (_M_data[0] & (1ull << _Offset));
+ }
+ else
+ static_assert(false);
+ }
+ else if constexpr (_Use_2_for_1 and __nbits == 1)
+ return _Ur(operator[](0)) << _Offset;
+ else
+ {
+#if _GLIBCXX_X86
+ if (not __builtin_is_constant_evaluated() and not _M_is_constprop())
+ {
+ _U0 __uint;
+ if constexpr (_Use_2_for_1)
+ {
+ static_assert(_Bytes * 2 != 2); // because of missing movmskw
+ __uint = __x86_movmsk(__vec_bit_cast<__integer_from<_Bytes * 2>>(_M_data));
+ }
+ else if constexpr (_Bytes != 2) // movmskb would duplicate each bit
+ __uint = __x86_movmsk(_M_data);
+ else if constexpr (_Bytes == 2 and _Traits._M_have_bmi2())
+ __uint = __bit_extract_even<__nbits>(__x86_movmsk(_M_data));
+ else if constexpr (_Bytes == 2)
+ return __similar_mask<char, __nbits, _Ap>(*this).template _M_to_uint<_Offset>();
+ else
+ static_assert(false);
+ return _Ur(__uint) << _Offset;
+ // TODO: with AVX512 use __builtin_ia32_cvt[bwdq]2mask(128|256|512)
+ // TODO: Ask for compiler builtin to do the best of the above. This should also
+ // combine with a preceding vector-mask compare to produce a bit-mask compare (on
+ // AVX512)
+ }
+#endif
+ using _IV = conditional_t<_Use_2_for_1,
+ __similar_vec<__integer_from<_Bytes * 2>, __nbits, _Ap>,
+ _VecType>;
+ static_assert(destructible<_IV>);
+ const typename _IV::mask_type& __k = [&] [[__gnu__::__always_inline__]] () {
+ if constexpr (_Use_2_for_1)
+ return typename _IV::mask_type(__to_cx_ileav(*this));
+ else if constexpr (is_same_v<typename _IV::mask_type, basic_mask>)
+ return *this;
+ else
+ return typename _IV::mask_type(*this);
+ }();
+ constexpr int __n = _IV::size();
+ if constexpr (_Bytes * __CHAR_BIT__ >= __n) // '1 << __iota' cannot overflow
+ {
+ constexpr _IV __pow2 = _IV(1) << __iota<_IV>;
+ return _Ur(_U0(__select_impl(__k, __pow2, _IV())
+ ._M_reduce(bit_or<>()))) << _Offset;
+ }
+ else if constexpr (__n % 8 != 0)
+ {
+ constexpr int __n_lo = __n - __n % 8;
+ const auto [__lo, __hi] = chunk<__n_lo>(__k);
+ _Ur __bits = __hi.template _M_to_uint<_Offset + __n_lo, _Use_2_for_1>();
+ return __bits | __lo.template _M_to_uint<_Offset, _Use_2_for_1>();
+ }
+ else
+ {
+ constexpr _IV __pow2 = _IV(1) << __iota<_IV> % _IV(8);
+ _IV __x = __select_impl(__k, __pow2, _IV());
+ __x |= _IV::_S_static_permute(__x, _SwapNeighbors<4>());
+ __x |= _IV::_S_static_permute(__x, _SwapNeighbors<2>());
+ __x |= _IV::_S_static_permute(__x, _SwapNeighbors<1>());
+ __x = _IV::_S_static_permute(__x, [](int __i) {
+ return __i * 8 < __n ? __i * 8 : uninit_element;
+ });
+ _U0 __bits = __builtin_bit_cast(
+ __similar_vec<_U0, __n * _Bytes / sizeof(_U0), _Ap>, __x)[0];
+ if constexpr (not __has_single_bit(unsigned(__nbits)))
+ __bits &= (_U0(1) << __nbits) - 1;
+ return _Ur(__bits) << _Offset;
+ }
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr unsigned long long
+ to_ullong() const
+ { return _M_to_uint(); }
+
+ // [simd.mask.binary] ---------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator&&(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data & __y._M_data); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator||(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data | __y._M_data); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator&(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data & __y._M_data); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator|(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data | __y._M_data); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator^(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data ^ __y._M_data); }
+
+ // [simd.mask.cassign] --------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator&=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data &= __y._M_data;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator|=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data |= __y._M_data;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator^=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data ^= __y._M_data;
+ return __x;
+ }
+
+ // [simd.mask.comparison] -----------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator==(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !(__x ^ __y); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator!=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x ^ __y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator>=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x || !__y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator<=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !__x || __y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator>(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x && !__y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator<(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !__x && __y; }
+
+ // [simd.mask.cond] -----------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ __select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept
+ {
+ if constexpr (not _S_use_bitmask)
+ {
+#if _GLIBCXX_X86
+ // this works around bad code-gen when the compiler can't see that __k is a vector-mask.
+ // This pattern, is recognized to match the x86 blend instructions, which only consider
+ // the sign bit of the mask register. Also, without SSE4, if the compiler knows that __k
+ // is a vector-mask, then the '< 0' is elided.
+ return __k._M_data < 0 ? __t._M_data : __f._M_data;
+#endif
+ return __k._M_data ? __t._M_data : __f._M_data;
+ }
+ else
+ return (__k._M_data & __t._M_data) | (~__k._M_data & __f._M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ __select_impl(const basic_mask& __k, same_as<bool> auto __t, same_as<bool> auto __f) noexcept
+ {
+ if (__t == __f)
+ return basic_mask(__t);
+ else
+ return __t ? __k : !__k;
+ }
+
+ template <__vectorizable _T0, same_as<_T0> _T1>
+ requires (sizeof(_T0) == _Bytes)
+ [[__gnu__::__always_inline__]]
+ friend constexpr vec<_T0, _S_size>
+ __select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return __k._M_data ? __t : __f;
+ else
+ {
+ using _Vp = vec<_T0, _S_size>;
+ using _Mp = typename _Vp::mask_type;
+ return __select_impl(_Mp(__k), _Vp(__t), _Vp(__f));
+ }
+ }
+
+ // [simd.mask.reductions] implementation --------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_all_of() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return _M_data;
+ else if constexpr (_S_use_bitmask)
+ {
+ if constexpr (_S_is_partial)
+ // PR120925 (partial kortest pattern not recognized)
+ return (_M_data & _S_implicit_mask) == _S_implicit_mask;
+ else
+ return _M_data == _S_implicit_mask;
+ }
+#if _GLIBCXX_X86
+ else if (not (__builtin_is_constant_evaluated() or __builtin_constant_p(_M_data)))
+ return __x86_vecmask_all<_S_size>(_M_data);
+#endif
+ else
+ return _VecOps<_DataType, _S_size>::_S_all_of(_M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_any_of() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return _M_data;
+ else if constexpr (_S_use_bitmask)
+ {
+ if constexpr (_S_is_partial)
+ // PR120925 (partial kortest pattern not recognized)
+ return (_M_data & _S_implicit_mask) != 0;
+ else
+ return _M_data != 0;
+ }
+#if _GLIBCXX_X86
+ else if (not (__builtin_is_constant_evaluated() or __builtin_constant_p(_M_data)))
+ return __x86_vecmask_any<_S_size>(_M_data);
+#endif
+ else
+ return _VecOps<_DataType, _S_size>::_S_any_of(_M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_none_of() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return not _M_data;
+ else if constexpr (_S_use_bitmask)
+ {
+ if constexpr (_S_is_partial)
+ // PR120925 (partial kortest pattern not recognized)
+ return (_M_data & _S_implicit_mask) == 0;
+ else
+ return _M_data == 0;
+ }
+#if _GLIBCXX_X86
+ else if (not (__builtin_is_constant_evaluated() or __builtin_constant_p(_M_data)))
+ return __x86_vecmask_none<_S_size>(_M_data);
+#endif
+ else
+ return _VecOps<_DataType, _S_size>::_S_none_of(_M_data);
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ _M_reduce_count() const noexcept
+ {
+ if constexpr (_S_is_scalar)
+ return int(_M_data);
+ else if constexpr (_S_size <= sizeof(int) * __CHAR_BIT__)
+ return __builtin_popcount(_M_to_uint());
+ else
+ return __builtin_popcountll(to_ullong());
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ _M_reduce_min_index() const
+ {
+ if constexpr (_S_size == 1)
+ return 0;
+ else
+ return __lowest_bit(_M_to_uint());
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ _M_reduce_max_index() const
+ {
+ if constexpr (_S_size == 1)
+ return 0;
+ else
+ return __highest_bit(_M_to_uint());
+ }
+
+ [[__gnu__::__always_inline__]]
+ bool
+ _M_is_constprop() const
+ { return __builtin_constant_p(_M_data); }
+ };
+
+ template <size_t _Bytes, __abi_tag _Ap>
+ requires (_Ap::_S_nreg > 1) and (not _Ap::_S_is_cx_ileav)
+ class basic_mask<_Bytes, _Ap>
+ {
+ template <size_t, typename>
+ friend class basic_mask;
+
+ template <typename, typename>
+ friend class basic_vec;
+
+ static constexpr int _S_size = _Ap::_S_size;
+
+ static constexpr int _N0 = __bit_ceil(unsigned(_S_size)) / 2;
+
+ static constexpr int _N1 = _S_size - _N0;
+
+ static constexpr int _Nreg0 = __bit_ceil(unsigned(_Ap::_S_nreg)) / 2;
+
+ static constexpr int _Nreg1 = _Ap::_S_nreg - _Nreg0;
+
+ using _Abi0 = conditional_t<_N0 == _Nreg0 or is_same_v<_Ap, _ScalarAbi<_S_size>>,
+ _ScalarAbi<_N0>, _Abi<_N0, _Nreg0, _Ap::_S_variant>>;
+
+ using _Abi1 = conditional_t<_N1 == _Nreg1 or is_same_v<_Ap, _ScalarAbi<_S_size>>,
+ _ScalarAbi<_N1>, _Abi<_N1, _Nreg1, _Ap::_S_variant>>;
+
+ using _Mask0 = basic_mask<_Bytes, _Abi0>;
+
+ using _Mask1 = basic_mask<_Bytes, _Abi1>;
+
+ // _Ap::_S_nreg determines how deep the recursion goes. E.g. basic_mask<4, _Abi<8, 4>> cannot
+ // use basic_mask<4, _Abi<4, 1>> as _Mask0/1 types.
+ static_assert(_Mask0::abi_type::_S_nreg + _Mask1::abi_type::_S_nreg == _Ap::_S_nreg);
+
+ static constexpr bool _S_use_bitmask = _Mask0::_S_use_bitmask;
+
+ static constexpr bool _S_is_scalar = _Mask0::_S_is_scalar;
+
+ _Mask0 _M_data0;
+
+ _Mask1 _M_data1;
+
+ using _VecType = __simd_vec_from_mask_t<_Bytes, _Ap>;
+
+ static constexpr bool _S_has_bool_member = _Mask1::_S_has_bool_member;
+
+ static constexpr size_t _S_padding_bytes
+ = __alignof__(_Mask0) - sizeof(_Mask1) + _Mask1::_S_padding_bytes;
+
+ public:
+ using value_type = bool;
+
+ using abi_type = _Ap;
+
+ using iterator = __iterator<basic_mask>;
+
+ using const_iterator = __iterator<const basic_mask>;
+
+ constexpr iterator
+ begin() noexcept
+ { return {*this, 0}; }
+
+ constexpr const_iterator
+ begin() const noexcept
+ { return {*this, 0}; }
+
+ constexpr const_iterator
+ cbegin() const noexcept
+ { return {*this, 0}; }
+
+ constexpr default_sentinel_t
+ end() const noexcept
+ { return {}; }
+
+ constexpr default_sentinel_t
+ cend() const noexcept
+ { return {}; }
+
+ static constexpr auto size = __simd_size_constant<_S_size>;
+
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(const _Mask0& __x, const _Mask1& __y)
+ {
+ basic_mask __r;
+ __r._M_data0 = __x;
+ __r._M_data1 = __y;
+ return __r;
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(unsigned_integral auto __bits)
+ { return basic_mask(__bits); }
+
+ template <typename _U0, typename _U1>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_init(const __trivial_pair<_U0, _U1>& __bits)
+ {
+ if constexpr (is_unsigned_v<_U0>)
+ {
+ static_assert(is_unsigned_v<_U1>);
+ return _S_init(_Mask0(__bits._M_first), _Mask1(__bits._M_second));
+ }
+ else if constexpr (is_unsigned_v<_U1>)
+ return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1(__bits._M_second));
+ else
+ return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1::_S_init(__bits._M_second));
+ }
+
+ template <size_t _UBytes, typename _UAbi>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x)
+ {
+ using _Mp = basic_mask<_UBytes, _UAbi>;
+ if constexpr (_Mp::_S_has_bool_member or sizeof(basic_mask) > sizeof(__x))
+ return _S_init(__builtin_bit_cast(_Mask0, __x._M_data0),
+ _Mask1::_S_recursive_bit_cast(__x._M_data1));
+ else if constexpr (sizeof(basic_mask) == sizeof(__x))
+ return __builtin_bit_cast(basic_mask, __x);
+ else
+ { // e.g. on IvyBridge (different alignment => different sizeof)
+ struct _Tmp { alignas(_Mp) basic_mask _M_data; };
+ return __builtin_bit_cast(_Tmp, __x)._M_data;
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_concat_data() const
+ {
+ if constexpr (_S_use_bitmask)
+ {
+ static_assert(_S_size <= sizeof(0ull) * __CHAR_BIT__, "cannot concat more than 64 bits");
+ using _Up = _Bitmask<_S_size>;
+ return _Up(_M_data0._M_concat_data() | (_Up(_M_data1._M_concat_data()) << _N0));
+ }
+ else
+ {
+ auto __lo = _M_data0._M_concat_data();
+ auto __hi = __vec_zero_pad_to<sizeof(__lo)>(_M_data1._M_concat_data());
+ return __vec_concat(__lo, __hi);
+ }
+ }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ static constexpr basic_mask
+ _S_partial_mask_of_n(int __n)
+ {
+#if __has_builtin(__builtin_ia32_bzhi_di)
+ if constexpr (_S_use_bitmask and _S_size <= 64 and _Traits._M_have_bmi2())
+ return __builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n));
+#endif
+ if (__n < _N0)
+ return _S_init(_Mask0::_S_partial_mask_of_n(__n), _Mask1(false));
+ else if (__n == _N0)
+ return _S_init(_Mask0(true), _Mask1(false));
+ else
+ return _S_init(_Mask0(true), _Mask1::_S_partial_mask_of_n(__n - _N0));
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask&
+ _M_and_neighbors()
+ {
+ _M_data0._M_and_neighbors();
+ _M_data1._M_and_neighbors();
+ return *this;
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask&
+ _M_or_neighbors()
+ {
+ _M_data0._M_or_neighbors();
+ _M_data1._M_or_neighbors();
+ return *this;
+ }
+
+ template <typename _Mp>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_chunk() const noexcept
+ {
+ constexpr int __n = _S_size / _Mp::_S_size;
+ constexpr int __rem = _S_size % _Mp::_S_size;
+ [[maybe_unused]] constexpr auto [...__is] = __iota<int[__n]>;
+ if constexpr (_N0 == _Mp::_S_size)
+ {
+ if constexpr (__rem == 0 and is_same_v<_Mp, _Mask0>)
+ return array<_Mp, __n> {_M_data0, _M_data1};
+ else if constexpr (__rem == 0)
+ return array<_Mp, __n> {_Mp(_M_data0), _Mp(_M_data1)};
+ else
+ return tuple<_Mp, resize_t<__rem, _Mp>> {_M_data0, _M_data1};
+ }
+ else if constexpr (__rem == 0)
+ {
+ using _Rp = array<_Mp, __n>;
+ if constexpr (sizeof(_Rp) == sizeof(*this))
+ {
+ static_assert(not _Mp::_S_is_partial);
+ return __builtin_bit_cast(_Rp, *this);
+ }
+ else
+ {
+ return _Rp {_Mp([&](int __i) { return (*this)[__i + __is * _Mp::_S_size]; })...};
+ }
+ }
+ else
+ {
+ using _Rest = resize_t<__rem, _Mp>;
+ // can't bit-cast because the member order of tuple is reversed
+ return tuple {
+ _Mp ([&](int __i) { return (*this)[__i + __is * _Mp::_S_size]; })...,
+ _Rest([&](int __i) { return (*this)[__i + __n * _Mp::_S_size]; })
+ };
+ }
+ }
+
+ // [simd.mask.overview] default constructor -----------------------------
+ basic_mask() = default;
+
+ // [simd.mask.overview] conversion extensions ---------------------------
+ // TODO: any?
+
+ // [simd.mask.ctor] broadcast constructor -------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(same_as<bool> auto __x) noexcept
+ : _M_data0(__x), _M_data1(__x)
+ {}
+
+ // [simd.mask.ctor] conversion constructor ------------------------------
+ template <size_t _UBytes, typename _UAbi>
+ requires (_S_size == _UAbi::_S_size)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes))
+ basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
+ : _M_data0([&] {
+ if constexpr (_UAbi::_S_nreg > 1)
+ {
+ if constexpr (_UAbi::_S_is_cx_ileav)
+ return __to_cx_ileav(__x._M_data._M_data0);
+ else
+ return __x._M_data0;
+ }
+ else
+ return get<0>(chunk<_N0>(__x));
+ }()),
+ _M_data1([&] {
+ if constexpr (_UAbi::_S_nreg > 1)
+ {
+ if constexpr (_UAbi::_S_is_cx_ileav)
+ return __to_cx_ileav(__x._M_data._M_data1);
+ else
+ return __x._M_data1;
+ }
+ else
+ return get<1>(chunk<_N0>(__x));
+ }())
+ {}
+
+ // [simd.mask.ctor] generator constructor -------------------------------
+ template <__simd_generator_invokable<bool, _S_size> _Fp>
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(_Fp&& __gen)
+ : _M_data0(__gen), _M_data1([&] [[__gnu__::__always_inline__]] (auto __i) {
+ return __gen(__simd_size_constant<__i + _N0>);
+ })
+ {}
+
+ template <__almost_simd_generator_invokable<bool, _S_size> _Fp>
+ constexpr explicit
+ basic_mask(_Fp&&)
+ = _GLIBCXX_DELETE_MSG("Invalid return type of the mask generator function: "
+ "Needs to be 'bool'.");
+
+ // [simd.mask.ctor] bitset constructor ----------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr
+ basic_mask(const same_as<bitset<size()>> auto& __b) noexcept
+ : _M_data0(__bitset_split<_N0>(__b)._M_lo), _M_data1(__bitset_split<_N0>(__b)._M_hi)
+ {}
+
+ // [simd.mask.ctor] uint constructor ------------------------------------------
+ template <unsigned_integral _Tp>
+ requires (not same_as<_Tp, bool>)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit
+ basic_mask(_Tp __val) noexcept
+ : _M_data0(static_cast<_Bitmask<_N0>>(__val)),
+ _M_data1(sizeof(_Tp) * __CHAR_BIT__ > _N0
+ ? static_cast<_Bitmask<_N1>>(__val >> _N0) : _Bitmask<_N1>())
+ {}
+
+ // [simd.mask.subscr] ---------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr value_type
+ operator[](__simd_size_type __i) const
+ {
+ // in some cases the last element can be 'bool' instead of bit-/vector-mask;
+ // e.g. mask<short, 17> is {mask<short, 16>, mask<short, 1>}, where the latter uses
+ // _ScalarAbi<1>, which is stored as 'bool'
+ if constexpr (_M_data1._S_has_bool_member)
+ {
+ if (__i < _N0)
+ return _M_data0[__i];
+ else
+ return _M_data1[__i - _N0];
+ }
+ else if constexpr (__flags_test(abi_type::_S_variant, _AbiVariant::_CxIleav))
+ {
+ // values are duplicated
+ if constexpr (__flags_test(abi_type::_S_variant, _AbiVariant::_BitMask))
+ {
+ struct _Tmp
+ {
+ alignas(basic_mask) unsigned char _M_bytes[__div_ceil(2 * _S_size, __CHAR_BIT__)];
+ };
+ return bool((__builtin_bit_cast(_Tmp, *this)
+ ._M_bytes[2 * __i / __CHAR_BIT__] >> (2 * __i % __CHAR_BIT__)) & 1);
+ }
+ else
+ {
+ struct _Tmp
+ {
+ alignas(basic_mask) __integer_from<_Bytes / 2> _M_values[2 * _S_size];
+ };
+ return __builtin_bit_cast(_Tmp, *this)._M_values[2 * __i] != 0;
+ }
+ }
+ else if constexpr (__flags_test(abi_type::_S_variant, _AbiVariant::_BitMask))
+ {
+ struct _Tmp
+ {
+ alignas(basic_mask) unsigned char _M_bytes[__div_ceil(_S_size, __CHAR_BIT__)];
+ };
+ return bool((__builtin_bit_cast(_Tmp, *this)
+ ._M_bytes[__i / __CHAR_BIT__] >> (__i % __CHAR_BIT__)) & 1);
+ }
+ else
+ {
+ struct _Tmp
+ {
+ alignas(basic_mask) __integer_from<_Bytes> _M_values[_S_size];
+ };
+ return __builtin_bit_cast(_Tmp, *this)._M_values[__i] != 0;
+ }
+ }
+
+ // [simd.mask.unary] ----------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr basic_mask
+ operator!() const noexcept
+ { return _S_init(!_M_data0, !_M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator+() const noexcept requires destructible<_VecType>
+ { return _VecType::_S_concat(+_M_data0, +_M_data1); }
+
+ constexpr _VecType
+ operator+() const noexcept = delete;
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator-() const noexcept requires destructible<_VecType>
+ { return _VecType::_S_concat(-_M_data0, -_M_data1); }
+
+ constexpr _VecType
+ operator-() const noexcept = delete;
+
+ [[__gnu__::__always_inline__]]
+ constexpr _VecType
+ operator~() const noexcept requires destructible<_VecType>
+ { return _VecType::_S_concat(~_M_data0, ~_M_data1); }
+
+ constexpr _VecType
+ operator~() const noexcept = delete;
+
+ // [simd.mask.conv] -----------------------------------------------------
+ template <typename _Up, typename _UAbi>
+ requires (__simd_size_v<_Up, _UAbi> == _S_size)
+ [[__gnu__::__always_inline__]]
+ constexpr explicit(sizeof(_Up) != _Bytes)
+ operator basic_vec<_Up, _UAbi>() const noexcept
+ {
+ using _Rp = basic_vec<_Up, _UAbi>;
+ return _Rp::_S_init(_M_data0, _M_data1);
+ }
+
+ // [simd.mask.namedconv] ------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ constexpr bitset<_S_size>
+ to_bitset() const noexcept
+ {
+ if constexpr (_S_size <= 64)
+ return to_ullong();
+ else
+ {
+ static_assert(_N0 % 64 == 0);
+ struct _Tmp
+ {
+ bitset<_N0> _M_lo;
+ bitset<_N1> _M_hi;
+ } __tmp = {_M_data0.to_bitset(), _M_data1.to_bitset()};
+ return __builtin_bit_cast(bitset<_S_size>, __tmp);
+ }
+ }
+
+ template <int _Offset = 0, bool _Use_2_for_1 = false, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ _M_to_uint() const
+ {
+ constexpr int _N0x = _Use_2_for_1 ? _N0 / 2 : _N0;
+ if constexpr (_N0x >= 64)
+ {
+ static_assert(_Offset == 0);
+ return __trivial_pair {
+ _M_data0.template _M_to_uint<0, _Use_2_for_1>(),
+ _M_data1.template _M_to_uint<0, _Use_2_for_1>()
+ };
+ }
+ else
+ {
+#if _GLIBCXX_X86
+ if constexpr (_Bytes == 2 and not _Traits._M_have_bmi2() and _Ap::_S_nreg == 2
+ and not _S_use_bitmask and not _Use_2_for_1)
+ return __similar_mask<char, _S_size, _Ap>(*this).template _M_to_uint<_Offset>();
+#endif
+ auto __uint = _M_data1.template _M_to_uint<_N0x + _Offset, _Use_2_for_1>();
+ __uint |= _M_data0.template _M_to_uint<_Offset, _Use_2_for_1>();
+ return __uint;
+ }
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr unsigned long long
+ to_ullong() const
+ {
+ if constexpr (_S_size <= 64)
+ return _M_to_uint();
+ else
+ {
+ __glibcxx_simd_precondition(_M_data1.to_ullong() == 0,
+ "to_ullong called on mask with 'true' elements at indices"
+ "higher than 64");
+ return _M_data0.to_ullong();
+ }
+ }
+
+ // [simd.mask.binary]
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator&&(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 && __y._M_data0, __x._M_data1 && __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator||(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 || __y._M_data0, __x._M_data1 || __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator&(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 & __y._M_data0, __x._M_data1 & __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator|(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 | __y._M_data0, __x._M_data1 | __y._M_data1); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator^(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return _S_init(__x._M_data0 ^ __y._M_data0, __x._M_data1 ^ __y._M_data1); }
+
+ // [simd.mask.cassign]
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator&=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data0 &= __y._M_data0;
+ __x._M_data1 &= __y._M_data1;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator|=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data0 |= __y._M_data0;
+ __x._M_data1 |= __y._M_data1;
+ return __x;
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask&
+ operator^=(basic_mask& __x, const basic_mask& __y) noexcept
+ {
+ __x._M_data0 ^= __y._M_data0;
+ __x._M_data1 ^= __y._M_data1;
+ return __x;
+ }
+
+ // [simd.mask.comparison] -----------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator==(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !(__x ^ __y); }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator!=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x ^ __y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator>=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x || !__y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator<=(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !__x || __y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator>(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return __x && !__y; }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ operator<(const basic_mask& __x, const basic_mask& __y) noexcept
+ { return !__x && __y; }
+
+ // [simd.mask.cond] -----------------------------------------------------
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ __select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept
+ {
+ return _S_init(__select_impl(__k._M_data0, __t._M_data0, __f._M_data0),
+ __select_impl(__k._M_data1, __t._M_data1, __f._M_data1));
+ }
+
+ [[__gnu__::__always_inline__]]
+ friend constexpr basic_mask
+ __select_impl(const basic_mask& __k, same_as<bool> auto __t, same_as<bool> auto __f) noexcept
+ {
+ if (__t == __f)
+ return basic_mask(__t);
+ else
+ return __t ? __k : !__k;
+ }
+
+ template <__vectorizable _T0, same_as<_T0> _T1>
+ requires (sizeof(_T0) == _Bytes)
+ [[__gnu__::__always_inline__]]
+ friend constexpr vec<_T0, _S_size>
+ __select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept
+ {
+ using _Vp = vec<_T0, _S_size>;
+ if constexpr (__complex_like<_T0>)
+ return _Vp::_S_concat(__select_impl(__k._M_data0, __t, __f),
+ __select_impl(__k._M_data1, __t, __f));
+ else
+ return _Vp::_S_init(__select_impl(__k._M_data0, __t, __f),
+ __select_impl(__k._M_data1, __t, __f));
+ }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_all_of() const
+ {
+ if constexpr (_N0 == _N1)
+ return (_M_data0 and _M_data1)._M_all_of();
+ else
+ return _M_data0._M_all_of() and _M_data1._M_all_of();
+ }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_any_of() const
+ {
+ if constexpr (_N0 == _N1)
+ return (_M_data0 or _M_data1)._M_any_of();
+ else
+ return _M_data0._M_any_of() or _M_data1._M_any_of();
+ }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ _M_none_of() const
+ {
+ if constexpr (_N0 == _N1)
+ return (_M_data0 or _M_data1)._M_none_of();
+ else
+ return _M_data0._M_none_of() and _M_data1._M_none_of();
+ }
+
+ [[__gnu__::__always_inline__]]
+ bool
+ _M_is_constprop() const
+ { return _M_data0._M_is_constprop() and _M_data1._M_is_constprop(); }
+ };
+}
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_MASK_H