[PATCH 03/11] libstdc++: C++26 [simd] rebind, resize, permute, chunk, cat and basic_mask

Matthias Kretz Sat, 11 Oct 2025 20:35:13 -0700


This implements basic_mask for everything but complex.


libstdc++-v3/ChangeLog:

        * include/bits/simd_mask.h: New file.

Signed-off-by: Matthias Kretz <[email protected]>
---
 libstdc++-v3/include/bits/simd_mask.h | 1732 +++++++++++++++++++++++++
 1 file changed, 1732 insertions(+)
 create mode 100644 libstdc++-v3/include/bits/simd_mask.h


--
──────────────────────────────────────────────────────────────────────────
 Dr. Matthias Kretz                           https://mattkretz.github.io
 GSI Helmholtz Center for Heavy Ion Research               https://gsi.de
 std::simd
──────────────────────────────────────────────────────────────────────────

diff --git a/libstdc++-v3/include/bits/simd_mask.h b/libstdc++-v3/include/bits/simd_mask.h
new file mode 100644
index 00000000000..7bb8598ca4c
--- /dev/null
+++ b/libstdc++-v3/include/bits/simd_mask.h
@@ -0,0 +1,1732 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
+/* Copyright © 2025      GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
+ *                       Matthias Kretz <[email protected]>
+ */
+
+#ifndef _GLIBCXX_SIMD_MASK_H
+#define _GLIBCXX_SIMD_MASK_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_iterator.h"
+#include "vec_ops.h"
+#if _GLIBCXX_X86
+#include "simd_x86.h"
+#endif
+
+#include <bit>
+#include <bitset>
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std::simd
+{
+  template <unsigned _Np>
+    struct _SwapNeighbors
+    {
+      consteval unsigned
+      operator()(unsigned __i, unsigned __size) const
+      {
+        if (__size % (2 * _Np) != 0)
+          __builtin_abort(); // swap_neighbors<N> permutation requires a multiple of 2N elements
+        else if (std::has_single_bit(_Np))
+          return __i ^ _Np;
+        else if (__i % (2 * _Np) >= _Np)
+          return __i - _Np;
+        else
+          return __i + _Np;
+      }
+    };
+
+  template <size_t _Np, size_t _Mp>
+    constexpr auto
+    __bitset_split(const bitset<_Mp>& __b)
+    {
+      constexpr auto __bits_per_word = __CHAR_BIT__ * __SIZEOF_LONG__;
+      if constexpr (_Np % __bits_per_word == 0)
+        {
+          struct _Tmp
+          {
+            bitset<_Np> _M_lo;
+            bitset<_Mp - _Np> _M_hi;
+          };
+          return __builtin_bit_cast(_Tmp, __b);
+        }
+      else
+        {
+          constexpr auto __bits_per_ullong = __CHAR_BIT__ * __SIZEOF_LONG_LONG__;
+          static_assert(_Mp <= __bits_per_ullong);
+          using _Lo = _Bitmask<_Np>;
+          using _Hi = _Bitmask<_Mp - _Np>;
+          struct _Tmp
+          {
+            _Lo _M_lo;
+            _Hi _M_hi;
+          };
+          return _Tmp {static_cast<_Lo>(__b.to_ullong()), static_cast<_Hi>(__b.to_ullong() >> _Np)};
+        }
+    }
+
+  // [simd.traits]
+  // --- rebind ---
+  template <typename _Tp, typename _Vp, _ArchTraits _Traits = {}>
+    struct rebind
+    {};
+
+  template <__vectorizable _Tp, __simd_vec_type _Vp, _ArchTraits _Traits>
+    //requires requires { typename __deduce_abi_t<_Tp, _Vp::size()>; }
+    struct rebind<_Tp, _Vp, _Traits>
+    { using type = __similar_vec<_Tp, _Vp::size(), typename _Vp::abi_type>; };
+
+  template <__vectorizable _Tp, __simd_mask_type _Mp, _ArchTraits _Traits>
+    //requires requires { typename __deduce_abi_t<_Tp, _Mp::size()>; }
+    struct rebind<_Tp, _Mp, _Traits>
+    { using type = __similar_mask<_Tp, _Mp::size(), typename _Mp::abi_type>; };
+
+  template <typename _Tp, typename _Vp>
+    using rebind_t = typename rebind<_Tp, _Vp>::type;
+
+  // --- resize ---
+  template <__simd_size_type _Np, typename _Vp, _ArchTraits _Traits = {}>
+    struct resize
+    {};
+
+  template <__simd_size_type _Np, __data_parallel_type _Vp, _ArchTraits _Traits>
+    requires requires { typename _Vp::mask_type; }
+    //requires requires { typename __deduce_abi_t<typename _Vp::value_type, _Np>; }
+    struct resize<_Np, _Vp, _Traits>
+    { using type = __similar_vec<typename _Vp::value_type, _Np, typename _Vp::abi_type>; };
+
+  template <__simd_size_type _Np, __simd_mask_type _Mp, _ArchTraits _Traits>
+    //requires requires { typename __deduce_abi_t<typename _Mp::value_type, _Np>; }
+    struct resize<_Np, _Mp, _Traits>
+    {
+      using _A1 = decltype(__abi_rebind<__mask_element_size<_Mp>, _Np, typename _Mp::abi_type,
+                                        true>());
+
+      static_assert(_Mp::abi_type::_S_variant == _A1::_S_variant or is_same_v<_A1, _ScalarAbi<_Np>>
+                      or is_same_v<typename _Mp::abi_type, _ScalarAbi<_Mp::size()>>);
+
+      using type = basic_mask<__mask_element_size<_Mp>, _A1>;
+    };
+
+  template <__simd_size_type _Np, typename _Vp>
+    using resize_t = typename resize<_Np, _Vp>::type;
+
+  // [simd.syn]
+  static constexpr __simd_size_type zero_element   = -1 << (sizeof(int) * __CHAR_BIT__ - 1);
+
+  static constexpr __simd_size_type uninit_element = zero_element + 1;
+
+  // [simd.permute.static]
+  template<__simd_size_type _Np = 0, __simd_vec_or_mask_type _Vp,
+           __index_permutation_function<_Vp> _IdxMap>
+    [[__gnu__::__always_inline__]]
+    constexpr resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp>
+    permute(const _Vp& __v, _IdxMap&& __idxmap)
+    { return resize_t<_Np == 0 ? _Vp::size() : _Np, _Vp>::_S_static_permute(__v, __idxmap); }
+
+  // [simd.permute.dynamic]
+  template<__simd_vec_or_mask_type _Vp, __simd_integral _Ip>
+    [[__gnu__::__always_inline__]]
+    constexpr resize_t<_Ip::size(), _Vp>
+    permute(const _Vp& __v, const _Ip& __indices)
+    { return __v[__indices]; }
+
+  // [simd.creation] ----------------------------------------------------------
+  template<__simd_vec_type _Vp, typename _Ap>
+    constexpr auto
+    chunk(const basic_vec<typename _Vp::value_type, _Ap>& __x) noexcept
+    { return __x.template _M_chunk<_Vp>(); }
+
+  template<__simd_mask_type _Mp, typename _Ap>
+    constexpr auto
+    chunk(const basic_mask<__mask_element_size<_Mp>, _Ap>& __x) noexcept
+    { return __x.template _M_chunk<_Mp>(); }
+
+  template<__simd_size_type _Np, typename _Tp, typename _Ap>
+    constexpr auto
+    chunk(const basic_vec<_Tp, _Ap>& __x) noexcept
+    { return chunk<resize_t<_Np, basic_vec<_Tp, _Ap>>>(__x); }
+
+  template<__simd_size_type _Np, size_t _Bytes, typename _Ap>
+    constexpr auto
+    chunk(const basic_mask<_Bytes, _Ap>& __x) noexcept
+    { return chunk<resize_t<_Np, basic_mask<_Bytes, _Ap>>>(__x); }
+
+  template<typename _Tp, typename... _Abis>
+    constexpr basic_vec<_Tp, __deduce_abi_t<_Tp, (_Abis::_S_size + ...)>>
+    cat(const basic_vec<_Tp, _Abis>&... __xs) noexcept
+    { return basic_vec<_Tp, __deduce_abi_t<_Tp, (_Abis::_S_size + ...)>>::_S_concat(__xs...); }
+
+  template<size_t _Bytes, typename... _Abis>
+    constexpr basic_mask<_Bytes, __deduce_abi_t<__integer_from<_Bytes>, (_Abis::_S_size + ...)>>
+    cat(const basic_mask<_Bytes, _Abis>&... __xs) noexcept
+    { static_assert(false, "TODO: cat"); }
+
+  // [simd.mask] --------------------------------------------------------------
+  template <size_t _Bytes, typename _Abi>
+    class basic_mask
+    {
+    public:
+      using value_type = bool;
+
+      using abi_type = _Abi;
+
+#define _GLIBCXX_DELETE_SIMD                                                                    \
+      _GLIBCXX_DELETE_MSG("This specialization is disabled because of an invalid combination "  \
+          "of template arguments to basic_mask.")
+
+      basic_mask() = _GLIBCXX_DELETE_SIMD;
+
+      ~basic_mask() = _GLIBCXX_DELETE_SIMD;
+
+      basic_mask(const basic_mask&) = _GLIBCXX_DELETE_SIMD;
+
+      basic_mask& operator=(const basic_mask&) = _GLIBCXX_DELETE_SIMD;
+
+#undef _GLIBCXX_DELETE_SIMD
+    };
+
+  template <size_t _Bytes, __abi_tag _Ap>
+    requires (_Ap::_S_nreg == 1) and (not _Ap::_S_is_cx_ileav)
+    class basic_mask<_Bytes, _Ap>
+    {
+      template <size_t, typename>
+        friend class basic_mask;
+
+      template <typename, typename>
+        friend class basic_vec;
+
+      static constexpr int _S_size = _Ap::_S_size;
+
+      static constexpr bool _S_is_scalar = is_same_v<_Ap, _ScalarAbi<_Ap::_S_size>>;
+
+      static constexpr bool _S_use_bitmask = [] {
+        if constexpr (_S_is_scalar)
+          return false;
+        else
+          return __flags_test(_Ap::_S_variant, _AbiVariant::_BitMask);
+      }();
+
+      static constexpr int _S_full_size = [] {
+        if constexpr (_S_is_scalar)
+          return _S_size;
+        else if constexpr (_S_use_bitmask and _S_size < __CHAR_BIT__)
+          return __CHAR_BIT__;
+        else
+          return __bit_ceil(unsigned(_S_size));
+      }();
+
+      static constexpr bool _S_is_partial = _S_size != _S_full_size;
+
+      using _DataType = typename _Ap::template _MaskDataType<_Bytes>;
+
+      static constexpr _DataType _S_implicit_mask = [] {
+        if constexpr (_S_is_scalar)
+          return true;
+        else if (not _S_is_partial)
+          return _DataType(~_DataType());
+        else if constexpr (_S_use_bitmask)
+          return _DataType((_DataType(1) << _S_size) - 1);
+        else
+          {
+            constexpr auto [...__is] = __iota<int[_S_full_size]>;
+            return _DataType{ (__is < _S_size ? -1 : 0)... };
+          }
+      }();
+
+      using _VecType = __simd_vec_from_mask_t<_Bytes, _Ap>;
+
+      static_assert(destructible<_VecType>);
+
+      static constexpr bool _S_has_bool_member = _S_is_scalar;
+
+      // Actual padding bytes, not padding elements.
+      // => _S_padding_bytes is 0 even if _S_is_partial is true.
+      static constexpr size_t _S_padding_bytes = 0;
+
+      _DataType _M_data;
+
+    public:
+      using value_type = bool;
+
+      using abi_type = _Ap;
+
+      using iterator = __iterator<basic_mask>;
+
+      using const_iterator = __iterator<const basic_mask>;
+
+      constexpr iterator
+      begin() noexcept
+      { return {*this, 0}; }
+
+      constexpr const_iterator
+      begin() const noexcept
+      { return {*this, 0}; }
+
+      constexpr const_iterator
+      cbegin() const noexcept
+      { return {*this, 0}; }
+
+      constexpr default_sentinel_t
+      end() const noexcept
+      { return {}; }
+
+      constexpr default_sentinel_t
+      cend() const noexcept
+      { return {}; }
+
+      static constexpr auto size = __simd_size_constant<_S_size>;
+
+      // internal but public API ----------------------------------------------
+      [[__gnu__::__always_inline__]]
+      static constexpr basic_mask
+      _S_init(_DataType __x)
+      {
+        basic_mask __r;
+        __r._M_data = __x;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr basic_mask
+      _S_init(unsigned_integral auto __bits)
+      { return basic_mask(__bits); }
+
+      /** \internal
+       * Bit-cast the given object \p __x to basic_mask.
+       *
+       * This is necessary for _S_nreg > 1 where the last element can be bool or when the sizeof
+       * doesn't match because of different alignment requirements of the sub-masks.
+       */
+      template <size_t _UBytes, typename _UAbi>
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_mask
+        _S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x)
+        { return __builtin_bit_cast(basic_mask, __x._M_concat_data()); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr auto
+      _M_concat_data() const
+      {
+        if constexpr (_S_is_scalar)
+          return __vec_builtin_type<__integer_from<_Bytes>, 1>{__integer_from<_Bytes>(-_M_data)};
+        else if constexpr (_S_is_partial)
+          return _M_data & _S_implicit_mask;
+        else
+          return _M_data;
+      }
+
+      template <_ArchTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_mask
+        _S_partial_mask_of_n(int __n)
+        {
+          // assume __n > 0 and __n < _S_size
+          static_assert(not _S_is_scalar);
+          if constexpr (not _S_use_bitmask)
+            return _VecType([&](__integer_from<_Bytes> __i) { return __i; })
+                     < __integer_from<_Bytes>(__n);
+          else
+            {
+#if __has_builtin(__builtin_ia32_bzhi_si)
+              if constexpr (_S_size <= 32 and _Traits._M_have_bmi2())
+                return __builtin_ia32_bzhi_si(~0u >> (32 - _S_size), unsigned(__n));
+#endif
+#if __has_builtin(__builtin_ia32_bzhi_di)
+              if constexpr (_S_size <= 64 and _Traits._M_have_bmi2())
+                return __builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n));
+#endif
+              if constexpr (_S_size <= 32)
+                return (1u << unsigned(__n)) - 1;
+              else if constexpr (_S_size <= 64)
+                return (1ull << unsigned(__n)) - 1;
+              else
+                static_assert(false);
+            }
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_mask&
+      _M_and_neighbors()
+      {
+        if constexpr (_S_use_bitmask)
+          _M_data &= ((_M_data >> 1) & 0x5555'5555'5555'5555ull)
+                       | ((_M_data << 1) & ~0x5555'5555'5555'5555ull);
+        else
+          _M_data &= _VecOps<_DataType>::_S_swap_neighbors(_M_data);
+        return *this;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_mask&
+      _M_or_neighbors()
+      {
+        if constexpr (_S_use_bitmask)
+          _M_data |= ((_M_data >> 1) & 0x5555'5555'5555'5555ull)
+                       | ((_M_data << 1) & ~0x5555'5555'5555'5555ull);
+        else
+          _M_data |= _VecOps<_DataType>::_S_swap_neighbors(_M_data);
+        return *this;
+      }
+
+      template <typename _Mp>
+        [[__gnu__::__always_inline__]]
+        constexpr auto _M_chunk() const noexcept
+        {
+          constexpr int __n = _S_size / _Mp::_S_size;
+          constexpr int __rem = _S_size % _Mp::_S_size;
+          constexpr int __stride = _Mp::_S_size;
+          constexpr auto [...__is] = __iota<int[__n]>;
+          if constexpr (_S_is_scalar)
+            {
+              if constexpr (__n == 0)
+                return array<_Mp, 1> {*this};
+              else
+                return tuple<basic_mask> {*this};
+            }
+          else if constexpr (_S_use_bitmask != _Mp::_S_use_bitmask)
+            // convert to whatever _Mp uses first and then recurse into _M_chunk
+            return resize_t<_S_size, _Mp>(*this).template _M_chunk<_Mp>();
+          else if constexpr (_S_use_bitmask and _Mp::_S_use_bitmask)
+            {
+              static_assert(is_unsigned_v<_DataType>);
+              if constexpr (__rem == 0)
+                return array<_Mp, __n> {_Mp::_S_init(_M_data >> (__is * __stride))...};
+              else
+                {
+                  using _Rest = resize_t<__rem, _Mp>;
+                  return tuple {_Mp::_S_init(_M_data >> (__is * __stride))...,
+                                _Rest::_S_init([&] [[__gnu__::__always_inline__]]() {
+                                  if constexpr (is_same_v<typename _Rest::_DataType, bool>)
+                                    return operator[](__n * _Mp::_S_size);
+                                  else
+                                    return _M_data >> (__n * __stride);
+                                }())};
+                }
+            }
+          else if constexpr (__rem == 0)
+            {
+              if constexpr (_Mp::_S_size == 1)
+                return array<_Mp, __n> {_Mp(operator[](__is))...};
+              else
+                {
+                  static_assert(is_same_v<__vec_value_type<typename _Mp::_DataType>,
+                                          __vec_value_type<_DataType>>);
+                  return array<_Mp, __n> {
+                    _Mp::_S_init(
+                      _VecOps<typename _Mp::_DataType>::_S_extract(
+                        _M_data, integral_constant<int, __is * __stride>()))...};
+                }
+            }
+          else
+            {
+              using _Rest = resize_t<__rem, _Mp>;
+              return tuple {
+                _Mp::_S_init(
+                  _VecOps<typename _Mp::_DataType>::_S_extract(
+                    _M_data, integral_constant<int, __is * __stride>()))...,
+                _Rest::_S_init([&] [[__gnu__::__always_inline__]]() {
+                  if constexpr (is_same_v<typename _Rest::_DataType, bool>)
+                    return operator[](__n * _Mp::_S_size);
+                  else
+                    return _VecOps<typename _Rest::_DataType>::_S_extract(
+                             _M_data, integral_constant<int, __n * __stride>());
+                }())
+              };
+            }
+        }
+
+      // [simd.mask.overview] default constructor -----------------------------
+      basic_mask() = default;
+
+      // [simd.mask.overview] conversion extensions ---------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr
+      basic_mask(_DataType __x) requires(not _S_is_scalar and not _S_use_bitmask)
+        : _M_data(__x)
+      {}
+
+      [[__gnu__::__always_inline__]]
+      constexpr
+      operator _DataType() requires(not _S_is_scalar and not _S_use_bitmask)
+      { return _M_data; }
+
+      // [simd.mask.ctor] broadcast constructor -------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr explicit
+      basic_mask(same_as<bool> auto __x) noexcept
+        : _M_data(__x ? _S_implicit_mask : _DataType())
+      {}
+
+      // [simd.mask.ctor] conversion constructor ------------------------------
+      template <size_t _UBytes, typename _UAbi>
+        requires (_S_size == _UAbi::_S_size)
+        [[__gnu__::__always_inline__]]
+        constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes))
+        basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
+          : _M_data([&] [[__gnu__::__always_inline__]] {
+              using _UV = basic_mask<_UBytes, _UAbi>;
+              // bool to bool
+              if constexpr (_S_is_scalar)
+                return __x[0];
+
+              // converting from an "array of bool"
+              else if constexpr (_UV::_S_is_scalar)
+                {
+                  constexpr auto [...__is] = __iota<int[_S_size]>;
+                  return _DataType{__vec_value_type<_DataType>(-__x[__is])...};
+                }
+
+              // vec-/bit-mask to bit-mask | bit-mask to vec-mask
+              else if constexpr (_S_use_bitmask or _UV::_S_use_bitmask)
+                return basic_mask(__x.to_bitset())._M_data;
+
+              // vec-mask to vec-mask
+              // 2-mask-elements wrapper to plain mask
+              else if constexpr (_UAbi::_S_is_cx_ileav)
+                {
+                  if constexpr (sizeof(__x) == sizeof(_M_data) and _Bytes == _UBytes
+                                    and _UV::_S_padding_bytes == 0)
+                    {
+                      static_assert(not _S_has_bool_member and not _UV::_S_has_bool_member);
+                      return __builtin_bit_cast(_DataType, __x);
+                    }
+                  else if (not __builtin_is_constant_evaluated()
+                             and sizeof(__x) == sizeof(_M_data) and _Bytes == _UBytes)
+                    {
+                      _DataType __tmp = {};
+                      __builtin_memcpy(&__tmp, &__x, sizeof(__x) - _UV::_S_padding_bytes);
+                      return __tmp;
+                    }
+                  else if constexpr (_UBytes / _Bytes == 16) // ughh
+                    {
+                      constexpr auto [...__is] = __iota<int[_S_size]>;
+                      return _DataType{__vec_value_type<_DataType>(-__x[__is])...};
+                    }
+                  else if constexpr (_Bytes > 1)
+                    {
+                      return reinterpret_cast<_DataType>(
+                               __vec_mask_cast<__vec_builtin_type_bytes<
+                                                 __integer_from<_Bytes / 2>, sizeof(_M_data)>>(
+                                 __x._M_data._M_concat_data()));
+                    }
+                  else if constexpr (_UBytes <= 8)
+                    {
+                      const auto __xv = __x._M_data._M_concat_data();
+                      return __vec_mask_cast<_DataType>(
+                               reinterpret_cast<__vec_builtin_type_bytes<
+                                                  __integer_from<_UBytes>, sizeof(__xv)>>(__xv));
+                    }
+                }
+              else if constexpr (sizeof(__x) == sizeof(_M_data) and _Bytes == _UBytes
+                                   and not _S_has_bool_member and not _UV::_S_has_bool_member
+                                   and not _UV::_S_use_bitmask and _UV::_S_padding_bytes == 0)
+                return __builtin_bit_cast(_DataType, __x);
+              else if (not __builtin_is_constant_evaluated()
+                         and sizeof(__x) == sizeof(_M_data) and _Bytes == _UBytes
+                         and not _S_has_bool_member and not _UV::_S_has_bool_member
+                         and not _UV::_S_use_bitmask and _UV::_S_padding_bytes != 0)
+                {
+                  _DataType __tmp = {};
+                  __builtin_memcpy(&__tmp, &__x, sizeof(__x) - _UV::_S_padding_bytes);
+                  return __tmp;
+                }
+              else
+                {
+#if _GLIBCXX_X86
+                  // TODO: turn this into a __vec_mask_cast overload in simd_x86.h
+                  if constexpr (_Bytes == 1 and _UBytes == 2)
+                    if (not __builtin_is_constant_evaluated() and not __x._M_is_constprop())
+                      {
+                        if constexpr (_UAbi::_S_nreg == 1)
+                          return __x86_cvt_vecmask<_DataType>(__x._M_data);
+                        else if constexpr (_UAbi::_S_nreg == 2)
+                          {
+                            auto __lo = __x._M_data0._M_data;
+                            auto __hi = __vec_zero_pad_to<sizeof(__lo)>(
+                                          __x._M_data1._M_concat_data());
+                            return __x86_cvt_vecmask<_DataType>(__lo, __hi);
+                          }
+                      }
+#endif
+                  return __vec_mask_cast<_DataType>(__x._M_concat_data());
+                }
+          }())
+        {}
+
+      // [simd.mask.ctor] generator constructor -------------------------------
+      template <__simd_generator_invokable<bool, _S_size> _Fp>
+        [[__gnu__::__always_inline__]]
+        constexpr explicit
+        basic_mask(_Fp&& __gen)
+          : _M_data([&] [[__gnu__::__always_inline__]] {
+              constexpr auto [...__is] = __iota<int[_S_size]>;
+              if constexpr (_S_is_scalar)
+                return __gen(__simd_size_constant<0>);
+              else if constexpr (_S_use_bitmask)
+                return _DataType(((_DataType(__gen(__simd_size_constant<__is>)) << __is)
+                                    | ...));
+              else
+                return _DataType{__vec_value_type<_DataType>(
+                                   __gen(__simd_size_constant<__is>) ? -1 : 0)...};
+            }())
+        {}
+
+      template <__almost_simd_generator_invokable<bool, _S_size> _Fp>
+        constexpr explicit
+        basic_mask(_Fp&&)
+          = _GLIBCXX_DELETE_MSG("Invalid return type of the mask generator function: "
+                                "Needs to be 'bool'.");
+
+      // [simd.mask.ctor] bitset constructor ----------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr
+      basic_mask(const same_as<bitset<size()>> auto& __b) noexcept
+      : basic_mask(static_cast<_Bitmask<_S_size>>(__b.to_ullong()))
+      {
+        static_assert(_S_size <= 64); // more than 64 elements in one register? not yet.
+      }
+
+      // [simd.mask.ctor] uint constructor ------------------------------------
+      template <unsigned_integral _Tp>
+        requires (not same_as<_Tp, bool>)
+        [[__gnu__::__always_inline__]]
+        constexpr explicit
+        basic_mask(_Tp __val) noexcept
+        : _M_data([&] [[__gnu__::__always_inline__]] () {
+            if constexpr (_S_use_bitmask)
+              return __val;
+            else if constexpr (_S_is_scalar)
+              return bool(__val & 1);
+            else if (__builtin_is_constant_evaluated() or __builtin_constant_p(__val))
+              {
+                constexpr auto [...__is] = __iota<int[_S_size]>;
+                return _DataType {__vec_value_type<_DataType>((__val & (1ull << __is)) == 0
+                                                                ? 0 : -1)...};
+              }
+            else
+              {
+                using _Ip = typename _VecType::value_type;
+                _VecType __v0 = _Ip(__val);
+                constexpr int __bits_per_element = sizeof(_Ip) * __CHAR_BIT__;
+                constexpr _VecType __pow2 = _VecType(1) << (__iota<_VecType> % __bits_per_element);
+                if constexpr (_S_size < __bits_per_element)
+                  return ((__v0 & __pow2) > 0)._M_concat_data();
+                else if constexpr (_S_size == __bits_per_element)
+                  return ((__v0 & __pow2) != 0)._M_concat_data();
+                else
+                  {
+                    static_assert(_Bytes == 1);
+                    static_assert(sizeof(_Ip) == 1);
+                    _Bitmask<_S_size> __bits = __val;
+                    static_assert(sizeof(_VecType) % sizeof(__bits) == 0);
+                    if constexpr (sizeof(_DataType) == 32)
+                      {
+                        __vec_builtin_type<_UInt<8>, 4> __v1 = {
+                          0xffu & (__bits >> (0 * __CHAR_BIT__)),
+                          0xffu & (__bits >> (1 * __CHAR_BIT__)),
+                          0xffu & (__bits >> (2 * __CHAR_BIT__)),
+                          0xffu & (__bits >> (3 * __CHAR_BIT__)),
+                        };
+                        __v1 *= 0x0101'0101'0101'0101ull;
+                        __v0 = __builtin_bit_cast(_VecType, __v1);
+                        return ((__v0 & __pow2) != 0)._M_data;
+                      }
+                    else
+                      {
+                        using _V1 = vec<_Ip, sizeof(__bits)>;
+                        _V1 __v1 = __builtin_bit_cast(_V1, __bits);
+                        __v0 = _VecType::_S_static_permute(__v1, [](int __i) {
+                                 return __i / __CHAR_BIT__;
+                               });
+                        return ((__v0 & __pow2) != 0)._M_data;
+                      }
+                  }
+              }
+          }())
+        {}
+
+      //Effects: Initializes the first M elements to the corresponding bit values in val, where M is
+      //the smaller of size() and the number of bits in the value representation
+      //([basic.types.general]) of the type of val. If M is less than size(), the remaining elements
+      //are initialized to zero.
+
+
+      // [simd.mask.subscr] ---------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr value_type
+      operator[](__simd_size_type __i) const
+      {
+        if constexpr (_S_is_scalar)
+          return _M_data;
+        else if constexpr (_S_use_bitmask)
+          return bool((_M_data >> __i) & 1);
+        else
+          return _M_data[__i] & 1;
+      }
+
+      // [simd.mask.unary] ----------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr basic_mask
+      operator!() const noexcept
+      {
+        if constexpr (_S_is_scalar)
+          return _S_init(!_M_data);
+        else
+          return _S_init(~_M_data);
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr _VecType
+      operator+() const noexcept requires destructible<_VecType>
+      { return operator _VecType(); }
+
+      constexpr _VecType
+      operator+() const noexcept = delete;
+
+      [[__gnu__::__always_inline__]]
+      constexpr _VecType
+      operator-() const noexcept requires destructible<_VecType>
+      {
+        using _Ip = typename _VecType::value_type;
+        if constexpr (_S_is_scalar)
+          return -_Ip(_M_data);
+        else if constexpr (_S_use_bitmask)
+          return __select_impl(*this, _Ip(-1), _Ip());
+        else
+          {
+            static_assert(sizeof(_VecType) == sizeof(_M_data));
+            return __builtin_bit_cast(_VecType, _M_data);
+          }
+      }
+
+      constexpr _VecType
+      operator-() const noexcept = delete;
+
+      [[__gnu__::__always_inline__]]
+      constexpr _VecType
+      operator~() const noexcept requires destructible<_VecType>
+      {
+        using _Ip = typename _VecType::value_type;
+        if constexpr (_S_is_scalar)
+          return ~_Ip(_M_data);
+        else if constexpr (_S_use_bitmask)
+          return __select_impl(*this, _Ip(-2), _Ip(-1));
+        else
+          {
+            static_assert(sizeof(_VecType) == sizeof(_M_data));
+            return __builtin_bit_cast(_VecType, _M_data) - _Ip(1);
+          }
+      }
+
+      constexpr _VecType
+      operator~() const noexcept = delete;
+
+      // [simd.mask.conv] -----------------------------------------------------
+      template <typename _Up, typename _UAbi>
+        requires (__simd_size_v<_Up, _UAbi> == _S_size)
+        [[__gnu__::__always_inline__]]
+        constexpr explicit(sizeof(_Up) != _Bytes)
+        operator basic_vec<_Up, _UAbi>() const noexcept
+        {
+          if constexpr (_S_is_scalar)
+            return _Up(_M_data);
+          else
+            return __select_impl(*this, _Up(1), _Up(0));
+        }
+
+      // [simd.mask.namedconv] ------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr bitset<_S_size>
+      to_bitset() const noexcept
+      {
+        static_assert(_S_size <= 64); // more than 64 elements in one register? not yet.
+        return to_ullong();
+      }
+
+      /** \internal
+       * Return the mask as the smallest possible unsigned integer (up to 64 bits).
+       *
+       * \tparam _Offset       Adjust the return type & value to start at bit \p _Offset.
+       * \tparam _Use_2_for_1  Store the value of every second element into one bit of the result.
+       *                       (precondition: each even/odd pair stores the same value)
+       */
+      template <int _Offset = 0, bool _Use_2_for_1 = false, _ArchTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr _Bitmask<_S_size / (_Use_2_for_1 + 1) + _Offset>
+        _M_to_uint() const
+        {
+          constexpr int __nbits = _S_size / (_Use_2_for_1 + 1);
+          static_assert(__nbits + _Offset <= 64);
+          static_assert(not (_S_is_scalar and _Use_2_for_1));
+          // before shifting
+          using _U0 = _Bitmask<__nbits>;
+          // potentially wider type needed for shift by _Offset
+          using _Ur = _Bitmask<__nbits + _Offset>;
+          if constexpr (_S_is_scalar or _S_use_bitmask)
+            {
+              auto __bits = _M_data;
+              if constexpr (_S_is_partial)
+                __bits &= _S_implicit_mask;
+              if constexpr (_Use_2_for_1)
+                __bits = __bit_extract_even<__nbits>(__bits);
+              return _Ur(__bits) << _Offset;
+            }
+          else if constexpr (_Bytes == 8 and _Use_2_for_1)
+            {
+              const auto __u32 = __vec_bit_cast<unsigned>(_M_data);
+              if constexpr (sizeof(_M_data) == 16)
+                {
+                  if constexpr (_Offset < 32)
+                    return __u32[0] & (1u << _Offset);
+                  else
+                    return _M_data[0] & (1ull << _Offset);
+                }
+              else if constexpr (sizeof(_M_data) == 32)
+                {
+                  if constexpr (_Offset < 31)
+                    return (__u32[4] & (2u << _Offset)) | (__u32[0] & (1u << _Offset));
+                  else
+                    return (_M_data[2] & (2ull << _Offset)) | (_M_data[0] & (1ull << _Offset));
+                }
+              else
+                static_assert(false);
+            }
+          else if constexpr (_Use_2_for_1 and __nbits == 1)
+            return _Ur(operator[](0)) << _Offset;
+          else
+            {
+#if _GLIBCXX_X86
+              if (not __builtin_is_constant_evaluated() and not _M_is_constprop())
+                {
+                  _U0 __uint;
+                  if constexpr (_Use_2_for_1)
+                    {
+                      static_assert(_Bytes * 2 != 2); // because of missing movmskw
+                      __uint = __x86_movmsk(__vec_bit_cast<__integer_from<_Bytes * 2>>(_M_data));
+                    }
+                  else if constexpr (_Bytes != 2) // movmskb would duplicate each bit
+                    __uint = __x86_movmsk(_M_data);
+                  else if constexpr (_Bytes == 2 and _Traits._M_have_bmi2())
+                    __uint = __bit_extract_even<__nbits>(__x86_movmsk(_M_data));
+                  else if constexpr (_Bytes == 2)
+                    return __similar_mask<char, __nbits, _Ap>(*this).template _M_to_uint<_Offset>();
+                  else
+                    static_assert(false);
+                  return _Ur(__uint) << _Offset;
+                  // TODO: with AVX512 use __builtin_ia32_cvt[bwdq]2mask(128|256|512)
+                  // TODO: Ask for compiler builtin to do the best of the above. This should also
+                  // combine with a preceding vector-mask compare to produce a bit-mask compare (on
+                  // AVX512)
+                }
+#endif
+              using _IV = conditional_t<_Use_2_for_1,
+                                        __similar_vec<__integer_from<_Bytes * 2>, __nbits, _Ap>,
+                                        _VecType>;
+              static_assert(destructible<_IV>);
+              const typename _IV::mask_type& __k = [&] [[__gnu__::__always_inline__]] () {
+                if constexpr (_Use_2_for_1)
+                  return typename _IV::mask_type(__to_cx_ileav(*this));
+                else if constexpr (is_same_v<typename _IV::mask_type, basic_mask>)
+                  return *this;
+                else
+                  return typename _IV::mask_type(*this);
+              }();
+              constexpr int __n = _IV::size();
+              if constexpr (_Bytes * __CHAR_BIT__ >= __n) // '1 << __iota' cannot overflow
+                {
+                  constexpr _IV __pow2 = _IV(1) << __iota<_IV>;
+                  return _Ur(_U0(__select_impl(__k, __pow2, _IV())
+                                   ._M_reduce(bit_or<>()))) << _Offset;
+                }
+              else if constexpr (__n % 8 != 0)
+                {
+                  constexpr int __n_lo = __n - __n % 8;
+                  const auto [__lo, __hi] = chunk<__n_lo>(__k);
+                  _Ur __bits = __hi.template _M_to_uint<_Offset + __n_lo, _Use_2_for_1>();
+                  return __bits | __lo.template _M_to_uint<_Offset, _Use_2_for_1>();
+                }
+              else
+                {
+                  constexpr _IV __pow2 = _IV(1) << __iota<_IV> % _IV(8);
+                  _IV __x = __select_impl(__k, __pow2, _IV());
+                  __x |= _IV::_S_static_permute(__x, _SwapNeighbors<4>());
+                  __x |= _IV::_S_static_permute(__x, _SwapNeighbors<2>());
+                  __x |= _IV::_S_static_permute(__x, _SwapNeighbors<1>());
+                  __x = _IV::_S_static_permute(__x, [](int __i) {
+                          return __i * 8 < __n ? __i * 8 : uninit_element;
+                        });
+                  _U0 __bits = __builtin_bit_cast(
+                                 __similar_vec<_U0, __n * _Bytes / sizeof(_U0), _Ap>, __x)[0];
+                  if constexpr (not __has_single_bit(unsigned(__nbits)))
+                    __bits &= (_U0(1) << __nbits) - 1;
+                  return _Ur(__bits) << _Offset;
+                }
+            }
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr unsigned long long
+      to_ullong() const
+      { return _M_to_uint(); }
+
+      // [simd.mask.binary] ---------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator&&(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data & __y._M_data); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator||(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data | __y._M_data); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator&(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data & __y._M_data); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator|(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data | __y._M_data); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator^(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data ^ __y._M_data); }
+
+      // [simd.mask.cassign] --------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask&
+      operator&=(basic_mask& __x, const basic_mask& __y) noexcept
+      {
+        __x._M_data &= __y._M_data;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask&
+      operator|=(basic_mask& __x, const basic_mask& __y) noexcept
+      {
+        __x._M_data |= __y._M_data;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask&
+      operator^=(basic_mask& __x, const basic_mask& __y) noexcept
+      {
+        __x._M_data ^= __y._M_data;
+        return __x;
+      }
+
+      // [simd.mask.comparison] -----------------------------------------------
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator==(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return !(__x ^ __y); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator!=(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return __x ^ __y; }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator>=(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return __x || !__y; }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator<=(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return !__x || __y; }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator>(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return __x && !__y; }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator<(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return !__x && __y; }
+
+      // [simd.mask.cond] -----------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      __select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept
+      {
+        if constexpr (not _S_use_bitmask)
+          {
+#if _GLIBCXX_X86
+            // this works around bad code-gen when the compiler can't see that __k is a vector-mask.
+            // This pattern, is recognized to match the x86 blend instructions, which only consider
+            // the sign bit of the mask register. Also, without SSE4, if the compiler knows that __k
+            // is a vector-mask, then the '< 0' is elided.
+            return __k._M_data < 0 ? __t._M_data : __f._M_data;
+#endif
+            return __k._M_data ? __t._M_data : __f._M_data;
+          }
+        else
+          return (__k._M_data & __t._M_data) | (~__k._M_data & __f._M_data);
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      __select_impl(const basic_mask& __k, same_as<bool> auto __t, same_as<bool> auto __f) noexcept
+      {
+        if (__t == __f)
+          return basic_mask(__t);
+        else
+          return __t ? __k : !__k;
+      }
+
+      template <__vectorizable _T0, same_as<_T0> _T1>
+        requires (sizeof(_T0) == _Bytes)
+        [[__gnu__::__always_inline__]]
+        friend constexpr vec<_T0, _S_size>
+        __select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept
+        {
+          if constexpr (_S_is_scalar)
+            return __k._M_data ? __t : __f;
+          else
+            {
+              using _Vp = vec<_T0, _S_size>;
+              using _Mp = typename _Vp::mask_type;
+              return __select_impl(_Mp(__k), _Vp(__t), _Vp(__f));
+            }
+        }
+
+      // [simd.mask.reductions] implementation --------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr bool
+      _M_all_of() const noexcept
+      {
+        if constexpr (_S_is_scalar)
+          return _M_data;
+        else if constexpr (_S_use_bitmask)
+          {
+            if constexpr (_S_is_partial)
+              // PR120925 (partial kortest pattern not recognized)
+              return (_M_data & _S_implicit_mask) == _S_implicit_mask;
+            else
+              return _M_data == _S_implicit_mask;
+          }
+#if _GLIBCXX_X86
+        else if (not (__builtin_is_constant_evaluated() or __builtin_constant_p(_M_data)))
+          return __x86_vecmask_all<_S_size>(_M_data);
+#endif
+        else
+          return _VecOps<_DataType, _S_size>::_S_all_of(_M_data);
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr bool
+      _M_any_of() const noexcept
+      {
+        if constexpr (_S_is_scalar)
+          return _M_data;
+        else if constexpr (_S_use_bitmask)
+          {
+            if constexpr (_S_is_partial)
+              // PR120925 (partial kortest pattern not recognized)
+              return (_M_data & _S_implicit_mask) != 0;
+            else
+              return _M_data != 0;
+          }
+#if _GLIBCXX_X86
+        else if (not (__builtin_is_constant_evaluated() or __builtin_constant_p(_M_data)))
+          return __x86_vecmask_any<_S_size>(_M_data);
+#endif
+        else
+          return _VecOps<_DataType, _S_size>::_S_any_of(_M_data);
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr bool
+      _M_none_of() const noexcept
+      {
+        if constexpr (_S_is_scalar)
+          return not _M_data;
+        else if constexpr (_S_use_bitmask)
+          {
+            if constexpr (_S_is_partial)
+              // PR120925 (partial kortest pattern not recognized)
+              return (_M_data & _S_implicit_mask) == 0;
+            else
+              return _M_data == 0;
+          }
+#if _GLIBCXX_X86
+        else if (not (__builtin_is_constant_evaluated() or __builtin_constant_p(_M_data)))
+          return __x86_vecmask_none<_S_size>(_M_data);
+#endif
+        else
+          return _VecOps<_DataType, _S_size>::_S_none_of(_M_data);
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr __simd_size_type
+      _M_reduce_count() const noexcept
+      {
+        if constexpr (_S_is_scalar)
+          return int(_M_data);
+        else if constexpr (_S_size <= sizeof(int) * __CHAR_BIT__)
+          return __builtin_popcount(_M_to_uint());
+        else
+          return __builtin_popcountll(to_ullong());
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr __simd_size_type
+      _M_reduce_min_index() const
+      {
+        if constexpr (_S_size == 1)
+          return 0;
+        else
+          return __lowest_bit(_M_to_uint());
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr __simd_size_type
+      _M_reduce_max_index() const
+      {
+        if constexpr (_S_size == 1)
+          return 0;
+        else
+          return __highest_bit(_M_to_uint());
+      }
+
+      [[__gnu__::__always_inline__]]
+      bool
+      _M_is_constprop() const
+      { return __builtin_constant_p(_M_data); }
+    };
+
+  template <size_t _Bytes, __abi_tag _Ap>
+    requires (_Ap::_S_nreg > 1) and (not _Ap::_S_is_cx_ileav)
+    class basic_mask<_Bytes, _Ap>
+    {
+      template <size_t, typename>
+        friend class basic_mask;
+
+      template <typename, typename>
+        friend class basic_vec;
+
+      static constexpr int _S_size = _Ap::_S_size;
+
+      static constexpr int _N0 = __bit_ceil(unsigned(_S_size)) / 2;
+
+      static constexpr int _N1 = _S_size - _N0;
+
+      static constexpr int _Nreg0 = __bit_ceil(unsigned(_Ap::_S_nreg)) / 2;
+
+      static constexpr int _Nreg1 = _Ap::_S_nreg - _Nreg0;
+
+      using _Abi0 = conditional_t<_N0 == _Nreg0 or is_same_v<_Ap, _ScalarAbi<_S_size>>,
+                                  _ScalarAbi<_N0>, _Abi<_N0, _Nreg0, _Ap::_S_variant>>;
+
+      using _Abi1 = conditional_t<_N1 == _Nreg1 or is_same_v<_Ap, _ScalarAbi<_S_size>>,
+                                  _ScalarAbi<_N1>, _Abi<_N1, _Nreg1, _Ap::_S_variant>>;
+
+      using _Mask0 = basic_mask<_Bytes, _Abi0>;
+
+      using _Mask1 = basic_mask<_Bytes, _Abi1>;
+
+      // _Ap::_S_nreg determines how deep the recursion goes. E.g. basic_mask<4, _Abi<8, 4>> cannot
+      // use basic_mask<4, _Abi<4, 1>> as _Mask0/1 types.
+      static_assert(_Mask0::abi_type::_S_nreg + _Mask1::abi_type::_S_nreg == _Ap::_S_nreg);
+
+      static constexpr bool _S_use_bitmask = _Mask0::_S_use_bitmask;
+
+      static constexpr bool _S_is_scalar = _Mask0::_S_is_scalar;
+
+      _Mask0 _M_data0;
+
+      _Mask1 _M_data1;
+
+      using _VecType = __simd_vec_from_mask_t<_Bytes, _Ap>;
+
+      static constexpr bool _S_has_bool_member = _Mask1::_S_has_bool_member;
+
+      static constexpr size_t _S_padding_bytes
+        = __alignof__(_Mask0) - sizeof(_Mask1) + _Mask1::_S_padding_bytes;
+
+    public:
+      using value_type = bool;
+
+      using abi_type = _Ap;
+
+      using iterator = __iterator<basic_mask>;
+
+      using const_iterator = __iterator<const basic_mask>;
+
+      constexpr iterator
+      begin() noexcept
+      { return {*this, 0}; }
+
+      constexpr const_iterator
+      begin() const noexcept
+      { return {*this, 0}; }
+
+      constexpr const_iterator
+      cbegin() const noexcept
+      { return {*this, 0}; }
+
+      constexpr default_sentinel_t
+      end() const noexcept
+      { return {}; }
+
+      constexpr default_sentinel_t
+      cend() const noexcept
+      { return {}; }
+
+      static constexpr auto size = __simd_size_constant<_S_size>;
+
+      [[__gnu__::__always_inline__]]
+      static constexpr basic_mask
+      _S_init(const _Mask0& __x, const _Mask1& __y)
+      {
+        basic_mask __r;
+        __r._M_data0 = __x;
+        __r._M_data1 = __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr basic_mask
+      _S_init(unsigned_integral auto __bits)
+      { return basic_mask(__bits); }
+
+      template <typename _U0, typename _U1>
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_mask
+        _S_init(const __trivial_pair<_U0, _U1>& __bits)
+        {
+          if constexpr (is_unsigned_v<_U0>)
+            {
+              static_assert(is_unsigned_v<_U1>);
+              return _S_init(_Mask0(__bits._M_first), _Mask1(__bits._M_second));
+            }
+          else if constexpr (is_unsigned_v<_U1>)
+            return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1(__bits._M_second));
+          else
+            return _S_init(_Mask0::_S_init(__bits._M_first), _Mask1::_S_init(__bits._M_second));
+        }
+
+      template <size_t _UBytes, typename _UAbi>
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_mask
+        _S_recursive_bit_cast(const basic_mask<_UBytes, _UAbi>& __x)
+        {
+          using _Mp = basic_mask<_UBytes, _UAbi>;
+          if constexpr (_Mp::_S_has_bool_member or sizeof(basic_mask) > sizeof(__x))
+            return _S_init(__builtin_bit_cast(_Mask0, __x._M_data0),
+                           _Mask1::_S_recursive_bit_cast(__x._M_data1));
+          else if constexpr (sizeof(basic_mask) == sizeof(__x))
+            return __builtin_bit_cast(basic_mask, __x);
+          else
+            { // e.g. on IvyBridge (different alignment => different sizeof)
+              struct _Tmp { alignas(_Mp) basic_mask _M_data; };
+              return __builtin_bit_cast(_Tmp, __x)._M_data;
+            }
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr auto
+      _M_concat_data() const
+      {
+        if constexpr (_S_use_bitmask)
+          {
+            static_assert(_S_size <= sizeof(0ull) * __CHAR_BIT__, "cannot concat more than 64 bits");
+            using _Up = _Bitmask<_S_size>;
+            return _Up(_M_data0._M_concat_data() | (_Up(_M_data1._M_concat_data()) << _N0));
+          }
+        else
+          {
+            auto __lo = _M_data0._M_concat_data();
+            auto __hi = __vec_zero_pad_to<sizeof(__lo)>(_M_data1._M_concat_data());
+            return __vec_concat(__lo, __hi);
+          }
+      }
+
+      template <_ArchTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_mask
+        _S_partial_mask_of_n(int __n)
+        {
+#if __has_builtin(__builtin_ia32_bzhi_di)
+          if constexpr (_S_use_bitmask and _S_size <= 64 and _Traits._M_have_bmi2())
+            return __builtin_ia32_bzhi_di(~0ull >> (64 - _S_size), unsigned(__n));
+#endif
+          if (__n < _N0)
+            return _S_init(_Mask0::_S_partial_mask_of_n(__n), _Mask1(false));
+          else if (__n == _N0)
+            return _S_init(_Mask0(true), _Mask1(false));
+          else
+            return _S_init(_Mask0(true), _Mask1::_S_partial_mask_of_n(__n - _N0));
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_mask&
+      _M_and_neighbors()
+      {
+        _M_data0._M_and_neighbors();
+        _M_data1._M_and_neighbors();
+        return *this;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_mask&
+      _M_or_neighbors()
+      {
+        _M_data0._M_or_neighbors();
+        _M_data1._M_or_neighbors();
+        return *this;
+      }
+
+      template <typename _Mp>
+        [[__gnu__::__always_inline__]]
+        constexpr auto
+        _M_chunk() const noexcept
+        {
+          constexpr int __n = _S_size / _Mp::_S_size;
+          constexpr int __rem = _S_size % _Mp::_S_size;
+          [[maybe_unused]] constexpr auto [...__is] = __iota<int[__n]>;
+          if constexpr (_N0 == _Mp::_S_size)
+            {
+              if constexpr (__rem == 0 and is_same_v<_Mp, _Mask0>)
+                return array<_Mp, __n> {_M_data0, _M_data1};
+              else if constexpr (__rem == 0)
+                return array<_Mp, __n> {_Mp(_M_data0), _Mp(_M_data1)};
+              else
+                return tuple<_Mp, resize_t<__rem, _Mp>> {_M_data0, _M_data1};
+            }
+          else if constexpr (__rem == 0)
+            {
+              using _Rp = array<_Mp, __n>;
+              if constexpr (sizeof(_Rp) == sizeof(*this))
+                {
+                  static_assert(not _Mp::_S_is_partial);
+                  return __builtin_bit_cast(_Rp, *this);
+                }
+              else
+                {
+                  return _Rp {_Mp([&](int __i) { return (*this)[__i + __is * _Mp::_S_size]; })...};
+                }
+            }
+          else
+            {
+              using _Rest = resize_t<__rem, _Mp>;
+              // can't bit-cast because the member order of tuple is reversed
+              return tuple {
+                _Mp  ([&](int __i) { return (*this)[__i + __is * _Mp::_S_size]; })...,
+                _Rest([&](int __i) { return (*this)[__i + __n * _Mp::_S_size]; })
+              };
+            }
+        }
+
+      // [simd.mask.overview] default constructor -----------------------------
+      basic_mask() = default;
+
+      // [simd.mask.overview] conversion extensions ---------------------------
+      // TODO: any?
+
+      // [simd.mask.ctor] broadcast constructor -------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr explicit
+      basic_mask(same_as<bool> auto __x) noexcept
+        : _M_data0(__x), _M_data1(__x)
+      {}
+
+      // [simd.mask.ctor] conversion constructor ------------------------------
+      template <size_t _UBytes, typename _UAbi>
+        requires (_S_size == _UAbi::_S_size)
+        [[__gnu__::__always_inline__]]
+        constexpr explicit(__is_mask_conversion_explicit<_Ap, _UAbi>(_Bytes, _UBytes))
+        basic_mask(const basic_mask<_UBytes, _UAbi>& __x) noexcept
+          : _M_data0([&] {
+              if constexpr (_UAbi::_S_nreg > 1)
+                {
+                  if constexpr (_UAbi::_S_is_cx_ileav)
+                    return __to_cx_ileav(__x._M_data._M_data0);
+                  else
+                    return __x._M_data0;
+                }
+              else
+                return get<0>(chunk<_N0>(__x));
+            }()),
+            _M_data1([&] {
+              if constexpr (_UAbi::_S_nreg > 1)
+                {
+                  if constexpr (_UAbi::_S_is_cx_ileav)
+                    return __to_cx_ileav(__x._M_data._M_data1);
+                  else
+                    return __x._M_data1;
+                }
+              else
+                return get<1>(chunk<_N0>(__x));
+            }())
+        {}
+
+      // [simd.mask.ctor] generator constructor -------------------------------
+      template <__simd_generator_invokable<bool, _S_size> _Fp>
+        [[__gnu__::__always_inline__]]
+        constexpr explicit
+        basic_mask(_Fp&& __gen)
+          : _M_data0(__gen), _M_data1([&] [[__gnu__::__always_inline__]] (auto __i) {
+                               return __gen(__simd_size_constant<__i + _N0>);
+                             })
+        {}
+
+      template <__almost_simd_generator_invokable<bool, _S_size> _Fp>
+        constexpr explicit
+        basic_mask(_Fp&&)
+          = _GLIBCXX_DELETE_MSG("Invalid return type of the mask generator function: "
+                                "Needs to be 'bool'.");
+
+      // [simd.mask.ctor] bitset constructor ----------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr
+      basic_mask(const same_as<bitset<size()>> auto& __b) noexcept
+      : _M_data0(__bitset_split<_N0>(__b)._M_lo), _M_data1(__bitset_split<_N0>(__b)._M_hi)
+      {}
+
+      // [simd.mask.ctor] uint constructor ------------------------------------------
+      template <unsigned_integral _Tp>
+        requires (not same_as<_Tp, bool>)
+        [[__gnu__::__always_inline__]]
+        constexpr explicit
+        basic_mask(_Tp __val) noexcept
+        : _M_data0(static_cast<_Bitmask<_N0>>(__val)),
+          _M_data1(sizeof(_Tp) * __CHAR_BIT__ > _N0
+                     ? static_cast<_Bitmask<_N1>>(__val >> _N0) : _Bitmask<_N1>())
+        {}
+
+      // [simd.mask.subscr] ---------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr value_type
+      operator[](__simd_size_type __i) const
+      {
+        // in some cases the last element can be 'bool' instead of bit-/vector-mask;
+        // e.g. mask<short, 17> is {mask<short, 16>, mask<short, 1>}, where the latter uses
+        // _ScalarAbi<1>, which is stored as 'bool'
+        if constexpr (_M_data1._S_has_bool_member)
+          {
+            if (__i < _N0)
+              return _M_data0[__i];
+            else
+              return _M_data1[__i - _N0];
+          }
+        else if constexpr (__flags_test(abi_type::_S_variant, _AbiVariant::_CxIleav))
+          {
+            // values are duplicated
+            if constexpr (__flags_test(abi_type::_S_variant, _AbiVariant::_BitMask))
+              {
+                struct _Tmp
+                {
+                  alignas(basic_mask) unsigned char _M_bytes[__div_ceil(2 * _S_size, __CHAR_BIT__)];
+                };
+                return bool((__builtin_bit_cast(_Tmp, *this)
+                               ._M_bytes[2 * __i / __CHAR_BIT__] >> (2 * __i % __CHAR_BIT__)) & 1);
+              }
+            else
+              {
+                struct _Tmp
+                {
+                  alignas(basic_mask) __integer_from<_Bytes / 2> _M_values[2 * _S_size];
+                };
+                return __builtin_bit_cast(_Tmp, *this)._M_values[2 * __i] != 0;
+              }
+          }
+        else if constexpr (__flags_test(abi_type::_S_variant, _AbiVariant::_BitMask))
+          {
+            struct _Tmp
+            {
+              alignas(basic_mask) unsigned char _M_bytes[__div_ceil(_S_size, __CHAR_BIT__)];
+            };
+            return bool((__builtin_bit_cast(_Tmp, *this)
+                           ._M_bytes[__i / __CHAR_BIT__] >> (__i % __CHAR_BIT__)) & 1);
+          }
+        else
+          {
+            struct _Tmp
+            {
+              alignas(basic_mask) __integer_from<_Bytes> _M_values[_S_size];
+            };
+            return __builtin_bit_cast(_Tmp, *this)._M_values[__i] != 0;
+          }
+      }
+
+      // [simd.mask.unary] ----------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr basic_mask
+      operator!() const noexcept
+      { return _S_init(!_M_data0, !_M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr _VecType
+      operator+() const noexcept requires destructible<_VecType>
+      { return _VecType::_S_concat(+_M_data0, +_M_data1); }
+
+      constexpr _VecType
+      operator+() const noexcept = delete;
+
+      [[__gnu__::__always_inline__]]
+      constexpr _VecType
+      operator-() const noexcept requires destructible<_VecType>
+      { return _VecType::_S_concat(-_M_data0, -_M_data1); }
+
+      constexpr _VecType
+      operator-() const noexcept = delete;
+
+      [[__gnu__::__always_inline__]]
+      constexpr _VecType
+      operator~() const noexcept requires destructible<_VecType>
+      { return _VecType::_S_concat(~_M_data0, ~_M_data1); }
+
+      constexpr _VecType
+      operator~() const noexcept = delete;
+
+      // [simd.mask.conv] -----------------------------------------------------
+      template <typename _Up, typename _UAbi>
+        requires (__simd_size_v<_Up, _UAbi> == _S_size)
+        [[__gnu__::__always_inline__]]
+        constexpr explicit(sizeof(_Up) != _Bytes)
+        operator basic_vec<_Up, _UAbi>() const noexcept
+        {
+          using _Rp = basic_vec<_Up, _UAbi>;
+          return _Rp::_S_init(_M_data0, _M_data1);
+        }
+
+      // [simd.mask.namedconv] ------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr bitset<_S_size>
+      to_bitset() const noexcept
+      {
+        if constexpr (_S_size <= 64)
+          return to_ullong();
+        else
+          {
+            static_assert(_N0 % 64 == 0);
+            struct _Tmp
+            {
+              bitset<_N0> _M_lo;
+              bitset<_N1> _M_hi;
+            } __tmp = {_M_data0.to_bitset(), _M_data1.to_bitset()};
+            return __builtin_bit_cast(bitset<_S_size>, __tmp);
+          }
+      }
+
+      template <int _Offset = 0, bool _Use_2_for_1 = false, _ArchTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr auto
+        _M_to_uint() const
+        {
+          constexpr int _N0x = _Use_2_for_1 ? _N0 / 2 : _N0;
+          if constexpr (_N0x >= 64)
+            {
+              static_assert(_Offset == 0);
+              return __trivial_pair {
+                _M_data0.template _M_to_uint<0, _Use_2_for_1>(),
+                _M_data1.template _M_to_uint<0, _Use_2_for_1>()
+              };
+            }
+          else
+            {
+#if _GLIBCXX_X86
+              if constexpr (_Bytes == 2 and not _Traits._M_have_bmi2() and _Ap::_S_nreg == 2
+                              and not _S_use_bitmask and not _Use_2_for_1)
+                return __similar_mask<char, _S_size, _Ap>(*this).template _M_to_uint<_Offset>();
+#endif
+              auto __uint = _M_data1.template _M_to_uint<_N0x + _Offset, _Use_2_for_1>();
+              __uint |= _M_data0.template _M_to_uint<_Offset, _Use_2_for_1>();
+              return __uint;
+            }
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr unsigned long long
+      to_ullong() const
+      {
+        if constexpr (_S_size <= 64)
+          return _M_to_uint();
+        else
+          {
+            __glibcxx_simd_precondition(_M_data1.to_ullong() == 0,
+                                        "to_ullong called on mask with 'true' elements at indices"
+                                        "higher than 64");
+            return _M_data0.to_ullong();
+          }
+      }
+
+      // [simd.mask.binary]
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator&&(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data0 && __y._M_data0, __x._M_data1 && __y._M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator||(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data0 || __y._M_data0, __x._M_data1 || __y._M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator&(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data0 & __y._M_data0, __x._M_data1 & __y._M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator|(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data0 | __y._M_data0, __x._M_data1 | __y._M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator^(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return _S_init(__x._M_data0 ^ __y._M_data0, __x._M_data1 ^ __y._M_data1); }
+
+      // [simd.mask.cassign]
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask&
+      operator&=(basic_mask& __x, const basic_mask& __y) noexcept
+      {
+        __x._M_data0 &= __y._M_data0;
+        __x._M_data1 &= __y._M_data1;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask&
+      operator|=(basic_mask& __x, const basic_mask& __y) noexcept
+      {
+        __x._M_data0 |= __y._M_data0;
+        __x._M_data1 |= __y._M_data1;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask&
+      operator^=(basic_mask& __x, const basic_mask& __y) noexcept
+      {
+        __x._M_data0 ^= __y._M_data0;
+        __x._M_data1 ^= __y._M_data1;
+        return __x;
+      }
+
+      // [simd.mask.comparison] -----------------------------------------------
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator==(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return !(__x ^ __y); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator!=(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return __x ^ __y; }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator>=(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return __x || !__y; }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator<=(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return !__x || __y; }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator>(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return __x && !__y; }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      operator<(const basic_mask& __x, const basic_mask& __y) noexcept
+      { return !__x && __y; }
+
+      // [simd.mask.cond] -----------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      __select_impl(const basic_mask& __k, const basic_mask& __t, const basic_mask& __f) noexcept
+      {
+        return _S_init(__select_impl(__k._M_data0, __t._M_data0, __f._M_data0),
+                       __select_impl(__k._M_data1, __t._M_data1, __f._M_data1));
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_mask
+      __select_impl(const basic_mask& __k, same_as<bool> auto __t, same_as<bool> auto __f) noexcept
+      {
+        if (__t == __f)
+          return basic_mask(__t);
+        else
+          return __t ? __k : !__k;
+      }
+
+      template <__vectorizable _T0, same_as<_T0> _T1>
+        requires (sizeof(_T0) == _Bytes)
+        [[__gnu__::__always_inline__]]
+        friend constexpr vec<_T0, _S_size>
+        __select_impl(const basic_mask& __k, const _T0& __t, const _T1& __f) noexcept
+        {
+          using _Vp = vec<_T0, _S_size>;
+          if constexpr (__complex_like<_T0>)
+            return _Vp::_S_concat(__select_impl(__k._M_data0, __t, __f),
+                                  __select_impl(__k._M_data1, __t, __f));
+          else
+            return _Vp::_S_init(__select_impl(__k._M_data0, __t, __f),
+                                __select_impl(__k._M_data1, __t, __f));
+        }
+
+      template <_ArchTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr bool
+        _M_all_of() const
+        {
+          if constexpr (_N0 == _N1)
+            return (_M_data0 and _M_data1)._M_all_of();
+          else
+            return _M_data0._M_all_of() and _M_data1._M_all_of();
+        }
+
+      template <_ArchTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr bool
+        _M_any_of() const
+        {
+          if constexpr (_N0 == _N1)
+            return (_M_data0 or _M_data1)._M_any_of();
+          else
+            return _M_data0._M_any_of() or _M_data1._M_any_of();
+        }
+
+      template <_ArchTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr bool
+        _M_none_of() const
+        {
+          if constexpr (_N0 == _N1)
+            return (_M_data0 or _M_data1)._M_none_of();
+          else
+            return _M_data0._M_none_of() and _M_data1._M_none_of();
+        }
+
+      [[__gnu__::__always_inline__]]
+      bool
+      _M_is_constprop() const
+      { return _M_data0._M_is_constprop() and _M_data1._M_is_constprop(); }
+    };
+}
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_MASK_H

[PATCH 03/11] libstdc++: C++26 [simd] rebind, resize, permute, chunk, cat and basic_mask

Reply via email to