[PATCH] libstdc++: Fix up std::generate_canonical for 32-bit arches

Jakub Jelinek Wed, 17 Dec 2025 06:50:15 -0800

Hi!

Actually, the
FAIL: 26_numerics/random/uniform_real_distribution/operators/64351.cc  
-std=gnu++20 execution test                                                     
                               
FAIL: 26_numerics/random/uniform_real_distribution/operators/gencanon.cc  
-std=gnu++20 execution test                                                     
                            
errors are gone when testing with --target_board=unix/-m32/-msse2/-mfpmath=sse
or when the tests are compiled with -O0, which means that it isn't buggy
unsigned __int128 emulation in that case, but rather either those tests or
something in random.tcc not being extended precision clean.


So, here is the full patch.  It uses what Tomasz was asking for, i.e. for
#ifdef __SIZEOF_INT128__ uses __extension__ using __rng_uint128 = unsigned 
__int128;
and otherwise the new class.  Tested on x86_64-linux and i686-linux, ok for
trunk?

2025-12-17  Jakub Jelinek  <[email protected]>

        * include/bits/random.tcc (std::__detail::__rng_uint128): For
        #ifdef __SIZEOF_INT128__ new using directive with __extension__,
        otherwise new class to emulate unsigned __int128 on 32-bit arches.
        (std::__detail::operator+, std::__detail::operator<<,
        std::__detail::operator>>, std::__detail::operator|,
        std::__detail::operator*, std::__detail::operator/,
        std::__detail::operator<): New overloaded operators for __rng_uint128.
        (std::__generate_canonical_pow2): Add __log2_uint_max template
        argument, remove it as automatic variable, remove __uint_range_less_1
        variable, cast 2.0 to _RealT.
        (std::__generate_canonical_any): Remove static_assert.  Formatting
        fix.  Use __sum = __sum + ...; instead of __sum += ...;.  Cast __rd
        to _FloatT before division.
        (std::generate_canonical): Remove unused _Rng alias.  Add extra
        template argument for __generate_canonical_pow2 calls.  For 128-bit
        calls remove __extension__ and use __detail::__rng_uint128 instead of
        unsigned __int128 and call it regardless of whether __SIZEOF_INT128__
        is defined or not.

--- libstdc++-v3/include/bits/random.tcc.jj     2025-12-17 15:21:21.216716403 
+0100
+++ libstdc++-v3/include/bits/random.tcc        2025-12-17 15:22:32.497121999 
+0100
@@ -3481,6 +3481,213 @@ namespace __detail
 #pragma GCC diagnostic ignored "-Wc++14-extensions" // for variable templates
 #pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr
 
+  namespace __detail
+  {
+#ifdef __SIZEOF_INT128__
+    __extension__ using __rng_uint128 = unsigned __int128;
+#else
+    // Note, this is not a full unsigned __int128 emulation, just
+    // enough for __generate_* purposes below.
+    struct __rng_uint128 {
+      uint64_t _M_l, _M_h;
+
+      constexpr __rng_uint128(uint64_t __x) noexcept
+      : _M_l (__x), _M_h (0) {}
+
+      constexpr __rng_uint128(const __rng_uint128& __x) noexcept = default;
+
+      constexpr __rng_uint128(uint64_t __h, uint64_t __l) noexcept
+      : _M_l (__l), _M_h (__h) {}
+
+      template<typename _RealT>
+      constexpr operator _RealT() const noexcept {
+       static_assert(std::is_floating_point<_RealT>::value,
+         "template argument must be a floating point type");
+       return _M_h == 0 ? _RealT(_M_l)
+                        : _RealT(_M_h) * _RealT(18446744073709551616.0)
+                          + _RealT(_M_l);
+      }
+
+      __rng_uint128& operator*=(const __rng_uint128& __x) noexcept
+      {
+       uint64_t __a[12] = {
+         uint32_t(_M_l), _M_l >> 32,
+         uint32_t(_M_h), _M_h >> 32,
+         uint32_t(__x._M_l), __x._M_l >> 32,
+         uint32_t(__x._M_h), __x._M_h >> 32,
+         0, 0,
+         0, 0 };
+       for (int __i = 0; __i < 4; ++__i)
+         {
+           uint64_t __c = 0;
+           for (int __j = __i; __j < 4; ++__j)
+             {
+               __c += __a[__i] * __a[4 + __j - __i] + __a[8 + __j];
+               __a[8 + __j] = uint32_t(__c);
+               __c >>= 32;
+             }
+         }
+       _M_l = __a[8] + (__a[9] << 32);
+       _M_h = __a[10] + (__a[11] << 32);
+       return *this;
+      }
+    };
+
+    constexpr __rng_uint128
+    operator+(const __rng_uint128& __x, const __rng_uint128& __y) noexcept
+    {
+      return __rng_uint128(__x._M_h + __y._M_h
+                          + (__x._M_l + __y._M_l < __x._M_l),
+                          __x._M_l + __y._M_l);
+    }
+
+    constexpr __rng_uint128
+    operator<<(const __rng_uint128& __x, size_t __y) noexcept
+    {
+      return (__y >= 64
+             ? __rng_uint128(__x._M_l << (__y - 64), 0)
+             : __y
+             ? __rng_uint128((__x._M_h << __y) | (__x._M_l >> (64 - __y)),
+                             __x._M_l << __y)
+             : __x);
+    }
+
+    constexpr __rng_uint128
+    operator>>(const __rng_uint128& __x, size_t __y) noexcept
+    {
+      return (__y >= 64
+             ? __rng_uint128(0, __x._M_h >> (__y - 64))
+             : __y
+             ? __rng_uint128(__x._M_h >> __y,
+                             (__x._M_l >> __y) | (__x._M_h << (64 - __y)))
+             : __x);
+    }
+
+    constexpr __rng_uint128
+    operator|(const __rng_uint128& __x, const __rng_uint128& __y) noexcept
+    {
+      return __rng_uint128(__x._M_h | __y._M_h, __x._M_l | __y._M_l);
+    }
+
+    __rng_uint128
+    operator*(const __rng_uint128& __x, const __rng_uint128& __y) noexcept
+    {
+      __rng_uint128 __ret(__x);
+      __ret *= __y;
+      return __ret;
+    }
+
+    __rng_uint128
+    operator/(const __rng_uint128& __x, const __rng_uint128& __y) noexcept
+    {
+      uint64_t __l, __h;
+      if (!__x._M_h)
+       {
+         if (!__y._M_h)
+           __l = __x._M_l / __y._M_l;
+         else
+           __l = 0;
+         __h = 0;
+       }
+      else
+       {
+         uint64_t __a[13] = {
+           uint32_t(__x._M_l), __x._M_l >> 32,
+           uint32_t(__x._M_h), __x._M_h >> 32,
+           0,
+           uint32_t(__y._M_l), __y._M_l >> 32,
+           uint32_t(__y._M_h), __y._M_h >> 32,
+           0, 0,
+           0, 0 };
+         uint64_t __c = 0, __w = 0;
+         if (!__y._M_h && __y._M_l <= ~uint32_t(0))
+           for (int __i = 3; ; --__i)
+             {
+               __w = __a[__i] + (__c << 32);
+               __a[9 + __i] = __w / __y._M_l;
+               if (__i == 0)
+                 break;
+               __c = __w % __y._M_l;
+             }
+         else
+           {
+             // See Donald E. Knuth's "Seminumerical Algorithms".
+             int __n = 0, __d = 0;
+             uint64_t __q = 0, __s = 0;
+             for (__d = 3; __a[5 + __d] == 0; --__d)
+               ;
+             __s = (uint64_t(1) << 32) / (__a[5 + __d] + 1);
+             if (__s > 1)
+               {
+                 for (int __i = 0; __i <= 3; ++__i)
+                   {
+                     __w = __a[__i] * __s + __c;
+                     __a[__i] = uint32_t(__w);
+                     __c = __w >> 32;
+                   }
+                 __a[4] = __c;
+                 __c = 0;
+                 for (int __i = 0; __i <= 3; ++__i)
+                   {
+                     __w = __a[5 + __i] * __s + __c;
+                     __a[5 + __i] = uint32_t(__w);
+                     __c = __w >> 32;
+                     if (__a[5 + __i])
+                       __d = __i;
+                   }
+               }
+             __n = 4;
+             for (int __i = __n - __d - 1; __i >= 0; --__i)
+               {
+                 __n = __i + __d + 1;
+                 __w = (__a[__n] << 32) + __a[__n - 1];
+                 if (__a[__n] != __a[5 + __d])
+                   __q = __w / __a[5 + __d];
+                 else
+                   __q = ~uint32_t(0);
+                 uint64_t __t = __w - __q * __a[5 + __d];
+                 if (__t <= ~uint32_t(0)
+                     && __a[4 + __d] * __q > (__t << 32) + __a[__n - 2])
+                   --__q;
+                 __c = 0;
+                 for (int __j = 0; __j <= __d; ++__j)
+                   {
+                     __w = __q * __a[5 + __j] + __c;
+                     __c = __w >> 32;
+                     __w = __a[__i + __j] - uint32_t(__w);
+                     __a[__i + __j] = uint32_t(__w);
+                     __c += (__w >> 32) != 0;
+                   }
+                 if (int64_t(__a[__n]) < int64_t(__c))
+                   {
+                     --__q;
+                     __c = 0;
+                     for (int __j = 0; __j <= __d; ++__j)
+                       {
+                         __w = __a[__i + __j] + __a[5 + __j] + __c;
+                         __c = __w >> 32;
+                         __a[__i + __j] = uint32_t(__w);
+                       }
+                     __a[__n] += __c;
+                   }
+                 __a[9 + __i] = __q;
+               }
+           }
+         __l = __a[9] + (__a[10] << 32);
+         __h = __a[11] + (__a[12] << 32);
+       }
+      return __rng_uint128(__h, __l);
+    }
+
+    constexpr bool
+    operator<(const __rng_uint128 __x, const __rng_uint128 __y)
+    {
+      return (__x._M_h < __y._M_h
+             || (__x._M_h == __y._M_h && __x._M_l < __y._M_l));
+    }
+#endif  
+  }
+
   // This version is used when Urbg::max()-Urbg::min() is a power of
   // two less 1, the norm in real programs. It works by calling urng()
   // as many times as needed to fill the target mantissa, accumulating
@@ -3511,7 +3718,8 @@ namespace __detail
   // This implementation works with std::bfloat16, which can exactly
   // represent 2^32, but not with std::float16_t, limited to 2^15.
 
-  template<typename _RealT, typename _UInt, size_t __d, typename _Urbg>
+  template<typename _RealT, typename _UInt, size_t __d, int __log2_uint_max,
+          typename _Urbg>
     _RealT
     __generate_canonical_pow2(_Urbg& __urng)
     {
@@ -3521,10 +3729,8 @@ namespace __detail
       // r = 2;  // Redundant, we only support radix 2.
       using _Rng = decltype(_Urbg::max());
       const _Rng __rng_range_less_1 = _Urbg::max() - _Urbg::min();
-      const _UInt __uint_range_less_1 = ~_UInt(0);
       // R = _UInt(__rng_range_less_1) + 1;  // May wrap to 0.
       const auto __log2_R = __builtin_popcountg(__rng_range_less_1);
-      const auto __log2_uint_max = __builtin_popcountg(__uint_range_less_1);
       // rd = _UInt(1) << __d;  // Could overflow, UB.
       const unsigned __k = (__d + __log2_R - 1) / __log2_R;
       const unsigned __log2_Rk_max = __k * __log2_R;
@@ -3532,7 +3738,8 @@ namespace __detail
        __log2_uint_max < __log2_Rk_max ? __log2_uint_max : __log2_Rk_max;
       static_assert(__log2_Rk >= __d);
       // Rk = _UInt(1) << __log2_Rk;  // Likely overflows, UB.
-      constexpr _RealT __Rk = _RealT(_UInt(1) << (__log2_Rk - 1)) * 2.0;
+      constexpr _RealT __Rk
+       = _RealT(_UInt(1) << (__log2_Rk - 1)) * _RealT(2.0);
 #if defined(_GLIBCXX_GENERATE_CANONICAL_STRICT)
       const unsigned __log2_x = __log2_Rk - __d; // # of spare entropy bits.
 #else
@@ -3585,7 +3792,6 @@ namespace __detail
     _RealT
     __generate_canonical_any(_Urbg& __urng)
     {
-      static_assert(__d < __builtin_popcountg(~_UInt(0)));
       // Names below are chosen to match the description in the Standard.
       // Parameter d is the actual target number of bits.
       const _UInt __r = 2;
@@ -3593,7 +3799,7 @@ namespace __detail
       const _UInt __rd = _UInt(1) << __d;
       const unsigned __k = __gen_can_rng_calls_needed(__R, __rd);
       const _UInt __Rk = __gen_can_pow(__R, __k);
-      const _UInt __x =  __Rk/__rd;
+      const _UInt __x =  __Rk / __rd;
 
       while (true)
        {
@@ -3601,9 +3807,9 @@ namespace __detail
          for (int __i = __k - 1; __i > 0; --__i)
            {
              __Ri *= __R;
-             __sum += (__urng() - _Urbg::min()) * __Ri;
+             __sum = __sum + (__urng() - _Urbg::min()) * __Ri;
            }
-         const _RealT __ret = _RealT(__sum / __x) / __rd;
+         const _RealT __ret = _RealT(__sum / __x) / _RealT(__rd);
          if (__ret < _RealT(1.0))
            return __ret;
        }
@@ -3659,7 +3865,6 @@ _GLIBCXX_BEGIN_INLINE_ABI_NAMESPACE(_V2)
        "float16_t type is not supported, consider using bfloat16_t");
 #endif
 
-      using _Rng = decltype(_Urbg::max());
       const unsigned __d_max = std::numeric_limits<_RealT>::digits;
       const unsigned __d = __digits > __d_max ? __d_max : __digits;
 
@@ -3668,20 +3873,15 @@ _GLIBCXX_BEGIN_INLINE_ABI_NAMESPACE(_V2)
       if constexpr (__is_power_of_2_less_1(_Urbg::max() - _Urbg::min()))
        {
          if constexpr (__d <= 32)
-           return __generate_canonical_pow2<_RealT, uint32_t, __d>(__urng);
+           return __generate_canonical_pow2<_RealT, uint32_t, __d,
+                                            32>(__urng);
          else if constexpr (__d <= 64)
-           return __generate_canonical_pow2<_RealT, uint64_t, __d>(__urng);
+           return __generate_canonical_pow2<_RealT, uint64_t, __d,
+                                            64>(__urng);
          else
-           {
-#if defined(__SIZEOF_INT128__)
-             // Accommodate double double or float128.
-             return __extension__ __generate_canonical_pow2<
-               _RealT, unsigned __int128, __d>(__urng);
-#else
-             static_assert(false,
-               "float precision >64 requires __int128 support");
-#endif
-           }
+           // Accommodate double double or float128.
+           return __generate_canonical_pow2<_RealT, __detail::__rng_uint128,
+                                            __d, 128>(__urng);
        }
       else // Need up to 2x bits.
        {
@@ -3689,15 +3890,10 @@ _GLIBCXX_BEGIN_INLINE_ABI_NAMESPACE(_V2)
            return __generate_canonical_any<_RealT, uint64_t, __d>(__urng);
          else
            {
-#if defined(__SIZEOF_INT128__)
              static_assert(__d <= 64,
                "irregular RNG with float precision >64 is not supported");
-             return __extension__ __generate_canonical_any<
-               _RealT, unsigned __int128, __d>(__urng);
-#else
-             static_assert(false, "irregular RNG with float precision"
-                " >32 requires __int128 support");
-#endif
+             return __generate_canonical_any<_RealT, __detail::__rng_uint128,
+                                             __d>(__urng);
            }
        }
     }

        Jakub

[PATCH] libstdc++: Fix up std::generate_canonical for 32-bit arches

Reply via email to