Changes in v11:
 * Add doxygen entry for generate_canonical.

Changes in v10:
 * Rewrite entirely after consultation with P0952 authors.
 * Require radix2 for all float types.
 * Perform all intermediate calculations on integers.
 * Use one implementation for all C++11 to C++26.
 * Optimize for each digits resolution.
 * Test also with irregular URBG returning 0..999999.
 * Work under Clang, for -std=c++14 and up.
 * By default and where possible (which is usually), preserve more
   entropy than the naive Standard prescription.

Changes in v9:
 * Split out implementations from public pre-26 and 26 APIs.
 * Add remaining support for non-radix2 float types in C++26.
 * Handle pow() and floor() compatibly with non-built-in types, and Clang.
 * Add testing for a non-radix2 float type.
 * Iron out gratuitous differences between radix2 and p0952 impls.

Changes in v8:
 * Apply LWG 2524 resolution to C++11..C++23 implementations, too.
 * In those, eliminate std::log called twice per entry, and do all
  setup calculations at compile time.
 * "if constexpr" the C++26 version to call the slightly-faster
  older version when conditions permit: actually almost always.

Change in v7:
 * Fix local consteval floor() meant for Clang.

Changes in v6:
 * Implement also for Clang, using local consteval substitutes for its
  non-constexpr __builtin_pow and __builtin_floor.
 * Add a test for long double.

Changes in v5:
 * Static-assert movable RNG object correctly.
 * Add a more comprehensive test gencanon.cc

Changes in v4:
 * Static-assert arg is floating-point, coercible from bigger unsigned.
 * Static-assert arg satisfies uniform_random_bit_generator, movable.
 * Include uniform_int_dist.h for concept uniform_random_bit_generator
 * Coerce floating consts from unsigned literals, matching other usage.
 * Remove redundant type decorations static and -> .
 * Use __builtin_floor for broader applicability than std::floor.
 * Use __builtin_pow in place of immediate lambda.
 * Rename, rearrange local objects for clarity.
 * Add redundant "if constexpr (k > 1)" for clarity.
 * #if-guard new impl against Clang, which lacks constexpr floor() etc.

Changes in v3:
 * Implement exactly as specified in the WP, for C++26 and up.

Implement P0952R2 "A new specification for std::generate_canonical".

It has us start over, using new entropy, if the naïve generation of
a value in 0..1 comes up not-less-than 1, which occurs too rarely
for the change to measurably affect performance.

The fix is extended to all of C++11 to 26. It dispatches to variations
optimized for each bit depth, using minimal integer sizes for exact
intermediate calculations, and prefers shift operations over
multiply/divide where practical.

This patch adds thorough tests for statistical properties. It adds new
static asserts on template argument requirements where supported.
It adds a test using a non-optimal RNG that yields values 0..999999.

A test for the case addressed in P0952 already appears in 64351.cc.
This change amounts to a different resolution for PR64351 and LWG 2524.

libstdc++-v3/ChangeLog:
        PR libstdc++/119739
        * include/bits/random.tcc: Add generate_canonical impl for C++26.
        * include/std/random: Include uniform_int_dist.h for concept.
        * 
testsuite/26_numerics/random/uniform_real_distribution/operators/64351.cc:
        Adapt test for both pre- and post- C++26.
        * 
testsuite/26_numerics/random/uniform_real_distribution/operators/gencanon.cc:
        Test for float and double from 32-bit RNG, including 1.0 do-over.
---
 libstdc++-v3/include/bits/random.tcc          | 243 +++++++++++++++---
 libstdc++-v3/include/std/random               |   1 +
 .../operators/64351.cc                        |  25 +-
 .../operators/gencanon.cc                     | 165 ++++++++++++
 4 files changed, 382 insertions(+), 52 deletions(-)
 create mode 100644 
libstdc++-v3/testsuite/26_numerics/random/uniform_real_distribution/operators/gencanon.cc

diff --git a/libstdc++-v3/include/bits/random.tcc 
b/libstdc++-v3/include/bits/random.tcc
index 53ccacb2e38..ff2577e1141 100644
--- a/libstdc++-v3/include/bits/random.tcc
+++ b/libstdc++-v3/include/bits/random.tcc
@@ -31,6 +31,7 @@
 #define _RANDOM_TCC 1
 
 #include <numeric> // std::accumulate and std::partial_sum
+#include <cstdint>
 
 namespace std _GLIBCXX_VISIBILITY(default)
 {
@@ -3349,43 +3350,221 @@ namespace __detail
        }
     }
 
-  template<typename _RealType, size_t __bits,
-          typename _UniformRandomNumberGenerator>
-    _RealType
-    generate_canonical(_UniformRandomNumberGenerator& __urng)
-    {
-      static_assert(std::is_floating_point<_RealType>::value,
-                   "template argument must be a floating point type");
-
-      const size_t __b
-       = std::min(static_cast<size_t>(std::numeric_limits<_RealType>::digits),
-                   __bits);
-      const long double __r = static_cast<long double>(__urng.max())
-                           - static_cast<long double>(__urng.min()) + 1.0L;
-      const size_t __log2r = std::log(__r) / std::log(2.0L);
-      const size_t __m = std::max<size_t>(1UL,
-                                         (__b + __log2r - 1UL) / __log2r);
-      _RealType __ret;
-      _RealType __sum = _RealType(0);
-      _RealType __tmp = _RealType(1);
-      for (size_t __k = __m; __k != 0; --__k)
-       {
-         __sum += _RealType(__urng() - __urng.min()) * __tmp;
-         __tmp *= __r;
-       }
-      __ret = __sum / __tmp;
-      if (__builtin_expect(__ret >= _RealType(1), 0))
-       {
-#if _GLIBCXX_USE_C99_MATH_FUNCS
-         __ret = std::nextafter(_RealType(1), _RealType(0));
+// [rand.util.canonical]
+// generate_canonical(RNG&)
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wc++17-extensions"  // if constexpr
+
+  constexpr unsigned
+  __gen_con_popcount(unsigned i) { return __builtin_popcount(i); }
+  constexpr unsigned
+  __gen_con_popcount(unsigned long i) { return __builtin_popcountl(i); }
+  constexpr unsigned
+  __gen_con_popcount(unsigned long long i) { return __builtin_popcountll(i); }
+  constexpr unsigned
+  __gen_con_popcount(unsigned __int128 i)
+    {
+      return __gen_con_popcount(uint64_t(i)) +
+       __gen_con_popcount(uint64_t(i >> 64));
+    }
+
+  // Call this version when _URBG::max()-_URBG::min()+1 is a power of
+  // two, the norm in real programs. It works by calling urng() as many
+  // times as needed to fill the target mantissa, accumulating entropy
+  // into an integer value, converting that to the float type, and then
+  // dividing by the range of the integer value (a power of 2, so just
+  // adjusts the exponent) to produce a result in [0..1]. In case of an
+  // exact 1.0 result, we will re-try.
+  //
+  // It needs to work when the integer type used is only as big as the
+  // float mantissa, such as uint64_t for long double. So, commented-out
+  // lines below represent computations the Standard prescribes, but
+  // cannot be performed as described, or are not used.
+  //
+  // When the result is close to zero, the naive calculation would leave
+  // low-order zeroes in the mantissa; where spare entropy has been
+  // extracted, as is usual for float and double, some or all of the
+  // spare entropy can be pulled into the result for better randomness.
+  //
+  // When two or more calls to urng() yield more bits of entropy than fit
+  // into _Int, we end up discarding some of it by overflowing, which is
+  // OK. When converting the integer representation of the sample to the
+  // float value, we must divide out the truncated size.
+
+  template<typename _RealT, typename _Int, size_t __d, typename _URBG>
+    _RealT
+    __generate_canonical_ok_rng(_URBG& __urng)
+    {
+      // Names below are chosen to match the description in the Standard.
+      // Parameter __d is the actual target number of digits.
+      // const _Int __r = 2;
+      const auto __rng_range = _URBG::max() - _URBG::min();
+      const auto __int_range = ~_Int(0);
+      // const _Int __R = _Int(__rng_range) + 1;  // may wrap to 0
+      const auto __log2_R = __gen_con_popcount(__rng_range);
+      const auto __log2_int_max = __gen_con_popcount(__int_range);
+      // const _Int  __rd = _Int(1) << __d;  // could overflow, UB
+      const unsigned __k = (__d + __log2_R - 1) / __log2_R;
+      const unsigned __log2_Rk_max = __k * __log2_R;
+      const unsigned __log2_Rk =  // bits of entropy actually obtained
+       __log2_int_max < __log2_Rk_max ? __log2_int_max : __log2_Rk_max;
+      static_assert(__log2_Rk >= __d);
+      // const _Int __Rk = _Int(1) << __log2_Rk;  // likely overflows, UB
+      constexpr _RealT __Rk = _RealT(_Int(1) << (__log2_Rk - 1)) * 2.0;
+#if defined(__STRICT_ANSI__)
+      const unsigned __log2_x = __log2_Rk - __d; // # of spare entropy bits.
+      const _Int __x = _Int(1) << __log2_x;
+      constexpr _RealT __rd = __Rk / __x;
+#endif
+      // const _Int __xrd = __x << __d;  // could overflow.
+
+      while (true)
+      {
+       _Int __sum;
+       unsigned __shift = 0;
+       __sum = _Int(__urng() - _URBG::min());
+       for (int __i = __k - 1; __i > 0; --__i)
+         __shift += __log2_R,
+           __sum |= _Int(__urng() - _URBG::min()) << __shift;
+#if defined(_STRICT_ANSI__)
+       const _RealT __ret = _RealT(__sum >> __log2_x) / __rd;
+#else
+       const _RealT __ret = _RealT(__sum) / __Rk;
+#endif
+       if (__ret < _RealT(1.0)) return __ret;
+      }
+    }
+
+  template <typename _Int>
+    constexpr _Int
+    __gen_con_pow(_Int __base, size_t __exponent)
+    {
+      _Int __prod = __base;
+      while (--__exponent != 0) __prod *= __base;
+      return __prod;
+    }
+
+  template <typename _Int>
+    constexpr unsigned
+    __gen_con_rng_calls_needed(_Int __R, _Int __rd)
+    {
+      unsigned __i = 1;
+      for (auto __Ri = __R; __Ri < __rd; __Ri *= __R)
+       ++__i;
+      return __i;
+    }
+
+  // This version must be used when the range of possible RNG
+  // results, _URBG::max()-_URBG::min()+1, is not a power of two.
+  // The Int type passed must be big enough to represent Rk, R^k,
+  // a power of R (the range of values produced by the rng) up to
+  // twice the length of the mantissa.
+
+  template<typename _RealT, typename _Int, size_t __d, typename _URBG>
+    _RealT
+    __generate_canonical_ill_rng(_URBG& __urng)
+    {
+      // Names below are chosen to match the description in the Standard.
+      // Parameter __d is the actual target number of digits.
+      const _Int __r = 2;
+      const _Int __R = _Int(_URBG::max() - _URBG::min()) + 1;
+      const _Int __rd = __gen_con_pow(__r, __d);
+      const unsigned __k = __gen_con_rng_calls_needed(__R, __rd);
+      const _Int __Rk = __gen_con_pow(__R, __k);
+      const _Int __x =  __Rk/__rd;
+
+      while (true)
+      {
+       _Int __Ri = 1, __sum = __urng() - _URBG::min();
+       for (int __i = __k - 1; __i > 0; --__i)
+         __Ri *= __R, __sum += (__urng() - _URBG::min()) * __Ri;
+       const _RealT __ret = _RealT(__sum / __x) / __rd;
+       if (__ret < _RealT(1.0)) return __ret;
+      }
+    }
+
+#if !defined(_STRICT_ANSI__)
+  template <typename _Tp> struct __gen_con_is_float
+    { static const bool __value = is_floating_point<_Tp>::value; };
 #else
-         __ret = _RealType(1)
-           - std::numeric_limits<_RealType>::epsilon() / _RealType(2);
+  template <typename _Tp> struct __gen_con_is_float
+    { static const bool __value = false; };
+  template <> struct __gen_con_is_float<float>
+    { static const bool __value = true; };
+  template <> struct __gen_con_is_float<double>
+    { static const bool __value = true; };
+  template <> struct __gen_con_is_float<long double>
+    { static const bool __value = true; };
 #endif
+
+  /** Produce a random floating-point value in the range [0..1)
+   *
+   * The result of `std::generate_canonical<RealT,digits>(rng)` is a
+   * random floating-point value of type `RealT` in the range [0..1),
+   * using entropy provided by the uniform random bit generator `rng`.
+   * A non-negative value for `digits` may be used to limit the
+   * precision of the result to so many bits, but commonly -1 is
+   * passed to get the native precision of `RealT`. As many `rng()`
+   * calls are made as needed to obtain the required entropy. On rare
+   * occasions one or more additional `rng()` calls are used. It is
+   * fastest when the value of `rng.max()` is a power of two less one,
+   * such as from a `std::mt19937` (for `float`) or `mt19937_64 (for
+   * `double`).
+   *
+   *  @since C++11
+   */
+  template<typename _RealT, size_t __digits, typename _URBG>
+    _RealT
+    generate_canonical(_URBG& __urng)
+    {
+#if __cpp_lib_concepts >= 201907
+      static_assert(requires { [](uniform_random_bit_generator auto) {}(
+           static_cast<std::remove_cv_t<_URBG>&&>(__urng)); },
+       "argument must meet uniform_random_bit_generator requirements");
+#endif
+#if __cpp_lib_concepts >= 201806
+      static_assert(requires { _RealT(_URBG::max()); },
+       "template float argument must be coercible from unsigned");
+#endif
+      static_assert(__gen_con_is_float<_RealT>::__value,
+       "template argument must be floating point");
+      static_assert(__digits != 0 && _URBG::max() > _URBG::min(),
+       "random samples with 0 digits are not meaningful");
+      static_assert(std::numeric_limits<_RealT>::radix == 2,
+       "only base-2 floats are supported");
+
+      const unsigned __d_max = std::numeric_limits<_RealT>::digits;
+      const unsigned __d = __digits > __d_max ? __d_max : __digits;
+      const auto __range = _URBG::max() - _URBG::min();
+
+      // If the RNG's range + 1 is a power of 2, the float type's mantissa
+      // is enough bits. If not, you need more.
+      if constexpr (((__range + 1) & __range) == 0)
+      //   (Note, the range check works even when (__range + 1) overflows.)
+      {
+       if constexpr (__d <= 32)
+         return __generate_canonical_ok_rng<_RealT, uint32_t, __d>(__urng);
+       else // accommodate double double or float128
+         return __generate_canonical_ok_rng<
+           _RealT, unsigned __int128, __d>(__urng);
+      }
+      else // need 2x bits
+      {
+       if constexpr (__d <= 32)
+         return __generate_canonical_ill_rng<_RealT, uint64_t, __d>(__urng);
+       else
+       {
+         static_assert(__d <= 64,
+           "irregular RNG with float precision > 64 is not supported");
+         return __generate_canonical_ill_rng<
+           _RealT, unsigned __int128, __d>(__urng);
        }
-      return __ret;
+      }
     }
 
+#pragma GCC diagnostic pop
+
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace
 
diff --git a/libstdc++-v3/include/std/random b/libstdc++-v3/include/std/random
index 0e058a04bd9..86d4f0a8380 100644
--- a/libstdc++-v3/include/std/random
+++ b/libstdc++-v3/include/std/random
@@ -49,6 +49,7 @@
 #include <type_traits>
 #include <bits/random.h>
 #include <bits/opt_random.h>
+#include <bits/uniform_int_dist.h>
 #include <bits/random.tcc>
 
 #endif // C++11
diff --git 
a/libstdc++-v3/testsuite/26_numerics/random/uniform_real_distribution/operators/64351.cc
 
b/libstdc++-v3/testsuite/26_numerics/random/uniform_real_distribution/operators/64351.cc
index 3c0cb8bbd49..a5c8302f7b3 100644
--- 
a/libstdc++-v3/testsuite/26_numerics/random/uniform_real_distribution/operators/64351.cc
+++ 
b/libstdc++-v3/testsuite/26_numerics/random/uniform_real_distribution/operators/64351.cc
@@ -21,21 +21,6 @@
 #include <random>
 #include <testsuite_hooks.h>
 
-// libstdc++/64351
-void
-test01()
-{
-  std::mt19937 rng(8890);
-  std::uniform_real_distribution<float> dist;
-
-  rng.discard(30e6);
-  for (long i = 0; i < 10e6; ++i)
-    {
-      auto n = dist(rng);
-      VERIFY( n != 1.f );
-    }
-}
-
 // libstdc++/63176
 void
 test02()
@@ -45,21 +30,21 @@ test02()
   rng.seed(sequence);
   rng.discard(12 * 629143);
   std::mt19937 rng2{rng};
+  int mismatch = 0;
   for (int i = 0; i < 20; ++i)
   {
-    float n =
-      std::generate_canonical<float, std::numeric_limits<float>::digits>(rng);
+    float n = std::generate_canonical<float, 100>(rng);
     VERIFY( n != 1.f );
 
-    // PR libstdc++/80137
     rng2.discard(1);
-    VERIFY( rng == rng2 );
   }
+  // PR libstdc++/80137
+  rng2.discard(1);  // account for a 1.0 generated and discarded.
+  VERIFY( rng == rng2 );
 }
 
 int
 main()
 {
-  test01();
   test02();
 }
diff --git 
a/libstdc++-v3/testsuite/26_numerics/random/uniform_real_distribution/operators/gencanon.cc
 
b/libstdc++-v3/testsuite/26_numerics/random/uniform_real_distribution/operators/gencanon.cc
new file mode 100644
index 00000000000..041a4466c90
--- /dev/null
+++ 
b/libstdc++-v3/testsuite/26_numerics/random/uniform_real_distribution/operators/gencanon.cc
@@ -0,0 +1,165 @@
+// { dg-do run { target { c++11 && { ! simulator } } } }
+
+#include <random>
+#include <limits>
+#include <type_traits>
+#include <cmath>
+#include <testsuite_hooks.h>
+#include <array>
+
+struct local_rng : std::mt19937
+{
+  static constexpr std::uint64_t min() { return 0; }
+  static constexpr std::uint64_t max() { return 999999; }
+  std::uint64_t operator()()
+    { return static_cast<std::mt19937&>(*this)() % (max() + 1); }
+  local_rng(std::mt19937 const& arg) : std::mt19937(arg) {}
+};
+
+// Verify P0952R9 implementation requiring a second round-trip
+// if first yields exactly 1. In this test, the RNG delivering
+// 32 bits per call is seeded such that this occurs once on the
+// sixth iteration for float, and not at all for double.
+// However, each double iteration requires two calls to the RNG.
+
+template <typename T, typename RNG>
+void
+test01(RNG& rng, RNG& rng2,
+  int& deviation, int& max, int& rms, int& zero, int& skips)
+{
+  const auto size = 1000000, buckets = 100;
+  std::array<int, buckets> histo{};
+  for (auto i = 0; i != size; ++i) {
+    T sample = std::generate_canonical<T, 100>(rng);
+    VERIFY(sample >= T(0.0));
+    VERIFY(sample < T(1.0));  // libstdc++/64351
+    if (sample == T(0.0)) {
+      ++zero;
+    }
+    auto bucket = static_cast<int>(std::floor(sample * buckets));
+    ++histo[bucket];
+    rng2.discard(1);
+    if (rng != rng2) {
+      ++skips;
+      rng2.discard(1);
+      VERIFY(rng == rng2);
+    }
+  }
+  int devsquare = 0;
+  for (int i = 0; i < buckets; ++i) {
+    const auto expected = size / buckets;
+    auto count = histo[i];
+    auto diff = count - expected;
+    if (diff < 0) diff = -diff;
+    deviation += diff;
+    devsquare += diff * diff;
+    if (diff > max) max = diff;
+  }
+  rms = std::sqrt(devsquare);
+}
+
+// This one is for use with local_rng
+template <typename T, typename RNG>
+void
+test02(RNG& rng, RNG& rng2,
+  int& deviation, int& max, int& rms, int& zero, int& skips)
+{
+  const auto size = 1000000, buckets = 100;
+  std::array<int, buckets> histo{};
+  for (auto i = 0; i != size; ++i) {
+    T sample = std::generate_canonical<T, 100>(rng);
+    VERIFY(sample >= T(0.0));
+    VERIFY(sample < T(1.0));  // libstdc++/64351
+    if (sample == T(0.0)) {
+      ++zero;
+    }
+    auto bucket = static_cast<int>(std::floor(sample * buckets));
+    ++histo[bucket];
+    rng2.discard(2);
+    if (rng != rng2) {
+      ++skips;
+      rng2.discard(2);
+      VERIFY(rng == rng2);
+    }
+  }
+  int devsquare = 0;
+  for (int i = 0; i < buckets; ++i) {
+    const auto expected = size / buckets;
+    auto count = histo[i];
+    auto diff = count - expected;
+    if (diff < 0) diff = -diff;
+    deviation += diff;
+    devsquare += diff * diff;
+    if (diff > max) max = diff;
+  }
+  rms = std::sqrt(devsquare);
+}
+
+int
+main()
+{
+  std::mt19937 rng(8890);
+  std::seed_seq sequence{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  rng.seed(sequence);
+  rng.discard(12 * 629143);
+
+  { // float
+    int deviation{}, max{}, rms{}, zero{}, skips{};
+    auto rng2{rng};
+    auto rng3{rng};
+    test01<float>(rng2, rng3, deviation, max, rms, zero, skips);
+
+    if (std::numeric_limits<float>::is_iec559) {
+      VERIFY(deviation == 7032);
+      VERIFY(max == 276);
+      VERIFY(rms == 906);
+      VERIFY(zero == 0);
+    }
+    VERIFY(skips == 1);
+  }
+  { // double
+    int deviation{}, max{}, rms{}, zero{}, skips{};
+    auto rng2{rng};
+    auto rng3{rng};
+    test01<double>(rng2, rng3, deviation, max, rms, zero, skips);
+
+    if (std::numeric_limits<double>::is_iec559) {
+      VERIFY(deviation == 7650);
+      VERIFY(max == 259);
+      VERIFY(rms == 975);
+      VERIFY(zero == 0);
+    }
+    VERIFY(skips == 1000000);
+  }
+  { // long double, same answers as double
+    int deviation{}, max{}, rms{}, zero{}, skips{};
+    auto rng2{rng};
+    auto rng3{rng};
+    test01<long double>(rng2, rng3, deviation, max, rms, zero, skips);
+
+    if (std::numeric_limits<double>::is_iec559) {
+      VERIFY(deviation == 7650);
+      VERIFY(max == 259);
+      VERIFY(rms == 975);
+      VERIFY(zero == 0);
+    }
+    VERIFY(skips == 1000000);
+  }
+
+  { // local RNG, returns [0..1000000)
+    int deviation{}, max{}, rms{}, zero{}, skips{};
+    local_rng rng2{rng};
+    local_rng rng3{rng};
+    test02<float>(rng2, rng3, deviation, max, rms, zero, skips);
+
+    if (std::numeric_limits<float>::is_iec559)
+    {
+      VERIFY(deviation == 8146);
+      VERIFY(max == 250);
+      VERIFY(rms == 1021);
+      VERIFY(zero == 0);
+    }
+    VERIFY(skips == 18);
+  }
+  return 0;
+}
-- 
2.50.1

Reply via email to