On Wed, Nov 26, 2025 at 5:10 PM Tomasz Kaminski <[email protected]> wrote:

>
>
> On Wed, Nov 26, 2025 at 4:45 PM Jonathan Wakely <[email protected]>
> wrote:
>
>> This defines __platform_wait, __platform_notify, and
>> __platform_wait_until for FreeBSD, making use of the _umtx_op syscall.
>>
>> The Linux versions of those functions only support 32-bit integers, but
>> the FreeBSD versions use the syscall for both 32-bit and 64-bit types,
>> as the _umtx_op supports both.
>>
>> libstdc++-v3/ChangeLog:
>>
>>         PR libstdc++/120527
>>         * include/bits/atomic_wait.h [__FreeBSD__] (__platform_wait_t):
>>         Define typedef.
>>         [__FreeBSD__] (__platform_wait_uses_type): Define variable
>>         template.
>>         * src/c++20/atomic.cc [__FreeBSD__] (_GLIBCXX_HAVE_PLATFORM_WAIT)
>>         (__platform_wait, __platform_notify, __platform_wait_until):
>>         Define.
>> ---
>>
>> Tested x86_64-linux and x86_64-freebsd14.
>>
>> v3:
>> - new wait_op function to determine the OP_WAIT constant from the obj_sz
>>   parameter.
>>
> LGTM, thanso for including a benchmark.
>
I have also checked later (3+) patches, they also look good to me.
But as you mentioned they are not intended to be committed.

>
>> I wrote a simple benchmark that calls notify in a loop or calls wait(1)
>> in a loop (without blocking because the value isn't equal to 1), for
>> 16/32/64 byte integers.
>>
>> Before this change (so that all types use a proxy wait and the proxy
>> wait is done using a condition_variable):
>>
>> -------------------------------------------------------
>> Benchmark             Time             CPU   Iterations
>> -------------------------------------------------------
>> BM_notify_16       28.8 ns         28.6 ns     24470204
>> BM_notify_32       28.7 ns         28.6 ns     24460713
>> BM_notify_64       28.8 ns         28.6 ns     24478476
>> BM_wait_16         5.13 ns         5.11 ns    136500107
>> BM_wait_32         5.12 ns         5.10 ns    136588487
>> BM_wait_64         5.12 ns         5.10 ns    136646629
>>
>> After this change, so that 32-bit and 64-bit types don't use proxy
>> waits, and the proxy wait (for the 16-bit type) uses the _umtx_op
>> syscall on _M_wait_state->_M_ver instead of a condition_variable:
>>
>> -------------------------------------------------------
>> Benchmark             Time             CPU   Iterations
>> -------------------------------------------------------
>> BM_notify_16       6.26 ns         6.23 ns    111360766
>> BM_notify_32       2.87 ns         2.86 ns    243363965
>> BM_notify_64       2.90 ns         2.89 ns    242195074
>> BM_wait_16         4.92 ns         4.90 ns    137517425
>> BM_wait_32         1.34 ns         1.33 ns    518316605
>> BM_wait_64         1.34 ns         1.33 ns    517155683
>>
>> So all notify calls are faster now, and a no-op wait is faster for
>> non-proxy types (which is because we don't need a PLT call to
>> _M_setup_proxy_wait, it's just inline in the headers).
>>
>> For a wait that actually needs to block I would expect the times to also
>> improve due to not using a condvar, but that's harder to benchmark
>> because it would need a second thread that's modifying the atomic and
>> doing a notify_one() call, so that the wait(n) call doesn't block
>> forever.
>>
>> I tried to check a real wait that actually waits by using a
>> std::counting_semaphore and calling try_acquire_for(1ns), so that we
>> block for 1ns in a loop and then timeout. That shows a huge regression
>> in performance when waiting on an unavailable semaphore:
>>
>> Before:
>> BM_counting_sema        413 ns          411 ns      1701349
>>
>> After:
>> BM_counting_sema       8198 ns         3616 ns       193253
>>
>> I don't know exactly where that cost comes from.
>>
>> For an uncontended semaphore where try_acquire doesn't need to block,
>> there's no difference in performance before and after this patch (as
>> expected).
>>
>>  libstdc++-v3/include/bits/atomic_wait.h | 11 +++++
>>  libstdc++-v3/src/c++20/atomic.cc        | 54 ++++++++++++++++++++++++-
>>  2 files changed, 64 insertions(+), 1 deletion(-)
>>
>> diff --git a/libstdc++-v3/include/bits/atomic_wait.h
>> b/libstdc++-v3/include/bits/atomic_wait.h
>> index 6d8c0de4af68..fcdd83c89fef 100644
>> --- a/libstdc++-v3/include/bits/atomic_wait.h
>> +++ b/libstdc++-v3/include/bits/atomic_wait.h
>> @@ -69,6 +69,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>>      inline constexpr bool __platform_wait_uses_type
>>        = __detail::__waitable<_Tp>
>>           && sizeof(_Tp) == sizeof(int) && alignof(_Tp) >= 4;
>> +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
>> +  namespace __detail
>> +  {
>> +    using __platform_wait_t = __UINT64_TYPE__;
>> +    inline constexpr size_t __platform_wait_alignment = 8;
>> +  }
>> +  template<typename _Tp>
>> +    inline constexpr bool __platform_wait_uses_type
>> +      = __detail::__waitable<_Tp>
>> +         && ((sizeof(_Tp) == 4 && alignof(_Tp) >= 4)
>> +               || (sizeof(_Tp) == 8 && alignof(_Tp) >= 8));
>>  #else
>>  // define _GLIBCX_HAVE_PLATFORM_WAIT and implement __platform_wait()
>>  // and __platform_notify() if there is a more efficient primitive
>> supported
>> diff --git a/libstdc++-v3/src/c++20/atomic.cc
>> b/libstdc++-v3/src/c++20/atomic.cc
>> index 80915617f0bf..05260a60392c 100644
>> --- a/libstdc++-v3/src/c++20/atomic.cc
>> +++ b/libstdc++-v3/src/c++20/atomic.cc
>> @@ -27,7 +27,7 @@
>>  #if __glibcxx_atomic_wait
>>  #include <atomic>
>>  #include <bits/atomic_timed_wait.h>
>> -#include <cstdint> // uint32_t, uint64_t
>> +#include <cstdint> // uint32_t, uint64_t, uintptr_t
>>  #include <climits> // INT_MAX
>>  #include <cerrno>  // errno, ETIMEDOUT, etc.
>>  #include <bits/std_mutex.h>  // std::mutex, std::__condvar
>> @@ -39,6 +39,11 @@
>>  # include <syscall.h>
>>  # include <sys/time.h> // timespec
>>  # define _GLIBCXX_HAVE_PLATFORM_WAIT 1
>> +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
>> +# include <sys/types.h>
>> +# include <sys/umtx.h>
>> +# include <sys/time.h>
>> +# define _GLIBCXX_HAVE_PLATFORM_WAIT 1
>>  #endif
>>
>>  #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
>> @@ -108,6 +113,53 @@ namespace
>>        }
>>      return true;
>>    }
>> +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
>> +  [[gnu::always_inline]]
>> +  inline int
>> +  wait_op(int obj_sz) noexcept
>> +  { return obj_sz == sizeof(unsigned) ? UMTX_OP_WAIT_UINT :
>> UMTX_OP_WAIT; }
>> +
>> +  void
>> +  __platform_wait(const void* addr, uint64_t val, int obj_sz) noexcept
>> +  {
>> +    if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
>> +                nullptr, nullptr))
>> +      if (errno != EINTR)
>> +       __throw_system_error(errno);
>> +  }
>> +
>> +  void
>> +  __platform_notify(const void* addr, bool all, int /* obj_sz */)
>> noexcept
>> +  {
>> +    const int count = all ? INT_MAX : 1;
>> +    _umtx_op(const_cast<void*>(addr), UMTX_OP_WAKE, count, nullptr,
>> nullptr);
>> +  }
>> +
>> +  // returns true if wait ended before timeout
>> +  bool
>> +  __platform_wait_until(const void* addr, uint64_t val,
>> +                       const __wait_clock_t::time_point& atime,
>> +                       int obj_sz) noexcept
>> +  {
>> +    struct _umtx_time timeout = {
>> +      ._timeout = chrono::__to_timeout_timespec(atime),
>> +      ._flags = UMTX_ABSTIME,
>> +      ._clockid = CLOCK_MONOTONIC
>> +    };
>> +    // _umtx_op hangs if timeout._timeout is {0, 0}
>> +    if (atime.time_since_epoch() < chrono::nanoseconds(1))
>> +      return false;
>> +    constexpr uintptr_t timeout_sz = sizeof(timeout);
>> +    if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
>> +                (void*)timeout_sz, &timeout))
>> +      {
>> +       if (errno == ETIMEDOUT)
>> +         return false;
>> +       if (errno != EINTR)
>> +         __throw_system_error(errno);
>> +      }
>> +    return true;
>> +  }
>>  #endif // HAVE_LINUX_FUTEX
>>
>>    // The state used by atomic waiting and notifying functions.
>> --
>> 2.51.1
>>
>>

Reply via email to