Andreas, do you have any opinion on this change? Original thread: https://gcc.gnu.org/pipermail/gcc-patches/2025-November/702053.html More benchmarks: https://gcc.gnu.org/pipermail/gcc-patches/2025-November/702061.html The semaphore regressions were fixed with: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122878
On Wed, 26 Nov 2025 at 15:44, Jonathan Wakely <[email protected]> wrote: > > This defines __platform_wait, __platform_notify, and > __platform_wait_until for FreeBSD, making use of the _umtx_op syscall. > > The Linux versions of those functions only support 32-bit integers, but > the FreeBSD versions use the syscall for both 32-bit and 64-bit types, > as the _umtx_op supports both. > > libstdc++-v3/ChangeLog: > > PR libstdc++/120527 > * include/bits/atomic_wait.h [__FreeBSD__] (__platform_wait_t): > Define typedef. > [__FreeBSD__] (__platform_wait_uses_type): Define variable > template. > * src/c++20/atomic.cc [__FreeBSD__] (_GLIBCXX_HAVE_PLATFORM_WAIT) > (__platform_wait, __platform_notify, __platform_wait_until): > Define. > --- > > Tested x86_64-linux and x86_64-freebsd14. > > v3: > - new wait_op function to determine the OP_WAIT constant from the obj_sz > parameter. > > I wrote a simple benchmark that calls notify in a loop or calls wait(1) > in a loop (without blocking because the value isn't equal to 1), for > 16/32/64 byte integers. > > Before this change (so that all types use a proxy wait and the proxy > wait is done using a condition_variable): > > ------------------------------------------------------- > Benchmark Time CPU Iterations > ------------------------------------------------------- > BM_notify_16 28.8 ns 28.6 ns 24470204 > BM_notify_32 28.7 ns 28.6 ns 24460713 > BM_notify_64 28.8 ns 28.6 ns 24478476 > BM_wait_16 5.13 ns 5.11 ns 136500107 > BM_wait_32 5.12 ns 5.10 ns 136588487 > BM_wait_64 5.12 ns 5.10 ns 136646629 > > After this change, so that 32-bit and 64-bit types don't use proxy > waits, and the proxy wait (for the 16-bit type) uses the _umtx_op > syscall on _M_wait_state->_M_ver instead of a condition_variable: > > ------------------------------------------------------- > Benchmark Time CPU Iterations > ------------------------------------------------------- > BM_notify_16 6.26 ns 6.23 ns 111360766 > BM_notify_32 2.87 ns 2.86 ns 243363965 > BM_notify_64 2.90 ns 2.89 ns 242195074 > BM_wait_16 4.92 ns 4.90 ns 137517425 > BM_wait_32 1.34 ns 1.33 ns 518316605 > BM_wait_64 1.34 ns 1.33 ns 517155683 > > So all notify calls are faster now, and a no-op wait is faster for > non-proxy types (which is because we don't need a PLT call to > _M_setup_proxy_wait, it's just inline in the headers). > > For a wait that actually needs to block I would expect the times to also > improve due to not using a condvar, but that's harder to benchmark > because it would need a second thread that's modifying the atomic and > doing a notify_one() call, so that the wait(n) call doesn't block > forever. > > I tried to check a real wait that actually waits by using a > std::counting_semaphore and calling try_acquire_for(1ns), so that we > block for 1ns in a loop and then timeout. That shows a huge regression > in performance when waiting on an unavailable semaphore: > > Before: > BM_counting_sema 413 ns 411 ns 1701349 > > After: > BM_counting_sema 8198 ns 3616 ns 193253 > > I don't know exactly where that cost comes from. > > For an uncontended semaphore where try_acquire doesn't need to block, > there's no difference in performance before and after this patch (as > expected). > > libstdc++-v3/include/bits/atomic_wait.h | 11 +++++ > libstdc++-v3/src/c++20/atomic.cc | 54 ++++++++++++++++++++++++- > 2 files changed, 64 insertions(+), 1 deletion(-) > > diff --git a/libstdc++-v3/include/bits/atomic_wait.h > b/libstdc++-v3/include/bits/atomic_wait.h > index 6d8c0de4af68..fcdd83c89fef 100644 > --- a/libstdc++-v3/include/bits/atomic_wait.h > +++ b/libstdc++-v3/include/bits/atomic_wait.h > @@ -69,6 +69,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > inline constexpr bool __platform_wait_uses_type > = __detail::__waitable<_Tp> > && sizeof(_Tp) == sizeof(int) && alignof(_Tp) >= 4; > +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8 > + namespace __detail > + { > + using __platform_wait_t = __UINT64_TYPE__; > + inline constexpr size_t __platform_wait_alignment = 8; > + } > + template<typename _Tp> > + inline constexpr bool __platform_wait_uses_type > + = __detail::__waitable<_Tp> > + && ((sizeof(_Tp) == 4 && alignof(_Tp) >= 4) > + || (sizeof(_Tp) == 8 && alignof(_Tp) >= 8)); > #else > // define _GLIBCX_HAVE_PLATFORM_WAIT and implement __platform_wait() > // and __platform_notify() if there is a more efficient primitive supported > diff --git a/libstdc++-v3/src/c++20/atomic.cc > b/libstdc++-v3/src/c++20/atomic.cc > index 80915617f0bf..05260a60392c 100644 > --- a/libstdc++-v3/src/c++20/atomic.cc > +++ b/libstdc++-v3/src/c++20/atomic.cc > @@ -27,7 +27,7 @@ > #if __glibcxx_atomic_wait > #include <atomic> > #include <bits/atomic_timed_wait.h> > -#include <cstdint> // uint32_t, uint64_t > +#include <cstdint> // uint32_t, uint64_t, uintptr_t > #include <climits> // INT_MAX > #include <cerrno> // errno, ETIMEDOUT, etc. > #include <bits/std_mutex.h> // std::mutex, std::__condvar > @@ -39,6 +39,11 @@ > # include <syscall.h> > # include <sys/time.h> // timespec > # define _GLIBCXX_HAVE_PLATFORM_WAIT 1 > +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8 > +# include <sys/types.h> > +# include <sys/umtx.h> > +# include <sys/time.h> > +# define _GLIBCXX_HAVE_PLATFORM_WAIT 1 > #endif > > #pragma GCC diagnostic ignored "-Wmissing-field-initializers" > @@ -108,6 +113,53 @@ namespace > } > return true; > } > +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8 > + [[gnu::always_inline]] > + inline int > + wait_op(int obj_sz) noexcept > + { return obj_sz == sizeof(unsigned) ? UMTX_OP_WAIT_UINT : UMTX_OP_WAIT; } > + > + void > + __platform_wait(const void* addr, uint64_t val, int obj_sz) noexcept > + { > + if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val, > + nullptr, nullptr)) > + if (errno != EINTR) > + __throw_system_error(errno); > + } > + > + void > + __platform_notify(const void* addr, bool all, int /* obj_sz */) noexcept > + { > + const int count = all ? INT_MAX : 1; > + _umtx_op(const_cast<void*>(addr), UMTX_OP_WAKE, count, nullptr, nullptr); > + } > + > + // returns true if wait ended before timeout > + bool > + __platform_wait_until(const void* addr, uint64_t val, > + const __wait_clock_t::time_point& atime, > + int obj_sz) noexcept > + { > + struct _umtx_time timeout = { > + ._timeout = chrono::__to_timeout_timespec(atime), > + ._flags = UMTX_ABSTIME, > + ._clockid = CLOCK_MONOTONIC > + }; > + // _umtx_op hangs if timeout._timeout is {0, 0} > + if (atime.time_since_epoch() < chrono::nanoseconds(1)) > + return false; > + constexpr uintptr_t timeout_sz = sizeof(timeout); > + if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val, > + (void*)timeout_sz, &timeout)) > + { > + if (errno == ETIMEDOUT) > + return false; > + if (errno != EINTR) > + __throw_system_error(errno); > + } > + return true; > + } > #endif // HAVE_LINUX_FUTEX > > // The state used by atomic waiting and notifying functions. > -- > 2.51.1 >
