This defines __platform_wait, __platform_notify, and
__platform_wait_until for FreeBSD, making use of the _umtx_op syscall.
The Linux versions of those functions only support 32-bit integers, but
the FreeBSD versions use the syscall for both 32-bit and 64-bit types,
as the _umtx_op supports both.
libstdc++-v3/ChangeLog:
PR libstdc++/120527
* include/bits/atomic_wait.h [__FreeBSD__] (__platform_wait_t):
Define typedef.
[__FreeBSD__] (__platform_wait_uses_type): Define variable
template.
* src/c++20/atomic.cc [__FreeBSD__] (_GLIBCXX_HAVE_PLATFORM_WAIT)
(__platform_wait, __platform_notify, __platform_wait_until):
Define.
---
Tested x86_64-linux and x86_64-freebsd14.
v3:
- new wait_op function to determine the OP_WAIT constant from the obj_sz
parameter.
I wrote a simple benchmark that calls notify in a loop or calls wait(1)
in a loop (without blocking because the value isn't equal to 1), for
16/32/64 byte integers.
Before this change (so that all types use a proxy wait and the proxy
wait is done using a condition_variable):
-------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------
BM_notify_16 28.8 ns 28.6 ns 24470204
BM_notify_32 28.7 ns 28.6 ns 24460713
BM_notify_64 28.8 ns 28.6 ns 24478476
BM_wait_16 5.13 ns 5.11 ns 136500107
BM_wait_32 5.12 ns 5.10 ns 136588487
BM_wait_64 5.12 ns 5.10 ns 136646629
After this change, so that 32-bit and 64-bit types don't use proxy
waits, and the proxy wait (for the 16-bit type) uses the _umtx_op
syscall on _M_wait_state->_M_ver instead of a condition_variable:
-------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------
BM_notify_16 6.26 ns 6.23 ns 111360766
BM_notify_32 2.87 ns 2.86 ns 243363965
BM_notify_64 2.90 ns 2.89 ns 242195074
BM_wait_16 4.92 ns 4.90 ns 137517425
BM_wait_32 1.34 ns 1.33 ns 518316605
BM_wait_64 1.34 ns 1.33 ns 517155683
So all notify calls are faster now, and a no-op wait is faster for
non-proxy types (which is because we don't need a PLT call to
_M_setup_proxy_wait, it's just inline in the headers).
For a wait that actually needs to block I would expect the times to also
improve due to not using a condvar, but that's harder to benchmark
because it would need a second thread that's modifying the atomic and
doing a notify_one() call, so that the wait(n) call doesn't block
forever.
I tried to check a real wait that actually waits by using a
std::counting_semaphore and calling try_acquire_for(1ns), so that we
block for 1ns in a loop and then timeout. That shows a huge regression
in performance when waiting on an unavailable semaphore:
Before:
BM_counting_sema 413 ns 411 ns 1701349
After:
BM_counting_sema 8198 ns 3616 ns 193253
I don't know exactly where that cost comes from.
For an uncontended semaphore where try_acquire doesn't need to block,
there's no difference in performance before and after this patch (as
expected).
libstdc++-v3/include/bits/atomic_wait.h | 11 +++++
libstdc++-v3/src/c++20/atomic.cc | 54 ++++++++++++++++++++++++-
2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/libstdc++-v3/include/bits/atomic_wait.h
b/libstdc++-v3/include/bits/atomic_wait.h
index 6d8c0de4af68..fcdd83c89fef 100644
--- a/libstdc++-v3/include/bits/atomic_wait.h
+++ b/libstdc++-v3/include/bits/atomic_wait.h
@@ -69,6 +69,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
inline constexpr bool __platform_wait_uses_type
= __detail::__waitable<_Tp>
&& sizeof(_Tp) == sizeof(int) && alignof(_Tp) >= 4;
+#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
+ namespace __detail
+ {
+ using __platform_wait_t = __UINT64_TYPE__;
+ inline constexpr size_t __platform_wait_alignment = 8;
+ }
+ template<typename _Tp>
+ inline constexpr bool __platform_wait_uses_type
+ = __detail::__waitable<_Tp>
+ && ((sizeof(_Tp) == 4 && alignof(_Tp) >= 4)
+ || (sizeof(_Tp) == 8 && alignof(_Tp) >= 8));
#else
// define _GLIBCX_HAVE_PLATFORM_WAIT and implement __platform_wait()
// and __platform_notify() if there is a more efficient primitive supported
diff --git a/libstdc++-v3/src/c++20/atomic.cc b/libstdc++-v3/src/c++20/atomic.cc
index 80915617f0bf..05260a60392c 100644
--- a/libstdc++-v3/src/c++20/atomic.cc
+++ b/libstdc++-v3/src/c++20/atomic.cc
@@ -27,7 +27,7 @@
#if __glibcxx_atomic_wait
#include <atomic>
#include <bits/atomic_timed_wait.h>
-#include <cstdint> // uint32_t, uint64_t
+#include <cstdint> // uint32_t, uint64_t, uintptr_t
#include <climits> // INT_MAX
#include <cerrno> // errno, ETIMEDOUT, etc.
#include <bits/std_mutex.h> // std::mutex, std::__condvar
@@ -39,6 +39,11 @@
# include <syscall.h>
# include <sys/time.h> // timespec
# define _GLIBCXX_HAVE_PLATFORM_WAIT 1
+#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
+# include <sys/types.h>
+# include <sys/umtx.h>
+# include <sys/time.h>
+# define _GLIBCXX_HAVE_PLATFORM_WAIT 1
#endif
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
@@ -108,6 +113,53 @@ namespace
}
return true;
}
+#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
+ [[gnu::always_inline]]
+ inline int
+ wait_op(int obj_sz) noexcept
+ { return obj_sz == sizeof(unsigned) ? UMTX_OP_WAIT_UINT : UMTX_OP_WAIT; }
+
+ void
+ __platform_wait(const void* addr, uint64_t val, int obj_sz) noexcept
+ {
+ if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
+ nullptr, nullptr))
+ if (errno != EINTR)
+ __throw_system_error(errno);
+ }
+
+ void
+ __platform_notify(const void* addr, bool all, int /* obj_sz */) noexcept
+ {
+ const int count = all ? INT_MAX : 1;
+ _umtx_op(const_cast<void*>(addr), UMTX_OP_WAKE, count, nullptr, nullptr);
+ }
+
+ // returns true if wait ended before timeout
+ bool
+ __platform_wait_until(const void* addr, uint64_t val,
+ const __wait_clock_t::time_point& atime,
+ int obj_sz) noexcept
+ {
+ struct _umtx_time timeout = {
+ ._timeout = chrono::__to_timeout_timespec(atime),
+ ._flags = UMTX_ABSTIME,
+ ._clockid = CLOCK_MONOTONIC
+ };
+ // _umtx_op hangs if timeout._timeout is {0, 0}
+ if (atime.time_since_epoch() < chrono::nanoseconds(1))
+ return false;
+ constexpr uintptr_t timeout_sz = sizeof(timeout);
+ if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
+ (void*)timeout_sz, &timeout))
+ {
+ if (errno == ETIMEDOUT)
+ return false;
+ if (errno != EINTR)
+ __throw_system_error(errno);
+ }
+ return true;
+ }
#endif // HAVE_LINUX_FUTEX
// The state used by atomic waiting and notifying functions.
--
2.51.1