Rename __semaphore_base to __semaphore_impl, because it's not used as a base class. Replace the three identical lambda expressions with a named class, __semaphore_impl::_Can_acquire, which stores the most recent value of the counter as a data member.
Add a new __platform_semaphore_impl class template which uses __platform_wait_t for the counter and uses more efficient atomic waits for the acquire functions, when __platform_wait is available. For a binary semaphore some members are further optimized because we know the counter can only be zero or one. Also add a bare wait flag to __atomic_wait_address_v, for consistency with __atomic_wait_address_until_v and __atomic_wait_address_for_v and to allow semaphores to use it without the redundant overhead of tracking waiters. libstdc++-v3/ChangeLog: * include/bits/atomic_wait.h (__atomic_wait_address_v): Add bare wait flag. * include/bits/semaphore_base.h (__semaphore_base): Rename to __semaphore_impl. Replace local variable and predicate lambda with _Can_acquire struct. (__platform_semaphore_impl): New class template. (__semaphore_impl): Remove alias template. * include/std/semaphore: --- This replaces the two patches: [PATCH 3/4] libstdc++: Optimize std::binary_semaphore [PATCH 4/4] libstdc++: Optimize std::counting_semaphore for futex path The problem with those patches was that the "optimized" implementations were used if _Max <= numeric_limits<__platform_wait_t>::max(), which is always true for many non-futex targets because we do: # if ATOMIC_LONG_LOCK_FREE == 2 using __platform_wait_t = unsigned long; # else using __platform_wait_t = unsigned int; # endif The earlier patches should have been checking either _GLIBCXX_HAVE_PLATFORM_WAIT or the __platform_wait_uses_type variable template to see if an efficient (i.e. not condition_variable-based) implementation is actually supported. It's not useful to check if the semaphore's maximum value fits in the __platform_wait_t type if that type doesn't have fast wait/wake operations like a futex. The new proposal in this patch only selects the optimized implementations if they can be implemented using __platform_wait. The __semaphore_impl always uses the portable atomic wait API (using an accessor function and predicate). As Tomasz suggested, a new class template provides the optimized implementation for both counting_semaphore and binary_semaphore, differing only in _M_get_current and _M_try_acquire. Tested x86_64-linux and sparc-solaris. libstdc++-v3/include/bits/atomic_wait.h | 6 +- libstdc++-v3/include/bits/semaphore_base.h | 241 +++++++++++++++------ libstdc++-v3/include/std/semaphore | 5 +- 3 files changed, 186 insertions(+), 66 deletions(-) diff --git a/libstdc++-v3/include/bits/atomic_wait.h b/libstdc++-v3/include/bits/atomic_wait.h index 815726c16ccb..d32fda1c2435 100644 --- a/libstdc++-v3/include/bits/atomic_wait.h +++ b/libstdc++-v3/include/bits/atomic_wait.h @@ -249,12 +249,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // C++26 will return __val } + // Wait on __addr while *__addr == __old is true. + // Cannot be used for proxy waits. inline void __atomic_wait_address_v(const __detail::__platform_wait_t* __addr, __detail::__platform_wait_t __old, - int __order) + int __order, bool __bare_wait = false) { - __detail::__wait_args __args{ __addr, __old, __order }; + __detail::__wait_args __args{ __addr, __old, __order, __bare_wait }; // C++26 will not ignore the return value here __detail::__wait_impl(__addr, __args); } diff --git a/libstdc++-v3/include/bits/semaphore_base.h b/libstdc++-v3/include/bits/semaphore_base.h index 3f7a33ccd51a..a1311e8690c7 100644 --- a/libstdc++-v3/include/bits/semaphore_base.h +++ b/libstdc++-v3/include/bits/semaphore_base.h @@ -46,23 +46,20 @@ namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION - template<bool _Platform_wait> - struct __semaphore_base + struct __semaphore_impl { - using __count_type = __conditional_t<_Platform_wait, - __detail::__platform_wait_t, - ptrdiff_t>; + using __count_type = ptrdiff_t; static constexpr ptrdiff_t _S_max = __gnu_cxx::__int_traits<__count_type>::__max; constexpr explicit - __semaphore_base(__count_type __count) noexcept + __semaphore_impl(__count_type __count) noexcept : _M_counter(__count) { } - __semaphore_base(const __semaphore_base&) = delete; - __semaphore_base& operator=(const __semaphore_base&) = delete; + __semaphore_impl(const __semaphore_impl&) = delete; + __semaphore_impl& operator=(const __semaphore_impl&) = delete; // Load the current counter value. _GLIBCXX_ALWAYS_INLINE __count_type @@ -71,8 +68,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Try to acquire the semaphore (i.e. decrement the counter). // Returns false if the current counter is zero, or if another thread - // decrements the value first. In the latter case, __cur is set to the - // new value. + // changes the value first. In the latter case, __cur is set to the new + // value. _GLIBCXX_ALWAYS_INLINE bool _M_do_try_acquire(__count_type& __cur) noexcept { @@ -85,24 +82,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION memory_order::relaxed); } + // Keep trying to acquire the semaphore in a loop until it succeeds. void _M_acquire() noexcept { - auto const __vfn = [this]{ return _M_get_current(); }; - auto __val = __vfn(); - auto const __pred = [&__val](__count_type __cur) { - if (__cur > 0) - { - __val = __cur; - return true; - } - return false; - }; - while (!_M_do_try_acquire(__val)) - if (__val == 0) + auto __vfn = [this]{ return _M_get_current(); }; + _Can_acquire __pred{__vfn()}; + while (!_M_do_try_acquire(__pred._M_val)) + if (__pred._M_val == 0) std::__atomic_wait_address(&_M_counter, __pred, __vfn, true); } + // Try to acquire the semaphore, retrying a small number of times + // in case of contention. bool _M_try_acquire() noexcept { @@ -116,23 +108,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& __atime) noexcept { - auto const __vfn = [this]{ return _M_get_current(); }; - auto __val = __vfn(); - auto const __pred = [&__val](__count_type __cur) { - if (__cur > 0) - { - __val = __cur; - return true; - } - return false; - }; - while (!_M_do_try_acquire(__val)) - if (__val == 0) - { - if (!std::__atomic_wait_address_until(&_M_counter, __pred, - __vfn, __atime, true)) - return false; // timed out - } + auto __vfn = [this]{ return _M_get_current(); }; + _Can_acquire __pred{__vfn()}; + while (!_M_do_try_acquire(__pred._M_val)) + if (__pred._M_val == 0) + if (!std::__atomic_wait_address_until(&_M_counter, __pred, + __vfn, __atime, true)) + return false; // timed out return true; } @@ -140,23 +122,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) noexcept { - auto const __vfn = [this]{ return _M_get_current(); }; - auto __val = __vfn(); - auto const __pred = [&__val](__count_type __cur) { - if (__cur > 0) - { - __val = __cur; - return true; - } - return false; - }; - while (!_M_do_try_acquire(__val)) - if (__val == 0) - { - if (!std::__atomic_wait_address_for(&_M_counter, __pred, - __vfn, __rtime, true)) - return false; // timed out - } + auto __vfn = [this]{ return _M_get_current(); }; + _Can_acquire __pred{__vfn()}; + while (!_M_do_try_acquire(__pred._M_val)) + if (__pred._M_val == 0) + if (!std::__atomic_wait_address_for(&_M_counter, __pred, + __vfn, __rtime, true)) + return false; // timed out return true; } @@ -171,14 +143,161 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } private: - alignas(_Platform_wait ? __detail::__platform_wait_alignment - : __alignof__(__count_type)) - __count_type _M_counter; + struct _Can_acquire + { + __count_type _M_val; + + bool operator()(__count_type __cur) noexcept + { + if (__cur == 0) + return false; + _M_val = __cur; + return true; + } + }; + + alignas(__atomic_ref<__count_type>::required_alignment) + __count_type _M_counter; + }; + + // Optimized specialization using __platform_wait (if available) + template<bool _Binary> + struct __platform_semaphore_impl + { + using __count_type = __detail::__platform_wait_t; + + static constexpr ptrdiff_t _S_max + = _Binary ? 1 : __gnu_cxx::__int_traits<__count_type>::__max; + + constexpr explicit + __platform_semaphore_impl(__count_type __count) noexcept + : _M_counter(__count) + { } + + __platform_semaphore_impl(__platform_semaphore_impl&) = delete; + __platform_semaphore_impl& operator=(const __platform_semaphore_impl&) = delete; + + // Load the current counter value. + _GLIBCXX_ALWAYS_INLINE __count_type + _M_get_current() const noexcept + { + if constexpr (_Binary) + return 1; // Not necessarily true, but optimistically assume it is. + else + return __atomic_impl::load(&_M_counter, memory_order::acquire); + } + + // Try to acquire the semaphore (i.e. decrement the counter). + // Returns false if the current counter is zero, or if another thread + // changes the value first. In the latter case, __cur is set to the new + // value. + _GLIBCXX_ALWAYS_INLINE bool + _M_do_try_acquire(__count_type& __cur) noexcept + { + if (__cur == 0) + return false; // Cannot decrement when it's already zero. + + return __atomic_impl::compare_exchange_strong(&_M_counter, + __cur, __cur - 1, + memory_order::acquire, + memory_order::relaxed); + } + + // Keep trying to acquire the semaphore in a loop until it succeeds. + void + _M_acquire() noexcept + { + auto __val = _M_get_current(); + while (!_M_do_try_acquire(__val)) + if (__val == 0) + { + std::__atomic_wait_address_v(&_M_counter, __val, __ATOMIC_ACQUIRE, + true); + __val = _M_get_current(); + } + } + + // Try to acquire the semaphore. + bool + _M_try_acquire() noexcept + { + if constexpr (_Binary) + { + __count_type __val = 1; + // Do not expect much contention on binary semaphore, only try once. + return _M_do_try_acquire(__val); + } + else + // Fastest implementation of this function is just _M_do_try_acquire + // but that can fail under contention even when _M_count > 0. + // Using _M_try_acquire_for(0ns) will retry a few times in a loop. + return _M_try_acquire_for(__detail::__wait_clock_t::duration{}); + } + + template<typename _Clock, typename _Duration> + bool + _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& __atime) noexcept + { + auto __val = _M_get_current(); + while (!_M_do_try_acquire(__val)) + if (__val == 0) + { + if (!std::__atomic_wait_address_until_v(&_M_counter, 0, + __ATOMIC_ACQUIRE, + __atime, true)) + return false; // timed out + __val = _M_get_current(); + } + return true; + } + + template<typename _Rep, typename _Period> + bool + _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) noexcept + { + auto __val = _M_get_current(); + while (!_M_do_try_acquire(__val)) + if (__val == 0) + { + if (!std::__atomic_wait_address_for_v(&_M_counter, 0, + __ATOMIC_ACQUIRE, + __rtime, true)) + return false; // timed out + __val = _M_get_current(); + } + return true; + } + + _GLIBCXX_ALWAYS_INLINE ptrdiff_t + _M_release(ptrdiff_t __update) noexcept + { + auto __old = __atomic_impl::fetch_add(&_M_counter, __update, + memory_order::release); + if (__old == 0 && __update > 0) + __atomic_notify_address(&_M_counter, true, true); + return __old; + } + + protected: + alignas(__detail::__platform_wait_alignment) __count_type _M_counter; }; template<ptrdiff_t _Max> - using __semaphore_impl - = __semaphore_base<(_Max <= __semaphore_base<true>::_S_max)>; + using _Select_semaphore_impl = typename decltype([] + { + using namespace __detail; + if constexpr (__platform_wait_uses_type<__platform_wait_t>) + { + if constexpr (_Max <= 1) + return type_identity<__platform_semaphore_impl<true>>{}; + else if constexpr (_Max <= __platform_semaphore_impl<false>::_S_max) + return type_identity<__platform_semaphore_impl<false>>{}; + else + return type_identity<__semaphore_impl>{}; + } + else + return type_identity<__semaphore_impl>{}; + }())::type; _GLIBCXX_END_NAMESPACE_VERSION } // namespace std diff --git a/libstdc++-v3/include/std/semaphore b/libstdc++-v3/include/std/semaphore index ca1bffe371a0..8f49188563e8 100644 --- a/libstdc++-v3/include/std/semaphore +++ b/libstdc++-v3/include/std/semaphore @@ -45,13 +45,12 @@ namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION - template<ptrdiff_t __least_max_value = __semaphore_base<true>::_S_max> + template<ptrdiff_t __least_max_value = _Select_semaphore_impl<2>::_S_max> class counting_semaphore { static_assert(__least_max_value >= 0); - using _Impl = __semaphore_impl<__least_max_value>; - _Impl _M_sem; + _Select_semaphore_impl<__least_max_value> _M_sem; public: constexpr explicit -- 2.49.0