On Thu, Jun 5, 2025 at 12:06 AM Jonathan Wakely <jwak...@redhat.com> wrote:
> Rename __semaphore_base to __semaphore_impl, because it's not used as a > base class. Replace the three identical lambda expressions with a named > class, __semaphore_impl::_Can_acquire, which stores the most recent > value of the counter as a data member. > > Add a new __platform_semaphore_impl class template which uses > __platform_wait_t for the counter and uses more efficient atomic waits > for the acquire functions, when __platform_wait is available. For a > binary semaphore some members are further optimized because we know the > counter can only be zero or one. > > Also add a bare wait flag to __atomic_wait_address_v, for consistency > with __atomic_wait_address_until_v and __atomic_wait_address_for_v and > to allow semaphores to use it without the redundant overhead of tracking > waiters. > > libstdc++-v3/ChangeLog: > > * include/bits/atomic_wait.h (__atomic_wait_address_v): Add bare > wait flag. > * include/bits/semaphore_base.h (__semaphore_base): Rename to > __semaphore_impl. Replace local variable and predicate lambda > with _Can_acquire struct. > (__platform_semaphore_impl): New class template. > (__semaphore_impl): Remove alias template. > * include/std/semaphore: > --- > > This replaces the two patches: > > [PATCH 3/4] libstdc++: Optimize std::binary_semaphore > [PATCH 4/4] libstdc++: Optimize std::counting_semaphore for futex path > > The problem with those patches was that the "optimized" implementations > were used if _Max <= numeric_limits<__platform_wait_t>::max(), which is > always true for many non-futex targets because we do: > > # if ATOMIC_LONG_LOCK_FREE == 2 > using __platform_wait_t = unsigned long; > # else > using __platform_wait_t = unsigned int; > # endif > > The earlier patches should have been checking either > _GLIBCXX_HAVE_PLATFORM_WAIT or the __platform_wait_uses_type variable > template to see if an efficient (i.e. not condition_variable-based) > implementation is actually supported. It's not useful to check if the > semaphore's maximum value fits in the __platform_wait_t type if that > type doesn't have fast wait/wake operations like a futex. > > The new proposal in this patch only selects the optimized > implementations if they can be implemented using __platform_wait. The > __semaphore_impl always uses the portable atomic wait API (using an > accessor function and predicate). As Tomasz suggested, a new class > template provides the optimized implementation for both > counting_semaphore and binary_semaphore, differing only in > _M_get_current and _M_try_acquire. > > Tested x86_64-linux and sparc-solaris. > LGTM, one major suggestion to encapsulate the __pred._M_cur == 0 into named member function. I have also checked that __atomic_wait_address_until takes __pred by reference, and does not make copies, so encapsulating __cur there is OK. > > libstdc++-v3/include/bits/atomic_wait.h | 6 +- > libstdc++-v3/include/bits/semaphore_base.h | 241 +++++++++++++++------ > libstdc++-v3/include/std/semaphore | 5 +- > 3 files changed, 186 insertions(+), 66 deletions(-) > > diff --git a/libstdc++-v3/include/bits/atomic_wait.h > b/libstdc++-v3/include/bits/atomic_wait.h > index 815726c16ccb..d32fda1c2435 100644 > --- a/libstdc++-v3/include/bits/atomic_wait.h > +++ b/libstdc++-v3/include/bits/atomic_wait.h > @@ -249,12 +249,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > // C++26 will return __val > } > > + // Wait on __addr while *__addr == __old is true. > + // Cannot be used for proxy waits. > inline void > __atomic_wait_address_v(const __detail::__platform_wait_t* __addr, > __detail::__platform_wait_t __old, > - int __order) > + int __order, bool __bare_wait = false) > { > - __detail::__wait_args __args{ __addr, __old, __order }; > + __detail::__wait_args __args{ __addr, __old, __order, __bare_wait }; > // C++26 will not ignore the return value here > __detail::__wait_impl(__addr, __args); > } > diff --git a/libstdc++-v3/include/bits/semaphore_base.h > b/libstdc++-v3/include/bits/semaphore_base.h > index 3f7a33ccd51a..a1311e8690c7 100644 > --- a/libstdc++-v3/include/bits/semaphore_base.h > +++ b/libstdc++-v3/include/bits/semaphore_base.h > @@ -46,23 +46,20 @@ namespace std _GLIBCXX_VISIBILITY(default) > { > _GLIBCXX_BEGIN_NAMESPACE_VERSION > > - template<bool _Platform_wait> > - struct __semaphore_base > + struct __semaphore_impl > { > - using __count_type = __conditional_t<_Platform_wait, > - __detail::__platform_wait_t, > - ptrdiff_t>; > + using __count_type = ptrdiff_t; > > static constexpr ptrdiff_t _S_max > = __gnu_cxx::__int_traits<__count_type>::__max; > > constexpr explicit > - __semaphore_base(__count_type __count) noexcept > + __semaphore_impl(__count_type __count) noexcept > : _M_counter(__count) > { } > > - __semaphore_base(const __semaphore_base&) = delete; > - __semaphore_base& operator=(const __semaphore_base&) = delete; > + __semaphore_impl(const __semaphore_impl&) = delete; > + __semaphore_impl& operator=(const __semaphore_impl&) = delete; > > // Load the current counter value. > _GLIBCXX_ALWAYS_INLINE __count_type > @@ -71,8 +68,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > > // Try to acquire the semaphore (i.e. decrement the counter). > // Returns false if the current counter is zero, or if another thread > - // decrements the value first. In the latter case, __cur is set to the > - // new value. > + // changes the value first. In the latter case, __cur is set to the > new > + // value. > _GLIBCXX_ALWAYS_INLINE bool > _M_do_try_acquire(__count_type& __cur) noexcept > { > @@ -85,24 +82,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > memory_order::relaxed); > } > > + // Keep trying to acquire the semaphore in a loop until it succeeds. > void > _M_acquire() noexcept > { > - auto const __vfn = [this]{ return _M_get_current(); }; > - auto __val = __vfn(); > - auto const __pred = [&__val](__count_type __cur) { > - if (__cur > 0) > - { > - __val = __cur; > - return true; > - } > - return false; > - }; > - while (!_M_do_try_acquire(__val)) > - if (__val == 0) > + auto __vfn = [this]{ return _M_get_current(); }; > + _Can_acquire __pred{__vfn()}; > + while (!_M_do_try_acquire(__pred._M_val)) > + if (__pred._M_val == 0) > std::__atomic_wait_address(&_M_counter, __pred, __vfn, true); > } > > + // Try to acquire the semaphore, retrying a small number of times > + // in case of contention. > bool > _M_try_acquire() noexcept > { > @@ -116,23 +108,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > bool > _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& > __atime) noexcept > { > - auto const __vfn = [this]{ return _M_get_current(); }; > - auto __val = __vfn(); > - auto const __pred = [&__val](__count_type __cur) { > - if (__cur > 0) > - { > - __val = __cur; > - return true; > - } > - return false; > - }; > - while (!_M_do_try_acquire(__val)) > - if (__val == 0) > - { > - if (!std::__atomic_wait_address_until(&_M_counter, __pred, > - __vfn, __atime, true)) > - return false; // timed out > - } > + auto __vfn = [this]{ return _M_get_current(); }; > + _Can_acquire __pred{__vfn()}; > + while (!_M_do_try_acquire(__pred._M_val)) > + if (__pred._M_val == 0) > + if (!std::__atomic_wait_address_until(&_M_counter, __pred, > + __vfn, __atime, true)) > + return false; // timed out > return true; > } > > @@ -140,23 +122,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > bool > _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) > noexcept > { > - auto const __vfn = [this]{ return _M_get_current(); }; > - auto __val = __vfn(); > - auto const __pred = [&__val](__count_type __cur) { > - if (__cur > 0) > - { > - __val = __cur; > - return true; > - } > - return false; > - }; > - while (!_M_do_try_acquire(__val)) > - if (__val == 0) > - { > - if (!std::__atomic_wait_address_for(&_M_counter, __pred, > - __vfn, __rtime, true)) > - return false; // timed out > - } > + auto __vfn = [this]{ return _M_get_current(); }; > + _Can_acquire __pred{__vfn()}; > + while (!_M_do_try_acquire(__pred._M_val)) > + if (__pred._M_val == 0) > I think we could have some named member funciton here, like __pred.should_wait(). But not strictly necessary. > + if (!std::__atomic_wait_address_for(&_M_counter, __pred, > + __vfn, __rtime, true)) > + return false; // timed out > return true; > } > > @@ -171,14 +143,161 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > } > > private: > - alignas(_Platform_wait ? __detail::__platform_wait_alignment > - : __alignof__(__count_type)) > - __count_type _M_counter; > + struct _Can_acquire > + { > + __count_type _M_val; > + > + bool operator()(__count_type __cur) noexcept > + { > + if (__cur == 0) > + return false; > + _M_val = __cur; > + return true; > + } > + }; > + > + alignas(__atomic_ref<__count_type>::required_alignment) > + __count_type _M_counter; > + }; > + > + // Optimized specialization using __platform_wait (if available) > + template<bool _Binary> > + struct __platform_semaphore_impl > + { > + using __count_type = __detail::__platform_wait_t; > + > + static constexpr ptrdiff_t _S_max > + = _Binary ? 1 : __gnu_cxx::__int_traits<__count_type>::__max; > + > + constexpr explicit > + __platform_semaphore_impl(__count_type __count) noexcept > + : _M_counter(__count) > + { } > + > + __platform_semaphore_impl(__platform_semaphore_impl&) = delete; > + __platform_semaphore_impl& operator=(const > __platform_semaphore_impl&) = delete; > + > + // Load the current counter value. > + _GLIBCXX_ALWAYS_INLINE __count_type > + _M_get_current() const noexcept > + { > + if constexpr (_Binary) > + return 1; // Not necessarily true, but optimistically assume it is. > + else > + return __atomic_impl::load(&_M_counter, memory_order::acquire); > + } > + > + // Try to acquire the semaphore (i.e. decrement the counter). > + // Returns false if the current counter is zero, or if another thread > + // changes the value first. In the latter case, __cur is set to the > new > + // value. > + _GLIBCXX_ALWAYS_INLINE bool > + _M_do_try_acquire(__count_type& __cur) noexcept > + { > + if (__cur == 0) > + return false; // Cannot decrement when it's already zero. > + > + return __atomic_impl::compare_exchange_strong(&_M_counter, > + __cur, __cur - 1, > + memory_order::acquire, > + memory_order::relaxed); > + } > + > + // Keep trying to acquire the semaphore in a loop until it succeeds. > + void > + _M_acquire() noexcept > + { > + auto __val = _M_get_current(); > + while (!_M_do_try_acquire(__val)) > + if (__val == 0) > + { > + std::__atomic_wait_address_v(&_M_counter, __val, > __ATOMIC_ACQUIRE, > + true); > + __val = _M_get_current(); > + } > + } > + > + // Try to acquire the semaphore. > + bool > + _M_try_acquire() noexcept > + { > + if constexpr (_Binary) > + { > + __count_type __val = 1; > + // Do not expect much contention on binary semaphore, only try > once. > + return _M_do_try_acquire(__val); > + } > + else > + // Fastest implementation of this function is just > _M_do_try_acquire > + // but that can fail under contention even when _M_count > 0. > + // Using _M_try_acquire_for(0ns) will retry a few times in a loop. > + return _M_try_acquire_for(__detail::__wait_clock_t::duration{}); > + } > + > + template<typename _Clock, typename _Duration> > + bool > + _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& > __atime) noexcept > + { > + auto __val = _M_get_current(); > + while (!_M_do_try_acquire(__val)) > + if (__val == 0) > + { > + if (!std::__atomic_wait_address_until_v(&_M_counter, 0, > + __ATOMIC_ACQUIRE, > + __atime, true)) > + return false; // timed out > + __val = _M_get_current(); > + } > + return true; > + } > + > + template<typename _Rep, typename _Period> > + bool > + _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) > noexcept > + { > + auto __val = _M_get_current(); > + while (!_M_do_try_acquire(__val)) > + if (__val == 0) > + { > + if (!std::__atomic_wait_address_for_v(&_M_counter, 0, > + __ATOMIC_ACQUIRE, > + __rtime, true)) > + return false; // timed out > + __val = _M_get_current(); > + } > + return true; > + } > + > + _GLIBCXX_ALWAYS_INLINE ptrdiff_t > + _M_release(ptrdiff_t __update) noexcept > + { > + auto __old = __atomic_impl::fetch_add(&_M_counter, __update, > + memory_order::release); > + if (__old == 0 && __update > 0) > + __atomic_notify_address(&_M_counter, true, true); > + return __old; > + } > + > + protected: > + alignas(__detail::__platform_wait_alignment) __count_type _M_counter; > }; > > template<ptrdiff_t _Max> > - using __semaphore_impl > - = __semaphore_base<(_Max <= __semaphore_base<true>::_S_max)>; > + using _Select_semaphore_impl = typename decltype([] > + { > + using namespace __detail; > + if constexpr (__platform_wait_uses_type<__platform_wait_t>) > + { > + if constexpr (_Max <= 1) > + return type_identity<__platform_semaphore_impl<true>>{}; > + else if constexpr (_Max <= > __platform_semaphore_impl<false>::_S_max) > + return type_identity<__platform_semaphore_impl<false>>{}; > + else > + return type_identity<__semaphore_impl>{}; > + } > + else > + return type_identity<__semaphore_impl>{}; > + }())::type; > > _GLIBCXX_END_NAMESPACE_VERSION > } // namespace std > diff --git a/libstdc++-v3/include/std/semaphore > b/libstdc++-v3/include/std/semaphore > index ca1bffe371a0..8f49188563e8 100644 > --- a/libstdc++-v3/include/std/semaphore > +++ b/libstdc++-v3/include/std/semaphore > @@ -45,13 +45,12 @@ namespace std _GLIBCXX_VISIBILITY(default) > { > _GLIBCXX_BEGIN_NAMESPACE_VERSION > > - template<ptrdiff_t __least_max_value = __semaphore_base<true>::_S_max> > + template<ptrdiff_t __least_max_value = > _Select_semaphore_impl<2>::_S_max> > class counting_semaphore > { > static_assert(__least_max_value >= 0); > > - using _Impl = __semaphore_impl<__least_max_value>; > - _Impl _M_sem; > + _Select_semaphore_impl<__least_max_value> _M_sem; > > public: > constexpr explicit > -- > 2.49.0 > >