On Thu, Jun 5, 2025 at 12:06 AM Jonathan Wakely <jwak...@redhat.com> wrote:

> Rename __semaphore_base to __semaphore_impl, because it's not used as a
> base class. Replace the three identical lambda expressions with a named
> class, __semaphore_impl::_Can_acquire, which stores the most recent
> value of the counter as a data member.
>
> Add a new __platform_semaphore_impl class template which uses
> __platform_wait_t for the counter and uses more efficient atomic waits
> for the acquire functions, when __platform_wait is available. For a
> binary semaphore some members are further optimized because we know the
> counter can only be zero or one.
>
> Also add a bare wait flag to __atomic_wait_address_v, for consistency
> with __atomic_wait_address_until_v and __atomic_wait_address_for_v and
> to allow semaphores to use it without the redundant overhead of tracking
> waiters.
>
> libstdc++-v3/ChangeLog:
>
>         * include/bits/atomic_wait.h (__atomic_wait_address_v): Add bare
>         wait flag.
>         * include/bits/semaphore_base.h (__semaphore_base): Rename to
>         __semaphore_impl. Replace local variable and predicate lambda
>         with _Can_acquire struct.
>         (__platform_semaphore_impl): New class template.
>         (__semaphore_impl): Remove alias template.
>         * include/std/semaphore:
> ---
>
> This replaces the two patches:
>
> [PATCH 3/4] libstdc++: Optimize std::binary_semaphore
> [PATCH 4/4] libstdc++: Optimize std::counting_semaphore for futex path
>
> The problem with those patches was that the "optimized" implementations
> were used if _Max <= numeric_limits<__platform_wait_t>::max(), which is
> always true for many non-futex targets because we do:
>
> # if ATOMIC_LONG_LOCK_FREE == 2
>     using __platform_wait_t = unsigned long;
> # else
>     using __platform_wait_t = unsigned int;
> # endif
>
> The earlier patches should have been checking either
> _GLIBCXX_HAVE_PLATFORM_WAIT or the __platform_wait_uses_type variable
> template to see if an efficient (i.e. not condition_variable-based)
> implementation is actually supported. It's not useful to check if the
> semaphore's maximum value fits in the __platform_wait_t type if that
> type doesn't have fast wait/wake operations like a futex.
>
> The new proposal in this patch only selects the optimized
> implementations if they can be implemented using __platform_wait.  The
> __semaphore_impl always uses the portable atomic wait API (using an
> accessor function and predicate). As Tomasz suggested, a new class
> template provides the optimized implementation for both
> counting_semaphore and binary_semaphore, differing only in
> _M_get_current and _M_try_acquire.
>
> Tested x86_64-linux and sparc-solaris.
>
LGTM, one major suggestion to encapsulate the __pred._M_cur == 0 into named
member
function.
I have also checked that  __atomic_wait_address_until takes __pred by
reference,
and does not make copies, so encapsulating __cur there is OK.

>
>  libstdc++-v3/include/bits/atomic_wait.h    |   6 +-
>  libstdc++-v3/include/bits/semaphore_base.h | 241 +++++++++++++++------
>  libstdc++-v3/include/std/semaphore         |   5 +-
>  3 files changed, 186 insertions(+), 66 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/atomic_wait.h
> b/libstdc++-v3/include/bits/atomic_wait.h
> index 815726c16ccb..d32fda1c2435 100644
> --- a/libstdc++-v3/include/bits/atomic_wait.h
> +++ b/libstdc++-v3/include/bits/atomic_wait.h
> @@ -249,12 +249,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>        // C++26 will return __val
>      }
>
> +  // Wait on __addr while *__addr == __old is true.
> +  // Cannot be used for proxy waits.
>    inline void
>    __atomic_wait_address_v(const __detail::__platform_wait_t* __addr,
>                           __detail::__platform_wait_t __old,
> -                         int __order)
> +                         int __order, bool __bare_wait = false)
>    {
> -    __detail::__wait_args __args{ __addr, __old, __order };
> +    __detail::__wait_args __args{ __addr, __old, __order, __bare_wait };
>      // C++26 will not ignore the return value here
>      __detail::__wait_impl(__addr, __args);
>    }
> diff --git a/libstdc++-v3/include/bits/semaphore_base.h
> b/libstdc++-v3/include/bits/semaphore_base.h
> index 3f7a33ccd51a..a1311e8690c7 100644
> --- a/libstdc++-v3/include/bits/semaphore_base.h
> +++ b/libstdc++-v3/include/bits/semaphore_base.h
> @@ -46,23 +46,20 @@ namespace std _GLIBCXX_VISIBILITY(default)
>  {
>  _GLIBCXX_BEGIN_NAMESPACE_VERSION
>
> -  template<bool _Platform_wait>
> -  struct __semaphore_base
> +  struct __semaphore_impl
>    {
> -    using __count_type = __conditional_t<_Platform_wait,
> -                                        __detail::__platform_wait_t,
> -                                        ptrdiff_t>;
> +    using __count_type = ptrdiff_t;
>
>      static constexpr ptrdiff_t _S_max
>        = __gnu_cxx::__int_traits<__count_type>::__max;
>
>      constexpr explicit
> -    __semaphore_base(__count_type __count) noexcept
> +    __semaphore_impl(__count_type __count) noexcept
>      : _M_counter(__count)
>      { }
>
> -    __semaphore_base(const __semaphore_base&) = delete;
> -    __semaphore_base& operator=(const __semaphore_base&) = delete;
> +    __semaphore_impl(const __semaphore_impl&) = delete;
> +    __semaphore_impl& operator=(const __semaphore_impl&) = delete;
>
>      // Load the current counter value.
>      _GLIBCXX_ALWAYS_INLINE __count_type
> @@ -71,8 +68,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>
>      // Try to acquire the semaphore (i.e. decrement the counter).
>      // Returns false if the current counter is zero, or if another thread
> -    // decrements the value first. In the latter case, __cur is set to the
> -    // new value.
> +    // changes the value first. In the latter case, __cur is set to the
> new
> +    // value.
>      _GLIBCXX_ALWAYS_INLINE bool
>      _M_do_try_acquire(__count_type& __cur) noexcept
>      {
> @@ -85,24 +82,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>                                                     memory_order::relaxed);
>      }
>
> +    // Keep trying to acquire the semaphore in a loop until it succeeds.
>      void
>      _M_acquire() noexcept
>      {
> -      auto const __vfn = [this]{ return _M_get_current(); };
> -      auto __val = __vfn();
> -      auto const __pred = [&__val](__count_type __cur) {
> -       if (__cur > 0)
> -         {
> -           __val = __cur;
> -           return true;
> -         }
> -       return false;
> -      };
> -      while (!_M_do_try_acquire(__val))
> -       if (__val == 0)
> +      auto __vfn = [this]{ return _M_get_current(); };
> +      _Can_acquire __pred{__vfn()};
> +      while (!_M_do_try_acquire(__pred._M_val))
> +       if (__pred._M_val == 0)
>           std::__atomic_wait_address(&_M_counter, __pred, __vfn, true);
>      }
>
> +    // Try to acquire the semaphore, retrying a small number of times
> +    // in case of contention.
>      bool
>      _M_try_acquire() noexcept
>      {
> @@ -116,23 +108,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>        bool
>        _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>&
> __atime) noexcept
>        {
> -       auto const __vfn = [this]{ return _M_get_current(); };
> -       auto __val = __vfn();
> -       auto const __pred = [&__val](__count_type __cur) {
> -         if (__cur > 0)
> -           {
> -             __val = __cur;
> -             return true;
> -           }
> -         return false;
> -       };
> -       while (!_M_do_try_acquire(__val))
> -         if (__val == 0)
> -           {
> -             if (!std::__atomic_wait_address_until(&_M_counter, __pred,
> -                                                   __vfn, __atime, true))
> -               return false; // timed out
> -           }
> +       auto __vfn = [this]{ return _M_get_current(); };
> +       _Can_acquire __pred{__vfn()};
> +       while (!_M_do_try_acquire(__pred._M_val))
> +         if (__pred._M_val == 0)
> +           if (!std::__atomic_wait_address_until(&_M_counter, __pred,
> +                                                 __vfn, __atime, true))
> +             return false; // timed out
>         return true;
>        }
>
> @@ -140,23 +122,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>        bool
>        _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime)
> noexcept
>        {
> -       auto const __vfn = [this]{ return _M_get_current(); };
> -       auto __val = __vfn();
> -       auto const __pred = [&__val](__count_type __cur) {
> -         if (__cur > 0)
> -           {
> -             __val = __cur;
> -             return true;
> -           }
> -         return false;
> -       };
> -       while (!_M_do_try_acquire(__val))
> -         if (__val == 0)
> -           {
> -             if (!std::__atomic_wait_address_for(&_M_counter, __pred,
> -                                                 __vfn, __rtime, true))
> -               return false; // timed out
> -           }
> +       auto __vfn = [this]{ return _M_get_current(); };
> +       _Can_acquire __pred{__vfn()};
> +       while (!_M_do_try_acquire(__pred._M_val))
> +         if (__pred._M_val == 0)
>
I think we could have some named member funciton here,
like __pred.should_wait(). But not strictly necessary.

> +           if (!std::__atomic_wait_address_for(&_M_counter, __pred,
> +                                               __vfn, __rtime, true))
> +             return false; // timed out
>         return true;
>        }
>
> @@ -171,14 +143,161 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>      }
>
>    private:
> -    alignas(_Platform_wait ? __detail::__platform_wait_alignment
> -                          : __alignof__(__count_type))
> -    __count_type _M_counter;
> +    struct _Can_acquire
> +    {
> +      __count_type _M_val;
> +
> +      bool operator()(__count_type __cur) noexcept
> +      {
> +       if (__cur == 0)
> +         return false;
> +       _M_val = __cur;
> +       return true;
> +      }
> +    };
> +
> +    alignas(__atomic_ref<__count_type>::required_alignment)
> +      __count_type _M_counter;
> +  };
> +
> +  // Optimized specialization using __platform_wait (if available)
> +  template<bool _Binary>
> +  struct __platform_semaphore_impl
> +  {
> +    using __count_type = __detail::__platform_wait_t;
> +
> +    static constexpr ptrdiff_t _S_max
> +      = _Binary ? 1 : __gnu_cxx::__int_traits<__count_type>::__max;
> +
> +    constexpr explicit
> +    __platform_semaphore_impl(__count_type __count) noexcept
> +    : _M_counter(__count)
> +    { }
> +
> +    __platform_semaphore_impl(__platform_semaphore_impl&) = delete;
> +    __platform_semaphore_impl& operator=(const
> __platform_semaphore_impl&) = delete;
> +
> +    // Load the current counter value.
> +    _GLIBCXX_ALWAYS_INLINE __count_type
> +    _M_get_current() const noexcept
> +    {
> +      if constexpr (_Binary)
> +       return 1; // Not necessarily true, but optimistically assume it is.
> +      else
> +       return __atomic_impl::load(&_M_counter, memory_order::acquire);
> +    }
> +
> +    // Try to acquire the semaphore (i.e. decrement the counter).
> +    // Returns false if the current counter is zero, or if another thread
> +    // changes the value first. In the latter case, __cur is set to the
> new
> +    // value.
> +    _GLIBCXX_ALWAYS_INLINE bool
> +    _M_do_try_acquire(__count_type& __cur) noexcept
> +    {
> +      if (__cur == 0)
> +       return false; // Cannot decrement when it's already zero.
> +
> +      return __atomic_impl::compare_exchange_strong(&_M_counter,
> +                                                   __cur, __cur - 1,
> +                                                   memory_order::acquire,
> +                                                   memory_order::relaxed);
> +    }
> +
> +    // Keep trying to acquire the semaphore in a loop until it succeeds.
> +    void
> +    _M_acquire() noexcept
> +    {
> +      auto __val = _M_get_current();
> +      while (!_M_do_try_acquire(__val))
> +       if (__val == 0)
> +         {
> +           std::__atomic_wait_address_v(&_M_counter, __val,
> __ATOMIC_ACQUIRE,
> +                                        true);
> +           __val = _M_get_current();
> +         }
> +    }
> +
> +    // Try to acquire the semaphore.
> +    bool
> +    _M_try_acquire() noexcept
> +    {
> +      if constexpr (_Binary)
> +       {
> +         __count_type __val = 1;
> +         // Do not expect much contention on binary semaphore, only try
> once.
> +         return _M_do_try_acquire(__val);
> +       }
> +      else
> +       // Fastest implementation of this function is just
> _M_do_try_acquire
> +       // but that can fail under contention even when _M_count > 0.
> +       // Using _M_try_acquire_for(0ns) will retry a few times in a loop.
> +       return _M_try_acquire_for(__detail::__wait_clock_t::duration{});
> +    }
> +
> +    template<typename _Clock, typename _Duration>
> +      bool
> +      _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>&
> __atime) noexcept
> +      {
> +       auto __val = _M_get_current();
> +       while (!_M_do_try_acquire(__val))
> +         if (__val == 0)
> +           {
> +             if (!std::__atomic_wait_address_until_v(&_M_counter, 0,
> +                                                     __ATOMIC_ACQUIRE,
> +                                                     __atime, true))
> +               return false; // timed out
> +             __val = _M_get_current();
> +           }
> +       return true;
> +      }
> +
> +    template<typename _Rep, typename _Period>
> +      bool
> +      _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime)
> noexcept
> +      {
> +       auto __val = _M_get_current();
> +       while (!_M_do_try_acquire(__val))
> +         if (__val == 0)
> +           {
> +             if (!std::__atomic_wait_address_for_v(&_M_counter, 0,
> +                                                   __ATOMIC_ACQUIRE,
> +                                                   __rtime, true))
> +               return false; // timed out
> +             __val = _M_get_current();
> +           }
> +       return true;
> +      }
> +
> +    _GLIBCXX_ALWAYS_INLINE ptrdiff_t
> +    _M_release(ptrdiff_t __update) noexcept
> +    {
> +      auto __old = __atomic_impl::fetch_add(&_M_counter, __update,
> +                                           memory_order::release);
> +      if (__old == 0 && __update > 0)
> +       __atomic_notify_address(&_M_counter, true, true);
> +      return __old;
> +    }
> +
> +  protected:
> +    alignas(__detail::__platform_wait_alignment) __count_type _M_counter;
>    };
>
>    template<ptrdiff_t _Max>
> -    using __semaphore_impl
> -      = __semaphore_base<(_Max <= __semaphore_base<true>::_S_max)>;
> +    using _Select_semaphore_impl = typename decltype([]
> +    {
> +      using namespace __detail;
> +      if constexpr (__platform_wait_uses_type<__platform_wait_t>)
> +       {
> +         if constexpr (_Max <= 1)
> +           return type_identity<__platform_semaphore_impl<true>>{};
> +         else if constexpr (_Max <=
> __platform_semaphore_impl<false>::_S_max)
> +           return type_identity<__platform_semaphore_impl<false>>{};
> +         else
> +           return type_identity<__semaphore_impl>{};
> +       }
> +      else
> +       return type_identity<__semaphore_impl>{};
> +    }())::type;
>
>  _GLIBCXX_END_NAMESPACE_VERSION
>  } // namespace std
> diff --git a/libstdc++-v3/include/std/semaphore
> b/libstdc++-v3/include/std/semaphore
> index ca1bffe371a0..8f49188563e8 100644
> --- a/libstdc++-v3/include/std/semaphore
> +++ b/libstdc++-v3/include/std/semaphore
> @@ -45,13 +45,12 @@ namespace std _GLIBCXX_VISIBILITY(default)
>  {
>  _GLIBCXX_BEGIN_NAMESPACE_VERSION
>
> -  template<ptrdiff_t __least_max_value = __semaphore_base<true>::_S_max>
> +  template<ptrdiff_t __least_max_value =
> _Select_semaphore_impl<2>::_S_max>
>      class counting_semaphore
>      {
>        static_assert(__least_max_value >= 0);
>
> -      using _Impl = __semaphore_impl<__least_max_value>;
> -      _Impl _M_sem;
> +      _Select_semaphore_impl<__least_max_value> _M_sem;
>
>      public:
>        constexpr explicit
> --
> 2.49.0
>
>

Reply via email to