Rename __semaphore_base to __semaphore_impl, because it's not used as a
base class. Replace the three identical lambda expressions with a named
class, __semaphore_impl::_Can_acquire, which stores the most recent
value of the counter as a data member.

Add a new __platform_semaphore_impl class template which uses
__platform_wait_t for the counter and uses more efficient atomic waits
for the acquire functions, when __platform_wait is available. For a
binary semaphore some members are further optimized because we know the
counter can only be zero or one.

Also add a bare wait flag to __atomic_wait_address_v, for consistency
with __atomic_wait_address_until_v and __atomic_wait_address_for_v and
to allow semaphores to use it without the redundant overhead of tracking
waiters.

libstdc++-v3/ChangeLog:

        * include/bits/atomic_wait.h (__atomic_wait_address_v): Add bare
        wait flag.
        * include/bits/semaphore_base.h (__semaphore_base): Rename to
        __semaphore_impl. Replace local variable and predicate lambda
        with _Can_acquire struct.
        (__platform_semaphore_impl): New class template.
        (__semaphore_impl): Remove alias template.
        * include/std/semaphore:
---

This replaces the two patches:

[PATCH 3/4] libstdc++: Optimize std::binary_semaphore
[PATCH 4/4] libstdc++: Optimize std::counting_semaphore for futex path

The problem with those patches was that the "optimized" implementations
were used if _Max <= numeric_limits<__platform_wait_t>::max(), which is
always true for many non-futex targets because we do:

# if ATOMIC_LONG_LOCK_FREE == 2
    using __platform_wait_t = unsigned long;
# else
    using __platform_wait_t = unsigned int;
# endif

The earlier patches should have been checking either
_GLIBCXX_HAVE_PLATFORM_WAIT or the __platform_wait_uses_type variable
template to see if an efficient (i.e. not condition_variable-based)
implementation is actually supported. It's not useful to check if the
semaphore's maximum value fits in the __platform_wait_t type if that
type doesn't have fast wait/wake operations like a futex.

The new proposal in this patch only selects the optimized
implementations if they can be implemented using __platform_wait.  The
__semaphore_impl always uses the portable atomic wait API (using an
accessor function and predicate). As Tomasz suggested, a new class
template provides the optimized implementation for both
counting_semaphore and binary_semaphore, differing only in
_M_get_current and _M_try_acquire.

Tested x86_64-linux and sparc-solaris.

 libstdc++-v3/include/bits/atomic_wait.h    |   6 +-
 libstdc++-v3/include/bits/semaphore_base.h | 241 +++++++++++++++------
 libstdc++-v3/include/std/semaphore         |   5 +-
 3 files changed, 186 insertions(+), 66 deletions(-)

diff --git a/libstdc++-v3/include/bits/atomic_wait.h 
b/libstdc++-v3/include/bits/atomic_wait.h
index 815726c16ccb..d32fda1c2435 100644
--- a/libstdc++-v3/include/bits/atomic_wait.h
+++ b/libstdc++-v3/include/bits/atomic_wait.h
@@ -249,12 +249,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       // C++26 will return __val
     }
 
+  // Wait on __addr while *__addr == __old is true.
+  // Cannot be used for proxy waits.
   inline void
   __atomic_wait_address_v(const __detail::__platform_wait_t* __addr,
                          __detail::__platform_wait_t __old,
-                         int __order)
+                         int __order, bool __bare_wait = false)
   {
-    __detail::__wait_args __args{ __addr, __old, __order };
+    __detail::__wait_args __args{ __addr, __old, __order, __bare_wait };
     // C++26 will not ignore the return value here
     __detail::__wait_impl(__addr, __args);
   }
diff --git a/libstdc++-v3/include/bits/semaphore_base.h 
b/libstdc++-v3/include/bits/semaphore_base.h
index 3f7a33ccd51a..a1311e8690c7 100644
--- a/libstdc++-v3/include/bits/semaphore_base.h
+++ b/libstdc++-v3/include/bits/semaphore_base.h
@@ -46,23 +46,20 @@ namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
-  template<bool _Platform_wait>
-  struct __semaphore_base
+  struct __semaphore_impl
   {
-    using __count_type = __conditional_t<_Platform_wait,
-                                        __detail::__platform_wait_t,
-                                        ptrdiff_t>;
+    using __count_type = ptrdiff_t;
 
     static constexpr ptrdiff_t _S_max
       = __gnu_cxx::__int_traits<__count_type>::__max;
 
     constexpr explicit
-    __semaphore_base(__count_type __count) noexcept
+    __semaphore_impl(__count_type __count) noexcept
     : _M_counter(__count)
     { }
 
-    __semaphore_base(const __semaphore_base&) = delete;
-    __semaphore_base& operator=(const __semaphore_base&) = delete;
+    __semaphore_impl(const __semaphore_impl&) = delete;
+    __semaphore_impl& operator=(const __semaphore_impl&) = delete;
 
     // Load the current counter value.
     _GLIBCXX_ALWAYS_INLINE __count_type
@@ -71,8 +68,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
     // Try to acquire the semaphore (i.e. decrement the counter).
     // Returns false if the current counter is zero, or if another thread
-    // decrements the value first. In the latter case, __cur is set to the
-    // new value.
+    // changes the value first. In the latter case, __cur is set to the new
+    // value.
     _GLIBCXX_ALWAYS_INLINE bool
     _M_do_try_acquire(__count_type& __cur) noexcept
     {
@@ -85,24 +82,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                                                    memory_order::relaxed);
     }
 
+    // Keep trying to acquire the semaphore in a loop until it succeeds.
     void
     _M_acquire() noexcept
     {
-      auto const __vfn = [this]{ return _M_get_current(); };
-      auto __val = __vfn();
-      auto const __pred = [&__val](__count_type __cur) {
-       if (__cur > 0)
-         {
-           __val = __cur;
-           return true;
-         }
-       return false;
-      };
-      while (!_M_do_try_acquire(__val))
-       if (__val == 0)
+      auto __vfn = [this]{ return _M_get_current(); };
+      _Can_acquire __pred{__vfn()};
+      while (!_M_do_try_acquire(__pred._M_val))
+       if (__pred._M_val == 0)
          std::__atomic_wait_address(&_M_counter, __pred, __vfn, true);
     }
 
+    // Try to acquire the semaphore, retrying a small number of times
+    // in case of contention.
     bool
     _M_try_acquire() noexcept
     {
@@ -116,23 +108,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       bool
       _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& 
__atime) noexcept
       {
-       auto const __vfn = [this]{ return _M_get_current(); };
-       auto __val = __vfn();
-       auto const __pred = [&__val](__count_type __cur) {
-         if (__cur > 0)
-           {
-             __val = __cur;
-             return true;
-           }
-         return false;
-       };
-       while (!_M_do_try_acquire(__val))
-         if (__val == 0)
-           {
-             if (!std::__atomic_wait_address_until(&_M_counter, __pred,
-                                                   __vfn, __atime, true))
-               return false; // timed out
-           }
+       auto __vfn = [this]{ return _M_get_current(); };
+       _Can_acquire __pred{__vfn()};
+       while (!_M_do_try_acquire(__pred._M_val))
+         if (__pred._M_val == 0)
+           if (!std::__atomic_wait_address_until(&_M_counter, __pred,
+                                                 __vfn, __atime, true))
+             return false; // timed out
        return true;
       }
 
@@ -140,23 +122,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       bool
       _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) 
noexcept
       {
-       auto const __vfn = [this]{ return _M_get_current(); };
-       auto __val = __vfn();
-       auto const __pred = [&__val](__count_type __cur) {
-         if (__cur > 0)
-           {
-             __val = __cur;
-             return true;
-           }
-         return false;
-       };
-       while (!_M_do_try_acquire(__val))
-         if (__val == 0)
-           {
-             if (!std::__atomic_wait_address_for(&_M_counter, __pred,
-                                                 __vfn, __rtime, true))
-               return false; // timed out
-           }
+       auto __vfn = [this]{ return _M_get_current(); };
+       _Can_acquire __pred{__vfn()};
+       while (!_M_do_try_acquire(__pred._M_val))
+         if (__pred._M_val == 0)
+           if (!std::__atomic_wait_address_for(&_M_counter, __pred,
+                                               __vfn, __rtime, true))
+             return false; // timed out
        return true;
       }
 
@@ -171,14 +143,161 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     }
 
   private:
-    alignas(_Platform_wait ? __detail::__platform_wait_alignment
-                          : __alignof__(__count_type))
-    __count_type _M_counter;
+    struct _Can_acquire
+    {
+      __count_type _M_val;
+
+      bool operator()(__count_type __cur) noexcept
+      {
+       if (__cur == 0)
+         return false;
+       _M_val = __cur;
+       return true;
+      }
+    };
+
+    alignas(__atomic_ref<__count_type>::required_alignment)
+      __count_type _M_counter;
+  };
+
+  // Optimized specialization using __platform_wait (if available)
+  template<bool _Binary>
+  struct __platform_semaphore_impl
+  {
+    using __count_type = __detail::__platform_wait_t;
+
+    static constexpr ptrdiff_t _S_max
+      = _Binary ? 1 : __gnu_cxx::__int_traits<__count_type>::__max;
+
+    constexpr explicit
+    __platform_semaphore_impl(__count_type __count) noexcept
+    : _M_counter(__count)
+    { }
+
+    __platform_semaphore_impl(__platform_semaphore_impl&) = delete;
+    __platform_semaphore_impl& operator=(const __platform_semaphore_impl&) = 
delete;
+
+    // Load the current counter value.
+    _GLIBCXX_ALWAYS_INLINE __count_type
+    _M_get_current() const noexcept
+    {
+      if constexpr (_Binary)
+       return 1; // Not necessarily true, but optimistically assume it is.
+      else
+       return __atomic_impl::load(&_M_counter, memory_order::acquire);
+    }
+
+    // Try to acquire the semaphore (i.e. decrement the counter).
+    // Returns false if the current counter is zero, or if another thread
+    // changes the value first. In the latter case, __cur is set to the new
+    // value.
+    _GLIBCXX_ALWAYS_INLINE bool
+    _M_do_try_acquire(__count_type& __cur) noexcept
+    {
+      if (__cur == 0)
+       return false; // Cannot decrement when it's already zero.
+
+      return __atomic_impl::compare_exchange_strong(&_M_counter,
+                                                   __cur, __cur - 1,
+                                                   memory_order::acquire,
+                                                   memory_order::relaxed);
+    }
+
+    // Keep trying to acquire the semaphore in a loop until it succeeds.
+    void
+    _M_acquire() noexcept
+    {
+      auto __val = _M_get_current();
+      while (!_M_do_try_acquire(__val))
+       if (__val == 0)
+         {
+           std::__atomic_wait_address_v(&_M_counter, __val, __ATOMIC_ACQUIRE,
+                                        true);
+           __val = _M_get_current();
+         }
+    }
+
+    // Try to acquire the semaphore.
+    bool
+    _M_try_acquire() noexcept
+    {
+      if constexpr (_Binary)
+       {
+         __count_type __val = 1;
+         // Do not expect much contention on binary semaphore, only try once.
+         return _M_do_try_acquire(__val);
+       }
+      else
+       // Fastest implementation of this function is just _M_do_try_acquire
+       // but that can fail under contention even when _M_count > 0.
+       // Using _M_try_acquire_for(0ns) will retry a few times in a loop.
+       return _M_try_acquire_for(__detail::__wait_clock_t::duration{});
+    }
+
+    template<typename _Clock, typename _Duration>
+      bool
+      _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& 
__atime) noexcept
+      {
+       auto __val = _M_get_current();
+       while (!_M_do_try_acquire(__val))
+         if (__val == 0)
+           {
+             if (!std::__atomic_wait_address_until_v(&_M_counter, 0,
+                                                     __ATOMIC_ACQUIRE,
+                                                     __atime, true))
+               return false; // timed out
+             __val = _M_get_current();
+           }
+       return true;
+      }
+
+    template<typename _Rep, typename _Period>
+      bool
+      _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) 
noexcept
+      {
+       auto __val = _M_get_current();
+       while (!_M_do_try_acquire(__val))
+         if (__val == 0)
+           {
+             if (!std::__atomic_wait_address_for_v(&_M_counter, 0,
+                                                   __ATOMIC_ACQUIRE,
+                                                   __rtime, true))
+               return false; // timed out
+             __val = _M_get_current();
+           }
+       return true;
+      }
+
+    _GLIBCXX_ALWAYS_INLINE ptrdiff_t
+    _M_release(ptrdiff_t __update) noexcept
+    {
+      auto __old = __atomic_impl::fetch_add(&_M_counter, __update,
+                                           memory_order::release);
+      if (__old == 0 && __update > 0)
+       __atomic_notify_address(&_M_counter, true, true);
+      return __old;
+    }
+
+  protected:
+    alignas(__detail::__platform_wait_alignment) __count_type _M_counter;
   };
 
   template<ptrdiff_t _Max>
-    using __semaphore_impl
-      = __semaphore_base<(_Max <= __semaphore_base<true>::_S_max)>;
+    using _Select_semaphore_impl = typename decltype([]
+    {
+      using namespace __detail;
+      if constexpr (__platform_wait_uses_type<__platform_wait_t>)
+       {
+         if constexpr (_Max <= 1)
+           return type_identity<__platform_semaphore_impl<true>>{};
+         else if constexpr (_Max <= __platform_semaphore_impl<false>::_S_max)
+           return type_identity<__platform_semaphore_impl<false>>{};
+         else
+           return type_identity<__semaphore_impl>{};
+       }
+      else
+       return type_identity<__semaphore_impl>{};
+    }())::type;
 
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
diff --git a/libstdc++-v3/include/std/semaphore 
b/libstdc++-v3/include/std/semaphore
index ca1bffe371a0..8f49188563e8 100644
--- a/libstdc++-v3/include/std/semaphore
+++ b/libstdc++-v3/include/std/semaphore
@@ -45,13 +45,12 @@ namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
-  template<ptrdiff_t __least_max_value = __semaphore_base<true>::_S_max>
+  template<ptrdiff_t __least_max_value = _Select_semaphore_impl<2>::_S_max>
     class counting_semaphore
     {
       static_assert(__least_max_value >= 0);
 
-      using _Impl = __semaphore_impl<__least_max_value>;
-      _Impl _M_sem;
+      _Select_semaphore_impl<__least_max_value> _M_sem;
 
     public:
       constexpr explicit
-- 
2.49.0

Reply via email to