On Tue, 27 Jan 2026 at 07:29, Jay Feldblum <[email protected]> wrote:
>
> Only some operations on std::shared_ptr and std::weak_ptr must
> synchronize with each other, namely, destructions and calls to member
> reset(). In the atomic implementation, synchronization is accomplished
> with release/acquire memory-ordering operations on refcount decrements.
> Copies, involving refcount increments, do not synchronize.
>
> On TSO architectures like x86-64, there is little observable difference.
> But on architectures with weaker memory models like aarch64, the
> compiler may emit faster instruction sequences for atomic increments
> without memory ordering than with.

Thanks for working on this. I think it's a useful optimization,
although it's too late to make the cut for GCC 16 now.

>
>         PR libstdc++/111589
>
> libstdc++-v3/ChangeLog:
>
>         PR libstdc++/111589
>         * include/bits/shared_ptr_base.h (_Sp_counted_base::_M_add_ref_copy):
>         Use __exchange_and_add_relaxed_dispatch instead of
>         __exchange_and_add_dispatch.
>         (_Sp_counted_base::_M_weak_add_ref): Likewise.
>         (_Sp_counted_base::_M_add_ref_lock_nothrow): Use __ATOMIC_RELAXED
>         instead of __ATOMIC_ACQ_REL for the success ordering.
>         * include/ext/atomicity.h (__exchange_and_add_relaxed): New function.
>         (__atomic_add_relaxed): New function.
>         (__exchange_and_add_relaxed_dispatch): New function.
>         (__atomic_add_relaxed_dispatch): New function.
> ---
>  libstdc++-v3/include/bits/shared_ptr_base.h | 11 ++++--
>  libstdc++-v3/include/ext/atomicity.h        | 43 +++++++++++++++++++++
>  2 files changed, 51 insertions(+), 3 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/shared_ptr_base.h 
> b/libstdc++-v3/include/bits/shared_ptr_base.h
> index 88e0f4d58c6..1aa0d4faa85 100644
> --- a/libstdc++-v3/include/bits/shared_ptr_base.h
> +++ b/libstdc++-v3/include/bits/shared_ptr_base.h
> @@ -148,7 +148,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>        // Increment the use count (used when the count is greater than zero).
>        void
>        _M_add_ref_copy()
> -      { _S_chk(__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, 1)); }
> +      {
> +       _S_chk(__gnu_cxx::__exchange_and_add_relaxed_dispatch(&_M_use_count,
> +                                                             1));
> +      }
>
>        // Increment the use count if it is non-zero, throw otherwise.
>        void
> @@ -204,7 +207,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>         // _M_weak_count can always use negative values because it cannot be
>         // observed by users (unlike _M_use_count). See _S_chk for details.
>         constexpr _Atomic_word __max = -1;
> -       if (__gnu_cxx::__exchange_and_add_dispatch(&_M_weak_count, 1) == 
> __max)
> +       auto __r
> +         = __gnu_cxx::__exchange_and_add_relaxed_dispatch(&_M_weak_count, 1);
> +       if (__r == __max)
>           [[__unlikely__]] __builtin_trap();
>        }
>
> @@ -358,7 +363,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>           // long as it's not changed meanwhile.
>         }
>        while (!__atomic_compare_exchange_n(&_M_use_count, &__count, __count + 
> 1,
> -                                         true, __ATOMIC_ACQ_REL,
> +                                         true, __ATOMIC_RELAXED,

Can this really be relaxed?

I guess it already has no ordering guarantees for the case where we
return false, and when we successfully increment a non-zero count this
is equivalent to _M_ref_add_copy() so relaxed must be fine. OK, I'm
happy.


>                                           __ATOMIC_RELAXED));
>        _S_chk(__count);
>        return true;
> diff --git a/libstdc++-v3/include/ext/atomicity.h 
> b/libstdc++-v3/include/ext/atomicity.h
> index ecbe376a687..6ecda307990 100644
> --- a/libstdc++-v3/include/ext/atomicity.h
> +++ b/libstdc++-v3/include/ext/atomicity.h
> @@ -74,12 +74,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>    __attribute__((__always_inline__))
>    __atomic_add(volatile _Atomic_word* __mem, int __val)
>    { __atomic_fetch_add(__mem, __val, __ATOMIC_ACQ_REL); }
> +
> +  inline _Atomic_word
> +  __attribute__((__always_inline__))
> +  __exchange_and_add_relaxed(volatile _Atomic_word* __mem, int __val)
> +  { return __atomic_fetch_add(__mem, __val, __ATOMIC_RELAXED); }
> +
> +  inline void
> +  __attribute__((__always_inline__))
> +  __atomic_add_relaxed(volatile _Atomic_word* __mem, int __val)
> +  { __atomic_fetch_add(__mem, __val, __ATOMIC_RELAXED); }

Do we need __atomic_add_relaxed? You don't seem to use it above.

>  #else // Defined in config/cpu/.../atomicity.h
>    _Atomic_word
>    __exchange_and_add(volatile _Atomic_word*, int) _GLIBCXX_NOTHROW;
>
>    void
>    __atomic_add(volatile _Atomic_word*, int) _GLIBCXX_NOTHROW;
> +
> +  // Fallback to acq_rel when builtins are not available.
> +  inline _Atomic_word
> +  __attribute__((__always_inline__))
> +  __exchange_and_add_relaxed(volatile _Atomic_word* __mem, int __val)
> +  { return __exchange_and_add(__mem, __val); }
> +
> +  inline void
> +  __attribute__((__always_inline__))
> +  __atomic_add_relaxed(volatile _Atomic_word* __mem, int __val)
> +  { __atomic_add(__mem, __val); }
>  #endif
>
>  #if __cplusplus < 201103L
> @@ -154,6 +175,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>        __atomic_add(__mem, __val);
>    }
>
> +  // Relaxed variants for reference count increments where acquire/release
> +  // semantics are not required. Decrements still need stronger ordering.
> +  inline _Atomic_word
> +  __attribute__ ((__always_inline__))
> +  __exchange_and_add_relaxed_dispatch(_Atomic_word* __mem, int __val)
> +  {
> +    if (__is_single_threaded())
> +      return __exchange_and_add_single(__mem, __val);
> +    else
> +      return __exchange_and_add_relaxed(__mem, __val);
> +  }

I wonder if we really need the new dispatch function, or should just
use __exchange_and_add_relaxed directly in the shared_ptr code.

The point of the dispatch functions is to use unsynchronized
operations when we know the program is single-threaded, to avoid the
additional overhead of atomics with acq_rel ordering. But for some
targets, a relaxed atomic is almost as cheap as the non-atomic op
anyway. At what point does checking the __is_single_threaded()
function outweigh the cost of a relaxed RMW? It certainly increases
code size.

It depends on the processor, and for some arches I expect it's worth
keeping the dispatch. But maybe not on all arches.


> +
> +  inline void
> +  __attribute__ ((__always_inline__))
> +  __atomic_add_relaxed_dispatch(_Atomic_word* __mem, int __val)
> +  {
> +    if (__is_single_threaded())
> +      __atomic_add_single(__mem, __val);
> +    else
> +      __atomic_add_relaxed(__mem, __val);
> +  }
> +
>  _GLIBCXX_END_NAMESPACE_VERSION
>  } // namespace
>
> --
> 2.47.3
>

Reply via email to