Re: [PATCH 0/3] Uncontroversial improvements to C++20 wait-related implementation

2021-03-23 Thread Thiago Macieira via Gcc-patches
On Tuesday, 23 March 2021 08:39:43 PDT Thomas Rodgers wrote:
> I will be submitting a new patch for the
> atomic.wait/barrier/latch/semaphore functionality a bit later today that
> subsumes the changes to atomic_wait and latch, and includes the changes
> to barrier.

Thanks, Thomas

Is that meant to be part of GCC 11's release?

If not, what do we do about preventing the future BC break and potential 
heisenbugs?

 1) do nothing, accept they will happen silently
 2) cause non-silent BC breaks
 3) disable the code for now (unless explicitly opted-in)

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





[PATCH 0/3] Uncontroversial improvements to C++20 wait-related implementation

2021-03-22 Thread Thiago Macieira via Gcc-patches
> Discussion at:
> https://gcc.gnu.org/pipermail/libstdc++/2021-February/052043.html
> 
> This patch set includes the uncontroversial parts that improve
> performance but don't otherwise change ABI.
> 
> Please note we still need to decide on how to deal with the future ABI
> break.
> 
> Thiago Macieira (3):
>   Atomic __platform_wait: accept any 32-bit type, not just int
>   std::latch: reduce internal implementation from ptrdiff_t to int
>   barrier: optimise by not having the hasher in a loop
>  
>  libstdc++-v3/include/bits/atomic_wait.h |  7 ---
>  libstdc++-v3/include/std/barrier| 10 +-
>  libstdc++-v3/include/std/latch  |  4 ++--
>  3 files changed, 11 insertions(+), 10 deletions(-)

ping?
-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





[PATCH 2/3] std::latch: reduce internal implementation from ptrdiff_t to int

2021-03-05 Thread Thiago Macieira via Gcc-patches
ints can be used as futex on Linux. ptrdiff_t on 64-bit Linux can't.

libstdc++-v3/ChangeLog:

* include/std/latch: Use int instead of ptrdiff_t
---
 libstdc++-v3/include/std/latch | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/std/latch b/libstdc++-v3/include/std/latch
index ef8c301e5e9..f3f73360618 100644
--- a/libstdc++-v3/include/std/latch
+++ b/libstdc++-v3/include/std/latch
@@ -48,7 +48,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   public:
 static constexpr ptrdiff_t
 max() noexcept
-{ return __gnu_cxx::__int_traits::__max; }
+{ return __gnu_cxx::__int_traits::__max; }
 
 constexpr explicit latch(ptrdiff_t __expected) noexcept
   : _M_a(__expected) { }
@@ -85,7 +85,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 }
 
   private:
-alignas(__alignof__(ptrdiff_t)) ptrdiff_t _M_a;
+alignas(4) int _M_a;   // kernel requires 4-byte alignment
   };
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace
-- 
2.30.1



[PATCH 3/3] barrier: optimise by not having the hasher in a loop

2021-03-05 Thread Thiago Macieira via Gcc-patches
Our thread's ID does not change so we don't have to get it every time
and hash it every time.

libstdc++-v3/ChangeLog:

* include/std/barrier(arrive): move hasher one level up in the
  stack.
---
 libstdc++-v3/include/std/barrier | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libstdc++-v3/include/std/barrier b/libstdc++-v3/include/std/barrier
index e09212dfcb9..4bb5642c164 100644
--- a/libstdc++-v3/include/std/barrier
+++ b/libstdc++-v3/include/std/barrier
@@ -94,7 +94,7 @@ It looks different from literature pseudocode for two main 
reasons:
   alignas(__phase_alignment) __barrier_phase_t  _M_phase;
 
   bool
-  _M_arrive(__barrier_phase_t __old_phase)
+  _M_arrive(__barrier_phase_t __old_phase, size_t __current)
   {
const auto __old_phase_val = static_cast(__old_phase);
const auto __half_step =
@@ -103,9 +103,7 @@ It looks different from literature pseudocode for two main 
reasons:
   static_cast<__barrier_phase_t>(__old_phase_val + 2);
 
size_t __current_expected = _M_expected;
-   std::hash __hasher;
-   size_t __current = __hasher(std::this_thread::get_id())
- % ((_M_expected + 1) >> 1);
+__current %= ((__current_expected + 1) >> 1);
 
for (int __round = 0; ; ++__round)
  {
@@ -163,12 +161,14 @@ It looks different from literature pseudocode for two 
main reasons:
   [[nodiscard]] arrival_token
   arrive(ptrdiff_t __update)
   {
+   std::hash __hasher;
+   size_t __current = __hasher(std::this_thread::get_id());
__atomic_phase_ref_t __phase(_M_phase);
const auto __old_phase = __phase.load(memory_order_relaxed);
const auto __cur = static_cast(__old_phase);
for(; __update; --__update)
  {
-   if(_M_arrive(__old_phase))
+   if(_M_arrive(__old_phase, __current))
  {
_M_completion();
_M_expected += 
_M_expected_adjustment.load(memory_order_relaxed);
-- 
2.30.1



[PATCH 0/3] Uncontroversial improvements to C++20 wait-related implementation

2021-03-05 Thread Thiago Macieira via Gcc-patches
Discussion at:
https://gcc.gnu.org/pipermail/libstdc++/2021-February/052043.html

This patch set includes the uncontroversial parts that improve
performance but don't otherwise change ABI.

Please note we still need to decide on how to deal with the future ABI
break.

Thiago Macieira (3):
  Atomic __platform_wait: accept any 32-bit type, not just int
  std::latch: reduce internal implementation from ptrdiff_t to int
  barrier: optimise by not having the hasher in a loop

 libstdc++-v3/include/bits/atomic_wait.h |  7 ---
 libstdc++-v3/include/std/barrier| 10 +-
 libstdc++-v3/include/std/latch  |  4 ++--
 3 files changed, 11 insertions(+), 10 deletions(-)

-- 
2.30.1



[PATCH 1/3] Atomic __platform_wait: accept any 32-bit type, not just int

2021-03-05 Thread Thiago Macieira via Gcc-patches
The kernel doesn't care what we store in those 32 bits, only that they are
comparable. This commit adds:
 * pointers and long on 32-bit architectures
 * unsigned
 * untyped enums and typed enums on int & unsigned int
 * float

We're not using FUTEX_OP anywhere today. The kernel reserves 4 bits for
this field but has only used 6 values so far, so it can be extended to
unsigned compares in the future, if needed.

libstdc++-v3/ChangeLog:

* include/bits/atomic_wait.h: Update __platform_wait_uses_type
---
 libstdc++-v3/include/bits/atomic_wait.h | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libstdc++-v3/include/bits/atomic_wait.h 
b/libstdc++-v3/include/bits/atomic_wait.h
index 424fccbe4c5..4b4573df691 100644
--- a/libstdc++-v3/include/bits/atomic_wait.h
+++ b/libstdc++-v3/include/bits/atomic_wait.h
@@ -65,7 +65,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 template
   inline constexpr bool __platform_wait_uses_type
 #ifdef _GLIBCXX_HAVE_LINUX_FUTEX
-   = is_same_v, __platform_wait_t>;
+   = sizeof(_Tp) == sizeof(__platform_wait_t) &&
+ alignof(_Tp) >= alignof(__platform_wait_t);
 #else
= false;
 #endif
@@ -91,13 +92,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 template
   void
-  __platform_wait(const _Tp* __addr, __platform_wait_t __val) noexcept
+  __platform_wait(const _Tp* __addr, _Tp __val) noexcept
   {
for(;;)
  {
auto __e = syscall (SYS_futex, static_cast(__addr),
  
static_cast(__futex_wait_flags::__wait_private),
-   __val, nullptr);
+ static_cast<__platform_wait_t>(__val), 
nullptr);
if (!__e || errno == EAGAIN)
  break;
else if (errno != EINTR)
-- 
2.30.1



Re: [PATCH 2/5] Atomic __platform_wait: accept any 32-bit type, not just int

2021-03-03 Thread Thiago Macieira via Gcc-patches
On Wednesday, 3 March 2021 08:21:51 PST Jonathan Wakely wrote:
> >>-   = is_same_v, __platform_wait_t>;
> >>+   = is_scalar_v> && sizeof(_Tp) ==
> >>sizeof(__platform_wait_t)
> Oh, except that is_scalar is surprisingly expensive to instantiate
> (its defined in a really expensive way) and since we control all uses
> of this constant, I don't think we need it. It's only ever used from
> atomic waiting functions which are only defined for scalar types.

Thanks. Will update and re-submit.

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





Re: [PATCH 1/5] std::latch: reduce internal implementation from ptrdiff_t to int

2021-03-03 Thread Thiago Macieira via Gcc-patches
On Wednesday, 3 March 2021 06:34:26 PST Jonathan Wakely wrote:
> On 26/02/21 07:59 -0800, Thiago Macieira via Libstdc++ wrote:
> >ints can be used as futex on Linux. ptrdiff_t on 64-bit Linux can't.
> 
> Yes, we should do this for GCC 11.

Want me to re-submit this one alone, with the "alignas(4)" with a commend 
indicating it's what the kernel requires?

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





Re: [PATCH 4/5] barrier: use int instead of unsigned char for the phase state

2021-03-01 Thread Thiago Macieira via Gcc-patches
On Monday, 1 March 2021 14:31:09 PST Ville Voutilainen wrote:
> On Tue, 2 Mar 2021 at 00:21, Thiago Macieira  
wrote:
> > But the code I posted, if people are careful to use write like I did,
> > would
> > allow us to have the experimental "we're not sure this is right"
> > implementation of atomic waits, latches, barriers and semaphores right
> > now.
> 
> The code assumes that as soon as __cplusplus bumps and a header is
> present, things
> are stable.

Not exactly. Re-posting the code for reference:

#if __cplusplus >= 202002L && __has_include()
#  include 
#endif
#if __cpp_lib_latch < 201907L
#  error "Please upgrade your Standard Library"
#endif

The first section simply assumes that  can be #included. The 
__cplusplus check is necessary because MSVC STL's headers are present but 
can't be #include'd otherwise (they cause #error).

It's the second check that is authoritative.

> I don't think that's a safe assumption to make.
> Furthermore, the second  #if
> tries to check a feature-testing macro without including either the
> corresponding header
> or . That doesn't work. You need to either include 
> and check a macro,
> or check that  exists, then include  and then check the macro.

 would work, but why can't you check the macro anyway? It may trigger 
a warning with GCC's -Wundef, but it's just a warning. The preprocessor is 
defined to replace any undefined identifier with 0. If you want to avoid the 
warning, you could write:

#if defined(__cpp_lib_latch) && __cpp_lib_latch < 201907L

Is there anything I'm not seeing?

> But other than that, sure, as QoI, vendors could already provide the
> standard macros with
> numbers that are lower than the standard ever specified. Going
> forward, if existing facilities
> are changed, this stops working because now you'd have to track the
> WPs to see which
> values are "vendor-bogus". I find it better to just change the macros
> whose facilities changed
> during a standard cycle, and in those cases bump the IS macro, that'll
> then work going forward.
> In the meanwhile, when vendors can, they could use the technique you
> describe, but that's
> barely worth proposing as an SG10 guideline because they would've
> needed to do it already, but didn't. :P

I am not opposed to that. Your solution is better.

But this solution is less work on the standard and still works, even if it 
creates a little more work on the users of said features. It's not 
unsurmountable, since we often need to check which C++ standard edition it 
came with anyway. So it doesn't matter what value it assumes: we'll be 
consulting a table anyway.

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





Re: [PATCH 4/5] barrier: use int instead of unsigned char for the phase state

2021-03-01 Thread Thiago Macieira via Gcc-patches
On Monday, 1 March 2021 14:04:34 PST Ville Voutilainen wrote:
> Well, this would be different. What I'm suggesting is not quite that;
> for any *new* facility, we'd make sure
> that its draft macro and the final IS macro are different, but the
> minimum value is the first draft version,
> not anything below it. I don't care too much, that approach and yours
> would work the same way. Things that already
> had an IS value for a macro and haven't changed since don't need to be
> changed. And we don't
> need to bump all values of existing facilities either, just for those
> that got changes, so some existing macros
> would be considered lost causes. Like the ones we're talking about,
> because the cats are already out of the
> bag.

But the code I posted, if people are careful to use write like I did, would 
allow us to have the experimental "we're not sure this is right" 
implementation of atomic waits, latches, barriers and semaphores right now.

It would simply require that we decrement the macros by 1 in the libstdc++ 
headers.

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





Re: [PATCH 4/5] barrier: use int instead of unsigned char for the phase state

2021-03-01 Thread Thiago Macieira via Gcc-patches
On Monday, 1 March 2021 12:35:05 PST Ville Voutilainen wrote:
> I can't make the above code work, in any reasonable manner, because
> it's doing feature detection incorrectly. :)
> What I can imagine doing, however, is this:
> 
> 1) a published IS always bumps feature-macro values (this won't help
> now, it's a future fix, we don't currently do this in WG21)

This is mostly already the case. I haven't seen any need to bump the values.

> 2) an implementation uses the pre-IS draft values before it deems itself
> stable 

Before the IS is stable? From what I've seen so far, the macros are always 
around the month of the latest draft of a given feature. So if an 
implementation implemented an earlier draft, the macro version would increase 
in a new draft coming before the IS.

I think we just need to be careful of updates done after the latest draft is 
adopted, usually by NBs. Those have to bump the macro too. I think you're in 
good speaking terms with the Finland NB :-)

> 3) users who want stability should require the feature-testing macro
> to have at least the IS value

True. That's not the stability I was talking about, but it could be done.

> 4) adventurous users can either use the macro without checking its
> value, or use at least the value that gives them
> whatever shiny toy they're interested in

Fair enough.

Please note I am not talking about the stability of the feature as described 
in the standard or a proposal or draft. I am talking about the stability of 
the implementation. C++20 is out and the atomic-wait feature is stable, as 
defined by the standard. What isn't stable is GCC's implementation of it.

But your suggestion does work. We don't need to apply them to all macros, only 
those that are new in a given version, like __cpp_lib_atomic_wait or 
__cpp_lib_latch in this case. Alternatively, implementations can set the macro 
to a given value below the standard-required one (incl. 1) to indicate that 
they haven't achieved stability.

That would make my check:

#if __cplusplus >= 202002L && __has_include()
#  include 
#endif
#if __cpp_lib_latch < 201907L
#  error "Please upgrade your Standard Library"
#endif

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





Re: [PATCH 4/5] barrier: use int instead of unsigned char for the phase state

2021-03-01 Thread Thiago Macieira via Gcc-patches
On Monday, 1 March 2021 10:12:35 PST Ville Voutilainen wrote:
> I do have a question about the intent/concern here, regardless of what
> your patch technically
> does. The ABI break _is_ your concern, and the "heisenbugs" you were
> worried about would
> in fact be caused by the ABI break? So if you embed these things in
> your QAtomicThing, the ABI
> break may mess it up(*)? Is that a correct understanding of the concern
> here?

Let me clarify. I am stating that the current implementation is inefficient 
because Linux currently only implements 32-bit futexes. With that in mind, I 
am arguing we need to change the implementation in a way that will break ABI 
in a new release.

The concern is that such a change, despite being in experimental code for 
which ABI stability has never been promised, is still troublesome. The problem 
there is that even people who have read the documentation and waited until 
2024 to write code using the feature may still be affected. This isn't about 
Qt because I have no plans to use wait and notify in inline API.

But you can see someone doing:

#if __cplusplus >= 202002L && __has_include()
#  include 
#else
#  error "Please upgrade your compiler & standard library"
#endif

and using  in their inline code. And as you say, if they then mix DSOs 
compiled with different versions of GCC, one of them might be the old, 
experimental and binary-incompatible code. Remember that before April 2024, 
the most recent Ubuntu LTS will be 22.04, which will be released before GCC 
12, which means it will contain GCC 11.

So, wholly aside from how we fix the inefficiencies, we must decide how to 
deal with the ABI break. We can:

a) not have an ABI break in the first place, by having the above code not 
   compile with GCC until we promise ABI compatibility
b) cause a non-silent ABI break
c) accept the silent ABI break

I advocate for (a) and vehemently disagree with (c). Meanwhile, (b) can be 
something like (untested!):

 _GLIBCXX_BEGIN_NAMESPACE_VERSION
   namespace __detail
   {
+struct __waiters;
+inline namespace __experimental
+{
+  // from atomic_wait.cc
+  __waiters &__get_waiter(const void *);
+}
 using __platform_wait_t = int;
 
 constexpr auto __atomic_spin_count_1 = 16;
@@ -187,11 +193,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   static __waiters&
   _S_for(const void* __t)
   {
-   const unsigned char __mask = 0xf;
-   static __waiters __w[__mask + 1];
-
-   auto __key = _Hash_impl::hash(__t) & __mask;
-   return __w[__key];
+   return __get_waiter(__t);
   }
 };

That way, any DSO compiled using GCC 11 will fail to load when using GCC 12's 
libstdc++. And probably put "std::__detail::__experimental" in the 
GLIBCXX_EXPERIMENTAL ELF version so it's even clearer that it's experimental, 
thus helping Linux distributions and other binary artefact distributors 
realise they've made a mistake.

I still don't like (b) because it pushes the problem to the wrong people: the 
packagers. But it's far better than (c).

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





Re: [PATCH 4/5] barrier: use int instead of unsigned char for the phase state

2021-03-01 Thread Thiago Macieira via Gcc-patches
On Monday, 1 March 2021 09:38:58 PST Thomas Rodgers wrote:
> > And _M_phase, despite being non-atomic, is never accessed without the
> > atomic_ref, aside from the constructor. Both arrive() and wait() start
> > off by
> > creating the atomic_ref.
> 
> If it's non-atomic, then how is wait() supposed to wait on it,
> atomically?

Hey, it's your code :-)

It is using atomics to operate on the value. It's just that the type on the 
structure isn't an atomic by itself.

Why, I don't know. Olivier's original code did use atomics 
:
std::atomic expected_adjustment;
std::atomic<__phase_t> phase;


> And I am not disagreeing with that. I am, however saying, that I know
> this particular implementation (well the upstream one it is based on)
> has been extensively tested by the original author (ogiroux) including
> time on Summit. If we are going to start changing his design decisions
> (beyond the largely cosmetic, not algorithmic, ones that I have made as
> per Jonathan's request), they should be motivated by more than a 'well
> we feel int's would be better here because Futex' justification.

That's a reasonable request.

I'd like to see the benchmark results of using a directly-futexable type vs 
using unsigned char. But even the timing results need to be weighed against 
the increased memory use for std::barrier, which means it's not a directly-
objective conclusion. And given we *may* get 8-bit futexes, it might be worth 
keeping them this way and just tell people with large contentions to upgrade.

That of course rests on the contended atomic_wait being out-of-line.

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





Re: [PATCH 4/5] barrier: use int instead of unsigned char for the phase state

2021-03-01 Thread Thiago Macieira via Gcc-patches
On Sunday, 28 February 2021 07:05:47 PST Hans-Peter Nilsson wrote:
> On Fri, 26 Feb 2021, Thiago Macieira via Gcc-patches wrote:
> > ints can be used in futexes. chars can't.
> 
> Shouldn't that be an atomic type instead of a bare int then?

There are a couple of atomic_refs:

  using __atomic_phase_ref_t = std::__atomic_ref<__barrier_phase_t>;
  using __atomic_phase_const_ref_t = std::__atomic_ref;

And _M_phase, despite being non-atomic, is never accessed without the 
atomic_ref, aside from the constructor. Both arrive() and wait() start off by 
creating the atomic_ref.

But I confess I don't understand this code sufficiently to say it is correct. 
I'm simply saying that waiting on unsigned chars will not use a futex, at 
least until https://lkml.org/lkml/2019/12/4/1373 is merged into the kernel.

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





Re: [PATCH 1/5] std::latch: reduce internal implementation from ptrdiff_t to int

2021-02-26 Thread Thiago Macieira via Gcc-patches
On Friday, 26 February 2021 11:31:00 PST Andreas Schwab wrote:
> On Feb 26 2021, Thiago Macieira wrote:
> > On Friday, 26 February 2021 10:14:42 PST Andreas Schwab wrote:
> >> On Feb 26 2021, Thiago Macieira via Gcc-patches wrote:
> >> > -alignas(__alignof__(ptrdiff_t)) ptrdiff_t _M_a;
> >> > +alignas(__alignof__(int)) int _M_a;
> >> 
> >> Futexes must be aligned to 4 bytes.
> > 
> > Agreed, but doesn't this accomplish that?
> 
> No.  It uses whatever alignment the type already has, and is an
> elaborate no-op.

I thought so too when I read the original line. But I expected it was written 
like that for a reason, especially since the same pattern appears in other 
places.

I can change to "alignas(4)" (which is a GCC extension, I believe). Is that 
the correct solution?


-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





Re: [PATCH 1/5] std::latch: reduce internal implementation from ptrdiff_t to int

2021-02-26 Thread Thiago Macieira via Gcc-patches
On Friday, 26 February 2021 10:14:42 PST Andreas Schwab wrote:
> On Feb 26 2021, Thiago Macieira via Gcc-patches wrote:
> > @@ -85,7 +85,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > }
> > 
> > private:
> > -alignas(__alignof__(ptrdiff_t)) ptrdiff_t _M_a;
> > +alignas(__alignof__(int)) int _M_a;
> 
> Futexes must be aligned to 4 bytes.

Agreed, but doesn't this accomplish that?

-- 
Thiago Macieira - thiago.macieira (AT) intel.com
  Software Architect - Intel DPG Cloud Engineering





[PATCH 5/5] barrier: optimise by not having the hasher in a loop

2021-02-26 Thread Thiago Macieira via Gcc-patches
Our thread's ID does not change so we don't have to get it every time
and hash it every time.
---
 libstdc++-v3/include/std/barrier | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libstdc++-v3/include/std/barrier b/libstdc++-v3/include/std/barrier
index ae058bd3dc3..eb31a89b175 100644
--- a/libstdc++-v3/include/std/barrier
+++ b/libstdc++-v3/include/std/barrier
@@ -101,12 +101,10 @@ It looks different from literature pseudocode for two 
main reasons:
   }
 
   bool
-  _M_arrive(__barrier_phase_t __old_phase)
+  _M_arrive(__barrier_phase_t __old_phase, size_t __current)
   {
size_t __current_expected = _M_expected;
-   std::hash __hasher;
-   size_t __current = __hasher(std::this_thread::get_id())
- % ((_M_expected + 1) >> 1);
+__current %= ((__current_expected + 1) >> 1);
 
const auto __half_step = _S_add_to_phase(__old_phase, 1);
const auto __full_step = _S_add_to_phase(__old_phase, 2);
@@ -167,11 +165,13 @@ It looks different from literature pseudocode for two 
main reasons:
   [[nodiscard]] arrival_token
   arrive(ptrdiff_t __update)
   {
+   std::hash __hasher;
+   size_t __current = __hasher(std::this_thread::get_id());
__atomic_phase_ref_t __phase(_M_phase);
const auto __old_phase = __phase.load(memory_order_relaxed);
for(; __update; --__update)
  {
-   if(_M_arrive(__old_phase))
+   if(_M_arrive(__old_phase, __current))
  {
_M_completion();
_M_expected += 
_M_expected_adjustment.load(memory_order_relaxed);
-- 
2.30.1



[PATCH 4/5] barrier: use int instead of unsigned char for the phase state

2021-02-26 Thread Thiago Macieira via Gcc-patches
ints can be used in futexes. chars can't.
---
 libstdc++-v3/include/std/barrier | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/libstdc++-v3/include/std/barrier b/libstdc++-v3/include/std/barrier
index e09212dfcb9..ae058bd3dc3 100644
--- a/libstdc++-v3/include/std/barrier
+++ b/libstdc++-v3/include/std/barrier
@@ -70,7 +70,7 @@ It looks different from literature pseudocode for two main 
reasons:
 
 */
 
-  enum class __barrier_phase_t : unsigned char { };
+  enum class __barrier_phase_t : int { };
 
   template
 class __tree_barrier
@@ -93,20 +93,24 @@ It looks different from literature pseudocode for two main 
reasons:
 
   alignas(__phase_alignment) __barrier_phase_t  _M_phase;
 
+  static __barrier_phase_t
+  _S_add_to_phase(__barrier_phase_t __phase, unsigned char __n)
+  {
+__n += static_cast(__phase);
+return static_cast<__barrier_phase_t>(__n);
+  }
+
   bool
   _M_arrive(__barrier_phase_t __old_phase)
   {
-   const auto __old_phase_val = static_cast(__old_phase);
-   const auto __half_step =
-  static_cast<__barrier_phase_t>(__old_phase_val + 1);
-   const auto __full_step =
-  static_cast<__barrier_phase_t>(__old_phase_val + 2);
-
size_t __current_expected = _M_expected;
std::hash __hasher;
size_t __current = __hasher(std::this_thread::get_id())
  % ((_M_expected + 1) >> 1);
 
+   const auto __half_step = _S_add_to_phase(__old_phase, 1);
+   const auto __full_step = _S_add_to_phase(__old_phase, 2);
+
for (int __round = 0; ; ++__round)
  {
if (__current_expected <= 1)
@@ -165,7 +169,6 @@ It looks different from literature pseudocode for two main 
reasons:
   {
__atomic_phase_ref_t __phase(_M_phase);
const auto __old_phase = __phase.load(memory_order_relaxed);
-   const auto __cur = static_cast(__old_phase);
for(; __update; --__update)
  {
if(_M_arrive(__old_phase))
@@ -173,7 +176,7 @@ It looks different from literature pseudocode for two main 
reasons:
_M_completion();
_M_expected += 
_M_expected_adjustment.load(memory_order_relaxed);
_M_expected_adjustment.store(0, memory_order_relaxed);
-   auto __new_phase = static_cast<__barrier_phase_t>(__cur + 2);
+   auto __new_phase = _S_add_to_phase(__old_phase, 2);
__phase.store(__new_phase, memory_order_release);
__phase.notify_all();
  }
-- 
2.30.1



[PATCH 3/5] std::__atomic_wait: don't use __detail::__waiter with futex

2021-02-26 Thread Thiago Macieira via Gcc-patches
That violates "don't pay for what you don't need" rule of C++. Users of
std::atomic::wait will often have some bit in their atomic indicate
whether the value is contended or not, so we don't need libstdc++ to do
double book-keeping for us.
---
 libstdc++-v3/include/bits/atomic_wait.h | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/libstdc++-v3/include/bits/atomic_wait.h 
b/libstdc++-v3/include/bits/atomic_wait.h
index 1c6bda2e2b6..4d240f44faf 100644
--- a/libstdc++-v3/include/bits/atomic_wait.h
+++ b/libstdc++-v3/include/bits/atomic_wait.h
@@ -258,14 +258,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   if (std::__atomic_spin(__pred))
return;
 
-  __waiter __w(__addr);
-  while (!__pred())
+  if constexpr (__platform_wait_uses_type<_Tp>)
{
- if constexpr (__platform_wait_uses_type<_Tp>)
-   {
- __platform_wait(__addr, __old);
-   }
- else
+ __platform_wait(__addr, __old);
+   }
+  else
+   {
+ __waiter __w(__addr);
+ while (!__pred())
{
  // TODO support timed backoff when this can be moved into the lib
  __w._M_do_wait();
@@ -274,13 +274,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 }
 
   template
-void
+inline void
 __atomic_notify(const _Tp* __addr, bool __all) noexcept
 {
   using namespace __detail;
-  auto& __w = __waiters::_S_for((void*)__addr);
-  if (!__w._M_waiting())
-   return;
 
 #ifdef _GLIBCXX_HAVE_LINUX_FUTEX
   if constexpr (__platform_wait_uses_type<_Tp>)
@@ -290,6 +287,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   else
 #endif
{
+ auto& __w = __waiters::_S_for((void*)__addr);
+ if (!__w._M_waiting())
+   return;
  __w._M_notify(__all);
}
 }
-- 
2.30.1



[PATCH 2/5] Atomic __platform_wait: accept any 32-bit type, not just int

2021-02-26 Thread Thiago Macieira via Gcc-patches
The kernel doesn't care what we store in those 32 bits, only that they are
comparable. This commit adds:
 * pointers and long on 32-bit architectures
 * unsigned
 * untyped enums and typed enums on int & unsigned int
 * float

We're not using FUTEX_OP anywhere today. The kernel reserves 4 bits for
this field but has only used 6 values so far, so it can be extended to
unsigned compares in the future, if needed.
---
 libstdc++-v3/include/bits/atomic_wait.h | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libstdc++-v3/include/bits/atomic_wait.h 
b/libstdc++-v3/include/bits/atomic_wait.h
index 424fccbe4c5..1c6bda2e2b6 100644
--- a/libstdc++-v3/include/bits/atomic_wait.h
+++ b/libstdc++-v3/include/bits/atomic_wait.h
@@ -65,7 +65,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 template
   inline constexpr bool __platform_wait_uses_type
 #ifdef _GLIBCXX_HAVE_LINUX_FUTEX
-   = is_same_v, __platform_wait_t>;
+   = is_scalar_v> && sizeof(_Tp) == 
sizeof(__platform_wait_t)
+ && alignof(_Tp) >= alignof(__platform_wait_t);
 #else
= false;
 #endif
@@ -91,13 +92,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 template
   void
-  __platform_wait(const _Tp* __addr, __platform_wait_t __val) noexcept
+  __platform_wait(const _Tp* __addr, _Tp __val) noexcept
   {
for(;;)
  {
auto __e = syscall (SYS_futex, static_cast(__addr),
  
static_cast(__futex_wait_flags::__wait_private),
-   __val, nullptr);
+ static_cast<__platform_wait_t>(__val), 
nullptr);
if (!__e || errno == EAGAIN)
  break;
else if (errno != EINTR)
-- 
2.30.1



[PATCH 1/5] std::latch: reduce internal implementation from ptrdiff_t to int

2021-02-26 Thread Thiago Macieira via Gcc-patches
ints can be used as futex on Linux. ptrdiff_t on 64-bit Linux can't.
---
 libstdc++-v3/include/std/latch | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/std/latch b/libstdc++-v3/include/std/latch
index ef8c301e5e9..156aea5c5e5 100644
--- a/libstdc++-v3/include/std/latch
+++ b/libstdc++-v3/include/std/latch
@@ -48,7 +48,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   public:
 static constexpr ptrdiff_t
 max() noexcept
-{ return __gnu_cxx::__int_traits::__max; }
+{ return __gnu_cxx::__int_traits::__max; }
 
 constexpr explicit latch(ptrdiff_t __expected) noexcept
   : _M_a(__expected) { }
@@ -85,7 +85,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 }
 
   private:
-alignas(__alignof__(ptrdiff_t)) ptrdiff_t _M_a;
+alignas(__alignof__(int)) int _M_a;
   };
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace
-- 
2.30.1