On Wed, 26 Nov 2025 at 15:44, Jonathan Wakely <[email protected]> wrote:
>
> This defines __platform_wait, __platform_notify, and
> __platform_wait_until for FreeBSD, making use of the _umtx_op syscall.
>
> The Linux versions of those functions only support 32-bit integers, but
> the FreeBSD versions use the syscall for both 32-bit and 64-bit types,
> as the _umtx_op supports both.
>
> libstdc++-v3/ChangeLog:
>
>         PR libstdc++/120527
>         * include/bits/atomic_wait.h [__FreeBSD__] (__platform_wait_t):
>         Define typedef.
>         [__FreeBSD__] (__platform_wait_uses_type): Define variable
>         template.
>         * src/c++20/atomic.cc [__FreeBSD__] (_GLIBCXX_HAVE_PLATFORM_WAIT)
>         (__platform_wait, __platform_notify, __platform_wait_until):
>         Define.
> ---
>
> Tested x86_64-linux and x86_64-freebsd14.
>
> v3:
> - new wait_op function to determine the OP_WAIT constant from the obj_sz
>   parameter.
>
> I wrote a simple benchmark that calls notify in a loop or calls wait(1)
> in a loop (without blocking because the value isn't equal to 1), for
> 16/32/64 byte integers.
>
> Before this change (so that all types use a proxy wait and the proxy
> wait is done using a condition_variable):
>
> -------------------------------------------------------
> Benchmark             Time             CPU   Iterations
> -------------------------------------------------------
> BM_notify_16       28.8 ns         28.6 ns     24470204
> BM_notify_32       28.7 ns         28.6 ns     24460713
> BM_notify_64       28.8 ns         28.6 ns     24478476
> BM_wait_16         5.13 ns         5.11 ns    136500107
> BM_wait_32         5.12 ns         5.10 ns    136588487
> BM_wait_64         5.12 ns         5.10 ns    136646629
>
> After this change, so that 32-bit and 64-bit types don't use proxy
> waits, and the proxy wait (for the 16-bit type) uses the _umtx_op
> syscall on _M_wait_state->_M_ver instead of a condition_variable:
>
> -------------------------------------------------------
> Benchmark             Time             CPU   Iterations
> -------------------------------------------------------
> BM_notify_16       6.26 ns         6.23 ns    111360766
> BM_notify_32       2.87 ns         2.86 ns    243363965
> BM_notify_64       2.90 ns         2.89 ns    242195074
> BM_wait_16         4.92 ns         4.90 ns    137517425
> BM_wait_32         1.34 ns         1.33 ns    518316605
> BM_wait_64         1.34 ns         1.33 ns    517155683
>
> So all notify calls are faster now, and a no-op wait is faster for
> non-proxy types (which is because we don't need a PLT call to
> _M_setup_proxy_wait, it's just inline in the headers).
>
> For a wait that actually needs to block I would expect the times to also
> improve due to not using a condvar, but that's harder to benchmark
> because it would need a second thread that's modifying the atomic and
> doing a notify_one() call, so that the wait(n) call doesn't block
> forever.
>
> I tried to check a real wait that actually waits by using a
> std::counting_semaphore and calling try_acquire_for(1ns), so that we
> block for 1ns in a loop and then timeout. That shows a huge regression
> in performance when waiting on an unavailable semaphore:
>
> Before:
> BM_counting_sema        413 ns          411 ns      1701349
>
> After:
> BM_counting_sema       8198 ns         3616 ns       193253
>
> I don't know exactly where that cost comes from.
>
> For an uncontended semaphore where try_acquire doesn't need to block,
> there's no difference in performance before and after this patch (as
> expected).

Some additional benchmarks for x86_64-freebsd14:

Before this patch:
-----------------------------------------------------------------
Benchmark                       Time             CPU   Iterations
-----------------------------------------------------------------
BM_notify_one_16             28.7 ns         28.6 ns     24384953
BM_notify_one_32             28.8 ns         28.7 ns     24355548
BM_notify_one_64             28.7 ns         28.6 ns     24433491
BM_notify_all_16             28.8 ns         28.6 ns     24451212
BM_notify_all_32             28.7 ns         28.6 ns     24443313
BM_notify_all_64             28.7 ns         28.6 ns     24447805
BM_noop_wait_16              5.34 ns         5.32 ns    131032527
BM_noop_wait_32              5.17 ns         5.14 ns    136753729
BM_noop_wait_64              5.25 ns         5.23 ns    132361525
BM_wait_16                   81.3 ns         80.9 ns      7957712
BM_wait_32                   78.2 ns         77.8 ns      7781455
BM_wait_64                   87.6 ns         87.0 ns      8682610
BM_binary_sema_fail           413 ns          411 ns      1699271
BM_binary_sema_success       11.4 ns         11.4 ns     61460175
BM_counting_sema              410 ns          409 ns      1711535

After:
-----------------------------------------------------------------
Benchmark                       Time             CPU   Iterations
-----------------------------------------------------------------
BM_notify_one_16             6.24 ns         6.22 ns    111823939
BM_notify_one_32             2.89 ns         2.88 ns    241753104
BM_notify_one_64             2.90 ns         2.89 ns    242854084
BM_notify_all_16             6.23 ns         6.21 ns    112720780
BM_notify_all_32             2.23 ns         2.22 ns    315718398
BM_notify_all_64             2.26 ns         2.25 ns    311893736
BM_noop_wait_16              4.90 ns         4.88 ns    143253055
BM_noop_wait_32              3.12 ns         3.11 ns    224913762
BM_noop_wait_64              3.12 ns         3.10 ns    225542226
BM_wait_16                   62.7 ns         62.4 ns     10317760
BM_wait_32                   66.5 ns         66.2 ns      9522859
BM_wait_64                   69.7 ns         69.4 ns     10127298
BM_binary_sema_fail          8229 ns         3644 ns       191770
BM_binary_sema_success       12.0 ns         12.0 ns     58493426
BM_counting_sema             8234 ns         3635 ns       193129

"noop_wait" is the same test as the previous "wait" benchmarks, where
the wait returns immediately because the atomic object's value is
already not equal to the argument.

The "wait" ones in these tables are for actual waits, with a second
thread modifying the variable and calling notify_one to unblock the
wait. This shows a small but significant speedup for the new code.

binary_sema_fail is calling try_acquire_for(1ns) on an unavailable
binary_semaphore, and counting_sema is the same for a
std::counting_semaphore<>.
binary_sema_success is calling try_acquire_for(1ns) on an available
binary_semaphore, so there's no need to wait and timeout.

I see comparable numbers for x86_64-linux on the same hardware (but on
the host OS, not in a VM):

-----------------------------------------------------------------
Benchmark                       Time             CPU   Iterations
-----------------------------------------------------------------
BM_notify_one_16             5.77 ns         5.77 ns    121394659
BM_notify_one_32             1.56 ns         1.55 ns    450840169
BM_notify_one_64             5.77 ns         5.77 ns    121397512
BM_notify_all_16             5.78 ns         5.77 ns    121390164
BM_notify_all_32             1.56 ns         1.55 ns    449443030
BM_notify_all_64             5.77 ns         5.77 ns    121379109
BM_noop_wait_16              4.46 ns         4.46 ns    157774493
BM_noop_wait_32              2.66 ns         2.66 ns    263003179
BM_noop_wait_64              4.45 ns         4.45 ns    157778198
BM_wait_16                   72.4 ns         72.3 ns      9995380
BM_wait_32                   76.6 ns         76.5 ns      9300501
BM_wait_64                   69.1 ns         69.0 ns      9111327
BM_binary_sema_fail         61880 ns         8483 ns        82735
BM_binary_sema_success       12.0 ns         12.0 ns     56591123
BM_counting_sema            62034 ns         8541 ns        82855

The contended semaphore figures are even worse on linux, and as
expected the 64-bit types perform the same as the 16-bit types
(because linux futex only supports 32-bit).

Comparing this to GCC 15 (on the same linux host):

-----------------------------------------------------------------
Benchmark                       Time             CPU   Iterations
-----------------------------------------------------------------
BM_notify_one_16             4.01 ns         3.99 ns    175259586
BM_notify_one_32            0.223 ns        0.222 ns   1000000000
BM_notify_one_64             4.02 ns         4.01 ns    175245594
BM_notify_all_16             4.00 ns         3.99 ns    175335113
BM_notify_all_32            0.444 ns        0.444 ns   1000000000
BM_notify_all_64             4.00 ns         4.00 ns    175336331
BM_noop_wait_16              12.0 ns         12.0 ns     58435328
BM_noop_wait_32              11.8 ns         11.8 ns     59546873
BM_noop_wait_64              12.0 ns         12.0 ns     58442306
BM_wait_16                    643 ns          641 ns      1086782
BM_wait_32                    666 ns          665 ns      1059993
BM_wait_64                    655 ns          654 ns      1029185
BM_binary_sema_fail          2764 ns         2761 ns       253847
BM_binary_sema_success       10.7 ns         10.6 ns     65728548
BM_counting_sema             2766 ns         2763 ns       256073

So a non-proxy notify was much faster with GCC 15, because everything
was inline but with trunk it has to call __notify_impl in the shared
library. But the non-inline call is still only a few nanos, so that's
acceptable.

The wait tests (both the noop and contended ones) are considerably
better on trunk.

But the contended semaphores are really bad on trunk. I'll have to
investigate that.

Benchmark code attached, using Google benchmark.
// -I ~/src/benchmark/include/ -L ~/src/benchmark/obj/src/ -lbenchmark -pthread -O2
#include "benchmark/benchmark.h"
#include <atomic>
#include <semaphore>
#include <chrono>
#include <future>
#include <stop_token>

std::atomic<short> a16{0};
std::atomic<int> a32{0};
std::atomic<long> a64{0};

// call atomic::notify_one()

template<class A>
void do_notify_one(A& a, benchmark::State& state) {
  for (auto _ : state)
    a.notify_one();
}

void BM_notify_one_16(benchmark::State& state) {
  do_notify_one(a16, state);
}
BENCHMARK(BM_notify_one_16);

void BM_notify_one_32(benchmark::State& state) {
  do_notify_one(a32, state);
}
BENCHMARK(BM_notify_one_32);

void BM_notify_one_64(benchmark::State& state) {
  do_notify_one(a64, state);
}
BENCHMARK(BM_notify_one_64);

// call atomic::notify_all()

template<class A>
void do_notify_all(A& a, benchmark::State& state) {
  for (auto _ : state)
    a.notify_all();
}

void BM_notify_all_16(benchmark::State& state) {
  do_notify_all(a16, state);
}
BENCHMARK(BM_notify_all_16);

void BM_notify_all_32(benchmark::State& state) {
  do_notify_all(a32, state);
}
BENCHMARK(BM_notify_all_32);

void BM_notify_all_64(benchmark::State& state) {
  do_notify_all(a64, state);
}
BENCHMARK(BM_notify_all_64);


// call atomic::wait(1) when it returns immediately

template<class A>
void do_noop_wait(A& a, benchmark::State& state) {
  for (auto _ : state)
    a.wait(1);
}

void BM_noop_wait_16(benchmark::State& state) {
  do_noop_wait(a16, state);
}
BENCHMARK(BM_noop_wait_16);

void BM_noop_wait_32(benchmark::State& state) {
  do_noop_wait(a32, state);
}
BENCHMARK(BM_noop_wait_32);

void BM_noop_wait_64(benchmark::State& state) {
  do_noop_wait(a64, state);
}
BENCHMARK(BM_noop_wait_64);

// call atomic::wait(1) when it might have to wait for another thread to set it

template<class A>
void do_wait(A& a, benchmark::State& state) {
  std::stop_source stop;
  auto fut = std::async(std::launch::async, [&] (std::stop_token tok) {
    while (!tok.stop_requested())
      for (int i = 0; i < 1000; ++i) // so we don't check stop_token too often
      {
        a = 0;
        a.notify_one();
      }
  }, stop.get_token());

  for (auto _ : state)
  {
    a = 1;
    a.wait(1);
  }

  stop.request_stop();
}

void BM_wait_16(benchmark::State& state) {
  do_wait(a16, state);
}
BENCHMARK(BM_wait_16);

void BM_wait_32(benchmark::State& state) {
  do_wait(a32, state);
}
BENCHMARK(BM_wait_32);

void BM_wait_64(benchmark::State& state) {
  do_wait(a64, state);
}
BENCHMARK(BM_wait_64);

std::binary_semaphore bsem(0);

void BM_binary_sema_fail(benchmark::State& state) {
  using namespace std::chrono;
  for (auto _ : state)
  {
    benchmark::DoNotOptimize(bsem.try_acquire_for(1ns));
  }
}
BENCHMARK(BM_binary_sema_fail);

void BM_binary_sema_success(benchmark::State& state) {
  bsem.release();
  using namespace std::chrono;
  for (auto _ : state)
  {
    bsem.release();
    benchmark::DoNotOptimize(bsem.try_acquire_for(1ns));
  }
}
BENCHMARK(BM_binary_sema_success);


std::counting_semaphore<> csem(0);

void BM_counting_sema(benchmark::State& state) {
  using namespace std::chrono;
  for (auto _ : state)
  {
    benchmark::DoNotOptimize(csem.try_acquire_for(1ns));
  }
}
BENCHMARK(BM_counting_sema);

BENCHMARK_MAIN();

Reply via email to