On Mon, Jan 26, 2026 at 12:53 PM Tomasz Kamiński <[email protected]>
wrote:

> This patch implements the changes suggested in PL007 NB comment
> for C++26. This is separated in two patches:
> * PATCH 1/2 - introduces range_slice type, without modifying
>   strided_slices. This allows comparison of behavior before
>   and after changes.
> * PATCH 2/2 - modifies the meaning of strided_slice::extents,
>   and expand support of decomposable types
>
> Using as simple benchmark (attached), that selects subset of elements
of extents<int, 10> with specific stride, we get following result, where:
* range - use range_slice
* stride_old - use strided_slice before the patch (extents = 10)
* stride_new - use strided_slice after the patch (extents = 1 + 9 / stride)
And dN indicates use of stride with dynamic value, and sN stride with
static value.

With the patch series :
---------------------------------------------------------------
bench_range_d1             1.52 ns         1.51 ns    449490447
bench_range_s1             1.01 ns         1.01 ns    696388065
bench_range_d3             3.03 ns         3.02 ns    230834059
bench_range_s3             1.01 ns         1.00 ns    695836227
bench_strided_new_d1       1.51 ns         1.51 ns    459057841
bench_strided_new_s1       1.02 ns         1.02 ns    697311175
bench_strided_new_d3       1.52 ns         1.51 ns    462984578
bench_strided_new_s3       1.01 ns         1.01 ns    695589296

The trunk results:
---------------------------------------------------------------
bench_range_d1             3.03 ns         3.03 ns    228128195
bench_range_s1             1.01 ns         1.01 ns    697117732
bench_range_d3             3.03 ns         3.02 ns    232435749
bench_range_s3             1.01 ns         1.01 ns    692646429
bench_strided_old_d1       3.03 ns         3.02 ns    231937322
bench_strided_old_s1       1.01 ns         1.01 ns    674428159
bench_strided_old_d3       3.03 ns         3.02 ns    231232866
bench_strided_old_s3       1.01 ns         1.01 ns    696073131

As expected, eliminating the need for division for strided_new_dN,
speeds up the submdspan computation. The speed-up between bench_range_d1
is the result of this patch series choice, to avoid division in case when
stride == 1.
#include <benchmark/benchmark.h>
#include <mdspan>
#include <sstream>

// ~/gcc/16/bin/g++ benchmark.cpp  -std=c++26 -lbenchmark -O2 -DUSE_PROXY -o a.proxy

int elems[100] {};

template<template<class, class, class> class Range>
void bench_generic(benchmark::State& s, auto stride)
{
  std::mdspan<int, std::extents<int, 10>> md(elems);
  while (s.KeepRunning())
  {
    auto smd = std::submdspan(md, Range{0, 10, stride});
    benchmark::DoNotOptimize(smd); 
  }
}

void bench_range(benchmark::State& s, auto stride)
{ bench_generic<std::range_slice>(s, stride); }

void bench_range_d1(benchmark::State& s) {
  bench_range(s, 1);
}
BENCHMARK(bench_range_d1);

void bench_range_s1(benchmark::State& s) {
  bench_range(s, std::cw<1>);
}
BENCHMARK(bench_range_s1);

void bench_range_d3(benchmark::State& s) {
  bench_range(s, 3);
}
BENCHMARK(bench_range_d3);

void bench_range_s3(benchmark::State& s) {
  bench_range(s, std::cw<3>);
}
BENCHMARK(bench_range_s3);

#if __cpp_lib_submdspan > 202411L
void bench_strided_new(benchmark::State& s, auto stride)
{
  std::mdspan<int, std::extents<int, 10>> md(elems);
  int extent = 1 + (9 / stride);
  while (s.KeepRunning())
  {
    auto smd = std::submdspan(md, std::strided_slice{0, extent, stride});
    benchmark::DoNotOptimize(smd); 
  }
}

void bench_strided_new_d1(benchmark::State& s) {
  bench_strided_new(s, 1);
}
BENCHMARK(bench_strided_new_d1);

void bench_strided_new_s1(benchmark::State& s) {
  bench_strided_new(s, std::cw<1>);
}
BENCHMARK(bench_strided_new_s1);

void bench_strided_new_d3(benchmark::State& s) {
  bench_strided_new(s, 3);
}
BENCHMARK(bench_strided_new_d3);

void bench_strided_new_s3(benchmark::State& s) {
  bench_strided_new(s, std::cw<3>);
}
BENCHMARK(bench_strided_new_s3);
#else
void bench_strided_old(benchmark::State& s, auto stride)
{ bench_generic<std::strided_slice>(s, stride); }

void bench_strided_old_d1(benchmark::State& s) {
  bench_strided_old(s, 1);
}
BENCHMARK(bench_strided_old_d1);

void bench_strided_old_s1(benchmark::State& s) {
  bench_strided_old(s, std::cw<1>);
}
BENCHMARK(bench_strided_old_s1);

void bench_strided_old_d3(benchmark::State& s) {
  bench_strided_old(s, 3);
}
BENCHMARK(bench_strided_old_d3);

void bench_strided_old_s3(benchmark::State& s) {
  bench_strided_old(s, std::cw<3>);
}
BENCHMARK(bench_strided_old_s3);
#endif


BENCHMARK_MAIN();

Reply via email to