On Mon, Jan 26, 2026 at 12:53 PM Tomasz Kamiński <[email protected]>
wrote:
> This patch implements the changes suggested in PL007 NB comment
> for C++26. This is separated in two patches:
> * PATCH 1/2 - introduces range_slice type, without modifying
> strided_slices. This allows comparison of behavior before
> and after changes.
> * PATCH 2/2 - modifies the meaning of strided_slice::extents,
> and expand support of decomposable types
>
> Using as simple benchmark (attached), that selects subset of elements
of extents<int, 10> with specific stride, we get following result, where:
* range - use range_slice
* stride_old - use strided_slice before the patch (extents = 10)
* stride_new - use strided_slice after the patch (extents = 1 + 9 / stride)
And dN indicates use of stride with dynamic value, and sN stride with
static value.
With the patch series :
---------------------------------------------------------------
bench_range_d1 1.52 ns 1.51 ns 449490447
bench_range_s1 1.01 ns 1.01 ns 696388065
bench_range_d3 3.03 ns 3.02 ns 230834059
bench_range_s3 1.01 ns 1.00 ns 695836227
bench_strided_new_d1 1.51 ns 1.51 ns 459057841
bench_strided_new_s1 1.02 ns 1.02 ns 697311175
bench_strided_new_d3 1.52 ns 1.51 ns 462984578
bench_strided_new_s3 1.01 ns 1.01 ns 695589296
The trunk results:
---------------------------------------------------------------
bench_range_d1 3.03 ns 3.03 ns 228128195
bench_range_s1 1.01 ns 1.01 ns 697117732
bench_range_d3 3.03 ns 3.02 ns 232435749
bench_range_s3 1.01 ns 1.01 ns 692646429
bench_strided_old_d1 3.03 ns 3.02 ns 231937322
bench_strided_old_s1 1.01 ns 1.01 ns 674428159
bench_strided_old_d3 3.03 ns 3.02 ns 231232866
bench_strided_old_s3 1.01 ns 1.01 ns 696073131
As expected, eliminating the need for division for strided_new_dN,
speeds up the submdspan computation. The speed-up between bench_range_d1
is the result of this patch series choice, to avoid division in case when
stride == 1.
#include <benchmark/benchmark.h>
#include <mdspan>
#include <sstream>
// ~/gcc/16/bin/g++ benchmark.cpp -std=c++26 -lbenchmark -O2 -DUSE_PROXY -o a.proxy
int elems[100] {};
template<template<class, class, class> class Range>
void bench_generic(benchmark::State& s, auto stride)
{
std::mdspan<int, std::extents<int, 10>> md(elems);
while (s.KeepRunning())
{
auto smd = std::submdspan(md, Range{0, 10, stride});
benchmark::DoNotOptimize(smd);
}
}
void bench_range(benchmark::State& s, auto stride)
{ bench_generic<std::range_slice>(s, stride); }
void bench_range_d1(benchmark::State& s) {
bench_range(s, 1);
}
BENCHMARK(bench_range_d1);
void bench_range_s1(benchmark::State& s) {
bench_range(s, std::cw<1>);
}
BENCHMARK(bench_range_s1);
void bench_range_d3(benchmark::State& s) {
bench_range(s, 3);
}
BENCHMARK(bench_range_d3);
void bench_range_s3(benchmark::State& s) {
bench_range(s, std::cw<3>);
}
BENCHMARK(bench_range_s3);
#if __cpp_lib_submdspan > 202411L
void bench_strided_new(benchmark::State& s, auto stride)
{
std::mdspan<int, std::extents<int, 10>> md(elems);
int extent = 1 + (9 / stride);
while (s.KeepRunning())
{
auto smd = std::submdspan(md, std::strided_slice{0, extent, stride});
benchmark::DoNotOptimize(smd);
}
}
void bench_strided_new_d1(benchmark::State& s) {
bench_strided_new(s, 1);
}
BENCHMARK(bench_strided_new_d1);
void bench_strided_new_s1(benchmark::State& s) {
bench_strided_new(s, std::cw<1>);
}
BENCHMARK(bench_strided_new_s1);
void bench_strided_new_d3(benchmark::State& s) {
bench_strided_new(s, 3);
}
BENCHMARK(bench_strided_new_d3);
void bench_strided_new_s3(benchmark::State& s) {
bench_strided_new(s, std::cw<3>);
}
BENCHMARK(bench_strided_new_s3);
#else
void bench_strided_old(benchmark::State& s, auto stride)
{ bench_generic<std::strided_slice>(s, stride); }
void bench_strided_old_d1(benchmark::State& s) {
bench_strided_old(s, 1);
}
BENCHMARK(bench_strided_old_d1);
void bench_strided_old_s1(benchmark::State& s) {
bench_strided_old(s, std::cw<1>);
}
BENCHMARK(bench_strided_old_s1);
void bench_strided_old_d3(benchmark::State& s) {
bench_strided_old(s, 3);
}
BENCHMARK(bench_strided_old_d3);
void bench_strided_old_s3(benchmark::State& s) {
bench_strided_old(s, std::cw<3>);
}
BENCHMARK(bench_strided_old_s3);
#endif
BENCHMARK_MAIN();