https://gcc.gnu.org/g:db563993b69bab00e40ebed567aa7422d77ed016

commit r16-3313-gdb563993b69bab00e40ebed567aa7422d77ed016
Author: Luc Grosheintz <luc.groshei...@gmail.com>
Date:   Sun Aug 3 22:57:25 2025 +0200

    libstdc++: Improve low-rank layout_{left,right}::stride.
    
    The methods layout_{left,right}::mapping::stride are defined
    as
    
      \prod_{i = 0}^r E[i]
      \prod_{i = r+1}^n E[i]
    
    This is computed as the product of a precomputed static product and the
    product of the required dynamic extents.
    
    Disassembly shows that even for low-rank extents, i.e. rank == 1 and
    rank == 2, with at least one dynamic extent, the generated code loads
    two values; and then runs the loop over at most one element, e.g. for
    stride_left_d5 defined below the generated code is:
    
     220:  48 8b 04 f5 00 00 00   mov    rax,QWORD PTR [rsi*8+0x0]
     227:  00
     228:  31 d2                  xor    edx,edx
     22a:  48 85 c0               test   rax,rax
     22d:  74 23                  je     252 <stride_left_d5+0x32>
     22f:  48 8b 0c f5 00 00 00   mov    rcx,QWORD PTR [rsi*8+0x0]
     236:  00
     237:  48 c1 e1 02            shl    rcx,0x2
     23b:  74 13                  je     250 <stride_left_d5+0x30>
     23d:  48 01 f9               add    rcx,rdi
     240:  48 63 17               movsxd rdx,DWORD PTR [rdi]
     243:  48 83 c7 04            add    rdi,0x4
     247:  48 0f af c2            imul   rax,rdx
     24b:  48 39 f9               cmp    rcx,rdi
     24e:  75 f0                  jne    240 <stride_left_d5+0x20>
     250:  89 c2                  mov    edx,eax
     252:  89 d0                  mov    eax,edx
     254:  c3                     ret
    
    If there's no dynamic extents, it simply loads the precomputed product
    of static extents.
    
    For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1 or
    extents.extent(k), with k == 0 for layout_left and k == 1 for
    layout_right.
    
    Consider,
    
      using Ed = std::extents<int, dyn>;
      int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
      { return m.stride(r); }
    
      using E3d = std::extents<int, 3, dyn>;
      int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
      { return m.stride(r); }
    
      using Ed5 = std::extents<int, dyn, 5>;
      int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
      { return m.stride(r); }
    
    The optimized code for these three cases is:
    
      0000000000000060 <stride_left_d>:
      60:  b8 01 00 00 00         mov    eax,0x1
      65:  c3                     ret
    
      0000000000000090 <stride_left_3d>:
      90:  48 83 fe 01            cmp    rsi,0x1
      94:  19 c0                  sbb    eax,eax
      96:  83 e0 fe               and    eax,0xfffffffe
      99:  83 c0 03               add    eax,0x3
      9c:  c3                     ret
    
      00000000000000a0 <stride_left_d5>:
      a0:  b8 01 00 00 00         mov    eax,0x1
      a5:  48 85 f6               test   rsi,rsi
      a8:  74 02                  je     ac <stride_left_d5+0xc>
      aa:  8b 07                  mov    eax,DWORD PTR [rdi]
      ac:  c3                     ret
    
    For rank == 1 it simply returns 1 (as expected). For rank == 2, it
    either implements a branchless formula, or conditionally loads one
    value. In all cases involving a dynamic extent this seems like it's
    always doing clearly less work, both in terms of computation and loads.
    In cases not involving a dynamic extent, it replaces loading one value
    with a branchless sequence of four instructions.
    
    This commit also refactors __size to no use any of the precomputed
    arrays. This prevents instantiating __{fwd,rev}_partial_prods for
    low-rank extents. This results in a further size reduction of a
    reference object file (described two commits prior) by 9% from 46.0kB to
    41.9kB.
    
    In a prior commit we optimized __size to produce better object code by
    precomputing the static products. This refactor enables the optimizer to
    generate the same optimized code.
    
    libstdc++-v3/ChangeLog:
    
            * include/std/mdspan (__mdspan::__fwd_prod): Optimize
            for rank <= 2.
            (__mdspan::__rev_prod): Ditto.
            (__mdspan::__size): Refactor to use a pre-computed product, not
            a partial product.
    
    Reviewed-by: Tomasz KamiƄski <tkami...@redhat.com>
    Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com>

Diff:
---
 libstdc++-v3/include/std/mdspan | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan
index dd53e60237b7..3cbf922e78d2 100644
--- a/libstdc++-v3/include/std/mdspan
+++ b/libstdc++-v3/include/std/mdspan
@@ -419,29 +419,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        return static_cast<typename _Extents::index_type>(__ret);
       }
 
+    // Preconditions: _r < _Extents::rank()
     template<typename _Extents>
       constexpr typename _Extents::index_type
       __fwd_prod(const _Extents& __exts, size_t __r) noexcept
       {
+       constexpr size_t __rank = _Extents::rank();
        constexpr auto& __sta_exts = __static_extents<_Extents>();
-       size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
-       return __extents_prod(__exts, __sta_prod, 0, __r);
+       if constexpr (__rank == 1)
+         return 1;
+       else if constexpr (__rank == 2)
+         return __r == 0 ? 1 : __exts.extent(0);
+       else
+         {
+           size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
+           return __extents_prod(__exts, __sta_prod, 0, __r);
+         }
       }
 
+    // Preconditions: _r < _Extents::rank()
     template<typename _Extents>
       constexpr typename _Extents::index_type
       __rev_prod(const _Extents& __exts, size_t __r) noexcept
       {
-       constexpr auto& __sta_exts = __static_extents<_Extents>();
        constexpr size_t __rank = _Extents::rank();
-       size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
-       return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
+       constexpr auto& __sta_exts = __static_extents<_Extents>();
+       if constexpr (__rank == 1)
+         return 1;
+       else if constexpr (__rank == 2)
+         return __r == 0 ? __exts.extent(1) : 1;
+       else
+         {
+           size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
+           return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
+         }
       }
 
     template<typename _Extents>
       constexpr typename _Extents::index_type
       __size(const _Extents& __exts) noexcept
-      { return __fwd_prod(__exts, __exts.rank()); }
+      {
+       constexpr auto& __sta_exts = __static_extents<_Extents>();
+       constexpr size_t __rank = _Extents::rank();
+       constexpr size_t __sta_prod = __static_prod<__sta_exts>(0, __rank);
+       return __extents_prod(__exts, __sta_prod, 0, __rank);
+      }
 
     template<typename _IndexType, size_t... _Counts>
       auto __build_dextents_type(integer_sequence<size_t, _Counts...>)

Reply via email to