https://gcc.gnu.org/g:3134742307f1133a449da9590e2294e9a7179226

commit r16-3312-g3134742307f1133a449da9590e2294e9a7179226
Author: Luc Grosheintz <luc.groshei...@gmail.com>
Date:   Sun Aug 3 22:57:24 2025 +0200

    libstdc++: Precompute products of static extents.
    
    Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
    n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
    dynamic extents.
    
    The two partial products for r = 0, ..., n:
    
      \prod_{i = 0}^r E[i]     (fwd)
      \prod_{i = r+1}^n E[i]   (rev)
    
    can be computed as the product of static and dynamic extents. The static
    fwd and rev product can be computed at compile time for all values of r.
    
    Three methods are directly affected by this optimization:
    
      layout_left::mapping::stride
      layout_right::mapping::stride
      mdspan::size
    
    We'll check the generated code (-O2) for all three methods for a generic
    (artificially) high-dimensional multi-dimensional extents.
    
    Consider a generic case:
    
      using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
    
      int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
      { return m.stride(r); }
    
    The code generated prior to this commit:
    
      4f0:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 
4f8
      4f7:  00
      4f8:  48 83 c6 01            add    rsi,0x1
      4fc:  48 c7 44 24 e8 ff ff   mov    QWORD PTR 
[rsp-0x18],0xffffffffffffffff
      503:  ff ff
      505:  48 8d 04 f5 00 00 00   lea    rax,[rsi*8+0x0]
      50c:  00
      50d:  0f 29 44 24 b8         movaps XMMWORD PTR [rsp-0x48],xmm0
      512:  66 0f 76 c0            pcmpeqd xmm0,xmm0
      516:  0f 29 44 24 c8         movaps XMMWORD PTR [rsp-0x38],xmm0
      51b:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 
523
      522:  00
      523:  0f 29 44 24 d8         movaps XMMWORD PTR [rsp-0x28],xmm0
      528:  48 83 f8 38            cmp    rax,0x38
      52c:  74 72                  je     5a0 <stride_right_E1+0xb0>
      52e:  48 8d 54 04 b8         lea    rdx,[rsp+rax*1-0x48]
      533:  4c 8d 4c 24 f0         lea    r9,[rsp-0x10]
      538:  b8 01 00 00 00         mov    eax,0x1
      53d:  0f 1f 00               nop    DWORD PTR [rax]
      540:  48 8b 0a               mov    rcx,QWORD PTR [rdx]
      543:  49 89 c0               mov    r8,rax
      546:  4c 0f af c1            imul   r8,rcx
      54a:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
      54e:  49 0f 45 c0            cmovne rax,r8
      552:  48 83 c2 08            add    rdx,0x8
      556:  49 39 d1               cmp    r9,rdx
      559:  75 e5                  jne    540 <stride_right_E1+0x50>
      55b:  48 85 c0               test   rax,rax
      55e:  74 38                  je     598 <stride_right_E1+0xa8>
      560:  48 8b 14 f5 00 00 00   mov    rdx,QWORD PTR [rsi*8+0x0]
      567:  00
      568:  48 c1 e2 02            shl    rdx,0x2
      56c:  48 83 fa 10            cmp    rdx,0x10
      570:  74 1e                  je     590 <stride_right_E1+0xa0>
      572:  48 8d 4f 10            lea    rcx,[rdi+0x10]
      576:  48 01 d7               add    rdi,rdx
      579:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
      580:  48 63 17               movsxd rdx,DWORD PTR [rdi]
      583:  48 83 c7 04            add    rdi,0x4
      587:  48 0f af c2            imul   rax,rdx
      58b:  48 39 f9               cmp    rcx,rdi
      58e:  75 f0                  jne    580 <stride_right_E1+0x90>
      590:  c3                     ret
      591:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
      598:  c3                     ret
      599:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
      5a0:  b8 01 00 00 00         mov    eax,0x1
      5a5:  eb b9                  jmp    560 <stride_right_E1+0x70>
      5a7:  66 0f 1f 84 00 00 00   nop    WORD PTR [rax+rax*1+0x0]
      5ae:  00 00
    
    which seems to be performing:
    
      preparatory_work();
      ret = 1
      for(i = 0; i < rank; ++i)
        tmp = ret * E[i]
        if E[i] != -1
          ret = tmp
      for(i = 0; i < rank_dynamic; ++i)
        ret *= D[i]
    
    This commit reduces it down to:
    
      270:  48 8b 04 f5 00 00 00   mov    rax,QWORD PTR [rsi*8+0x0]
      277:  00
      278:  31 d2                  xor    edx,edx
      27a:  48 85 c0               test   rax,rax
      27d:  74 33                  je     2b2 <stride_right_E1+0x42>
      27f:  48 8b 14 f5 00 00 00   mov    rdx,QWORD PTR [rsi*8+0x0]
      286:  00
      287:  48 c1 e2 02            shl    rdx,0x2
      28b:  48 83 fa 10            cmp    rdx,0x10
      28f:  74 1f                  je     2b0 <stride_right_E1+0x40>
      291:  48 8d 4f 10            lea    rcx,[rdi+0x10]
      295:  48 01 d7               add    rdi,rdx
      298:  0f 1f 84 00 00 00 00   nop    DWORD PTR [rax+rax*1+0x0]
      29f:  00
      2a0:  48 63 17               movsxd rdx,DWORD PTR [rdi]
      2a3:  48 83 c7 04            add    rdi,0x4
      2a7:  48 0f af c2            imul   rax,rdx
      2ab:  48 39 f9               cmp    rcx,rdi
      2ae:  75 f0                  jne    2a0 <stride_right_E1+0x30>
      2b0:  89 c2                  mov    edx,eax
      2b2:  89 d0                  mov    eax,edx
      2b4:  c3                     ret
    
    Loosely speaking this does the following:
    
      1. Load the starting position k in the array of dynamic extents; and
         return if possible.
      2. Load the partial product of static extents.
      3. Computes the \prod_{i = k}^d D[i] where d is the number of
      dynamic extents in a loop.
    
    It shows that the span used for passing in the dynamic extents is
    completely eliminated; and the fact that the product always runs to the
    end of the array of dynamic extents is used by the compiler to eliminate
    one indirection to determine the end position in the array of dynamic
    extents.
    
    The analogous code is generated for layout_left.
    
    Next, consider
    
      using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
      int size2(const std::mdspan<double, E2>& md)
      { return md.size(); }
    
    on immediately preceding commit the generated code is
    
      10:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 18
      17:  00
      18:  49 89 f8               mov    r8,rdi
      1b:  48 8d 44 24 b8         lea    rax,[rsp-0x48]
      20:  48 c7 44 24 e8 0b 00   mov    QWORD PTR [rsp-0x18],0xb
      27:  00 00
      29:  48 8d 7c 24 f0         lea    rdi,[rsp-0x10]
      2e:  ba 01 00 00 00         mov    edx,0x1
      33:  0f 29 44 24 b8         movaps XMMWORD PTR [rsp-0x48],xmm0
      38:  66 0f 76 c0            pcmpeqd xmm0,xmm0
      3c:  0f 29 44 24 c8         movaps XMMWORD PTR [rsp-0x38],xmm0
      41:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 49
      48:  00
      49:  0f 29 44 24 d8         movaps XMMWORD PTR [rsp-0x28],xmm0
      4e:  66 66 2e 0f 1f 84 00   data16 cs nop WORD PTR [rax+rax*1+0x0]
      55:  00 00 00 00
      59:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
      60:  48 8b 08               mov    rcx,QWORD PTR [rax]
      63:  48 89 d6               mov    rsi,rdx
      66:  48 0f af f1            imul   rsi,rcx
      6a:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
      6e:  48 0f 45 d6            cmovne rdx,rsi
      72:  48 83 c0 08            add    rax,0x8
      76:  48 39 c7               cmp    rdi,rax
      79:  75 e5                  jne    60 <size2+0x50>
      7b:  48 85 d2               test   rdx,rdx
      7e:  74 18                  je     98 <size2+0x88>
      80:  49 63 00               movsxd rax,DWORD PTR [r8]
      83:  49 63 48 04            movsxd rcx,DWORD PTR [r8+0x4]
      87:  48 0f af c1            imul   rax,rcx
      8b:  41 0f af 40 08         imul   eax,DWORD PTR [r8+0x8]
      90:  0f af c2               imul   eax,edx
      93:  c3                     ret
      94:  0f 1f 40 00            nop    DWORD PTR [rax+0x0]
      98:  31 c0                  xor    eax,eax
      9a:  c3                     ret
    
    which is needlessly long. The current commit reduces it down to:
    
      10:  48 63 07               movsxd rax,DWORD PTR [rdi]
      13:  48 63 57 04            movsxd rdx,DWORD PTR [rdi+0x4]
      17:  48 0f af c2            imul   rax,rdx
      1b:  0f af 47 08            imul   eax,DWORD PTR [rdi+0x8]
      1f:  69 c0 83 04 00 00      imul   eax,eax,0x483
      25:  c3                     ret
    
    Which simply computes the product:
    
      D[0] * D[1] * D[2] * const
    
    where const is the product of all static extents. Meaning the loop to
    compute the product of dynamic extents has been fully unrolled and
    all constants are perfectly precomputed.
    
    The size of the object file described in the previous commit reduces
    by 17% from 55.8kB to 46.0kB.
    
    libstdc++-v3/ChangeLog:
    
            * include/std/mdspan (__mdspan::__static_prod): New function.
            (__mdspan::__fwd_partial_prods): Constexpr array of partial
            forward products.
            (__mdspan::__fwd_partial_prods): Same for reverse partial
            products.
            (__mdspan::__static_extents_prod): Delete function.
            (__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
            include/std/mdspan (__mdspan::__fwd_prod): Compute as the
            product of pre-computed static static and the product of dynamic
            extents.
            (__mdspan::__rev_prod): Ditto.
    
    Reviewed-by: Tomasz KamiƄski <tkami...@redhat.com>
    Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com>

Diff:
---
 libstdc++-v3/include/std/mdspan | 77 ++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 25 deletions(-)

diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan
index 08e6b8ff5183..dd53e60237b7 100644
--- a/libstdc++-v3/include/std/mdspan
+++ b/libstdc++-v3/include/std/mdspan
@@ -202,6 +202,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       __static_extents() noexcept
       { return _Extents::_S_storage::_S_static_extents(); }
 
+    template<array _Extents>
+      consteval size_t
+      __static_prod(size_t __begin, size_t __end)
+      {
+       size_t __prod = 1;
+       for(size_t __i = __begin; __i < __end; ++__i)
+         {
+           auto __ext = _Extents[__i];
+           __prod *= __ext == dynamic_extent ? size_t(1) : __ext;
+         }
+       return __prod;
+      }
+
+    // Pre-compute: \prod_{i = 0}^r _Extents[i], for r = 0,..., n (exclusive)
+    template<array _Extents>
+      constexpr auto __fwd_partial_prods = [] consteval
+       {
+         constexpr size_t __rank = _Extents.size();
+         std::array<size_t, __rank + 1> __ret;
+         for(size_t __r = 0; __r < __rank + 1; ++__r)
+           __ret[__r] = __static_prod<_Extents>(0, __r);
+         return __ret;
+       }();
+
+    // Pre-compute: \prod_{i = r+1}^{n-1} _Extents[i]
+    template<array _Extents>
+      constexpr auto __rev_partial_prods = [] consteval
+       {
+         constexpr size_t __rank = _Extents.size();
+         std::array<size_t, __rank> __ret;
+         for(size_t __r = 0; __r < __rank; ++__r)
+           __ret[__r] = __static_prod<_Extents>(__r + 1, __rank);
+         return __ret;
+       }();
+
     template<typename _Extents>
       constexpr span<const typename _Extents::index_type>
       __dynamic_extents(const _Extents& __exts, size_t __begin = 0,
@@ -369,47 +404,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
          return false;
       }
 
-    constexpr size_t
-    __static_extents_prod(const auto& __sta_exts) noexcept
-    {
-      size_t __ret = 1;
-      for (auto __factor : __sta_exts)
-       if (__factor != dynamic_extent)
-         __ret *= __factor;
-      return __ret;
-    }
-
     template<typename _Extents>
       constexpr typename _Extents::index_type
-      __exts_prod(const _Extents& __exts, size_t __begin, size_t __end) 
noexcept
+      __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t __begin,
+                    size_t __end) noexcept
       {
-       using _IndexType = typename _Extents::index_type;
-
-       size_t __ret = 1;
-       if constexpr (_Extents::rank_dynamic() != _Extents::rank())
-         {
-           std::span<const size_t> __sta_exts = __static_extents<_Extents>();
-           __ret = __static_extents_prod(__sta_exts.subspan(__begin,
-                                                            __end - __begin));
-           if (__ret == 0)
-             return 0;
-         }
+       if (__sta_prod == 0)
+         return 0;
 
+       size_t __ret = __sta_prod;
        if constexpr (_Extents::rank_dynamic() > 0)
          for (auto __factor : __dynamic_extents(__exts, __begin, __end))
            __ret *= size_t(__factor);
-       return _IndexType(__ret);
+       return static_cast<typename _Extents::index_type>(__ret);
       }
 
     template<typename _Extents>
       constexpr typename _Extents::index_type
       __fwd_prod(const _Extents& __exts, size_t __r) noexcept
-      { return __exts_prod(__exts, 0, __r); }
+      {
+       constexpr auto& __sta_exts = __static_extents<_Extents>();
+       size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
+       return __extents_prod(__exts, __sta_prod, 0, __r);
+      }
 
     template<typename _Extents>
       constexpr typename _Extents::index_type
       __rev_prod(const _Extents& __exts, size_t __r) noexcept
-      { return __exts_prod(__exts, __r + 1, __exts.rank()); }
+      {
+       constexpr auto& __sta_exts = __static_extents<_Extents>();
+       constexpr size_t __rank = _Extents::rank();
+       size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
+       return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
+      }
 
     template<typename _Extents>
       constexpr typename _Extents::index_type

Reply via email to