https://gcc.gnu.org/g:867ca749704d1d7bd042f1a4d8403801df8cc172

commit r17-495-g867ca749704d1d7bd042f1a4d8403801df8cc172
Author: Tamar Christina <[email protected]>
Date:   Wed May 13 12:36:07 2026 +0100

    scev: maintain affine CHRECs in the presence of type conversions
    
    The example
    
    float *e;
    void f (float *f, float *g, char *h, int n,
            int b, int c, int d)
    {
      float a = 0;
      for (int i = 0; i < n; ++i) {
        int j = b + i, k = c + i * d;
        float l = g[j], m = h[i] ? g[k] : l;
        a += f[i] * m;
      }
      *e = a;
    }
    
    gets vectorized using gathers for the access to g:
    
    .L5:
            ld1b    z4.s, p7/z, [x2, x6]
            cmpne   p6.b, p7/z, z4.b, #0
            ld1w    z2.s, p7/z, [x0, x6, lsl 2]
            add     z7.s, z30.s, z16.s
            add     z6.s, z16.s, z18.s
            add     x6, x6, x7
            ld1w    z5.s, p7/z, [x1, z6.s, sxtw 2]
            ld1w    z3.s, p6/z, [x1, z7.s, sxtw 2]
            incw    z16.s
            sel     z3.s, p6, z3.s, z5.s
            fmla    z17.s, p7/m, z2.s, z3.s
            whilelo p7.s, w6, w3
            b.any   .L5
    
    however the first g is g[b+i] and second is g[c + i*d];
    
    since b is loop invariant the access to g[b+i] is actually linear and since 
c
    is loop invariant, then the base of the second access g[c + i *d] can be
    simplified by recognizing the base as g + c.
    
    Today however SCEV fails to analyze these accesses as affine and as a
    consequence we end up with gathers:
    
    : missed:  failed: evolution of base is not affine.
            base_address:
            offset from base address:
            constant offset from base address:
            step:
            base alignment: 0
            base misalignment: 0
            offset alignment: 0
            step alignment: 0
            base_object: *_63
    
    Looking at SCEV this is because of an outer cast around the CHREC:
    
    )
    (set_scalar_evolution
      instantiated_below = 25
      (scalar = _65)
      (scalar_evolution = (long unsigned int) {b_22(D), +, 1}_2))
    )
    (instantiate_scev
      (instantiate_below = 25 -> 12)
      (evolution_loop = 2)
      (chrec = (long unsigned int) {b_22(D), +, 1}_2)
    
    (instantiate_scev
      (instantiate_below = 25 -> 12)
      (evolution_loop = 2)
      (chrec = g_27(D))
      (res = g_27(D)))
    
      which corresponds to
    
      j_66 = b_22(D) + i_67;
      _65 = (long unsigned int) j_66;
      _64 = _65 * 4;
      _63 = g_27(D) + _64;
      l_62 = *_63;
    
    and the _64 is deemed to not be affine:
    
    (instantiate_scev
      (instantiate_below = 25 -> 12)
      (evolution_loop = 2)
      (chrec = _64)
    (analyze_scalar_evolution
      (loop_nb = 2)
      (scalar = _64)
    (get_scalar_evolution
      (scalar = _64)
      (scalar_evolution = _64))
    )
      (res = scev_not_known))
    
    This patch fixes it by (very carefully) folding a multiply on an unsigned 
affine
    CHREC into the CHREC itself.
    
    which results in
    
    (instantiate_scev
      (instantiate_below = 25 -> 12)
      (evolution_loop = 2)
      (chrec = 4)
      (res = 4))
    (set_scalar_evolution
      instantiated_below = 25
      (scalar = _64)
      (scalar_evolution = {(long unsigned int) b_22(D) * 4, +, 4}_2))
    )
    (instantiate_scev
      (instantiate_below = 25 -> 12)
      (evolution_loop = 2)
      (chrec = g_27(D))
      (res = g_27(D)))
    (instantiate_scev
      (instantiate_below = 25 -> 12)
      (evolution_loop = 2)
      (chrec = {(long unsigned int) b_22(D) * 4, +, 4}_2)
      (res = {(long unsigned int) b_22(D) * 4, +, 4}_2))
    (set_scalar_evolution
      instantiated_below = 25
      (scalar = _63)
      (scalar_evolution = {g_27(D) + (long unsigned int) b_22(D) * 4, +, 4}_2))
    )
    
    and dataref now correctly analyzes the base
    
            base_address: g_27(D) + (sizetype) b_22(D) * 4
            offset from base address: 0
            constant offset from base address: 0
            step: 4
            base alignment: 4
            base misalignment: 0
            offset alignment: 128
            step alignment: 4
            base_object: *g_27(D) + (sizetype) b_22(D) * 4
            Access function 0: {0B, +, 4}_2
    
    producing the final codegen:
    
    .L7:
            ld1b    z4.s, p7/z, [x2, x6]
            cmpne   p6.b, p7/z, z4.b, #0
            ld1w    z29.s, p7/z, [x4, x6, lsl 2]
            ld1w    z2.s, p7/z, [x0, x6, lsl 2]
            ld1w    z3.s, p6/z, [x5]
            add     x6, x6, x7
            sel     z3.s, p6, z3.s, z29.s
            add     x5, x5, x1
            fmla    z30.s, p7/m, z2.s, z3.s
            whilelo p7.s, w6, w3
            b.any   .L7
            faddv   s31, p5, z30.s
    
    gcc/ChangeLog:
    
            * tree-chrec.cc (chrec_convert_1): Fold unsigned CHREC converts.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/vect/vect-scev-affine_1.c: New test.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c | 17 +++++++++++++++++
 gcc/tree-chrec.cc                              | 25 +++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c 
b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
new file mode 100644
index 000000000000..929012184e0a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+
+float *e;
+void f (float *f, float *g, char *h, int n,
+        int b, int c, int d)
+{
+  float a = 0;
+  for (int i = 0; i < n; ++i) {
+    int j = b + i, k = c + i * d;
+    float l = g[j], m = h[i] ? g[k] : l;
+    a += f[i] * m;
+  }
+  *e = a;
+}
+
+/* { dg-final { scan-tree-dump-not {failed: evolution of base is not affine} 
"vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-chrec.cc b/gcc/tree-chrec.cc
index 20beaeb09ec0..bad0396d0407 100644
--- a/gcc/tree-chrec.cc
+++ b/gcc/tree-chrec.cc
@@ -1598,6 +1598,31 @@ keep_cast:
                                                  CHREC_RIGHT (chrec)));
       res = chrec_convert_1 (type, res, at_stmt, use_overflow_semantics, from);
     }
+  /* Similar perform the trick that (unsigned T)(base + step) can be
+     folded to ((unsigned T)x + (unsigned T)step).  */
+  else if (use_overflow_semantics
+          && TREE_CODE (chrec) == POLYNOMIAL_CHREC
+          && INTEGRAL_TYPE_P (ct)
+          && INTEGRAL_TYPE_P (type)
+          && TYPE_OVERFLOW_UNDEFINED (type)
+          /* Must be unsigned so we don't introduce any UB.  */
+          && TYPE_UNSIGNED (type)
+          /* The outer type must at least as wide than the inner type so we
+                don't truncate when we fold and must the inner CHREC must be
+                non-wrapping so we don't change the behavior when folding to
+                a wider type.  */
+         && TYPE_PRECISION (type) >= TYPE_PRECISION (ct)
+         && (!TYPE_UNSIGNED (ct)
+             || TYPE_PRECISION (type) == TYPE_PRECISION (ct)
+             || nonwrapping_chrec_p (chrec)))
+    {
+      res = build_polynomial_chrec (CHREC_VARIABLE (chrec),
+                                   fold_convert (type,
+                                                 CHREC_LEFT (chrec)),
+                                   fold_convert (type,
+                                                 CHREC_RIGHT (chrec)));
+      res = chrec_convert_1 (type, res, at_stmt, use_overflow_semantics, from);
+    }
   else
     res = fold_convert (type, chrec);

Reply via email to