On Wed, 13 May 2026, Tamar Christina wrote:

> The example
> 
> float *e;
> void f (float *f, float *g, char *h, int n,
>         int b, int c, int d)
> {
>   float a = 0;
>   for (int i = 0; i < n; ++i) {
>     int j = b + i, k = c + i * d;
>     float l = g[j], m = h[i] ? g[k] : l;
>     a += f[i] * m;
>   }
>   *e = a;
> }
> 
> gets vectorized using gathers for the access to g:
> 
> .L5:
>         ld1b    z4.s, p7/z, [x2, x6]
>         cmpne   p6.b, p7/z, z4.b, #0
>         ld1w    z2.s, p7/z, [x0, x6, lsl 2]
>         add     z7.s, z30.s, z16.s
>         add     z6.s, z16.s, z18.s
>         add     x6, x6, x7
>         ld1w    z5.s, p7/z, [x1, z6.s, sxtw 2]
>         ld1w    z3.s, p6/z, [x1, z7.s, sxtw 2]
>         incw    z16.s
>         sel     z3.s, p6, z3.s, z5.s
>         fmla    z17.s, p7/m, z2.s, z3.s
>         whilelo p7.s, w6, w3
>         b.any   .L5
> 
> however the first g is g[b+i] and second is g[c + i*d];
> 
> since b is loop invariant the access to g[b+i] is actually linear and since c
> is loop invariant, then the base of the second access g[c + i *d] can be
> simplified by recognizing the base as g + c.
> 
> Today however SCEV fails to analyze these accesses as affine and as a
> consequence we end up with gathers:
> 
> : missed:  failed: evolution of base is not affine.
>         base_address:
>         offset from base address:
>         constant offset from base address:
>         step:
>         base alignment: 0
>         base misalignment: 0
>         offset alignment: 0
>         step alignment: 0
>         base_object: *_63
> 
> Looking at SCEV this is because of an outer cast around the CHREC:
> 
> )
> (set_scalar_evolution
>   instantiated_below = 25
>   (scalar = _65)
>   (scalar_evolution = (long unsigned int) {b_22(D), +, 1}_2))
> )
> (instantiate_scev
>   (instantiate_below = 25 -> 12)
>   (evolution_loop = 2)
>   (chrec = (long unsigned int) {b_22(D), +, 1}_2)
> 
> (instantiate_scev
>   (instantiate_below = 25 -> 12)
>   (evolution_loop = 2)
>   (chrec = g_27(D))
>   (res = g_27(D)))
> 
>   which corresponds to
> 
>   j_66 = b_22(D) + i_67;
>   _65 = (long unsigned int) j_66;
>   _64 = _65 * 4;
>   _63 = g_27(D) + _64;
>   l_62 = *_63;
> 
> and the _64 is deemed to not be affine:
> 
> (instantiate_scev
>   (instantiate_below = 25 -> 12)
>   (evolution_loop = 2)
>   (chrec = _64)
> (analyze_scalar_evolution
>   (loop_nb = 2)
>   (scalar = _64)
> (get_scalar_evolution
>   (scalar = _64)
>   (scalar_evolution = _64))
> )
>   (res = scev_not_known))
> 
> This patch fixes it by (very carefully) folding a multiply on an unsigned 
> affine
> CHREC into the CHREC itself.
> 
> which results in
> 
> (instantiate_scev
>   (instantiate_below = 25 -> 12)
>   (evolution_loop = 2)
>   (chrec = 4)
>   (res = 4))
> (set_scalar_evolution
>   instantiated_below = 25
>   (scalar = _64)
>   (scalar_evolution = {(long unsigned int) b_22(D) * 4, +, 4}_2))
> )
> (instantiate_scev
>   (instantiate_below = 25 -> 12)
>   (evolution_loop = 2)
>   (chrec = g_27(D))
>   (res = g_27(D)))
> (instantiate_scev
>   (instantiate_below = 25 -> 12)
>   (evolution_loop = 2)
>   (chrec = {(long unsigned int) b_22(D) * 4, +, 4}_2)
>   (res = {(long unsigned int) b_22(D) * 4, +, 4}_2))
> (set_scalar_evolution
>   instantiated_below = 25
>   (scalar = _63)
>   (scalar_evolution = {g_27(D) + (long unsigned int) b_22(D) * 4, +, 4}_2))
> )
> 
> and dataref now correctly analyzes the base
> 
>         base_address: g_27(D) + (sizetype) b_22(D) * 4
>         offset from base address: 0
>         constant offset from base address: 0
>         step: 4
>         base alignment: 4
>         base misalignment: 0
>         offset alignment: 128
>         step alignment: 4
>         base_object: *g_27(D) + (sizetype) b_22(D) * 4
>         Access function 0: {0B, +, 4}_2
> 
> producing the final codegen:
> 
> .L7:
>         ld1b    z4.s, p7/z, [x2, x6]
>         cmpne   p6.b, p7/z, z4.b, #0
>         ld1w    z29.s, p7/z, [x4, x6, lsl 2]
>         ld1w    z2.s, p7/z, [x0, x6, lsl 2]
>         ld1w    z3.s, p6/z, [x5]
>         add     x6, x6, x7
>         sel     z3.s, p6, z3.s, z29.s
>         add     x5, x5, x1
>         fmla    z30.s, p7/m, z2.s, z3.s
>         whilelo p7.s, w6, w3
>         b.any   .L7
>         faddv   s31, p5, z30.s
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       * tree-chrec.cc (chrec_convert_1): Fold unsigned CHREC converts.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.dg/vect/vect-scev-affine_1.c: New test.
> 
> ---
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c 
> b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..929012184e0a2595af826d3d06284d0a6a510119
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_float } */
> +
> +float *e;
> +void f (float *f, float *g, char *h, int n,
> +        int b, int c, int d)
> +{
> +  float a = 0;
> +  for (int i = 0; i < n; ++i) {
> +    int j = b + i, k = c + i * d;
> +    float l = g[j], m = h[i] ? g[k] : l;
> +    a += f[i] * m;
> +  }
> +  *e = a;
> +}
> +
> +/* { dg-final { scan-tree-dump-not {failed: evolution of base is not affine} 
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/tree-chrec.cc b/gcc/tree-chrec.cc
> index 
> 20beaeb09ec0cf1b7ffbcefa55789beded5e3869..514b10fa27974253bc56391f4f89651222e34181
>  100644
> --- a/gcc/tree-chrec.cc
> +++ b/gcc/tree-chrec.cc
> @@ -1598,6 +1598,31 @@ keep_cast:
>                                                 CHREC_RIGHT (chrec)));
>        res = chrec_convert_1 (type, res, at_stmt, use_overflow_semantics, 
> from);
>      }
> +  /* Similar perform the trick that (unsigned T)(base + step) can be
> +     folded to ((unsigned T)x + (unsigned T)2).  */

(unsigned T)step

> +  /* The CHREC we're trying to distribute the cast into must be
> +     affine already.  */
> +  else if (evolution_function_is_affine_p (chrec)

The previous case got along with && TREE_CODE (chrec) == POLYNOMIAL_CHREC

> +        && INTEGRAL_TYPE_P (ct)
> +        && INTEGRAL_TYPE_P (type)
> +        /* Must be unsigned so we don't introduce any UB.  */
> +        && TYPE_UNSIGNED (type)
> +        /* The outer type must at least as wide than the inner type so we
> +              don't truncate when we fold and must the inner CHREC must be
> +              non-wrapping so we don't change the behavior when folding to
> +              a wider type.  */
> +       && TYPE_PRECISION (type) >= TYPE_PRECISION (ct)
> +       && (!TYPE_UNSIGNED (ct)

TYPE_OVERFLOW_UNDEFINED (ct) (beware of -fwrapv)

> +           || TYPE_PRECISION (type) == TYPE_PRECISION (ct)
> +           || nonwrapping_chrec_p (chrec)))

previous cases all gate those foldings on use_overflow_semantics, I
think you should do that as well (even if I don't exactly remember
what this was about - at least TYPE_OVERFLOW_UNDEFINED cannot be
relied on without IIRC).

OK with those changes.

Thanks,
Richrd.

> +    {
> +      res = build_polynomial_chrec (CHREC_VARIABLE (chrec),
> +                                 fold_convert (type,
> +                                               CHREC_LEFT (chrec)),
> +                                 fold_convert (type,
> +                                               CHREC_RIGHT (chrec)));
> +      res = chrec_convert_1 (type, res, at_stmt, use_overflow_semantics, 
> from);
> +    }
>    else
>      res = fold_convert (type, chrec);
>  
> 
> 
> 

-- 
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to