On Thu, 7 May 2026, Tamar Christina wrote:
> The example
>
> float *e;
> void f (float *f, float *g, char *h, int n,
> int b, int c, int d)
> {
> float a = 0;
> for (int i = 0; i < n; ++i) {
> int j = b + i, k = c + i * d;
> float l = g[j], m = h[i] ? g[k] : l;
> a += f[i] * m;
> }
> *e = a;
> }
>
> gets vectorized using gathers for the access to g:
>
> .L5:
> ld1b z4.s, p7/z, [x2, x6]
> cmpne p6.b, p7/z, z4.b, #0
> ld1w z2.s, p7/z, [x0, x6, lsl 2]
> add z7.s, z30.s, z16.s
> add z6.s, z16.s, z18.s
> add x6, x6, x7
> ld1w z5.s, p7/z, [x1, z6.s, sxtw 2]
> ld1w z3.s, p6/z, [x1, z7.s, sxtw 2]
> incw z16.s
> sel z3.s, p6, z3.s, z5.s
> fmla z17.s, p7/m, z2.s, z3.s
> whilelo p7.s, w6, w3
> b.any .L5
>
> however the first g is g[b+i] and second is g[c + i*d];
>
> since b is loop invariant the access to g[b+i] is actually linear and since c
> is loop invariant, then the base of the second access g[c + i *d] can be
> simplified by recognizing the base as g + c.
>
> Today however SCEV fails to analyze these accesses as affine and as a
> consequence we end up with gathers:
>
> : missed: failed: evolution of base is not affine.
> base_address:
> offset from base address:
> constant offset from base address:
> step:
> base alignment: 0
> base misalignment: 0
> offset alignment: 0
> step alignment: 0
> base_object: *_63
>
> Looking at SCEV this is because of an outer cast around the CHREC:
>
> )
> (set_scalar_evolution
> instantiated_below = 25
> (scalar = _65)
> (scalar_evolution = (long unsigned int) {b_22(D), +, 1}_2))
> )
> (instantiate_scev
> (instantiate_below = 25 -> 12)
> (evolution_loop = 2)
> (chrec = (long unsigned int) {b_22(D), +, 1}_2)
>
> (instantiate_scev
> (instantiate_below = 25 -> 12)
> (evolution_loop = 2)
> (chrec = g_27(D))
> (res = g_27(D)))
>
> which corresponds to
>
> j_66 = b_22(D) + i_67;
> _65 = (long unsigned int) j_66;
> _64 = _65 * 4;
> _63 = g_27(D) + _64;
> l_62 = *_63;
>
> and the _64 is deemed to not be affine:
>
> (instantiate_scev
> (instantiate_below = 25 -> 12)
> (evolution_loop = 2)
> (chrec = _64)
> (analyze_scalar_evolution
> (loop_nb = 2)
> (scalar = _64)
> (get_scalar_evolution
> (scalar = _64)
> (scalar_evolution = _64))
> )
> (res = scev_not_known))
>
> This patch fixes it by (very carefully) folding a multiply on an unsigned
> affine
> CHREC into the CHREC itself.
>
> which results in
>
> (instantiate_scev
> (instantiate_below = 25 -> 12)
> (evolution_loop = 2)
> (chrec = 4)
> (res = 4))
> (set_scalar_evolution
> instantiated_below = 25
> (scalar = _64)
> (scalar_evolution = {(long unsigned int) b_22(D) * 4, +, 4}_2))
> )
> (instantiate_scev
> (instantiate_below = 25 -> 12)
> (evolution_loop = 2)
> (chrec = g_27(D))
> (res = g_27(D)))
> (instantiate_scev
> (instantiate_below = 25 -> 12)
> (evolution_loop = 2)
> (chrec = {(long unsigned int) b_22(D) * 4, +, 4}_2)
> (res = {(long unsigned int) b_22(D) * 4, +, 4}_2))
> (set_scalar_evolution
> instantiated_below = 25
> (scalar = _63)
> (scalar_evolution = {g_27(D) + (long unsigned int) b_22(D) * 4, +, 4}_2))
> )
>
> and dataref now correctly analyzes the base
>
> base_address: g_27(D) + (sizetype) b_22(D) * 4
> offset from base address: 0
> constant offset from base address: 0
> step: 4
> base alignment: 4
> base misalignment: 0
> offset alignment: 128
> step alignment: 4
> base_object: *g_27(D) + (sizetype) b_22(D) * 4
> Access function 0: {0B, +, 4}_2
>
> producing the final codegen:
>
> .L7:
> ld1b z4.s, p7/z, [x2, x6]
> cmpne p6.b, p7/z, z4.b, #0
> ld1w z29.s, p7/z, [x4, x6, lsl 2]
> ld1w z2.s, p7/z, [x0, x6, lsl 2]
> ld1w z3.s, p6/z, [x5]
> add x6, x6, x7
> sel z3.s, p6, z3.s, z29.s
> add x5, x5, x1
> fmla z30.s, p7/m, z2.s, z3.s
> whilelo p7.s, w6, w3
> b.any .L7
> faddv s31, p5, z30.s
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * tree-chrec.cc (chrec_fold_multiply): Fold unsigned CHREC mult.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/vect-scev-affine_1.c: New test.
>
> ---
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
> b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..929012184e0a2595af826d3d06284d0a6a510119
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-scev-affine_1.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_float } */
> +
> +float *e;
> +void f (float *f, float *g, char *h, int n,
> + int b, int c, int d)
> +{
> + float a = 0;
> + for (int i = 0; i < n; ++i) {
> + int j = b + i, k = c + i * d;
> + float l = g[j], m = h[i] ? g[k] : l;
> + a += f[i] * m;
> + }
> + *e = a;
> +}
> +
> +/* { dg-final { scan-tree-dump-not {failed: evolution of base is not affine}
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/tree-chrec.cc b/gcc/tree-chrec.cc
> index
> 09dd81900bce70138f975c68b77c4ba6d0e45fc3..ff77c7a6c2397f65f3ee17a386408b5ceec4676d
> 100644
> --- a/gcc/tree-chrec.cc
> +++ b/gcc/tree-chrec.cc
> @@ -508,9 +508,52 @@ chrec_fold_multiply (tree type,
> CASE_CONVERT:
> if (tree_contains_chrecs (op0, NULL))
> {
> + tree inner = TREE_OPERAND (op0, 0);
> + tree inner_type = TREE_TYPE (inner);
> +
> + /* Keep widening unsigned multiplies of affine CHRECs affine.
> + This handles byte-offset computations such as
> + (unsigned T) {base, +, step} * C and fold these into
> + {(unsigned T) base * C, +, (unsigned T) step * C}. */
> + if (evolution_function_is_affine_p (inner)
> + /* The CHREC we're trying to distribute the cast into must be
> + affine already. */
> + && tree_does_not_contain_chrecs (op1)
> + && INTEGRAL_TYPE_P (type)
> + && INTEGRAL_TYPE_P (inner_type)
> + /* Must be unsigned so we don't introduce any UB. */
> + && TYPE_UNSIGNED (type)
> + /* The outer type must at least as wide than the inner type so we
> + don't truncate when we fold and must the inner CHREC must be
> + non-wrapping so we don't change the behavior when folding to
> + a wider type. */
> + && TYPE_PRECISION (type) >= TYPE_PRECISION (inner_type)
> + && (!TYPE_UNSIGNED (inner_type)
> + || TYPE_PRECISION (type) == TYPE_PRECISION (inner_type)
> + || nonwrapping_chrec_p (inner))
> + /* The component we are multiplying must be loop invariant
> + otherwise the base expression can't be simplified and the
> + resulting CHREC won't be affine. */
> + && evolution_function_is_invariant_p (op1,
> + CHREC_VARIABLE (inner)))
> + {
> + tree top1 = chrec_convert (type, op1, NULL);
> + tree left
> + = chrec_fold_multiply (type,
> + chrec_convert (type, CHREC_LEFT (inner),
> + NULL), top1);
> + tree right
> + = chrec_fold_multiply (type,
> + chrec_convert_rhs (type,
> + CHREC_RIGHT (inner),
> + NULL), top1);
So what you are basically doing is selectively (only if present
as multiplication operand), simplify (unsigned T){x, +, s} to
{(unsigned T)x, +, (unsinged T)s}.
chrec_convert_1 has some similar "tricks" below keep_cast:,
specifically this is in the class of us not generally widening
operations because of costs, but for SCEV analysis it's better
than giving up.
So I think this is better done in chrec_convert_1.
Richard.
> + return build_polynomial_chrec (CHREC_VARIABLE (inner),
> + left, right);
> + }
> +
> /* We can strip sign-conversions to signed by performing the
> operation in unsigned. */
> - tree optype = TREE_TYPE (TREE_OPERAND (op0, 0));
> + tree optype = inner_type;
> if (INTEGRAL_TYPE_P (type)
> && INTEGRAL_TYPE_P (optype)
> && tree_nop_conversion_p (type, optype)
>
>
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)