On Tue, 24 Jun 2025, Alfie Richards wrote:

> Hi all,
> 
> This is a small change to ivopts to expand SSA variables enabling ivopts to
> correctly work out when an address IV step is set to be a multiple on index
> step in the loop header (ie, not constant, not calculated each loop.)
> 
> Seems like this might have compile speed costs that need to be considered, but
> I believe should be worth it.
> 
> This is also required for some upcoming work for vectorization of VLA loops 
> with
> iteration data dependencies.
> 
> Bootstrapped and reg tested on aarch64-linux-gnu and x86_64-unknown-linux-gnu.

OK.

Thanks,
Richard.

> Thanks,
> Alfie
> 
> -- >8 --
> 
> This changes the calls to tree_to_aff_combination in constant_multiple_of to
> tree_to_aff_combination_expand along with associated plumbing of ivopts_data
> and required cache.
> 
> This improves cases such as:
> 
> ```c
> void f(int *p1, int *p2, unsigned long step, unsigned long end, svbool_t pg) {
>     for (unsigned long i = 0; i < end; i += step) {
>         svst1(pg, p1, svld1_s32(pg, p2));
>         p1 += step;
>         p2 += step;
>     }
> }
> ```
> 
> Where ivopts previously didn't expand the SSA variables for the step 
> increements
> and so lacked the ability to group all the IV's and ended up with:
> 
> ```
> f:
>       cbz     x3, .L1
>       mov     x4, 0
> .L3:
>       ld1w    z31.s, p0/z, [x1]
>       add     x4, x4, x2
>       st1w    z31.s, p0, [x0]
>       add     x1, x1, x2, lsl 2
>       add     x0, x0, x2, lsl 2
>       cmp     x3, x4
>       bhi     .L3
> .L1:
>       ret
> ```
> 
> After this change we end up with:
> 
> ```
> f:
>       cbz     x3, .L1
>       mov     x4, 0
> .L3:
>       ld1w    z31.s, p0/z, [x1, x4, lsl 2]
>       st1w    z31.s, p0, [x0, x4, lsl 2]
>       add     x4, x4, x2
>       cmp     x3, x4
>       bhi     .L3
> .L1:
>       ret
> ```
> 
> gcc/ChangeLog:
> 
>       * tree-ssa-loop-ivopts.cc (constant_multiple_of): Change
>       tree_to_aff_combination to tree_to_aff_combination_expand and add
>       parameter to take ivopts_data.
>       (get_computation_aff_1): Change parameters and calls to include
>       ivopts_data.
>       (get_computation_aff): Ditto.
>       (get_computation_at) Ditto.:
>       (get_debug_computation_at) Ditto.:
>       (get_computation_cost) Ditto.:
>       (rewrite_use_nonlinear_expr) Ditto.:
>       (rewrite_use_address) Ditto.:
>       (rewrite_use_compare) Ditto.:
>       (remove_unused_ivs) Ditto.:
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/aarch64/sve/adr_7.c: New test.
> ---
>  gcc/testsuite/gcc.target/aarch64/sve/adr_7.c | 19 ++++++
>  gcc/tree-ssa-loop-ivopts.cc                  | 65 +++++++++++---------
>  2 files changed, 54 insertions(+), 30 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/adr_7.c
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_7.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/adr_7.c
> new file mode 100644
> index 00000000000..61e23bbf182
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_7.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ftree-vectorize" } */
> +
> +#include <arm_sve.h>
> +
> +void f(int *p1, int *p2, unsigned long step, unsigned long end, svbool_t pg) 
> {
> +    for (unsigned long i = 0; i < end; i += step) {
> +        svst1(pg, p1, svld1_s32(pg, p2));
> +        p1 += step;
> +        p2 += step;
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-not {\tld1w\tz[0-9]+\.d, 
> p[0-9]+/z\[x[0-9]+\.d\]} } } */
> +/* { dg-final { scan-assembler-not {\tst1w\tz[0-9]+\.d, 
> p[0-9]+/z\[x[0-9]+\.d\]} } } */
> +
> +/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+} 1 } 
> } */
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-9]+/z, 
> \[x[0-9]+, x[0-9]+, lsl 2\]} 1 } } */
> +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-9]+, 
> \[x[0-9]+, x[0-9]+, lsl 2\]} 1 } } */
> diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
> index 8a6726f1988..544a946ff89 100644
> --- a/gcc/tree-ssa-loop-ivopts.cc
> +++ b/gcc/tree-ssa-loop-ivopts.cc
> @@ -2117,11 +2117,15 @@ idx_record_use (tree base, tree *idx,
>     signedness of TOP and BOT.  */
>  
>  static bool
> -constant_multiple_of (tree top, tree bot, widest_int *mul)
> +constant_multiple_of (tree top, tree bot, widest_int *mul,
> +                   struct ivopts_data *data)
>  {
>    aff_tree aff_top, aff_bot;
> -  tree_to_aff_combination (top, TREE_TYPE (top), &aff_top);
> -  tree_to_aff_combination (bot, TREE_TYPE (bot), &aff_bot);
> +  tree_to_aff_combination_expand (top, TREE_TYPE (top), &aff_top,
> +                               &data->name_expansion_cache);
> +  tree_to_aff_combination_expand (bot, TREE_TYPE (bot), &aff_bot,
> +                               &data->name_expansion_cache);
> +
>    poly_widest_int poly_mul;
>    if (aff_combination_constant_multiple_p (&aff_top, &aff_bot, &poly_mul)
>        && poly_mul.is_constant (mul))
> @@ -3945,13 +3949,14 @@ determine_common_wider_type (tree *a, tree *b)
>  }
>  
>  /* Determines the expression by that USE is expressed from induction variable
> -   CAND at statement AT in LOOP.  The expression is stored in two parts in a
> -   decomposed form.  The invariant part is stored in AFF_INV; while variant
> -   part in AFF_VAR.  Store ratio of CAND.step over USE.step in PRAT if it's
> -   non-null.  Returns false if USE cannot be expressed using CAND.  */
> +   CAND at statement AT in DATA's current loop.  The expression is stored in
> +   two parts in a decomposed form.  The invariant part is stored in AFF_INV;
> +   while variant part in AFF_VAR.  Store ratio of CAND.step over USE.step in
> +   PRAT if it's non-null.  Returns false if USE cannot be expressed using
> +   CAND.  */
>  
>  static bool
> -get_computation_aff_1 (class loop *loop, gimple *at, struct iv_use *use,
> +get_computation_aff_1 (struct ivopts_data *data, gimple *at, struct iv_use 
> *use,
>                      struct iv_cand *cand, class aff_tree *aff_inv,
>                      class aff_tree *aff_var, widest_int *prat = NULL)
>  {
> @@ -3966,7 +3971,7 @@ get_computation_aff_1 (class loop *loop, gimple *at, 
> struct iv_use *use,
>    if (TYPE_PRECISION (utype) > TYPE_PRECISION (ctype))
>      return false;
>  
> -  var = var_at_stmt (loop, cand, at);
> +  var = var_at_stmt (data->current_loop, cand, at);
>    uutype = unsigned_type_for (utype);
>  
>    /* If the conversion is not noop, perform it.  */
> @@ -4011,7 +4016,7 @@ get_computation_aff_1 (class loop *loop, gimple *at, 
> struct iv_use *use,
>        gcc_assert (gimple_assign_lhs (use->stmt) == cand->var_after);
>        rat = 1;
>      }
> -  else if (!constant_multiple_of (ustep, cstep, &rat))
> +  else if (!constant_multiple_of (ustep, cstep, &rat, data))
>      return false;
>  
>    if (prat)
> @@ -4030,7 +4035,7 @@ get_computation_aff_1 (class loop *loop, gimple *at, 
> struct iv_use *use,
>    tree_to_aff_combination (var, uutype, aff_var);
>  
>    /* We need to shift the value if we are after the increment.  */
> -  if (stmt_after_increment (loop, cand, at))
> +  if (stmt_after_increment (data->current_loop, cand, at))
>      {
>        aff_tree cstep_aff;
>  
> @@ -4053,16 +4058,17 @@ get_computation_aff_1 (class loop *loop, gimple *at, 
> struct iv_use *use,
>  }
>  
>  /* Determines the expression by that USE is expressed from induction variable
> -   CAND at statement AT in LOOP.  The expression is stored in a decomposed
> -   form into AFF.  Returns false if USE cannot be expressed using CAND.  */
> +   CAND at statement AT in DATA's current loop.  The expression is stored in 
> a
> +   decomposed form into AFF.  Returns false if USE cannot be expressed using
> +   CAND.  */
>  
>  static bool
> -get_computation_aff (class loop *loop, gimple *at, struct iv_use *use,
> +get_computation_aff (struct ivopts_data *data, gimple *at, struct iv_use 
> *use,
>                    struct iv_cand *cand, class aff_tree *aff)
>  {
>    aff_tree aff_var;
>  
> -  if (!get_computation_aff_1 (loop, at, use, cand, aff, &aff_var))
> +  if (!get_computation_aff_1 (data, at, use, cand, aff, &aff_var))
>      return false;
>  
>    aff_combination_add (aff, &aff_var);
> @@ -4092,16 +4098,17 @@ get_use_type (struct iv_use *use)
>  }
>  
>  /* Determines the expression by that USE is expressed from induction variable
> -   CAND at statement AT in LOOP.  The computation is unshared.  */
> +   CAND at statement AT in DATA's current loop.  The computation is
> +   unshared.  */
>  
>  static tree
> -get_computation_at (class loop *loop, gimple *at,
> +get_computation_at (struct ivopts_data *data, gimple *at,
>                   struct iv_use *use, struct iv_cand *cand)
>  {
>    aff_tree aff;
>    tree type = get_use_type (use);
>  
> -  if (!get_computation_aff (loop, at, use, cand, &aff))
> +  if (!get_computation_aff (data, at, use, cand, &aff))
>      return NULL_TREE;
>    unshare_aff_combination (&aff);
>    return fold_convert (type, aff_combination_to_tree (&aff));
> @@ -4111,10 +4118,10 @@ get_computation_at (class loop *loop, gimple *at,
>     is more expensive.  Intended for debug stmts.  */
>  
>  static tree
> -get_debug_computation_at (class loop *loop, gimple *at,
> +get_debug_computation_at (struct ivopts_data *data, gimple *at,
>                         struct iv_use *use, struct iv_cand *cand)
>  {
> -  if (tree ret = get_computation_at (loop, at, use, cand))
> +  if (tree ret = get_computation_at (data, at, use, cand))
>      return ret;
>  
>    tree ubase = use->iv->base, ustep = use->iv->step;
> @@ -4131,7 +4138,7 @@ get_debug_computation_at (class loop *loop, gimple *at,
>       try to express
>       use = ubase + (var - cbase) / ratio.  */
>    if (!constant_multiple_of (cstep, fold_convert (TREE_TYPE (cstep), ustep),
> -                          &rat))
> +                          &rat, data))
>      return NULL_TREE;
>  
>    bool neg_p = false;
> @@ -4160,7 +4167,7 @@ get_debug_computation_at (class loop *loop, gimple *at,
>        && TYPE_PRECISION (utype) + bits > TYPE_PRECISION (ctype))
>      return NULL_TREE;
>  
> -  var = var_at_stmt (loop, cand, at);
> +  var = var_at_stmt (data->current_loop, cand, at);
>  
>    if (POINTER_TYPE_P (ctype))
>      {
> @@ -4170,7 +4177,7 @@ get_debug_computation_at (class loop *loop, gimple *at,
>        var = fold_convert (ctype, var);
>      }
>  
> -  if (stmt_after_increment (loop, cand, at))
> +  if (stmt_after_increment (data->current_loop, cand, at))
>      var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var,
>                      unshare_expr (cstep));
>  
> @@ -4873,8 +4880,7 @@ get_computation_cost (struct ivopts_data *data, struct 
> iv_use *use,
>       return infinite_cost;
>      }
>  
> -  if (!get_computation_aff_1 (data->current_loop, at, use,
> -                           cand, &aff_inv, &aff_var, &rat)
> +  if (!get_computation_aff_1 (data, at, use, cand, &aff_inv, &aff_var, &rat)
>        || !wi::fits_shwi_p (rat))
>      return infinite_cost;
>  
> @@ -7358,8 +7364,7 @@ rewrite_use_nonlinear_expr (struct ivopts_data *data,
>      }
>  
>    aff_tree aff_inv, aff_var;
> -  if (!get_computation_aff_1 (data->current_loop, use->stmt,
> -                           use, cand, &aff_inv, &aff_var))
> +  if (!get_computation_aff_1 (data, use->stmt, use, cand, &aff_inv, 
> &aff_var))
>      gcc_unreachable ();
>  
>    unshare_aff_combination (&aff_inv);
> @@ -7556,7 +7561,7 @@ rewrite_use_address (struct ivopts_data *data,
>    bool ok;
>  
>    adjust_iv_update_pos (cand, use);
> -  ok = get_computation_aff (data->current_loop, use->stmt, use, cand, &aff);
> +  ok = get_computation_aff (data, use->stmt, use, cand, &aff);
>    gcc_assert (ok);
>    unshare_aff_combination (&aff);
>  
> @@ -7659,7 +7664,7 @@ rewrite_use_compare (struct ivopts_data *data,
>  
>    /* The induction variable elimination failed; just express the original
>       giv.  */
> -  comp = get_computation_at (data->current_loop, use->stmt, use, cand);
> +  comp = get_computation_at (data, use->stmt, use, cand);
>    gcc_assert (comp != NULL_TREE);
>    gcc_assert (use->op_p != NULL);
>    *use->op_p = force_gimple_operand_gsi (&bsi, comp, true,
> @@ -7790,7 +7795,7 @@ remove_unused_ivs (struct ivopts_data *data, bitmap 
> toremove)
>                 if (best_cand == NULL || best_pref < cand_pref)
>                   {
>                     tree this_comp
> -                     = get_debug_computation_at (data->current_loop,
> +                     = get_debug_computation_at (data,
>                                                   SSA_NAME_DEF_STMT (def),
>                                                   &dummy_use, cand);
>                     if (this_comp)
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to