Hi all, This is a small change to ivopts to expand SSA variables enabling ivopts to correctly work out when an address IV step is set to be a multiple on index step in the loop header (ie, not constant, not calculated each loop.)
Seems like this might have compile speed costs that need to be considered, but I believe should be worth it. This is also required for some upcoming work for vectorization of VLA loops with iteration data dependencies. Bootstrapped and reg tested on aarch64-linux-gnu and x86_64-unknown-linux-gnu. Thanks, Alfie -- >8 -- This changes the calls to tree_to_aff_combination in constant_multiple_of to tree_to_aff_combination_expand along with associated plumbing of ivopts_data and required cache. This improves cases such as: ```c void f(int *p1, int *p2, unsigned long step, unsigned long end, svbool_t pg) { for (unsigned long i = 0; i < end; i += step) { svst1(pg, p1, svld1_s32(pg, p2)); p1 += step; p2 += step; } } ``` Where ivopts previously didn't expand the SSA variables for the step increements and so lacked the ability to group all the IV's and ended up with: ``` f: cbz x3, .L1 mov x4, 0 .L3: ld1w z31.s, p0/z, [x1] add x4, x4, x2 st1w z31.s, p0, [x0] add x1, x1, x2, lsl 2 add x0, x0, x2, lsl 2 cmp x3, x4 bhi .L3 .L1: ret ``` After this change we end up with: ``` f: cbz x3, .L1 mov x4, 0 .L3: ld1w z31.s, p0/z, [x1, x4, lsl 2] st1w z31.s, p0, [x0, x4, lsl 2] add x4, x4, x2 cmp x3, x4 bhi .L3 .L1: ret ``` gcc/ChangeLog: * tree-ssa-loop-ivopts.cc (constant_multiple_of): Change tree_to_aff_combination to tree_to_aff_combination_expand and add parameter to take ivopts_data. (get_computation_aff_1): Change parameters and calls to include ivopts_data. (get_computation_aff): Ditto. (get_computation_at) Ditto.: (get_debug_computation_at) Ditto.: (get_computation_cost) Ditto.: (rewrite_use_nonlinear_expr) Ditto.: (rewrite_use_address) Ditto.: (rewrite_use_compare) Ditto.: (remove_unused_ivs) Ditto.: gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/adr_7.c: New test. --- gcc/testsuite/gcc.target/aarch64/sve/adr_7.c | 19 ++++++ gcc/tree-ssa-loop-ivopts.cc | 65 +++++++++++--------- 2 files changed, 54 insertions(+), 30 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/adr_7.c diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_7.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_7.c new file mode 100644 index 00000000000..61e23bbf182 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_7.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -ftree-vectorize" } */ + +#include <arm_sve.h> + +void f(int *p1, int *p2, unsigned long step, unsigned long end, svbool_t pg) { + for (unsigned long i = 0; i < end; i += step) { + svst1(pg, p1, svld1_s32(pg, p2)); + p1 += step; + p2 += step; + } +} + +/* { dg-final { scan-assembler-not {\tld1w\tz[0-9]+\.d, p[0-9]+/z\[x[0-9]+\.d\]} } } */ +/* { dg-final { scan-assembler-not {\tst1w\tz[0-9]+\.d, p[0-9]+/z\[x[0-9]+\.d\]} } } */ + +/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-9]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-9]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 1 } } */ diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc index 8a6726f1988..544a946ff89 100644 --- a/gcc/tree-ssa-loop-ivopts.cc +++ b/gcc/tree-ssa-loop-ivopts.cc @@ -2117,11 +2117,15 @@ idx_record_use (tree base, tree *idx, signedness of TOP and BOT. */ static bool -constant_multiple_of (tree top, tree bot, widest_int *mul) +constant_multiple_of (tree top, tree bot, widest_int *mul, + struct ivopts_data *data) { aff_tree aff_top, aff_bot; - tree_to_aff_combination (top, TREE_TYPE (top), &aff_top); - tree_to_aff_combination (bot, TREE_TYPE (bot), &aff_bot); + tree_to_aff_combination_expand (top, TREE_TYPE (top), &aff_top, + &data->name_expansion_cache); + tree_to_aff_combination_expand (bot, TREE_TYPE (bot), &aff_bot, + &data->name_expansion_cache); + poly_widest_int poly_mul; if (aff_combination_constant_multiple_p (&aff_top, &aff_bot, &poly_mul) && poly_mul.is_constant (mul)) @@ -3945,13 +3949,14 @@ determine_common_wider_type (tree *a, tree *b) } /* Determines the expression by that USE is expressed from induction variable - CAND at statement AT in LOOP. The expression is stored in two parts in a - decomposed form. The invariant part is stored in AFF_INV; while variant - part in AFF_VAR. Store ratio of CAND.step over USE.step in PRAT if it's - non-null. Returns false if USE cannot be expressed using CAND. */ + CAND at statement AT in DATA's current loop. The expression is stored in + two parts in a decomposed form. The invariant part is stored in AFF_INV; + while variant part in AFF_VAR. Store ratio of CAND.step over USE.step in + PRAT if it's non-null. Returns false if USE cannot be expressed using + CAND. */ static bool -get_computation_aff_1 (class loop *loop, gimple *at, struct iv_use *use, +get_computation_aff_1 (struct ivopts_data *data, gimple *at, struct iv_use *use, struct iv_cand *cand, class aff_tree *aff_inv, class aff_tree *aff_var, widest_int *prat = NULL) { @@ -3966,7 +3971,7 @@ get_computation_aff_1 (class loop *loop, gimple *at, struct iv_use *use, if (TYPE_PRECISION (utype) > TYPE_PRECISION (ctype)) return false; - var = var_at_stmt (loop, cand, at); + var = var_at_stmt (data->current_loop, cand, at); uutype = unsigned_type_for (utype); /* If the conversion is not noop, perform it. */ @@ -4011,7 +4016,7 @@ get_computation_aff_1 (class loop *loop, gimple *at, struct iv_use *use, gcc_assert (gimple_assign_lhs (use->stmt) == cand->var_after); rat = 1; } - else if (!constant_multiple_of (ustep, cstep, &rat)) + else if (!constant_multiple_of (ustep, cstep, &rat, data)) return false; if (prat) @@ -4030,7 +4035,7 @@ get_computation_aff_1 (class loop *loop, gimple *at, struct iv_use *use, tree_to_aff_combination (var, uutype, aff_var); /* We need to shift the value if we are after the increment. */ - if (stmt_after_increment (loop, cand, at)) + if (stmt_after_increment (data->current_loop, cand, at)) { aff_tree cstep_aff; @@ -4053,16 +4058,17 @@ get_computation_aff_1 (class loop *loop, gimple *at, struct iv_use *use, } /* Determines the expression by that USE is expressed from induction variable - CAND at statement AT in LOOP. The expression is stored in a decomposed - form into AFF. Returns false if USE cannot be expressed using CAND. */ + CAND at statement AT in DATA's current loop. The expression is stored in a + decomposed form into AFF. Returns false if USE cannot be expressed using + CAND. */ static bool -get_computation_aff (class loop *loop, gimple *at, struct iv_use *use, +get_computation_aff (struct ivopts_data *data, gimple *at, struct iv_use *use, struct iv_cand *cand, class aff_tree *aff) { aff_tree aff_var; - if (!get_computation_aff_1 (loop, at, use, cand, aff, &aff_var)) + if (!get_computation_aff_1 (data, at, use, cand, aff, &aff_var)) return false; aff_combination_add (aff, &aff_var); @@ -4092,16 +4098,17 @@ get_use_type (struct iv_use *use) } /* Determines the expression by that USE is expressed from induction variable - CAND at statement AT in LOOP. The computation is unshared. */ + CAND at statement AT in DATA's current loop. The computation is + unshared. */ static tree -get_computation_at (class loop *loop, gimple *at, +get_computation_at (struct ivopts_data *data, gimple *at, struct iv_use *use, struct iv_cand *cand) { aff_tree aff; tree type = get_use_type (use); - if (!get_computation_aff (loop, at, use, cand, &aff)) + if (!get_computation_aff (data, at, use, cand, &aff)) return NULL_TREE; unshare_aff_combination (&aff); return fold_convert (type, aff_combination_to_tree (&aff)); @@ -4111,10 +4118,10 @@ get_computation_at (class loop *loop, gimple *at, is more expensive. Intended for debug stmts. */ static tree -get_debug_computation_at (class loop *loop, gimple *at, +get_debug_computation_at (struct ivopts_data *data, gimple *at, struct iv_use *use, struct iv_cand *cand) { - if (tree ret = get_computation_at (loop, at, use, cand)) + if (tree ret = get_computation_at (data, at, use, cand)) return ret; tree ubase = use->iv->base, ustep = use->iv->step; @@ -4131,7 +4138,7 @@ get_debug_computation_at (class loop *loop, gimple *at, try to express use = ubase + (var - cbase) / ratio. */ if (!constant_multiple_of (cstep, fold_convert (TREE_TYPE (cstep), ustep), - &rat)) + &rat, data)) return NULL_TREE; bool neg_p = false; @@ -4160,7 +4167,7 @@ get_debug_computation_at (class loop *loop, gimple *at, && TYPE_PRECISION (utype) + bits > TYPE_PRECISION (ctype)) return NULL_TREE; - var = var_at_stmt (loop, cand, at); + var = var_at_stmt (data->current_loop, cand, at); if (POINTER_TYPE_P (ctype)) { @@ -4170,7 +4177,7 @@ get_debug_computation_at (class loop *loop, gimple *at, var = fold_convert (ctype, var); } - if (stmt_after_increment (loop, cand, at)) + if (stmt_after_increment (data->current_loop, cand, at)) var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var, unshare_expr (cstep)); @@ -4873,8 +4880,7 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use, return infinite_cost; } - if (!get_computation_aff_1 (data->current_loop, at, use, - cand, &aff_inv, &aff_var, &rat) + if (!get_computation_aff_1 (data, at, use, cand, &aff_inv, &aff_var, &rat) || !wi::fits_shwi_p (rat)) return infinite_cost; @@ -7358,8 +7364,7 @@ rewrite_use_nonlinear_expr (struct ivopts_data *data, } aff_tree aff_inv, aff_var; - if (!get_computation_aff_1 (data->current_loop, use->stmt, - use, cand, &aff_inv, &aff_var)) + if (!get_computation_aff_1 (data, use->stmt, use, cand, &aff_inv, &aff_var)) gcc_unreachable (); unshare_aff_combination (&aff_inv); @@ -7556,7 +7561,7 @@ rewrite_use_address (struct ivopts_data *data, bool ok; adjust_iv_update_pos (cand, use); - ok = get_computation_aff (data->current_loop, use->stmt, use, cand, &aff); + ok = get_computation_aff (data, use->stmt, use, cand, &aff); gcc_assert (ok); unshare_aff_combination (&aff); @@ -7659,7 +7664,7 @@ rewrite_use_compare (struct ivopts_data *data, /* The induction variable elimination failed; just express the original giv. */ - comp = get_computation_at (data->current_loop, use->stmt, use, cand); + comp = get_computation_at (data, use->stmt, use, cand); gcc_assert (comp != NULL_TREE); gcc_assert (use->op_p != NULL); *use->op_p = force_gimple_operand_gsi (&bsi, comp, true, @@ -7790,7 +7795,7 @@ remove_unused_ivs (struct ivopts_data *data, bitmap toremove) if (best_cand == NULL || best_pref < cand_pref) { tree this_comp - = get_debug_computation_at (data->current_loop, + = get_debug_computation_at (data, SSA_NAME_DEF_STMT (def), &dummy_use, cand); if (this_comp) -- 2.34.1