With SLP now being a graph with shared nodes across instances we have to make sure to compute the load permutation of nodes once, not overwriting the result of earlier analysis.
Bootstrapped and tested on x86_64-unknown-linux-gnu, applied. Richard. 2020-01-28 Richard Biener <rguent...@suse.de> PR tree-optimization/93428 * tree-vect-slp.c (vect_build_slp_tree_2): Compute the load permutation when the load node is created. (vect_analyze_slp_instance): Re-use it here. * gcc.dg/torture/pr93428.c: New testcase. --- gcc/testsuite/gcc.dg/torture/pr93428.c | 27 +++++++++++++++++++++++++ gcc/tree-vect-slp.c | 37 +++++++++++++++++++++++----------- 2 files changed, 52 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/torture/pr93428.c diff --git a/gcc/testsuite/gcc.dg/torture/pr93428.c b/gcc/testsuite/gcc.dg/torture/pr93428.c new file mode 100644 index 00000000000..b24f651e5a6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr93428.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-ftree-slp-vectorize" } */ + +int ai[2][8]; +void bar (int *); +void +br (void) +{ + int qp[9]; + bar (qp); + ai[0][0] = qp[0] + qp[1] + 1 >> 1; + ai[0][1] = qp[1] + qp[2] + 1 >> 1; + ai[0][2] = qp[2] + qp[3] + 1 >> 1; + ai[0][3] = qp[3] + qp[4] + 1 >> 1; + ai[0][4] = qp[4] + qp[5] + 1 >> 1; + ai[0][5] = qp[5] + qp[6] + 1 >> 1; + ai[0][6] = qp[6] + qp[7] + 1 >> 1; + ai[0][7] = qp[7] + qp[8] + 1 >> 1; + ai[1][0] = qp[0] + qp[1] + 2 * qp[0] + 1 >> 2; + ai[1][1] = qp[0] + qp[2] + 2 * qp[1] + 1 >> 2; + ai[1][2] = qp[1] + qp[3] + 2 * qp[2] + 1 >> 2; + ai[1][3] = qp[2] + qp[4] + 2 * qp[3] + 1 >> 2; + ai[1][4] = qp[3] + qp[5] + 2 * qp[4] + 1 >> 2; + ai[1][5] = qp[4] + qp[6] + 2 * qp[5] + 1 >> 2; + ai[1][6] = qp[5] + qp[7] + 2 * qp[6] + 1 >> 2; + ai[1][7] = qp[6] + qp[8] + 2 * qp[7] + 1 >> 2; +} diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index b13beeb3689..71a24b78cf4 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -1353,6 +1353,23 @@ vect_build_slp_tree_2 (vec_info *vinfo, *max_nunits = this_max_nunits; (*tree_size)++; node = vect_create_new_slp_node (stmts); + /* And compute the load permutation. Whether it is actually + a permutation depends on the unrolling factor which is + decided later. */ + vec<unsigned> load_permutation; + int j; + stmt_vec_info load_info; + load_permutation.create (group_size); + stmt_vec_info first_stmt_info + = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) + { + int load_place = vect_get_place_in_interleaving_chain + (load_info, first_stmt_info); + gcc_assert (load_place != -1); + load_permutation.safe_push (load_place); + } + SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; return node; } } @@ -2254,22 +2271,19 @@ vect_analyze_slp_instance (vec_info *vinfo, bool loads_permuted = false; FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node) { - vec<unsigned> load_permutation; - int j; + if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) + continue; + unsigned j; stmt_vec_info load_info; bool this_load_permuted = false; - load_permutation.create (group_size); stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (load_node)[0]); FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) - { - int load_place = vect_get_place_in_interleaving_chain - (load_info, first_stmt_info); - gcc_assert (load_place != -1); - if (load_place != j) + if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) + { this_load_permuted = true; - load_permutation.safe_push (load_place); - } + break; + } if (!this_load_permuted /* The load requires permutation when unrolling exposes a gap either because the group is larger than the SLP @@ -2278,10 +2292,9 @@ vect_analyze_slp_instance (vec_info *vinfo, || (group_size == DR_GROUP_SIZE (first_stmt_info) && DR_GROUP_GAP (first_stmt_info) == 0))) { - load_permutation.release (); + SLP_TREE_LOAD_PERMUTATION (load_node).release (); continue; } - SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation; loads_permuted = true; } -- 2.16.4