https://gcc.gnu.org/g:51ba233fe2db562390a6e0a3618420889761bc77
commit r16-305-g51ba233fe2db562390a6e0a3618420889761bc77 Author: Richard Biener <rguent...@suse.de> Date: Tue Apr 29 13:23:41 2025 +0200 tree-optimization/119960 - failed external SLP promotion The following addresses a too conservative sanity check of SLP nodes we want to promote external. The issue lies in code generation for such external which relies on get_later_stmt to figure an insert location. But get_later_stmt relies on the ability to totally order stmts, specifically implementation-wise that they are all from the same BB, which is what is verified at the moment. The patch changes this to require stmts to be orderable by dominance queries. For simplicity and seemingly enough for the testcase in PR119960, this handles the case of two distinct BBs. PR tree-optimization/119960 * tree-vect-slp.cc (vect_slp_can_convert_to_external): Handle cases where defs from multiple BBs are ordered by their dominance relation. * gcc.dg/vect/bb-slp-pr119960-1.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c | 15 +++++++ gcc/tree-vect-slp.cc | 63 ++++++++++++++++++++++++--- 2 files changed, 71 insertions(+), 7 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c new file mode 100644 index 000000000000..955fc7e32208 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ + +double foo (double *dst, double *src, int b) +{ + double y = src[1]; + if (b) + { + dst[0] = src[0]; + dst[1] = y; + } + return y; +} + +/* { dg-final { scan-tree-dump "optimized: basic block part vectorized" "slp2" { target vect_double } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 19beeed8a3a9..ea30795f0647 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -7832,21 +7832,70 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, node, node_instance, cost_vec); } +static int +sort_ints (const void *a_, const void *b_) +{ + int a = *(const int *)a_; + int b = *(const int *)b_; + return a - b; +} + /* Verify if we can externalize a set of internal defs. */ static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts) { + /* Constant generation uses get_later_stmt which can only handle + defs from the same BB or a set of defs that can be ordered + with a dominance query. */ basic_block bb = NULL; + bool all_same = true; + auto_vec<int> bbs; + bbs.reserve_exact (stmts.length ()); for (stmt_vec_info stmt : stmts) - if (!stmt) - return false; - /* Constant generation uses get_later_stmt which can only handle - defs from the same BB. */ - else if (!bb) - bb = gimple_bb (stmt->stmt); - else if (gimple_bb (stmt->stmt) != bb) + { + if (!stmt) + return false; + else if (!bb) + bb = gimple_bb (stmt->stmt); + else if (gimple_bb (stmt->stmt) != bb) + all_same = false; + bbs.quick_push (gimple_bb (stmt->stmt)->index); + } + if (all_same) + return true; + + /* Produce a vector of unique BB indexes for the defs. */ + bbs.qsort (sort_ints); + unsigned i, j; + for (i = 1, j = 1; i < bbs.length (); ++i) + if (bbs[i] != bbs[j-1]) + bbs[j++] = bbs[i]; + gcc_assert (j >= 2); + bbs.truncate (j); + + if (bbs.length () == 2) + return (dominated_by_p (CDI_DOMINATORS, + BASIC_BLOCK_FOR_FN (cfun, bbs[0]), + BASIC_BLOCK_FOR_FN (cfun, bbs[1])) + || dominated_by_p (CDI_DOMINATORS, + BASIC_BLOCK_FOR_FN (cfun, bbs[1]), + BASIC_BLOCK_FOR_FN (cfun, bbs[0]))); + + /* ??? For more than two BBs we can sort the vector and verify the + result is a total order. But we can't use vec::qsort with a + compare function using a dominance query since there's no way to + signal failure and any fallback for an unordered pair would + fail qsort_chk later. + For now simply hope that ordering after BB index provides the + best candidate total order. If required we can implement our + own mergesort or export an entry without checking. */ + for (unsigned i = 1; i < bbs.length (); ++i) + if (!dominated_by_p (CDI_DOMINATORS, + BASIC_BLOCK_FOR_FN (cfun, bbs[i]), + BASIC_BLOCK_FOR_FN (cfun, bbs[i-1]))) return false; + return true; }