[gcc r12-10503] tree-optimization/111070 - fix ICE with recent ifcombine fix
https://gcc.gnu.org/g:d73137ab352d654f50b703925bd92e021dce1cab commit r12-10503-gd73137ab352d654f50b703925bd92e021dce1cab Author: Richard Biener Date: Mon Aug 21 09:01:00 2023 +0200 tree-optimization/111070 - fix ICE with recent ifcombine fix We now got test coverage for non-SSA name bits so the following amends the SSA_NAME_OCCURS_IN_ABNORMAL_PHI checks. PR tree-optimization/111070 * tree-ssa-ifcombine.cc (ifcombine_ifandif): Check we have an SSA name before checking SSA_NAME_OCCURS_IN_ABNORMAL_PHI. * gcc.dg/pr111070.c: New testcase. (cherry picked from commit 966b0a96523fb7adbf498ac71df5e033c70dc546) Diff: --- gcc/testsuite/gcc.dg/pr111070.c | 20 gcc/tree-ssa-ifcombine.cc | 9 ++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/gcc.dg/pr111070.c b/gcc/testsuite/gcc.dg/pr111070.c new file mode 100644 index 000..1ebc7adf782 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr111070.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +/* common */ +char c; +/* arrays must be 8 byte aligned, regardless of size */ +char c_ary[1]; + +/* data */ +char d = 1; +char d_ary[1] = {1}; + +int main () +{ + if (((unsigned long)_ary[0] & 7) != 0) +return 1; + if (((unsigned long)_ary[0] & 7) != 0) +return 1; + return 0; +} diff --git a/gcc/tree-ssa-ifcombine.cc b/gcc/tree-ssa-ifcombine.cc index b139328af22..dcfa92c0c82 100644 --- a/gcc/tree-ssa-ifcombine.cc +++ b/gcc/tree-ssa-ifcombine.cc @@ -415,7 +415,8 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool inner_inv, { tree t, t2; - if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1)) + if (TREE_CODE (name1) == SSA_NAME + && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1)) return false; /* Do it. */ @@ -468,8 +469,10 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool inner_inv, gimple_stmt_iterator gsi; tree t; - if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1) - || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name2)) + if ((TREE_CODE (name1) == SSA_NAME + && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1)) + || (TREE_CODE (name2) == SSA_NAME + && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name2))) return false; /* Find the common name which is bit-tested. */
[gcc r15-1163] tree-optimization/115388 - wrong DSE in irreductible regions
https://gcc.gnu.org/g:818e760528d436ea8f6c28ef620e2bb82d456ea1 commit r15-1163-g818e760528d436ea8f6c28ef620e2bb82d456ea1 Author: Richard Biener Date: Mon Jun 10 11:29:43 2024 +0200 tree-optimization/115388 - wrong DSE in irreductible regions The following fixes a latent bug in DSE with regarding to variant array accesses where the code avoiding bogus DSE in loops fails to handle irreducible regions. For those we need to make sure backedges are marked and discover a header for the irreducible region to check invariantness. PR tree-optimization/115388 * tree-ssa-dse.cc (dse_classify_store): Handle irreducible regions. (pass_dse::execute): Make sure to mark backedges. * gcc.dg/torture/pr115388.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/torture/pr115388.c | 34 ++ gcc/tree-ssa-dse.cc | 61 + 2 files changed, 74 insertions(+), 21 deletions(-) diff --git a/gcc/testsuite/gcc.dg/torture/pr115388.c b/gcc/testsuite/gcc.dg/torture/pr115388.c new file mode 100644 index 000..c7c902888da --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr115388.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ + +int printf(const char *, ...); +int a[10], b, c, d[0], h, i, j, k, l; +char e = -1, g; +volatile int f; +static void n() { + while (e >= 0) +while (1) + ; + for (b = 2; b >= 0; b--) { +for (k = 0; k < 4; k++) { + if (e || i) +continue; + for (h = 0; h < 2; h++) +f; +} +for (l = 2; l >= 0; l--) + g = 0; +for (; g < 1; g++) + if (c) +d[l] = 1; +a[9] = 0; +a[b] = 1; +while (j) + printf("\n"); + } +} +int main() { + n(); + if (a[1] != 1) +__builtin_abort(); + return 0; +} diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc index 9252ca34050..63bf4491cf6 100644 --- a/gcc/tree-ssa-dse.cc +++ b/gcc/tree-ssa-dse.cc @@ -1018,8 +1018,11 @@ dse_classify_store (ao_ref *ref, gimple *stmt, if (defvar == stop_at_vuse) return DSE_STORE_LIVE; - FOR_EACH_IMM_USE_STMT (use_stmt, ui, defvar) + use_operand_p usep; + FOR_EACH_IMM_USE_FAST (usep, ui, defvar) { + use_stmt = USE_STMT (usep); + /* Limit stmt walking. */ if (++cnt > param_dse_max_alias_queries_per_store) { @@ -1031,31 +1034,43 @@ dse_classify_store (ao_ref *ref, gimple *stmt, have to be careful with loops and with memory references containing operands that are also operands of PHI nodes. See gcc.c-torture/execute/20051110-*.c. */ - if (gimple_code (use_stmt) == GIMPLE_PHI) + if (gphi *phi = dyn_cast (use_stmt)) { /* Look through single-argument PHIs. */ - if (gimple_phi_num_args (use_stmt) == 1) - worklist.safe_push (gimple_phi_result (use_stmt)); - - /* If we already visited this PHI ignore it for further -processing. */ - else if (!bitmap_bit_p (visited, - SSA_NAME_VERSION - (PHI_RESULT (use_stmt + if (gimple_phi_num_args (phi) == 1) + worklist.safe_push (gimple_phi_result (phi)); + else { /* If we visit this PHI by following a backedge then we have to make sure ref->ref only refers to SSA names that are invariant with respect to the loop -represented by this PHI node. */ - if (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt), - gimple_bb (use_stmt)) - && !for_each_index (ref->ref ? >ref : >base, - check_name, gimple_bb (use_stmt))) - return DSE_STORE_LIVE; - defs.safe_push (use_stmt); - if (!first_phi_def) - first_phi_def = as_a (use_stmt); - last_phi_def = as_a (use_stmt); +represented by this PHI node. We handle irreducible +regions by relying on backedge marking and identifying +the head of the (sub-)region. */ + edge e = gimple_phi_arg_edge +(phi, PHI_ARG_INDEX_FROM_USE (usep)); + if (e->flags & EDGE_DFS_BACK) + { + basic_block rgn_head + = nearest_common_dominator (CDI_DOMINATORS, + gimple_bb (phi), +
[gcc r15-1160] tree-optimization/115395 - wrong-code with SLP reduction in epilog
https://gcc.gnu.org/g:4ed9c5df7efeb98e190573cca42a4fd40666c45f commit r15-1160-g4ed9c5df7efeb98e190573cca42a4fd40666c45f Author: Richard Biener Date: Mon Jun 10 10:12:52 2024 +0200 tree-optimization/115395 - wrong-code with SLP reduction in epilog When we continue a non-SLP reduction from the main loop in the epilog with a SLP reduction we currently fail to handle an adjustment by the initial value because that's not a thing with SLP. As long as we have the possibility to mix SLP and non-SLP we have to handle it though. PR tree-optimization/115395 * tree-vect-loop.cc (vect_create_epilog_for_reduction): Handle STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT also for SLP reductions of group_size one. * gcc.dg/vect/pr115395.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/vect/pr115395.c | 27 +++ gcc/tree-vect-loop.cc| 27 --- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr115395.c b/gcc/testsuite/gcc.dg/vect/pr115395.c new file mode 100644 index 000..cd1cee9f3df --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr115395.c @@ -0,0 +1,27 @@ +/* { dg-additional-options "-mavx2" { target avx2_runtime } } */ + +#include "tree-vect.h" + +struct { + long header_size; + long start_offset; + long end_offset; +} myrar_dbo[5] = {{0, 87, 6980}, {0, 7087, 13980}, {0, 14087, 0}}; + +int i; +long offset; + +int main() +{ + check_vect (); + + offset += myrar_dbo[0].start_offset; + while (i < 2) { +i++; +offset += myrar_dbo[i].start_offset - myrar_dbo[i - 1].end_offset; + } + if (offset != 301) +abort(); + + return 0; +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 028692614bb..c471f1564a7 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -6030,25 +6030,14 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, tree induc_val = NULL_TREE; tree adjustment_def = NULL; - if (slp_node) -{ - /* Optimize: for induction condition reduction, if we can't use zero -for induc_val, use initial_def. */ - if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) - induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); - /* ??? Coverage for 'else' isn't clear. */ -} + /* Optimize: for induction condition reduction, if we can't use zero + for induc_val, use initial_def. */ + if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) +induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); + else if (double_reduc) +; else -{ - /* Optimize: for induction condition reduction, if we can't use zero - for induc_val, use initial_def. */ - if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) - induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); - else if (double_reduc) - ; - else - adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); -} +adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); stmt_vec_info single_live_out_stmt[] = { stmt_info }; array_slice live_out_stmts = single_live_out_stmt; @@ -6873,7 +6862,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, if (adjustment_def) { - gcc_assert (!slp_reduc); + gcc_assert (!slp_reduc || group_size == 1); gimple_seq stmts = NULL; if (double_reduc) {
[gcc r15-1126] tree-optimization/115383 - EXTRACT_LAST_REDUCTION with multiple stmt copies
https://gcc.gnu.org/g:c1429e3a8da0cdfe9391e1e9b2c7228d896a3a87 commit r15-1126-gc1429e3a8da0cdfe9391e1e9b2c7228d896a3a87 Author: Richard Biener Date: Fri Jun 7 12:15:31 2024 +0200 tree-optimization/115383 - EXTRACT_LAST_REDUCTION with multiple stmt copies The EXTRACT_LAST_REDUCTION code isn't ready to deal with multiple stmt copies but SLP no longer checks for this. The following adjusts code generation to handle the situation. PR tree-optimization/115383 * tree-vect-stmts.cc (vectorizable_condition): Handle generating a chain of .FOLD_EXTRACT_LAST. * gcc.dg/vect/pr115383.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/vect/pr115383.c | 20 gcc/tree-vect-stmts.cc | 20 +++- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr115383.c b/gcc/testsuite/gcc.dg/vect/pr115383.c new file mode 100644 index 000..92c24699146 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr115383.c @@ -0,0 +1,20 @@ +#include "tree-vect.h" + +int __attribute__((noipa)) +s331 (int i, int n) +{ + int j = 0; + for (; i < n; i++) +if ((float)i < 0.) + j = i; + return j; +} + +int main() +{ + check_vect (); + int j = s331(-13, 17); + if (j != -1) +abort (); + return 0; +} diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 5098b7fab6a..05a169ecb2d 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12415,6 +12415,9 @@ vectorizable_condition (vec_info *vinfo, reduction_type != EXTRACT_LAST_REDUCTION ? else_clause : NULL, vectype, _oprnds3); + if (reduction_type == EXTRACT_LAST_REDUCTION) +vec_else_clause = else_clause; + /* Arguments are ready. Create the new vector stmt. */ FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_cond_lhs) { @@ -12557,17 +12560,24 @@ vectorizable_condition (vec_info *vinfo, { gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt; tree lhs = gimple_get_lhs (old_stmt); + if ((unsigned)i != vec_oprnds0.length () - 1) + lhs = copy_ssa_name (lhs); if (len) new_stmt = gimple_build_call_internal - (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare, -vec_then_clause, len, bias); + (IFN_LEN_FOLD_EXTRACT_LAST, 5, vec_else_clause, vec_compare, +vec_then_clause, len, bias); else new_stmt = gimple_build_call_internal - (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare, -vec_then_clause); + (IFN_FOLD_EXTRACT_LAST, 3, vec_else_clause, vec_compare, +vec_then_clause); gimple_call_set_lhs (new_stmt, lhs); SSA_NAME_DEF_STMT (lhs) = new_stmt; - if (old_stmt == gsi_stmt (*gsi)) + if ((unsigned)i != vec_oprnds0.length () - 1) + { + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + vec_else_clause = lhs; + } + else if (old_stmt == gsi_stmt (*gsi)) vect_finish_replace_stmt (vinfo, stmt_info, new_stmt); else {
[gcc r15-1097] Fix fold-left reduction vectorization with multiple stmt copies
https://gcc.gnu.org/g:dd6f942c266533b2f72610f354bc9184f8276beb commit r15-1097-gdd6f942c266533b2f72610f354bc9184f8276beb Author: Richard Biener Date: Fri Jun 7 09:41:11 2024 +0200 Fix fold-left reduction vectorization with multiple stmt copies There's a typo when code generating the mask operand for conditional fold-left reductions in the case we have multiple stmt copies. The latter is now allowed for SLP and possibly disabled for non-SLP by accident. This fixes the observed run-FAIL for gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c with AVX512 and 256bit sized vectors. * tree-vect-loop.cc (vectorize_fold_left_reduction): Fix mask vector operand indexing. Diff: --- gcc/tree-vect-loop.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index ceb92156b58..028692614bb 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -7217,7 +7217,7 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo, if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i); else if (is_cond_op) - mask = vec_opmask[0]; + mask = vec_opmask[i]; if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) { len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
[gcc r15-1056] Allow single-lane SLP in-order reductions
https://gcc.gnu.org/g:4653b682ef161c3c2fc7bf8462b8f9206a1349e6 commit r15-1056-g4653b682ef161c3c2fc7bf8462b8f9206a1349e6 Author: Richard Biener Date: Tue Mar 5 15:46:24 2024 +0100 Allow single-lane SLP in-order reductions The single-lane case isn't different from non-SLP, no re-association implied. But the transform stage cannot handle a conditional reduction op which isn't checked during analysis - this makes it work, exercised with a single-lane non-reduction-chain by gcc.target/i386/pr112464.c * tree-vect-loop.cc (vectorizable_reduction): Allow single-lane SLP in-order reductions. (vectorize_fold_left_reduction): Handle SLP reduction with conditional reduction op. Diff: --- gcc/tree-vect-loop.cc | 48 +++- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index b9e8e9b5559..ceb92156b58 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -7139,56 +7139,46 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo, gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op); if (slp_node) -{ - if (is_cond_op) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, -"fold-left reduction on SLP not supported.\n"); - return false; - } - - gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), - TYPE_VECTOR_SUBPARTS (vectype_in))); -} +gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), + TYPE_VECTOR_SUBPARTS (vectype_in))); /* The operands either come from a binary operation or an IFN_COND operation. The former is a gimple assign with binary rhs and the latter is a gimple call with four arguments. */ gcc_assert (num_ops == 2 || num_ops == 4); - tree op0, opmask; - if (!is_cond_op) -op0 = ops[1 - reduc_index]; - else -{ - op0 = ops[2 + (1 - reduc_index)]; - opmask = ops[0]; - gcc_assert (!slp_node); -} int group_size = 1; stmt_vec_info scalar_dest_def_info; auto_vec vec_oprnds0, vec_opmask; if (slp_node) { - auto_vec > vec_defs (2); - vect_get_slp_defs (loop_vinfo, slp_node, _defs); - vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]); - vec_defs[0].release (); - vec_defs[1].release (); + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0) + + (1 - reduc_index)], + _oprnds0); group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; + /* For an IFN_COND_OP we also need the vector mask operand. */ + if (is_cond_op) + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], _opmask); } else { + tree op0, opmask; + if (!is_cond_op) + op0 = ops[1 - reduc_index]; + else + { + op0 = ops[2 + (1 - reduc_index)]; + opmask = ops[0]; + } vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, op0, _oprnds0); scalar_dest_def_info = stmt_info; /* For an IFN_COND_OP we also need the vector mask operand. */ if (is_cond_op) - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, -opmask, _opmask); + vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, + opmask, _opmask); } gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt; @@ -8210,7 +8200,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } if (reduction_type == FOLD_LEFT_REDUCTION - && slp_node + && (slp_node && SLP_TREE_LANES (slp_node) > 1) && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) { /* We cannot use in-order reductions in this case because there is
[gcc r15-1054] Allow single-lane COND_REDUCTION vectorization
https://gcc.gnu.org/g:202a9c8fe7db9dd94e5a77f42e54ef3d966f88e8 commit r15-1054-g202a9c8fe7db9dd94e5a77f42e54ef3d966f88e8 Author: Richard Biener Date: Fri Mar 1 14:39:08 2024 +0100 Allow single-lane COND_REDUCTION vectorization The following enables single-lane COND_REDUCTION vectorization. * tree-vect-loop.cc (vect_create_epilog_for_reduction): Adjust for single-lane COND_REDUCTION SLP vectorization. (vectorizable_reduction): Likewise. (vect_transform_cycle_phi): Likewise. Diff: --- gcc/tree-vect-loop.cc | 97 ++- 1 file changed, 81 insertions(+), 16 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 06292ed8bbe..ccd6acef5c5 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -6030,7 +6030,13 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, tree induc_val = NULL_TREE; tree adjustment_def = NULL; if (slp_node) -; +{ + /* Optimize: for induction condition reduction, if we can't use zero +for induc_val, use initial_def. */ + if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) + induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); + /* ??? Coverage for double_reduc and 'else' isn't clear. */ +} else { /* Optimize: for induction condition reduction, if we can't use zero @@ -6075,23 +6081,46 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) { auto_vec, 2> ccompares; - stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); - cond_info = vect_stmt_to_vectorize (cond_info); - while (cond_info != reduc_info) + if (slp_node) { - if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR) + slp_tree cond_node = slp_node_instance->root; + while (cond_node != slp_node_instance->reduc_phis) { - gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0]; - gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); - ccompares.safe_push - (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)), -STMT_VINFO_REDUC_IDX (cond_info) == 2)); + stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node); + if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR) + { + gimple *vec_stmt + = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]); + gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); + ccompares.safe_push + (std::make_pair (gimple_assign_rhs1 (vec_stmt), +STMT_VINFO_REDUC_IDX (cond_info) == 2)); + } + /* ??? We probably want to have REDUC_IDX on the SLP node? */ + cond_node = SLP_TREE_CHILDREN + (cond_node)[STMT_VINFO_REDUC_IDX (cond_info)]; } - cond_info - = loop_vinfo->lookup_def (gimple_op (cond_info->stmt, -1 + STMT_VINFO_REDUC_IDX - (cond_info))); + } + else + { + stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); cond_info = vect_stmt_to_vectorize (cond_info); + while (cond_info != reduc_info) + { + if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR) + { + gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0]; + gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); + ccompares.safe_push + (std::make_pair (gimple_assign_rhs1 (vec_stmt), +STMT_VINFO_REDUC_IDX (cond_info) == 2)); + } + cond_info + = loop_vinfo->lookup_def (gimple_op (cond_info->stmt, +1 + STMT_VINFO_REDUC_IDX +(cond_info))); + cond_info = vect_stmt_to_vectorize (cond_info); + } } gcc_assert (ccompares.length () != 0); @@ -7844,7 +7873,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, /* If we have a condition reduction, see if we can simplify it further. */ if (v_reduc_type == COND_REDUCTION) { - if (slp_node) + if (slp_node && SLP_TREE_LANES (slp_node) != 1) return false; /* When the condition uses the reduction value in the condition, fail. */ @@ -8050,6 +8079,18 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } } + if ((reduction_type == COND_REDUCTION + || reduction_type == INTEGER_INDUC_COND_REDUCTION +
[gcc r15-1055] Add double reduction support for SLP vectorization
https://gcc.gnu.org/g:2ee41ef76a99ef5a8b62b351e2c01dad93f51b18 commit r15-1055-g2ee41ef76a99ef5a8b62b351e2c01dad93f51b18 Author: Richard Biener Date: Tue Mar 5 15:28:58 2024 +0100 Add double reduction support for SLP vectorization The following makes double reduction vectorization work when using (single-lane) SLP vectorization. * tree-vect-loop.cc (vect_analyze_scalar_cycles_1): Queue double reductions in LOOP_VINFO_REDUCTIONS. (vect_create_epilog_for_reduction): Remove asserts disabling SLP for double reductions. (vectorizable_reduction): Analyze SLP double reductions only once and start off the correct places. * tree-vect-slp.cc (vect_get_and_check_slp_defs): Allow vect_double_reduction_def. (vect_build_slp_tree_2): Fix condition for the ignored reduction initial values. * tree-vect-stmts.cc (vect_analyze_stmt): Allow vect_double_reduction_def. Diff: --- gcc/tree-vect-loop.cc | 35 +-- gcc/tree-vect-slp.cc | 3 ++- gcc/tree-vect-stmts.cc | 4 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index ccd6acef5c5..b9e8e9b5559 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -685,6 +685,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop, STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def; + /* Make it accessible for SLP vectorization. */ + LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info); } else { @@ -5975,7 +5977,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, stmt_vec_info rdef_info = stmt_info; if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) { - gcc_assert (!slp_node); double_reduc = true; stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def (stmt_info->stmt, 0)); @@ -6020,7 +6021,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { outer_loop = loop; loop = loop->inner; - gcc_assert (!slp_node && double_reduc); + gcc_assert (double_reduc); } vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); @@ -6035,7 +6036,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, for induc_val, use initial_def. */ if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); - /* ??? Coverage for double_reduc and 'else' isn't clear. */ + /* ??? Coverage for 'else' isn't clear. */ } else { @@ -7605,15 +7606,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo, STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; return true; } - if (slp_node) -{ - slp_node_instance->reduc_phis = slp_node; - /* ??? We're leaving slp_node to point to the PHIs, we only -need it to get at the number of vector stmts which wasn't -yet initialized for the instance root. */ -} if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) { + if (gimple_bb (stmt_info->stmt) != loop->header) + { + /* For SLP we arrive here for both the inner loop LC PHI and +the outer loop PHI. The latter is what we want to analyze +the reduction with. */ + gcc_assert (slp_node); + return true; + } use_operand_p use_p; gimple *use_stmt; bool res = single_imm_use (gimple_phi_result (stmt_info->stmt), @@ -7622,6 +7624,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo, phi_info = loop_vinfo->lookup_stmt (use_stmt); } + if (slp_node) +{ + slp_node_instance->reduc_phis = slp_node; + /* ??? We're leaving slp_node to point to the PHIs, we only +need it to get at the number of vector stmts which wasn't +yet initialized for the instance root. */ +} + /* PHIs should not participate in patterns. */ gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); gphi *reduc_def_phi = as_a (phi_info->stmt); @@ -7637,6 +7647,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, bool only_slp_reduc_chain = true; stmt_info = NULL; slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL; + /* For double-reductions we start SLP analysis at the inner loop LC PHI + which is the def of the outer loop live stmt. */ + if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def + && slp_node) +slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; while (reduc_def != PHI_RESULT (reduc_def_phi)) { stmt_vec_info def =
[gcc r15-1053] Relax COND_EXPR reduction vectorization SLP restriction
https://gcc.gnu.org/g:28edeb1409a7b839407ec06031899b933390bff3 commit r15-1053-g28edeb1409a7b839407ec06031899b933390bff3 Author: Richard Biener Date: Fri Feb 23 16:16:38 2024 +0100 Relax COND_EXPR reduction vectorization SLP restriction Allow one-lane SLP but for the case where we need to swap the arms. * tree-vect-stmts.cc (vectorizable_condition): Allow single-lane SLP, but not when we need to swap then and else clause. Diff: --- gcc/tree-vect-stmts.cc | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index b26cc74f417..c82381e799e 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12116,7 +12116,7 @@ vectorizable_condition (vec_info *vinfo, = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL; if (for_reduction) { - if (slp_node) + if (slp_node && SLP_TREE_LANES (slp_node) > 1) return false; reduc_info = info_for_reduction (vinfo, stmt_info); reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); @@ -12205,6 +12205,10 @@ vectorizable_condition (vec_info *vinfo, cond_expr = NULL_TREE; } } + /* ??? The vectorized operand query below doesn't allow swapping +this way for SLP. */ + if (slp_node) + return false; std::swap (then_clause, else_clause); }
[gcc r15-1006] Do single-lane SLP discovery for reductions
https://gcc.gnu.org/g:d93353e6423ecaaae9fa47d0935caafd9abfe4de commit r15-1006-gd93353e6423ecaaae9fa47d0935caafd9abfe4de Author: Richard Biener Date: Fri Feb 23 11:45:50 2024 +0100 Do single-lane SLP discovery for reductions The following performs single-lane SLP discovery for reductions. It requires a fixup for outer loop vectorization where a check for multiple types needs adjustments as otherwise bogus pointer IV increments happen when there are multiple copies of vector stmts in the inner loop. For the reduction epilog handling this extends the optimized path to cover the trivial single-lane SLP reduction case. The fix for PR65518 implemented in vect_grouped_load_supported for non-SLP needs a SLP counterpart that I put in get_group_load_store_type. I've decided to adjust three testcases for appearing single-lane SLP instances instead of not dumping "vectorizing stmts using SLP" for single-lane instances as that also requires testsuite adjustments. * tree-vect-slp.cc (vect_build_slp_tree_2): Only multi-lane discoveries are reduction chains and need special backedge treatment. (vect_analyze_slp): Fall back to single-lane SLP discovery for reductions. Make sure to try single-lane SLP reduction for all reductions as fallback. (vectorizable_load): Avoid outer loop SLP vectorization with multi-copy vector stmts in the inner loop. (vectorizable_store): Likewise. * tree-vect-loop.cc (vect_create_epilog_for_reduction): Allow direct opcode and shift reduction also for SLP reductions with a single lane. * tree-vect-stmts.cc (get_group_load_store_type): For SLP also check for the PR65518 single-element interleaving case as done in vect_grouped_load_supported. * gcc.dg/vect/slp-24.c: Expect another SLP instance for the reduction. * gcc.dg/vect/slp-24-big-array.c: Likewise. * gcc.dg/vect/slp-reduc-6.c: Remove scan for zero SLP instances. Diff: --- gcc/testsuite/gcc.dg/vect/slp-24-big-array.c | 2 +- gcc/testsuite/gcc.dg/vect/slp-24.c | 2 +- gcc/testsuite/gcc.dg/vect/slp-reduc-6.c | 1 - gcc/tree-vect-loop.cc| 4 +- gcc/tree-vect-slp.cc | 71 +--- gcc/tree-vect-stmts.cc | 24 +- 6 files changed, 80 insertions(+), 24 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c index 5eaea9600ac..63f744338a1 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c +++ b/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c @@ -92,4 +92,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_no_align && ilp32 } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-24.c b/gcc/testsuite/gcc.dg/vect/slp-24.c index 59178f2c0f2..7814d7c324e 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-24.c +++ b/gcc/testsuite/gcc.dg/vect/slp-24.c @@ -78,4 +78,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_no_align && ilp32 } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c index 1fd15aa3c87..5566705a704 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c @@ -45,6 +45,5 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail { vect_no_int_add || { ! { vect_unpack || vect_strided2 } } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ /* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index a08357acc11..06292ed8bbe 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -6504,7 +6504,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* 2.3 Create the reduction code, using one of the three schemes described above. In SLP we simply need to extract all the elements from the vector (without reducing them), so we use scalar shifts. */ - else if (reduc_fn != IFN_LAST && !slp_reduc) + else if
[gcc r15-1005] Avoid inserting after a GIMPLE_COND with SLP and early break
https://gcc.gnu.org/g:0592000aeed84d47040946a125154b3c46d7c84f commit r15-1005-g0592000aeed84d47040946a125154b3c46d7c84f Author: Richard Biener Date: Mon May 27 14:40:27 2024 +0200 Avoid inserting after a GIMPLE_COND with SLP and early break When vectorizing an early break loop with LENs (do we miss some check here to disallow this?) we can end up deciding to insert stmts after a GIMPLE_COND when doing SLP scheduling and trying to be conservative with placing of stmts only dependent on the implicit loop mask/len. The following avoids this, I guess it's not perfect but it does the job fixing some observed RISC-V regression. * tree-vect-slp.cc (vect_schedule_slp_node): For mask/len loops make sure to not advance the insertion iterator beyond a GIMPLE_COND. Diff: --- gcc/tree-vect-slp.cc | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index bf1f467f53f..11ec82086fc 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -9650,7 +9650,12 @@ vect_schedule_slp_node (vec_info *vinfo, else { si = gsi_for_stmt (last_stmt); - gsi_next (); + /* When we're getting gsi_after_labels from the starting +condition of a fully masked/len loop avoid insertion +after a GIMPLE_COND that can appear as the only header +stmt with early break vectorization. */ + if (gimple_code (last_stmt) != GIMPLE_COND) + gsi_next (); } }
[gcc r12-10493] c++: Add testcase for this PR [PR97990]
https://gcc.gnu.org/g:c7627054b9ee2ded8a22340a6a09bf9786afcafa commit r12-10493-gc7627054b9ee2ded8a22340a6a09bf9786afcafa Author: Andrew Pinski Date: Fri Feb 16 10:55:43 2024 -0800 c++: Add testcase for this PR [PR97990] This testcase was fixed by r14-5934-gf26d68d5d128c8 but we should add one to make sure it does not regress again. Committed as obvious after a quick test on the testcase. PR c++/97990 gcc/testsuite/ChangeLog: * g++.dg/torture/vector-struct-1.C: New test. Signed-off-by: Andrew Pinski (cherry picked from commit 5f1438db419c9eb8901d1d1d7f98fb69082aec8e) Diff: --- gcc/testsuite/g++.dg/torture/vector-struct-1.C | 18 ++ 1 file changed, 18 insertions(+) diff --git a/gcc/testsuite/g++.dg/torture/vector-struct-1.C b/gcc/testsuite/g++.dg/torture/vector-struct-1.C new file mode 100644 index 000..e2747417e2d --- /dev/null +++ b/gcc/testsuite/g++.dg/torture/vector-struct-1.C @@ -0,0 +1,18 @@ +/* PR c++/97990 */ +/* This used to crash with lto and strict aliasing enabled as the + vector type variant still had TYPE_ALIAS_SET set on it. */ + +typedef __attribute__((__vector_size__(sizeof(short short TSimd; +TSimd hh(int); +struct y6 +{ + TSimd VALUE; + ~y6(); +}; +template +auto f2(T1 p1, T2){ + return hh(p1) <= 0; +} +void f1(){ + f2(0, y6{}); +}
[gcc r12-10492] middle-end/112732 - stray TYPE_ALIAS_SET in type variant
https://gcc.gnu.org/g:b46486ef0316240eb3c173bda062b52333507e03 commit r12-10492-gb46486ef0316240eb3c173bda062b52333507e03 Author: Richard Biener Date: Tue Nov 28 12:36:21 2023 +0100 middle-end/112732 - stray TYPE_ALIAS_SET in type variant The following fixes a stray TYPE_ALIAS_SET in a type variant built by build_opaque_vector_type which is diagnosed by type checking enabled with -flto. PR middle-end/112732 * tree.cc (build_opaque_vector_type): Reset TYPE_ALIAS_SET of the newly built type. (cherry picked from commit f26d68d5d128c86faaceeb81b1e8f22254ad53df) Diff: --- gcc/tree.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gcc/tree.cc b/gcc/tree.cc index ead4c1421cd..6b28eb9f10d 100644 --- a/gcc/tree.cc +++ b/gcc/tree.cc @@ -10124,6 +10124,8 @@ build_opaque_vector_type (tree innertype, poly_int64 nunits) TYPE_NEXT_VARIANT (cand) = TYPE_NEXT_VARIANT (t); TYPE_NEXT_VARIANT (t) = cand; TYPE_MAIN_VARIANT (cand) = TYPE_MAIN_VARIANT (t); + /* Type variants have no alias set defined. */ + TYPE_ALIAS_SET (cand) = -1; return cand; }
[gcc r12-10491] tree-optimization/110381 - preserve SLP permutation with in-order reductions
https://gcc.gnu.org/g:8f6d889a8e609710ecfd555778fbff602b2c7d74 commit r12-10491-g8f6d889a8e609710ecfd555778fbff602b2c7d74 Author: Richard Biener Date: Mon Jun 26 12:51:37 2023 +0200 tree-optimization/110381 - preserve SLP permutation with in-order reductions The following fixes a bug that manifests itself during fold-left reduction transform in picking not the last scalar def to replace and thus double-counting some elements. But the underlying issue is that we merge a load permutation into the in-order reduction which is of course wrong. Now, reduction analysis has not yet been performend when optimizing permutations so we have to resort to check that ourselves. PR tree-optimization/110381 * tree-vect-slp.cc (vect_optimize_slp_pass::start_choosing_layouts): Materialize permutes before fold-left reductions. * gcc.dg/vect/pr110381.c: New testcase. (cherry picked from commit 53d6f57c1b20c6da52aefce737fb7d5263686ba3) Diff: --- gcc/testsuite/gcc.dg/vect/pr110381.c | 44 gcc/tree-vect-slp.cc | 19 +--- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr110381.c b/gcc/testsuite/gcc.dg/vect/pr110381.c new file mode 100644 index 000..278f4426c29 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr110381.c @@ -0,0 +1,44 @@ +/* { dg-require-effective-target vect_float_strict } */ + +#include "tree-vect.h" + +struct FOO { + double a; + double b; + double c; +}; + +double __attribute__((noipa)) +sum_8_foos(const struct FOO* foos) +{ + double sum = 0; + + for (int i = 0; i < 8; ++i) +{ + struct FOO foo = foos[i]; + + /* Need to use an in-order reduction here, preserving + the load permutation. */ + sum += foo.a; + sum += foo.c; + sum += foo.b; +} + + return sum; +} + +int main() +{ + struct FOO foos[8]; + + check_vect (); + + __builtin_memset (foos, 0, sizeof (foos)); + foos[0].a = __DBL_MAX__; + foos[0].b = 5; + foos[0].c = -__DBL_MAX__; + + if (sum_8_foos (foos) != 5) +__builtin_abort (); + return 0; +} diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 54e6a9e4224..19cab93761c 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3733,9 +3733,8 @@ vect_optimize_slp (vec_info *vinfo) vertices[idx].perm_out = perms.length () - 1; } - /* In addition to the above we have to mark outgoing permutes facing - non-reduction graph entries that are not represented as to be - materialized. */ + /* We have to mark outgoing permutations facing non-associating-reduction + graph entries that are not represented as to be materialized. */ for (slp_instance instance : vinfo->slp_instances) if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor) { @@ -3744,6 +3743,20 @@ vect_optimize_slp (vec_info *vinfo) vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0; vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0; } +else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain) + { + stmt_vec_info stmt_info + = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance)); + stmt_vec_info reduc_info = info_for_reduction (vinfo, stmt_info); + if (needs_fold_left_reduction_p (TREE_TYPE + (gimple_get_lhs (stmt_info->stmt)), +STMT_VINFO_REDUC_CODE (reduc_info))) + { + unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex; + vertices[node_i].perm_in = 0; + vertices[node_i].perm_out = 0; + } + } /* Propagate permutes along the graph and compute materialization points. */ bool changed;
[gcc r12-10490] tree-optimization/113910 - huge compile time during PTA
https://gcc.gnu.org/g:db0f236aa1c30f703ff564960bd9f3dbd747ea7b commit r12-10490-gdb0f236aa1c30f703ff564960bd9f3dbd747ea7b Author: Richard Biener Date: Wed Feb 14 12:33:13 2024 +0100 tree-optimization/113910 - huge compile time during PTA For the testcase in PR113910 we spend a lot of time in PTA comparing bitmaps for looking up equivalence class members. This points to the very weak bitmap_hash function which effectively hashes set and a subset of not set bits. The major problem with it is that it simply truncates the BITMAP_WORD sized intermediate hash to hashval_t which is unsigned int, effectively not hashing half of the bits. This reduces the compile-time for the testcase from tens of minutes to 42 seconds and PTA time from 99% to 46%. PR tree-optimization/113910 * bitmap.cc (bitmap_hash): Mix the full element "hash" to the hashval_t hash. (cherry picked from commit ad7a365aaccecd23ea287c7faaab9c7bd50b944a) Diff: --- gcc/bitmap.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/bitmap.cc b/gcc/bitmap.cc index 88c329f9325..601c04e2e13 100644 --- a/gcc/bitmap.cc +++ b/gcc/bitmap.cc @@ -2673,7 +2673,7 @@ bitmap_hash (const_bitmap head) for (ix = 0; ix != BITMAP_ELEMENT_WORDS; ix++) hash ^= ptr->bits[ix]; } - return (hashval_t)hash; + return iterative_hash (, sizeof (hash), 0); }
[gcc r15-991] testsuite/115304 - properly guard gcc.dg/vect/slp-gap-1.c
https://gcc.gnu.org/g:ed8ba88074f3663f810ef2f07d79c3fcde5d9697 commit r15-991-ged8ba88074f3663f810ef2f07d79c3fcde5d9697 Author: Richard Biener Date: Mon Jun 3 14:43:42 2024 +0200 testsuite/115304 - properly guard gcc.dg/vect/slp-gap-1.c Testing on sparc shows we need vect_unpack and vect_perm. This isn't enough to resolve the GCN fail which ends up using interleaving. PR testsuite/115304 * gcc.dg/vect/slp-gap-1.c: Require vect_unpack and vect_perm. Diff: --- gcc/testsuite/gcc.dg/vect/slp-gap-1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/slp-gap-1.c b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c index 36463ca22c5..9856da7a7f4 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-gap-1.c +++ b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c @@ -15,4 +15,4 @@ void pixel_sub_wxh(int16_t * __restrict diff, uint8_t *pix1, uint8_t *pix2) { /* We can vectorize this without peeling for gaps and thus without epilogue, but the only thing we can reliably scan is the zero-padding trick for the partial loads. */ -/* { dg-final { scan-tree-dump-times "\{_\[0-9\]\+, 0" 6 "vect" { target vect64 } } } */ +/* { dg-final { scan-tree-dump-times "\{_\[0-9\]\+, 0" 6 "vect" { target { vect64 && { vect_unpack && vect_perm } } } } } */
[gcc r15-986] Adjust vector dump scans
https://gcc.gnu.org/g:5b52517e22540874bac07e2499e9650a9e8278a4 commit r15-986-g5b52517e22540874bac07e2499e9650a9e8278a4 Author: Richard Biener Date: Fri May 31 15:38:29 2024 +0200 Adjust vector dump scans The following adjusts dump scanning for something followed by successful vector analysis to more specifically look for 'Analysis succeeded' and not 'Analysis failed' because the previous look for just 'succeeded' or 'failed' is easily confused by SLP discovery dumping those words. * tree-vect-loop.cc (vect_analyze_loop_1): Avoid extra space before 'failed'. * gcc.dg/vect/no-scevccp-outer-7.c: Adjust scanning for succeeded analysis not interrupted by failure. * gcc.dg/vect/no-scevccp-vect-iv-3.c: Likewise. * gcc.dg/vect/vect-cond-reduc-4.c: Likewise. * gcc.dg/vect/vect-live-2.c: Likewise. * gcc.dg/vect/vect-outer-4c-big-array.c: Likewise. * gcc.dg/vect/vect-reduc-dot-s16a.c: Likewise. * gcc.dg/vect/vect-reduc-dot-s8a.c: Likewise. * gcc.dg/vect/vect-reduc-dot-s8b.c: Likewise. * gcc.dg/vect/vect-reduc-dot-u16a.c: Likewise. * gcc.dg/vect/vect-reduc-dot-u16b.c: Likewise. * gcc.dg/vect/vect-reduc-dot-u8a.c: Likewise. * gcc.dg/vect/vect-reduc-dot-u8b.c: Likewise. * gcc.dg/vect/vect-reduc-pattern-1a.c: Likewise. * gcc.dg/vect/vect-reduc-pattern-1b-big-array.c: Likewise. * gcc.dg/vect/vect-reduc-pattern-1c-big-array.c: Likewise. * gcc.dg/vect/vect-reduc-pattern-2a.c: Likewise. * gcc.dg/vect/vect-reduc-pattern-2b-big-array.c: Likewise. * gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c: Likewise. Diff: --- gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c | 2 +- gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c| 2 +- gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-live-2.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-outer-4c-big-array.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c | 4 ++-- gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c | 4 ++-- gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16a.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c | 2 +- gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c| 4 ++-- gcc/tree-vect-loop.cc | 2 +- 19 files changed, 22 insertions(+), 22 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c index 87048422013..e796e6ba216 100644 --- a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c +++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c @@ -77,4 +77,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { target vect_widen_mult_hi_to_si } } } */ -/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected(?:(?!failed)(?!Re-trying).)*succeeded" 1 "vect" { target vect_widen_mult_hi_to_si } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected(?:(?!Analysis failed).)*Analysis succeeded" 1 "vect" { target vect_widen_mult_hi_to_si } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c index 6f2b2210b11..f268d4a5131 100644 --- a/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c +++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c @@ -30,4 +30,4 @@ unsigned int main1 () } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_widen_sum_hi_to_si } } } */ -/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected(?:(?!failed)(?!Re-trying).)*succeeded" 1 "vect" { target vect_widen_sum_hi_to_si } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected(?:(?!Analysis failed).)*Analysis succeeded" 1 "vect" { target vect_widen_sum_hi_to_si } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c index 27f18dc5bda..e9d414287e8 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c @@ -42,6 +42,6 @@ main (void) } /* { dg-final {
[gcc r15-985] Avoid ICE with pointer reduction
https://gcc.gnu.org/g:a1810364cd0c36d6408a4c386bdc504a021d68c7 commit r15-985-ga1810364cd0c36d6408a4c386bdc504a021d68c7 Author: Richard Biener Date: Fri May 31 15:17:10 2024 +0200 Avoid ICE with pointer reduction There's another case where we can refer to neutral_op before eventually converting it from pointer to integer so simply do that unconditionally. * tree-vect-loop.cc (get_initial_defs_for_reduction): Always convert neutral_op. Diff: --- gcc/tree-vect-loop.cc | 15 +++ 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 5b85cffb37f..b6e0b9616d5 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -5606,6 +5606,12 @@ get_initial_defs_for_reduction (loop_vec_info loop_vinfo, tree_vector_builder elts (vector_type, nunits, 1); elts.quick_grow (nunits); gimple_seq ctor_seq = NULL; + if (neutral_op + && !useless_type_conversion_p (TREE_TYPE (vector_type), +TREE_TYPE (neutral_op))) +neutral_op = gimple_convert (_seq, +TREE_TYPE (vector_type), +neutral_op); for (j = 0; j < nunits * number_of_vectors; ++j) { tree op; @@ -5614,14 +5620,7 @@ get_initial_defs_for_reduction (loop_vec_info loop_vinfo, /* Get the def before the loop. In reduction chain we have only one initial value. Else we have as many as PHIs in the group. */ if (i >= initial_values.length () || (j > i && neutral_op)) - { - if (!useless_type_conversion_p (TREE_TYPE (vector_type), - TREE_TYPE (neutral_op))) - neutral_op = gimple_convert (_seq, -TREE_TYPE (vector_type), -neutral_op); - op = neutral_op; - } + op = neutral_op; else { if (!useless_type_conversion_p (TREE_TYPE (vector_type),
[gcc r15-941] tree-optimization/115278 - fix DSE in if-conversion wrt volatiles
https://gcc.gnu.org/g:65dbe0ab7cdaf2aa84b09a74e594f0faacf1945c commit r15-941-g65dbe0ab7cdaf2aa84b09a74e594f0faacf1945c Author: Richard Biener Date: Fri May 31 10:14:25 2024 +0200 tree-optimization/115278 - fix DSE in if-conversion wrt volatiles The following adds the missing guard for volatile stores to the embedded DSE in the loop if-conversion pass. PR tree-optimization/115278 * tree-if-conv.cc (ifcvt_local_dce): Do not DSE volatile stores. * g++.dg/vect/pr115278.cc: New testcase. Diff: --- gcc/testsuite/g++.dg/vect/pr115278.cc | 38 +++ gcc/tree-if-conv.cc | 4 +++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/g++.dg/vect/pr115278.cc b/gcc/testsuite/g++.dg/vect/pr115278.cc new file mode 100644 index 000..331075fb278 --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/pr115278.cc @@ -0,0 +1,38 @@ +// { dg-do compile } +// { dg-require-effective-target c++11 } +// { dg-additional-options "-fdump-tree-optimized" } + +#include + +const int runs = 92; + +union BitfieldStructUnion { +struct { +uint64_t a : 17; +uint64_t padding: 39; +uint64_t b : 8; +} __attribute__((packed)); + +struct { +uint32_t value_low; +uint32_t value_high; +} __attribute__((packed)); + +BitfieldStructUnion(uint32_t value_low, uint32_t value_high) : value_low(value_low), value_high(value_high) {} +}; + +volatile uint32_t *WRITE = (volatile unsigned*)0x42; + +void buggy() { +for (int i = 0; i < runs; i++) { +BitfieldStructUnion rt{*WRITE, *WRITE}; + +rt.a = 99; +rt.b = 1; + +*WRITE = rt.value_low; +*WRITE = rt.value_high; +} +} + +// { dg-final { scan-tree-dump-times "\\\*WRITE\[^\r\n\]* ={v} " 2 "optimized" } } diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index 09d99fb9dda..c4c3ed41a44 100644 --- a/gcc/tree-if-conv.cc +++ b/gcc/tree-if-conv.cc @@ -3381,7 +3381,9 @@ ifcvt_local_dce (class loop *loop) gimple_stmt_iterator gsiprev = gsi; gsi_prev (); stmt = gsi_stmt (gsi); - if (gimple_store_p (stmt) && gimple_vdef (stmt)) + if (!gimple_has_volatile_ops (stmt) + && gimple_store_p (stmt) + && gimple_vdef (stmt)) { tree lhs = gimple_get_lhs (stmt); ao_ref write;
[gcc r15-896] tree-optimization/115252 - enhance peeling for gaps avoidance
https://gcc.gnu.org/g:f46eaad445e680034df51bd0dec4e6c7b1f372a4 commit r15-896-gf46eaad445e680034df51bd0dec4e6c7b1f372a4 Author: Richard Biener Date: Mon May 27 16:04:35 2024 +0200 tree-optimization/115252 - enhance peeling for gaps avoidance Code generation for contiguous load vectorization can already deal with generalized avoidance of loading from a gap. The following extends detection of peeling for gaps requirement with that, gets rid of the old special casing of a half load and makes sure when we do access the gap we have peeling for gaps enabled. PR tree-optimization/115252 * tree-vect-stmts.cc (get_group_load_store_type): Enhance detecting the number of cases where we can avoid accessing a gap during code generation. (vectorizable_load): Remove old half-vector peeling for gap avoidance which is now redundant. Add gap-aligned case where it's OK to access the gap. Add assert that we have peeling for gaps enabled when we access a gap. * gcc.dg/vect/slp-gap-1.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/vect/slp-gap-1.c | 18 +++ gcc/tree-vect-stmts.cc| 58 +-- 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/slp-gap-1.c b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c new file mode 100644 index 000..36463ca22c5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3" } */ + +typedef unsigned char uint8_t; +typedef short int16_t; +void pixel_sub_wxh(int16_t * __restrict diff, uint8_t *pix1, uint8_t *pix2) { + for (int y = 0; y < 4; y++) { +for (int x = 0; x < 4; x++) + diff[x + y * 4] = pix1[x] - pix2[x]; +pix1 += 16; +pix2 += 32; + } +} + +/* We can vectorize this without peeling for gaps and thus without epilogue, + but the only thing we can reliably scan is the zero-padding trick for the + partial loads. */ +/* { dg-final { scan-tree-dump-times "\{_\[0-9\]\+, 0" 6 "vect" { target vect64 } } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 4219ad832db..935d80f0e1b 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2072,16 +2072,22 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, dr_alignment_support alss; int misalign = dr_misalignment (first_dr_info, vectype); tree half_vtype; + poly_uint64 remain; + unsigned HOST_WIDE_INT tem, num; if (overrun_p && !masked_p && (((alss = vect_supportable_dr_alignment (vinfo, first_dr_info, vectype, misalign))) == dr_aligned || alss == dr_unaligned_supported) - && known_eq (nunits, (group_size - gap) * 2) - && known_eq (nunits, group_size) - && (vector_vector_composition_type (vectype, 2, _vtype) - != NULL_TREE)) + && can_div_trunc_p (group_size + * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap, + nunits, , ) + && (known_eq (remain, 0u) + || (constant_multiple_p (nunits, remain, ) + && (vector_vector_composition_type (vectype, num, + _vtype) + != NULL_TREE overrun_p = false; if (overrun_p && !can_overrun_p) @@ -11513,33 +11519,14 @@ vectorizable_load (vec_info *vinfo, unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info); unsigned int vect_align = vect_known_alignment_in_bytes (first_dr_info, vectype); - unsigned int scalar_dr_size - = vect_get_scalar_dr_size (first_dr_info); - /* If there's no peeling for gaps but we have a gap - with slp loads then load the lower half of the - vector only. See get_group_load_store_type for - when we apply this optimization. */ - if (slp - && loop_vinfo - && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0 - && known_eq (nunits, (group_size - gap) * 2) - && known_eq (nunits, group_size) - && gap >= (vect_align / scalar_dr_size)) - { - tree half_vtype; - new_vtype - = vector_vector_composition_type (vectype, 2, - _vtype); - if (new_vtype != NULL_TREE) - ltype =
[gcc r15-895] tree-optimization/114435 - pcom left around copies confusing SLP
https://gcc.gnu.org/g:1065a7db6f2a69770a85b4d53b9123b090dd1771 commit r15-895-g1065a7db6f2a69770a85b4d53b9123b090dd1771 Author: Richard Biener Date: Wed May 29 10:41:51 2024 +0200 tree-optimization/114435 - pcom left around copies confusing SLP The following arranges for the pre-SLP vectorization scalar cleanup to be run when predictive commoning was applied to a loop in the function. This is similar to the complete unroll situation and facilitating SLP vectorization. Avoiding the SSA copies in predictive commoning itself isn't easy (and predcom also sometimes unrolls, asking for scalar cleanup). PR tree-optimization/114435 * tree-predcom.cc (tree_predictive_commoning): Queue the next scalar cleanup sub-pipeline to be run when we did something. * gcc.dg/vect/bb-slp-pr114435.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/vect/bb-slp-pr114435.c | 37 + gcc/tree-predcom.cc | 3 +++ 2 files changed, 40 insertions(+) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr114435.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr114435.c new file mode 100644 index 000..d1eecf7979a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr114435.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ +/* Predictive commining is supposed to happen. */ +/* { dg-additional-options "-O3 -fdump-tree-pcom" } */ + +struct res { +double r0; +double r1; +double r2; +double r3; +}; + +struct pxl { +double v0; +double v1; +double v2; +double v3; +}; + +#define IS_NAN(x) ((x) == (x)) + +void fold(struct res *r, struct pxl *in, double k, int sz) +{ + int i; + + for (i = 0; i < sz; i++) { + if (IS_NAN(k)) continue; + r->r0 += in[i].v0 * k; + r->r1 += in[i].v1 * k; + r->r2 += in[i].v2 * k; + r->r3 += in[i].v3 * k; + } +} + +/* { dg-final { scan-tree-dump "# r__r0_lsm\[^\r\n\]* = PHI" "pcom" } } */ +/* { dg-final { scan-tree-dump "optimized: basic block part vectorized" "slp1" } } */ +/* { dg-final { scan-tree-dump "# vect\[^\r\n\]* = PHI" "slp1" } } */ diff --git a/gcc/tree-predcom.cc b/gcc/tree-predcom.cc index 75a4c85164c..9844fee1e97 100644 --- a/gcc/tree-predcom.cc +++ b/gcc/tree-predcom.cc @@ -3522,6 +3522,9 @@ tree_predictive_commoning (bool allow_unroll_p) } } + if (ret != 0) +cfun->pending_TODOs |= PENDING_TODO_force_next_scalar_cleanup; + return ret; }
[gcc r14-10256] tree-optimization/115197 - fix ICE w/ constant in LC PHI and loop distribution
https://gcc.gnu.org/g:2a1fdd5fd0f6bc02d25da192c8fa6487d93d2d50 commit r14-10256-g2a1fdd5fd0f6bc02d25da192c8fa6487d93d2d50 Author: Richard Biener Date: Thu May 23 14:36:39 2024 +0200 tree-optimization/115197 - fix ICE w/ constant in LC PHI and loop distribution Forgot a check for an SSA name before trying to replace a PHI arg with its current definition. PR tree-optimization/115197 * tree-loop-distribution.cc (copy_loop_before): Constant PHI args remain the same. * gcc.dg/pr115197.c: New testcase. (cherry picked from commit 2b2476d4d18c92b8aba3567ebccd2100c2f7c258) Diff: --- gcc/testsuite/gcc.dg/pr115197.c | 14 ++ gcc/tree-loop-distribution.cc | 7 +-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/pr115197.c b/gcc/testsuite/gcc.dg/pr115197.c new file mode 100644 index 000..00d674b3bd9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr115197.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -fno-tree-scev-cprop -ftree-pre -ftree-loop-distribute-patterns" } */ + +int a, b[2], c, d, e, f[2]; +int main() { + while (a) +if (d) { + if (e) +return 0; + for (; c; c++) +f[c] = 0 < (b[c] = ~(f[c + 1] < a)); +} + return 0; +} diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index 45932bae5e7..c5a05ee151d 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -977,8 +977,11 @@ copy_loop_before (class loop *loop, bool redirect_lc_phi_defs) if (virtual_operand_p (gimple_phi_result (phi))) continue; use_operand_p use_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, exit); - tree new_def = get_current_def (USE_FROM_PTR (use_p)); - SET_USE (use_p, new_def); + if (TREE_CODE (USE_FROM_PTR (use_p)) == SSA_NAME) + { + tree new_def = get_current_def (USE_FROM_PTR (use_p)); + SET_USE (use_p, new_def); + } } }
[gcc r14-10257] tree-optimization/115149 - VOP live and missing PHIs
https://gcc.gnu.org/g:90a447677a2abb934b683a012b477e6c52088e35 commit r14-10257-g90a447677a2abb934b683a012b477e6c52088e35 Author: Richard Biener Date: Tue May 21 09:48:04 2024 +0200 tree-optimization/115149 - VOP live and missing PHIs The following fixes a bug in vop-live get_live_in which was using NULL to indicate the first processed edge but at the same time using it for the case the live-in virtual operand cannot be computed. The following fixes this, avoiding sinking a load to a place where we'd have to insert virtual PHIs to make the virtual operand SSA web OK. PR tree-optimization/115149 * tree-ssa-live.cc (virtual_operand_live::get_live_in): Explicitly track the first processed edge. * gcc.dg/pr115149.c: New testcase. (cherry picked from commit ec9b8bafe20755d13ab9a1b834b5da79ae972c0e) Diff: --- gcc/testsuite/gcc.dg/pr115149.c | 16 gcc/tree-ssa-live.cc| 8 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/pr115149.c b/gcc/testsuite/gcc.dg/pr115149.c new file mode 100644 index 000..9f6bc97dbe6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr115149.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -fno-tree-vrp -fno-ipa-sra -fno-tree-dce -fno-tree-ch" } */ + +int a, c, e, f, g, h[1], i; +static int j(int b) { return 0; } +static void k(int d) {} +int main() +{ + if (h[0]) +while (1) { + k(f && j(i && (h[g] = e))); + while (a) + c ^= 1; +} + return 0; +} diff --git a/gcc/tree-ssa-live.cc b/gcc/tree-ssa-live.cc index d94e94eb3bc..122d8e245dd 100644 --- a/gcc/tree-ssa-live.cc +++ b/gcc/tree-ssa-live.cc @@ -1684,14 +1684,18 @@ virtual_operand_live::get_live_in (basic_block bb) edge_iterator ei; edge e; tree livein = NULL_TREE; + bool first = true; FOR_EACH_EDGE (e, ei, bb->preds) if (e->flags & EDGE_DFS_BACK) /* We can ignore backedges since if there's a def there it would have forced a PHI in the source because it also acts as use downstream. */ continue; -else if (!livein) - livein = get_live_out (e->src); +else if (first) + { + livein = get_live_out (e->src); + first = false; + } else if (get_live_out (e->src) != livein) /* When there's no virtual use downstream this indicates a point where we'd insert a PHI merging the different live virtual
[gcc r14-10255] tree-optimization/114921 - _Float16 -> __bf16 isn't noop fixup
https://gcc.gnu.org/g:9e971c671ded9647beb0a1c5b9430b4e64060862 commit r14-10255-g9e971c671ded9647beb0a1c5b9430b4e64060862 Author: Richard Biener Date: Mon May 6 12:03:09 2024 +0200 tree-optimization/114921 - _Float16 -> __bf16 isn't noop fixup The following further strengthens the check which convert expressions we allow to vectorize as simple copy by resorting to tree_nop_conversion_p on the vector components. PR tree-optimization/114921 * tree-vect-stmts.cc (vectorizable_assignment): Use tree_nop_conversion_p to identify converts we can vectorize with a simple assignment. (cherry picked from commit d0d6dcc019cd32eebf85d625f56e0f7573938319) Diff: --- gcc/tree-vect-stmts.cc | 19 +++ 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index f8d8636b139..21e8fe98e44 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -5955,14 +5955,17 @@ vectorizable_assignment (vec_info *vinfo, if (!vectype_in) vectype_in = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), slp_node); - /* We can handle NOP_EXPR conversions that do not change the number - of elements or the vector size. */ - if ((CONVERT_EXPR_CODE_P (code) - || code == VIEW_CONVERT_EXPR) - && (!vectype_in - || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in), nunits) - || maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)), - GET_MODE_SIZE (TYPE_MODE (vectype_in) + /* We can handle VIEW_CONVERT conversions that do not change the number + of elements or the vector size or other conversions when the component + types are nop-convertible. */ + if (!vectype_in + || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in), nunits) + || (code == VIEW_CONVERT_EXPR + && maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)), + GET_MODE_SIZE (TYPE_MODE (vectype_in + || (CONVERT_EXPR_CODE_P (code) + && !tree_nop_conversion_p (TREE_TYPE (vectype), +TREE_TYPE (vectype_in return false; if (VECTOR_BOOLEAN_TYPE_P (vectype) != VECTOR_BOOLEAN_TYPE_P (vectype_in))
[gcc r15-862] target/115254 - fix gcc.dg/vect/vect-gather-4.c dump scanning
https://gcc.gnu.org/g:d8d70b783765361a8acef70fc9b54db526cd6ff5 commit r15-862-gd8d70b783765361a8acef70fc9b54db526cd6ff5 Author: Richard Biener Date: Tue May 28 15:55:59 2024 +0200 target/115254 - fix gcc.dg/vect/vect-gather-4.c dump scanning The dump scanning is supposed to check that we do not merge two sligtly different gathers into one SLP node but since we now SLP the store scanning for "ectorizing stmts using SLP" is no longer good. Instead the following makes us look for "stmt 1 .* = .MASK" which would be how the second lane of an SLP node looks like. We have to handle both .MASK_GATHER_LOAD (for targets with ifun mask gathers) and .MASK_LOAD (for ones without). Tested on x86_64-linux with and without native gather and on GCN where this now avoids a FAIL. PR target/115254 * gcc.dg/vect/vect-gather-4.c: Adjust dump scan. Diff: --- gcc/testsuite/gcc.dg/vect/vect-gather-4.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c index d18094d6982..edd9a6783c2 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c @@ -45,4 +45,7 @@ f3 (int *restrict y, int *restrict x, int *restrict indices) } } -/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" vect } } */ +/* We do not want to see a two-lane .MASK_LOAD or .MASK_GATHER_LOAD since + the gathers are different on each lane. This is a bit fragile and + should possibly be turned into a runtime test. */ +/* { dg-final { scan-tree-dump-not "stmt 1 \[^\r\n\]* = .MASK" vect } } */
[gcc r15-861] tree-optimization/115236 - more points-to *ANYTHING = x fixes
https://gcc.gnu.org/g:c08b0d3f7b3539b26031de31d88dea6b94474577 commit r15-861-gc08b0d3f7b3539b26031de31d88dea6b94474577 Author: Richard Biener Date: Mon May 27 10:41:02 2024 +0200 tree-optimization/115236 - more points-to *ANYTHING = x fixes The stored-to ANYTHING handling has more holes, uncovered by treating volatile accesses as ANYTHING. We fail to properly build the pred and succ graphs, in particular we may not elide direct nodes from receiving from STOREDANYTHING. PR tree-optimization/115236 * tree-ssa-structalias.cc (build_pred_graph): Properly handle *ANYTHING = X. (build_succ_graph): Likewise. Do not elide direct nodes from receiving from STOREDANYTHING. * gcc.dg/pr115236.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/pr115236.c | 12 gcc/tree-ssa-structalias.cc | 20 ++-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/gcc/testsuite/gcc.dg/pr115236.c b/gcc/testsuite/gcc.dg/pr115236.c new file mode 100644 index 000..91edfab957a --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr115236.c @@ -0,0 +1,12 @@ +/* { dg-do run } */ +/* { dg-options "-O -fno-tree-fre" } */ + +int a, *b = +int main() +{ + int *c, *volatile *d = + *d = b; + if (c != ) +__builtin_abort(); + return 0; +} diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc index 9cec2c6cfd9..330e64e65da 100644 --- a/gcc/tree-ssa-structalias.cc +++ b/gcc/tree-ssa-structalias.cc @@ -1312,7 +1312,12 @@ build_pred_graph (void) { /* *x = y. */ if (rhs.offset == 0 && lhs.offset == 0 && rhs.type == SCALAR) - add_pred_graph_edge (graph, FIRST_REF_NODE + lhsvar, rhsvar); + { + if (lhs.var == anything_id) + add_pred_graph_edge (graph, storedanything_id, rhsvar); + else + add_pred_graph_edge (graph, FIRST_REF_NODE + lhsvar, rhsvar); + } } else if (rhs.type == DEREF) { @@ -1398,7 +1403,12 @@ build_succ_graph (void) if (lhs.type == DEREF) { if (rhs.offset == 0 && lhs.offset == 0 && rhs.type == SCALAR) - add_graph_edge (graph, FIRST_REF_NODE + lhsvar, rhsvar); + { + if (lhs.var == anything_id) + add_graph_edge (graph, storedanything_id, rhsvar); + else + add_graph_edge (graph, FIRST_REF_NODE + lhsvar, rhsvar); + } } else if (rhs.type == DEREF) { @@ -1418,13 +1428,11 @@ build_succ_graph (void) } } - /* Add edges from STOREDANYTHING to all non-direct nodes that can - receive pointers. */ + /* Add edges from STOREDANYTHING to all nodes that can receive pointers. */ t = find (storedanything_id); for (i = integer_id + 1; i < FIRST_REF_NODE; ++i) { - if (!bitmap_bit_p (graph->direct_nodes, i) - && get_varinfo (i)->may_have_pointers) + if (get_varinfo (i)->may_have_pointers) add_graph_edge (graph, find (i), t); }
[gcc r15-860] Avoid pessimistic constraints for asm memory constraints
https://gcc.gnu.org/g:19cc6120087619b496732b249b48b1fbe27e commit r15-860-g19cc6120087619b496732b249b48b1fbe27e Author: Richard Biener Date: Tue May 28 13:29:30 2024 +0200 Avoid pessimistic constraints for asm memory constraints We process asm memory input/outputs with constraints to ESCAPED but for this temporarily build an ADDR_EXPR. The issue is that the used build_fold_addr_expr ends up wrapping the ADDR_EXPR in a conversion which ends up producing constraints which is quite bad. The following uses get_constraint_for_address_of instead, avoiding the temporary tree and the unhandled conversion. This avoids a gcc.dg/tree-ssa/restrict-9.c FAIL with the fix for PR115236. * tree-ssa-structalias.cc (find_func_aliases): Use get_constraint_for_address_of to build escape constraints for asm inputs and outputs. Diff: --- gcc/tree-ssa-structalias.cc | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc index f93c5df0767..9cec2c6cfd9 100644 --- a/gcc/tree-ssa-structalias.cc +++ b/gcc/tree-ssa-structalias.cc @@ -5269,7 +5269,11 @@ find_func_aliases (struct function *fn, gimple *origt) /* A memory constraint makes the address of the operand escape. */ if (!allows_reg && allows_mem) - make_escape_constraint (build_fold_addr_expr (op)); + { + auto_vec tmpc; + get_constraint_for_address_of (op, ); + make_constraints_to (escaped_id, tmpc); + } /* The asm may read global memory, so outputs may point to any global memory. */ @@ -5298,7 +5302,11 @@ find_func_aliases (struct function *fn, gimple *origt) /* A memory constraint makes the address of the operand escape. */ if (!allows_reg && allows_mem) - make_escape_constraint (build_fold_addr_expr (op)); + { + auto_vec tmpc; + get_constraint_for_address_of (op, ); + make_constraints_to (escaped_id, tmpc); + } /* Strictly we'd only need the constraint to ESCAPED if the asm clobbers memory, otherwise using something along the lines of per-call clobbers/uses would be enough. */
[gcc r15-859] tree-optimization/115254 - don't account single-lane SLP against discovery limit
https://gcc.gnu.org/g:eaaa4b88038d4d6eda1b20ab662f1568fd9be31f commit r15-859-geaaa4b88038d4d6eda1b20ab662f1568fd9be31f Author: Richard Biener Date: Fri Sep 29 15:12:54 2023 +0200 tree-optimization/115254 - don't account single-lane SLP against discovery limit The following avoids accounting single-lane SLP to the discovery limit. As the two testcases show this makes discovery fail, unfortunately even not the same across targets. The following should fix two FAILs for GCN as a side-effect. PR tree-optimization/115254 * tree-vect-slp.cc (vect_build_slp_tree): Only account multi-lane SLP to limit. * gcc.dg/vect/slp-cond-2-big-array.c: Expect 4 times SLP. * gcc.dg/vect/slp-cond-2.c: Likewise. Diff: --- gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c | 2 +- gcc/testsuite/gcc.dg/vect/slp-cond-2.c | 2 +- gcc/tree-vect-slp.cc | 31 ++-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c index cb7eb94b3a3..9a9f63c0b8d 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c +++ b/gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c @@ -128,4 +128,4 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-cond-2.c b/gcc/testsuite/gcc.dg/vect/slp-cond-2.c index 1dcee46cd95..08bbb3dbec6 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-cond-2.c +++ b/gcc/testsuite/gcc.dg/vect/slp-cond-2.c @@ -128,4 +128,4 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index c7ed520b629..7a963e28063 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1725,21 +1725,26 @@ vect_build_slp_tree (vec_info *vinfo, SLP_TREE_SCALAR_STMTS (res) = stmts; bst_map->put (stmts.copy (), res); - if (*limit == 0) + /* Single-lane SLP doesn't have the chance of run-away, do not account + it to the limit. */ + if (stmts.length () > 1) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, -"SLP discovery limit exceeded\n"); - /* Mark the node invalid so we can detect those when still in use -as backedge destinations. */ - SLP_TREE_SCALAR_STMTS (res) = vNULL; - SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def; - res->failed = XNEWVEC (bool, group_size); - memset (res->failed, 0, sizeof (bool) * group_size); - memset (matches, 0, sizeof (bool) * group_size); - return NULL; + if (*limit == 0) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, +"SLP discovery limit exceeded\n"); + /* Mark the node invalid so we can detect those when still in use +as backedge destinations. */ + SLP_TREE_SCALAR_STMTS (res) = vNULL; + SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def; + res->failed = XNEWVEC (bool, group_size); + memset (res->failed, 0, sizeof (bool) * group_size); + memset (matches, 0, sizeof (bool) * group_size); + return NULL; + } + --*limit; } - --*limit; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location,
[gcc r15-858] Fix SLP reduction neutral op value for pointer reductions
https://gcc.gnu.org/g:65aa46ffc3b06bba3d49b9b8315610c706a1215b commit r15-858-g65aa46ffc3b06bba3d49b9b8315610c706a1215b Author: Richard Biener Date: Mon May 27 11:38:11 2024 +0200 Fix SLP reduction neutral op value for pointer reductions When the neutral op is the initial value we might need to convert it from pointer to integer. * tree-vect-loop.cc (get_initial_defs_for_reduction): Convert neutral op to the vector component type. Diff: --- gcc/tree-vect-loop.cc | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 83c0544b6aa..3b94bb13a8b 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -5616,7 +5616,14 @@ get_initial_defs_for_reduction (loop_vec_info loop_vinfo, /* Get the def before the loop. In reduction chain we have only one initial value. Else we have as many as PHIs in the group. */ if (i >= initial_values.length () || (j > i && neutral_op)) - op = neutral_op; + { + if (!useless_type_conversion_p (TREE_TYPE (vector_type), + TREE_TYPE (neutral_op))) + neutral_op = gimple_convert (_seq, +TREE_TYPE (vector_type), +neutral_op); + op = neutral_op; + } else { if (!useless_type_conversion_p (TREE_TYPE (vector_type),
[gcc r15-851] Fix points-to SCC collapsing bug
https://gcc.gnu.org/g:07cdba6294756af350198fbb01ea8c8efeac54dd commit r15-851-g07cdba6294756af350198fbb01ea8c8efeac54dd Author: Richard Biener Date: Mon May 27 13:50:14 2024 +0200 Fix points-to SCC collapsing bug When points-to analysis finds SCCs it marks the wrong node as being part of a found cycle. It only wants to mark the node it collapses to but marked the entry node found rather than the one it collapses to. This causes fallout in the patch for PR115236 but generally weakens the points-to solution by collapsing too many nodes. Note that this fix might slow down points-to solving. * tree-ssa-structalias.cc (scc_visit): Mark the node we collapse to as being in a component. Diff: --- gcc/tree-ssa-structalias.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc index a39b36c146e..f93c5df0767 100644 --- a/gcc/tree-ssa-structalias.cc +++ b/gcc/tree-ssa-structalias.cc @@ -1534,8 +1534,10 @@ scc_visit (constraint_graph_t graph, class scc_info *si, unsigned int n) graph->indirect_cycles[i - FIRST_REF_NODE] = lowest_node; } } + bitmap_set_bit (si->deleted, lowest_node); } - bitmap_set_bit (si->deleted, n); + else + bitmap_set_bit (si->deleted, n); } else si->scc_stack.safe_push (n);
[gcc r15-850] tree-optimization/115220 - fix store sinking virtual operand constraints
https://gcc.gnu.org/g:f9fbb47987efc8b5261e4cc36613c928a8693493 commit r15-850-gf9fbb47987efc8b5261e4cc36613c928a8693493 Author: Richard Biener Date: Mon May 27 09:40:19 2024 +0200 tree-optimization/115220 - fix store sinking virtual operand constraints The following makes sure the virtual operand updating when sinking stores works for the case we ignore paths to kills. The final sink location might not post-dominate the original stmt location which would require inserting of a virtual PHI which we do not support. PR tree-optimization/115220 PR tree-optimization/115226 * tree-ssa-sink.cc (statement_sink_location): When ignoring paths to kills when sinking stores make sure the final sink location is still post-dominated by the original one. Otherwise we'd need to insert a PHI node to merge virtual operands. * gcc.dg/torture/pr115220.c: New testcase. * gcc.dg/torture/pr115226.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/torture/pr115220.c | 14 ++ gcc/testsuite/gcc.dg/torture/pr115226.c | 15 +++ gcc/tree-ssa-sink.cc| 12 +--- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/gcc.dg/torture/pr115220.c b/gcc/testsuite/gcc.dg/torture/pr115220.c new file mode 100644 index 000..e7b5da6ba42 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr115220.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-additional-options "--param logical-op-non-short-circuit=0" } */ + +extern char **environ; +static char ***p_environ = +int +_setenv_r (const char *name, const char *value) +{ + register char *C; + int offset; + for (C = (*p_environ)[offset]; (*C = *name++) && *C != '='; ++C); + for (*C++ = '='; (*C++ = *value++) != 0;); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/torture/pr115226.c b/gcc/testsuite/gcc.dg/torture/pr115226.c new file mode 100644 index 000..9a0bc7c9b6a --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr115226.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ + +extern void c(); +int a, b; +int main() { + while (b) { +int d, e = 0, *f = +*f = 1; +e = 1 >> d ? : 1 << d; +if (e) + a = 0; +c(); + } + return 0; +} diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc index b0fe871cf1e..8c551e42a4d 100644 --- a/gcc/tree-ssa-sink.cc +++ b/gcc/tree-ssa-sink.cc @@ -467,11 +467,17 @@ statement_sink_location (gimple *stmt, basic_block frombb, if (!sinkbb) return false; - sinkbb = select_best_block (frombb, sinkbb, stmt); - if (sinkbb == frombb) + basic_block bestbb = select_best_block (frombb, sinkbb, stmt); + if (bestbb == frombb + /* When we sink a store make sure there's not a path to any of +the possibly skipped killing defs as that wrecks the virtual +operand update, requiring inserting of a PHI node. */ + || (gimple_vdef (stmt) + && bestbb != sinkbb + && !dominated_by_p (CDI_POST_DOMINATORS, bestbb, sinkbb))) return false; - *togsi = gsi_after_labels (sinkbb); + *togsi = gsi_after_labels (bestbb); return true; }
[gcc r14-10247] tree-optimization/115232 - demangle failure during -Waccess
https://gcc.gnu.org/g:4790076933ef9337553c3fbbc52a93cce78c584f commit r14-10247-g4790076933ef9337553c3fbbc52a93cce78c584f Author: Richard Biener Date: Mon May 27 09:13:11 2024 +0200 tree-optimization/115232 - demangle failure during -Waccess For the following testcase we fail to demangle _ZZN5OuterIvE6methodIvEEvvQ3cstITL0__EEN5InnernwEm and _ZZN5OuterIvE6methodIvEEvvQ3cstITL0__EEN5InnerdlEPv and in turn end up building NULL references. The following puts in a safeguard for faile demangling into -Waccess. PR tree-optimization/115232 * gimple-ssa-warn-access.cc (new_delete_mismatch_p): Handle failure to demangle gracefully. * g++.dg/pr115232.C: New testcase. (cherry picked from commit 311d7f5c17b8969c7ed8e4f23178d6ec4752e33f) Diff: --- gcc/gimple-ssa-warn-access.cc | 2 +- gcc/testsuite/g++.dg/pr115232.C | 25 + 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/gcc/gimple-ssa-warn-access.cc b/gcc/gimple-ssa-warn-access.cc index dedaae27b31..194d1a2c02a 100644 --- a/gcc/gimple-ssa-warn-access.cc +++ b/gcc/gimple-ssa-warn-access.cc @@ -1762,7 +1762,7 @@ new_delete_mismatch_p (tree new_decl, tree delete_decl) void *np = NULL, *dp = NULL; demangle_component *ndc = cplus_demangle_v3_components (new_str, 0, ); demangle_component *ddc = cplus_demangle_v3_components (del_str, 0, ); - bool mismatch = new_delete_mismatch_p (*ndc, *ddc); + bool mismatch = ndc && ddc && new_delete_mismatch_p (*ndc, *ddc); free (np); free (dp); return mismatch; diff --git a/gcc/testsuite/g++.dg/pr115232.C b/gcc/testsuite/g++.dg/pr115232.C new file mode 100644 index 000..e1d96d8f899 --- /dev/null +++ b/gcc/testsuite/g++.dg/pr115232.C @@ -0,0 +1,25 @@ +// { dg-do compile } +// { dg-require-effective-target c++20 } + +using size_t = decltype(sizeof(0)); +template +static constexpr bool cst = true; +template +struct Outer +{ +Outer(); +template void method() requires cst +{ +struct Inner +{ +static void* operator new(size_t){return new char;} +static void operator delete(void*){} +Outer t; +}; +new Inner; +} +}; +void f() +{ +Outer{}.method(); +}
[gcc r15-848] tree-optimization/115232 - demangle failure during -Waccess
https://gcc.gnu.org/g:311d7f5c17b8969c7ed8e4f23178d6ec4752e33f commit r15-848-g311d7f5c17b8969c7ed8e4f23178d6ec4752e33f Author: Richard Biener Date: Mon May 27 09:13:11 2024 +0200 tree-optimization/115232 - demangle failure during -Waccess For the following testcase we fail to demangle _ZZN5OuterIvE6methodIvEEvvQ3cstITL0__EEN5InnernwEm and _ZZN5OuterIvE6methodIvEEvvQ3cstITL0__EEN5InnerdlEPv and in turn end up building NULL references. The following puts in a safeguard for faile demangling into -Waccess. PR tree-optimization/115232 * gimple-ssa-warn-access.cc (new_delete_mismatch_p): Handle failure to demangle gracefully. * g++.dg/pr115232.C: New testcase. Diff: --- gcc/gimple-ssa-warn-access.cc | 2 +- gcc/testsuite/g++.dg/pr115232.C | 25 + 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/gcc/gimple-ssa-warn-access.cc b/gcc/gimple-ssa-warn-access.cc index 0cd5b6d6ef4..61f9f0f3d31 100644 --- a/gcc/gimple-ssa-warn-access.cc +++ b/gcc/gimple-ssa-warn-access.cc @@ -1762,7 +1762,7 @@ new_delete_mismatch_p (tree new_decl, tree delete_decl) void *np = NULL, *dp = NULL; demangle_component *ndc = cplus_demangle_v3_components (new_str, 0, ); demangle_component *ddc = cplus_demangle_v3_components (del_str, 0, ); - bool mismatch = new_delete_mismatch_p (*ndc, *ddc); + bool mismatch = ndc && ddc && new_delete_mismatch_p (*ndc, *ddc); free (np); free (dp); return mismatch; diff --git a/gcc/testsuite/g++.dg/pr115232.C b/gcc/testsuite/g++.dg/pr115232.C new file mode 100644 index 000..e1d96d8f899 --- /dev/null +++ b/gcc/testsuite/g++.dg/pr115232.C @@ -0,0 +1,25 @@ +// { dg-do compile } +// { dg-require-effective-target c++20 } + +using size_t = decltype(sizeof(0)); +template +static constexpr bool cst = true; +template +struct Outer +{ +Outer(); +template void method() requires cst +{ +struct Inner +{ +static void* operator new(size_t){return new char;} +static void operator delete(void*){} +Outer t; +}; +new Inner; +} +}; +void f() +{ +Outer{}.method(); +}
[gcc r15-816] Fix gcc.dg/vect/vect-gather-4.c for cascadelake
https://gcc.gnu.org/g:85e2ce10f76aee93e43aab6558cf8e39cec911e4 commit r15-816-g85e2ce10f76aee93e43aab6558cf8e39cec911e4 Author: Richard Biener Date: Fri May 24 13:15:38 2024 +0200 Fix gcc.dg/vect/vect-gather-4.c for cascadelake There's not really a good way to test what the testcase wants to test, the following exchanges one dump scan for another (imperfect) one. * gcc.dg/vect/vect-gather-4.c: Scan for not vectorizing using SLP. Diff: --- gcc/testsuite/gcc.dg/vect/vect-gather-4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c index 1ce63e69199..d18094d6982 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c @@ -45,4 +45,4 @@ f3 (int *restrict y, int *restrict x, int *restrict indices) } } -/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect } } */ +/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" vect } } */
[gcc r15-815] tree-optimization/115144 - improve sinking destination choice
https://gcc.gnu.org/g:5b9b3bae33cae7fca2e3c3e3028be6b8bee9b698 commit r15-815-g5b9b3bae33cae7fca2e3c3e3028be6b8bee9b698 Author: Richard Biener Date: Wed May 22 09:16:51 2024 +0200 tree-optimization/115144 - improve sinking destination choice When sinking code closer to its uses we already try to minimize the distance we move by inserting at the start of the basic-block. The following makes sure to sink closest to the control dependence check of the region we want to sink to as well as make sure to ignore control dependences that are only guarding exceptional code. This restores somewhat the old profile check but without requiring nearly even probabilities. The patch also makes sure to not give up completely when the best sink location is one we do not want to sink to but possibly then choose the next best one. PR tree-optimization/115144 * tree-ssa-sink.cc (do_not_sink): New function, split out from ... (select_best_block): Here. First pick valid block to sink to. From that search for the best valid block, avoiding sinking across conditions to exceptional code. (sink_code_in_bb): When updating vuses of stores in paths we do not sink a store to make sure we didn't pick a dominating sink location. * gcc.dg/tree-ssa/ssa-sink-22.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c | 14 gcc/tree-ssa-sink.cc| 106 +++- 2 files changed, 86 insertions(+), 34 deletions(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c new file mode 100644 index 000..e35626d4070 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-sink1-details" } */ + +extern void abort (void); + +int foo (int x, int y, int f) +{ + int tem = x / y; + if (f) +abort (); + return tem; +} + +/* { dg-final { scan-tree-dump-not "Sinking" "sink1" } } */ diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc index 2188b7523c7..b0fe871cf1e 100644 --- a/gcc/tree-ssa-sink.cc +++ b/gcc/tree-ssa-sink.cc @@ -172,6 +172,39 @@ nearest_common_dominator_of_uses (def_operand_p def_p, bool *debug_stmts) return commondom; } +/* Return whether sinking STMT from EARLY_BB to BEST_BB should be avoided. */ + +static bool +do_not_sink (gimple *stmt, basic_block early_bb, basic_block best_bb) +{ + /* Placing a statement before a setjmp-like function would be invalid + (it cannot be reevaluated when execution follows an abnormal edge). + If we selected a block with abnormal predecessors, just punt. */ + if (bb_has_abnormal_pred (best_bb)) +return true; + + /* If the latch block is empty, don't make it non-empty by sinking + something into it. */ + if (best_bb == early_bb->loop_father->latch + && empty_block_p (best_bb)) +return true; + + /* Avoid turning an unconditional read into a conditional one when we + still might want to perform vectorization. */ + if (best_bb->loop_father == early_bb->loop_father + && loop_outer (best_bb->loop_father) + && !best_bb->loop_father->inner + && gimple_vuse (stmt) + && !gimple_vdef (stmt) + && flag_tree_loop_vectorize + && !(cfun->curr_properties & PROP_loop_opts_done) + && dominated_by_p (CDI_DOMINATORS, best_bb->loop_father->latch, early_bb) + && !dominated_by_p (CDI_DOMINATORS, best_bb->loop_father->latch, best_bb)) +return true; + + return false; +} + /* Given EARLY_BB and LATE_BB, two blocks in a path through the dominator tree, return the best basic block between them (inclusive) to place statements. @@ -185,54 +218,57 @@ select_best_block (basic_block early_bb, basic_block late_bb, gimple *stmt) { + /* First pick a block we do not disqualify. */ + while (late_bb != early_bb +&& do_not_sink (stmt, early_bb, late_bb)) +late_bb = get_immediate_dominator (CDI_DOMINATORS, late_bb); + basic_block best_bb = late_bb; basic_block temp_bb = late_bb; - while (temp_bb != early_bb) { /* Walk up the dominator tree, hopefully we'll find a shallower loop nest. */ temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb); + /* Do not consider blocks we do not want to sink to. */ + if (temp_bb != early_bb && do_not_sink (stmt, early_bb, temp_bb)) + ; + /* If we've moved into a lower loop nest, then that becomes our best block. */ - if (bb_loop_depth (temp_bb) < bb_loop_depth (best_bb)) + else if (bb_loop_depth (temp_bb) < bb_loop_depth (best_bb)) best_bb = temp_bb; -} - /* Placing a statement before a setjmp-like function would be invalid - (it cannot be reevaluated when execution
[gcc r15-812] Avoid splitting store dataref groups during SLP discovery
https://gcc.gnu.org/g:c71886f2ca2e46ce1449c7064d6f1b447d02fcba commit r15-812-gc71886f2ca2e46ce1449c7064d6f1b447d02fcba Author: Richard Biener Date: Fri Sep 29 13:13:16 2023 +0200 Avoid splitting store dataref groups during SLP discovery The following avoids splitting store dataref groups during SLP discovery but instead forces (eventually single-lane) consecutive lane SLP discovery for all lanes of the group, creating VEC_PERM SLP nodes merging them so the store will always cover the whole group. With this for example int x[1024], y[1024], z[1024], w[1024]; void foo (void) { for (int i = 0; i < 256; i++) { x[4*i+0] = y[2*i+0]; x[4*i+1] = y[2*i+1]; x[4*i+2] = z[i]; x[4*i+3] = w[i]; } } which was previously using hybrid SLP can now be fully SLPed and SSE code generated looks better (but of course you never know, I didn't actually benchmark). We of course need a VF of four here. .L2: movdqa z(%rax), %xmm0 movdqa w(%rax), %xmm4 movdqa y(%rax,%rax), %xmm2 movdqa y+16(%rax,%rax), %xmm1 movdqa %xmm0, %xmm3 punpckhdq %xmm4, %xmm0 punpckldq %xmm4, %xmm3 movdqa %xmm2, %xmm4 shufps $238, %xmm3, %xmm2 movaps %xmm2, x+16(,%rax,4) movdqa %xmm1, %xmm2 shufps $68, %xmm3, %xmm4 shufps $68, %xmm0, %xmm2 movaps %xmm4, x(,%rax,4) shufps $238, %xmm0, %xmm1 movaps %xmm2, x+32(,%rax,4) movaps %xmm1, x+48(,%rax,4) addq$16, %rax cmpq$1024, %rax jne .L2 The extra permute nodes merging distinct branches of the SLP tree might be unexpected for some code, esp. since SLP_TREE_REPRESENTATIVE cannot be meaningfully set and we cannot populate SLP_TREE_SCALAR_STMTS or SLP_TREE_SCALAR_OPS consistently as we can have a mix of both. The patch keeps the sub-trees form consecutive lanes but that's in principle not necessary if we for example have an even/odd split which now would result in N single-lane sub-trees. That's left for future improvements. The interesting part is how VLA vector ISAs handle merging of two vectors that's not trivial even/odd merging. The strathegy of how to build the permute tree might need adjustments for that (in the end splitting each branch to single lanes and then doing even/odd merging would be the brute-force fallback). Not sure how much we can or should rely on the SLP optimize pass to handle this. The gcc.dg/vect/slp-12a.c case is interesting as we currently split the 8 store group into lanes 0-5 which we SLP with an unroll factor of two (on x86-64 with SSE) and the remaining two lanes are using interleaving vectorization with a final unroll factor of four. Thus we're using hybrid SLP within a single store group. After the change we discover the same 0-5 lane SLP part as well as two single-lane parts feeding the full store group. But that results in a load permutation that isn't supported (I have WIP patchs to rectify that). So we end up cancelling SLP and vectorizing the whole loop with interleaving which is IMO good and results in better code. This is similar for gcc.target/i386/pr52252-atom.c where interleaving generates much better code than hybrid SLP. I'm unsure how to update the testcase though. gcc.dg/vect/slp-21.c runs into similar situations. Note that when when analyzing SLP operations we discard an instance we currently force the full loop to have no SLP because hybrid detection is broken. It's probably not worth fixing this at this moment. For gcc.dg/vect/pr97428.c we are not splitting the 16 store group into two but merge the two 8 lane loads into one before doing the store and thus have only a single SLP instance. A similar situation happens in gcc.dg/vect/slp-11c.c but the branches feeding the single SLP store only have a single lane. Likewise for gcc.dg/vect/vect-complex-5.c and gcc.dg/vect/vect-gather-2.c. gcc.dg/vect/slp-cond-1.c has an additional SLP vectorization with a SLP store group of size two but two single-lane branches. * tree-vect-slp.cc (vect_build_slp_instance): Do not split store dataref groups on loop SLP discovery failure but create a single SLP instance for the stores but branch to SLP sub-trees and merge with a series of VEC_PERM nodes. * gcc.dg/vect/pr97428.c: Expect a single store SLP group. * gcc.dg/vect/slp-11c.c: Likewise, if !vect_load_lanes. * gcc.dg/vect/vect-complex-5.c: Likewise. * gcc.dg/vect/slp-12a.c: Do not expect SLP.
[gcc r15-793] tree-optimization/115197 - fix ICE w/ constant in LC PHI and loop distribution
https://gcc.gnu.org/g:2b2476d4d18c92b8aba3567ebccd2100c2f7c258 commit r15-793-g2b2476d4d18c92b8aba3567ebccd2100c2f7c258 Author: Richard Biener Date: Thu May 23 14:36:39 2024 +0200 tree-optimization/115197 - fix ICE w/ constant in LC PHI and loop distribution Forgot a check for an SSA name before trying to replace a PHI arg with its current definition. PR tree-optimization/115197 * tree-loop-distribution.cc (copy_loop_before): Constant PHI args remain the same. * gcc.dg/pr115197.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/pr115197.c | 14 ++ gcc/tree-loop-distribution.cc | 7 +-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/pr115197.c b/gcc/testsuite/gcc.dg/pr115197.c new file mode 100644 index 000..00d674b3bd9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr115197.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -fno-tree-scev-cprop -ftree-pre -ftree-loop-distribute-patterns" } */ + +int a, b[2], c, d, e, f[2]; +int main() { + while (a) +if (d) { + if (e) +return 0; + for (; c; c++) +f[c] = 0 < (b[c] = ~(f[c + 1] < a)); +} + return 0; +} diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index 668dc420449..4d1ed234fcb 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -977,8 +977,11 @@ copy_loop_before (class loop *loop, bool redirect_lc_phi_defs) if (virtual_operand_p (gimple_phi_result (phi))) continue; use_operand_p use_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, exit); - tree new_def = get_current_def (USE_FROM_PTR (use_p)); - SET_USE (use_p, new_def); + if (TREE_CODE (USE_FROM_PTR (use_p)) == SSA_NAME) + { + tree new_def = get_current_def (USE_FROM_PTR (use_p)); + SET_USE (use_p, new_def); + } } }
[gcc r15-792] tree-optimization/115199 - fix PTA constraint processing for LHS
https://gcc.gnu.org/g:f0a02467bbc35a478eb82f5a8a7e8870827b51fc commit r15-792-gf0a02467bbc35a478eb82f5a8a7e8870827b51fc Author: Richard Biener Date: Thu May 23 13:33:15 2024 +0200 tree-optimization/115199 - fix PTA constraint processing for LHS When processing a = X constraint we treat it as *ANYTHING = X during constraint processing but then end up recording it as = X anyway, breaking constraint graph building. This is because we only update the local copy of the LHS and not the constraint itself. PR tree-optimization/115199 * tree-ssa-structalias.cc (process_constraint): Also record = X as *ANYTING = X in the end. * gcc.dg/torture/pr115199.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/torture/pr115199.c | 24 gcc/tree-ssa-structalias.cc | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/torture/pr115199.c b/gcc/testsuite/gcc.dg/torture/pr115199.c new file mode 100644 index 000..981a7330b32 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr115199.c @@ -0,0 +1,24 @@ +/* { dg-do run } */ + +struct b { + char *volatile c; +}; +struct b * __attribute__((noipa)) +d() +{ + char *e; + struct b *b = __builtin_malloc(sizeof(b)); + void *f = __builtin_malloc(1); + + e = __builtin_memcpy(f, "z", 1); + b->c = e; + return b; +} + +int main() +{ + struct b b = *d(); + if (b.c[0] != 'z') +__builtin_abort(); + return 0; +} diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc index 0e9423a78ec..a39b36c146e 100644 --- a/gcc/tree-ssa-structalias.cc +++ b/gcc/tree-ssa-structalias.cc @@ -3104,7 +3104,7 @@ process_constraint (constraint_t t) it here by turning it into *ANYTHING. */ if (lhs.type == ADDRESSOF && lhs.var == anything_id) -lhs.type = DEREF; +t->lhs.type = lhs.type = DEREF; /* ADDRESSOF on the lhs is invalid. */ gcc_assert (lhs.type != ADDRESSOF);
[gcc r15-791] tree-optimization/115138 - ptr-vs-ptr and FUNCTION_DECLs
https://gcc.gnu.org/g:61f5b3c59ed20438d7d9918d7a83d29a21097d4e commit r15-791-g61f5b3c59ed20438d7d9918d7a83d29a21097d4e Author: Richard Biener Date: Thu May 23 11:26:14 2024 +0200 tree-optimization/115138 - ptr-vs-ptr and FUNCTION_DECLs I failed to realize we do not represent FUNCTION_DECLs or LABEL_DECLs in vars explicitly and thus have to compare pt.vars_contains_nonlocal. PR tree-optimization/115138 * tree-ssa-alias.cc (ptrs_compare_unequal): Make sure pt.vars_contains_nonlocal differs since we do not represent FUNCTION_DECLs or LABEL_DECLs in vars explicitly. * gcc.dg/torture/pr115138.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/torture/pr115138.c | 28 gcc/tree-ssa-alias.cc | 6 ++ 2 files changed, 34 insertions(+) diff --git a/gcc/testsuite/gcc.dg/torture/pr115138.c b/gcc/testsuite/gcc.dg/torture/pr115138.c new file mode 100644 index 000..6becaecbaff --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr115138.c @@ -0,0 +1,28 @@ +/* { dg-do run } */ + +int foo (int) {} +int bar (int) {} + +typedef int (*pred)(int); + +int x, y; +pred A () { if (x) return foo; else return bar; } +pred B () { if (y) return foo; else return bar; } +int __attribute__((noipa)) baz() +{ + pred a = A(); + pred b = B(); + if (a != b) +return 42; + return 0; +} + +int main() +{ + if (baz () != 0) +__builtin_abort (); + y = 1; + if (baz () != 42) +__builtin_abort (); + return 0; +} diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc index d64d6d02f4a..1a91d63a31e 100644 --- a/gcc/tree-ssa-alias.cc +++ b/gcc/tree-ssa-alias.cc @@ -501,6 +501,12 @@ ptrs_compare_unequal (tree ptr1, tree ptr2) || pi2->pt.vars_contains_interposable) return false; if ((!pi1->pt.null || !pi2->pt.null) + /* ??? We do not represent FUNCTION_DECL and LABEL_DECL +in pt.vars but only set pt.vars_contains_nonlocal. This +makes compares involving those and other nonlocals +imprecise. */ + && (!pi1->pt.vars_contains_nonlocal + || !pi2->pt.vars_contains_nonlocal) && (!pt_solution_includes_const_pool (>pt) || !pt_solution_includes_const_pool (>pt))) return !pt_solutions_intersect (>pt, >pt);
[gcc r15-773] Fix mixed input kind permute optimization
https://gcc.gnu.org/g:3507ab1b018a68500e49fa9f1de7caa0f1b53dda commit r15-773-g3507ab1b018a68500e49fa9f1de7caa0f1b53dda Author: Richard Biener Date: Tue May 21 19:15:33 2024 +0200 Fix mixed input kind permute optimization When change_vec_perm_layout runs into a permute combining two nodes where one is invariant and one internal the partition of one input can be -1 but the other might not be. The following supports this case by simply ignoring inputs with input partiton -1. I'm not sure this is correct but it avoids ICEing when accessing that partitions layout for gcc.target/i386/pr98928.c with the change to avoid splitting store dataref groups during SLP discovery. * tree-vect-slp.cc (change_vec_perm_layout): Ignore an input partition of -1. Diff: --- gcc/tree-vect-slp.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 43f2c153bf0..3f8209b43a7 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -4640,6 +4640,8 @@ change_vec_perm_layout (slp_tree node, lane_permutation_t , { slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first]; unsigned int in_partition_i = m_vertices[in_node->vertex].partition; + if (in_partition_i == -1u) + continue; this_in_layout_i = m_partitions[in_partition_i].layout; } if (this_in_layout_i > 0)
[gcc r15-772] Avoid SLP_REPRESENTATIVE access for VEC_PERM in SLP scheduling
https://gcc.gnu.org/g:31e9bae0ea5e5413abfa3ca9050e66cc6760553e commit r15-772-g31e9bae0ea5e5413abfa3ca9050e66cc6760553e Author: Richard Biener Date: Fri May 17 15:23:38 2024 +0200 Avoid SLP_REPRESENTATIVE access for VEC_PERM in SLP scheduling SLP permute nodes can end up without a SLP_REPRESENTATIVE now, the following avoids touching it in this case in vect_schedule_slp_node. * tree-vect-slp.cc (vect_schedule_slp_node): Avoid looking at SLP_REPRESENTATIVE for VEC_PERM nodes. Diff: --- gcc/tree-vect-slp.cc | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index f34ed54a70b..43f2c153bf0 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -9301,13 +9301,8 @@ vect_schedule_slp_node (vec_info *vinfo, gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0); SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); - if (dump_enabled_p ()) -dump_printf_loc (MSG_NOTE, vect_location, -"-->vectorizing SLP node starting from: %G", -stmt_info->stmt); - - if (STMT_VINFO_DATA_REF (stmt_info) - && SLP_TREE_CODE (node) != VEC_PERM_EXPR) + if (SLP_TREE_CODE (node) != VEC_PERM_EXPR + && STMT_VINFO_DATA_REF (stmt_info)) { /* Vectorized loads go before the first scalar load to make it ready early, vectorized stores go before the last scalar @@ -9319,10 +9314,10 @@ vect_schedule_slp_node (vec_info *vinfo, last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); si = gsi_for_stmt (last_stmt_info->stmt); } - else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type - || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type - || STMT_VINFO_TYPE (stmt_info) == phi_info_type) - && SLP_TREE_CODE (node) != VEC_PERM_EXPR) + else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR + && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type + || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type + || STMT_VINFO_TYPE (stmt_info) == phi_info_type)) { /* For PHI node vectorization we do not use the insertion iterator. */ si = gsi_none (); @@ -9456,6 +9451,9 @@ vect_schedule_slp_node (vec_info *vinfo, /* Handle purely internal nodes. */ if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, +"-->vectorizing SLP permutation node\n"); /* ??? the transform kind is stored to STMT_VINFO_TYPE which might be shared with different SLP nodes (but usually it's the same operation apart from the case the stmt is only there for denoting @@ -9474,7 +9472,13 @@ vect_schedule_slp_node (vec_info *vinfo, } } else -vect_transform_stmt (vinfo, stmt_info, , node, instance); +{ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, +"-->vectorizing SLP node starting from: %G", +stmt_info->stmt); + vect_transform_stmt (vinfo, stmt_info, , node, instance); +} } /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
[gcc r15-771] Avoid requiring VEC_PERM represenatives
https://gcc.gnu.org/g:0c7792f707368d0225a9a457895b847ef660c270 commit r15-771-g0c7792f707368d0225a9a457895b847ef660c270 Author: Richard Biener Date: Fri May 17 14:26:38 2024 +0200 Avoid requiring VEC_PERM represenatives The following plugs one hole where we require a VEC_PERM node representative unnecessarily. This is for vect_check_store_rhs which looks at the RHS and checks whether a constant can be native encoded. The fix is to guard that with vect_constant_def additionally and making vect_is_simple_use forgiving for a missing SLP_TREE_REPRESENTATIVE when the child is a VEC_PERM node, initializing the scalar def to error_mark_node. * tree-vect-stmts.cc (vect_check_store_rhs): Look at *rhs only when it's a vec_constant_def. (vect_is_simple_use): When we have no representative for an internal node, fill in *op with error_mark_node. Diff: --- gcc/tree-vect-stmts.cc | 25 ++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 672959501bb..4219ad832db 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2553,7 +2553,8 @@ vect_check_store_rhs (vec_info *vinfo, stmt_vec_info stmt_info, /* In the case this is a store from a constant make sure native_encode_expr can handle it. */ - if (CONSTANT_CLASS_P (*rhs) && native_encode_expr (*rhs, NULL, 64) == 0) + if (rhs_dt == vect_constant_def + && CONSTANT_CLASS_P (*rhs) && native_encode_expr (*rhs, NULL, 64) == 0) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -14002,8 +14003,26 @@ vect_is_simple_use (vec_info *vinfo, stmt_vec_info stmt, slp_tree slp_node, *vectype = SLP_TREE_VECTYPE (child); if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) { - *op = gimple_get_lhs (SLP_TREE_REPRESENTATIVE (child)->stmt); - return vect_is_simple_use (*op, vinfo, dt, def_stmt_info_out); + /* ??? VEC_PERM nodes might be intermediate and their lane value +have no representative (nor do we build a VEC_PERM stmt for +the actual operation). Note for two-operator nodes we set +a representative but leave scalar stmts empty as we'd only +have one for a subset of lanes. Ideally no caller would +require *op for internal defs. */ + if (SLP_TREE_REPRESENTATIVE (child)) + { + *op = gimple_get_lhs (SLP_TREE_REPRESENTATIVE (child)->stmt); + return vect_is_simple_use (*op, vinfo, dt, def_stmt_info_out); + } + else + { + gcc_assert (SLP_TREE_CODE (child) == VEC_PERM_EXPR); + *op = error_mark_node; + *dt = vect_internal_def; + if (def_stmt_info_out) + *def_stmt_info_out = NULL; + return true; + } } else {
[gcc r15-768] web/115183 - fix typo in C++ docs
https://gcc.gnu.org/g:424f8a01df9b311250e416759ad61c00bba4af48 commit r15-768-g424f8a01df9b311250e416759ad61c00bba4af48 Author: Richard Biener Date: Wed May 22 10:19:08 2024 +0200 web/115183 - fix typo in C++ docs The following fixes a reported typo. * doc/invoke.texi (C++ Modules): Fix typo. Diff: --- gcc/doc/invoke.texi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 218901c0b20..0625a5ede6f 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -37646,7 +37646,7 @@ not get debugging information for routines in the precompiled header. @cindex speed of compilation Modules are a C++20 language feature. As the name suggests, they -provides a modular compilation system, intending to provide both +provide a modular compilation system, intending to provide both faster builds and better library isolation. The ``Merging Modules'' paper @uref{https://wg21.link/p1103}, provides the easiest to read set of changes to the standard, although it does not capture later
gcc-wwwdocs branch master updated. 9d10f6fccee3a68102173f28cf312ed266b7d95d
This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "gcc-wwwdocs". The branch, master has been updated via 9d10f6fccee3a68102173f28cf312ed266b7d95d (commit) from 9e79c5e411eb3236b481c6093fad4dc5ae5141c5 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log - commit 9d10f6fccee3a68102173f28cf312ed266b7d95d Author: Richard Biener Date: Wed May 22 10:04:32 2024 +0200 web/115183 - Remove duplicate links to GCCGO online docs. diff --git a/htdocs/onlinedocs/13.1.0/index.html b/htdocs/onlinedocs/13.1.0/index.html index 2abc06ac..08d312ba 100644 --- a/htdocs/onlinedocs/13.1.0/index.html +++ b/htdocs/onlinedocs/13.1.0/index.html @@ -62,12 +62,6 @@ href="https://gcc.gnu.org/onlinedocs/gcc-13.1.0/libstdc++-api-gfdl.xml.gz;>XML GFDL or https://gcc.gnu.org/onlinedocs/gcc-13.1.0/libstdc++-api-html.tar.gz;>an HTML tarball) - https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo/;>GCCGO 13.1 Manual (https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo.pdf;>also in - PDF or https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo.ps.gz;>PostScript or https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo-html.tar.gz;>an - HTML tarball) https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo/;>GCCGO 13.1 Manual (https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo.pdf;>also in PDF or https://gcc.gnu.org/onlinedocs/gcc-13.2.0/libstdc++-api-gfdl.xml.gz;>XML GFDL or https://gcc.gnu.org/onlinedocs/gcc-13.2.0/libstdc++-api-html.tar.gz;>an HTML tarball) - https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo/;>GCCGO 13.2 Manual (https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo.pdf;>also in - PDF or https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo.ps.gz;>PostScript or https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo-html.tar.gz;>an - HTML tarball) https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo/;>GCCGO 13.2 Manual (https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo.pdf;>also in PDF or https://gcc.gnu.org/onlinedocs/gcc-13.3.0/libstdc++-api-gfdl.xml.gz;>XML GFDL or https://gcc.gnu.org/onlinedocs/gcc-13.3.0/libstdc++-api-html.tar.gz;>an HTML tarball) - https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo/;>GCCGO 13.3 Manual (https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo.pdf;>also in - PDF or https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo.ps.gz;>PostScript or https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo-html.tar.gz;>an - HTML tarball) https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo/;>GCCGO 13.3 Manual (https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo.pdf;>also in PDF or https://gcc.gnu.org/onlinedocs/gcc-14.1.0/libstdc++-api-gfdl.xml.gz;>XML GFDL or https://gcc.gnu.org/onlinedocs/gcc-14.1.0/libstdc++-api-html.tar.gz;>an HTML tarball) - https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo/;>GCCGO 14.1 Manual (https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo.pdf;>also in - PDF or https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo.ps.gz;>PostScript or https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo-html.tar.gz;>an - HTML tarball) https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo/;>GCCGO 14.1 Manual (https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo.pdf;>also in PDF or
[gcc r15-753] tree-optimization/115137 - more ptr-vs-ptr compare fixes
https://gcc.gnu.org/g:85f7828679edc3ae7488594145756cd53787650e commit r15-753-g85f7828679edc3ae7488594145756cd53787650e Author: Richard Biener Date: Tue May 21 10:12:40 2024 +0200 tree-optimization/115137 - more ptr-vs-ptr compare fixes The following fixes the omission of const-pool included in NONLOCAL. PR tree-optimization/115137 * tree-ssa-structalias.cc (pt_solution_includes_const_pool): NONLOCAL also includes constant pool entries. * gcc.dg/torture/pr115137.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/torture/pr115137.c | 34 + gcc/tree-ssa-structalias.cc | 1 + 2 files changed, 35 insertions(+) diff --git a/gcc/testsuite/gcc.dg/torture/pr115137.c b/gcc/testsuite/gcc.dg/torture/pr115137.c new file mode 100644 index 000..9cd8ff93633 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr115137.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ + +struct a { + int b; +} c; + +int d; +long e; + +static void f(char *g, char *h, struct a *l) { + char a[1024]; + int j = 0; + + if (d) +h = a; + + for (; g < h; g++) +if (__builtin_iscntrl(*g)) + ++j; + + while (l->b < j) +; +} + +int main() { + static const struct { +char *input; + } k[] = {{"somepage.html"}, {""}}; + + for (unsigned int i = 0; i < 1; ++i) { +e = __builtin_strlen(k[i].input); +f(k[i].input, k[i].input + e, ); + } +} diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc index 61fb3610a17..0e9423a78ec 100644 --- a/gcc/tree-ssa-structalias.cc +++ b/gcc/tree-ssa-structalias.cc @@ -7087,6 +7087,7 @@ bool pt_solution_includes_const_pool (struct pt_solution *pt) { return (pt->const_pool + || pt->nonlocal || (pt->escaped && (!cfun || cfun->gimple_df->escaped.const_pool)) || (pt->ipa_escaped && ipa_escaped_pt.const_pool)); }
[gcc r15-750] tree-optimization/115149 - VOP live and missing PHIs
https://gcc.gnu.org/g:ec9b8bafe20755d13ab9a1b834b5da79ae972c0e commit r15-750-gec9b8bafe20755d13ab9a1b834b5da79ae972c0e Author: Richard Biener Date: Tue May 21 09:48:04 2024 +0200 tree-optimization/115149 - VOP live and missing PHIs The following fixes a bug in vop-live get_live_in which was using NULL to indicate the first processed edge but at the same time using it for the case the live-in virtual operand cannot be computed. The following fixes this, avoiding sinking a load to a place where we'd have to insert virtual PHIs to make the virtual operand SSA web OK. PR tree-optimization/115149 * tree-ssa-live.cc (virtual_operand_live::get_live_in): Explicitly track the first processed edge. * gcc.dg/pr115149.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/pr115149.c | 16 gcc/tree-ssa-live.cc| 8 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/pr115149.c b/gcc/testsuite/gcc.dg/pr115149.c new file mode 100644 index 000..9f6bc97dbe6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr115149.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -fno-tree-vrp -fno-ipa-sra -fno-tree-dce -fno-tree-ch" } */ + +int a, c, e, f, g, h[1], i; +static int j(int b) { return 0; } +static void k(int d) {} +int main() +{ + if (h[0]) +while (1) { + k(f && j(i && (h[g] = e))); + while (a) + c ^= 1; +} + return 0; +} diff --git a/gcc/tree-ssa-live.cc b/gcc/tree-ssa-live.cc index e6ae551a457..60dfc05dcd9 100644 --- a/gcc/tree-ssa-live.cc +++ b/gcc/tree-ssa-live.cc @@ -1675,14 +1675,18 @@ virtual_operand_live::get_live_in (basic_block bb) edge_iterator ei; edge e; tree livein = NULL_TREE; + bool first = true; FOR_EACH_EDGE (e, ei, bb->preds) if (e->flags & EDGE_DFS_BACK) /* We can ignore backedges since if there's a def there it would have forced a PHI in the source because it also acts as use downstream. */ continue; -else if (!livein) - livein = get_live_out (e->src); +else if (first) + { + livein = get_live_out (e->src); + first = false; + } else if (get_live_out (e->src) != livein) /* When there's no virtual use downstream this indicates a point where we'd insert a PHI merging the different live virtual
gcc-wwwdocs branch master updated. 48be8366fcbf9246b2b5b1625febb5e9202842b8
This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "gcc-wwwdocs". The branch, master has been updated via 48be8366fcbf9246b2b5b1625febb5e9202842b8 (commit) from ed9ceba9b8b038f0e0f333798da7abe046679d0c (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log - commit 48be8366fcbf9246b2b5b1625febb5e9202842b8 Author: Richard Biener Date: Fri May 17 13:48:56 2024 +0200 Fix trunk status diff --git a/htdocs/index.html b/htdocs/index.html index 63fbcdcd..c52bb30b 100644 --- a/htdocs/index.html +++ b/htdocs/index.html @@ -222,7 +222,7 @@ More news? Let ger...@pfeifer.com know! https://gcc.gnu.org/pipermail/gcc/2024-April/243824.html;>2024-04-26 - (regression fixes docs only). + (general development). https://gcc.gnu.org/bugzilla/buglist.cgi?query_format=advancedshort_desc_type=regexpshort_desc=%5C%5B(%5B%200-9.%2F%5D*%5B%20%2F%5D)*15%5B%20%2F%5D%5B%200-9.%2F%5D*%5BRr%5Degression%20*%5C%5Dtarget_milestone=11.5target_milestone=12.4target_milestone=13.3target_milestone=14.2target_milestone=15.0known_to_fail_type=allwordssubstrknown_to_work_type=allwordssubstrlong_desc_type=allwordssubstrlong_desc=bug_file_loc_type=allwordssubstrbug_file_loc=gcchost_type=allwordssubstrgcchost=gcctarget_type=allwordssubstrgcctarget=gccbuild_type=allwordssubstrgccbuild=keywords_type=allwordskeywords=bug_status=UNCONFIRMEDbug_status=NEWbug_status=ASSIGNEDbug_status=SUSPENDEDbug_status=WAITINGbug_status=REOPENEDpriority=P1priority=P2priority=P3emailtype1=substringemail1=emailtype2=substringemail2=bugidtype=includebug_i d=votes=chfieldfrom=chfieldto=Nowchfieldvalue=cmdtype=doitorder=Reuse+same+sort+as+last+timefield0-0-0=nooptype0-0-0=noopvalue0-0-0=">Serious --- Summary of changes: htdocs/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) hooks/post-receive -- gcc-wwwdocs
[gcc r12-10456] tree-optimization/112281 - loop distribution and zero dependence distances
https://gcc.gnu.org/g:5db4b5449df8f59a61438f8db1836dbc9b53f02e commit r12-10456-g5db4b5449df8f59a61438f8db1836dbc9b53f02e Author: Richard Biener Date: Mon Nov 20 13:39:52 2023 +0100 tree-optimization/112281 - loop distribution and zero dependence distances The following fixes an omission in dependence testing for loop distribution. When the overall dependence distance is not zero but the dependence direction in the innermost common loop is = there is a conflict between the partitions and we have to merge them. PR tree-optimization/112281 * tree-loop-distribution.cc (loop_distribution::pg_add_dependence_edges): For = in the innermost common loop record a partition conflict. * gcc.dg/torture/pr112281-1.c: New testcase. * gcc.dg/torture/pr112281-2.c: Likewise. (cherry picked from commit 3b34902417259031823bff7f853f615a60464bbd) Diff: --- gcc/testsuite/gcc.dg/torture/pr112281-1.c | 18 ++ gcc/testsuite/gcc.dg/torture/pr112281-2.c | 18 ++ gcc/tree-loop-distribution.cc | 18 ++ 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/gcc/testsuite/gcc.dg/torture/pr112281-1.c b/gcc/testsuite/gcc.dg/torture/pr112281-1.c new file mode 100644 index ..711f5663195c --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr112281-1.c @@ -0,0 +1,18 @@ +/* { dg-do run } */ +/* { dg-additional-options "-ftree-loop-distribution" } */ + +struct { + int : 8; + int a; +} b, d[4] = {{0}, {0}, {0}, {5}}; +int c, e; +int main() { + for (c = 2; c; c--) +for (e = 0; e < 2; e++) { + d[c] = b = d[c + 1]; + d[c + 1].a = 0; +} + if (b.a != 0) +__builtin_abort(); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/torture/pr112281-2.c b/gcc/testsuite/gcc.dg/torture/pr112281-2.c new file mode 100644 index ..d7671e3322b4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr112281-2.c @@ -0,0 +1,18 @@ +/* { dg-do run } */ +/* { dg-additional-options "-ftree-loop-distribution" } */ + +struct { + int : 8; + int a; +} b, d[4] = {{5}, {0}, {0}, {0}}; +int c, e; +int main() { + for (c = 0; c < 2; c++) +for (e = 0; e < 2; e++) { + d[c + 1] = b = d[c]; + d[c].a = 0; +} + if (b.a != 0) +__builtin_abort(); + return 0; +} diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index 606eb05e64a5..1b7d2a1ea7d2 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -2117,9 +2117,6 @@ loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir, } else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE) { - if (DDR_REVERSED_P (ddr)) - this_dir = -this_dir; - /* Known dependences can still be unordered througout the iteration space, see gcc.dg/tree-ssa/ldist-16.c and gcc.dg/tree-ssa/pr94969.c. */ @@ -2132,7 +2129,20 @@ loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir, /* Else as the distance vector is lexicographic positive swap the dependence direction. */ else - this_dir = -this_dir; + { + if (DDR_REVERSED_P (ddr)) + this_dir = -this_dir; + this_dir = -this_dir; + + /* When then dependence distance of the innermost common +loop of the DRs is zero we have a conflict. */ + auto l1 = gimple_bb (DR_STMT (dr1))->loop_father; + auto l2 = gimple_bb (DR_STMT (dr2))->loop_father; + int idx = index_in_loop_nest (find_common_loop (l1, l2)->num, + DDR_LOOP_NEST (ddr)); + if (DDR_DIST_VECT (ddr, 0)[idx] == 0) + this_dir = 2; + } } else this_dir = 0;
[gcc r12-10458] middle-end/110176 - wrong zext (bool) <= (int) 4294967295u folding
https://gcc.gnu.org/g:65e5547e5468ce404d0f9ebd646a1d63abf3a772 commit r12-10458-g65e5547e5468ce404d0f9ebd646a1d63abf3a772 Author: Richard Biener Date: Wed Jan 31 14:40:24 2024 +0100 middle-end/110176 - wrong zext (bool) <= (int) 4294967295u folding The following fixes a wrong pattern that didn't match the behavior of the original fold_widened_comparison in that get_unwidened returned a constant always in the wider type. But here we're using (int) 4294967295u without the conversion applied. Fixed by doing as earlier in the pattern - matching constants only if the conversion was actually applied. PR middle-end/110176 * match.pd (zext (bool) <= (int) 4294967295u): Make sure to match INTEGER_CST only without outstanding conversion. * gcc.dg/torture/pr110176.c: New testcase. (cherry picked from commit 22dbfbe8767ff4c1d93e39f68ec7c2d5b1358beb) Diff: --- gcc/match.pd| 12 - gcc/testsuite/gcc.dg/torture/pr110176.c | 46 + 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/gcc/match.pd b/gcc/match.pd index 0938d56fa45f..45ed34205106 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -5379,19 +5379,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) >= TYPE_PRECISION (TREE_TYPE (@10))) && (TYPE_UNSIGNED (TREE_TYPE (@00)) == TYPE_UNSIGNED (TREE_TYPE (@10 - || (TREE_CODE (@10) == INTEGER_CST + || (TREE_CODE (@1) == INTEGER_CST && INTEGRAL_TYPE_P (TREE_TYPE (@00)) - && int_fits_type_p (@10, TREE_TYPE (@00) + && int_fits_type_p (@1, TREE_TYPE (@00) (cmp @00 (convert @10)) - (if (TREE_CODE (@10) == INTEGER_CST + (if (TREE_CODE (@1) == INTEGER_CST && INTEGRAL_TYPE_P (TREE_TYPE (@00)) - && !int_fits_type_p (@10, TREE_TYPE (@00))) + && !int_fits_type_p (@1, TREE_TYPE (@00))) (with { tree min = lower_bound_in_type (TREE_TYPE (@10), TREE_TYPE (@00)); tree max = upper_bound_in_type (TREE_TYPE (@10), TREE_TYPE (@00)); - bool above = integer_nonzerop (const_binop (LT_EXPR, type, max, @10)); - bool below = integer_nonzerop (const_binop (LT_EXPR, type, @10, min)); + bool above = integer_nonzerop (const_binop (LT_EXPR, type, max, @1)); + bool below = integer_nonzerop (const_binop (LT_EXPR, type, @1, min)); } (if (above || below) (if (cmp == EQ_EXPR || cmp == NE_EXPR) diff --git a/gcc/testsuite/gcc.dg/torture/pr110176.c b/gcc/testsuite/gcc.dg/torture/pr110176.c new file mode 100644 index ..e41e3a0c3a7e --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr110176.c @@ -0,0 +1,46 @@ +/* { dg-do run } */ + +int f(_Bool t) +{ +int tt = t; +unsigned x = -1; +int xx = x; +return xx <= tt; +} + +int a, b; +void c() {} +__attribute__((noipa)) +void h() {__builtin_abort();} +int d() { + unsigned f[1]; + int i; + if (a) +goto h; + f[0] = -1; + while (1) { +c(); +for (; a < 1; a++) { + if (0) { + j: +continue; + } + i = f[0]; + if (a) +break; + b = i >= (b == 0); +} +if (!b) { + if (0) { + h: +goto j; + } + return 0; +} +h(); + } +} +int main() { + d(); + return 0; +}
[gcc r12-10454] tree-optimization/112505 - bit-precision induction vectorization
https://gcc.gnu.org/g:4a71557fbebe3fb4031d1c2adc4f89c89a8c6c62 commit r12-10454-g4a71557fbebe3fb4031d1c2adc4f89c89a8c6c62 Author: Richard Biener Date: Thu Jan 11 14:00:33 2024 +0100 tree-optimization/112505 - bit-precision induction vectorization Vectorization of bit-precision inductions isn't implemented but we don't check this, instead we ICE during transform. PR tree-optimization/112505 * tree-vect-loop.cc (vectorizable_induction): Reject bit-precision induction. * gcc.dg/vect/pr112505.c: New testcase. (cherry picked from commit ec345df53556ec581590347f71c3d9ff3cdbca76) Diff: --- gcc/testsuite/gcc.dg/vect/pr112505.c | 14 ++ gcc/tree-vect-loop.cc| 9 + 2 files changed, 23 insertions(+) diff --git a/gcc/testsuite/gcc.dg/vect/pr112505.c b/gcc/testsuite/gcc.dg/vect/pr112505.c new file mode 100644 index ..56546c1095aa --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr112505.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3" } */ + +short int w9; +struct T { + short a : 14; + int b; +}; +struct T v; +void zc() +{ + for(int i = 0; i < 4; i ++) +w9 *= v.b ? v.a-- < 0 : 0; +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 86ee9e449e19..fd0e5a70a962 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -8211,6 +8211,15 @@ vectorizable_induction (loop_vec_info loop_vinfo, step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); gcc_assert (step_expr != NULL_TREE); + if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) + && !type_has_mode_precision_p (TREE_TYPE (step_expr))) +{ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +"bit-precision induction vectorization not " +"supported.\n"); + return false; +} tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype); /* Check for backend support of PLUS/MINUS_EXPR. */
[gcc r12-10457] tree-optimization/111039 - abnormals and bit test merging
https://gcc.gnu.org/g:47e6bff94d980e2fcb6bcb42df04d3b73bd67da7 commit r12-10457-g47e6bff94d980e2fcb6bcb42df04d3b73bd67da7 Author: Richard Biener Date: Thu Aug 17 13:10:14 2023 +0200 tree-optimization/111039 - abnormals and bit test merging The following guards the bit test merging code in if-combine against the appearance of SSA names used in abnormal PHIs. PR tree-optimization/111039 * tree-ssa-ifcombine.cc (ifcombine_ifandif): Check for SSA_NAME_OCCURS_IN_ABNORMAL_PHI. * gcc.dg/pr111039.c: New testcase. (cherry picked from commit 482551a79a3d3f107f6239679ee74655cfe8707e) Diff: --- gcc/testsuite/gcc.dg/pr111039.c | 15 +++ gcc/tree-ssa-ifcombine.cc | 7 +++ 2 files changed, 22 insertions(+) diff --git a/gcc/testsuite/gcc.dg/pr111039.c b/gcc/testsuite/gcc.dg/pr111039.c new file mode 100644 index ..bec9983b35f8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr111039.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +int _setjmp (); +void abcd (); +void abcde (); +void compiler_corruption_function(int flags) +{ + int nowait = flags & 1048576, isexpand = flags & 8388608; + abcd(); + _setjmp(flags); + if (nowait && isexpand) +flags &= 0; + abcde(); +} diff --git a/gcc/tree-ssa-ifcombine.cc b/gcc/tree-ssa-ifcombine.cc index ce9bbebf9480..b139328af224 100644 --- a/gcc/tree-ssa-ifcombine.cc +++ b/gcc/tree-ssa-ifcombine.cc @@ -415,6 +415,9 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool inner_inv, { tree t, t2; + if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1)) + return false; + /* Do it. */ gsi = gsi_for_stmt (inner_cond); t = fold_build2 (LSHIFT_EXPR, TREE_TYPE (name1), @@ -465,6 +468,10 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool inner_inv, gimple_stmt_iterator gsi; tree t; + if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1) + || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name2)) + return false; + /* Find the common name which is bit-tested. */ if (name1 == name2) ;
[gcc r12-10455] tree-optimization/112495 - alias versioning and address spaces
https://gcc.gnu.org/g:dbb5273996259b04350a1e3d35e633c51fc9310f commit r12-10455-gdbb5273996259b04350a1e3d35e633c51fc9310f Author: Richard Biener Date: Mon Nov 13 10:20:37 2023 +0100 tree-optimization/112495 - alias versioning and address spaces We are not correctly handling differing address spaces in dependence analysis runtime alias check generation so refuse to do that. PR tree-optimization/112495 * tree-data-ref.cc (runtime_alias_check_p): Reject checks between different address spaces. * gcc.target/i386/pr112495.c: New testcase. (cherry picked from commit 0f593c0521caab8cfac53514b1a5e7d0d0dd1932) Diff: --- gcc/testsuite/gcc.target/i386/pr112495.c | 12 gcc/tree-data-ref.cc | 7 +++ 2 files changed, 19 insertions(+) diff --git a/gcc/testsuite/gcc.target/i386/pr112495.c b/gcc/testsuite/gcc.target/i386/pr112495.c new file mode 100644 index ..21afbaa6945d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr112495.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +typedef struct { int v; } T1; +typedef struct { T1 v[32]; } T2; + +T1 s; +T1 f1() { return s; } + +void f2(__seg_gs T2 *p, int n) { + for (int i = 0; i < n; ++i) p->v[i] = f1(); +} diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc index 397792c3584c..0df4a3525f4c 100644 --- a/gcc/tree-data-ref.cc +++ b/gcc/tree-data-ref.cc @@ -1632,6 +1632,13 @@ runtime_alias_check_p (ddr_p ddr, class loop *loop, bool speed_p) "runtime alias check not supported for" " outer loop.\n"); + /* FORNOW: We don't support handling different address spaces. */ + if (TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (DR_BASE_ADDRESS (DDR_A (ddr) + != TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (DR_BASE_ADDRESS (DDR_B (ddr)) +return opt_result::failure_at (DR_STMT (DDR_A (ddr)), + "runtime alias check between different " + "address spaces not supported.\n"); + return opt_result::success (); }
[gcc r12-10453] debug/112718 - reset all type units with -ffat-lto-objects
https://gcc.gnu.org/g:1f41e8eef3da1d76c18fe1a93846054c53dc5a47 commit r12-10453-g1f41e8eef3da1d76c18fe1a93846054c53dc5a47 Author: Richard Biener Date: Mon Jan 22 15:42:59 2024 +0100 debug/112718 - reset all type units with -ffat-lto-objects When mixing -flto, -ffat-lto-objects and -fdebug-type-section we fail to reset all type units after early output resulting in an ICE when attempting to add then duplicate sibling attributes. PR debug/112718 * dwarf2out.cc (dwarf2out_finish): Reset all type units for the fat part of an LTO compile. * gcc.dg/debug/pr112718.c: New testcase. (cherry picked from commit 7218f5050cb7163edae331f54ca163248ab48bfa) Diff: --- gcc/dwarf2out.cc | 12 gcc/testsuite/gcc.dg/debug/pr112718.c | 12 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gcc/dwarf2out.cc b/gcc/dwarf2out.cc index d14ec0261b6b..cfe87cba4c4c 100644 --- a/gcc/dwarf2out.cc +++ b/gcc/dwarf2out.cc @@ -32162,24 +32162,12 @@ dwarf2out_finish (const char *filename) reset_dies (comp_unit_die ()); for (limbo_die_node *node = cu_die_list; node; node = node->next) reset_dies (node->die); - - hash_table comdat_type_table (100); for (ctnode = comdat_type_list; ctnode != NULL; ctnode = ctnode->next) { - comdat_type_node **slot - = comdat_type_table.find_slot (ctnode, INSERT); - - /* Don't reset types twice. */ - if (*slot != HTAB_EMPTY_ENTRY) - continue; - /* Remove the pointer to the line table. */ remove_AT (ctnode->root_die, DW_AT_stmt_list); - if (debug_info_level >= DINFO_LEVEL_TERSE) reset_dies (ctnode->root_die); - - *slot = ctnode; } /* Reset die CU symbol so we don't output it twice. */ diff --git a/gcc/testsuite/gcc.dg/debug/pr112718.c b/gcc/testsuite/gcc.dg/debug/pr112718.c new file mode 100644 index ..ff80ca5a2981 --- /dev/null +++ b/gcc/testsuite/gcc.dg/debug/pr112718.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target lto } */ +/* { dg-options "-g -fdebug-types-section -flto -ffat-lto-objects" } */ + +struct { + int h; + unsigned char data[20 + 24 * 6]; +} _EC_X9_62_PRIME_192V2; +struct { + int h; + unsigned char data[20 + 24 * 6]; +} _EC_X9_62_PRIME_192V3;
[gcc r12-10452] tree-optimization/112793 - SLP of constant/external code-generated twice
https://gcc.gnu.org/g:9bad5cf9ae446b367f666176537eb76e94cc4448 commit r12-10452-g9bad5cf9ae446b367f666176537eb76e94cc4448 Author: Richard Biener Date: Wed Dec 13 14:23:31 2023 +0100 tree-optimization/112793 - SLP of constant/external code-generated twice The following makes the attempt at code-generating a constant/external SLP node twice well-formed as that can happen when partitioning BB vectorization attempts where we keep constants/externals unpartitioned. PR tree-optimization/112793 * tree-vect-slp.cc (vect_schedule_slp_node): Already code-generated constant/external nodes are OK. * g++.dg/vect/pr112793.cc: New testcase. (cherry picked from commit d782ec8362eadc3169286eb1e39c631effd02323) Diff: --- gcc/testsuite/g++.dg/vect/pr112793.cc | 32 gcc/tree-vect-slp.cc | 16 +--- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/gcc/testsuite/g++.dg/vect/pr112793.cc b/gcc/testsuite/g++.dg/vect/pr112793.cc new file mode 100644 index ..258d7c1b1119 --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/pr112793.cc @@ -0,0 +1,32 @@ +// { dg-do compile } +// { dg-require-effective-target c++11 } +// { dg-additional-options "-march=znver2" { target x86_64-*-* i?86-*-* } } + +typedef double T; +T c, s; +T a[16]; +struct Matrix4 { + Matrix4(){} + Matrix4(T e, T f, T i, T j) { +r[1] = r[4] = e; +r[5] = f; +r[8] = i; +r[9] = j; + } + Matrix4 operator*(Matrix4 a) { +return Matrix4( + r[0] * a.r[4] + r[4] + r[15] + r[6], + r[1] * a.r[4] + 1 + 2 + 3, r[0] * r[8] + 1 + 2 + 3, + r[1] * r[8] + r[1] + r[14] + r[2] * r[3]); + } + T r[16] = {}; +}; +Matrix4 t1, t2; +Matrix4 tt; +Matrix4 getRotAltAzToEquatorial() +{ + t2.r[4] = 0; + t1.r[1] = -s; + t1.r[8] = 0; + return t1 * t2; +} diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 26c989cbff9a..54e6a9e4224f 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -7240,12 +7240,6 @@ vect_schedule_slp_node (vec_info *vinfo, int i; slp_tree child; - /* For existing vectors there's nothing to do. */ - if (SLP_TREE_VEC_DEFS (node).exists ()) -return; - - gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ()); - /* Vectorize externals and constants. */ if (SLP_TREE_DEF_TYPE (node) == vect_constant_def || SLP_TREE_DEF_TYPE (node) == vect_external_def) @@ -7256,10 +7250,18 @@ vect_schedule_slp_node (vec_info *vinfo, if (!SLP_TREE_VECTYPE (node)) return; - vect_create_constant_vectors (vinfo, node); + /* There are two reasons vector defs might already exist. The first +is that we are vectorizing an existing vector def. The second is +when performing BB vectorization shared constant/external nodes +are not split apart during partitioning so during the code-gen +DFS walk we can end up visiting them twice. */ + if (! SLP_TREE_VEC_DEFS (node).exists ()) + vect_create_constant_vectors (vinfo, node); return; } + gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ()); + stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
[gcc r14-10214] tree-optimization/114998 - use-after-free with loop distribution
https://gcc.gnu.org/g:1e9ae50d4d160f6d557fc4cbbe95c4a36897c09f commit r14-10214-g1e9ae50d4d160f6d557fc4cbbe95c4a36897c09f Author: Richard Biener Date: Fri May 10 14:19:49 2024 +0200 tree-optimization/114998 - use-after-free with loop distribution When loop distribution releases a PHI node of the original IL it can end up clobbering memory that's re-used when it upon releasing its RDG resets all stmt UIDs back to -1, even those that got released. The fix is to avoid resetting UIDs based on stmts in the RDG but instead reset only those still present in the loop. PR tree-optimization/114998 * tree-loop-distribution.cc (free_rdg): Take loop argument. Reset UIDs of stmts still in the IL rather than all stmts referenced from the RDG. (loop_distribution::build_rdg): Pass loop to free_rdg. (loop_distribution::distribute_loop): Likewise. (loop_distribution::transform_reduction_loop): Likewise. * gcc.dg/torture/pr114998.c: New testcase. (cherry picked from commit 34d15a4d630a0d54eddb99bdab086c506e10dac5) Diff: --- gcc/testsuite/gcc.dg/torture/pr114998.c | 35 + gcc/tree-loop-distribution.cc | 24 -- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/gcc/testsuite/gcc.dg/torture/pr114998.c b/gcc/testsuite/gcc.dg/torture/pr114998.c new file mode 100644 index ..81fc1e077cb9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr114998.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-fno-tree-dce -ftree-loop-distribution" } */ + +short a, d; +int b, c, f, g, h, i, j[2], o; +__attribute__((const)) int s(char r); +int main() { + int l, m, k, n; + if (b) { +char p; +for (; p >= 0; p--) { + int e[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, + 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, + 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0}; + if (j[p]) { +int q[1]; +i = o; +o = q[h]; +if (g) + n = d; +m = 4; +for (; m; m--) { + if (l) +k |= c; + if (a) +break; +} + } + s(n); + f |= b; +} + } + return 0; +} diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index 95203fefa188..45932bae5e7f 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -778,7 +778,7 @@ loop_distribution::stmts_from_loop (class loop *loop, vec *stmts) /* Free the reduced dependence graph RDG. */ static void -free_rdg (struct graph *rdg) +free_rdg (struct graph *rdg, loop_p loop) { int i; @@ -792,13 +792,25 @@ free_rdg (struct graph *rdg) if (v->data) { - gimple_set_uid (RDGV_STMT (v), -1); (RDGV_DATAREFS (v)).release (); free (v->data); } } free_graph (rdg); + + /* Reset UIDs of stmts still in the loop. */ + basic_block *bbs = get_loop_body (loop); + for (unsigned i = 0; i < loop->num_nodes; ++i) +{ + basic_block bb = bbs[i]; + gimple_stmt_iterator gsi; + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next ()) + gimple_set_uid (gsi_stmt (gsi), -1); + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next ()) + gimple_set_uid (gsi_stmt (gsi), -1); +} + free (bbs); } struct graph * @@ -812,7 +824,7 @@ loop_distribution::build_rdg (class loop *loop, control_dependences *cd) rdg = new_graph (stmts.length ()); if (!create_rdg_vertices (rdg, stmts, loop)) { - free_rdg (rdg); + free_rdg (rdg, loop); return NULL; } stmts.release (); @@ -3062,7 +3074,7 @@ loop_distribution::distribute_loop (class loop *loop, "Loop %d not distributed: too many memory references.\n", loop->num); - free_rdg (rdg); + free_rdg (rdg, loop); loop_nest.release (); free_data_refs (datarefs_vec); delete ddrs_table; @@ -3259,7 +3271,7 @@ loop_distribution::distribute_loop (class loop *loop, FOR_EACH_VEC_ELT (partitions, i, partition) partition_free (partition); - free_rdg (rdg); + free_rdg (rdg, loop); return nbp - *nb_calls; } @@ -3665,7 +3677,7 @@ loop_distribution::transform_reduction_loop (loop_p loop) auto_bitmap partition_stmts; bitmap_set_range (partition_stmts, 0, rdg->n_vertices); find_single_drs (loop, rdg, partition_stmts, _dr, _dr); - free_rdg (rdg); + free_rdg (rdg, loop); /* Bail out if there is no single load. */ if (load_dr == NULL)
[gcc r15-626] middle-end/115110 - Fix view_converted_memref_p
https://gcc.gnu.org/g:a5b3721c06646bf5b9b50a22964e8e2bd4d03f5f commit r15-626-ga5b3721c06646bf5b9b50a22964e8e2bd4d03f5f Author: Richard Biener Date: Fri May 17 11:02:29 2024 +0200 middle-end/115110 - Fix view_converted_memref_p view_converted_memref_p was checking the reference type against the pointer type of the offset operand rather than its pointed-to type which leads to all refs being subject to view-convert treatment in get_alias_set causing numerous testsuite fails but with its new uses from r15-512-g9b7cad5884f21c is also a wrong-code issue. PR middle-end/115110 * tree-ssa-alias.cc (view_converted_memref_p): Fix. Diff: --- gcc/tree-ssa-alias.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc index 9f5f69bcfad2..d64d6d02f4a8 100644 --- a/gcc/tree-ssa-alias.cc +++ b/gcc/tree-ssa-alias.cc @@ -2077,8 +2077,9 @@ view_converted_memref_p (tree base) { if (TREE_CODE (base) != MEM_REF && TREE_CODE (base) != TARGET_MEM_REF) return false; - return same_type_for_tbaa (TREE_TYPE (base), -TREE_TYPE (TREE_OPERAND (base, 1))) != 1; + return (same_type_for_tbaa (TREE_TYPE (base), + TREE_TYPE (TREE_TYPE (TREE_OPERAND (base, 1 + != 1); } /* Return true if an indirect reference based on *PTR1 constrained
[gcc r15-622] Add missing check for const_pool in the escaped solutions
https://gcc.gnu.org/g:b420e0b920613c42f63252aa2478a8315dc37a13 commit r15-622-gb420e0b920613c42f63252aa2478a8315dc37a13 Author: Richard Biener Date: Fri May 17 09:31:52 2024 +0200 Add missing check for const_pool in the escaped solutions The ptr-vs-ptr compare folding using points-to info was missing a check for const_pool being included in the escaped solution. The following fixes that, fixing the observed execute FAIL of experimental/functional/searchers.cc * tree-ssa-alias.h (pt_solution_includes_const_pool): Declare. * tree-ssa-alias.cc (ptrs_compare_unequal): Use pt_solution_includes_const_pool. * tree-ssa-structalias.cc (pt_solution_includes_const_pool): New. * gcc.dg/torture/20240517-1.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/torture/20240517-1.c | 26 ++ gcc/tree-ssa-alias.cc | 3 ++- gcc/tree-ssa-alias.h | 1 + gcc/tree-ssa-structalias.cc | 11 +++ 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/torture/20240517-1.c b/gcc/testsuite/gcc.dg/torture/20240517-1.c new file mode 100644 index ..ab83d3ca6fba --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/20240517-1.c @@ -0,0 +1,26 @@ +/* { dg-do run } */ +/* { dg-additional-options "-fmerge-all-constants" } */ + +char *p; + +char * __attribute__((noipa)) +foo () { return p+1; } + +volatile int z; + +int main() +{ + /* ESCAPED = CONST_POOL */ + p = "Hello"; + /* PT = ESCAPED */ + char *x = foo (); + char *y; + /* y PT = CONST_POOL */ + if (z) +y = "Baz"; + else +y = "Hello" + 1; + if (y != x) +__builtin_abort (); + return 0; +} diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc index 6d31fc836917..9f5f69bcfad2 100644 --- a/gcc/tree-ssa-alias.cc +++ b/gcc/tree-ssa-alias.cc @@ -501,7 +501,8 @@ ptrs_compare_unequal (tree ptr1, tree ptr2) || pi2->pt.vars_contains_interposable) return false; if ((!pi1->pt.null || !pi2->pt.null) - && (!pi1->pt.const_pool || !pi2->pt.const_pool)) + && (!pt_solution_includes_const_pool (>pt) + || !pt_solution_includes_const_pool (>pt))) return !pt_solutions_intersect (>pt, >pt); } } diff --git a/gcc/tree-ssa-alias.h b/gcc/tree-ssa-alias.h index e29dff583750..5cd64e722955 100644 --- a/gcc/tree-ssa-alias.h +++ b/gcc/tree-ssa-alias.h @@ -178,6 +178,7 @@ extern bool pt_solution_empty_p (const pt_solution *); extern bool pt_solution_singleton_or_null_p (struct pt_solution *, unsigned *); extern bool pt_solution_includes_global (struct pt_solution *, bool); extern bool pt_solution_includes (struct pt_solution *, const_tree); +extern bool pt_solution_includes_const_pool (struct pt_solution *); extern bool pt_solutions_intersect (struct pt_solution *, struct pt_solution *); extern void pt_solution_reset (struct pt_solution *); extern void pt_solution_set (struct pt_solution *, bitmap, bool); diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc index 0c6085b17662..61fb3610a172 100644 --- a/gcc/tree-ssa-structalias.cc +++ b/gcc/tree-ssa-structalias.cc @@ -7080,6 +7080,17 @@ pt_solution_includes (struct pt_solution *pt, const_tree decl) return res; } +/* Return true if the points-to solution *PT contains a reference to a + constant pool entry. */ + +bool +pt_solution_includes_const_pool (struct pt_solution *pt) +{ + return (pt->const_pool + || (pt->escaped && (!cfun || cfun->gimple_df->escaped.const_pool)) + || (pt->ipa_escaped && ipa_escaped_pt.const_pool)); +} + /* Return true if both points-to solutions PT1 and PT2 have a non-empty intersection. */
[gcc r15-580] tree-optimization/13962 - handle ptr-ptr compares in ptrs_compare_unequal
https://gcc.gnu.org/g:f3e5f4c58591f5dacdd14a65ec47bbe310df02a0 commit r15-580-gf3e5f4c58591f5dacdd14a65ec47bbe310df02a0 Author: Richard Biener Date: Mon Mar 11 11:17:32 2024 +0100 tree-optimization/13962 - handle ptr-ptr compares in ptrs_compare_unequal Now that we handle pt.null conservatively we can implement the missing tracking of constant pool entries (aka STRING_CST) and handle ptr-ptr compares using points-to info in ptrs_compare_unequal. PR tree-optimization/13962 PR tree-optimization/96564 * tree-ssa-alias.h (pt_solution::const_pool): New flag. * tree-ssa-alias.cc (ptrs_compare_unequal): Handle pointer-pointer compares. (dump_points_to_solution): Dump the const_pool flag, fix guard of flag dumping. * gimple-pretty-print.cc (pp_points_to_solution): Likewise. * tree-ssa-structalias.cc (find_what_var_points_to): Set the const_pool flag for STRING. (pt_solution_ior_into): Handle the const_pool flag. (ipa_escaped_pt): Initialize it. * gcc.dg/tree-ssa/alias-39.c: New testcase. * g++.dg/vect/pr68145.cc: Use -fno-tree-pta to avoid UB to manifest in transforms no longer vectorizing this testcase for an ICE. Diff: --- gcc/gimple-pretty-print.cc | 5 - gcc/testsuite/g++.dg/vect/pr68145.cc | 2 +- gcc/testsuite/gcc.dg/tree-ssa/alias-39.c | 12 gcc/tree-ssa-alias.cc| 30 ++ gcc/tree-ssa-alias.h | 5 + gcc/tree-ssa-structalias.cc | 6 +++--- 6 files changed, 51 insertions(+), 9 deletions(-) diff --git a/gcc/gimple-pretty-print.cc b/gcc/gimple-pretty-print.cc index abda8871f97f..a71e1e0efc77 100644 --- a/gcc/gimple-pretty-print.cc +++ b/gcc/gimple-pretty-print.cc @@ -822,6 +822,8 @@ pp_points_to_solution (pretty_printer *buffer, const pt_solution *pt) pp_string (buffer, "unit-escaped "); if (pt->null) pp_string (buffer, "null "); + if (pt->const_pool) +pp_string (buffer, "const-pool "); if (pt->vars && !bitmap_empty_p (pt->vars)) { @@ -838,7 +840,8 @@ pp_points_to_solution (pretty_printer *buffer, const pt_solution *pt) if (pt->vars_contains_nonlocal || pt->vars_contains_escaped || pt->vars_contains_escaped_heap - || pt->vars_contains_restrict) + || pt->vars_contains_restrict + || pt->vars_contains_interposable) { const char *comma = ""; pp_string (buffer, " ("); diff --git a/gcc/testsuite/g++.dg/vect/pr68145.cc b/gcc/testsuite/g++.dg/vect/pr68145.cc index 8a1e10ee7833..8d3502b0bf4e 100644 --- a/gcc/testsuite/g++.dg/vect/pr68145.cc +++ b/gcc/testsuite/g++.dg/vect/pr68145.cc @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-Wno-return-type" } */ +/* { dg-additional-options "-fno-tree-pta -Wno-return-type" } */ struct A { bool operator()(int p1, int p2) { return p1 && p2; } diff --git a/gcc/testsuite/gcc.dg/tree-ssa/alias-39.c b/gcc/testsuite/gcc.dg/tree-ssa/alias-39.c new file mode 100644 index ..3b452893f6b1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/alias-39.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fdump-tree-forwprop3" } */ + +static int a, b; +int foo (int n, int which) +{ + void *p = __builtin_malloc (n); + void *q = which ? : + return p == q; +} + +/* { dg-final { scan-tree-dump "return 0;" "forwprop3" } } */ diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc index 96301bbde7fa..6d31fc836917 100644 --- a/gcc/tree-ssa-alias.cc +++ b/gcc/tree-ssa-alias.cc @@ -484,9 +484,27 @@ ptrs_compare_unequal (tree ptr1, tree ptr2) } return !pt_solution_includes (>pt, obj1); } - - /* ??? We'd like to handle ptr1 != NULL and ptr1 != ptr2 - but those require pt.null to be conservatively correct. */ + else if (TREE_CODE (ptr1) == SSA_NAME) +{ + struct ptr_info_def *pi1 = SSA_NAME_PTR_INFO (ptr1); + if (!pi1 + || pi1->pt.vars_contains_restrict + || pi1->pt.vars_contains_interposable) + return false; + if (integer_zerop (ptr2) && !pi1->pt.null) + return true; + if (TREE_CODE (ptr2) == SSA_NAME) + { + struct ptr_info_def *pi2 = SSA_NAME_PTR_INFO (ptr2); + if (!pi2 + || pi2->pt.vars_contains_restrict + || pi2->pt.vars_contains_interposable) + return false; + if ((!pi1->pt.null || !pi2->pt.null) + && (!pi1->pt.const_pool || !pi2->pt.const_pool)) + return !pt_solutions_intersect (>pt, >pt); + } +} return false; } @@ -636,6 +654,9 @@ dump_points_to_solution (FILE *file, struct pt_solution *pt) if (pt->null) fprintf (file, ", points-to NULL"); + if (pt->const_pool) +fprintf (file, ", points-to
[gcc r15-579] wrong code with points-to and volatile
https://gcc.gnu.org/g:a9251ab3c91c8c559d0306838575a666ae62dff4 commit r15-579-ga9251ab3c91c8c559d0306838575a666ae62dff4 Author: Richard Biener Date: Thu May 16 12:35:28 2024 +0200 wrong code with points-to and volatile The following fixes points-to analysis which ignores the fact that volatile qualified refs can result in any pointer. * tree-ssa-structalias.cc (get_constraint_for_1): For volatile referenced or decls use ANYTHING. * gcc.dg/tree-ssa/alias-38.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/tree-ssa/alias-38.c | 14 ++ gcc/tree-ssa-structalias.cc | 7 +++ 2 files changed, 21 insertions(+) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/alias-38.c b/gcc/testsuite/gcc.dg/tree-ssa/alias-38.c new file mode 100644 index ..a5c414934735 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/alias-38.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +int x; +int y; + +int main () +{ + int *volatile p = + return (p != ); +} + +/* { dg-final { scan-tree-dump " != " "optimized" } } */ +/* { dg-final { scan-tree-dump-not "return 1;" "optimized" } } */ diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc index bb59c6a7c023..0bac1a1f045a 100644 --- a/gcc/tree-ssa-structalias.cc +++ b/gcc/tree-ssa-structalias.cc @@ -3575,6 +3575,10 @@ get_constraint_for_1 (tree t, vec *results, bool address_p, } case tcc_reference: { + if (TREE_THIS_VOLATILE (t)) + /* Fall back to anything. */ + break; + switch (TREE_CODE (t)) { case MEM_REF: @@ -3676,6 +3680,9 @@ get_constraint_for_1 (tree t, vec *results, bool address_p, } case tcc_declaration: { + if (VAR_P (t) && TREE_THIS_VOLATILE (t)) + /* Fall back to anything. */ + break; get_constraint_for_ssa_var (t, results, address_p); return; }
[gcc r12-10450] tree-optimization/114027 - conditional reduction chain
https://gcc.gnu.org/g:6661a7c098a46eff4afc98b55c89e3a71d63d674 commit r12-10450-g6661a7c098a46eff4afc98b55c89e3a71d63d674 Author: Richard Biener Date: Thu Feb 22 10:50:12 2024 +0100 tree-optimization/114027 - conditional reduction chain When we classify a conditional reduction chain as CONST_COND_REDUCTION we fail to verify all involved conditionals have the same constant. That's a quite unlikely situation so the following simply disables such classification when there's more than one reduction statement. PR tree-optimization/114027 * tree-vect-loop.cc (vecctorizable_reduction): Use optimized condition reduction classification only for single-element chains. * gcc.dg/vect/pr114027.c: New testcase. (cherry picked from commit 549f251f055e3a0b0084189a3012c4f15d635e75) Diff: --- gcc/testsuite/gcc.dg/vect/pr114027.c | 26 ++ gcc/tree-vect-loop.cc| 11 ++- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr114027.c b/gcc/testsuite/gcc.dg/vect/pr114027.c new file mode 100644 index ..ead9cdd982d7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr114027.c @@ -0,0 +1,26 @@ +#include "tree-vect.h" + +int __attribute__((noipa)) +foo (int *f, int n) +{ + int res = 0; + for (int i = 0; i < n; ++i) +{ + if (f[2*i]) +res = 2; + if (f[2*i+1]) +res = -2; +} + return res; +} + +int f[] = { 1, 1, 1, 1, 1, 1, 1, 1, +1, 1, 1, 1, 1, 1, 1, 0 }; + +int +main () +{ + if (foo (f, 16) != 2) +__builtin_abort (); + return 0; +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index e1681047d9da..86ee9e449e19 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -6850,17 +6850,18 @@ vectorizable_reduction (loop_vec_info loop_vinfo, < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i])) vectype_in = vectype_op[i]; - if (op.code == COND_EXPR) + /* Record how the non-reduction-def value of COND_EXPR is defined. +??? For a chain of multiple CONDs we'd have to match them up all. */ + if (op.code == COND_EXPR && reduc_chain_length == 1) { - /* Record how the non-reduction-def value of COND_EXPR is defined. */ if (dt == vect_constant_def) { cond_reduc_dt = dt; cond_reduc_val = op.ops[i]; } - if (dt == vect_induction_def - && def_stmt_info - && is_nonwrapping_integer_induction (def_stmt_info, loop)) + else if (dt == vect_induction_def + && def_stmt_info + && is_nonwrapping_integer_induction (def_stmt_info, loop)) { cond_reduc_dt = dt; cond_stmt_vinfo = def_stmt_info;
[gcc r12-10451] tree-optimization/114027 - fix testcase
https://gcc.gnu.org/g:2d650c041d37a3df2bec46a67e42f9976d7fd2bf commit r12-10451-g2d650c041d37a3df2bec46a67e42f9976d7fd2bf Author: Richard Biener Date: Tue Mar 26 09:46:06 2024 +0100 tree-optimization/114027 - fix testcase The following fixes out-of-bounds read in the testcase. PR tree-optimization/114027 * gcc.dg/vect/pr114027.c: Fix iteration count. (cherry picked from commit 4470611e20f3217ee81647b01fda65b6a62229aa) Diff: --- gcc/testsuite/gcc.dg/vect/pr114027.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr114027.c b/gcc/testsuite/gcc.dg/vect/pr114027.c index ead9cdd982d7..b3f3e30e15fc 100644 --- a/gcc/testsuite/gcc.dg/vect/pr114027.c +++ b/gcc/testsuite/gcc.dg/vect/pr114027.c @@ -20,7 +20,7 @@ int f[] = { 1, 1, 1, 1, 1, 1, 1, 1, int main () { - if (foo (f, 16) != 2) + if (foo (f, 8) != 2) __builtin_abort (); return 0; }
[gcc r12-10448] tree-optimization/114231 - use patterns for BB SLP discovery root stmts
https://gcc.gnu.org/g:a7b1d814da0aa2e7331c56180264a8b786012971 commit r12-10448-ga7b1d814da0aa2e7331c56180264a8b786012971 Author: Richard Biener Date: Tue Mar 5 10:55:56 2024 +0100 tree-optimization/114231 - use patterns for BB SLP discovery root stmts The following makes sure to use recognized patterns when vectorizing roots during BB SLP discovery. We need to apply those late since during root discovery we've not yet done pattern recognition. All parts of the vectorizer assume patterns get used, for the testcase we mix this up when doing live lane computation. PR tree-optimization/114231 * tree-vect-slp.cc (vect_analyze_slp): Lookup patterns when processing a BB SLP root. * gcc.dg/vect/pr114231.c: New testcase. (cherry picked from commit 04fffbaa87997ac893a9aa68b674c938ba3ecddb) Diff: --- gcc/testsuite/gcc.dg/vect/pr114231.c | 12 gcc/tree-vect-slp.cc | 4 2 files changed, 16 insertions(+) diff --git a/gcc/testsuite/gcc.dg/vect/pr114231.c b/gcc/testsuite/gcc.dg/vect/pr114231.c new file mode 100644 index ..5e3a81039188 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr114231.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ + +void f(long*); +int ff[2]; +void f2(long, long, unsigned long); +void k(unsigned long x, unsigned long y) +{ + long t = x >> ff[0]; + long t1 = ff[1]; + unsigned long t2 = y >> ff[0]; + f2(t1, t+t2, t2); +} diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 7f9fbb9f3d01..f33e85337abd 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3410,6 +3410,10 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i) { vect_location = bb_vinfo->roots[i].roots[0]->stmt; + /* Apply patterns. */ + for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j) + bb_vinfo->roots[i].stmts[j] + = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]); if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind, bb_vinfo->roots[i].stmts, bb_vinfo->roots[i].roots,
[gcc r12-10449] tree-optimization/114375 - disallow SLP discovery of permuted mask loads
https://gcc.gnu.org/g:c1b21855e131bb818aedc953f403812b494917fc commit r12-10449-gc1b21855e131bb818aedc953f403812b494917fc Author: Richard Biener Date: Mon Mar 18 12:39:03 2024 +0100 tree-optimization/114375 - disallow SLP discovery of permuted mask loads We cannot currently handle permutations of mask loads in code generation or permute optimization. But we simply drop any permutation on the floor, so the following instead rejects the SLP build rather than producing wrong-code. I've also made sure to reject them in vectorizable_load for completeness. PR tree-optimization/114375 * tree-vect-slp.cc (vect_build_slp_tree_2): Compute the load permutation for masked loads but reject it when any such is necessary. * tree-vect-stmts.cc (vectorizable_load): Reject masked VMAT_ELEMENTWISE and VMAT_STRIDED_SLP as those are not supported. * gcc.dg/vect/vect-pr114375.c: New testcase. (cherry picked from commit 4f2a35a76cca503749c696e7772d2e8eadc77ba5) Diff: --- gcc/testsuite/gcc.dg/vect/vect-pr114375.c | 44 +++ gcc/tree-vect-slp.cc | 34 +++- gcc/tree-vect-stmts.cc| 8 ++ 3 files changed, 79 insertions(+), 7 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/vect-pr114375.c b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c new file mode 100644 index ..1e1cb0123d07 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c @@ -0,0 +1,44 @@ +/* { dg-additional-options "-mavx2" { target avx2_runtime } } */ + +#include "tree-vect.h" + +int a[512]; +int b[512]; +int c[512]; + +void __attribute__((noipa)) +foo(int * __restrict p) +{ + for (int i = 0; i < 64; ++i) +{ + int tem = 2, tem2 = 2; + if (a[4*i + 1]) +tem = p[4*i]; + if (a[4*i]) +tem2 = p[4*i + 2]; + b[2*i] = tem2; + b[2*i+1] = tem; + if (a[4*i + 2]) +tem = p[4*i + 1]; + if (a[4*i + 3]) +tem2 = p[4*i + 3]; + c[2*i] = tem2; + c[2*i+1] = tem; +} +} +int main() +{ + check_vect (); + + for (int i = 0; i < 512; ++i) +a[i] = (i >> 1) & 1; + + foo (a); + + if (c[0] != 1 || c[1] != 0 || c[2] != 1 || c[3] != 0 + || b[0] != 2 || b[1] != 2 || b[2] != 2 || b[3] != 2) +abort (); + + return 0; +} + diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index f33e85337abd..26c989cbff9a 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1722,10 +1722,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, if (STMT_VINFO_GROUPED_ACCESS (stmt_info) && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) { - if (gcall *stmt = dyn_cast (stmt_info->stmt)) - gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) - || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))); else { *max_nunits = this_max_nunits; @@ -1741,15 +1739,37 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, load_permutation.create (group_size); stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); + bool any_permute = false; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) { int load_place = vect_get_place_in_interleaving_chain (load_info, first_stmt_info); gcc_assert (load_place != -1); - load_permutation.safe_push (load_place); + any_permute |= load_place != j; + load_permutation.quick_push (load_place); + } + + if (gcall *stmt = dyn_cast (stmt_info->stmt)) + { + gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) + || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); + load_permutation.release (); + /* We cannot handle permuted masked loads, see PR114375. */ + if (any_permute + || (STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_GROUP_SIZE (first_stmt_info) != group_size) + || STMT_VINFO_STRIDED_P (stmt_info)) + { + matches[0] = false; + return NULL; + } + } + else + { + SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; + return node; } - SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; - return node; } } else if (gimple_assign_single_p (stmt_info->stmt) diff --git a/gcc/tree-vect-stmts.cc
[gcc r12-10447] middle-end/114734 - wrong code with expand_call_mem_ref
https://gcc.gnu.org/g:46b2e98983ebc62440c43217f2b3bbe72f8b0191 commit r12-10447-g46b2e98983ebc62440c43217f2b3bbe72f8b0191 Author: Richard Biener Date: Fri Apr 26 15:47:13 2024 +0200 middle-end/114734 - wrong code with expand_call_mem_ref When expand_call_mem_ref looks at the definition of the address argument to eventually expand a _MEM_REF argument together with a masked load it fails to honor constraints imposed by SSA coalescing decisions. The following fixes this. PR middle-end/114734 * internal-fn.cc (expand_call_mem_ref): Use get_gimple_for_ssa_name to get at the def stmt of the address argument to honor SSA coalescing constraints. (cherry picked from commit 20ebcaf826c91ddaf2aac35417ec1e5e6d31ad50) Diff: --- gcc/internal-fn.cc | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 8b1733e20c44..db23f66b0219 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -52,6 +52,8 @@ along with GCC; see the file COPYING3. If not see #include "explow.h" #include "rtl-iter.h" #include "gimple-range.h" +#include "tree-ssa-live.h" +#include "tree-outof-ssa.h" /* For lang_hooks.types.type_for_mode. */ #include "langhooks.h" @@ -2663,8 +2665,8 @@ expand_call_mem_ref (tree type, gcall *stmt, int index) tree tmp = addr; if (TREE_CODE (tmp) == SSA_NAME) { - gimple *def = SSA_NAME_DEF_STMT (tmp); - if (gimple_assign_single_p (def)) + gimple *def = get_gimple_for_ssa_name (tmp); + if (def && gimple_assign_single_p (def)) tmp = gimple_assign_rhs1 (def); }
[gcc r12-10446] lto/114655 - -flto=4 at link time doesn't override -flto=auto at compile time
https://gcc.gnu.org/g:42a0393655d8e4662ba001c3759cf00b639eddb9 commit r12-10446-g42a0393655d8e4662ba001c3759cf00b639eddb9 Author: Richard Biener Date: Tue Apr 9 14:25:57 2024 +0200 lto/114655 - -flto=4 at link time doesn't override -flto=auto at compile time The following adjusts -flto option processing in lto-wrapper to have link-time -flto override any compile time setting. PR lto/114655 * lto-wrapper.cc (merge_flto_options): Add force argument. (merge_and_complain): Do not force here. (run_gcc): But here to make the link-time -flto option override any compile-time one. (cherry picked from commit 32fb04adae90a0ea68e64e8fc3cb04b613b2e9f3) Diff: --- gcc/lto-wrapper.cc | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/gcc/lto-wrapper.cc b/gcc/lto-wrapper.cc index 155ccce57ae3..456636bd5dfe 100644 --- a/gcc/lto-wrapper.cc +++ b/gcc/lto-wrapper.cc @@ -193,15 +193,18 @@ find_option (vec , cl_decoded_option *option) return find_option (options, option->opt_index); } -/* Merge -flto FOPTION into vector of DECODED_OPTIONS. */ +/* Merge -flto FOPTION into vector of DECODED_OPTIONS. If FORCE is true + then FOPTION overrides previous settings. */ static void merge_flto_options (vec _options, - cl_decoded_option *foption) + cl_decoded_option *foption, bool force) { int existing_opt = find_option (decoded_options, foption); if (existing_opt == -1) decoded_options.safe_push (*foption); + else if (force) +decoded_options[existing_opt].arg = foption->arg; else { if (strcmp (foption->arg, decoded_options[existing_opt].arg) != 0) @@ -466,7 +469,7 @@ merge_and_complain (vec _options, break; case OPT_flto_: - merge_flto_options (decoded_options, foption); + merge_flto_options (decoded_options, foption, false); break; } } @@ -1540,8 +1543,8 @@ run_gcc (unsigned argc, char *argv[]) break; case OPT_flto_: - /* Merge linker -flto= option with what we have in IL files. */ - merge_flto_options (fdecoded_options, option); + /* Override IL file settings with a linker -flto= option. */ + merge_flto_options (fdecoded_options, option, true); if (strcmp (option->arg, "jobserver") == 0) jobserver_requested = true; break;
[gcc r12-10445] gcov-profile/114715 - missing coverage for switch
https://gcc.gnu.org/g:56415e39029012aa3675d3b4b71fb248cf43049e commit r12-10445-g56415e39029012aa3675d3b4b71fb248cf43049e Author: Richard Biener Date: Mon Apr 15 11:09:17 2024 +0200 gcov-profile/114715 - missing coverage for switch The following avoids missing coverage for the line of a switch statement which happens when gimplification emits a BIND_EXPR wrapping the switch as that prevents us from setting locations on the containing statements via annotate_all_with_location. Instead set the location of the GIMPLE switch directly. PR gcov-profile/114715 * gimplify.cc (gimplify_switch_expr): Set the location of the GIMPLE switch. * gcc.misc-tests/gcov-24.c: New testcase. (cherry picked from commit 5a3cc62dbb45185dd1ca32caec80d57a320ec5a0) Diff: --- gcc/gimplify.cc| 1 + gcc/testsuite/gcc.misc-tests/gcov-24.c | 30 ++ 2 files changed, 31 insertions(+) diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc index 947fe570e1e7..9e7869770183 100644 --- a/gcc/gimplify.cc +++ b/gcc/gimplify.cc @@ -2764,6 +2764,7 @@ gimplify_switch_expr (tree *expr_p, gimple_seq *pre_p) switch_stmt = gimple_build_switch (SWITCH_COND (switch_expr), default_case, labels); + gimple_set_location (switch_stmt, EXPR_LOCATION (switch_expr)); /* For the benefit of -Wimplicit-fallthrough, if switch_body_seq ends with a GIMPLE_LABEL holding SWITCH_BREAK_LABEL_P LABEL_DECL, wrap the GIMPLE_SWITCH up to that GIMPLE_LABEL into a GIMPLE_BIND, diff --git a/gcc/testsuite/gcc.misc-tests/gcov-24.c b/gcc/testsuite/gcc.misc-tests/gcov-24.c new file mode 100644 index ..395099bd7ae3 --- /dev/null +++ b/gcc/testsuite/gcc.misc-tests/gcov-24.c @@ -0,0 +1,30 @@ +/* { dg-options "-fprofile-arcs -ftest-coverage" } */ +/* { dg-do run { target native } } */ + +int main() +{ + int a = 1; + int b = 2; + int c = -3; + switch(a) /* count(1) */ +{ +case 1: /* count(1) */ +c = 3; +switch(b) { /* count(1) */ + case 1: /* count(#) */ + c = 4; + break; + case 2: /* count(1) */ + c = 5; + break; +} +break; +case 2: /* count(#) */ +c = 6; +break; +default: /* count(#) */ +break; +} +} + +/* { dg-final { run-gcov gcov-24.c } } */
[gcc r15-571] tree-optimization/79958 - make DSE track multiple paths
https://gcc.gnu.org/g:1e0ae1f52741f7e0133661659ed2d210f939a398 commit r15-571-g1e0ae1f52741f7e0133661659ed2d210f939a398 Author: Richard Biener Date: Wed May 15 18:32:37 2024 +0200 tree-optimization/79958 - make DSE track multiple paths DSE currently gives up when the path we analyze forks. This leads to multiple missed dead store elimination PRs. The following fixes this by recursing for each path and maintaining the visited bitmap to avoid visiting CFG re-merges multiple times. The overall cost is still limited by the same bound, it's just more likely we'll hit the limit now. The patch doesn't try to deal with byte tracking once a path forks but drops info on the floor and only handling fully dead stores in that case. PR tree-optimization/79958 PR tree-optimization/109087 PR tree-optimization/100314 PR tree-optimization/114774 * tree-ssa-dse.cc (dse_classify_store): New forwarder. (dse_classify_store): Add arguments cnt and visited, recurse to track multiple paths when we end up with multiple defs. * gcc.dg/tree-ssa/ssa-dse-48.c: New testcase. * gcc.dg/tree-ssa/ssa-dse-49.c: Likewise. * gcc.dg/tree-ssa/ssa-dse-50.c: Likewise. * gcc.dg/tree-ssa/ssa-dse-51.c: Likewise. * gcc.dg/graphite/pr80906.c: Avoid DSE of last data reference in loop. * g++.dg/ipa/devirt-24.C: Adjust for extra DSE. * g++.dg/warn/Wuninitialized-pr107919-1.C: Use more important -O2 optimization level, -O1 regresses. Diff: --- gcc/testsuite/g++.dg/ipa/devirt-24.C | 4 ++- .../g++.dg/warn/Wuninitialized-pr107919-1.C| 2 +- gcc/testsuite/gcc.dg/graphite/pr80906.c| 2 +- gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-48.c | 17 gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-49.c | 18 + gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-50.c | 25 + gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-51.c | 24 + gcc/tree-ssa-dse.cc| 31 +++--- 8 files changed, 116 insertions(+), 7 deletions(-) diff --git a/gcc/testsuite/g++.dg/ipa/devirt-24.C b/gcc/testsuite/g++.dg/ipa/devirt-24.C index 7b5b806dd05f..333c03cd8dd7 100644 --- a/gcc/testsuite/g++.dg/ipa/devirt-24.C +++ b/gcc/testsuite/g++.dg/ipa/devirt-24.C @@ -37,4 +37,6 @@ C *b = new (C); } } /* { dg-final { scan-ipa-dump-times "Discovered a virtual call to a known target" 1 "inline" { xfail *-*-* } } } */ -/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 2 "cp" } } */ +/* We used to have IPA CP see two aggregates passed to sort() but as the + first argument is unused DSE now elides the vptr initialization. */ +/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 1 "cp" } } */ diff --git a/gcc/testsuite/g++.dg/warn/Wuninitialized-pr107919-1.C b/gcc/testsuite/g++.dg/warn/Wuninitialized-pr107919-1.C index dd631dc8bfe7..067a44a462e1 100644 --- a/gcc/testsuite/g++.dg/warn/Wuninitialized-pr107919-1.C +++ b/gcc/testsuite/g++.dg/warn/Wuninitialized-pr107919-1.C @@ -1,6 +1,6 @@ // { dg-do compile } // { dg-require-effective-target c++17 } -// { dg-options "-O -Wuninitialized" } +// { dg-options "-O2 -Wuninitialized" } #include #include diff --git a/gcc/testsuite/gcc.dg/graphite/pr80906.c b/gcc/testsuite/gcc.dg/graphite/pr80906.c index 59c7f59cadff..ec3840834fc4 100644 --- a/gcc/testsuite/gcc.dg/graphite/pr80906.c +++ b/gcc/testsuite/gcc.dg/graphite/pr80906.c @@ -18,7 +18,7 @@ ec (int lh[][2]) --bm; if (bm != 0) --c5; - lh[0][0] = 0; + lh[hp][0] = 0; m3 *= jv; } diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-48.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-48.c new file mode 100644 index ..edfc62c7e4ab --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-48.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fdump-tree-dse1-details" } */ + +int a; +int foo (void); +int bar (void); + +void +baz (void) +{ + int *b[6]; + b[0] = + if (foo ()) +a |= bar (); +} + +/* { dg-final { scan-tree-dump "Deleted dead store: b\\\[0\\\] = " "dse1" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-49.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-49.c new file mode 100644 index ..1eec284a4159 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-49.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fno-tree-dce -fdump-tree-dse1-details" } */ + +struct X { int i; }; +void bar (); +void foo (int b) +{ + struct X x; + x.i = 1; + if (b) +{ + bar (); + __builtin_abort (); +} + bar (); +} + +/* { dg-final { scan-tree-dump "Deleted dead store: x.i = 1;" "dse1" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-50.c
[gcc r15-518] tree-optimization/114589 - remove profile based sink heuristics
https://gcc.gnu.org/g:99b1daae18c095d6c94d32efb77442838e11cbfb commit r15-518-g99b1daae18c095d6c94d32efb77442838e11cbfb Author: Richard Biener Date: Fri May 3 14:04:41 2024 +0200 tree-optimization/114589 - remove profile based sink heuristics The following removes the profile based heuristic limiting sinking and instead uses post-dominators to avoid sinking to places that are executed under the same conditions as the earlier location which the profile based heuristic should have guaranteed as well. To avoid regressing this moves the empty-latch check to cover all sink cases. It also stream-lines the resulting select_best_block a bit but avoids adjusting heuristics more with this change. gfortran.dg/streamio_9.f90 starts execute failing with this on x86_64 with -m32 because the (float)i * 9....e-7 compute is sunk across a STOP causing it to be no longer spilled and thus the compare failing due to excess precision. The patch adds -ffloat-store to avoid this, following other similar testcases. This change fixes the testcase in the PR only when using -fno-ivopts as otherwise VRP is confused. PR tree-optimization/114589 * tree-ssa-sink.cc (select_best_block): Remove profile-based heuristics. Instead reject sink locations that sink to post-dominators. Move empty latch check here from statement_sink_location. Also consider early_bb for the loop depth check. (statement_sink_location): Remove superfluous check. Remove empty latch check. (pass_sink_code::execute): Compute/release post-dominators. * gfortran.dg/streamio_9.f90: Use -ffloat-store to avoid excess precision when not spilling. * g++.dg/tree-ssa/pr114589.C: New testcase. Diff: --- gcc/testsuite/g++.dg/tree-ssa/pr114589.C | 22 gcc/testsuite/gfortran.dg/streamio_9.f90 | 1 + gcc/tree-ssa-sink.cc | 62 ++-- 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr114589.C b/gcc/testsuite/g++.dg/tree-ssa/pr114589.C new file mode 100644 index ..85bb6d03015b --- /dev/null +++ b/gcc/testsuite/g++.dg/tree-ssa/pr114589.C @@ -0,0 +1,22 @@ +// { dg-do compile { target c++11 } } +// { dg-options "-O2 -fno-ivopts -fdump-tree-optimized" } + +template +struct simple_optional { +bool has_val; +T val; + +auto begin() const -> T const* { return } +auto end() const -> T const* { return + (has_val ? 1 : 0); } +}; + +void f(int); + +void call_f(simple_optional const& o) { +for (int i : o) { +f(i); +} +} + +// Only a conditional execution of 'f' should prevail, no loop +// { dg-final { scan-tree-dump-times ". ! Test case derived from that given in PR by Steve Kargl. diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc index 2f90acb7ef48..2188b7523c7b 100644 --- a/gcc/tree-ssa-sink.cc +++ b/gcc/tree-ssa-sink.cc @@ -178,15 +178,7 @@ nearest_common_dominator_of_uses (def_operand_p def_p, bool *debug_stmts) We want the most control dependent block in the shallowest loop nest. - If the resulting block is in a shallower loop nest, then use it. Else - only use the resulting block if it has significantly lower execution - frequency than EARLY_BB to avoid gratuitous statement movement. We - consider statements with VOPS more desirable to move. - - This pass would obviously benefit from PDO as it utilizes block - frequencies. It would also benefit from recomputing frequencies - if profile data is not available since frequencies often get out - of sync with reality. */ + If the resulting block is in a shallower loop nest, then use it. */ static basic_block select_best_block (basic_block early_bb, @@ -195,18 +187,17 @@ select_best_block (basic_block early_bb, { basic_block best_bb = late_bb; basic_block temp_bb = late_bb; - int threshold; while (temp_bb != early_bb) { + /* Walk up the dominator tree, hopefully we'll find a shallower +loop nest. */ + temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb); + /* If we've moved into a lower loop nest, then that becomes our best block. */ if (bb_loop_depth (temp_bb) < bb_loop_depth (best_bb)) best_bb = temp_bb; - - /* Walk up the dominator tree, hopefully we'll find a shallower -loop nest. */ - temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb); } /* Placing a statement before a setjmp-like function would be invalid @@ -221,6 +212,16 @@ select_best_block (basic_block early_bb, if (bb_loop_depth (best_bb) < bb_loop_depth (early_bb)) return best_bb; + /* Do not move stmts to post-dominating places on the same loop depth. */ + if (dominated_by_p (CDI_POST_DOMINATORS, early_bb,
[gcc r15-517] middle-end/111422 - wrong stack var coalescing, handle PHIs
https://gcc.gnu.org/g:ab25eef36400e8c1d28e3ed059c5f95a38b45f17 commit r15-517-gab25eef36400e8c1d28e3ed059c5f95a38b45f17 Author: Richard Biener Date: Wed May 15 13:06:30 2024 +0200 middle-end/111422 - wrong stack var coalescing, handle PHIs The gcc.c-torture/execute/pr111422.c testcase after installing the sink pass improvement reveals that we also need to handle _65 = + _58; _44 = + _43; # _59 = PHI <_65, _44> *_59 = 8; g = {v} {CLOBBER(eos)}; ... n[0] = *_59 = 8; g = {v} {CLOBBER(eos)}; where we fail to see the conflict between n and g after the first clobber of g. Before the sinking improvement there was a conflict recorded on a path where _65/_44 are unused, so the real conflict was missed but the fake one avoided the miscompile. The following handles PHI defs in add_scope_conflicts_2 which fixes the issue. PR middle-end/111422 * cfgexpand.cc (add_scope_conflicts_2): Handle PHIs by recursing to their arguments. Diff: --- gcc/cfgexpand.cc | 19 +++ 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc index 557cb28733bd..8de5f2ba58b7 100644 --- a/gcc/cfgexpand.cc +++ b/gcc/cfgexpand.cc @@ -584,10 +584,21 @@ add_scope_conflicts_2 (tree use, bitmap work, || INTEGRAL_TYPE_P (TREE_TYPE (use { gimple *g = SSA_NAME_DEF_STMT (use); - if (is_gimple_assign (g)) - if (tree op = gimple_assign_rhs1 (g)) - if (TREE_CODE (op) == ADDR_EXPR) - visit (g, TREE_OPERAND (op, 0), op, work); + if (gassign *a = dyn_cast (g)) + { + if (tree op = gimple_assign_rhs1 (a)) + if (TREE_CODE (op) == ADDR_EXPR) + visit (a, TREE_OPERAND (op, 0), op, work); + } + else if (gphi *p = dyn_cast (g)) + for (unsigned i = 0; i < gimple_phi_num_args (p); ++i) + if (TREE_CODE (use = gimple_phi_arg_def (p, i)) == SSA_NAME) + if (gassign *a = dyn_cast (SSA_NAME_DEF_STMT (use))) + { + if (tree op = gimple_assign_rhs1 (a)) + if (TREE_CODE (op) == ADDR_EXPR) + visit (a, TREE_OPERAND (op, 0), op, work); + } } }
[gcc r14-10211] middle-end/114931 - type_hash_canon and structual equality types
https://gcc.gnu.org/g:1d89cb43943e77d0bbb48fd5a58a352bdd3d82c7 commit r14-10211-g1d89cb43943e77d0bbb48fd5a58a352bdd3d82c7 Author: Richard Biener Date: Fri May 3 10:44:50 2024 +0200 middle-end/114931 - type_hash_canon and structual equality types TYPE_STRUCTURAL_EQUALITY_P is part of our type system so we have to make sure to include that into the type unification done via type_hash_canon. This requires the flag to be set before querying the hash which is the biggest part of the patch. PR middle-end/114931 gcc/ * tree.cc (type_hash_canon_hash): Hash TYPE_STRUCTURAL_EQUALITY_P. (type_cache_hasher::equal): Compare TYPE_STRUCTURAL_EQUALITY_P. (build_array_type_1): Set TYPE_STRUCTURAL_EQUALITY_P before probing with type_hash_canon. (build_function_type): Likewise. (build_method_type_directly): Likewise. (build_offset_type): Likewise. (build_complex_type): Likewise. * attribs.cc (build_type_attribute_qual_variant): Likewise. gcc/c-family/ * c-common.cc (complete_array_type): Set TYPE_STRUCTURAL_EQUALITY_P before probing with type_hash_canon. gcc/testsuite/ * gcc.dg/pr114931.c: New testcase. (cherry picked from commit b09c2e9560648b0cf993c2ca9ad972c34e6bddfa) Diff: --- gcc/attribs.cc | 20 ++--- gcc/c-family/c-common.cc| 11 +-- gcc/testsuite/gcc.dg/pr114931.c | 10 +++ gcc/tree.cc | 65 + 4 files changed, 74 insertions(+), 32 deletions(-) diff --git a/gcc/attribs.cc b/gcc/attribs.cc index 12ffc5f170a1..3ab0b0fd87a4 100644 --- a/gcc/attribs.cc +++ b/gcc/attribs.cc @@ -1336,6 +1336,16 @@ build_type_attribute_qual_variant (tree otype, tree attribute, int quals) tree dtype = ntype = build_distinct_type_copy (ttype); TYPE_ATTRIBUTES (ntype) = attribute; + /* If the target-dependent attributes make NTYPE different from +its canonical type, we will need to use structural equality +checks for this type. + +We shouldn't get here for stripping attributes from a type; +the no-attribute type might not need structural comparison. But +we can if was discarded from type_hash_table. */ + if (TYPE_STRUCTURAL_EQUALITY_P (ttype) + || !comp_type_attributes (ntype, ttype)) + SET_TYPE_STRUCTURAL_EQUALITY (ntype); hashval_t hash = type_hash_canon_hash (ntype); ntype = type_hash_canon (hash, ntype); @@ -1343,16 +1353,6 @@ build_type_attribute_qual_variant (tree otype, tree attribute, int quals) if (ntype != dtype) /* This variant was already in the hash table, don't mess with TYPE_CANONICAL. */; - else if (TYPE_STRUCTURAL_EQUALITY_P (ttype) - || !comp_type_attributes (ntype, ttype)) - /* If the target-dependent attributes make NTYPE different from - its canonical type, we will need to use structural equality - checks for this type. - - We shouldn't get here for stripping attributes from a type; - the no-attribute type might not need structural comparison. But - we can if was discarded from type_hash_table. */ - SET_TYPE_STRUCTURAL_EQUALITY (ntype); else if (TYPE_CANONICAL (ntype) == ntype) TYPE_CANONICAL (ntype) = TYPE_CANONICAL (ttype); diff --git a/gcc/c-family/c-common.cc b/gcc/c-family/c-common.cc index d14591c7bd3b..aae998d0f738 100644 --- a/gcc/c-family/c-common.cc +++ b/gcc/c-family/c-common.cc @@ -7115,6 +7115,13 @@ complete_array_type (tree *ptype, tree initial_value, bool do_default) TYPE_TYPELESS_STORAGE (main_type) = TYPE_TYPELESS_STORAGE (type); layout_type (main_type); + /* Set TYPE_STRUCTURAL_EQUALITY_P early. */ + if (TYPE_STRUCTURAL_EQUALITY_P (TREE_TYPE (main_type)) + || TYPE_STRUCTURAL_EQUALITY_P (TYPE_DOMAIN (main_type))) +SET_TYPE_STRUCTURAL_EQUALITY (main_type); + else +TYPE_CANONICAL (main_type) = main_type; + /* Make sure we have the canonical MAIN_TYPE. */ hashval_t hashcode = type_hash_canon_hash (main_type); main_type = type_hash_canon (hashcode, main_type); @@ -7122,7 +7129,7 @@ complete_array_type (tree *ptype, tree initial_value, bool do_default) /* Fix the canonical type. */ if (TYPE_STRUCTURAL_EQUALITY_P (TREE_TYPE (main_type)) || TYPE_STRUCTURAL_EQUALITY_P (TYPE_DOMAIN (main_type))) -SET_TYPE_STRUCTURAL_EQUALITY (main_type); +gcc_assert (TYPE_STRUCTURAL_EQUALITY_P (main_type)); else if (TYPE_CANONICAL (TREE_TYPE (main_type)) != TREE_TYPE (main_type) || (TYPE_CANONICAL (TYPE_DOMAIN (main_type)) != TYPE_DOMAIN (main_type))) @@ -7130,8 +7137,6 @@ complete_array_type (tree *ptype, tree initial_value, bool do_default) = build_array_type (TYPE_CANONICAL (TREE_TYPE (main_type)),
[gcc r14-10210] Avoid changing type in the type_hash_canon hash
https://gcc.gnu.org/g:573e1df0ec8428e564c97af7c237a5e0c98c59bd commit r14-10210-g573e1df0ec8428e564c97af7c237a5e0c98c59bd Author: Richard Biener Date: Fri May 3 11:48:07 2024 +0200 Avoid changing type in the type_hash_canon hash When building a type and type_hash_canon returns an existing type avoid changing it, in particular its TYPE_CANONICAL. PR middle-end/114931 * tree.cc (build_array_type_1): Return early when type_hash_canon returned an older existing type. (build_function_type): Likewise. (build_method_type_directly): Likewise. (build_offset_type): Likewise. (cherry picked from commit 7a212ac678e13e0df5da2d090144b246a1262b64) Diff: --- gcc/tree.cc | 12 1 file changed, 12 insertions(+) diff --git a/gcc/tree.cc b/gcc/tree.cc index 83f3bf306afa..780662549fea 100644 --- a/gcc/tree.cc +++ b/gcc/tree.cc @@ -7352,7 +7352,10 @@ build_array_type_1 (tree elt_type, tree index_type, bool typeless_storage, if (shared) { hashval_t hash = type_hash_canon_hash (t); + tree probe_type = t; t = type_hash_canon (hash, t); + if (t != probe_type) + return t; } if (TYPE_CANONICAL (t) == t && set_canonical) @@ -7509,7 +7512,10 @@ build_function_type (tree value_type, tree arg_types, /* If we already have such a type, use the old one. */ hashval_t hash = type_hash_canon_hash (t); + tree probe_type = t; t = type_hash_canon (hash, t); + if (t != probe_type) +return t; /* Set up the canonical type. */ any_structural_p = TYPE_STRUCTURAL_EQUALITY_P (value_type); @@ -7663,7 +7669,10 @@ build_method_type_directly (tree basetype, /* If we already have such a type, use the old one. */ hashval_t hash = type_hash_canon_hash (t); + tree probe_type = t; t = type_hash_canon (hash, t); + if (t != probe_type) +return t; /* Set up the canonical type. */ any_structural_p @@ -7720,7 +7729,10 @@ build_offset_type (tree basetype, tree type) /* If we already have such a type, use the old one. */ hashval_t hash = type_hash_canon_hash (t); + tree probe_type = t; t = type_hash_canon (hash, t); + if (t != probe_type) +return t; if (!COMPLETE_TYPE_P (t)) layout_type (t);
[gcc r15-491] tree-optimization/99954 - redo loop distribution memcpy recognition fix
https://gcc.gnu.org/g:c290e6a0b7a9de5692963affc6627a4af7dc2411 commit r15-491-gc290e6a0b7a9de5692963affc6627a4af7dc2411 Author: Richard Biener Date: Tue May 14 11:13:51 2024 +0200 tree-optimization/99954 - redo loop distribution memcpy recognition fix The following revisits the fix for PR99954 which was observed as causing missed memcpy recognition and instead using memmove for non-aliasing copies. While the original fix mitigated bogus recognition of memcpy the root cause was not properly identified. The root cause is dr_analyze_indices "failing" to handle union references and leaving the DRs indices in a state that's not correctly handled by dr_may_alias. The following mitigates this there appropriately, restoring memcpy recognition for non-aliasing copies. This makes us run into a latent issue in ptr_deref_may_alias_decl_p when the pointer is something like [0].a in which case we fail to handle non-SSA name pointers. Add code similar to what we have in ptr_derefs_may_alias_p. PR tree-optimization/99954 * tree-data-ref.cc (dr_may_alias_p): For bases that are not completely analyzed fall back to TBAA and points-to. * tree-loop-distribution.cc (loop_distribution::classify_builtin_ldst): When there is no dependence again classify as memcpy. * tree-ssa-alias.cc (ptr_deref_may_alias_decl_p): Verify the pointer is an SSA name. * gcc.dg/tree-ssa/ldist-40.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/tree-ssa/ldist-40.c | 10 ++ gcc/tree-data-ref.cc | 22 ++ gcc/tree-loop-distribution.cc| 4 ++-- gcc/tree-ssa-alias.cc| 5 + 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-40.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-40.c new file mode 100644 index ..238a0098352a --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-40.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-ldist-details" } */ + +void copy_a_to_b (char * __restrict b, char * a, int n) +{ + for (int i = 0; i < n; ++i) +b[i] = a[i]; +} + +/* { dg-final { scan-tree-dump "generated memcpy" "ldist" } } */ diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc index f37734b53409..db15ddb43ded 100644 --- a/gcc/tree-data-ref.cc +++ b/gcc/tree-data-ref.cc @@ -3066,6 +3066,28 @@ dr_may_alias_p (const struct data_reference *a, const struct data_reference *b, return ptr_derefs_may_alias_p (build_fold_addr_expr (addr_a), TREE_OPERAND (addr_b, 0)); } + /* If dr_analyze_innermost failed to handle a component we are + possibly left with a non-base in which case we didn't analyze + a possible evolution of the base when analyzing a loop. */ + else if (loop_nest + && (handled_component_p (addr_a) || handled_component_p (addr_b))) +{ + /* For true dependences we can apply TBAA. */ + if (flag_strict_aliasing + && DR_IS_WRITE (a) && DR_IS_READ (b) + && !alias_sets_conflict_p (get_alias_set (DR_REF (a)), +get_alias_set (DR_REF (b + return false; + if (TREE_CODE (addr_a) == MEM_REF) + return ptr_derefs_may_alias_p (TREE_OPERAND (addr_a, 0), + build_fold_addr_expr (addr_b)); + else if (TREE_CODE (addr_b) == MEM_REF) + return ptr_derefs_may_alias_p (build_fold_addr_expr (addr_a), + TREE_OPERAND (addr_b, 0)); + else + return ptr_derefs_may_alias_p (build_fold_addr_expr (addr_a), + build_fold_addr_expr (addr_b)); +} /* Otherwise DR_BASE_OBJECT is an access that covers the whole object that is being subsetted in the loop nest. */ diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index 45932bae5e7f..668dc4204490 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -1840,11 +1840,11 @@ loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg, /* Now check that if there is a dependence. */ ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr); - /* Classify as memmove if no dependence between load and store. */ + /* Classify as memcpy if no dependence between load and store. */ if (DDR_ARE_DEPENDENT (ddr) == chrec_known) { partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); - partition->kind = PKIND_MEMMOVE; + partition->kind = PKIND_MEMCPY; return; } diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc index e7c1c1aa6243..374ba04e6fd0 100644 --- a/gcc/tree-ssa-alias.cc +++ b/gcc/tree-ssa-alias.cc @@ -294,6 +294,11 @@ ptr_deref_may_alias_decl_p (tree ptr,
[gcc(refs/users/rguenth/heads/vect-force-slp)] Improve combined store node splitting
https://gcc.gnu.org/g:4336060fe2db8ec41c0f108034a4ae8de89e5fa1 commit 4336060fe2db8ec41c0f108034a4ae8de89e5fa1 Author: Richard Biener Date: Wed Mar 20 14:55:08 2024 +0100 Improve combined store node splitting The following improves on the initial "Avoid splitting store dataref groups during SLP discovery" change, in particular on how we deal with the multi-input VEC_PERM node combining back the SLP instances into the single node for the whole group store. Instead of combining the last two inputs recursively this more carefully selects nodes to combine (but still recursively), combining the first two nodes with the least number of inputs. That should avoid the need for three-input permutes consistently. * tree-vect-slp.cc (vect_build_slp_instance): Split merge permute node in a better manner. Diff: --- gcc/tree-vect-slp.cc | 66 +--- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index f3743997e9cd..7e6ff07db0ff 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3654,18 +3654,45 @@ vect_build_slp_instance (vec_info *vinfo, } /* ??? Now we have a single permute node but when that's -fed more than two inputs it's prone to hit the limitation +fed more than two inputs it's prone to hit the limitation on at most two sources for a VEC_PERM_EXPR. Ideally we'd defer the following to the optimize-slp pass but for now split it here. -??? Optimally we'd produce permute nodes feeding in -the same number of lanes from each input and also have -the same vector type (only the width will eventually -differ here), for now just do "something". */ +For now perform pairwise reduction, reducing the two inputs +with the least number of lanes to one and then repeat until +we end up with two inputs. */ while (SLP_TREE_CHILDREN (perm).length () > 2) { - slp_tree b = SLP_TREE_CHILDREN (perm).pop (); - slp_tree a = SLP_TREE_CHILDREN (perm).pop (); + /* Pick the two nodes with the least number of lanes, +prefer the earliest candidate and maintain ai < bi. */ + int ai = -1; + int bi = -1; + for (unsigned ci = 0; + ci < SLP_TREE_CHILDREN (perm).length (); ++ci) + { + if (ai == -1) + ai = ci; + else if (bi == -1) + bi = ci; + else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci]) + < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])) + || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci]) + < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))) + { + if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]) + <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])) + bi = ci; + else + { + ai = bi; + bi = ci; + } + } + } + + /* Produce a merge of nodes ai and bi. */ + slp_tree a = SLP_TREE_CHILDREN (perm)[ai]; + slp_tree b = SLP_TREE_CHILDREN (perm)[bi]; unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b); slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR); SLP_TREE_LANES (permab) = n; @@ -3682,12 +3709,25 @@ vect_build_slp_instance (vec_info *vinfo, for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k) SLP_TREE_LANE_PERMUTATION (permab) .quick_push (std::make_pair (1, k)); - /* ??? Popluate SLP_TREE_SCALAR_STMTS/OPS of permab. */ - SLP_TREE_CHILDREN (perm).quick_push (permab); - for (unsigned k = group_size - n; k < group_size; ++k) - SLP_TREE_LANE_PERMUTATION (perm)[k] - = std::make_pair (SLP_TREE_CHILDREN (perm).length () - 1, - k - (group_size - n)); + + /* Put the merged node into 'perm', in place of a */ + SLP_TREE_CHILDREN (perm)[ai] = permab; + /* Adjust the references to b in the permutation +of perm and to the later children which we'll +remove. */ + for (unsigned k = 0; k < SLP_TREE_LANES
[gcc(refs/users/rguenth/heads/vect-force-slp)] Add single-lane SLP support to .GOMP_SIMD_LANE vectorization
https://gcc.gnu.org/g:ccbe801c08914ba136bba9cdbbcf0146dd261204 commit ccbe801c08914ba136bba9cdbbcf0146dd261204 Author: Richard Biener Date: Wed Mar 13 14:13:00 2024 +0100 Add single-lane SLP support to .GOMP_SIMD_LANE vectorization The following adds support for single-lane SLP .GOMP_SIMD_LANE vectorization. * tree-vect-slp.cc (no_arg_map): New. (vect_get_operand_map): Handle IFN_GOMP_SIMD_LANE. (vect_build_slp_tree_1): Likewise. * tree-vect-stmts.cc (vectorizable_call): Handle single-lane SLP for .GOMP_SIMD_LANE calls. Diff: --- gcc/tree-vect-slp.cc | 11 +++ gcc/tree-vect-stmts.cc | 27 +++ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 3138a815da7a..f3743997e9cd 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -505,6 +505,7 @@ static const int cond_expr_maps[3][5] = { { 4, -2, -1, 1, 2 }, { 4, -1, -2, 2, 1 } }; +static const int no_arg_map[] = { 0 }; static const int arg0_map[] = { 1, 0 }; static const int arg1_map[] = { 1, 1 }; static const int arg2_map[] = { 1, 2 }; @@ -585,6 +586,9 @@ vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false, case IFN_CTZ: return arg0_map; + case IFN_GOMP_SIMD_LANE: + return no_arg_map; + default: break; } @@ -1168,6 +1172,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, ldst_p = true; rhs_code = CFN_MASK_STORE; } + else if (cfn == CFN_GOMP_SIMD_LANE) + ; else if ((cfn != CFN_LAST && cfn != CFN_MASK_CALL && internal_fn_p (cfn) @@ -1271,6 +1277,11 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, need_same_oprnds = true; first_op1 = gimple_call_arg (call_stmt, 1); } + else if (rhs_code == CFN_GOMP_SIMD_LANE) + { + need_same_oprnds = true; + first_op1 = gimple_call_arg (call_stmt, 1); + } } else { diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 840ff8a3406a..270c5a5dd347 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -3341,7 +3341,7 @@ vectorizable_call (vec_info *vinfo, if (ifn == IFN_LAST && !fndecl) { if (cfn == CFN_GOMP_SIMD_LANE - && !slp_node + && (!slp_node || SLP_TREE_LANES (slp_node) == 1) && loop_vinfo && LOOP_VINFO_LOOP (loop_vinfo)->simduid && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME @@ -3487,18 +3487,15 @@ vectorizable_call (vec_info *vinfo, /* Build argument list for the vectorized call. */ if (slp_node) { - vec vec_oprnds0; - + unsigned int vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); vect_get_slp_defs (vinfo, slp_node, _defs); - vec_oprnds0 = vec_defs[0]; /* Arguments are ready. Create the new vector stmt. */ - FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_oprnd0) + for (i = 0; i < vec_num; ++i) { int varg = 0; if (masked_loop_p && reduc_idx >= 0) { - unsigned int vec_num = vec_oprnds0.length (); /* Always true for SLP. */ gcc_assert (ncopies == 1); vargs[varg++] = vect_get_loop_mask (loop_vinfo, @@ -3539,11 +3536,26 @@ vectorizable_call (vec_info *vinfo, vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); } + else if (cfn == CFN_GOMP_SIMD_LANE) + { + /* ??? For multi-lane SLP we'd need to build +{ 0, 0, .., 1, 1, ... }. */ + tree cst = build_index_vector (vectype_out, +i * nunits_out, 1); + tree new_var + = vect_get_new_ssa_name (vectype_out, vect_simple_var, +"cst_"); + gimple *init_stmt = gimple_build_assign (new_var, cst); + vect_init_vector_1 (vinfo, stmt_info, init_stmt, NULL); + new_temp = make_ssa_name (vec_dest); + new_stmt = gimple_build_assign (new_temp, new_var); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, + gsi); + } else { if (len_opno >= 0 && len_loop_p) { - unsigned int vec_num =
[gcc(refs/users/rguenth/heads/vect-force-slp)] Handle unused-only-live stmts in SLP discovery
https://gcc.gnu.org/g:d0b1eaaf0ba4b9e6fd2b18ef597bde3eb7bd018b commit d0b1eaaf0ba4b9e6fd2b18ef597bde3eb7bd018b Author: Richard Biener Date: Thu Mar 7 15:13:33 2024 +0100 Handle unused-only-live stmts in SLP discovery The following adds SLP discovery for roots that are only live but otherwise unused. * tree-vect-slp.cc (vect_analyze_slp): Analyze SLP for live but otherwise unused defs. Diff: --- gcc/tree-vect-slp.cc | 34 ++ 1 file changed, 34 insertions(+) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 6cc544057115..3138a815da7a 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3985,6 +3985,40 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) saved_stmts.release (); } } + + if (param_vect_single_lane_slp != 0) + { + /* Make sure to vectorize only-live stmts, usually inductions. */ + for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo))) + for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); +gsi_next ()) + { + gphi *lc_phi = *gsi; + tree def = gimple_phi_arg_def_from_edge (lc_phi, e); + stmt_vec_info stmt_info; + if (TREE_CODE (def) == SSA_NAME + && !virtual_operand_p (def) + && (stmt_info = loop_vinfo->lookup_def (def)) + && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live + && STMT_VINFO_LIVE_P (stmt_info) + && (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def + || (STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def + && STMT_VINFO_REDUC_IDX (stmt_info) == -1))) + { + vec stmts; + vec roots = vNULL; + vec remain = vNULL; + stmts.create (1); + stmts.quick_push (vect_stmt_to_vectorize (stmt_info)); + bool res = vect_build_slp_instance (vinfo, + slp_inst_kind_reduc_group, + stmts, roots, remain, + max_tree_size, , + bst_map, NULL); + gcc_assert (res); + } + } + } } hash_set visited_patterns;
[gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid bogus SLP outer loop vectorization
https://gcc.gnu.org/g:4652b8bdf79f6ba3a86e085b7ce13d23057c57f6 commit 4652b8bdf79f6ba3a86e085b7ce13d23057c57f6 Author: Richard Biener Date: Wed Mar 6 15:13:05 2024 +0100 Avoid bogus SLP outer loop vectorization This fixes the check for multiple types which go wrong I think because of bogus pointer IV increments when there are multiple copies of vector stmts in the inner loop. * tree-vect-stmts.cc (vectorizable_load): Avoid outer loop SLP vectorization with multi-copy vector stmts in the inner loop. (vectorizable_store): Likewise. Diff: --- gcc/tree-vect-stmts.cc | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 414c1fce38db..840ff8a3406a 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -8195,7 +8195,9 @@ vectorizable_store (vec_info *vinfo, gcc_assert (ncopies >= 1); /* FORNOW. This restriction should be relaxed. */ - if (loop && nested_in_vect_loop_p (loop, stmt_info) && ncopies > 1) + if (loop + && nested_in_vect_loop_p (loop, stmt_info) + && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -9941,7 +9943,8 @@ vectorizable_load (vec_info *vinfo, gcc_assert (ncopies >= 1); /* FORNOW. This restriction should be relaxed. */ - if (nested_in_vect_loop && ncopies > 1) + if (nested_in_vect_loop + && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
[gcc(refs/users/rguenth/heads/vect-force-slp)] Allow single-lane SLP in-order reductions
https://gcc.gnu.org/g:0faad9e4dfa5015c9535e4f2a40400914c5b4674 commit 0faad9e4dfa5015c9535e4f2a40400914c5b4674 Author: Richard Biener Date: Tue Mar 5 15:46:24 2024 +0100 Allow single-lane SLP in-order reductions The single-lane case isn't different from non-SLP, no re-association implied. * tree-vect-loop.cc (vectorizable_reduction): Allow single-lane SLP in-order reductions. Diff: --- gcc/tree-vect-loop.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 8fb8800e6a7e..a5597ec1287b 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -8134,7 +8134,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } if (reduction_type == FOLD_LEFT_REDUCTION - && slp_node + && (slp_node && SLP_TREE_LANES (slp_node) > 1) && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) { /* We cannot use in-order reductions in this case because there is
[gcc(refs/users/rguenth/heads/vect-force-slp)] Add double reduction support for SLP vectorization
https://gcc.gnu.org/g:5bdcb5f182a6472a66cc5d7842a64ec7ad0fd7e5 commit 5bdcb5f182a6472a66cc5d7842a64ec7ad0fd7e5 Author: Richard Biener Date: Tue Mar 5 15:28:58 2024 +0100 Add double reduction support for SLP vectorization The following makes double reduction vectorization work when using (single-lane) SLP vectorization. * tree-vect-loop.cc (vect_analyze_scalar_cycles_1): Queue double reductions in LOOP_VINFO_REDUCTIONS. (vect_create_epilog_for_reduction): Remove asserts disabling SLP for double reductions. (vectorizable_reduction): Analyze SLP double reductions only once and start off the correct places. * tree-vect-slp.cc (vect_get_and_check_slp_defs): Allow vect_double_reduction_def. (vect_build_slp_tree_2): Fix condition for the ignored reduction initial values. * tree-vect-stmts.cc (vect_analyze_stmt): Allow vect_double_reduction_def. Diff: --- gcc/tree-vect-loop.cc | 35 +-- gcc/tree-vect-slp.cc | 3 ++- gcc/tree-vect-stmts.cc | 4 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 195db5b1089e..8fb8800e6a7e 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -685,6 +685,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop, STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def; + /* Make it accessible for SLP vectorization. */ + LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info); } else { @@ -5973,7 +5975,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, stmt_vec_info rdef_info = stmt_info; if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) { - gcc_assert (!slp_node); double_reduc = true; stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def (stmt_info->stmt, 0)); @@ -6018,7 +6019,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { outer_loop = loop; loop = loop->inner; - gcc_assert (!slp_node && double_reduc); + gcc_assert (double_reduc); } vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); @@ -6033,7 +6034,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, for induc_val, use initial_def. */ if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); - /* ??? Coverage for double_reduc and 'else' isn't clear. */ + /* ??? Coverage for 'else' isn't clear. */ } else { @@ -7532,15 +7533,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo, STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; return true; } - if (slp_node) -{ - slp_node_instance->reduc_phis = slp_node; - /* ??? We're leaving slp_node to point to the PHIs, we only -need it to get at the number of vector stmts which wasn't -yet initialized for the instance root. */ -} if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) { + if (gimple_bb (stmt_info->stmt) != loop->header) + { + /* For SLP we arrive here for both the inner loop LC PHI and +the outer loop PHI. The latter is what we want to analyze +the reduction with. */ + gcc_assert (slp_node); + return true; + } use_operand_p use_p; gimple *use_stmt; bool res = single_imm_use (gimple_phi_result (stmt_info->stmt), @@ -7549,6 +7551,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo, phi_info = loop_vinfo->lookup_stmt (use_stmt); } + if (slp_node) +{ + slp_node_instance->reduc_phis = slp_node; + /* ??? We're leaving slp_node to point to the PHIs, we only +need it to get at the number of vector stmts which wasn't +yet initialized for the instance root. */ +} + /* PHIs should not participate in patterns. */ gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); gphi *reduc_def_phi = as_a (phi_info->stmt); @@ -7564,6 +7574,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, bool only_slp_reduc_chain = true; stmt_info = NULL; slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL; + /* For double-reductions we start SLP analysis at the inner loop LC PHI + which is the def of the outer loop live stmt. */ + if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def + && slp_node) +slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; while (reduc_def != PHI_RESULT (reduc_def_phi)) { stmt_vec_info def =
[gcc(refs/users/rguenth/heads/vect-force-slp)] Allow single-lane COND_REDUCTION vectorization
https://gcc.gnu.org/g:27affc29b9de2cc35ec53c7263b23027d9932191 commit 27affc29b9de2cc35ec53c7263b23027d9932191 Author: Richard Biener Date: Fri Mar 1 14:39:08 2024 +0100 Allow single-lane COND_REDUCTION vectorization The following enables single-lane COND_REDUCTION vectorization. * tree-vect-loop.cc (vect_create_epilog_for_reduction): Adjust for single-lane COND_REDUCTION SLP vectorization. (vectorizable_reduction): Likewise. (vect_transform_cycle_phi): Likewise. Diff: --- gcc/tree-vect-loop.cc | 100 +- 1 file changed, 83 insertions(+), 17 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 7eeae908d367..195db5b1089e 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -6028,7 +6028,13 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, tree induc_val = NULL_TREE; tree adjustment_def = NULL; if (slp_node) -; +{ + /* Optimize: for induction condition reduction, if we can't use zero +for induc_val, use initial_def. */ + if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) + induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); + /* ??? Coverage for double_reduc and 'else' isn't clear. */ +} else { /* Optimize: for induction condition reduction, if we can't use zero @@ -6073,23 +6079,46 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) { auto_vec, 2> ccompares; - stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); - cond_info = vect_stmt_to_vectorize (cond_info); - while (cond_info != reduc_info) + if (slp_node) { - if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR) + slp_tree cond_node = slp_node_instance->root; + while (cond_node != slp_node_instance->reduc_phis) { - gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0]; - gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); - ccompares.safe_push - (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)), -STMT_VINFO_REDUC_IDX (cond_info) == 2)); + stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node); + if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR) + { + gimple *vec_stmt + = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]); + gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); + ccompares.safe_push + (std::make_pair (gimple_assign_rhs1 (vec_stmt), +STMT_VINFO_REDUC_IDX (cond_info) == 2)); + } + /* ??? We probably want to have REDUC_IDX on the SLP node? */ + cond_node = SLP_TREE_CHILDREN + (cond_node)[STMT_VINFO_REDUC_IDX (cond_info)]; } - cond_info - = loop_vinfo->lookup_def (gimple_op (cond_info->stmt, -1 + STMT_VINFO_REDUC_IDX - (cond_info))); + } + else + { + stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); cond_info = vect_stmt_to_vectorize (cond_info); + while (cond_info != reduc_info) + { + if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR) + { + gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0]; + gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); + ccompares.safe_push + (std::make_pair (gimple_assign_rhs1 (vec_stmt), +STMT_VINFO_REDUC_IDX (cond_info) == 2)); + } + cond_info + = loop_vinfo->lookup_def (gimple_op (cond_info->stmt, +1 + STMT_VINFO_REDUC_IDX +(cond_info))); + cond_info = vect_stmt_to_vectorize (cond_info); + } } gcc_assert (ccompares.length () != 0); @@ -6502,7 +6531,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* 2.3 Create the reduction code, using one of the three schemes described above. In SLP we simply need to extract all the elements from the vector (without reducing them), so we use scalar shifts. */ - else if (reduc_fn != IFN_LAST && !slp_reduc) + else if (reduc_fn != IFN_LAST + && (!slp_reduc || SLP_TREE_LANES (slp_node) == 1)) { tree tmp; tree vec_elem_type; @@ -7767,7 +7797,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, /* If
[gcc(refs/users/rguenth/heads/vect-force-slp)] Place easily identifyable assert insead of SIGSEV
https://gcc.gnu.org/g:a1126dd1b7b0ba051d7d62de2c12b7affa2ecc34 commit a1126dd1b7b0ba051d7d62de2c12b7affa2ecc34 Author: Richard Biener Date: Fri Mar 1 14:56:01 2024 +0100 Place easily identifyable assert insead of SIGSEV Better identification of known ICEs. * tree-vect-stmts.cc (vect_is_simple_use): Assert instead of SIGSEV. Diff: --- gcc/tree-vect-stmts.cc | 4 1 file changed, 4 insertions(+) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 8fef72cb9072..ca81957def06 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -14009,6 +14009,10 @@ vect_is_simple_use (vec_info *vinfo, stmt_vec_info stmt, slp_tree slp_node, *vectype = SLP_TREE_VECTYPE (child); if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) { + /* ??? Instead of crashing, easier to identify. But we +need to think what to do with internal defs of VEC_PERM +kind here. */ + gcc_assert (SLP_TREE_REPRESENTATIVE (child)); *op = gimple_get_lhs (SLP_TREE_REPRESENTATIVE (child)->stmt); return vect_is_simple_use (*op, vinfo, dt, def_stmt_info_out); }
[gcc(refs/users/rguenth/heads/vect-force-slp)] Relax COND_EXPR reduction vectorization SLP restriction
https://gcc.gnu.org/g:1ba0215280e51f0cbc3c3867d6e8c07fc76694f8 commit 1ba0215280e51f0cbc3c3867d6e8c07fc76694f8 Author: Richard Biener Date: Fri Feb 23 16:16:38 2024 +0100 Relax COND_EXPR reduction vectorization SLP restriction Allow one-lane SLP but for the case where we need to swap the arms. * tree-vect-stmts.cc (vectorizable_condition): Allow single-lane SLP, but not when we need to swap then and else clause. Diff: --- gcc/tree-vect-stmts.cc | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 72a9c144823c..8fef72cb9072 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12105,7 +12105,7 @@ vectorizable_condition (vec_info *vinfo, = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL; if (for_reduction) { - if (slp_node) + if (slp_node && SLP_TREE_LANES (slp_node) > 1) return false; reduc_info = info_for_reduction (vinfo, stmt_info); reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); @@ -12194,6 +12194,10 @@ vectorizable_condition (vec_info *vinfo, cond_expr = NULL_TREE; } } + /* ??? The vectorized operand query below doesn't allow swapping +this way for SLP. */ + if (slp_node) + return false; std::swap (then_clause, else_clause); }
[gcc(refs/users/rguenth/heads/vect-force-slp)] Amend --param vect-force-slp checking
https://gcc.gnu.org/g:362a1f1bc0c72c618737a634a88898e3f6142995 commit 362a1f1bc0c72c618737a634a88898e3f6142995 Author: Richard Biener Date: Fri Feb 23 12:39:57 2024 +0100 Amend --param vect-force-slp checking This makes sure no non-SLP code-gen happens. * tree-vect-stmts.cc (vect_transform_stmt): Assert no non-SLP code-gen happens with --param vect-force-slp=1. Diff: --- gcc/tree-vect-stmts.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index b694cc4a8373..72a9c144823c 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -13373,6 +13373,8 @@ vect_transform_stmt (vec_info *vinfo, gcc_assert (slp_node || !PURE_SLP_STMT (stmt_info)); + gcc_assert (!param_vect_force_slp || slp_node); + tree saved_vectype = STMT_VINFO_VECTYPE (stmt_info); if (slp_node) STMT_VINFO_VECTYPE (stmt_info) = SLP_TREE_VECTYPE (slp_node);
[gcc(refs/users/rguenth/heads/vect-force-slp)] Do single-lane SLP discovery for reductions
https://gcc.gnu.org/g:b423891ad43d003a565e7b5c6ed648e446bd3c7c commit b423891ad43d003a565e7b5c6ed648e446bd3c7c Author: Richard Biener Date: Fri Feb 23 11:45:50 2024 +0100 Do single-lane SLP discovery for reductions The following performs single-lane SLP discovery for reductions. This exposes a latent issue with reduction SLP in outer loop vectorization and makes gcc.dg/vect/vect-outer-4[fgkl].c FAIL execution. * tree-vect-slp.cc (vect_build_slp_tree_2): Only multi-lane discoveries are reduction chains and need special backedge treatment. (vect_analyze_slp): Fall back to single-lane SLP discovery for reductions. Make sure to try single-lane SLP reduction for all reductions as fallback. Diff: --- gcc/tree-vect-slp.cc | 58 +++- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index ecc185aae885..f39cde3a8d50 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1918,7 +1918,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, /* Reduction chain backedge defs are filled manually. ??? Need a better way to identify a SLP reduction chain PHI. Or a better overall way to SLP match those. */ - if (all_same && def_type == vect_reduction_def) + if (stmts.length () > 1 + && all_same && def_type == vect_reduction_def) skip_args[loop_latch_edge (loop)->dest_idx] = true; } else if (def_type != vect_internal_def) @@ -3911,7 +3912,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) } /* Find SLP sequences starting from groups of reductions. */ - if (loop_vinfo->reductions.length () > 1) + if (loop_vinfo->reductions.length () > 0) { /* Collect reduction statements. */ vec scalar_stmts; @@ -3934,17 +3935,54 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR && gimple_assign_rhs_code (g) != SAD_EXPR))) scalar_stmts.quick_push (next_info); + else if (param_vect_single_lane_slp != 0) + { + vec stmts; + vec roots = vNULL; + vec remain = vNULL; + stmts.create (1); + stmts.quick_push (next_info); + bool res = vect_build_slp_instance (vinfo, + slp_inst_kind_reduc_group, + stmts, roots, remain, + max_tree_size, , + bst_map, NULL); + gcc_assert (res); + } } - if (scalar_stmts.length () > 1) + vec roots = vNULL; + vec remain = vNULL; + vec saved_stmts = vNULL; + if (param_vect_single_lane_slp != 0) + /* ??? scalar_stmts ownership and arg passing sucks. */ + saved_stmts = scalar_stmts.copy (); + if ((scalar_stmts.length () <= 1 + || !vect_build_slp_instance (loop_vinfo, + slp_inst_kind_reduc_group, + scalar_stmts, roots, remain, + max_tree_size, , bst_map, + NULL)) + && param_vect_single_lane_slp != 0) { - vec roots = vNULL; - vec remain = vNULL; - vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group, - scalar_stmts, roots, remain, - max_tree_size, , bst_map, NULL); + if (scalar_stmts.length () <= 1) + scalar_stmts.release (); + /* Do SLP discovery for single-lane reductions. */ + for (auto stmt_info : saved_stmts) + { + vec stmts; + vec roots = vNULL; + vec remain = vNULL; + stmts.create (1); + stmts.quick_push (vect_stmt_to_vectorize (stmt_info)); + bool res = vect_build_slp_instance (vinfo, + slp_inst_kind_reduc_group, + stmts, roots, remain, + max_tree_size, , + bst_map, NULL); + gcc_assert (res); + } + saved_stmts.release (); } - else - scalar_stmts.release (); } }
[gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid SLP build failure for unsupported shifts
https://gcc.gnu.org/g:95f3686d4788d7420422d514996c5b6e7a8facfd commit 95f3686d4788d7420422d514996c5b6e7a8facfd Author: Richard Biener Date: Thu Oct 5 14:07:02 2023 +0200 Avoid SLP build failure for unsupported shifts When asserting that SLP _build_ can succeed we run into the SLP shift discovery code trying to be clever doing vectorizable_shifts work and failing discovery already. That gives a false impression for now, so disable that when we do single-lane builds. * tree-vect-slp.cc (vect_build_slp_tree_1): Do not fail fatally for shifts not supported by the target when discovering a single lane. Diff: --- gcc/tree-vect-slp.cc | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 6bfc59dc2131..ecc185aae885 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1215,7 +1215,13 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, if (!directly_supported_p (rhs_code, vectype, optab_vector)) { /* No vector/vector shift, try for a vector/scalar shift. */ - if (!directly_supported_p (rhs_code, vectype, optab_scalar)) + if (!directly_supported_p (rhs_code, vectype, optab_scalar) + /* ??? We are using this to guide operand swapping to +eventually make all shift operands the same but we +shouldn't fail in the end - that's be business of +vectorizable_shift. +Avoid spurious ICEs for single-lane discovery. */ + && group_size != 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
[gcc(refs/users/rguenth/heads/vect-force-slp)] Reduce single-lane SLP testresult noise
https://gcc.gnu.org/g:e8ba59ef5c6284604f3c0920e246ed4cf889e541 commit e8ba59ef5c6284604f3c0920e246ed4cf889e541 Author: Richard Biener Date: Thu Oct 5 13:31:16 2023 +0200 Reduce single-lane SLP testresult noise The following avoids dumping 'vectorizing stmts using SLP' for single-lane instances since that causes extra testsuite fallout. * tree-vect-slp.cc (vect_schedule_slp): Gate dumping 'vectorizing stmts using SLP' on > 1 lanes. Diff: --- gcc/tree-vect-slp.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index ef0199cf3fb2..6bfc59dc2131 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -10055,7 +10055,8 @@ vect_schedule_slp (vec_info *vinfo, const vec _instances) if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) vectorize_slp_instance_root_stmt (node, instance); - if (dump_enabled_p ()) + /* ??? Reduce some testsuite noise because of "more SLP". */ + if (SLP_TREE_LANES (node) > 1 && dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "vectorizing stmts using SLP.\n"); }
[gcc(refs/users/rguenth/heads/vect-force-slp)] Add FIXME note regarding gcc.dg/vect/pr60276.c runfail with single-lane SLP
https://gcc.gnu.org/g:307f09a496e4476c006e8b1fe56b396a465c9413 commit 307f09a496e4476c006e8b1fe56b396a465c9413 Author: Richard Biener Date: Wed Oct 4 14:34:18 2023 +0200 Add FIXME note regarding gcc.dg/vect/pr60276.c runfail with single-lane SLP * tree-vect-stmts.cc (vectorizable_load): Add FIXME to PR60276 fix. Diff: --- gcc/tree-vect-stmts.cc | 5 + 1 file changed, 5 insertions(+) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 46e4edb5e36f..b694cc4a8373 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -9951,6 +9951,11 @@ vectorizable_load (vec_info *vinfo, /* Invalidate assumptions made by dependence analysis when vectorization on the unrolled body effectively re-orders stmts. */ + /* ??? This fails to trigger with single-lane SLP, gcc.dg/vect/pr60276.c, + but simply removing the ncopies > 1 conditional here (and below) will + cause FAILs of gcc.dg/vect/no-vfa-vect-depend-3.c and + gcc.dg/vect/tsvc/vect-tsvc-s3251.c. The original fix (for PR60276) + needs to be re-thought. */ if (ncopies > 1 && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0 && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
[gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store dataref groups during SLP discovery
https://gcc.gnu.org/g:9315bfc661432c3ad82a7ade21359d5c078dc41b commit 9315bfc661432c3ad82a7ade21359d5c078dc41b Author: Richard Biener Date: Fri Sep 29 13:13:16 2023 +0200 Avoid splitting store dataref groups during SLP discovery The following avoids splitting store dataref groups during SLP discovery but instead forces (eventually single-lane) consecutive lane SLP discovery for all lanes of the group, creating a VEC_PERM SLP node merging them so the store will always cover the whole group. I figured the patched function needs some refactoring so this is in draft state indenting-wise. With this for example int x[1024], y[1024], z[1024], w[1024]; void foo (void) { for (int i = 0; i < 256; i++) { x[4*i+0] = y[2*i+0]; x[4*i+1] = y[2*i+1]; x[4*i+2] = z[i]; x[4*i+3] = w[i]; } } which was previously using hybrid SLP can now be fully SLPed and SSE code generated looks better (but of course you never know, I didn't actually benchmark). We of course need a VF of four here. .L2: movdqa z(%rax), %xmm0 movdqa w(%rax), %xmm4 movdqa y(%rax,%rax), %xmm2 movdqa y+16(%rax,%rax), %xmm1 movdqa %xmm0, %xmm3 punpckhdq %xmm4, %xmm0 punpckldq %xmm4, %xmm3 movdqa %xmm2, %xmm4 shufps $238, %xmm3, %xmm2 movaps %xmm2, x+16(,%rax,4) movdqa %xmm1, %xmm2 shufps $68, %xmm3, %xmm4 shufps $68, %xmm0, %xmm2 movaps %xmm4, x(,%rax,4) shufps $238, %xmm0, %xmm1 movaps %xmm2, x+32(,%rax,4) movaps %xmm1, x+48(,%rax,4) addq$16, %rax cmpq$1024, %rax jne .L2 The extra permute nodes unfortunately sometimes do not behave nicely wrt vect_is_simple_use since when the source is an invariant or external there's no def stmt we can fake as representative but vect_is_simple_use eventually gets the caller the scalar operand and its definition. One might argue using SLP_TREE_OPS and getting an external def would maybe be more to the point, also since permute optimization could change whether or not that appears. * tree-vect-slp.cc (vect_build_slp_instance): Do not split dataref groups on discovery failure. Diff: --- gcc/tree-vect-slp.cc | 171 ++- 1 file changed, 168 insertions(+), 3 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 3a078b253df5..ef0199cf3fb2 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3476,8 +3476,6 @@ vect_build_slp_instance (vec_info *vinfo, else { /* Failed to SLP. */ - /* Free the allocated memory. */ - scalar_stmts.release (); } stmt_vec_info stmt_info = stmt_info_; @@ -3496,6 +3494,8 @@ vect_build_slp_instance (vec_info *vinfo, if (is_a (vinfo) && (i > 1 && i < group_size)) { +/* Free the allocated memory. */ +scalar_stmts.release (); tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, @@ -3542,7 +3542,10 @@ vect_build_slp_instance (vec_info *vinfo, /* For loop vectorization split into arbitrary pieces of size > 1. */ if (is_a (vinfo) - && (i > 1 && i < group_size) + && ((i > 1 && i < group_size) + /* For single-lane SLP when only the first lane didn't fail +also split to single-lanes. */ + || (i > 0 && i < group_size && param_vect_single_lane_slp != 0)) && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i)) { unsigned group1_size = i; @@ -3551,6 +3554,164 @@ vect_build_slp_instance (vec_info *vinfo, dump_printf_loc (MSG_NOTE, vect_location, "Splitting SLP group at stmt %u\n", i); + if (param_vect_single_lane_slp != 0) + { + /* Analyze the stored values and pinch them together with +a permute node so we can preserve the whole store group. */ + auto_vec rhs_nodes; + + /* Calculate the unrolling factor based on the smallest type. */ + poly_uint64 unrolling_factor = 1; + + unsigned int start = 0, end = i; + while (start < group_size) + { + gcc_assert (end - start >= 1); + vec substmts; + substmts.create (end - start); + for (unsigned j = start; j < end; ++j) + substmts.quick_push (scalar_stmts[j]); + max_nunits = 1; + node = vect_build_slp_tree (vinfo, substmts, end - start, +
[gcc(refs/users/rguenth/heads/vect-force-slp)] Do not account single-lane SLP graphs against discovery limit
https://gcc.gnu.org/g:fa0f551f3931529d5be72140f5a37ed02d0e0366 commit fa0f551f3931529d5be72140f5a37ed02d0e0366 Author: Richard Biener Date: Fri Sep 29 15:12:54 2023 +0200 Do not account single-lane SLP graphs against discovery limit The following avoids accounting single-lane SLP to the discovery limit. Even when raising it the attempt of forming multi-lane SLP can exhaust the limit before we fall back to single-lane. * tree-vect-slp.cc (vect_build_slp_tree): Only account multi-lane SLP to limit. Diff: --- gcc/tree-vect-slp.cc | 31 ++- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 66c8fa38979f..3a078b253df5 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1725,21 +1725,26 @@ vect_build_slp_tree (vec_info *vinfo, SLP_TREE_SCALAR_STMTS (res) = stmts; bst_map->put (stmts.copy (), res); - if (*limit == 0) + /* Single-lane SLP doesn't have the chance of run-away, do not account + it to the limit. */ + if (stmts.length () > 1) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, -"SLP discovery limit exceeded\n"); - /* Mark the node invalid so we can detect those when still in use -as backedge destinations. */ - SLP_TREE_SCALAR_STMTS (res) = vNULL; - SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def; - res->failed = XNEWVEC (bool, group_size); - memset (res->failed, 0, sizeof (bool) * group_size); - memset (matches, 0, sizeof (bool) * group_size); - return NULL; + if (*limit == 0) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, +"SLP discovery limit exceeded\n"); + /* Mark the node invalid so we can detect those when still in use +as backedge destinations. */ + SLP_TREE_SCALAR_STMTS (res) = vNULL; + SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def; + res->failed = XNEWVEC (bool, group_size); + memset (res->failed, 0, sizeof (bool) * group_size); + memset (matches, 0, sizeof (bool) * group_size); + return NULL; + } + --*limit; } - --*limit; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location,
[gcc(refs/users/rguenth/heads/vect-force-slp)] Allow bigger SLP graphs
https://gcc.gnu.org/g:6b597bdb72177699178e238e3da713dc222a0eeb commit 6b597bdb72177699178e238e3da713dc222a0eeb Author: Richard Biener Date: Fri Sep 29 13:05:01 2023 +0200 Allow bigger SLP graphs When doing single-lane SLP discovery only we're easily running into the SLP graph size limit when patterns are involved. The following ups the limit from the number of scalar stmts to the number of scalar or pattern stmts by using the number of stmt_vec_infos created. * tree-vect-loop.cc (vect_analyze_loop_2): Use the number of stmt_vec_infos created to limit the SLP graph size. Diff: --- gcc/tree-vect-loop.cc | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 361aec064884..7eeae908d367 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2855,8 +2855,10 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool , if (slp) { /* Check the SLP opportunities in the loop, analyze and build -SLP trees. */ - ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo)); +SLP trees. Use the number of stmt_vec_infos as graph limit +since that also includes pattern stmts which LOOP_VINFO_N_STMTS +does not. */ + ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length ()); if (!ok) return ok;
[gcc(refs/users/rguenth/heads/vect-force-slp)] Handle non-grouped SLP stores
https://gcc.gnu.org/g:540ffaa0d720ed04bb083857230ecd583662f8cc commit 540ffaa0d720ed04bb083857230ecd583662f8cc Author: Richard Biener Date: Wed Oct 4 14:32:39 2023 +0200 Handle non-grouped SLP stores The following adjusts vectorizable_store to properly handle non-grouped SLP stores to update vec_num. * tree-vect-stmts.cc (vectorizable_store): Always set vec_num for SLP. Diff: --- gcc/tree-vect-stmts.cc | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index f99dce38bf7b..46e4edb5e36f 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -8361,10 +8361,12 @@ vectorizable_store (vec_info *vinfo, return vectorizable_scan_store (vinfo, stmt_info, gsi, vec_stmt, ncopies); } - if (grouped_store) + if (grouped_store || slp) { /* FORNOW */ - gcc_assert (!loop || !nested_in_vect_loop_p (loop, stmt_info)); + gcc_assert (!grouped_store + || !loop + || !nested_in_vect_loop_p (loop, stmt_info)); if (slp) { @@ -8373,8 +8375,9 @@ vectorizable_store (vec_info *vinfo, group. */ vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0]; - gcc_assert (DR_GROUP_FIRST_ELEMENT (first_stmt_info) - == first_stmt_info); + gcc_assert (!STMT_VINFO_GROUPED_ACCESS (first_stmt_info) + || (DR_GROUP_FIRST_ELEMENT (first_stmt_info) + == first_stmt_info)); first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); op = vect_get_store_rhs (first_stmt_info); }
[gcc(refs/users/rguenth/heads/vect-force-slp)] Add --param vect-single-lane-slp
https://gcc.gnu.org/g:e2d9709cdc50d68ed0e662634d8608c6f8491888 commit e2d9709cdc50d68ed0e662634d8608c6f8491888 Author: Richard Biener Date: Fri Sep 29 12:54:17 2023 +0200 Add --param vect-single-lane-slp The following adds --param vect-single-lane-slp to guard single-lane loop SLP discovery. As first client we look at non-grouped stores with an assert that SLP discovery works to discover gaps in it. * params.opt (-param=vect-single-lane-slp=): New. * tree-vect-slp.cc (vect_analyze_slp): Perform single-lane loop SLP discovery for non-grouped stores if requested. Diff: --- gcc/params.opt | 4 gcc/tree-vect-slp.cc | 26 ++ 2 files changed, 30 insertions(+) diff --git a/gcc/params.opt b/gcc/params.opt index 74ea9c6f8d93..4cde5c3015ae 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -1198,6 +1198,10 @@ The maximum factor which the loop vectorizer applies to the cost of statements i Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 1) Param Optimization Enable loop vectorization of floating point inductions. +-param=vect-single-lane-slp= +Common Joined UInteger Var(param_vect_single_lane_slp) Init(0) IntegerRange(0, 1) Param Optimization +Enable single lane SLP discovery. + -param=vect-force-slp= Common Joined UInteger Var(param_vect_force_slp) Init(0) IntegerRange(0, 1) Param Optimization Fail vectorization when falling back to non-SLP. diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index f34ed54a70b0..66c8fa38979f 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3643,6 +3643,7 @@ vect_analyze_slp_instance (vec_info *vinfo, opt_result vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) { + loop_vec_info loop_vinfo = dyn_cast (vinfo); unsigned int i; stmt_vec_info first_element; slp_instance instance; @@ -3658,6 +3659,31 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) vect_analyze_slp_instance (vinfo, bst_map, first_element, slp_inst_kind_store, max_tree_size, ); + if (loop_vinfo && param_vect_single_lane_slp != 0) +{ + data_reference_p dr; + FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr) + if (DR_IS_WRITE (dr)) + { + stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt; + /* It works a bit to dissolve the group but that's + not really what we want to do. Instead group analysis + above starts discovery for each lane and pieces them together + to a single store to the whole group. */ + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + continue; + vec stmts; + vec roots = vNULL; + vec remain = vNULL; + stmts.create (1); + stmts.quick_push (stmt_info); + bool res = vect_build_slp_instance (vinfo, slp_inst_kind_store, + stmts, roots, remain, + max_tree_size, , + bst_map, NULL); + gcc_assert (res); + } +} if (bb_vec_info bb_vinfo = dyn_cast (vinfo)) {
[gcc(refs/users/rguenth/heads/vect-force-slp)] Fail vectorization when not SLP with --param vect-force-slp=1
https://gcc.gnu.org/g:51d831bd7cd122511d03efcc3da2de343a16553a commit 51d831bd7cd122511d03efcc3da2de343a16553a Author: Richard Biener Date: Wed Aug 23 10:48:32 2023 +0200 Fail vectorization when not SLP with --param vect-force-slp=1 The following adds --param vect-force-slp allowing to indicate failure when not all stmts participating in loop vectorization are using SLP vectorization. This is intended for transitioning and debugging. Enabling this without further changes results in the following within vect.exp on x86_64 === g++ Summary === -# of expected passes 619 +# of expected passes 546 +# of unexpected failures 73 === gcc Summary === -# of expected passes 8835 -# of expected failures 256 +# of expected passes 7271 +# of unexpected failures 1564 +# of unexpected successes 12 +# of expected failures 244 === gfortran Summary === -# of expected passes 171 +# of expected passes 144 +# of unexpected failures 27 * params.opt (-param=vect-force-slp=): New, default to 0. * doc/invoke.texi (--param vect-force-slp): Document. * tree-vect-stmts.cc (vect_analyze_stmt): With --param vect-force-slp=1 fail vectorization when not using SLP. Diff: --- gcc/doc/invoke.texi| 4 gcc/params.opt | 4 gcc/tree-vect-stmts.cc | 6 ++ 3 files changed, 14 insertions(+) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index ddcd5213f06a..3bd02fb13e5e 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -16747,6 +16747,10 @@ this parameter. The default value of this parameter is 50. @item vect-induction-float Enable loop vectorization of floating point inductions. +@item vect-force-slp +Fail vectorization when falling back to non-SLP. This is intended for +debugging only. + @item vrp-sparse-threshold Maximum number of basic blocks before VRP uses a sparse bitmap cache. diff --git a/gcc/params.opt b/gcc/params.opt index d34ef545bf03..74ea9c6f8d93 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -1198,6 +1198,10 @@ The maximum factor which the loop vectorizer applies to the cost of statements i Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 1) Param Optimization Enable loop vectorization of floating point inductions. +-param=vect-force-slp= +Common Joined UInteger Var(param_vect_force_slp) Init(0) IntegerRange(0, 1) Param Optimization +Fail vectorization when falling back to non-SLP. + -param=vrp-sparse-threshold= Common Joined UInteger Var(param_vrp_sparse_threshold) Init(3000) Optimization Param Maximum number of basic blocks before VRP uses a sparse bitmap cache. diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index b8a71605f1bc..f99dce38bf7b 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -13257,6 +13257,12 @@ vect_analyze_stmt (vec_info *vinfo, return opt_result::success (); } + if (param_vect_force_slp && !node) +return opt_result::failure_at (stmt_info->stmt, + "not vectorized:" + " not part of SLP but SLP forced: %G", + stmt_info->stmt); + ok = true; if (!bb_vinfo && (STMT_VINFO_RELEVANT_P (stmt_info)
[gcc/rguenth/heads/vect-force-slp] (1426 commits) PR60276 fix for single-lane SLP
The branch 'rguenth/heads/vect-force-slp' was updated to point to: 8a9b159a8608... PR60276 fix for single-lane SLP It previously pointed to: 3a1fe1d6d941... Improve combined store node splitting Diff: !!! WARNING: THE FOLLOWING COMMITS ARE NO LONGER ACCESSIBLE (LOST): --- 3a1fe1d... Improve combined store node splitting 86287e0... Add single-lane SLP support to .GOMP_SIMD_LANE vectorizatio e2bef5c... Fix last commit WRT patterns fa45f8f... Handle unused-only-live stmts in SLP discovery d1b8915... Avoid bogus SLP outer loop vectorization 89b9eee... Fix non-grouped SLP load/store accounting in alignment peel 27d303e... Allow single-lane SLP in-order reductions bc49f0d... Add double reduction support for SLP vectorization c316aa7... Allow single-lane COND_REDUCTION vectorization 8cdfb70... Place easily identifyable assert insead of SIGSEV e672d56... Refactor SLP reduction group discovery d17ef2e... Allow patterns in SLP reductions 74e7541... Relax COND_EXPR reduction vectorization SLP restriction c917156... Amend --param vect-force-slp checking b992691... Do single-lane SLP discovery for reductions 598e22d... Fix SLP reduction initial value for pointer reductions 98f3724... PR60276 fix for single-lane SLP ef49bc6... Avoid SLP build failure for unsupported shifts a373df9... Reduce single-lane SLP testresult noise 69d5454... Add FIXME note regarding gcc.dg/vect/pr60276.c runfail with 04db3df... Avoid splitting store dataref groups during SLP discovery 9db0573... Do not account single-lane SLP graphs against discovery lim 305009b... Allow bigger SLP graphs 248bd7d... Guard SLP optimize latch edge discovery 0ba91db... Handle non-grouped SLP stores f9c2a5d... Add --param vect-single-lane-slp e5d482c... Fail vectorization when not SLP with --param vect-force-slp Summary of changes (added commits): --- 8a9b159... PR60276 fix for single-lane SLP (*) c4af8eb... testsuite: c++: Allow for std::printf in g++.dg/modules/std (*) fb1649f... libstdc++: Use __builtin_shufflevector for simd split and c (*) 898d714... Refactor SLP reduction group discovery (*) b621482... tree-ssa-math-opts: Pattern recognize yet another .ADD_OVER (*) f3f02a7... Manually add ChangeLog entry for r15-353-gd7bb8eaade3cd3aa7 (*) f2d1189... Daily bump. (*) 5de0753... ada: Move Init_Proc_Level_Formal from Exp_Ch3 to Exp_Util (*) 51b84f2... ada: Remove code that expected pre/post being split into co (*) a004159... ada: Revert recent change for Put_Image and Object_Size att (*) 6d13384... ada: Rename finalization scope masters into finalization ma (*) a9c07b8... ada: Remove dynamic frame in System.Image_D and document it (*) 7e348a4... ada: Attributes Put_Image and Object_Size are defined by Ad (*) c1ece0c... ada: Remove guards against traversal of empty list of aspec (*) b3eef3b... ada: Fix crash on Compile_Time_Warning in dead code (*) f7e1dde... ada: Deconstruct flag Split_PPC since splitting now is done (*) 3aa99be... ada: Move splitting of pre/post aspect expressions to expan (*) 1de93ed... ada: Fix style in comments (*) da88475... ada: Refine type of a local variable (*) 32fe73e... ada: Recognize pragma Lock_Free as specific to GNAT (*) 4768f3d... ada: Deconstruct unused flag Is_Expanded_Contract (*) 7f12896... ada: Refactor repeated code for querying Boolean-valued asp (*) 65c0029... ada: Complete implementation of Ada 2022 aspect Exclusive_F (*) 0533acf... ada: Rewrite Append_Entity_Name; skip irrelevant names (*) c52bfe6... ada: Couple of comment tweaks to latest change (*) c8e01e7... ada: Replace finalization masters with finalization collect (*) eff0e26... ada: Remove deprecated VxWorks interrupt connection API (*) 56e781f... ada: Decouple finalization masters from storage pools (*) c1b33f8... ada: Small cleanup in the BIP machinery (*) 8d6c7fc... ada: Restore fix for controlled dynamic allocation with BIP (*) 8e76c18... ada: Avoid crash on illegal constrained type declarations (*) 2fc8ea4... ada: Fix pragma Compile_Time_Error for alignment of array t (*) c573c56... ada: Enable casing on composite via -X0 instead of -X (*) 5270bfc... ada: Fix internal error with Put_Image aspect on access-to- (*) a14dc3e... ada: Simplify uses of readdir_gnat with object overlay (*) 105bba8... ada: Refactor GNAT.Directory_Operations.Read to minimise ru (*) 0a82463... ada: Compiler crash on nonstatic container aggregates for D (*) 2d0eeb5... Fortran: Fix wrong code in unlimited polymorphic assignment (*) 0c6dd4b... Revert "MIPS: Support constraint 'w' for MSA instruction" (*) 4aeff84... MAINTAINERS: Add myself to write after approval (*) 13b6ac4... Fortran: fix frontend memleak (*) 4607799... arm: Use utxb rN, rM, ror #8 to implement zero_extract on a (*) 83fb5e6... [to-be-committed,RISC-V] Improve usage of slli.uw in
[gcc r15-431] PR60276 fix for single-lane SLP
https://gcc.gnu.org/g:8a9b159a86081053289be0c44339623ff59717a2 commit r15-431-g8a9b159a86081053289be0c44339623ff59717a2 Author: Richard Biener Date: Thu Nov 9 11:30:22 2023 +0100 PR60276 fix for single-lane SLP When enabling single-lane SLP and not splitting groups the fix for PR60276 is no longer effective since it for unknown reason exempted pure SLP. The following removes this exemption, making gcc.dg/vect/pr60276.c PASS even with --param vect-single-lane-slp=1 PR tree-optimization/60276 * tree-vect-stmts.cc (vectorizable_load): Do not exempt pure_slp grouped loads from the STMT_VINFO_MIN_NEG_DIST restriction. Diff: --- gcc/tree-vect-stmts.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 21e8fe98e44a..b8a71605f1bc 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -9995,8 +9995,7 @@ vectorizable_load (vec_info *vinfo, /* Invalidate assumptions made by dependence analysis when vectorization on the unrolled body effectively re-orders stmts. */ - if (!PURE_SLP_STMT (stmt_info) - && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0 + if (STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0 && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo), STMT_VINFO_MIN_NEG_DIST (stmt_info))) {
[gcc r13-8763] rtl-optimization/54052 - RTL SSA PHI insertion compile-time hog
https://gcc.gnu.org/g:d629308c699bb8fe90c2afeb7fa1acb12cb5526b commit r13-8763-gd629308c699bb8fe90c2afeb7fa1acb12cb5526b Author: Richard Biener Date: Mon Feb 19 11:10:50 2024 +0100 rtl-optimization/54052 - RTL SSA PHI insertion compile-time hog The following tries to address the PHI insertion compile-time hog in RTL fwprop observed with the PR54052 testcase where the loop computing the "unfiltered" set of variables possibly needing PHI nodes for each block exhibits quadratic compile-time and memory-use. It does so by pruning the local DEFs with LR_OUT of the block, removing regs that can never be LR_IN (defined by this block) in the dominance frontier. PR rtl-optimization/54052 * rtl-ssa/blocks.cc (function_info::place_phis): Filter local defs by LR_OUT. (cherry picked from commit c7151283dc747769d4ac4f216d8f519bda2569b5) Diff: --- gcc/rtl-ssa/blocks.cc | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gcc/rtl-ssa/blocks.cc b/gcc/rtl-ssa/blocks.cc index 1f9969d78d88..0ee9fa0d6a2a 100644 --- a/gcc/rtl-ssa/blocks.cc +++ b/gcc/rtl-ssa/blocks.cc @@ -639,7 +639,12 @@ function_info::place_phis (build_info ) if (bitmap_empty_p ([b1])) continue; - bitmap b1_def = _LR_BB_INFO (BASIC_BLOCK_FOR_FN (m_fn, b1))->def; + // Defs in B1 that are possibly in LR_IN in the dominance frontier + // blocks. + auto_bitmap b1_def; + bitmap_and (b1_def, _LR_BB_INFO (BASIC_BLOCK_FOR_FN (m_fn, b1))->def, + DF_LR_OUT (BASIC_BLOCK_FOR_FN (m_fn, b1))); + bitmap_iterator bmi; unsigned int b2; EXECUTE_IF_SET_IN_BITMAP ([b1], 0, b2, bmi)
[gcc r15-428] Refactor SLP reduction group discovery
https://gcc.gnu.org/g:898d7145fb90734c9981555ec099710d87fc05af commit r15-428-g898d7145fb90734c9981555ec099710d87fc05af Author: Richard Biener Date: Fri Mar 1 12:08:36 2024 +0100 Refactor SLP reduction group discovery The following refactors a bit how we perform SLP reduction group discovery possibly making it easier to have multiple reduction groups later, esp. with single-lane SLP. * tree-vect-slp.cc (vect_analyze_slp_instance): Remove slp_inst_kind_reduc_group handling. (vect_analyze_slp): Add the meat here. Diff: --- gcc/tree-vect-slp.cc | 67 ++-- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 8c18f5308e2e..f34ed54a70b0 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3586,7 +3586,6 @@ vect_analyze_slp_instance (vec_info *vinfo, slp_instance_kind kind, unsigned max_tree_size, unsigned *limit) { - unsigned int i; vec scalar_stmts; if (is_a (vinfo)) @@ -3620,35 +3619,6 @@ vect_analyze_slp_instance (vec_info *vinfo, STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); } - else if (kind == slp_inst_kind_reduc_group) -{ - /* Collect reduction statements. */ - const vec - = as_a (vinfo)->reductions; - scalar_stmts.create (reductions.length ()); - for (i = 0; reductions.iterate (i, _info); i++) - { - gassign *g; - next_info = vect_stmt_to_vectorize (next_info); - if ((STMT_VINFO_RELEVANT_P (next_info) - || STMT_VINFO_LIVE_P (next_info)) - /* ??? Make sure we didn't skip a conversion around a reduction -path. In that case we'd have to reverse engineer that -conversion stmt following the chain using reduc_idx and from -the PHI using reduc_def. */ - && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def - /* Do not discover SLP reductions for lane-reducing ops, that -will fail later. */ - && (!(g = dyn_cast (STMT_VINFO_STMT (next_info))) - || (gimple_assign_rhs_code (g) != DOT_PROD_EXPR - && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR - && gimple_assign_rhs_code (g) != SAD_EXPR))) - scalar_stmts.quick_push (next_info); - } - /* If less than two were relevant/live there's nothing to SLP. */ - if (scalar_stmts.length () < 2) - return false; -} else gcc_unreachable (); @@ -3740,9 +3710,40 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) /* Find SLP sequences starting from groups of reductions. */ if (loop_vinfo->reductions.length () > 1) - vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0], - slp_inst_kind_reduc_group, max_tree_size, - ); + { + /* Collect reduction statements. */ + vec scalar_stmts; + scalar_stmts.create (loop_vinfo->reductions.length ()); + for (auto next_info : loop_vinfo->reductions) + { + gassign *g; + next_info = vect_stmt_to_vectorize (next_info); + if ((STMT_VINFO_RELEVANT_P (next_info) + || STMT_VINFO_LIVE_P (next_info)) + /* ??? Make sure we didn't skip a conversion around a +reduction path. In that case we'd have to reverse +engineer that conversion stmt following the chain using +reduc_idx and from the PHI using reduc_def. */ + && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def + /* Do not discover SLP reductions for lane-reducing ops, that +will fail later. */ + && (!(g = dyn_cast (STMT_VINFO_STMT (next_info))) + || (gimple_assign_rhs_code (g) != DOT_PROD_EXPR + && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR + && gimple_assign_rhs_code (g) != SAD_EXPR))) + scalar_stmts.quick_push (next_info); + } + if (scalar_stmts.length () > 1) + { + vec roots = vNULL; + vec remain = vNULL; + vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group, + scalar_stmts, roots, remain, + max_tree_size, , bst_map, NULL); + } + else + scalar_stmts.release (); + } } hash_set visited_patterns;
[gcc r15-362] tree-optimization/114998 - use-after-free with loop distribution
https://gcc.gnu.org/g:34d15a4d630a0d54eddb99bdab086c506e10dac5 commit r15-362-g34d15a4d630a0d54eddb99bdab086c506e10dac5 Author: Richard Biener Date: Fri May 10 14:19:49 2024 +0200 tree-optimization/114998 - use-after-free with loop distribution When loop distribution releases a PHI node of the original IL it can end up clobbering memory that's re-used when it upon releasing its RDG resets all stmt UIDs back to -1, even those that got released. The fix is to avoid resetting UIDs based on stmts in the RDG but instead reset only those still present in the loop. PR tree-optimization/114998 * tree-loop-distribution.cc (free_rdg): Take loop argument. Reset UIDs of stmts still in the IL rather than all stmts referenced from the RDG. (loop_distribution::build_rdg): Pass loop to free_rdg. (loop_distribution::distribute_loop): Likewise. (loop_distribution::transform_reduction_loop): Likewise. * gcc.dg/torture/pr114998.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/torture/pr114998.c | 35 + gcc/tree-loop-distribution.cc | 24 -- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/gcc/testsuite/gcc.dg/torture/pr114998.c b/gcc/testsuite/gcc.dg/torture/pr114998.c new file mode 100644 index ..81fc1e077cb9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr114998.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-fno-tree-dce -ftree-loop-distribution" } */ + +short a, d; +int b, c, f, g, h, i, j[2], o; +__attribute__((const)) int s(char r); +int main() { + int l, m, k, n; + if (b) { +char p; +for (; p >= 0; p--) { + int e[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, + 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, + 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0}; + if (j[p]) { +int q[1]; +i = o; +o = q[h]; +if (g) + n = d; +m = 4; +for (; m; m--) { + if (l) +k |= c; + if (a) +break; +} + } + s(n); + f |= b; +} + } + return 0; +} diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index 95203fefa188..45932bae5e7f 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -778,7 +778,7 @@ loop_distribution::stmts_from_loop (class loop *loop, vec *stmts) /* Free the reduced dependence graph RDG. */ static void -free_rdg (struct graph *rdg) +free_rdg (struct graph *rdg, loop_p loop) { int i; @@ -792,13 +792,25 @@ free_rdg (struct graph *rdg) if (v->data) { - gimple_set_uid (RDGV_STMT (v), -1); (RDGV_DATAREFS (v)).release (); free (v->data); } } free_graph (rdg); + + /* Reset UIDs of stmts still in the loop. */ + basic_block *bbs = get_loop_body (loop); + for (unsigned i = 0; i < loop->num_nodes; ++i) +{ + basic_block bb = bbs[i]; + gimple_stmt_iterator gsi; + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next ()) + gimple_set_uid (gsi_stmt (gsi), -1); + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next ()) + gimple_set_uid (gsi_stmt (gsi), -1); +} + free (bbs); } struct graph * @@ -812,7 +824,7 @@ loop_distribution::build_rdg (class loop *loop, control_dependences *cd) rdg = new_graph (stmts.length ()); if (!create_rdg_vertices (rdg, stmts, loop)) { - free_rdg (rdg); + free_rdg (rdg, loop); return NULL; } stmts.release (); @@ -3062,7 +3074,7 @@ loop_distribution::distribute_loop (class loop *loop, "Loop %d not distributed: too many memory references.\n", loop->num); - free_rdg (rdg); + free_rdg (rdg, loop); loop_nest.release (); free_data_refs (datarefs_vec); delete ddrs_table; @@ -3259,7 +3271,7 @@ loop_distribution::distribute_loop (class loop *loop, FOR_EACH_VEC_ELT (partitions, i, partition) partition_free (partition); - free_rdg (rdg); + free_rdg (rdg, loop); return nbp - *nb_calls; } @@ -3665,7 +3677,7 @@ loop_distribution::transform_reduction_loop (loop_p loop) auto_bitmap partition_stmts; bitmap_set_range (partition_stmts, 0, rdg->n_vertices); find_single_drs (loop, rdg, partition_stmts, _dr, _dr); - free_rdg (rdg); + free_rdg (rdg, loop); /* Bail out if there is no single load. */ if (load_dr == NULL)
[gcc r15-361] Allow patterns in SLP reductions
https://gcc.gnu.org/g:52d4691294c84793b301ad3cc24e277b8c7efe0b commit r15-361-g52d4691294c84793b301ad3cc24e277b8c7efe0b Author: Richard Biener Date: Fri Mar 1 09:29:32 2024 +0100 Allow patterns in SLP reductions The following removes the over-broad rejection of patterns for SLP reductions which is done by removing them from LOOP_VINFO_REDUCTIONS during pattern detection. That's also insufficient in case the pattern only appears on the reduction path. Instead this implements the proper correctness check in vectorizable_reduction and guides SLP discovery to heuristically avoid forming later invalid groups. I also couldn't find any testcase that FAILs when allowing the SLP reductions to form so I've added one. I came across this for single-lane SLP reductions with the all-SLP work where we rely on patterns to properly vectorize COND_EXPR reductions. * tree-vect-patterns.cc (vect_pattern_recog_1): Do not remove reductions involving patterns. * tree-vect-loop.cc (vectorizable_reduction): Reject SLP reduction groups with multiple lane-reducing reductions. * tree-vect-slp.cc (vect_analyze_slp_instance): When discovering SLP reduction groups avoid including lane-reducing ones. * gcc.dg/vect/vect-reduc-sad-9.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c | 68 gcc/tree-vect-loop.cc| 15 ++ gcc/tree-vect-patterns.cc| 13 -- gcc/tree-vect-slp.cc | 26 +++ 4 files changed, 101 insertions(+), 21 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c new file mode 100644 index ..3c6af4510f45 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c @@ -0,0 +1,68 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-additional-options "-msse4.2" { target { x86_64-*-* i?86-*-* } } } */ +/* { dg-require-effective-target vect_usad_char } */ + +#include +#include "tree-vect.h" + +#define N 64 + +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +int abs (int); + +/* Sum of absolute differences between arrays of unsigned char types. + Detected as a sad pattern. + Vectorized on targets that support sad for unsigned chars. */ + +__attribute__ ((noinline)) int +foo (int len, int *res2) +{ + int i; + int result = 0; + int result2 = 0; + + for (i = 0; i < len; i++) +{ + /* Make sure we are not using an SLP reduction for this. */ + result += abs (X[2*i] - Y[2*i]); + result2 += abs (X[2*i + 1] - Y[2*i + 1]); +} + + *res2 = result2; + return result; +} + + +int +main (void) +{ + int i; + int sad; + + check_vect (); + + for (i = 0; i < N/2; i++) +{ + X[2*i] = i; + Y[2*i] = N/2 - i; + X[2*i+1] = i; + Y[2*i+1] = 0; + __asm__ volatile (""); +} + + + int sad2; + sad = foo (N/2, ); + if (sad != (N/2)*(N/4)) +abort (); + if (sad2 != (N/2-1)*(N/2)/2) +abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ + diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 704df7bdcc73..361aec064884 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -7667,6 +7667,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo, return false; } + /* Lane-reducing ops also never can be used in a SLP reduction group + since we'll mix lanes belonging to different reductions. But it's + OK to use them in a reduction chain or when the reduction group + has just one element. */ + if (lane_reduc_code_p + && slp_node + && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) + && SLP_TREE_LANES (slp_node) > 1) +{ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +"lane-reducing reduction in reduction group.\n"); + return false; +} + /* All uses but the last are expected to be defined in the loop. The last use is the reduction variable. In case of nested cycle this assumption is not true: we use reduc_index to record the index of the diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 8e8de5ea3a55..dfb7d8005262 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -7160,7 +7160,6 @@ vect_pattern_recog_1 (vec_info *vinfo, vect_recog_func *recog_func, stmt_vec_info stmt_info) { gimple *pattern_stmt; - loop_vec_info loop_vinfo; tree pattern_vectype; /* If this
[gcc r13-8727] tree-optimization/114375 - disallow SLP discovery of permuted mask loads
https://gcc.gnu.org/g:4f2a35a76cca503749c696e7772d2e8eadc77ba5 commit r13-8727-g4f2a35a76cca503749c696e7772d2e8eadc77ba5 Author: Richard Biener Date: Mon Mar 18 12:39:03 2024 +0100 tree-optimization/114375 - disallow SLP discovery of permuted mask loads We cannot currently handle permutations of mask loads in code generation or permute optimization. But we simply drop any permutation on the floor, so the following instead rejects the SLP build rather than producing wrong-code. I've also made sure to reject them in vectorizable_load for completeness. PR tree-optimization/114375 * tree-vect-slp.cc (vect_build_slp_tree_2): Compute the load permutation for masked loads but reject it when any such is necessary. * tree-vect-stmts.cc (vectorizable_load): Reject masked VMAT_ELEMENTWISE and VMAT_STRIDED_SLP as those are not supported. * gcc.dg/vect/vect-pr114375.c: New testcase. (cherry picked from commit 94c3508c5a14d1948fe3bffa9e16c6f3d9c2836a) Diff: --- gcc/testsuite/gcc.dg/vect/vect-pr114375.c | 44 +++ gcc/tree-vect-slp.cc | 34 +++- gcc/tree-vect-stmts.cc| 8 ++ 3 files changed, 79 insertions(+), 7 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/vect-pr114375.c b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c new file mode 100644 index ..1e1cb0123d07 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c @@ -0,0 +1,44 @@ +/* { dg-additional-options "-mavx2" { target avx2_runtime } } */ + +#include "tree-vect.h" + +int a[512]; +int b[512]; +int c[512]; + +void __attribute__((noipa)) +foo(int * __restrict p) +{ + for (int i = 0; i < 64; ++i) +{ + int tem = 2, tem2 = 2; + if (a[4*i + 1]) +tem = p[4*i]; + if (a[4*i]) +tem2 = p[4*i + 2]; + b[2*i] = tem2; + b[2*i+1] = tem; + if (a[4*i + 2]) +tem = p[4*i + 1]; + if (a[4*i + 3]) +tem2 = p[4*i + 3]; + c[2*i] = tem2; + c[2*i+1] = tem; +} +} +int main() +{ + check_vect (); + + for (int i = 0; i < 512; ++i) +a[i] = (i >> 1) & 1; + + foo (a); + + if (c[0] != 1 || c[1] != 0 || c[2] != 1 || c[3] != 0 + || b[0] != 2 || b[1] != 2 || b[2] != 2 || b[3] != 2) +abort (); + + return 0; +} + diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index bbc05fac65ec..c01dc02afff6 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1780,10 +1780,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, if (STMT_VINFO_GROUPED_ACCESS (stmt_info) && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) { - if (gcall *stmt = dyn_cast (stmt_info->stmt)) - gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) - || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))); else { *max_nunits = this_max_nunits; @@ -1799,15 +1797,37 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, load_permutation.create (group_size); stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); + bool any_permute = false; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) { int load_place = vect_get_place_in_interleaving_chain (load_info, first_stmt_info); gcc_assert (load_place != -1); - load_permutation.safe_push (load_place); + any_permute |= load_place != j; + load_permutation.quick_push (load_place); + } + + if (gcall *stmt = dyn_cast (stmt_info->stmt)) + { + gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) + || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); + load_permutation.release (); + /* We cannot handle permuted masked loads, see PR114375. */ + if (any_permute + || (STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_GROUP_SIZE (first_stmt_info) != group_size) + || STMT_VINFO_STRIDED_P (stmt_info)) + { + matches[0] = false; + return NULL; + } + } + else + { + SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; + return node; } - SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; - return node; } } else if (gimple_assign_single_p (stmt_info->stmt) diff --git a/gcc/tree-vect-stmts.cc
[gcc r13-8726] cfgrtl: Fix MEM_EXPR update in duplicate_insn_chain [PR114924]
https://gcc.gnu.org/g:c63704a2d840436797f54e175a2af0cb029889d2 commit r13-8726-gc63704a2d840436797f54e175a2af0cb029889d2 Author: Alex Coplan Date: Fri May 3 09:23:59 2024 +0100 cfgrtl: Fix MEM_EXPR update in duplicate_insn_chain [PR114924] The PR shows that when cfgrtl.cc:duplicate_insn_chain attempts to update the MR_DEPENDENCE_CLIQUE information for a MEM_EXPR we can end up accidentally dropping (e.g.) an ARRAY_REF from the MEM_EXPR and end up replacing it with the underlying MEM_REF. This leads to an inconsistency in the MEM_EXPR information, and could lead to wrong code. While the walk down to the MEM_REF is necessary to update MR_DEPENDENCE_CLIQUE, we should use the outer tree expression for the MEM_EXPR. This patch does that. gcc/ChangeLog: PR rtl-optimization/114924 * cfgrtl.cc (duplicate_insn_chain): When updating MEM_EXPRs, don't strip (e.g.) ARRAY_REFs from the final MEM_EXPR. (cherry picked from commit fe40d525619eee9c2821126390df75068df4773a) Diff: --- gcc/cfgrtl.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gcc/cfgrtl.cc b/gcc/cfgrtl.cc index 149131c2693f..4cb32e4d9bf3 100644 --- a/gcc/cfgrtl.cc +++ b/gcc/cfgrtl.cc @@ -4407,12 +4407,13 @@ duplicate_insn_chain (rtx_insn *from, rtx_insn *to, since MEM_EXPR is shared so make a copy and walk to the subtree again. */ tree new_expr = unshare_expr (MEM_EXPR (*iter)); + tree orig_new_expr = new_expr; if (TREE_CODE (new_expr) == WITH_SIZE_EXPR) new_expr = TREE_OPERAND (new_expr, 0); while (handled_component_p (new_expr)) new_expr = TREE_OPERAND (new_expr, 0); MR_DEPENDENCE_CLIQUE (new_expr) = newc; - set_mem_expr (const_cast (*iter), new_expr); + set_mem_expr (const_cast (*iter), orig_new_expr); } } }