On Wed, Aug 6, 2025 at 7:29 PM Andrew Pinski <quic_apin...@quicinc.com> wrote: > > One thing I noticed while working on copy prop for aggregates is that we > start with > a memcpy like statement and then walk backwards. This means we could have a > few walks > backwards to see there was no statement for zeroing. Instead this changes the > walk > backwards into a true forwprop. In the future we can expand to forwprop the > zeroing > into say an function argument or something more than memcpy like statement. > > This should speed up slightly the compile time performance since there will > be less > memsets like statements than memcpy and there is only one walk forwards for > memset like > staments instead of multiple walk backwards to find the memset. > > Note this does add one extra improvement, the memcpy now does not need to > have an address > as its dest argument; this could have been done before too but it was even > more noticable > now because of the variable became only set so it was removed and the check > was removed > as well. > > There is also a fix on how ao_ref for the memset/memcpy is done, before it > was just using > ao_ref_init which is wrong since it should instead of used > ao_ref_init_from_ptr_and_size. > This part fixes PR 121422.
OK. Thanks, Richard. > Changes since v1: > * v2: Add back limit on the walk which was missed in v1. > Move the call to get_addr_base_and_unit_offset outside > of the vuse loop. > * v3: Remove extra check before the call to optimize_aggr_zeroprop_1. > Fix setting up of ao_ref for memset (PR121422). > > PR tree-optimization/118946 > PR tree-optimization/121422 > > gcc/ChangeLog: > > * tree-ssa-forwprop.cc (optimize_memcpy_to_memset): Remove. > (optimize_aggr_zeroprop_1): New function. > (optimize_aggr_zeroprop): New function. > (simplify_builtin_call): Don't call optimize_memcpy_to_memset > for memcpy but call optimize_aggr_zeroprop for memset. > (pass_forwprop::execute): Don't call optimize_memcpy_to_memset > for aggregate copies but rather call optimize_aggr_zeroprop > for aggregate stores. > > gcc/testsuite/ChangeLog: > > * gcc.dg/pr118946-1.c: New test. > * gcc.dg/torture/pr121422-1.c: New test. > * gcc.dg/torture/pr121422-2.c: New test. > > Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com> > --- > gcc/testsuite/gcc.dg/pr118946-1.c | 15 ++ > gcc/testsuite/gcc.dg/torture/pr121422-1.c | 35 +++ > gcc/testsuite/gcc.dg/torture/pr121422-2.c | 36 +++ > gcc/tree-ssa-forwprop.cc | 300 +++++++++++++--------- > 4 files changed, 258 insertions(+), 128 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/pr118946-1.c > create mode 100644 gcc/testsuite/gcc.dg/torture/pr121422-1.c > create mode 100644 gcc/testsuite/gcc.dg/torture/pr121422-2.c > > diff --git a/gcc/testsuite/gcc.dg/pr118946-1.c > b/gcc/testsuite/gcc.dg/pr118946-1.c > new file mode 100644 > index 00000000000..6cf2661f286 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/pr118946-1.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-optimized -fdump-tree-forwprop1-details" } > */ > + > +/* PR tree-optimization/118946 */ > + > +void f(char *a) > +{ > + char t[1024] = {}; > + __builtin_memcpy(a, t, 10); > +} > + > +/* We should be able to optimize the memcpy into a memset here. */ > +/* { dg-final { scan-tree-dump-times "after previous" 1 "forwprop1"} } */ > +/* { dg-final { scan-tree-dump-times "memset " 1 "optimized" } } */ > +/* { dg-final { scan-tree-dump-not "memcpy " "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/torture/pr121422-1.c > b/gcc/testsuite/gcc.dg/torture/pr121422-1.c > new file mode 100644 > index 00000000000..136f80d3ead > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/torture/pr121422-1.c > @@ -0,0 +1,35 @@ > +/* { dg-do run } */ > +/* PR tree-optimization/121422 */ > + > +struct s1 > +{ > + char a[4]; > +}; > +struct s1 b; > +char t[4]; > + > +/* if both t and b startout zero initialized before this function, > + t should end up being: > + {0, 0, 1, 0} > + while b.a should end up being: > + {0, 0, 0, 1} > +*/ > +__attribute__((noipa,noinline)) > +void f(void) > +{ > + b = (struct s1){}; > + b.a[3] = 1; > + /* This memcpy should stay a memcpy and not become memset. */ > + __builtin_memcpy(&t[0], &b.a[1], 3*sizeof(t[0])); > +} > + > + > +int main() > +{ > + f(); > + for(int i = 0; i < 4; i++) > + { > + if (t[i] != (i == 2 ? 1 : 0)) > + __builtin_abort(); > + } > +} > diff --git a/gcc/testsuite/gcc.dg/torture/pr121422-2.c > b/gcc/testsuite/gcc.dg/torture/pr121422-2.c > new file mode 100644 > index 00000000000..570559c6c73 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/torture/pr121422-2.c > @@ -0,0 +1,36 @@ > +/* { dg-do run } */ > +/* PR tree-optimization/121422 */ > + > +struct s1 > +{ > + char a[4]; > +}; > +struct s1 b; > +char t[4]; > + > +/* if both t and b startout zero initialized before this function, > + t should end up being: > + {0, 0, 1, 0} > + while b.a should end up being: > + {0, 0, 0, 1} > +*/ > +__attribute__((noipa,noinline)) > +void f(void) > +{ > + __builtin_memset(&b.a[1], 0, 2); > + b.a[3] = 1; > + /* This memcpy should stay a memcpy and not become memset. */ > + __builtin_memcpy(&t[0], &b.a[1], 3); > +} > + > + > +int main() > +{ > + f(); > + for(int i = 0; i < 4; i++) > + { > + if (t[i] != (i == 2 ? 1 : 0)) > + __builtin_abort(); > + } > +} > + > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc > index 2dc77ccba1d..156ea322867 100644 > --- a/gcc/tree-ssa-forwprop.cc > +++ b/gcc/tree-ssa-forwprop.cc > @@ -1190,117 +1190,55 @@ constant_pointer_difference (tree p1, tree p2) > return NULL_TREE; > } > > - > -/* Optimize > - a = {}; > - b = a; > - into > - a = {}; > - b = {}; > - Similarly for memset (&a, ..., sizeof (a)); instead of a = {}; > - and/or memcpy (&b, &a, sizeof (a)); instead of b = a; */ > - > +/* Helper function for optimize_aggr_zeroprop. > + Props the zeroing (memset, VAL) that was done in DEST+OFFSET:LEN > + (DEFSTMT) into the STMT. Returns true if the STMT was updated. */ > static bool > -optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, tree dest, tree src, > tree len) > +optimize_aggr_zeroprop_1 (gimple *defstmt, gimple *stmt, > + tree dest, poly_int64 offset, tree val, > + poly_offset_int len) > { > - ao_ref read; > - gimple *stmt = gsi_stmt (*gsip); > - if (gimple_has_volatile_ops (stmt)) > - return false; > - > - tree src2 = NULL_TREE, len2 = NULL_TREE; > - poly_int64 offset, offset2; > - tree val = integer_zero_node; > - bool len_was_null = len == NULL_TREE; > - if (len == NULL_TREE) > - len = (TREE_CODE (src) == COMPONENT_REF > - ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1)) > - : TYPE_SIZE_UNIT (TREE_TYPE (src))); > - if (len == NULL_TREE > - || !poly_int_tree_p (len)) > - return false; > + tree src2; > + tree len2 = NULL_TREE; > + poly_int64 offset2; > > - ao_ref_init (&read, src); > - tree vuse = gimple_vuse (stmt); > - gimple *defstmt; > - unsigned limit = param_sccvn_max_alias_queries_per_access; > - do { > - /* If the vuse is the default definition, then there is no stores > beforhand. */ > - if (SSA_NAME_IS_DEFAULT_DEF (vuse)) > - return false; > - defstmt = SSA_NAME_DEF_STMT (vuse); > - if (is_a <gphi*>(defstmt)) > - return false; > - if (limit-- == 0) > - return false; > - /* If the len was null, then we can use TBBA. */ > - if (stmt_may_clobber_ref_p_1 (defstmt, &read, > - /* tbaa_p = */ len_was_null)) > - break; > - vuse = gimple_vuse (defstmt); > - } while (true); > - > - if (gimple_store_p (defstmt) > - && gimple_assign_single_p (defstmt) > - && TREE_CODE (gimple_assign_rhs1 (defstmt)) == STRING_CST > - && !gimple_clobber_p (defstmt)) > + if (gimple_call_builtin_p (stmt, BUILT_IN_MEMCPY) > + && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR > + && poly_int_tree_p (gimple_call_arg (stmt, 2))) > { > - tree str = gimple_assign_rhs1 (defstmt); > - src2 = gimple_assign_lhs (defstmt); > - /* The string must contain all null char's for now. */ > - for (int i = 0; i < TREE_STRING_LENGTH (str); i++) > - { > - if (TREE_STRING_POINTER (str)[i] != 0) > - { > - src2 = NULL_TREE; > - break; > - } > - } > - } > - else if (gimple_store_p (defstmt) > - && gimple_assign_single_p (defstmt) > - && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR > - && !gimple_clobber_p (defstmt)) > - src2 = gimple_assign_lhs (defstmt); > - else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET) > - && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR > - && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST) > - { > - src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0); > - len2 = gimple_call_arg (defstmt, 2); > - val = gimple_call_arg (defstmt, 1); > - /* For non-0 val, we'd have to transform stmt from assignment > - into memset (only if dest is addressable). */ > - if (!integer_zerop (val) && is_gimple_assign (stmt)) > - src2 = NULL_TREE; > + src2 = TREE_OPERAND (gimple_call_arg (stmt, 1), 0); > + len2 = gimple_call_arg (stmt, 2); > } > + else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt)) > + { > + src2 = gimple_assign_rhs1 (stmt); > + len2 = (TREE_CODE (src2) == COMPONENT_REF > + ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1)) > + : TYPE_SIZE_UNIT (TREE_TYPE (src2))); > + /* Can only handle zero memsets. */ > + if (!integer_zerop (val)) > + return false; > + } > + else > + return false; > > - if (src2 == NULL_TREE) > - return false; > - > - if (len2 == NULL_TREE) > - len2 = (TREE_CODE (src2) == COMPONENT_REF > - ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1)) > - : TYPE_SIZE_UNIT (TREE_TYPE (src2))); > if (len2 == NULL_TREE > || !poly_int_tree_p (len2)) > return false; > > - src = get_addr_base_and_unit_offset (src, &offset); > src2 = get_addr_base_and_unit_offset (src2, &offset2); > - if (src == NULL_TREE > - || src2 == NULL_TREE > - || maybe_lt (offset, offset2)) > + if (src2 == NULL_TREE > + || maybe_lt (offset2, offset)) > return false; > > - if (!operand_equal_p (src, src2, 0)) > + if (!operand_equal_p (dest, src2, 0)) > return false; > > - /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val. > + /* [ dest + offset, dest + offset + len - 1 ] is set to val. > Make sure that > - [ src + offset, src + offset + len - 1 ] is a subset of that. */ > - if (maybe_gt (wi::to_poly_offset (len) + (offset - offset2), > - wi::to_poly_offset (len2))) > + [ dest + offset2, dest + offset2 + len2 - 1 ] is a subset of that. */ > + if (maybe_gt (wi::to_poly_offset (len2) + (offset2 - offset), > + len)) > return false; > > if (dump_file && (dump_flags & TDF_DETAILS)) > @@ -1310,7 +1248,7 @@ optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, > tree dest, tree src, tree > fprintf (dump_file, "after previous\n "); > print_gimple_stmt (dump_file, defstmt, 0, dump_flags); > } > - > + gimple *orig_stmt = stmt; > /* For simplicity, don't change the kind of the stmt, > turn dest = src; into dest = {}; and memcpy (&dest, &src, len); > into memset (&dest, val, len); > @@ -1320,8 +1258,10 @@ optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, > tree dest, tree src, tree > of dest, dest isn't volatile. */ > if (is_gimple_assign (stmt)) > { > - tree ctor = build_constructor (TREE_TYPE (dest), NULL); > - gimple_assign_set_rhs_from_tree (gsip, ctor); > + tree ctor_type = TREE_TYPE (gimple_assign_lhs (stmt)); > + tree ctor = build_constructor (ctor_type, NULL); > + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); > + gimple_assign_set_rhs_from_tree (&gsi, ctor); > update_stmt (stmt); > statistics_counter_event (cfun, "copy zeroing propagation of > aggregate", 1); > } > @@ -1341,8 +1281,126 @@ optimize_memcpy_to_memset (gimple_stmt_iterator > *gsip, tree dest, tree src, tree > fprintf (dump_file, "into\n "); > print_gimple_stmt (dump_file, stmt, 0, dump_flags); > } > + > + /* Mark the bb for eh cleanup if needed. */ > + if (maybe_clean_or_replace_eh_stmt (orig_stmt, stmt)) > + bitmap_set_bit (to_purge, gimple_bb (stmt)->index); > + > return true; > } > + > +/* Optimize > + a = {}; // DEST = value ;; LEN(nullptr) > + b = a; > + into > + a = {}; > + b = {}; > + Similarly for memset (&a, ..., sizeof (a)); instead of a = {}; > + and/or memcpy (&b, &a, sizeof (a)); instead of b = a; */ > + > +static bool > +optimize_aggr_zeroprop (gimple_stmt_iterator *gsip) > +{ > + ao_ref read; > + gimple *stmt = gsi_stmt (*gsip); > + if (gimple_has_volatile_ops (stmt)) > + return false; > + > + tree dest = NULL_TREE; > + tree val = integer_zero_node; > + tree len = NULL_TREE; > + bool can_use_tbba = true; > + bool changed = false; > + > + if (gimple_call_builtin_p (stmt, BUILT_IN_MEMSET) > + && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR > + && TREE_CODE (gimple_call_arg (stmt, 1)) == INTEGER_CST > + && poly_int_tree_p (gimple_call_arg (stmt, 2))) > + { > + dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0); > + len = gimple_call_arg (stmt, 2); > + val = gimple_call_arg (stmt, 1); > + ao_ref_init_from_ptr_and_size (&read, gimple_call_arg (stmt, 0), len); > + can_use_tbba = false; > + } > + else if (gimple_store_p (stmt) > + && gimple_assign_single_p (stmt) > + && TREE_CODE (gimple_assign_rhs1 (stmt)) == STRING_CST) > + { > + tree str = gimple_assign_rhs1 (stmt); > + dest = gimple_assign_lhs (stmt); > + ao_ref_init (&read, dest); > + /* The string must contain all null char's for now. */ > + for (int i = 0; i < TREE_STRING_LENGTH (str); i++) > + { > + if (TREE_STRING_POINTER (str)[i] != 0) > + { > + dest = NULL_TREE; > + break; > + } > + } > + } > + else if (gimple_store_p (stmt) > + && gimple_assign_single_p (stmt) > + && TREE_CODE (gimple_assign_rhs1 (stmt)) == CONSTRUCTOR > + && !gimple_clobber_p (stmt)) > + { > + dest = gimple_assign_lhs (stmt); > + ao_ref_init (&read, dest); > + } > + > + if (dest == NULL_TREE) > + return false; > + > + if (len == NULL_TREE) > + len = (TREE_CODE (dest) == COMPONENT_REF > + ? DECL_SIZE_UNIT (TREE_OPERAND (dest, 1)) > + : TYPE_SIZE_UNIT (TREE_TYPE (dest))); > + if (len == NULL_TREE > + || !poly_int_tree_p (len)) > + return false; > + > + /* This store needs to be on the byte boundary and pointing to an object. > */ > + poly_int64 offset; > + tree dest_base = get_addr_base_and_unit_offset (dest, &offset); > + if (dest_base == NULL_TREE) > + return false; > + > + /* Setup the worklist. */ > + auto_vec<std::pair<tree, unsigned>> worklist; > + unsigned limit = param_sccvn_max_alias_queries_per_access; > + worklist.safe_push (std::make_pair (gimple_vdef (stmt), limit)); > + > + while (!worklist.is_empty ()) > + { > + std::pair<tree, unsigned> top = worklist.pop (); > + tree vdef = top.first; > + limit = top.second; > + gimple *use_stmt; > + imm_use_iterator iter; > + FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef) > + { > + /* Handling PHI nodes might not be worth it so don't. */ > + if (is_a <gphi*> (use_stmt)) > + continue; > + > + /* If this statement does not clobber add the vdef stmt to the > + worklist. */ > + if (gimple_vdef (use_stmt) > + && !stmt_may_clobber_ref_p_1 (use_stmt, &read, > + /* tbaa_p = */ can_use_tbba) > + && limit != 0) > + worklist.safe_push (std::make_pair (gimple_vdef (use_stmt), > + limit - 1)); > + > + if (optimize_aggr_zeroprop_1 (stmt, use_stmt, dest_base, offset, > + val, wi::to_poly_offset (len))) > + changed = true; > + } > + } > + > + return changed; > +} > /* Optimizes > DEST = SRC; > DEST2 = DEST; # DEST2 = SRC2; > @@ -1462,22 +1520,6 @@ simplify_builtin_call (gimple_stmt_iterator *gsi_p, > tree callee2) > > switch (DECL_FUNCTION_CODE (callee2)) > { > - case BUILT_IN_MEMCPY: > - if (gimple_call_num_args (stmt2) == 3) > - { > - tree dest = gimple_call_arg (stmt2, 0); > - tree src = gimple_call_arg (stmt2, 1); > - tree len = gimple_call_arg (stmt2, 2); > - /* Try to optimize the memcpy to memset if src > - and dest are addresses. */ > - if (TREE_CODE (dest) == ADDR_EXPR > - && TREE_CODE (src) == ADDR_EXPR > - && TREE_CODE (len) == INTEGER_CST > - && optimize_memcpy_to_memset (gsi_p, TREE_OPERAND (dest, 0), > - TREE_OPERAND (src, 0), len)) > - return true; > - } > - break; > case BUILT_IN_MEMCHR: > if (gimple_call_num_args (stmt2) == 3 > && (res = gimple_call_lhs (stmt2)) != nullptr > @@ -1539,6 +1581,13 @@ simplify_builtin_call (gimple_stmt_iterator *gsi_p, > tree callee2) > break; > > case BUILT_IN_MEMSET: > + if (gimple_call_num_args (stmt2) == 3) > + { > + /* Try to prop the zeroing/value of the memset to memcpy > + if the dest is an address and the value is a constant. */ > + if (optimize_aggr_zeroprop (gsi_p)) > + return true; > + } > if (gimple_call_num_args (stmt2) != 3 > || gimple_call_lhs (stmt2) > || CHAR_BIT != 8 > @@ -4857,21 +4906,16 @@ pass_forwprop::execute (function *fun) > { > tree rhs1 = gimple_assign_rhs1 (stmt); > enum tree_code code = gimple_assign_rhs_code (stmt); > - if (gimple_assign_load_p (stmt) && gimple_store_p (stmt)) > + if (gimple_store_p (stmt) && optimize_aggr_zeroprop > (&gsi)) > { > - if (optimize_memcpy_to_memset (&gsi, > - gimple_assign_lhs > (stmt), > - gimple_assign_rhs1 > (stmt), > - /* len = */NULL_TREE)) > - { > - changed = true; > - break; > - } > - if (optimize_agr_copyprop (&gsi)) > - { > - changed = true; > - break; > - } > + changed = true; > + break; > + } > + if (gimple_assign_load_p (stmt) && gimple_store_p (stmt) > + && optimize_agr_copyprop (&gsi)) > + { > + changed = true; > + break; > } > > if (TREE_CODE_CLASS (code) == tcc_comparison) > -- > 2.43.0 >