On Wed, Aug 6, 2025 at 7:29 PM Andrew Pinski <quic_apin...@quicinc.com> wrote:
>
> One thing I noticed while working on copy prop for aggregates is that we 
> start with
> a memcpy like statement and then walk backwards. This means we could have a 
> few walks
> backwards to see there was no statement for zeroing. Instead this changes the 
> walk
> backwards into a true forwprop. In the future we can expand to forwprop the 
> zeroing
> into say an function argument or something more than memcpy like statement.
>
> This should speed up slightly the compile time performance since there will 
> be less
> memsets like statements than memcpy and there is only one walk forwards for 
> memset like
> staments instead of multiple walk backwards to find the memset.
>
> Note this does add one extra improvement, the memcpy now does not need to 
> have an address
> as its dest argument; this could have been done before too but it was even 
> more noticable
> now because of the variable became only set so it was removed and the check 
> was removed
> as well.
>
> There is also a fix on how ao_ref for the memset/memcpy is done, before it 
> was just using
> ao_ref_init which is wrong since it should instead of used 
> ao_ref_init_from_ptr_and_size.
> This part fixes PR 121422.

OK.

Thanks,
Richard.

> Changes since v1:
> * v2: Add back limit on the walk which was missed in v1.
>       Move the call to get_addr_base_and_unit_offset outside
>         of the vuse loop.
> * v3: Remove extra check before the call to optimize_aggr_zeroprop_1.
>       Fix setting up of ao_ref for memset (PR121422).
>
>         PR tree-optimization/118946
>         PR tree-optimization/121422
>
> gcc/ChangeLog:
>
>         * tree-ssa-forwprop.cc (optimize_memcpy_to_memset): Remove.
>         (optimize_aggr_zeroprop_1): New function.
>         (optimize_aggr_zeroprop): New function.
>         (simplify_builtin_call): Don't call optimize_memcpy_to_memset
>         for memcpy but call optimize_aggr_zeroprop for memset.
>         (pass_forwprop::execute): Don't call optimize_memcpy_to_memset
>         for aggregate copies but rather call optimize_aggr_zeroprop
>         for aggregate stores.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/pr118946-1.c: New test.
>         * gcc.dg/torture/pr121422-1.c: New test.
>         * gcc.dg/torture/pr121422-2.c: New test.
>
> Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com>
> ---
>  gcc/testsuite/gcc.dg/pr118946-1.c         |  15 ++
>  gcc/testsuite/gcc.dg/torture/pr121422-1.c |  35 +++
>  gcc/testsuite/gcc.dg/torture/pr121422-2.c |  36 +++
>  gcc/tree-ssa-forwprop.cc                  | 300 +++++++++++++---------
>  4 files changed, 258 insertions(+), 128 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/pr118946-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/torture/pr121422-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/torture/pr121422-2.c
>
> diff --git a/gcc/testsuite/gcc.dg/pr118946-1.c 
> b/gcc/testsuite/gcc.dg/pr118946-1.c
> new file mode 100644
> index 00000000000..6cf2661f286
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr118946-1.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized -fdump-tree-forwprop1-details" } 
> */
> +
> +/* PR tree-optimization/118946 */
> +
> +void f(char *a)
> +{
> +  char t[1024] = {};
> +  __builtin_memcpy(a, t, 10);
> +}
> +
> +/* We should be able to optimize the memcpy into a memset here. */
> +/* { dg-final { scan-tree-dump-times "after previous" 1 "forwprop1"} } */
> +/* { dg-final { scan-tree-dump-times "memset " 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "memcpy " "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/torture/pr121422-1.c 
> b/gcc/testsuite/gcc.dg/torture/pr121422-1.c
> new file mode 100644
> index 00000000000..136f80d3ead
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/torture/pr121422-1.c
> @@ -0,0 +1,35 @@
> +/* { dg-do run } */
> +/* PR tree-optimization/121422 */
> +
> +struct s1
> +{
> +  char a[4];
> +};
> +struct s1 b;
> +char t[4];
> +
> +/* if both t and b startout zero initialized before this function,
> +   t should end up being:
> +   {0, 0, 1, 0}
> +   while b.a should end up being:
> +   {0, 0, 0, 1}
> +*/
> +__attribute__((noipa,noinline))
> +void f(void)
> +{
> +  b = (struct s1){};
> +  b.a[3] = 1;
> +  /* This memcpy should stay a memcpy and not become memset. */
> +  __builtin_memcpy(&t[0], &b.a[1], 3*sizeof(t[0]));
> +}
> +
> +
> +int main()
> +{
> +  f();
> +  for(int i = 0; i < 4; i++)
> +  {
> +        if (t[i] != (i == 2 ? 1 : 0))
> +          __builtin_abort();
> +  }
> +}
> diff --git a/gcc/testsuite/gcc.dg/torture/pr121422-2.c 
> b/gcc/testsuite/gcc.dg/torture/pr121422-2.c
> new file mode 100644
> index 00000000000..570559c6c73
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/torture/pr121422-2.c
> @@ -0,0 +1,36 @@
> +/* { dg-do run } */
> +/* PR tree-optimization/121422 */
> +
> +struct s1
> +{
> +  char a[4];
> +};
> +struct s1 b;
> +char t[4];
> +
> +/* if both t and b startout zero initialized before this function,
> +   t should end up being:
> +   {0, 0, 1, 0}
> +   while b.a should end up being:
> +   {0, 0, 0, 1}
> +*/
> +__attribute__((noipa,noinline))
> +void f(void)
> +{
> +  __builtin_memset(&b.a[1], 0, 2);
> +  b.a[3] = 1;
> +  /* This memcpy should stay a memcpy and not become memset. */
> +  __builtin_memcpy(&t[0], &b.a[1], 3);
> +}
> +
> +
> +int main()
> +{
> +  f();
> +  for(int i = 0; i < 4; i++)
> +  {
> +        if (t[i] != (i == 2 ? 1 : 0))
> +          __builtin_abort();
> +  }
> +}
> +
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 2dc77ccba1d..156ea322867 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -1190,117 +1190,55 @@ constant_pointer_difference (tree p1, tree p2)
>    return NULL_TREE;
>  }
>
> -
> -/* Optimize
> -   a = {};
> -   b = a;
> -   into
> -   a = {};
> -   b = {};
> -   Similarly for memset (&a, ..., sizeof (a)); instead of a = {};
> -   and/or memcpy (&b, &a, sizeof (a)); instead of b = a;  */
> -
> +/* Helper function for optimize_aggr_zeroprop.
> +   Props the zeroing (memset, VAL) that was done in DEST+OFFSET:LEN
> +   (DEFSTMT) into the STMT.  Returns true if the STMT was updated.  */
>  static bool
> -optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, tree dest, tree src, 
> tree len)
> +optimize_aggr_zeroprop_1 (gimple *defstmt, gimple *stmt,
> +                         tree dest, poly_int64 offset, tree val,
> +                         poly_offset_int len)
>  {
> -  ao_ref read;
> -  gimple *stmt = gsi_stmt (*gsip);
> -  if (gimple_has_volatile_ops (stmt))
> -    return false;
> -
> -  tree src2 = NULL_TREE, len2 = NULL_TREE;
> -  poly_int64 offset, offset2;
> -  tree val = integer_zero_node;
> -  bool len_was_null = len == NULL_TREE;
> -  if (len == NULL_TREE)
> -    len = (TREE_CODE (src) == COMPONENT_REF
> -          ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1))
> -          : TYPE_SIZE_UNIT (TREE_TYPE (src)));
> -  if (len == NULL_TREE
> -      || !poly_int_tree_p (len))
> -    return false;
> +  tree src2;
> +  tree len2 = NULL_TREE;
> +  poly_int64 offset2;
>
> -  ao_ref_init (&read, src);
> -  tree vuse = gimple_vuse (stmt);
> -  gimple *defstmt;
> -  unsigned limit = param_sccvn_max_alias_queries_per_access;
> -  do {
> -    /* If the vuse is the default definition, then there is no stores 
> beforhand. */
> -    if (SSA_NAME_IS_DEFAULT_DEF (vuse))
> -      return false;
> -    defstmt = SSA_NAME_DEF_STMT (vuse);
> -    if (is_a <gphi*>(defstmt))
> -      return false;
> -    if (limit-- == 0)
> -      return false;
> -    /* If the len was null, then we can use TBBA. */
> -    if (stmt_may_clobber_ref_p_1 (defstmt, &read,
> -                                 /* tbaa_p = */ len_was_null))
> -      break;
> -    vuse = gimple_vuse (defstmt);
> -  } while (true);
> -
> -  if (gimple_store_p (defstmt)
> -      && gimple_assign_single_p (defstmt)
> -      && TREE_CODE (gimple_assign_rhs1 (defstmt)) == STRING_CST
> -      && !gimple_clobber_p (defstmt))
> +  if (gimple_call_builtin_p (stmt, BUILT_IN_MEMCPY)
> +      && TREE_CODE (gimple_call_arg (stmt, 1)) == ADDR_EXPR
> +      && poly_int_tree_p (gimple_call_arg (stmt, 2)))
>      {
> -      tree str = gimple_assign_rhs1 (defstmt);
> -      src2 = gimple_assign_lhs (defstmt);
> -      /* The string must contain all null char's for now.  */
> -      for (int i = 0; i < TREE_STRING_LENGTH (str); i++)
> -       {
> -         if (TREE_STRING_POINTER (str)[i] != 0)
> -           {
> -             src2 = NULL_TREE;
> -             break;
> -           }
> -       }
> -    }
> -  else if (gimple_store_p (defstmt)
> -      && gimple_assign_single_p (defstmt)
> -      && TREE_CODE (gimple_assign_rhs1 (defstmt)) == CONSTRUCTOR
> -      && !gimple_clobber_p (defstmt))
> -    src2 = gimple_assign_lhs (defstmt);
> -  else if (gimple_call_builtin_p (defstmt, BUILT_IN_MEMSET)
> -          && TREE_CODE (gimple_call_arg (defstmt, 0)) == ADDR_EXPR
> -          && TREE_CODE (gimple_call_arg (defstmt, 1)) == INTEGER_CST)
> -    {
> -      src2 = TREE_OPERAND (gimple_call_arg (defstmt, 0), 0);
> -      len2 = gimple_call_arg (defstmt, 2);
> -      val = gimple_call_arg (defstmt, 1);
> -      /* For non-0 val, we'd have to transform stmt from assignment
> -        into memset (only if dest is addressable).  */
> -      if (!integer_zerop (val) && is_gimple_assign (stmt))
> -       src2 = NULL_TREE;
> +      src2 = TREE_OPERAND (gimple_call_arg (stmt, 1), 0);
> +      len2 = gimple_call_arg (stmt, 2);
>      }
> +   else if (gimple_assign_load_p (stmt) && gimple_store_p (stmt))
> +     {
> +       src2 = gimple_assign_rhs1 (stmt);
> +       len2 = (TREE_CODE (src2) == COMPONENT_REF
> +               ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1))
> +               : TYPE_SIZE_UNIT (TREE_TYPE (src2)));
> +       /* Can only handle zero memsets. */
> +       if (!integer_zerop (val))
> +         return false;
> +     }
> +   else
> +     return false;
>
> -  if (src2 == NULL_TREE)
> -    return false;
> -
> -  if (len2 == NULL_TREE)
> -    len2 = (TREE_CODE (src2) == COMPONENT_REF
> -           ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1))
> -           : TYPE_SIZE_UNIT (TREE_TYPE (src2)));
>    if (len2 == NULL_TREE
>        || !poly_int_tree_p (len2))
>      return false;
>
> -  src = get_addr_base_and_unit_offset (src, &offset);
>    src2 = get_addr_base_and_unit_offset (src2, &offset2);
> -  if (src == NULL_TREE
> -      || src2 == NULL_TREE
> -      || maybe_lt (offset, offset2))
> +  if (src2 == NULL_TREE
> +      || maybe_lt (offset2, offset))
>      return false;
>
> -  if (!operand_equal_p (src, src2, 0))
> +  if (!operand_equal_p (dest, src2, 0))
>      return false;
>
> -  /* [ src + offset2, src + offset2 + len2 - 1 ] is set to val.
> +  /* [ dest + offset, dest + offset + len - 1 ] is set to val.
>       Make sure that
> -     [ src + offset, src + offset + len - 1 ] is a subset of that.  */
> -  if (maybe_gt (wi::to_poly_offset (len) + (offset - offset2),
> -               wi::to_poly_offset (len2)))
> +     [ dest + offset2, dest + offset2 + len2 - 1 ] is a subset of that.  */
> +  if (maybe_gt (wi::to_poly_offset (len2) + (offset2 - offset),
> +               len))
>      return false;
>
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1310,7 +1248,7 @@ optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, 
> tree dest, tree src, tree
>        fprintf (dump_file, "after previous\n  ");
>        print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
>      }
> -
> +  gimple *orig_stmt = stmt;
>    /* For simplicity, don't change the kind of the stmt,
>       turn dest = src; into dest = {}; and memcpy (&dest, &src, len);
>       into memset (&dest, val, len);
> @@ -1320,8 +1258,10 @@ optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, 
> tree dest, tree src, tree
>       of dest, dest isn't volatile.  */
>    if (is_gimple_assign (stmt))
>      {
> -      tree ctor = build_constructor (TREE_TYPE (dest), NULL);
> -      gimple_assign_set_rhs_from_tree (gsip, ctor);
> +      tree ctor_type = TREE_TYPE (gimple_assign_lhs (stmt));
> +      tree ctor = build_constructor (ctor_type, NULL);
> +      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
> +      gimple_assign_set_rhs_from_tree (&gsi, ctor);
>        update_stmt (stmt);
>        statistics_counter_event (cfun, "copy zeroing propagation of 
> aggregate", 1);
>      }
> @@ -1341,8 +1281,126 @@ optimize_memcpy_to_memset (gimple_stmt_iterator 
> *gsip, tree dest, tree src, tree
>        fprintf (dump_file, "into\n  ");
>        print_gimple_stmt (dump_file, stmt, 0, dump_flags);
>      }
> +
> +  /* Mark the bb for eh cleanup if needed.  */
> +  if (maybe_clean_or_replace_eh_stmt (orig_stmt, stmt))
> +    bitmap_set_bit (to_purge, gimple_bb (stmt)->index);
> +
>    return true;
>  }
> +
> +/* Optimize
> +   a = {}; // DEST = value ;; LEN(nullptr)
> +   b = a;
> +   into
> +   a = {};
> +   b = {};
> +   Similarly for memset (&a, ..., sizeof (a)); instead of a = {};
> +   and/or memcpy (&b, &a, sizeof (a)); instead of b = a;  */
> +
> +static bool
> +optimize_aggr_zeroprop (gimple_stmt_iterator *gsip)
> +{
> +  ao_ref read;
> +  gimple *stmt = gsi_stmt (*gsip);
> +  if (gimple_has_volatile_ops (stmt))
> +    return false;
> +
> +  tree dest = NULL_TREE;
> +  tree val = integer_zero_node;
> +  tree len = NULL_TREE;
> +  bool can_use_tbba = true;
> +  bool changed = false;
> +
> +  if (gimple_call_builtin_p (stmt, BUILT_IN_MEMSET)
> +      && TREE_CODE (gimple_call_arg (stmt, 0)) == ADDR_EXPR
> +      && TREE_CODE (gimple_call_arg (stmt, 1)) == INTEGER_CST
> +      && poly_int_tree_p (gimple_call_arg (stmt, 2)))
> +    {
> +      dest = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
> +      len = gimple_call_arg (stmt, 2);
> +      val = gimple_call_arg (stmt, 1);
> +      ao_ref_init_from_ptr_and_size (&read, gimple_call_arg (stmt, 0), len);
> +      can_use_tbba = false;
> +    }
> +  else if (gimple_store_p (stmt)
> +          && gimple_assign_single_p (stmt)
> +          && TREE_CODE (gimple_assign_rhs1 (stmt)) == STRING_CST)
> +    {
> +      tree str = gimple_assign_rhs1 (stmt);
> +      dest = gimple_assign_lhs (stmt);
> +      ao_ref_init (&read, dest);
> +      /* The string must contain all null char's for now.  */
> +      for (int i = 0; i < TREE_STRING_LENGTH (str); i++)
> +       {
> +         if (TREE_STRING_POINTER (str)[i] != 0)
> +           {
> +             dest = NULL_TREE;
> +             break;
> +           }
> +       }
> +    }
> +  else if (gimple_store_p (stmt)
> +          && gimple_assign_single_p (stmt)
> +          && TREE_CODE (gimple_assign_rhs1 (stmt)) == CONSTRUCTOR
> +          && !gimple_clobber_p (stmt))
> +    {
> +      dest = gimple_assign_lhs (stmt);
> +      ao_ref_init (&read, dest);
> +    }
> +
> +  if (dest == NULL_TREE)
> +    return false;
> +
> +  if (len == NULL_TREE)
> +    len = (TREE_CODE (dest) == COMPONENT_REF
> +          ? DECL_SIZE_UNIT (TREE_OPERAND (dest, 1))
> +          : TYPE_SIZE_UNIT (TREE_TYPE (dest)));
> +  if (len == NULL_TREE
> +      || !poly_int_tree_p (len))
> +    return false;
> +
> +  /* This store needs to be on the byte boundary and pointing to an object.  
> */
> +  poly_int64 offset;
> +  tree dest_base = get_addr_base_and_unit_offset (dest, &offset);
> +  if (dest_base == NULL_TREE)
> +    return false;
> +
> +  /* Setup the worklist.  */
> +  auto_vec<std::pair<tree, unsigned>> worklist;
> +  unsigned limit = param_sccvn_max_alias_queries_per_access;
> +  worklist.safe_push (std::make_pair (gimple_vdef (stmt), limit));
> +
> +  while (!worklist.is_empty ())
> +    {
> +      std::pair<tree, unsigned> top = worklist.pop ();
> +      tree vdef = top.first;
> +      limit = top.second;
> +      gimple *use_stmt;
> +      imm_use_iterator iter;
> +      FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
> +       {
> +         /* Handling PHI nodes might not be worth it so don't.  */
> +         if (is_a <gphi*> (use_stmt))
> +           continue;
> +
> +         /* If this statement does not clobber add the vdef stmt to the
> +            worklist.  */
> +         if (gimple_vdef (use_stmt)
> +             && !stmt_may_clobber_ref_p_1 (use_stmt, &read,
> +                                          /* tbaa_p = */ can_use_tbba)
> +             && limit != 0)
> +           worklist.safe_push (std::make_pair (gimple_vdef (use_stmt),
> +                                               limit - 1));
> +
> +         if (optimize_aggr_zeroprop_1 (stmt, use_stmt, dest_base, offset,
> +                                        val, wi::to_poly_offset (len)))
> +          changed = true;
> +       }
> +    }
> +
> +  return changed;
> +}
>  /* Optimizes
>     DEST = SRC;
>     DEST2 = DEST; # DEST2 = SRC2;
> @@ -1462,22 +1520,6 @@ simplify_builtin_call (gimple_stmt_iterator *gsi_p, 
> tree callee2)
>
>    switch (DECL_FUNCTION_CODE (callee2))
>      {
> -    case BUILT_IN_MEMCPY:
> -      if (gimple_call_num_args (stmt2) == 3)
> -       {
> -         tree dest = gimple_call_arg (stmt2, 0);
> -         tree src = gimple_call_arg (stmt2, 1);
> -         tree len = gimple_call_arg (stmt2, 2);
> -         /* Try to optimize the memcpy to memset if src
> -            and dest are addresses. */
> -         if (TREE_CODE (dest) == ADDR_EXPR
> -             && TREE_CODE (src) == ADDR_EXPR
> -             && TREE_CODE (len) == INTEGER_CST
> -             && optimize_memcpy_to_memset (gsi_p, TREE_OPERAND (dest, 0),
> -                                           TREE_OPERAND (src, 0), len))
> -           return true;
> -       }
> -    break;
>      case BUILT_IN_MEMCHR:
>        if (gimple_call_num_args (stmt2) == 3
>           && (res = gimple_call_lhs (stmt2)) != nullptr
> @@ -1539,6 +1581,13 @@ simplify_builtin_call (gimple_stmt_iterator *gsi_p, 
> tree callee2)
>        break;
>
>      case BUILT_IN_MEMSET:
> +      if (gimple_call_num_args (stmt2) == 3)
> +       {
> +         /* Try to prop the zeroing/value of the memset to memcpy
> +            if the dest is an address and the value is a constant. */
> +         if (optimize_aggr_zeroprop (gsi_p))
> +           return true;
> +       }
>        if (gimple_call_num_args (stmt2) != 3
>           || gimple_call_lhs (stmt2)
>           || CHAR_BIT != 8
> @@ -4857,21 +4906,16 @@ pass_forwprop::execute (function *fun)
>                   {
>                     tree rhs1 = gimple_assign_rhs1 (stmt);
>                     enum tree_code code = gimple_assign_rhs_code (stmt);
> -                   if (gimple_assign_load_p (stmt) && gimple_store_p (stmt))
> +                   if (gimple_store_p (stmt) && optimize_aggr_zeroprop 
> (&gsi))
>                       {
> -                       if (optimize_memcpy_to_memset (&gsi,
> -                                                      gimple_assign_lhs 
> (stmt),
> -                                                      gimple_assign_rhs1 
> (stmt),
> -                                                      /* len = */NULL_TREE))
> -                         {
> -                           changed = true;
> -                           break;
> -                         }
> -                       if (optimize_agr_copyprop (&gsi))
> -                         {
> -                           changed = true;
> -                           break;
> -                         }
> +                       changed = true;
> +                       break;
> +                     }
> +                   if (gimple_assign_load_p (stmt) && gimple_store_p (stmt)
> +                       && optimize_agr_copyprop (&gsi))
> +                     {
> +                       changed = true;
> +                       break;
>                       }
>
>                     if (TREE_CODE_CLASS (code) == tcc_comparison)
> --
> 2.43.0
>

Reply via email to