On Sun, May 18, 2025 at 10:58 PM Andrew Pinski <quic_apin...@quicinc.com> wrote:
>
> This implements a simple copy propagation for aggregates in the similar
> fashion as we already do for copy prop of zeroing.
>
> Right now this only looks at the previous vdef statement but this allows us
> to catch a lot of cases that show up in C++ code.
>
> Also deletes aggregate copies that are to the same location (PR57361), this 
> was
> already done in DSE but we should do it here also since it is simple to add 
> and
> when doing a copy to a temporary and back to itself should be deleted too.
> So we need a variant that tests DSE and one for forwprop.
>
> Also adds a variant of pr22237.c which was found while working on this patch.
>
>         PR tree-optimization/14295
>         PR tree-optimization/108358
>         PR tree-optimization/114169
>
> gcc/ChangeLog:
>
>         * tree-ssa-forwprop.cc (optimize_agr_copyprop): New function.
>         (pass_forwprop::execute): Call optimize_agr_copyprop for load/store 
> statements.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/tree-ssa/20031106-6.c: Un-xfail. Add scan for forwprop1.
>         * g++.dg/opt/pr66119.C: Disable forwprop since that does
>         the copy prop now.
>         * gcc.dg/tree-ssa/pr108358-a.c: New test.
>         * gcc.dg/tree-ssa/pr114169-1.c: New test.
>         * gcc.c-torture/execute/builtins/pr22237-1-lib.c: New test.
>         * gcc.c-torture/execute/builtins/pr22237-1.c: New test.
>         * gcc.dg/tree-ssa/pr57361.c: Disable forwprop1.
>         * gcc.dg/tree-ssa/pr57361-1.c: New test.
>
> Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com>
> ---
>  gcc/testsuite/g++.dg/opt/pr66119.C            |   2 +-
>  .../execute/builtins/pr22237-1-lib.c          |  27 +++++
>  .../execute/builtins/pr22237-1.c              |  57 ++++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c    |   8 +-
>  gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c    |  33 ++++++
>  gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c    |  39 +++++++
>  gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c     |   9 ++
>  gcc/testsuite/gcc.dg/tree-ssa/pr57361.c       |   2 +-
>  gcc/tree-ssa-forwprop.cc                      | 103 ++++++++++++++++++
>  9 files changed, 276 insertions(+), 4 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c
>  create mode 100644 gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c
>
> diff --git a/gcc/testsuite/g++.dg/opt/pr66119.C 
> b/gcc/testsuite/g++.dg/opt/pr66119.C
> index d1b1845a258..52362e44434 100644
> --- a/gcc/testsuite/g++.dg/opt/pr66119.C
> +++ b/gcc/testsuite/g++.dg/opt/pr66119.C
> @@ -3,7 +3,7 @@
>     the value of MOVE_RATIO now is.  */
>
>  /* { dg-do compile  { target { { i?86-*-* x86_64-*-* } && c++11 } }  }  */
> -/* { dg-options "-O3 -mavx -fdump-tree-sra -march=slm -mtune=slm 
> -fno-early-inlining" } */
> +/* { dg-options "-O3 -mavx -fdump-tree-sra -fno-tree-forwprop -march=slm 
> -mtune=slm -fno-early-inlining" } */
>  // { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib 
> } }
>
>  #include <immintrin.h>
> diff --git a/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c 
> b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c
> new file mode 100644
> index 00000000000..44032357405
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c
> @@ -0,0 +1,27 @@
> +extern void abort (void);
> +
> +void *
> +memcpy (void *dst, const void *src, __SIZE_TYPE__ n)
> +{
> +  const char *srcp;
> +  char *dstp;
> +
> +  srcp = src;
> +  dstp = dst;
> +
> +  if (dst < src)
> +    {
> +      if (dst + n > src)
> +       abort ();
> +    }
> +  else
> +    {
> +      if (src + n > dst)
> +       abort ();
> +    }
> +
> +  while (n-- != 0)
> +    *dstp++ = *srcp++;
> +
> +  return dst;
> +}
> diff --git a/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c 
> b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c
> new file mode 100644
> index 00000000000..0a12b0fc9a1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c
> @@ -0,0 +1,57 @@
> +extern void abort (void);
> +extern void exit (int);
> +struct s { unsigned char a[256]; };
> +union u { struct { struct s b; int c; } d; struct { int c; struct s b; } e; 
> };
> +static union u v;
> +static union u v0;
> +static struct s *p = &v.d.b;
> +static struct s *q = &v.e.b;
> +
> +struct outers
> +{
> +  struct s inner;
> +};
> +
> +static inline struct s rp (void) { return *p; }
> +static inline struct s rq (void) { return *q; }
> +static void pq (void)
> +{
> +  struct outers o = {rq () };
> +  *p = o.inner;
> +}
> +static void qp (void)
> +{
> +  struct outers o = {rp () };
> +  *q  = o.inner;
> +}
> +
> +static void
> +init (struct s *sp)
> +{
> +  int i;
> +  for (i = 0; i < 256; i++)
> +    sp->a[i] = i;
> +}
> +
> +static void
> +check (struct s *sp)
> +{
> +  int i;
> +  for (i = 0; i < 256; i++)
> +    if (sp->a[i] != i)
> +      abort ();
> +}
> +
> +void
> +main_test (void)
> +{
> +  v = v0;
> +  init (p);
> +  qp ();
> +  check (q);
> +  v = v0;
> +  init (q);
> +  pq ();
> +  check (p);
> +  exit (0);
> +}
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c
> index 56d1887bd78..c7e00887c16 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c
> @@ -1,5 +1,7 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O1 -fno-tree-sra -fdump-tree-optimized" } */
> +/* { dg-options "-O1 -fno-tree-sra -fdump-tree-optimized 
> -fdump-tree-forwprop1-details" } */
> +
> +/* PR tree-optimization/14295 */
>
>  extern void link_error (void);
>
> @@ -25,4 +27,6 @@ struct s foo (struct s r)
>
>  /* There should be no references to any of "temp_struct*"
>     temporaries.  */
> -/* { dg-final { scan-tree-dump-times "temp_struct" 0 "optimized" { xfail 
> *-*-* } } } */
> +/* { dg-final { scan-tree-dump-times "temp_struct" 0 "optimized" } } */
> +/* Also check that forwprop pass did the copy prop. */
> +/* { dg-final { scan-tree-dump-times "after previous" 3 "forwprop1" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c
> new file mode 100644
> index 00000000000..342e1c1a5c2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -fdump-tree-optimized" } */
> +
> +/* PR tree-optimization/108358 */
> +
> +struct a {
> +  int b;
> +  int c;
> +  short d;
> +  int e;
> +  int f;
> +};
> +struct g {
> +  struct a f;
> +  struct a h;
> +};
> +int i;
> +void foo();
> +void bar31_(void);
> +int main() {
> +  struct g j, l = {2, 1, 6, 1, 1, 7, 5, 1, 0, 1};
> +  for (; i; ++i)
> +    bar31_();
> +  j = l;
> +  struct g m = j;
> +  struct g k = m;
> +  if (k.h.b)
> +    ;
> +  else
> +    foo();
> +}
> +/* The call to foo should be optimized away. */
> +/* { dg-final { scan-tree-dump-not "foo " "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c
> new file mode 100644
> index 00000000000..37766fbe296
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-forwprop-details -fdump-tree-optimized" } */
> +
> +
> +/* PR tree-optimization/114169 */
> +
> +#include <stdint.h>
> +
> +struct S1 {
> +   uint32_t  f0;
> +   uint8_t  f1;
> +   uint64_t  f2;
> +   uint64_t  f3;
> +   int32_t  f4;
> +};
> +
> +union U8 {
> +   struct S1  f0;
> +   int32_t  f1;
> +   int64_t  f2;
> +   uint8_t  f3;
> +   const int64_t  f4;
> +};
> +
> +/* --- GLOBAL VARIABLES --- */
> +struct S1 g_16 = {4294967293UL,1UL,1UL,0xA9C1C73B017290B1LL,0x5ADF851FL};
> +union U8 g_37 = {{1UL,1UL,0x2361AE7D51263067LL,0xEEFD7F9B64A47447LL,0L}};
> +struct S1 g_50 = 
> {0x0CFC2012L,1UL,0x43E1243B3BE7B8BBLL,0x03C5CEC10C1A6FE1LL,1L};
> +
> +
> +/* --- FORWARD DECLARATIONS --- */
> +
> +void func_32(union U8 e) {
> +  e.f3 = e.f0.f4;
> +  g_16 = e.f0 = g_50;
> +}
> +/* The union e should not make a difference here.  */
> +/* { dg-final { scan-tree-dump-times "after previous" 1 "forwprop1" } } */
> +/* { dg-final { scan-tree-dump "g_16 = g_50;" "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c
> new file mode 100644
> index 00000000000..0accffd7fe4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-forwprop1-details" } */
> +
> +struct A { int x; double y; };
> +void f (struct A *a) {
> +  *a = *a;
> +}
> +
> +/* { dg-final { scan-tree-dump "into a NOP" "forwprop1"} } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c
> index 81f27b3cd1f..7e273dbe2e1 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O -fdump-tree-dse1-details" } */
> +/* { dg-options "-O -fdump-tree-dse1-details -fno-tree-forwprop" } */
>
>  struct A { int x; double y; };
>  void f (struct A *a) {
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 4c048a9a298..412e5faa106 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -1341,6 +1341,103 @@ optimize_memcpy_to_memset (gimple_stmt_iterator 
> *gsip, tree dest, tree src, tree
>      }
>    return true;
>  }
> +/* Optimizes
> +   a = c;
> +   b = a;
> +   into
> +   a = c;
> +   b = c;
> +   GSIP is the second statement and SRC is the common
> +   between the statements.
> +*/
> +static bool
> +optimize_agr_copyprop (gimple_stmt_iterator *gsip, tree dest, tree src)
> +{
> +  gimple *stmt = gsi_stmt (*gsip);

Since you get at the stmt itself here, there seems to be no need to pass
dest and src separately?

> +  if (gimple_has_volatile_ops (stmt))
> +    return false;
> +
> +  tree vuse = gimple_vuse (stmt);
> +  if (vuse == NULL || TREE_CODE (vuse) != SSA_NAME)

hopefully the latter doesn't happen too often and the former never?

> +    return false;
> +  /* If the statement is `src = src;` change it to be a NOP. */
> +  if (operand_equal_p (dest, src, 0))
> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         fprintf (dump_file, "Simplified\n  ");
> +         print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +         fprintf (dump_file, "into a NOP\n  ");
> +       }
> +      unlink_stmt_vdef (stmt);
> +      release_defs (stmt);
> +      gsi_replace (gsip, gimple_build_nop (), false);
> +      return true;
> +    }
> +  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
> +  if (!gimple_assign_load_p (defstmt)
> +      || !gimple_store_p (defstmt))
> +    return false;
> +  if (gimple_has_volatile_ops (defstmt))
> +    return false;
> +
> +  tree dest2 = gimple_assign_lhs (defstmt);
> +  tree src2 = gimple_assign_rhs1 (defstmt);
> +
> +  /* If the original store is `src2 = src2;` skip over it. */
> +  if (operand_equal_p (src2, dest2, 0))
> +    return false;
> +  if (!operand_equal_p (src, dest2, 0))
> +    return false;
> +
> +  /* If replacing with the same thing, then change the statement
> +     into a NOP. */
> +  if (operand_equal_p (src2, dest, 0))

Note this is redundant store removal - I'm not sure operand_equal_p
is good enough to catch all cases of effective type changes done?
Esp. as infering the old effective type from the read side (src2)
is impossible in principle.  Consider

  (T{ref-all} *)dest = {};
  dest2 = dest;
  dest = dest2;

now, the last stmt is good to elide (we'll keep a ref-all store), but can
we construct a case where it goes wrong?  Possibly when 'dest'
is a may-alias type but the first store (not visible in the IL as far
as you analyze) was done with the ref-all stripped?

I realize all the other redundant store removal instances have
similar problems (and there's at least one open PR still).

We have refs_same_for_tbaa_p that is used by DOM and RTL CSE.
FRE open-codes something, likewise postreload.

> +    {
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         fprintf (dump_file, "Simplified\n  ");
> +         print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +         fprintf (dump_file, "after previous\n  ");
> +         print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
> +         fprintf (dump_file, "into a NOP\n  ");

Can you say "Deleted redundant store" or so for cases we remove
a later store?  That makes it easy to grep for such cases.

> +       }
> +      unlink_stmt_vdef (stmt);
> +      release_defs (stmt);
> +      gsi_replace (gsip, gimple_build_nop (), false);
> +      return true;
> +    }
> +  /* For 2 memory refences and using a temporary to do the copy,
> +     don't remove the temporary as the 2 memory references might overlap.
> +     Note t does not need to be decl as it could be field.
> +     See PR 22237 for full details.
> +     E.g.
> +     t = *a;
> +     *b = t;
> +     Cannot be convert into
> +     t = *a;
> +     *b = *a;
> +  */
> +  if (!DECL_P (dest) && !DECL_P (src2))
> +    return false;

FRE also uses size vs. alignment for further cases but I guess
for aggregates (aka large size) alignment is usually too small to help.

What I see is that you now increase the lifetime of '*a' when
there are other uses of 't' between t = *a; and *b = t;  This
might for example kill stack-reuse.  It might also confuse SRA
heuristics with respect to total scalarization?

IMO it would be worth gating this on single_use (vuse) and thus
elide the t = *a; copy as well here?  Can you add simple statistics
via statistics_counter_event and check whether this makes a difference
on say GCC itself?

Otherwise thanks for working on this.
Richard.

> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file, "Simplified\n  ");
> +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +      fprintf (dump_file, "after previous\n  ");
> +      print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
> +    }
> +  gimple_assign_set_rhs_from_tree (gsip, unshare_expr (src2));
> +  update_stmt (stmt);
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file, "into\n  ");
> +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +    }
> +  return true;
> +}
>
>  /* *GSI_P is a GIMPLE_CALL to a builtin function.
>     Optimize
> @@ -4717,6 +4814,12 @@ pass_forwprop::execute (function *fun)
>                             changed = true;
>                             break;
>                           }
> +                       if (optimize_agr_copyprop (&gsi, gimple_assign_lhs 
> (stmt),
> +                                      gimple_assign_rhs1 (stmt)))
> +                         {
> +                           changed = true;
> +                           break;
> +                         }
>                       }
>
>                     if (TREE_CODE_CLASS (code) == tcc_comparison)
> --
> 2.43.0
>

Reply via email to