On Mon, May 26, 2025 at 5:36 AM Richard Biener
<richard.guent...@gmail.com> wrote:
>
> On Sun, May 18, 2025 at 10:58 PM Andrew Pinski <quic_apin...@quicinc.com> 
> wrote:
> >
> > This implements a simple copy propagation for aggregates in the similar
> > fashion as we already do for copy prop of zeroing.
> >
> > Right now this only looks at the previous vdef statement but this allows us
> > to catch a lot of cases that show up in C++ code.
> >
> > Also deletes aggregate copies that are to the same location (PR57361), this 
> > was
> > already done in DSE but we should do it here also since it is simple to add 
> > and
> > when doing a copy to a temporary and back to itself should be deleted too.
> > So we need a variant that tests DSE and one for forwprop.
> >
> > Also adds a variant of pr22237.c which was found while working on this 
> > patch.
> >
> >         PR tree-optimization/14295
> >         PR tree-optimization/108358
> >         PR tree-optimization/114169
> >
> > gcc/ChangeLog:
> >
> >         * tree-ssa-forwprop.cc (optimize_agr_copyprop): New function.
> >         (pass_forwprop::execute): Call optimize_agr_copyprop for load/store 
> > statements.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.dg/tree-ssa/20031106-6.c: Un-xfail. Add scan for forwprop1.
> >         * g++.dg/opt/pr66119.C: Disable forwprop since that does
> >         the copy prop now.
> >         * gcc.dg/tree-ssa/pr108358-a.c: New test.
> >         * gcc.dg/tree-ssa/pr114169-1.c: New test.
> >         * gcc.c-torture/execute/builtins/pr22237-1-lib.c: New test.
> >         * gcc.c-torture/execute/builtins/pr22237-1.c: New test.
> >         * gcc.dg/tree-ssa/pr57361.c: Disable forwprop1.
> >         * gcc.dg/tree-ssa/pr57361-1.c: New test.
> >
> > Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com>
> > ---
> >  gcc/testsuite/g++.dg/opt/pr66119.C            |   2 +-
> >  .../execute/builtins/pr22237-1-lib.c          |  27 +++++
> >  .../execute/builtins/pr22237-1.c              |  57 ++++++++++
> >  gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c    |   8 +-
> >  gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c    |  33 ++++++
> >  gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c    |  39 +++++++
> >  gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c     |   9 ++
> >  gcc/testsuite/gcc.dg/tree-ssa/pr57361.c       |   2 +-
> >  gcc/tree-ssa-forwprop.cc                      | 103 ++++++++++++++++++
> >  9 files changed, 276 insertions(+), 4 deletions(-)
> >  create mode 100644 
> > gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c
> >  create mode 100644 gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c
> >
> > diff --git a/gcc/testsuite/g++.dg/opt/pr66119.C 
> > b/gcc/testsuite/g++.dg/opt/pr66119.C
> > index d1b1845a258..52362e44434 100644
> > --- a/gcc/testsuite/g++.dg/opt/pr66119.C
> > +++ b/gcc/testsuite/g++.dg/opt/pr66119.C
> > @@ -3,7 +3,7 @@
> >     the value of MOVE_RATIO now is.  */
> >
> >  /* { dg-do compile  { target { { i?86-*-* x86_64-*-* } && c++11 } }  }  */
> > -/* { dg-options "-O3 -mavx -fdump-tree-sra -march=slm -mtune=slm 
> > -fno-early-inlining" } */
> > +/* { dg-options "-O3 -mavx -fdump-tree-sra -fno-tree-forwprop -march=slm 
> > -mtune=slm -fno-early-inlining" } */
> >  // { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! 
> > hostedlib } }
> >
> >  #include <immintrin.h>
> > diff --git a/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c 
> > b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c
> > new file mode 100644
> > index 00000000000..44032357405
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c
> > @@ -0,0 +1,27 @@
> > +extern void abort (void);
> > +
> > +void *
> > +memcpy (void *dst, const void *src, __SIZE_TYPE__ n)
> > +{
> > +  const char *srcp;
> > +  char *dstp;
> > +
> > +  srcp = src;
> > +  dstp = dst;
> > +
> > +  if (dst < src)
> > +    {
> > +      if (dst + n > src)
> > +       abort ();
> > +    }
> > +  else
> > +    {
> > +      if (src + n > dst)
> > +       abort ();
> > +    }
> > +
> > +  while (n-- != 0)
> > +    *dstp++ = *srcp++;
> > +
> > +  return dst;
> > +}
> > diff --git a/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c 
> > b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c
> > new file mode 100644
> > index 00000000000..0a12b0fc9a1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c
> > @@ -0,0 +1,57 @@
> > +extern void abort (void);
> > +extern void exit (int);
> > +struct s { unsigned char a[256]; };
> > +union u { struct { struct s b; int c; } d; struct { int c; struct s b; } 
> > e; };
> > +static union u v;
> > +static union u v0;
> > +static struct s *p = &v.d.b;
> > +static struct s *q = &v.e.b;
> > +
> > +struct outers
> > +{
> > +  struct s inner;
> > +};
> > +
> > +static inline struct s rp (void) { return *p; }
> > +static inline struct s rq (void) { return *q; }
> > +static void pq (void)
> > +{
> > +  struct outers o = {rq () };
> > +  *p = o.inner;
> > +}
> > +static void qp (void)
> > +{
> > +  struct outers o = {rp () };
> > +  *q  = o.inner;
> > +}
> > +
> > +static void
> > +init (struct s *sp)
> > +{
> > +  int i;
> > +  for (i = 0; i < 256; i++)
> > +    sp->a[i] = i;
> > +}
> > +
> > +static void
> > +check (struct s *sp)
> > +{
> > +  int i;
> > +  for (i = 0; i < 256; i++)
> > +    if (sp->a[i] != i)
> > +      abort ();
> > +}
> > +
> > +void
> > +main_test (void)
> > +{
> > +  v = v0;
> > +  init (p);
> > +  qp ();
> > +  check (q);
> > +  v = v0;
> > +  init (q);
> > +  pq ();
> > +  check (p);
> > +  exit (0);
> > +}
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c
> > index 56d1887bd78..c7e00887c16 100644
> > --- a/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c
> > @@ -1,5 +1,7 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O1 -fno-tree-sra -fdump-tree-optimized" } */
> > +/* { dg-options "-O1 -fno-tree-sra -fdump-tree-optimized 
> > -fdump-tree-forwprop1-details" } */
> > +
> > +/* PR tree-optimization/14295 */
> >
> >  extern void link_error (void);
> >
> > @@ -25,4 +27,6 @@ struct s foo (struct s r)
> >
> >  /* There should be no references to any of "temp_struct*"
> >     temporaries.  */
> > -/* { dg-final { scan-tree-dump-times "temp_struct" 0 "optimized" { xfail 
> > *-*-* } } } */
> > +/* { dg-final { scan-tree-dump-times "temp_struct" 0 "optimized" } } */
> > +/* Also check that forwprop pass did the copy prop. */
> > +/* { dg-final { scan-tree-dump-times "after previous" 3 "forwprop1" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c
> > new file mode 100644
> > index 00000000000..342e1c1a5c2
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c
> > @@ -0,0 +1,33 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Os -fdump-tree-optimized" } */
> > +
> > +/* PR tree-optimization/108358 */
> > +
> > +struct a {
> > +  int b;
> > +  int c;
> > +  short d;
> > +  int e;
> > +  int f;
> > +};
> > +struct g {
> > +  struct a f;
> > +  struct a h;
> > +};
> > +int i;
> > +void foo();
> > +void bar31_(void);
> > +int main() {
> > +  struct g j, l = {2, 1, 6, 1, 1, 7, 5, 1, 0, 1};
> > +  for (; i; ++i)
> > +    bar31_();
> > +  j = l;
> > +  struct g m = j;
> > +  struct g k = m;
> > +  if (k.h.b)
> > +    ;
> > +  else
> > +    foo();
> > +}
> > +/* The call to foo should be optimized away. */
> > +/* { dg-final { scan-tree-dump-not "foo " "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c
> > new file mode 100644
> > index 00000000000..37766fbe296
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c
> > @@ -0,0 +1,39 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-forwprop-details -fdump-tree-optimized" } 
> > */
> > +
> > +
> > +/* PR tree-optimization/114169 */
> > +
> > +#include <stdint.h>
> > +
> > +struct S1 {
> > +   uint32_t  f0;
> > +   uint8_t  f1;
> > +   uint64_t  f2;
> > +   uint64_t  f3;
> > +   int32_t  f4;
> > +};
> > +
> > +union U8 {
> > +   struct S1  f0;
> > +   int32_t  f1;
> > +   int64_t  f2;
> > +   uint8_t  f3;
> > +   const int64_t  f4;
> > +};
> > +
> > +/* --- GLOBAL VARIABLES --- */
> > +struct S1 g_16 = {4294967293UL,1UL,1UL,0xA9C1C73B017290B1LL,0x5ADF851FL};
> > +union U8 g_37 = {{1UL,1UL,0x2361AE7D51263067LL,0xEEFD7F9B64A47447LL,0L}};
> > +struct S1 g_50 = 
> > {0x0CFC2012L,1UL,0x43E1243B3BE7B8BBLL,0x03C5CEC10C1A6FE1LL,1L};
> > +
> > +
> > +/* --- FORWARD DECLARATIONS --- */
> > +
> > +void func_32(union U8 e) {
> > +  e.f3 = e.f0.f4;
> > +  g_16 = e.f0 = g_50;
> > +}
> > +/* The union e should not make a difference here.  */
> > +/* { dg-final { scan-tree-dump-times "after previous" 1 "forwprop1" } } */
> > +/* { dg-final { scan-tree-dump "g_16 = g_50;" "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c
> > new file mode 100644
> > index 00000000000..0accffd7fe4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c
> > @@ -0,0 +1,9 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O -fdump-tree-forwprop1-details" } */
> > +
> > +struct A { int x; double y; };
> > +void f (struct A *a) {
> > +  *a = *a;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump "into a NOP" "forwprop1"} } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c
> > index 81f27b3cd1f..7e273dbe2e1 100644
> > --- a/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O -fdump-tree-dse1-details" } */
> > +/* { dg-options "-O -fdump-tree-dse1-details -fno-tree-forwprop" } */
> >
> >  struct A { int x; double y; };
> >  void f (struct A *a) {
> > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > index 4c048a9a298..412e5faa106 100644
> > --- a/gcc/tree-ssa-forwprop.cc
> > +++ b/gcc/tree-ssa-forwprop.cc
> > @@ -1341,6 +1341,103 @@ optimize_memcpy_to_memset (gimple_stmt_iterator 
> > *gsip, tree dest, tree src, tree
> >      }
> >    return true;
> >  }
> > +/* Optimizes
> > +   a = c;
> > +   b = a;
> > +   into
> > +   a = c;
> > +   b = c;
> > +   GSIP is the second statement and SRC is the common
> > +   between the statements.
> > +*/
> > +static bool
> > +optimize_agr_copyprop (gimple_stmt_iterator *gsip, tree dest, tree src)
> > +{
> > +  gimple *stmt = gsi_stmt (*gsip);
>
> Since you get at the stmt itself here, there seems to be no need to pass
> dest and src separately?

Yes, that makes sense. I was copying what was originally done for
optimize_memcpy_to_memset here.

>
> > +  if (gimple_has_volatile_ops (stmt))
> > +    return false;
> > +
> > +  tree vuse = gimple_vuse (stmt);
> > +  if (vuse == NULL || TREE_CODE (vuse) != SSA_NAME)
>
> hopefully the latter doesn't happen too often and the former never?

This was copied from optimize_memcpy_to_memset; let me look into why
it was done. I think the latter now shows up and the former was what
it was when it was part of fold-all-builtins.


>
> > +    return false;
> > +  /* If the statement is `src = src;` change it to be a NOP. */
> > +  if (operand_equal_p (dest, src, 0))
> > +    {
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +       {
> > +         fprintf (dump_file, "Simplified\n  ");
> > +         print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> > +         fprintf (dump_file, "into a NOP\n  ");
> > +       }
> > +      unlink_stmt_vdef (stmt);
> > +      release_defs (stmt);
> > +      gsi_replace (gsip, gimple_build_nop (), false);
> > +      return true;
> > +    }
> > +  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
> > +  if (!gimple_assign_load_p (defstmt)
> > +      || !gimple_store_p (defstmt))
> > +    return false;
> > +  if (gimple_has_volatile_ops (defstmt))
> > +    return false;
> > +
> > +  tree dest2 = gimple_assign_lhs (defstmt);
> > +  tree src2 = gimple_assign_rhs1 (defstmt);
> > +
> > +  /* If the original store is `src2 = src2;` skip over it. */
> > +  if (operand_equal_p (src2, dest2, 0))
> > +    return false;
> > +  if (!operand_equal_p (src, dest2, 0))
> > +    return false;
> > +
> > +  /* If replacing with the same thing, then change the statement
> > +     into a NOP. */
> > +  if (operand_equal_p (src2, dest, 0))
>
> Note this is redundant store removal - I'm not sure operand_equal_p
> is good enough to catch all cases of effective type changes done?
> Esp. as infering the old effective type from the read side (src2)
> is impossible in principle.  Consider
>
>   (T{ref-all} *)dest = {};
>   dest2 = dest;
>   dest = dest2;
>
> now, the last stmt is good to elide (we'll keep a ref-all store), but can
> we construct a case where it goes wrong?  Possibly when 'dest'
> is a may-alias type but the first store (not visible in the IL as far
> as you analyze) was done with the ref-all stripped?
>
> I realize all the other redundant store removal instances have
> similar problems (and there's at least one open PR still).

operand_equal_p has some code for MEM_REF to check alias compatible. I
will add a few testcases for this. I did try to find some testcases
where we might delete the store but I could not find it.
Note tree-ssa-dse uses exactly operand_equal_p check on the MEM_REF to
see if it is storing the same thing to itself.
I do wonder if that might break in some cases too.


>
> We have refs_same_for_tbaa_p that is used by DOM and RTL CSE.
> FRE open-codes something, likewise postreload.

I will look into that in a little bit (as I mentioned tree-ssa-dse
might be broken too)


>
> > +    {
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +       {
> > +         fprintf (dump_file, "Simplified\n  ");
> > +         print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> > +         fprintf (dump_file, "after previous\n  ");
> > +         print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
> > +         fprintf (dump_file, "into a NOP\n  ");
>
> Can you say "Deleted redundant store" or so for cases we remove
> a later store?  That makes it easy to grep for such cases.

Yes that makes sense.

>
> > +       }
> > +      unlink_stmt_vdef (stmt);
> > +      release_defs (stmt);
> > +      gsi_replace (gsip, gimple_build_nop (), false);
> > +      return true;
> > +    }
> > +  /* For 2 memory refences and using a temporary to do the copy,
> > +     don't remove the temporary as the 2 memory references might overlap.
> > +     Note t does not need to be decl as it could be field.
> > +     See PR 22237 for full details.
> > +     E.g.
> > +     t = *a;
> > +     *b = t;
> > +     Cannot be convert into
> > +     t = *a;
> > +     *b = *a;
> > +  */
> > +  if (!DECL_P (dest) && !DECL_P (src2))
> > +    return false;
>
> FRE also uses size vs. alignment for further cases but I guess
> for aggregates (aka large size) alignment is usually too small to help.

So operand_equal_p already checks the alignment and size if we have a
MEM_REF.  Unless I am misunderstanding the point here.

>
> What I see is that you now increase the lifetime of '*a' when
> there are other uses of 't' between t = *a; and *b = t;  This
> might for example kill stack-reuse.  It might also confuse SRA
> heuristics with respect to total scalarization?

Yes we do increase the lifetime of `*a`. And the SRA heuristics for
total scalarization is part of the problem and what this change is
trying to avoid in the first place.

>
> IMO it would be worth gating this on single_use (vuse) and thus
> elide the t = *a; copy as well here?  Can you add simple statistics
> via statistics_counter_event and check whether this makes a difference
> on say GCC itself?

I will add the statistics for both this and the memset one which is missing one.

Yes it does makes a difference for GCC itself (at least with earlier
versions of the patch); wi::wide_int and gimple_stmt_iterator are the
structs where the prop happened the most. Though those do get fully
SRA scalarized anyways so overall I think it was just a slight
decrease in compile time.

Thanks,
Andrew Pinski

>
> Otherwise thanks for working on this.
> Richard.
>
> > +
> > +  if (dump_file && (dump_flags & TDF_DETAILS))
> > +    {
> > +      fprintf (dump_file, "Simplified\n  ");
> > +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> > +      fprintf (dump_file, "after previous\n  ");
> > +      print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
> > +    }
> > +  gimple_assign_set_rhs_from_tree (gsip, unshare_expr (src2));
> > +  update_stmt (stmt);
> > +
> > +  if (dump_file && (dump_flags & TDF_DETAILS))
> > +    {
> > +      fprintf (dump_file, "into\n  ");
> > +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> > +    }
> > +  return true;
> > +}
> >
> >  /* *GSI_P is a GIMPLE_CALL to a builtin function.
> >     Optimize
> > @@ -4717,6 +4814,12 @@ pass_forwprop::execute (function *fun)
> >                             changed = true;
> >                             break;
> >                           }
> > +                       if (optimize_agr_copyprop (&gsi, gimple_assign_lhs 
> > (stmt),
> > +                                      gimple_assign_rhs1 (stmt)))
> > +                         {
> > +                           changed = true;
> > +                           break;
> > +                         }
> >                       }
> >
> >                     if (TREE_CODE_CLASS (code) == tcc_comparison)
> > --
> > 2.43.0
> >

Reply via email to