On Wed, 24 May 2023 at 12:41, Richard Biener <rguent...@suse.de> wrote:

> On Wed, 24 May 2023, Christophe Lyon wrote:
>
> > Hi Richard,
> >
> > On Tue, 23 May 2023 at 11:55, Richard Biener via Gcc-patches <
> > gcc-patches@gcc.gnu.org> wrote:
> >
> > > The following fixes code hoisting to properly consider ANTIC_OUT
> instead
> > > of ANTIC_IN.  That's a bit expensive to re-compute but since we no
> > > longer iterate we're doing this only once per BB which should be
> > > acceptable.  This avoids missing hoistings to the end of blocks where
> > > something in the block clobbers the hoisted value.
> > >
> > > Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
> > >
> > >         PR tree-optimization/109849
> > >         * tree-ssa-pre.cc (do_hoist_insertion): Compute ANTIC_OUT
> > >         and use that to determine what to hoist.
> > >
> > >         * gcc.dg/tree-ssa/ssa-hoist-8.c: New testcase.
> > >
> >
> > This patch causes a regression on aarch64:
> > gcc.target/aarch64/sve/fmla_2.c: \\tst1d found 3 times
> >  FAIL: gcc.target/aarch64/sve/fmla_2.c scan-assembler-times \\tst1d 2
> >
> >
> > We used to generate:
> >         mov     x6, 0
> >         mov     w7, 55
> >         whilelo p7.d, wzr, w7
> >         .p2align 3,,7
> > .L2:
> >         ld1d    z30.d, p7/z, [x5, x6, lsl 3]
> >         ld1d    z31.d, p7/z, [x4, x6, lsl 3]
> >         cmpne   p6.d, p7/z, z30.d, #0
> >         ld1d    z30.d, p7/z, [x3, x6, lsl 3]
> >         ld1d    z29.d, p6/z, [x2, x6, lsl 3]
> >         movprfx z28, z30
> >         fmla    z28.d, p6/m, z31.d, z29.d
> >         fmla    z31.d, p6/m, z30.d, z29.d
> >         st1d    z28.d, p7, [x1, x6, lsl 3]
> >         st1d    z31.d, p7, [x0, x6, lsl 3]
> >         incd    x6
> >         whilelo p7.d, w6, w7
> >         b.any   .L2
> >
> >
> > But now:
> >         mov     x6, 0
> >         mov     w7, 55
> >         ptrue   p4.b, all
> >         whilelo p7.d, wzr, w7
> >         .p2align 3,,7
> > .L2:
> >         ld1d    z30.d, p7/z, [x5, x6, lsl 3]
> >         ld1d    z31.d, p7/z, [x4, x6, lsl 3]
> >         cmpne   p6.d, p7/z, z30.d, #0
> >         cmpeq   p5.d, p7/z, z30.d, #0
> >         ld1d    z29.d, p6/z, [x2, x6, lsl 3]
> >         ld1d    z28.d, p6/z, [x3, x6, lsl 3]
> >         ld1d    z30.d, p5/z, [x3, x6, lsl 3]
> >         movprfx z27, z31
> >         fmla    z27.d, p4/m, z29.d, z28.d
> >         movprfx z30.d, p6/m, z28.d
> >         fmla    z30.d, p6/m, z31.d, z29.d
> >         st1d    z27.d, p6, [x0, x6, lsl 3]
> >         st1d    z30.d, p7, [x1, x6, lsl 3]
> >         st1d    z31.d, p5, [x0, x6, lsl 3]
> >         incd    x6
> >         whilelo p7.d, w6, w7
> >         b.any   .L2
>
> Thanks for reporting.  I'm testing the following together with
> a testcase that's not architecture specific.
>
> Richard.
>
> From 2340df60dd9192b30b02de5b34f9cb7c16806430 Mon Sep 17 00:00:00 2001
> From: Richard Biener <rguent...@suse.de>
> Date: Wed, 24 May 2023 12:36:28 +0200
> Subject: [PATCH] tree-optimization/109849 - fix fallout of PRE hoisting
> change
> To: gcc-patches@gcc.gnu.org
>
> The PR109849 fix made us no longer hoist some memory loads because
> of the expression set intersection.  We can still avoid to compute
> the union by simply taking the first sets expressions and leave
> the pruning of expressions with values not suitable for hoisting
> to sorted_array_from_bitmap_set.
>
>         PR tree-optimization/109849
>         * tree-ssa-pre.cc (do_hoist_insertion): Do not intersect
>         expressions but take the first sets.
>
>         * gcc.dg/tree-ssa/ssa-hoist-9.c: New testcase.
>

Thanks, I confirm that this fixes the problem I reported on aarch64.

Christophe


> ---
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c | 20 ++++++++++++++++++++
>  gcc/tree-ssa-pre.cc                         | 12 ++++--------
>  2 files changed, 24 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
> new file mode 100644
> index 00000000000..388f79fd80f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-9.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-pre-stats" } */
> +
> +int foo (int flag, int * __restrict a, int * __restrict b)
> +{
> +  int res;
> +  if (flag)
> +    res = *a + *b;
> +  else
> +    {
> +      res = *a;
> +      *a = 1;
> +      res += *b;
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-tree-dump "HOIST inserted: 3" "pre" } } */
> +/* { dg-final { scan-tree-dump-times " = \\\*" 2 "pre" } } */
> +/* { dg-final { scan-tree-dump-times " = \[^\r\n\]* \\\+ \[^\r\n\]*;" 1
> "pre" } } */
> diff --git a/gcc/tree-ssa-pre.cc b/gcc/tree-ssa-pre.cc
> index b1ceea90a8e..7bbfa5ac43d 100644
> --- a/gcc/tree-ssa-pre.cc
> +++ b/gcc/tree-ssa-pre.cc
> @@ -3625,8 +3625,9 @@ do_hoist_insertion (basic_block block)
>
>    /* We have multiple successors, compute ANTIC_OUT by taking the
> intersection
>       of all of ANTIC_IN translating through PHI nodes.  Note we do not
> have to
> -     worry about iteration stability here so just intersect the
> expression sets
> -     as well.  This is a simplification of what we do in
> compute_antic_aux.  */
> +     worry about iteration stability here so just use the expression set
> +     from the first set and prune that by sorted_array_from_bitmap_set.
> +     This is a simplification of what we do in compute_antic_aux.  */
>    bitmap_set_t ANTIC_OUT = bitmap_set_new ();
>    bool first = true;
>    FOR_EACH_EDGE (e, ei, block->succs)
> @@ -3641,15 +3642,10 @@ do_hoist_insertion (basic_block block)
>           bitmap_set_t tmp = bitmap_set_new ();
>           phi_translate_set (tmp, ANTIC_IN (e->dest), e);
>           bitmap_and_into (&ANTIC_OUT->values, &tmp->values);
> -         bitmap_and_into (&ANTIC_OUT->expressions, &tmp->expressions);
>           bitmap_set_free (tmp);
>         }
>        else
> -       {
> -         bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN
> (e->dest)->values);
> -         bitmap_and_into (&ANTIC_OUT->expressions,
> -                          &ANTIC_IN (e->dest)->expressions);
> -       }
> +       bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN (e->dest)->values);
>      }
>
>    /* Compute the set of hoistable expressions from ANTIC_OUT.  First
> compute
> --
> 2.35.3
>
>

Reply via email to