Hi Richard,

On Tue, 23 May 2023 at 11:55, Richard Biener via Gcc-patches <
gcc-patches@gcc.gnu.org> wrote:

> The following fixes code hoisting to properly consider ANTIC_OUT instead
> of ANTIC_IN.  That's a bit expensive to re-compute but since we no
> longer iterate we're doing this only once per BB which should be
> acceptable.  This avoids missing hoistings to the end of blocks where
> something in the block clobbers the hoisted value.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
>
>         PR tree-optimization/109849
>         * tree-ssa-pre.cc (do_hoist_insertion): Compute ANTIC_OUT
>         and use that to determine what to hoist.
>
>         * gcc.dg/tree-ssa/ssa-hoist-8.c: New testcase.
>

This patch causes a regression on aarch64:
gcc.target/aarch64/sve/fmla_2.c: \\tst1d found 3 times
 FAIL: gcc.target/aarch64/sve/fmla_2.c scan-assembler-times \\tst1d 2


We used to generate:
        mov     x6, 0
        mov     w7, 55
        whilelo p7.d, wzr, w7
        .p2align 3,,7
.L2:
        ld1d    z30.d, p7/z, [x5, x6, lsl 3]
        ld1d    z31.d, p7/z, [x4, x6, lsl 3]
        cmpne   p6.d, p7/z, z30.d, #0
        ld1d    z30.d, p7/z, [x3, x6, lsl 3]
        ld1d    z29.d, p6/z, [x2, x6, lsl 3]
        movprfx z28, z30
        fmla    z28.d, p6/m, z31.d, z29.d
        fmla    z31.d, p6/m, z30.d, z29.d
        st1d    z28.d, p7, [x1, x6, lsl 3]
        st1d    z31.d, p7, [x0, x6, lsl 3]
        incd    x6
        whilelo p7.d, w6, w7
        b.any   .L2


But now:
        mov     x6, 0
        mov     w7, 55
        ptrue   p4.b, all
        whilelo p7.d, wzr, w7
        .p2align 3,,7
.L2:
        ld1d    z30.d, p7/z, [x5, x6, lsl 3]
        ld1d    z31.d, p7/z, [x4, x6, lsl 3]
        cmpne   p6.d, p7/z, z30.d, #0
        cmpeq   p5.d, p7/z, z30.d, #0
        ld1d    z29.d, p6/z, [x2, x6, lsl 3]
        ld1d    z28.d, p6/z, [x3, x6, lsl 3]
        ld1d    z30.d, p5/z, [x3, x6, lsl 3]
        movprfx z27, z31
        fmla    z27.d, p4/m, z29.d, z28.d
        movprfx z30.d, p6/m, z28.d
        fmla    z30.d, p6/m, z31.d, z29.d
        st1d    z27.d, p6, [x0, x6, lsl 3]
        st1d    z30.d, p7, [x1, x6, lsl 3]
        st1d    z31.d, p5, [x0, x6, lsl 3]
        incd    x6
        whilelo p7.d, w6, w7
        b.any   .L2


Christophe

---
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c | 22 ++++++++++
>  gcc/tree-ssa-pre.cc                         | 48 ++++++++++++++++++---
>  2 files changed, 64 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
> new file mode 100644
> index 00000000000..66bb48e0dc1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-8.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-pre-stats" } */
> +
> +int mem;
> +void foo ();
> +int bar (int flag)
> +{
> +  int res;
> +  foo ();
> +  /* Hoist the load of mem here even though foo () clobbers it.  */
> +  if (flag)
> +    res = mem;
> +  else
> +    {
> +      res = mem;
> +      mem = 2;
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-tree-dump "HOIST inserted: 1" "pre" } } */
> +/* { dg-final { scan-tree-dump-times " = mem;" 1 "pre" } } */
> diff --git a/gcc/tree-ssa-pre.cc b/gcc/tree-ssa-pre.cc
> index 1f7eea93c16..d56431b4145 100644
> --- a/gcc/tree-ssa-pre.cc
> +++ b/gcc/tree-ssa-pre.cc
> @@ -3622,19 +3622,51 @@ do_hoist_insertion (basic_block block)
>        && stmt_ends_bb_p (gsi_stmt (last)))
>      return false;
>
> -  /* Compute the set of hoistable expressions from ANTIC_IN.  First
> compute
> +  /* We have multiple successors, compute ANTIC_OUT by taking the
> intersection
> +     of all of ANTIC_IN translating through PHI nodes.  Note we do not
> have to
> +     worry about iteration stability here so just intersect the
> expression sets
> +     as well.  This is a simplification of what we do in
> compute_antic_aux.  */
> +  bitmap_set_t ANTIC_OUT = bitmap_set_new ();
> +  bool first = true;
> +  FOR_EACH_EDGE (e, ei, block->succs)
> +    {
> +      if (first)
> +       {
> +         phi_translate_set (ANTIC_OUT, ANTIC_IN (e->dest), e);
> +         first = false;
> +       }
> +      else if (!gimple_seq_empty_p (phi_nodes (e->dest)))
> +       {
> +         bitmap_set_t tmp = bitmap_set_new ();
> +         phi_translate_set (tmp, ANTIC_IN (e->dest), e);
> +         bitmap_and_into (&ANTIC_OUT->values, &tmp->values);
> +         bitmap_and_into (&ANTIC_OUT->expressions, &tmp->expressions);
> +         bitmap_set_free (tmp);
> +       }
> +      else
> +       {
> +         bitmap_and_into (&ANTIC_OUT->values, &ANTIC_IN
> (e->dest)->values);
> +         bitmap_and_into (&ANTIC_OUT->expressions,
> +                          &ANTIC_IN (e->dest)->expressions);
> +       }
> +    }
> +
> +  /* Compute the set of hoistable expressions from ANTIC_OUT.  First
> compute
>       hoistable values.  */
>    bitmap_set hoistable_set;
>
> -  /* A hoistable value must be in ANTIC_IN(block)
> +  /* A hoistable value must be in ANTIC_OUT(block)
>       but not in AVAIL_OUT(BLOCK).  */
>    bitmap_initialize (&hoistable_set.values, &grand_bitmap_obstack);
>    bitmap_and_compl (&hoistable_set.values,
> -                   &ANTIC_IN (block)->values, &AVAIL_OUT (block)->values);
> +                   &ANTIC_OUT->values, &AVAIL_OUT (block)->values);
>
>    /* Short-cut for a common case: hoistable_set is empty.  */
>    if (bitmap_empty_p (&hoistable_set.values))
> -    return false;
> +    {
> +      bitmap_set_free (ANTIC_OUT);
> +      return false;
> +    }
>
>    /* Compute which of the hoistable values is in AVAIL_OUT of
>       at least one of the successors of BLOCK.  */
> @@ -3652,11 +3684,14 @@ do_hoist_insertion (basic_block block)
>
>    /* Short-cut for a common case: availout_in_some is empty.  */
>    if (bitmap_empty_p (&availout_in_some))
> -    return false;
> +    {
> +      bitmap_set_free (ANTIC_OUT);
> +      return false;
> +    }
>
>    /* Hack hoitable_set in-place so we can use
> sorted_array_from_bitmap_set.  */
>    bitmap_move (&hoistable_set.values, &availout_in_some);
> -  hoistable_set.expressions = ANTIC_IN (block)->expressions;
> +  hoistable_set.expressions = ANTIC_OUT->expressions;
>
>    /* Now finally construct the topological-ordered expression set.  */
>    vec<pre_expr> exprs = sorted_array_from_bitmap_set (&hoistable_set);
> @@ -3731,6 +3766,7 @@ do_hoist_insertion (basic_block block)
>      }
>
>    exprs.release ();
> +  bitmap_set_free (ANTIC_OUT);
>
>    return new_stuff;
>  }
> --
> 2.35.3
>

Reply via email to