On Fri, Aug 1, 2025 at 8:10 PM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> Don't hoist non all 0s/1s vector set outside of the loop to avoid extra
> spills.

The patch LGTM.
>
> gcc/
>
>         PR target/120941
>         * config/i386/i386-features.cc (x86_cse_kind): Moved before
>         ix86_place_single_vector_set.
>         (redundant_load): Likewise.
>         (ix86_place_single_vector_set): Replace the last argument to the
>         pointer to redundant_load.  For X86_CSE_VEC_DUP, don't place the
>         vector set outside of the loop to avoid extra spills.
>         (remove_redundant_vector_load): Pass load to
>         ix86_place_single_vector_set.
>
> gcc/testsuite/
>
>         PR target/120941
>         * gcc.target/i386/pr120941-1.c: New test.
>
> Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
> ---
>  gcc/config/i386/i386-features.cc           | 107 +++++++++++----------
>  gcc/testsuite/gcc.target/i386/pr120941-1.c |  49 ++++++++++
>  2 files changed, 106 insertions(+), 50 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr120941-1.c
>
> diff --git a/gcc/config/i386/i386-features.cc 
> b/gcc/config/i386/i386-features.cc
> index 53e86c8a493..9941e61361c 100644
> --- a/gcc/config/i386/i386-features.cc
> +++ b/gcc/config/i386/i386-features.cc
> @@ -3085,21 +3085,63 @@ ix86_rpad_gate ()
>           && optimize_function_for_speed_p (cfun));
>  }
>
> +enum x86_cse_kind
> +{
> +  X86_CSE_CONST0_VECTOR,
> +  X86_CSE_CONSTM1_VECTOR,
> +  X86_CSE_VEC_DUP
> +};
> +
> +struct redundant_load
> +{
> +  /* Bitmap of basic blocks with broadcast instructions.  */
> +  auto_bitmap bbs;
> +  /* Bitmap of broadcast instructions.  */
> +  auto_bitmap insns;
> +  /* The broadcast inner scalar.  */
> +  rtx val;
> +  /* The inner scalar mode.  */
> +  machine_mode mode;
> +  /* The instruction which sets the inner scalar.  Nullptr if the inner
> +     scalar is applied to the whole function, instead of within the same
> +     block.  */
> +  rtx_insn *def_insn;
> +  /* The widest broadcast source.  */
> +  rtx broadcast_source;
> +  /* The widest broadcast register.  */
> +  rtx broadcast_reg;
> +  /* The basic block of the broadcast instruction.  */
> +  basic_block bb;
> +  /* The number of broadcast instructions with the same inner scalar.  */
> +  unsigned HOST_WIDE_INT count;
> +  /* The threshold of broadcast instructions with the same inner
> +     scalar.  */
> +  unsigned int threshold;
> +  /* The widest broadcast size in bytes.  */
> +  unsigned int size;
> +  /* Load kind.  */
> +  x86_cse_kind kind;
> +};
> +
>  /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
>     for basic block map BBS, which is in the fake loop that contains the
>     whole function, so that there is only a single vector set in the
> -   whole function.  If not nullptr, INNER_SCALAR is the inner scalar of
> -   SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)).  */
> +   whole function.  If not nullptr, LOAD is a pointer to the load.  */
>
>  static void
>  ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
> -                             rtx inner_scalar = nullptr)
> +                             redundant_load *load = nullptr)
>  {
>    basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
> -  while (bb->loop_father->latch
> -        != EXIT_BLOCK_PTR_FOR_FN (cfun))
> -    bb = get_immediate_dominator (CDI_DOMINATORS,
> -                                 bb->loop_father->header);
> +  /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop
> +     to avoid extra spills.  */
> +  if (!load || load->kind != X86_CSE_VEC_DUP)
> +    {
> +      while (bb->loop_father->latch
> +            != EXIT_BLOCK_PTR_FOR_FN (cfun))
> +       bb = get_immediate_dominator (CDI_DOMINATORS,
> +                                     bb->loop_father->header);
> +    }
>
>    rtx set = gen_rtx_SET (dest, src);
>
> @@ -3141,8 +3183,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, 
> bitmap bbs,
>         }
>      }
>
> -  if (inner_scalar)
> +  if (load && load->kind == X86_CSE_VEC_DUP)
>      {
> +      /* Get the source from LOAD as (reg:SI 99) in
> +
> +        (vec_duplicate:V4SI (reg:SI 99))
> +
> +       */
> +      rtx inner_scalar = load->val;
>        /* Set the source in (vec_duplicate:V4SI (reg:SI 99)).  */
>        rtx reg = XEXP (src, 0);
>        if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
> @@ -3489,44 +3537,6 @@ replace_vector_const (machine_mode vector_mode, rtx 
> vector_const,
>      }
>  }
>
> -enum x86_cse_kind
> -{
> -  X86_CSE_CONST0_VECTOR,
> -  X86_CSE_CONSTM1_VECTOR,
> -  X86_CSE_VEC_DUP
> -};
> -
> -struct redundant_load
> -{
> -  /* Bitmap of basic blocks with broadcast instructions.  */
> -  auto_bitmap bbs;
> -  /* Bitmap of broadcast instructions.  */
> -  auto_bitmap insns;
> -  /* The broadcast inner scalar.  */
> -  rtx val;
> -  /* The inner scalar mode.  */
> -  machine_mode mode;
> -  /* The instruction which sets the inner scalar.  Nullptr if the inner
> -     scalar is applied to the whole function, instead of within the same
> -     block.  */
> -  rtx_insn *def_insn;
> -  /* The widest broadcast source.  */
> -  rtx broadcast_source;
> -  /* The widest broadcast register.  */
> -  rtx broadcast_reg;
> -  /* The basic block of the broadcast instruction.  */
> -  basic_block bb;
> -  /* The number of broadcast instructions with the same inner scalar.  */
> -  unsigned HOST_WIDE_INT count;
> -  /* The threshold of broadcast instructions with the same inner
> -     scalar.  */
> -  unsigned int threshold;
> -  /* The widest broadcast size in bytes.  */
> -  unsigned int size;
> -  /* Load kind.  */
> -  x86_cse_kind kind;
> -};
> -
>  /* Return the inner scalar if OP is a broadcast, else return nullptr.  */
>
>  static rtx
> @@ -3872,10 +3882,7 @@ remove_redundant_vector_load (void)
>             else
>               ix86_place_single_vector_set (load->broadcast_reg,
>                                             load->broadcast_source,
> -                                           load->bbs,
> -                                           (load->kind == X86_CSE_VEC_DUP
> -                                            ? load->val
> -                                            : nullptr));
> +                                           load->bbs, load);
>           }
>
>        loop_optimizer_finalize ();
> diff --git a/gcc/testsuite/gcc.target/i386/pr120941-1.c 
> b/gcc/testsuite/gcc.target/i386/pr120941-1.c
> new file mode 100644
> index 00000000000..b4fc6ac97fc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr120941-1.c
> @@ -0,0 +1,49 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=x86-64-v3" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } 
> } */
> +
> +/*
> +**bar:
> +**.LFB[0-9]+:
> +**...
> +**     vbroadcastsd    .LC4\(%rip\), %ymm2
> +**     leal    2\(%rbx\), %eax
> +**     vbroadcastsd    .LC2\(%rip\), %ymm4
> +**     negl    %eax
> +**...
> +*/
> +
> +extern void foo (int);
> +
> +enum { N_CELL_ENTRIES1 = 2 }
> +typedef LBM_Grid1[64];
> +enum { N_CELL_ENTRIES2 = 2 }
> +typedef LBM_Grid2[64];
> +LBM_Grid1 grid1;
> +LBM_Grid2 grid2;
> +extern int n;
> +
> +void
> +LBM_handleInOutFlow()
> +{
> +  int i, j;
> +  for (; i; i += 2)
> +    {
> +      for (j = 0; j < n; j++)
> +       {
> +         grid1[i] = 1.0 / 36.0 * i;
> +         grid2[i] = 1.0 / 36.0 * i;
> +       }
> +    }
> +}
> +
> +int main_t;
> +void
> +bar (void)
> +{
> +  for (; main_t; main_t++) {
> +    LBM_handleInOutFlow();
> +    foo (main_t);
> +  }
> +}
> --
> 2.50.1
>


-- 
BR,
Hongtao

Reply via email to