On Fri, Aug 1, 2025 at 8:10 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > Don't hoist non all 0s/1s vector set outside of the loop to avoid extra > spills.
The patch LGTM. > > gcc/ > > PR target/120941 > * config/i386/i386-features.cc (x86_cse_kind): Moved before > ix86_place_single_vector_set. > (redundant_load): Likewise. > (ix86_place_single_vector_set): Replace the last argument to the > pointer to redundant_load. For X86_CSE_VEC_DUP, don't place the > vector set outside of the loop to avoid extra spills. > (remove_redundant_vector_load): Pass load to > ix86_place_single_vector_set. > > gcc/testsuite/ > > PR target/120941 > * gcc.target/i386/pr120941-1.c: New test. > > Signed-off-by: H.J. Lu <hjl.to...@gmail.com> > --- > gcc/config/i386/i386-features.cc | 107 +++++++++++---------- > gcc/testsuite/gcc.target/i386/pr120941-1.c | 49 ++++++++++ > 2 files changed, 106 insertions(+), 50 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr120941-1.c > > diff --git a/gcc/config/i386/i386-features.cc > b/gcc/config/i386/i386-features.cc > index 53e86c8a493..9941e61361c 100644 > --- a/gcc/config/i386/i386-features.cc > +++ b/gcc/config/i386/i386-features.cc > @@ -3085,21 +3085,63 @@ ix86_rpad_gate () > && optimize_function_for_speed_p (cfun)); > } > > +enum x86_cse_kind > +{ > + X86_CSE_CONST0_VECTOR, > + X86_CSE_CONSTM1_VECTOR, > + X86_CSE_VEC_DUP > +}; > + > +struct redundant_load > +{ > + /* Bitmap of basic blocks with broadcast instructions. */ > + auto_bitmap bbs; > + /* Bitmap of broadcast instructions. */ > + auto_bitmap insns; > + /* The broadcast inner scalar. */ > + rtx val; > + /* The inner scalar mode. */ > + machine_mode mode; > + /* The instruction which sets the inner scalar. Nullptr if the inner > + scalar is applied to the whole function, instead of within the same > + block. */ > + rtx_insn *def_insn; > + /* The widest broadcast source. */ > + rtx broadcast_source; > + /* The widest broadcast register. */ > + rtx broadcast_reg; > + /* The basic block of the broadcast instruction. */ > + basic_block bb; > + /* The number of broadcast instructions with the same inner scalar. */ > + unsigned HOST_WIDE_INT count; > + /* The threshold of broadcast instructions with the same inner > + scalar. */ > + unsigned int threshold; > + /* The widest broadcast size in bytes. */ > + unsigned int size; > + /* Load kind. */ > + x86_cse_kind kind; > +}; > + > /* Generate a vector set, DEST = SRC, at entry of the nearest dominator > for basic block map BBS, which is in the fake loop that contains the > whole function, so that there is only a single vector set in the > - whole function. If not nullptr, INNER_SCALAR is the inner scalar of > - SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */ > + whole function. If not nullptr, LOAD is a pointer to the load. */ > > static void > ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, > - rtx inner_scalar = nullptr) > + redundant_load *load = nullptr) > { > basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); > - while (bb->loop_father->latch > - != EXIT_BLOCK_PTR_FOR_FN (cfun)) > - bb = get_immediate_dominator (CDI_DOMINATORS, > - bb->loop_father->header); > + /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop > + to avoid extra spills. */ > + if (!load || load->kind != X86_CSE_VEC_DUP) > + { > + while (bb->loop_father->latch > + != EXIT_BLOCK_PTR_FOR_FN (cfun)) > + bb = get_immediate_dominator (CDI_DOMINATORS, > + bb->loop_father->header); > + } > > rtx set = gen_rtx_SET (dest, src); > > @@ -3141,8 +3183,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, > bitmap bbs, > } > } > > - if (inner_scalar) > + if (load && load->kind == X86_CSE_VEC_DUP) > { > + /* Get the source from LOAD as (reg:SI 99) in > + > + (vec_duplicate:V4SI (reg:SI 99)) > + > + */ > + rtx inner_scalar = load->val; > /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */ > rtx reg = XEXP (src, 0); > if ((REG_P (inner_scalar) || MEM_P (inner_scalar)) > @@ -3489,44 +3537,6 @@ replace_vector_const (machine_mode vector_mode, rtx > vector_const, > } > } > > -enum x86_cse_kind > -{ > - X86_CSE_CONST0_VECTOR, > - X86_CSE_CONSTM1_VECTOR, > - X86_CSE_VEC_DUP > -}; > - > -struct redundant_load > -{ > - /* Bitmap of basic blocks with broadcast instructions. */ > - auto_bitmap bbs; > - /* Bitmap of broadcast instructions. */ > - auto_bitmap insns; > - /* The broadcast inner scalar. */ > - rtx val; > - /* The inner scalar mode. */ > - machine_mode mode; > - /* The instruction which sets the inner scalar. Nullptr if the inner > - scalar is applied to the whole function, instead of within the same > - block. */ > - rtx_insn *def_insn; > - /* The widest broadcast source. */ > - rtx broadcast_source; > - /* The widest broadcast register. */ > - rtx broadcast_reg; > - /* The basic block of the broadcast instruction. */ > - basic_block bb; > - /* The number of broadcast instructions with the same inner scalar. */ > - unsigned HOST_WIDE_INT count; > - /* The threshold of broadcast instructions with the same inner > - scalar. */ > - unsigned int threshold; > - /* The widest broadcast size in bytes. */ > - unsigned int size; > - /* Load kind. */ > - x86_cse_kind kind; > -}; > - > /* Return the inner scalar if OP is a broadcast, else return nullptr. */ > > static rtx > @@ -3872,10 +3882,7 @@ remove_redundant_vector_load (void) > else > ix86_place_single_vector_set (load->broadcast_reg, > load->broadcast_source, > - load->bbs, > - (load->kind == X86_CSE_VEC_DUP > - ? load->val > - : nullptr)); > + load->bbs, load); > } > > loop_optimizer_finalize (); > diff --git a/gcc/testsuite/gcc.target/i386/pr120941-1.c > b/gcc/testsuite/gcc.target/i386/pr120941-1.c > new file mode 100644 > index 00000000000..b4fc6ac97fc > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr120941-1.c > @@ -0,0 +1,49 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -march=x86-64-v3" } */ > +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } > } */ > + > +/* > +**bar: > +**.LFB[0-9]+: > +**... > +** vbroadcastsd .LC4\(%rip\), %ymm2 > +** leal 2\(%rbx\), %eax > +** vbroadcastsd .LC2\(%rip\), %ymm4 > +** negl %eax > +**... > +*/ > + > +extern void foo (int); > + > +enum { N_CELL_ENTRIES1 = 2 } > +typedef LBM_Grid1[64]; > +enum { N_CELL_ENTRIES2 = 2 } > +typedef LBM_Grid2[64]; > +LBM_Grid1 grid1; > +LBM_Grid2 grid2; > +extern int n; > + > +void > +LBM_handleInOutFlow() > +{ > + int i, j; > + for (; i; i += 2) > + { > + for (j = 0; j < n; j++) > + { > + grid1[i] = 1.0 / 36.0 * i; > + grid2[i] = 1.0 / 36.0 * i; > + } > + } > +} > + > +int main_t; > +void > +bar (void) > +{ > + for (; main_t; main_t++) { > + LBM_handleInOutFlow(); > + foo (main_t); > + } > +} > -- > 2.50.1 > -- BR, Hongtao