https://gcc.gnu.org/g:09f0768b55b96c861811a8989d7c1cc59b4c29b6
commit r16-2727-g09f0768b55b96c861811a8989d7c1cc59b4c29b6 Author: H.J. Lu <hjl.to...@gmail.com> Date: Fri Aug 1 05:02:18 2025 -0700 x86: Don't hoist non all 0s/1s vector set outside of loop Don't hoist non all 0s/1s vector set outside of the loop to avoid extra spills. gcc/ PR target/120941 * config/i386/i386-features.cc (x86_cse_kind): Moved before ix86_place_single_vector_set. (redundant_load): Likewise. (ix86_place_single_vector_set): Replace the last argument to the pointer to redundant_load. For X86_CSE_VEC_DUP, don't place the vector set outside of the loop to avoid extra spills. (remove_redundant_vector_load): Pass load to ix86_place_single_vector_set. gcc/testsuite/ PR target/120941 * gcc.target/i386/pr120941-1.c: New test. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> Diff: --- gcc/config/i386/i386-features.cc | 107 +++++++++++++++-------------- gcc/testsuite/gcc.target/i386/pr120941-1.c | 49 +++++++++++++ 2 files changed, 106 insertions(+), 50 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 53e86c8a4931..9941e61361c7 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3085,21 +3085,63 @@ ix86_rpad_gate () && optimize_function_for_speed_p (cfun)); } +enum x86_cse_kind +{ + X86_CSE_CONST0_VECTOR, + X86_CSE_CONSTM1_VECTOR, + X86_CSE_VEC_DUP +}; + +struct redundant_load +{ + /* Bitmap of basic blocks with broadcast instructions. */ + auto_bitmap bbs; + /* Bitmap of broadcast instructions. */ + auto_bitmap insns; + /* The broadcast inner scalar. */ + rtx val; + /* The inner scalar mode. */ + machine_mode mode; + /* The instruction which sets the inner scalar. Nullptr if the inner + scalar is applied to the whole function, instead of within the same + block. */ + rtx_insn *def_insn; + /* The widest broadcast source. */ + rtx broadcast_source; + /* The widest broadcast register. */ + rtx broadcast_reg; + /* The basic block of the broadcast instruction. */ + basic_block bb; + /* The number of broadcast instructions with the same inner scalar. */ + unsigned HOST_WIDE_INT count; + /* The threshold of broadcast instructions with the same inner + scalar. */ + unsigned int threshold; + /* The widest broadcast size in bytes. */ + unsigned int size; + /* Load kind. */ + x86_cse_kind kind; +}; + /* Generate a vector set, DEST = SRC, at entry of the nearest dominator for basic block map BBS, which is in the fake loop that contains the whole function, so that there is only a single vector set in the - whole function. If not nullptr, INNER_SCALAR is the inner scalar of - SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */ + whole function. If not nullptr, LOAD is a pointer to the load. */ static void ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, - rtx inner_scalar = nullptr) + redundant_load *load = nullptr) { basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); - while (bb->loop_father->latch - != EXIT_BLOCK_PTR_FOR_FN (cfun)) - bb = get_immediate_dominator (CDI_DOMINATORS, - bb->loop_father->header); + /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop + to avoid extra spills. */ + if (!load || load->kind != X86_CSE_VEC_DUP) + { + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + } rtx set = gen_rtx_SET (dest, src); @@ -3141,8 +3183,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, } } - if (inner_scalar) + if (load && load->kind == X86_CSE_VEC_DUP) { + /* Get the source from LOAD as (reg:SI 99) in + + (vec_duplicate:V4SI (reg:SI 99)) + + */ + rtx inner_scalar = load->val; /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */ rtx reg = XEXP (src, 0); if ((REG_P (inner_scalar) || MEM_P (inner_scalar)) @@ -3489,44 +3537,6 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const, } } -enum x86_cse_kind -{ - X86_CSE_CONST0_VECTOR, - X86_CSE_CONSTM1_VECTOR, - X86_CSE_VEC_DUP -}; - -struct redundant_load -{ - /* Bitmap of basic blocks with broadcast instructions. */ - auto_bitmap bbs; - /* Bitmap of broadcast instructions. */ - auto_bitmap insns; - /* The broadcast inner scalar. */ - rtx val; - /* The inner scalar mode. */ - machine_mode mode; - /* The instruction which sets the inner scalar. Nullptr if the inner - scalar is applied to the whole function, instead of within the same - block. */ - rtx_insn *def_insn; - /* The widest broadcast source. */ - rtx broadcast_source; - /* The widest broadcast register. */ - rtx broadcast_reg; - /* The basic block of the broadcast instruction. */ - basic_block bb; - /* The number of broadcast instructions with the same inner scalar. */ - unsigned HOST_WIDE_INT count; - /* The threshold of broadcast instructions with the same inner - scalar. */ - unsigned int threshold; - /* The widest broadcast size in bytes. */ - unsigned int size; - /* Load kind. */ - x86_cse_kind kind; -}; - /* Return the inner scalar if OP is a broadcast, else return nullptr. */ static rtx @@ -3872,10 +3882,7 @@ remove_redundant_vector_load (void) else ix86_place_single_vector_set (load->broadcast_reg, load->broadcast_source, - load->bbs, - (load->kind == X86_CSE_VEC_DUP - ? load->val - : nullptr)); + load->bbs, load); } loop_optimizer_finalize (); diff --git a/gcc/testsuite/gcc.target/i386/pr120941-1.c b/gcc/testsuite/gcc.target/i386/pr120941-1.c new file mode 100644 index 000000000000..b4fc6ac97fc2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120941-1.c @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=x86-64-v3" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**bar: +**.LFB[0-9]+: +**... +** vbroadcastsd .LC4\(%rip\), %ymm2 +** leal 2\(%rbx\), %eax +** vbroadcastsd .LC2\(%rip\), %ymm4 +** negl %eax +**... +*/ + +extern void foo (int); + +enum { N_CELL_ENTRIES1 = 2 } +typedef LBM_Grid1[64]; +enum { N_CELL_ENTRIES2 = 2 } +typedef LBM_Grid2[64]; +LBM_Grid1 grid1; +LBM_Grid2 grid2; +extern int n; + +void +LBM_handleInOutFlow() +{ + int i, j; + for (; i; i += 2) + { + for (j = 0; j < n; j++) + { + grid1[i] = 1.0 / 36.0 * i; + grid2[i] = 1.0 / 36.0 * i; + } + } +} + +int main_t; +void +bar (void) +{ + for (; main_t; main_t++) { + LBM_handleInOutFlow(); + foo (main_t); + } +}