https://gcc.gnu.org/g:09f0768b55b96c861811a8989d7c1cc59b4c29b6

commit r16-2727-g09f0768b55b96c861811a8989d7c1cc59b4c29b6
Author: H.J. Lu <hjl.to...@gmail.com>
Date:   Fri Aug 1 05:02:18 2025 -0700

    x86: Don't hoist non all 0s/1s vector set outside of loop
    
    Don't hoist non all 0s/1s vector set outside of the loop to avoid extra
    spills.
    
    gcc/
    
            PR target/120941
            * config/i386/i386-features.cc (x86_cse_kind): Moved before
            ix86_place_single_vector_set.
            (redundant_load): Likewise.
            (ix86_place_single_vector_set): Replace the last argument to the
            pointer to redundant_load.  For X86_CSE_VEC_DUP, don't place the
            vector set outside of the loop to avoid extra spills.
            (remove_redundant_vector_load): Pass load to
            ix86_place_single_vector_set.
    
    gcc/testsuite/
    
            PR target/120941
            * gcc.target/i386/pr120941-1.c: New test.
    
    Signed-off-by: H.J. Lu <hjl.to...@gmail.com>

Diff:
---
 gcc/config/i386/i386-features.cc           | 107 +++++++++++++++--------------
 gcc/testsuite/gcc.target/i386/pr120941-1.c |  49 +++++++++++++
 2 files changed, 106 insertions(+), 50 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 53e86c8a4931..9941e61361c7 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3085,21 +3085,63 @@ ix86_rpad_gate ()
          && optimize_function_for_speed_p (cfun));
 }
 
+enum x86_cse_kind
+{
+  X86_CSE_CONST0_VECTOR,
+  X86_CSE_CONSTM1_VECTOR,
+  X86_CSE_VEC_DUP
+};
+
+struct redundant_load
+{
+  /* Bitmap of basic blocks with broadcast instructions.  */
+  auto_bitmap bbs;
+  /* Bitmap of broadcast instructions.  */
+  auto_bitmap insns;
+  /* The broadcast inner scalar.  */
+  rtx val;
+  /* The inner scalar mode.  */
+  machine_mode mode;
+  /* The instruction which sets the inner scalar.  Nullptr if the inner
+     scalar is applied to the whole function, instead of within the same
+     block.  */
+  rtx_insn *def_insn;
+  /* The widest broadcast source.  */
+  rtx broadcast_source;
+  /* The widest broadcast register.  */
+  rtx broadcast_reg;
+  /* The basic block of the broadcast instruction.  */
+  basic_block bb;
+  /* The number of broadcast instructions with the same inner scalar.  */
+  unsigned HOST_WIDE_INT count;
+  /* The threshold of broadcast instructions with the same inner
+     scalar.  */
+  unsigned int threshold;
+  /* The widest broadcast size in bytes.  */
+  unsigned int size;
+  /* Load kind.  */
+  x86_cse_kind kind;
+};
+
 /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
    for basic block map BBS, which is in the fake loop that contains the
    whole function, so that there is only a single vector set in the
-   whole function.  If not nullptr, INNER_SCALAR is the inner scalar of
-   SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)).  */
+   whole function.  If not nullptr, LOAD is a pointer to the load.  */
 
 static void
 ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
-                             rtx inner_scalar = nullptr)
+                             redundant_load *load = nullptr)
 {
   basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
-  while (bb->loop_father->latch
-        != EXIT_BLOCK_PTR_FOR_FN (cfun))
-    bb = get_immediate_dominator (CDI_DOMINATORS,
-                                 bb->loop_father->header);
+  /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop
+     to avoid extra spills.  */
+  if (!load || load->kind != X86_CSE_VEC_DUP)
+    {
+      while (bb->loop_father->latch
+            != EXIT_BLOCK_PTR_FOR_FN (cfun))
+       bb = get_immediate_dominator (CDI_DOMINATORS,
+                                     bb->loop_father->header);
+    }
 
   rtx set = gen_rtx_SET (dest, src);
 
@@ -3141,8 +3183,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap 
bbs,
        }
     }
 
-  if (inner_scalar)
+  if (load && load->kind == X86_CSE_VEC_DUP)
     {
+      /* Get the source from LOAD as (reg:SI 99) in
+
+        (vec_duplicate:V4SI (reg:SI 99))
+
+       */
+      rtx inner_scalar = load->val;
       /* Set the source in (vec_duplicate:V4SI (reg:SI 99)).  */
       rtx reg = XEXP (src, 0);
       if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
@@ -3489,44 +3537,6 @@ replace_vector_const (machine_mode vector_mode, rtx 
vector_const,
     }
 }
 
-enum x86_cse_kind
-{
-  X86_CSE_CONST0_VECTOR,
-  X86_CSE_CONSTM1_VECTOR,
-  X86_CSE_VEC_DUP
-};
-
-struct redundant_load
-{
-  /* Bitmap of basic blocks with broadcast instructions.  */
-  auto_bitmap bbs;
-  /* Bitmap of broadcast instructions.  */
-  auto_bitmap insns;
-  /* The broadcast inner scalar.  */
-  rtx val;
-  /* The inner scalar mode.  */
-  machine_mode mode;
-  /* The instruction which sets the inner scalar.  Nullptr if the inner
-     scalar is applied to the whole function, instead of within the same
-     block.  */
-  rtx_insn *def_insn;
-  /* The widest broadcast source.  */
-  rtx broadcast_source;
-  /* The widest broadcast register.  */
-  rtx broadcast_reg;
-  /* The basic block of the broadcast instruction.  */
-  basic_block bb;
-  /* The number of broadcast instructions with the same inner scalar.  */
-  unsigned HOST_WIDE_INT count;
-  /* The threshold of broadcast instructions with the same inner
-     scalar.  */
-  unsigned int threshold;
-  /* The widest broadcast size in bytes.  */
-  unsigned int size;
-  /* Load kind.  */
-  x86_cse_kind kind;
-};
-
 /* Return the inner scalar if OP is a broadcast, else return nullptr.  */
 
 static rtx
@@ -3872,10 +3882,7 @@ remove_redundant_vector_load (void)
            else
              ix86_place_single_vector_set (load->broadcast_reg,
                                            load->broadcast_source,
-                                           load->bbs,
-                                           (load->kind == X86_CSE_VEC_DUP
-                                            ? load->val
-                                            : nullptr));
+                                           load->bbs, load);
          }
 
       loop_optimizer_finalize ();
diff --git a/gcc/testsuite/gcc.target/i386/pr120941-1.c 
b/gcc/testsuite/gcc.target/i386/pr120941-1.c
new file mode 100644
index 000000000000..b4fc6ac97fc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr120941-1.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=x86-64-v3" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } 
*/
+
+/*
+**bar:
+**.LFB[0-9]+:
+**...
+**     vbroadcastsd    .LC4\(%rip\), %ymm2
+**     leal    2\(%rbx\), %eax
+**     vbroadcastsd    .LC2\(%rip\), %ymm4
+**     negl    %eax
+**...
+*/
+
+extern void foo (int);
+
+enum { N_CELL_ENTRIES1 = 2 }
+typedef LBM_Grid1[64];
+enum { N_CELL_ENTRIES2 = 2 }
+typedef LBM_Grid2[64];
+LBM_Grid1 grid1;
+LBM_Grid2 grid2;
+extern int n;
+
+void
+LBM_handleInOutFlow()
+{
+  int i, j;
+  for (; i; i += 2)
+    {
+      for (j = 0; j < n; j++)
+       {
+         grid1[i] = 1.0 / 36.0 * i;
+         grid2[i] = 1.0 / 36.0 * i;
+       }
+    }
+}
+
+int main_t;
+void
+bar (void)
+{
+  for (; main_t; main_t++) {
+    LBM_handleInOutFlow();
+    foo (main_t);
+  }
+}

Reply via email to