On Mon, Jun 23, 2025 at 4:10 PM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> On Mon, Jun 23, 2025 at 3:11 PM Hongtao Liu <crazy...@gmail.com> wrote:
> >
> > On Thu, Jun 19, 2025 at 10:25 AM H.J. Lu <hjl.to...@gmail.com> wrote:
> > >
> > > Extend the remove_redundant_vector pass to handle vector broadcasts from
> > > constant and variable scalars.  When broadcasting from constants and
> > > function arguments, we can place a single widest vector broadcast at
> > > entry of the nearest common dominator for basic blocks with all uses
> > > since constants and function arguments aren't changed.  For broadcast
> > > from variables with a single definition, the single definition is
> > > replaced with the widest broadcast.
> > >
> > > gcc/
> > >
> > >         PR target/92080
> > >         * config/i386/i386-expand.cc (ix86_expand_call): Set
> > >         recursive_function to true for recursive call.
> > >         * config/i386/i386-features.cc (ix86_place_single_vector_set):
> > >         Add an argument for inner scalar, default to nullptr.  Set the
> > >         source from inner scalar if not nullptr.
> > >         (ix86_get_vector_load_mode): Renamed to ...
> > >         (ix86_get_vector_cse_mode): This.  Add an argument for scalar mode
> > >         and handle integer and float scalar modes.
> > >         (replace_vector_const): Add an argument for scalar mode and pass
> > >         it to ix86_get_vector_load_mode.
> > >         (x86_cse_kind): New.
> > >         (redundant_load): Likewise.
> > >         (ix86_broadcast_inner): Likewise.
> > >         (remove_redundant_vector_load): Also support const0_rtx and
> > >         constm1_rtx broadcasts.  Handle vector broadcasts from constant
> > >         and variable scalars.
> > >         * config/i386/i386.h (machine_function): Add recursive_function.
> > >
> > > gcc/testsuite/
> > >
> > >         * gcc.target/i386/keylocker-aesdecwide128kl.c: Updated to expect
> > >         movdqa instead pxor.
> > >         * gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
> > >         * gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
> > >         * gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
> > >         * gcc.target/i386/pr92080-4.c: New test.
> > >         * gcc.target/i386/pr92080-5.c: Likewise.
> > >         * gcc.target/i386/pr92080-6.c: Likewise.
> > >         * gcc.target/i386/pr92080-7.c: Likewise.
> > >         * gcc.target/i386/pr92080-8.c: Likewise.
> > >         * gcc.target/i386/pr92080-9.c: Likewise.
> > >         * gcc.target/i386/pr92080-10.c: Likewise.
> > >         * gcc.target/i386/pr92080-11.c: Likewise.
> > >         * gcc.target/i386/pr92080-12.c: Likewise.
> > >         * gcc.target/i386/pr92080-13.c: Likewise.
> > >         * gcc.target/i386/pr92080-14.c: Likewise.
> > >         * gcc.target/i386/pr92080-15.c: Likewise.
> > >         * gcc.target/i386/pr92080-16.c: Likewise.
> > >
> > > Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
> > > ---
> > >  gcc/config/i386/i386-expand.cc                |   3 +
> > >  gcc/config/i386/i386-features.cc              | 410 ++++++++++++++----
> > >  gcc/config/i386/i386.h                        |   3 +
> > >  .../i386/keylocker-aesdecwide128kl.c          |  14 +-
> > >  .../i386/keylocker-aesdecwide256kl.c          |  14 +-
> > >  .../i386/keylocker-aesencwide128kl.c          |  14 +-
> > >  .../i386/keylocker-aesencwide256kl.c          |  14 +-
> > >  gcc/testsuite/gcc.target/i386/pr92080-10.c    |  13 +
> > >  gcc/testsuite/gcc.target/i386/pr92080-11.c    |  33 ++
> > >  gcc/testsuite/gcc.target/i386/pr92080-12.c    |  16 +
> > >  gcc/testsuite/gcc.target/i386/pr92080-13.c    |  32 ++
> > >  gcc/testsuite/gcc.target/i386/pr92080-14.c    |  31 ++
> > >  gcc/testsuite/gcc.target/i386/pr92080-15.c    |  25 ++
> > >  gcc/testsuite/gcc.target/i386/pr92080-16.c    |  26 ++
> > >  gcc/testsuite/gcc.target/i386/pr92080-4.c     |  50 +++
> > >  gcc/testsuite/gcc.target/i386/pr92080-5.c     | 109 +++++
> > >  gcc/testsuite/gcc.target/i386/pr92080-6.c     |  19 +
> > >  gcc/testsuite/gcc.target/i386/pr92080-7.c     |  20 +
> > >  gcc/testsuite/gcc.target/i386/pr92080-8.c     |  16 +
> > >  gcc/testsuite/gcc.target/i386/pr92080-9.c     |  81 ++++
> > >  20 files changed, 823 insertions(+), 120 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-10.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-11.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-12.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-13.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-14.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-15.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-16.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-4.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-5.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-6.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-7.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-8.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-9.c
> > >

> > > +  else
> > > +    {
> > > +      while (SUBREG_P (dest))
> > > +       dest = SUBREG_REG (dest);
> > > +
> > > +      /* Skip if the SET destination mode doesn't match.  */
> > > +      if (GET_MODE (dest) != mode)
> > > +       return nullptr;
> >
> > Can we just require (dest == reg || dest == op), otherwise we need to
> > make sure GET_MODE of the original dest can cover mode of op(which is
> > more complicated, need to make sure SUBREG_BYTE is also zero???)
>
> I will change it to
>
>       /* Skip if the SET destination isn't the broadcast source.  */
>       if (dest != reg)
>         return nullptr;

Here is the v4 patch with:

      /* The SET destination must be the broadcast source.  */
      gcc_assert (dest == op);

OK for master?

Thanks.


-- 
H.J.
From d1e56ce24a5a7a1a9be1bf3790ffcbea532ce63c Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.to...@gmail.com>
Date: Sat, 10 May 2025 16:57:58 +0800
Subject: [PATCH v4] x86: Add debug dump for the remove_redundant_vector pass

Add debug dump for the remove_redundant_vector pass with the following
output:

Replace:

(insn 7 4 8 2 (set (reg:V2DI 103)
        (const_vector:V2DI [
                (const_int 0 [0]) repeated x2
            ])) "x.c":8:13 2406 {movv2di_internal}
     (nil))

with:

(insn 7 4 8 2 (set (reg:V2DI 103)
        (subreg:V2DI (reg:V32QI 109) 0)) "x.c":8:13 2406 {movv2di_internal}
     (nil))

...

Replace:

(insn 16 15 17 3 (set (reg:V4DI 105)
        (const_vector:V4DI [
                (const_int 0 [0]) repeated x4
            ])) "x.c":13:28 2405 {movv4di_internal}
     (nil))

with:

(insn 16 15 17 3 (set (reg:V4DI 105)
        (subreg:V4DI (reg:V32QI 109) 0)) "x.c":13:28 2405 {movv4di_internal}
     (nil))

...

Place:

(insn 25 5 23 2 (set (reg:V32QI 109)
        (const_vector:V32QI [
                (const_int 0 [0]) repeated x32
            ])) -1
     (nil))

after:

(insn 23 25 24 2 (set (reg/f:DI 107 [ mem1 ])
        (reg:DI 5 di [ mem1 ])) "x.c":5:1 95 {*movdi_internal}
     (expr_list:REG_DEAD (reg:DI 5 di [ mem1 ])
        (nil)))

in the *.309r.rrvl debug dump.

	* config/i386/i386-features.cc (ix86_place_single_vector_set):
	Add debug dump.
	(replace_vector_const): Likewise.
	(remove_redundant_vector_load): Likewise.

Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
---
 gcc/config/i386/i386-features.cc | 65 +++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 62504b28376..92bc65edbc5 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3116,10 +3116,30 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
 
   rtx_insn *set_insn;
   if (insn == BB_HEAD (bb))
-    set_insn = emit_insn_before (set, insn);
+    {
+      set_insn = emit_insn_before (set, insn);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  print_rtl_single (dump_file, set_insn);
+	  fprintf (dump_file, "\nbefore:\n\n");
+	  print_rtl_single (dump_file, insn);
+	  fprintf (dump_file, "\n");
+	}
+    }
   else
-    set_insn = emit_insn_after (set,
-				insn ? PREV_INSN (insn) : BB_END (bb));
+    {
+      rtx_insn *after = insn ? PREV_INSN (insn) : BB_END (bb);
+      set_insn = emit_insn_after (set, after);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  print_rtl_single (dump_file, set_insn);
+	  fprintf (dump_file, "\nafter:\n\n");
+	  print_rtl_single (dump_file, after);
+	  fprintf (dump_file, "\n");
+	}
+    }
 
   if (inner_scalar)
     {
@@ -3129,7 +3149,15 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
 	  && GET_MODE (reg) != GET_MODE (inner_scalar))
 	inner_scalar = gen_rtx_SUBREG (GET_MODE (reg), inner_scalar, 0);
       rtx set = gen_rtx_SET (reg, inner_scalar);
-      emit_insn_before (set, set_insn);
+      insn = emit_insn_before (set, set_insn);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nAdd:\n\n");
+	  print_rtl_single (dump_file, insn);
+	  fprintf (dump_file, "\nbefore:\n\n");
+	  print_rtl_single (dump_file, set_insn);
+	  fprintf (dump_file, "\n");
+	}
     }
 }
 
@@ -3416,7 +3444,15 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
 		  vreg = gen_reg_rtx (vmode);
 		  rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
 		  rtx pat = gen_rtx_SET (vreg, vsubreg);
-		  emit_insn_before (pat, insn);
+		  rtx_insn *vinsn = emit_insn_before (pat, insn);
+		  if (dump_file)
+		    {
+		      fprintf (dump_file, "\nInsert an extra move:\n\n");
+		      print_rtl_single (dump_file, vinsn);
+		      fprintf (dump_file, "\nbefore:\n\n");
+		      print_rtl_single (dump_file, insn);
+		      fprintf (dump_file, "\n");
+		    }
 		}
 	      replace = gen_rtx_SUBREG (mode, vreg, 0);
 	    }
@@ -3424,11 +3460,22 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
 	    replace = gen_rtx_SUBREG (mode, vector_const, 0);
 	}
 
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nReplace:\n\n");
+	  print_rtl_single (dump_file, insn);
+	}
       SET_SRC (set) = replace;
       /* Drop possible dead definitions.  */
       PATTERN (insn) = set;
       INSN_CODE (insn) = -1;
       recog_memoized (insn);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nwith:\n\n");
+	  print_rtl_single (dump_file, insn);
+	  fprintf (dump_file, "\n");
+	}
       df_insn_rescan (insn);
     }
 }
@@ -3768,6 +3815,14 @@ remove_redundant_vector_load (void)
 		rtx set = gen_rtx_SET (load->broadcast_reg,
 				       load->broadcast_source);
 		insn = emit_insn_after (set, load->def_insn);
+		if (dump_file)
+		  {
+		    fprintf (dump_file, "\nAdd:\n\n");
+		    print_rtl_single (dump_file, insn);
+		    fprintf (dump_file, "\nafter:\n\n");
+		    print_rtl_single (dump_file, load->def_insn);
+		    fprintf (dump_file, "\n");
+		  }
 	      }
 	    else
 	      ix86_place_single_vector_set (load->broadcast_reg,
-- 
2.49.0

Reply via email to