On Mon, Jun 23, 2025 at 4:10 PM H.J. Lu <[email protected]> wrote:
>
> On Mon, Jun 23, 2025 at 3:11 PM Hongtao Liu <[email protected]> wrote:
> >
> > On Thu, Jun 19, 2025 at 10:25 AM H.J. Lu <[email protected]> wrote:
> > >
> > > Extend the remove_redundant_vector pass to handle vector broadcasts from
> > > constant and variable scalars. When broadcasting from constants and
> > > function arguments, we can place a single widest vector broadcast at
> > > entry of the nearest common dominator for basic blocks with all uses
> > > since constants and function arguments aren't changed. For broadcast
> > > from variables with a single definition, the single definition is
> > > replaced with the widest broadcast.
> > >
> > > gcc/
> > >
> > > PR target/92080
> > > * config/i386/i386-expand.cc (ix86_expand_call): Set
> > > recursive_function to true for recursive call.
> > > * config/i386/i386-features.cc (ix86_place_single_vector_set):
> > > Add an argument for inner scalar, default to nullptr. Set the
> > > source from inner scalar if not nullptr.
> > > (ix86_get_vector_load_mode): Renamed to ...
> > > (ix86_get_vector_cse_mode): This. Add an argument for scalar mode
> > > and handle integer and float scalar modes.
> > > (replace_vector_const): Add an argument for scalar mode and pass
> > > it to ix86_get_vector_load_mode.
> > > (x86_cse_kind): New.
> > > (redundant_load): Likewise.
> > > (ix86_broadcast_inner): Likewise.
> > > (remove_redundant_vector_load): Also support const0_rtx and
> > > constm1_rtx broadcasts. Handle vector broadcasts from constant
> > > and variable scalars.
> > > * config/i386/i386.h (machine_function): Add recursive_function.
> > >
> > > gcc/testsuite/
> > >
> > > * gcc.target/i386/keylocker-aesdecwide128kl.c: Updated to expect
> > > movdqa instead pxor.
> > > * gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
> > > * gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
> > > * gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
> > > * gcc.target/i386/pr92080-4.c: New test.
> > > * gcc.target/i386/pr92080-5.c: Likewise.
> > > * gcc.target/i386/pr92080-6.c: Likewise.
> > > * gcc.target/i386/pr92080-7.c: Likewise.
> > > * gcc.target/i386/pr92080-8.c: Likewise.
> > > * gcc.target/i386/pr92080-9.c: Likewise.
> > > * gcc.target/i386/pr92080-10.c: Likewise.
> > > * gcc.target/i386/pr92080-11.c: Likewise.
> > > * gcc.target/i386/pr92080-12.c: Likewise.
> > > * gcc.target/i386/pr92080-13.c: Likewise.
> > > * gcc.target/i386/pr92080-14.c: Likewise.
> > > * gcc.target/i386/pr92080-15.c: Likewise.
> > > * gcc.target/i386/pr92080-16.c: Likewise.
> > >
> > > Signed-off-by: H.J. Lu <[email protected]>
> > > ---
> > > gcc/config/i386/i386-expand.cc | 3 +
> > > gcc/config/i386/i386-features.cc | 410 ++++++++++++++----
> > > gcc/config/i386/i386.h | 3 +
> > > .../i386/keylocker-aesdecwide128kl.c | 14 +-
> > > .../i386/keylocker-aesdecwide256kl.c | 14 +-
> > > .../i386/keylocker-aesencwide128kl.c | 14 +-
> > > .../i386/keylocker-aesencwide256kl.c | 14 +-
> > > gcc/testsuite/gcc.target/i386/pr92080-10.c | 13 +
> > > gcc/testsuite/gcc.target/i386/pr92080-11.c | 33 ++
> > > gcc/testsuite/gcc.target/i386/pr92080-12.c | 16 +
> > > gcc/testsuite/gcc.target/i386/pr92080-13.c | 32 ++
> > > gcc/testsuite/gcc.target/i386/pr92080-14.c | 31 ++
> > > gcc/testsuite/gcc.target/i386/pr92080-15.c | 25 ++
> > > gcc/testsuite/gcc.target/i386/pr92080-16.c | 26 ++
> > > gcc/testsuite/gcc.target/i386/pr92080-4.c | 50 +++
> > > gcc/testsuite/gcc.target/i386/pr92080-5.c | 109 +++++
> > > gcc/testsuite/gcc.target/i386/pr92080-6.c | 19 +
> > > gcc/testsuite/gcc.target/i386/pr92080-7.c | 20 +
> > > gcc/testsuite/gcc.target/i386/pr92080-8.c | 16 +
> > > gcc/testsuite/gcc.target/i386/pr92080-9.c | 81 ++++
> > > 20 files changed, 823 insertions(+), 120 deletions(-)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-10.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-11.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-12.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-13.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-14.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-15.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-16.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-4.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-5.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-6.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-7.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-8.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-9.c
> > >
> > > + else
> > > + {
> > > + while (SUBREG_P (dest))
> > > + dest = SUBREG_REG (dest);
> > > +
> > > + /* Skip if the SET destination mode doesn't match. */
> > > + if (GET_MODE (dest) != mode)
> > > + return nullptr;
> >
> > Can we just require (dest == reg || dest == op), otherwise we need to
> > make sure GET_MODE of the original dest can cover mode of op(which is
> > more complicated, need to make sure SUBREG_BYTE is also zero???)
>
> I will change it to
>
> /* Skip if the SET destination isn't the broadcast source. */
> if (dest != reg)
> return nullptr;
Here is the v4 patch with:
/* The SET destination must be the broadcast source. */
gcc_assert (dest == op);
OK for master?
Thanks.
--
H.J.
From d1e56ce24a5a7a1a9be1bf3790ffcbea532ce63c Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <[email protected]>
Date: Sat, 10 May 2025 16:57:58 +0800
Subject: [PATCH v4] x86: Add debug dump for the remove_redundant_vector pass
Add debug dump for the remove_redundant_vector pass with the following
output:
Replace:
(insn 7 4 8 2 (set (reg:V2DI 103)
(const_vector:V2DI [
(const_int 0 [0]) repeated x2
])) "x.c":8:13 2406 {movv2di_internal}
(nil))
with:
(insn 7 4 8 2 (set (reg:V2DI 103)
(subreg:V2DI (reg:V32QI 109) 0)) "x.c":8:13 2406 {movv2di_internal}
(nil))
...
Replace:
(insn 16 15 17 3 (set (reg:V4DI 105)
(const_vector:V4DI [
(const_int 0 [0]) repeated x4
])) "x.c":13:28 2405 {movv4di_internal}
(nil))
with:
(insn 16 15 17 3 (set (reg:V4DI 105)
(subreg:V4DI (reg:V32QI 109) 0)) "x.c":13:28 2405 {movv4di_internal}
(nil))
...
Place:
(insn 25 5 23 2 (set (reg:V32QI 109)
(const_vector:V32QI [
(const_int 0 [0]) repeated x32
])) -1
(nil))
after:
(insn 23 25 24 2 (set (reg/f:DI 107 [ mem1 ])
(reg:DI 5 di [ mem1 ])) "x.c":5:1 95 {*movdi_internal}
(expr_list:REG_DEAD (reg:DI 5 di [ mem1 ])
(nil)))
in the *.309r.rrvl debug dump.
* config/i386/i386-features.cc (ix86_place_single_vector_set):
Add debug dump.
(replace_vector_const): Likewise.
(remove_redundant_vector_load): Likewise.
Signed-off-by: H.J. Lu <[email protected]>
---
gcc/config/i386/i386-features.cc | 65 +++++++++++++++++++++++++++++---
1 file changed, 60 insertions(+), 5 deletions(-)
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 62504b28376..92bc65edbc5 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3116,10 +3116,30 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
rtx_insn *set_insn;
if (insn == BB_HEAD (bb))
- set_insn = emit_insn_before (set, insn);
+ {
+ set_insn = emit_insn_before (set, insn);
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nPlace:\n\n");
+ print_rtl_single (dump_file, set_insn);
+ fprintf (dump_file, "\nbefore:\n\n");
+ print_rtl_single (dump_file, insn);
+ fprintf (dump_file, "\n");
+ }
+ }
else
- set_insn = emit_insn_after (set,
- insn ? PREV_INSN (insn) : BB_END (bb));
+ {
+ rtx_insn *after = insn ? PREV_INSN (insn) : BB_END (bb);
+ set_insn = emit_insn_after (set, after);
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nPlace:\n\n");
+ print_rtl_single (dump_file, set_insn);
+ fprintf (dump_file, "\nafter:\n\n");
+ print_rtl_single (dump_file, after);
+ fprintf (dump_file, "\n");
+ }
+ }
if (inner_scalar)
{
@@ -3129,7 +3149,15 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
&& GET_MODE (reg) != GET_MODE (inner_scalar))
inner_scalar = gen_rtx_SUBREG (GET_MODE (reg), inner_scalar, 0);
rtx set = gen_rtx_SET (reg, inner_scalar);
- emit_insn_before (set, set_insn);
+ insn = emit_insn_before (set, set_insn);
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nAdd:\n\n");
+ print_rtl_single (dump_file, insn);
+ fprintf (dump_file, "\nbefore:\n\n");
+ print_rtl_single (dump_file, set_insn);
+ fprintf (dump_file, "\n");
+ }
}
}
@@ -3416,7 +3444,15 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
vreg = gen_reg_rtx (vmode);
rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
rtx pat = gen_rtx_SET (vreg, vsubreg);
- emit_insn_before (pat, insn);
+ rtx_insn *vinsn = emit_insn_before (pat, insn);
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nInsert an extra move:\n\n");
+ print_rtl_single (dump_file, vinsn);
+ fprintf (dump_file, "\nbefore:\n\n");
+ print_rtl_single (dump_file, insn);
+ fprintf (dump_file, "\n");
+ }
}
replace = gen_rtx_SUBREG (mode, vreg, 0);
}
@@ -3424,11 +3460,22 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
replace = gen_rtx_SUBREG (mode, vector_const, 0);
}
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nReplace:\n\n");
+ print_rtl_single (dump_file, insn);
+ }
SET_SRC (set) = replace;
/* Drop possible dead definitions. */
PATTERN (insn) = set;
INSN_CODE (insn) = -1;
recog_memoized (insn);
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nwith:\n\n");
+ print_rtl_single (dump_file, insn);
+ fprintf (dump_file, "\n");
+ }
df_insn_rescan (insn);
}
}
@@ -3768,6 +3815,14 @@ remove_redundant_vector_load (void)
rtx set = gen_rtx_SET (load->broadcast_reg,
load->broadcast_source);
insn = emit_insn_after (set, load->def_insn);
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nAdd:\n\n");
+ print_rtl_single (dump_file, insn);
+ fprintf (dump_file, "\nafter:\n\n");
+ print_rtl_single (dump_file, load->def_insn);
+ fprintf (dump_file, "\n");
+ }
}
else
ix86_place_single_vector_set (load->broadcast_reg,
--
2.49.0