This is incremental patch based on [1], it enables optimization as below

-       vbroadcastss    .LC1(%rip), %xmm0
+       movl    $-45, %edx
+       vmovd   %edx, %xmm0
+       vpshufd $0, %xmm0, %xmm0

According to microbenchmark, it's faster than broadcast from memory.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591162.html.

Bootstrapped and regtest on x86_64-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

        PR target/104704
        * config/i386/sse.md (*vec_dupv4si): Add alternative $r and
        corresponding post_reload splitter.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/pr100865-8a.c: Adjust testcase.
        * gcc.target/i386/pr100865-8c.c: Ditto.
        * gcc.target/i386/pr100865-9c.c: Ditto.
---
 gcc/config/i386/sse.md                      | 41 ++++++++++++++++-----
 gcc/testsuite/gcc.target/i386/pr100865-8a.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-8c.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-9c.c |  2 +-
 4 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3066ea3734a..d124545aa5d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -25121,20 +25121,43 @@ (define_insn "vec_dupv4sf"
    (set_attr "mode" "V4SF")])
 
 (define_insn "*vec_dupv4si"
-  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x")
+  [(set (match_operand:V4SI 0 "register_operand"     "=v,v,x,v")
        (vec_duplicate:V4SI
-         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))]
+         (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0,$r")))]
   "TARGET_SSE"
   "@
    %vpshufd\t{$0, %1, %0|%0, %1, 0}
    vbroadcastss\t{%1, %0|%0, %1}
-   shufps\t{$0, %0, %0|%0, %0, 0}"
-  [(set_attr "isa" "sse2,avx,noavx")
-   (set_attr "type" "sselog1,ssemov,sselog1")
-   (set_attr "length_immediate" "1,0,1")
-   (set_attr "prefix_extra" "0,1,*")
-   (set_attr "prefix" "maybe_vex,maybe_evex,orig")
-   (set_attr "mode" "TI,V4SF,V4SF")])
+   shufps\t{$0, %0, %0|%0, %0, 0}
+   #"
+  [(set_attr "isa" "sse2,avx,noavx,noavx512vl")
+   (set_attr "type" "sselog1,ssemov,sselog1,sselog1")
+   (set_attr "length_immediate" "1,0,1,1")
+   (set_attr "prefix_extra" "0,1,*,0")
+   (set_attr "prefix" "maybe_vex,maybe_evex,orig,maybe_vex")
+   (set_attr "mode" "TI,V4SF,V4SF,TI")
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "3")
+             (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC")
+          ]
+          (symbol_ref "true")))])
+
+(define_split
+  [(set (match_operand:V4SI 0 "sse_reg_operand")
+       (vec_duplicate:V4SI
+         (match_operand:SI 1 "general_reg_operand")))]
+  "TARGET_SSE && reload_completed
+   /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
+      available, because then we can broadcast from GPRs directly.  */
+   && !TARGET_AVX512VL"
+  [(const_int 0)]
+{
+  emit_insn (gen_vec_setv4si_0 (gen_lowpart (V4SImode, operands[0]),
+                               CONST0_RTX (V4SImode),
+                               gen_lowpart (SImode, operands[1])));
+  emit_insn (gen_vec_duplicatev4si (operands[0], operands[0]));
+  DONE;
+})
 
 (define_insn "*vec_dupv2di"
   [(set (match_operand:V2DI 0 "register_operand"     "=x,v,v,x")
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c 
b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
index 911b14d4a25..544a14db6f7 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
@@ -20,5 +20,5 @@ foo (void)
     array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
 }
 
-/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t 
\]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t 
\]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c 
b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
index 00682edb8c9..efee0488614 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
@@ -3,5 +3,5 @@
 
 #include "pr100865-8a.c"
 
-/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 
1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 
1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c 
b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
index 8ffcdc1629d..e6f25902c1d 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
@@ -3,5 +3,5 @@
 
 #include "pr100865-9a.c"
 
-/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 
1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 
1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
-- 
2.18.1

Reply via email to