https://gcc.gnu.org/g:daf225605e85d17f5c6c1a205918e44b3b1eccba

commit r17-593-gdaf225605e85d17f5c6c1a205918e44b3b1eccba
Author: Jakub Jelinek <[email protected]>
Date:   Tue May 19 10:11:08 2026 +0200

    i386: Use vpermilps for some non-const permutations [PR125357]
    
    We don't use vpermilps insn for V4S[IF]mode variable permutations on
    TARGET_AVX without TARGET_AVX512*.  For TARGET_AVX512* there are plenty
    of permutation instructions already.  For TARGET_AVX2, the function has
    special cases for one_operand_shuffle for V8SImode/V8SFmode and emits
    reasonable code, but for V4SImode/V4SFmode with TARGET_AVX2 it handles
    those using V8SImode/V8SFmode as two operand shuffle, which requires
    2 preparation instructions, vpermd and one finalization instruction.
    And for !TARGET_AVX2 && TARGET_AVX we just emit terrible code for these.
    
    So, the following patch uses vpermilps for V4S[IF]mode one_operand_shuffle.
    
    Trying to handle V8S[IF]mode is not worth it, for TARGET_AVX2 we already
    emit good code (see above) and for !TARGET_AVX2 && TARGET_AVX V8SImode
    mask is not valid vector mode, so we emit terrible code no matter what.
    
    2026-05-19  Jakub Jelinek  <[email protected]>
    
            PR target/125357
            * config/i386/i386-expand.cc (ix86_expand_vec_perm): For
            one_operand_shuffle if TARGET_AVX and not TARGET_AVX512F use
            vpermilps for V4SImode/V4SFmode.  Formatting fix.
    
            * gcc.target/i386/avx-pr125357.c: New test.
            * gcc.target/i386/avx2-pr125357.c: New test.
    
    Reviewed-by: Hongtao Liu <[email protected]>

Diff:
---
 gcc/config/i386/i386-expand.cc                | 22 +++++++++++++++++++++-
 gcc/testsuite/gcc.target/i386/avx-pr125357.c  | 19 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/avx2-pr125357.c | 19 +++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 6e07194e7167..01cff86d20aa 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -5578,7 +5578,7 @@ ix86_expand_vec_perm (rtx operands[])
       switch (mode)
        {
        case E_V16SImode:
-         gen =gen_avx512f_permvarv16si;
+         gen = gen_avx512f_permvarv16si;
          break;
        case E_V16SFmode:
          gen = gen_avx512f_permvarv16sf;
@@ -5702,6 +5702,8 @@ ix86_expand_vec_perm (rtx operands[])
          return;
 
         case E_V4SImode:
+         if (one_operand_shuffle)
+           break; /* Handled below for TARGET_AVX.  */
          /* By combining the two 128-bit input vectors into one 256-bit
             input vector, we can use VPERMD and VPERMPS for the full
             two-operand shuffle.  */
@@ -5714,6 +5716,8 @@ ix86_expand_vec_perm (rtx operands[])
          return;
 
         case E_V4SFmode:
+         if (one_operand_shuffle)
+           break; /* Handled below for TARGET_AVX.  */
          t1 = gen_reg_rtx (V8SFmode);
          t2 = gen_reg_rtx (V8SImode);
          mask = gen_lowpart (V4SImode, mask);
@@ -5820,6 +5824,22 @@ ix86_expand_vec_perm (rtx operands[])
        }
     }
 
+  if (TARGET_AVX
+      && one_operand_shuffle
+      && (mode == V4SImode || mode == V4SFmode))
+    {
+      if (mode == V4SImode)
+       {
+         op0 = gen_lowpart (V4SFmode, op0);
+         t1 = gen_reg_rtx (V4SFmode);
+         emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask));
+         emit_move_insn (target, gen_lowpart (mode, t1));
+       }
+      else
+       emit_insn (gen_avx_vpermilvarv4sf3 (target, op0, mask));
+      return;
+    }
+
   if (TARGET_XOP)
     {
       /* The XOP VPPERM insn supports three inputs.  By ignoring the
diff --git a/gcc/testsuite/gcc.target/i386/avx-pr125357.c 
b/gcc/testsuite/gcc.target/i386/avx-pr125357.c
new file mode 100644
index 000000000000..1d598315ea27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-pr125357.c
@@ -0,0 +1,19 @@
+/* PR target/125357 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -mno-avx2 -mno-xop" } */
+/* { dg-final { scan-assembler-times "\tvpermilps\t" 2 } } */
+
+typedef int v4si __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+
+v4si
+foo (v4si x, v4si y)
+{
+  return __builtin_shuffle (x, y);
+}
+
+v4sf
+bar (v4sf x, v4si y)
+{
+  return __builtin_shuffle (x, y);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr125357.c 
b/gcc/testsuite/gcc.target/i386/avx2-pr125357.c
new file mode 100644
index 000000000000..4af93473c393
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr125357.c
@@ -0,0 +1,19 @@
+/* PR target/125357 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f -mno-xop" } */
+/* { dg-final { scan-assembler-times "\tvpermilps\t" 2 } } */
+
+typedef int v4si __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+
+v4si
+foo (v4si x, v4si y)
+{
+  return __builtin_shuffle (x, y);
+}
+
+v4sf
+bar (v4sf x, v4si y)
+{
+  return __builtin_shuffle (x, y);
+}

Reply via email to