On Tue, May 19, 2026 at 10:30:16AM +0200, Jakub Jelinek wrote:
> On Tue, May 19, 2026 at 10:51:37AM +0300, Alexander Monakov wrote:
> > Thanks for looking at the issue, I really appreciate it. The same problem
> > exists with 64-bit lanes (V2DF/V2SI modes, we fail to utilize vpermilpd).
> 
> The control in that case is in bits 1 and 65 rather than 0 and 64.
> So, in order to use vpermilpd for
> __builtin_shuffle (v2di_or_v2df, v2di);
> one would need to first shift the mask (or vpaddq with itself).
> Though, that is still shorter than what we emit right now.

The following seems to work for me.

-       movl    $1, %eax
-       vmovq   %rax, %xmm2
-       vpunpcklqdq     %xmm2, %xmm2, %xmm2
-       vpand   %xmm2, %xmm1, %xmm1
-       vpsllq  $3, %xmm1, %xmm1
-       vpshufb .LC1(%rip), %xmm1, %xmm1
-       vpaddb  .LC2(%rip), %xmm1, %xmm1
-       vpshufb %xmm1, %xmm0, %xmm0
+       vpaddq  %xmm1, %xmm1, %xmm1
+       vpermilpd       %xmm1, %xmm0, %xmm0

for both V2DI and V2DF.

Ok for trunk if it passes full bootstrap/regtest on x86_64-linux and
i686-linux?

2026-05-19  Jakub Jelinek  <[email protected]>

        PR target/125357
        * config/i386/i386-expand.cc (ix86_expand_vec_perm): For TARGET_AVX
        one_operand_shuffle handle also V2DImode and V2DFmode using
        vpaddq and vpermilpd.

        * gcc.target/i386/avx-pr125357-2.c: New test.
        * gcc.target/i386/avx2-pr125357-2.c: New test.

--- gcc/config/i386/i386-expand.cc.jj   2026-05-19 10:11:01.102445891 +0200
+++ gcc/config/i386/i386-expand.cc      2026-05-19 10:37:46.988388686 +0200
@@ -5824,21 +5824,34 @@ ix86_expand_vec_perm (rtx operands[])
        }
     }
 
-  if (TARGET_AVX
-      && one_operand_shuffle
-      && (mode == V4SImode || mode == V4SFmode))
-    {
-      if (mode == V4SImode)
-       {
-         op0 = gen_lowpart (V4SFmode, op0);
-         t1 = gen_reg_rtx (V4SFmode);
-         emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask));
-         emit_move_insn (target, gen_lowpart (mode, t1));
-       }
-      else
+  if (TARGET_AVX && one_operand_shuffle)
+    switch (mode)
+      {
+      case V4SImode:
+       op0 = gen_lowpart (V4SFmode, op0);
+       t1 = gen_reg_rtx (V4SFmode);
+       emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask));
+       emit_move_insn (target, gen_lowpart (mode, t1));
+       return;
+      case V4SFmode:
        emit_insn (gen_avx_vpermilvarv4sf3 (target, op0, mask));
-      return;
-    }
+       return;
+      case V2DImode:
+       op0 = gen_lowpart (V2DFmode, op0);
+       t1 = gen_reg_rtx (V2DImode);
+       t2 = gen_reg_rtx (V2DFmode);
+       emit_insn (gen_addv2di3 (t1, mask, mask));
+       emit_insn (gen_avx_vpermilvarv2df3 (t2, op0, t1));
+       emit_move_insn (target, gen_lowpart (mode, t2));
+       return;
+      case V2DFmode:
+       t1 = gen_reg_rtx (V2DImode);
+       emit_insn (gen_addv2di3 (t1, mask, mask));
+       emit_insn (gen_avx_vpermilvarv2df3 (target, op0, t1));
+       return;
+      default:
+       break;
+      }
 
   if (TARGET_XOP)
     {
--- gcc/testsuite/gcc.target/i386/avx-pr125357-2.c.jj   2026-05-19 
10:42:40.697456727 +0200
+++ gcc/testsuite/gcc.target/i386/avx-pr125357-2.c      2026-05-19 
10:44:05.909025855 +0200
@@ -0,0 +1,20 @@
+/* PR target/125357 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -mno-avx2 -mno-xop" } */
+/* { dg-final { scan-assembler-times "\tvpaddq\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvpermilpd\t" 2 } } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef double v2df __attribute__((vector_size (16)));
+
+v2di
+foo (v2di x, v2di y)
+{
+  return __builtin_shuffle (x, y);
+}
+
+v2df
+bar (v2df x, v2di y)
+{
+  return __builtin_shuffle (x, y);
+}
--- gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c.jj  2026-05-19 
10:43:22.313757908 +0200
+++ gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c     2026-05-19 
10:43:50.024292593 +0200
@@ -0,0 +1,20 @@
+/* PR target/125357 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f -mno-xop" } */
+/* { dg-final { scan-assembler-times "\tvpaddq\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvpermilpd\t" 2 } } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef double v2df __attribute__((vector_size (16)));
+
+v2di
+foo (v2di x, v2di y)
+{
+  return __builtin_shuffle (x, y);
+}
+
+v2df
+bar (v2df x, v2di y)
+{
+  return __builtin_shuffle (x, y);
+}


        Jakub

Reply via email to