On Tue, May 19, 2026 at 10:30:16AM +0200, Jakub Jelinek wrote: > On Tue, May 19, 2026 at 10:51:37AM +0300, Alexander Monakov wrote: > > Thanks for looking at the issue, I really appreciate it. The same problem > > exists with 64-bit lanes (V2DF/V2SI modes, we fail to utilize vpermilpd). > > The control in that case is in bits 1 and 65 rather than 0 and 64. > So, in order to use vpermilpd for > __builtin_shuffle (v2di_or_v2df, v2di); > one would need to first shift the mask (or vpaddq with itself). > Though, that is still shorter than what we emit right now.
The following seems to work for me. - movl $1, %eax - vmovq %rax, %xmm2 - vpunpcklqdq %xmm2, %xmm2, %xmm2 - vpand %xmm2, %xmm1, %xmm1 - vpsllq $3, %xmm1, %xmm1 - vpshufb .LC1(%rip), %xmm1, %xmm1 - vpaddb .LC2(%rip), %xmm1, %xmm1 - vpshufb %xmm1, %xmm0, %xmm0 + vpaddq %xmm1, %xmm1, %xmm1 + vpermilpd %xmm1, %xmm0, %xmm0 for both V2DI and V2DF. Ok for trunk if it passes full bootstrap/regtest on x86_64-linux and i686-linux? 2026-05-19 Jakub Jelinek <[email protected]> PR target/125357 * config/i386/i386-expand.cc (ix86_expand_vec_perm): For TARGET_AVX one_operand_shuffle handle also V2DImode and V2DFmode using vpaddq and vpermilpd. * gcc.target/i386/avx-pr125357-2.c: New test. * gcc.target/i386/avx2-pr125357-2.c: New test. --- gcc/config/i386/i386-expand.cc.jj 2026-05-19 10:11:01.102445891 +0200 +++ gcc/config/i386/i386-expand.cc 2026-05-19 10:37:46.988388686 +0200 @@ -5824,21 +5824,34 @@ ix86_expand_vec_perm (rtx operands[]) } } - if (TARGET_AVX - && one_operand_shuffle - && (mode == V4SImode || mode == V4SFmode)) - { - if (mode == V4SImode) - { - op0 = gen_lowpart (V4SFmode, op0); - t1 = gen_reg_rtx (V4SFmode); - emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask)); - emit_move_insn (target, gen_lowpart (mode, t1)); - } - else + if (TARGET_AVX && one_operand_shuffle) + switch (mode) + { + case V4SImode: + op0 = gen_lowpart (V4SFmode, op0); + t1 = gen_reg_rtx (V4SFmode); + emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask)); + emit_move_insn (target, gen_lowpart (mode, t1)); + return; + case V4SFmode: emit_insn (gen_avx_vpermilvarv4sf3 (target, op0, mask)); - return; - } + return; + case V2DImode: + op0 = gen_lowpart (V2DFmode, op0); + t1 = gen_reg_rtx (V2DImode); + t2 = gen_reg_rtx (V2DFmode); + emit_insn (gen_addv2di3 (t1, mask, mask)); + emit_insn (gen_avx_vpermilvarv2df3 (t2, op0, t1)); + emit_move_insn (target, gen_lowpart (mode, t2)); + return; + case V2DFmode: + t1 = gen_reg_rtx (V2DImode); + emit_insn (gen_addv2di3 (t1, mask, mask)); + emit_insn (gen_avx_vpermilvarv2df3 (target, op0, t1)); + return; + default: + break; + } if (TARGET_XOP) { --- gcc/testsuite/gcc.target/i386/avx-pr125357-2.c.jj 2026-05-19 10:42:40.697456727 +0200 +++ gcc/testsuite/gcc.target/i386/avx-pr125357-2.c 2026-05-19 10:44:05.909025855 +0200 @@ -0,0 +1,20 @@ +/* PR target/125357 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx -mno-avx2 -mno-xop" } */ +/* { dg-final { scan-assembler-times "\tvpaddq\t" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpermilpd\t" 2 } } */ + +typedef long long v2di __attribute__((vector_size (16))); +typedef double v2df __attribute__((vector_size (16))); + +v2di +foo (v2di x, v2di y) +{ + return __builtin_shuffle (x, y); +} + +v2df +bar (v2df x, v2di y) +{ + return __builtin_shuffle (x, y); +} --- gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c.jj 2026-05-19 10:43:22.313757908 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c 2026-05-19 10:43:50.024292593 +0200 @@ -0,0 +1,20 @@ +/* PR target/125357 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f -mno-xop" } */ +/* { dg-final { scan-assembler-times "\tvpaddq\t" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpermilpd\t" 2 } } */ + +typedef long long v2di __attribute__((vector_size (16))); +typedef double v2df __attribute__((vector_size (16))); + +v2di +foo (v2di x, v2di y) +{ + return __builtin_shuffle (x, y); +} + +v2df +bar (v2df x, v2di y) +{ + return __builtin_shuffle (x, y); +} Jakub
