https://gcc.gnu.org/g:4446d3e1045bd3728f8e57ee4af85f7d1b190e4f
commit r17-617-g4446d3e1045bd3728f8e57ee4af85f7d1b190e4f Author: Jakub Jelinek <[email protected]> Date: Wed May 20 08:49:06 2026 +0200 i386: Use vpaddq + vpermilpd for some non-const permutations [PR125357] On Tue, May 19, 2026 at 10:30:16AM +0200, Jakub Jelinek wrote: > On Tue, May 19, 2026 at 10:51:37AM +0300, Alexander Monakov wrote: > > Thanks for looking at the issue, I really appreciate it. The same problem > > exists with 64-bit lanes (V2DF/V2SI modes, we fail to utilize vpermilpd). > > The control in that case is in bits 1 and 65 rather than 0 and 64. > So, in order to use vpermilpd for > __builtin_shuffle (v2di_or_v2df, v2di); > one would need to first shift the mask (or vpaddq with itself). > Though, that is still shorter than what we emit right now. The following seems to work for me. - movl $1, %eax - vmovq %rax, %xmm2 - vpunpcklqdq %xmm2, %xmm2, %xmm2 - vpand %xmm2, %xmm1, %xmm1 - vpsllq $3, %xmm1, %xmm1 - vpshufb .LC1(%rip), %xmm1, %xmm1 - vpaddb .LC2(%rip), %xmm1, %xmm1 - vpshufb %xmm1, %xmm0, %xmm0 + vpaddq %xmm1, %xmm1, %xmm1 + vpermilpd %xmm1, %xmm0, %xmm0 for both V2DI and V2DF. 2026-05-20 Jakub Jelinek <[email protected]> PR target/125357 * config/i386/i386-expand.cc (ix86_expand_vec_perm): For TARGET_AVX one_operand_shuffle handle also V2DImode and V2DFmode using vpaddq and vpermilpd. * gcc.target/i386/avx-pr125357-2.c: New test. * gcc.target/i386/avx2-pr125357-2.c: New test. Reviewed-by: Hongtao Liu <[email protected]> Diff: --- gcc/config/i386/i386-expand.cc | 41 ++++++++++++++++--------- gcc/testsuite/gcc.target/i386/avx-pr125357-2.c | 20 ++++++++++++ gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c | 20 ++++++++++++ 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 2c366329107d..6af5b7751b2c 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -5824,21 +5824,34 @@ ix86_expand_vec_perm (rtx operands[]) } } - if (TARGET_AVX - && one_operand_shuffle - && (mode == V4SImode || mode == V4SFmode)) - { - if (mode == V4SImode) - { - op0 = gen_lowpart (V4SFmode, op0); - t1 = gen_reg_rtx (V4SFmode); - emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask)); - emit_move_insn (target, gen_lowpart (mode, t1)); - } - else + if (TARGET_AVX && one_operand_shuffle) + switch (mode) + { + case V4SImode: + op0 = gen_lowpart (V4SFmode, op0); + t1 = gen_reg_rtx (V4SFmode); + emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask)); + emit_move_insn (target, gen_lowpart (mode, t1)); + return; + case V4SFmode: emit_insn (gen_avx_vpermilvarv4sf3 (target, op0, mask)); - return; - } + return; + case V2DImode: + op0 = gen_lowpart (V2DFmode, op0); + t1 = gen_reg_rtx (V2DImode); + t2 = gen_reg_rtx (V2DFmode); + emit_insn (gen_addv2di3 (t1, mask, mask)); + emit_insn (gen_avx_vpermilvarv2df3 (t2, op0, t1)); + emit_move_insn (target, gen_lowpart (mode, t2)); + return; + case V2DFmode: + t1 = gen_reg_rtx (V2DImode); + emit_insn (gen_addv2di3 (t1, mask, mask)); + emit_insn (gen_avx_vpermilvarv2df3 (target, op0, t1)); + return; + default: + break; + } if (TARGET_XOP) { diff --git a/gcc/testsuite/gcc.target/i386/avx-pr125357-2.c b/gcc/testsuite/gcc.target/i386/avx-pr125357-2.c new file mode 100644 index 000000000000..4eebe91c0aad --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-pr125357-2.c @@ -0,0 +1,20 @@ +/* PR target/125357 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx -mno-avx2 -mno-xop" } */ +/* { dg-final { scan-assembler-times "\tvpaddq\t" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpermilpd\t" 2 } } */ + +typedef long long v2di __attribute__((vector_size (16))); +typedef double v2df __attribute__((vector_size (16))); + +v2di +foo (v2di x, v2di y) +{ + return __builtin_shuffle (x, y); +} + +v2df +bar (v2df x, v2di y) +{ + return __builtin_shuffle (x, y); +} diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c b/gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c new file mode 100644 index 000000000000..34ddc59b99fb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c @@ -0,0 +1,20 @@ +/* PR target/125357 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f -mno-xop" } */ +/* { dg-final { scan-assembler-times "\tvpaddq\t" 2 } } */ +/* { dg-final { scan-assembler-times "\tvpermilpd\t" 2 } } */ + +typedef long long v2di __attribute__((vector_size (16))); +typedef double v2df __attribute__((vector_size (16))); + +v2di +foo (v2di x, v2di y) +{ + return __builtin_shuffle (x, y); +} + +v2df +bar (v2df x, v2di y) +{ + return __builtin_shuffle (x, y); +}
