On Tue, May 19, 2026 at 4:52 PM Jakub Jelinek <[email protected]> wrote: > > On Tue, May 19, 2026 at 10:30:16AM +0200, Jakub Jelinek wrote: > > On Tue, May 19, 2026 at 10:51:37AM +0300, Alexander Monakov wrote: > > > Thanks for looking at the issue, I really appreciate it. The same problem > > > exists with 64-bit lanes (V2DF/V2SI modes, we fail to utilize vpermilpd). > > > > The control in that case is in bits 1 and 65 rather than 0 and 64. > > So, in order to use vpermilpd for > > __builtin_shuffle (v2di_or_v2df, v2di); > > one would need to first shift the mask (or vpaddq with itself). > > Though, that is still shorter than what we emit right now. > > The following seems to work for me. > > - movl $1, %eax > - vmovq %rax, %xmm2 > - vpunpcklqdq %xmm2, %xmm2, %xmm2 > - vpand %xmm2, %xmm1, %xmm1 > - vpsllq $3, %xmm1, %xmm1 > - vpshufb .LC1(%rip), %xmm1, %xmm1 > - vpaddb .LC2(%rip), %xmm1, %xmm1 > - vpshufb %xmm1, %xmm0, %xmm0 > + vpaddq %xmm1, %xmm1, %xmm1 > + vpermilpd %xmm1, %xmm0, %xmm0 > > for both V2DI and V2DF. > > Ok for trunk if it passes full bootstrap/regtest on x86_64-linux and > i686-linux?
Ok. > > 2026-05-19 Jakub Jelinek <[email protected]> > > PR target/125357 > * config/i386/i386-expand.cc (ix86_expand_vec_perm): For TARGET_AVX > one_operand_shuffle handle also V2DImode and V2DFmode using > vpaddq and vpermilpd. > > * gcc.target/i386/avx-pr125357-2.c: New test. > * gcc.target/i386/avx2-pr125357-2.c: New test. > > --- gcc/config/i386/i386-expand.cc.jj 2026-05-19 10:11:01.102445891 +0200 > +++ gcc/config/i386/i386-expand.cc 2026-05-19 10:37:46.988388686 +0200 > @@ -5824,21 +5824,34 @@ ix86_expand_vec_perm (rtx operands[]) > } > } > > - if (TARGET_AVX > - && one_operand_shuffle > - && (mode == V4SImode || mode == V4SFmode)) > - { > - if (mode == V4SImode) > - { > - op0 = gen_lowpart (V4SFmode, op0); > - t1 = gen_reg_rtx (V4SFmode); > - emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask)); > - emit_move_insn (target, gen_lowpart (mode, t1)); > - } > - else > + if (TARGET_AVX && one_operand_shuffle) > + switch (mode) > + { > + case V4SImode: > + op0 = gen_lowpart (V4SFmode, op0); > + t1 = gen_reg_rtx (V4SFmode); > + emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask)); > + emit_move_insn (target, gen_lowpart (mode, t1)); > + return; > + case V4SFmode: > emit_insn (gen_avx_vpermilvarv4sf3 (target, op0, mask)); > - return; > - } > + return; > + case V2DImode: > + op0 = gen_lowpart (V2DFmode, op0); > + t1 = gen_reg_rtx (V2DImode); > + t2 = gen_reg_rtx (V2DFmode); > + emit_insn (gen_addv2di3 (t1, mask, mask)); > + emit_insn (gen_avx_vpermilvarv2df3 (t2, op0, t1)); > + emit_move_insn (target, gen_lowpart (mode, t2)); > + return; > + case V2DFmode: > + t1 = gen_reg_rtx (V2DImode); > + emit_insn (gen_addv2di3 (t1, mask, mask)); > + emit_insn (gen_avx_vpermilvarv2df3 (target, op0, t1)); > + return; > + default: > + break; > + } > > if (TARGET_XOP) > { > --- gcc/testsuite/gcc.target/i386/avx-pr125357-2.c.jj 2026-05-19 > 10:42:40.697456727 +0200 > +++ gcc/testsuite/gcc.target/i386/avx-pr125357-2.c 2026-05-19 > 10:44:05.909025855 +0200 > @@ -0,0 +1,20 @@ > +/* PR target/125357 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx -mno-avx2 -mno-xop" } */ > +/* { dg-final { scan-assembler-times "\tvpaddq\t" 2 } } */ > +/* { dg-final { scan-assembler-times "\tvpermilpd\t" 2 } } */ > + > +typedef long long v2di __attribute__((vector_size (16))); > +typedef double v2df __attribute__((vector_size (16))); > + > +v2di > +foo (v2di x, v2di y) > +{ > + return __builtin_shuffle (x, y); > +} > + > +v2df > +bar (v2df x, v2di y) > +{ > + return __builtin_shuffle (x, y); > +} > --- gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c.jj 2026-05-19 > 10:43:22.313757908 +0200 > +++ gcc/testsuite/gcc.target/i386/avx2-pr125357-2.c 2026-05-19 > 10:43:50.024292593 +0200 > @@ -0,0 +1,20 @@ > +/* PR target/125357 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2 -mno-avx512f -mno-xop" } */ > +/* { dg-final { scan-assembler-times "\tvpaddq\t" 2 } } */ > +/* { dg-final { scan-assembler-times "\tvpermilpd\t" 2 } } */ > + > +typedef long long v2di __attribute__((vector_size (16))); > +typedef double v2df __attribute__((vector_size (16))); > + > +v2di > +foo (v2di x, v2di y) > +{ > + return __builtin_shuffle (x, y); > +} > + > +v2df > +bar (v2df x, v2di y) > +{ > + return __builtin_shuffle (x, y); > +} > > > Jakub > -- BR, Hongtao
