https://gcc.gnu.org/g:daf225605e85d17f5c6c1a205918e44b3b1eccba
commit r17-593-gdaf225605e85d17f5c6c1a205918e44b3b1eccba Author: Jakub Jelinek <[email protected]> Date: Tue May 19 10:11:08 2026 +0200 i386: Use vpermilps for some non-const permutations [PR125357] We don't use vpermilps insn for V4S[IF]mode variable permutations on TARGET_AVX without TARGET_AVX512*. For TARGET_AVX512* there are plenty of permutation instructions already. For TARGET_AVX2, the function has special cases for one_operand_shuffle for V8SImode/V8SFmode and emits reasonable code, but for V4SImode/V4SFmode with TARGET_AVX2 it handles those using V8SImode/V8SFmode as two operand shuffle, which requires 2 preparation instructions, vpermd and one finalization instruction. And for !TARGET_AVX2 && TARGET_AVX we just emit terrible code for these. So, the following patch uses vpermilps for V4S[IF]mode one_operand_shuffle. Trying to handle V8S[IF]mode is not worth it, for TARGET_AVX2 we already emit good code (see above) and for !TARGET_AVX2 && TARGET_AVX V8SImode mask is not valid vector mode, so we emit terrible code no matter what. 2026-05-19 Jakub Jelinek <[email protected]> PR target/125357 * config/i386/i386-expand.cc (ix86_expand_vec_perm): For one_operand_shuffle if TARGET_AVX and not TARGET_AVX512F use vpermilps for V4SImode/V4SFmode. Formatting fix. * gcc.target/i386/avx-pr125357.c: New test. * gcc.target/i386/avx2-pr125357.c: New test. Reviewed-by: Hongtao Liu <[email protected]> Diff: --- gcc/config/i386/i386-expand.cc | 22 +++++++++++++++++++++- gcc/testsuite/gcc.target/i386/avx-pr125357.c | 19 +++++++++++++++++++ gcc/testsuite/gcc.target/i386/avx2-pr125357.c | 19 +++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 6e07194e7167..01cff86d20aa 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -5578,7 +5578,7 @@ ix86_expand_vec_perm (rtx operands[]) switch (mode) { case E_V16SImode: - gen =gen_avx512f_permvarv16si; + gen = gen_avx512f_permvarv16si; break; case E_V16SFmode: gen = gen_avx512f_permvarv16sf; @@ -5702,6 +5702,8 @@ ix86_expand_vec_perm (rtx operands[]) return; case E_V4SImode: + if (one_operand_shuffle) + break; /* Handled below for TARGET_AVX. */ /* By combining the two 128-bit input vectors into one 256-bit input vector, we can use VPERMD and VPERMPS for the full two-operand shuffle. */ @@ -5714,6 +5716,8 @@ ix86_expand_vec_perm (rtx operands[]) return; case E_V4SFmode: + if (one_operand_shuffle) + break; /* Handled below for TARGET_AVX. */ t1 = gen_reg_rtx (V8SFmode); t2 = gen_reg_rtx (V8SImode); mask = gen_lowpart (V4SImode, mask); @@ -5820,6 +5824,22 @@ ix86_expand_vec_perm (rtx operands[]) } } + if (TARGET_AVX + && one_operand_shuffle + && (mode == V4SImode || mode == V4SFmode)) + { + if (mode == V4SImode) + { + op0 = gen_lowpart (V4SFmode, op0); + t1 = gen_reg_rtx (V4SFmode); + emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask)); + emit_move_insn (target, gen_lowpart (mode, t1)); + } + else + emit_insn (gen_avx_vpermilvarv4sf3 (target, op0, mask)); + return; + } + if (TARGET_XOP) { /* The XOP VPPERM insn supports three inputs. By ignoring the diff --git a/gcc/testsuite/gcc.target/i386/avx-pr125357.c b/gcc/testsuite/gcc.target/i386/avx-pr125357.c new file mode 100644 index 000000000000..1d598315ea27 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-pr125357.c @@ -0,0 +1,19 @@ +/* PR target/125357 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx -mno-avx2 -mno-xop" } */ +/* { dg-final { scan-assembler-times "\tvpermilps\t" 2 } } */ + +typedef int v4si __attribute__((vector_size (16))); +typedef float v4sf __attribute__((vector_size (16))); + +v4si +foo (v4si x, v4si y) +{ + return __builtin_shuffle (x, y); +} + +v4sf +bar (v4sf x, v4si y) +{ + return __builtin_shuffle (x, y); +} diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr125357.c b/gcc/testsuite/gcc.target/i386/avx2-pr125357.c new file mode 100644 index 000000000000..4af93473c393 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-pr125357.c @@ -0,0 +1,19 @@ +/* PR target/125357 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f -mno-xop" } */ +/* { dg-final { scan-assembler-times "\tvpermilps\t" 2 } } */ + +typedef int v4si __attribute__((vector_size (16))); +typedef float v4sf __attribute__((vector_size (16))); + +v4si +foo (v4si x, v4si y) +{ + return __builtin_shuffle (x, y); +} + +v4sf +bar (v4sf x, v4si y) +{ + return __builtin_shuffle (x, y); +}
