A constant matrix can describe the movement of the 8 bits, so these shifts can be performed with one instruction.
Logic courtesy of Andi Kleen <a...@linux.intel.com>: https://gcc.gnu.org/pipermail/gcc-patches/2025-August/691624.html Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- tcg/i386/tcg-target.c.inc | 75 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index 9dd588fc41..fb76724941 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -4342,12 +4342,46 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) } } +static void gen_vgf2p8affineqb0(TCGType type, TCGv_vec v0, + TCGv_vec v1, uint64_t matrix) +{ + vec_gen_4(INDEX_op_x86_vgf2p8affineqb_vec, type, MO_8, + tcgv_vec_arg(v0), tcgv_vec_arg(v1), + tcgv_vec_arg(tcg_constant_vec(type, MO_64, matrix)), 0); +} + static void expand_vec_shi(TCGType type, unsigned vece, bool right, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { + static const uint64_t gf2_shi[2][8] = { + /* left shift */ + { 0, + 0x0001020408102040ull, + 0x0000010204081020ull, + 0x0000000102040810ull, + 0x0000000001020408ull, + 0x0000000000010204ull, + 0x0000000000000102ull, + 0x0000000000000001ull }, + /* right shift */ + { 0, + 0x0204081020408000ull, + 0x0408102040800000ull, + 0x0810204080000000ull, + 0x1020408000000000ull, + 0x2040800000000000ull, + 0x4080000000000000ull, + 0x8000000000000000ull } + }; uint8_t mask; tcg_debug_assert(vece == MO_8); + + if (cpuinfo & CPUINFO_GFNI) { + gen_vgf2p8affineqb0(type, v0, v1, gf2_shi[right][imm]); + return; + } + if (right) { mask = 0xff >> imm; tcg_gen_shri_vec(MO_16, v0, v1, imm); @@ -4361,10 +4395,25 @@ static void expand_vec_shi(TCGType type, unsigned vece, bool right, static void expand_vec_sari(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { + static const uint64_t gf2_sar[8] = { + 0, + 0x0204081020408080ull, + 0x0408102040808080ull, + 0x0810204080808080ull, + 0x1020408080808080ull, + 0x2040808080808080ull, + 0x4080808080808080ull, + 0x8080808080808080ull, + }; TCGv_vec t1, t2; switch (vece) { case MO_8: + if (cpuinfo & CPUINFO_GFNI) { + gen_vgf2p8affineqb0(type, v0, v1, gf2_sar[imm]); + break; + } + /* Unpack to 16-bit, shift, and repack. */ t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); @@ -4416,12 +4465,30 @@ static void expand_vec_sari(TCGType type, unsigned vece, static void expand_vec_rotli(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { + static const uint64_t gf2_rol[8] = { + 0, + 0x8001020408102040ull, + 0x4080010204081020ull, + 0x2040800102040810ull, + 0x1020408001020408ull, + 0x0810204080010204ull, + 0x0408102040800102ull, + 0x0204081020408001ull, + }; TCGv_vec t; - if (vece != MO_8 && have_avx512vbmi2) { - vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, - tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); - return; + if (vece == MO_8) { + if (cpuinfo & CPUINFO_GFNI) { + gen_vgf2p8affineqb0(type, v0, v1, gf2_rol[imm]); + return; + } + } else { + if (have_avx512vbmi2) { + vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, + tcgv_vec_arg(v0), tcgv_vec_arg(v1), + tcgv_vec_arg(v1), imm); + return; + } } t = tcg_temp_new_vec(type); -- 2.43.0