A constant matrix can describe the movement of the 8 bits,
so these shifts can be performed with one instruction.

Logic courtesy of Andi Kleen <a...@linux.intel.com>:
https://gcc.gnu.org/pipermail/gcc-patches/2025-August/691624.html

Signed-off-by: Richard Henderson <richard.hender...@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 75 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 71 insertions(+), 4 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 9dd588fc41..fb76724941 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -4342,12 +4342,46 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, 
unsigned vece)
     }
 }
 
+static void gen_vgf2p8affineqb0(TCGType type, TCGv_vec v0,
+                                TCGv_vec v1, uint64_t matrix)
+{
+    vec_gen_4(INDEX_op_x86_vgf2p8affineqb_vec, type, MO_8,
+              tcgv_vec_arg(v0), tcgv_vec_arg(v1),
+              tcgv_vec_arg(tcg_constant_vec(type, MO_64, matrix)), 0);
+}
+
 static void expand_vec_shi(TCGType type, unsigned vece, bool right,
                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
 {
+    static const uint64_t gf2_shi[2][8] = {
+        /* left shift */
+        { 0,
+          0x0001020408102040ull,
+          0x0000010204081020ull,
+          0x0000000102040810ull,
+          0x0000000001020408ull,
+          0x0000000000010204ull,
+          0x0000000000000102ull,
+          0x0000000000000001ull },
+        /* right shift */
+        { 0,
+          0x0204081020408000ull,
+          0x0408102040800000ull,
+          0x0810204080000000ull,
+          0x1020408000000000ull,
+          0x2040800000000000ull,
+          0x4080000000000000ull,
+          0x8000000000000000ull }
+    };
     uint8_t mask;
 
     tcg_debug_assert(vece == MO_8);
+
+    if (cpuinfo & CPUINFO_GFNI) {
+        gen_vgf2p8affineqb0(type, v0, v1, gf2_shi[right][imm]);
+        return;
+    }
+
     if (right) {
         mask = 0xff >> imm;
         tcg_gen_shri_vec(MO_16, v0, v1, imm);
@@ -4361,10 +4395,25 @@ static void expand_vec_shi(TCGType type, unsigned vece, 
bool right,
 static void expand_vec_sari(TCGType type, unsigned vece,
                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
 {
+    static const uint64_t gf2_sar[8] = {
+        0,
+        0x0204081020408080ull,
+        0x0408102040808080ull,
+        0x0810204080808080ull,
+        0x1020408080808080ull,
+        0x2040808080808080ull,
+        0x4080808080808080ull,
+        0x8080808080808080ull,
+    };
     TCGv_vec t1, t2;
 
     switch (vece) {
     case MO_8:
+        if (cpuinfo & CPUINFO_GFNI) {
+            gen_vgf2p8affineqb0(type, v0, v1, gf2_sar[imm]);
+            break;
+        }
+
         /* Unpack to 16-bit, shift, and repack.  */
         t1 = tcg_temp_new_vec(type);
         t2 = tcg_temp_new_vec(type);
@@ -4416,12 +4465,30 @@ static void expand_vec_sari(TCGType type, unsigned vece,
 static void expand_vec_rotli(TCGType type, unsigned vece,
                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
 {
+    static const uint64_t gf2_rol[8] = {
+        0,
+        0x8001020408102040ull,
+        0x4080010204081020ull,
+        0x2040800102040810ull,
+        0x1020408001020408ull,
+        0x0810204080010204ull,
+        0x0408102040800102ull,
+        0x0204081020408001ull,
+    };
     TCGv_vec t;
 
-    if (vece != MO_8 && have_avx512vbmi2) {
-        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
-                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
-        return;
+    if (vece == MO_8) {
+        if (cpuinfo & CPUINFO_GFNI) {
+            gen_vgf2p8affineqb0(type, v0, v1, gf2_rol[imm]);
+            return;
+        }
+    } else {
+        if (have_avx512vbmi2) {
+            vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
+                      tcgv_vec_arg(v0), tcgv_vec_arg(v1),
+                      tcgv_vec_arg(v1), imm);
+            return;
+        }
     }
 
     t = tcg_temp_new_vec(type);
-- 
2.43.0


Reply via email to