Re: [RFC 2/6] target/riscv: rvk: add implementation of instructions for Zbk* - reuse partial instructions of Zbb/Zbc extensions - add brev8 packh, unzip, zip, etc.

2021-11-02 Thread liweiwei

Thanks for your suggestions.

在 2021/11/2 下午11:44, Richard Henderson 写道:

On 11/1/21 11:11 PM, liweiwei wrote:

Signed-off-by: liweiwei 
Signed-off-by: wangjunqiang 


You managed to get the whole patch description into the subject line.
Please break it up.


OK.

+target_ulong HELPER(grev)(target_ulong rs1, target_ulong rs2)
+{
+    return do_grev(rs1, rs2, TARGET_LONG_BITS);
+}


Are we expecting to see the full grev instruction at any point? If 
not, we can certainly implement Zbk with a simpler implementation.
The main idea that I add this helper is that  grev may be added to 
B-extension later and it can be reused. However, it have no effect 
currently.  I'll replace this with a simpler implementation.


+target_ulong HELPER(xperm)(target_ulong rs1, target_ulong rs2, 
uint32_t sz_log2)

+{
+    target_ulong r = 0;
+    target_ulong sz = 1LL << sz_log2;
+    target_ulong mask = (1LL << sz) - 1;
+    for (int i = 0; i < TARGET_LONG_BITS; i += sz) {
+    target_ulong pos = ((rs2 >> i) & mask) << sz_log2;
+    if (pos < sizeof(target_ulong) * 8) {
+    r |= ((rs1 >> pos) & mask) << i;
+    }
+    }
+    return r;
+}


This could become a static inline do_xperm, and provide two specific 
xperm4 and xperm8 helpers; the compiler would fold all of the sz_log2 
stuff into a more efficient implementation.

OK.



+target_ulong HELPER(unshfl)(target_ulong rs1,
+    target_ulong rs2)
+{
+    target_ulong x = rs1;
+    int i, shift;
+    int bits = TARGET_LONG_BITS >> 1;
+    for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
+    if (rs2 & shift) {
+    x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> 
shift, shift);

+    }
+    }
+    return x;
+}
+
+target_ulong HELPER(shfl)(target_ulong rs1,
+  target_ulong rs2)
+{
+    target_ulong x = rs1;
+    int i, shift;
+    shift = TARGET_LONG_BITS >> 2;
+    i = (shift == 8) ? 3 : 4;
+    for (; i >= 0; i--, shift >>= 1) {
+    if (rs2 & shift) {
+    x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> 
shift, shift);

+    }
+    }
+    return x;
+}


Similar comment as for grev.


+# The encoding for zext.h differs between RV32 and RV64.
+# zext_h_32 denotes the RV32 variant.
+{
+  zext_h_32  100 0 . 100 . 0110011 @r2
+  pack   100 . . 100 . 0110011 @r
+}


Note to self: improve tcg_gen_deposit to notice zeros, so that the 
more general pack compiles to zero-extension.


@@ -556,6 +563,81 @@ static bool gen_unary_per_ol(DisasContext *ctx, 
arg_r2 *a, DisasExtend ext,

  return gen_unary(ctx, a, ext, f_tl);
  }
  +static bool gen_xperm(DisasContext *ctx, arg_r *a, int32_t size)
+{
+    TCGv dest = dest_gpr(ctx, a->rd);
+    TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE);
+    TCGv src2 = get_gpr(ctx, a->rs2, EXT_NONE);
+
+    TCGv_i32 sz = tcg_const_i32(size);
+    gen_helper_xperm(dest, src1, src2, sz);
+
+    gen_set_gpr(ctx, a->rd, dest);
+    tcg_temp_free_i32(sz);
+    return true;
+}
+
+static bool gen_grevi(DisasContext *ctx, arg_r2 *a, int shamt)
+{
+    TCGv dest = dest_gpr(ctx, a->rd);
+    TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE);
+
+    if (shamt == (TARGET_LONG_BITS - 8)) {
+    /* rev8, byte swaps */
+    tcg_gen_bswap_tl(dest, src1);
+    } else {
+    TCGv src2 = tcg_temp_new();
+    tcg_gen_movi_tl(src2, shamt);
+    gen_helper_grev(dest, src1, src2);
+    tcg_temp_free(src2);
+    }
+
+    gen_set_gpr(ctx, a->rd, dest);
+    return true;
+}
+
+static void gen_pack(TCGv ret, TCGv src1, TCGv src2)
+{
+    tcg_gen_deposit_tl(ret, src1, src2,
+   TARGET_LONG_BITS / 2,
+   TARGET_LONG_BITS / 2);
+}
+
+static void gen_packh(TCGv ret, TCGv src1, TCGv src2)
+{
+    TCGv t = tcg_temp_new();
+    tcg_gen_ext8u_tl(t, src2);
+    tcg_gen_deposit_tl(ret, src1, t, 8, TARGET_LONG_BITS - 8);
+    tcg_temp_free(t);
+}
+
+static void gen_packw(TCGv ret, TCGv src1, TCGv src2)
+{
+    TCGv t = tcg_temp_new();
+    tcg_gen_ext16s_tl(t, src2);
+    tcg_gen_deposit_tl(ret, src1, t, 16, 48);
+    tcg_temp_free(t);
+}
+
+static bool gen_shufi(DisasContext *ctx, arg_r2 *a, int shamt,
+   void(*func)(TCGv, TCGv, TCGv))
+{
+    if (shamt >= TARGET_LONG_BITS / 2) {
+    return false;
+    }
+
+    TCGv dest = dest_gpr(ctx, a->rd);
+    TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE);
+    TCGv src2 = tcg_temp_new();
+
+    tcg_gen_movi_tl(src2, shamt);
+    (*func)(dest, src1, src2);
+
+    gen_set_gpr(ctx, a->rd, dest);
+    tcg_temp_free(src2);
+    return true;
+}


All of the gen functions belong in insn_trans/trans_rvb.c.inc.

OK. I'll move them to insn_trans/trans_rvb.c.inc.



r~





Re: [RFC 2/6] target/riscv: rvk: add implementation of instructions for Zbk* - reuse partial instructions of Zbb/Zbc extensions - add brev8 packh, unzip, zip, etc.

2021-11-02 Thread Richard Henderson

On 11/1/21 11:11 PM, liweiwei wrote:

Signed-off-by: liweiwei 
Signed-off-by: wangjunqiang 


You managed to get the whole patch description into the subject line.
Please break it up.


+target_ulong HELPER(grev)(target_ulong rs1, target_ulong rs2)
+{
+return do_grev(rs1, rs2, TARGET_LONG_BITS);
+}


Are we expecting to see the full grev instruction at any point?  If not, we can certainly 
implement Zbk with a simpler implementation.



+target_ulong HELPER(xperm)(target_ulong rs1, target_ulong rs2, uint32_t 
sz_log2)
+{
+target_ulong r = 0;
+target_ulong sz = 1LL << sz_log2;
+target_ulong mask = (1LL << sz) - 1;
+for (int i = 0; i < TARGET_LONG_BITS; i += sz) {
+target_ulong pos = ((rs2 >> i) & mask) << sz_log2;
+if (pos < sizeof(target_ulong) * 8) {
+r |= ((rs1 >> pos) & mask) << i;
+}
+}
+return r;
+}


This could become a static inline do_xperm, and provide two specific xperm4 and xperm8 
helpers; the compiler would fold all of the sz_log2 stuff into a more efficient 
implementation.



+target_ulong HELPER(unshfl)(target_ulong rs1,
+target_ulong rs2)
+{
+target_ulong x = rs1;
+int i, shift;
+int bits = TARGET_LONG_BITS >> 1;
+for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
+if (rs2 & shift) {
+x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift);
+}
+}
+return x;
+}
+
+target_ulong HELPER(shfl)(target_ulong rs1,
+  target_ulong rs2)
+{
+target_ulong x = rs1;
+int i, shift;
+shift = TARGET_LONG_BITS >> 2;
+i = (shift == 8) ? 3 : 4;
+for (; i >= 0; i--, shift >>= 1) {
+if (rs2 & shift) {
+x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift);
+}
+}
+return x;
+}


Similar comment as for grev.


+# The encoding for zext.h differs between RV32 and RV64.
+# zext_h_32 denotes the RV32 variant.
+{
+  zext_h_32  100 0 . 100 . 0110011 @r2
+  pack   100 . . 100 . 0110011 @r
+}


Note to self: improve tcg_gen_deposit to notice zeros, so that the more general pack 
compiles to zero-extension.



@@ -556,6 +563,81 @@ static bool gen_unary_per_ol(DisasContext *ctx, arg_r2 *a, 
DisasExtend ext,
  return gen_unary(ctx, a, ext, f_tl);
  }
  
+static bool gen_xperm(DisasContext *ctx, arg_r *a, int32_t size)

+{
+TCGv dest = dest_gpr(ctx, a->rd);
+TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE);
+TCGv src2 = get_gpr(ctx, a->rs2, EXT_NONE);
+
+TCGv_i32 sz = tcg_const_i32(size);
+gen_helper_xperm(dest, src1, src2, sz);
+
+gen_set_gpr(ctx, a->rd, dest);
+tcg_temp_free_i32(sz);
+return true;
+}
+
+static bool gen_grevi(DisasContext *ctx, arg_r2 *a, int shamt)
+{
+TCGv dest = dest_gpr(ctx, a->rd);
+TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE);
+
+if (shamt == (TARGET_LONG_BITS - 8)) {
+/* rev8, byte swaps */
+tcg_gen_bswap_tl(dest, src1);
+} else {
+TCGv src2 = tcg_temp_new();
+tcg_gen_movi_tl(src2, shamt);
+gen_helper_grev(dest, src1, src2);
+tcg_temp_free(src2);
+}
+
+gen_set_gpr(ctx, a->rd, dest);
+return true;
+}
+
+static void gen_pack(TCGv ret, TCGv src1, TCGv src2)
+{
+tcg_gen_deposit_tl(ret, src1, src2,
+   TARGET_LONG_BITS / 2,
+   TARGET_LONG_BITS / 2);
+}
+
+static void gen_packh(TCGv ret, TCGv src1, TCGv src2)
+{
+TCGv t = tcg_temp_new();
+tcg_gen_ext8u_tl(t, src2);
+tcg_gen_deposit_tl(ret, src1, t, 8, TARGET_LONG_BITS - 8);
+tcg_temp_free(t);
+}
+
+static void gen_packw(TCGv ret, TCGv src1, TCGv src2)
+{
+TCGv t = tcg_temp_new();
+tcg_gen_ext16s_tl(t, src2);
+tcg_gen_deposit_tl(ret, src1, t, 16, 48);
+tcg_temp_free(t);
+}
+
+static bool gen_shufi(DisasContext *ctx, arg_r2 *a, int shamt,
+   void(*func)(TCGv, TCGv, TCGv))
+{
+if (shamt >= TARGET_LONG_BITS / 2) {
+return false;
+}
+
+TCGv dest = dest_gpr(ctx, a->rd);
+TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE);
+TCGv src2 = tcg_temp_new();
+
+tcg_gen_movi_tl(src2, shamt);
+(*func)(dest, src1, src2);
+
+gen_set_gpr(ctx, a->rd, dest);
+tcg_temp_free(src2);
+return true;
+}


All of the gen functions belong in insn_trans/trans_rvb.c.inc.


r~



[RFC 2/6] target/riscv: rvk: add implementation of instructions for Zbk* - reuse partial instructions of Zbb/Zbc extensions - add brev8 packh, unzip, zip, etc.

2021-11-02 Thread liweiwei
Signed-off-by: liweiwei 
Signed-off-by: wangjunqiang 
---
 target/riscv/bitmanip_helper.c  | 94 +
 target/riscv/helper.h   |  4 ++
 target/riscv/insn32.decode  | 52 +-
 target/riscv/insn_trans/trans_rvb.c.inc | 91 
 target/riscv/translate.c| 82 +
 5 files changed, 292 insertions(+), 31 deletions(-)

diff --git a/target/riscv/bitmanip_helper.c b/target/riscv/bitmanip_helper.c
index f1b5e5549f..1c6beb8216 100644
--- a/target/riscv/bitmanip_helper.c
+++ b/target/riscv/bitmanip_helper.c
@@ -49,3 +49,97 @@ target_ulong HELPER(clmulr)(target_ulong rs1, target_ulong 
rs2)
 
 return result;
 }
+
+static const uint64_t adjacent_masks[] = {
+dup_const(MO_8, 0x55),
+dup_const(MO_8, 0x33),
+dup_const(MO_8, 0x0f),
+dup_const(MO_16, 0xff),
+dup_const(MO_32, 0x),
+UINT32_MAX
+};
+
+static inline target_ulong do_swap(target_ulong x, uint64_t mask, int shift)
+{
+return ((x & mask) << shift) | ((x & ~mask) >> shift);
+}
+
+static target_ulong do_grev(target_ulong rs1,
+target_ulong rs2,
+int bits)
+{
+target_ulong x = rs1;
+int i, shift;
+
+for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
+if (rs2 & shift) {
+x = do_swap(x, adjacent_masks[i], shift);
+}
+}
+
+return x;
+}
+
+target_ulong HELPER(grev)(target_ulong rs1, target_ulong rs2)
+{
+return do_grev(rs1, rs2, TARGET_LONG_BITS);
+}
+
+target_ulong HELPER(xperm)(target_ulong rs1, target_ulong rs2, uint32_t 
sz_log2)
+{
+target_ulong r = 0;
+target_ulong sz = 1LL << sz_log2;
+target_ulong mask = (1LL << sz) - 1;
+for (int i = 0; i < TARGET_LONG_BITS; i += sz) {
+target_ulong pos = ((rs2 >> i) & mask) << sz_log2;
+if (pos < sizeof(target_ulong) * 8) {
+r |= ((rs1 >> pos) & mask) << i;
+}
+}
+return r;
+}
+
+static const uint64_t shuf_masks[] = {
+dup_const(MO_8, 0x44),
+dup_const(MO_8, 0x30),
+dup_const(MO_16, 0x0f00),
+dup_const(MO_32, 0xff),
+dup_const(MO_64, 0x)
+};
+
+static inline target_ulong do_shuf_stage(target_ulong src, uint64_t maskL,
+ uint64_t maskR, int shift)
+{
+target_ulong x = src & ~(maskL | maskR);
+x |= ((src << shift) & maskL) | ((src >> shift) & maskR);
+return x;
+}
+
+target_ulong HELPER(unshfl)(target_ulong rs1,
+target_ulong rs2)
+{
+target_ulong x = rs1;
+int i, shift;
+int bits = TARGET_LONG_BITS >> 1;
+for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) {
+if (rs2 & shift) {
+x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift);
+}
+}
+return x;
+}
+
+target_ulong HELPER(shfl)(target_ulong rs1,
+  target_ulong rs2)
+{
+target_ulong x = rs1;
+int i, shift;
+shift = TARGET_LONG_BITS >> 2;
+i = (shift == 8) ? 3 : 4;
+for (; i >= 0; i--, shift >>= 1) {
+if (rs2 & shift) {
+x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift);
+}
+}
+return x;
+}
diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index c7a5376227..216aa4193b 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -61,6 +61,10 @@ DEF_HELPER_FLAGS_1(fclass_d, TCG_CALL_NO_RWG_SE, tl, i64)
 /* Bitmanip */
 DEF_HELPER_FLAGS_2(clmul, TCG_CALL_NO_RWG_SE, tl, tl, tl)
 DEF_HELPER_FLAGS_2(clmulr, TCG_CALL_NO_RWG_SE, tl, tl, tl)
+DEF_HELPER_FLAGS_2(grev, TCG_CALL_NO_RWG_SE, tl, tl, tl)
+DEF_HELPER_FLAGS_3(xperm, TCG_CALL_NO_RWG_SE, tl, tl, tl, i32)
+DEF_HELPER_FLAGS_2(shfl, TCG_CALL_NO_RWG_SE, tl, tl, tl)
+DEF_HELPER_FLAGS_2(unshfl, TCG_CALL_NO_RWG_SE, tl, tl, tl)
 
 /* Special functions */
 DEF_HELPER_2(csrr, tl, env, int)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 2f251dac1b..a5333c4533 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -672,8 +672,22 @@ sh2add_uw  001 .. 100 . 0111011 @r
 sh3add_uw  001 .. 110 . 0111011 @r
 slli_uw1  001 . 0011011 @sh
 
-# *** RV32 Zbb Standard Extension ***
+# *** RV32 Zbb/Zbkb Standard Extension ***
 andn   010 .. 111 . 0110011 @r
+rol011 .. 001 . 0110011 @r
+ror011 .. 101 . 0110011 @r
+rori   01100  101 . 0010011 @sh
+# The encoding for rev8 differs between RV32 and RV64.
+# rev8_32 denotes the RV32 variant.
+rev8_32011010 011000 . 101 . 0010011 @r2
+# The encoding for zext.h differs between RV32 and RV64.
+# zext_h_32 denotes the RV32 variant.
+{
+  zext_h_32  100 0 . 100 . 0110011 @r2
+  pack   100 . . 100 . 0110011 @r
+}
+xnor   010 .. 100 . 0110011 @r
+#