Re: [RFC 2/6] target/riscv: rvk: add implementation of instructions for Zbk* - reuse partial instructions of Zbb/Zbc extensions - add brev8 packh, unzip, zip, etc.
Thanks for your suggestions. 在 2021/11/2 下午11:44, Richard Henderson 写道: On 11/1/21 11:11 PM, liweiwei wrote: Signed-off-by: liweiwei Signed-off-by: wangjunqiang You managed to get the whole patch description into the subject line. Please break it up. OK. +target_ulong HELPER(grev)(target_ulong rs1, target_ulong rs2) +{ + return do_grev(rs1, rs2, TARGET_LONG_BITS); +} Are we expecting to see the full grev instruction at any point? If not, we can certainly implement Zbk with a simpler implementation. The main idea that I add this helper is that grev may be added to B-extension later and it can be reused. However, it have no effect currently. I'll replace this with a simpler implementation. +target_ulong HELPER(xperm)(target_ulong rs1, target_ulong rs2, uint32_t sz_log2) +{ + target_ulong r = 0; + target_ulong sz = 1LL << sz_log2; + target_ulong mask = (1LL << sz) - 1; + for (int i = 0; i < TARGET_LONG_BITS; i += sz) { + target_ulong pos = ((rs2 >> i) & mask) << sz_log2; + if (pos < sizeof(target_ulong) * 8) { + r |= ((rs1 >> pos) & mask) << i; + } + } + return r; +} This could become a static inline do_xperm, and provide two specific xperm4 and xperm8 helpers; the compiler would fold all of the sz_log2 stuff into a more efficient implementation. OK. +target_ulong HELPER(unshfl)(target_ulong rs1, + target_ulong rs2) +{ + target_ulong x = rs1; + int i, shift; + int bits = TARGET_LONG_BITS >> 1; + for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) { + if (rs2 & shift) { + x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift); + } + } + return x; +} + +target_ulong HELPER(shfl)(target_ulong rs1, + target_ulong rs2) +{ + target_ulong x = rs1; + int i, shift; + shift = TARGET_LONG_BITS >> 2; + i = (shift == 8) ? 3 : 4; + for (; i >= 0; i--, shift >>= 1) { + if (rs2 & shift) { + x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift); + } + } + return x; +} Similar comment as for grev. +# The encoding for zext.h differs between RV32 and RV64. +# zext_h_32 denotes the RV32 variant. +{ + zext_h_32 100 0 . 100 . 0110011 @r2 + pack 100 . . 100 . 0110011 @r +} Note to self: improve tcg_gen_deposit to notice zeros, so that the more general pack compiles to zero-extension. @@ -556,6 +563,81 @@ static bool gen_unary_per_ol(DisasContext *ctx, arg_r2 *a, DisasExtend ext, return gen_unary(ctx, a, ext, f_tl); } +static bool gen_xperm(DisasContext *ctx, arg_r *a, int32_t size) +{ + TCGv dest = dest_gpr(ctx, a->rd); + TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE); + TCGv src2 = get_gpr(ctx, a->rs2, EXT_NONE); + + TCGv_i32 sz = tcg_const_i32(size); + gen_helper_xperm(dest, src1, src2, sz); + + gen_set_gpr(ctx, a->rd, dest); + tcg_temp_free_i32(sz); + return true; +} + +static bool gen_grevi(DisasContext *ctx, arg_r2 *a, int shamt) +{ + TCGv dest = dest_gpr(ctx, a->rd); + TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE); + + if (shamt == (TARGET_LONG_BITS - 8)) { + /* rev8, byte swaps */ + tcg_gen_bswap_tl(dest, src1); + } else { + TCGv src2 = tcg_temp_new(); + tcg_gen_movi_tl(src2, shamt); + gen_helper_grev(dest, src1, src2); + tcg_temp_free(src2); + } + + gen_set_gpr(ctx, a->rd, dest); + return true; +} + +static void gen_pack(TCGv ret, TCGv src1, TCGv src2) +{ + tcg_gen_deposit_tl(ret, src1, src2, + TARGET_LONG_BITS / 2, + TARGET_LONG_BITS / 2); +} + +static void gen_packh(TCGv ret, TCGv src1, TCGv src2) +{ + TCGv t = tcg_temp_new(); + tcg_gen_ext8u_tl(t, src2); + tcg_gen_deposit_tl(ret, src1, t, 8, TARGET_LONG_BITS - 8); + tcg_temp_free(t); +} + +static void gen_packw(TCGv ret, TCGv src1, TCGv src2) +{ + TCGv t = tcg_temp_new(); + tcg_gen_ext16s_tl(t, src2); + tcg_gen_deposit_tl(ret, src1, t, 16, 48); + tcg_temp_free(t); +} + +static bool gen_shufi(DisasContext *ctx, arg_r2 *a, int shamt, + void(*func)(TCGv, TCGv, TCGv)) +{ + if (shamt >= TARGET_LONG_BITS / 2) { + return false; + } + + TCGv dest = dest_gpr(ctx, a->rd); + TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE); + TCGv src2 = tcg_temp_new(); + + tcg_gen_movi_tl(src2, shamt); + (*func)(dest, src1, src2); + + gen_set_gpr(ctx, a->rd, dest); + tcg_temp_free(src2); + return true; +} All of the gen functions belong in insn_trans/trans_rvb.c.inc. OK. I'll move them to insn_trans/trans_rvb.c.inc. r~
Re: [RFC 2/6] target/riscv: rvk: add implementation of instructions for Zbk* - reuse partial instructions of Zbb/Zbc extensions - add brev8 packh, unzip, zip, etc.
On 11/1/21 11:11 PM, liweiwei wrote: Signed-off-by: liweiwei Signed-off-by: wangjunqiang You managed to get the whole patch description into the subject line. Please break it up. +target_ulong HELPER(grev)(target_ulong rs1, target_ulong rs2) +{ +return do_grev(rs1, rs2, TARGET_LONG_BITS); +} Are we expecting to see the full grev instruction at any point? If not, we can certainly implement Zbk with a simpler implementation. +target_ulong HELPER(xperm)(target_ulong rs1, target_ulong rs2, uint32_t sz_log2) +{ +target_ulong r = 0; +target_ulong sz = 1LL << sz_log2; +target_ulong mask = (1LL << sz) - 1; +for (int i = 0; i < TARGET_LONG_BITS; i += sz) { +target_ulong pos = ((rs2 >> i) & mask) << sz_log2; +if (pos < sizeof(target_ulong) * 8) { +r |= ((rs1 >> pos) & mask) << i; +} +} +return r; +} This could become a static inline do_xperm, and provide two specific xperm4 and xperm8 helpers; the compiler would fold all of the sz_log2 stuff into a more efficient implementation. +target_ulong HELPER(unshfl)(target_ulong rs1, +target_ulong rs2) +{ +target_ulong x = rs1; +int i, shift; +int bits = TARGET_LONG_BITS >> 1; +for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) { +if (rs2 & shift) { +x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift); +} +} +return x; +} + +target_ulong HELPER(shfl)(target_ulong rs1, + target_ulong rs2) +{ +target_ulong x = rs1; +int i, shift; +shift = TARGET_LONG_BITS >> 2; +i = (shift == 8) ? 3 : 4; +for (; i >= 0; i--, shift >>= 1) { +if (rs2 & shift) { +x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift); +} +} +return x; +} Similar comment as for grev. +# The encoding for zext.h differs between RV32 and RV64. +# zext_h_32 denotes the RV32 variant. +{ + zext_h_32 100 0 . 100 . 0110011 @r2 + pack 100 . . 100 . 0110011 @r +} Note to self: improve tcg_gen_deposit to notice zeros, so that the more general pack compiles to zero-extension. @@ -556,6 +563,81 @@ static bool gen_unary_per_ol(DisasContext *ctx, arg_r2 *a, DisasExtend ext, return gen_unary(ctx, a, ext, f_tl); } +static bool gen_xperm(DisasContext *ctx, arg_r *a, int32_t size) +{ +TCGv dest = dest_gpr(ctx, a->rd); +TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE); +TCGv src2 = get_gpr(ctx, a->rs2, EXT_NONE); + +TCGv_i32 sz = tcg_const_i32(size); +gen_helper_xperm(dest, src1, src2, sz); + +gen_set_gpr(ctx, a->rd, dest); +tcg_temp_free_i32(sz); +return true; +} + +static bool gen_grevi(DisasContext *ctx, arg_r2 *a, int shamt) +{ +TCGv dest = dest_gpr(ctx, a->rd); +TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE); + +if (shamt == (TARGET_LONG_BITS - 8)) { +/* rev8, byte swaps */ +tcg_gen_bswap_tl(dest, src1); +} else { +TCGv src2 = tcg_temp_new(); +tcg_gen_movi_tl(src2, shamt); +gen_helper_grev(dest, src1, src2); +tcg_temp_free(src2); +} + +gen_set_gpr(ctx, a->rd, dest); +return true; +} + +static void gen_pack(TCGv ret, TCGv src1, TCGv src2) +{ +tcg_gen_deposit_tl(ret, src1, src2, + TARGET_LONG_BITS / 2, + TARGET_LONG_BITS / 2); +} + +static void gen_packh(TCGv ret, TCGv src1, TCGv src2) +{ +TCGv t = tcg_temp_new(); +tcg_gen_ext8u_tl(t, src2); +tcg_gen_deposit_tl(ret, src1, t, 8, TARGET_LONG_BITS - 8); +tcg_temp_free(t); +} + +static void gen_packw(TCGv ret, TCGv src1, TCGv src2) +{ +TCGv t = tcg_temp_new(); +tcg_gen_ext16s_tl(t, src2); +tcg_gen_deposit_tl(ret, src1, t, 16, 48); +tcg_temp_free(t); +} + +static bool gen_shufi(DisasContext *ctx, arg_r2 *a, int shamt, + void(*func)(TCGv, TCGv, TCGv)) +{ +if (shamt >= TARGET_LONG_BITS / 2) { +return false; +} + +TCGv dest = dest_gpr(ctx, a->rd); +TCGv src1 = get_gpr(ctx, a->rs1, EXT_NONE); +TCGv src2 = tcg_temp_new(); + +tcg_gen_movi_tl(src2, shamt); +(*func)(dest, src1, src2); + +gen_set_gpr(ctx, a->rd, dest); +tcg_temp_free(src2); +return true; +} All of the gen functions belong in insn_trans/trans_rvb.c.inc. r~
[RFC 2/6] target/riscv: rvk: add implementation of instructions for Zbk* - reuse partial instructions of Zbb/Zbc extensions - add brev8 packh, unzip, zip, etc.
Signed-off-by: liweiwei Signed-off-by: wangjunqiang --- target/riscv/bitmanip_helper.c | 94 + target/riscv/helper.h | 4 ++ target/riscv/insn32.decode | 52 +- target/riscv/insn_trans/trans_rvb.c.inc | 91 target/riscv/translate.c| 82 + 5 files changed, 292 insertions(+), 31 deletions(-) diff --git a/target/riscv/bitmanip_helper.c b/target/riscv/bitmanip_helper.c index f1b5e5549f..1c6beb8216 100644 --- a/target/riscv/bitmanip_helper.c +++ b/target/riscv/bitmanip_helper.c @@ -49,3 +49,97 @@ target_ulong HELPER(clmulr)(target_ulong rs1, target_ulong rs2) return result; } + +static const uint64_t adjacent_masks[] = { +dup_const(MO_8, 0x55), +dup_const(MO_8, 0x33), +dup_const(MO_8, 0x0f), +dup_const(MO_16, 0xff), +dup_const(MO_32, 0x), +UINT32_MAX +}; + +static inline target_ulong do_swap(target_ulong x, uint64_t mask, int shift) +{ +return ((x & mask) << shift) | ((x & ~mask) >> shift); +} + +static target_ulong do_grev(target_ulong rs1, +target_ulong rs2, +int bits) +{ +target_ulong x = rs1; +int i, shift; + +for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) { +if (rs2 & shift) { +x = do_swap(x, adjacent_masks[i], shift); +} +} + +return x; +} + +target_ulong HELPER(grev)(target_ulong rs1, target_ulong rs2) +{ +return do_grev(rs1, rs2, TARGET_LONG_BITS); +} + +target_ulong HELPER(xperm)(target_ulong rs1, target_ulong rs2, uint32_t sz_log2) +{ +target_ulong r = 0; +target_ulong sz = 1LL << sz_log2; +target_ulong mask = (1LL << sz) - 1; +for (int i = 0; i < TARGET_LONG_BITS; i += sz) { +target_ulong pos = ((rs2 >> i) & mask) << sz_log2; +if (pos < sizeof(target_ulong) * 8) { +r |= ((rs1 >> pos) & mask) << i; +} +} +return r; +} + +static const uint64_t shuf_masks[] = { +dup_const(MO_8, 0x44), +dup_const(MO_8, 0x30), +dup_const(MO_16, 0x0f00), +dup_const(MO_32, 0xff), +dup_const(MO_64, 0x) +}; + +static inline target_ulong do_shuf_stage(target_ulong src, uint64_t maskL, + uint64_t maskR, int shift) +{ +target_ulong x = src & ~(maskL | maskR); +x |= ((src << shift) & maskL) | ((src >> shift) & maskR); +return x; +} + +target_ulong HELPER(unshfl)(target_ulong rs1, +target_ulong rs2) +{ +target_ulong x = rs1; +int i, shift; +int bits = TARGET_LONG_BITS >> 1; +for (i = 0, shift = 1; shift < bits; i++, shift <<= 1) { +if (rs2 & shift) { +x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift); +} +} +return x; +} + +target_ulong HELPER(shfl)(target_ulong rs1, + target_ulong rs2) +{ +target_ulong x = rs1; +int i, shift; +shift = TARGET_LONG_BITS >> 2; +i = (shift == 8) ? 3 : 4; +for (; i >= 0; i--, shift >>= 1) { +if (rs2 & shift) { +x = do_shuf_stage(x, shuf_masks[i], shuf_masks[i] >> shift, shift); +} +} +return x; +} diff --git a/target/riscv/helper.h b/target/riscv/helper.h index c7a5376227..216aa4193b 100644 --- a/target/riscv/helper.h +++ b/target/riscv/helper.h @@ -61,6 +61,10 @@ DEF_HELPER_FLAGS_1(fclass_d, TCG_CALL_NO_RWG_SE, tl, i64) /* Bitmanip */ DEF_HELPER_FLAGS_2(clmul, TCG_CALL_NO_RWG_SE, tl, tl, tl) DEF_HELPER_FLAGS_2(clmulr, TCG_CALL_NO_RWG_SE, tl, tl, tl) +DEF_HELPER_FLAGS_2(grev, TCG_CALL_NO_RWG_SE, tl, tl, tl) +DEF_HELPER_FLAGS_3(xperm, TCG_CALL_NO_RWG_SE, tl, tl, tl, i32) +DEF_HELPER_FLAGS_2(shfl, TCG_CALL_NO_RWG_SE, tl, tl, tl) +DEF_HELPER_FLAGS_2(unshfl, TCG_CALL_NO_RWG_SE, tl, tl, tl) /* Special functions */ DEF_HELPER_2(csrr, tl, env, int) diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode index 2f251dac1b..a5333c4533 100644 --- a/target/riscv/insn32.decode +++ b/target/riscv/insn32.decode @@ -672,8 +672,22 @@ sh2add_uw 001 .. 100 . 0111011 @r sh3add_uw 001 .. 110 . 0111011 @r slli_uw1 001 . 0011011 @sh -# *** RV32 Zbb Standard Extension *** +# *** RV32 Zbb/Zbkb Standard Extension *** andn 010 .. 111 . 0110011 @r +rol011 .. 001 . 0110011 @r +ror011 .. 101 . 0110011 @r +rori 01100 101 . 0010011 @sh +# The encoding for rev8 differs between RV32 and RV64. +# rev8_32 denotes the RV32 variant. +rev8_32011010 011000 . 101 . 0010011 @r2 +# The encoding for zext.h differs between RV32 and RV64. +# zext_h_32 denotes the RV32 variant. +{ + zext_h_32 100 0 . 100 . 0110011 @r2 + pack 100 . . 100 . 0110011 @r +} +xnor 010 .. 100 . 0110011 @r +#