Re: [PATCH 1/1] target/ppc: Move VMX integer add/sub saturate insns to decodetree.
Hi Richard, On 5/12/24 17:08, Richard Henderson wrote: On 5/12/24 11:38, Chinmay Rath wrote: @@ -2934,6 +2870,184 @@ static bool do_vx_vaddsubcuw(DisasContext *ctx, arg_VX *a, int add) return true; } +static inline void do_vadd_vsub_sat +( + unsigned vece, TCGv_vec t, TCGv_vec sat, TCGv_vec a, TCGv_vec b, + void (*norm_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec), + void (*sat_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) +{ + TCGv_vec x = tcg_temp_new_vec_matching(t); + norm_op(vece, x, a, b); + sat_op(vece, t, a, b); + tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t); + tcg_gen_or_vec(vece, sat, sat, x); +} As a separate change, before or after, the cmp_vec may be simplified to xor_vec. Which means that INDEX_op_cmp_vec need not be probed in the vecop_lists. See https://lore.kernel.org/qemu-devel/20240506010403.6204-31-richard.hender...@linaro.org/ which is performing the same operation on AArch64. Noted ! Will do. +static bool do_vx_vadd_vsub_sat(DisasContext *ctx, arg_VX *a, + int sign, int vece, int add) +{ + static const TCGOpcode vecop_list_sub_u[] = { + INDEX_op_sub_vec, INDEX_op_ussub_vec, INDEX_op_cmp_vec, 0 + }; + static const TCGOpcode vecop_list_sub_s[] = { + INDEX_op_sub_vec, INDEX_op_sssub_vec, INDEX_op_cmp_vec, 0 + }; + static const TCGOpcode vecop_list_add_u[] = { + INDEX_op_add_vec, INDEX_op_usadd_vec, INDEX_op_cmp_vec, 0 + }; + static const TCGOpcode vecop_list_add_s[] = { + INDEX_op_add_vec, INDEX_op_ssadd_vec, INDEX_op_cmp_vec, 0 + }; + + static const GVecGen4 op[2][3][2] = { + { + { + { + .fniv = gen_vsub_sat_u, + .fno = gen_helper_VSUBUBS, + .opt_opc = vecop_list_sub_u, + .write_aofs = true, + .vece = MO_8 + }, . . . + { + .fniv = gen_vadd_sat_s, + .fno = gen_helper_VADDSWS, + .opt_opc = vecop_list_add_s, + .write_aofs = true, + .vece = MO_32 + }, + }, + }, + }; While this table is not wrong, I think it is clearer to have separate tables, one per operation, which are then passed in to a common expander. + + REQUIRE_INSNS_FLAGS(ctx, ALTIVEC); + REQUIRE_VECTOR(ctx); + + tcg_gen_gvec_4(avr_full_offset(a->vrt), offsetof(CPUPPCState, vscr_sat), + avr_full_offset(a->vra), avr_full_offset(a->vrb), 16, 16, + &op[sign][vece][add]); + + return true; +} + +TRANS(VSUBUBS, do_vx_vadd_vsub_sat, 0, MO_8, 0) I think it is clearer to use TRANS_FLAGS than to sink the ISA check into the helper. In general I seem to find the helper later gets reused for something else with a different ISA check. Thus static const TCGOpcode vecop_list_vsub_sat_u[] = { INDEX_op_sub_vec, INDEX_op_ussub_vec, 0 }; static const GVecGen4 op_vsububs = { .fno = gen_helper_VSUBUBS, .fniv = gen_vsub_sat_u, .opt_opc = vecop_list_vsub_sat_u, .write_aofs = true, .vece = MO_8 }; TRANS_FLAGS(VSUBUBS, do_vx_vadd_vsub_sat, &op_vsububs) static const GVecGen4 op_vsubuhs = { .fno = gen_helper_VSUBUHS, .fniv = gen_vsub_sat_u, .opt_opc = vecop_list_vsub_sat_u, .write_aofs = true, .vece = MO_16 }; TRANS_FLAGS(VSUBUHS, do_vx_vadd_vsub_sat, &op_vsubuhs) etc. Will add those changes in v2. -GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE), You are correct in your cover letter that this is not right. We should have been testing ISA300 for vmul10uq here. Thank you very much for the clarification ! +GEN_VXFORM(vmul10euq, 0, 9), And thus need GEN_VXFORM_300 here. +GEN_VXFORM(vmul10euq, 0, 9), +GEN_VXFORM(bcdcpsgn, 0, 13), +GEN_VXFORM(bcdadd, 0, 24), +GEN_VXFORM(bcdsub, 0, 25), ... +GEN_VXFORM(xpnd04_2, 0, 30), None of these are in the base ISA, so all need a flag check. r~ Thanks & Regards, Chinmay
Re: [PATCH 1/1] target/ppc: Move VMX integer add/sub saturate insns to decodetree.
On 5/12/24 11:38, Chinmay Rath wrote: @@ -2934,6 +2870,184 @@ static bool do_vx_vaddsubcuw(DisasContext *ctx, arg_VX *a, int add) return true; } +static inline void do_vadd_vsub_sat +( +unsigned vece, TCGv_vec t, TCGv_vec sat, TCGv_vec a, TCGv_vec b, +void (*norm_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec), +void (*sat_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) +{ +TCGv_vec x = tcg_temp_new_vec_matching(t); +norm_op(vece, x, a, b); +sat_op(vece, t, a, b); +tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t); +tcg_gen_or_vec(vece, sat, sat, x); +} As a separate change, before or after, the cmp_vec may be simplified to xor_vec. Which means that INDEX_op_cmp_vec need not be probed in the vecop_lists. See https://lore.kernel.org/qemu-devel/20240506010403.6204-31-richard.hender...@linaro.org/ which is performing the same operation on AArch64. +static bool do_vx_vadd_vsub_sat(DisasContext *ctx, arg_VX *a, +int sign, int vece, int add) +{ +static const TCGOpcode vecop_list_sub_u[] = { +INDEX_op_sub_vec, INDEX_op_ussub_vec, INDEX_op_cmp_vec, 0 +}; +static const TCGOpcode vecop_list_sub_s[] = { +INDEX_op_sub_vec, INDEX_op_sssub_vec, INDEX_op_cmp_vec, 0 +}; +static const TCGOpcode vecop_list_add_u[] = { +INDEX_op_add_vec, INDEX_op_usadd_vec, INDEX_op_cmp_vec, 0 +}; +static const TCGOpcode vecop_list_add_s[] = { +INDEX_op_add_vec, INDEX_op_ssadd_vec, INDEX_op_cmp_vec, 0 +}; + +static const GVecGen4 op[2][3][2] = { +{ +{ +{ +.fniv = gen_vsub_sat_u, +.fno = gen_helper_VSUBUBS, +.opt_opc = vecop_list_sub_u, +.write_aofs = true, +.vece = MO_8 +}, +{ +.fniv = gen_vadd_sat_u, +.fno = gen_helper_VADDUBS, +.opt_opc = vecop_list_add_u, +.write_aofs = true, +.vece = MO_8 +}, +}, +{ +{ +.fniv = gen_vsub_sat_u, +.fno = gen_helper_VSUBUHS, +.opt_opc = vecop_list_sub_u, +.write_aofs = true, +.vece = MO_16 +}, +{ +.fniv = gen_vadd_sat_u, +.fno = gen_helper_VADDUHS, +.opt_opc = vecop_list_add_u, +.write_aofs = true, +.vece = MO_16 +}, +}, +{ +{ +.fniv = gen_vsub_sat_u, +.fno = gen_helper_VSUBUWS, +.opt_opc = vecop_list_sub_u, +.write_aofs = true, +.vece = MO_32 +}, +{ +.fniv = gen_vadd_sat_u, +.fno = gen_helper_VADDUWS, +.opt_opc = vecop_list_add_u, +.write_aofs = true, +.vece = MO_32 +}, +}, +}, +{ +{ +{ +.fniv = gen_vsub_sat_s, +.fno = gen_helper_VSUBSBS, +.opt_opc = vecop_list_sub_s, +.write_aofs = true, +.vece = MO_8 +}, +{ +.fniv = gen_vadd_sat_s, +.fno = gen_helper_VADDSBS, +.opt_opc = vecop_list_add_s, +.write_aofs = true, +.vece = MO_8 +}, +}, +{ +{ +.fniv = gen_vsub_sat_s, +.fno = gen_helper_VSUBSHS, +.opt_opc = vecop_list_sub_s, +.write_aofs = true, +.vece = MO_16 +}, +{ +.fniv = gen_vadd_sat_s, +.fno = gen_helper_VADDSHS, +.opt_opc = vecop_list_add_s, +.write_aofs = true, +.vece = MO_16 +}, +}, +{ +{ +.fniv = gen_vsub_sat_s, +.fno = gen_helper_VSUBSWS, +.opt_opc = vecop_list_sub_s, +.write_aofs = true, +.vece = MO_32 +}, +{ +.fniv = gen_vadd_sat_s, +.fno = gen_helper_VADDSWS, +.opt_opc = vecop_list_add_s, +.write_aofs = true, +.vece = MO_32 +}, +}, +}, +}; While this table is not wrong, I think it is clearer to have separate tables, one per
[PATCH 1/1] target/ppc: Move VMX integer add/sub saturate insns to decodetree.
Moving the following instructions to decodetree specification : v{add,sub}{u,s}{b,h,w}s : VX-form The changes were verified by validating that the tcg ops generated by those instructions remain the same, which were captured with the '-d in_asm,op' flag. Signed-off-by: Chinmay Rath --- target/ppc/helper.h | 24 +-- target/ppc/insn32.decode| 16 ++ target/ppc/int_helper.c | 22 +-- target/ppc/translate/vmx-impl.c.inc | 242 target/ppc/translate/vmx-ops.c.inc | 19 +-- 5 files changed, 224 insertions(+), 99 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index f397ef459a..2963e48fdc 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -200,18 +200,18 @@ DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32) -DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vsubsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vsubshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vsubsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vaddubs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vadduhs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vadduws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vsububs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vsubuhs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) -DEF_HELPER_FLAGS_5(vsubuws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VADDSBS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VADDSHS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VADDSWS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VSUBSBS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VSUBSHS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VSUBSWS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VADDUBS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VADDUHS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VADDUWS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VSUBUBS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VSUBUHS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) +DEF_HELPER_FLAGS_5(VSUBUWS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_3(VADDUQM, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_4(VADDECUQ, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VADDEUQM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 847a2f4356..d7d77eaa99 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -697,6 +697,14 @@ VADDCUW 000100 . . . 0011000@VX VADDCUQ 000100 . . . 0010100@VX VADDUQM 000100 . . . 001@VX +VADDSBS 000100 . . . 011@VX +VADDSHS 000100 . . . 0110100@VX +VADDSWS 000100 . . . 0111000@VX + +VADDUBS 000100 . . . 010@VX +VADDUHS 000100 . . . 0100100@VX +VADDUWS 000100 . . . 0101000@VX + VADDEUQM000100 . . . . 00 @VA VADDECUQ000100 . . . . 01 @VA @@ -704,6 +712,14 @@ VSUBCUW 000100 . . . 1011000@VX VSUBCUQ 000100 . . . 1010100@VX VSUBUQM 000100 . . . 101@VX +VSUBSBS 000100 . . . 111@VX +VSUBSHS 000100 . . . 1110100@VX +VSUBSWS 000100 . . . 000@VX + +VSUBUBS 000100 . . . 110@VX +VSUBUHS 000100 . . . 1100100@VX +VSUBUWS 000100 . . . 1101000@VX + VSUBECUQ000100 . . . . 11 @VA VSUBEUQM000100 . . . . 10 @VA diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 0a5c3e78a4..aec2d3d4ec 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -541,7 +541,7 @@ VARITHFPFMA(nmsubfp, float_muladd_negate_result | float_muladd_negate_c); } #define VARITHSAT_DO(name, op, optype, cvt, element)