Re: [PATCH 1/1] target/ppc: Move VMX integer add/sub saturate insns to decodetree.

2024-05-16 Thread Chinmay Rath

Hi Richard,

On 5/12/24 17:08, Richard Henderson wrote:

On 5/12/24 11:38, Chinmay Rath wrote:
@@ -2934,6 +2870,184 @@ static bool do_vx_vaddsubcuw(DisasContext 
*ctx, arg_VX *a, int add)

  return true;
  }
  +static inline void do_vadd_vsub_sat
+(
+    unsigned vece, TCGv_vec t, TCGv_vec sat, TCGv_vec a, TCGv_vec b,
+    void (*norm_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec),
+    void (*sat_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
+{
+    TCGv_vec x = tcg_temp_new_vec_matching(t);
+    norm_op(vece, x, a, b);
+    sat_op(vece, t, a, b);
+    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+    tcg_gen_or_vec(vece, sat, sat, x);
+}


As a separate change, before or after, the cmp_vec may be simplified 
to xor_vec.  Which means that INDEX_op_cmp_vec need not be probed in 
the vecop_lists.  See


https://lore.kernel.org/qemu-devel/20240506010403.6204-31-richard.hender...@linaro.org/ 



which is performing the same operation on AArch64.


Noted ! Will do.



+static bool do_vx_vadd_vsub_sat(DisasContext *ctx, arg_VX *a,
+    int sign, int vece, int add)
+{
+    static const TCGOpcode vecop_list_sub_u[] = {
+    INDEX_op_sub_vec, INDEX_op_ussub_vec, INDEX_op_cmp_vec, 0
+    };
+    static const TCGOpcode vecop_list_sub_s[] = {
+    INDEX_op_sub_vec, INDEX_op_sssub_vec, INDEX_op_cmp_vec, 0
+    };
+    static const TCGOpcode vecop_list_add_u[] = {
+    INDEX_op_add_vec, INDEX_op_usadd_vec, INDEX_op_cmp_vec, 0
+    };
+    static const TCGOpcode vecop_list_add_s[] = {
+    INDEX_op_add_vec, INDEX_op_ssadd_vec, INDEX_op_cmp_vec, 0
+    };
+
+    static const GVecGen4 op[2][3][2] = {
+    {
+    {
+    {
+    .fniv = gen_vsub_sat_u,
+    .fno = gen_helper_VSUBUBS,
+    .opt_opc = vecop_list_sub_u,
+    .write_aofs = true,
+    .vece = MO_8
+    },

.
.
.

+    {
+    .fniv = gen_vadd_sat_s,
+    .fno = gen_helper_VADDSWS,
+    .opt_opc = vecop_list_add_s,
+    .write_aofs = true,
+    .vece = MO_32
+    },
+    },
+    },
+    };


While this table is not wrong, I think it is clearer to have separate 
tables, one per operation, which are then passed in to a common expander.



+
+    REQUIRE_INSNS_FLAGS(ctx, ALTIVEC);
+    REQUIRE_VECTOR(ctx);
+
+    tcg_gen_gvec_4(avr_full_offset(a->vrt), offsetof(CPUPPCState, 
vscr_sat),
+   avr_full_offset(a->vra), avr_full_offset(a->vrb), 
16, 16,

+   &op[sign][vece][add]);
+
+    return true;
+}
+
+TRANS(VSUBUBS, do_vx_vadd_vsub_sat, 0, MO_8, 0)


I think it is clearer to use TRANS_FLAGS than to sink the ISA check 
into the helper.  In general I seem to find the helper later gets 
reused for something else with a different ISA check.


Thus

static const TCGOpcode vecop_list_vsub_sat_u[] = {
    INDEX_op_sub_vec, INDEX_op_ussub_vec, 0
};
static const GVecGen4 op_vsububs = {
    .fno = gen_helper_VSUBUBS,
    .fniv = gen_vsub_sat_u,
    .opt_opc = vecop_list_vsub_sat_u,
    .write_aofs = true,
    .vece = MO_8
};
TRANS_FLAGS(VSUBUBS, do_vx_vadd_vsub_sat, &op_vsububs)

static const GVecGen4 op_vsubuhs = {
    .fno = gen_helper_VSUBUHS,
    .fniv = gen_vsub_sat_u,
    .opt_opc = vecop_list_vsub_sat_u,
    .write_aofs = true,
    .vece = MO_16
};
TRANS_FLAGS(VSUBUHS, do_vx_vadd_vsub_sat, &op_vsubuhs)

etc.


Will add those changes in v2.

-GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE),


You are correct in your cover letter that this is not right.
We should have been testing ISA300 for vmul10uq here.


Thank you very much for the clarification !

+GEN_VXFORM(vmul10euq, 0, 9),


And thus need GEN_VXFORM_300 here.


+GEN_VXFORM(vmul10euq, 0, 9),
+GEN_VXFORM(bcdcpsgn, 0, 13),
+GEN_VXFORM(bcdadd, 0, 24),
+GEN_VXFORM(bcdsub, 0, 25),

...

+GEN_VXFORM(xpnd04_2, 0, 30),


None of these are in the base ISA, so all need a flag check.



r~


Thanks & Regards,
Chinmay



Re: [PATCH 1/1] target/ppc: Move VMX integer add/sub saturate insns to decodetree.

2024-05-12 Thread Richard Henderson

On 5/12/24 11:38, Chinmay Rath wrote:

@@ -2934,6 +2870,184 @@ static bool do_vx_vaddsubcuw(DisasContext *ctx, arg_VX 
*a, int add)
  return true;
  }
  
+static inline void do_vadd_vsub_sat

+(
+unsigned vece, TCGv_vec t, TCGv_vec sat, TCGv_vec a, TCGv_vec b,
+void (*norm_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec),
+void (*sat_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
+{
+TCGv_vec x = tcg_temp_new_vec_matching(t);
+norm_op(vece, x, a, b);
+sat_op(vece, t, a, b);
+tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+tcg_gen_or_vec(vece, sat, sat, x);
+}


As a separate change, before or after, the cmp_vec may be simplified to xor_vec.  Which 
means that INDEX_op_cmp_vec need not be probed in the vecop_lists.  See


https://lore.kernel.org/qemu-devel/20240506010403.6204-31-richard.hender...@linaro.org/

which is performing the same operation on AArch64.



+static bool do_vx_vadd_vsub_sat(DisasContext *ctx, arg_VX *a,
+int sign, int vece, int add)
+{
+static const TCGOpcode vecop_list_sub_u[] = {
+INDEX_op_sub_vec, INDEX_op_ussub_vec, INDEX_op_cmp_vec, 0
+};
+static const TCGOpcode vecop_list_sub_s[] = {
+INDEX_op_sub_vec, INDEX_op_sssub_vec, INDEX_op_cmp_vec, 0
+};
+static const TCGOpcode vecop_list_add_u[] = {
+INDEX_op_add_vec, INDEX_op_usadd_vec, INDEX_op_cmp_vec, 0
+};
+static const TCGOpcode vecop_list_add_s[] = {
+INDEX_op_add_vec, INDEX_op_ssadd_vec, INDEX_op_cmp_vec, 0
+};
+
+static const GVecGen4 op[2][3][2] = {
+{
+{
+{
+.fniv = gen_vsub_sat_u,
+.fno = gen_helper_VSUBUBS,
+.opt_opc = vecop_list_sub_u,
+.write_aofs = true,
+.vece = MO_8
+},
+{
+.fniv = gen_vadd_sat_u,
+.fno = gen_helper_VADDUBS,
+.opt_opc = vecop_list_add_u,
+.write_aofs = true,
+.vece = MO_8
+},
+},
+{
+{
+.fniv = gen_vsub_sat_u,
+.fno = gen_helper_VSUBUHS,
+.opt_opc = vecop_list_sub_u,
+.write_aofs = true,
+.vece = MO_16
+},
+{
+.fniv = gen_vadd_sat_u,
+.fno = gen_helper_VADDUHS,
+.opt_opc = vecop_list_add_u,
+.write_aofs = true,
+.vece = MO_16
+},
+},
+{
+{
+.fniv = gen_vsub_sat_u,
+.fno = gen_helper_VSUBUWS,
+.opt_opc = vecop_list_sub_u,
+.write_aofs = true,
+.vece = MO_32
+},
+{
+.fniv = gen_vadd_sat_u,
+.fno = gen_helper_VADDUWS,
+.opt_opc = vecop_list_add_u,
+.write_aofs = true,
+.vece = MO_32
+},
+},
+},
+{
+{
+{
+.fniv = gen_vsub_sat_s,
+.fno = gen_helper_VSUBSBS,
+.opt_opc = vecop_list_sub_s,
+.write_aofs = true,
+.vece = MO_8
+},
+{
+.fniv = gen_vadd_sat_s,
+.fno = gen_helper_VADDSBS,
+.opt_opc = vecop_list_add_s,
+.write_aofs = true,
+.vece = MO_8
+},
+},
+{
+{
+.fniv = gen_vsub_sat_s,
+.fno = gen_helper_VSUBSHS,
+.opt_opc = vecop_list_sub_s,
+.write_aofs = true,
+.vece = MO_16
+},
+{
+.fniv = gen_vadd_sat_s,
+.fno = gen_helper_VADDSHS,
+.opt_opc = vecop_list_add_s,
+.write_aofs = true,
+.vece = MO_16
+},
+},
+{
+{
+.fniv = gen_vsub_sat_s,
+.fno = gen_helper_VSUBSWS,
+.opt_opc = vecop_list_sub_s,
+.write_aofs = true,
+.vece = MO_32
+},
+{
+.fniv = gen_vadd_sat_s,
+.fno = gen_helper_VADDSWS,
+.opt_opc = vecop_list_add_s,
+.write_aofs = true,
+.vece = MO_32
+},
+},
+},
+};


While this table is not wrong, I think it is clearer to have separate tables, one per 

[PATCH 1/1] target/ppc: Move VMX integer add/sub saturate insns to decodetree.

2024-05-12 Thread Chinmay Rath
Moving the following instructions to decodetree specification :

v{add,sub}{u,s}{b,h,w}s : VX-form

The changes were verified by validating that the tcg ops generated by those
instructions remain the same, which were captured with the '-d in_asm,op' flag.

Signed-off-by: Chinmay Rath 
---
 target/ppc/helper.h |  24 +--
 target/ppc/insn32.decode|  16 ++
 target/ppc/int_helper.c |  22 +--
 target/ppc/translate/vmx-impl.c.inc | 242 
 target/ppc/translate/vmx-ops.c.inc  |  19 +--
 5 files changed, 224 insertions(+), 99 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f397ef459a..2963e48fdc 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -200,18 +200,18 @@ DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vsubsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vsubshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vsubsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vaddubs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vadduhs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vadduws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vsububs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vsubuhs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_5(vsubuws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VADDSBS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VADDSHS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VADDSWS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VSUBSBS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VSUBSHS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VSUBSWS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VADDUBS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VADDUHS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VADDUWS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VSUBUBS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VSUBUHS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_5(VSUBUWS, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_3(VADDUQM, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VADDECUQ, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VADDEUQM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 847a2f4356..d7d77eaa99 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -697,6 +697,14 @@ VADDCUW 000100 . . . 0011000@VX
 VADDCUQ 000100 . . . 0010100@VX
 VADDUQM 000100 . . . 001@VX
 
+VADDSBS 000100 . . . 011@VX
+VADDSHS 000100 . . . 0110100@VX
+VADDSWS 000100 . . . 0111000@VX
+
+VADDUBS 000100 . . . 010@VX
+VADDUHS 000100 . . . 0100100@VX
+VADDUWS 000100 . . . 0101000@VX
+
 VADDEUQM000100 . . . . 00   @VA
 VADDECUQ000100 . . . . 01   @VA
 
@@ -704,6 +712,14 @@ VSUBCUW 000100 . . . 1011000@VX
 VSUBCUQ 000100 . . . 1010100@VX
 VSUBUQM 000100 . . . 101@VX
 
+VSUBSBS 000100 . . . 111@VX
+VSUBSHS 000100 . . . 1110100@VX
+VSUBSWS 000100 . . . 000@VX
+
+VSUBUBS 000100 . . . 110@VX
+VSUBUHS 000100 . . . 1100100@VX
+VSUBUWS 000100 . . . 1101000@VX
+
 VSUBECUQ000100 . . . . 11   @VA
 VSUBEUQM000100 . . . . 10   @VA
 
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 0a5c3e78a4..aec2d3d4ec 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -541,7 +541,7 @@ VARITHFPFMA(nmsubfp, float_muladd_negate_result | 
float_muladd_negate_c);
 }
 
 #define VARITHSAT_DO(name, op, optype, cvt, element)