Re: [PATCH 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P

2022-09-24 Thread Richard Henderson

On 9/23/22 21:47, Lucas Mateus Castro(alqotel) wrote:

+static void do_xvtstdc_vec(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t imm)
+{
+TCGv_vec match = tcg_const_ones_vec_matching(t);
+TCGv_vec temp;
+TCGv_vec mask;
+uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP;
+uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP;
+uint64_t frc_msk = ~(exp_msk | sgn_msk);
+mask = tcg_constant_vec_matching(t, vece, 0);
+tcg_gen_mov_vec(t, mask);
+if (imm & (0x3 << 0)) {
+/* test if Denormal */
+temp = tcg_temp_new_vec_matching(t);
+mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
+tcg_gen_and_vec(vece, t, b, mask);
+mask = tcg_constant_vec_matching(t, vece, frc_msk);
+tcg_gen_cmp_vec(TCG_COND_LE, vece, temp, t, mask);
+mask = tcg_constant_vec_matching(t, vece, 0);
+tcg_gen_cmpsel_vec(TCG_COND_NE, vece, temp, t, mask, temp, mask);
+
+tcg_gen_mov_vec(t, mask);
+mask = tcg_constant_vec_matching(t, vece, sgn_msk);
+if (imm & (0x1)) {
+/* test if negative */
+tcg_gen_cmpsel_vec(TCG_COND_GTU, vece, t, b, mask, temp, t);
+}
+if (imm & (0x2)) {
+/* test if positive */
+tcg_gen_cmpsel_vec(TCG_COND_LTU, vece, t, b, mask, temp, t);
+}
+tcg_temp_free_vec(temp);
+}
+if (imm & (1 << 2)) {
+/* test if -0 */
+mask = tcg_constant_vec_matching(t, vece, sgn_msk);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 3)) {
+/* test if +0 */
+mask = tcg_constant_vec_matching(t, vece, 0);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 4)) {
+/* test if -Inf */
+mask = tcg_constant_vec_matching(t, vece, exp_msk | sgn_msk);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 5)) {
+/* test if +Inf */
+mask = tcg_constant_vec_matching(t, vece, exp_msk);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 6)) {
+/* test if NaN */
+mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
+tcg_gen_and_vec(vece, b, b, mask);
+mask = tcg_constant_vec_matching(t, vece, exp_msk);
+tcg_gen_cmpsel_vec(TCG_COND_GT, vece, t, b, mask, match, t);
+}
+tcg_temp_free_vec(match);
+}


While each case is fairly clever, I don't think that stringing them together like this is 
a good idea.  I think you should only handle the easy cases inline, and defer random (and 
probably rarely used) bit combinations to the helper function.


For instance,

static void gen_is_pos_inf(unsigned vece, TCGv_vec t, TCGv_vec b)
{
tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b,
tcg_constant_vec_matching(t, vece, exp_mask));
}

static void gen_is_any_inf(unsigned vece, TCGv_vec t, TCGv_vec b)
{
tcg_gen_and_vec(vece, t, b,
tcg_constant_vec_matching(t, vece, ~sgn_mask));
tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b,
tcg_constant_vec_matching(t, vece, exp_mask));
}

static bool do_xvtstdc(...)
{
switch (a->imm) {
case (1 << 4): /* -Inf */
tcg_gen_gvec_2(..., _is_neg_inf);
break;
case (1 << 5): /* +Inf */
tcg_gen_gvec_2(..., _is_pos_inf);
break;
case (1 << 4) | (1 << 5): /* -Inf | +Inf */
tcg_gen_gvec_2(..., _is_any_inf);
break;
...
default:
tcg_gen_gvec_2_ool(..., 16, 16, a->imm, gen_helper_XVTSTDCXX);
}
}

Or something of that nature.

I'll also note that you don't need CMPSEL -- all cases are mutually exclusive, so OR works 
just as well.


r~



[PATCH 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Used gvec to translate XVTSTDCSP and XVTSTDCDP.

xvtstdcsp:
reptlooppatch10 patch12
8   12500   2,70288900  1,24050300 (-54.1%)
25  40002,65665700  1,14078900 (-57.1%)
100 10002,82795400  1,53337200 (-45.8%)
500 200 3,62225400  3,91718000 (+8.1%)
250040  6,45658000 12,60683700 (+95.3%)
800012 17,48091900 44,15384000 (+152.6%)

xvtstdcdp:
reptlooppatch10 patch12
8   125001,56435900 1,24554800 (-20.4%)
25  4000 1,53789500 1,14177800 (-25.8%)
100 1000 1,67964600 1,5428 (-8.1%)
500 200  2,46777100 3,96816000 (+60.8%)
250040   5,2193890012,79937800 (+145.2%)
800012  15,9760050045,44233000 (+184.4%)

Overall these instructions are the hardest ones to measure performance
as the helper implementation is affected by the immediate. So for
example in a worst case scenario (high REPT, LOOP = 1, immediate 127) it
took 13x longer with the gvec implementation, and in a best case
scenario (low REPT, high LOOP, only 1 bit set in the immediate) the
execution took 21.8% of the time with gvec (-78.2%).
The tests here are the sum of every possible immediate.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/translate/vsx-impl.c.inc | 73 -
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index c3c179723b..dc95e8fdf4 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -1121,16 +1121,85 @@ GEN_VSX_HELPER_X2(xscvhpdp, 0x16, 0x15, 0x10, 
PPC2_ISA300)
 GEN_VSX_HELPER_R2(xscvsdqp, 0x04, 0x1A, 0x0A, PPC2_ISA300)
 GEN_VSX_HELPER_X2(xscvspdp, 0x12, 0x14, 0, PPC2_VSX)
 
+static void do_xvtstdc_vec(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t imm)
+{
+TCGv_vec match = tcg_const_ones_vec_matching(t);
+TCGv_vec temp;
+TCGv_vec mask;
+uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP;
+uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP;
+uint64_t frc_msk = ~(exp_msk | sgn_msk);
+mask = tcg_constant_vec_matching(t, vece, 0);
+tcg_gen_mov_vec(t, mask);
+if (imm & (0x3 << 0)) {
+/* test if Denormal */
+temp = tcg_temp_new_vec_matching(t);
+mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
+tcg_gen_and_vec(vece, t, b, mask);
+mask = tcg_constant_vec_matching(t, vece, frc_msk);
+tcg_gen_cmp_vec(TCG_COND_LE, vece, temp, t, mask);
+mask = tcg_constant_vec_matching(t, vece, 0);
+tcg_gen_cmpsel_vec(TCG_COND_NE, vece, temp, t, mask, temp, mask);
+
+tcg_gen_mov_vec(t, mask);
+mask = tcg_constant_vec_matching(t, vece, sgn_msk);
+if (imm & (0x1)) {
+/* test if negative */
+tcg_gen_cmpsel_vec(TCG_COND_GTU, vece, t, b, mask, temp, t);
+}
+if (imm & (0x2)) {
+/* test if positive */
+tcg_gen_cmpsel_vec(TCG_COND_LTU, vece, t, b, mask, temp, t);
+}
+tcg_temp_free_vec(temp);
+}
+if (imm & (1 << 2)) {
+/* test if -0 */
+mask = tcg_constant_vec_matching(t, vece, sgn_msk);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 3)) {
+/* test if +0 */
+mask = tcg_constant_vec_matching(t, vece, 0);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 4)) {
+/* test if -Inf */
+mask = tcg_constant_vec_matching(t, vece, exp_msk | sgn_msk);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 5)) {
+/* test if +Inf */
+mask = tcg_constant_vec_matching(t, vece, exp_msk);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 6)) {
+/* test if NaN */
+mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
+tcg_gen_and_vec(vece, b, b, mask);
+mask = tcg_constant_vec_matching(t, vece, exp_msk);
+tcg_gen_cmpsel_vec(TCG_COND_GT, vece, t, b, mask, match, t);
+}
+tcg_temp_free_vec(match);
+}
+
 static bool do_xvtstdc(DisasContext *ctx, arg_XX2_uim *a, unsigned vece)
 {
+static const TCGOpcode vecop_list[] = {
+INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
+};
 static const GVecGen2i op[] = {
 {
 .fnoi = gen_helper_XVTSTDCSP,
-.vece = MO_32
+.fniv = do_xvtstdc_vec,
+.vece = MO_32,
+.opt_opc = vecop_list
 },
 {
 .fnoi = gen_helper_XVTSTDCDP,
-.vece = MO_64
+.fniv = do_xvtstdc_vec,
+.vece = MO_64,
+.opt_opc = vecop_list
 },
 };
 
--