[PATCH v2 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Used gvec to translate XVTSTDCSP and XVTSTDCDP.

xvtstdcsp:
reptloopimm prev versioncurrent version
25  40000   0,0475500,040820 (-14.2%)
25  40001   0,0695200,053520 (-23.0%)
25  40003   0,0786600,058470 (-25.7%)
25  400051  0,0992800,190100 (+91.5%)
25  4000127 0,1296900,201750 (+55.6%)
800012  0   0,5546250,391385 (-29.4%)
800012  1   2,6756351,423656 (-46.8%)
800012  3   3,1868231,756885 (-44.9%)
800012  51  4,2844171,363698 (-68.2%)
800012  127 5,6380001,305333 (-76.8%)

xvtstdcdp:
reptloopimm prev versioncurrent version
25  40000   0,0474500,040590 (-14.5%)
25  40001   0,0741300,053570 (-27.7%)
25  40003   0,0841800,063020 (-25.1%)
25  400051  0,1033400,127980 (+23.8%)
25  4000127 0,1346700,128660 (-4.5%)
800012  0   0,5224270,391510 (-25.1%)
800012  1   2,8847081,426802 (-50.5%)
800012  3   3,4276251,972115 (-42.5%)
800012  51  4,4502601,251865 (-71.9%)
800012  127 5,8544791,250719 (-78.6%)

Overall, these instructions are the hardest ones to measure performance
as the gvec implementation is affected by the immediate. Above there are
5 different scenarios when it comes to immediate and 2 when it comes to
rept/loop combination. The immediates scenarios are: all bits are 0
therefore the target register should just be changed to 0, with 1 bit
set, with 2 bits set in a combination the new implementation can deal
with using gvec, 4 bits set and the new implementation can't deal with
it using gvec and all bits set. The rept/loop scenarios are high loop
and low rept (so it should spend more time executing it than translating
it) and high rept low loop (so it should spend more time translating it
than executing this code).
There was a gain when it came to translating the instructions
and in the execution time in the immediates the new implementation is
configured to accept, but a loss in performance in execution time for
more exoteric immediates.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/fpu_helper.c |   7 +-
 target/ppc/helper.h |   4 +-
 target/ppc/translate/vsx-impl.c.inc | 188 ++--
 3 files changed, 184 insertions(+), 15 deletions(-)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index a66e16c212..6c94576575 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -22,6 +22,7 @@
 #include "exec/exec-all.h"
 #include "internal.h"
 #include "fpu/softfloat.h"
+#include "tcg/tcg-gvec-desc.h"
 
 static inline float128 float128_snan_to_qnan(float128 x)
 {
@@ -3263,17 +3264,19 @@ VSX_TSTDC(float64)
 VSX_TSTDC(float128)
 #undef VSX_TSTDC
 
-void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint32_t dcmx)
 {
 int i;
+dcmx = simd_data(dcmx);
 for (i = 0; i < 2; i++) {
 t->s64[i] = (int64_t)-float64_tstdc(b->f64[i], dcmx);
 }
 }
 
-void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint32_t dcmx)
 {
 int i;
+dcmx = simd_data(dcmx);
 for (i = 0; i < 4; i++) {
 t->s32[i] = (int32_t)-float32_tstdc(b->f32[i], dcmx);
 }
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 8344fe39c6..2851418acc 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -517,8 +517,8 @@ DEF_HELPER_3(xvcvsxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvsxwsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxwsp, void, env, vsr, vsr)
-DEF_HELPER_FLAGS_4(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
-DEF_HELPER_FLAGS_4(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
+DEF_HELPER_FLAGS_3(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i32)
+DEF_HELPER_FLAGS_3(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i32)
 DEF_HELPER_3(xvrspi, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspic, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspim, void, env, vsr, vsr)
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 4fdbc45ff4..26fc8c0b01 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -632,6 +632,8 @@ static void gen_mtvsrws(DisasContext *ctx)
 #define SGN_MASK_SP 0x80008000ull
 #define EXP_MASK_DP  0x7FF0ull
 #define EXP_MASK_SP 0x7F807F80ull
+#define FRC_MASK_DP (~(SGN_MASK_DP | EXP_MASK_DP))
+#define FRC_MASK_SP (~(SGN_MASK_SP | EXP_MASK_SP))
 
 

Re: [PATCH v2 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P

2022-10-10 Thread Lucas Mateus Martins Araujo e Castro



On 10/10/2022 16:42, Richard Henderson wrote:
 
On 10/10/22 12:13, Lucas Mateus Castro(alqotel) wrote:

+/* test if +Inf or -Inf */
+static void gen_is_any_inf(unsigned vece, TCGv_vec t, TCGv_vec b)
+{
+    uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : 
EXP_MASK_DP;
+    uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : 
SGN_MASK_DP;
+    tcg_gen_andc_vec(vece, b, b, tcg_constant_vec_matching(t, vece, 
exp_msk));

+    tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b,
+    tcg_constant_vec_matching(t, vece, sgn_msk));
+}


Should be clearing sign and comparing exp, not the other way.

Yeah this was a mistake, I'll fix it in the next version.
Kind of weird that my tests didn't caught this, probably should test 
that the '.out' risu file I'm using actually test every immediate value.



+    GVecGen2 op = {
+    .fno = (vece == MO_32) ? gen_helper_XVTSTDCSP : 
gen_helper_XVTSTDCDP,

+    .vece = vece,
+    .opt_opc = vecop_list
  };

  REQUIRE_VSX(ctx);

-    tcg_gen_gvec_2i(vsr_full_offset(a->xt), vsr_full_offset(a->xb),
-    16, 16, (int32_t)(a->uim), [vece - MO_32]);
+    switch (a->uim) {
+    case 0:
+    set_cpu_vsr(a->xt, tcg_constant_i64(0), true);
+    set_cpu_vsr(a->xt, tcg_constant_i64(0), false);
+    break;
+    case ((1 << 0) | (1 << 1)):
+    /* test if +Denormal or -Denormal */
+    op.fniv = gen_is_any_denormal,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );


This default setting of .fno doesn't work, because the helper requires 
simd_data set,

which this invocation via tcg_gen_gvec_2 will not provide.

You could fix this by using GVecGen2i and tcg_gen_gvec_2i, and ignoring 
the immediate

And I can remove the new #include from int_helper which was bothering me.

argument added to the functions above.  Which also means...


+    case (1 << 0):
+    /* test if -Denormal */
+    op.fniv = gen_is_neg_denormal,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );
+    break;
+    case (1 << 1):
+    /* test if +Denormal */
+    op.fniv = gen_is_pos_denormal,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );
+    break;
+    case ((1 << 2) | (1 << 3)):
+    /* test if +0 or -0 */
+    op.fniv = gen_is_any_zero,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );
+    break;
+    case (1 << 2):
+    /* test if -0 */
+    op.fniv = gen_is_neg_zero,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );
+    break;
+    case (1 << 3):
+    /* test if +0 */
+    op.fniv = gen_is_pos_zero,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );
+    break;
+    case ((1 << 4) | (1 << 5)):
+    /* test if +Inf or -Inf */
+    op.fniv = gen_is_any_inf,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );
+    break;
+    case (1 << 4):
+    /* test if -Inf */
+    op.fniv = gen_is_neg_inf,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );
+    break;
+    case (1 << 5):
+    /* test if +Inf */
+    op.fniv = gen_is_pos_inf,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );
+    break;
+    case (1 << 6):
+    /* test if NaN */
+    op.fniv = gen_is_nan,
+    tcg_gen_gvec_2(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16, 16,

+   );
+    break;
+    default:
+    tcg_gen_gvec_2_ool(vsr_full_offset(a->xt), 
vsr_full_offset(a->xb), 16,

+   16, (int32_t)(a->uim), op.fno);
+    break;


You can have only the store to op.fniv in the switch, remove the 
default, and have a

common call to tcg_gen_gvec_2i after the switch.


r~

I'll send a v3 with these changes.
--
Lucas Mateus M. Araujo e Castro
Instituto de Pesquisas ELDORADO

Departamento Computação Embarcada
Analista de Software Junior
Aviso Legal - Disclaimer 


Re: [PATCH v2 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P

2022-10-10 Thread Richard Henderson

On 10/10/22 12:13, Lucas Mateus Castro(alqotel) wrote:

+/* test if +Inf or -Inf */
+static void gen_is_any_inf(unsigned vece, TCGv_vec t, TCGv_vec b)
+{
+uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP;
+uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP;
+tcg_gen_andc_vec(vece, b, b, tcg_constant_vec_matching(t, vece, exp_msk));
+tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b,
+tcg_constant_vec_matching(t, vece, sgn_msk));
+}


Should be clearing sign and comparing exp, not the other way.


+GVecGen2 op = {
+.fno = (vece == MO_32) ? gen_helper_XVTSTDCSP : gen_helper_XVTSTDCDP,
+.vece = vece,
+.opt_opc = vecop_list
  };
  
  REQUIRE_VSX(ctx);
  
-tcg_gen_gvec_2i(vsr_full_offset(a->xt), vsr_full_offset(a->xb),

-16, 16, (int32_t)(a->uim), [vece - MO_32]);
+switch (a->uim) {
+case 0:
+set_cpu_vsr(a->xt, tcg_constant_i64(0), true);
+set_cpu_vsr(a->xt, tcg_constant_i64(0), false);
+break;
+case ((1 << 0) | (1 << 1)):
+/* test if +Denormal or -Denormal */
+op.fniv = gen_is_any_denormal,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );


This default setting of .fno doesn't work, because the helper requires simd_data set, 
which this invocation via tcg_gen_gvec_2 will not provide.


You could fix this by using GVecGen2i and tcg_gen_gvec_2i, and ignoring the immediate 
argument added to the functions above.  Which also means...



+case (1 << 0):
+/* test if -Denormal */
+op.fniv = gen_is_neg_denormal,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );
+break;
+case (1 << 1):
+/* test if +Denormal */
+op.fniv = gen_is_pos_denormal,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );
+break;
+case ((1 << 2) | (1 << 3)):
+/* test if +0 or -0 */
+op.fniv = gen_is_any_zero,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );
+break;
+case (1 << 2):
+/* test if -0 */
+op.fniv = gen_is_neg_zero,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );
+break;
+case (1 << 3):
+/* test if +0 */
+op.fniv = gen_is_pos_zero,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );
+break;
+case ((1 << 4) | (1 << 5)):
+/* test if +Inf or -Inf */
+op.fniv = gen_is_any_inf,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );
+break;
+case (1 << 4):
+/* test if -Inf */
+op.fniv = gen_is_neg_inf,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );
+break;
+case (1 << 5):
+/* test if +Inf */
+op.fniv = gen_is_pos_inf,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );
+break;
+case (1 << 6):
+/* test if NaN */
+op.fniv = gen_is_nan,
+tcg_gen_gvec_2(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16, 16,
+   );
+break;
+default:
+tcg_gen_gvec_2_ool(vsr_full_offset(a->xt), vsr_full_offset(a->xb), 16,
+   16, (int32_t)(a->uim), op.fno);
+break;


You can have only the store to op.fniv in the switch, remove the default, and have a 
common call to tcg_gen_gvec_2i after the switch.



r~