On 4/15/24 23:39, Chinmay Rath wrote:
Moving the following instructions to decodetree specification :
        mulli                           : D-form
        mul{lw, lwo, hw, hwu}[.]        : XO-form

The changes were verified by validating that the tcg ops generated by those
instructions remain the same, which were captured with the '-d in_asm,op' flag.

Signed-off-by: Chinmay Rath <ra...@linux.ibm.com>
---
  target/ppc/insn32.decode                   |  9 +++
  target/ppc/translate.c                     | 89 ----------------------
  target/ppc/translate/fixedpoint-impl.c.inc | 71 +++++++++++++++++
  3 files changed, 80 insertions(+), 89 deletions(-)

This is an accurate reorg of the current code, so
Reviewed-by: Richard Henderson <richard.hender...@linaro.org>

However, as follow-up, the code generation could be cleaned up:


+static bool trans_MULLW(DisasContext *ctx, arg_MULLW *a)
+{
+#if defined(TARGET_PPC64)
+    TCGv_i64 t0, t1;
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+    tcg_gen_ext32s_tl(t0, cpu_gpr[a->ra]);
+    tcg_gen_ext32s_tl(t1, cpu_gpr[a->rb]);
+    tcg_gen_mul_i64(cpu_gpr[a->rt], t0, t1);
+#else
+    tcg_gen_mul_i32(cpu_gpr[a->rt], cpu_gpr[a->ra], cpu_gpr[a->rb]);
+#endif
+    if (unlikely(a->rc)) {
+        gen_set_Rc0(ctx, cpu_gpr[a->rt]);
+    }
+    return true;
+}

Without ifdefs:

    TCGv t0 = tcg_temp_new();
    TCGv t1 = tcg_temp_new();

    tcg_gen_ext32s_tl(t0, ra);
    tcg_gen_ext32s_tl(t1, rb);
    tcg_gen_mul_tl(rt, t0, t1);

For ppc32, ext32s_tl will turn into a mov, which will be optimized away. So ideal code generation for both modes.


+static bool trans_MULLWO(DisasContext *ctx, arg_MULLWO *a)
+{
+    TCGv_i32 t0 = tcg_temp_new_i32();
+    TCGv_i32 t1 = tcg_temp_new_i32();
+
+    tcg_gen_trunc_tl_i32(t0, cpu_gpr[a->ra]);
+    tcg_gen_trunc_tl_i32(t1, cpu_gpr[a->rb]);
+    tcg_gen_muls2_i32(t0, t1, t0, t1);
+#if defined(TARGET_PPC64)
+    tcg_gen_concat_i32_i64(cpu_gpr[a->rt], t0, t1);
+#else
+    tcg_gen_mov_i32(cpu_gpr[a->rt], t0);
+#endif
+
+    tcg_gen_sari_i32(t0, t0, 31);
+    tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t1);
+    tcg_gen_extu_i32_tl(cpu_ov, t0);

Usually hosts need to create the full 64-bit product and then break it apart for tcg_gen_muls2_i32, so split followed immediately by concatenate isn't great.


    TCGv t0 = tcg_temp_new();
    TCGv t1 = tcg_temp_new();

#ifdef TARGET_PPC64
    tcg_gen_ext32s_i64(t0, ra);
    tcg_gen_ext32s_i64(t1, rb);
    tcg_gen_mul_i64(rt, t0, t1);
    tcg_gen_sextract_i64(t0, rt, 31, 1);
    tcg_gen_sari_i64(t1, rt, 32);
#else
    tcg_gen_muls2_i32(rt, t1, ra, rb);
    tcg_gen_sari_i32(t0, rt, 31);
#endif
    tcg_gen_setcond_tl(TCG_COND_NE, cpu_ov, t0, t1);


+    if (is_isa300(ctx)) {
+        tcg_gen_mov_tl(cpu_ov32, cpu_ov);
+    }
+    tcg_gen_or_tl(cpu_so, cpu_so, cpu_ov);
+
+    if (unlikely(a->rc)) {
+        gen_set_Rc0(ctx, cpu_gpr[a->rt]);
+    }
+    return true;
+}


r~

Reply via email to