On 3/15/19 5:02 AM, Mateja Marjanovic wrote: > From: Mateja Marjanovic <mateja.marjano...@rt-rk.com> > > Optimize set of MSA instructions ILVOD, using directly > tcg registers and performing logic on them insted of > using helpers. > Performance measurement is done by executing the > instructions large number of times on a computer > with Intel Core i7-3770 CPU @ 3.40GHz×8. > > instruction || before || after || > ============================================== > ilvod.b: || 66.97 ms || 26.34 ms || > ilvod.h: || 44.75 ms || 25.17 ms || > ilvod.w: || 41.27 ms || 24.37 ms || > ilvod.d: || 41.75 ms || 20.50 ms || > > Signed-off-by: Mateja Marjanovic <mateja.marjano...@rt-rk.com> > --- > target/mips/helper.h | 1 - > target/mips/msa_helper.c | 51 -------------------- > target/mips/translate.c | 119 > ++++++++++++++++++++++++++++++++++++++++++++++- > 3 files changed, 118 insertions(+), 53 deletions(-) > > diff --git a/target/mips/helper.h b/target/mips/helper.h > index a6d687e..d162836 100644 > --- a/target/mips/helper.h > +++ b/target/mips/helper.h > @@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32) > DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32) > DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32) > DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32) > -DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32) > DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32) > DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32) > DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32) > diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c > index 9d9dafe..cbcfd57 100644 > --- a/target/mips/msa_helper.c > +++ b/target/mips/msa_helper.c > @@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t > df, uint32_t wd, > } > } > > -void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd, > - uint32_t ws, uint32_t wt) > -{ > - wr_t *pwd = &(env->active_fpu.fpr[wd].wr); > - wr_t *pws = &(env->active_fpu.fpr[ws].wr); > - wr_t *pwt = &(env->active_fpu.fpr[wt].wr); > - > - switch (df) { > - case DF_BYTE: > - pwd->b[0] = pwt->b[1]; > - pwd->b[1] = pws->b[1]; > - pwd->b[2] = pwt->b[3]; > - pwd->b[3] = pws->b[3]; > - pwd->b[4] = pwt->b[5]; > - pwd->b[5] = pws->b[5]; > - pwd->b[6] = pwt->b[7]; > - pwd->b[7] = pws->b[7]; > - pwd->b[8] = pwt->b[9]; > - pwd->b[9] = pws->b[9]; > - pwd->b[10] = pwt->b[11]; > - pwd->b[11] = pws->b[11]; > - pwd->b[12] = pwt->b[13]; > - pwd->b[13] = pws->b[13]; > - pwd->b[14] = pwt->b[15]; > - pwd->b[15] = pws->b[15]; > - break; > - case DF_HALF: > - pwd->h[0] = pwt->h[1]; > - pwd->h[1] = pws->h[1]; > - pwd->h[2] = pwt->h[3]; > - pwd->h[3] = pws->h[3]; > - pwd->h[4] = pwt->h[5]; > - pwd->h[5] = pws->h[5]; > - pwd->h[6] = pwt->h[7]; > - pwd->h[7] = pws->h[7]; > - break; > - case DF_WORD: > - pwd->w[0] = pwt->w[1]; > - pwd->w[1] = pws->w[1]; > - pwd->w[2] = pwt->w[3]; > - pwd->w[3] = pws->w[3]; > - break; > - case DF_DOUBLE: > - pwd->d[0] = pwt->d[1]; > - pwd->d[1] = pws->d[1]; > - break; > - default: > - assert(0); > - } > -} > - > void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd, > uint32_t ws, uint32_t wt) > { > diff --git a/target/mips/translate.c b/target/mips/translate.c > index b4a1103..101d2de 100644 > --- a/target/mips/translate.c > +++ b/target/mips/translate.c > @@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, > DisasContext *ctx) > tcg_temp_free_i32(tws); > } > > +static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd, > + uint32_t ws, uint32_t wt) {
{ on next line. > + TCGv_i64 t0 = tcg_temp_new_i64(); > + TCGv_i64 t1 = tcg_temp_new_i64(); > + > + uint64_t mask = (1ULL << 8) - 1; > + mask |= mask << 16; > + mask |= mask << 32; > + mask <<= 8; This is a constant. Clearer to just write 0xff00ff00ff00ff00ull; > +static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd, > + uint32_t ws, uint32_t wt) { Likewise. > + uint64_t mask = (1ULL << 16) - 1; > + mask |= mask << 32; > + mask <<= 16; 0xffff0000ffff0000ull > +static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd, > + uint32_t ws, uint32_t wt) { Likewise. > + tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask); > + tcg_gen_shri_i64(t0, t0, 32); > + tcg_gen_or_i64(t1, t1, t0); > + tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask); > + tcg_gen_or_i64(t1, t1, t0); This can fold down to deposit. tcg_gen_shri_i64(t0, msa_wr_d[wt * 2], 32); tcg_gen_deposit_i64(msa_wr_d[wd * 2], msa_wr_d[ws * 2], t0, 0, 32); > +static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd, > + uint32_t ws, uint32_t wt) { Likewise. r~