From: Juha Riihimäki <juha.riihim...@nokia.com> Move the implementation of the Neon VUZP unzip instruction from inline code to a helper function. (At 50+ TCG ops it was well over the recommended limit for coding inline.) The helper implementation also gives the correct answers where the inline implementation did not.
Signed-off-by: Juha Riihimäki <juha.riihim...@nokia.com> Reviewed-by: Peter Maydell <peter.mayd...@linaro.org> --- target-arm/helpers.h | 1 + target-arm/neon_helper.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++ target-arm/translate.c | 81 ++------------------------------------------- 3 files changed, 86 insertions(+), 77 deletions(-) diff --git a/target-arm/helpers.h b/target-arm/helpers.h index 893503f..9dc55c1 100644 --- a/target-arm/helpers.h +++ b/target-arm/helpers.h @@ -461,5 +461,6 @@ DEF_HELPER_3(iwmmxt_muladdswl, i64, i64, i32, i32) DEF_HELPER_2(set_teecr, void, env, i32) DEF_HELPER_2(neon_unzip, void, env, i32) +DEF_HELPER_2(neon_zip, void, env, i32) #include "def-helper.h" diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index f8d4b90..b348896 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -1748,3 +1748,84 @@ void HELPER(neon_unzip)(CPUState *env, uint32_t insn) env->vfp.regs[rd] = make_float64(d); } } + +void HELPER(neon_zip)(CPUState *env, uint32_t insn) +{ + int rd = ((insn >> 18) & 0x10) | ((insn >> 12) & 0x0f); + int rm = ((insn >> 1) & 0x10) | (insn & 0x0f); + int size = (insn >> 18) & 3; + if (insn & 0x40) { /* Q */ + uint64_t zm0 = float64_val(env->vfp.regs[rm]); + uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); + uint64_t zd0 = float64_val(env->vfp.regs[rd]); + uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); + uint64_t m0 = 0, m1 = 0, d0 = 0, d1 = 0; + switch (size) { + case 0: + d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) + | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) + | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) + | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); + d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) + | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) + | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) + | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); + m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) + | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) + | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) + | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); + m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) + | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) + | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) + | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); + break; + case 1: + d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) + | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); + d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) + | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); + m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) + | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); + m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) + | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); + break; + case 2: + d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); + d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); + m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); + m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); + break; + } + env->vfp.regs[rm] = make_float64(m0); + env->vfp.regs[rm + 1] = make_float64(m1); + env->vfp.regs[rd] = make_float64(d0); + env->vfp.regs[rd + 1] = make_float64(d1); + } else { + uint64_t zm = float64_val(env->vfp.regs[rm]); + uint64_t zd = float64_val(env->vfp.regs[rd]); + uint64_t m = 0, d = 0; + switch (size) { + case 0: + d = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) + | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) + | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) + | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); + m = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) + | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) + | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) + | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); + break; + case 1: + d = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) + | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); + m = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) + | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); + break; + default: + /* size == 2 is a no-op for doubleword vectors */ + break; + } + env->vfp.regs[rm] = make_float64(m); + env->vfp.regs[rd] = make_float64(d); + } +} diff --git a/target-arm/translate.c b/target-arm/translate.c index 3200742..d72a37a 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -3614,61 +3614,6 @@ static inline TCGv neon_get_scalar(int size, int reg) return tmp; } -static void gen_neon_zip_u8(TCGv t0, TCGv t1) -{ - TCGv rd, rm, tmp; - - rd = new_tmp(); - rm = new_tmp(); - tmp = new_tmp(); - - tcg_gen_andi_i32(rd, t0, 0xff); - tcg_gen_shli_i32(tmp, t1, 8); - tcg_gen_andi_i32(tmp, tmp, 0xff00); - tcg_gen_or_i32(rd, rd, tmp); - tcg_gen_shli_i32(tmp, t0, 16); - tcg_gen_andi_i32(tmp, tmp, 0xff0000); - tcg_gen_or_i32(rd, rd, tmp); - tcg_gen_shli_i32(tmp, t1, 24); - tcg_gen_andi_i32(tmp, tmp, 0xff000000); - tcg_gen_or_i32(rd, rd, tmp); - - tcg_gen_andi_i32(rm, t1, 0xff000000); - tcg_gen_shri_i32(tmp, t0, 8); - tcg_gen_andi_i32(tmp, tmp, 0xff0000); - tcg_gen_or_i32(rm, rm, tmp); - tcg_gen_shri_i32(tmp, t1, 8); - tcg_gen_andi_i32(tmp, tmp, 0xff00); - tcg_gen_or_i32(rm, rm, tmp); - tcg_gen_shri_i32(tmp, t0, 16); - tcg_gen_andi_i32(tmp, tmp, 0xff); - tcg_gen_or_i32(t1, rm, tmp); - tcg_gen_mov_i32(t0, rd); - - dead_tmp(tmp); - dead_tmp(rm); - dead_tmp(rd); -} - -static void gen_neon_zip_u16(TCGv t0, TCGv t1) -{ - TCGv tmp, tmp2; - - tmp = new_tmp(); - tmp2 = new_tmp(); - - tcg_gen_andi_i32(tmp, t0, 0xffff); - tcg_gen_shli_i32(tmp2, t1, 16); - tcg_gen_or_i32(tmp, tmp, tmp2); - tcg_gen_andi_i32(t1, t1, 0xffff0000); - tcg_gen_shri_i32(tmp2, t0, 16); - tcg_gen_or_i32(t1, t1, tmp2); - tcg_gen_mov_i32(t0, tmp); - - dead_tmp(tmp2); - dead_tmp(tmp); -} - static void gen_neon_trn_u8(TCGv t0, TCGv t1) { TCGv rd, tmp; @@ -5389,30 +5334,12 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) tcg_temp_free_i32(tmp); break; case 35: /* VZIP */ - /* Reg Before After - Rd A3 A2 A1 A0 B1 A1 B0 A0 - Rm B3 B2 B1 B0 B3 A3 B2 A2 - */ - if (size == 3) + if (size == 3 || (!q && size == 2)) { return 1; - count = (q ? 4 : 2); - for (n = 0; n < count; n++) { - tmp = neon_load_reg(rd, n); - tmp2 = neon_load_reg(rd, n); - switch (size) { - case 0: gen_neon_zip_u8(tmp, tmp2); break; - case 1: gen_neon_zip_u16(tmp, tmp2); break; - case 2: /* no-op */; break; - default: abort(); - } - neon_store_scratch(n * 2, tmp); - neon_store_scratch(n * 2 + 1, tmp2); - } - for (n = 0; n < count * 2; n++) { - int reg = (n < count) ? rd : rm; - tmp = neon_load_scratch(n); - neon_store_reg(reg, n % count, tmp); } + tmp = tcg_const_i32(insn); + gen_helper_neon_zip(cpu_env, tmp); + tcg_temp_free_i32(tmp); break; case 36: case 37: /* VMOVN, VQMOVUN, VQMOVN */ if (size == 3) -- 1.7.1