[PATCH v3 03/12] target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" This patch moves VADDCUW and VSUBCUW to decodtree with gvec using an implementation based on the helper, with the main difference being changing the -1 (aka all bits set to 1) result returned by cmp when true to +1. It also implemented a .fni4 version of those instructions and dropped the helper. vaddcuw: reptloopmaster patch 8 12500 0,01008200 0,00612400 (-39.3%) 25 40000,01091500 0,00471600 (-56.8%) 100 10000,01332500 0,00593700 (-55.4%) 500 200 0,01998500 0,01275700 (-36.2%) 250040 0,04704300 0,04364300 (-7.2%) 800012 0,10748200 0,11241000 (+4.6%) vsubcuw: reptloopmaster patch 8 12500 0,01226200 0,00571600 (-53.4%) 25 40000,01493500 0,00462100 (-69.1%) 100 10000,01522700 0,00455100 (-70.1%) 500 200 0,02384600 0,01133500 (-52.5%) 250040 0,04935200 0,03178100 (-35.6%) 800012 0,09039900 0,09440600 (+4.4%) Overall there was a gain in performance, but the TCGop code was still slightly bigger in the new version (it went from 4 to 5). Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 2 - target/ppc/insn32.decode| 2 + target/ppc/int_helper.c | 18 - target/ppc/translate/vmx-impl.c.inc | 61 +++-- target/ppc/translate/vmx-ops.c.inc | 3 +- 5 files changed, 60 insertions(+), 26 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index f02a9497b7..f7047ed2aa 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -193,11 +193,9 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vaddcuw, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_3(vsubcuw, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 9a509e84df..aebc7b73c8 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -608,12 +608,14 @@ VRLQNM 000100 . . . 00101000101 @VX ## Vector Integer Arithmetic Instructions +VADDCUW 000100 . . . 0011000@VX VADDCUQ 000100 . . . 0010100@VX VADDUQM 000100 . . . 001@VX VADDEUQM000100 . . . . 00 @VA VADDECUQ000100 . . . . 01 @VA +VSUBCUW 000100 . . . 1011000@VX VSUBCUQ 000100 . . . 1010100@VX VSUBUQM 000100 . . . 101@VX diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index ae1ba8084d..f8dd12e8ae 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -492,15 +492,6 @@ static inline void set_vscr_sat(CPUPPCState *env) env->vscr_sat.u32[0] = 1; } -void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) -{ -int i; - -for (i = 0; i < ARRAY_SIZE(r->u32); i++) { -r->u32[i] = ~a->u32[i] < b->u32[i]; -} -} - /* vprtybw */ void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b) { @@ -1962,15 +1953,6 @@ void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) #endif } -void helper_vsubcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) -{ -int i; - -for (i = 0; i < ARRAY_SIZE(r->u32); i++) { -r->u32[i] = a->u32[i] >= b->u32[i]; -} -} - void helper_vsumsws(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { int64_t t; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 3acd585a2f..f52485a5f1 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -803,8 +803,6 @@ GEN_VXFORM(vsrv, 2, 28); GEN_VXFORM(vslv, 2, 29); GEN_VXFORM(vslo, 6, 16); GEN_VXFORM(vsro, 6, 17); -GEN_VXFORM(vaddcuw, 0, 6); -GEN_VXFORM(vsubcuw, 0, 22); static bool do_vector_gvec3_VX(DisasContext *ctx, arg_VX *a, int vece, void (*gen_gvec)(unsigned, uint32_t, uint32_t, @@ -2847,8 +2845,6 @@ static void gen_xpnd04_2(DisasContext *ctx) } -GEN_VXFORM_DUAL(vsubcuw, PPC_ALTIVEC, PPC_NONE, \
[PATCH v3 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P
From: "Lucas Mateus Castro (alqotel)" Used gvec to translate XVTSTDCSP and XVTSTDCDP. xvtstdcsp: reptloopimm master version prev versioncurrent version 25 40000 0,2062000,040730 (-80.2%)0,040740 (-80.2%) 25 40001 0,2051200,053650 (-73.8%)0,053510 (-73.9%) 25 40003 0,2061600,058630 (-71.6%)0,058570 (-71.6%) 25 400051 0,2171100,191490 (-11.8%)0,192320 (-11.4%) 25 4000127 0,2061600,191490 (-7.1%) 0,192640 (-6.6%) 800012 0 1,2347190,418833 (-66.1%)0,386365 (-68.7%) 800012 1 1,2324171,435979 (+16.5%)1,462792 (+18.7%) 800012 3 1,2327601,766073 (+43.3%)1,743990 (+41.5%) 800012 51 1,2392811,319562 (+6.5%) 1,423479 (+14.9%) 800012 127 1,2317081,315760 (+6.8%) 1,426667 (+15.8%) xvtstdcdp: reptloopimm master version prev versioncurrent version 25 40000 0,1599300,040830 (-74.5%)0,040610 (-74.6%) 25 40001 0,1606400,053670 (-66.6%)0,053480 (-66.7%) 25 40003 0,1600200,063030 (-60.6%)0,062960 (-60.7%) 25 400051 0,1604100,128620 (-19.8%)0,127470 (-20.5%) 25 4000127 0,1603300,127670 (-20.4%)0,128690 (-19.7%) 800012 0 1,1903650,422146 (-64.5%)0,388417 (-67.4%) 800012 1 1,1912921,445312 (+21.3%)1,428698 (+19.9%) 800012 3 1,1886871,980656 (+66.6%)1,975354 (+66.2%) 800012 51 1,1912501,264500 (+6.1%) 1,355083 (+13.8%) 800012 127 1,1973131,266729 (+5.8%) 1,349156 (+12.7%) Overall, these instructions are the hardest ones to measure performance as the gvec implementation is affected by the immediate. Above there are 5 different scenarios when it comes to immediate and 2 when it comes to rept/loop combination. The immediates scenarios are: all bits are 0 therefore the target register should just be changed to 0, with 1 bit set, with 2 bits set in a combination the new implementation can deal with using gvec, 4 bits set and the new implementation can't deal with it using gvec and all bits set. The rept/loop scenarios are high loop and low rept (so it should spend more time executing it than translating it) and high rept low loop (so it should spend more time translating it than executing this code). These comparisons are between the upstream version, a previous similar implementation and a one with a cleaner code(this one). For a comparison with o previous different implementation: <20221010191356.83659-13-lucas.ara...@eldorado.org.br> Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/translate/vsx-impl.c.inc | 164 ++-- 1 file changed, 154 insertions(+), 10 deletions(-) diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index af410cbf1b..7099e7823d 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -632,6 +632,8 @@ static void gen_mtvsrws(DisasContext *ctx) #define SGN_MASK_SP 0x80008000ull #define EXP_MASK_DP 0x7FF0ull #define EXP_MASK_SP 0x7F807F80ull +#define FRC_MASK_DP (~(SGN_MASK_DP | EXP_MASK_DP)) +#define FRC_MASK_SP (~(SGN_MASK_SP | EXP_MASK_SP)) #define VSX_SCALAR_MOVE(name, op, sgn_mask) \ static void glue(gen_, name)(DisasContext *ctx) \ @@ -1112,23 +1114,165 @@ GEN_VSX_HELPER_X2(xscvhpdp, 0x16, 0x15, 0x10, PPC2_ISA300) GEN_VSX_HELPER_R2(xscvsdqp, 0x04, 0x1A, 0x0A, PPC2_ISA300) GEN_VSX_HELPER_X2(xscvspdp, 0x12, 0x14, 0, PPC2_VSX) +/* test if +Inf */ +static void gen_is_pos_inf(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t v) +{ +uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP; +tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b, +tcg_constant_vec_matching(t, vece, exp_msk)); +} + +/* test if -Inf */ +static void gen_is_neg_inf(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t v) +{ +uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP; +uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP; +tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b, +tcg_constant_vec_matching(t, vece, sgn_msk | exp_msk)); +} + +/* test if +Inf or -Inf */ +static void gen_is_any_inf(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t v) +{ +uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP; +uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP; +tcg_gen_andc_vec(vece, b, b, tcg_constant_vec_matching(t, vece, sgn_msk)); +tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b, +tcg_constant_vec_matching(
[PATCH v3 11/12] target/ppc: Moved XSTSTDC[QDS]P to decodetree
From: "Lucas Mateus Castro (alqotel)" Moved XSTSTDCSP, XSTSTDCDP and XSTSTDCQP to decodetree and moved some of its decoding away from the helper as previously the DCMX, XB and BF were calculated in the helper with the help of cpu_env, now that part was moved to the decodetree with the rest. xvtstdcsp: reptloopmaster patch 8 12500 1,85393600 1,94683600 (+5.0%) 25 40001,78779800 1,92479000 (+7.7%) 100 10002,12775000 2,28895500 (+7.6%) 500 200 2,99655300 3,23102900 (+7.8%) 250040 6,89082200 7,44827500 (+8.1%) 800012 17,5058550018,95152100 (+8.3%) xvtstdcdp: reptloopmaster patch 8 12500 1,39043100 1,33539800 (-4.0%) 25 40001,35731800 1,37347800 (+1.2%) 100 10001,51514800 1,56053000 (+3.0%) 500 200 2,21014400 2,47906000 (+12.2%) 250040 5,39488200 6,68766700 (+24.0%) 800012 13,9862390018,17661900 (+30.0%) xvtstdcdp: reptloopmaster patch 8 12500 1,35123800 1,34455800 (-0.5%) 25 40001,36441200 1,36759600 (+0.2%) 100 10001,49763500 1,54138400 (+2.9%) 500 200 2,19020200 2,46196400 (+12.4%) 250040 5,39265700 6,68147900 (+23.9%) 800012 14,0416360018,19669600 (+29.6%) As some values are now decoded outside the helper and passed to it as an argument the number of arguments of the helper increased, the number of TCGop needed to load the arguments increased. I suspect that's why the slow-down in the tests with a high REPT but low LOOP. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/fpu_helper.c | 114 +--- target/ppc/helper.h | 6 +- target/ppc/insn32.decode| 6 ++ target/ppc/translate/vsx-impl.c.inc | 20 - target/ppc/translate/vsx-ops.c.inc | 4 - 5 files changed, 60 insertions(+), 90 deletions(-) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 960a76a8a5..a66e16c212 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -3241,63 +3241,6 @@ void helper_XVXSIGSP(ppc_vsr_t *xt, ppc_vsr_t *xb) *xt = t; } -/* - * VSX_TEST_DC - VSX floating point test data class - * op- instruction mnemonic - * nels - number of elements (1, 2 or 4) - * xbn - VSR register number - * tp- type (float32 or float64) - * fld - vsr_t field (VsrD(*) or VsrW(*)) - * tfld - target vsr_t field (VsrD(*) or VsrW(*)) - * fld_max - target field max - * scrf - set result in CR and FPCC - */ -#define VSX_TEST_DC(op, nels, xbn, tp, fld, tfld, fld_max, scrf) \ -void helper_##op(CPUPPCState *env, uint32_t opcode) \ -{ \ -ppc_vsr_t *xt = >vsr[xT(opcode)]; \ -ppc_vsr_t *xb = >vsr[xbn]; \ -ppc_vsr_t t = { }; \ -uint32_t i, sign, dcmx; \ -uint32_t cc, match = 0; \ -\ -if (!scrf) {\ -dcmx = DCMX_XV(opcode); \ -} else {\ -t = *xt;\ -dcmx = DCMX(opcode);\ -} \ -\ -for (i = 0; i < nels; i++) {\ -sign = tp##_is_neg(xb->fld);\ -if (tp##_is_any_nan(xb->fld)) { \ -match = extract32(dcmx, 6, 1); \ -} else if (tp##_is_infinity(xb->fld)) { \ -match = extract32(dcmx, 4 + !sign, 1); \ -} else if (tp##_is_zero(xb->fld)) { \ -match = extract32(dcmx, 2 + !sign, 1); \ -} else if (tp##_is_zero_or_denormal(xb->fld)) { \ -match = extract32(dcmx, 0 + !sign, 1); \ -} \ -\ -if (scrf) { \ -cc = sign << CRF_LT_BIT | match << CRF_EQ_BIT; \ -env->fpscr &= ~FP_FPCC; \ -env->fpscr |= cc << FPSCR_FPCC; \ -env->crf[BF(opcode)] = cc; \ -} else {
[PATCH v3 08/12] target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P
From: "Lucas Mateus Castro (alqotel)" Moved XVABSSP, XVABSDP, XVNABSSP,XVNABSDP, XVNEGSP and XVNEGDP to decodetree and used gvec to translate them. xvabssp: reptloopmaster patch 8 12500 0,00477900 0,00476000 (-0.4%) 25 40000,00442800 0,00353300 (-20.2%) 100 10000,00478700 0,00366100 (-23.5%) 500 200 0,00973200 0,00649400 (-33.3%) 250040 0,03165200 0,02226700 (-29.7%) 800012 0,09315900 0,06674900 (-28.3%) xvabsdp: reptloopmaster patch 8 12500 0,00475000 0,00474400 (-0.1%) 25 40000,00355600 0,00367500 (+3.3%) 100 10000,00444200 0,00366000 (-17.6%) 500 200 0,00942700 0,00732400 (-22.3%) 250040 0,0299 0,02308500 (-22.8%) 800012 0,08770300 0,06683800 (-23.8%) xvnabssp: reptloopmaster patch 8 12500 0,00494500 0,00492900 (-0.3%) 25 40000,00397700 0,00338600 (-14.9%) 100 10000,00421400 0,00353500 (-16.1%) 500 200 0,01048000 0,00707100 (-32.5%) 250040 0,03251500 0,02238300 (-31.2%) 800012 0,08889100 0,06469800 (-27.2%) xvnabsdp: reptloopmaster patch 8 12500 0,00511000 0,00492700 (-3.6%) 25 40000,00398800 0,00381500 (-4.3%) 100 10000,00390500 0,00365900 (-6.3%) 500 200 0,00924800 0,00784600 (-15.2%) 250040 0,03138900 0,02391600 (-23.8%) 800012 0,09654200 0,05684600 (-41.1%) xvnegsp: reptloopmaster patch 8 12500 0,00493900 0,00452800 (-8.3%) 25 40000,00369100 0,00366800 (-0.6%) 100 10000,00371100 0,0038 (+2.4%) 500 200 0,00991100 0,00652300 (-34.2%) 250040 0,03025800 0,02422300 (-19.9%) 800012 0,09251100 0,06457600 (-30.2%) xvnegdp: reptloopmaster patch 8 12500 0,00474900 0,00454400 (-4.3%) 25 40000,00353100 0,00325600 (-7.8%) 100 10000,00398600 0,00366800 (-8.0%) 500 200 0,01032300 0,00702400 (-32.0%) 250040 0,03125000 0,02422400 (-22.5%) 800012 0,09475100 0,06173000 (-34.9%) This one to me seemed the opposite of the previous instructions, as it looks like there was an improvement in the translation time (itself not a surprise as operations were done twice before so there was the need to translate twice as many TCGop) Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/insn32.decode| 9 target/ppc/translate/vsx-impl.c.inc | 73 ++--- target/ppc/translate/vsx-ops.c.inc | 6 --- 3 files changed, 76 insertions(+), 12 deletions(-) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index ae151c4b62..5b687078be 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -754,6 +754,15 @@ STXVRHX 01 . . . 0010101101 . @X_TSX STXVRWX 01 . . . 0011001101 . @X_TSX STXVRDX 01 . . . 0011101101 . @X_TSX +## VSX Vector Binary Floating-Point Sign Manipulation Instructions + +XVABSDP 00 . 0 . 111011001 .. @XX2 +XVABSSP 00 . 0 . 110011001 .. @XX2 +XVNABSDP00 . 0 . 01001 .. @XX2 +XVNABSSP00 . 0 . 110101001 .. @XX2 +XVNEGDP 00 . 0 . 11001 .. @XX2 +XVNEGSP 00 . 0 . 110111001 .. @XX2 + ## VSX Scalar Multiply-Add Instructions XSMADDADP 00 . . . 0011 . . . @XX3 diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index e6e5c45ffd..8717e20d08 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -782,15 +782,76 @@ static void glue(gen_, name)(DisasContext *ctx) \ tcg_temp_free_i64(sgm); \ } -VSX_VECTOR_MOVE(xvabsdp, OP_ABS, SGN_MASK_DP) -VSX_VECTOR_MOVE(xvnabsdp, OP_NABS, SGN_MASK_DP) -VSX_VECTOR_MOVE(xvnegdp, OP_NEG, SGN_MASK_DP) VSX_VECTOR_MOVE(xvcpsgndp, OP_CPSGN, SGN_MASK_DP) -VSX_VECTOR_MOVE(xvabssp, OP_ABS, SGN_MASK_SP) -VSX_VECTOR_MOVE(xvnabssp, OP_NABS, SGN_MASK_SP) -VSX_VECTOR_MOVE(xvnegsp, OP_NEG, SGN_MASK_SP) VSX_VECTOR_MOVE(xvcpsgnsp, OP_CPSGN, SGN_MASK_SP) +#define TCG_OP_IMM_i64(FUNC, OP, IMM) \ +static void FUNC(TCGv_i64 t, TCGv_i64 b)\ +{ \ +OP(t, b, IMM); \ +} + +TCG_OP_IMM_i64(do_xvabssp_i64, tcg_ge
[PATCH v3 07/12] target/ppc: Move VABSDU[BHW] to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved VABSDUB, VABSDUH and VABSDUW to decodetree and use gvec to translate them. vabsdub: reptloopmaster patch 8 12500 0,03601600 0,00688500 (-80.9%) 25 40000,03651000 0,00532100 (-85.4%) 100 10000,03666900 0,00595300 (-83.8%) 500 200 0,04305800 0,01244600 (-71.1%) 250040 0,06893300 0,04273700 (-38.0%) 800012 0,14633200 0,12660300 (-13.5%) vabsduh: reptloopmaster patch 8 12500 0,02172400 0,00687500 (-68.4%) 25 40000,02154100 0,00531500 (-75.3%) 100 10000,02235400 0,00596300 (-73.3%) 500 200 0,02827500 0,01245100 (-56.0%) 250040 0,05638400 0,04285500 (-24.0%) 800012 0,13166000 0,12641400 (-4.0%) vabsduw: reptloopmaster patch 8 12500 0,01646400 0,00688300 (-58.2%) 25 40000,01454500 0,00475500 (-67.3%) 100 10000,01545800 0,00511800 (-66.9%) 500 200 0,02168200 0,01114300 (-48.6%) 250040 0,04571300 0,04138800 (-9.5%) 800012 0,12209500 0,12178500 (-0.3%) Same as VADDCUW and VSUBCUW, overall performance gain but it uses more TCGop (4 before the patch, 6 after). Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 6 ++-- target/ppc/insn32.decode| 6 target/ppc/int_helper.c | 13 +++- target/ppc/translate/vmx-impl.c.inc | 49 +++-- target/ppc/translate/vmx-ops.c.inc | 3 -- 5 files changed, 60 insertions(+), 17 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 71c22efc2e..fd8280dfa7 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -146,9 +146,9 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64) DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) -DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VABSDUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VABSDUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VABSDUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 38458c01de..ae151c4b62 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -528,6 +528,12 @@ VAVGUB 000100 . . . 110@VX VAVGUH 000100 . . . 1000110@VX VAVGUW 000100 . . . 1001010@VX +## Vector Integer Absolute Difference Instructions + +VABSDUB 000100 . . . 111@VX +VABSDUH 000100 . . . 1000111@VX +VABSDUW 000100 . . . 1001011@VX + ## Vector Bit Manipulation Instruction VGNB000100 . -- ... . 10011001100 @VX_n diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index bda76e54d4..d97a7f1f28 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -589,8 +589,8 @@ VAVG(VAVGSW, s32, int64_t) VAVG(VAVGUW, u32, uint64_t) #undef VAVG -#define VABSDU_DO(name, element)\ -void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ +#define VABSDU(name, element) \ +void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v)\ { \ int i; \ \ @@ -606,12 +606,9 @@ void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ * name- instruction mnemonic suffix (b: byte, h: halfword, w: word) * element - element type to access from vector */ -#define VABSDU(type, element) \ -VABSDU_DO(absdu##type, element) -VABSDU(b, u8) -VABSDU(h, u16) -VABSDU(w, u32) -#undef VABSDU_DO +VABSDU(VABSDUB, u8) +VABSDU(VABSDUH, u16) +VABSDU(VABSDUW, u32) #undef VABSDU #define VCF(suffix, cvt, element) \ diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 195c601f7a..7741f2eb49 100644 --- a/
[PATCH v3 06/12] target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved the instructions VAVGUB, VAVGUH, VAVGUW, VAVGSB, VAVGSH, VAVGSW, to decodetree and use gvec with them. For these one the right shift had to be made before the sum as to avoid an overflow, so add 1 at the end if any of the entries had 1 in its LSB as to replicate the "+ 1" before the shift described by the ISA. vavgub: reptloopmaster patch 8 12500 0,02616600 0,00754200 (-71.2%) 25 40000,0253 0,00637700 (-74.8%) 100 10000,02604600 0,00790100 (-69.7%) 500 200 0,03189300 0,01838400 (-42.4%) 250040 0,06006900 0,06851000 (+14.1%) 800012 0,13941000 0,20548500 (+47.4%) vavguh: reptloopmaster patch 8 12500 0,01818200 0,00780600 (-57.1%) 25 40000,01789300 0,00641600 (-64.1%) 100 10000,01899100 0,00787200 (-58.5%) 500 200 0,02527200 0,01828400 (-27.7%) 250040 0,05361800 0,06773000 (+26.3%) 800012 0,12886600 0,20291400 (+57.5%) vavguw: reptloopmaster patch 8 12500 0,01423100 0,00776600 (-45.4%) 25 40000,01780800 0,00638600 (-64.1%) 100 10000,02085500 0,00787000 (-62.3%) 500 200 0,02737100 0,01828800 (-33.2%) 250040 0,05572600 0,06774200 (+21.6%) 800012 0,13101700 0,20311600 (+55.0%) vavgsb: reptloopmaster patch 8 12500 0,03006000 0,00788600 (-73.8%) 25 40000,02882200 0,00637800 (-77.9%) 100 10000,02958000 0,00791400 (-73.2%) 500 200 0,03548800 0,01860400 (-47.6%) 250040 0,0636 0,06850800 (+7.7%) 800012 0,13816500 0,20550300 (+48.7%) vavgsh: reptloopmaster patch 8 12500 0,01965900 0,00776600 (-60.5%) 25 40000,01875400 0,00638700 (-65.9%) 100 10000,01952200 0,00786900 (-59.7%) 500 200 0,02562000 0,01760300 (-31.3%) 250040 0,05384300 0,06742800 (+25.2%) 800012 0,13240800 0,2033 (+53.5%) vavgsw: reptloopmaster patch 8 12500 0,01407700 0,00775600 (-44.9%) 25 40000,01762300 0,0064 (-63.7%) 100 10000,02046500 0,00788500 (-61.5%) 500 200 0,02745600 0,01843000 (-32.9%) 250040 0,05375500 0,06820500 (+26.9%) 800012 0,13068300 0,20304900 (+55.4%) These results to me seems to indicate that with gvec the results have a slower translation but faster execution. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 12 ++-- target/ppc/insn32.decode| 9 +++ target/ppc/int_helper.c | 32 - target/ppc/translate/vmx-impl.c.inc | 106 target/ppc/translate/vmx-ops.c.inc | 9 +-- 5 files changed, 127 insertions(+), 41 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index a06193bc67..71c22efc2e 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -143,15 +143,15 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64) #define dh_ctype_acc ppc_acc_t * #define dh_typecode_acc dh_typecode_ptr -DEF_HELPER_FLAGS_3(vavgub, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavguh, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavguw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavgsb, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavgsh, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavgsw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_4(vcmpeqfp, void, env, avr, avr, avr) DEF_HELPER_4(vcmpgefp, void, env, avr, avr, avr) DEF_HELPER_4(vcmpgtfp, void, env, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index aa4968e6b9..38458c01de 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -519,6 +519,15 @@ VCMPNEZW000100 . . . . 011111 @VC VCMPSQ 000100 ... -- . . 0010101 @VX_bf VCMPUQ 000100 ... -- . .
[PATCH v3 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to decodetree. VPRTYBW and VPRTYBD now also use .fni4 and .fni8, respectively. vprtybw: reptloopmaster patch 8 12500 0,01198900 0,00703100 (-41.4%) 25 40000,01070100 0,00571400 (-46.6%) 100 10000,01123300 0,00678200 (-39.6%) 500 200 0,01601500 0,01535600 (-4.1%) 250040 0,03872900 0,05562100 (43.6%) 800012 0,10047000 0,16643000 (65.7%) vprtybd: reptloopmaster patch 8 12500 0,00757700 0,00788100 (4.0%) 25 40000,00652500 0,00669600 (2.6%) 100 10000,00714400 0,00825400 (15.5%) 500 200 0,01211000 0,01903700 (57.2%) 250040 0,03483800 0,07021200 (101.5%) 800012 0,09591800 0,21036200 (119.3%) vprtybq: reptloopmaster patch 8 12500 0,00675600 0,00667200 (-1.2%) 25 40000,00619400 0,00643200 (3.8%) 100 10000,00707100 0,00751100 (6.2%) 500 200 0,01199300 0,01342000 (11.9%) 250040 0,03490900 0,04092900 (17.2%) 800012 0,09588200 0,11465100 (19.6%) I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ, I'm not sure if it's worth to move those instructions. Comparing the assembly of the helper with the TCGop they are pretty similar, so I'm not sure why vprtybd took so much more time. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 4 +- target/ppc/insn32.decode| 4 ++ target/ppc/int_helper.c | 25 +-- target/ppc/translate/vmx-impl.c.inc | 68 +++-- target/ppc/translate/vmx-ops.c.inc | 3 -- 5 files changed, 71 insertions(+), 33 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index b2e910b089..a06193bc67 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -193,9 +193,7 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr) +DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 2658dd3395..aa4968e6b9 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -529,6 +529,10 @@ VCTZDM 000100 . . . 1000100@VX VPDEPD 000100 . . . 10111001101@VX VPEXTD 000100 . . . 10110001101@VX +VPRTYBD 000100 . 01001 . 1100010@VX_tb +VPRTYBQ 000100 . 01010 . 1100010@VX_tb +VPRTYBW 000100 . 01000 . 1100010@VX_tb + ## Vector Permute and Formatting Instruction VEXTDUBVLX 000100 . . . . 011000 @VA diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index c7fd0d1faa..c6ce4665fa 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -492,31 +492,8 @@ static inline void set_vscr_sat(CPUPPCState *env) env->vscr_sat.u32[0] = 1; } -/* vprtybw */ -void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b) -{ -int i; -for (i = 0; i < ARRAY_SIZE(r->u32); i++) { -uint64_t res = b->u32[i] ^ (b->u32[i] >> 16); -res ^= res >> 8; -r->u32[i] = res & 1; -} -} - -/* vprtybd */ -void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b) -{ -int i; -for (i = 0; i < ARRAY_SIZE(r->u64); i++) { -uint64_t res = b->u64[i] ^ (b->u64[i] >> 32); -res ^= res >> 16; -res ^= res >> 8; -r->u64[i] = res & 1; -} -} - /* vprtybq */ -void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) +void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v) { uint64_t res = b->u64[0] ^ b->u64[1]; res ^= res >> 32; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index b9a9e83ab3..cbb2a3ebe7 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -1659,9 +1659,71 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11); GEN_VXFORM_NOA_ENV(vrfin, 5, 8); GEN_VXFORM_NOA_ENV(vrfip, 5, 10); GEN_VXFORM_NOA_ENV(vrfiz, 5, 9); -GE
[PATCH v3 04/12] target/ppc: Move VNEG[WD] to decodtree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved the instructions VNEGW and VNEGD to decodetree and used gvec to decode it. vnegw: reptloopmaster patch 8 12500 0,01053200 0,00548400 (-47.9%) 25 40000,01030500 0,0039 (-62.2%) 100 10000,01096300 0,00395400 (-63.9%) 500 200 0,01472000 0,00712300 (-51.6%) 250040 0,03809000 0,02147700 (-43.6%) 800012 0,09957100 0,06202100 (-37.7%) vnegd: reptloopmaster patch 8 12500 0,00594600 0,00543800 (-8.5%) 25 40000,00575200 0,00396400 (-31.1%) 100 10000,00676100 0,00394800 (-41.6%) 500 200 0,01149300 0,00709400 (-38.3%) 250040 0,03441500 0,02169600 (-37.0%) 800012 0,09516900 0,06337000 (-33.4%) Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 2 -- target/ppc/insn32.decode| 3 +++ target/ppc/int_helper.c | 12 target/ppc/translate/vmx-impl.c.inc | 15 +-- target/ppc/translate/vmx-ops.c.inc | 2 -- 5 files changed, 16 insertions(+), 18 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index f7047ed2aa..b2e910b089 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -229,8 +229,6 @@ DEF_HELPER_FLAGS_2(VSTRIBL, TCG_CALL_NO_RWG, i32, avr, avr) DEF_HELPER_FLAGS_2(VSTRIBR, TCG_CALL_NO_RWG, i32, avr, avr) DEF_HELPER_FLAGS_2(VSTRIHL, TCG_CALL_NO_RWG, i32, avr, avr) DEF_HELPER_FLAGS_2(VSTRIHR, TCG_CALL_NO_RWG, i32, avr, avr) -DEF_HELPER_FLAGS_2(vnegw, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vnegd, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vupkhpx, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vupklpx, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vupkhsb, TCG_CALL_NO_RWG, void, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index aebc7b73c8..2658dd3395 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -629,6 +629,9 @@ VEXTSH2D000100 . 11001 . 1100010 @VX_tb VEXTSW2D000100 . 11010 . 1100010@VX_tb VEXTSD2Q000100 . 11011 . 1100010@VX_tb +VNEGD 000100 . 00111 . 1100010@VX_tb +VNEGW 000100 . 00110 . 1100010@VX_tb + ## Vector Mask Manipulation Instructions MTVSRBM 000100 . 1 . 1100110@VX_tb diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index f8dd12e8ae..c7fd0d1faa 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -1928,18 +1928,6 @@ XXBLEND(W, 32) XXBLEND(D, 64) #undef XXBLEND -#define VNEG(name, element) \ -void helper_##name(ppc_avr_t *r, ppc_avr_t *b) \ -{ \ -int i; \ -for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ -r->element[i] = -b->element[i]; \ -} \ -} -VNEG(vnegw, s32) -VNEG(vnegd, s64) -#undef VNEG - void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { int sh = (b->VsrB(0xf) >> 3) & 0xf; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index f52485a5f1..b9a9e83ab3 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2625,8 +2625,19 @@ GEN_VXFORM_NOA(vclzb, 1, 28) GEN_VXFORM_NOA(vclzh, 1, 29) GEN_VXFORM_TRANS(vclzw, 1, 30) GEN_VXFORM_TRANS(vclzd, 1, 31) -GEN_VXFORM_NOA_2(vnegw, 1, 24, 6) -GEN_VXFORM_NOA_2(vnegd, 1, 24, 7) + +static bool do_vneg(DisasContext *ctx, arg_VX_tb *a, unsigned vece) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA300); +REQUIRE_VECTOR(ctx); + +tcg_gen_gvec_neg(vece, avr_full_offset(a->vrt), avr_full_offset(a->vrb), + 16, 16); +return true; +} + +TRANS(VNEGW, do_vneg, MO_32) +TRANS(VNEGD, do_vneg, MO_64) static void gen_vexts_i64(TCGv_i64 t, TCGv_i64 b, int64_t s) { diff --git a/target/ppc/translate/vmx-ops.c.inc b/target/ppc/translate/vmx-ops.c.inc index ded0234123..27908533dd 100644 --- a/target/ppc/translate/vmx-ops.c.inc +++ b/target/ppc/translate/vmx-ops.c.inc @@ -181,8 +181,6 @@ GEN_VXFORM_300_EXT(vextractd, 6, 11, 0x10), GEN_VXFORM(vspltisb, 6, 12), GEN_VXFORM(vspltish, 6, 13), GEN_VXFORM(vspltisw, 6, 14), -GEN_VXFORM_300_EO(vnegw, 0x01, 0x18, 0x06), -GEN_VXFORM_300_EO(vnegd, 0x01, 0x18, 0x07), GEN_VXFORM_300_EO(vctzb, 0x01, 0x18, 0x1C), GEN_VXFORM_300_EO(vctzh, 0x01, 0x18, 0x1D), GEN_VXFORM_300_EO(vctzw, 0x01, 0x18, 0x1E), -- 2.37.3
[PATCH v3 02/12] target/ppc: Move VMH[R]ADDSHS instruction to decodetree
From: "Lucas Mateus Castro (alqotel)" This patch moves VMHADDSHS and VMHRADDSHS to decodetree I couldn't find a satisfactory implementation with TCG inline. vmhaddshs: reptloopmaster patch 8 12500 0,02983400 0,02648500 (-11.2%) 25 40000,02946000 0,02518000 (-14.5%) 100 10000,03104300 0,02638000 (-15.0%) 500 200 0,04002000 0,03502500 (-12.5%) 250040 0,08090100 0,07562200 (-6.5%) 800012 0,19242600 0,18626800 (-3.2%) vmhraddshs: reptloopmaster patch 8 12500 0,03078600 0,02851000 (-7.4%) 25 40000,02793200 0,02746900 (-1.7%) 100 10000,02886000 0,02839900 (-1.6%) 500 200 0,03714700 0,03799200 (+2.3%) 250040 0,07948000 0,07852200 (-1.2%) 800012 0,19049800 0,18813900 (-1.2%) Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 4 ++-- target/ppc/insn32.decode| 2 ++ target/ppc/int_helper.c | 4 ++-- target/ppc/translate/vmx-impl.c.inc | 5 +++-- target/ppc/translate/vmx-ops.c.inc | 1 - 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 9c562ab00e..f02a9497b7 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -258,8 +258,8 @@ DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr) DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr) DEF_HELPER_4(vpkudum, void, env, avr, avr, avr) DEF_HELPER_FLAGS_3(vpkpx, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr) -DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr) +DEF_HELPER_5(VMHADDSHS, void, env, avr, avr, avr, avr) +DEF_HELPER_5(VMHRADDSHS, void, env, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 7445455a12..9a509e84df 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -694,6 +694,8 @@ VMSUMCUD000100 . . . . 010111 @VA VMSUMUDM000100 . . . . 100011 @VA VMLADDUHM 000100 . . . . 100010 @VA +VMHADDSHS 000100 . . . . 10 @VA +VMHRADDSHS 000100 . . . . 11 @VA ## Vector String Instructions diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 0d25000b2a..ae1ba8084d 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -939,7 +939,7 @@ target_ulong helper_vctzlsbb(ppc_avr_t *r) return count; } -void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, +void helper_VMHADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) { int sat = 0; @@ -957,7 +957,7 @@ void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, } } -void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, +void helper_VMHRADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) { int sat = 0; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 9f18c6d4f2..3acd585a2f 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2521,7 +2521,7 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ tcg_temp_free_ptr(rd); \ } -GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16) +GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23) static bool do_va_helper(DisasContext *ctx, arg_VA *a, void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr)) @@ -2620,7 +2620,8 @@ static bool do_va_env_helper(DisasContext *ctx, arg_VA *a, TRANS_FLAGS(ALTIVEC, VMSUMUHS, do_va_env_helper, gen_helper_VMSUMUHS) TRANS_FLAGS(ALTIVEC, VMSUMSHS, do_va_env_helper, gen_helper_VMSUMSHS) -GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23) +TRANS_FLAGS(ALTIVEC, VMHADDSHS, do_va_env_helper, gen_helper_VMHADDSHS) +TRANS_FLAGS(ALTIVEC, VMHRADDSHS, do_va_env_helper, gen_helper_VMHRADDSHS) GEN_VXFORM_NOA(vclzb, 1, 28) GEN_VXFORM_NOA(vclzh, 1, 29) diff --git a/target/ppc/translate/vmx-ops.c.inc b/target/ppc/translate/vmx-ops.c.inc index a3a0fd0650..7cd9d40e06 100644 --- a/target/ppc/translate/vmx-ops.c.inc +++ b/target/ppc/translate/vmx-ops.c.inc @@ -219,7 +219,6 @@ GEN_VXFORM_UIMM(vctsxs, 5, 15), #define GEN_VAFORM_PAIRED(name0, name1, opc2) \ GEN_HANDLER(name0##_##name1, 0x04, opc2, 0xFF, 0x, PPC_ALTIVEC) -GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16), GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23), GEN_VXFORM_DUAL(vclzb, vpopc
[PATCH v3 00/12] VMX/VSX instructions with gvec
From: "Lucas Mateus Castro (alqotel)" Patches missing review: 12 v2 -> v3: - Used ctpop in i32 and i64 vprtyb - Changed gvec set up in xvtstdc[ds]p v1 -> v2: - Implemented instructions with fni4/fni8 and dropped the helper: * VSUBCUW * VADDCUW * VPRTYBW * VPRTYBD - Reworked patch12 to only use gvec implementation with a few immediates. - Used bitsel_ver on patch9 - Changed vec variables to tcg_constant_vec when possible This patch series moves some instructions from decode legacy to decodetree and translate said instructions with gvec. Some cases using gvec ended up with a bigger, more complex and slower so those instructions were only moved to decodetree. In each patch there's a comparison of the execution time before the patch being applied and after. Said result is the sum of 10 executions. The program used to time the execution worked like this: clock_t start = clock(); for (int i = 0; i < LOOP; i++) { asm ( load values in registers, between 2 and 3 instructions ".rept REPT\n\t" "INSTRUCTION registers\n\t" ".endr\n\t" save result from register, 1 instruction ); } clock_t end = clock(); printf("INSTRUCTION rept=REPT loop=LOOP, time taken: %.12lf\n", ((double)(end - start))/ CLOCKS_PER_SEC); Where the column rept in the value used in .rept in the inline assembly and loop column is the value used for the for loop. All of those tests were executed on a Power9. When comparing the TCGop the data used was gathered using '-d op' and '-d op_opt'. Lucas Mateus Castro (alqotel) (12): target/ppc: Moved VMLADDUHM to decodetree and use gvec target/ppc: Move VMH[R]ADDSHS instruction to decodetree target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec target/ppc: Move VNEG[WD] to decodtree and use gvec target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec target/ppc: Move VABSDU[BHW] to decodetree and use gvec target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P target/ppc: Use gvec to decode XVCPSGN[SD]P target/ppc: Moved XVTSTDC[DS]P to decodetree target/ppc: Moved XSTSTDC[QDS]P to decodetree target/ppc: Use gvec to decode XVTSTDC[DS]P target/ppc/fpu_helper.c | 137 +- target/ppc/helper.h | 42 ++-- target/ppc/insn32.decode| 50 target/ppc/int_helper.c | 107 ++-- target/ppc/translate.c | 1 - target/ppc/translate/vmx-impl.c.inc | 352 ++ target/ppc/translate/vmx-ops.c.inc | 15 +- target/ppc/translate/vsx-impl.c.inc | 372 +++- target/ppc/translate/vsx-ops.c.inc | 21 -- 9 files changed, 771 insertions(+), 326 deletions(-) -- 2.37.3
[PATCH v3 01/12] target/ppc: Moved VMLADDUHM to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" This patch moves VMLADDUHM to decodetree a creates a gvec implementation using mul_vec and add_vec. reptloopmaster patch 8 12500 0,01810500 0,00903100 (-50.1%) 25 40000,01739400 0,00747700 (-57.0%) 100 10000,01843600 0,00901400 (-51.1%) 500 200 0,02574600 0,01971000 (-23.4%) 250040 0,05921600 0,07121800 (+20.3%) 800012 0,15326700 0,21725200 (+41.7%) The significant difference in performance when REPT is low and LOOP is high I think is due to the fact that the new implementation has a higher translation time, as when using a helper only 5 TCGop are used but with the patch a total of 10 TCGop are needed (Power lacks a direct mul_vec equivalent so this instruction is implemented with the help of 5 others, vmuleu, vmulou, vmrgh, vmrgl and vpkum). Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 2 +- target/ppc/insn32.decode| 2 ++ target/ppc/int_helper.c | 3 +- target/ppc/translate.c | 1 - target/ppc/translate/vmx-impl.c.inc | 48 ++--- 5 files changed, 35 insertions(+), 21 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 57eee07256..9c562ab00e 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -264,7 +264,7 @@ DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_5(VMSUMSHS, void, env, avr, avr, avr, avr) -DEF_HELPER_FLAGS_4(vmladduhm, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) +DEF_HELPER_FLAGS_5(VMLADDUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_2(mtvscr, TCG_CALL_NO_RWG, void, env, i32) DEF_HELPER_FLAGS_1(mfvscr, TCG_CALL_NO_RWG, i32, env) DEF_HELPER_3(lvebx, void, env, avr, tl) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index a5249ee32c..7445455a12 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -693,6 +693,8 @@ VMSUMUHS000100 . . . . 100111 @VA VMSUMCUD000100 . . . . 010111 @VA VMSUMUDM000100 . . . . 100011 @VA +VMLADDUHM 000100 . . . . 100010 @VA + ## Vector String Instructions VSTRIBL 000100 . 0 . . 001101 @VX_tb_rc diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 696096100b..0d25000b2a 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -974,7 +974,8 @@ void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, } } -void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) +void helper_VMLADDUHM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c, + uint32_t v) { int i; diff --git a/target/ppc/translate.c b/target/ppc/translate.c index e810842925..11f729c60c 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -6921,7 +6921,6 @@ GEN_HANDLER(lvsl, 0x1f, 0x06, 0x00, 0x0001, PPC_ALTIVEC), GEN_HANDLER(lvsr, 0x1f, 0x06, 0x01, 0x0001, PPC_ALTIVEC), GEN_HANDLER(mfvscr, 0x04, 0x2, 0x18, 0x001ff800, PPC_ALTIVEC), GEN_HANDLER(mtvscr, 0x04, 0x2, 0x19, 0x03ff, PPC_ALTIVEC), -GEN_HANDLER(vmladduhm, 0x04, 0x11, 0xFF, 0x, PPC_ALTIVEC), #if defined(TARGET_PPC64) GEN_HANDLER_E(maddhd_maddhdu, 0x04, 0x18, 0xFF, 0x, PPC_NONE, PPC2_ISA300), diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index e644ad3236..9f18c6d4f2 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2523,24 +2523,6 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16) -static void gen_vmladduhm(DisasContext *ctx) -{ -TCGv_ptr ra, rb, rc, rd; -if (unlikely(!ctx->altivec_enabled)) { -gen_exception(ctx, POWERPC_EXCP_VPU); -return; -} -ra = gen_avr_ptr(rA(ctx->opcode)); -rb = gen_avr_ptr(rB(ctx->opcode)); -rc = gen_avr_ptr(rC(ctx->opcode)); -rd = gen_avr_ptr(rD(ctx->opcode)); -gen_helper_vmladduhm(rd, ra, rb, rc); -tcg_temp_free_ptr(ra); -tcg_temp_free_ptr(rb); -tcg_temp_free_ptr(rc); -tcg_temp_free_ptr(rd); -} - static bool do_va_helper(DisasContext *ctx, arg_VA *a, void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr)) { @@ -2569,6 +2551,36 @@ TRANS_FLAGS2(ALTIVEC_207, VSUBECUQ, do_va_helper, gen_helper_VSUBECUQ) TRANS_FLAGS(ALTIVEC, VPERM, do_va_helper, gen_helper_VPERM) TRANS_FLAGS2(ISA300, VPERMR, do_va_helper, gen_helper_VPERMR) +static void gen_vmladduhm_vec(unsigned vece, TCGv_vec t, TCGv_vec a, TCGv_vec b, +
[PATCH v3 09/12] target/ppc: Use gvec to decode XVCPSGN[SD]P
From: "Lucas Mateus Castro (alqotel)" Moved XVCPSGNSP and XVCPSGNDP to decodetree and used gvec to translate them. xvcpsgnsp: reptloopmaster patch 8 12500 0,00561400 0,00537900 (-4.2%) 25 40000,00562100 0,0040 (-28.8%) 100 10000,00696900 0,00416300 (-40.3%) 500 200 0,02211900 0,00840700 (-62.0%) 250040 0,09328600 0,02728300 (-70.8%) 800012 0,27295300 0,06867800 (-74.8%) xvcpsgndp: reptloopmaster patch 8 12500 0,00556300 0,00584200 (+5.0%) 25 40000,00482700 0,00431700 (-10.6%) 100 10000,00585800 0,00464400 (-20.7%) 500 200 0,01565300 0,00839700 (-46.4%) 250040 0,05766500 0,02430600 (-57.8%) 800012 0,19875300 0,07947100 (-60.0%) Like the previous instructions there seemed to be a improvement on translation time. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/insn32.decode| 2 + target/ppc/translate/vsx-impl.c.inc | 109 ++-- target/ppc/translate/vsx-ops.c.inc | 3 - 3 files changed, 55 insertions(+), 59 deletions(-) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 5b687078be..6549c4040e 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -762,6 +762,8 @@ XVNABSDP00 . 0 . 01001 .. @XX2 XVNABSSP00 . 0 . 110101001 .. @XX2 XVNEGDP 00 . 0 . 11001 .. @XX2 XVNEGSP 00 . 0 . 110111001 .. @XX2 +XVCPSGNDP 00 . . . ... @XX3 +XVCPSGNSP 00 . . . 1101 ... @XX3 ## VSX Scalar Multiply-Add Instructions diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 8717e20d08..1c289238ec 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -729,62 +729,6 @@ VSX_SCALAR_MOVE_QP(xsnabsqp, OP_NABS, SGN_MASK_DP) VSX_SCALAR_MOVE_QP(xsnegqp, OP_NEG, SGN_MASK_DP) VSX_SCALAR_MOVE_QP(xscpsgnqp, OP_CPSGN, SGN_MASK_DP) -#define VSX_VECTOR_MOVE(name, op, sgn_mask) \ -static void glue(gen_, name)(DisasContext *ctx) \ -{\ -TCGv_i64 xbh, xbl, sgm; \ -if (unlikely(!ctx->vsx_enabled)) { \ -gen_exception(ctx, POWERPC_EXCP_VSXU); \ -return; \ -}\ -xbh = tcg_temp_new_i64();\ -xbl = tcg_temp_new_i64();\ -sgm = tcg_temp_new_i64();\ -get_cpu_vsr(xbh, xB(ctx->opcode), true); \ -get_cpu_vsr(xbl, xB(ctx->opcode), false);\ -tcg_gen_movi_i64(sgm, sgn_mask); \ -switch (op) {\ -case OP_ABS: { \ -tcg_gen_andc_i64(xbh, xbh, sgm); \ -tcg_gen_andc_i64(xbl, xbl, sgm); \ -break; \ -}\ -case OP_NABS: { \ -tcg_gen_or_i64(xbh, xbh, sgm); \ -tcg_gen_or_i64(xbl, xbl, sgm); \ -break; \ -}\ -case OP_NEG: { \ -tcg_gen_xor_i64(xbh, xbh, sgm); \ -tcg_gen_xor_i64(xbl, xbl, sgm); \ -break; \ -}\ -case OP_CPSGN: { \ -TCGv_i64 xah = tcg_temp_new_i64(); \ -TCGv_i64 xal = tcg_temp_new_i64(); \ -get_cpu_vsr(xah, xA(ctx->opcode), true); \ -get_cpu_vsr(xal, xA(ctx->opcode), false);\ -tcg_gen_and_i64(xah, xah, sgm); \ -tcg_gen_and_i64(xal, xal, sgm); \ -tcg_gen_andc_i64(xbh, xbh, sgm); \ -tcg_gen_andc_i64(xbl, xbl, sgm); \ -tcg
[PATCH v3 10/12] target/ppc: Moved XVTSTDC[DS]P to decodetree
From: "Lucas Mateus Castro (alqotel)" Moved XVTSTDCSP and XVTSTDCDP to decodetree an restructured the helper to be simpler and do all decoding in the decodetree (so XB, XT and DCMX are all calculated outside the helper). Obs: The tests in this one are slightly different, these are the sum of these instructions with all possible immediate and those instructions are repeated 10 times. xvtstdcsp: reptloopmaster patch 8 12500 2,76402100 2,70699100 (-2.1%) 25 40002,64867100 2,67884100 (+1.1%) 100 10002,73806300 2,78701000 (+1.8%) 500 200 3,44666500 3,61027600 (+4.7%) 250040 5,85790200 6,47475500 (+10.5%) 800012 15,2210210017,46062900 (+14.7%) xvtstdcdp: reptloopmaster patch 8 12500 2,11818000 1,61065300 (-24.0%) 25 40002,04573400 1,60132200 (-21.7%) 100 10002,13834100 1,69988100 (-20.5%) 500 200 2,73977000 2,48631700 (-9.3%) 250040 5,05067000 5,25914100 (+4.1%) 800012 14,6050780015,93704900 (+9.1%) Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/fpu_helper.c | 39 +++-- target/ppc/helper.h | 4 +-- target/ppc/insn32.decode| 5 target/ppc/translate/vsx-impl.c.inc | 28 +++-- target/ppc/translate/vsx-ops.c.inc | 8 -- 5 files changed, 70 insertions(+), 14 deletions(-) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index ae25f32d6e..960a76a8a5 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -3295,11 +3295,46 @@ void helper_##op(CPUPPCState *env, uint32_t opcode) \ } \ } -VSX_TEST_DC(xvtstdcdp, 2, xB(opcode), float64, VsrD(i), VsrD(i), UINT64_MAX, 0) -VSX_TEST_DC(xvtstdcsp, 4, xB(opcode), float32, VsrW(i), VsrW(i), UINT32_MAX, 0) VSX_TEST_DC(xststdcdp, 1, xB(opcode), float64, VsrD(0), VsrD(0), 0, 1) VSX_TEST_DC(xststdcqp, 1, (rB(opcode) + 32), float128, f128, VsrD(0), 0, 1) +#define VSX_TSTDC(tp) \ +static int32_t tp##_tstdc(tp b, uint32_t dcmx) \ +{ \ +uint32_t match = 0; \ +uint32_t sign = tp##_is_neg(b); \ +if (tp##_is_any_nan(b)) { \ +match = extract32(dcmx, 6, 1); \ +} else if (tp##_is_infinity(b)) { \ +match = extract32(dcmx, 4 + !sign, 1); \ +} else if (tp##_is_zero(b)) { \ +match = extract32(dcmx, 2 + !sign, 1); \ +} else if (tp##_is_zero_or_denormal(b)) { \ +match = extract32(dcmx, 0 + !sign, 1); \ +} \ +return (match != 0);\ +} + +VSX_TSTDC(float32) +VSX_TSTDC(float64) +#undef VSX_TSTDC + +void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v) +{ +int i; +for (i = 0; i < 2; i++) { +t->s64[i] = (int64_t)-float64_tstdc(b->f64[i], dcmx); +} +} + +void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v) +{ +int i; +for (i = 0; i < 4; i++) { +t->s32[i] = (int32_t)-float32_tstdc(b->f32[i], dcmx); +} +} + void helper_xststdcsp(CPUPPCState *env, uint32_t opcode, ppc_vsr_t *xb) { uint32_t dcmx, sign, exp; diff --git a/target/ppc/helper.h b/target/ppc/helper.h index fd8280dfa7..9e5d11939b 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -517,8 +517,8 @@ DEF_HELPER_3(xvcvsxdsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvuxdsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvsxwsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvuxwsp, void, env, vsr, vsr) -DEF_HELPER_2(xvtstdcsp, void, env, i32) -DEF_HELPER_2(xvtstdcdp, void, env, i32) +DEF_HELPER_FLAGS_4(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32) +DEF_HELPER_FLAGS_4(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32) DEF_HELPER_3(xvrspi, void, env, vsr, vsr) DEF_HELPER_3(xvrspic, void, env, vsr, vsr) DEF_HELPER_3(xvrspim, void, env, vsr, vsr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 6549c4040e..c0a531be5c 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -199,6 +199,9 @@ @XX2_uim4 .. . . uim:4 . . .. _uim xt=%xx_xt xb=%xx_xb +%xx_uim76:1 2:1 16:5 +@XX2_uim7 .. . . . . ... . .._uim xt=%xx_xt xb=%xx_xb uim=%xx_uim7 + _bf_xb bf xb @XX2_bf_xb .. bf:3 .. . . . . ._bf_xb xb=%xx_xb @@ -848,6 +851
[PATCH v2 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P
From: "Lucas Mateus Castro (alqotel)" Used gvec to translate XVTSTDCSP and XVTSTDCDP. xvtstdcsp: reptloopimm prev versioncurrent version 25 40000 0,0475500,040820 (-14.2%) 25 40001 0,0695200,053520 (-23.0%) 25 40003 0,0786600,058470 (-25.7%) 25 400051 0,0992800,190100 (+91.5%) 25 4000127 0,1296900,201750 (+55.6%) 800012 0 0,5546250,391385 (-29.4%) 800012 1 2,6756351,423656 (-46.8%) 800012 3 3,1868231,756885 (-44.9%) 800012 51 4,2844171,363698 (-68.2%) 800012 127 5,6380001,305333 (-76.8%) xvtstdcdp: reptloopimm prev versioncurrent version 25 40000 0,0474500,040590 (-14.5%) 25 40001 0,0741300,053570 (-27.7%) 25 40003 0,0841800,063020 (-25.1%) 25 400051 0,1033400,127980 (+23.8%) 25 4000127 0,1346700,128660 (-4.5%) 800012 0 0,5224270,391510 (-25.1%) 800012 1 2,8847081,426802 (-50.5%) 800012 3 3,4276251,972115 (-42.5%) 800012 51 4,4502601,251865 (-71.9%) 800012 127 5,8544791,250719 (-78.6%) Overall, these instructions are the hardest ones to measure performance as the gvec implementation is affected by the immediate. Above there are 5 different scenarios when it comes to immediate and 2 when it comes to rept/loop combination. The immediates scenarios are: all bits are 0 therefore the target register should just be changed to 0, with 1 bit set, with 2 bits set in a combination the new implementation can deal with using gvec, 4 bits set and the new implementation can't deal with it using gvec and all bits set. The rept/loop scenarios are high loop and low rept (so it should spend more time executing it than translating it) and high rept low loop (so it should spend more time translating it than executing this code). There was a gain when it came to translating the instructions and in the execution time in the immediates the new implementation is configured to accept, but a loss in performance in execution time for more exoteric immediates. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/fpu_helper.c | 7 +- target/ppc/helper.h | 4 +- target/ppc/translate/vsx-impl.c.inc | 188 ++-- 3 files changed, 184 insertions(+), 15 deletions(-) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index a66e16c212..6c94576575 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -22,6 +22,7 @@ #include "exec/exec-all.h" #include "internal.h" #include "fpu/softfloat.h" +#include "tcg/tcg-gvec-desc.h" static inline float128 float128_snan_to_qnan(float128 x) { @@ -3263,17 +3264,19 @@ VSX_TSTDC(float64) VSX_TSTDC(float128) #undef VSX_TSTDC -void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v) +void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint32_t dcmx) { int i; +dcmx = simd_data(dcmx); for (i = 0; i < 2; i++) { t->s64[i] = (int64_t)-float64_tstdc(b->f64[i], dcmx); } } -void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v) +void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint32_t dcmx) { int i; +dcmx = simd_data(dcmx); for (i = 0; i < 4; i++) { t->s32[i] = (int32_t)-float32_tstdc(b->f32[i], dcmx); } diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 8344fe39c6..2851418acc 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -517,8 +517,8 @@ DEF_HELPER_3(xvcvsxdsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvuxdsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvsxwsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvuxwsp, void, env, vsr, vsr) -DEF_HELPER_FLAGS_4(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32) -DEF_HELPER_FLAGS_4(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32) +DEF_HELPER_FLAGS_3(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i32) +DEF_HELPER_FLAGS_3(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i32) DEF_HELPER_3(xvrspi, void, env, vsr, vsr) DEF_HELPER_3(xvrspic, void, env, vsr, vsr) DEF_HELPER_3(xvrspim, void, env, vsr, vsr) diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 4fdbc45ff4..26fc8c0b01 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -632,6 +632,8 @@ static void gen_mtvsrws(DisasContext *ctx) #define SGN_MASK_SP 0x80008000ull #define EXP_MASK_DP 0x7FF0ull #define EXP_MASK_SP 0x7F807F80ull +#define FRC_MASK_DP (~(SGN_MASK_
[PATCH v2 11/12] target/ppc: Moved XSTSTDC[QDS]P to decodetree
From: "Lucas Mateus Castro (alqotel)" Moved XSTSTDCSP, XSTSTDCDP and XSTSTDCQP to decodetree and moved some of its decoding away from the helper as previously the DCMX, XB and BF were calculated in the helper with the help of cpu_env, now that part was moved to the decodetree with the rest. xvtstdcsp: reptloopmaster patch 8 12500 1,85393600 1,94683600 (+5.0%) 25 40001,78779800 1,92479000 (+7.7%) 100 10002,12775000 2,28895500 (+7.6%) 500 200 2,99655300 3,23102900 (+7.8%) 250040 6,89082200 7,44827500 (+8.1%) 800012 17,5058550018,95152100 (+8.3%) xvtstdcdp: reptloopmaster patch 8 12500 1,39043100 1,33539800 (-4.0%) 25 40001,35731800 1,37347800 (+1.2%) 100 10001,51514800 1,56053000 (+3.0%) 500 200 2,21014400 2,47906000 (+12.2%) 250040 5,39488200 6,68766700 (+24.0%) 800012 13,9862390018,17661900 (+30.0%) xvtstdcdp: reptloopmaster patch 8 12500 1,35123800 1,34455800 (-0.5%) 25 40001,36441200 1,36759600 (+0.2%) 100 10001,49763500 1,54138400 (+2.9%) 500 200 2,19020200 2,46196400 (+12.4%) 250040 5,39265700 6,68147900 (+23.9%) 800012 14,0416360018,19669600 (+29.6%) As some values are now decoded outside the helper and passed to it as an argument the number of arguments of the helper increased, the number of TCGop needed to load the arguments increased. I suspect that's why the slow-down in the tests with a high REPT but low LOOP. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/fpu_helper.c | 114 +--- target/ppc/helper.h | 6 +- target/ppc/insn32.decode| 6 ++ target/ppc/translate/vsx-impl.c.inc | 20 - target/ppc/translate/vsx-ops.c.inc | 4 - 5 files changed, 60 insertions(+), 90 deletions(-) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 960a76a8a5..a66e16c212 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -3241,63 +3241,6 @@ void helper_XVXSIGSP(ppc_vsr_t *xt, ppc_vsr_t *xb) *xt = t; } -/* - * VSX_TEST_DC - VSX floating point test data class - * op- instruction mnemonic - * nels - number of elements (1, 2 or 4) - * xbn - VSR register number - * tp- type (float32 or float64) - * fld - vsr_t field (VsrD(*) or VsrW(*)) - * tfld - target vsr_t field (VsrD(*) or VsrW(*)) - * fld_max - target field max - * scrf - set result in CR and FPCC - */ -#define VSX_TEST_DC(op, nels, xbn, tp, fld, tfld, fld_max, scrf) \ -void helper_##op(CPUPPCState *env, uint32_t opcode) \ -{ \ -ppc_vsr_t *xt = >vsr[xT(opcode)]; \ -ppc_vsr_t *xb = >vsr[xbn]; \ -ppc_vsr_t t = { }; \ -uint32_t i, sign, dcmx; \ -uint32_t cc, match = 0; \ -\ -if (!scrf) {\ -dcmx = DCMX_XV(opcode); \ -} else {\ -t = *xt;\ -dcmx = DCMX(opcode);\ -} \ -\ -for (i = 0; i < nels; i++) {\ -sign = tp##_is_neg(xb->fld);\ -if (tp##_is_any_nan(xb->fld)) { \ -match = extract32(dcmx, 6, 1); \ -} else if (tp##_is_infinity(xb->fld)) { \ -match = extract32(dcmx, 4 + !sign, 1); \ -} else if (tp##_is_zero(xb->fld)) { \ -match = extract32(dcmx, 2 + !sign, 1); \ -} else if (tp##_is_zero_or_denormal(xb->fld)) { \ -match = extract32(dcmx, 0 + !sign, 1); \ -} \ -\ -if (scrf) { \ -cc = sign << CRF_LT_BIT | match << CRF_EQ_BIT; \ -env->fpscr &= ~FP_FPCC; \ -env->fpscr |= cc << FPSCR_FPCC; \ -env->crf[BF(opcode)] = cc; \ -} else {\ -t.tfld = match ? fld_max : 0; \ -}
[PATCH v2 10/12] target/ppc: Moved XVTSTDC[DS]P to decodetree
From: "Lucas Mateus Castro (alqotel)" Moved XVTSTDCSP and XVTSTDCDP to decodetree an restructured the helper to be simpler and do all decoding in the decodetree (so XB, XT and DCMX are all calculated outside the helper). Obs: The tests in this one are slightly different, these are the sum of these instructions with all possible immediate and those instructions are repeated 10 times. xvtstdcsp: reptloopmaster patch 8 12500 2,76402100 2,70699100 (-2.1%) 25 40002,64867100 2,67884100 (+1.1%) 100 10002,73806300 2,78701000 (+1.8%) 500 200 3,44666500 3,61027600 (+4.7%) 250040 5,85790200 6,47475500 (+10.5%) 800012 15,2210210017,46062900 (+14.7%) xvtstdcdp: reptloopmaster patch 8 12500 2,11818000 1,61065300 (-24.0%) 25 40002,04573400 1,60132200 (-21.7%) 100 10002,13834100 1,69988100 (-20.5%) 500 200 2,73977000 2,48631700 (-9.3%) 250040 5,05067000 5,25914100 (+4.1%) 800012 14,6050780015,93704900 (+9.1%) Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/fpu_helper.c | 39 +++-- target/ppc/helper.h | 4 +-- target/ppc/insn32.decode| 5 target/ppc/translate/vsx-impl.c.inc | 28 +++-- target/ppc/translate/vsx-ops.c.inc | 8 -- 5 files changed, 70 insertions(+), 14 deletions(-) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index ae25f32d6e..960a76a8a5 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -3295,11 +3295,46 @@ void helper_##op(CPUPPCState *env, uint32_t opcode) \ } \ } -VSX_TEST_DC(xvtstdcdp, 2, xB(opcode), float64, VsrD(i), VsrD(i), UINT64_MAX, 0) -VSX_TEST_DC(xvtstdcsp, 4, xB(opcode), float32, VsrW(i), VsrW(i), UINT32_MAX, 0) VSX_TEST_DC(xststdcdp, 1, xB(opcode), float64, VsrD(0), VsrD(0), 0, 1) VSX_TEST_DC(xststdcqp, 1, (rB(opcode) + 32), float128, f128, VsrD(0), 0, 1) +#define VSX_TSTDC(tp) \ +static int32_t tp##_tstdc(tp b, uint32_t dcmx) \ +{ \ +uint32_t match = 0; \ +uint32_t sign = tp##_is_neg(b); \ +if (tp##_is_any_nan(b)) { \ +match = extract32(dcmx, 6, 1); \ +} else if (tp##_is_infinity(b)) { \ +match = extract32(dcmx, 4 + !sign, 1); \ +} else if (tp##_is_zero(b)) { \ +match = extract32(dcmx, 2 + !sign, 1); \ +} else if (tp##_is_zero_or_denormal(b)) { \ +match = extract32(dcmx, 0 + !sign, 1); \ +} \ +return (match != 0);\ +} + +VSX_TSTDC(float32) +VSX_TSTDC(float64) +#undef VSX_TSTDC + +void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v) +{ +int i; +for (i = 0; i < 2; i++) { +t->s64[i] = (int64_t)-float64_tstdc(b->f64[i], dcmx); +} +} + +void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v) +{ +int i; +for (i = 0; i < 4; i++) { +t->s32[i] = (int32_t)-float32_tstdc(b->f32[i], dcmx); +} +} + void helper_xststdcsp(CPUPPCState *env, uint32_t opcode, ppc_vsr_t *xb) { uint32_t dcmx, sign, exp; diff --git a/target/ppc/helper.h b/target/ppc/helper.h index fd8280dfa7..9e5d11939b 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -517,8 +517,8 @@ DEF_HELPER_3(xvcvsxdsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvuxdsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvsxwsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvuxwsp, void, env, vsr, vsr) -DEF_HELPER_2(xvtstdcsp, void, env, i32) -DEF_HELPER_2(xvtstdcdp, void, env, i32) +DEF_HELPER_FLAGS_4(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32) +DEF_HELPER_FLAGS_4(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32) DEF_HELPER_3(xvrspi, void, env, vsr, vsr) DEF_HELPER_3(xvrspic, void, env, vsr, vsr) DEF_HELPER_3(xvrspim, void, env, vsr, vsr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 6549c4040e..c0a531be5c 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -199,6 +199,9 @@ @XX2_uim4 .. . . uim:4 . . .. _uim xt=%xx_xt xb=%xx_xb +%xx_uim76:1 2:1 16:5 +@XX2_uim7 .. . . . . ... . .._uim xt=%xx_xt xb=%xx_xb uim=%xx_uim7 + _bf_xb bf xb @XX2_bf_xb .. bf:3 .. . . . . ._bf_xb xb=%xx_xb @@ -848,6 +851
[PATCH v2 07/12] target/ppc: Move VABSDU[BHW] to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved VABSDUB, VABSDUH and VABSDUW to decodetree and use gvec to translate them. vabsdub: reptloopmaster patch 8 12500 0,03601600 0,00688500 (-80.9%) 25 40000,03651000 0,00532100 (-85.4%) 100 10000,03666900 0,00595300 (-83.8%) 500 200 0,04305800 0,01244600 (-71.1%) 250040 0,06893300 0,04273700 (-38.0%) 800012 0,14633200 0,12660300 (-13.5%) vabsduh: reptloopmaster patch 8 12500 0,02172400 0,00687500 (-68.4%) 25 40000,02154100 0,00531500 (-75.3%) 100 10000,02235400 0,00596300 (-73.3%) 500 200 0,02827500 0,01245100 (-56.0%) 250040 0,05638400 0,04285500 (-24.0%) 800012 0,13166000 0,12641400 (-4.0%) vabsduw: reptloopmaster patch 8 12500 0,01646400 0,00688300 (-58.2%) 25 40000,01454500 0,00475500 (-67.3%) 100 10000,01545800 0,00511800 (-66.9%) 500 200 0,02168200 0,01114300 (-48.6%) 250040 0,04571300 0,04138800 (-9.5%) 800012 0,12209500 0,12178500 (-0.3%) Same as VADDCUW and VSUBCUW, overall performance gain but it uses more TCGop (4 before the patch, 6 after). Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 6 ++-- target/ppc/insn32.decode| 6 target/ppc/int_helper.c | 13 +++- target/ppc/translate/vmx-impl.c.inc | 49 +++-- target/ppc/translate/vmx-ops.c.inc | 3 -- 5 files changed, 60 insertions(+), 17 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 71c22efc2e..fd8280dfa7 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -146,9 +146,9 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64) DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) -DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VABSDUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VABSDUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VABSDUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 38458c01de..ae151c4b62 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -528,6 +528,12 @@ VAVGUB 000100 . . . 110@VX VAVGUH 000100 . . . 1000110@VX VAVGUW 000100 . . . 1001010@VX +## Vector Integer Absolute Difference Instructions + +VABSDUB 000100 . . . 111@VX +VABSDUH 000100 . . . 1000111@VX +VABSDUW 000100 . . . 1001011@VX + ## Vector Bit Manipulation Instruction VGNB000100 . -- ... . 10011001100 @VX_n diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index bda76e54d4..d97a7f1f28 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -589,8 +589,8 @@ VAVG(VAVGSW, s32, int64_t) VAVG(VAVGUW, u32, uint64_t) #undef VAVG -#define VABSDU_DO(name, element)\ -void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ +#define VABSDU(name, element) \ +void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v)\ { \ int i; \ \ @@ -606,12 +606,9 @@ void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ * name- instruction mnemonic suffix (b: byte, h: halfword, w: word) * element - element type to access from vector */ -#define VABSDU(type, element) \ -VABSDU_DO(absdu##type, element) -VABSDU(b, u8) -VABSDU(h, u16) -VABSDU(w, u32) -#undef VABSDU_DO +VABSDU(VABSDUB, u8) +VABSDU(VABSDUH, u16) +VABSDU(VABSDUW, u32) #undef VABSDU #define VCF(suffix, cvt, element) \ diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 1e3e099739..f46a354d31 100644 --- a/
[PATCH v2 06/12] target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved the instructions VAVGUB, VAVGUH, VAVGUW, VAVGSB, VAVGSH, VAVGSW, to decodetree and use gvec with them. For these one the right shift had to be made before the sum as to avoid an overflow, so add 1 at the end if any of the entries had 1 in its LSB as to replicate the "+ 1" before the shift described by the ISA. vavgub: reptloopmaster patch 8 12500 0,02616600 0,00754200 (-71.2%) 25 40000,0253 0,00637700 (-74.8%) 100 10000,02604600 0,00790100 (-69.7%) 500 200 0,03189300 0,01838400 (-42.4%) 250040 0,06006900 0,06851000 (+14.1%) 800012 0,13941000 0,20548500 (+47.4%) vavguh: reptloopmaster patch 8 12500 0,01818200 0,00780600 (-57.1%) 25 40000,01789300 0,00641600 (-64.1%) 100 10000,01899100 0,00787200 (-58.5%) 500 200 0,02527200 0,01828400 (-27.7%) 250040 0,05361800 0,06773000 (+26.3%) 800012 0,12886600 0,20291400 (+57.5%) vavguw: reptloopmaster patch 8 12500 0,01423100 0,00776600 (-45.4%) 25 40000,01780800 0,00638600 (-64.1%) 100 10000,02085500 0,00787000 (-62.3%) 500 200 0,02737100 0,01828800 (-33.2%) 250040 0,05572600 0,06774200 (+21.6%) 800012 0,13101700 0,20311600 (+55.0%) vavgsb: reptloopmaster patch 8 12500 0,03006000 0,00788600 (-73.8%) 25 40000,02882200 0,00637800 (-77.9%) 100 10000,02958000 0,00791400 (-73.2%) 500 200 0,03548800 0,01860400 (-47.6%) 250040 0,0636 0,06850800 (+7.7%) 800012 0,13816500 0,20550300 (+48.7%) vavgsh: reptloopmaster patch 8 12500 0,01965900 0,00776600 (-60.5%) 25 40000,01875400 0,00638700 (-65.9%) 100 10000,01952200 0,00786900 (-59.7%) 500 200 0,02562000 0,01760300 (-31.3%) 250040 0,05384300 0,06742800 (+25.2%) 800012 0,13240800 0,2033 (+53.5%) vavgsw: reptloopmaster patch 8 12500 0,01407700 0,00775600 (-44.9%) 25 40000,01762300 0,0064 (-63.7%) 100 10000,02046500 0,00788500 (-61.5%) 500 200 0,02745600 0,01843000 (-32.9%) 250040 0,05375500 0,06820500 (+26.9%) 800012 0,13068300 0,20304900 (+55.4%) These results to me seems to indicate that with gvec the results have a slower translation but faster execution. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 12 ++-- target/ppc/insn32.decode| 9 +++ target/ppc/int_helper.c | 32 - target/ppc/translate/vmx-impl.c.inc | 106 target/ppc/translate/vmx-ops.c.inc | 9 +-- 5 files changed, 127 insertions(+), 41 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index a06193bc67..71c22efc2e 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -143,15 +143,15 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64) #define dh_ctype_acc ppc_acc_t * #define dh_typecode_acc dh_typecode_ptr -DEF_HELPER_FLAGS_3(vavgub, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavguh, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavguw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavgsb, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavgsh, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavgsw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_4(vcmpeqfp, void, env, avr, avr, avr) DEF_HELPER_4(vcmpgefp, void, env, avr, avr, avr) DEF_HELPER_4(vcmpgtfp, void, env, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index aa4968e6b9..38458c01de 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -519,6 +519,15 @@ VCMPNEZW000100 . . . . 011111 @VC VCMPSQ 000100 ... -- . . 0010101 @VX_bf VCMPUQ 000100 ... -- . .
[PATCH v2 00/12] VMX/VSX instructions with gvec
From: "Lucas Mateus Castro (alqotel)" Patches missing review: 3,5,9,11,12 v1 -> v2: - Implemented instructions with fni4/fni8 and dropped the helper: * VSUBCUW * VADDCUW * VPRTYBW * VPRTYBD - Reworked patch12 to only use gvec implementation with a few immediates. - Used bitsel_ver on patch9 - Changed vec variables to tcg_constant_vec when possible This patch series moves some instructions from decode legacy to decodetree and translate said instructions with gvec. Some cases using gvec ended up with a bigger, more complex and slower so those instructions were only moved to decodetree. In each patch there's a comparison of the execution time before the patch being applied and after. Said result is the sum of 10 executions. The program used to time the execution worked like this: clock_t start = clock(); for (int i = 0; i < LOOP; i++) { asm ( load values in registers, between 2 and 3 instructions ".rept REPT\n\t" "INSTRUCTION registers\n\t" ".endr\n\t" save result from register, 1 instruction ); } clock_t end = clock(); printf("INSTRUCTION rept=REPT loop=LOOP, time taken: %.12lf\n", ((double)(end - start))/ CLOCKS_PER_SEC); Where the column rept in the value used in .rept in the inline assembly and loop column is the value used for the for loop. All of those tests were executed on a Power9. When comparing the TCGop the data used was gathered using '-d op' and '-d op_opt'. Lucas Mateus Castro (alqotel) (12): target/ppc: Moved VMLADDUHM to decodetree and use gvec target/ppc: Move VMH[R]ADDSHS instruction to decodetree target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec target/ppc: Move VNEG[WD] to decodtree and use gvec target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec target/ppc: Move VABSDU[BHW] to decodetree and use gvec target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P target/ppc: Use gvec to decode XVCPSGN[SD]P target/ppc: Moved XVTSTDC[DS]P to decodetree target/ppc: Moved XSTSTDC[QDS]P to decodetree target/ppc: Use gvec to decode XVTSTDC[DS]P target/ppc/fpu_helper.c | 140 +- target/ppc/helper.h | 42 ++- target/ppc/insn32.decode| 50 target/ppc/int_helper.c | 107 ++-- target/ppc/translate.c | 1 - target/ppc/translate/vmx-impl.c.inc | 364 + target/ppc/translate/vmx-ops.c.inc | 15 +- target/ppc/translate/vsx-impl.c.inc | 394 +++- target/ppc/translate/vsx-ops.c.inc | 21 -- 9 files changed, 808 insertions(+), 326 deletions(-) -- 2.37.3
[PATCH v2 04/12] target/ppc: Move VNEG[WD] to decodtree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved the instructions VNEGW and VNEGD to decodetree and used gvec to decode it. vnegw: reptloopmaster patch 8 12500 0,01053200 0,00548400 (-47.9%) 25 40000,01030500 0,0039 (-62.2%) 100 10000,01096300 0,00395400 (-63.9%) 500 200 0,01472000 0,00712300 (-51.6%) 250040 0,03809000 0,02147700 (-43.6%) 800012 0,09957100 0,06202100 (-37.7%) vnegd: reptloopmaster patch 8 12500 0,00594600 0,00543800 (-8.5%) 25 40000,00575200 0,00396400 (-31.1%) 100 10000,00676100 0,00394800 (-41.6%) 500 200 0,01149300 0,00709400 (-38.3%) 250040 0,03441500 0,02169600 (-37.0%) 800012 0,09516900 0,06337000 (-33.4%) Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 2 -- target/ppc/insn32.decode| 3 +++ target/ppc/int_helper.c | 12 target/ppc/translate/vmx-impl.c.inc | 15 +-- target/ppc/translate/vmx-ops.c.inc | 2 -- 5 files changed, 16 insertions(+), 18 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index f7047ed2aa..b2e910b089 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -229,8 +229,6 @@ DEF_HELPER_FLAGS_2(VSTRIBL, TCG_CALL_NO_RWG, i32, avr, avr) DEF_HELPER_FLAGS_2(VSTRIBR, TCG_CALL_NO_RWG, i32, avr, avr) DEF_HELPER_FLAGS_2(VSTRIHL, TCG_CALL_NO_RWG, i32, avr, avr) DEF_HELPER_FLAGS_2(VSTRIHR, TCG_CALL_NO_RWG, i32, avr, avr) -DEF_HELPER_FLAGS_2(vnegw, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vnegd, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vupkhpx, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vupklpx, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vupkhsb, TCG_CALL_NO_RWG, void, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index aebc7b73c8..2658dd3395 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -629,6 +629,9 @@ VEXTSH2D000100 . 11001 . 1100010 @VX_tb VEXTSW2D000100 . 11010 . 1100010@VX_tb VEXTSD2Q000100 . 11011 . 1100010@VX_tb +VNEGD 000100 . 00111 . 1100010@VX_tb +VNEGW 000100 . 00110 . 1100010@VX_tb + ## Vector Mask Manipulation Instructions MTVSRBM 000100 . 1 . 1100110@VX_tb diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index f8dd12e8ae..c7fd0d1faa 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -1928,18 +1928,6 @@ XXBLEND(W, 32) XXBLEND(D, 64) #undef XXBLEND -#define VNEG(name, element) \ -void helper_##name(ppc_avr_t *r, ppc_avr_t *b) \ -{ \ -int i; \ -for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ -r->element[i] = -b->element[i]; \ -} \ -} -VNEG(vnegw, s32) -VNEG(vnegd, s64) -#undef VNEG - void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { int sh = (b->VsrB(0xf) >> 3) & 0xf; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index f52485a5f1..b9a9e83ab3 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2625,8 +2625,19 @@ GEN_VXFORM_NOA(vclzb, 1, 28) GEN_VXFORM_NOA(vclzh, 1, 29) GEN_VXFORM_TRANS(vclzw, 1, 30) GEN_VXFORM_TRANS(vclzd, 1, 31) -GEN_VXFORM_NOA_2(vnegw, 1, 24, 6) -GEN_VXFORM_NOA_2(vnegd, 1, 24, 7) + +static bool do_vneg(DisasContext *ctx, arg_VX_tb *a, unsigned vece) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA300); +REQUIRE_VECTOR(ctx); + +tcg_gen_gvec_neg(vece, avr_full_offset(a->vrt), avr_full_offset(a->vrb), + 16, 16); +return true; +} + +TRANS(VNEGW, do_vneg, MO_32) +TRANS(VNEGD, do_vneg, MO_64) static void gen_vexts_i64(TCGv_i64 t, TCGv_i64 b, int64_t s) { diff --git a/target/ppc/translate/vmx-ops.c.inc b/target/ppc/translate/vmx-ops.c.inc index ded0234123..27908533dd 100644 --- a/target/ppc/translate/vmx-ops.c.inc +++ b/target/ppc/translate/vmx-ops.c.inc @@ -181,8 +181,6 @@ GEN_VXFORM_300_EXT(vextractd, 6, 11, 0x10), GEN_VXFORM(vspltisb, 6, 12), GEN_VXFORM(vspltish, 6, 13), GEN_VXFORM(vspltisw, 6, 14), -GEN_VXFORM_300_EO(vnegw, 0x01, 0x18, 0x06), -GEN_VXFORM_300_EO(vnegd, 0x01, 0x18, 0x07), GEN_VXFORM_300_EO(vctzb, 0x01, 0x18, 0x1C), GEN_VXFORM_300_EO(vctzh, 0x01, 0x18, 0x1D), GEN_VXFORM_300_EO(vctzw, 0x01, 0x18, 0x1E), -- 2.37.3
[PATCH v2 09/12] target/ppc: Use gvec to decode XVCPSGN[SD]P
From: "Lucas Mateus Castro (alqotel)" Moved XVCPSGNSP and XVCPSGNDP to decodetree and used gvec to translate them. xvcpsgnsp: reptloopmaster patch 8 12500 0,00561400 0,00537900 (-4.2%) 25 40000,00562100 0,0040 (-28.8%) 100 10000,00696900 0,00416300 (-40.3%) 500 200 0,02211900 0,00840700 (-62.0%) 250040 0,09328600 0,02728300 (-70.8%) 800012 0,27295300 0,06867800 (-74.8%) xvcpsgndp: reptloopmaster patch 8 12500 0,00556300 0,00584200 (+5.0%) 25 40000,00482700 0,00431700 (-10.6%) 100 10000,00585800 0,00464400 (-20.7%) 500 200 0,01565300 0,00839700 (-46.4%) 250040 0,05766500 0,02430600 (-57.8%) 800012 0,19875300 0,07947100 (-60.0%) Like the previous instructions there seemed to be a improvement on translation time. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/insn32.decode| 2 + target/ppc/translate/vsx-impl.c.inc | 109 ++-- target/ppc/translate/vsx-ops.c.inc | 3 - 3 files changed, 55 insertions(+), 59 deletions(-) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 5b687078be..6549c4040e 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -762,6 +762,8 @@ XVNABSDP00 . 0 . 01001 .. @XX2 XVNABSSP00 . 0 . 110101001 .. @XX2 XVNEGDP 00 . 0 . 11001 .. @XX2 XVNEGSP 00 . 0 . 110111001 .. @XX2 +XVCPSGNDP 00 . . . ... @XX3 +XVCPSGNSP 00 . . . 1101 ... @XX3 ## VSX Scalar Multiply-Add Instructions diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 3f9af811dc..4f17da514c 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -729,62 +729,6 @@ VSX_SCALAR_MOVE_QP(xsnabsqp, OP_NABS, SGN_MASK_DP) VSX_SCALAR_MOVE_QP(xsnegqp, OP_NEG, SGN_MASK_DP) VSX_SCALAR_MOVE_QP(xscpsgnqp, OP_CPSGN, SGN_MASK_DP) -#define VSX_VECTOR_MOVE(name, op, sgn_mask) \ -static void glue(gen_, name)(DisasContext *ctx) \ -{\ -TCGv_i64 xbh, xbl, sgm; \ -if (unlikely(!ctx->vsx_enabled)) { \ -gen_exception(ctx, POWERPC_EXCP_VSXU); \ -return; \ -}\ -xbh = tcg_temp_new_i64();\ -xbl = tcg_temp_new_i64();\ -sgm = tcg_temp_new_i64();\ -get_cpu_vsr(xbh, xB(ctx->opcode), true); \ -get_cpu_vsr(xbl, xB(ctx->opcode), false);\ -tcg_gen_movi_i64(sgm, sgn_mask); \ -switch (op) {\ -case OP_ABS: { \ -tcg_gen_andc_i64(xbh, xbh, sgm); \ -tcg_gen_andc_i64(xbl, xbl, sgm); \ -break; \ -}\ -case OP_NABS: { \ -tcg_gen_or_i64(xbh, xbh, sgm); \ -tcg_gen_or_i64(xbl, xbl, sgm); \ -break; \ -}\ -case OP_NEG: { \ -tcg_gen_xor_i64(xbh, xbh, sgm); \ -tcg_gen_xor_i64(xbl, xbl, sgm); \ -break; \ -}\ -case OP_CPSGN: { \ -TCGv_i64 xah = tcg_temp_new_i64(); \ -TCGv_i64 xal = tcg_temp_new_i64(); \ -get_cpu_vsr(xah, xA(ctx->opcode), true); \ -get_cpu_vsr(xal, xA(ctx->opcode), false);\ -tcg_gen_and_i64(xah, xah, sgm); \ -tcg_gen_and_i64(xal, xal, sgm); \ -tcg_gen_andc_i64(xbh, xbh, sgm); \ -tcg_gen_andc_i64(xbl, xbl, sgm); \ -tcg_gen_or_i64(xbh, xbh, xah); \ -tcg
[PATCH v2 08/12] target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P
From: "Lucas Mateus Castro (alqotel)" Moved XVABSSP, XVABSDP, XVNABSSP,XVNABSDP, XVNEGSP and XVNEGDP to decodetree and used gvec to translate them. xvabssp: reptloopmaster patch 8 12500 0,00477900 0,00476000 (-0.4%) 25 40000,00442800 0,00353300 (-20.2%) 100 10000,00478700 0,00366100 (-23.5%) 500 200 0,00973200 0,00649400 (-33.3%) 250040 0,03165200 0,02226700 (-29.7%) 800012 0,09315900 0,06674900 (-28.3%) xvabsdp: reptloopmaster patch 8 12500 0,00475000 0,00474400 (-0.1%) 25 40000,00355600 0,00367500 (+3.3%) 100 10000,00444200 0,00366000 (-17.6%) 500 200 0,00942700 0,00732400 (-22.3%) 250040 0,0299 0,02308500 (-22.8%) 800012 0,08770300 0,06683800 (-23.8%) xvnabssp: reptloopmaster patch 8 12500 0,00494500 0,00492900 (-0.3%) 25 40000,00397700 0,00338600 (-14.9%) 100 10000,00421400 0,00353500 (-16.1%) 500 200 0,01048000 0,00707100 (-32.5%) 250040 0,03251500 0,02238300 (-31.2%) 800012 0,08889100 0,06469800 (-27.2%) xvnabsdp: reptloopmaster patch 8 12500 0,00511000 0,00492700 (-3.6%) 25 40000,00398800 0,00381500 (-4.3%) 100 10000,00390500 0,00365900 (-6.3%) 500 200 0,00924800 0,00784600 (-15.2%) 250040 0,03138900 0,02391600 (-23.8%) 800012 0,09654200 0,05684600 (-41.1%) xvnegsp: reptloopmaster patch 8 12500 0,00493900 0,00452800 (-8.3%) 25 40000,00369100 0,00366800 (-0.6%) 100 10000,00371100 0,0038 (+2.4%) 500 200 0,00991100 0,00652300 (-34.2%) 250040 0,03025800 0,02422300 (-19.9%) 800012 0,09251100 0,06457600 (-30.2%) xvnegdp: reptloopmaster patch 8 12500 0,00474900 0,00454400 (-4.3%) 25 40000,00353100 0,00325600 (-7.8%) 100 10000,00398600 0,00366800 (-8.0%) 500 200 0,01032300 0,00702400 (-32.0%) 250040 0,03125000 0,02422400 (-22.5%) 800012 0,09475100 0,06173000 (-34.9%) This one to me seemed the opposite of the previous instructions, as it looks like there was an improvement in the translation time (itself not a surprise as operations were done twice before so there was the need to translate twice as many TCGop) Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/insn32.decode| 9 target/ppc/translate/vsx-impl.c.inc | 73 ++--- target/ppc/translate/vsx-ops.c.inc | 6 --- 3 files changed, 76 insertions(+), 12 deletions(-) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index ae151c4b62..5b687078be 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -754,6 +754,15 @@ STXVRHX 01 . . . 0010101101 . @X_TSX STXVRWX 01 . . . 0011001101 . @X_TSX STXVRDX 01 . . . 0011101101 . @X_TSX +## VSX Vector Binary Floating-Point Sign Manipulation Instructions + +XVABSDP 00 . 0 . 111011001 .. @XX2 +XVABSSP 00 . 0 . 110011001 .. @XX2 +XVNABSDP00 . 0 . 01001 .. @XX2 +XVNABSSP00 . 0 . 110101001 .. @XX2 +XVNEGDP 00 . 0 . 11001 .. @XX2 +XVNEGSP 00 . 0 . 110111001 .. @XX2 + ## VSX Scalar Multiply-Add Instructions XSMADDADP 00 . . . 0011 . . . @XX3 diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 7acdbceec4..3f9af811dc 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -782,15 +782,76 @@ static void glue(gen_, name)(DisasContext *ctx) \ tcg_temp_free_i64(sgm); \ } -VSX_VECTOR_MOVE(xvabsdp, OP_ABS, SGN_MASK_DP) -VSX_VECTOR_MOVE(xvnabsdp, OP_NABS, SGN_MASK_DP) -VSX_VECTOR_MOVE(xvnegdp, OP_NEG, SGN_MASK_DP) VSX_VECTOR_MOVE(xvcpsgndp, OP_CPSGN, SGN_MASK_DP) -VSX_VECTOR_MOVE(xvabssp, OP_ABS, SGN_MASK_SP) -VSX_VECTOR_MOVE(xvnabssp, OP_NABS, SGN_MASK_SP) -VSX_VECTOR_MOVE(xvnegsp, OP_NEG, SGN_MASK_SP) VSX_VECTOR_MOVE(xvcpsgnsp, OP_CPSGN, SGN_MASK_SP) +#define TCG_OP_IMM_i64(FUNC, OP, IMM) \ +static void FUNC(TCGv_i64 t, TCGv_i64 b)\ +{ \ +OP(t, b, IMM); \ +} + +TCG_OP_IMM_i64(do_xvabssp_i64, tcg_ge
[PATCH v2 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to decodetree. VPRTYBW and VPRTYBD now also use .fni4 and .fni8, respectively. vprtybw: reptloopmaster patch 8 12500 0,00991200 0,00626300 (-36.8%) 25 40000,01040600 0,00550600 (-47.1%) 100 10000,01084500 0,00601100 (-44.6%) 500 200 0,01490600 0,01394100 (-6.5%) 250040 0,03285100 0,05143000 (+56.6%) 800012 0,08971500 0,14662500 (+63.4%) vprtybd: reptloopmaster patch 8 12500 0,00665800 0,00652800 (-2.0%) 25 40000,00589300 0,00670400 (+13.8%) 100 10000,00646800 0,00743900 (+15.0%) 500 200 0,01065800 0,01586400 (+48.8%) 250040 0,03497000 0,07180100 (+105.3%) 800012 0,09242200 0,21566600 (+133.3%) vprtybq: reptloopmaster patch 8 12500 0,00656200 0,00665800 (+1.5%) 25 40000,00620500 0,00644900 (+3.9%) 100 10000,00707500 0,00764900 (+8.1%) 500 200 0,01203500 0,01349500 (+12.1%) 250040 0,03505700 0,04123100 (+17.6%) 800012 0,09590600 0,11586700 (+20.8%) I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ, I'm not sure if it's worth to move those instructions. Comparing the assembly of the helper with the TCGop they are pretty similar, so I'm not sure why vprtybd took so much more time. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/helper.h | 4 +- target/ppc/insn32.decode| 4 ++ target/ppc/int_helper.c | 25 + target/ppc/translate/vmx-impl.c.inc | 80 +++-- target/ppc/translate/vmx-ops.c.inc | 3 -- 5 files changed, 83 insertions(+), 33 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index b2e910b089..a06193bc67 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -193,9 +193,7 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr) +DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 2658dd3395..aa4968e6b9 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -529,6 +529,10 @@ VCTZDM 000100 . . . 1000100@VX VPDEPD 000100 . . . 10111001101@VX VPEXTD 000100 . . . 10110001101@VX +VPRTYBD 000100 . 01001 . 1100010@VX_tb +VPRTYBQ 000100 . 01010 . 1100010@VX_tb +VPRTYBW 000100 . 01000 . 1100010@VX_tb + ## Vector Permute and Formatting Instruction VEXTDUBVLX 000100 . . . . 011000 @VA diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index c7fd0d1faa..c6ce4665fa 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -492,31 +492,8 @@ static inline void set_vscr_sat(CPUPPCState *env) env->vscr_sat.u32[0] = 1; } -/* vprtybw */ -void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b) -{ -int i; -for (i = 0; i < ARRAY_SIZE(r->u32); i++) { -uint64_t res = b->u32[i] ^ (b->u32[i] >> 16); -res ^= res >> 8; -r->u32[i] = res & 1; -} -} - -/* vprtybd */ -void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b) -{ -int i; -for (i = 0; i < ARRAY_SIZE(r->u64); i++) { -uint64_t res = b->u64[i] ^ (b->u64[i] >> 32); -res ^= res >> 16; -res ^= res >> 8; -r->u64[i] = res & 1; -} -} - /* vprtybq */ -void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) +void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v) { uint64_t res = b->u64[0] ^ b->u64[1]; res ^= res >> 32; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index b9a9e83ab3..23601942bc 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -1659,9 +1659,83 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11); GEN_VXFORM_NOA_ENV(vrfin, 5, 8); GEN_VXFORM_NOA_ENV(vrfip, 5, 10); GEN_VXFORM_NOA_ENV(vrfiz, 5, 9); -GEN_VXFORM_NOA(vprt
[PATCH v2 03/12] target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" This patch moves VADDCUW and VSUBCUW to decodtree with gvec using an implementation based on the helper, with the main difference being changing the -1 (aka all bits set to 1) result returned by cmp when true to +1. It also implemented a .fni4 version of those instructions and dropped the helper. vaddcuw: reptloopmaster patch 8 12500 0,01008200 0,00612400 (-39.3%) 25 40000,01091500 0,00471600 (-56.8%) 100 10000,01332500 0,00593700 (-55.4%) 500 200 0,01998500 0,01275700 (-36.2%) 250040 0,04704300 0,04364300 (-7.2%) 800012 0,10748200 0,11241000 (+4.6%) vsubcuw: reptloopmaster patch 8 12500 0,01226200 0,00571600 (-53.4%) 25 40000,01493500 0,00462100 (-69.1%) 100 10000,01522700 0,00455100 (-70.1%) 500 200 0,02384600 0,01133500 (-52.5%) 250040 0,04935200 0,03178100 (-35.6%) 800012 0,09039900 0,09440600 (+4.4%) Overall there was a gain in performance, but the TCGop code was still slightly bigger in the new version (it went from 4 to 5). Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/helper.h | 2 - target/ppc/insn32.decode| 2 + target/ppc/int_helper.c | 18 - target/ppc/translate/vmx-impl.c.inc | 61 +++-- target/ppc/translate/vmx-ops.c.inc | 3 +- 5 files changed, 60 insertions(+), 26 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index f02a9497b7..f7047ed2aa 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -193,11 +193,9 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vaddcuw, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_3(vsubcuw, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 9a509e84df..aebc7b73c8 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -608,12 +608,14 @@ VRLQNM 000100 . . . 00101000101 @VX ## Vector Integer Arithmetic Instructions +VADDCUW 000100 . . . 0011000@VX VADDCUQ 000100 . . . 0010100@VX VADDUQM 000100 . . . 001@VX VADDEUQM000100 . . . . 00 @VA VADDECUQ000100 . . . . 01 @VA +VSUBCUW 000100 . . . 1011000@VX VSUBCUQ 000100 . . . 1010100@VX VSUBUQM 000100 . . . 101@VX diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index ae1ba8084d..f8dd12e8ae 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -492,15 +492,6 @@ static inline void set_vscr_sat(CPUPPCState *env) env->vscr_sat.u32[0] = 1; } -void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) -{ -int i; - -for (i = 0; i < ARRAY_SIZE(r->u32); i++) { -r->u32[i] = ~a->u32[i] < b->u32[i]; -} -} - /* vprtybw */ void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b) { @@ -1962,15 +1953,6 @@ void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) #endif } -void helper_vsubcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) -{ -int i; - -for (i = 0; i < ARRAY_SIZE(r->u32); i++) { -r->u32[i] = a->u32[i] >= b->u32[i]; -} -} - void helper_vsumsws(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { int64_t t; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 3acd585a2f..f52485a5f1 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -803,8 +803,6 @@ GEN_VXFORM(vsrv, 2, 28); GEN_VXFORM(vslv, 2, 29); GEN_VXFORM(vslo, 6, 16); GEN_VXFORM(vsro, 6, 17); -GEN_VXFORM(vaddcuw, 0, 6); -GEN_VXFORM(vsubcuw, 0, 22); static bool do_vector_gvec3_VX(DisasContext *ctx, arg_VX *a, int vece, void (*gen_gvec)(unsigned, uint32_t, uint32_t, @@ -2847,8 +2845,6 @@ static void gen_xpnd04_2(DisasContext *ctx) } -GEN_VXFORM_DUAL(vsubcuw, PPC_ALTIVEC, PPC_NONE, \ -xpnd04_1, PPC_NONE, P
[PATCH v2 02/12] target/ppc: Move VMH[R]ADDSHS instruction to decodetree
From: "Lucas Mateus Castro (alqotel)" This patch moves VMHADDSHS and VMHRADDSHS to decodetree I couldn't find a satisfactory implementation with TCG inline. vmhaddshs: reptloopmaster patch 8 12500 0,02983400 0,02648500 (-11.2%) 25 40000,02946000 0,02518000 (-14.5%) 100 10000,03104300 0,02638000 (-15.0%) 500 200 0,04002000 0,03502500 (-12.5%) 250040 0,08090100 0,07562200 (-6.5%) 800012 0,19242600 0,18626800 (-3.2%) vmhraddshs: reptloopmaster patch 8 12500 0,03078600 0,02851000 (-7.4%) 25 40000,02793200 0,02746900 (-1.7%) 100 10000,02886000 0,02839900 (-1.6%) 500 200 0,03714700 0,03799200 (+2.3%) 250040 0,07948000 0,07852200 (-1.2%) 800012 0,19049800 0,18813900 (-1.2%) Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 4 ++-- target/ppc/insn32.decode| 2 ++ target/ppc/int_helper.c | 4 ++-- target/ppc/translate/vmx-impl.c.inc | 5 +++-- target/ppc/translate/vmx-ops.c.inc | 1 - 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 9c562ab00e..f02a9497b7 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -258,8 +258,8 @@ DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr) DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr) DEF_HELPER_4(vpkudum, void, env, avr, avr, avr) DEF_HELPER_FLAGS_3(vpkpx, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr) -DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr) +DEF_HELPER_5(VMHADDSHS, void, env, avr, avr, avr, avr) +DEF_HELPER_5(VMHRADDSHS, void, env, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 7445455a12..9a509e84df 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -694,6 +694,8 @@ VMSUMCUD000100 . . . . 010111 @VA VMSUMUDM000100 . . . . 100011 @VA VMLADDUHM 000100 . . . . 100010 @VA +VMHADDSHS 000100 . . . . 10 @VA +VMHRADDSHS 000100 . . . . 11 @VA ## Vector String Instructions diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 0d25000b2a..ae1ba8084d 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -939,7 +939,7 @@ target_ulong helper_vctzlsbb(ppc_avr_t *r) return count; } -void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, +void helper_VMHADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) { int sat = 0; @@ -957,7 +957,7 @@ void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, } } -void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, +void helper_VMHRADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) { int sat = 0; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 9f18c6d4f2..3acd585a2f 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2521,7 +2521,7 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ tcg_temp_free_ptr(rd); \ } -GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16) +GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23) static bool do_va_helper(DisasContext *ctx, arg_VA *a, void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr)) @@ -2620,7 +2620,8 @@ static bool do_va_env_helper(DisasContext *ctx, arg_VA *a, TRANS_FLAGS(ALTIVEC, VMSUMUHS, do_va_env_helper, gen_helper_VMSUMUHS) TRANS_FLAGS(ALTIVEC, VMSUMSHS, do_va_env_helper, gen_helper_VMSUMSHS) -GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23) +TRANS_FLAGS(ALTIVEC, VMHADDSHS, do_va_env_helper, gen_helper_VMHADDSHS) +TRANS_FLAGS(ALTIVEC, VMHRADDSHS, do_va_env_helper, gen_helper_VMHRADDSHS) GEN_VXFORM_NOA(vclzb, 1, 28) GEN_VXFORM_NOA(vclzh, 1, 29) diff --git a/target/ppc/translate/vmx-ops.c.inc b/target/ppc/translate/vmx-ops.c.inc index a3a0fd0650..7cd9d40e06 100644 --- a/target/ppc/translate/vmx-ops.c.inc +++ b/target/ppc/translate/vmx-ops.c.inc @@ -219,7 +219,6 @@ GEN_VXFORM_UIMM(vctsxs, 5, 15), #define GEN_VAFORM_PAIRED(name0, name1, opc2) \ GEN_HANDLER(name0##_##name1, 0x04, opc2, 0xFF, 0x, PPC_ALTIVEC) -GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16), GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23), GEN_VXFORM_DUAL(vclzb, vpopc
[PATCH v2 01/12] target/ppc: Moved VMLADDUHM to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" This patch moves VMLADDUHM to decodetree a creates a gvec implementation using mul_vec and add_vec. reptloopmaster patch 8 12500 0,01810500 0,00903100 (-50.1%) 25 40000,01739400 0,00747700 (-57.0%) 100 10000,01843600 0,00901400 (-51.1%) 500 200 0,02574600 0,01971000 (-23.4%) 250040 0,05921600 0,07121800 (+20.3%) 800012 0,15326700 0,21725200 (+41.7%) The significant difference in performance when REPT is low and LOOP is high I think is due to the fact that the new implementation has a higher translation time, as when using a helper only 5 TCGop are used but with the patch a total of 10 TCGop are needed (Power lacks a direct mul_vec equivalent so this instruction is implemented with the help of 5 others, vmuleu, vmulou, vmrgh, vmrgl and vpkum). Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 2 +- target/ppc/insn32.decode| 2 ++ target/ppc/int_helper.c | 3 +- target/ppc/translate.c | 1 - target/ppc/translate/vmx-impl.c.inc | 48 ++--- 5 files changed, 35 insertions(+), 21 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 57eee07256..9c562ab00e 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -264,7 +264,7 @@ DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_5(VMSUMSHS, void, env, avr, avr, avr, avr) -DEF_HELPER_FLAGS_4(vmladduhm, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) +DEF_HELPER_FLAGS_5(VMLADDUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_2(mtvscr, TCG_CALL_NO_RWG, void, env, i32) DEF_HELPER_FLAGS_1(mfvscr, TCG_CALL_NO_RWG, i32, env) DEF_HELPER_3(lvebx, void, env, avr, tl) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index a5249ee32c..7445455a12 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -693,6 +693,8 @@ VMSUMUHS000100 . . . . 100111 @VA VMSUMCUD000100 . . . . 010111 @VA VMSUMUDM000100 . . . . 100011 @VA +VMLADDUHM 000100 . . . . 100010 @VA + ## Vector String Instructions VSTRIBL 000100 . 0 . . 001101 @VX_tb_rc diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 696096100b..0d25000b2a 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -974,7 +974,8 @@ void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, } } -void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) +void helper_VMLADDUHM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c, + uint32_t v) { int i; diff --git a/target/ppc/translate.c b/target/ppc/translate.c index e810842925..11f729c60c 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -6921,7 +6921,6 @@ GEN_HANDLER(lvsl, 0x1f, 0x06, 0x00, 0x0001, PPC_ALTIVEC), GEN_HANDLER(lvsr, 0x1f, 0x06, 0x01, 0x0001, PPC_ALTIVEC), GEN_HANDLER(mfvscr, 0x04, 0x2, 0x18, 0x001ff800, PPC_ALTIVEC), GEN_HANDLER(mtvscr, 0x04, 0x2, 0x19, 0x03ff, PPC_ALTIVEC), -GEN_HANDLER(vmladduhm, 0x04, 0x11, 0xFF, 0x, PPC_ALTIVEC), #if defined(TARGET_PPC64) GEN_HANDLER_E(maddhd_maddhdu, 0x04, 0x18, 0xFF, 0x, PPC_NONE, PPC2_ISA300), diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index e644ad3236..9f18c6d4f2 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2523,24 +2523,6 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16) -static void gen_vmladduhm(DisasContext *ctx) -{ -TCGv_ptr ra, rb, rc, rd; -if (unlikely(!ctx->altivec_enabled)) { -gen_exception(ctx, POWERPC_EXCP_VPU); -return; -} -ra = gen_avr_ptr(rA(ctx->opcode)); -rb = gen_avr_ptr(rB(ctx->opcode)); -rc = gen_avr_ptr(rC(ctx->opcode)); -rd = gen_avr_ptr(rD(ctx->opcode)); -gen_helper_vmladduhm(rd, ra, rb, rc); -tcg_temp_free_ptr(ra); -tcg_temp_free_ptr(rb); -tcg_temp_free_ptr(rc); -tcg_temp_free_ptr(rd); -} - static bool do_va_helper(DisasContext *ctx, arg_VA *a, void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr)) { @@ -2569,6 +2551,36 @@ TRANS_FLAGS2(ALTIVEC_207, VSUBECUQ, do_va_helper, gen_helper_VSUBECUQ) TRANS_FLAGS(ALTIVEC, VPERM, do_va_helper, gen_helper_VPERM) TRANS_FLAGS2(ISA300, VPERMR, do_va_helper, gen_helper_VPERMR) +static void gen_vmladduhm_vec(unsigned vece, TCGv_vec t, TCGv_vec a, TCGv_vec b, +
[PATCH 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P
From: "Lucas Mateus Castro (alqotel)" Used gvec to translate XVTSTDCSP and XVTSTDCDP. xvtstdcsp: reptlooppatch10 patch12 8 12500 2,70288900 1,24050300 (-54.1%) 25 40002,65665700 1,14078900 (-57.1%) 100 10002,82795400 1,53337200 (-45.8%) 500 200 3,62225400 3,91718000 (+8.1%) 250040 6,45658000 12,60683700 (+95.3%) 800012 17,48091900 44,15384000 (+152.6%) xvtstdcdp: reptlooppatch10 patch12 8 125001,56435900 1,24554800 (-20.4%) 25 4000 1,53789500 1,14177800 (-25.8%) 100 1000 1,67964600 1,5428 (-8.1%) 500 200 2,46777100 3,96816000 (+60.8%) 250040 5,2193890012,79937800 (+145.2%) 800012 15,9760050045,44233000 (+184.4%) Overall these instructions are the hardest ones to measure performance as the helper implementation is affected by the immediate. So for example in a worst case scenario (high REPT, LOOP = 1, immediate 127) it took 13x longer with the gvec implementation, and in a best case scenario (low REPT, high LOOP, only 1 bit set in the immediate) the execution took 21.8% of the time with gvec (-78.2%). The tests here are the sum of every possible immediate. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/translate/vsx-impl.c.inc | 73 - 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index c3c179723b..dc95e8fdf4 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -1121,16 +1121,85 @@ GEN_VSX_HELPER_X2(xscvhpdp, 0x16, 0x15, 0x10, PPC2_ISA300) GEN_VSX_HELPER_R2(xscvsdqp, 0x04, 0x1A, 0x0A, PPC2_ISA300) GEN_VSX_HELPER_X2(xscvspdp, 0x12, 0x14, 0, PPC2_VSX) +static void do_xvtstdc_vec(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t imm) +{ +TCGv_vec match = tcg_const_ones_vec_matching(t); +TCGv_vec temp; +TCGv_vec mask; +uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP; +uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP; +uint64_t frc_msk = ~(exp_msk | sgn_msk); +mask = tcg_constant_vec_matching(t, vece, 0); +tcg_gen_mov_vec(t, mask); +if (imm & (0x3 << 0)) { +/* test if Denormal */ +temp = tcg_temp_new_vec_matching(t); +mask = tcg_constant_vec_matching(t, vece, ~sgn_msk); +tcg_gen_and_vec(vece, t, b, mask); +mask = tcg_constant_vec_matching(t, vece, frc_msk); +tcg_gen_cmp_vec(TCG_COND_LE, vece, temp, t, mask); +mask = tcg_constant_vec_matching(t, vece, 0); +tcg_gen_cmpsel_vec(TCG_COND_NE, vece, temp, t, mask, temp, mask); + +tcg_gen_mov_vec(t, mask); +mask = tcg_constant_vec_matching(t, vece, sgn_msk); +if (imm & (0x1)) { +/* test if negative */ +tcg_gen_cmpsel_vec(TCG_COND_GTU, vece, t, b, mask, temp, t); +} +if (imm & (0x2)) { +/* test if positive */ +tcg_gen_cmpsel_vec(TCG_COND_LTU, vece, t, b, mask, temp, t); +} +tcg_temp_free_vec(temp); +} +if (imm & (1 << 2)) { +/* test if -0 */ +mask = tcg_constant_vec_matching(t, vece, sgn_msk); +tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t); +} +if (imm & (1 << 3)) { +/* test if +0 */ +mask = tcg_constant_vec_matching(t, vece, 0); +tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t); +} +if (imm & (1 << 4)) { +/* test if -Inf */ +mask = tcg_constant_vec_matching(t, vece, exp_msk | sgn_msk); +tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t); +} +if (imm & (1 << 5)) { +/* test if +Inf */ +mask = tcg_constant_vec_matching(t, vece, exp_msk); +tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t); +} +if (imm & (1 << 6)) { +/* test if NaN */ +mask = tcg_constant_vec_matching(t, vece, ~sgn_msk); +tcg_gen_and_vec(vece, b, b, mask); +mask = tcg_constant_vec_matching(t, vece, exp_msk); +tcg_gen_cmpsel_vec(TCG_COND_GT, vece, t, b, mask, match, t); +} +tcg_temp_free_vec(match); +} + static bool do_xvtstdc(DisasContext *ctx, arg_XX2_uim *a, unsigned vece) { +static const TCGOpcode vecop_list[] = { +INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0 +}; static const GVecGen2i op[] = { { .fnoi = gen_helper_XVTSTDCSP, -.vece = MO_32 +.fniv = do_xvtstdc_vec, +.vece = MO_32, +.opt_opc = vecop_list }, { .fnoi = gen_helper_XVTSTDCDP, -.vece = MO_64 +.fniv =
[PATCH 11/12] target/ppc: Moved XSTSTDC[QDS]P to decodetree
From: "Lucas Mateus Castro (alqotel)" Moved XSTSTDCSP, XSTSTDCDP and XSTSTDCQP to decodetree and moved some of its decoding away from the helper as previously the DCMX, XB and BF were calculated in the helper with the help of cpu_env, now that part was moved to the decodetree with the rest. xvtstdcsp: reptloopmaster patch 8 12500 1,85393600 1,94683600 (+5.0%) 25 40001,78779800 1,92479000 (+7.7%) 100 10002,12775000 2,28895500 (+7.6%) 500 200 2,99655300 3,23102900 (+7.8%) 250040 6,89082200 7,44827500 (+8.1%) 800012 17,5058550018,95152100 (+8.3%) xvtstdcdp: reptloopmaster patch 8 12500 1,39043100 1,33539800 (-4.0%) 25 40001,35731800 1,37347800 (+1.2%) 100 10001,51514800 1,56053000 (+3.0%) 500 200 2,21014400 2,47906000 (+12.2%) 250040 5,39488200 6,68766700 (+24.0%) 800012 13,9862390018,17661900 (+30.0%) xvtstdcdp: reptloopmaster patch 8 12500 1,35123800 1,34455800 (-0.5%) 25 40001,36441200 1,36759600 (+0.2%) 100 10001,49763500 1,54138400 (+2.9%) 500 200 2,19020200 2,46196400 (+12.4%) 250040 5,39265700 6,68147900 (+23.9%) 800012 14,0416360018,19669600 (+29.6%) As some values are now decoded outside the helper and passed to it as an argument the number of arguments of the helper increased, the number of TCGop needed to load the arguments increased. I suspect that's why the slow-down in the tests with a high REPT but low LOOP. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/fpu_helper.c | 114 +--- target/ppc/helper.h | 6 +- target/ppc/insn32.decode| 6 ++ target/ppc/translate/vsx-impl.c.inc | 20 - target/ppc/translate/vsx-ops.c.inc | 4 - 5 files changed, 60 insertions(+), 90 deletions(-) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 35ca03b10b..b385f24908 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -3241,63 +3241,6 @@ void helper_XVXSIGSP(ppc_vsr_t *xt, ppc_vsr_t *xb) *xt = t; } -/* - * VSX_TEST_DC - VSX floating point test data class - * op- instruction mnemonic - * nels - number of elements (1, 2 or 4) - * xbn - VSR register number - * tp- type (float32 or float64) - * fld - vsr_t field (VsrD(*) or VsrW(*)) - * tfld - target vsr_t field (VsrD(*) or VsrW(*)) - * fld_max - target field max - * scrf - set result in CR and FPCC - */ -#define VSX_TEST_DC(op, nels, xbn, tp, fld, tfld, fld_max, scrf) \ -void helper_##op(CPUPPCState *env, uint32_t opcode) \ -{ \ -ppc_vsr_t *xt = >vsr[xT(opcode)]; \ -ppc_vsr_t *xb = >vsr[xbn]; \ -ppc_vsr_t t = { }; \ -uint32_t i, sign, dcmx; \ -uint32_t cc, match = 0; \ -\ -if (!scrf) {\ -dcmx = DCMX_XV(opcode); \ -} else {\ -t = *xt;\ -dcmx = DCMX(opcode);\ -} \ -\ -for (i = 0; i < nels; i++) {\ -sign = tp##_is_neg(xb->fld);\ -if (tp##_is_any_nan(xb->fld)) { \ -match = extract32(dcmx, 6, 1); \ -} else if (tp##_is_infinity(xb->fld)) { \ -match = extract32(dcmx, 4 + !sign, 1); \ -} else if (tp##_is_zero(xb->fld)) { \ -match = extract32(dcmx, 2 + !sign, 1); \ -} else if (tp##_is_zero_or_denormal(xb->fld)) { \ -match = extract32(dcmx, 0 + !sign, 1); \ -} \ -\ -if (scrf) { \ -cc = sign << CRF_LT_BIT | match << CRF_EQ_BIT; \ -env->fpscr &= ~FP_FPCC; \ -env->fpscr |= cc << FPSCR_FPCC; \ -env->crf[BF(opcode)] = cc; \ -} else {\ -t.tfld = match ? fld_max : 0; \ -}
[PATCH 10/12] target/ppc: Moved XVTSTDC[DS]P to decodetree
From: "Lucas Mateus Castro (alqotel)" Moved XVTSTDCSP and XVTSTDCDP to decodetree an restructured the helper to be simpler and do all decoding in the decodetree (so XB, XT and DCMX are all calculated outside the helper). Obs: The tests in this one are slightly different, these are the sum of these instructions with all possible immediate and those instructions are repeated 10 times. xvtstdcsp: reptloopmaster patch 8 12500 2,76402100 2,70699100 (-2.1%) 25 40002,64867100 2,67884100 (+1.1%) 100 10002,73806300 2,78701000 (+1.8%) 500 200 3,44666500 3,61027600 (+4.7%) 250040 5,85790200 6,47475500 (+10.5%) 800012 15,2210210017,46062900 (+14.7%) xvtstdcdp: reptloopmaster patch 8 12500 2,11818000 1,61065300 (-24.0%) 25 40002,04573400 1,60132200 (-21.7%) 100 10002,13834100 1,69988100 (-20.5%) 500 200 2,73977000 2,48631700 (-9.3%) 250040 5,05067000 5,25914100 (+4.1%) 800012 14,6050780015,93704900 (+9.1%) Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/fpu_helper.c | 39 +++-- target/ppc/helper.h | 4 +-- target/ppc/insn32.decode| 5 target/ppc/translate/vsx-impl.c.inc | 28 +++-- target/ppc/translate/vsx-ops.c.inc | 8 -- 5 files changed, 70 insertions(+), 14 deletions(-) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index ae25f32d6e..35ca03b10b 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -3295,11 +3295,46 @@ void helper_##op(CPUPPCState *env, uint32_t opcode) \ } \ } -VSX_TEST_DC(xvtstdcdp, 2, xB(opcode), float64, VsrD(i), VsrD(i), UINT64_MAX, 0) -VSX_TEST_DC(xvtstdcsp, 4, xB(opcode), float32, VsrW(i), VsrW(i), UINT32_MAX, 0) VSX_TEST_DC(xststdcdp, 1, xB(opcode), float64, VsrD(0), VsrD(0), 0, 1) VSX_TEST_DC(xststdcqp, 1, (rB(opcode) + 32), float128, f128, VsrD(0), 0, 1) +#define VSX_TSTDC(tp) \ +static int32_t tp##_tstdc(tp b, uint32_t dcmx) \ +{ \ +uint32_t match = 0; \ +uint32_t sign = tp##_is_neg(b); \ +if (tp##_is_any_nan(b)) { \ +match = extract32(dcmx, 6, 1); \ +} else if (tp##_is_infinity(b)) { \ +match = extract32(dcmx, 4 + !sign, 1); \ +} else if (tp##_is_zero(b)) { \ +match = extract32(dcmx, 2 + !sign, 1); \ +} else if (tp##_is_zero_or_denormal(b)) { \ +match = extract32(dcmx, 0 + !sign, 1); \ +} \ +return (match != 0) ? 1 : 0;\ +} + +VSX_TSTDC(float32) +VSX_TSTDC(float64) +#undef VSX_TSTDC + +void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v) +{ +int i; +for (i = 0; i < 2; i++) { +t->s64[i] = (int64_t)-float64_tstdc(b->f64[i], dcmx); +} +} + +void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v) +{ +int i; +for (i = 0; i < 4; i++) { +t->s32[i] = (int32_t)-float32_tstdc(b->f32[i], dcmx); +} +} + void helper_xststdcsp(CPUPPCState *env, uint32_t opcode, ppc_vsr_t *xb) { uint32_t dcmx, sign, exp; diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 1f470a0e5e..d3e3324c73 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -521,8 +521,8 @@ DEF_HELPER_3(xvcvsxdsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvuxdsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvsxwsp, void, env, vsr, vsr) DEF_HELPER_3(xvcvuxwsp, void, env, vsr, vsr) -DEF_HELPER_2(xvtstdcsp, void, env, i32) -DEF_HELPER_2(xvtstdcdp, void, env, i32) +DEF_HELPER_FLAGS_4(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32) +DEF_HELPER_FLAGS_4(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32) DEF_HELPER_3(xvrspi, void, env, vsr, vsr) DEF_HELPER_3(xvrspic, void, env, vsr, vsr) DEF_HELPER_3(xvrspim, void, env, vsr, vsr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 6549c4040e..c0a531be5c 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -199,6 +199,9 @@ @XX2_uim4 .. . . uim:4 . . .. _uim xt=%xx_xt xb=%xx_xb +%xx_uim76:1 2:1 16:5 +@XX2_uim7 .. . . . . ... . .._uim xt=%xx_xt xb=%xx_xb uim=%xx_uim7 + _bf_xb bf xb @XX2_bf_xb .. bf:3 .. . . . . ._bf_xb xb=%xx_xb @@ -848,6 +851,8 @@ XSCVSPDPN 00 . - . 101001
[PATCH 06/12] target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved the instructions VAVGUB, VAVGUH, VAVGUW, VAVGSB, VAVGSH, VAVGSW, to decodetree and use gvec with them. For these one the right shift had to be made before the sum as to avoid an overflow, so add 1 at the end if any of the entries had 1 in its LSB as to replicate the "+ 1" before the shift described by the ISA. vavgub: reptloopmaster patch 8 12500 0,02616600 0,00754200 (-71.2%) 25 40000,0253 0,00637700 (-74.8%) 100 10000,02604600 0,00790100 (-69.7%) 500 200 0,03189300 0,01838400 (-42.4%) 250040 0,06006900 0,06851000 (+14.1%) 800012 0,13941000 0,20548500 (+47.4%) vavguh: reptloopmaster patch 8 12500 0,01818200 0,00780600 (-57.1%) 25 40000,01789300 0,00641600 (-64.1%) 100 10000,01899100 0,00787200 (-58.5%) 500 200 0,02527200 0,01828400 (-27.7%) 250040 0,05361800 0,06773000 (+26.3%) 800012 0,12886600 0,20291400 (+57.5%) vavguw: reptloopmaster patch 8 12500 0,01423100 0,00776600 (-45.4%) 25 40000,01780800 0,00638600 (-64.1%) 100 10000,02085500 0,00787000 (-62.3%) 500 200 0,02737100 0,01828800 (-33.2%) 250040 0,05572600 0,06774200 (+21.6%) 800012 0,13101700 0,20311600 (+55.0%) vavgsb: reptloopmaster patch 8 12500 0,03006000 0,00788600 (-73.8%) 25 40000,02882200 0,00637800 (-77.9%) 100 10000,02958000 0,00791400 (-73.2%) 500 200 0,03548800 0,01860400 (-47.6%) 250040 0,0636 0,06850800 (+7.7%) 800012 0,13816500 0,20550300 (+48.7%) vavgsh: reptloopmaster patch 8 12500 0,01965900 0,00776600 (-60.5%) 25 40000,01875400 0,00638700 (-65.9%) 100 10000,01952200 0,00786900 (-59.7%) 500 200 0,02562000 0,01760300 (-31.3%) 250040 0,05384300 0,06742800 (+25.2%) 800012 0,13240800 0,2033 (+53.5%) vavgsw: reptloopmaster patch 8 12500 0,01407700 0,00775600 (-44.9%) 25 40000,01762300 0,0064 (-63.7%) 100 10000,02046500 0,00788500 (-61.5%) 500 200 0,02745600 0,01843000 (-32.9%) 250040 0,05375500 0,06820500 (+26.9%) 800012 0,13068300 0,20304900 (+55.4%) These results to me seems to indicate that with gvec the results have a slower translation but faster execution. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/helper.h | 12 +-- target/ppc/insn32.decode| 9 +++ target/ppc/int_helper.c | 32 target/ppc/translate/vmx-impl.c.inc | 109 +--- target/ppc/translate/vmx-ops.c.inc | 9 +-- 5 files changed, 130 insertions(+), 41 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 6a43e32ad3..f88d9d3996 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -143,15 +143,15 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64) #define dh_ctype_acc ppc_acc_t * #define dh_typecode_acc dh_typecode_ptr -DEF_HELPER_FLAGS_3(vavgub, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavguh, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavguw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavgsb, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavgsh, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vavgsw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_4(vcmpeqfp, void, env, avr, avr, avr) DEF_HELPER_4(vcmpgefp, void, env, avr, avr, avr) DEF_HELPER_4(vcmpgtfp, void, env, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index aa4968e6b9..38458c01de 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -519,6 +519,15 @@ VCMPNEZW000100 . . . . 011111 @VC VCMPSQ 000100 ... -- . . 0010101 @VX_bf VCMPUQ 000100 ... -- . . 0010001 @VX_bf +## Ve
[PATCH 08/12] target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P
From: "Lucas Mateus Castro (alqotel)" Moved XVABSSP, XVABSDP, XVNABSSP,XVNABSDP, XVNEGSP and XVNEGDP to decodetree and used gvec to translate them. xvabssp: reptloopmaster patch 8 12500 0,00477900 0,00476000 (-0.4%) 25 40000,00442800 0,00353300 (-20.2%) 100 10000,00478700 0,00366100 (-23.5%) 500 200 0,00973200 0,00649400 (-33.3%) 250040 0,03165200 0,02226700 (-29.7%) 800012 0,09315900 0,06674900 (-28.3%) xvabsdp: reptloopmaster patch 8 12500 0,00475000 0,00474400 (-0.1%) 25 40000,00355600 0,00367500 (+3.3%) 100 10000,00444200 0,00366000 (-17.6%) 500 200 0,00942700 0,00732400 (-22.3%) 250040 0,0299 0,02308500 (-22.8%) 800012 0,08770300 0,06683800 (-23.8%) xvnabssp: reptloopmaster patch 8 12500 0,00494500 0,00492900 (-0.3%) 25 40000,00397700 0,00338600 (-14.9%) 100 10000,00421400 0,00353500 (-16.1%) 500 200 0,01048000 0,00707100 (-32.5%) 250040 0,03251500 0,02238300 (-31.2%) 800012 0,08889100 0,06469800 (-27.2%) xvnabsdp: reptloopmaster patch 8 12500 0,00511000 0,00492700 (-3.6%) 25 40000,00398800 0,00381500 (-4.3%) 100 10000,00390500 0,00365900 (-6.3%) 500 200 0,00924800 0,00784600 (-15.2%) 250040 0,03138900 0,02391600 (-23.8%) 800012 0,09654200 0,05684600 (-41.1%) xvnegsp: reptloopmaster patch 8 12500 0,00493900 0,00452800 (-8.3%) 25 40000,00369100 0,00366800 (-0.6%) 100 10000,00371100 0,0038 (+2.4%) 500 200 0,00991100 0,00652300 (-34.2%) 250040 0,03025800 0,02422300 (-19.9%) 800012 0,09251100 0,06457600 (-30.2%) xvnegdp: reptloopmaster patch 8 12500 0,00474900 0,00454400 (-4.3%) 25 40000,00353100 0,00325600 (-7.8%) 100 10000,00398600 0,00366800 (-8.0%) 500 200 0,01032300 0,00702400 (-32.0%) 250040 0,03125000 0,02422400 (-22.5%) 800012 0,09475100 0,06173000 (-34.9%) This one to me seemed the opposite of the previous instructions, as it looks like there was an improvement in the translation time (itself not a surprise as operations were done twice before so there was the need to translate twice as many TCGop) Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/insn32.decode| 9 target/ppc/translate/vsx-impl.c.inc | 76 ++--- target/ppc/translate/vsx-ops.c.inc | 6 --- 3 files changed, 79 insertions(+), 12 deletions(-) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index ae151c4b62..5b687078be 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -754,6 +754,15 @@ STXVRHX 01 . . . 0010101101 . @X_TSX STXVRWX 01 . . . 0011001101 . @X_TSX STXVRDX 01 . . . 0011101101 . @X_TSX +## VSX Vector Binary Floating-Point Sign Manipulation Instructions + +XVABSDP 00 . 0 . 111011001 .. @XX2 +XVABSSP 00 . 0 . 110011001 .. @XX2 +XVNABSDP00 . 0 . 01001 .. @XX2 +XVNABSSP00 . 0 . 110101001 .. @XX2 +XVNEGDP 00 . 0 . 11001 .. @XX2 +XVNEGSP 00 . 0 . 110111001 .. @XX2 + ## VSX Scalar Multiply-Add Instructions XSMADDADP 00 . . . 0011 . . . @XX3 diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 7acdbceec4..426a9a3926 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -782,15 +782,79 @@ static void glue(gen_, name)(DisasContext *ctx) \ tcg_temp_free_i64(sgm); \ } -VSX_VECTOR_MOVE(xvabsdp, OP_ABS, SGN_MASK_DP) -VSX_VECTOR_MOVE(xvnabsdp, OP_NABS, SGN_MASK_DP) -VSX_VECTOR_MOVE(xvnegdp, OP_NEG, SGN_MASK_DP) VSX_VECTOR_MOVE(xvcpsgndp, OP_CPSGN, SGN_MASK_DP) -VSX_VECTOR_MOVE(xvabssp, OP_ABS, SGN_MASK_SP) -VSX_VECTOR_MOVE(xvnabssp, OP_NABS, SGN_MASK_SP) -VSX_VECTOR_MOVE(xvnegsp, OP_NEG, SGN_MASK_SP) VSX_VECTOR_MOVE(xvcpsgnsp, OP_CPSGN, SGN_MASK_SP) +#define TCG_OP_IMM_i64(FUNC, OP, IMM) \ +static void FUNC(TCGv_i64 t, TCGv_i64 b)\ +{ \ +OP(t, b, IMM); \ +} + +TCG_OP_IMM_i64(do_xvabssp_i64, tcg_gen_andi_i64, ~SGN_MASK_SP) +TCG_
[PATCH 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to decodetree. vprtybw: reptloopmaster patch 8 12500 0,01215900 0,00705600 (-42.0%) 25 40000,01198700 0,00574400 (-52.1%) 100 10000,01307800 0,00692200 (-47.1%) 500 200 0,01794800 0,01558800 (-13.1%) 250040 0,04028200 0,05400800 (+34.1%) 800012 0,10127300 0,16744700 (+65.3%) vprtybd: reptloopmaster patch 8 12500 0,00757400 0,00791600 (+4.5%) 25 40000,00651300 0,00673700 (+3.4%) 100 10000,00713400 0,00837700 (+17.4%) 500 200 0,01195400 0,01937400 (+62.1%) 250040 0,03478600 0,07005500 (+101.4%) 800012 0,09539600 0,21013500 (+120.3%) vprtybq: reptloopmaster patch 8 12500 0,00065540 0,00066440 (+1.4%) 25 40000,00057720 0,00059850 (+3.7%) 100 10000,00066400 0,00069360 (+4.5%) 500 200 0,00115170 0,00127360 (+10.6%) 250040 0,00341890 0,00391550 (+14.5%) 800012 0,00951220 0,0480 (+16.8%) I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ, I'm not sure if it's worth to move those instructions. Comparing the assembly of the helper with the TCGop they are pretty similar, so I'm not sure why vprtybd took so much more time. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/helper.h | 6 ++-- target/ppc/insn32.decode| 4 +++ target/ppc/int_helper.c | 6 ++-- target/ppc/translate/vmx-impl.c.inc | 55 +++-- target/ppc/translate/vmx-ops.c.inc | 3 -- 5 files changed, 62 insertions(+), 12 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index feccf30bcb..6a43e32ad3 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -194,9 +194,9 @@ DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_4(VADDCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) -DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr) +DEF_HELPER_FLAGS_3(VPRTYBW, TCG_CALL_NO_RWG, void, avr, avr, i32) +DEF_HELPER_FLAGS_3(VPRTYBD, TCG_CALL_NO_RWG, void, avr, avr, i32) +DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32) DEF_HELPER_FLAGS_4(VSUBCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 2658dd3395..aa4968e6b9 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -529,6 +529,10 @@ VCTZDM 000100 . . . 1000100@VX VPDEPD 000100 . . . 10111001101@VX VPEXTD 000100 . . . 10110001101@VX +VPRTYBD 000100 . 01001 . 1100010@VX_tb +VPRTYBQ 000100 . 01010 . 1100010@VX_tb +VPRTYBW 000100 . 01000 . 1100010@VX_tb + ## Vector Permute and Formatting Instruction VEXTDUBVLX 000100 . . . . 011000 @VA diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 338ebced22..64b2d44a66 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -502,7 +502,7 @@ void helper_VADDCUW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v) } /* vprtybw */ -void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b) +void helper_VPRTYBW(ppc_avr_t *r, ppc_avr_t *b, uint32_t v) { int i; for (i = 0; i < ARRAY_SIZE(r->u32); i++) { @@ -513,7 +513,7 @@ void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b) } /* vprtybd */ -void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b) +void helper_VPRTYBD(ppc_avr_t *r, ppc_avr_t *b, uint32_t v) { int i; for (i = 0; i < ARRAY_SIZE(r->u64); i++) { @@ -525,7 +525,7 @@ void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b) } /* vprtybq */ -void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) +void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v) { uint64_t res = b->u64[0] ^ b->u64[1]; res ^= res >> 32; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 3f614097ac..06d91d1304 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -1659,9 +1659,58 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11); GEN_VXFORM_NOA_ENV(vrfin, 5, 8); GEN_VXFORM_NOA_ENV(vrfip, 5, 10); GEN_VXFORM_NOA_ENV(vrfiz, 5, 9); -GEN_VXFORM_NOA(vprtybw, 1, 2
[PATCH 07/12] target/ppc: Move VABSDU[BHW] to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved VABSDUB, VABSDUH and VABSDUW to decodetree and use gvec to translate them. vabsdub: reptloopmaster patch 8 12500 0,03601600 0,00688500 (-80.9%) 25 40000,03651000 0,00532100 (-85.4%) 100 10000,03666900 0,00595300 (-83.8%) 500 200 0,04305800 0,01244600 (-71.1%) 250040 0,06893300 0,04273700 (-38.0%) 800012 0,14633200 0,12660300 (-13.5%) vabsduh: reptloopmaster patch 8 12500 0,02172400 0,00687500 (-68.4%) 25 40000,02154100 0,00531500 (-75.3%) 100 10000,02235400 0,00596300 (-73.3%) 500 200 0,02827500 0,01245100 (-56.0%) 250040 0,05638400 0,04285500 (-24.0%) 800012 0,13166000 0,12641400 (-4.0%) vabsduw: reptloopmaster patch 8 12500 0,01646400 0,00688300 (-58.2%) 25 40000,01454500 0,00475500 (-67.3%) 100 10000,01545800 0,00511800 (-66.9%) 500 200 0,02168200 0,01114300 (-48.6%) 250040 0,04571300 0,04138800 (-9.5%) 800012 0,12209500 0,12178500 (-0.3%) Same as VADDCUW and VSUBCUW, overall performance gain but it uses more TCGop (4 before the patch, 6 after). Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/helper.h | 6 ++-- target/ppc/insn32.decode| 6 target/ppc/int_helper.c | 13 +++- target/ppc/translate/vmx-impl.c.inc | 49 +++-- target/ppc/translate/vmx-ops.c.inc | 3 -- 5 files changed, 60 insertions(+), 17 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index f88d9d3996..1f470a0e5e 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -146,9 +146,9 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64) DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) -DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VABSDUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VABSDUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) +DEF_HELPER_FLAGS_4(VABSDUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 38458c01de..ae151c4b62 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -528,6 +528,12 @@ VAVGUB 000100 . . . 110@VX VAVGUH 000100 . . . 1000110@VX VAVGUW 000100 . . . 1001010@VX +## Vector Integer Absolute Difference Instructions + +VABSDUB 000100 . . . 111@VX +VABSDUH 000100 . . . 1000111@VX +VABSDUW 000100 . . . 1001011@VX + ## Vector Bit Manipulation Instruction VGNB000100 . -- ... . 10011001100 @VX_n diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 0a142441e5..a797b4ddaf 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -621,8 +621,8 @@ VAVG(VAVGSW, s32, int64_t) VAVG(VAVGUW, u32, uint64_t) #undef VAVG -#define VABSDU_DO(name, element)\ -void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ +#define VABSDU(name, element) \ +void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v)\ { \ int i; \ \ @@ -638,12 +638,9 @@ void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ * name- instruction mnemonic suffix (b: byte, h: halfword, w: word) * element - element type to access from vector */ -#define VABSDU(type, element) \ -VABSDU_DO(absdu##type, element) -VABSDU(b, u8) -VABSDU(h, u16) -VABSDU(w, u32) -#undef VABSDU_DO +VABSDU(VABSDUB, u8) +VABSDU(VABSDUH, u16) +VABSDU(VABSDUW, u32) #undef VABSDU #define VCF(suffix, cvt, element) \ diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 8ff7c6ff3a..1dd799620d 100644 --- a/target/ppc/translate/vmx-impl.c
[PATCH 03/12] target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" This patch moves VADDCUW and VSUBCUW to decodtree with gvec using an implementation based on the helper, with the main difference being changing the -1 (aka all bits set to 1) result returned by cmp when true to +1 vaddcuw: reptloopmaster patch 8 12500 0,01420600 0,00679200 (-52.2%) 25 40000,01781700 0,00524500 (-70.6%) 100 10000,02053300 0,00591800 (-71.2%) 500 200 0,02709800 0,01254600 (-53.7%) 250040 0,05537100 0,04347800 (-21.5%) 800012 0,13103100 0,12973600 (-1.0%) vsubcuw: reptloopmaster patch 8 12500 0,01426100 0,00685500 (-51.9%) 25 40000,01744600 0,00536000 (-69.3%) 100 10000,02029500 0,00597400 (-70.6%) 500 200 0,02654000 0,01263200 (-52.4%) 250040 0,05507200 0,04347100 (-21.1%) 800012 0,13072400 0,12872300 (-1.5%) Overall there was a gain in performance, but the TCGop code was still slightly bigger in the new version (it went from 4 to 5). Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/helper.h | 4 +-- target/ppc/insn32.decode| 2 ++ target/ppc/int_helper.c | 4 +-- target/ppc/translate/vmx-impl.c.inc | 50 ++--- target/ppc/translate/vmx-ops.c.inc | 3 +- 5 files changed, 53 insertions(+), 10 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index f02a9497b7..edce059f2c 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -193,11 +193,11 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_3(vaddcuw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VADDCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_3(vsubcuw, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_4(VSUBCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 9a509e84df..aebc7b73c8 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -608,12 +608,14 @@ VRLQNM 000100 . . . 00101000101 @VX ## Vector Integer Arithmetic Instructions +VADDCUW 000100 . . . 0011000@VX VADDCUQ 000100 . . . 0010100@VX VADDUQM 000100 . . . 001@VX VADDEUQM000100 . . . . 00 @VA VADDECUQ000100 . . . . 01 @VA +VSUBCUW 000100 . . . 1011000@VX VSUBCUQ 000100 . . . 1010100@VX VSUBUQM 000100 . . . 101@VX diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index ae1ba8084d..c48841819d 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -492,7 +492,7 @@ static inline void set_vscr_sat(CPUPPCState *env) env->vscr_sat.u32[0] = 1; } -void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) +void helper_VADDCUW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v) { int i; @@ -1962,7 +1962,7 @@ void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) #endif } -void helper_vsubcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) +void helper_VSUBCUW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v) { int i; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 3acd585a2f..c5bfbfb3ce 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -803,8 +803,6 @@ GEN_VXFORM(vsrv, 2, 28); GEN_VXFORM(vslv, 2, 29); GEN_VXFORM(vslo, 6, 16); GEN_VXFORM(vsro, 6, 17); -GEN_VXFORM(vaddcuw, 0, 6); -GEN_VXFORM(vsubcuw, 0, 22); static bool do_vector_gvec3_VX(DisasContext *ctx, arg_VX *a, int vece, void (*gen_gvec)(unsigned, uint32_t, uint32_t, @@ -2847,8 +2845,6 @@ static void gen_xpnd04_2(DisasContext *ctx) } -GEN_VXFORM_DUAL(vsubcuw, PPC_ALTIVEC, PPC_NONE, \ -xpnd04_1, PPC_NONE, PPC2_ISA300) GEN_VXFORM_DUAL(vsubsws, PPC_ALTIVEC, PPC_NONE, \ xpnd04_2, PPC_NONE, PPC2_ISA300) @@ -3110,6 +3106,52 @@ TRANS_FLAGS2(ALTIVEC_207, VPMSUMD, do_vx_helper, gen_helper_VPMSUMD)
[PATCH 01/12] target/ppc: Moved VMLADDUHM to decodetree and use gvec
From: "Lucas Mateus Castro (alqotel)" This patch moves VMLADDUHM to decodetree a creates a gvec implementation using mul_vec and add_vec. reptloopmaster patch 8 12500 0,01810500 0,00903100 (-50.1%) 25 40000,01739400 0,00747700 (-57.0%) 100 10000,01843600 0,00901400 (-51.1%) 500 200 0,02574600 0,01971000 (-23.4%) 250040 0,05921600 0,07121800 (+20.3%) 800012 0,15326700 0,21725200 (+41.7%) The significant difference in performance when REPT is low and LOOP is high I think is due to the fact that the new implementation has a higher translation time, as when using a helper only 5 TCGop are used but with the patch a total of 10 TCGop are needed (Power lacks a direct mul_vec equivalent so this instruction is implemented with the help of 5 others, vmuleu, vmulou, vmrgh, vmrgl and vpkum). Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/helper.h | 2 +- target/ppc/insn32.decode| 2 ++ target/ppc/int_helper.c | 3 +- target/ppc/translate.c | 1 - target/ppc/translate/vmx-impl.c.inc | 48 ++--- 5 files changed, 35 insertions(+), 21 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 57eee07256..9c562ab00e 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -264,7 +264,7 @@ DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_5(VMSUMSHS, void, env, avr, avr, avr, avr) -DEF_HELPER_FLAGS_4(vmladduhm, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) +DEF_HELPER_FLAGS_5(VMLADDUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_2(mtvscr, TCG_CALL_NO_RWG, void, env, i32) DEF_HELPER_FLAGS_1(mfvscr, TCG_CALL_NO_RWG, i32, env) DEF_HELPER_3(lvebx, void, env, avr, tl) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index a5249ee32c..7445455a12 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -693,6 +693,8 @@ VMSUMUHS000100 . . . . 100111 @VA VMSUMCUD000100 . . . . 010111 @VA VMSUMUDM000100 . . . . 100011 @VA +VMLADDUHM 000100 . . . . 100010 @VA + ## Vector String Instructions VSTRIBL 000100 . 0 . . 001101 @VX_tb_rc diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 696096100b..0d25000b2a 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -974,7 +974,8 @@ void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, } } -void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) +void helper_VMLADDUHM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c, + uint32_t v) { int i; diff --git a/target/ppc/translate.c b/target/ppc/translate.c index e810842925..11f729c60c 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -6921,7 +6921,6 @@ GEN_HANDLER(lvsl, 0x1f, 0x06, 0x00, 0x0001, PPC_ALTIVEC), GEN_HANDLER(lvsr, 0x1f, 0x06, 0x01, 0x0001, PPC_ALTIVEC), GEN_HANDLER(mfvscr, 0x04, 0x2, 0x18, 0x001ff800, PPC_ALTIVEC), GEN_HANDLER(mtvscr, 0x04, 0x2, 0x19, 0x03ff, PPC_ALTIVEC), -GEN_HANDLER(vmladduhm, 0x04, 0x11, 0xFF, 0x, PPC_ALTIVEC), #if defined(TARGET_PPC64) GEN_HANDLER_E(maddhd_maddhdu, 0x04, 0x18, 0xFF, 0x, PPC_NONE, PPC2_ISA300), diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index e644ad3236..9f18c6d4f2 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2523,24 +2523,6 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16) -static void gen_vmladduhm(DisasContext *ctx) -{ -TCGv_ptr ra, rb, rc, rd; -if (unlikely(!ctx->altivec_enabled)) { -gen_exception(ctx, POWERPC_EXCP_VPU); -return; -} -ra = gen_avr_ptr(rA(ctx->opcode)); -rb = gen_avr_ptr(rB(ctx->opcode)); -rc = gen_avr_ptr(rC(ctx->opcode)); -rd = gen_avr_ptr(rD(ctx->opcode)); -gen_helper_vmladduhm(rd, ra, rb, rc); -tcg_temp_free_ptr(ra); -tcg_temp_free_ptr(rb); -tcg_temp_free_ptr(rc); -tcg_temp_free_ptr(rd); -} - static bool do_va_helper(DisasContext *ctx, arg_VA *a, void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr)) { @@ -2569,6 +2551,36 @@ TRANS_FLAGS2(ALTIVEC_207, VSUBECUQ, do_va_helper, gen_helper_VSUBECUQ) TRANS_FLAGS(ALTIVEC, VPERM, do_va_helper, gen_helper_VPERM) TRANS_FLAGS2(ISA300, VPERMR, do_va_helper, gen_helper_VPERMR) +static void gen_vmladduhm_vec(unsigned vece, TCGv_vec t, TCGv_vec a, TCGv_vec b, + TCGv_vec c) +{ +
[PATCH 09/12] target/ppc: Use gvec to decode XVCPSGN[SD]P
From: "Lucas Mateus Castro (alqotel)" Moved XVCPSGNSP and XVCPSGNDP to decodetree and used gvec to translate them. xvcpsgnsp: reptloopmaster patch 8 12500 0,00722000 0,00587700 (-18.6%) 25 40000,00604300 0,00521500 (-13.7%) 100 10000,00815600 0,00508500 (-37.7%) 500 200 0,02376600 0,01222600 (-48.6%) 250040 0,07709200 0,04158300 (-46.1%) 800012 0,27922100 0,12394400 (-55.6%) xvcpsgndp: reptloopmaster patch 8 12500 0,00557900 0,00584900 (+4.8%) 25 40000,00518700 0,00502900 (-3.0%) 100 10000,00655900 0,00569600 (-13.2%) 500 200 0,01560900 0,01260500 (-19.2%) 250040 0,05899200 0,03989400 (-32.4%) 800012 0,20046000 0,12417700 (-38.1%) Like the previous instructions there seemed to be a improvement on translation time. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/insn32.decode| 2 + target/ppc/translate/vsx-impl.c.inc | 114 ++-- target/ppc/translate/vsx-ops.c.inc | 3 - 3 files changed, 60 insertions(+), 59 deletions(-) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 5b687078be..6549c4040e 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -762,6 +762,8 @@ XVNABSDP00 . 0 . 01001 .. @XX2 XVNABSSP00 . 0 . 110101001 .. @XX2 XVNEGDP 00 . 0 . 11001 .. @XX2 XVNEGSP 00 . 0 . 110111001 .. @XX2 +XVCPSGNDP 00 . . . ... @XX3 +XVCPSGNSP 00 . . . 1101 ... @XX3 ## VSX Scalar Multiply-Add Instructions diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 426a9a3926..3e4509cb41 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -729,62 +729,6 @@ VSX_SCALAR_MOVE_QP(xsnabsqp, OP_NABS, SGN_MASK_DP) VSX_SCALAR_MOVE_QP(xsnegqp, OP_NEG, SGN_MASK_DP) VSX_SCALAR_MOVE_QP(xscpsgnqp, OP_CPSGN, SGN_MASK_DP) -#define VSX_VECTOR_MOVE(name, op, sgn_mask) \ -static void glue(gen_, name)(DisasContext *ctx) \ -{\ -TCGv_i64 xbh, xbl, sgm; \ -if (unlikely(!ctx->vsx_enabled)) { \ -gen_exception(ctx, POWERPC_EXCP_VSXU); \ -return; \ -}\ -xbh = tcg_temp_new_i64();\ -xbl = tcg_temp_new_i64();\ -sgm = tcg_temp_new_i64();\ -get_cpu_vsr(xbh, xB(ctx->opcode), true); \ -get_cpu_vsr(xbl, xB(ctx->opcode), false);\ -tcg_gen_movi_i64(sgm, sgn_mask); \ -switch (op) {\ -case OP_ABS: { \ -tcg_gen_andc_i64(xbh, xbh, sgm); \ -tcg_gen_andc_i64(xbl, xbl, sgm); \ -break; \ -}\ -case OP_NABS: { \ -tcg_gen_or_i64(xbh, xbh, sgm); \ -tcg_gen_or_i64(xbl, xbl, sgm); \ -break; \ -}\ -case OP_NEG: { \ -tcg_gen_xor_i64(xbh, xbh, sgm); \ -tcg_gen_xor_i64(xbl, xbl, sgm); \ -break; \ -}\ -case OP_CPSGN: { \ -TCGv_i64 xah = tcg_temp_new_i64(); \ -TCGv_i64 xal = tcg_temp_new_i64(); \ -get_cpu_vsr(xah, xA(ctx->opcode), true); \ -get_cpu_vsr(xal, xA(ctx->opcode), false);\ -tcg_gen_and_i64(xah, xah, sgm); \ -tcg_gen_and_i64(xal, xal, sgm); \ -tcg_gen_andc_i64(xbh, xbh, sgm); \ -tcg_gen_andc_i64(xbl, xbl, sgm); \ -tcg_gen_or_i64(xbh, xbh, xah); \ -tcg
[PATCH 04/12] target/ppc: Move VNEG[WD] to decodtree and use gvec
From: "Lucas Mateus Castro (alqotel)" Moved the instructions VNEGW and VNEGD to decodetree and used gvec to decode it. vnegw: reptloopmaster patch 8 12500 0,01053200 0,00548400 (-47.9%) 25 40000,01030500 0,0039 (-62.2%) 100 10000,01096300 0,00395400 (-63.9%) 500 200 0,01472000 0,00712300 (-51.6%) 250040 0,03809000 0,02147700 (-43.6%) 800012 0,09957100 0,06202100 (-37.7%) vnegd: reptloopmaster patch 8 12500 0,00594600 0,00543800 (-8.5%) 25 40000,00575200 0,00396400 (-31.1%) 100 10000,00676100 0,00394800 (-41.6%) 500 200 0,01149300 0,00709400 (-38.3%) 250040 0,03441500 0,02169600 (-37.0%) 800012 0,09516900 0,06337000 (-33.4%) Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/helper.h | 2 -- target/ppc/insn32.decode| 3 +++ target/ppc/int_helper.c | 12 target/ppc/translate/vmx-impl.c.inc | 15 +-- target/ppc/translate/vmx-ops.c.inc | 2 -- 5 files changed, 16 insertions(+), 18 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index edce059f2c..feccf30bcb 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -231,8 +231,6 @@ DEF_HELPER_FLAGS_2(VSTRIBL, TCG_CALL_NO_RWG, i32, avr, avr) DEF_HELPER_FLAGS_2(VSTRIBR, TCG_CALL_NO_RWG, i32, avr, avr) DEF_HELPER_FLAGS_2(VSTRIHL, TCG_CALL_NO_RWG, i32, avr, avr) DEF_HELPER_FLAGS_2(VSTRIHR, TCG_CALL_NO_RWG, i32, avr, avr) -DEF_HELPER_FLAGS_2(vnegw, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vnegd, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vupkhpx, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vupklpx, TCG_CALL_NO_RWG, void, avr, avr) DEF_HELPER_FLAGS_2(vupkhsb, TCG_CALL_NO_RWG, void, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index aebc7b73c8..2658dd3395 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -629,6 +629,9 @@ VEXTSH2D000100 . 11001 . 1100010 @VX_tb VEXTSW2D000100 . 11010 . 1100010@VX_tb VEXTSD2Q000100 . 11011 . 1100010@VX_tb +VNEGD 000100 . 00111 . 1100010@VX_tb +VNEGW 000100 . 00110 . 1100010@VX_tb + ## Vector Mask Manipulation Instructions MTVSRBM 000100 . 1 . 1100110@VX_tb diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index c48841819d..338ebced22 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -1937,18 +1937,6 @@ XXBLEND(W, 32) XXBLEND(D, 64) #undef XXBLEND -#define VNEG(name, element) \ -void helper_##name(ppc_avr_t *r, ppc_avr_t *b) \ -{ \ -int i; \ -for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ -r->element[i] = -b->element[i]; \ -} \ -} -VNEG(vnegw, s32) -VNEG(vnegd, s64) -#undef VNEG - void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { int sh = (b->VsrB(0xf) >> 3) & 0xf; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index c5bfbfb3ce..3f614097ac 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2625,8 +2625,19 @@ GEN_VXFORM_NOA(vclzb, 1, 28) GEN_VXFORM_NOA(vclzh, 1, 29) GEN_VXFORM_TRANS(vclzw, 1, 30) GEN_VXFORM_TRANS(vclzd, 1, 31) -GEN_VXFORM_NOA_2(vnegw, 1, 24, 6) -GEN_VXFORM_NOA_2(vnegd, 1, 24, 7) + +static bool do_vneg(DisasContext *ctx, arg_VX_tb *a, unsigned vece) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA300); +REQUIRE_VECTOR(ctx); + +tcg_gen_gvec_neg(vece, avr_full_offset(a->vrt), avr_full_offset(a->vrb), + 16, 16); +return true; +} + +TRANS(VNEGW, do_vneg, MO_32) +TRANS(VNEGD, do_vneg, MO_64) static void gen_vexts_i64(TCGv_i64 t, TCGv_i64 b, int64_t s) { diff --git a/target/ppc/translate/vmx-ops.c.inc b/target/ppc/translate/vmx-ops.c.inc index ded0234123..27908533dd 100644 --- a/target/ppc/translate/vmx-ops.c.inc +++ b/target/ppc/translate/vmx-ops.c.inc @@ -181,8 +181,6 @@ GEN_VXFORM_300_EXT(vextractd, 6, 11, 0x10), GEN_VXFORM(vspltisb, 6, 12), GEN_VXFORM(vspltish, 6, 13), GEN_VXFORM(vspltisw, 6, 14), -GEN_VXFORM_300_EO(vnegw, 0x01, 0x18, 0x06), -GEN_VXFORM_300_EO(vnegd, 0x01, 0x18, 0x07), GEN_VXFORM_300_EO(vctzb, 0x01, 0x18, 0x1C), GEN_VXFORM_300_EO(vctzh, 0x01, 0x18, 0x1D), GEN_VXFORM_300_EO(vctzw, 0x01, 0x18, 0x1E), -- 2.31.1
[PATCH 00/12] VMX/VSX instructions with gvec
From: "Lucas Mateus Castro (alqotel)" This patch series moves some instructions from decode legacy to decodetree and translate said instructions with gvec. Some cases using gvec ended up with a bigger, more complex and slower so those instructions were only moved to decodetree. In each patch there's a comparison of the execution time before the patch being applied and after. Said result is the sum of 10 executions. The program used to time the execution worked like this: clock_t start = clock(); for (int i = 0; i < LOOP; i++) { asm ( load values in registers, between 2 and 3 instructions ".rept REPT\n\t" "INSTRUCTION registers\n\t" ".endr\n\t" save result from register, 1 instruction ); } clock_t end = clock(); printf("INSTRUCTION rept=REPT loop=LOOP, time taken: %.12lf\n", ((double)(end - start))/ CLOCKS_PER_SEC); Where the column rept in the value used in .rept in the inline assembly and loop column is the value used for the for loop. All of those tests were executed on a Power9. When comparing the TCGop the data used was gathered using '-d op' and '-d op_opt'. Lucas Mateus Castro (alqotel) (12): target/ppc: Moved VMLADDUHM to decodetree and use gvec target/ppc: Move VMH[R]ADDSHS instruction to decodetree target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec target/ppc: Move VNEG[WD] to decodtree and use gvec target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec target/ppc: Move VAVG to decodetree and use gvec target/ppc: Move VABSDU to decodetree and use gvec target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P target/ppc: Use gvec to decode XVCPSNG[SD]P target/ppc: Moved XVTSTDC[DS]P to decodetree target/ppc: Moved XSTSTDC[QDS]P to decodetree target/ppc: Use gvec to decode XVTSTDC[DS]P target/ppc/fpu_helper.c | 137 ++-- target/ppc/helper.h | 46 ++-- target/ppc/insn32.decode| 50 + target/ppc/int_helper.c | 74 +++ target/ppc/translate.c | 1 - target/ppc/translate/vmx-impl.c.inc | 333 target/ppc/translate/vmx-ops.c.inc | 15 +- target/ppc/translate/vsx-impl.c.inc | 305 +++-- target/ppc/translate/vsx-ops.c.inc | 21 -- 9 files changed, 693 insertions(+), 289 deletions(-) -- 2.31.1
[PATCH 02/12] target/ppc: Move VMH[R]ADDSHS instruction to decodetree
From: "Lucas Mateus Castro (alqotel)" This patch moves VMHADDSHS and VMHRADDSHS to decodetree I couldn't find a satisfactory implementation with TCG inline. vmhaddshs: reptloopmaster patch 8 12500 0,02983400 0,02648500 (-11.2%) 25 40000,02946000 0,02518000 (-14.5%) 100 10000,03104300 0,02638000 (-15.0%) 500 200 0,04002000 0,03502500 (-12.5%) 250040 0,08090100 0,07562200 (-6.5%) 800012 0,19242600 0,18626800 (-3.2%) vmhraddshs: reptloopmaster patch 8 12500 0,03078600 0,02851000 (-7.4%) 25 40000,02793200 0,02746900 (-1.7%) 100 10000,02886000 0,02839900 (-1.6%) 500 200 0,03714700 0,03799200 (+2.3%) 250040 0,07948000 0,07852200 (-1.2%) 800012 0,19049800 0,18813900 (-1.2%) Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/helper.h | 4 ++-- target/ppc/insn32.decode| 2 ++ target/ppc/int_helper.c | 4 ++-- target/ppc/translate/vmx-impl.c.inc | 5 +++-- target/ppc/translate/vmx-ops.c.inc | 1 - 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 9c562ab00e..f02a9497b7 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -258,8 +258,8 @@ DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr) DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr) DEF_HELPER_4(vpkudum, void, env, avr, avr, avr) DEF_HELPER_FLAGS_3(vpkpx, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr) -DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr) +DEF_HELPER_5(VMHADDSHS, void, env, avr, avr, avr, avr) +DEF_HELPER_5(VMHRADDSHS, void, env, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr) DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 7445455a12..9a509e84df 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -694,6 +694,8 @@ VMSUMCUD000100 . . . . 010111 @VA VMSUMUDM000100 . . . . 100011 @VA VMLADDUHM 000100 . . . . 100010 @VA +VMHADDSHS 000100 . . . . 10 @VA +VMHRADDSHS 000100 . . . . 11 @VA ## Vector String Instructions diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 0d25000b2a..ae1ba8084d 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -939,7 +939,7 @@ target_ulong helper_vctzlsbb(ppc_avr_t *r) return count; } -void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, +void helper_VMHADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) { int sat = 0; @@ -957,7 +957,7 @@ void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, } } -void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, +void helper_VMHRADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) { int sat = 0; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 9f18c6d4f2..3acd585a2f 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -2521,7 +2521,7 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ tcg_temp_free_ptr(rd); \ } -GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16) +GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23) static bool do_va_helper(DisasContext *ctx, arg_VA *a, void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr)) @@ -2620,7 +2620,8 @@ static bool do_va_env_helper(DisasContext *ctx, arg_VA *a, TRANS_FLAGS(ALTIVEC, VMSUMUHS, do_va_env_helper, gen_helper_VMSUMUHS) TRANS_FLAGS(ALTIVEC, VMSUMSHS, do_va_env_helper, gen_helper_VMSUMSHS) -GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23) +TRANS_FLAGS(ALTIVEC, VMHADDSHS, do_va_env_helper, gen_helper_VMHADDSHS) +TRANS_FLAGS(ALTIVEC, VMHRADDSHS, do_va_env_helper, gen_helper_VMHRADDSHS) GEN_VXFORM_NOA(vclzb, 1, 28) GEN_VXFORM_NOA(vclzh, 1, 29) diff --git a/target/ppc/translate/vmx-ops.c.inc b/target/ppc/translate/vmx-ops.c.inc index a3a0fd0650..7cd9d40e06 100644 --- a/target/ppc/translate/vmx-ops.c.inc +++ b/target/ppc/translate/vmx-ops.c.inc @@ -219,7 +219,6 @@ GEN_VXFORM_UIMM(vctsxs, 5, 15), #define GEN_VAFORM_PAIRED(name0, name1, opc2) \ GEN_HANDLER(name0##_##name1, 0x04, opc2, 0xFF, 0x, PPC_ALTIVEC) -GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16), GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23), GEN_VXFORM_DUAL(vclzb, vpopcntb, 1, 28, PPC_NONE, PPC2_A
[PATCH v3 0/4] Patch series to set up a ppc64le CI
This patch series aim to make easier to set up a compilation and CI environment on PPC64 and PPC64LE machines. v3: Changed patch 1 to respect alphabetical order v2: This patch series are only patches 2-4 of v1 and an alternative to patch 1 suggested by Daniel. Lucas Mateus Castro (alqotel) (4): scripts/ci/setup: ninja missing from build-environment scripts/ci/setup: Fix libxen requirements scripts/ci/setup: spice-server only on x86 aarch64 tests/docker: run script use realpath instead of readlink scripts/ci/setup/build-environment.yml | 15 +-- tests/docker/run | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) -- 2.25.1
[PATCH v3 4/4] tests/docker: run script use realpath instead of readlink
From: "Lucas Mateus Castro (alqotel)" The alpine docker image only comes with busybox, which doesn't have the '-e' option on its readlink, so change it to 'realpath' to avoid that problem. Suggested-by: Daniel P. Berrangé Signed-off-by: Lucas Mateus Castro (alqotel) --- tests/docker/run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/docker/run b/tests/docker/run index 421393046b..9eb96129da 100755 --- a/tests/docker/run +++ b/tests/docker/run @@ -15,7 +15,7 @@ if test -n "$V"; then set -x fi -BASE="$(dirname $(readlink -e $0))" +BASE="$(dirname $(realpath $0))" # Prepare the environment export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH -- 2.25.1
[PATCH v3 3/4] scripts/ci/setup: spice-server only on x86 aarch64
From: "Lucas Mateus Castro (alqotel)" Changed build-environment.yml to only install spice-server on x86_64 and aarch64 as this package is only available on those architectures. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Philippe Mathieu-Daudé --- scripts/ci/setup/build-environment.yml | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index 49292715d3..b04c2b7cee 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -160,7 +160,6 @@ - python36 - rdma-core-devel - spice-glib-devel - - spice-server - systemtap-sdt-devel - tar - zlib-devel @@ -168,3 +167,14 @@ when: - ansible_facts['distribution_file_variety'] == 'RedHat' - ansible_facts['distribution_version'] == '8' + +- name: Install packages only available on x86 and aarch64 + dnf: +# Spice server not available in ppc64le +name: + - spice-server +state: present + when: +- ansible_facts['distribution_file_variety'] == 'RedHat' +- ansible_facts['distribution_version'] == '8' +- ansible_facts['architecture'] == 'aarch64' or ansible_facts['architecture'] == 'x86_64' -- 2.25.1
[PATCH v3 2/4] scripts/ci/setup: Fix libxen requirements
From: "Lucas Mateus Castro (alqotel)" XEN hypervisor is only available in ARM and x86, but the yaml only checked if the architecture is different from s390x, changed it to a more accurate test. Tested this change on a Ubuntu 20.04 ppc64le. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Alex Bennée Reviewed-by: Philippe Mathieu-Daudé --- scripts/ci/setup/build-environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index b5acaf9118..49292715d3 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -97,7 +97,7 @@ state: present when: - ansible_facts['distribution'] == 'Ubuntu' -- ansible_facts['architecture'] != 's390x' +- ansible_facts['architecture'] == 'aarch64' or ansible_facts['architecture'] == 'x86_64' - name: Install basic packages to build QEMU on Ubuntu 20.04 package: -- 2.25.1
[PATCH v3 1/4] scripts/ci/setup: ninja missing from build-environment
From: "Lucas Mateus Castro (alqotel)" ninja-build is missing from the RHEL environment, so a system prepared with that script would still fail to compile QEMU. Tested on a Fedora 36 Signed-off-by: Lucas Mateus Castro (alqotel) --- scripts/ci/setup/build-environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index 232525b91d..b5acaf9118 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -153,6 +153,7 @@ - make - mesa-libEGL-devel - nettle-devel + - ninja-build - nmap-ncat - perl-Test-Harness - pixman-devel -- 2.25.1
[PATCH v2 4/4] tests/docker: run script use realpath instead of readlink
From: "Lucas Mateus Castro (alqotel)" The alpine docker image only comes with busybox, which doesn't have the '-e' option on its readlink, so change it to 'realpath' to avoid that problem. Suggested-by: Daniel P. Berrangé Signed-off-by: Lucas Mateus Castro (alqotel) --- tests/docker/run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/docker/run b/tests/docker/run index 421393046b..9eb96129da 100755 --- a/tests/docker/run +++ b/tests/docker/run @@ -15,7 +15,7 @@ if test -n "$V"; then set -x fi -BASE="$(dirname $(readlink -e $0))" +BASE="$(dirname $(realpath $0))" # Prepare the environment export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH -- 2.31.1
[PATCH v2 2/4] scripts/ci/setup: Fix libxen requirements
From: "Lucas Mateus Castro (alqotel)" XEN hypervisor is only available in ARM and x86, but the yaml only checked if the architecture is different from s390x, changed it to a more accurate test. Tested this change on a Ubuntu 20.04 ppc64le. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Alex Bennée --- scripts/ci/setup/build-environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index 6df3e61d94..7535228685 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -97,7 +97,7 @@ state: present when: - ansible_facts['distribution'] == 'Ubuntu' -- ansible_facts['architecture'] != 's390x' +- ansible_facts['architecture'] == 'aarch64' or ansible_facts['architecture'] == 'x86_64' - name: Install basic packages to build QEMU on Ubuntu 20.04 package: -- 2.31.1
[PATCH v2 1/4] scripts/ci/setup: ninja missing from build-environment
From: "Lucas Mateus Castro (alqotel)" ninja-build is missing from the RHEL environment, so a system prepared with that script would still fail to compile QEMU. Tested on a Fedora 36 Signed-off-by: Lucas Mateus Castro (alqotel) --- scripts/ci/setup/build-environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index 232525b91d..6df3e61d94 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -150,6 +150,7 @@ - libepoxy-devel - libgcrypt-devel - lzo-devel + - ninja-build - make - mesa-libEGL-devel - nettle-devel -- 2.31.1
[PATCH v2 0/4] Patch series to set up a ppc64le CI
From: "Lucas Mateus Castro (alqotel)" This patch series aim to make easier to set up a compilation and CI environment on PPC64 and PPC64LE machines. v2: This patch series are only patches 2-4 of v1 and an alternative to patch 1 suggested by Daniel. Lucas Mateus Castro (alqotel) (4): scripts/ci/setup: ninja missing from build-environment scripts/ci/setup: Fix libxen requirements scripts/ci/setup: spice-server only on x86 aarch64 tests/docker: run script use realpath instead of readlink scripts/ci/setup/build-environment.yml | 15 +-- tests/docker/run | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) -- 2.31.1
[PATCH v2 3/4] scripts/ci/setup: spice-server only on x86 aarch64
From: "Lucas Mateus Castro (alqotel)" Changed build-environment.yml to only install spice-server on x86_64 and aarch64 as this package is only available on those architectures. Signed-off-by: Lucas Mateus Castro (alqotel) --- scripts/ci/setup/build-environment.yml | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index 7535228685..43cf8c759f 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -160,7 +160,6 @@ - python36 - rdma-core-devel - spice-glib-devel - - spice-server - systemtap-sdt-devel - tar - zlib-devel @@ -168,3 +167,14 @@ when: - ansible_facts['distribution_file_variety'] == 'RedHat' - ansible_facts['distribution_version'] == '8' + +- name: Install packages only available on x86 and aarch64 + dnf: +# Spice server not available in ppc64le +name: + - spice-server +state: present + when: +- ansible_facts['distribution_file_variety'] == 'RedHat' +- ansible_facts['distribution_version'] == '8' +- ansible_facts['architecture'] == 'aarch64' or ansible_facts['architecture'] == 'x86_64' -- 2.31.1
[PATCH v2 2/2] tests/tcg/ppc64le: Added an underflow with UE=1 test
Added a test to see if the adjustment is being made correctly when an underflow occurs and UE is set. Signed-off-by: Lucas Mateus Castro (alqotel) --- This patch will also fail without the underflow with UE set bugfix Message-Id:<20220805141522.412864-3-lucas.ara...@eldorado.org.br> --- tests/tcg/ppc64/Makefile.target | 1 + tests/tcg/ppc64le/Makefile.target | 1 + tests/tcg/ppc64le/ue_excp.c | 53 +++ 3 files changed, 55 insertions(+) create mode 100644 tests/tcg/ppc64le/ue_excp.c diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target index 43958ad87b..583677031b 100644 --- a/tests/tcg/ppc64/Makefile.target +++ b/tests/tcg/ppc64/Makefile.target @@ -30,5 +30,6 @@ run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10 PPC64_TESTS += signal_save_restore_xer PPC64_TESTS += xxspltw PPC64_TESTS += oe_excp +PPC64_TESTS += ue_excp TESTS += $(PPC64_TESTS) diff --git a/tests/tcg/ppc64le/Makefile.target b/tests/tcg/ppc64le/Makefile.target index 8d11ac731d..b9e689c582 100644 --- a/tests/tcg/ppc64le/Makefile.target +++ b/tests/tcg/ppc64le/Makefile.target @@ -28,5 +28,6 @@ PPC64LE_TESTS += mffsce PPC64LE_TESTS += signal_save_restore_xer PPC64LE_TESTS += xxspltw PPC64LE_TESTS += oe_excp +PPC64LE_TESTS += ue_excp TESTS += $(PPC64LE_TESTS) diff --git a/tests/tcg/ppc64le/ue_excp.c b/tests/tcg/ppc64le/ue_excp.c new file mode 100644 index 00..028ef3bbc7 --- /dev/null +++ b/tests/tcg/ppc64le/ue_excp.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include + +#define FP_UE (1ull << 5) +#define MTFSF(FLM, FRB) asm volatile ("mtfsf %0, %1" :: "i" (FLM), "f" (FRB)) + +void sigfpe_handler(int sig, siginfo_t *si, void *ucontext) +{ +union { +uint64_t ll; +double dp; +} r; +uint64_t ch = 0x1b64f1c1b000ull; +r.dp = ((ucontext_t *)ucontext)->uc_mcontext.fp_regs[2]; +if (r.ll == ch) { +exit(0); +} +fprintf(stderr, "expected result: %lx\n result: %lx\n", ch, r.ll); +exit(1); +} + +int main() +{ +uint64_t fpscr; +uint64_t a = 0x5ca8ull; +uint64_t b = 0x1cefull; + +struct sigaction sa = { +.sa_sigaction = sigfpe_handler, +.sa_flags = SA_SIGINFO +}; + +prctl(PR_SET_FPEXC, PR_FP_EXC_PRECISE); +sigaction(SIGFPE, , NULL); + +fpscr = FP_UE; +MTFSF(0b, fpscr); + +asm ( +"lfd 0, %0\n\t" +"lfd 1, %1\n\t" +"fmul 2, 0, 1\n\t" +: +: "m"(a), "m"(b) +: "memory", "fr0", "fr1", "fr2" +); + +abort(); +} -- 2.25.1
[PATCH v2 1/2] tests/tcg/ppc64le: Added an overflow with OE=1 test
Added a test to see if the adjustment is being made correctly when an overflow occurs and OE is set. Signed-off-by: Lucas Mateus Castro (alqotel) --- The prctl patch is not ready yet, so this patch does as Richard Henderson suggested and check the fp register in the signal handler This patch will fail without the overflow with OE set bugfix Message-Id:<20220805141522.412864-3-lucas.ara...@eldorado.org.br> --- tests/tcg/ppc64/Makefile.target | 1 + tests/tcg/ppc64le/Makefile.target | 1 + tests/tcg/ppc64le/oe_excp.c | 53 +++ 3 files changed, 55 insertions(+) create mode 100644 tests/tcg/ppc64le/oe_excp.c diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target index 331fae628e..43958ad87b 100644 --- a/tests/tcg/ppc64/Makefile.target +++ b/tests/tcg/ppc64/Makefile.target @@ -29,5 +29,6 @@ run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10 PPC64_TESTS += signal_save_restore_xer PPC64_TESTS += xxspltw +PPC64_TESTS += oe_excp TESTS += $(PPC64_TESTS) diff --git a/tests/tcg/ppc64le/Makefile.target b/tests/tcg/ppc64le/Makefile.target index 6ca3003f02..8d11ac731d 100644 --- a/tests/tcg/ppc64le/Makefile.target +++ b/tests/tcg/ppc64le/Makefile.target @@ -27,5 +27,6 @@ PPC64LE_TESTS += mtfsf PPC64LE_TESTS += mffsce PPC64LE_TESTS += signal_save_restore_xer PPC64LE_TESTS += xxspltw +PPC64LE_TESTS += oe_excp TESTS += $(PPC64LE_TESTS) diff --git a/tests/tcg/ppc64le/oe_excp.c b/tests/tcg/ppc64le/oe_excp.c new file mode 100644 index 00..c8f07d80b6 --- /dev/null +++ b/tests/tcg/ppc64le/oe_excp.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include + +#define FP_OE (1ull << 6) +#define MTFSF(FLM, FRB) asm volatile ("mtfsf %0, %1" :: "i" (FLM), "f" (FRB)) + +void sigfpe_handler(int sig, siginfo_t *si, void *ucontext) +{ +union { +uint64_t ll; +double dp; +} r; +uint64_t ch = 0x5fcfffe4965a17e0ull; +r.dp = ((ucontext_t *)ucontext)->uc_mcontext.fp_regs[2]; +if (r.ll == ch) { +exit(0); +} +fprintf(stderr, "expected result: %lx\n result: %lx\n", ch, r.ll); +exit(1); +} + +int main() +{ +uint64_t fpscr; +uint64_t a = 0x7fdfffe816d77b00ull; +uint64_t b = 0x7fdfffFC7F7FFF00ull; + +struct sigaction sa = { +.sa_sigaction = sigfpe_handler, +.sa_flags = SA_SIGINFO +}; + +prctl(PR_SET_FPEXC, PR_FP_EXC_PRECISE); +sigaction(SIGFPE, , NULL); + +fpscr = FP_OE; +MTFSF(0b, fpscr); + +asm ( +"lfd 0, %0\n\t" +"lfd 1, %1\n\t" +"fmul 2, 0, 1\n\t" +: +: "m"(a), "m"(b) +: "memory", "fr0", "fr1", "fr2" +); + +abort(); +} -- 2.25.1
[PATCH 2/2] tests/tcg/ppc64le: Added an underflow with UE=1 test
From: "Lucas Mateus Castro (alqotel)" Added a test to see if the adjustment is being made correctly when an underflow occurs and UE is set. Signed-off-by: Lucas Mateus Castro (alqotel) --- This patch will also fail without the underflow with UE set bugfix Message-Id:<20220805141522.412864-3-lucas.ara...@eldorado.org.br> --- tests/tcg/ppc64/Makefile.target | 1 + tests/tcg/ppc64le/Makefile.target | 1 + tests/tcg/ppc64le/ue_excp.c | 54 +++ 3 files changed, 56 insertions(+) create mode 100644 tests/tcg/ppc64le/ue_excp.c diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target index 43958ad87b..583677031b 100644 --- a/tests/tcg/ppc64/Makefile.target +++ b/tests/tcg/ppc64/Makefile.target @@ -30,5 +30,6 @@ run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10 PPC64_TESTS += signal_save_restore_xer PPC64_TESTS += xxspltw PPC64_TESTS += oe_excp +PPC64_TESTS += ue_excp TESTS += $(PPC64_TESTS) diff --git a/tests/tcg/ppc64le/Makefile.target b/tests/tcg/ppc64le/Makefile.target index 8d11ac731d..b9e689c582 100644 --- a/tests/tcg/ppc64le/Makefile.target +++ b/tests/tcg/ppc64le/Makefile.target @@ -28,5 +28,6 @@ PPC64LE_TESTS += mffsce PPC64LE_TESTS += signal_save_restore_xer PPC64LE_TESTS += xxspltw PPC64LE_TESTS += oe_excp +PPC64LE_TESTS += ue_excp TESTS += $(PPC64LE_TESTS) diff --git a/tests/tcg/ppc64le/ue_excp.c b/tests/tcg/ppc64le/ue_excp.c new file mode 100644 index 00..b25ba1f803 --- /dev/null +++ b/tests/tcg/ppc64le/ue_excp.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include + +#define FP_UE (1ull << 5) +#define MTFSF(FLM, FRB) asm volatile ("mtfsf %0, %1" :: "i" (FLM), "f" (FRB)) + +void sigfpe_handler(int sig, siginfo_t *si, void *ucontext) +{ +uint64_t t; +uint64_t ch = 0x1b64f1c1b000ull; +asm ( +"stfd 2, %0\n\t" +: "=m"(t) +: +: "memory", "fr2" +); +if (t == ch) { +exit(0); +} +fprintf(stderr, "expected result: %lx\n result: %lx\n", ch, t); +exit(1); +} + +int main() +{ +uint64_t fpscr; +uint64_t a = 0x5ca8ull; +uint64_t b = 0x1cefull; + +struct sigaction sa = { +.sa_sigaction = sigfpe_handler, +.sa_flags = SA_SIGINFO +}; + +prctl(PR_SET_FPEXC, PR_FP_EXC_PRECISE); +sigaction(SIGFPE, , NULL); + +fpscr = FP_UE; +MTFSF(0b, fpscr); + +asm ( +"lfd 0, %0\n\t" +"lfd 1, %1\n\t" +"fmul 2, 0, 1\n\t" +: +: "m"(a), "m"(b) +: "memory", "fr0", "fr1", "fr2" +); + +return -1; +} -- 2.31.1
[PATCH 1/2] tests/tcg/ppc64le: Added an overflow with OE=1 test
From: "Lucas Mateus Castro (alqotel)" Added a test to see if the adjustment is being made correctly when an overflow occurs and OE is set. Signed-off-by: Lucas Mateus Castro (alqotel) --- The prctl patch is not ready yet, so this patch does as Richard Henderson suggested and check the fp register in the signal handler This patch will fail without the overflow with OE set bugfix Message-Id:<20220805141522.412864-3-lucas.ara...@eldorado.org.br> --- tests/tcg/ppc64/Makefile.target | 1 + tests/tcg/ppc64le/Makefile.target | 1 + tests/tcg/ppc64le/oe_excp.c | 54 +++ 3 files changed, 56 insertions(+) create mode 100644 tests/tcg/ppc64le/oe_excp.c diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target index 331fae628e..43958ad87b 100644 --- a/tests/tcg/ppc64/Makefile.target +++ b/tests/tcg/ppc64/Makefile.target @@ -29,5 +29,6 @@ run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10 PPC64_TESTS += signal_save_restore_xer PPC64_TESTS += xxspltw +PPC64_TESTS += oe_excp TESTS += $(PPC64_TESTS) diff --git a/tests/tcg/ppc64le/Makefile.target b/tests/tcg/ppc64le/Makefile.target index 6ca3003f02..8d11ac731d 100644 --- a/tests/tcg/ppc64le/Makefile.target +++ b/tests/tcg/ppc64le/Makefile.target @@ -27,5 +27,6 @@ PPC64LE_TESTS += mtfsf PPC64LE_TESTS += mffsce PPC64LE_TESTS += signal_save_restore_xer PPC64LE_TESTS += xxspltw +PPC64LE_TESTS += oe_excp TESTS += $(PPC64LE_TESTS) diff --git a/tests/tcg/ppc64le/oe_excp.c b/tests/tcg/ppc64le/oe_excp.c new file mode 100644 index 00..cfc364f5ed --- /dev/null +++ b/tests/tcg/ppc64le/oe_excp.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include + +#define FP_OE (1ull << 6) +#define MTFSF(FLM, FRB) asm volatile ("mtfsf %0, %1" :: "i" (FLM), "f" (FRB)) + +void sigfpe_handler(int sig, siginfo_t *si, void *ucontext) +{ +uint64_t t; +uint64_t ch = 0x5fcfffe4965a17e0ull; +asm ( +"stfd 2, %0\n\t" +: "=m"(t) +: +: "memory", "fr2" +); +if (t == ch) { +exit(0); +} +fprintf(stderr, "expected result: %lx\n result: %lx\n", ch, t); +exit(1); +} + +int main() +{ +uint64_t fpscr; +uint64_t a = 0x7fdfffe816d77b00ull; +uint64_t b = 0x7fdfffFC7F7FFF00ull; + +struct sigaction sa = { +.sa_sigaction = sigfpe_handler, +.sa_flags = SA_SIGINFO +}; + +prctl(PR_SET_FPEXC, PR_FP_EXC_PRECISE); +sigaction(SIGFPE, , NULL); + +fpscr = FP_OE; +MTFSF(0b, fpscr); + +asm ( +"lfd 0, %0\n\t" +"lfd 1, %1\n\t" +"fmul 2, 0, 1\n\t" +: +: "m"(a), "m"(b) +: "memory", "fr0", "fr1", "fr2" +); + +return -1; +} -- 2.31.1
[PATCH 1/2] fpu: Add rebias bool, value and operation
From: "Lucas Mateus Castro (alqotel)" Added the possibility of recalculating a result if it overflows or underflows, if the result overflow and the rebias bool is true then the intermediate result should have 3/4 of the total range subtracted from the exponent. The same for underflow but it should be added to the exponent of the intermediate number instead. Signed-off-by: Lucas Mateus Castro (alqotel) --- fpu/softfloat-parts.c.inc | 21 +++-- fpu/softfloat.c | 2 ++ include/fpu/softfloat-types.h | 4 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc index bbeadaa189..a9f268fcab 100644 --- a/fpu/softfloat-parts.c.inc +++ b/fpu/softfloat-parts.c.inc @@ -214,18 +214,35 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s, p->frac_lo &= ~round_mask; } } else if (unlikely(exp >= exp_max)) { -flags |= float_flag_overflow | float_flag_inexact; -if (overflow_norm) { +flags |= float_flag_overflow; +if (s->rebias_overflow) { +exp -= fmt->exp_re_bias; +} else if (overflow_norm) { +flags |= float_flag_inexact; exp = exp_max - 1; frac_allones(p); p->frac_lo &= ~round_mask; } else { +flags |= float_flag_inexact; p->cls = float_class_inf; exp = exp_max; frac_clear(p); } } frac_shr(p, frac_shift); +} else if (unlikely(s->rebias_underflow)) { +flags |= float_flag_underflow; +exp += fmt->exp_re_bias; +if (p->frac_lo & round_mask) { +flags |= float_flag_inexact; +if (frac_addi(p, p, inc)) { +frac_shr(p, 1); +p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; +exp++; +} +p->frac_lo &= ~round_mask; +} +frac_shr(p, frac_shift); } else if (s->flush_to_zero) { flags |= float_flag_output_denormal; p->cls = float_class_zero; diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 4a871ef2a1..c7454c3eb1 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -521,6 +521,7 @@ typedef struct { typedef struct { int exp_size; int exp_bias; +int exp_re_bias; int exp_max; int frac_size; int frac_shift; @@ -532,6 +533,7 @@ typedef struct { #define FLOAT_PARAMS_(E)\ .exp_size = E,\ .exp_bias = ((1 << E) - 1) >> 1, \ +.exp_re_bias= (1 << (E - 1)) + (1 << (E - 2)), \ .exp_max= (1 << E) - 1 #define FLOAT_PARAMS(E, F) \ diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h index 7a6ea881d8..9735543ac4 100644 --- a/include/fpu/softfloat-types.h +++ b/include/fpu/softfloat-types.h @@ -195,6 +195,10 @@ typedef struct float_status { bool snan_bit_is_one; bool use_first_nan; bool no_signaling_nans; +/* should overflowed results subtract re_bias to its exponent? */ +bool rebias_overflow; +/* should underflowed results add re_bias to its exponent? */ +bool rebias_underflow; } float_status; #endif /* SOFTFLOAT_TYPES_H */ -- 2.31.1
[PATCH 2/2] target/ppc: Bugfix FP when OE/UE are set
From: "Lucas Mateus Castro (alqotel)" When an overflow exception occurs and OE is set the intermediate result should be adjusted (by subtracting from the exponent) to avoid rounding to inf. The same applies to an underflow exceptionion and UE (but adding to the exponent). To do this set the fp_status.rebias_overflow when OE is set and fp_status.rebias_underflow when UE is set as the FPU will recalculate in case of a overflow/underflow if the according rebias* is set. Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/cpu.c| 2 ++ target/ppc/fpu_helper.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c index 401b6f9e63..0ebac04bc4 100644 --- a/target/ppc/cpu.c +++ b/target/ppc/cpu.c @@ -120,6 +120,8 @@ void ppc_store_fpscr(CPUPPCState *env, target_ulong val) val |= FP_FEX; } env->fpscr = val; +env->fp_status.rebias_overflow = (FP_OE & env->fpscr) ? true : false; +env->fp_status.rebias_underflow = (FP_UE & env->fpscr) ? true : false; if (tcg_enabled()) { fpscr_set_rounding_mode(env); } diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 134804628b..c17575de5d 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -344,7 +344,6 @@ static inline int float_overflow_excp(CPUPPCState *env) bool overflow_enabled = !!(env->fpscr & FP_OE); if (overflow_enabled) { -/* XXX: should adjust the result */ /* Update the floating-point enabled exception summary */ env->fpscr |= FP_FEX; /* We must update the target FPR before raising the exception */ @@ -363,7 +362,6 @@ static inline void float_underflow_excp(CPUPPCState *env) /* Update the floating-point exception summary */ env->fpscr |= FP_FX; if (env->fpscr & FP_UE) { -/* XXX: should adjust the result */ /* Update the floating-point enabled exception summary */ env->fpscr |= FP_FEX; /* We must update the target FPR before raising the exception */ -- 2.31.1
[PATCH 0/2] Floating-point OE/UE exception bug
From: "Lucas Mateus Castro (alqotel)" Changes in v2: - Completely reworked the solution: * Created re_bias in FloatFmt, it is 3/4 of the total exponent range of a FP type * Added rebias bools that dictates if the result should have its exponent add/subtract the re_bias value if an overflow/underflow occurs. * ppc_store_fpscr sets/unsets rebias if OE/UE is set/unset The PowerISA defines that if an overflow exception happen with FPSCR.OE set, the exponent of the intermediate result is subtracted 1536 in double precision operations and is added 1536 in an underflow exception, currently this behavior is not QEMU's behavior, this patch series fixes that. Currently there's no test in this patch series as there's no way to disable MSR.FE0 and MSR.FE1 in linux user, so any overflow/underflow exception with OE/UE set causes a trapping exception. Lucas Mateus Castro (alqotel) (2): fpu: Add rebias bool, value and operation target/ppc: Bugfix FP when OE/UE are set fpu/softfloat-parts.c.inc | 21 +++-- fpu/softfloat.c | 2 ++ include/fpu/softfloat-types.h | 4 target/ppc/cpu.c | 2 ++ target/ppc/fpu_helper.c | 2 -- 5 files changed, 27 insertions(+), 4 deletions(-) -- 2.31.1
[PATCH] tests/tcg/ppc64le: Added OE/UE enabled exception test
From: "Lucas Mateus Castro (alqotel)" DO NOT MERGE This patch adds a test to check if the add/sub of the intermediate result when an overflow or underflow exception with the corresponding enabling bit being set (i.e. OE/UE), but linux-user currently can't disable MSR.FE0 and MSR.FE1 so it will always result in a trapping exception, to avoid that the test should be run in a VM or use Matheus' WIP patch in https://github.com/PPC64/qemu/tree/alqotel-ferst-prctl-patch The test results were based on a Power9 machine. Signed-off-by: Lucas Mateus Castro (alqotel) --- tests/tcg/ppc64le/oe_ue_excp.c | 105 + 1 file changed, 105 insertions(+) create mode 100644 tests/tcg/ppc64le/oe_ue_excp.c diff --git a/tests/tcg/ppc64le/oe_ue_excp.c b/tests/tcg/ppc64le/oe_ue_excp.c new file mode 100644 index 00..384219a366 --- /dev/null +++ b/tests/tcg/ppc64le/oe_ue_excp.c @@ -0,0 +1,105 @@ +#include +#include +#include + +#define FP_OE (1ull << 6) +#define FP_UE (1ull << 5) + +typedef union { +double d; +long long ll; +} ll_fp; + +double asm_fmul (double a, double b) +{ +double t; +asm ( +"lfd 0, %1\n\t" +"lfd 1, %2\n\t" +"fmul 2, 0, 1\n\t" +"stfd 2, %0\n\t" +:"=m"(t) +:"m"(a),"m"(b) +); +return t; +} + +double asm_fdiv (double a, double b) +{ +double t; +asm ( +"lfd 0, %1\n\t" +"lfd 1, %2\n\t" +"fdiv 2, 0, 1\n\t" +"stfd 2, %0\n\t" +:"=m"(t) +:"m"(a),"m"(b) +); +return t; +} + +int main () +{ +int i, ok = 1; +ll_fp fpscr, t; + +prctl(PR_SET_FPEXC, PR_FP_EXC_DISABLED); + +fpscr.ll = FP_UE | FP_OE; +__builtin_mtfsf (0b, fpscr.d); +fpscr.d = __builtin_mffs (); +printf("fpscr = %016llx\n", fpscr.ll); + +ll_fp ch[] = +{ +{ .ll = 0x1b64f1c1b000ull }, +{ .ll = 0x1b64f1c1b001ull }, +{ .ll = 0x1b90de341000ull }, +{ .ll = 0x1b90de341000ull }, +{ .ll = 0x5fcfffe4965a17e0ull }, +{ .ll = 0x5fcfffe4965a17e0ull }, +{ .ll = 0x2003ull }, +{ .ll = 0x2003ull } +}; + +ll_fp a[] = +{ +{ .ll = 0x5ca8ull }, +{ .ll = 0xbadcull }, +{ .ll = 0x7fdfffe816d77b00ull }, +{ .d = DBL_MAX } +}; + +ll_fp b[] = +{ +{ .ll = 0x1cefull }, +{ .ll = 0x5c70ull }, +{ .ll = 0x7fdfffFC7F7FFF00ull }, +{ .d = 2.5 } +}; + +for (i = 0; i < 4; i++) { +t.d = asm_fmul(a[i].d, b[i].d); +if (t.ll != ch[2 * i].ll) { +ok = 0; +printf ("Mismatch on fmul n %d:\n\tresult: %016llx\n\t" +"expected: %016llx\n", i, t.ll, ch[2 * i].ll); +} else { +printf ("Ok on fmul n %d\n", i); +} +t.d = asm_fdiv(a[i].d, 1.0/b[i].d); +if (t.ll != ch[2 * i + 1].ll) { +ok = 0; +printf ("Mismatch on fdiv n %d:\n\tresult: %016llx\n\t" +"expected: %016llx\n", i, t.ll, ch[2 * i + 1].ll); +} else { +printf ("Ok on fdiv n %d\n", i); +} +} +fpscr.d = __builtin_mffs (); +printf("fpscr = %016llx\n", fpscr.ll); +if(!ok) { +return -1; +} +return 0; +} -- 2.31.1
[RFC PATCH 3/3] target/ppc: Bugfix fdiv result with OE/UE set
From: "Lucas Mateus Castro (alqotel)" Change fdiv in the same way of fadd/fsub to handle overflow/underflow if OE/UE is set (i.e. function that receives a value to add/subtract from the exponent if an overflow/underflow occurs). Signed-off-by: Lucas Mateus Castro (alqotel) --- fpu/softfloat.c | 30 ++ include/fpu/softfloat.h | 1 + target/ppc/fpu_helper.c | 5 - 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/fpu/softfloat.c b/fpu/softfloat.c index e2b4ad4b63..0e9d2d2678 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -2558,6 +2558,27 @@ soft_f64_div(float64 a, float64 b, float_status *status) return float64_round_pack_canonical(pr, status); } +static float64 QEMU_SOFTFLOAT_ATTR +soft_f64_div_excp_en(float64 a, float64 b, int oe_sub, int ue_sum, + float_status *status) +{ +FloatParts64 pa, pb, *pr; + +float64_unpack_canonical(, a, status); +float64_unpack_canonical(, b, status); +pr = parts_div(, , status); + +if (unlikely(oe_sub && (pr->exp > 1023))) { +pr->exp -= oe_sub; +float_raise(float_flag_overflow, status); +} else if (unlikely(ue_sum && (pr->exp < -1022))) { +pr->exp += ue_sum; +float_raise(float_flag_underflow, status); +} + +return float64_round_pack_canonical(pr, status); +} + static float hard_f32_div(float a, float b) { return a / b; @@ -2616,6 +2637,15 @@ float64_div(float64 a, float64 b, float_status *s) f64_div_pre, f64_div_post); } +float64 QEMU_FLATTEN +float64_div_excp_en(float64 a, float64 b, int oe_sub, int ue_sum, +float_status *s) +{ +return float64_gen2_excp(a, b, oe_sub, ue_sum, s, hard_f64_div, + soft_f64_div, soft_f64_div_excp_en, f64_div_pre, + f64_div_post); +} + float64 float64r32_div(float64 a, float64 b, float_status *status) { FloatParts64 pa, pb, *pr; diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h index 4ff56b0e10..a6c7885fcd 100644 --- a/include/fpu/softfloat.h +++ b/include/fpu/softfloat.h @@ -778,6 +778,7 @@ float64 float64_sub_excp_en(float64, float64, int, int, float_status *status); float64 float64_mul(float64, float64, float_status *status); float64 float64_mul_excp_en(float64, float64, int, int, float_status *status); float64 float64_div(float64, float64, float_status *status); +float64 float64_div_excp_en(float64, float64, int, int, float_status *status); float64 float64_rem(float64, float64, float_status *status); float64 float64_muladd(float64, float64, float64, int, float_status *status); float64 float64_sqrt(float64, float_status *status); diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 18cf720743..1a6869a920 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -635,7 +635,10 @@ static void float_invalid_op_div(CPUPPCState *env, int flags, /* fdiv - fdiv. */ float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2) { -float64 ret = float64_div(arg1, arg2, >fp_status); +int oe_sub = (FP_OE & env->fpscr) ? 1536 : 0; +int ue_sum = (FP_UE & env->fpscr) ? 1536 : 0; +float64 ret = float64_div_excp_en(arg1, arg2, oe_sub, ue_sum, + >fp_status); int flags = get_float_exception_flags(>fp_status); if (unlikely(flags & float_flag_invalid)) { -- 2.31.1
[RFC PATCH 2/3] target/ppc: Bugfix fmul result with OE/UE set
From: "Lucas Mateus Castro (alqotel)" Change fmul in the same way of fadd/fsub to handle overflow/underflow if OE/UE is set (i.e. function that receives a value to add/subtract from the exponent if an overflow/underflow occurs). Signed-off-by: Lucas Mateus Castro (alqotel) --- fpu/softfloat.c | 30 ++ include/fpu/softfloat.h | 1 + target/ppc/fpu_helper.c | 5 - 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/fpu/softfloat.c b/fpu/softfloat.c index a407129dcb..e2b4ad4b63 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -2212,6 +2212,36 @@ float64_mul(float64 a, float64 b, float_status *s) f64_is_zon2, f64_addsubmul_post); } +static float64 QEMU_SOFTFLOAT_ATTR +soft_f64_mul_excp_en(float64 a, float64 b, int oe_sub, int ue_sum, + float_status *s) +{ +FloatParts64 pa, pb, *pr; + +float64_unpack_canonical(, a, s); +float64_unpack_canonical(, b, s); +pr = parts_mul(, , s); + +if (unlikely(oe_sub && (pr->exp > 1023))) { +pr->exp -= oe_sub; +float_raise(float_flag_overflow, s); +} else if (unlikely(ue_sum && (pr->exp < -1022))) { +pr->exp += ue_sum; +float_raise(float_flag_underflow, s); +} + +return float64_round_pack_canonical(pr, s); +} + +float64 QEMU_FLATTEN +float64_mul_excp_en(float64 a, float64 b, int oe_sub, int ue_sum, +float_status *status) +{ +return float64_gen2_excp(a, b, oe_sub, ue_sum, status, + hard_f64_mul, soft_f64_mul, soft_f64_mul_excp_en, + f64_is_zon2, f64_addsubmul_post); +} + float64 float64r32_mul(float64 a, float64 b, float_status *status) { FloatParts64 pa, pb, *pr; diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h index 76bf628a29..4ff56b0e10 100644 --- a/include/fpu/softfloat.h +++ b/include/fpu/softfloat.h @@ -776,6 +776,7 @@ float64 float64_add_excp_en(float64, float64, int, int, float_status *status); float64 float64_sub(float64, float64, float_status *status); float64 float64_sub_excp_en(float64, float64, int, int, float_status *status); float64 float64_mul(float64, float64, float_status *status); +float64 float64_mul_excp_en(float64, float64, int, int, float_status *status); float64 float64_div(float64, float64, float_status *status); float64 float64_rem(float64, float64, float_status *status); float64 float64_muladd(float64, float64, float64, int, float_status *status); diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index cb82c91340..18cf720743 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -595,7 +595,10 @@ static void float_invalid_op_mul(CPUPPCState *env, int flags, /* fmul - fmul. */ float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2) { -float64 ret = float64_mul(arg1, arg2, >fp_status); +int oe_sub = (FP_OE & env->fpscr) ? 1536 : 0; +int ue_sum = (FP_UE & env->fpscr) ? 1536 : 0; +float64 ret = float64_mul_excp_en(arg1, arg2, oe_sub, ue_sum, + >fp_status); int flags = get_float_exception_flags(>fp_status); if (unlikely(flags & float_flag_invalid)) { -- 2.31.1
[RFC PATCH 1/3] target/ppc: Bugfix fadd/fsub result with OE/UE set
From: "Lucas Mateus Castro (alqotel)" As mentioned in the functions float_overflow_excp and float_underflow_excp, the result should be adjusted as mentioned in the ISA (subtracted 192/1536 from the exponent of the intermediate result if an overflow occurs with OE set and added 192/1536 to the exponent of the intermediate result if an underflow occurs with UE set), but at those functions the result has already been rounded so it is not possible to add/subtract from the intermediate result anymore. This patch creates a new function that receives the value that should be subtracted/added from the exponent if an overflow/underflow happens, to not leave some arbitrary numbers from the PowerISA in the middle of the FPU code. If these numbers are 0 the new functions just call the old ones. I used 2 values here for overflow and underflow, maybe it'd be better to just use the same ones, any thoughts? Signed-off-by: Lucas Mateus Castro (alqotel) --- An alternative I've thought was to always return the value adjusted if a overflow or underflow occurs and in float_underflow_excp and float_overflow_excp adjust it to inf/den/0 if OE/UE is 0, but I didn't saw many advantages to that approach. --- fpu/softfloat.c | 75 + include/fpu/softfloat.h | 2 ++ target/ppc/fpu_helper.c | 10 -- 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 4a871ef2a1..a407129dcb 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -268,6 +268,8 @@ typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); +typedef float64 (*soft_f64_op2_int2_fn)(float64 a, float64 b, int c, int d, +float_status *s); typedef float (*hard_f32_op2_fn)(float a, float b); typedef double (*hard_f64_op2_fn)(double a, double b); @@ -401,6 +403,19 @@ float64_gen2(float64 xa, float64 xb, float_status *s, return soft(ua.s, ub.s, s); } +static inline float64 +float64_gen2_excp(float64 xa, float64 xb, int xc, int xd, float_status *s, + hard_f64_op2_fn hard, soft_f64_op2_fn soft, + soft_f64_op2_int2_fn soft_excp, f64_check_fn pre, + f64_check_fn post) +{ +if (xc || xd) { +return soft_excp(xa, xb, xc, xd, s); +} else { +return float64_gen2(xa, xb, s, hard, soft, pre, post); +} +} + /* * Classify a floating point number. Everything above float_class_qnan * is a NaN so cls >= float_class_qnan is any NaN. @@ -1929,6 +1944,39 @@ static double hard_f64_sub(double a, double b) return a - b; } +static float64 QEMU_SOFTFLOAT_ATTR +soft_f64_addsub_excp_en(float64 a, float64 b, int oe_sub, int ue_sum, +float_status *status, bool subtract) +{ +FloatParts64 pa, pb, *pr; + +float64_unpack_canonical(, a, status); +float64_unpack_canonical(, b, status); +pr = parts_addsub(, , status, subtract); + +if (unlikely(oe_sub && (pr->exp > 1023))) { +pr->exp -= oe_sub; +float_raise(float_flag_overflow, status); +} else if (unlikely(ue_sum && (pr->exp < -1022))) { +pr->exp += ue_sum; +float_raise(float_flag_underflow, status); +} + +return float64_round_pack_canonical(pr, status); +} + +static float64 soft_f64_add_excp_en(float64 a, float64 b, int oe_sub, +int ue_sum, float_status *status) +{ +return soft_f64_addsub_excp_en(a, b, oe_sub, ue_sum, status, false); +} + +static float64 soft_f64_sub_excp_en(float64 a, float64 b, int oe_sub, +int ue_sum, float_status *status) +{ +return soft_f64_addsub_excp_en(a, b, oe_sub, ue_sum, status, true); +} + static bool f32_addsubmul_post(union_float32 a, union_float32 b) { if (QEMU_HARDFLOAT_2F32_USE_FP) { @@ -1960,6 +2008,15 @@ static float64 float64_addsub(float64 a, float64 b, float_status *s, f64_is_zon2, f64_addsubmul_post); } +static float64 float64_addsub_excp_en(float64 a, float64 b, int oe_sum, + int ue_sub, float_status *s, + hard_f64_op2_fn hard, soft_f64_op2_fn soft, + soft_f64_op2_int2_fn soft_excp) +{ +return float64_gen2_excp(a, b, oe_sum, ue_sub, s, hard, soft, soft_excp, + f64_is_zon2, f64_addsubmul_post); +} + float32 QEMU_FLATTEN float32_add(float32 a, float32 b, float_status *s) { @@ -1984,6 +2041,24 @@ float64_sub(float64 a, float64 b, float_status *s) return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); } +float64 QEMU_FLATTEN +float64_add_excp_en(float64 a, float64 b, int oe_sub, int ue_sum, +
[RFC PATCH 8/8] tests/docker: Selective line reading by python script
Building some images failed on ppc64le because the dockerfile tried to install some packages that are only available in x86 and arm64, to solve this while still having those packages be available in those architectures a comment was put before the installation command to instruct the python script into ignoring those lines for some architectures (in this case ppc64le) Overall I'm not a big fan of the way I solved this problem, so I'd like to know if anyone has a better way to make these dockerfilse work in PPC64LE. For context the base images used here are available in PPC64LE but some of the packages installed are not (in alpine's case it's XEN, which is only available to x86 and ARM), so this patch create a ignore_list which is set on a per-architecture basis, and any packages in a dockerfile in this ignore_list will not be copied to the temporary dockerfile used in the docker command. Signed-off-by: Lucas Mateus Castro(alqotel) --- tests/docker/docker.py | 15 --- tests/docker/dockerfiles/alpine.docker | 2 ++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/docker/docker.py b/tests/docker/docker.py index d0af2861b8..9b962d1c78 100755 --- a/tests/docker/docker.py +++ b/tests/docker/docker.py @@ -14,6 +14,7 @@ import os import sys import subprocess +import platform import json import hashlib import atexit @@ -207,8 +208,15 @@ def _read_qemu_dockerfile(img_name): def _dockerfile_preprocess(df): out = "" +ignore_list = [] for l in df.splitlines(): -if len(l.strip()) == 0 or l.startswith("#"): +if len(l.strip()) == 0: +continue +if l.startswith("#"): +if len(l.split()) >= 3: +if l.split()[1] == "ignore": +if platform.processor() in l.split()[2].split(','): +ignore_list += l.split()[3].split(',') continue from_pref = "FROM qemu/" if l.startswith(from_pref): @@ -219,7 +227,8 @@ def _dockerfile_preprocess(df): inlining = _read_qemu_dockerfile(l[len(from_pref):]) out += _dockerfile_preprocess(inlining) continue -out += l + "\n" +if not any(x in l.split() for x in ignore_list): +out += l + "\n" return out @@ -330,7 +339,7 @@ def build_image(self, tag, docker_dir, dockerfile, tmp_df = tempfile.NamedTemporaryFile(mode="w+t", encoding='utf-8', dir=docker_dir, suffix=".docker") -tmp_df.write(dockerfile) +tmp_df.write(_dockerfile_preprocess(dockerfile)) if user: uid = os.getuid() diff --git a/tests/docker/dockerfiles/alpine.docker b/tests/docker/dockerfiles/alpine.docker index 2943a99730..5cec46d8f2 100644 --- a/tests/docker/dockerfiles/alpine.docker +++ b/tests/docker/dockerfiles/alpine.docker @@ -6,6 +6,8 @@ FROM docker.io/library/alpine:edge +# Lines to by ignored when this file is read by the python script +# ignore ppc64le,ppc64 xen-dev RUN apk update && \ apk upgrade && \ apk add \ -- 2.25.1
[PATCH 6/8] scripts/ci/setup: Add Fedora to build-environment.yml
Minicloud doesn't have a RHEL image, but it does have Fedora 34 and 35 images and both use DNF as package manager, so just change the ansible facts to check if it's RHEL or Fedora Signed-off-by: Lucas Mateus Castro(alqotel) --- scripts/ci/setup/build-environment.yml | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index 43cf8c759f..a7d53d0f70 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -165,8 +165,10 @@ - zlib-devel state: present when: -- ansible_facts['distribution_file_variety'] == 'RedHat' -- ansible_facts['distribution_version'] == '8' +- | + (ansible_facts['distribution'] == 'RedHat' and +ansible_facts['distribution_version'] == '8') or +ansible_facts['distribution'] == 'Fedora' - name: Install packages only available on x86 and aarch64 dnf: @@ -175,6 +177,8 @@ - spice-server state: present when: -- ansible_facts['distribution_file_variety'] == 'RedHat' -- ansible_facts['distribution_version'] == '8' +- | + (ansible_facts['distribution'] == 'RedHat' and +ansible_facts['distribution_version'] == '8') or +ansible_facts['distribution'] == 'Fedora' - ansible_facts['architecture'] == 'aarch64' or ansible_facts['architecture'] == 'x86_64' -- 2.25.1
[PATCH 2/8] scripts/ci/setup: ninja missing from build-environment
ninja-build is missing from the RHEL environment, so a system prepared with that script would still fail to compile QEMU. Tested on a Fedora 36 Signed-off-by: Lucas Mateus Castro(alqotel) --- scripts/ci/setup/build-environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index 232525b91d..6df3e61d94 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -150,6 +150,7 @@ - libepoxy-devel - libgcrypt-devel - lzo-devel + - ninja-build - make - mesa-libEGL-devel - nettle-devel -- 2.25.1
[PATCH 3/8] scripts/ci/setup: Fix libxen requirements
XEN hypervisor is only available in ARM and x86, but the yaml only checked if the architecture is different from s390x, changed it to a more accurate test. Tested this change on a Ubuntu 20.04 ppc64le. Signed-off-by: Lucas Mateus Castro(alqotel) --- scripts/ci/setup/build-environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index 6df3e61d94..7535228685 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -97,7 +97,7 @@ state: present when: - ansible_facts['distribution'] == 'Ubuntu' -- ansible_facts['architecture'] != 's390x' +- ansible_facts['architecture'] == 'aarch64' or ansible_facts['architecture'] == 'x86_64' - name: Install basic packages to build QEMU on Ubuntu 20.04 package: -- 2.25.1
[PATCH 7/8] scripts/ci/setup: Added debian to build-environment.yml
Minicloud has a PPC64 BE Debian11 image which can be used for the CI, so add Debian to the build-environment.yml so it can be configured with ansible-playbook. Signed-off-by: Lucas Mateus Castro(alqotel) --- scripts/ci/setup/build-environment.yml | 31 +- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index a7d53d0f70..b5d415496f 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -31,9 +31,11 @@ update_cache: yes upgrade: yes when: -- ansible_facts['distribution'] == 'Ubuntu' +- | +ansible_facts['distribution'] == 'Ubuntu' or +ansible_facts['distribution'] == 'Debian' -- name: Install basic packages to build QEMU on Ubuntu 20.04 +- name: Install basic packages to build QEMU on Ubuntu 20.04 or Debian11 package: name: - ccache @@ -56,7 +58,6 @@ - libibverbs-dev - libiscsi-dev - libjemalloc-dev - - libjpeg-turbo8-dev - liblzo2-dev - libncurses5-dev - libncursesw5-dev @@ -86,17 +87,37 @@ - sparse - xfslibs-dev state: present + when: +- | +ansible_facts['distribution'] == 'Ubuntu' or +ansible_facts['distribution'] == 'Debian' + +- name: Install Ubuntu exclusive packages to build QEMU + package: +name: + - libjpeg-turbo8-dev +state: present when: - ansible_facts['distribution'] == 'Ubuntu' -- name: Install packages to build QEMU on Ubuntu 20.04 on non-s390x +- name: Install Debian exclusive packages to build QEMU + package: +name: + - libjpeg62-turbo-dev +state: present + when: +- ansible_facts['distribution'] == 'Debian' + +- name: Install packages to build QEMU on Ubuntu 20.04 or Debian11 on non-s390x package: name: - libspice-server-dev - libxen-dev state: present when: -- ansible_facts['distribution'] == 'Ubuntu' +- | +ansible_facts['distribution'] == 'Ubuntu' or +ansible_facts['distribution'] == 'Debian' - ansible_facts['architecture'] == 'aarch64' or ansible_facts['architecture'] == 'x86_64' - name: Install basic packages to build QEMU on Ubuntu 20.04 -- 2.25.1
[PATCH 5/8] scripts/ci/setup: Add ppc64le to vars.yml template
Added ppc64le so that the gitlab-runner.yml could be used to set up ppc64le runners. Signed-off-by: Lucas Mateus Castro(alqotel) --- scripts/ci/setup/vars.yml.template | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/setup/vars.yml.template b/scripts/ci/setup/vars.yml.template index e48089761f..2c84698b87 100644 --- a/scripts/ci/setup/vars.yml.template +++ b/scripts/ci/setup/vars.yml.template @@ -8,5 +8,6 @@ ansible_to_gitlab_arch: x86_64: amd64 aarch64: arm64 s390x: s390x + ppc64le: ppc64le # A unique token made available by GitLab to your project for registering runners gitlab_runner_registration_token: PLEASE_PROVIDE_A_VALID_TOKEN -- 2.25.1
[PATCH 1/8] tests/docker: Fix alpine dockerfile
Currently the run script uses 'readlink -e' but the image only has the busybox readlink, this commit add the coreutils package which contains the readlink with the '-e' option. Signed-off-by: Lucas Mateus Castro(alqotel) --- tests/docker/dockerfiles/alpine.docker | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/docker/dockerfiles/alpine.docker b/tests/docker/dockerfiles/alpine.docker index 3f4c0f95cb..2943a99730 100644 --- a/tests/docker/dockerfiles/alpine.docker +++ b/tests/docker/dockerfiles/alpine.docker @@ -21,6 +21,7 @@ RUN apk update && \ cdrkit \ ceph-dev \ clang \ +coreutils \ ctags \ curl-dev \ cyrus-sasl-dev \ -- 2.25.1
[PATCH 4/8] scripts/ci/setup: spice-server only on x86 aarch64
Changed build-environment.yml to only install spice-server on x86_64 and aarch64 as this package is only available on those architectures. Signed-off-by: Lucas Mateus Castro(alqotel) --- scripts/ci/setup/build-environment.yml | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/ci/setup/build-environment.yml b/scripts/ci/setup/build-environment.yml index 7535228685..43cf8c759f 100644 --- a/scripts/ci/setup/build-environment.yml +++ b/scripts/ci/setup/build-environment.yml @@ -160,7 +160,6 @@ - python36 - rdma-core-devel - spice-glib-devel - - spice-server - systemtap-sdt-devel - tar - zlib-devel @@ -168,3 +167,14 @@ when: - ansible_facts['distribution_file_variety'] == 'RedHat' - ansible_facts['distribution_version'] == '8' + +- name: Install packages only available on x86 and aarch64 + dnf: +# Spice server not available in ppc64le +name: + - spice-server +state: present + when: +- ansible_facts['distribution_file_variety'] == 'RedHat' +- ansible_facts['distribution_version'] == '8' +- ansible_facts['architecture'] == 'aarch64' or ansible_facts['architecture'] == 'x86_64' -- 2.25.1
[PATCH 0/8] Patch series to set up a ppc64le CI
This patch series aim to make easier to set up a compilation and CI environment in PPC64 and PPC64LE machines. The first 2 patches is a fix not related to ppc64. Patch 3 and 4 also affect some other architectures. Patches 5 to 7 are adding Power specific additions. Patch 8 is a RFC for a current way to run the docker tests in PPC64LE. Lucas Mateus Castro(alqotel) (8): tests/docker: Fix alpine dockerfile scripts/ci/setup: ninja missing from build-environment scripts/ci/setup: Fix libxen requirements scripts/ci/setup: spice-server only on x86 aarch64 scripts/ci/setup: Add ppc64le to vars.yml template scripts/ci/setup: Add Fedora to build-environment.yml scripts/ci/setup: Added debian to build-environment.yml tests/docker: Selective line reading by python script scripts/ci/setup/build-environment.yml | 54 +- scripts/ci/setup/vars.yml.template | 1 + tests/docker/docker.py | 15 +-- tests/docker/dockerfiles/alpine.docker | 3 ++ 4 files changed, 61 insertions(+), 12 deletions(-) -- 2.25.1
[RFC PATCH RESEND] scripts/checkpatch.pl: Change line limit warning
The QEMU documentation mentions that lines should be up to 80 characters and that the script checkpatch will warn at 100 characters, but the script warns at 80 characters and throw and error at 90, so this commit changes to warn at 100. As to why extend, the argument that resulted in the change of the docs was that trying to always wrap to 80 columns could result in less readable code, so sometimes not wrapping was the better choice and in those circumstances checkpatch could nudge people into creating less readable code. A 132 error limit is put to catch overly big lines. Based-on: 20201105154208.12442-1-ganqi...@huawei.com Signed-off-by: Lucas Mateus Castro(alqotel) --- Currently there's a disagreement between the checkpatch code and the documentation, this RFC just changes the checkpatch to match the documentation. But there was a discussion in 2020 as the best way to deal with this, some alternatives mentioned are: change the warning to remind people to not blindly wrap just because of the warning, change to warn at 90 columns (which would mean changing the column limit for the error as well). If any of those are preferred I'll send a next version updating the documentation as well as changing checkpatch.pl to the preferred behavior. --- scripts/checkpatch.pl | 15 +-- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d900d18048..2c2d7b31ab 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1639,12 +1639,12 @@ sub process { if ($line =~ /^\+/ && !($line =~ /^\+\s*"[^"]*"\s*(?:\s*|,|\)\s*;)\s*$/) && !($rawline =~ /^[^[:alnum:]]*https?:\S*$/) && - $length > 80) + $length > 100) { - if ($length > 90) { - ERROR("line over 90 characters\n" . $herecurr); + if ($length > 132) { + ERROR("line over 132 characters\n" . $herecurr); } else { - WARN("line over 80 characters\n" . $herecurr); + WARN("line over 100 characters\n" . $herecurr); } } @@ -1838,13 +1838,8 @@ sub process { #print "realcnt<$realcnt> ctx_cnt<$ctx_cnt>\n"; #print "pre<$pre_ctx>\nline<$line>\nctx<$ctx>\nnext<$lines[$ctx_ln - 1]>\n"; - # The length of the "previous line" is checked against 80 because it - # includes the + at the beginning of the line (if the actual line has - # 79 or 80 characters, it is no longer possible to add a space and an - # opening brace there) if ($#ctx == 0 && $ctx !~ /{\s*/ && - defined($lines[$ctx_ln - 1]) && $lines[$ctx_ln - 1] =~ /^\+\s*\{/ && - defined($lines[$ctx_ln - 2]) && length($lines[$ctx_ln - 2]) < 80) { + defined($lines[$ctx_ln - 1]) && $lines[$ctx_ln - 1] =~ /^\+\s*\{/) { ERROR("that open brace { should be on the previous line\n" . "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); } -- 2.25.1
[PATCH RESEND v3 8/8] target/ppc: Implemented vector module quadword
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: vmodsq: Vector Modulo Signed Quadword vmoduq: Vector Modulo Unsigned Quadword Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson Resolves: https://gitlab.com/qemu-project/qemu/-/issues/744 --- target/ppc/helper.h | 2 ++ target/ppc/insn32.decode| 2 ++ target/ppc/int_helper.c | 21 + target/ppc/translate/vmx-impl.c.inc | 2 ++ 4 files changed, 27 insertions(+) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index e7624300df..d627cfe6ed 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -181,6 +181,8 @@ DEF_HELPER_FLAGS_3(VDIVESD, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VDIVEUD, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VDIVESQ, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VDIVEUQ, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_3(VMODSQ, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_3(VMODUQ, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 75fa206b39..6ea48d5163 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -807,3 +807,5 @@ VMODSW 000100 . . . 0001011@VX VMODUW 000100 . . . 11010001011@VX VMODSD 000100 . . . 1001011@VX VMODUD 000100 . . . 11011001011@VX +VMODSQ 000100 . . . 1111011@VX +VMODUQ 000100 . . . 1101011@VX diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 42f0dcfc52..16357c0900 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -1247,6 +1247,27 @@ void helper_VDIVEUQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) } } +void helper_VMODSQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) +{ +Int128 neg1 = int128_makes64(-1); +Int128 int128_min = int128_make128(0, INT64_MIN); +if (likely(int128_nz(b->s128) && + (int128_ne(a->s128, int128_min) || int128_ne(b->s128, neg1 { +t->s128 = int128_rems(a->s128, b->s128); +} else { +t->s128 = int128_zero(); /* Undefined behavior */ +} +} + +void helper_VMODUQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) +{ +if (likely(int128_nz(b->s128))) { +t->s128 = int128_remu(a->s128, b->s128); +} else { +t->s128 = int128_zero(); /* Undefined behavior */ +} +} + void helper_VPERM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) { ppc_avr_t result; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 78277fb018..0b563bed37 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -3381,6 +3381,8 @@ TRANS_FLAGS2(ISA310, VMODSW, do_vdiv_vmod, MO_32, do_modsw , NULL) TRANS_FLAGS2(ISA310, VMODUW, do_vdiv_vmod, MO_32, do_moduw, NULL) TRANS_FLAGS2(ISA310, VMODSD, do_vdiv_vmod, MO_64, NULL, do_modsd) TRANS_FLAGS2(ISA310, VMODUD, do_vdiv_vmod, MO_64, NULL, do_modud) +TRANS_FLAGS2(ISA310, VMODSQ, do_vx_helper, gen_helper_VMODSQ) +TRANS_FLAGS2(ISA310, VMODUQ, do_vx_helper, gen_helper_VMODUQ) #undef DIVS32 #undef DIVU32 -- 2.31.1
[PATCH RESEND v3 4/8] host-utils: Implemented unsigned 256-by-128 division
From: "Lucas Mateus Castro (alqotel)" Based on already existing QEMU implementation, created an unsigned 256 bit by 128 bit division needed to implement the vector divide extended unsigned instruction from PowerISA3.1 Signed-off-by: Lucas Mateus Castro (alqotel) --- This patch had received Reviewed-by by Richard Henderson pending on the placemente of clz128 being moved to int128.h, but clz128 ended up being changed to accommodate to int128.h (i.e. the lack of clz64), so out of precaution I'd like to request a review of the clz128 implementation --- include/qemu/host-utils.h | 2 + include/qemu/int128.h | 38 +++ util/host-utils.c | 129 ++ 3 files changed, 169 insertions(+) diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h index f19bd29105..9767af7573 100644 --- a/include/qemu/host-utils.h +++ b/include/qemu/host-utils.h @@ -32,6 +32,7 @@ #include "qemu/compiler.h" #include "qemu/bswap.h" +#include "qemu/int128.h" #ifdef CONFIG_INT128 static inline void mulu64(uint64_t *plow, uint64_t *phigh, @@ -849,4 +850,5 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, #endif } +Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor); #endif diff --git a/include/qemu/int128.h b/include/qemu/int128.h index ef71f56e3f..d2b76ca6ac 100644 --- a/include/qemu/int128.h +++ b/include/qemu/int128.h @@ -128,11 +128,21 @@ static inline bool int128_ge(Int128 a, Int128 b) return a >= b; } +static inline bool int128_uge(Int128 a, Int128 b) +{ +return ((__uint128_t)a) >= ((__uint128_t)b); +} + static inline bool int128_lt(Int128 a, Int128 b) { return a < b; } +static inline bool int128_ult(Int128 a, Int128 b) +{ +return (__uint128_t)a < (__uint128_t)b; +} + static inline bool int128_le(Int128 a, Int128 b) { return a <= b; @@ -177,6 +187,15 @@ static inline Int128 bswap128(Int128 a) #endif } +static inline int clz128(Int128 a) +{ +if (a >> 64) { +return __builtin_clzll(a >> 64); +} else { +return (a) ? __builtin_clzll((uint64_t)a) + 64 : 128; +} +} + static inline Int128 int128_divu(Int128 a, Int128 b) { return (__uint128_t)a / (__uint128_t)b; @@ -373,11 +392,21 @@ static inline bool int128_ge(Int128 a, Int128 b) return a.hi > b.hi || (a.hi == b.hi && a.lo >= b.lo); } +static inline bool int128_uge(Int128 a, Int128 b) +{ +return (uint64_t)a.hi > (uint64_t)b.hi || (a.hi == b.hi && a.lo >= b.lo); +} + static inline bool int128_lt(Int128 a, Int128 b) { return !int128_ge(a, b); } +static inline bool int128_ult(Int128 a, Int128 b) +{ +return !int128_uge(a, b); +} + static inline bool int128_le(Int128 a, Int128 b) { return int128_ge(b, a); @@ -418,6 +447,15 @@ static inline Int128 bswap128(Int128 a) return int128_make128(bswap64(a.hi), bswap64(a.lo)); } +static inline int clz128(Int128 a) +{ +if (a.hi) { +return __builtin_clzll(a.hi); +} else { +return (a.lo) ? __builtin_clzll(a.lo) + 64 : 128; +} +} + Int128 int128_divu(Int128, Int128); Int128 int128_remu(Int128, Int128); Int128 int128_divs(Int128, Int128); diff --git a/util/host-utils.c b/util/host-utils.c index 96d5dc0bed..93dfb1b6ab 100644 --- a/util/host-utils.c +++ b/util/host-utils.c @@ -266,3 +266,132 @@ void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow) *plow = *plow << shift; } } + +/* + * Unsigned 256-by-128 division. + * Returns the remainder via r. + * Returns lower 128 bit of quotient. + * Needs a normalized divisor (most significant bit set to 1). + * + * Adapted from include/qemu/host-utils.h udiv_qrnnd, + * from the GNU Multi Precision Library - longlong.h __udiv_qrnnd + * (https://gmplib.org/repo/gmp/file/tip/longlong.h) + * + * Licensed under the GPLv2/LGPLv3 + */ +static Int128 udiv256_qrnnd(Int128 *r, Int128 n1, Int128 n0, Int128 d) +{ +Int128 d0, d1, q0, q1, r1, r0, m; +uint64_t mp0, mp1; + +d0 = int128_make64(int128_getlo(d)); +d1 = int128_make64(int128_gethi(d)); + +r1 = int128_remu(n1, d1); +q1 = int128_divu(n1, d1); +mp0 = int128_getlo(q1); +mp1 = int128_gethi(q1); +mulu128(, , int128_getlo(d0)); +m = int128_make128(mp0, mp1); +r1 = int128_make128(int128_gethi(n0), int128_getlo(r1)); +if (int128_ult(r1, m)) { +q1 = int128_sub(q1, int128_one()); +r1 = int128_add(r1, d); +if (int128_uge(r1, d)) { +if (int128_ult(r1, m)) { +q1 = int128_sub(q1, int128_one()); +r1 = int128_add(r1, d); +} +} +} +r1 = int128_sub(r1, m); + +r0 = int128_remu(r1, d1); +q0 = int128_divu(r1, d1); +mp0 = int128_getlo(q0); +mp1 = int128_gethi(q0); +mulu128(, , int128_getlo(d0)); +m = int128_make128(mp0, mp1); +r0 = int128_m
[PATCH RESEND v3 3/8] target/ppc: Implemented vector divide extended word
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: vdivesw: Vector Divide Extended Signed Word vdiveuw: Vector Divide Extended Unsigned Word Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/insn32.decode| 3 ++ target/ppc/translate/vmx-impl.c.inc | 48 + 2 files changed, 51 insertions(+) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 01bfde8c5e..f6d2d4b257 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -795,3 +795,6 @@ VDIVSD 000100 . . . 00111001011@VX VDIVUD 000100 . . . 00011001011@VX VDIVSQ 000100 . . . 0011011@VX VDIVUQ 000100 . . . 0001011@VX + +VDIVESW 000100 . . . 01110001011@VX +VDIVEUW 000100 . . . 01010001011@VX diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 22572e6a79..8c542bcb29 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -3320,6 +3320,54 @@ TRANS_FLAGS2(ISA310, VDIVUD, do_vdiv_vmod, MO_64, NULL, do_divud) TRANS_FLAGS2(ISA310, VDIVSQ, do_vx_helper, gen_helper_VDIVSQ) TRANS_FLAGS2(ISA310, VDIVUQ, do_vx_helper, gen_helper_VDIVUQ) +static void do_dives_i32(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b) +{ +TCGv_i64 val1, val2; + +val1 = tcg_temp_new_i64(); +val2 = tcg_temp_new_i64(); + +tcg_gen_ext_i32_i64(val1, a); +tcg_gen_ext_i32_i64(val2, b); + +/* (a << 32)/b */ +tcg_gen_shli_i64(val1, val1, 32); +tcg_gen_div_i64(val1, val1, val2); + +/* if quotient doesn't fit in 32 bits the result is undefined */ +tcg_gen_extrl_i64_i32(t, val1); + +tcg_temp_free_i64(val1); +tcg_temp_free_i64(val2); +} + +static void do_diveu_i32(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b) +{ +TCGv_i64 val1, val2; + +val1 = tcg_temp_new_i64(); +val2 = tcg_temp_new_i64(); + +tcg_gen_extu_i32_i64(val1, a); +tcg_gen_extu_i32_i64(val2, b); + +/* (a << 32)/b */ +tcg_gen_shli_i64(val1, val1, 32); +tcg_gen_divu_i64(val1, val1, val2); + +/* if quotient doesn't fit in 32 bits the result is undefined */ +tcg_gen_extrl_i64_i32(t, val1); + +tcg_temp_free_i64(val1); +tcg_temp_free_i64(val2); +} + +DIVS32(do_divesw, do_dives_i32) +DIVU32(do_diveuw, do_diveu_i32) + +TRANS_FLAGS2(ISA310, VDIVESW, do_vdiv_vmod, MO_32, do_divesw, NULL) +TRANS_FLAGS2(ISA310, VDIVEUW, do_vdiv_vmod, MO_32, do_diveuw, NULL) + #undef DIVS32 #undef DIVU32 #undef DIVS64 -- 2.31.1
[PATCH RESEND v3 7/8] target/ppc: Implemented vector module word/doubleword
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: vmodsw: Vector Modulo Signed Word vmoduw: Vector Modulo Unsigned Word vmodsd: Vector Modulo Signed Doubleword vmodud: Vector Modulo Unsigned Doubleword Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/insn32.decode| 5 + target/ppc/translate/vmx-impl.c.inc | 10 ++ 2 files changed, 15 insertions(+) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 5b2d7824a0..75fa206b39 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -802,3 +802,8 @@ VDIVESD 000100 . . . 0001011@VX VDIVEUD 000100 . . . 01011001011@VX VDIVESQ 000100 . . . 0111011@VX VDIVEUQ 000100 . . . 0101011@VX + +VMODSW 000100 . . . 0001011@VX +VMODUW 000100 . . . 11010001011@VX +VMODSD 000100 . . . 1001011@VX +VMODUD 000100 . . . 11011001011@VX diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index f00aa64bf9..78277fb018 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -3365,6 +3365,11 @@ static void do_diveu_i32(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b) DIVS32(do_divesw, do_dives_i32) DIVU32(do_diveuw, do_diveu_i32) +DIVS32(do_modsw, tcg_gen_rem_i32) +DIVU32(do_moduw, tcg_gen_remu_i32) +DIVS64(do_modsd, tcg_gen_rem_i64) +DIVU64(do_modud, tcg_gen_remu_i64) + TRANS_FLAGS2(ISA310, VDIVESW, do_vdiv_vmod, MO_32, do_divesw, NULL) TRANS_FLAGS2(ISA310, VDIVEUW, do_vdiv_vmod, MO_32, do_diveuw, NULL) TRANS_FLAGS2(ISA310, VDIVESD, do_vx_helper, gen_helper_VDIVESD) @@ -3372,6 +3377,11 @@ TRANS_FLAGS2(ISA310, VDIVEUD, do_vx_helper, gen_helper_VDIVEUD) TRANS_FLAGS2(ISA310, VDIVESQ, do_vx_helper, gen_helper_VDIVESQ) TRANS_FLAGS2(ISA310, VDIVEUQ, do_vx_helper, gen_helper_VDIVEUQ) +TRANS_FLAGS2(ISA310, VMODSW, do_vdiv_vmod, MO_32, do_modsw , NULL) +TRANS_FLAGS2(ISA310, VMODUW, do_vdiv_vmod, MO_32, do_moduw, NULL) +TRANS_FLAGS2(ISA310, VMODSD, do_vdiv_vmod, MO_64, NULL, do_modsd) +TRANS_FLAGS2(ISA310, VMODUD, do_vdiv_vmod, MO_64, NULL, do_modud) + #undef DIVS32 #undef DIVU32 #undef DIVS64 -- 2.31.1
[PATCH RESEND v3 5/8] host-utils: Implemented signed 256-by-128 division
From: "Lucas Mateus Castro (alqotel)" Based on already existing QEMU implementation created a signed 256 bit by 128 bit division needed to implement the vector divide extended signed quadword instruction from PowerISA 3.1 Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- include/qemu/host-utils.h | 1 + util/host-utils.c | 51 +++ 2 files changed, 52 insertions(+) diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h index 9767af7573..bc743f5e32 100644 --- a/include/qemu/host-utils.h +++ b/include/qemu/host-utils.h @@ -851,4 +851,5 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, } Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor); +Int128 divs256(Int128 *plow, Int128 *phigh, Int128 divisor); #endif diff --git a/util/host-utils.c b/util/host-utils.c index 93dfb1b6ab..fb91bcba82 100644 --- a/util/host-utils.c +++ b/util/host-utils.c @@ -395,3 +395,54 @@ Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor) return rem; } } + +/* + * Signed 256-by-128 division. + * Returns quotient via plow and phigh. + * Also returns the remainder via the function return value. + */ +Int128 divs256(Int128 *plow, Int128 *phigh, Int128 divisor) +{ +bool neg_quotient = false, neg_remainder = false; +Int128 unsig_hi = *phigh, unsig_lo = *plow; +Int128 rem; + +if (!int128_nonneg(*phigh)) { +neg_quotient = !neg_quotient; +neg_remainder = !neg_remainder; + +if (!int128_nz(unsig_lo)) { +unsig_hi = int128_neg(unsig_hi); +} else { +unsig_hi = int128_not(unsig_hi); +unsig_lo = int128_neg(unsig_lo); +} +} + +if (!int128_nonneg(divisor)) { +neg_quotient = !neg_quotient; + +divisor = int128_neg(divisor); +} + +rem = divu256(_lo, _hi, divisor); + +if (neg_quotient) { +if (!int128_nz(unsig_lo)) { +*phigh = int128_neg(unsig_hi); +*plow = int128_zero(); +} else { +*phigh = int128_not(unsig_hi); +*plow = int128_neg(unsig_lo); +} +} else { +*phigh = unsig_hi; +*plow = unsig_lo; +} + +if (neg_remainder) { +return int128_neg(rem); +} else { +return rem; +} +} -- 2.31.1
[PATCH RESEND v3 1/8] target/ppc: Implemented vector divide instructions
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: vdivsw: Vector Divide Signed Word vdivuw: Vector Divide Unsigned Word vdivsd: Vector Divide Signed Doubleword vdivud: Vector Divide Unsigned Doubleword Signed-off-by: Lucas Mateus Castro (alqotel) --- target/ppc/insn32.decode| 7 +++ target/ppc/translate/vmx-impl.c.inc | 85 + 2 files changed, 92 insertions(+) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 18a94fa3b5..6df405e398 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -786,3 +786,10 @@ XVF64GERPP 111011 ... -- 0 . 00111010 ..- @XX3_at xa=%xx_xa_pair XVF64GERPN 111011 ... -- 0 . 10111010 ..- @XX3_at xa=%xx_xa_pair XVF64GERNP 111011 ... -- 0 . 0010 ..- @XX3_at xa=%xx_xa_pair XVF64GERNN 111011 ... -- 0 . 1010 ..- @XX3_at xa=%xx_xa_pair + +## Vector Division Instructions + +VDIVSW 000100 . . . 00110001011@VX +VDIVUW 000100 . . . 00010001011@VX +VDIVSD 000100 . . . 00111001011@VX +VDIVUD 000100 . . . 00011001011@VX diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index d7524c3204..4c0b1a32ec 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -3238,6 +3238,91 @@ TRANS(VMULHSD, do_vx_mulh, true , do_vx_vmulhd_i64) TRANS(VMULHUW, do_vx_mulh, false, do_vx_vmulhw_i64) TRANS(VMULHUD, do_vx_mulh, false, do_vx_vmulhd_i64) +static bool do_vdiv_vmod(DisasContext *ctx, arg_VX *a, const int vece, + void (*func_32)(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b), + void (*func_64)(TCGv_i64 t, TCGv_i64 a, TCGv_i64 b)) +{ +const GVecGen3 op = { +.fni4 = func_32, +.fni8 = func_64, +.vece = vece +}; + +REQUIRE_VECTOR(ctx); + +tcg_gen_gvec_3(avr_full_offset(a->vrt), avr_full_offset(a->vra), + avr_full_offset(a->vrb), 16, 16, ); + +return true; +} + +#define DIVU32(NAME, DIV) \ +static void NAME(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b)\ +{ \ +TCGv_i32 zero = tcg_constant_i32(0);\ +TCGv_i32 one = tcg_constant_i32(1); \ +tcg_gen_movcond_i32(TCG_COND_EQ, b, b, zero, one, b); \ +DIV(t, a, b); \ +} + +#define DIVS32(NAME, DIV) \ +static void NAME(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b)\ +{ \ +TCGv_i32 t0 = tcg_temp_new_i32(); \ +TCGv_i32 t1 = tcg_temp_new_i32(); \ +tcg_gen_setcondi_i32(TCG_COND_EQ, t0, a, INT32_MIN);\ +tcg_gen_setcondi_i32(TCG_COND_EQ, t1, b, -1); \ +tcg_gen_and_i32(t0, t0, t1);\ +tcg_gen_setcondi_i32(TCG_COND_EQ, t1, b, 0);\ +tcg_gen_or_i32(t0, t0, t1); \ +tcg_gen_movi_i32(t1, 0);\ +tcg_gen_movcond_i32(TCG_COND_NE, b, t0, t1, t0, b); \ +DIV(t, a, b); \ +tcg_temp_free_i32(t0); \ +tcg_temp_free_i32(t1); \ +} + +#define DIVU64(NAME, DIV) \ +static void NAME(TCGv_i64 t, TCGv_i64 a, TCGv_i64 b)\ +{ \ +TCGv_i64 zero = tcg_constant_i64(0);\ +TCGv_i64 one = tcg_constant_i64(1); \ +tcg_gen_movcond_i64(TCG_COND_EQ, b, b, zero, one, b); \ +DIV(t, a, b); \ +} + +#define DIVS64(NAME, DIV) \ +static void NAME(TCGv_i64 t, TCGv_i64 a, TCGv_i64 b)\ +{ \ +TCGv_i64 t0 = tcg_temp_new_i64(); \ +TCGv_i64 t1 = tcg_temp_new_i64(); \ +tcg_gen_setcondi_i64(TCG_COND_EQ, t0, a, INT64_MIN);\ +tcg_gen_setcondi_i64(TCG_COND_EQ, t1, b, -1); \ +tcg_gen_and_i64(t0, t0, t1);\ +tcg_gen_setcondi_i64(TC
[PATCH RESEND v3 2/8] target/ppc: Implemented vector divide quadword
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: vdivsq: Vector Divide Signed Quadword vdivuq: Vector Divide Unsigned Quadword Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 2 ++ target/ppc/insn32.decode| 2 ++ target/ppc/int_helper.c | 21 + target/ppc/translate/vmx-impl.c.inc | 2 ++ 4 files changed, 27 insertions(+) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 6233e28d85..9f33e589e0 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -175,6 +175,8 @@ DEF_HELPER_FLAGS_3(VMULOSW, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VMULOUB, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VMULOUH, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VMULOUW, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_3(VDIVSQ, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_3(VDIVUQ, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 6df405e398..01bfde8c5e 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -793,3 +793,5 @@ VDIVSW 000100 . . . 00110001011@VX VDIVUW 000100 . . . 00010001011@VX VDIVSD 000100 . . . 00111001011@VX VDIVUD 000100 . . . 00011001011@VX +VDIVSQ 000100 . . . 0011011@VX +VDIVUQ 000100 . . . 0001011@VX diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 105b626d1b..033718dc0e 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -1162,6 +1162,27 @@ void helper_XXPERMX(ppc_vsr_t *t, ppc_vsr_t *s0, ppc_vsr_t *s1, ppc_vsr_t *pcv, *t = tmp; } +void helper_VDIVSQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) +{ +Int128 neg1 = int128_makes64(-1); +Int128 int128_min = int128_make128(0, INT64_MIN); +if (likely(int128_nz(b->s128) && + (int128_ne(a->s128, int128_min) || int128_ne(b->s128, neg1 { +t->s128 = int128_divs(a->s128, b->s128); +} else { +t->s128 = a->s128; /* Undefined behavior */ +} +} + +void helper_VDIVUQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) +{ +if (int128_nz(b->s128)) { +t->s128 = int128_divu(a->s128, b->s128); +} else { +t->s128 = a->s128; /* Undefined behavior */ +} +} + void helper_VPERM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) { ppc_avr_t result; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 4c0b1a32ec..22572e6a79 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -3317,6 +3317,8 @@ TRANS_FLAGS2(ISA310, VDIVSW, do_vdiv_vmod, MO_32, do_divsw, NULL) TRANS_FLAGS2(ISA310, VDIVUW, do_vdiv_vmod, MO_32, do_divuw, NULL) TRANS_FLAGS2(ISA310, VDIVSD, do_vdiv_vmod, MO_64, NULL, do_divsd) TRANS_FLAGS2(ISA310, VDIVUD, do_vdiv_vmod, MO_64, NULL, do_divud) +TRANS_FLAGS2(ISA310, VDIVSQ, do_vx_helper, gen_helper_VDIVSQ) +TRANS_FLAGS2(ISA310, VDIVUQ, do_vx_helper, gen_helper_VDIVUQ) #undef DIVS32 #undef DIVU32 -- 2.31.1
[PATCH RESEND v3 6/8] target/ppc: Implemented remaining vector divide extended
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: vdivesd: Vector Divide Extended Signed Doubleword vdiveud: Vector Divide Extended Unsigned Doubleword vdivesq: Vector Divide Extended Signed Quadword vdiveuq: Vector Divide Extended Unsigned Quadword Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/helper.h | 4 ++ target/ppc/insn32.decode| 4 ++ target/ppc/int_helper.c | 64 + target/ppc/translate/vmx-impl.c.inc | 4 ++ 4 files changed, 76 insertions(+) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 9f33e589e0..e7624300df 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -177,6 +177,10 @@ DEF_HELPER_FLAGS_3(VMULOUH, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VMULOUW, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VDIVSQ, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(VDIVUQ, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_3(VDIVESD, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_3(VDIVEUD, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_3(VDIVESQ, TCG_CALL_NO_RWG, void, avr, avr, avr) +DEF_HELPER_FLAGS_3(VDIVEUQ, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index f6d2d4b257..5b2d7824a0 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -798,3 +798,7 @@ VDIVUQ 000100 . . . 0001011@VX VDIVESW 000100 . . . 01110001011@VX VDIVEUW 000100 . . . 01010001011@VX +VDIVESD 000100 . . . 0001011@VX +VDIVEUD 000100 . . . 01011001011@VX +VDIVESQ 000100 . . . 0111011@VX +VDIVEUQ 000100 . . . 0101011@VX diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 033718dc0e..42f0dcfc52 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -1183,6 +1183,70 @@ void helper_VDIVUQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) } } +void helper_VDIVESD(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) +{ +int i; +int64_t high; +uint64_t low; +for (i = 0; i < 2; i++) { +high = a->s64[i]; +low = 0; +if (unlikely((high == INT64_MIN && b->s64[i] == -1) || !b->s64[i])) { +t->s64[i] = a->s64[i]; /* Undefined behavior */ +} else { +divs128(, , b->s64[i]); +t->s64[i] = low; +} +} +} + +void helper_VDIVEUD(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) +{ +int i; +uint64_t high, low; +for (i = 0; i < 2; i++) { +high = a->u64[i]; +low = 0; +if (unlikely(!b->u64[i])) { +t->u64[i] = a->u64[i]; /* Undefined behavior */ +} else { +divu128(, , b->u64[i]); +t->u64[i] = low; +} +} +} + +void helper_VDIVESQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) +{ +Int128 high, low; +Int128 int128_min = int128_make128(0, INT64_MIN); +Int128 neg1 = int128_makes64(-1); + +high = a->s128; +low = int128_zero(); +if (unlikely(!int128_nz(b->s128) || + (int128_eq(b->s128, neg1) && int128_eq(high, int128_min { +t->s128 = a->s128; /* Undefined behavior */ +} else { +divs256(, , b->s128); +t->s128 = low; +} +} + +void helper_VDIVEUQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b) +{ +Int128 high, low; + +high = a->s128; +low = int128_zero(); +if (unlikely(!int128_nz(b->s128))) { +t->s128 = a->s128; /* Undefined behavior */ +} else { +divu256(, , b->s128); +t->s128 = low; +} +} + void helper_VPERM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) { ppc_avr_t result; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 8c542bcb29..f00aa64bf9 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -3367,6 +3367,10 @@ DIVU32(do_diveuw, do_diveu_i32) TRANS_FLAGS2(ISA310, VDIVESW, do_vdiv_vmod, MO_32, do_divesw, NULL) TRANS_FLAGS2(ISA310, VDIVEUW, do_vdiv_vmod, MO_32, do_diveuw, NULL) +TRANS_FLAGS2(ISA310, VDIVESD, do_vx_helper, gen_helper_VDIVESD) +TRANS_FLAGS2(ISA310, VDIVEUD, do_vx_helper, gen_helper_VDIVEUD) +TRANS_FLAGS2(ISA310, VDIVESQ, do_vx_helper, gen_helper_VDIVESQ) +TRANS_FLAGS2(ISA310, VDIVEUQ, do_vx_helper, gen_helper_VDIVEUQ) #undef DIVS32 #undef DIVU32 -- 2.31.1
[PATCH v6 8/8] linux-user: Add PowerPC ISA 3.1 and MMA to hwcap
From: Joel Stanley These are new hwcap bits added for power10. Signed-off-by: Joel Stanley Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- linux-user/elfload.c | 4 1 file changed, 4 insertions(+) diff --git a/linux-user/elfload.c b/linux-user/elfload.c index 61063fd974..0908692e62 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -779,6 +779,8 @@ enum { QEMU_PPC_FEATURE2_DARN = 0x0020, /* darn random number insn */ QEMU_PPC_FEATURE2_SCV = 0x0010, /* scv syscall */ QEMU_PPC_FEATURE2_HTM_NO_SUSPEND = 0x0008, /* TM w/o suspended state */ +QEMU_PPC_FEATURE2_ARCH_3_1 = 0x0004, /* ISA 3.1 */ +QEMU_PPC_FEATURE2_MMA = 0x0002, /* Matrix-Multiply Assist */ }; #define ELF_HWCAP get_elf_hwcap() @@ -836,6 +838,8 @@ static uint32_t get_elf_hwcap2(void) QEMU_PPC_FEATURE2_VEC_CRYPTO); GET_FEATURE2(PPC2_ISA300, QEMU_PPC_FEATURE2_ARCH_3_00 | QEMU_PPC_FEATURE2_DARN | QEMU_PPC_FEATURE2_HAS_IEEE128); +GET_FEATURE2(PPC2_ISA310, QEMU_PPC_FEATURE2_ARCH_3_1 | + QEMU_PPC_FEATURE2_MMA); #undef GET_FEATURE #undef GET_FEATURE2 -- 2.31.1
[PATCH v6 7/8] target/ppc: Implemented [pm]xvbf16ger2*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xvbf16ger2: VSX Vector bfloat16 GER (rank-2 update) xvbf16ger2nn: VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Negative accumulate xvbf16ger2np: VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Positive accumulate xvbf16ger2pn: VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Negative accumulate xvbf16ger2pp: VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Positive accumulate pmxvbf16ger2: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) pmxvbf16ger2nn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Negative accumulate pmxvbf16ger2np: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Positive accumulate pmxvbf16ger2pn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Negative accumulate pmxvbf16ger2pp: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/fpu_helper.c | 40 + target/ppc/helper.h | 5 target/ppc/insn32.decode| 6 + target/ppc/insn64.decode| 11 target/ppc/translate/vsx-impl.c.inc | 12 + 5 files changed, 74 insertions(+) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index a9b2ef370f..fed0ce420a 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -3517,6 +3517,11 @@ static float64 extract_hf16(float16 in, float_status *fp_status) return float16_to_float64(in, true, fp_status); } +static float64 extract_bf16(bfloat16 in, float_status *fp_status) +{ +return bfloat16_to_float64(in, fp_status); +} + static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t *at, uint32_t mask, bool acc, bool neg_mul, bool neg_acc, extract_f16 extract) @@ -3639,6 +3644,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, vsxger_excp(env, GETPC()); } +QEMU_FLATTEN +void helper_XVBF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, false, false, false, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2PP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, false, false, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2PN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, false, true, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2NP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, true, false, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2NN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, true, true, extract_bf16); +} + QEMU_FLATTEN void helper_XVF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t *at, uint32_t mask) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 4070c0891c..6233e28d85 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -555,6 +555,11 @@ DEF_HELPER_5(XVF16GER2PP, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF16GER2PN, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF16GER2NP, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF16GER2NN, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2PP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2PN, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2NP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2NN, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF32GER, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF32GERPP, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF32GERPN, void, env, vsr, vsr, acc, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index b8e317159c..18a94fa3b5 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -763,6 +763,12 @@ XVI8GER4SPP 111011 ... -- . . 01100011 ..- @XX3_at xa=%xx_xa XVI16GER2S 111011 ... -- . . 00101011 ..- @XX3_at xa=%xx_xa XVI16GER2SPP111011 ... -- . . 00101010 ..- @XX3_at xa=%xx_xa +XVBF16GER2 111011 ... -- . . 00110011 ..- @XX3_at xa=%xx_xa +XVBF16GER2PP111011 ... -- . . 00110010 ..- @XX3_at xa=%xx_xa +XVBF16GER2PN111011 ... -- . . 10110010 ..- @XX3_at xa=%xx_xa +XVBF16GER2NP111011 ... -- . . 01110010 ..- @XX3_at xa=%xx_xa +XV
[PATCH v6 3/8] target/ppc: Implemented pmxvi*ger* instructions
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: pmxvi4ger8: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) pmxvi4ger8pp: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) Positive multiply, Positive accumulate pmxvi8ger4: Prefixed Masked VSX Vector 4-bit Signed Integer GER (rank-8 update) pmxvi8ger4pp: Prefixed Masked VSX Vector 4-bit Signed Integer GER (rank-8 update) Positive multiply, Positive accumulate pmxvi8ger4spp: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with Saturate Positive multiply, Positive accumulate pmxvi16ger2:Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) pmxvi16ger2pp: Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) Positive multiply, Positive accumulate pmxvi16ger2s: Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation pmxvi16ger2spp: Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/insn64.decode| 30 + target/ppc/translate/vsx-impl.c.inc | 10 ++ 2 files changed, 40 insertions(+) diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode index 691e8fe6c0..0eed35c8cd 100644 --- a/target/ppc/insn64.decode +++ b/target/ppc/insn64.decode @@ -68,6 +68,15 @@ .. . . . . .. \ &8RR_XX4_uim3 xt=%8rr_xx_xt xa=%8rr_xx_xa xb=%8rr_xx_xb xc=%8rr_xx_xc +# Format MMIRR:XX3 +_XX3 !extern xa xb xt pmsk xmsk ymsk +%xx3_xa 2:1 16:5 +%xx3_xb 1:1 11:5 +%xx3_at 23:3 +@MMIRR_XX3 .. .. .. . . xmsk:4 ymsk:4 \ +.. ... .. . . ... \ +_XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at + ### Fixed-Point Load Instructions PLBZ01 10 0--.-- .. \ @@ -115,6 +124,27 @@ PSTFS 01 10 0--.-- .. \ PSTFD 01 10 0--.-- .. \ 110110 . . @PLS_D +## VSX GER instruction + +PMXVI4GER8 01 11 1001 -- - - pmsk:8 \ +111011 ... -- . . 00100011 ..- @MMIRR_XX3 +PMXVI4GER8PP01 11 1001 -- - - pmsk:8 \ +111011 ... -- . . 00100010 ..- @MMIRR_XX3 +PMXVI8GER4 01 11 1001 -- - - pmsk:4 \ +111011 ... -- . . 0011 ..- @MMIRR_XX3 +PMXVI8GER4PP01 11 1001 -- - - pmsk:4 \ +111011 ... -- . . 0010 ..- @MMIRR_XX3 +PMXVI16GER2 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 01001011 ..- @MMIRR_XX3 +PMXVI16GER2PP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 01101011 ..- @MMIRR_XX3 +PMXVI8GER4SPP 01 11 1001 -- - - pmsk:4 \ +111011 ... -- . . 01100011 ..- @MMIRR_XX3 +PMXVI16GER2S01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00101011 ..- @MMIRR_XX3 +PMXVI16GER2SPP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00101010 ..- @MMIRR_XX3 + ### Prefixed No-operation Instruction @PNOP 01 11 -- 00 \ diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 6026b203e0..b10eded1da 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2888,6 +2888,16 @@ TRANS(XVI16GER2PP, do_ger, gen_helper_XVI16GER2PP) TRANS(XVI16GER2S, do_ger, gen_helper_XVI16GER2S) TRANS(XVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP) +TRANS64(PMXVI4GER8, do_ger, gen_helper_XVI4GER8) +TRANS64(PMXVI4GER8PP, do_ger, gen_helper_XVI4GER8PP) +TRANS64(PMXVI8GER4, do_ger, gen_helper_XVI8GER4) +TRANS64(PMXVI8GER4PP, do_ger, gen_helper_XVI8GER4PP) +TRANS64(PMXVI8GER4SPP, do_ger, gen_helper_XVI8GER4SPP) +TRANS64(PMXVI16GER2, do_ger, gen_helper_XVI16GER2) +TRANS64(PMXVI16GER2PP, do_ger, gen_helper_XVI16GER2PP) +TRANS64(PMXVI16GER2S, do_ger, gen_helper_XVI16GER2S) +TRANS64(PMXVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP) + #undef GEN_XX2FORM #undef GEN_XX3FORM #undef GEN_XX2IFORM -- 2.31.1
[PATCH v6 6/8] target/ppc: Implemented pmxvf*ger*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: pmxvf16ger2: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) pmxvf16ger2nn: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Negative accumulate pmxvf16ger2np: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Positive accumulate pmxvf16ger2pn: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Negative accumulate pmxvf16ger2pp: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Positive accumulate pmxvf32ger:Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) pmxvf32gernn: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate pmxvf32gernp: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate pmxvf32gerpn: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate pmxvf32gerpp: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate pmxvf64ger:Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) pmxvf64gernn: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate pmxvf64gernp: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate pmxvf64gerpn: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate pmxvf64gerpp: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/insn64.decode| 38 + target/ppc/translate/vsx-impl.c.inc | 18 ++ 2 files changed, 56 insertions(+) diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode index 0eed35c8cd..5ecc5c85bf 100644 --- a/target/ppc/insn64.decode +++ b/target/ppc/insn64.decode @@ -73,10 +73,15 @@ %xx3_xa 2:1 16:5 %xx3_xb 1:1 11:5 %xx3_at 23:3 +%xx3_xa_pair2:1 17:4 !function=times_2 @MMIRR_XX3 .. .. .. . . xmsk:4 ymsk:4 \ .. ... .. . . ... \ _XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at +@MMIRR_XX3_NO_P .. .. .. . . xmsk:4 \ +.. ... .. . . ... \ +_XX3 xb=%xx3_xb xt=%xx3_at pmsk=1 + ### Fixed-Point Load Instructions PLBZ01 10 0--.-- .. \ @@ -145,6 +150,39 @@ PMXVI16GER2S01 11 1001 -- - - pmsk:2 -- \ PMXVI16GER2SPP 01 11 1001 -- - - pmsk:2 -- \ 111011 ... -- . . 00101010 ..- @MMIRR_XX3 +PMXVF16GER2 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00010011 ..- @MMIRR_XX3 +PMXVF16GER2PP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00010010 ..- @MMIRR_XX3 +PMXVF16GER2PN 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 10010010 ..- @MMIRR_XX3 +PMXVF16GER2NP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 01010010 ..- @MMIRR_XX3 +PMXVF16GER2NN 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 11010010 ..- @MMIRR_XX3 + +PMXVF32GER 01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 00011011 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERPP01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 00011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERPN01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 10011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERNP01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 01011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERNN01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 11011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa + +PMXVF64GER 01 11 1001 -- - - ymsk:2 -- \ +111011 ... -- 0 . 00111011 ..- @MMIRR_XX3_NO_P xa=%xx3_xa_pair +PMXVF64GERPP01 11 1001 -- - - ymsk:2 -- \ +111011 ... -- 0 . 00111010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa_pair +PMXVF64GERPN01 11 1001 -- - - ymsk:2 -- \ +111011 ... -- 0 . 10111010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa_pair +PMXVF64GERNP01 11 1001 -- - - ymsk:2 -- \ +
[PATCH v6 1/8] target/ppc: Implement xxm[tf]acc and xxsetaccz
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xxmfacc: VSX Move From Accumulator xxmtacc: VSX Move To Accumulator xxsetaccz: VSX Set Accumulator to Zero The PowerISA 3.1 mentions that for the current version of the architecture, "the hardware implementation provides the effect of ACC[i] and VSRs 4*i to 4*i + 3 logically containing the same data" and "The Accumulators introduce no new logical state at this time" (page 501). For now it seems unnecessary to create new structures, so this patch just uses ACC[i] as VSRs 4*i to 4*i+3 and therefore move to and from accumulators are no-ops. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/cpu.h| 5 + target/ppc/insn32.decode| 9 + target/ppc/translate/vsx-impl.c.inc | 31 + 3 files changed, 45 insertions(+) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index bf8f8aad2c..c865206827 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -2663,6 +2663,11 @@ static inline int vsr_full_offset(int i) return offsetof(CPUPPCState, vsr[i].u64[0]); } +static inline int acc_full_offset(int i) +{ +return vsr_full_offset(i * 4); +} + static inline int fpr_offset(int i) { return vsr64_offset(i, true); diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index f001c02a8c..c0f545ca38 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -154,6 +154,9 @@ _vrt_frbp vrt frbp @X_vrt_frbp .. vrt:5 . 0 .. . _vrt_frbp frbp=%x_frbp +_ara +@X_a.. ra:3 .. . . .. . _a + %xx_xt 0:1 21:5 %xx_xb 1:1 11:5 %xx_xa 2:1 16:5 @@ -734,3 +737,9 @@ XVTLSBB 00 ... -- 00010 . 111011011 . - @XX2_bf_xb _s s:uint8_t @XL_s ..-- s:1 .. - _s RFEBB 010011-- . 0010010010 - @XL_s + +## Accumulator Instructions + +XXMFACC 01 ... -- 0 - 0010110001 - @X_a +XXMTACC 01 ... -- 1 - 0010110001 - @X_a +XXSETACCZ 01 ... -- 00011 - 0010110001 - @X_a diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 900c1a1ab2..235be360e2 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2816,6 +2816,37 @@ static bool trans_XVCVBF16SPN(DisasContext *ctx, arg_XX2 *a) return true; } +/* + * The PowerISA 3.1 mentions that for the current version of the + * architecture, "the hardware implementation provides the effect of + * ACC[i] and VSRs 4*i to 4*i + 3 logically containing the same data" + * and "The Accumulators introduce no new logical state at this time" + * (page 501). For now it seems unnecessary to create new structures, + * so ACC[i] is the same as VSRs 4*i to 4*i+3 and therefore + * move to and from accumulators are no-ops. + */ +static bool trans_XXMFACC(DisasContext *ctx, arg_X_a *a) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA310); +REQUIRE_VSX(ctx); +return true; +} + +static bool trans_XXMTACC(DisasContext *ctx, arg_X_a *a) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA310); +REQUIRE_VSX(ctx); +return true; +} + +static bool trans_XXSETACCZ(DisasContext *ctx, arg_X_a *a) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA310); +REQUIRE_VSX(ctx); +tcg_gen_gvec_dup_imm(MO_64, acc_full_offset(a->ra), 64, 64, 0); +return true; +} + #undef GEN_XX2FORM #undef GEN_XX3FORM #undef GEN_XX2IFORM -- 2.31.1
[PATCH v6 4/8] target/ppc: Implemented xvf*ger*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xvf32ger: VSX Vector 32-bit Floating-Point GER (rank-1 update) xvf32gernn: VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate xvf32gernp: VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate xvf32gerpn: VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate xvf32gerpp: VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate xvf64ger: VSX Vector 64-bit Floating-Point GER (rank-1 update) xvf64gernn: VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate xvf64gernp: VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate xvf64gerpn: VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate xvf64gerpp: VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/cpu.h| 4 + target/ppc/fpu_helper.c | 194 +++- target/ppc/helper.h | 10 ++ target/ppc/insn32.decode| 13 ++ target/ppc/translate/vsx-impl.c.inc | 12 ++ 5 files changed, 231 insertions(+), 2 deletions(-) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index dff3ca8222..40c779f246 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -2643,6 +2643,8 @@ static inline bool lsw_reg_in_range(int start, int nregs, int rx) #define VsrSW(i) s32[i] #define VsrD(i) u64[i] #define VsrSD(i) s64[i] +#define VsrSF(i) f32[i] +#define VsrDF(i) f64[i] #else #define VsrB(i) u8[15 - (i)] #define VsrSB(i) s8[15 - (i)] @@ -2652,6 +2654,8 @@ static inline bool lsw_reg_in_range(int start, int nregs, int rx) #define VsrSW(i) s32[3 - (i)] #define VsrD(i) u64[1 - (i)] #define VsrSD(i) s64[1 - (i)] +#define VsrSF(i) f32[3 - (i)] +#define VsrDF(i) f64[1 - (i)] #endif static inline int vsr64_offset(int i, bool high) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 9489e06504..712c71162c 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -414,7 +414,7 @@ void helper_store_fpscr(CPUPPCState *env, uint64_t val, uint32_t nibbles) ppc_store_fpscr(env, val); } -void helper_fpscr_check_status(CPUPPCState *env) +static void do_fpscr_check_status(CPUPPCState *env, uintptr_t raddr) { CPUState *cs = env_cpu(env); target_ulong fpscr = env->fpscr; @@ -455,13 +455,19 @@ void helper_fpscr_check_status(CPUPPCState *env) } cs->exception_index = POWERPC_EXCP_PROGRAM; env->error_code = error | POWERPC_EXCP_FP; +env->fpscr |= error ? FP_FEX : 0; /* Deferred floating-point exception after target FPSCR update */ if (fp_exceptions_enabled(env)) { raise_exception_err_ra(env, cs->exception_index, - env->error_code, GETPC()); + env->error_code, raddr); } } +void helper_fpscr_check_status(CPUPPCState *env) +{ +do_fpscr_check_status(env, GETPC()); +} + static void do_float_check_status(CPUPPCState *env, bool change_fi, uintptr_t raddr) { @@ -3468,3 +3474,187 @@ void helper_xssubqp(CPUPPCState *env, uint32_t opcode, *xt = t; do_float_check_status(env, true, GETPC()); } + +static inline void vsxger_excp(CPUPPCState *env, uintptr_t retaddr) +{ +/* + * XV*GER instructions execute and set the FPSCR as if exceptions + * are disabled and only at the end throw an exception + */ +target_ulong enable; +enable = env->fpscr & (FP_ENABLES | FP_FI | FP_FR); +env->fpscr &= ~(FP_ENABLES | FP_FI | FP_FR); +int status = get_float_exception_flags(>fp_status); +if (unlikely(status & float_flag_invalid)) { +if (status & float_flag_invalid_snan) { +float_invalid_op_vxsnan(env, 0); +} +if (status & float_flag_invalid_imz) { +float_invalid_op_vximz(env, false, 0); +} +if (status & float_flag_invalid_isi) { +float_invalid_op_vxisi(env, false, 0); +} +} +do_float_check_status(env, false, retaddr); +env->fpscr |= enable; +do_fpscr_check_status(env, retaddr); +} + +typedef void vsxger_zero(ppc_vsr_t *at, int, int); + +typedef void vsxger_muladd_f(ppc_vsr_t *, ppc_vsr_t *, ppc_vsr_t *, int, int, + int flags, float_status *s); + +static void vsxger_muladd32(ppc_vsr_t *at, ppc_vsr_t *a, ppc_vsr_t *b, int i, +int j, int flags, float_status *s) +{ +at[i].VsrSF(j) = float32_muladd(a->VsrSF(i), b->VsrSF(j), +at[i].VsrSF(j), flags, s); +} + +
[PATCH v6 5/8] target/ppc: Implemented xvf16ger*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xvf16ger2: VSX Vector 16-bit Floating-Point GER (rank-2 update) xvf16ger2nn: VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Negative accumulate xvf16ger2np: VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Positive accumulate xvf16ger2pn: VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Negative accumulate xvf16ger2pp: VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/cpu.h| 3 + target/ppc/fpu_helper.c | 95 + target/ppc/helper.h | 5 ++ target/ppc/insn32.decode| 6 ++ target/ppc/translate/vsx-impl.c.inc | 6 ++ 5 files changed, 115 insertions(+) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 40c779f246..6d78078f37 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -227,6 +227,7 @@ typedef union _ppc_vsr_t { int16_t s16[8]; int32_t s32[4]; int64_t s64[2]; +float16 f16[8]; float32 f32[4]; float64 f64[2]; float128 f128; @@ -2643,6 +2644,7 @@ static inline bool lsw_reg_in_range(int start, int nregs, int rx) #define VsrSW(i) s32[i] #define VsrD(i) u64[i] #define VsrSD(i) s64[i] +#define VsrHF(i) f16[i] #define VsrSF(i) f32[i] #define VsrDF(i) f64[i] #else @@ -2654,6 +2656,7 @@ static inline bool lsw_reg_in_range(int start, int nregs, int rx) #define VsrSW(i) s32[3 - (i)] #define VsrD(i) u64[1 - (i)] #define VsrSD(i) s64[1 - (i)] +#define VsrHF(i) f16[7 - (i)] #define VsrSF(i) f32[3 - (i)] #define VsrDF(i) f64[1 - (i)] #endif diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 712c71162c..a9b2ef370f 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -36,6 +36,15 @@ static inline float128 float128_snan_to_qnan(float128 x) #define float32_snan_to_qnan(x) ((x) | 0x0040) #define float16_snan_to_qnan(x) ((x) | 0x0200) +static inline float32 bfp32_neg(float32 a) +{ +if (unlikely(float32_is_any_nan(a))) { +return a; +} else { +return float32_chs(a); +} +} + static inline bool fp_exceptions_enabled(CPUPPCState *env) { #ifdef CONFIG_USER_ONLY @@ -3501,6 +3510,57 @@ static inline void vsxger_excp(CPUPPCState *env, uintptr_t retaddr) do_fpscr_check_status(env, retaddr); } +typedef float64 extract_f16(float16, float_status *); + +static float64 extract_hf16(float16 in, float_status *fp_status) +{ +return float16_to_float64(in, true, fp_status); +} + +static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask, bool acc, + bool neg_mul, bool neg_acc, extract_f16 extract) +{ +float32 r, aux_acc; +float64 psum, va, vb, vc, vd; +int i, j, xmsk_bit, ymsk_bit; +uint8_t pmsk = FIELD_EX32(mask, GER_MSK, PMSK), +xmsk = FIELD_EX32(mask, GER_MSK, XMSK), +ymsk = FIELD_EX32(mask, GER_MSK, YMSK); +float_status *excp_ptr = >fp_status; +for (i = 0, xmsk_bit = 1 << 3; i < 4; i++, xmsk_bit >>= 1) { +for (j = 0, ymsk_bit = 1 << 3; j < 4; j++, ymsk_bit >>= 1) { +if ((xmsk_bit & xmsk) && (ymsk_bit & ymsk)) { +va = !(pmsk & 2) ? float64_zero : + extract(a->VsrHF(2 * i), excp_ptr); +vb = !(pmsk & 2) ? float64_zero : + extract(b->VsrHF(2 * j), excp_ptr); +vc = !(pmsk & 1) ? float64_zero : + extract(a->VsrHF(2 * i + 1), excp_ptr); +vd = !(pmsk & 1) ? float64_zero : + extract(b->VsrHF(2 * j + 1), excp_ptr); +psum = float64_mul(va, vb, excp_ptr); +psum = float64r32_muladd(vc, vd, psum, 0, excp_ptr); +r = float64_to_float32(psum, excp_ptr); +if (acc) { +aux_acc = at[i].VsrSF(j); +if (neg_mul) { +r = bfp32_neg(r); +} +if (neg_acc) { +aux_acc = bfp32_neg(aux_acc); +} +r = float32_add(r, aux_acc, excp_ptr); +} +at[i].VsrSF(j) = r; +} else { +at[i].VsrSF(j) = float32_zero; +} +} +} +vsxger_excp(env, GETPC()); +} + typedef void vsxger_zero(ppc_vsr_t *at, int, int); typedef void vsxger_muladd_f(ppc_vsr_t *, ppc_vsr_t *, ppc_vsr_t *, int, int, @@ -3579,6 +3639,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, vsxger_excp(env, GETPC());
[PATCH v6 0/8] VSX MMA Implementation
From: "Lucas Mateus Castro (alqotel)" Based-on: https://gitlab.com/danielhb/qemu/-/tree/ppc-next This patch series is a patch series of the Matrix-Multiply Assist (MMA) instructions implementation from the PowerISA 3.1 This patch series was created based on Victor's target/ppc: Fix FPSCR.FI bit patch series changes as that series changed do_check_float_status, which is called by the GER helper functions. These and the VDIV/VMOD implementation are the last new PowerISA 3.1 instructions left to be implemented. The XVFGER instructions accumulate the exception status and at the end set the FPSCR and take a Program interrupt on a trap-enabled exception, previous versions were based on Victor's rework of FPU exceptions, but as that patch was rejected this version worked around the fact that OX/UX/XX and invalid instructions were handled in different functions by disabling all enable bits then re-enabling them and calling the mtfsf deferred exception helper. v6 changes: - Rebased on ppc-next - Wrapped lines to stay <= 80 characters v5 changes: - Changed VSXGER16 accumulation to negate the multiplication and accumulation in independent if's (if necessary) and sum their values. v4 changes: - Changed VSXGER16 accumulation to always use float32_sum and negate the elements according to the type of accumulation v3 changes: - GER helpers now use ppc_acc_t instead of ppc_vsr_t for passing acc - Removed do_ger_XX3 and updated the decodetree to pass the masks in 32 bits instructions - Removed unnecessary rounding mode function - Moved float32_neg to fpu_helper.c and renamed it bfp32_negate to make it clearer that it's a 32 bit version of the PowerISA bfp_NEGATE - Negated accumulation now a subtraction - Changed exception handling by disabling all enable FPSCR enable bits to set all FPSCR bits (except FEX) correctly, then re-enable them and call do_fpscr_check_status to raise the exception accordingly and set FEX if necessary v2 changes: - Changed VSXGER, VSXGER16 and XVIGER macros to functions - Set rounding mode in floating-point instructions based on RN before operations - Separated accumulate and with saturation instructions in different helpers - Used FIELD, FIELD_EX32 and FIELD_DP32 for packing/unpacking masks Joel Stanley (1): linux-user: Add PowerPC ISA 3.1 and MMA to hwcap Lucas Mateus Castro (alqotel) (7): target/ppc: Implement xxm[tf]acc and xxsetaccz target/ppc: Implemented xvi*ger* instructions target/ppc: Implemented pmxvi*ger* instructions target/ppc: Implemented xvf*ger* target/ppc: Implemented xvf16ger* target/ppc: Implemented pmxvf*ger* target/ppc: Implemented [pm]xvbf16ger2* linux-user/elfload.c| 4 + target/ppc/cpu.h| 13 ++ target/ppc/fpu_helper.c | 329 +++- target/ppc/helper.h | 33 +++ target/ppc/insn32.decode| 52 + target/ppc/insn64.decode| 79 +++ target/ppc/int_helper.c | 130 +++ target/ppc/internal.h | 15 ++ target/ppc/translate/vsx-impl.c.inc | 130 +++ 9 files changed, 783 insertions(+), 2 deletions(-) -- 2.31.1
[PATCH v6 2/8] target/ppc: Implemented xvi*ger* instructions
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xvi4ger8: VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) xvi4ger8pp: VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) Positive multiply, Positive accumulate xvi8ger4: VSX Vector 4-bit Signed Integer GER (rank-8 update) xvi8ger4pp: VSX Vector 4-bit Signed Integer GER (rank-8 update) Positive multiply, Positive accumulate xvi8ger4spp: VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with Saturate Positive multiply, Positive accumulate xvi16ger2:VSX Vector 16-bit Signed Integer GER (rank-2 update) xvi16ger2pp: VSX Vector 16-bit Signed Integer GER (rank-2 update) Positive multiply, Positive accumulate xvi16ger2s: VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation xvi16ger2spp: VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/cpu.h| 1 + target/ppc/helper.h | 13 +++ target/ppc/insn32.decode| 18 target/ppc/int_helper.c | 130 target/ppc/internal.h | 15 target/ppc/translate/vsx-impl.c.inc | 41 + 6 files changed, 218 insertions(+) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index c865206827..dff3ca8222 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -238,6 +238,7 @@ typedef union _ppc_vsr_t { typedef ppc_vsr_t ppc_avr_t; typedef ppc_vsr_t ppc_fprp_t; +typedef ppc_vsr_t ppc_acc_t; #if !defined(CONFIG_USER_ONLY) /* Software TLB cache */ diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 5e43920b9e..1666797edf 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -133,6 +133,10 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64) #define dh_ctype_vsr ppc_vsr_t * #define dh_typecode_vsr dh_typecode_ptr +#define dh_alias_acc ptr +#define dh_ctype_acc ppc_acc_t * +#define dh_typecode_acc dh_typecode_ptr + DEF_HELPER_FLAGS_3(vavgub, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vavguh, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vavguw, TCG_CALL_NO_RWG, void, avr, avr, avr) @@ -537,6 +541,15 @@ DEF_HELPER_FLAGS_5(XXBLENDVB, TCG_CALL_NO_RWG, void, vsr, vsr, vsr, vsr, i32) DEF_HELPER_FLAGS_5(XXBLENDVH, TCG_CALL_NO_RWG, void, vsr, vsr, vsr, vsr, i32) DEF_HELPER_FLAGS_5(XXBLENDVW, TCG_CALL_NO_RWG, void, vsr, vsr, vsr, vsr, i32) DEF_HELPER_FLAGS_5(XXBLENDVD, TCG_CALL_NO_RWG, void, vsr, vsr, vsr, vsr, i32) +DEF_HELPER_5(XVI4GER8, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI4GER8PP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI8GER4, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI8GER4PP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI8GER4SPP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI16GER2, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI16GER2S, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI16GER2PP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI16GER2SPP, void, env, vsr, vsr, acc, i32) DEF_HELPER_2(efscfsi, i32, env, i32) DEF_HELPER_2(efscfui, i32, env, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index c0f545ca38..0e189fe2da 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -175,6 +175,12 @@ xt xa xb @XX3.. . . . ...xt=%xx_xt xa=%xx_xa xb=%xx_xb +# 32 bit GER instructions have all mask bits considered 1 +_XX3 xa xb xt pmsk xmsk ymsk +%xx_at 23:3 +@XX3_at .. ... .. . . ... _XX3 xt=%xx_at xb=%xx_xb \ +pmsk=255 xmsk=15 ymsk=15 + _dm xt xa xb dm @XX3_dm .. . . . . dm:2 . ... _dm xt=%xx_xt xa=%xx_xa xb=%xx_xb @@ -743,3 +749,15 @@ RFEBB 010011-- . 0010010010 - @XL_s XXMFACC 01 ... -- 0 - 0010110001 - @X_a XXMTACC 01 ... -- 1 - 0010110001 - @X_a XXSETACCZ 01 ... -- 00011 - 0010110001 - @X_a + +## VSX GER instruction + +XVI4GER8111011 ... -- . . 00100011 ..- @XX3_at xa=%xx_xa +XVI4GER8PP 111011 ... -- . . 00100010 ..- @XX3_at xa=%xx_xa +XVI8GER4111011 ... -- . . 0011 ..- @XX3_at xa=%xx_xa +XVI8GER4PP 111011 ... -- . . 0010 ..- @XX3_at xa=%xx_xa +XVI16GER2 111011 ... -- . . 01001011 ..- @XX3_at xa=%xx_xa +XVI16GER2PP 111011 ... -- . . 01101011 ..- @XX3_at xa=%xx_xa +XVI8GER4SPP 111011 ... -- . . 01100011 ..- @XX3_at xa=%xx_xa +XVI16GER2S 111011 ... -- . . 00101011 ..- @XX3_at xa=%xx_xa +XVI16GER2SPP111011 ... -- . . 00101010 ..- @XX3_at xa=%xx_xa diff --git a/target/ppc/int_
[PATCH v5 6/8] target/ppc: Implemented pmxvf*ger*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: pmxvf16ger2: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) pmxvf16ger2nn: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Negative accumulate pmxvf16ger2np: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Positive accumulate pmxvf16ger2pn: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Negative accumulate pmxvf16ger2pp: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Positive accumulate pmxvf32ger:Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) pmxvf32gernn: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate pmxvf32gernp: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate pmxvf32gerpn: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate pmxvf32gerpp: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate pmxvf64ger:Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) pmxvf64gernn: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate pmxvf64gernp: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate pmxvf64gerpn: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate pmxvf64gerpp: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/insn64.decode| 38 + target/ppc/translate/vsx-impl.c.inc | 18 ++ 2 files changed, 56 insertions(+) diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode index 0eed35c8cd..5ecc5c85bf 100644 --- a/target/ppc/insn64.decode +++ b/target/ppc/insn64.decode @@ -73,10 +73,15 @@ %xx3_xa 2:1 16:5 %xx3_xb 1:1 11:5 %xx3_at 23:3 +%xx3_xa_pair2:1 17:4 !function=times_2 @MMIRR_XX3 .. .. .. . . xmsk:4 ymsk:4 \ .. ... .. . . ... \ _XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at +@MMIRR_XX3_NO_P .. .. .. . . xmsk:4 \ +.. ... .. . . ... \ +_XX3 xb=%xx3_xb xt=%xx3_at pmsk=1 + ### Fixed-Point Load Instructions PLBZ01 10 0--.-- .. \ @@ -145,6 +150,39 @@ PMXVI16GER2S01 11 1001 -- - - pmsk:2 -- \ PMXVI16GER2SPP 01 11 1001 -- - - pmsk:2 -- \ 111011 ... -- . . 00101010 ..- @MMIRR_XX3 +PMXVF16GER2 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00010011 ..- @MMIRR_XX3 +PMXVF16GER2PP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00010010 ..- @MMIRR_XX3 +PMXVF16GER2PN 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 10010010 ..- @MMIRR_XX3 +PMXVF16GER2NP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 01010010 ..- @MMIRR_XX3 +PMXVF16GER2NN 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 11010010 ..- @MMIRR_XX3 + +PMXVF32GER 01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 00011011 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERPP01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 00011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERPN01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 10011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERNP01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 01011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERNN01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 11011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa + +PMXVF64GER 01 11 1001 -- - - ymsk:2 -- \ +111011 ... -- 0 . 00111011 ..- @MMIRR_XX3_NO_P xa=%xx3_xa_pair +PMXVF64GERPP01 11 1001 -- - - ymsk:2 -- \ +111011 ... -- 0 . 00111010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa_pair +PMXVF64GERPN01 11 1001 -- - - ymsk:2 -- \ +111011 ... -- 0 . 10111010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa_pair +PMXVF64GERNP01 11 1001 -- - - ymsk:2 -- \ +
[PATCH v5 5/8] target/ppc: Implemented xvf16ger*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xvf16ger2: VSX Vector 16-bit Floating-Point GER (rank-2 update) xvf16ger2nn: VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Negative accumulate xvf16ger2np: VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Positive accumulate xvf16ger2pn: VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Negative accumulate xvf16ger2pp: VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/cpu.h| 3 + target/ppc/fpu_helper.c | 91 + target/ppc/helper.h | 5 ++ target/ppc/insn32.decode| 6 ++ target/ppc/translate/vsx-impl.c.inc | 6 ++ 5 files changed, 111 insertions(+) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index bdedf4138e..46769a5647 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -227,6 +227,7 @@ typedef union _ppc_vsr_t { int16_t s16[8]; int32_t s32[4]; int64_t s64[2]; +float16 f16[8]; float32 f32[4]; float64 f64[2]; float128 f128; @@ -2641,6 +2642,7 @@ static inline bool lsw_reg_in_range(int start, int nregs, int rx) #define VsrSW(i) s32[i] #define VsrD(i) u64[i] #define VsrSD(i) s64[i] +#define VsrHF(i) f16[i] #define VsrSF(i) f32[i] #define VsrDF(i) f64[i] #else @@ -2652,6 +2654,7 @@ static inline bool lsw_reg_in_range(int start, int nregs, int rx) #define VsrSW(i) s32[3 - (i)] #define VsrD(i) u64[1 - (i)] #define VsrSD(i) s64[1 - (i)] +#define VsrHF(i) f16[7 - (i)] #define VsrSF(i) f32[3 - (i)] #define VsrDF(i) f64[1 - (i)] #endif diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 1766da5bcf..7a7aa03ac4 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -36,6 +36,15 @@ static inline float128 float128_snan_to_qnan(float128 x) #define float32_snan_to_qnan(x) ((x) | 0x0040) #define float16_snan_to_qnan(x) ((x) | 0x0200) +static inline float32 bfp32_neg(float32 a) +{ +if (unlikely(float32_is_any_nan(a))) { +return a; +} else { +return float32_chs(a); +} +} + static inline bool fp_exceptions_enabled(CPUPPCState *env) { #ifdef CONFIG_USER_ONLY @@ -3502,6 +3511,53 @@ static inline void vsxger_excp(CPUPPCState *env, uintptr_t retaddr) do_fpscr_check_status(env, retaddr); } +typedef float64 extract_f16(float16, float_status *); + +static float64 extract_hf16(float16 in, float_status *fp_status) +{ +return float16_to_float64(in, true, fp_status); +} + +static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask, bool acc, + bool neg_mul, bool neg_acc, extract_f16 extract) +{ +float32 r, aux_acc; +float64 psum, va, vb, vc, vd; +int i, j, xmsk_bit, ymsk_bit; +uint8_t pmsk = FIELD_EX32(mask, GER_MSK, PMSK), +xmsk = FIELD_EX32(mask, GER_MSK, XMSK), +ymsk = FIELD_EX32(mask, GER_MSK, YMSK); +float_status *excp_ptr = >fp_status; +for (i = 0, xmsk_bit = 1 << 3; i < 4; i++, xmsk_bit >>= 1) { +for (j = 0, ymsk_bit = 1 << 3; j < 4; j++, ymsk_bit >>= 1) { +if ((xmsk_bit & xmsk) && (ymsk_bit & ymsk)) { +va = !(pmsk & 2) ? float64_zero : extract(a->VsrHF(2 * i), excp_ptr); +vb = !(pmsk & 2) ? float64_zero : extract(b->VsrHF(2 * j), excp_ptr); +vc = !(pmsk & 1) ? float64_zero : extract(a->VsrHF(2 * i + 1), excp_ptr); +vd = !(pmsk & 1) ? float64_zero : extract(b->VsrHF(2 * j + 1), excp_ptr); +psum = float64_mul(va, vb, excp_ptr); +psum = float64r32_muladd(vc, vd, psum, 0, excp_ptr); +r = float64_to_float32(psum, excp_ptr); +if (acc) { +aux_acc = at[i].VsrSF(j); +if (neg_mul) { +r = bfp32_neg(r); +} +if (neg_acc) { +aux_acc = bfp32_neg(aux_acc); +} +r = float32_add(r, aux_acc, excp_ptr); +} +at[i].VsrSF(j) = r; +} else { +at[i].VsrSF(j) = float32_zero; +} +} +} +vsxger_excp(env, GETPC()); +} + typedef void vsxger_zero(ppc_vsr_t *at, int, int); typedef void vsxger_muladd_f(ppc_vsr_t *, ppc_vsr_t *, ppc_vsr_t *, int, int, @@ -3579,6 +3635,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t *at, vsxger_excp(env, GETPC()); } +QEMU_FLATTEN +void helper_XVF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at,
[PATCH v5 7/8] target/ppc: Implemented [pm]xvbf16ger2*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xvbf16ger2: VSX Vector bfloat16 GER (rank-2 update) xvbf16ger2nn: VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Negative accumulate xvbf16ger2np: VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Positive accumulate xvbf16ger2pn: VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Negative accumulate xvbf16ger2pp: VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Positive accumulate pmxvbf16ger2: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) pmxvbf16ger2nn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Negative accumulate pmxvbf16ger2np: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Positive accumulate pmxvbf16ger2pn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Negative accumulate pmxvbf16ger2pp: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/fpu_helper.c | 40 + target/ppc/helper.h | 5 target/ppc/insn32.decode| 6 + target/ppc/insn64.decode| 11 target/ppc/translate/vsx-impl.c.inc | 12 + 5 files changed, 74 insertions(+) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 7a7aa03ac4..20f134c1d6 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -3518,6 +3518,11 @@ static float64 extract_hf16(float16 in, float_status *fp_status) return float16_to_float64(in, true, fp_status); } +static float64 extract_bf16(bfloat16 in, float_status *fp_status) +{ +return bfloat16_to_float64(in, fp_status); +} + static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t *at, uint32_t mask, bool acc, bool neg_mul, bool neg_acc, extract_f16 extract) @@ -3635,6 +3640,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t *at, vsxger_excp(env, GETPC()); } +QEMU_FLATTEN +void helper_XVBF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, false, false, false, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2PP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, false, false, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2PN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, false, true, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2NP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, true, false, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2NN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, true, true, extract_bf16); +} + QEMU_FLATTEN void helper_XVF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t *at, uint32_t mask) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 7ab5ac8ee7..06203fd893 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -555,6 +555,11 @@ DEF_HELPER_5(XVF16GER2PP, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF16GER2PN, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF16GER2NP, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF16GER2NN, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2PP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2PN, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2NP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2NN, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF32GER, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF32GERPP, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF32GERPN, void, env, vsr, vsr, acc, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index c774227d8c..dfd12e9801 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -739,6 +739,12 @@ XVI8GER4SPP 111011 ... -- . . 01100011 ..- @XX3_at xa=%xx_xa XVI16GER2S 111011 ... -- . . 00101011 ..- @XX3_at xa=%xx_xa XVI16GER2SPP111011 ... -- . . 00101010 ..- @XX3_at xa=%xx_xa +XVBF16GER2 111011 ... -- . . 00110011 ..- @XX3_at xa=%xx_xa +XVBF16GER2PP111011 ... -- . . 00110010 ..- @XX3_at xa=%xx_xa +XVBF16GER2PN111011 ... -- . . 10110010 ..- @XX3_at xa=%xx_xa +XVBF16GER2NP111011 ... -- . . 01110010 ..- @XX3_at
[PATCH v5 8/8] linux-user: Add PowerPC ISA 3.1 and MMA to hwcap
From: Joel Stanley These are new hwcap bits added for power10. Signed-off-by: Joel Stanley Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- linux-user/elfload.c | 4 1 file changed, 4 insertions(+) diff --git a/linux-user/elfload.c b/linux-user/elfload.c index 61063fd974..0908692e62 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -779,6 +779,8 @@ enum { QEMU_PPC_FEATURE2_DARN = 0x0020, /* darn random number insn */ QEMU_PPC_FEATURE2_SCV = 0x0010, /* scv syscall */ QEMU_PPC_FEATURE2_HTM_NO_SUSPEND = 0x0008, /* TM w/o suspended state */ +QEMU_PPC_FEATURE2_ARCH_3_1 = 0x0004, /* ISA 3.1 */ +QEMU_PPC_FEATURE2_MMA = 0x0002, /* Matrix-Multiply Assist */ }; #define ELF_HWCAP get_elf_hwcap() @@ -836,6 +838,8 @@ static uint32_t get_elf_hwcap2(void) QEMU_PPC_FEATURE2_VEC_CRYPTO); GET_FEATURE2(PPC2_ISA300, QEMU_PPC_FEATURE2_ARCH_3_00 | QEMU_PPC_FEATURE2_DARN | QEMU_PPC_FEATURE2_HAS_IEEE128); +GET_FEATURE2(PPC2_ISA310, QEMU_PPC_FEATURE2_ARCH_3_1 | + QEMU_PPC_FEATURE2_MMA); #undef GET_FEATURE #undef GET_FEATURE2 -- 2.31.1
[PATCH v5 1/8] target/ppc: Implement xxm[tf]acc and xxsetaccz
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xxmfacc: VSX Move From Accumulator xxmtacc: VSX Move To Accumulator xxsetaccz: VSX Set Accumulator to Zero The PowerISA 3.1 mentions that for the current version of the architecture, "the hardware implementation provides the effect of ACC[i] and VSRs 4*i to 4*i + 3 logically containing the same data" and "The Accumulators introduce no new logical state at this time" (page 501). For now it seems unnecessary to create new structures, so this patch just uses ACC[i] as VSRs 4*i to 4*i+3 and therefore move to and from accumulators are no-ops. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/cpu.h| 5 + target/ppc/insn32.decode| 9 + target/ppc/translate/vsx-impl.c.inc | 31 + 3 files changed, 45 insertions(+) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 901ded79e9..2e80d0978f 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -2661,6 +2661,11 @@ static inline int vsr_full_offset(int i) return offsetof(CPUPPCState, vsr[i].u64[0]); } +static inline int acc_full_offset(int i) +{ +return vsr_full_offset(i * 4); +} + static inline int fpr_offset(int i) { return vsr64_offset(i, true); diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 39372fe673..7a76bedfa6 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -151,6 +151,9 @@ _vrt_frbp vrt frbp @X_vrt_frbp .. vrt:5 . 0 .. . _vrt_frbp frbp=%x_frbp +_ara +@X_a.. ra:3 .. . . .. . _a + %xx_xt 0:1 21:5 %xx_xb 1:1 11:5 %xx_xa 2:1 16:5 @@ -710,3 +713,9 @@ XVTLSBB 00 ... -- 00010 . 111011011 . - @XX2_bf_xb _s s:uint8_t @XL_s ..-- s:1 .. - _s RFEBB 010011-- . 0010010010 - @XL_s + +## Accumulator Instructions + +XXMFACC 01 ... -- 0 - 0010110001 - @X_a +XXMTACC 01 ... -- 1 - 0010110001 - @X_a +XXSETACCZ 01 ... -- 00011 - 0010110001 - @X_a diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 3692740736..dc8875d5d3 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2787,6 +2787,37 @@ static bool trans_XVCVBF16SPN(DisasContext *ctx, arg_XX2 *a) return true; } +/* + * The PowerISA 3.1 mentions that for the current version of the + * architecture, "the hardware implementation provides the effect of + * ACC[i] and VSRs 4*i to 4*i + 3 logically containing the same data" + * and "The Accumulators introduce no new logical state at this time" + * (page 501). For now it seems unnecessary to create new structures, + * so ACC[i] is the same as VSRs 4*i to 4*i+3 and therefore + * move to and from accumulators are no-ops. + */ +static bool trans_XXMFACC(DisasContext *ctx, arg_X_a *a) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA310); +REQUIRE_VSX(ctx); +return true; +} + +static bool trans_XXMTACC(DisasContext *ctx, arg_X_a *a) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA310); +REQUIRE_VSX(ctx); +return true; +} + +static bool trans_XXSETACCZ(DisasContext *ctx, arg_X_a *a) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA310); +REQUIRE_VSX(ctx); +tcg_gen_gvec_dup_imm(MO_64, acc_full_offset(a->ra), 64, 64, 0); +return true; +} + #undef GEN_XX2FORM #undef GEN_XX3FORM #undef GEN_XX2IFORM -- 2.31.1
[PATCH v5 2/8] target/ppc: Implemented xvi*ger* instructions
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xvi4ger8: VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) xvi4ger8pp: VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) Positive multiply, Positive accumulate xvi8ger4: VSX Vector 4-bit Signed Integer GER (rank-8 update) xvi8ger4pp: VSX Vector 4-bit Signed Integer GER (rank-8 update) Positive multiply, Positive accumulate xvi8ger4spp: VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with Saturate Positive multiply, Positive accumulate xvi16ger2:VSX Vector 16-bit Signed Integer GER (rank-2 update) xvi16ger2pp: VSX Vector 16-bit Signed Integer GER (rank-2 update) Positive multiply, Positive accumulate xvi16ger2s: VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation xvi16ger2spp: VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/cpu.h| 1 + target/ppc/helper.h | 13 +++ target/ppc/insn32.decode| 18 target/ppc/int_helper.c | 130 target/ppc/internal.h | 15 target/ppc/translate/vsx-impl.c.inc | 41 + 6 files changed, 218 insertions(+) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 2e80d0978f..c8a12a3985 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -238,6 +238,7 @@ typedef union _ppc_vsr_t { typedef ppc_vsr_t ppc_avr_t; typedef ppc_vsr_t ppc_fprp_t; +typedef ppc_vsr_t ppc_acc_t; #if !defined(CONFIG_USER_ONLY) /* Software TLB cache */ diff --git a/target/ppc/helper.h b/target/ppc/helper.h index aa6773c4a5..29354276f0 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -133,6 +133,10 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64) #define dh_ctype_vsr ppc_vsr_t * #define dh_typecode_vsr dh_typecode_ptr +#define dh_alias_acc ptr +#define dh_ctype_acc ppc_acc_t * +#define dh_typecode_acc dh_typecode_ptr + DEF_HELPER_3(vavgub, void, avr, avr, avr) DEF_HELPER_3(vavguh, void, avr, avr, avr) DEF_HELPER_3(vavguw, void, avr, avr, avr) @@ -537,6 +541,15 @@ DEF_HELPER_5(XXBLENDVB, void, vsr, vsr, vsr, vsr, i32) DEF_HELPER_5(XXBLENDVH, void, vsr, vsr, vsr, vsr, i32) DEF_HELPER_5(XXBLENDVW, void, vsr, vsr, vsr, vsr, i32) DEF_HELPER_5(XXBLENDVD, void, vsr, vsr, vsr, vsr, i32) +DEF_HELPER_5(XVI4GER8, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI4GER8PP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI8GER4, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI8GER4PP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI8GER4SPP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI16GER2, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI16GER2S, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI16GER2PP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVI16GER2SPP, void, env, vsr, vsr, acc, i32) DEF_HELPER_2(efscfsi, i32, env, i32) DEF_HELPER_2(efscfui, i32, env, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 7a76bedfa6..899a04bf77 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -170,6 +170,12 @@ xt xa xb @XX3.. . . . ...xt=%xx_xt xa=%xx_xa xb=%xx_xb +# 32 bit GER instructions have all mask bits considered 1 +_XX3 xa xb xt pmsk xmsk ymsk +%xx_at 23:3 +@XX3_at .. ... .. . . ... _XX3 xt=%xx_at xb=%xx_xb \ +pmsk=255 xmsk=15 ymsk=15 + _dm xt xa xb dm @XX3_dm .. . . . . dm:2 . ... _dm xt=%xx_xt xa=%xx_xa xb=%xx_xb @@ -719,3 +725,15 @@ RFEBB 010011-- . 0010010010 - @XL_s XXMFACC 01 ... -- 0 - 0010110001 - @X_a XXMTACC 01 ... -- 1 - 0010110001 - @X_a XXSETACCZ 01 ... -- 00011 - 0010110001 - @X_a + +## VSX GER instruction + +XVI4GER8111011 ... -- . . 00100011 ..- @XX3_at xa=%xx_xa +XVI4GER8PP 111011 ... -- . . 00100010 ..- @XX3_at xa=%xx_xa +XVI8GER4111011 ... -- . . 0011 ..- @XX3_at xa=%xx_xa +XVI8GER4PP 111011 ... -- . . 0010 ..- @XX3_at xa=%xx_xa +XVI16GER2 111011 ... -- . . 01001011 ..- @XX3_at xa=%xx_xa +XVI16GER2PP 111011 ... -- . . 01101011 ..- @XX3_at xa=%xx_xa +XVI8GER4SPP 111011 ... -- . . 01100011 ..- @XX3_at xa=%xx_xa +XVI16GER2S 111011 ... -- . . 00101011 ..- @XX3_at xa=%xx_xa +XVI16GER2SPP111011 ... -- . . 00101010 ..- @XX3_at xa=%xx_xa diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 8c1674510b..32a7d99718 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -782,6 +782,136 @@ VCT(uxs, cv
[PATCH v5 4/8] target/ppc: Implemented xvf*ger*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xvf32ger: VSX Vector 32-bit Floating-Point GER (rank-1 update) xvf32gernn: VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate xvf32gernp: VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate xvf32gerpn: VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate xvf32gerpp: VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate xvf64ger: VSX Vector 64-bit Floating-Point GER (rank-1 update) xvf64gernn: VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate xvf64gernp: VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate xvf64gerpn: VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate xvf64gerpp: VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/cpu.h| 4 + target/ppc/fpu_helper.c | 193 +++- target/ppc/helper.h | 10 ++ target/ppc/insn32.decode| 13 ++ target/ppc/translate/vsx-impl.c.inc | 12 ++ 5 files changed, 230 insertions(+), 2 deletions(-) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index c8a12a3985..bdedf4138e 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -2641,6 +2641,8 @@ static inline bool lsw_reg_in_range(int start, int nregs, int rx) #define VsrSW(i) s32[i] #define VsrD(i) u64[i] #define VsrSD(i) s64[i] +#define VsrSF(i) f32[i] +#define VsrDF(i) f64[i] #else #define VsrB(i) u8[15 - (i)] #define VsrSB(i) s8[15 - (i)] @@ -2650,6 +2652,8 @@ static inline bool lsw_reg_in_range(int start, int nregs, int rx) #define VsrSW(i) s32[3 - (i)] #define VsrD(i) u64[1 - (i)] #define VsrSD(i) s64[1 - (i)] +#define VsrSF(i) f32[3 - (i)] +#define VsrDF(i) f64[1 - (i)] #endif static inline int vsr64_offset(int i, bool high) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index 8592727792..1766da5bcf 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -414,7 +414,7 @@ void helper_store_fpscr(CPUPPCState *env, uint64_t val, uint32_t nibbles) ppc_store_fpscr(env, val); } -void helper_fpscr_check_status(CPUPPCState *env) +static void do_fpscr_check_status(CPUPPCState *env, uintptr_t raddr) { CPUState *cs = env_cpu(env); target_ulong fpscr = env->fpscr; @@ -455,13 +455,19 @@ void helper_fpscr_check_status(CPUPPCState *env) } cs->exception_index = POWERPC_EXCP_PROGRAM; env->error_code = error | POWERPC_EXCP_FP; +env->fpscr |= error ? FP_FEX : 0; /* Deferred floating-point exception after target FPSCR update */ if (fp_exceptions_enabled(env)) { raise_exception_err_ra(env, cs->exception_index, - env->error_code, GETPC()); + env->error_code, raddr); } } +void helper_fpscr_check_status(CPUPPCState *env) +{ +do_fpscr_check_status(env, GETPC()); +} + static void do_float_check_status(CPUPPCState *env, bool change_fi, uintptr_t raddr) { @@ -3469,3 +3475,186 @@ void helper_xssubqp(CPUPPCState *env, uint32_t opcode, *xt = t; do_float_check_status(env, true, GETPC()); } + +static inline void vsxger_excp(CPUPPCState *env, uintptr_t retaddr) +{ +/* + * XV*GER instructions execute and set the FPSCR as if exceptions + * are disabled and only at the end throw an exception + */ +target_ulong enable; +enable = env->fpscr & (FP_ENABLES | FP_FI | FP_FR); +env->fpscr &= ~(FP_ENABLES | FP_FI | FP_FR); +int status = get_float_exception_flags(>fp_status); +if (unlikely(status & float_flag_invalid)) { +if (status & float_flag_invalid_snan) { +float_invalid_op_vxsnan(env, 0); +} +if (status & float_flag_invalid_imz) { +float_invalid_op_vximz(env, false, 0); +} +if (status & float_flag_invalid_isi) { +float_invalid_op_vxisi(env, false, 0); +} +} +do_float_check_status(env, false, retaddr); +env->fpscr |= enable; +do_fpscr_check_status(env, retaddr); +} + +typedef void vsxger_zero(ppc_vsr_t *at, int, int); + +typedef void vsxger_muladd_f(ppc_vsr_t *, ppc_vsr_t *, ppc_vsr_t *, int, int, + int flags, float_status *s); + +static void vsxger_muladd32(ppc_vsr_t *at, ppc_vsr_t *a, ppc_vsr_t *b, int i, +int j, int flags, float_status *s) +{ +at[i].VsrSF(j) = float32_muladd(a->VsrSF(i), b->VsrSF(j), +at[i].VsrSF(j), flags, s); +} + +
[PATCH v5 3/8] target/ppc: Implemented pmxvi*ger* instructions
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: pmxvi4ger8: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) pmxvi4ger8pp: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) Positive multiply, Positive accumulate pmxvi8ger4: Prefixed Masked VSX Vector 4-bit Signed Integer GER (rank-8 update) pmxvi8ger4pp: Prefixed Masked VSX Vector 4-bit Signed Integer GER (rank-8 update) Positive multiply, Positive accumulate pmxvi8ger4spp: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with Saturate Positive multiply, Positive accumulate pmxvi16ger2:Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) pmxvi16ger2pp: Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) Positive multiply, Positive accumulate pmxvi16ger2s: Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation pmxvi16ger2spp: Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/insn64.decode| 30 + target/ppc/translate/vsx-impl.c.inc | 10 ++ 2 files changed, 40 insertions(+) diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode index 691e8fe6c0..0eed35c8cd 100644 --- a/target/ppc/insn64.decode +++ b/target/ppc/insn64.decode @@ -68,6 +68,15 @@ .. . . . . .. \ &8RR_XX4_uim3 xt=%8rr_xx_xt xa=%8rr_xx_xa xb=%8rr_xx_xb xc=%8rr_xx_xc +# Format MMIRR:XX3 +_XX3 !extern xa xb xt pmsk xmsk ymsk +%xx3_xa 2:1 16:5 +%xx3_xb 1:1 11:5 +%xx3_at 23:3 +@MMIRR_XX3 .. .. .. . . xmsk:4 ymsk:4 \ +.. ... .. . . ... \ +_XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at + ### Fixed-Point Load Instructions PLBZ01 10 0--.-- .. \ @@ -115,6 +124,27 @@ PSTFS 01 10 0--.-- .. \ PSTFD 01 10 0--.-- .. \ 110110 . . @PLS_D +## VSX GER instruction + +PMXVI4GER8 01 11 1001 -- - - pmsk:8 \ +111011 ... -- . . 00100011 ..- @MMIRR_XX3 +PMXVI4GER8PP01 11 1001 -- - - pmsk:8 \ +111011 ... -- . . 00100010 ..- @MMIRR_XX3 +PMXVI8GER4 01 11 1001 -- - - pmsk:4 \ +111011 ... -- . . 0011 ..- @MMIRR_XX3 +PMXVI8GER4PP01 11 1001 -- - - pmsk:4 \ +111011 ... -- . . 0010 ..- @MMIRR_XX3 +PMXVI16GER2 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 01001011 ..- @MMIRR_XX3 +PMXVI16GER2PP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 01101011 ..- @MMIRR_XX3 +PMXVI8GER4SPP 01 11 1001 -- - - pmsk:4 \ +111011 ... -- . . 01100011 ..- @MMIRR_XX3 +PMXVI16GER2S01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00101011 ..- @MMIRR_XX3 +PMXVI16GER2SPP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00101010 ..- @MMIRR_XX3 + ### Prefixed No-operation Instruction @PNOP 01 11 -- 00 \ diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 9d4309e841..c9ed898bb6 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2859,6 +2859,16 @@ TRANS(XVI16GER2PP, do_ger, gen_helper_XVI16GER2PP) TRANS(XVI16GER2S, do_ger, gen_helper_XVI16GER2S) TRANS(XVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP) +TRANS64(PMXVI4GER8, do_ger, gen_helper_XVI4GER8) +TRANS64(PMXVI4GER8PP, do_ger, gen_helper_XVI4GER8PP) +TRANS64(PMXVI8GER4, do_ger, gen_helper_XVI8GER4) +TRANS64(PMXVI8GER4PP, do_ger, gen_helper_XVI8GER4PP) +TRANS64(PMXVI8GER4SPP, do_ger, gen_helper_XVI8GER4SPP) +TRANS64(PMXVI16GER2, do_ger, gen_helper_XVI16GER2) +TRANS64(PMXVI16GER2PP, do_ger, gen_helper_XVI16GER2PP) +TRANS64(PMXVI16GER2S, do_ger, gen_helper_XVI16GER2S) +TRANS64(PMXVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP) + #undef GEN_XX2FORM #undef GEN_XX3FORM #undef GEN_XX2IFORM -- 2.31.1
[PATCH v5 0/8] VSX MMA Implementation
From: "Lucas Mateus Castro (alqotel)" Based-on: <20220517161522.36132-1-victor.colo...@eldorado.org.br> This patch series is a patch series of the Matrix-Multiply Assist (MMA) instructions implementation from the PowerISA 3.1 These and the VDIV/VMOD implementation are the last new PowerISA 3.1 instructions left to be implemented. The XVFGER instructions accumulate the exception status and at the end set the FPSCR and take a Program interrupt on a trap-enabled exception, previous versions were based on Victor's rework of FPU exceptions, but as that patch was rejected this version worked around the fact that OX/UX/XX and invalid instructions were handled in different functions by disabling all enable bits then re-enabling them and calling the mtfsf deferred exception helper. v5 changes: - Changed VSXGER16 accumulation to negate the multiplication and accumulation in independent if's (if necessary) and sum their values. v4 changes: - Changed VSXGER16 accumulation to always use float32_sum and negate the elements according to the type of accumulation v3 changes: - GER helpers now use ppc_acc_t instead of ppc_vsr_t for passing acc - Removed do_ger_XX3 and updated the decodetree to pass the masks in 32 bits instructions - Removed unnecessary rounding mode function - Moved float32_neg to fpu_helper.c and renamed it bfp32_negate to make it clearer that it's a 32 bit version of the PowerISA bfp_NEGATE - Negated accumulation now a subtraction - Changed exception handling by disabling all enable FPSCR enable bits to set all FPSCR bits (except FEX) correctly, then re-enable them and call do_fpscr_check_status to raise the exception accordingly and set FEX if necessary v2 changes: - Changed VSXGER, VSXGER16 and XVIGER macros to functions - Set rounding mode in floating-point instructions based on RN before operations - Separated accumulate and with saturation instructions in different helpers - Used FIELD, FIELD_EX32 and FIELD_DP32 for packing/unpacking masks Joel Stanley (1): linux-user: Add PowerPC ISA 3.1 and MMA to hwcap Lucas Mateus Castro (alqotel) (7): target/ppc: Implement xxm[tf]acc and xxsetaccz target/ppc: Implemented xvi*ger* instructions target/ppc: Implemented pmxvi*ger* instructions target/ppc: Implemented xvf*ger* target/ppc: Implemented xvf16ger* target/ppc: Implemented pmxvf*ger* target/ppc: Implemented [pm]xvbf16ger2* linux-user/elfload.c| 4 + target/ppc/cpu.h| 13 ++ target/ppc/fpu_helper.c | 324 +++- target/ppc/helper.h | 33 +++ target/ppc/insn32.decode| 52 + target/ppc/insn64.decode| 79 +++ target/ppc/int_helper.c | 130 +++ target/ppc/internal.h | 15 ++ target/ppc/translate/vsx-impl.c.inc | 130 +++ 9 files changed, 778 insertions(+), 2 deletions(-) -- 2.31.1
[PATCH v4 6/8] target/ppc: Implemented pmxvf*ger*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: pmxvf16ger2: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) pmxvf16ger2nn: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Negative accumulate pmxvf16ger2np: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative multiply, Positive accumulate pmxvf16ger2pn: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Negative accumulate pmxvf16ger2pp: Prefixed Masked VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive multiply, Positive accumulate pmxvf32ger:Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) pmxvf32gernn: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate pmxvf32gernp: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate pmxvf32gerpn: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate pmxvf32gerpp: Prefixed Masked VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate pmxvf64ger:Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) pmxvf64gernn: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Negative accumulate pmxvf64gernp: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative multiply, Positive accumulate pmxvf64gerpn: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Negative accumulate pmxvf64gerpp: Prefixed Masked VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/insn64.decode| 38 + target/ppc/translate/vsx-impl.c.inc | 18 ++ 2 files changed, 56 insertions(+) diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode index 0eed35c8cd..5ecc5c85bf 100644 --- a/target/ppc/insn64.decode +++ b/target/ppc/insn64.decode @@ -73,10 +73,15 @@ %xx3_xa 2:1 16:5 %xx3_xb 1:1 11:5 %xx3_at 23:3 +%xx3_xa_pair2:1 17:4 !function=times_2 @MMIRR_XX3 .. .. .. . . xmsk:4 ymsk:4 \ .. ... .. . . ... \ _XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at +@MMIRR_XX3_NO_P .. .. .. . . xmsk:4 \ +.. ... .. . . ... \ +_XX3 xb=%xx3_xb xt=%xx3_at pmsk=1 + ### Fixed-Point Load Instructions PLBZ01 10 0--.-- .. \ @@ -145,6 +150,39 @@ PMXVI16GER2S01 11 1001 -- - - pmsk:2 -- \ PMXVI16GER2SPP 01 11 1001 -- - - pmsk:2 -- \ 111011 ... -- . . 00101010 ..- @MMIRR_XX3 +PMXVF16GER2 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00010011 ..- @MMIRR_XX3 +PMXVF16GER2PP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00010010 ..- @MMIRR_XX3 +PMXVF16GER2PN 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 10010010 ..- @MMIRR_XX3 +PMXVF16GER2NP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 01010010 ..- @MMIRR_XX3 +PMXVF16GER2NN 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 11010010 ..- @MMIRR_XX3 + +PMXVF32GER 01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 00011011 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERPP01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 00011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERPN01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 10011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERNP01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 01011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa +PMXVF32GERNN01 11 1001 -- - - ymsk:4 \ +111011 ... -- . . 11011010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa + +PMXVF64GER 01 11 1001 -- - - ymsk:2 -- \ +111011 ... -- 0 . 00111011 ..- @MMIRR_XX3_NO_P xa=%xx3_xa_pair +PMXVF64GERPP01 11 1001 -- - - ymsk:2 -- \ +111011 ... -- 0 . 00111010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa_pair +PMXVF64GERPN01 11 1001 -- - - ymsk:2 -- \ +111011 ... -- 0 . 10111010 ..- @MMIRR_XX3_NO_P xa=%xx3_xa_pair +PMXVF64GERNP01 11 1001 -- - - ymsk:2 -- \ +
[PATCH v4 3/8] target/ppc: Implemented pmxvi*ger* instructions
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: pmxvi4ger8: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) pmxvi4ger8pp: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) Positive multiply, Positive accumulate pmxvi8ger4: Prefixed Masked VSX Vector 4-bit Signed Integer GER (rank-8 update) pmxvi8ger4pp: Prefixed Masked VSX Vector 4-bit Signed Integer GER (rank-8 update) Positive multiply, Positive accumulate pmxvi8ger4spp: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with Saturate Positive multiply, Positive accumulate pmxvi16ger2:Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) pmxvi16ger2pp: Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) Positive multiply, Positive accumulate pmxvi16ger2s: Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation pmxvi16ger2spp: Prefixed Masked VSX Vector 16-bit Signed Integer GER (rank-2 update) with Saturation Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/insn64.decode| 30 + target/ppc/translate/vsx-impl.c.inc | 10 ++ 2 files changed, 40 insertions(+) diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode index 691e8fe6c0..0eed35c8cd 100644 --- a/target/ppc/insn64.decode +++ b/target/ppc/insn64.decode @@ -68,6 +68,15 @@ .. . . . . .. \ &8RR_XX4_uim3 xt=%8rr_xx_xt xa=%8rr_xx_xa xb=%8rr_xx_xb xc=%8rr_xx_xc +# Format MMIRR:XX3 +_XX3 !extern xa xb xt pmsk xmsk ymsk +%xx3_xa 2:1 16:5 +%xx3_xb 1:1 11:5 +%xx3_at 23:3 +@MMIRR_XX3 .. .. .. . . xmsk:4 ymsk:4 \ +.. ... .. . . ... \ +_XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at + ### Fixed-Point Load Instructions PLBZ01 10 0--.-- .. \ @@ -115,6 +124,27 @@ PSTFS 01 10 0--.-- .. \ PSTFD 01 10 0--.-- .. \ 110110 . . @PLS_D +## VSX GER instruction + +PMXVI4GER8 01 11 1001 -- - - pmsk:8 \ +111011 ... -- . . 00100011 ..- @MMIRR_XX3 +PMXVI4GER8PP01 11 1001 -- - - pmsk:8 \ +111011 ... -- . . 00100010 ..- @MMIRR_XX3 +PMXVI8GER4 01 11 1001 -- - - pmsk:4 \ +111011 ... -- . . 0011 ..- @MMIRR_XX3 +PMXVI8GER4PP01 11 1001 -- - - pmsk:4 \ +111011 ... -- . . 0010 ..- @MMIRR_XX3 +PMXVI16GER2 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 01001011 ..- @MMIRR_XX3 +PMXVI16GER2PP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 01101011 ..- @MMIRR_XX3 +PMXVI8GER4SPP 01 11 1001 -- - - pmsk:4 \ +111011 ... -- . . 01100011 ..- @MMIRR_XX3 +PMXVI16GER2S01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00101011 ..- @MMIRR_XX3 +PMXVI16GER2SPP 01 11 1001 -- - - pmsk:2 -- \ +111011 ... -- . . 00101010 ..- @MMIRR_XX3 + ### Prefixed No-operation Instruction @PNOP 01 11 -- 00 \ diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 9d4309e841..c9ed898bb6 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2859,6 +2859,16 @@ TRANS(XVI16GER2PP, do_ger, gen_helper_XVI16GER2PP) TRANS(XVI16GER2S, do_ger, gen_helper_XVI16GER2S) TRANS(XVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP) +TRANS64(PMXVI4GER8, do_ger, gen_helper_XVI4GER8) +TRANS64(PMXVI4GER8PP, do_ger, gen_helper_XVI4GER8PP) +TRANS64(PMXVI8GER4, do_ger, gen_helper_XVI8GER4) +TRANS64(PMXVI8GER4PP, do_ger, gen_helper_XVI8GER4PP) +TRANS64(PMXVI8GER4SPP, do_ger, gen_helper_XVI8GER4SPP) +TRANS64(PMXVI16GER2, do_ger, gen_helper_XVI16GER2) +TRANS64(PMXVI16GER2PP, do_ger, gen_helper_XVI16GER2PP) +TRANS64(PMXVI16GER2S, do_ger, gen_helper_XVI16GER2S) +TRANS64(PMXVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP) + #undef GEN_XX2FORM #undef GEN_XX3FORM #undef GEN_XX2IFORM -- 2.31.1
[PATCH v4 7/8] target/ppc: Implemented [pm]xvbf16ger2*
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xvbf16ger2: VSX Vector bfloat16 GER (rank-2 update) xvbf16ger2nn: VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Negative accumulate xvbf16ger2np: VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Positive accumulate xvbf16ger2pn: VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Negative accumulate xvbf16ger2pp: VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Positive accumulate pmxvbf16ger2: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) pmxvbf16ger2nn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Negative accumulate pmxvbf16ger2np: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Negative multiply, Positive accumulate pmxvbf16ger2pn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Negative accumulate pmxvbf16ger2pp: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update) Positive multiply, Positive accumulate Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/fpu_helper.c | 40 + target/ppc/helper.h | 5 target/ppc/insn32.decode| 6 + target/ppc/insn64.decode| 11 target/ppc/translate/vsx-impl.c.inc | 12 + 5 files changed, 74 insertions(+) diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c index f7da92a51a..46e82b7b26 100644 --- a/target/ppc/fpu_helper.c +++ b/target/ppc/fpu_helper.c @@ -3518,6 +3518,11 @@ static float64 extract_hf16(float16 in, float_status *fp_status) return float16_to_float64(in, true, fp_status); } +static float64 extract_bf16(bfloat16 in, float_status *fp_status) +{ +return bfloat16_to_float64(in, fp_status); +} + static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t *at, uint32_t mask, bool acc, bool neg_mul, bool neg_acc, extract_f16 extract) @@ -3637,6 +3642,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t *at, vsxger_excp(env, GETPC()); } +QEMU_FLATTEN +void helper_XVBF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, false, false, false, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2PP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, false, false, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2PN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, false, true, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2NP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, true, false, extract_bf16); +} + +QEMU_FLATTEN +void helper_XVBF16GER2NN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, + ppc_acc_t *at, uint32_t mask) +{ +vsxger16(env, a, b, at, mask, true, true, true, extract_bf16); +} + QEMU_FLATTEN void helper_XVF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t *at, uint32_t mask) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 7ab5ac8ee7..06203fd893 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -555,6 +555,11 @@ DEF_HELPER_5(XVF16GER2PP, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF16GER2PN, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF16GER2NP, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF16GER2NN, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2PP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2PN, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2NP, void, env, vsr, vsr, acc, i32) +DEF_HELPER_5(XVBF16GER2NN, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF32GER, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF32GERPP, void, env, vsr, vsr, acc, i32) DEF_HELPER_5(XVF32GERPN, void, env, vsr, vsr, acc, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index c774227d8c..dfd12e9801 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -739,6 +739,12 @@ XVI8GER4SPP 111011 ... -- . . 01100011 ..- @XX3_at xa=%xx_xa XVI16GER2S 111011 ... -- . . 00101011 ..- @XX3_at xa=%xx_xa XVI16GER2SPP111011 ... -- . . 00101010 ..- @XX3_at xa=%xx_xa +XVBF16GER2 111011 ... -- . . 00110011 ..- @XX3_at xa=%xx_xa +XVBF16GER2PP111011 ... -- . . 00110010 ..- @XX3_at xa=%xx_xa +XVBF16GER2PN111011 ... -- . . 10110010 ..- @XX3_at xa=%xx_xa +XVBF16GER2NP111011 ... -- . . 01110010 ..- @XX3_at
[PATCH v4 1/8] target/ppc: Implement xxm[tf]acc and xxsetaccz
From: "Lucas Mateus Castro (alqotel)" Implement the following PowerISA v3.1 instructions: xxmfacc: VSX Move From Accumulator xxmtacc: VSX Move To Accumulator xxsetaccz: VSX Set Accumulator to Zero The PowerISA 3.1 mentions that for the current version of the architecture, "the hardware implementation provides the effect of ACC[i] and VSRs 4*i to 4*i + 3 logically containing the same data" and "The Accumulators introduce no new logical state at this time" (page 501). For now it seems unnecessary to create new structures, so this patch just uses ACC[i] as VSRs 4*i to 4*i+3 and therefore move to and from accumulators are no-ops. Signed-off-by: Lucas Mateus Castro (alqotel) Reviewed-by: Richard Henderson --- target/ppc/cpu.h| 5 + target/ppc/insn32.decode| 9 + target/ppc/translate/vsx-impl.c.inc | 31 + 3 files changed, 45 insertions(+) diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 901ded79e9..2e80d0978f 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -2661,6 +2661,11 @@ static inline int vsr_full_offset(int i) return offsetof(CPUPPCState, vsr[i].u64[0]); } +static inline int acc_full_offset(int i) +{ +return vsr_full_offset(i * 4); +} + static inline int fpr_offset(int i) { return vsr64_offset(i, true); diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 39372fe673..7a76bedfa6 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -151,6 +151,9 @@ _vrt_frbp vrt frbp @X_vrt_frbp .. vrt:5 . 0 .. . _vrt_frbp frbp=%x_frbp +_ara +@X_a.. ra:3 .. . . .. . _a + %xx_xt 0:1 21:5 %xx_xb 1:1 11:5 %xx_xa 2:1 16:5 @@ -710,3 +713,9 @@ XVTLSBB 00 ... -- 00010 . 111011011 . - @XX2_bf_xb _s s:uint8_t @XL_s ..-- s:1 .. - _s RFEBB 010011-- . 0010010010 - @XL_s + +## Accumulator Instructions + +XXMFACC 01 ... -- 0 - 0010110001 - @X_a +XXMTACC 01 ... -- 1 - 0010110001 - @X_a +XXSETACCZ 01 ... -- 00011 - 0010110001 - @X_a diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 3692740736..dc8875d5d3 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2787,6 +2787,37 @@ static bool trans_XVCVBF16SPN(DisasContext *ctx, arg_XX2 *a) return true; } +/* + * The PowerISA 3.1 mentions that for the current version of the + * architecture, "the hardware implementation provides the effect of + * ACC[i] and VSRs 4*i to 4*i + 3 logically containing the same data" + * and "The Accumulators introduce no new logical state at this time" + * (page 501). For now it seems unnecessary to create new structures, + * so ACC[i] is the same as VSRs 4*i to 4*i+3 and therefore + * move to and from accumulators are no-ops. + */ +static bool trans_XXMFACC(DisasContext *ctx, arg_X_a *a) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA310); +REQUIRE_VSX(ctx); +return true; +} + +static bool trans_XXMTACC(DisasContext *ctx, arg_X_a *a) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA310); +REQUIRE_VSX(ctx); +return true; +} + +static bool trans_XXSETACCZ(DisasContext *ctx, arg_X_a *a) +{ +REQUIRE_INSNS_FLAGS2(ctx, ISA310); +REQUIRE_VSX(ctx); +tcg_gen_gvec_dup_imm(MO_64, acc_full_offset(a->ra), 64, 64, 0); +return true; +} + #undef GEN_XX2FORM #undef GEN_XX3FORM #undef GEN_XX2IFORM -- 2.31.1