[PATCH v3 03/12] target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch moves VADDCUW and VSUBCUW to decodtree with gvec using an
implementation based on the helper, with the main difference being
changing the -1 (aka all bits set to 1) result returned by cmp when
true to +1. It also implemented a .fni4 version of those instructions
and dropped the helper.

vaddcuw:
reptloopmaster patch
8   12500   0,01008200 0,00612400 (-39.3%)
25  40000,01091500 0,00471600 (-56.8%)
100 10000,01332500 0,00593700 (-55.4%)
500 200 0,01998500 0,01275700 (-36.2%)
250040  0,04704300 0,04364300 (-7.2%)
800012  0,10748200 0,11241000 (+4.6%)

vsubcuw:
reptloopmaster patch
8   12500   0,01226200 0,00571600 (-53.4%)
25  40000,01493500 0,00462100 (-69.1%)
100 10000,01522700 0,00455100 (-70.1%)
500 200 0,02384600 0,01133500 (-52.5%)
250040  0,04935200 0,03178100 (-35.6%)
800012  0,09039900 0,09440600 (+4.4%)

Overall there was a gain in performance, but the TCGop code was still
slightly bigger in the new version (it went from 4 to 5).

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  2 -
 target/ppc/insn32.decode|  2 +
 target/ppc/int_helper.c | 18 -
 target/ppc/translate/vmx-impl.c.inc | 61 +++--
 target/ppc/translate/vmx-ops.c.inc  |  3 +-
 5 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f02a9497b7..f7047ed2aa 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -193,11 +193,9 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vaddcuw, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_3(vsubcuw, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 9a509e84df..aebc7b73c8 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -608,12 +608,14 @@ VRLQNM  000100 . . . 00101000101
@VX
 
 ## Vector Integer Arithmetic Instructions
 
+VADDCUW 000100 . . . 0011000@VX
 VADDCUQ 000100 . . . 0010100@VX
 VADDUQM 000100 . . . 001@VX
 
 VADDEUQM000100 . . . . 00   @VA
 VADDECUQ000100 . . . . 01   @VA
 
+VSUBCUW 000100 . . . 1011000@VX
 VSUBCUQ 000100 . . . 1010100@VX
 VSUBUQM 000100 . . . 101@VX
 
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index ae1ba8084d..f8dd12e8ae 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -492,15 +492,6 @@ static inline void set_vscr_sat(CPUPPCState *env)
 env->vscr_sat.u32[0] = 1;
 }
 
-void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i;
-
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
-r->u32[i] = ~a->u32[i] < b->u32[i];
-}
-}
-
 /* vprtybw */
 void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
 {
@@ -1962,15 +1953,6 @@ void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t 
*b)
 #endif
 }
 
-void helper_vsubcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i;
-
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
-r->u32[i] = a->u32[i] >= b->u32[i];
-}
-}
-
 void helper_vsumsws(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
 int64_t t;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 3acd585a2f..f52485a5f1 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -803,8 +803,6 @@ GEN_VXFORM(vsrv, 2, 28);
 GEN_VXFORM(vslv, 2, 29);
 GEN_VXFORM(vslo, 6, 16);
 GEN_VXFORM(vsro, 6, 17);
-GEN_VXFORM(vaddcuw, 0, 6);
-GEN_VXFORM(vsubcuw, 0, 22);
 
 static bool do_vector_gvec3_VX(DisasContext *ctx, arg_VX *a, int vece,
void (*gen_gvec)(unsigned, uint32_t, uint32_t,
@@ -2847,8 +2845,6 @@ static void gen_xpnd04_2(DisasContext *ctx)
 }
 
 
-GEN_VXFORM_DUAL(vsubcuw, PPC_ALTIVEC, PPC_NONE, \

[PATCH v3 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Used gvec to translate XVTSTDCSP and XVTSTDCDP.

xvtstdcsp:
reptloopimm master version  prev versioncurrent version
25  40000   0,2062000,040730 (-80.2%)0,040740 (-80.2%)
25  40001   0,2051200,053650 (-73.8%)0,053510 (-73.9%)
25  40003   0,2061600,058630 (-71.6%)0,058570 (-71.6%)
25  400051  0,2171100,191490 (-11.8%)0,192320 (-11.4%)
25  4000127 0,2061600,191490 (-7.1%) 0,192640 (-6.6%)
800012  0   1,2347190,418833 (-66.1%)0,386365 (-68.7%)
800012  1   1,2324171,435979 (+16.5%)1,462792 (+18.7%)
800012  3   1,2327601,766073 (+43.3%)1,743990 (+41.5%)
800012  51  1,2392811,319562 (+6.5%) 1,423479 (+14.9%)
800012  127 1,2317081,315760 (+6.8%) 1,426667 (+15.8%)

xvtstdcdp:
reptloopimm master version  prev versioncurrent version
25  40000   0,1599300,040830 (-74.5%)0,040610 (-74.6%)
25  40001   0,1606400,053670 (-66.6%)0,053480 (-66.7%)
25  40003   0,1600200,063030 (-60.6%)0,062960 (-60.7%)
25  400051  0,1604100,128620 (-19.8%)0,127470 (-20.5%)
25  4000127 0,1603300,127670 (-20.4%)0,128690 (-19.7%)
800012  0   1,1903650,422146 (-64.5%)0,388417 (-67.4%)
800012  1   1,1912921,445312 (+21.3%)1,428698 (+19.9%)
800012  3   1,1886871,980656 (+66.6%)1,975354 (+66.2%)
800012  51  1,1912501,264500 (+6.1%) 1,355083 (+13.8%)
800012  127 1,1973131,266729 (+5.8%) 1,349156 (+12.7%)

Overall, these instructions are the hardest ones to measure performance
as the gvec implementation is affected by the immediate. Above there are
5 different scenarios when it comes to immediate and 2 when it comes to
rept/loop combination. The immediates scenarios are: all bits are 0
therefore the target register should just be changed to 0, with 1 bit
set, with 2 bits set in a combination the new implementation can deal
with using gvec, 4 bits set and the new implementation can't deal with
it using gvec and all bits set. The rept/loop scenarios are high loop
and low rept (so it should spend more time executing it than translating
it) and high rept low loop (so it should spend more time translating it
than executing this code).
These comparisons are between the upstream version, a previous similar
implementation and a one with a cleaner code(this one).
For a comparison with o previous different implementation:
<20221010191356.83659-13-lucas.ara...@eldorado.org.br>

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/translate/vsx-impl.c.inc | 164 ++--
 1 file changed, 154 insertions(+), 10 deletions(-)

diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index af410cbf1b..7099e7823d 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -632,6 +632,8 @@ static void gen_mtvsrws(DisasContext *ctx)
 #define SGN_MASK_SP 0x80008000ull
 #define EXP_MASK_DP  0x7FF0ull
 #define EXP_MASK_SP 0x7F807F80ull
+#define FRC_MASK_DP (~(SGN_MASK_DP | EXP_MASK_DP))
+#define FRC_MASK_SP (~(SGN_MASK_SP | EXP_MASK_SP))
 
 #define VSX_SCALAR_MOVE(name, op, sgn_mask)   \
 static void glue(gen_, name)(DisasContext *ctx)   \
@@ -1112,23 +1114,165 @@ GEN_VSX_HELPER_X2(xscvhpdp, 0x16, 0x15, 0x10, 
PPC2_ISA300)
 GEN_VSX_HELPER_R2(xscvsdqp, 0x04, 0x1A, 0x0A, PPC2_ISA300)
 GEN_VSX_HELPER_X2(xscvspdp, 0x12, 0x14, 0, PPC2_VSX)
 
+/* test if +Inf */
+static void gen_is_pos_inf(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t v)
+{
+uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP;
+tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b,
+tcg_constant_vec_matching(t, vece, exp_msk));
+}
+
+/* test if -Inf */
+static void gen_is_neg_inf(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t v)
+{
+uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP;
+uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP;
+tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b,
+tcg_constant_vec_matching(t, vece, sgn_msk | exp_msk));
+}
+
+/* test if +Inf or -Inf */
+static void gen_is_any_inf(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t v)
+{
+uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP;
+uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP;
+tcg_gen_andc_vec(vece, b, b, tcg_constant_vec_matching(t, vece, sgn_msk));
+tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b,
+tcg_constant_vec_matching(

[PATCH v3 11/12] target/ppc: Moved XSTSTDC[QDS]P to decodetree

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XSTSTDCSP, XSTSTDCDP and XSTSTDCQP to decodetree and moved some of
its decoding away from the helper as previously the DCMX, XB and BF were
calculated in the helper with the help of cpu_env, now that part was
moved to the decodetree with the rest.

xvtstdcsp:
reptloopmaster patch
8   12500   1,85393600 1,94683600 (+5.0%)
25  40001,78779800 1,92479000 (+7.7%)
100 10002,12775000 2,28895500 (+7.6%)
500 200 2,99655300 3,23102900 (+7.8%)
250040  6,89082200 7,44827500 (+8.1%)
800012 17,5058550018,95152100 (+8.3%)

xvtstdcdp:
reptloopmaster patch
8   12500   1,39043100 1,33539800 (-4.0%)
25  40001,35731800 1,37347800 (+1.2%)
100 10001,51514800 1,56053000 (+3.0%)
500 200 2,21014400 2,47906000 (+12.2%)
250040  5,39488200 6,68766700 (+24.0%)
800012 13,9862390018,17661900 (+30.0%)

xvtstdcdp:
reptloopmaster patch
8   12500   1,35123800 1,34455800 (-0.5%)
25  40001,36441200 1,36759600 (+0.2%)
100 10001,49763500 1,54138400 (+2.9%)
500 200 2,19020200 2,46196400 (+12.4%)
250040  5,39265700 6,68147900 (+23.9%)
800012 14,0416360018,19669600 (+29.6%)

As some values are now decoded outside the helper and passed to it as an
argument the number of arguments of the helper increased, the number
of TCGop needed to load the arguments increased. I suspect that's why
the slow-down in the tests with a high REPT but low LOOP.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/fpu_helper.c | 114 +---
 target/ppc/helper.h |   6 +-
 target/ppc/insn32.decode|   6 ++
 target/ppc/translate/vsx-impl.c.inc |  20 -
 target/ppc/translate/vsx-ops.c.inc  |   4 -
 5 files changed, 60 insertions(+), 90 deletions(-)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 960a76a8a5..a66e16c212 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3241,63 +3241,6 @@ void helper_XVXSIGSP(ppc_vsr_t *xt, ppc_vsr_t *xb)
 *xt = t;
 }
 
-/*
- * VSX_TEST_DC - VSX floating point test data class
- *   op- instruction mnemonic
- *   nels  - number of elements (1, 2 or 4)
- *   xbn   - VSR register number
- *   tp- type (float32 or float64)
- *   fld   - vsr_t field (VsrD(*) or VsrW(*))
- *   tfld   - target vsr_t field (VsrD(*) or VsrW(*))
- *   fld_max - target field max
- *   scrf - set result in CR and FPCC
- */
-#define VSX_TEST_DC(op, nels, xbn, tp, fld, tfld, fld_max, scrf)  \
-void helper_##op(CPUPPCState *env, uint32_t opcode) \
-{   \
-ppc_vsr_t *xt = >vsr[xT(opcode)];  \
-ppc_vsr_t *xb = >vsr[xbn]; \
-ppc_vsr_t t = { };  \
-uint32_t i, sign, dcmx; \
-uint32_t cc, match = 0; \
-\
-if (!scrf) {\
-dcmx = DCMX_XV(opcode); \
-} else {\
-t = *xt;\
-dcmx = DCMX(opcode);\
-}   \
-\
-for (i = 0; i < nels; i++) {\
-sign = tp##_is_neg(xb->fld);\
-if (tp##_is_any_nan(xb->fld)) { \
-match = extract32(dcmx, 6, 1);  \
-} else if (tp##_is_infinity(xb->fld)) { \
-match = extract32(dcmx, 4 + !sign, 1);  \
-} else if (tp##_is_zero(xb->fld)) { \
-match = extract32(dcmx, 2 + !sign, 1);  \
-} else if (tp##_is_zero_or_denormal(xb->fld)) { \
-match = extract32(dcmx, 0 + !sign, 1);  \
-}   \
-\
-if (scrf) { \
-cc = sign << CRF_LT_BIT | match << CRF_EQ_BIT;  \
-env->fpscr &= ~FP_FPCC; \
-env->fpscr |= cc << FPSCR_FPCC; \
-env->crf[BF(opcode)] = cc;  \
-} else {  

[PATCH v3 08/12] target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XVABSSP, XVABSDP, XVNABSSP,XVNABSDP, XVNEGSP and XVNEGDP to
decodetree and used gvec to translate them.

xvabssp:
reptloopmaster patch
8   12500   0,00477900 0,00476000 (-0.4%)
25  40000,00442800 0,00353300 (-20.2%)
100 10000,00478700 0,00366100 (-23.5%)
500 200 0,00973200 0,00649400 (-33.3%)
250040  0,03165200 0,02226700 (-29.7%)
800012  0,09315900 0,06674900 (-28.3%)

xvabsdp:
reptloopmaster patch
8   12500   0,00475000 0,00474400 (-0.1%)
25  40000,00355600 0,00367500 (+3.3%)
100 10000,00444200 0,00366000 (-17.6%)
500 200 0,00942700 0,00732400 (-22.3%)
250040  0,0299 0,02308500 (-22.8%)
800012  0,08770300 0,06683800 (-23.8%)

xvnabssp:
reptloopmaster patch
8   12500   0,00494500 0,00492900 (-0.3%)
25  40000,00397700 0,00338600 (-14.9%)
100 10000,00421400 0,00353500 (-16.1%)
500 200 0,01048000 0,00707100 (-32.5%)
250040  0,03251500 0,02238300 (-31.2%)
800012  0,08889100 0,06469800 (-27.2%)

xvnabsdp:
reptloopmaster patch
8   12500   0,00511000 0,00492700 (-3.6%)
25  40000,00398800 0,00381500 (-4.3%)
100 10000,00390500 0,00365900 (-6.3%)
500 200 0,00924800 0,00784600 (-15.2%)
250040  0,03138900 0,02391600 (-23.8%)
800012  0,09654200 0,05684600 (-41.1%)

xvnegsp:
reptloopmaster patch
8   12500   0,00493900 0,00452800 (-8.3%)
25  40000,00369100 0,00366800 (-0.6%)
100 10000,00371100 0,0038 (+2.4%)
500 200 0,00991100 0,00652300 (-34.2%)
250040  0,03025800 0,02422300 (-19.9%)
800012  0,09251100 0,06457600 (-30.2%)

xvnegdp:
reptloopmaster patch
8   12500   0,00474900 0,00454400 (-4.3%)
25  40000,00353100 0,00325600 (-7.8%)
100 10000,00398600 0,00366800 (-8.0%)
500 200 0,01032300 0,00702400 (-32.0%)
250040  0,03125000 0,02422400 (-22.5%)
800012  0,09475100 0,06173000 (-34.9%)

This one to me seemed the opposite of the previous instructions, as it
looks like there was an improvement in the translation time (itself not
a surprise as operations were done twice before so there was the need to
translate twice as many TCGop)

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn32.decode|  9 
 target/ppc/translate/vsx-impl.c.inc | 73 ++---
 target/ppc/translate/vsx-ops.c.inc  |  6 ---
 3 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index ae151c4b62..5b687078be 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -754,6 +754,15 @@ STXVRHX 01 . . . 0010101101 .   
@X_TSX
 STXVRWX 01 . . . 0011001101 .   @X_TSX
 STXVRDX 01 . . . 0011101101 .   @X_TSX
 
+## VSX Vector Binary Floating-Point Sign Manipulation Instructions
+
+XVABSDP 00 . 0 . 111011001 ..   @XX2
+XVABSSP 00 . 0 . 110011001 ..   @XX2
+XVNABSDP00 . 0 . 01001 ..   @XX2
+XVNABSSP00 . 0 . 110101001 ..   @XX2
+XVNEGDP 00 . 0 . 11001 ..   @XX2
+XVNEGSP 00 . 0 . 110111001 ..   @XX2
+
 ## VSX Scalar Multiply-Add Instructions
 
 XSMADDADP   00 . . . 0011 . . . @XX3
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index e6e5c45ffd..8717e20d08 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -782,15 +782,76 @@ static void glue(gen_, name)(DisasContext *ctx)   
   \
 tcg_temp_free_i64(sgm);  \
 }
 
-VSX_VECTOR_MOVE(xvabsdp, OP_ABS, SGN_MASK_DP)
-VSX_VECTOR_MOVE(xvnabsdp, OP_NABS, SGN_MASK_DP)
-VSX_VECTOR_MOVE(xvnegdp, OP_NEG, SGN_MASK_DP)
 VSX_VECTOR_MOVE(xvcpsgndp, OP_CPSGN, SGN_MASK_DP)
-VSX_VECTOR_MOVE(xvabssp, OP_ABS, SGN_MASK_SP)
-VSX_VECTOR_MOVE(xvnabssp, OP_NABS, SGN_MASK_SP)
-VSX_VECTOR_MOVE(xvnegsp, OP_NEG, SGN_MASK_SP)
 VSX_VECTOR_MOVE(xvcpsgnsp, OP_CPSGN, SGN_MASK_SP)
 
+#define TCG_OP_IMM_i64(FUNC, OP, IMM)   \
+static void FUNC(TCGv_i64 t, TCGv_i64 b)\
+{   \
+OP(t, b, IMM);  \
+}
+
+TCG_OP_IMM_i64(do_xvabssp_i64, tcg_ge

[PATCH v3 07/12] target/ppc: Move VABSDU[BHW] to decodetree and use gvec

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved VABSDUB, VABSDUH and VABSDUW to decodetree and use gvec to
translate them.

vabsdub:
reptloopmaster patch
8   12500   0,03601600 0,00688500 (-80.9%)
25  40000,03651000 0,00532100 (-85.4%)
100 10000,03666900 0,00595300 (-83.8%)
500 200 0,04305800 0,01244600 (-71.1%)
250040  0,06893300 0,04273700 (-38.0%)
800012  0,14633200 0,12660300 (-13.5%)

vabsduh:
reptloopmaster patch
8   12500   0,02172400 0,00687500 (-68.4%)
25  40000,02154100 0,00531500 (-75.3%)
100 10000,02235400 0,00596300 (-73.3%)
500 200 0,02827500 0,01245100 (-56.0%)
250040  0,05638400 0,04285500 (-24.0%)
800012  0,13166000 0,12641400 (-4.0%)

vabsduw:
reptloopmaster patch
8   12500   0,01646400 0,00688300 (-58.2%)
25  40000,01454500 0,00475500 (-67.3%)
100 10000,01545800 0,00511800 (-66.9%)
500 200 0,02168200 0,01114300 (-48.6%)
250040  0,04571300 0,04138800 (-9.5%)
800012  0,12209500 0,12178500 (-0.3%)

Same as VADDCUW and VSUBCUW, overall performance gain but it uses more
TCGop (4 before the patch, 6 after).

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  6 ++--
 target/ppc/insn32.decode|  6 
 target/ppc/int_helper.c | 13 +++-
 target/ppc/translate/vmx-impl.c.inc | 49 +++--
 target/ppc/translate/vmx-ops.c.inc  |  3 --
 5 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 71c22efc2e..fd8280dfa7 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -146,9 +146,9 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)
 DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VABSDUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VABSDUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VABSDUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 38458c01de..ae151c4b62 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -528,6 +528,12 @@ VAVGUB  000100 . . . 110@VX
 VAVGUH  000100 . . . 1000110@VX
 VAVGUW  000100 . . . 1001010@VX
 
+## Vector Integer Absolute Difference Instructions
+
+VABSDUB 000100 . . . 111@VX
+VABSDUH 000100 . . . 1000111@VX
+VABSDUW 000100 . . . 1001011@VX
+
 ## Vector Bit Manipulation Instruction
 
 VGNB000100 . -- ... . 10011001100   @VX_n
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index bda76e54d4..d97a7f1f28 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -589,8 +589,8 @@ VAVG(VAVGSW, s32, int64_t)
 VAVG(VAVGUW, u32, uint64_t)
 #undef VAVG
 
-#define VABSDU_DO(name, element)\
-void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)   \
+#define VABSDU(name, element)   \
+void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v)\
 {   \
 int i;  \
 \
@@ -606,12 +606,9 @@ void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t 
*b)   \
  *   name- instruction mnemonic suffix (b: byte, h: halfword, w: word)
  *   element - element type to access from vector
  */
-#define VABSDU(type, element)   \
-VABSDU_DO(absdu##type, element)
-VABSDU(b, u8)
-VABSDU(h, u16)
-VABSDU(w, u32)
-#undef VABSDU_DO
+VABSDU(VABSDUB, u8)
+VABSDU(VABSDUH, u16)
+VABSDU(VABSDUW, u32)
 #undef VABSDU
 
 #define VCF(suffix, cvt, element)   \
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 195c601f7a..7741f2eb49 100644
--- a/

[PATCH v3 06/12] target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved the instructions VAVGUB, VAVGUH, VAVGUW, VAVGSB, VAVGSH, VAVGSW,
to decodetree and use gvec with them. For these one the right shift
had to be made before the sum as to avoid an overflow, so add 1 at the
end if any of the entries had 1 in its LSB as to replicate the "+ 1"
before the shift described by the ISA.

vavgub:
reptloopmaster patch
8   12500   0,02616600 0,00754200 (-71.2%)
25  40000,0253 0,00637700 (-74.8%)
100 10000,02604600 0,00790100 (-69.7%)
500 200 0,03189300 0,01838400 (-42.4%)
250040  0,06006900 0,06851000 (+14.1%)
800012  0,13941000 0,20548500 (+47.4%)

vavguh:
reptloopmaster patch
8   12500   0,01818200 0,00780600 (-57.1%)
25  40000,01789300 0,00641600 (-64.1%)
100 10000,01899100 0,00787200 (-58.5%)
500 200 0,02527200 0,01828400 (-27.7%)
250040  0,05361800 0,06773000 (+26.3%)
800012  0,12886600 0,20291400 (+57.5%)

vavguw:
reptloopmaster patch
8   12500   0,01423100 0,00776600 (-45.4%)
25  40000,01780800 0,00638600 (-64.1%)
100 10000,02085500 0,00787000 (-62.3%)
500 200 0,02737100 0,01828800 (-33.2%)
250040  0,05572600 0,06774200 (+21.6%)
800012  0,13101700 0,20311600 (+55.0%)

vavgsb:
reptloopmaster patch
8   12500   0,03006000 0,00788600 (-73.8%)
25  40000,02882200 0,00637800 (-77.9%)
100 10000,02958000 0,00791400 (-73.2%)
500 200 0,03548800 0,01860400 (-47.6%)
250040  0,0636 0,06850800 (+7.7%)
800012  0,13816500 0,20550300 (+48.7%)

vavgsh:
reptloopmaster patch
8   12500   0,01965900 0,00776600 (-60.5%)
25  40000,01875400 0,00638700 (-65.9%)
100 10000,01952200 0,00786900 (-59.7%)
500 200 0,02562000 0,01760300 (-31.3%)
250040  0,05384300 0,06742800 (+25.2%)
800012  0,13240800 0,2033 (+53.5%)

vavgsw:
reptloopmaster patch
8   12500   0,01407700 0,00775600 (-44.9%)
25  40000,01762300 0,0064 (-63.7%)
100 10000,02046500 0,00788500 (-61.5%)
500 200 0,02745600 0,01843000 (-32.9%)
250040  0,05375500 0,06820500 (+26.9%)
800012  0,13068300 0,20304900 (+55.4%)

These results to me seems to indicate that with gvec the results have a
slower translation but faster execution.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  12 ++--
 target/ppc/insn32.decode|   9 +++
 target/ppc/int_helper.c |  32 -
 target/ppc/translate/vmx-impl.c.inc | 106 
 target/ppc/translate/vmx-ops.c.inc  |   9 +--
 5 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index a06193bc67..71c22efc2e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -143,15 +143,15 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)
 #define dh_ctype_acc ppc_acc_t *
 #define dh_typecode_acc dh_typecode_ptr
 
-DEF_HELPER_FLAGS_3(vavgub, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavguh, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavguw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavgsb, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavgsh, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavgsw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_4(vcmpeqfp, void, env, avr, avr, avr)
 DEF_HELPER_4(vcmpgefp, void, env, avr, avr, avr)
 DEF_HELPER_4(vcmpgtfp, void, env, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index aa4968e6b9..38458c01de 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -519,6 +519,15 @@ VCMPNEZW000100 . . . . 011111   @VC
 VCMPSQ  000100 ... -- . . 0010101   @VX_bf
 VCMPUQ  000100 ... -- . . 

[PATCH v3 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to
decodetree. VPRTYBW and VPRTYBD now also use .fni4 and .fni8,
respectively.

vprtybw:
reptloopmaster patch
8   12500   0,01198900 0,00703100 (-41.4%)
25  40000,01070100 0,00571400 (-46.6%)
100 10000,01123300 0,00678200 (-39.6%)
500 200 0,01601500 0,01535600 (-4.1%)
250040  0,03872900 0,05562100 (43.6%)
800012  0,10047000 0,16643000 (65.7%)

vprtybd:
reptloopmaster patch
8   12500   0,00757700 0,00788100 (4.0%)
25  40000,00652500 0,00669600 (2.6%)
100 10000,00714400 0,00825400 (15.5%)
500 200 0,01211000 0,01903700 (57.2%)
250040  0,03483800 0,07021200 (101.5%)
800012  0,09591800 0,21036200 (119.3%)

vprtybq:
reptloopmaster patch
8   12500   0,00675600 0,00667200 (-1.2%)
25  40000,00619400 0,00643200 (3.8%)
100 10000,00707100 0,00751100 (6.2%)
500 200 0,01199300 0,01342000 (11.9%)
250040  0,03490900 0,04092900 (17.2%)
800012  0,09588200 0,11465100 (19.6%)

I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ,
I'm not sure if it's worth to move those instructions. Comparing the
assembly of the helper with the TCGop they are pretty similar, so
I'm not sure why vprtybd took so much more time.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  4 +-
 target/ppc/insn32.decode|  4 ++
 target/ppc/int_helper.c | 25 +--
 target/ppc/translate/vmx-impl.c.inc | 68 +++--
 target/ppc/translate/vmx-ops.c.inc  |  3 --
 5 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index b2e910b089..a06193bc67 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -193,9 +193,7 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
+DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 2658dd3395..aa4968e6b9 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -529,6 +529,10 @@ VCTZDM  000100 . . . 1000100@VX
 VPDEPD  000100 . . . 10111001101@VX
 VPEXTD  000100 . . . 10110001101@VX
 
+VPRTYBD 000100 . 01001 . 1100010@VX_tb
+VPRTYBQ 000100 . 01010 . 1100010@VX_tb
+VPRTYBW 000100 . 01000 . 1100010@VX_tb
+
 ## Vector Permute and Formatting Instruction
 
 VEXTDUBVLX  000100 . . . . 011000   @VA
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index c7fd0d1faa..c6ce4665fa 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -492,31 +492,8 @@ static inline void set_vscr_sat(CPUPPCState *env)
 env->vscr_sat.u32[0] = 1;
 }
 
-/* vprtybw */
-void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
-{
-int i;
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
-uint64_t res = b->u32[i] ^ (b->u32[i] >> 16);
-res ^= res >> 8;
-r->u32[i] = res & 1;
-}
-}
-
-/* vprtybd */
-void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
-{
-int i;
-for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
-uint64_t res = b->u64[i] ^ (b->u64[i] >> 32);
-res ^= res >> 16;
-res ^= res >> 8;
-r->u64[i] = res & 1;
-}
-}
-
 /* vprtybq */
-void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
 uint64_t res = b->u64[0] ^ b->u64[1];
 res ^= res >> 32;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index b9a9e83ab3..cbb2a3ebe7 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1659,9 +1659,71 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
 GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
 GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
 GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
-GE

[PATCH v3 04/12] target/ppc: Move VNEG[WD] to decodtree and use gvec

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved the instructions VNEGW and VNEGD to decodetree and used gvec to
decode it.

vnegw:
reptloopmaster patch
8   12500   0,01053200 0,00548400 (-47.9%)
25  40000,01030500 0,0039 (-62.2%)
100 10000,01096300 0,00395400 (-63.9%)
500 200 0,01472000 0,00712300 (-51.6%)
250040  0,03809000 0,02147700 (-43.6%)
800012  0,09957100 0,06202100 (-37.7%)

vnegd:
reptloopmaster patch
8   12500   0,00594600 0,00543800 (-8.5%)
25  40000,00575200 0,00396400 (-31.1%)
100 10000,00676100 0,00394800 (-41.6%)
500 200 0,01149300 0,00709400 (-38.3%)
250040  0,03441500 0,02169600 (-37.0%)
800012  0,09516900 0,06337000 (-33.4%)

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  2 --
 target/ppc/insn32.decode|  3 +++
 target/ppc/int_helper.c | 12 
 target/ppc/translate/vmx-impl.c.inc | 15 +--
 target/ppc/translate/vmx-ops.c.inc  |  2 --
 5 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f7047ed2aa..b2e910b089 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -229,8 +229,6 @@ DEF_HELPER_FLAGS_2(VSTRIBL, TCG_CALL_NO_RWG, i32, avr, avr)
 DEF_HELPER_FLAGS_2(VSTRIBR, TCG_CALL_NO_RWG, i32, avr, avr)
 DEF_HELPER_FLAGS_2(VSTRIHL, TCG_CALL_NO_RWG, i32, avr, avr)
 DEF_HELPER_FLAGS_2(VSTRIHR, TCG_CALL_NO_RWG, i32, avr, avr)
-DEF_HELPER_FLAGS_2(vnegw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vnegd, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vupkhpx, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vupklpx, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vupkhsb, TCG_CALL_NO_RWG, void, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index aebc7b73c8..2658dd3395 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -629,6 +629,9 @@ VEXTSH2D000100 . 11001 . 1100010
@VX_tb
 VEXTSW2D000100 . 11010 . 1100010@VX_tb
 VEXTSD2Q000100 . 11011 . 1100010@VX_tb
 
+VNEGD   000100 . 00111 . 1100010@VX_tb
+VNEGW   000100 . 00110 . 1100010@VX_tb
+
 ## Vector Mask Manipulation Instructions
 
 MTVSRBM 000100 . 1 . 1100110@VX_tb
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index f8dd12e8ae..c7fd0d1faa 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1928,18 +1928,6 @@ XXBLEND(W, 32)
 XXBLEND(D, 64)
 #undef XXBLEND
 
-#define VNEG(name, element) \
-void helper_##name(ppc_avr_t *r, ppc_avr_t *b)  \
-{   \
-int i;  \
-for (i = 0; i < ARRAY_SIZE(r->element); i++) {  \
-r->element[i] = -b->element[i]; \
-}   \
-}
-VNEG(vnegw, s32)
-VNEG(vnegd, s64)
-#undef VNEG
-
 void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
 int sh = (b->VsrB(0xf) >> 3) & 0xf;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index f52485a5f1..b9a9e83ab3 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2625,8 +2625,19 @@ GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
 GEN_VXFORM_TRANS(vclzw, 1, 30)
 GEN_VXFORM_TRANS(vclzd, 1, 31)
-GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
-GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
+
+static bool do_vneg(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+REQUIRE_VECTOR(ctx);
+
+tcg_gen_gvec_neg(vece, avr_full_offset(a->vrt), avr_full_offset(a->vrb),
+ 16, 16);
+return true;
+}
+
+TRANS(VNEGW, do_vneg, MO_32)
+TRANS(VNEGD, do_vneg, MO_64)
 
 static void gen_vexts_i64(TCGv_i64 t, TCGv_i64 b, int64_t s)
 {
diff --git a/target/ppc/translate/vmx-ops.c.inc 
b/target/ppc/translate/vmx-ops.c.inc
index ded0234123..27908533dd 100644
--- a/target/ppc/translate/vmx-ops.c.inc
+++ b/target/ppc/translate/vmx-ops.c.inc
@@ -181,8 +181,6 @@ GEN_VXFORM_300_EXT(vextractd, 6, 11, 0x10),
 GEN_VXFORM(vspltisb, 6, 12),
 GEN_VXFORM(vspltish, 6, 13),
 GEN_VXFORM(vspltisw, 6, 14),
-GEN_VXFORM_300_EO(vnegw, 0x01, 0x18, 0x06),
-GEN_VXFORM_300_EO(vnegd, 0x01, 0x18, 0x07),
 GEN_VXFORM_300_EO(vctzb, 0x01, 0x18, 0x1C),
 GEN_VXFORM_300_EO(vctzh, 0x01, 0x18, 0x1D),
 GEN_VXFORM_300_EO(vctzw, 0x01, 0x18, 0x1E),
-- 
2.37.3




[PATCH v3 02/12] target/ppc: Move VMH[R]ADDSHS instruction to decodetree

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch moves VMHADDSHS and VMHRADDSHS to decodetree I couldn't find
a satisfactory implementation with TCG inline.

vmhaddshs:
reptloopmaster patch
8   12500   0,02983400 0,02648500 (-11.2%)
25  40000,02946000 0,02518000 (-14.5%)
100 10000,03104300 0,02638000 (-15.0%)
500 200 0,04002000 0,03502500 (-12.5%)
250040  0,08090100 0,07562200 (-6.5%)
800012  0,19242600 0,18626800 (-3.2%)

vmhraddshs:
reptloopmaster patch
8   12500   0,03078600 0,02851000 (-7.4%)
25  40000,02793200 0,02746900 (-1.7%)
100 10000,02886000 0,02839900 (-1.6%)
500 200 0,03714700 0,03799200 (+2.3%)
250040  0,07948000 0,07852200 (-1.2%)
800012  0,19049800 0,18813900 (-1.2%)

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h | 4 ++--
 target/ppc/insn32.decode| 2 ++
 target/ppc/int_helper.c | 4 ++--
 target/ppc/translate/vmx-impl.c.inc | 5 +++--
 target/ppc/translate/vmx-ops.c.inc  | 1 -
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 9c562ab00e..f02a9497b7 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -258,8 +258,8 @@ DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vpkpx, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
-DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
+DEF_HELPER_5(VMHADDSHS, void, env, avr, avr, avr, avr)
+DEF_HELPER_5(VMHRADDSHS, void, env, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
 DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 7445455a12..9a509e84df 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -694,6 +694,8 @@ VMSUMCUD000100 . . . . 010111   @VA
 VMSUMUDM000100 . . . . 100011   @VA
 
 VMLADDUHM   000100 . . . . 100010   @VA
+VMHADDSHS   000100 . . . . 10   @VA
+VMHRADDSHS  000100 . . . . 11   @VA
 
 ## Vector String Instructions
 
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 0d25000b2a..ae1ba8084d 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -939,7 +939,7 @@ target_ulong helper_vctzlsbb(ppc_avr_t *r)
 return count;
 }
 
-void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
+void helper_VMHADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
   ppc_avr_t *b, ppc_avr_t *c)
 {
 int sat = 0;
@@ -957,7 +957,7 @@ void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, 
ppc_avr_t *a,
 }
 }
 
-void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
+void helper_VMHRADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
ppc_avr_t *b, ppc_avr_t *c)
 {
 int sat = 0;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 9f18c6d4f2..3acd585a2f 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2521,7 +2521,7 @@ static void glue(gen_, name0##_##name1)(DisasContext 
*ctx)  \
 tcg_temp_free_ptr(rd);  \
 }
 
-GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16)
+GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 
 static bool do_va_helper(DisasContext *ctx, arg_VA *a,
 void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr))
@@ -2620,7 +2620,8 @@ static bool do_va_env_helper(DisasContext *ctx, arg_VA *a,
 TRANS_FLAGS(ALTIVEC, VMSUMUHS, do_va_env_helper, gen_helper_VMSUMUHS)
 TRANS_FLAGS(ALTIVEC, VMSUMSHS, do_va_env_helper, gen_helper_VMSUMSHS)
 
-GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
+TRANS_FLAGS(ALTIVEC, VMHADDSHS, do_va_env_helper, gen_helper_VMHADDSHS)
+TRANS_FLAGS(ALTIVEC, VMHRADDSHS, do_va_env_helper, gen_helper_VMHRADDSHS)
 
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
diff --git a/target/ppc/translate/vmx-ops.c.inc 
b/target/ppc/translate/vmx-ops.c.inc
index a3a0fd0650..7cd9d40e06 100644
--- a/target/ppc/translate/vmx-ops.c.inc
+++ b/target/ppc/translate/vmx-ops.c.inc
@@ -219,7 +219,6 @@ GEN_VXFORM_UIMM(vctsxs, 5, 15),
 
 #define GEN_VAFORM_PAIRED(name0, name1, opc2)   \
 GEN_HANDLER(name0##_##name1, 0x04, opc2, 0xFF, 0x, PPC_ALTIVEC)
-GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16),
 GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23),
 
 GEN_VXFORM_DUAL(vclzb, vpopc

[PATCH v3 00/12] VMX/VSX instructions with gvec

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Patches missing review: 12

v2 -> v3:
- Used ctpop in i32 and i64 vprtyb
- Changed gvec set up in xvtstdc[ds]p

v1 -> v2:
- Implemented instructions with fni4/fni8 and dropped the helper:
* VSUBCUW
* VADDCUW
* VPRTYBW
* VPRTYBD
- Reworked patch12 to only use gvec implementation with a few
  immediates.
- Used bitsel_ver on patch9
- Changed vec variables to tcg_constant_vec when possible

This patch series moves some instructions from decode legacy to
decodetree and translate said instructions with gvec. Some cases using
gvec ended up with a bigger, more complex and slower so those
instructions were only moved to decodetree.

In each patch there's a comparison of the execution time before the
patch being applied and after. Said result is the sum of 10 executions.

The program used to time the execution worked like this:

clock_t start = clock();
for (int i = 0; i < LOOP; i++) {
asm (
 load values in registers, between 2 and 3 instructions
 ".rept REPT\n\t"
 "INSTRUCTION registers\n\t"
 ".endr\n\t"
 save result from register, 1 instruction
);
}
clock_t end = clock();
printf("INSTRUCTION rept=REPT loop=LOOP, time taken: %.12lf\n",
   ((double)(end - start))/ CLOCKS_PER_SEC);

Where the column rept in the value used in .rept in the inline assembly
and loop column is the value used for the for loop. All of those tests
were executed on a Power9. When comparing the TCGop the data used was
gathered using '-d op' and '-d op_opt'.

Lucas Mateus Castro (alqotel) (12):
  target/ppc: Moved VMLADDUHM to decodetree and use gvec
  target/ppc: Move VMH[R]ADDSHS instruction to decodetree
  target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec
  target/ppc: Move VNEG[WD] to decodtree and use gvec
  target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec
  target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec
  target/ppc: Move VABSDU[BHW] to decodetree and use gvec
  target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P
  target/ppc: Use gvec to decode XVCPSGN[SD]P
  target/ppc: Moved XVTSTDC[DS]P to decodetree
  target/ppc: Moved XSTSTDC[QDS]P to decodetree
  target/ppc: Use gvec to decode XVTSTDC[DS]P

 target/ppc/fpu_helper.c | 137 +-
 target/ppc/helper.h |  42 ++--
 target/ppc/insn32.decode|  50 
 target/ppc/int_helper.c | 107 ++--
 target/ppc/translate.c  |   1 -
 target/ppc/translate/vmx-impl.c.inc | 352 ++
 target/ppc/translate/vmx-ops.c.inc  |  15 +-
 target/ppc/translate/vsx-impl.c.inc | 372 +++-
 target/ppc/translate/vsx-ops.c.inc  |  21 --
 9 files changed, 771 insertions(+), 326 deletions(-)

-- 
2.37.3




[PATCH v3 01/12] target/ppc: Moved VMLADDUHM to decodetree and use gvec

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch moves VMLADDUHM to decodetree a creates a gvec implementation
using mul_vec and add_vec.

reptloopmaster patch
8   12500   0,01810500 0,00903100 (-50.1%)
25  40000,01739400 0,00747700 (-57.0%)
100 10000,01843600 0,00901400 (-51.1%)
500 200 0,02574600 0,01971000 (-23.4%)
250040  0,05921600 0,07121800 (+20.3%)
800012  0,15326700 0,21725200 (+41.7%)

The significant difference in performance when REPT is low and LOOP is
high I think is due to the fact that the new implementation has a higher
translation time, as when using a helper only 5 TCGop are used but with
the patch a total of 10 TCGop are needed (Power lacks a direct mul_vec
equivalent so this instruction is implemented with the help of 5 others,
vmuleu, vmulou, vmrgh, vmrgl and vpkum).

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  2 +-
 target/ppc/insn32.decode|  2 ++
 target/ppc/int_helper.c |  3 +-
 target/ppc/translate.c  |  1 -
 target/ppc/translate/vmx-impl.c.inc | 48 ++---
 5 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 57eee07256..9c562ab00e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -264,7 +264,7 @@ DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, 
avr, avr, avr)
 DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
 DEF_HELPER_5(VMSUMSHS, void, env, avr, avr, avr, avr)
-DEF_HELPER_FLAGS_4(vmladduhm, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
+DEF_HELPER_FLAGS_5(VMLADDUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_2(mtvscr, TCG_CALL_NO_RWG, void, env, i32)
 DEF_HELPER_FLAGS_1(mfvscr, TCG_CALL_NO_RWG, i32, env)
 DEF_HELPER_3(lvebx, void, env, avr, tl)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index a5249ee32c..7445455a12 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -693,6 +693,8 @@ VMSUMUHS000100 . . . . 100111   @VA
 VMSUMCUD000100 . . . . 010111   @VA
 VMSUMUDM000100 . . . . 100011   @VA
 
+VMLADDUHM   000100 . . . . 100010   @VA
+
 ## Vector String Instructions
 
 VSTRIBL 000100 . 0 . . 001101   @VX_tb_rc
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 696096100b..0d25000b2a 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -974,7 +974,8 @@ void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, 
ppc_avr_t *a,
 }
 }
 
-void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+void helper_VMLADDUHM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c,
+  uint32_t v)
 {
 int i;
 
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index e810842925..11f729c60c 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -6921,7 +6921,6 @@ GEN_HANDLER(lvsl, 0x1f, 0x06, 0x00, 0x0001, 
PPC_ALTIVEC),
 GEN_HANDLER(lvsr, 0x1f, 0x06, 0x01, 0x0001, PPC_ALTIVEC),
 GEN_HANDLER(mfvscr, 0x04, 0x2, 0x18, 0x001ff800, PPC_ALTIVEC),
 GEN_HANDLER(mtvscr, 0x04, 0x2, 0x19, 0x03ff, PPC_ALTIVEC),
-GEN_HANDLER(vmladduhm, 0x04, 0x11, 0xFF, 0x, PPC_ALTIVEC),
 #if defined(TARGET_PPC64)
 GEN_HANDLER_E(maddhd_maddhdu, 0x04, 0x18, 0xFF, 0x, PPC_NONE,
   PPC2_ISA300),
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index e644ad3236..9f18c6d4f2 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2523,24 +2523,6 @@ static void glue(gen_, name0##_##name1)(DisasContext 
*ctx)  \
 
 GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16)
 
-static void gen_vmladduhm(DisasContext *ctx)
-{
-TCGv_ptr ra, rb, rc, rd;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-ra = gen_avr_ptr(rA(ctx->opcode));
-rb = gen_avr_ptr(rB(ctx->opcode));
-rc = gen_avr_ptr(rC(ctx->opcode));
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_vmladduhm(rd, ra, rb, rc);
-tcg_temp_free_ptr(ra);
-tcg_temp_free_ptr(rb);
-tcg_temp_free_ptr(rc);
-tcg_temp_free_ptr(rd);
-}
-
 static bool do_va_helper(DisasContext *ctx, arg_VA *a,
 void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr))
 {
@@ -2569,6 +2551,36 @@ TRANS_FLAGS2(ALTIVEC_207, VSUBECUQ, do_va_helper, 
gen_helper_VSUBECUQ)
 TRANS_FLAGS(ALTIVEC, VPERM, do_va_helper, gen_helper_VPERM)
 TRANS_FLAGS2(ISA300, VPERMR, do_va_helper, gen_helper_VPERMR)
 
+static void gen_vmladduhm_vec(unsigned vece, TCGv_vec t, TCGv_vec a, TCGv_vec 
b,
+   

[PATCH v3 09/12] target/ppc: Use gvec to decode XVCPSGN[SD]P

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XVCPSGNSP and XVCPSGNDP to decodetree and used gvec to translate
them.

xvcpsgnsp:
reptloopmaster patch
8   12500   0,00561400 0,00537900 (-4.2%)
25  40000,00562100 0,0040 (-28.8%)
100 10000,00696900 0,00416300 (-40.3%)
500 200 0,02211900 0,00840700 (-62.0%)
250040  0,09328600 0,02728300 (-70.8%)
800012  0,27295300 0,06867800 (-74.8%)

xvcpsgndp:
reptloopmaster patch
8   12500   0,00556300 0,00584200 (+5.0%)
25  40000,00482700 0,00431700 (-10.6%)
100 10000,00585800 0,00464400 (-20.7%)
500 200 0,01565300 0,00839700 (-46.4%)
250040  0,05766500 0,02430600 (-57.8%)
800012  0,19875300 0,07947100 (-60.0%)

Like the previous instructions there seemed to be a improvement on
translation time.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn32.decode|   2 +
 target/ppc/translate/vsx-impl.c.inc | 109 ++--
 target/ppc/translate/vsx-ops.c.inc  |   3 -
 3 files changed, 55 insertions(+), 59 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 5b687078be..6549c4040e 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -762,6 +762,8 @@ XVNABSDP00 . 0 . 01001 ..   @XX2
 XVNABSSP00 . 0 . 110101001 ..   @XX2
 XVNEGDP 00 . 0 . 11001 ..   @XX2
 XVNEGSP 00 . 0 . 110111001 ..   @XX2
+XVCPSGNDP   00 . . .  ...   @XX3
+XVCPSGNSP   00 . . . 1101 ...   @XX3
 
 ## VSX Scalar Multiply-Add Instructions
 
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 8717e20d08..1c289238ec 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -729,62 +729,6 @@ VSX_SCALAR_MOVE_QP(xsnabsqp, OP_NABS, SGN_MASK_DP)
 VSX_SCALAR_MOVE_QP(xsnegqp, OP_NEG, SGN_MASK_DP)
 VSX_SCALAR_MOVE_QP(xscpsgnqp, OP_CPSGN, SGN_MASK_DP)
 
-#define VSX_VECTOR_MOVE(name, op, sgn_mask)  \
-static void glue(gen_, name)(DisasContext *ctx)  \
-{\
-TCGv_i64 xbh, xbl, sgm;  \
-if (unlikely(!ctx->vsx_enabled)) {   \
-gen_exception(ctx, POWERPC_EXCP_VSXU);   \
-return;  \
-}\
-xbh = tcg_temp_new_i64();\
-xbl = tcg_temp_new_i64();\
-sgm = tcg_temp_new_i64();\
-get_cpu_vsr(xbh, xB(ctx->opcode), true); \
-get_cpu_vsr(xbl, xB(ctx->opcode), false);\
-tcg_gen_movi_i64(sgm, sgn_mask); \
-switch (op) {\
-case OP_ABS: {   \
-tcg_gen_andc_i64(xbh, xbh, sgm); \
-tcg_gen_andc_i64(xbl, xbl, sgm); \
-break;   \
-}\
-case OP_NABS: {  \
-tcg_gen_or_i64(xbh, xbh, sgm);   \
-tcg_gen_or_i64(xbl, xbl, sgm);   \
-break;   \
-}\
-case OP_NEG: {   \
-tcg_gen_xor_i64(xbh, xbh, sgm);  \
-tcg_gen_xor_i64(xbl, xbl, sgm);  \
-break;   \
-}\
-case OP_CPSGN: { \
-TCGv_i64 xah = tcg_temp_new_i64();   \
-TCGv_i64 xal = tcg_temp_new_i64();   \
-get_cpu_vsr(xah, xA(ctx->opcode), true); \
-get_cpu_vsr(xal, xA(ctx->opcode), false);\
-tcg_gen_and_i64(xah, xah, sgm);  \
-tcg_gen_and_i64(xal, xal, sgm);  \
-tcg_gen_andc_i64(xbh, xbh, sgm); \
-tcg_gen_andc_i64(xbl, xbl, sgm); \
-tcg

[PATCH v3 10/12] target/ppc: Moved XVTSTDC[DS]P to decodetree

2022-10-19 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XVTSTDCSP and XVTSTDCDP to decodetree an restructured the helper
to be simpler and do all decoding in the decodetree (so XB, XT and DCMX
are all calculated outside the helper).

Obs: The tests in this one are slightly different, these are the sum of
these instructions with all possible immediate and those instructions
are repeated 10 times.

xvtstdcsp:
reptloopmaster patch
8   12500   2,76402100 2,70699100 (-2.1%)
25  40002,64867100 2,67884100 (+1.1%)
100 10002,73806300 2,78701000 (+1.8%)
500 200 3,44666500 3,61027600 (+4.7%)
250040  5,85790200 6,47475500 (+10.5%)
800012 15,2210210017,46062900 (+14.7%)

xvtstdcdp:
reptloopmaster patch
8   12500   2,11818000 1,61065300 (-24.0%)
25  40002,04573400 1,60132200 (-21.7%)
100 10002,13834100 1,69988100 (-20.5%)
500 200 2,73977000 2,48631700 (-9.3%)
250040  5,05067000 5,25914100 (+4.1%)
800012 14,6050780015,93704900 (+9.1%)

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/fpu_helper.c | 39 +++--
 target/ppc/helper.h |  4 +--
 target/ppc/insn32.decode|  5 
 target/ppc/translate/vsx-impl.c.inc | 28 +++--
 target/ppc/translate/vsx-ops.c.inc  |  8 --
 5 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index ae25f32d6e..960a76a8a5 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3295,11 +3295,46 @@ void helper_##op(CPUPPCState *env, uint32_t opcode) 
\
 }   \
 }
 
-VSX_TEST_DC(xvtstdcdp, 2, xB(opcode), float64, VsrD(i), VsrD(i), UINT64_MAX, 0)
-VSX_TEST_DC(xvtstdcsp, 4, xB(opcode), float32, VsrW(i), VsrW(i), UINT32_MAX, 0)
 VSX_TEST_DC(xststdcdp, 1, xB(opcode), float64, VsrD(0), VsrD(0), 0, 1)
 VSX_TEST_DC(xststdcqp, 1, (rB(opcode) + 32), float128, f128, VsrD(0), 0, 1)
 
+#define VSX_TSTDC(tp)   \
+static int32_t tp##_tstdc(tp b, uint32_t dcmx)  \
+{   \
+uint32_t match = 0; \
+uint32_t sign = tp##_is_neg(b); \
+if (tp##_is_any_nan(b)) {   \
+match = extract32(dcmx, 6, 1);  \
+} else if (tp##_is_infinity(b)) {   \
+match = extract32(dcmx, 4 + !sign, 1);  \
+} else if (tp##_is_zero(b)) {   \
+match = extract32(dcmx, 2 + !sign, 1);  \
+} else if (tp##_is_zero_or_denormal(b)) {   \
+match = extract32(dcmx, 0 + !sign, 1);  \
+}   \
+return (match != 0);\
+}
+
+VSX_TSTDC(float32)
+VSX_TSTDC(float64)
+#undef VSX_TSTDC
+
+void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+{
+int i;
+for (i = 0; i < 2; i++) {
+t->s64[i] = (int64_t)-float64_tstdc(b->f64[i], dcmx);
+}
+}
+
+void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+{
+int i;
+for (i = 0; i < 4; i++) {
+t->s32[i] = (int32_t)-float32_tstdc(b->f32[i], dcmx);
+}
+}
+
 void helper_xststdcsp(CPUPPCState *env, uint32_t opcode, ppc_vsr_t *xb)
 {
 uint32_t dcmx, sign, exp;
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index fd8280dfa7..9e5d11939b 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -517,8 +517,8 @@ DEF_HELPER_3(xvcvsxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvsxwsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxwsp, void, env, vsr, vsr)
-DEF_HELPER_2(xvtstdcsp, void, env, i32)
-DEF_HELPER_2(xvtstdcdp, void, env, i32)
+DEF_HELPER_FLAGS_4(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
+DEF_HELPER_FLAGS_4(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
 DEF_HELPER_3(xvrspi, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspic, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspim, void, env, vsr, vsr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 6549c4040e..c0a531be5c 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -199,6 +199,9 @@
 
 @XX2_uim4   .. . . uim:4 . . .. _uim 
xt=%xx_xt xb=%xx_xb
 
+%xx_uim76:1 2:1 16:5
+@XX2_uim7   .. . . .  . ... . .._uim 
xt=%xx_xt xb=%xx_xb uim=%xx_uim7
+
 _bf_xb  bf xb
 @XX2_bf_xb  .. bf:3 .. . . . . ._bf_xb 
xb=%xx_xb
 
@@ -848,6 +851

[PATCH v2 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Used gvec to translate XVTSTDCSP and XVTSTDCDP.

xvtstdcsp:
reptloopimm prev versioncurrent version
25  40000   0,0475500,040820 (-14.2%)
25  40001   0,0695200,053520 (-23.0%)
25  40003   0,0786600,058470 (-25.7%)
25  400051  0,0992800,190100 (+91.5%)
25  4000127 0,1296900,201750 (+55.6%)
800012  0   0,5546250,391385 (-29.4%)
800012  1   2,6756351,423656 (-46.8%)
800012  3   3,1868231,756885 (-44.9%)
800012  51  4,2844171,363698 (-68.2%)
800012  127 5,6380001,305333 (-76.8%)

xvtstdcdp:
reptloopimm prev versioncurrent version
25  40000   0,0474500,040590 (-14.5%)
25  40001   0,0741300,053570 (-27.7%)
25  40003   0,0841800,063020 (-25.1%)
25  400051  0,1033400,127980 (+23.8%)
25  4000127 0,1346700,128660 (-4.5%)
800012  0   0,5224270,391510 (-25.1%)
800012  1   2,8847081,426802 (-50.5%)
800012  3   3,4276251,972115 (-42.5%)
800012  51  4,4502601,251865 (-71.9%)
800012  127 5,8544791,250719 (-78.6%)

Overall, these instructions are the hardest ones to measure performance
as the gvec implementation is affected by the immediate. Above there are
5 different scenarios when it comes to immediate and 2 when it comes to
rept/loop combination. The immediates scenarios are: all bits are 0
therefore the target register should just be changed to 0, with 1 bit
set, with 2 bits set in a combination the new implementation can deal
with using gvec, 4 bits set and the new implementation can't deal with
it using gvec and all bits set. The rept/loop scenarios are high loop
and low rept (so it should spend more time executing it than translating
it) and high rept low loop (so it should spend more time translating it
than executing this code).
There was a gain when it came to translating the instructions
and in the execution time in the immediates the new implementation is
configured to accept, but a loss in performance in execution time for
more exoteric immediates.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/fpu_helper.c |   7 +-
 target/ppc/helper.h |   4 +-
 target/ppc/translate/vsx-impl.c.inc | 188 ++--
 3 files changed, 184 insertions(+), 15 deletions(-)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index a66e16c212..6c94576575 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -22,6 +22,7 @@
 #include "exec/exec-all.h"
 #include "internal.h"
 #include "fpu/softfloat.h"
+#include "tcg/tcg-gvec-desc.h"
 
 static inline float128 float128_snan_to_qnan(float128 x)
 {
@@ -3263,17 +3264,19 @@ VSX_TSTDC(float64)
 VSX_TSTDC(float128)
 #undef VSX_TSTDC
 
-void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint32_t dcmx)
 {
 int i;
+dcmx = simd_data(dcmx);
 for (i = 0; i < 2; i++) {
 t->s64[i] = (int64_t)-float64_tstdc(b->f64[i], dcmx);
 }
 }
 
-void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint32_t dcmx)
 {
 int i;
+dcmx = simd_data(dcmx);
 for (i = 0; i < 4; i++) {
 t->s32[i] = (int32_t)-float32_tstdc(b->f32[i], dcmx);
 }
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 8344fe39c6..2851418acc 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -517,8 +517,8 @@ DEF_HELPER_3(xvcvsxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvsxwsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxwsp, void, env, vsr, vsr)
-DEF_HELPER_FLAGS_4(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
-DEF_HELPER_FLAGS_4(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
+DEF_HELPER_FLAGS_3(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i32)
+DEF_HELPER_FLAGS_3(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i32)
 DEF_HELPER_3(xvrspi, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspic, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspim, void, env, vsr, vsr)
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 4fdbc45ff4..26fc8c0b01 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -632,6 +632,8 @@ static void gen_mtvsrws(DisasContext *ctx)
 #define SGN_MASK_SP 0x80008000ull
 #define EXP_MASK_DP  0x7FF0ull
 #define EXP_MASK_SP 0x7F807F80ull
+#define FRC_MASK_DP (~(SGN_MASK_

[PATCH v2 11/12] target/ppc: Moved XSTSTDC[QDS]P to decodetree

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XSTSTDCSP, XSTSTDCDP and XSTSTDCQP to decodetree and moved some of
its decoding away from the helper as previously the DCMX, XB and BF were
calculated in the helper with the help of cpu_env, now that part was
moved to the decodetree with the rest.

xvtstdcsp:
reptloopmaster patch
8   12500   1,85393600 1,94683600 (+5.0%)
25  40001,78779800 1,92479000 (+7.7%)
100 10002,12775000 2,28895500 (+7.6%)
500 200 2,99655300 3,23102900 (+7.8%)
250040  6,89082200 7,44827500 (+8.1%)
800012 17,5058550018,95152100 (+8.3%)

xvtstdcdp:
reptloopmaster patch
8   12500   1,39043100 1,33539800 (-4.0%)
25  40001,35731800 1,37347800 (+1.2%)
100 10001,51514800 1,56053000 (+3.0%)
500 200 2,21014400 2,47906000 (+12.2%)
250040  5,39488200 6,68766700 (+24.0%)
800012 13,9862390018,17661900 (+30.0%)

xvtstdcdp:
reptloopmaster patch
8   12500   1,35123800 1,34455800 (-0.5%)
25  40001,36441200 1,36759600 (+0.2%)
100 10001,49763500 1,54138400 (+2.9%)
500 200 2,19020200 2,46196400 (+12.4%)
250040  5,39265700 6,68147900 (+23.9%)
800012 14,0416360018,19669600 (+29.6%)

As some values are now decoded outside the helper and passed to it as an
argument the number of arguments of the helper increased, the number
of TCGop needed to load the arguments increased. I suspect that's why
the slow-down in the tests with a high REPT but low LOOP.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/fpu_helper.c | 114 +---
 target/ppc/helper.h |   6 +-
 target/ppc/insn32.decode|   6 ++
 target/ppc/translate/vsx-impl.c.inc |  20 -
 target/ppc/translate/vsx-ops.c.inc  |   4 -
 5 files changed, 60 insertions(+), 90 deletions(-)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 960a76a8a5..a66e16c212 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3241,63 +3241,6 @@ void helper_XVXSIGSP(ppc_vsr_t *xt, ppc_vsr_t *xb)
 *xt = t;
 }
 
-/*
- * VSX_TEST_DC - VSX floating point test data class
- *   op- instruction mnemonic
- *   nels  - number of elements (1, 2 or 4)
- *   xbn   - VSR register number
- *   tp- type (float32 or float64)
- *   fld   - vsr_t field (VsrD(*) or VsrW(*))
- *   tfld   - target vsr_t field (VsrD(*) or VsrW(*))
- *   fld_max - target field max
- *   scrf - set result in CR and FPCC
- */
-#define VSX_TEST_DC(op, nels, xbn, tp, fld, tfld, fld_max, scrf)  \
-void helper_##op(CPUPPCState *env, uint32_t opcode) \
-{   \
-ppc_vsr_t *xt = >vsr[xT(opcode)];  \
-ppc_vsr_t *xb = >vsr[xbn]; \
-ppc_vsr_t t = { };  \
-uint32_t i, sign, dcmx; \
-uint32_t cc, match = 0; \
-\
-if (!scrf) {\
-dcmx = DCMX_XV(opcode); \
-} else {\
-t = *xt;\
-dcmx = DCMX(opcode);\
-}   \
-\
-for (i = 0; i < nels; i++) {\
-sign = tp##_is_neg(xb->fld);\
-if (tp##_is_any_nan(xb->fld)) { \
-match = extract32(dcmx, 6, 1);  \
-} else if (tp##_is_infinity(xb->fld)) { \
-match = extract32(dcmx, 4 + !sign, 1);  \
-} else if (tp##_is_zero(xb->fld)) { \
-match = extract32(dcmx, 2 + !sign, 1);  \
-} else if (tp##_is_zero_or_denormal(xb->fld)) { \
-match = extract32(dcmx, 0 + !sign, 1);  \
-}   \
-\
-if (scrf) { \
-cc = sign << CRF_LT_BIT | match << CRF_EQ_BIT;  \
-env->fpscr &= ~FP_FPCC; \
-env->fpscr |= cc << FPSCR_FPCC; \
-env->crf[BF(opcode)] = cc;  \
-} else {\
-t.tfld = match ? fld_max : 0;   \
-}  

[PATCH v2 10/12] target/ppc: Moved XVTSTDC[DS]P to decodetree

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XVTSTDCSP and XVTSTDCDP to decodetree an restructured the helper
to be simpler and do all decoding in the decodetree (so XB, XT and DCMX
are all calculated outside the helper).

Obs: The tests in this one are slightly different, these are the sum of
these instructions with all possible immediate and those instructions
are repeated 10 times.

xvtstdcsp:
reptloopmaster patch
8   12500   2,76402100 2,70699100 (-2.1%)
25  40002,64867100 2,67884100 (+1.1%)
100 10002,73806300 2,78701000 (+1.8%)
500 200 3,44666500 3,61027600 (+4.7%)
250040  5,85790200 6,47475500 (+10.5%)
800012 15,2210210017,46062900 (+14.7%)

xvtstdcdp:
reptloopmaster patch
8   12500   2,11818000 1,61065300 (-24.0%)
25  40002,04573400 1,60132200 (-21.7%)
100 10002,13834100 1,69988100 (-20.5%)
500 200 2,73977000 2,48631700 (-9.3%)
250040  5,05067000 5,25914100 (+4.1%)
800012 14,6050780015,93704900 (+9.1%)

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/fpu_helper.c | 39 +++--
 target/ppc/helper.h |  4 +--
 target/ppc/insn32.decode|  5 
 target/ppc/translate/vsx-impl.c.inc | 28 +++--
 target/ppc/translate/vsx-ops.c.inc  |  8 --
 5 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index ae25f32d6e..960a76a8a5 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3295,11 +3295,46 @@ void helper_##op(CPUPPCState *env, uint32_t opcode) 
\
 }   \
 }
 
-VSX_TEST_DC(xvtstdcdp, 2, xB(opcode), float64, VsrD(i), VsrD(i), UINT64_MAX, 0)
-VSX_TEST_DC(xvtstdcsp, 4, xB(opcode), float32, VsrW(i), VsrW(i), UINT32_MAX, 0)
 VSX_TEST_DC(xststdcdp, 1, xB(opcode), float64, VsrD(0), VsrD(0), 0, 1)
 VSX_TEST_DC(xststdcqp, 1, (rB(opcode) + 32), float128, f128, VsrD(0), 0, 1)
 
+#define VSX_TSTDC(tp)   \
+static int32_t tp##_tstdc(tp b, uint32_t dcmx)  \
+{   \
+uint32_t match = 0; \
+uint32_t sign = tp##_is_neg(b); \
+if (tp##_is_any_nan(b)) {   \
+match = extract32(dcmx, 6, 1);  \
+} else if (tp##_is_infinity(b)) {   \
+match = extract32(dcmx, 4 + !sign, 1);  \
+} else if (tp##_is_zero(b)) {   \
+match = extract32(dcmx, 2 + !sign, 1);  \
+} else if (tp##_is_zero_or_denormal(b)) {   \
+match = extract32(dcmx, 0 + !sign, 1);  \
+}   \
+return (match != 0);\
+}
+
+VSX_TSTDC(float32)
+VSX_TSTDC(float64)
+#undef VSX_TSTDC
+
+void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+{
+int i;
+for (i = 0; i < 2; i++) {
+t->s64[i] = (int64_t)-float64_tstdc(b->f64[i], dcmx);
+}
+}
+
+void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+{
+int i;
+for (i = 0; i < 4; i++) {
+t->s32[i] = (int32_t)-float32_tstdc(b->f32[i], dcmx);
+}
+}
+
 void helper_xststdcsp(CPUPPCState *env, uint32_t opcode, ppc_vsr_t *xb)
 {
 uint32_t dcmx, sign, exp;
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index fd8280dfa7..9e5d11939b 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -517,8 +517,8 @@ DEF_HELPER_3(xvcvsxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvsxwsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxwsp, void, env, vsr, vsr)
-DEF_HELPER_2(xvtstdcsp, void, env, i32)
-DEF_HELPER_2(xvtstdcdp, void, env, i32)
+DEF_HELPER_FLAGS_4(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
+DEF_HELPER_FLAGS_4(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
 DEF_HELPER_3(xvrspi, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspic, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspim, void, env, vsr, vsr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 6549c4040e..c0a531be5c 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -199,6 +199,9 @@
 
 @XX2_uim4   .. . . uim:4 . . .. _uim 
xt=%xx_xt xb=%xx_xb
 
+%xx_uim76:1 2:1 16:5
+@XX2_uim7   .. . . .  . ... . .._uim 
xt=%xx_xt xb=%xx_xb uim=%xx_uim7
+
 _bf_xb  bf xb
 @XX2_bf_xb  .. bf:3 .. . . . . ._bf_xb 
xb=%xx_xb
 
@@ -848,6 +851

[PATCH v2 07/12] target/ppc: Move VABSDU[BHW] to decodetree and use gvec

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved VABSDUB, VABSDUH and VABSDUW to decodetree and use gvec to
translate them.

vabsdub:
reptloopmaster patch
8   12500   0,03601600 0,00688500 (-80.9%)
25  40000,03651000 0,00532100 (-85.4%)
100 10000,03666900 0,00595300 (-83.8%)
500 200 0,04305800 0,01244600 (-71.1%)
250040  0,06893300 0,04273700 (-38.0%)
800012  0,14633200 0,12660300 (-13.5%)

vabsduh:
reptloopmaster patch
8   12500   0,02172400 0,00687500 (-68.4%)
25  40000,02154100 0,00531500 (-75.3%)
100 10000,02235400 0,00596300 (-73.3%)
500 200 0,02827500 0,01245100 (-56.0%)
250040  0,05638400 0,04285500 (-24.0%)
800012  0,13166000 0,12641400 (-4.0%)

vabsduw:
reptloopmaster patch
8   12500   0,01646400 0,00688300 (-58.2%)
25  40000,01454500 0,00475500 (-67.3%)
100 10000,01545800 0,00511800 (-66.9%)
500 200 0,02168200 0,01114300 (-48.6%)
250040  0,04571300 0,04138800 (-9.5%)
800012  0,12209500 0,12178500 (-0.3%)

Same as VADDCUW and VSUBCUW, overall performance gain but it uses more
TCGop (4 before the patch, 6 after).

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  6 ++--
 target/ppc/insn32.decode|  6 
 target/ppc/int_helper.c | 13 +++-
 target/ppc/translate/vmx-impl.c.inc | 49 +++--
 target/ppc/translate/vmx-ops.c.inc  |  3 --
 5 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 71c22efc2e..fd8280dfa7 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -146,9 +146,9 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)
 DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VABSDUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VABSDUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VABSDUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 38458c01de..ae151c4b62 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -528,6 +528,12 @@ VAVGUB  000100 . . . 110@VX
 VAVGUH  000100 . . . 1000110@VX
 VAVGUW  000100 . . . 1001010@VX
 
+## Vector Integer Absolute Difference Instructions
+
+VABSDUB 000100 . . . 111@VX
+VABSDUH 000100 . . . 1000111@VX
+VABSDUW 000100 . . . 1001011@VX
+
 ## Vector Bit Manipulation Instruction
 
 VGNB000100 . -- ... . 10011001100   @VX_n
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index bda76e54d4..d97a7f1f28 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -589,8 +589,8 @@ VAVG(VAVGSW, s32, int64_t)
 VAVG(VAVGUW, u32, uint64_t)
 #undef VAVG
 
-#define VABSDU_DO(name, element)\
-void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)   \
+#define VABSDU(name, element)   \
+void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v)\
 {   \
 int i;  \
 \
@@ -606,12 +606,9 @@ void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t 
*b)   \
  *   name- instruction mnemonic suffix (b: byte, h: halfword, w: word)
  *   element - element type to access from vector
  */
-#define VABSDU(type, element)   \
-VABSDU_DO(absdu##type, element)
-VABSDU(b, u8)
-VABSDU(h, u16)
-VABSDU(w, u32)
-#undef VABSDU_DO
+VABSDU(VABSDUB, u8)
+VABSDU(VABSDUH, u16)
+VABSDU(VABSDUW, u32)
 #undef VABSDU
 
 #define VCF(suffix, cvt, element)   \
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 1e3e099739..f46a354d31 100644
--- a/

[PATCH v2 06/12] target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved the instructions VAVGUB, VAVGUH, VAVGUW, VAVGSB, VAVGSH, VAVGSW,
to decodetree and use gvec with them. For these one the right shift
had to be made before the sum as to avoid an overflow, so add 1 at the
end if any of the entries had 1 in its LSB as to replicate the "+ 1"
before the shift described by the ISA.

vavgub:
reptloopmaster patch
8   12500   0,02616600 0,00754200 (-71.2%)
25  40000,0253 0,00637700 (-74.8%)
100 10000,02604600 0,00790100 (-69.7%)
500 200 0,03189300 0,01838400 (-42.4%)
250040  0,06006900 0,06851000 (+14.1%)
800012  0,13941000 0,20548500 (+47.4%)

vavguh:
reptloopmaster patch
8   12500   0,01818200 0,00780600 (-57.1%)
25  40000,01789300 0,00641600 (-64.1%)
100 10000,01899100 0,00787200 (-58.5%)
500 200 0,02527200 0,01828400 (-27.7%)
250040  0,05361800 0,06773000 (+26.3%)
800012  0,12886600 0,20291400 (+57.5%)

vavguw:
reptloopmaster patch
8   12500   0,01423100 0,00776600 (-45.4%)
25  40000,01780800 0,00638600 (-64.1%)
100 10000,02085500 0,00787000 (-62.3%)
500 200 0,02737100 0,01828800 (-33.2%)
250040  0,05572600 0,06774200 (+21.6%)
800012  0,13101700 0,20311600 (+55.0%)

vavgsb:
reptloopmaster patch
8   12500   0,03006000 0,00788600 (-73.8%)
25  40000,02882200 0,00637800 (-77.9%)
100 10000,02958000 0,00791400 (-73.2%)
500 200 0,03548800 0,01860400 (-47.6%)
250040  0,0636 0,06850800 (+7.7%)
800012  0,13816500 0,20550300 (+48.7%)

vavgsh:
reptloopmaster patch
8   12500   0,01965900 0,00776600 (-60.5%)
25  40000,01875400 0,00638700 (-65.9%)
100 10000,01952200 0,00786900 (-59.7%)
500 200 0,02562000 0,01760300 (-31.3%)
250040  0,05384300 0,06742800 (+25.2%)
800012  0,13240800 0,2033 (+53.5%)

vavgsw:
reptloopmaster patch
8   12500   0,01407700 0,00775600 (-44.9%)
25  40000,01762300 0,0064 (-63.7%)
100 10000,02046500 0,00788500 (-61.5%)
500 200 0,02745600 0,01843000 (-32.9%)
250040  0,05375500 0,06820500 (+26.9%)
800012  0,13068300 0,20304900 (+55.4%)

These results to me seems to indicate that with gvec the results have a
slower translation but faster execution.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  12 ++--
 target/ppc/insn32.decode|   9 +++
 target/ppc/int_helper.c |  32 -
 target/ppc/translate/vmx-impl.c.inc | 106 
 target/ppc/translate/vmx-ops.c.inc  |   9 +--
 5 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index a06193bc67..71c22efc2e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -143,15 +143,15 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)
 #define dh_ctype_acc ppc_acc_t *
 #define dh_typecode_acc dh_typecode_ptr
 
-DEF_HELPER_FLAGS_3(vavgub, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavguh, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavguw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavgsb, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavgsh, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavgsw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_4(vcmpeqfp, void, env, avr, avr, avr)
 DEF_HELPER_4(vcmpgefp, void, env, avr, avr, avr)
 DEF_HELPER_4(vcmpgtfp, void, env, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index aa4968e6b9..38458c01de 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -519,6 +519,15 @@ VCMPNEZW000100 . . . . 011111   @VC
 VCMPSQ  000100 ... -- . . 0010101   @VX_bf
 VCMPUQ  000100 ... -- . . 

[PATCH v2 00/12] VMX/VSX instructions with gvec

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Patches missing review: 3,5,9,11,12

v1 -> v2:
- Implemented instructions with fni4/fni8 and dropped the helper:
* VSUBCUW
* VADDCUW
* VPRTYBW
* VPRTYBD
- Reworked patch12 to only use gvec implementation with a few
  immediates.
- Used bitsel_ver on patch9
- Changed vec variables to tcg_constant_vec when possible

This patch series moves some instructions from decode legacy to
decodetree and translate said instructions with gvec. Some cases using
gvec ended up with a bigger, more complex and slower so those
instructions were only moved to decodetree.

In each patch there's a comparison of the execution time before the
patch being applied and after. Said result is the sum of 10 executions.

The program used to time the execution worked like this:

clock_t start = clock();
for (int i = 0; i < LOOP; i++) {
asm (
 load values in registers, between 2 and 3 instructions
 ".rept REPT\n\t"
 "INSTRUCTION registers\n\t"
 ".endr\n\t"
 save result from register, 1 instruction
);
}
clock_t end = clock();
printf("INSTRUCTION rept=REPT loop=LOOP, time taken: %.12lf\n",
   ((double)(end - start))/ CLOCKS_PER_SEC);

Where the column rept in the value used in .rept in the inline assembly
and loop column is the value used for the for loop. All of those tests
were executed on a Power9. When comparing the TCGop the data used was
gathered using '-d op' and '-d op_opt'.

Lucas Mateus Castro (alqotel) (12):
  target/ppc: Moved VMLADDUHM to decodetree and use gvec
  target/ppc: Move VMH[R]ADDSHS instruction to decodetree
  target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec
  target/ppc: Move VNEG[WD] to decodtree and use gvec
  target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec
  target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec
  target/ppc: Move VABSDU[BHW] to decodetree and use gvec
  target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P
  target/ppc: Use gvec to decode XVCPSGN[SD]P
  target/ppc: Moved XVTSTDC[DS]P to decodetree
  target/ppc: Moved XSTSTDC[QDS]P to decodetree
  target/ppc: Use gvec to decode XVTSTDC[DS]P

 target/ppc/fpu_helper.c | 140 +-
 target/ppc/helper.h |  42 ++-
 target/ppc/insn32.decode|  50 
 target/ppc/int_helper.c | 107 ++--
 target/ppc/translate.c  |   1 -
 target/ppc/translate/vmx-impl.c.inc | 364 +
 target/ppc/translate/vmx-ops.c.inc  |  15 +-
 target/ppc/translate/vsx-impl.c.inc | 394 +++-
 target/ppc/translate/vsx-ops.c.inc  |  21 --
 9 files changed, 808 insertions(+), 326 deletions(-)

-- 
2.37.3




[PATCH v2 04/12] target/ppc: Move VNEG[WD] to decodtree and use gvec

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved the instructions VNEGW and VNEGD to decodetree and used gvec to
decode it.

vnegw:
reptloopmaster patch
8   12500   0,01053200 0,00548400 (-47.9%)
25  40000,01030500 0,0039 (-62.2%)
100 10000,01096300 0,00395400 (-63.9%)
500 200 0,01472000 0,00712300 (-51.6%)
250040  0,03809000 0,02147700 (-43.6%)
800012  0,09957100 0,06202100 (-37.7%)

vnegd:
reptloopmaster patch
8   12500   0,00594600 0,00543800 (-8.5%)
25  40000,00575200 0,00396400 (-31.1%)
100 10000,00676100 0,00394800 (-41.6%)
500 200 0,01149300 0,00709400 (-38.3%)
250040  0,03441500 0,02169600 (-37.0%)
800012  0,09516900 0,06337000 (-33.4%)

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  2 --
 target/ppc/insn32.decode|  3 +++
 target/ppc/int_helper.c | 12 
 target/ppc/translate/vmx-impl.c.inc | 15 +--
 target/ppc/translate/vmx-ops.c.inc  |  2 --
 5 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f7047ed2aa..b2e910b089 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -229,8 +229,6 @@ DEF_HELPER_FLAGS_2(VSTRIBL, TCG_CALL_NO_RWG, i32, avr, avr)
 DEF_HELPER_FLAGS_2(VSTRIBR, TCG_CALL_NO_RWG, i32, avr, avr)
 DEF_HELPER_FLAGS_2(VSTRIHL, TCG_CALL_NO_RWG, i32, avr, avr)
 DEF_HELPER_FLAGS_2(VSTRIHR, TCG_CALL_NO_RWG, i32, avr, avr)
-DEF_HELPER_FLAGS_2(vnegw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vnegd, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vupkhpx, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vupklpx, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vupkhsb, TCG_CALL_NO_RWG, void, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index aebc7b73c8..2658dd3395 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -629,6 +629,9 @@ VEXTSH2D000100 . 11001 . 1100010
@VX_tb
 VEXTSW2D000100 . 11010 . 1100010@VX_tb
 VEXTSD2Q000100 . 11011 . 1100010@VX_tb
 
+VNEGD   000100 . 00111 . 1100010@VX_tb
+VNEGW   000100 . 00110 . 1100010@VX_tb
+
 ## Vector Mask Manipulation Instructions
 
 MTVSRBM 000100 . 1 . 1100110@VX_tb
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index f8dd12e8ae..c7fd0d1faa 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1928,18 +1928,6 @@ XXBLEND(W, 32)
 XXBLEND(D, 64)
 #undef XXBLEND
 
-#define VNEG(name, element) \
-void helper_##name(ppc_avr_t *r, ppc_avr_t *b)  \
-{   \
-int i;  \
-for (i = 0; i < ARRAY_SIZE(r->element); i++) {  \
-r->element[i] = -b->element[i]; \
-}   \
-}
-VNEG(vnegw, s32)
-VNEG(vnegd, s64)
-#undef VNEG
-
 void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
 int sh = (b->VsrB(0xf) >> 3) & 0xf;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index f52485a5f1..b9a9e83ab3 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2625,8 +2625,19 @@ GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
 GEN_VXFORM_TRANS(vclzw, 1, 30)
 GEN_VXFORM_TRANS(vclzd, 1, 31)
-GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
-GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
+
+static bool do_vneg(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+REQUIRE_VECTOR(ctx);
+
+tcg_gen_gvec_neg(vece, avr_full_offset(a->vrt), avr_full_offset(a->vrb),
+ 16, 16);
+return true;
+}
+
+TRANS(VNEGW, do_vneg, MO_32)
+TRANS(VNEGD, do_vneg, MO_64)
 
 static void gen_vexts_i64(TCGv_i64 t, TCGv_i64 b, int64_t s)
 {
diff --git a/target/ppc/translate/vmx-ops.c.inc 
b/target/ppc/translate/vmx-ops.c.inc
index ded0234123..27908533dd 100644
--- a/target/ppc/translate/vmx-ops.c.inc
+++ b/target/ppc/translate/vmx-ops.c.inc
@@ -181,8 +181,6 @@ GEN_VXFORM_300_EXT(vextractd, 6, 11, 0x10),
 GEN_VXFORM(vspltisb, 6, 12),
 GEN_VXFORM(vspltish, 6, 13),
 GEN_VXFORM(vspltisw, 6, 14),
-GEN_VXFORM_300_EO(vnegw, 0x01, 0x18, 0x06),
-GEN_VXFORM_300_EO(vnegd, 0x01, 0x18, 0x07),
 GEN_VXFORM_300_EO(vctzb, 0x01, 0x18, 0x1C),
 GEN_VXFORM_300_EO(vctzh, 0x01, 0x18, 0x1D),
 GEN_VXFORM_300_EO(vctzw, 0x01, 0x18, 0x1E),
-- 
2.37.3




[PATCH v2 09/12] target/ppc: Use gvec to decode XVCPSGN[SD]P

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XVCPSGNSP and XVCPSGNDP to decodetree and used gvec to translate
them.

xvcpsgnsp:
reptloopmaster patch
8   12500   0,00561400 0,00537900 (-4.2%)
25  40000,00562100 0,0040 (-28.8%)
100 10000,00696900 0,00416300 (-40.3%)
500 200 0,02211900 0,00840700 (-62.0%)
250040  0,09328600 0,02728300 (-70.8%)
800012  0,27295300 0,06867800 (-74.8%)

xvcpsgndp:
reptloopmaster patch
8   12500   0,00556300 0,00584200 (+5.0%)
25  40000,00482700 0,00431700 (-10.6%)
100 10000,00585800 0,00464400 (-20.7%)
500 200 0,01565300 0,00839700 (-46.4%)
250040  0,05766500 0,02430600 (-57.8%)
800012  0,19875300 0,07947100 (-60.0%)

Like the previous instructions there seemed to be a improvement on
translation time.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/insn32.decode|   2 +
 target/ppc/translate/vsx-impl.c.inc | 109 ++--
 target/ppc/translate/vsx-ops.c.inc  |   3 -
 3 files changed, 55 insertions(+), 59 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 5b687078be..6549c4040e 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -762,6 +762,8 @@ XVNABSDP00 . 0 . 01001 ..   @XX2
 XVNABSSP00 . 0 . 110101001 ..   @XX2
 XVNEGDP 00 . 0 . 11001 ..   @XX2
 XVNEGSP 00 . 0 . 110111001 ..   @XX2
+XVCPSGNDP   00 . . .  ...   @XX3
+XVCPSGNSP   00 . . . 1101 ...   @XX3
 
 ## VSX Scalar Multiply-Add Instructions
 
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 3f9af811dc..4f17da514c 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -729,62 +729,6 @@ VSX_SCALAR_MOVE_QP(xsnabsqp, OP_NABS, SGN_MASK_DP)
 VSX_SCALAR_MOVE_QP(xsnegqp, OP_NEG, SGN_MASK_DP)
 VSX_SCALAR_MOVE_QP(xscpsgnqp, OP_CPSGN, SGN_MASK_DP)
 
-#define VSX_VECTOR_MOVE(name, op, sgn_mask)  \
-static void glue(gen_, name)(DisasContext *ctx)  \
-{\
-TCGv_i64 xbh, xbl, sgm;  \
-if (unlikely(!ctx->vsx_enabled)) {   \
-gen_exception(ctx, POWERPC_EXCP_VSXU);   \
-return;  \
-}\
-xbh = tcg_temp_new_i64();\
-xbl = tcg_temp_new_i64();\
-sgm = tcg_temp_new_i64();\
-get_cpu_vsr(xbh, xB(ctx->opcode), true); \
-get_cpu_vsr(xbl, xB(ctx->opcode), false);\
-tcg_gen_movi_i64(sgm, sgn_mask); \
-switch (op) {\
-case OP_ABS: {   \
-tcg_gen_andc_i64(xbh, xbh, sgm); \
-tcg_gen_andc_i64(xbl, xbl, sgm); \
-break;   \
-}\
-case OP_NABS: {  \
-tcg_gen_or_i64(xbh, xbh, sgm);   \
-tcg_gen_or_i64(xbl, xbl, sgm);   \
-break;   \
-}\
-case OP_NEG: {   \
-tcg_gen_xor_i64(xbh, xbh, sgm);  \
-tcg_gen_xor_i64(xbl, xbl, sgm);  \
-break;   \
-}\
-case OP_CPSGN: { \
-TCGv_i64 xah = tcg_temp_new_i64();   \
-TCGv_i64 xal = tcg_temp_new_i64();   \
-get_cpu_vsr(xah, xA(ctx->opcode), true); \
-get_cpu_vsr(xal, xA(ctx->opcode), false);\
-tcg_gen_and_i64(xah, xah, sgm);  \
-tcg_gen_and_i64(xal, xal, sgm);  \
-tcg_gen_andc_i64(xbh, xbh, sgm); \
-tcg_gen_andc_i64(xbl, xbl, sgm); \
-tcg_gen_or_i64(xbh, xbh, xah);   \
-tcg

[PATCH v2 08/12] target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XVABSSP, XVABSDP, XVNABSSP,XVNABSDP, XVNEGSP and XVNEGDP to
decodetree and used gvec to translate them.

xvabssp:
reptloopmaster patch
8   12500   0,00477900 0,00476000 (-0.4%)
25  40000,00442800 0,00353300 (-20.2%)
100 10000,00478700 0,00366100 (-23.5%)
500 200 0,00973200 0,00649400 (-33.3%)
250040  0,03165200 0,02226700 (-29.7%)
800012  0,09315900 0,06674900 (-28.3%)

xvabsdp:
reptloopmaster patch
8   12500   0,00475000 0,00474400 (-0.1%)
25  40000,00355600 0,00367500 (+3.3%)
100 10000,00444200 0,00366000 (-17.6%)
500 200 0,00942700 0,00732400 (-22.3%)
250040  0,0299 0,02308500 (-22.8%)
800012  0,08770300 0,06683800 (-23.8%)

xvnabssp:
reptloopmaster patch
8   12500   0,00494500 0,00492900 (-0.3%)
25  40000,00397700 0,00338600 (-14.9%)
100 10000,00421400 0,00353500 (-16.1%)
500 200 0,01048000 0,00707100 (-32.5%)
250040  0,03251500 0,02238300 (-31.2%)
800012  0,08889100 0,06469800 (-27.2%)

xvnabsdp:
reptloopmaster patch
8   12500   0,00511000 0,00492700 (-3.6%)
25  40000,00398800 0,00381500 (-4.3%)
100 10000,00390500 0,00365900 (-6.3%)
500 200 0,00924800 0,00784600 (-15.2%)
250040  0,03138900 0,02391600 (-23.8%)
800012  0,09654200 0,05684600 (-41.1%)

xvnegsp:
reptloopmaster patch
8   12500   0,00493900 0,00452800 (-8.3%)
25  40000,00369100 0,00366800 (-0.6%)
100 10000,00371100 0,0038 (+2.4%)
500 200 0,00991100 0,00652300 (-34.2%)
250040  0,03025800 0,02422300 (-19.9%)
800012  0,09251100 0,06457600 (-30.2%)

xvnegdp:
reptloopmaster patch
8   12500   0,00474900 0,00454400 (-4.3%)
25  40000,00353100 0,00325600 (-7.8%)
100 10000,00398600 0,00366800 (-8.0%)
500 200 0,01032300 0,00702400 (-32.0%)
250040  0,03125000 0,02422400 (-22.5%)
800012  0,09475100 0,06173000 (-34.9%)

This one to me seemed the opposite of the previous instructions, as it
looks like there was an improvement in the translation time (itself not
a surprise as operations were done twice before so there was the need to
translate twice as many TCGop)

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn32.decode|  9 
 target/ppc/translate/vsx-impl.c.inc | 73 ++---
 target/ppc/translate/vsx-ops.c.inc  |  6 ---
 3 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index ae151c4b62..5b687078be 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -754,6 +754,15 @@ STXVRHX 01 . . . 0010101101 .   
@X_TSX
 STXVRWX 01 . . . 0011001101 .   @X_TSX
 STXVRDX 01 . . . 0011101101 .   @X_TSX
 
+## VSX Vector Binary Floating-Point Sign Manipulation Instructions
+
+XVABSDP 00 . 0 . 111011001 ..   @XX2
+XVABSSP 00 . 0 . 110011001 ..   @XX2
+XVNABSDP00 . 0 . 01001 ..   @XX2
+XVNABSSP00 . 0 . 110101001 ..   @XX2
+XVNEGDP 00 . 0 . 11001 ..   @XX2
+XVNEGSP 00 . 0 . 110111001 ..   @XX2
+
 ## VSX Scalar Multiply-Add Instructions
 
 XSMADDADP   00 . . . 0011 . . . @XX3
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 7acdbceec4..3f9af811dc 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -782,15 +782,76 @@ static void glue(gen_, name)(DisasContext *ctx)   
   \
 tcg_temp_free_i64(sgm);  \
 }
 
-VSX_VECTOR_MOVE(xvabsdp, OP_ABS, SGN_MASK_DP)
-VSX_VECTOR_MOVE(xvnabsdp, OP_NABS, SGN_MASK_DP)
-VSX_VECTOR_MOVE(xvnegdp, OP_NEG, SGN_MASK_DP)
 VSX_VECTOR_MOVE(xvcpsgndp, OP_CPSGN, SGN_MASK_DP)
-VSX_VECTOR_MOVE(xvabssp, OP_ABS, SGN_MASK_SP)
-VSX_VECTOR_MOVE(xvnabssp, OP_NABS, SGN_MASK_SP)
-VSX_VECTOR_MOVE(xvnegsp, OP_NEG, SGN_MASK_SP)
 VSX_VECTOR_MOVE(xvcpsgnsp, OP_CPSGN, SGN_MASK_SP)
 
+#define TCG_OP_IMM_i64(FUNC, OP, IMM)   \
+static void FUNC(TCGv_i64 t, TCGv_i64 b)\
+{   \
+OP(t, b, IMM);  \
+}
+
+TCG_OP_IMM_i64(do_xvabssp_i64, tcg_ge

[PATCH v2 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to
decodetree. VPRTYBW and VPRTYBD now also use .fni4 and .fni8,
respectively.

vprtybw:
reptloopmaster patch
8   12500   0,00991200 0,00626300 (-36.8%)
25  40000,01040600 0,00550600 (-47.1%)
100 10000,01084500 0,00601100 (-44.6%)
500 200 0,01490600 0,01394100 (-6.5%)
250040  0,03285100 0,05143000 (+56.6%)
800012  0,08971500 0,14662500 (+63.4%)

vprtybd:
reptloopmaster patch
8   12500   0,00665800 0,00652800 (-2.0%)
25  40000,00589300 0,00670400 (+13.8%)
100 10000,00646800 0,00743900 (+15.0%)
500 200 0,01065800 0,01586400 (+48.8%)
250040  0,03497000 0,07180100 (+105.3%)
800012  0,09242200 0,21566600 (+133.3%)

vprtybq:
reptloopmaster patch
8   12500   0,00656200 0,00665800 (+1.5%)
25  40000,00620500 0,00644900 (+3.9%)
100 10000,00707500 0,00764900 (+8.1%)
500 200 0,01203500 0,01349500 (+12.1%)
250040  0,03505700 0,04123100 (+17.6%)
800012  0,09590600 0,11586700 (+20.8%)

I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ,
I'm not sure if it's worth to move those instructions. Comparing the
assembly of the helper with the TCGop they are pretty similar, so
I'm not sure why vprtybd took so much more time.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h |  4 +-
 target/ppc/insn32.decode|  4 ++
 target/ppc/int_helper.c | 25 +
 target/ppc/translate/vmx-impl.c.inc | 80 +++--
 target/ppc/translate/vmx-ops.c.inc  |  3 --
 5 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index b2e910b089..a06193bc67 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -193,9 +193,7 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
+DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 2658dd3395..aa4968e6b9 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -529,6 +529,10 @@ VCTZDM  000100 . . . 1000100@VX
 VPDEPD  000100 . . . 10111001101@VX
 VPEXTD  000100 . . . 10110001101@VX
 
+VPRTYBD 000100 . 01001 . 1100010@VX_tb
+VPRTYBQ 000100 . 01010 . 1100010@VX_tb
+VPRTYBW 000100 . 01000 . 1100010@VX_tb
+
 ## Vector Permute and Formatting Instruction
 
 VEXTDUBVLX  000100 . . . . 011000   @VA
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index c7fd0d1faa..c6ce4665fa 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -492,31 +492,8 @@ static inline void set_vscr_sat(CPUPPCState *env)
 env->vscr_sat.u32[0] = 1;
 }
 
-/* vprtybw */
-void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
-{
-int i;
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
-uint64_t res = b->u32[i] ^ (b->u32[i] >> 16);
-res ^= res >> 8;
-r->u32[i] = res & 1;
-}
-}
-
-/* vprtybd */
-void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
-{
-int i;
-for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
-uint64_t res = b->u64[i] ^ (b->u64[i] >> 32);
-res ^= res >> 16;
-res ^= res >> 8;
-r->u64[i] = res & 1;
-}
-}
-
 /* vprtybq */
-void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
 uint64_t res = b->u64[0] ^ b->u64[1];
 res ^= res >> 32;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index b9a9e83ab3..23601942bc 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1659,9 +1659,83 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
 GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
 GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
 GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
-GEN_VXFORM_NOA(vprt

[PATCH v2 03/12] target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch moves VADDCUW and VSUBCUW to decodtree with gvec using an
implementation based on the helper, with the main difference being
changing the -1 (aka all bits set to 1) result returned by cmp when
true to +1. It also implemented a .fni4 version of those instructions
and dropped the helper.

vaddcuw:
reptloopmaster patch
8   12500   0,01008200 0,00612400 (-39.3%)
25  40000,01091500 0,00471600 (-56.8%)
100 10000,01332500 0,00593700 (-55.4%)
500 200 0,01998500 0,01275700 (-36.2%)
250040  0,04704300 0,04364300 (-7.2%)
800012  0,10748200 0,11241000 (+4.6%)

vsubcuw:
reptloopmaster patch
8   12500   0,01226200 0,00571600 (-53.4%)
25  40000,01493500 0,00462100 (-69.1%)
100 10000,01522700 0,00455100 (-70.1%)
500 200 0,02384600 0,01133500 (-52.5%)
250040  0,04935200 0,03178100 (-35.6%)
800012  0,09039900 0,09440600 (+4.4%)

Overall there was a gain in performance, but the TCGop code was still
slightly bigger in the new version (it went from 4 to 5).

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h |  2 -
 target/ppc/insn32.decode|  2 +
 target/ppc/int_helper.c | 18 -
 target/ppc/translate/vmx-impl.c.inc | 61 +++--
 target/ppc/translate/vmx-ops.c.inc  |  3 +-
 5 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f02a9497b7..f7047ed2aa 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -193,11 +193,9 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vaddcuw, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_3(vsubcuw, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 9a509e84df..aebc7b73c8 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -608,12 +608,14 @@ VRLQNM  000100 . . . 00101000101
@VX
 
 ## Vector Integer Arithmetic Instructions
 
+VADDCUW 000100 . . . 0011000@VX
 VADDCUQ 000100 . . . 0010100@VX
 VADDUQM 000100 . . . 001@VX
 
 VADDEUQM000100 . . . . 00   @VA
 VADDECUQ000100 . . . . 01   @VA
 
+VSUBCUW 000100 . . . 1011000@VX
 VSUBCUQ 000100 . . . 1010100@VX
 VSUBUQM 000100 . . . 101@VX
 
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index ae1ba8084d..f8dd12e8ae 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -492,15 +492,6 @@ static inline void set_vscr_sat(CPUPPCState *env)
 env->vscr_sat.u32[0] = 1;
 }
 
-void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i;
-
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
-r->u32[i] = ~a->u32[i] < b->u32[i];
-}
-}
-
 /* vprtybw */
 void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
 {
@@ -1962,15 +1953,6 @@ void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t 
*b)
 #endif
 }
 
-void helper_vsubcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-int i;
-
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
-r->u32[i] = a->u32[i] >= b->u32[i];
-}
-}
-
 void helper_vsumsws(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
 int64_t t;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 3acd585a2f..f52485a5f1 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -803,8 +803,6 @@ GEN_VXFORM(vsrv, 2, 28);
 GEN_VXFORM(vslv, 2, 29);
 GEN_VXFORM(vslo, 6, 16);
 GEN_VXFORM(vsro, 6, 17);
-GEN_VXFORM(vaddcuw, 0, 6);
-GEN_VXFORM(vsubcuw, 0, 22);
 
 static bool do_vector_gvec3_VX(DisasContext *ctx, arg_VX *a, int vece,
void (*gen_gvec)(unsigned, uint32_t, uint32_t,
@@ -2847,8 +2845,6 @@ static void gen_xpnd04_2(DisasContext *ctx)
 }
 
 
-GEN_VXFORM_DUAL(vsubcuw, PPC_ALTIVEC, PPC_NONE, \
-xpnd04_1, PPC_NONE, P

[PATCH v2 02/12] target/ppc: Move VMH[R]ADDSHS instruction to decodetree

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch moves VMHADDSHS and VMHRADDSHS to decodetree I couldn't find
a satisfactory implementation with TCG inline.

vmhaddshs:
reptloopmaster patch
8   12500   0,02983400 0,02648500 (-11.2%)
25  40000,02946000 0,02518000 (-14.5%)
100 10000,03104300 0,02638000 (-15.0%)
500 200 0,04002000 0,03502500 (-12.5%)
250040  0,08090100 0,07562200 (-6.5%)
800012  0,19242600 0,18626800 (-3.2%)

vmhraddshs:
reptloopmaster patch
8   12500   0,03078600 0,02851000 (-7.4%)
25  40000,02793200 0,02746900 (-1.7%)
100 10000,02886000 0,02839900 (-1.6%)
500 200 0,03714700 0,03799200 (+2.3%)
250040  0,07948000 0,07852200 (-1.2%)
800012  0,19049800 0,18813900 (-1.2%)

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h | 4 ++--
 target/ppc/insn32.decode| 2 ++
 target/ppc/int_helper.c | 4 ++--
 target/ppc/translate/vmx-impl.c.inc | 5 +++--
 target/ppc/translate/vmx-ops.c.inc  | 1 -
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 9c562ab00e..f02a9497b7 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -258,8 +258,8 @@ DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vpkpx, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
-DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
+DEF_HELPER_5(VMHADDSHS, void, env, avr, avr, avr, avr)
+DEF_HELPER_5(VMHRADDSHS, void, env, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
 DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 7445455a12..9a509e84df 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -694,6 +694,8 @@ VMSUMCUD000100 . . . . 010111   @VA
 VMSUMUDM000100 . . . . 100011   @VA
 
 VMLADDUHM   000100 . . . . 100010   @VA
+VMHADDSHS   000100 . . . . 10   @VA
+VMHRADDSHS  000100 . . . . 11   @VA
 
 ## Vector String Instructions
 
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 0d25000b2a..ae1ba8084d 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -939,7 +939,7 @@ target_ulong helper_vctzlsbb(ppc_avr_t *r)
 return count;
 }
 
-void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
+void helper_VMHADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
   ppc_avr_t *b, ppc_avr_t *c)
 {
 int sat = 0;
@@ -957,7 +957,7 @@ void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, 
ppc_avr_t *a,
 }
 }
 
-void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
+void helper_VMHRADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
ppc_avr_t *b, ppc_avr_t *c)
 {
 int sat = 0;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 9f18c6d4f2..3acd585a2f 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2521,7 +2521,7 @@ static void glue(gen_, name0##_##name1)(DisasContext 
*ctx)  \
 tcg_temp_free_ptr(rd);  \
 }
 
-GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16)
+GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 
 static bool do_va_helper(DisasContext *ctx, arg_VA *a,
 void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr))
@@ -2620,7 +2620,8 @@ static bool do_va_env_helper(DisasContext *ctx, arg_VA *a,
 TRANS_FLAGS(ALTIVEC, VMSUMUHS, do_va_env_helper, gen_helper_VMSUMUHS)
 TRANS_FLAGS(ALTIVEC, VMSUMSHS, do_va_env_helper, gen_helper_VMSUMSHS)
 
-GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
+TRANS_FLAGS(ALTIVEC, VMHADDSHS, do_va_env_helper, gen_helper_VMHADDSHS)
+TRANS_FLAGS(ALTIVEC, VMHRADDSHS, do_va_env_helper, gen_helper_VMHRADDSHS)
 
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
diff --git a/target/ppc/translate/vmx-ops.c.inc 
b/target/ppc/translate/vmx-ops.c.inc
index a3a0fd0650..7cd9d40e06 100644
--- a/target/ppc/translate/vmx-ops.c.inc
+++ b/target/ppc/translate/vmx-ops.c.inc
@@ -219,7 +219,6 @@ GEN_VXFORM_UIMM(vctsxs, 5, 15),
 
 #define GEN_VAFORM_PAIRED(name0, name1, opc2)   \
 GEN_HANDLER(name0##_##name1, 0x04, opc2, 0xFF, 0x, PPC_ALTIVEC)
-GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16),
 GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23),
 
 GEN_VXFORM_DUAL(vclzb, vpopc

[PATCH v2 01/12] target/ppc: Moved VMLADDUHM to decodetree and use gvec

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch moves VMLADDUHM to decodetree a creates a gvec implementation
using mul_vec and add_vec.

reptloopmaster patch
8   12500   0,01810500 0,00903100 (-50.1%)
25  40000,01739400 0,00747700 (-57.0%)
100 10000,01843600 0,00901400 (-51.1%)
500 200 0,02574600 0,01971000 (-23.4%)
250040  0,05921600 0,07121800 (+20.3%)
800012  0,15326700 0,21725200 (+41.7%)

The significant difference in performance when REPT is low and LOOP is
high I think is due to the fact that the new implementation has a higher
translation time, as when using a helper only 5 TCGop are used but with
the patch a total of 10 TCGop are needed (Power lacks a direct mul_vec
equivalent so this instruction is implemented with the help of 5 others,
vmuleu, vmulou, vmrgh, vmrgl and vpkum).

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  2 +-
 target/ppc/insn32.decode|  2 ++
 target/ppc/int_helper.c |  3 +-
 target/ppc/translate.c  |  1 -
 target/ppc/translate/vmx-impl.c.inc | 48 ++---
 5 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 57eee07256..9c562ab00e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -264,7 +264,7 @@ DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, 
avr, avr, avr)
 DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
 DEF_HELPER_5(VMSUMSHS, void, env, avr, avr, avr, avr)
-DEF_HELPER_FLAGS_4(vmladduhm, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
+DEF_HELPER_FLAGS_5(VMLADDUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_2(mtvscr, TCG_CALL_NO_RWG, void, env, i32)
 DEF_HELPER_FLAGS_1(mfvscr, TCG_CALL_NO_RWG, i32, env)
 DEF_HELPER_3(lvebx, void, env, avr, tl)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index a5249ee32c..7445455a12 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -693,6 +693,8 @@ VMSUMUHS000100 . . . . 100111   @VA
 VMSUMCUD000100 . . . . 010111   @VA
 VMSUMUDM000100 . . . . 100011   @VA
 
+VMLADDUHM   000100 . . . . 100010   @VA
+
 ## Vector String Instructions
 
 VSTRIBL 000100 . 0 . . 001101   @VX_tb_rc
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 696096100b..0d25000b2a 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -974,7 +974,8 @@ void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, 
ppc_avr_t *a,
 }
 }
 
-void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+void helper_VMLADDUHM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c,
+  uint32_t v)
 {
 int i;
 
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index e810842925..11f729c60c 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -6921,7 +6921,6 @@ GEN_HANDLER(lvsl, 0x1f, 0x06, 0x00, 0x0001, 
PPC_ALTIVEC),
 GEN_HANDLER(lvsr, 0x1f, 0x06, 0x01, 0x0001, PPC_ALTIVEC),
 GEN_HANDLER(mfvscr, 0x04, 0x2, 0x18, 0x001ff800, PPC_ALTIVEC),
 GEN_HANDLER(mtvscr, 0x04, 0x2, 0x19, 0x03ff, PPC_ALTIVEC),
-GEN_HANDLER(vmladduhm, 0x04, 0x11, 0xFF, 0x, PPC_ALTIVEC),
 #if defined(TARGET_PPC64)
 GEN_HANDLER_E(maddhd_maddhdu, 0x04, 0x18, 0xFF, 0x, PPC_NONE,
   PPC2_ISA300),
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index e644ad3236..9f18c6d4f2 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2523,24 +2523,6 @@ static void glue(gen_, name0##_##name1)(DisasContext 
*ctx)  \
 
 GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16)
 
-static void gen_vmladduhm(DisasContext *ctx)
-{
-TCGv_ptr ra, rb, rc, rd;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-ra = gen_avr_ptr(rA(ctx->opcode));
-rb = gen_avr_ptr(rB(ctx->opcode));
-rc = gen_avr_ptr(rC(ctx->opcode));
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_vmladduhm(rd, ra, rb, rc);
-tcg_temp_free_ptr(ra);
-tcg_temp_free_ptr(rb);
-tcg_temp_free_ptr(rc);
-tcg_temp_free_ptr(rd);
-}
-
 static bool do_va_helper(DisasContext *ctx, arg_VA *a,
 void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr))
 {
@@ -2569,6 +2551,36 @@ TRANS_FLAGS2(ALTIVEC_207, VSUBECUQ, do_va_helper, 
gen_helper_VSUBECUQ)
 TRANS_FLAGS(ALTIVEC, VPERM, do_va_helper, gen_helper_VPERM)
 TRANS_FLAGS2(ISA300, VPERMR, do_va_helper, gen_helper_VPERMR)
 
+static void gen_vmladduhm_vec(unsigned vece, TCGv_vec t, TCGv_vec a, TCGv_vec 
b,
+   

[PATCH 12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Used gvec to translate XVTSTDCSP and XVTSTDCDP.

xvtstdcsp:
reptlooppatch10 patch12
8   12500   2,70288900  1,24050300 (-54.1%)
25  40002,65665700  1,14078900 (-57.1%)
100 10002,82795400  1,53337200 (-45.8%)
500 200 3,62225400  3,91718000 (+8.1%)
250040  6,45658000 12,60683700 (+95.3%)
800012 17,48091900 44,15384000 (+152.6%)

xvtstdcdp:
reptlooppatch10 patch12
8   125001,56435900 1,24554800 (-20.4%)
25  4000 1,53789500 1,14177800 (-25.8%)
100 1000 1,67964600 1,5428 (-8.1%)
500 200  2,46777100 3,96816000 (+60.8%)
250040   5,2193890012,79937800 (+145.2%)
800012  15,9760050045,44233000 (+184.4%)

Overall these instructions are the hardest ones to measure performance
as the helper implementation is affected by the immediate. So for
example in a worst case scenario (high REPT, LOOP = 1, immediate 127) it
took 13x longer with the gvec implementation, and in a best case
scenario (low REPT, high LOOP, only 1 bit set in the immediate) the
execution took 21.8% of the time with gvec (-78.2%).
The tests here are the sum of every possible immediate.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/translate/vsx-impl.c.inc | 73 -
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index c3c179723b..dc95e8fdf4 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -1121,16 +1121,85 @@ GEN_VSX_HELPER_X2(xscvhpdp, 0x16, 0x15, 0x10, 
PPC2_ISA300)
 GEN_VSX_HELPER_R2(xscvsdqp, 0x04, 0x1A, 0x0A, PPC2_ISA300)
 GEN_VSX_HELPER_X2(xscvspdp, 0x12, 0x14, 0, PPC2_VSX)
 
+static void do_xvtstdc_vec(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t imm)
+{
+TCGv_vec match = tcg_const_ones_vec_matching(t);
+TCGv_vec temp;
+TCGv_vec mask;
+uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP;
+uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP;
+uint64_t frc_msk = ~(exp_msk | sgn_msk);
+mask = tcg_constant_vec_matching(t, vece, 0);
+tcg_gen_mov_vec(t, mask);
+if (imm & (0x3 << 0)) {
+/* test if Denormal */
+temp = tcg_temp_new_vec_matching(t);
+mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
+tcg_gen_and_vec(vece, t, b, mask);
+mask = tcg_constant_vec_matching(t, vece, frc_msk);
+tcg_gen_cmp_vec(TCG_COND_LE, vece, temp, t, mask);
+mask = tcg_constant_vec_matching(t, vece, 0);
+tcg_gen_cmpsel_vec(TCG_COND_NE, vece, temp, t, mask, temp, mask);
+
+tcg_gen_mov_vec(t, mask);
+mask = tcg_constant_vec_matching(t, vece, sgn_msk);
+if (imm & (0x1)) {
+/* test if negative */
+tcg_gen_cmpsel_vec(TCG_COND_GTU, vece, t, b, mask, temp, t);
+}
+if (imm & (0x2)) {
+/* test if positive */
+tcg_gen_cmpsel_vec(TCG_COND_LTU, vece, t, b, mask, temp, t);
+}
+tcg_temp_free_vec(temp);
+}
+if (imm & (1 << 2)) {
+/* test if -0 */
+mask = tcg_constant_vec_matching(t, vece, sgn_msk);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 3)) {
+/* test if +0 */
+mask = tcg_constant_vec_matching(t, vece, 0);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 4)) {
+/* test if -Inf */
+mask = tcg_constant_vec_matching(t, vece, exp_msk | sgn_msk);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 5)) {
+/* test if +Inf */
+mask = tcg_constant_vec_matching(t, vece, exp_msk);
+tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+}
+if (imm & (1 << 6)) {
+/* test if NaN */
+mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
+tcg_gen_and_vec(vece, b, b, mask);
+mask = tcg_constant_vec_matching(t, vece, exp_msk);
+tcg_gen_cmpsel_vec(TCG_COND_GT, vece, t, b, mask, match, t);
+}
+tcg_temp_free_vec(match);
+}
+
 static bool do_xvtstdc(DisasContext *ctx, arg_XX2_uim *a, unsigned vece)
 {
+static const TCGOpcode vecop_list[] = {
+INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
+};
 static const GVecGen2i op[] = {
 {
 .fnoi = gen_helper_XVTSTDCSP,
-.vece = MO_32
+.fniv = do_xvtstdc_vec,
+.vece = MO_32,
+.opt_opc = vecop_list
 },
 {
 .fnoi = gen_helper_XVTSTDCDP,
-.vece = MO_64
+.fniv = 

[PATCH 11/12] target/ppc: Moved XSTSTDC[QDS]P to decodetree

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XSTSTDCSP, XSTSTDCDP and XSTSTDCQP to decodetree and moved some of
its decoding away from the helper as previously the DCMX, XB and BF were
calculated in the helper with the help of cpu_env, now that part was
moved to the decodetree with the rest.

xvtstdcsp:
reptloopmaster patch
8   12500   1,85393600 1,94683600 (+5.0%)
25  40001,78779800 1,92479000 (+7.7%)
100 10002,12775000 2,28895500 (+7.6%)
500 200 2,99655300 3,23102900 (+7.8%)
250040  6,89082200 7,44827500 (+8.1%)
800012 17,5058550018,95152100 (+8.3%)

xvtstdcdp:
reptloopmaster patch
8   12500   1,39043100 1,33539800 (-4.0%)
25  40001,35731800 1,37347800 (+1.2%)
100 10001,51514800 1,56053000 (+3.0%)
500 200 2,21014400 2,47906000 (+12.2%)
250040  5,39488200 6,68766700 (+24.0%)
800012 13,9862390018,17661900 (+30.0%)

xvtstdcdp:
reptloopmaster patch
8   12500   1,35123800 1,34455800 (-0.5%)
25  40001,36441200 1,36759600 (+0.2%)
100 10001,49763500 1,54138400 (+2.9%)
500 200 2,19020200 2,46196400 (+12.4%)
250040  5,39265700 6,68147900 (+23.9%)
800012 14,0416360018,19669600 (+29.6%)

As some values are now decoded outside the helper and passed to it as an
argument the number of arguments of the helper increased, the number
of TCGop needed to load the arguments increased. I suspect that's why
the slow-down in the tests with a high REPT but low LOOP.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/fpu_helper.c | 114 +---
 target/ppc/helper.h |   6 +-
 target/ppc/insn32.decode|   6 ++
 target/ppc/translate/vsx-impl.c.inc |  20 -
 target/ppc/translate/vsx-ops.c.inc  |   4 -
 5 files changed, 60 insertions(+), 90 deletions(-)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 35ca03b10b..b385f24908 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3241,63 +3241,6 @@ void helper_XVXSIGSP(ppc_vsr_t *xt, ppc_vsr_t *xb)
 *xt = t;
 }
 
-/*
- * VSX_TEST_DC - VSX floating point test data class
- *   op- instruction mnemonic
- *   nels  - number of elements (1, 2 or 4)
- *   xbn   - VSR register number
- *   tp- type (float32 or float64)
- *   fld   - vsr_t field (VsrD(*) or VsrW(*))
- *   tfld   - target vsr_t field (VsrD(*) or VsrW(*))
- *   fld_max - target field max
- *   scrf - set result in CR and FPCC
- */
-#define VSX_TEST_DC(op, nels, xbn, tp, fld, tfld, fld_max, scrf)  \
-void helper_##op(CPUPPCState *env, uint32_t opcode) \
-{   \
-ppc_vsr_t *xt = >vsr[xT(opcode)];  \
-ppc_vsr_t *xb = >vsr[xbn]; \
-ppc_vsr_t t = { };  \
-uint32_t i, sign, dcmx; \
-uint32_t cc, match = 0; \
-\
-if (!scrf) {\
-dcmx = DCMX_XV(opcode); \
-} else {\
-t = *xt;\
-dcmx = DCMX(opcode);\
-}   \
-\
-for (i = 0; i < nels; i++) {\
-sign = tp##_is_neg(xb->fld);\
-if (tp##_is_any_nan(xb->fld)) { \
-match = extract32(dcmx, 6, 1);  \
-} else if (tp##_is_infinity(xb->fld)) { \
-match = extract32(dcmx, 4 + !sign, 1);  \
-} else if (tp##_is_zero(xb->fld)) { \
-match = extract32(dcmx, 2 + !sign, 1);  \
-} else if (tp##_is_zero_or_denormal(xb->fld)) { \
-match = extract32(dcmx, 0 + !sign, 1);  \
-}   \
-\
-if (scrf) { \
-cc = sign << CRF_LT_BIT | match << CRF_EQ_BIT;  \
-env->fpscr &= ~FP_FPCC; \
-env->fpscr |= cc << FPSCR_FPCC; \
-env->crf[BF(opcode)] = cc;  \
-} else {\
-t.tfld = match ? fld_max : 0;   \
-}  

[PATCH 10/12] target/ppc: Moved XVTSTDC[DS]P to decodetree

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XVTSTDCSP and XVTSTDCDP to decodetree an restructured the helper
to be simpler and do all decoding in the decodetree (so XB, XT and DCMX
are all calculated outside the helper).

Obs: The tests in this one are slightly different, these are the sum of
these instructions with all possible immediate and those instructions
are repeated 10 times.

xvtstdcsp:
reptloopmaster patch
8   12500   2,76402100 2,70699100 (-2.1%)
25  40002,64867100 2,67884100 (+1.1%)
100 10002,73806300 2,78701000 (+1.8%)
500 200 3,44666500 3,61027600 (+4.7%)
250040  5,85790200 6,47475500 (+10.5%)
800012 15,2210210017,46062900 (+14.7%)

xvtstdcdp:
reptloopmaster patch
8   12500   2,11818000 1,61065300 (-24.0%)
25  40002,04573400 1,60132200 (-21.7%)
100 10002,13834100 1,69988100 (-20.5%)
500 200 2,73977000 2,48631700 (-9.3%)
250040  5,05067000 5,25914100 (+4.1%)
800012 14,6050780015,93704900 (+9.1%)

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/fpu_helper.c | 39 +++--
 target/ppc/helper.h |  4 +--
 target/ppc/insn32.decode|  5 
 target/ppc/translate/vsx-impl.c.inc | 28 +++--
 target/ppc/translate/vsx-ops.c.inc  |  8 --
 5 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index ae25f32d6e..35ca03b10b 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3295,11 +3295,46 @@ void helper_##op(CPUPPCState *env, uint32_t opcode) 
\
 }   \
 }
 
-VSX_TEST_DC(xvtstdcdp, 2, xB(opcode), float64, VsrD(i), VsrD(i), UINT64_MAX, 0)
-VSX_TEST_DC(xvtstdcsp, 4, xB(opcode), float32, VsrW(i), VsrW(i), UINT32_MAX, 0)
 VSX_TEST_DC(xststdcdp, 1, xB(opcode), float64, VsrD(0), VsrD(0), 0, 1)
 VSX_TEST_DC(xststdcqp, 1, (rB(opcode) + 32), float128, f128, VsrD(0), 0, 1)
 
+#define VSX_TSTDC(tp)   \
+static int32_t tp##_tstdc(tp b, uint32_t dcmx)  \
+{   \
+uint32_t match = 0; \
+uint32_t sign = tp##_is_neg(b); \
+if (tp##_is_any_nan(b)) {   \
+match = extract32(dcmx, 6, 1);  \
+} else if (tp##_is_infinity(b)) {   \
+match = extract32(dcmx, 4 + !sign, 1);  \
+} else if (tp##_is_zero(b)) {   \
+match = extract32(dcmx, 2 + !sign, 1);  \
+} else if (tp##_is_zero_or_denormal(b)) {   \
+match = extract32(dcmx, 0 + !sign, 1);  \
+}   \
+return (match != 0) ? 1 : 0;\
+}
+
+VSX_TSTDC(float32)
+VSX_TSTDC(float64)
+#undef VSX_TSTDC
+
+void helper_XVTSTDCDP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+{
+int i;
+for (i = 0; i < 2; i++) {
+t->s64[i] = (int64_t)-float64_tstdc(b->f64[i], dcmx);
+}
+}
+
+void helper_XVTSTDCSP(ppc_vsr_t *t, ppc_vsr_t *b, uint64_t dcmx, uint32_t v)
+{
+int i;
+for (i = 0; i < 4; i++) {
+t->s32[i] = (int32_t)-float32_tstdc(b->f32[i], dcmx);
+}
+}
+
 void helper_xststdcsp(CPUPPCState *env, uint32_t opcode, ppc_vsr_t *xb)
 {
 uint32_t dcmx, sign, exp;
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 1f470a0e5e..d3e3324c73 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -521,8 +521,8 @@ DEF_HELPER_3(xvcvsxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxdsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvsxwsp, void, env, vsr, vsr)
 DEF_HELPER_3(xvcvuxwsp, void, env, vsr, vsr)
-DEF_HELPER_2(xvtstdcsp, void, env, i32)
-DEF_HELPER_2(xvtstdcdp, void, env, i32)
+DEF_HELPER_FLAGS_4(XVTSTDCSP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
+DEF_HELPER_FLAGS_4(XVTSTDCDP, TCG_CALL_NO_RWG, void, vsr, vsr, i64, i32)
 DEF_HELPER_3(xvrspi, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspic, void, env, vsr, vsr)
 DEF_HELPER_3(xvrspim, void, env, vsr, vsr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 6549c4040e..c0a531be5c 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -199,6 +199,9 @@
 
 @XX2_uim4   .. . . uim:4 . . .. _uim 
xt=%xx_xt xb=%xx_xb
 
+%xx_uim76:1 2:1 16:5
+@XX2_uim7   .. . . .  . ... . .._uim 
xt=%xx_xt xb=%xx_xb uim=%xx_uim7
+
 _bf_xb  bf xb
 @XX2_bf_xb  .. bf:3 .. . . . . ._bf_xb 
xb=%xx_xb
 
@@ -848,6 +851,8 @@ XSCVSPDPN   00 . - . 101001

[PATCH 06/12] target/ppc: Move VAVG[SU][BHW] to decodetree and use gvec

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved the instructions VAVGUB, VAVGUH, VAVGUW, VAVGSB, VAVGSH, VAVGSW,
to decodetree and use gvec with them. For these one the right shift
had to be made before the sum as to avoid an overflow, so add 1 at the
end if any of the entries had 1 in its LSB as to replicate the "+ 1"
before the shift described by the ISA.

vavgub:
reptloopmaster patch
8   12500   0,02616600 0,00754200 (-71.2%)
25  40000,0253 0,00637700 (-74.8%)
100 10000,02604600 0,00790100 (-69.7%)
500 200 0,03189300 0,01838400 (-42.4%)
250040  0,06006900 0,06851000 (+14.1%)
800012  0,13941000 0,20548500 (+47.4%)

vavguh:
reptloopmaster patch
8   12500   0,01818200 0,00780600 (-57.1%)
25  40000,01789300 0,00641600 (-64.1%)
100 10000,01899100 0,00787200 (-58.5%)
500 200 0,02527200 0,01828400 (-27.7%)
250040  0,05361800 0,06773000 (+26.3%)
800012  0,12886600 0,20291400 (+57.5%)

vavguw:
reptloopmaster patch
8   12500   0,01423100 0,00776600 (-45.4%)
25  40000,01780800 0,00638600 (-64.1%)
100 10000,02085500 0,00787000 (-62.3%)
500 200 0,02737100 0,01828800 (-33.2%)
250040  0,05572600 0,06774200 (+21.6%)
800012  0,13101700 0,20311600 (+55.0%)

vavgsb:
reptloopmaster patch
8   12500   0,03006000 0,00788600 (-73.8%)
25  40000,02882200 0,00637800 (-77.9%)
100 10000,02958000 0,00791400 (-73.2%)
500 200 0,03548800 0,01860400 (-47.6%)
250040  0,0636 0,06850800 (+7.7%)
800012  0,13816500 0,20550300 (+48.7%)

vavgsh:
reptloopmaster patch
8   12500   0,01965900 0,00776600 (-60.5%)
25  40000,01875400 0,00638700 (-65.9%)
100 10000,01952200 0,00786900 (-59.7%)
500 200 0,02562000 0,01760300 (-31.3%)
250040  0,05384300 0,06742800 (+25.2%)
800012  0,13240800 0,2033 (+53.5%)

vavgsw:
reptloopmaster patch
8   12500   0,01407700 0,00775600 (-44.9%)
25  40000,01762300 0,0064 (-63.7%)
100 10000,02046500 0,00788500 (-61.5%)
500 200 0,02745600 0,01843000 (-32.9%)
250040  0,05375500 0,06820500 (+26.9%)
800012  0,13068300 0,20304900 (+55.4%)

These results to me seems to indicate that with gvec the results have a
slower translation but faster execution.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h |  12 +--
 target/ppc/insn32.decode|   9 +++
 target/ppc/int_helper.c |  32 
 target/ppc/translate/vmx-impl.c.inc | 109 +---
 target/ppc/translate/vmx-ops.c.inc  |   9 +--
 5 files changed, 130 insertions(+), 41 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 6a43e32ad3..f88d9d3996 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -143,15 +143,15 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)
 #define dh_ctype_acc ppc_acc_t *
 #define dh_typecode_acc dh_typecode_ptr
 
-DEF_HELPER_FLAGS_3(vavgub, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavguh, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavguw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavgsb, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavgsh, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vavgsw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_4(vcmpeqfp, void, env, avr, avr, avr)
 DEF_HELPER_4(vcmpgefp, void, env, avr, avr, avr)
 DEF_HELPER_4(vcmpgtfp, void, env, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index aa4968e6b9..38458c01de 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -519,6 +519,15 @@ VCMPNEZW000100 . . . . 011111   @VC
 VCMPSQ  000100 ... -- . . 0010101   @VX_bf
 VCMPUQ  000100 ... -- . . 0010001   @VX_bf
 
+## Ve

[PATCH 08/12] target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XVABSSP, XVABSDP, XVNABSSP,XVNABSDP, XVNEGSP and XVNEGDP to
decodetree and used gvec to translate them.

xvabssp:
reptloopmaster patch
8   12500   0,00477900 0,00476000 (-0.4%)
25  40000,00442800 0,00353300 (-20.2%)
100 10000,00478700 0,00366100 (-23.5%)
500 200 0,00973200 0,00649400 (-33.3%)
250040  0,03165200 0,02226700 (-29.7%)
800012  0,09315900 0,06674900 (-28.3%)

xvabsdp:
reptloopmaster patch
8   12500   0,00475000 0,00474400 (-0.1%)
25  40000,00355600 0,00367500 (+3.3%)
100 10000,00444200 0,00366000 (-17.6%)
500 200 0,00942700 0,00732400 (-22.3%)
250040  0,0299 0,02308500 (-22.8%)
800012  0,08770300 0,06683800 (-23.8%)

xvnabssp:
reptloopmaster patch
8   12500   0,00494500 0,00492900 (-0.3%)
25  40000,00397700 0,00338600 (-14.9%)
100 10000,00421400 0,00353500 (-16.1%)
500 200 0,01048000 0,00707100 (-32.5%)
250040  0,03251500 0,02238300 (-31.2%)
800012  0,08889100 0,06469800 (-27.2%)

xvnabsdp:
reptloopmaster patch
8   12500   0,00511000 0,00492700 (-3.6%)
25  40000,00398800 0,00381500 (-4.3%)
100 10000,00390500 0,00365900 (-6.3%)
500 200 0,00924800 0,00784600 (-15.2%)
250040  0,03138900 0,02391600 (-23.8%)
800012  0,09654200 0,05684600 (-41.1%)

xvnegsp:
reptloopmaster patch
8   12500   0,00493900 0,00452800 (-8.3%)
25  40000,00369100 0,00366800 (-0.6%)
100 10000,00371100 0,0038 (+2.4%)
500 200 0,00991100 0,00652300 (-34.2%)
250040  0,03025800 0,02422300 (-19.9%)
800012  0,09251100 0,06457600 (-30.2%)

xvnegdp:
reptloopmaster patch
8   12500   0,00474900 0,00454400 (-4.3%)
25  40000,00353100 0,00325600 (-7.8%)
100 10000,00398600 0,00366800 (-8.0%)
500 200 0,01032300 0,00702400 (-32.0%)
250040  0,03125000 0,02422400 (-22.5%)
800012  0,09475100 0,06173000 (-34.9%)

This one to me seemed the opposite of the previous instructions, as it
looks like there was an improvement in the translation time (itself not
a surprise as operations were done twice before so there was the need to
translate twice as many TCGop)

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/insn32.decode|  9 
 target/ppc/translate/vsx-impl.c.inc | 76 ++---
 target/ppc/translate/vsx-ops.c.inc  |  6 ---
 3 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index ae151c4b62..5b687078be 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -754,6 +754,15 @@ STXVRHX 01 . . . 0010101101 .   
@X_TSX
 STXVRWX 01 . . . 0011001101 .   @X_TSX
 STXVRDX 01 . . . 0011101101 .   @X_TSX
 
+## VSX Vector Binary Floating-Point Sign Manipulation Instructions
+
+XVABSDP 00 . 0 . 111011001 ..   @XX2
+XVABSSP 00 . 0 . 110011001 ..   @XX2
+XVNABSDP00 . 0 . 01001 ..   @XX2
+XVNABSSP00 . 0 . 110101001 ..   @XX2
+XVNEGDP 00 . 0 . 11001 ..   @XX2
+XVNEGSP 00 . 0 . 110111001 ..   @XX2
+
 ## VSX Scalar Multiply-Add Instructions
 
 XSMADDADP   00 . . . 0011 . . . @XX3
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 7acdbceec4..426a9a3926 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -782,15 +782,79 @@ static void glue(gen_, name)(DisasContext *ctx)   
   \
 tcg_temp_free_i64(sgm);  \
 }
 
-VSX_VECTOR_MOVE(xvabsdp, OP_ABS, SGN_MASK_DP)
-VSX_VECTOR_MOVE(xvnabsdp, OP_NABS, SGN_MASK_DP)
-VSX_VECTOR_MOVE(xvnegdp, OP_NEG, SGN_MASK_DP)
 VSX_VECTOR_MOVE(xvcpsgndp, OP_CPSGN, SGN_MASK_DP)
-VSX_VECTOR_MOVE(xvabssp, OP_ABS, SGN_MASK_SP)
-VSX_VECTOR_MOVE(xvnabssp, OP_NABS, SGN_MASK_SP)
-VSX_VECTOR_MOVE(xvnegsp, OP_NEG, SGN_MASK_SP)
 VSX_VECTOR_MOVE(xvcpsgnsp, OP_CPSGN, SGN_MASK_SP)
 
+#define TCG_OP_IMM_i64(FUNC, OP, IMM)   \
+static void FUNC(TCGv_i64 t, TCGv_i64 b)\
+{   \
+OP(t, b, IMM);  \
+}
+
+TCG_OP_IMM_i64(do_xvabssp_i64, tcg_gen_andi_i64, ~SGN_MASK_SP)
+TCG_

[PATCH 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to
decodetree.

vprtybw:
reptloopmaster patch
8   12500   0,01215900 0,00705600 (-42.0%)
25  40000,01198700 0,00574400 (-52.1%)
100 10000,01307800 0,00692200 (-47.1%)
500 200 0,01794800 0,01558800 (-13.1%)
250040  0,04028200 0,05400800 (+34.1%)
800012  0,10127300 0,16744700 (+65.3%)

vprtybd:
reptloopmaster patch
8   12500   0,00757400 0,00791600 (+4.5%)
25  40000,00651300 0,00673700 (+3.4%)
100 10000,00713400 0,00837700 (+17.4%)
500 200 0,01195400 0,01937400 (+62.1%)
250040  0,03478600 0,07005500 (+101.4%)
800012  0,09539600 0,21013500 (+120.3%)

vprtybq:
reptloopmaster patch
8   12500   0,00065540 0,00066440 (+1.4%)
25  40000,00057720 0,00059850 (+3.7%)
100 10000,00066400 0,00069360 (+4.5%)
500 200 0,00115170 0,00127360 (+10.6%)
250040  0,00341890 0,00391550 (+14.5%)
800012  0,00951220 0,0480 (+16.8%)

I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ,
I'm not sure if it's worth to move those instructions. Comparing the 
assembly of the helper with the TCGop they are pretty similar, so
I'm not sure why vprtybd took so much more time.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h |  6 ++--
 target/ppc/insn32.decode|  4 +++
 target/ppc/int_helper.c |  6 ++--
 target/ppc/translate/vmx-impl.c.inc | 55 +++--
 target/ppc/translate/vmx-ops.c.inc  |  3 --
 5 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index feccf30bcb..6a43e32ad3 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -194,9 +194,9 @@ DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VADDCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
+DEF_HELPER_FLAGS_3(VPRTYBW, TCG_CALL_NO_RWG, void, avr, avr, i32)
+DEF_HELPER_FLAGS_3(VPRTYBD, TCG_CALL_NO_RWG, void, avr, avr, i32)
+DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VSUBCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 2658dd3395..aa4968e6b9 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -529,6 +529,10 @@ VCTZDM  000100 . . . 1000100@VX
 VPDEPD  000100 . . . 10111001101@VX
 VPEXTD  000100 . . . 10110001101@VX
 
+VPRTYBD 000100 . 01001 . 1100010@VX_tb
+VPRTYBQ 000100 . 01010 . 1100010@VX_tb
+VPRTYBW 000100 . 01000 . 1100010@VX_tb
+
 ## Vector Permute and Formatting Instruction
 
 VEXTDUBVLX  000100 . . . . 011000   @VA
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 338ebced22..64b2d44a66 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -502,7 +502,7 @@ void helper_VADDCUW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t 
*b, uint32_t v)
 }
 
 /* vprtybw */
-void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBW(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
 int i;
 for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
@@ -513,7 +513,7 @@ void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
 }
 
 /* vprtybd */
-void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBD(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
 int i;
 for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
@@ -525,7 +525,7 @@ void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
 }
 
 /* vprtybq */
-void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
 uint64_t res = b->u64[0] ^ b->u64[1];
 res ^= res >> 32;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 3f614097ac..06d91d1304 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1659,9 +1659,58 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
 GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
 GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
 GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
-GEN_VXFORM_NOA(vprtybw, 1, 2

[PATCH 07/12] target/ppc: Move VABSDU[BHW] to decodetree and use gvec

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved VABSDUB, VABSDUH and VABSDUW to decodetree and use gvec to
translate them.

vabsdub:
reptloopmaster patch
8   12500   0,03601600 0,00688500 (-80.9%)
25  40000,03651000 0,00532100 (-85.4%)
100 10000,03666900 0,00595300 (-83.8%)
500 200 0,04305800 0,01244600 (-71.1%)
250040  0,06893300 0,04273700 (-38.0%)
800012  0,14633200 0,12660300 (-13.5%)

vabsduh:
reptloopmaster patch
8   12500   0,02172400 0,00687500 (-68.4%)
25  40000,02154100 0,00531500 (-75.3%)
100 10000,02235400 0,00596300 (-73.3%)
500 200 0,02827500 0,01245100 (-56.0%)
250040  0,05638400 0,04285500 (-24.0%)
800012  0,13166000 0,12641400 (-4.0%)

vabsduw:
reptloopmaster patch
8   12500   0,01646400 0,00688300 (-58.2%)
25  40000,01454500 0,00475500 (-67.3%)
100 10000,01545800 0,00511800 (-66.9%)
500 200 0,02168200 0,01114300 (-48.6%)
250040  0,04571300 0,04138800 (-9.5%)
800012  0,12209500 0,12178500 (-0.3%)

Same as VADDCUW and VSUBCUW, overall performance gain but it uses more
TCGop (4 before the patch, 6 after).

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h |  6 ++--
 target/ppc/insn32.decode|  6 
 target/ppc/int_helper.c | 13 +++-
 target/ppc/translate/vmx-impl.c.inc | 49 +++--
 target/ppc/translate/vmx-ops.c.inc  |  3 --
 5 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f88d9d3996..1f470a0e5e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -146,9 +146,9 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)
 DEF_HELPER_FLAGS_4(VAVGUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_3(vabsdub, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vabsduh, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vabsduw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VABSDUB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VABSDUH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
+DEF_HELPER_FLAGS_4(VABSDUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGSB, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGSH, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VAVGSW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 38458c01de..ae151c4b62 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -528,6 +528,12 @@ VAVGUB  000100 . . . 110@VX
 VAVGUH  000100 . . . 1000110@VX
 VAVGUW  000100 . . . 1001010@VX
 
+## Vector Integer Absolute Difference Instructions
+
+VABSDUB 000100 . . . 111@VX
+VABSDUH 000100 . . . 1000111@VX
+VABSDUW 000100 . . . 1001011@VX
+
 ## Vector Bit Manipulation Instruction
 
 VGNB000100 . -- ... . 10011001100   @VX_n
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 0a142441e5..a797b4ddaf 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -621,8 +621,8 @@ VAVG(VAVGSW, s32, int64_t)
 VAVG(VAVGUW, u32, uint64_t)
 #undef VAVG
 
-#define VABSDU_DO(name, element)\
-void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)   \
+#define VABSDU(name, element)   \
+void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v)\
 {   \
 int i;  \
 \
@@ -638,12 +638,9 @@ void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t 
*b)   \
  *   name- instruction mnemonic suffix (b: byte, h: halfword, w: word)
  *   element - element type to access from vector
  */
-#define VABSDU(type, element)   \
-VABSDU_DO(absdu##type, element)
-VABSDU(b, u8)
-VABSDU(h, u16)
-VABSDU(w, u32)
-#undef VABSDU_DO
+VABSDU(VABSDUB, u8)
+VABSDU(VABSDUH, u16)
+VABSDU(VABSDUW, u32)
 #undef VABSDU
 
 #define VCF(suffix, cvt, element)   \
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 8ff7c6ff3a..1dd799620d 100644
--- a/target/ppc/translate/vmx-impl.c

[PATCH 03/12] target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch moves VADDCUW and VSUBCUW to decodtree with gvec using an
implementation based on the helper, with the main difference being
changing the -1 (aka all bits set to 1) result returned by cmp when
true to +1

vaddcuw:
reptloopmaster patch
8   12500   0,01420600 0,00679200 (-52.2%)
25  40000,01781700 0,00524500 (-70.6%)
100 10000,02053300 0,00591800 (-71.2%)
500 200 0,02709800 0,01254600 (-53.7%)
250040  0,05537100 0,04347800 (-21.5%)
800012  0,13103100 0,12973600 (-1.0%)

vsubcuw:
reptloopmaster patch
8   12500   0,01426100 0,00685500 (-51.9%)
25  40000,01744600 0,00536000 (-69.3%)
100 10000,02029500 0,00597400 (-70.6%)
500 200 0,02654000 0,01263200 (-52.4%)
250040  0,05507200 0,04347100 (-21.1%)
800012  0,13072400 0,12872300 (-1.5%)

Overall there was a gain in performance, but the TCGop code was still
slightly bigger in the new version (it went from 4 to 5).

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h |  4 +--
 target/ppc/insn32.decode|  2 ++
 target/ppc/int_helper.c |  4 +--
 target/ppc/translate/vmx-impl.c.inc | 50 ++---
 target/ppc/translate/vmx-ops.c.inc  |  3 +-
 5 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index f02a9497b7..edce059f2c 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -193,11 +193,11 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_3(vaddcuw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VADDCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_3(vsubcuw, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_4(VSUBCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 9a509e84df..aebc7b73c8 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -608,12 +608,14 @@ VRLQNM  000100 . . . 00101000101
@VX
 
 ## Vector Integer Arithmetic Instructions
 
+VADDCUW 000100 . . . 0011000@VX
 VADDCUQ 000100 . . . 0010100@VX
 VADDUQM 000100 . . . 001@VX
 
 VADDEUQM000100 . . . . 00   @VA
 VADDECUQ000100 . . . . 01   @VA
 
+VSUBCUW 000100 . . . 1011000@VX
 VSUBCUQ 000100 . . . 1010100@VX
 VSUBUQM 000100 . . . 101@VX
 
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index ae1ba8084d..c48841819d 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -492,7 +492,7 @@ static inline void set_vscr_sat(CPUPPCState *env)
 env->vscr_sat.u32[0] = 1;
 }
 
-void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+void helper_VADDCUW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v)
 {
 int i;
 
@@ -1962,7 +1962,7 @@ void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 #endif
 }
 
-void helper_vsubcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+void helper_VSUBCUW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t v)
 {
 int i;
 
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 3acd585a2f..c5bfbfb3ce 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -803,8 +803,6 @@ GEN_VXFORM(vsrv, 2, 28);
 GEN_VXFORM(vslv, 2, 29);
 GEN_VXFORM(vslo, 6, 16);
 GEN_VXFORM(vsro, 6, 17);
-GEN_VXFORM(vaddcuw, 0, 6);
-GEN_VXFORM(vsubcuw, 0, 22);
 
 static bool do_vector_gvec3_VX(DisasContext *ctx, arg_VX *a, int vece,
void (*gen_gvec)(unsigned, uint32_t, uint32_t,
@@ -2847,8 +2845,6 @@ static void gen_xpnd04_2(DisasContext *ctx)
 }
 
 
-GEN_VXFORM_DUAL(vsubcuw, PPC_ALTIVEC, PPC_NONE, \
-xpnd04_1, PPC_NONE, PPC2_ISA300)
 GEN_VXFORM_DUAL(vsubsws, PPC_ALTIVEC, PPC_NONE, \
 xpnd04_2, PPC_NONE, PPC2_ISA300)
 
@@ -3110,6 +3106,52 @@ TRANS_FLAGS2(ALTIVEC_207, VPMSUMD, do_vx_helper, 
gen_helper_VPMSUMD)

[PATCH 01/12] target/ppc: Moved VMLADDUHM to decodetree and use gvec

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch moves VMLADDUHM to decodetree a creates a gvec implementation
using mul_vec and add_vec.

reptloopmaster patch
8   12500   0,01810500 0,00903100 (-50.1%)
25  40000,01739400 0,00747700 (-57.0%)
100 10000,01843600 0,00901400 (-51.1%)
500 200 0,02574600 0,01971000 (-23.4%)
250040  0,05921600 0,07121800 (+20.3%)
800012  0,15326700 0,21725200 (+41.7%)

The significant difference in performance when REPT is low and LOOP is
high I think is due to the fact that the new implementation has a higher
translation time, as when using a helper only 5 TCGop are used but with
the patch a total of 10 TCGop are needed (Power lacks a direct mul_vec
equivalent so this instruction is implemented with the help of 5 others,
vmuleu, vmulou, vmrgh, vmrgl and vpkum).

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h |  2 +-
 target/ppc/insn32.decode|  2 ++
 target/ppc/int_helper.c |  3 +-
 target/ppc/translate.c  |  1 -
 target/ppc/translate/vmx-impl.c.inc | 48 ++---
 5 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 57eee07256..9c562ab00e 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -264,7 +264,7 @@ DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, 
avr, avr, avr)
 DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
 DEF_HELPER_5(VMSUMSHS, void, env, avr, avr, avr, avr)
-DEF_HELPER_FLAGS_4(vmladduhm, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
+DEF_HELPER_FLAGS_5(VMLADDUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_2(mtvscr, TCG_CALL_NO_RWG, void, env, i32)
 DEF_HELPER_FLAGS_1(mfvscr, TCG_CALL_NO_RWG, i32, env)
 DEF_HELPER_3(lvebx, void, env, avr, tl)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index a5249ee32c..7445455a12 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -693,6 +693,8 @@ VMSUMUHS000100 . . . . 100111   @VA
 VMSUMCUD000100 . . . . 010111   @VA
 VMSUMUDM000100 . . . . 100011   @VA
 
+VMLADDUHM   000100 . . . . 100010   @VA
+
 ## Vector String Instructions
 
 VSTRIBL 000100 . 0 . . 001101   @VX_tb_rc
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 696096100b..0d25000b2a 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -974,7 +974,8 @@ void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, 
ppc_avr_t *a,
 }
 }
 
-void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+void helper_VMLADDUHM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c,
+  uint32_t v)
 {
 int i;
 
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index e810842925..11f729c60c 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -6921,7 +6921,6 @@ GEN_HANDLER(lvsl, 0x1f, 0x06, 0x00, 0x0001, 
PPC_ALTIVEC),
 GEN_HANDLER(lvsr, 0x1f, 0x06, 0x01, 0x0001, PPC_ALTIVEC),
 GEN_HANDLER(mfvscr, 0x04, 0x2, 0x18, 0x001ff800, PPC_ALTIVEC),
 GEN_HANDLER(mtvscr, 0x04, 0x2, 0x19, 0x03ff, PPC_ALTIVEC),
-GEN_HANDLER(vmladduhm, 0x04, 0x11, 0xFF, 0x, PPC_ALTIVEC),
 #if defined(TARGET_PPC64)
 GEN_HANDLER_E(maddhd_maddhdu, 0x04, 0x18, 0xFF, 0x, PPC_NONE,
   PPC2_ISA300),
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index e644ad3236..9f18c6d4f2 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2523,24 +2523,6 @@ static void glue(gen_, name0##_##name1)(DisasContext 
*ctx)  \
 
 GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16)
 
-static void gen_vmladduhm(DisasContext *ctx)
-{
-TCGv_ptr ra, rb, rc, rd;
-if (unlikely(!ctx->altivec_enabled)) {
-gen_exception(ctx, POWERPC_EXCP_VPU);
-return;
-}
-ra = gen_avr_ptr(rA(ctx->opcode));
-rb = gen_avr_ptr(rB(ctx->opcode));
-rc = gen_avr_ptr(rC(ctx->opcode));
-rd = gen_avr_ptr(rD(ctx->opcode));
-gen_helper_vmladduhm(rd, ra, rb, rc);
-tcg_temp_free_ptr(ra);
-tcg_temp_free_ptr(rb);
-tcg_temp_free_ptr(rc);
-tcg_temp_free_ptr(rd);
-}
-
 static bool do_va_helper(DisasContext *ctx, arg_VA *a,
 void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr))
 {
@@ -2569,6 +2551,36 @@ TRANS_FLAGS2(ALTIVEC_207, VSUBECUQ, do_va_helper, 
gen_helper_VSUBECUQ)
 TRANS_FLAGS(ALTIVEC, VPERM, do_va_helper, gen_helper_VPERM)
 TRANS_FLAGS2(ISA300, VPERMR, do_va_helper, gen_helper_VPERMR)
 
+static void gen_vmladduhm_vec(unsigned vece, TCGv_vec t, TCGv_vec a, TCGv_vec 
b,
+  TCGv_vec c)
+{
+  

[PATCH 09/12] target/ppc: Use gvec to decode XVCPSGN[SD]P

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved XVCPSGNSP and XVCPSGNDP to decodetree and used gvec to translate
them.

xvcpsgnsp:
reptloopmaster patch
8   12500   0,00722000 0,00587700 (-18.6%)
25  40000,00604300 0,00521500 (-13.7%)
100 10000,00815600 0,00508500 (-37.7%)
500 200 0,02376600 0,01222600 (-48.6%)
250040  0,07709200 0,04158300 (-46.1%)
800012  0,27922100 0,12394400 (-55.6%)

xvcpsgndp:
reptloopmaster patch
8   12500   0,00557900 0,00584900 (+4.8%)
25  40000,00518700 0,00502900 (-3.0%)
100 10000,00655900 0,00569600 (-13.2%)
500 200 0,01560900 0,01260500 (-19.2%)
250040  0,05899200 0,03989400 (-32.4%)
800012  0,20046000 0,12417700 (-38.1%)

Like the previous instructions there seemed to be a improvement on
translation time.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/insn32.decode|   2 +
 target/ppc/translate/vsx-impl.c.inc | 114 ++--
 target/ppc/translate/vsx-ops.c.inc  |   3 -
 3 files changed, 60 insertions(+), 59 deletions(-)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 5b687078be..6549c4040e 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -762,6 +762,8 @@ XVNABSDP00 . 0 . 01001 ..   @XX2
 XVNABSSP00 . 0 . 110101001 ..   @XX2
 XVNEGDP 00 . 0 . 11001 ..   @XX2
 XVNEGSP 00 . 0 . 110111001 ..   @XX2
+XVCPSGNDP   00 . . .  ...   @XX3
+XVCPSGNSP   00 . . . 1101 ...   @XX3
 
 ## VSX Scalar Multiply-Add Instructions
 
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 426a9a3926..3e4509cb41 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -729,62 +729,6 @@ VSX_SCALAR_MOVE_QP(xsnabsqp, OP_NABS, SGN_MASK_DP)
 VSX_SCALAR_MOVE_QP(xsnegqp, OP_NEG, SGN_MASK_DP)
 VSX_SCALAR_MOVE_QP(xscpsgnqp, OP_CPSGN, SGN_MASK_DP)
 
-#define VSX_VECTOR_MOVE(name, op, sgn_mask)  \
-static void glue(gen_, name)(DisasContext *ctx)  \
-{\
-TCGv_i64 xbh, xbl, sgm;  \
-if (unlikely(!ctx->vsx_enabled)) {   \
-gen_exception(ctx, POWERPC_EXCP_VSXU);   \
-return;  \
-}\
-xbh = tcg_temp_new_i64();\
-xbl = tcg_temp_new_i64();\
-sgm = tcg_temp_new_i64();\
-get_cpu_vsr(xbh, xB(ctx->opcode), true); \
-get_cpu_vsr(xbl, xB(ctx->opcode), false);\
-tcg_gen_movi_i64(sgm, sgn_mask); \
-switch (op) {\
-case OP_ABS: {   \
-tcg_gen_andc_i64(xbh, xbh, sgm); \
-tcg_gen_andc_i64(xbl, xbl, sgm); \
-break;   \
-}\
-case OP_NABS: {  \
-tcg_gen_or_i64(xbh, xbh, sgm);   \
-tcg_gen_or_i64(xbl, xbl, sgm);   \
-break;   \
-}\
-case OP_NEG: {   \
-tcg_gen_xor_i64(xbh, xbh, sgm);  \
-tcg_gen_xor_i64(xbl, xbl, sgm);  \
-break;   \
-}\
-case OP_CPSGN: { \
-TCGv_i64 xah = tcg_temp_new_i64();   \
-TCGv_i64 xal = tcg_temp_new_i64();   \
-get_cpu_vsr(xah, xA(ctx->opcode), true); \
-get_cpu_vsr(xal, xA(ctx->opcode), false);\
-tcg_gen_and_i64(xah, xah, sgm);  \
-tcg_gen_and_i64(xal, xal, sgm);  \
-tcg_gen_andc_i64(xbh, xbh, sgm); \
-tcg_gen_andc_i64(xbl, xbl, sgm); \
-tcg_gen_or_i64(xbh, xbh, xah);   \
-tcg

[PATCH 04/12] target/ppc: Move VNEG[WD] to decodtree and use gvec

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved the instructions VNEGW and VNEGD to decodetree and used gvec to
decode it.

vnegw:
reptloopmaster patch
8   12500   0,01053200 0,00548400 (-47.9%)
25  40000,01030500 0,0039 (-62.2%)
100 10000,01096300 0,00395400 (-63.9%)
500 200 0,01472000 0,00712300 (-51.6%)
250040  0,03809000 0,02147700 (-43.6%)
800012  0,09957100 0,06202100 (-37.7%)

vnegd:
reptloopmaster patch
8   12500   0,00594600 0,00543800 (-8.5%)
25  40000,00575200 0,00396400 (-31.1%)
100 10000,00676100 0,00394800 (-41.6%)
500 200 0,01149300 0,00709400 (-38.3%)
250040  0,03441500 0,02169600 (-37.0%)
800012  0,09516900 0,06337000 (-33.4%)

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h |  2 --
 target/ppc/insn32.decode|  3 +++
 target/ppc/int_helper.c | 12 
 target/ppc/translate/vmx-impl.c.inc | 15 +--
 target/ppc/translate/vmx-ops.c.inc  |  2 --
 5 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index edce059f2c..feccf30bcb 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -231,8 +231,6 @@ DEF_HELPER_FLAGS_2(VSTRIBL, TCG_CALL_NO_RWG, i32, avr, avr)
 DEF_HELPER_FLAGS_2(VSTRIBR, TCG_CALL_NO_RWG, i32, avr, avr)
 DEF_HELPER_FLAGS_2(VSTRIHL, TCG_CALL_NO_RWG, i32, avr, avr)
 DEF_HELPER_FLAGS_2(VSTRIHR, TCG_CALL_NO_RWG, i32, avr, avr)
-DEF_HELPER_FLAGS_2(vnegw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vnegd, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vupkhpx, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vupklpx, TCG_CALL_NO_RWG, void, avr, avr)
 DEF_HELPER_FLAGS_2(vupkhsb, TCG_CALL_NO_RWG, void, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index aebc7b73c8..2658dd3395 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -629,6 +629,9 @@ VEXTSH2D000100 . 11001 . 1100010
@VX_tb
 VEXTSW2D000100 . 11010 . 1100010@VX_tb
 VEXTSD2Q000100 . 11011 . 1100010@VX_tb
 
+VNEGD   000100 . 00111 . 1100010@VX_tb
+VNEGW   000100 . 00110 . 1100010@VX_tb
+
 ## Vector Mask Manipulation Instructions
 
 MTVSRBM 000100 . 1 . 1100110@VX_tb
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index c48841819d..338ebced22 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1937,18 +1937,6 @@ XXBLEND(W, 32)
 XXBLEND(D, 64)
 #undef XXBLEND
 
-#define VNEG(name, element) \
-void helper_##name(ppc_avr_t *r, ppc_avr_t *b)  \
-{   \
-int i;  \
-for (i = 0; i < ARRAY_SIZE(r->element); i++) {  \
-r->element[i] = -b->element[i]; \
-}   \
-}
-VNEG(vnegw, s32)
-VNEG(vnegd, s64)
-#undef VNEG
-
 void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
 int sh = (b->VsrB(0xf) >> 3) & 0xf;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index c5bfbfb3ce..3f614097ac 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2625,8 +2625,19 @@ GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
 GEN_VXFORM_TRANS(vclzw, 1, 30)
 GEN_VXFORM_TRANS(vclzd, 1, 31)
-GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
-GEN_VXFORM_NOA_2(vnegd, 1, 24, 7)
+
+static bool do_vneg(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+REQUIRE_VECTOR(ctx);
+
+tcg_gen_gvec_neg(vece, avr_full_offset(a->vrt), avr_full_offset(a->vrb),
+ 16, 16);
+return true;
+}
+
+TRANS(VNEGW, do_vneg, MO_32)
+TRANS(VNEGD, do_vneg, MO_64)
 
 static void gen_vexts_i64(TCGv_i64 t, TCGv_i64 b, int64_t s)
 {
diff --git a/target/ppc/translate/vmx-ops.c.inc 
b/target/ppc/translate/vmx-ops.c.inc
index ded0234123..27908533dd 100644
--- a/target/ppc/translate/vmx-ops.c.inc
+++ b/target/ppc/translate/vmx-ops.c.inc
@@ -181,8 +181,6 @@ GEN_VXFORM_300_EXT(vextractd, 6, 11, 0x10),
 GEN_VXFORM(vspltisb, 6, 12),
 GEN_VXFORM(vspltish, 6, 13),
 GEN_VXFORM(vspltisw, 6, 14),
-GEN_VXFORM_300_EO(vnegw, 0x01, 0x18, 0x06),
-GEN_VXFORM_300_EO(vnegd, 0x01, 0x18, 0x07),
 GEN_VXFORM_300_EO(vctzb, 0x01, 0x18, 0x1C),
 GEN_VXFORM_300_EO(vctzh, 0x01, 0x18, 0x1D),
 GEN_VXFORM_300_EO(vctzw, 0x01, 0x18, 0x1E),
-- 
2.31.1




[PATCH 00/12] VMX/VSX instructions with gvec

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch series moves some instructions from decode legacy to
decodetree and translate said instructions with gvec. Some cases using
gvec ended up with a bigger, more complex and slower so those
instructions were only moved to decodetree.

In each patch there's a comparison of the execution time before the
patch being applied and after. Said result is the sum of 10 executions.

The program used to time the execution worked like this:

clock_t start = clock();
for (int i = 0; i < LOOP; i++) {
asm (
 load values in registers, between 2 and 3 instructions
 ".rept REPT\n\t"
 "INSTRUCTION registers\n\t"
 ".endr\n\t"
 save result from register, 1 instruction
);
}
clock_t end = clock();
printf("INSTRUCTION rept=REPT loop=LOOP, time taken: %.12lf\n",
   ((double)(end - start))/ CLOCKS_PER_SEC);

Where the column rept in the value used in .rept in the inline assembly
and loop column is the value used for the for loop. All of those tests
were executed on a Power9. When comparing the TCGop the data used was
gathered using '-d op' and '-d op_opt'.

Lucas Mateus Castro (alqotel) (12):
  target/ppc: Moved VMLADDUHM to decodetree and use gvec
  target/ppc: Move VMH[R]ADDSHS instruction to decodetree
  target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec
  target/ppc: Move VNEG[WD] to decodtree and use gvec
  target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec
  target/ppc: Move VAVG to decodetree and use gvec
  target/ppc: Move VABSDU to decodetree and use gvec
  target/ppc: Use gvec to decode XV[N]ABS[DS]P/XVNEG[DS]P
  target/ppc: Use gvec to decode XVCPSNG[SD]P
  target/ppc: Moved XVTSTDC[DS]P to decodetree
  target/ppc: Moved XSTSTDC[QDS]P to decodetree
  target/ppc: Use gvec to decode XVTSTDC[DS]P

 target/ppc/fpu_helper.c | 137 ++--
 target/ppc/helper.h |  46 ++--
 target/ppc/insn32.decode|  50 +
 target/ppc/int_helper.c |  74 +++
 target/ppc/translate.c  |   1 -
 target/ppc/translate/vmx-impl.c.inc | 333 
 target/ppc/translate/vmx-ops.c.inc  |  15 +-
 target/ppc/translate/vsx-impl.c.inc | 305 +++--
 target/ppc/translate/vsx-ops.c.inc  |  21 --
 9 files changed, 693 insertions(+), 289 deletions(-)

-- 
2.31.1




[PATCH 02/12] target/ppc: Move VMH[R]ADDSHS instruction to decodetree

2022-09-23 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch moves VMHADDSHS and VMHRADDSHS to decodetree I couldn't find
a satisfactory implementation with TCG inline.

vmhaddshs:
reptloopmaster patch
8   12500   0,02983400 0,02648500 (-11.2%)
25  40000,02946000 0,02518000 (-14.5%)
100 10000,03104300 0,02638000 (-15.0%)
500 200 0,04002000 0,03502500 (-12.5%)
250040  0,08090100 0,07562200 (-6.5%)
800012  0,19242600 0,18626800 (-3.2%)

vmhraddshs:
reptloopmaster patch
8   12500   0,03078600 0,02851000 (-7.4%)
25  40000,02793200 0,02746900 (-1.7%)
100 10000,02886000 0,02839900 (-1.6%)
500 200 0,03714700 0,03799200 (+2.3%)
250040  0,07948000 0,07852200 (-1.2%)
800012  0,19049800 0,18813900 (-1.2%)

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h | 4 ++--
 target/ppc/insn32.decode| 2 ++
 target/ppc/int_helper.c | 4 ++--
 target/ppc/translate/vmx-impl.c.inc | 5 +++--
 target/ppc/translate/vmx-ops.c.inc  | 1 -
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 9c562ab00e..f02a9497b7 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -258,8 +258,8 @@ DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
 DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vpkpx, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
-DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
+DEF_HELPER_5(VMHADDSHS, void, env, avr, avr, avr, avr)
+DEF_HELPER_5(VMHRADDSHS, void, env, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VMSUMUHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
 DEF_HELPER_5(VMSUMUHS, void, env, avr, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VMSUMSHM, TCG_CALL_NO_RWG, void, avr, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 7445455a12..9a509e84df 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -694,6 +694,8 @@ VMSUMCUD000100 . . . . 010111   @VA
 VMSUMUDM000100 . . . . 100011   @VA
 
 VMLADDUHM   000100 . . . . 100010   @VA
+VMHADDSHS   000100 . . . . 10   @VA
+VMHRADDSHS  000100 . . . . 11   @VA
 
 ## Vector String Instructions
 
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 0d25000b2a..ae1ba8084d 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -939,7 +939,7 @@ target_ulong helper_vctzlsbb(ppc_avr_t *r)
 return count;
 }
 
-void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
+void helper_VMHADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
   ppc_avr_t *b, ppc_avr_t *c)
 {
 int sat = 0;
@@ -957,7 +957,7 @@ void helper_vmhaddshs(CPUPPCState *env, ppc_avr_t *r, 
ppc_avr_t *a,
 }
 }
 
-void helper_vmhraddshs(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
+void helper_VMHRADDSHS(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
ppc_avr_t *b, ppc_avr_t *c)
 {
 int sat = 0;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 9f18c6d4f2..3acd585a2f 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2521,7 +2521,7 @@ static void glue(gen_, name0##_##name1)(DisasContext 
*ctx)  \
 tcg_temp_free_ptr(rd);  \
 }
 
-GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16)
+GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
 
 static bool do_va_helper(DisasContext *ctx, arg_VA *a,
 void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr))
@@ -2620,7 +2620,8 @@ static bool do_va_env_helper(DisasContext *ctx, arg_VA *a,
 TRANS_FLAGS(ALTIVEC, VMSUMUHS, do_va_env_helper, gen_helper_VMSUMUHS)
 TRANS_FLAGS(ALTIVEC, VMSUMSHS, do_va_env_helper, gen_helper_VMSUMSHS)
 
-GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
+TRANS_FLAGS(ALTIVEC, VMHADDSHS, do_va_env_helper, gen_helper_VMHADDSHS)
+TRANS_FLAGS(ALTIVEC, VMHRADDSHS, do_va_env_helper, gen_helper_VMHRADDSHS)
 
 GEN_VXFORM_NOA(vclzb, 1, 28)
 GEN_VXFORM_NOA(vclzh, 1, 29)
diff --git a/target/ppc/translate/vmx-ops.c.inc 
b/target/ppc/translate/vmx-ops.c.inc
index a3a0fd0650..7cd9d40e06 100644
--- a/target/ppc/translate/vmx-ops.c.inc
+++ b/target/ppc/translate/vmx-ops.c.inc
@@ -219,7 +219,6 @@ GEN_VXFORM_UIMM(vctsxs, 5, 15),
 
 #define GEN_VAFORM_PAIRED(name0, name1, opc2)   \
 GEN_HANDLER(name0##_##name1, 0x04, opc2, 0xFF, 0x, PPC_ALTIVEC)
-GEN_VAFORM_PAIRED(vmhaddshs, vmhraddshs, 16),
 GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23),
 
 GEN_VXFORM_DUAL(vclzb, vpopcntb, 1, 28, PPC_NONE, PPC2_A

[PATCH v3 0/4] Patch series to set up a ppc64le CI

2022-09-22 Thread Lucas Mateus Castro(alqotel)
This patch series aim to make easier to set up a compilation and CI
environment on PPC64 and PPC64LE machines.

v3:
Changed patch 1 to respect alphabetical order

v2:
This patch series are only patches 2-4 of v1 and an alternative to patch 1
suggested by Daniel.

Lucas Mateus Castro (alqotel) (4):
  scripts/ci/setup: ninja missing from build-environment
  scripts/ci/setup: Fix libxen requirements
  scripts/ci/setup: spice-server only on x86 aarch64
  tests/docker: run script use realpath instead of readlink

 scripts/ci/setup/build-environment.yml | 15 +--
 tests/docker/run   |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

-- 
2.25.1




[PATCH v3 4/4] tests/docker: run script use realpath instead of readlink

2022-09-22 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

The alpine docker image only comes with busybox, which doesn't have the
'-e' option on its readlink, so change it to 'realpath' to avoid that
problem.

Suggested-by: Daniel P. Berrangé 
Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 tests/docker/run | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/docker/run b/tests/docker/run
index 421393046b..9eb96129da 100755
--- a/tests/docker/run
+++ b/tests/docker/run
@@ -15,7 +15,7 @@ if test -n "$V"; then
 set -x
 fi
 
-BASE="$(dirname $(readlink -e $0))"
+BASE="$(dirname $(realpath $0))"
 
 # Prepare the environment
 export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH
-- 
2.25.1




[PATCH v3 3/4] scripts/ci/setup: spice-server only on x86 aarch64

2022-09-22 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Changed build-environment.yml to only install spice-server on x86_64 and
aarch64 as this package is only available on those architectures.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Philippe Mathieu-Daudé 
---
 scripts/ci/setup/build-environment.yml | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index 49292715d3..b04c2b7cee 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -160,7 +160,6 @@
   - python36
   - rdma-core-devel
   - spice-glib-devel
-  - spice-server
   - systemtap-sdt-devel
   - tar
   - zlib-devel
@@ -168,3 +167,14 @@
   when:
 - ansible_facts['distribution_file_variety'] == 'RedHat'
 - ansible_facts['distribution_version'] == '8'
+
+- name: Install packages only available on x86 and aarch64
+  dnf:
+# Spice server not available in ppc64le
+name:
+  - spice-server
+state: present
+  when:
+- ansible_facts['distribution_file_variety'] == 'RedHat'
+- ansible_facts['distribution_version'] == '8'
+- ansible_facts['architecture'] == 'aarch64' or 
ansible_facts['architecture'] == 'x86_64'
-- 
2.25.1




[PATCH v3 2/4] scripts/ci/setup: Fix libxen requirements

2022-09-22 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

XEN hypervisor is only available in ARM and x86, but the yaml only
checked if the architecture is different from s390x, changed it to
a more accurate test.
Tested this change on a Ubuntu 20.04 ppc64le.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Alex Bennée 
Reviewed-by: Philippe Mathieu-Daudé 
---
 scripts/ci/setup/build-environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index b5acaf9118..49292715d3 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -97,7 +97,7 @@
 state: present
   when:
 - ansible_facts['distribution'] == 'Ubuntu'
-- ansible_facts['architecture'] != 's390x'
+- ansible_facts['architecture'] == 'aarch64' or 
ansible_facts['architecture'] == 'x86_64'
 
 - name: Install basic packages to build QEMU on Ubuntu 20.04
   package:
-- 
2.25.1




[PATCH v3 1/4] scripts/ci/setup: ninja missing from build-environment

2022-09-22 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

ninja-build is missing from the RHEL environment, so a system prepared
with that script would still fail to compile QEMU.
Tested on a Fedora 36

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 scripts/ci/setup/build-environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index 232525b91d..b5acaf9118 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -153,6 +153,7 @@
   - make
   - mesa-libEGL-devel
   - nettle-devel
+  - ninja-build
   - nmap-ncat
   - perl-Test-Harness
   - pixman-devel
-- 
2.25.1




[PATCH v2 4/4] tests/docker: run script use realpath instead of readlink

2022-09-14 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

The alpine docker image only comes with busybox, which doesn't have the
'-e' option on its readlink, so change it to 'realpath' to avoid that
problem.

Suggested-by: Daniel P. Berrangé 
Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 tests/docker/run | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/docker/run b/tests/docker/run
index 421393046b..9eb96129da 100755
--- a/tests/docker/run
+++ b/tests/docker/run
@@ -15,7 +15,7 @@ if test -n "$V"; then
 set -x
 fi
 
-BASE="$(dirname $(readlink -e $0))"
+BASE="$(dirname $(realpath $0))"
 
 # Prepare the environment
 export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH
-- 
2.31.1




[PATCH v2 2/4] scripts/ci/setup: Fix libxen requirements

2022-09-14 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

XEN hypervisor is only available in ARM and x86, but the yaml only
checked if the architecture is different from s390x, changed it to
a more accurate test.
Tested this change on a Ubuntu 20.04 ppc64le.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Alex Bennée 
---
 scripts/ci/setup/build-environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index 6df3e61d94..7535228685 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -97,7 +97,7 @@
 state: present
   when:
 - ansible_facts['distribution'] == 'Ubuntu'
-- ansible_facts['architecture'] != 's390x'
+- ansible_facts['architecture'] == 'aarch64' or 
ansible_facts['architecture'] == 'x86_64'
 
 - name: Install basic packages to build QEMU on Ubuntu 20.04
   package:
-- 
2.31.1




[PATCH v2 1/4] scripts/ci/setup: ninja missing from build-environment

2022-09-14 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

ninja-build is missing from the RHEL environment, so a system prepared
with that script would still fail to compile QEMU.
Tested on a Fedora 36

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 scripts/ci/setup/build-environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index 232525b91d..6df3e61d94 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -150,6 +150,7 @@
   - libepoxy-devel
   - libgcrypt-devel
   - lzo-devel
+  - ninja-build
   - make
   - mesa-libEGL-devel
   - nettle-devel
-- 
2.31.1




[PATCH v2 0/4] Patch series to set up a ppc64le CI

2022-09-14 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

This patch series aim to make easier to set up a compilation and CI
environment on PPC64 and PPC64LE machines.

v2:
This patch series are only patches 2-4 of v1 and an alternative to patch 1
suggested by Daniel.

Lucas Mateus Castro (alqotel) (4):
  scripts/ci/setup: ninja missing from build-environment
  scripts/ci/setup: Fix libxen requirements
  scripts/ci/setup: spice-server only on x86 aarch64
  tests/docker: run script use realpath instead of readlink

 scripts/ci/setup/build-environment.yml | 15 +--
 tests/docker/run   |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

-- 
2.31.1




[PATCH v2 3/4] scripts/ci/setup: spice-server only on x86 aarch64

2022-09-14 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Changed build-environment.yml to only install spice-server on x86_64 and
aarch64 as this package is only available on those architectures.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 scripts/ci/setup/build-environment.yml | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index 7535228685..43cf8c759f 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -160,7 +160,6 @@
   - python36
   - rdma-core-devel
   - spice-glib-devel
-  - spice-server
   - systemtap-sdt-devel
   - tar
   - zlib-devel
@@ -168,3 +167,14 @@
   when:
 - ansible_facts['distribution_file_variety'] == 'RedHat'
 - ansible_facts['distribution_version'] == '8'
+
+- name: Install packages only available on x86 and aarch64
+  dnf:
+# Spice server not available in ppc64le
+name:
+  - spice-server
+state: present
+  when:
+- ansible_facts['distribution_file_variety'] == 'RedHat'
+- ansible_facts['distribution_version'] == '8'
+- ansible_facts['architecture'] == 'aarch64' or 
ansible_facts['architecture'] == 'x86_64'
-- 
2.31.1




[PATCH v2 2/2] tests/tcg/ppc64le: Added an underflow with UE=1 test

2022-08-18 Thread Lucas Mateus Castro(alqotel)
Added a test to see if the adjustment is being made correctly when an
underflow occurs and UE is set.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
This patch will also fail without the underflow with UE set bugfix
Message-Id:<20220805141522.412864-3-lucas.ara...@eldorado.org.br>
---
 tests/tcg/ppc64/Makefile.target   |  1 +
 tests/tcg/ppc64le/Makefile.target |  1 +
 tests/tcg/ppc64le/ue_excp.c   | 53 +++
 3 files changed, 55 insertions(+)
 create mode 100644 tests/tcg/ppc64le/ue_excp.c

diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target
index 43958ad87b..583677031b 100644
--- a/tests/tcg/ppc64/Makefile.target
+++ b/tests/tcg/ppc64/Makefile.target
@@ -30,5 +30,6 @@ run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10
 PPC64_TESTS += signal_save_restore_xer
 PPC64_TESTS += xxspltw
 PPC64_TESTS += oe_excp
+PPC64_TESTS += ue_excp
 
 TESTS += $(PPC64_TESTS)
diff --git a/tests/tcg/ppc64le/Makefile.target 
b/tests/tcg/ppc64le/Makefile.target
index 8d11ac731d..b9e689c582 100644
--- a/tests/tcg/ppc64le/Makefile.target
+++ b/tests/tcg/ppc64le/Makefile.target
@@ -28,5 +28,6 @@ PPC64LE_TESTS += mffsce
 PPC64LE_TESTS += signal_save_restore_xer
 PPC64LE_TESTS += xxspltw
 PPC64LE_TESTS += oe_excp
+PPC64LE_TESTS += ue_excp
 
 TESTS += $(PPC64LE_TESTS)
diff --git a/tests/tcg/ppc64le/ue_excp.c b/tests/tcg/ppc64le/ue_excp.c
new file mode 100644
index 00..028ef3bbc7
--- /dev/null
+++ b/tests/tcg/ppc64le/ue_excp.c
@@ -0,0 +1,53 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define FP_UE (1ull << 5)
+#define MTFSF(FLM, FRB) asm volatile ("mtfsf %0, %1" :: "i" (FLM), "f" (FRB))
+
+void sigfpe_handler(int sig, siginfo_t *si, void *ucontext)
+{
+union {
+uint64_t ll;
+double dp;
+} r;
+uint64_t ch = 0x1b64f1c1b000ull;
+r.dp = ((ucontext_t *)ucontext)->uc_mcontext.fp_regs[2];
+if (r.ll == ch) {
+exit(0);
+}
+fprintf(stderr, "expected result: %lx\n result: %lx\n", ch, r.ll);
+exit(1);
+}
+
+int main()
+{
+uint64_t fpscr;
+uint64_t a = 0x5ca8ull;
+uint64_t b = 0x1cefull;
+
+struct sigaction sa = {
+.sa_sigaction = sigfpe_handler,
+.sa_flags = SA_SIGINFO
+};
+
+prctl(PR_SET_FPEXC, PR_FP_EXC_PRECISE);
+sigaction(SIGFPE, , NULL);
+
+fpscr = FP_UE;
+MTFSF(0b, fpscr);
+
+asm (
+"lfd 0, %0\n\t"
+"lfd 1, %1\n\t"
+"fmul 2, 0, 1\n\t"
+:
+: "m"(a), "m"(b)
+: "memory", "fr0", "fr1", "fr2"
+);
+
+abort();
+}
-- 
2.25.1




[PATCH v2 1/2] tests/tcg/ppc64le: Added an overflow with OE=1 test

2022-08-18 Thread Lucas Mateus Castro(alqotel)
Added a test to see if the adjustment is being made correctly when an
overflow occurs and OE is set.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
The prctl patch is not ready yet, so this patch does as Richard
Henderson suggested and check the fp register in the signal handler

This patch will fail without the overflow with OE set bugfix
Message-Id:<20220805141522.412864-3-lucas.ara...@eldorado.org.br>
---
 tests/tcg/ppc64/Makefile.target   |  1 +
 tests/tcg/ppc64le/Makefile.target |  1 +
 tests/tcg/ppc64le/oe_excp.c   | 53 +++
 3 files changed, 55 insertions(+)
 create mode 100644 tests/tcg/ppc64le/oe_excp.c

diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target
index 331fae628e..43958ad87b 100644
--- a/tests/tcg/ppc64/Makefile.target
+++ b/tests/tcg/ppc64/Makefile.target
@@ -29,5 +29,6 @@ run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10
 
 PPC64_TESTS += signal_save_restore_xer
 PPC64_TESTS += xxspltw
+PPC64_TESTS += oe_excp
 
 TESTS += $(PPC64_TESTS)
diff --git a/tests/tcg/ppc64le/Makefile.target 
b/tests/tcg/ppc64le/Makefile.target
index 6ca3003f02..8d11ac731d 100644
--- a/tests/tcg/ppc64le/Makefile.target
+++ b/tests/tcg/ppc64le/Makefile.target
@@ -27,5 +27,6 @@ PPC64LE_TESTS += mtfsf
 PPC64LE_TESTS += mffsce
 PPC64LE_TESTS += signal_save_restore_xer
 PPC64LE_TESTS += xxspltw
+PPC64LE_TESTS += oe_excp
 
 TESTS += $(PPC64LE_TESTS)
diff --git a/tests/tcg/ppc64le/oe_excp.c b/tests/tcg/ppc64le/oe_excp.c
new file mode 100644
index 00..c8f07d80b6
--- /dev/null
+++ b/tests/tcg/ppc64le/oe_excp.c
@@ -0,0 +1,53 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define FP_OE (1ull << 6)
+#define MTFSF(FLM, FRB) asm volatile ("mtfsf %0, %1" :: "i" (FLM), "f" (FRB))
+
+void sigfpe_handler(int sig, siginfo_t *si, void *ucontext)
+{
+union {
+uint64_t ll;
+double dp;
+} r;
+uint64_t ch = 0x5fcfffe4965a17e0ull;
+r.dp = ((ucontext_t *)ucontext)->uc_mcontext.fp_regs[2];
+if (r.ll == ch) {
+exit(0);
+}
+fprintf(stderr, "expected result: %lx\n result: %lx\n", ch, r.ll);
+exit(1);
+}
+
+int main()
+{
+uint64_t fpscr;
+uint64_t a = 0x7fdfffe816d77b00ull;
+uint64_t b = 0x7fdfffFC7F7FFF00ull;
+
+struct sigaction sa = {
+.sa_sigaction = sigfpe_handler,
+.sa_flags = SA_SIGINFO
+};
+
+prctl(PR_SET_FPEXC, PR_FP_EXC_PRECISE);
+sigaction(SIGFPE, , NULL);
+
+fpscr = FP_OE;
+MTFSF(0b, fpscr);
+
+asm (
+"lfd 0, %0\n\t"
+"lfd 1, %1\n\t"
+"fmul 2, 0, 1\n\t"
+:
+: "m"(a), "m"(b)
+: "memory", "fr0", "fr1", "fr2"
+);
+
+abort();
+}
-- 
2.25.1




[PATCH 2/2] tests/tcg/ppc64le: Added an underflow with UE=1 test

2022-08-17 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Added a test to see if the adjustment is being made correctly when an
underflow occurs and UE is set.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
This patch will also fail without the underflow with UE set bugfix
Message-Id:<20220805141522.412864-3-lucas.ara...@eldorado.org.br>
---
 tests/tcg/ppc64/Makefile.target   |  1 +
 tests/tcg/ppc64le/Makefile.target |  1 +
 tests/tcg/ppc64le/ue_excp.c   | 54 +++
 3 files changed, 56 insertions(+)
 create mode 100644 tests/tcg/ppc64le/ue_excp.c

diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target
index 43958ad87b..583677031b 100644
--- a/tests/tcg/ppc64/Makefile.target
+++ b/tests/tcg/ppc64/Makefile.target
@@ -30,5 +30,6 @@ run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10
 PPC64_TESTS += signal_save_restore_xer
 PPC64_TESTS += xxspltw
 PPC64_TESTS += oe_excp
+PPC64_TESTS += ue_excp
 
 TESTS += $(PPC64_TESTS)
diff --git a/tests/tcg/ppc64le/Makefile.target 
b/tests/tcg/ppc64le/Makefile.target
index 8d11ac731d..b9e689c582 100644
--- a/tests/tcg/ppc64le/Makefile.target
+++ b/tests/tcg/ppc64le/Makefile.target
@@ -28,5 +28,6 @@ PPC64LE_TESTS += mffsce
 PPC64LE_TESTS += signal_save_restore_xer
 PPC64LE_TESTS += xxspltw
 PPC64LE_TESTS += oe_excp
+PPC64LE_TESTS += ue_excp
 
 TESTS += $(PPC64LE_TESTS)
diff --git a/tests/tcg/ppc64le/ue_excp.c b/tests/tcg/ppc64le/ue_excp.c
new file mode 100644
index 00..b25ba1f803
--- /dev/null
+++ b/tests/tcg/ppc64le/ue_excp.c
@@ -0,0 +1,54 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define FP_UE (1ull << 5)
+#define MTFSF(FLM, FRB) asm volatile ("mtfsf %0, %1" :: "i" (FLM), "f" (FRB))
+
+void sigfpe_handler(int sig, siginfo_t *si, void *ucontext)
+{
+uint64_t t;
+uint64_t ch = 0x1b64f1c1b000ull;
+asm (
+"stfd 2, %0\n\t"
+: "=m"(t)
+:
+: "memory", "fr2"
+);
+if (t == ch) {
+exit(0);
+}
+fprintf(stderr, "expected result: %lx\n result: %lx\n", ch, t);
+exit(1);
+}
+
+int main()
+{
+uint64_t fpscr;
+uint64_t a = 0x5ca8ull;
+uint64_t b = 0x1cefull;
+
+struct sigaction sa = {
+.sa_sigaction = sigfpe_handler,
+.sa_flags = SA_SIGINFO
+};
+
+prctl(PR_SET_FPEXC, PR_FP_EXC_PRECISE);
+sigaction(SIGFPE, , NULL);
+
+fpscr = FP_UE;
+MTFSF(0b, fpscr);
+
+asm (
+"lfd 0, %0\n\t"
+"lfd 1, %1\n\t"
+"fmul 2, 0, 1\n\t"
+:
+: "m"(a), "m"(b)
+: "memory", "fr0", "fr1", "fr2"
+);
+
+return -1;
+}
-- 
2.31.1




[PATCH 1/2] tests/tcg/ppc64le: Added an overflow with OE=1 test

2022-08-17 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Added a test to see if the adjustment is being made correctly when an
overflow occurs and OE is set.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
The prctl patch is not ready yet, so this patch does as Richard
Henderson suggested and check the fp register in the signal handler

This patch will fail without the overflow with OE set bugfix
Message-Id:<20220805141522.412864-3-lucas.ara...@eldorado.org.br>
---
 tests/tcg/ppc64/Makefile.target   |  1 +
 tests/tcg/ppc64le/Makefile.target |  1 +
 tests/tcg/ppc64le/oe_excp.c   | 54 +++
 3 files changed, 56 insertions(+)
 create mode 100644 tests/tcg/ppc64le/oe_excp.c

diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target
index 331fae628e..43958ad87b 100644
--- a/tests/tcg/ppc64/Makefile.target
+++ b/tests/tcg/ppc64/Makefile.target
@@ -29,5 +29,6 @@ run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10
 
 PPC64_TESTS += signal_save_restore_xer
 PPC64_TESTS += xxspltw
+PPC64_TESTS += oe_excp
 
 TESTS += $(PPC64_TESTS)
diff --git a/tests/tcg/ppc64le/Makefile.target 
b/tests/tcg/ppc64le/Makefile.target
index 6ca3003f02..8d11ac731d 100644
--- a/tests/tcg/ppc64le/Makefile.target
+++ b/tests/tcg/ppc64le/Makefile.target
@@ -27,5 +27,6 @@ PPC64LE_TESTS += mtfsf
 PPC64LE_TESTS += mffsce
 PPC64LE_TESTS += signal_save_restore_xer
 PPC64LE_TESTS += xxspltw
+PPC64LE_TESTS += oe_excp
 
 TESTS += $(PPC64LE_TESTS)
diff --git a/tests/tcg/ppc64le/oe_excp.c b/tests/tcg/ppc64le/oe_excp.c
new file mode 100644
index 00..cfc364f5ed
--- /dev/null
+++ b/tests/tcg/ppc64le/oe_excp.c
@@ -0,0 +1,54 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define FP_OE (1ull << 6)
+#define MTFSF(FLM, FRB) asm volatile ("mtfsf %0, %1" :: "i" (FLM), "f" (FRB))
+
+void sigfpe_handler(int sig, siginfo_t *si, void *ucontext)
+{
+uint64_t t;
+uint64_t ch = 0x5fcfffe4965a17e0ull;
+asm (
+"stfd 2, %0\n\t"
+: "=m"(t)
+:
+: "memory", "fr2"
+);
+if (t == ch) {
+exit(0);
+}
+fprintf(stderr, "expected result: %lx\n result: %lx\n", ch, t);
+exit(1);
+}
+
+int main()
+{
+uint64_t fpscr;
+uint64_t a = 0x7fdfffe816d77b00ull;
+uint64_t b = 0x7fdfffFC7F7FFF00ull;
+
+struct sigaction sa = {
+.sa_sigaction = sigfpe_handler,
+.sa_flags = SA_SIGINFO
+};
+
+prctl(PR_SET_FPEXC, PR_FP_EXC_PRECISE);
+sigaction(SIGFPE, , NULL);
+
+fpscr = FP_OE;
+MTFSF(0b, fpscr);
+
+asm (
+"lfd 0, %0\n\t"
+"lfd 1, %1\n\t"
+"fmul 2, 0, 1\n\t"
+:
+: "m"(a), "m"(b)
+: "memory", "fr0", "fr1", "fr2"
+);
+
+return -1;
+}
-- 
2.31.1




[PATCH 1/2] fpu: Add rebias bool, value and operation

2022-08-05 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Added the possibility of recalculating a result if it overflows or
underflows, if the result overflow and the rebias bool is true then the
intermediate result should have 3/4 of the total range subtracted from
the exponent. The same for underflow but it should be added to the
exponent of the intermediate number instead.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 fpu/softfloat-parts.c.inc | 21 +++--
 fpu/softfloat.c   |  2 ++
 include/fpu/softfloat-types.h |  4 
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index bbeadaa189..a9f268fcab 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -214,18 +214,35 @@ static void partsN(uncanon_normal)(FloatPartsN *p, 
float_status *s,
 p->frac_lo &= ~round_mask;
 }
 } else if (unlikely(exp >= exp_max)) {
-flags |= float_flag_overflow | float_flag_inexact;
-if (overflow_norm) {
+flags |= float_flag_overflow;
+if (s->rebias_overflow) {
+exp -= fmt->exp_re_bias;
+} else if (overflow_norm) {
+flags |= float_flag_inexact;
 exp = exp_max - 1;
 frac_allones(p);
 p->frac_lo &= ~round_mask;
 } else {
+flags |= float_flag_inexact;
 p->cls = float_class_inf;
 exp = exp_max;
 frac_clear(p);
 }
 }
 frac_shr(p, frac_shift);
+} else if (unlikely(s->rebias_underflow)) {
+flags |= float_flag_underflow;
+exp += fmt->exp_re_bias;
+if (p->frac_lo & round_mask) {
+flags |= float_flag_inexact;
+if (frac_addi(p, p, inc)) {
+frac_shr(p, 1);
+p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
+exp++;
+}
+p->frac_lo &= ~round_mask;
+}
+frac_shr(p, frac_shift);
 } else if (s->flush_to_zero) {
 flags |= float_flag_output_denormal;
 p->cls = float_class_zero;
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 4a871ef2a1..c7454c3eb1 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -521,6 +521,7 @@ typedef struct {
 typedef struct {
 int exp_size;
 int exp_bias;
+int exp_re_bias;
 int exp_max;
 int frac_size;
 int frac_shift;
@@ -532,6 +533,7 @@ typedef struct {
 #define FLOAT_PARAMS_(E)\
 .exp_size   = E,\
 .exp_bias   = ((1 << E) - 1) >> 1,  \
+.exp_re_bias= (1 << (E - 1)) + (1 << (E - 2)),  \
 .exp_max= (1 << E) - 1
 
 #define FLOAT_PARAMS(E, F)  \
diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
index 7a6ea881d8..9735543ac4 100644
--- a/include/fpu/softfloat-types.h
+++ b/include/fpu/softfloat-types.h
@@ -195,6 +195,10 @@ typedef struct float_status {
 bool snan_bit_is_one;
 bool use_first_nan;
 bool no_signaling_nans;
+/* should overflowed results subtract re_bias to its exponent? */
+bool rebias_overflow;
+/* should underflowed results add re_bias to its exponent? */
+bool rebias_underflow;
 } float_status;
 
 #endif /* SOFTFLOAT_TYPES_H */
-- 
2.31.1




[PATCH 2/2] target/ppc: Bugfix FP when OE/UE are set

2022-08-05 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

When an overflow exception occurs and OE is set the intermediate result
should be adjusted (by subtracting from the exponent) to avoid rounding
to inf. The same applies to an underflow exceptionion and UE (but adding
to the exponent). To do this set the fp_status.rebias_overflow when OE
is set and fp_status.rebias_underflow when UE is set as the FPU will
recalculate in case of a overflow/underflow if the according rebias* is
set.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/cpu.c| 2 ++
 target/ppc/fpu_helper.c | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
index 401b6f9e63..0ebac04bc4 100644
--- a/target/ppc/cpu.c
+++ b/target/ppc/cpu.c
@@ -120,6 +120,8 @@ void ppc_store_fpscr(CPUPPCState *env, target_ulong val)
 val |= FP_FEX;
 }
 env->fpscr = val;
+env->fp_status.rebias_overflow  = (FP_OE & env->fpscr) ? true : false;
+env->fp_status.rebias_underflow = (FP_UE & env->fpscr) ? true : false;
 if (tcg_enabled()) {
 fpscr_set_rounding_mode(env);
 }
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 134804628b..c17575de5d 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -344,7 +344,6 @@ static inline int float_overflow_excp(CPUPPCState *env)
 
 bool overflow_enabled = !!(env->fpscr & FP_OE);
 if (overflow_enabled) {
-/* XXX: should adjust the result */
 /* Update the floating-point enabled exception summary */
 env->fpscr |= FP_FEX;
 /* We must update the target FPR before raising the exception */
@@ -363,7 +362,6 @@ static inline void float_underflow_excp(CPUPPCState *env)
 /* Update the floating-point exception summary */
 env->fpscr |= FP_FX;
 if (env->fpscr & FP_UE) {
-/* XXX: should adjust the result */
 /* Update the floating-point enabled exception summary */
 env->fpscr |= FP_FEX;
 /* We must update the target FPR before raising the exception */
-- 
2.31.1




[PATCH 0/2] Floating-point OE/UE exception bug

2022-08-05 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Changes in v2:
- Completely reworked the solution:
* Created re_bias in FloatFmt, it is 3/4 of the total exponent
  range of a FP type
* Added rebias bools that dictates if the result should have
  its exponent add/subtract the re_bias value if an
  overflow/underflow occurs.
* ppc_store_fpscr sets/unsets rebias if OE/UE is set/unset

The PowerISA defines that if an overflow exception happen with FPSCR.OE
set, the exponent of the intermediate result is subtracted 1536 in
double precision operations and is added 1536 in an underflow exception,
currently this behavior is not QEMU's behavior, this patch series fixes
that.

Currently there's no test in this patch series as there's no way to
disable MSR.FE0 and MSR.FE1 in linux user, so any overflow/underflow
exception with OE/UE set causes a trapping exception.

Lucas Mateus Castro (alqotel) (2):
  fpu: Add rebias bool, value and operation
  target/ppc: Bugfix FP when OE/UE are set

 fpu/softfloat-parts.c.inc | 21 +++--
 fpu/softfloat.c   |  2 ++
 include/fpu/softfloat-types.h |  4 
 target/ppc/cpu.c  |  2 ++
 target/ppc/fpu_helper.c   |  2 --
 5 files changed, 27 insertions(+), 4 deletions(-)

-- 
2.31.1




[PATCH] tests/tcg/ppc64le: Added OE/UE enabled exception test

2022-08-03 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

DO NOT MERGE

This patch adds a test to check if the add/sub of the intermediate
result when an overflow or underflow exception with the corresponding
enabling bit being set (i.e. OE/UE), but linux-user currently can't
disable MSR.FE0 and MSR.FE1 so it will always result in a trapping
exception, to avoid that the test should be run in a VM or use Matheus'
WIP patch in 
https://github.com/PPC64/qemu/tree/alqotel-ferst-prctl-patch

The test results were based on a Power9 machine.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 tests/tcg/ppc64le/oe_ue_excp.c | 105 +
 1 file changed, 105 insertions(+)
 create mode 100644 tests/tcg/ppc64le/oe_ue_excp.c

diff --git a/tests/tcg/ppc64le/oe_ue_excp.c b/tests/tcg/ppc64le/oe_ue_excp.c
new file mode 100644
index 00..384219a366
--- /dev/null
+++ b/tests/tcg/ppc64le/oe_ue_excp.c
@@ -0,0 +1,105 @@
+#include 
+#include 
+#include 
+
+#define FP_OE (1ull << 6)
+#define FP_UE (1ull << 5)
+
+typedef union {
+double d;
+long long ll;
+} ll_fp;
+
+double asm_fmul (double a, double b)
+{
+double t;
+asm (
+"lfd 0, %1\n\t"
+"lfd 1, %2\n\t"
+"fmul 2, 0, 1\n\t"
+"stfd 2, %0\n\t"
+:"=m"(t)
+:"m"(a),"m"(b)
+);
+return t;
+}
+
+double asm_fdiv (double a, double b)
+{
+double t;
+asm (
+"lfd 0, %1\n\t"
+"lfd 1, %2\n\t"
+"fdiv 2, 0, 1\n\t"
+"stfd 2, %0\n\t"
+:"=m"(t)
+:"m"(a),"m"(b)
+);
+return t;
+}
+
+int main ()
+{
+int i, ok = 1;
+ll_fp fpscr, t;
+
+prctl(PR_SET_FPEXC, PR_FP_EXC_DISABLED);
+
+fpscr.ll = FP_UE | FP_OE;
+__builtin_mtfsf (0b, fpscr.d);
+fpscr.d = __builtin_mffs ();
+printf("fpscr = %016llx\n", fpscr.ll);
+
+ll_fp ch[] =
+{
+{ .ll = 0x1b64f1c1b000ull },
+{ .ll = 0x1b64f1c1b001ull },
+{ .ll = 0x1b90de341000ull },
+{ .ll = 0x1b90de341000ull },
+{ .ll = 0x5fcfffe4965a17e0ull },
+{ .ll = 0x5fcfffe4965a17e0ull },
+{ .ll = 0x2003ull },
+{ .ll = 0x2003ull }
+};
+
+ll_fp a[] =
+{
+{ .ll = 0x5ca8ull },
+{ .ll = 0xbadcull },
+{ .ll = 0x7fdfffe816d77b00ull },
+{ .d  = DBL_MAX }
+};
+
+ll_fp b[] =
+{
+{ .ll = 0x1cefull },
+{ .ll = 0x5c70ull },
+{ .ll = 0x7fdfffFC7F7FFF00ull },
+{ .d  = 2.5 }
+};
+
+for (i = 0; i < 4; i++) {
+t.d = asm_fmul(a[i].d, b[i].d);
+if (t.ll != ch[2 * i].ll) {
+ok = 0;
+printf ("Mismatch on fmul n %d:\n\tresult:   %016llx\n\t"
+"expected: %016llx\n", i, t.ll, ch[2 * i].ll);
+} else {
+printf ("Ok on fmul n %d\n", i);
+}
+t.d = asm_fdiv(a[i].d, 1.0/b[i].d);
+if (t.ll != ch[2 * i + 1].ll) {
+ok = 0;
+printf ("Mismatch on fdiv n %d:\n\tresult:   %016llx\n\t"
+"expected: %016llx\n", i, t.ll, ch[2 * i + 1].ll);
+} else {
+printf ("Ok on fdiv n %d\n", i);
+}
+}
+fpscr.d = __builtin_mffs ();
+printf("fpscr = %016llx\n", fpscr.ll);
+if(!ok) {
+return -1;
+}
+return 0;
+}
-- 
2.31.1




[RFC PATCH 3/3] target/ppc: Bugfix fdiv result with OE/UE set

2022-08-03 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Change fdiv in the same way of fadd/fsub to handle overflow/underflow if
OE/UE is set (i.e. function that receives a value to add/subtract from
the exponent if an overflow/underflow occurs).

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 fpu/softfloat.c | 30 ++
 include/fpu/softfloat.h |  1 +
 target/ppc/fpu_helper.c |  5 -
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index e2b4ad4b63..0e9d2d2678 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -2558,6 +2558,27 @@ soft_f64_div(float64 a, float64 b, float_status *status)
 return float64_round_pack_canonical(pr, status);
 }
 
+static float64 QEMU_SOFTFLOAT_ATTR
+soft_f64_div_excp_en(float64 a, float64 b, int oe_sub, int ue_sum,
+ float_status *status)
+{
+FloatParts64 pa, pb, *pr;
+
+float64_unpack_canonical(, a, status);
+float64_unpack_canonical(, b, status);
+pr = parts_div(, , status);
+
+if (unlikely(oe_sub && (pr->exp > 1023))) {
+pr->exp -= oe_sub;
+float_raise(float_flag_overflow, status);
+} else if (unlikely(ue_sum && (pr->exp < -1022))) {
+pr->exp += ue_sum;
+float_raise(float_flag_underflow, status);
+}
+
+return float64_round_pack_canonical(pr, status);
+}
+
 static float hard_f32_div(float a, float b)
 {
 return a / b;
@@ -2616,6 +2637,15 @@ float64_div(float64 a, float64 b, float_status *s)
 f64_div_pre, f64_div_post);
 }
 
+float64 QEMU_FLATTEN
+float64_div_excp_en(float64 a, float64 b, int oe_sub, int ue_sum,
+float_status *s)
+{
+return float64_gen2_excp(a, b, oe_sub, ue_sum, s, hard_f64_div,
+ soft_f64_div, soft_f64_div_excp_en, f64_div_pre,
+ f64_div_post);
+}
+
 float64 float64r32_div(float64 a, float64 b, float_status *status)
 {
 FloatParts64 pa, pb, *pr;
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 4ff56b0e10..a6c7885fcd 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -778,6 +778,7 @@ float64 float64_sub_excp_en(float64, float64, int, int, 
float_status *status);
 float64 float64_mul(float64, float64, float_status *status);
 float64 float64_mul_excp_en(float64, float64, int, int, float_status *status);
 float64 float64_div(float64, float64, float_status *status);
+float64 float64_div_excp_en(float64, float64, int, int, float_status *status);
 float64 float64_rem(float64, float64, float_status *status);
 float64 float64_muladd(float64, float64, float64, int, float_status *status);
 float64 float64_sqrt(float64, float_status *status);
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 18cf720743..1a6869a920 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -635,7 +635,10 @@ static void float_invalid_op_div(CPUPPCState *env, int 
flags,
 /* fdiv - fdiv. */
 float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2)
 {
-float64 ret = float64_div(arg1, arg2, >fp_status);
+int oe_sub = (FP_OE & env->fpscr) ? 1536 : 0;
+int ue_sum = (FP_UE & env->fpscr) ? 1536 : 0;
+float64 ret = float64_div_excp_en(arg1, arg2, oe_sub, ue_sum,
+  >fp_status);
 int flags = get_float_exception_flags(>fp_status);
 
 if (unlikely(flags & float_flag_invalid)) {
-- 
2.31.1




[RFC PATCH 2/3] target/ppc: Bugfix fmul result with OE/UE set

2022-08-03 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Change fmul in the same way of fadd/fsub to handle overflow/underflow if
OE/UE is set (i.e. function that receives a value to add/subtract from
the exponent if an overflow/underflow occurs).

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 fpu/softfloat.c | 30 ++
 include/fpu/softfloat.h |  1 +
 target/ppc/fpu_helper.c |  5 -
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index a407129dcb..e2b4ad4b63 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -2212,6 +2212,36 @@ float64_mul(float64 a, float64 b, float_status *s)
 f64_is_zon2, f64_addsubmul_post);
 }
 
+static float64 QEMU_SOFTFLOAT_ATTR
+soft_f64_mul_excp_en(float64 a, float64 b, int oe_sub, int ue_sum,
+ float_status *s)
+{
+FloatParts64 pa, pb, *pr;
+
+float64_unpack_canonical(, a, s);
+float64_unpack_canonical(, b, s);
+pr = parts_mul(, , s);
+
+if (unlikely(oe_sub && (pr->exp > 1023))) {
+pr->exp -= oe_sub;
+float_raise(float_flag_overflow, s);
+} else if (unlikely(ue_sum && (pr->exp < -1022))) {
+pr->exp += ue_sum;
+float_raise(float_flag_underflow, s);
+}
+
+return float64_round_pack_canonical(pr, s);
+}
+
+float64 QEMU_FLATTEN
+float64_mul_excp_en(float64 a, float64 b, int oe_sub, int ue_sum,
+float_status *status)
+{
+return float64_gen2_excp(a, b, oe_sub, ue_sum, status,
+ hard_f64_mul, soft_f64_mul, soft_f64_mul_excp_en,
+ f64_is_zon2, f64_addsubmul_post);
+}
+
 float64 float64r32_mul(float64 a, float64 b, float_status *status)
 {
 FloatParts64 pa, pb, *pr;
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 76bf628a29..4ff56b0e10 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -776,6 +776,7 @@ float64 float64_add_excp_en(float64, float64, int, int, 
float_status *status);
 float64 float64_sub(float64, float64, float_status *status);
 float64 float64_sub_excp_en(float64, float64, int, int, float_status *status);
 float64 float64_mul(float64, float64, float_status *status);
+float64 float64_mul_excp_en(float64, float64, int, int, float_status *status);
 float64 float64_div(float64, float64, float_status *status);
 float64 float64_rem(float64, float64, float_status *status);
 float64 float64_muladd(float64, float64, float64, int, float_status *status);
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index cb82c91340..18cf720743 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -595,7 +595,10 @@ static void float_invalid_op_mul(CPUPPCState *env, int 
flags,
 /* fmul - fmul. */
 float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2)
 {
-float64 ret = float64_mul(arg1, arg2, >fp_status);
+int oe_sub = (FP_OE & env->fpscr) ? 1536 : 0;
+int ue_sum = (FP_UE & env->fpscr) ? 1536 : 0;
+float64 ret = float64_mul_excp_en(arg1, arg2, oe_sub, ue_sum,
+  >fp_status);
 int flags = get_float_exception_flags(>fp_status);
 
 if (unlikely(flags & float_flag_invalid)) {
-- 
2.31.1




[RFC PATCH 1/3] target/ppc: Bugfix fadd/fsub result with OE/UE set

2022-08-03 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

As mentioned in the functions float_overflow_excp and
float_underflow_excp, the result should be adjusted as mentioned in the
ISA (subtracted 192/1536 from the exponent of the intermediate result if
an overflow occurs with OE set and added 192/1536 to the exponent of the
intermediate result if an underflow occurs with UE set), but at those
functions the result has already been rounded so it is not possible to
add/subtract from the intermediate result anymore.
 
This patch creates a new function that receives the value that should be
subtracted/added from the exponent if an overflow/underflow happens, to
not leave some arbitrary numbers from the PowerISA in the middle of the
FPU code. If these numbers are 0 the new functions just call the old
ones.

I used 2 values here for overflow and underflow, maybe it'd be better to
just use the same ones, any thoughts?

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
An alternative I've thought was to always return the value adjusted if a
overflow or underflow occurs and in float_underflow_excp and
float_overflow_excp adjust it to inf/den/0 if OE/UE is 0, but I didn't
saw many advantages to that approach.
---
 fpu/softfloat.c | 75 +
 include/fpu/softfloat.h |  2 ++
 target/ppc/fpu_helper.c | 10 --
 3 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 4a871ef2a1..a407129dcb 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -268,6 +268,8 @@ typedef bool (*f64_check_fn)(union_float64 a, union_float64 
b);
 
 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
+typedef float64 (*soft_f64_op2_int2_fn)(float64 a, float64 b, int c, int d,
+float_status *s);
 typedef float   (*hard_f32_op2_fn)(float a, float b);
 typedef double  (*hard_f64_op2_fn)(double a, double b);
 
@@ -401,6 +403,19 @@ float64_gen2(float64 xa, float64 xb, float_status *s,
 return soft(ua.s, ub.s, s);
 }
 
+static inline float64
+float64_gen2_excp(float64 xa, float64 xb, int xc, int xd, float_status *s,
+  hard_f64_op2_fn hard, soft_f64_op2_fn soft,
+  soft_f64_op2_int2_fn soft_excp, f64_check_fn pre,
+  f64_check_fn post)
+{
+if (xc || xd) {
+return soft_excp(xa, xb, xc, xd, s);
+} else {
+return float64_gen2(xa, xb, s, hard, soft, pre, post);
+}
+}
+
 /*
  * Classify a floating point number. Everything above float_class_qnan
  * is a NaN so cls >= float_class_qnan is any NaN.
@@ -1929,6 +1944,39 @@ static double hard_f64_sub(double a, double b)
 return a - b;
 }
 
+static float64 QEMU_SOFTFLOAT_ATTR
+soft_f64_addsub_excp_en(float64 a, float64 b, int oe_sub, int ue_sum,
+float_status *status, bool subtract)
+{
+FloatParts64 pa, pb, *pr;
+
+float64_unpack_canonical(, a, status);
+float64_unpack_canonical(, b, status);
+pr = parts_addsub(, , status, subtract);
+
+if (unlikely(oe_sub && (pr->exp > 1023))) {
+pr->exp -= oe_sub;
+float_raise(float_flag_overflow, status);
+} else if (unlikely(ue_sum && (pr->exp < -1022))) {
+pr->exp += ue_sum;
+float_raise(float_flag_underflow, status);
+}
+
+return float64_round_pack_canonical(pr, status);
+}
+
+static float64 soft_f64_add_excp_en(float64 a, float64 b, int oe_sub,
+int ue_sum, float_status *status)
+{
+return soft_f64_addsub_excp_en(a, b, oe_sub, ue_sum, status, false);
+}
+
+static float64 soft_f64_sub_excp_en(float64 a, float64 b, int oe_sub,
+int ue_sum, float_status *status)
+{
+return soft_f64_addsub_excp_en(a, b, oe_sub, ue_sum, status, true);
+}
+
 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
 {
 if (QEMU_HARDFLOAT_2F32_USE_FP) {
@@ -1960,6 +2008,15 @@ static float64 float64_addsub(float64 a, float64 b, 
float_status *s,
 f64_is_zon2, f64_addsubmul_post);
 }
 
+static float64 float64_addsub_excp_en(float64 a, float64 b, int oe_sum,
+  int ue_sub, float_status *s,
+  hard_f64_op2_fn hard, soft_f64_op2_fn 
soft,
+  soft_f64_op2_int2_fn soft_excp)
+{
+return float64_gen2_excp(a, b, oe_sum, ue_sub, s, hard, soft, soft_excp,
+ f64_is_zon2, f64_addsubmul_post);
+}
+
 float32 QEMU_FLATTEN
 float32_add(float32 a, float32 b, float_status *s)
 {
@@ -1984,6 +2041,24 @@ float64_sub(float64 a, float64 b, float_status *s)
 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
 }
 
+float64 QEMU_FLATTEN
+float64_add_excp_en(float64 a, float64 b, int oe_sub, int ue_sum,
+

[RFC PATCH 8/8] tests/docker: Selective line reading by python script

2022-07-27 Thread Lucas Mateus Castro(alqotel)
Building some images failed on ppc64le because the dockerfile tried to
install some packages that are only available in x86 and arm64, to solve
this while still having those packages be available in those architectures
a comment was put before the installation command to instruct the python
script into ignoring those lines for some architectures (in this case
ppc64le)

Overall I'm not a big fan of the way I solved this problem, so I'd like
to know if anyone has a better way to make these dockerfilse work in
PPC64LE.

For context the base images used here are available in PPC64LE but some
of the packages installed are not (in alpine's case it's XEN, which is
only available to x86 and ARM), so this patch create a ignore_list which
is set on a per-architecture basis, and any packages in a dockerfile in
this ignore_list will not be copied to the temporary dockerfile used in
the docker command.

Signed-off-by: Lucas Mateus Castro(alqotel) 
---
 tests/docker/docker.py | 15 ---
 tests/docker/dockerfiles/alpine.docker |  2 ++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/docker/docker.py b/tests/docker/docker.py
index d0af2861b8..9b962d1c78 100755
--- a/tests/docker/docker.py
+++ b/tests/docker/docker.py
@@ -14,6 +14,7 @@
 import os
 import sys
 import subprocess
+import platform
 import json
 import hashlib
 import atexit
@@ -207,8 +208,15 @@ def _read_qemu_dockerfile(img_name):
 
 def _dockerfile_preprocess(df):
 out = ""
+ignore_list = []
 for l in df.splitlines():
-if len(l.strip()) == 0 or l.startswith("#"):
+if len(l.strip()) == 0:
+continue
+if l.startswith("#"):
+if len(l.split()) >= 3:
+if l.split()[1] == "ignore":
+if platform.processor() in l.split()[2].split(','):
+ignore_list += l.split()[3].split(',')
 continue
 from_pref = "FROM qemu/"
 if l.startswith(from_pref):
@@ -219,7 +227,8 @@ def _dockerfile_preprocess(df):
 inlining = _read_qemu_dockerfile(l[len(from_pref):])
 out += _dockerfile_preprocess(inlining)
 continue
-out += l + "\n"
+if not any(x in l.split() for x in ignore_list):
+out += l + "\n"
 return out
 
 
@@ -330,7 +339,7 @@ def build_image(self, tag, docker_dir, dockerfile,
 tmp_df = tempfile.NamedTemporaryFile(mode="w+t",
  encoding='utf-8',
  dir=docker_dir, suffix=".docker")
-tmp_df.write(dockerfile)
+tmp_df.write(_dockerfile_preprocess(dockerfile))
 
 if user:
 uid = os.getuid()
diff --git a/tests/docker/dockerfiles/alpine.docker 
b/tests/docker/dockerfiles/alpine.docker
index 2943a99730..5cec46d8f2 100644
--- a/tests/docker/dockerfiles/alpine.docker
+++ b/tests/docker/dockerfiles/alpine.docker
@@ -6,6 +6,8 @@
 
 FROM docker.io/library/alpine:edge
 
+# Lines to by ignored when this file is read by the python script
+# ignore ppc64le,ppc64 xen-dev
 RUN apk update && \
 apk upgrade && \
 apk add \
-- 
2.25.1




[PATCH 6/8] scripts/ci/setup: Add Fedora to build-environment.yml

2022-07-27 Thread Lucas Mateus Castro(alqotel)
Minicloud doesn't have a RHEL image, but it does have Fedora 34 and 35
images and both use DNF as package manager, so just change the ansible facts
to check if it's RHEL or Fedora

Signed-off-by: Lucas Mateus Castro(alqotel) 
---
 scripts/ci/setup/build-environment.yml | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index 43cf8c759f..a7d53d0f70 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -165,8 +165,10 @@
   - zlib-devel
 state: present
   when:
-- ansible_facts['distribution_file_variety'] == 'RedHat'
-- ansible_facts['distribution_version'] == '8'
+- |
+   (ansible_facts['distribution'] == 'RedHat' and
+ansible_facts['distribution_version'] == '8') or
+ansible_facts['distribution'] == 'Fedora'
 
 - name: Install packages only available on x86 and aarch64
   dnf:
@@ -175,6 +177,8 @@
   - spice-server
 state: present
   when:
-- ansible_facts['distribution_file_variety'] == 'RedHat'
-- ansible_facts['distribution_version'] == '8'
+- |
+   (ansible_facts['distribution'] == 'RedHat' and
+ansible_facts['distribution_version'] == '8') or
+ansible_facts['distribution'] == 'Fedora'
 - ansible_facts['architecture'] == 'aarch64' or 
ansible_facts['architecture'] == 'x86_64'
-- 
2.25.1




[PATCH 2/8] scripts/ci/setup: ninja missing from build-environment

2022-07-27 Thread Lucas Mateus Castro(alqotel)
ninja-build is missing from the RHEL environment, so a system prepared
with that script would still fail to compile QEMU.
Tested on a Fedora 36

Signed-off-by: Lucas Mateus Castro(alqotel) 
---
 scripts/ci/setup/build-environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index 232525b91d..6df3e61d94 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -150,6 +150,7 @@
   - libepoxy-devel
   - libgcrypt-devel
   - lzo-devel
+  - ninja-build
   - make
   - mesa-libEGL-devel
   - nettle-devel
-- 
2.25.1




[PATCH 3/8] scripts/ci/setup: Fix libxen requirements

2022-07-27 Thread Lucas Mateus Castro(alqotel)
XEN hypervisor is only available in ARM and x86, but the yaml only
checked if the architecture is different from s390x, changed it to
a more accurate test.
Tested this change on a Ubuntu 20.04 ppc64le.

Signed-off-by: Lucas Mateus Castro(alqotel) 
---
 scripts/ci/setup/build-environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index 6df3e61d94..7535228685 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -97,7 +97,7 @@
 state: present
   when:
 - ansible_facts['distribution'] == 'Ubuntu'
-- ansible_facts['architecture'] != 's390x'
+- ansible_facts['architecture'] == 'aarch64' or 
ansible_facts['architecture'] == 'x86_64'
 
 - name: Install basic packages to build QEMU on Ubuntu 20.04
   package:
-- 
2.25.1




[PATCH 7/8] scripts/ci/setup: Added debian to build-environment.yml

2022-07-27 Thread Lucas Mateus Castro(alqotel)
Minicloud has a PPC64 BE Debian11 image which can be used for the CI,
so add Debian to the build-environment.yml so it can be configured with
ansible-playbook.

Signed-off-by: Lucas Mateus Castro(alqotel) 
---
 scripts/ci/setup/build-environment.yml | 31 +-
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index a7d53d0f70..b5d415496f 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -31,9 +31,11 @@
 update_cache: yes
 upgrade: yes
   when:
-- ansible_facts['distribution'] == 'Ubuntu'
+- |
+ansible_facts['distribution'] == 'Ubuntu' or
+ansible_facts['distribution'] == 'Debian'
 
-- name: Install basic packages to build QEMU on Ubuntu 20.04
+- name: Install basic packages to build QEMU on Ubuntu 20.04 or Debian11
   package:
 name:
   - ccache
@@ -56,7 +58,6 @@
   - libibverbs-dev
   - libiscsi-dev
   - libjemalloc-dev
-  - libjpeg-turbo8-dev
   - liblzo2-dev
   - libncurses5-dev
   - libncursesw5-dev
@@ -86,17 +87,37 @@
   - sparse
   - xfslibs-dev
 state: present
+  when:
+- |
+ansible_facts['distribution'] == 'Ubuntu' or
+ansible_facts['distribution'] == 'Debian'
+
+- name: Install Ubuntu exclusive packages to build QEMU
+  package:
+name:
+  - libjpeg-turbo8-dev
+state: present
   when:
 - ansible_facts['distribution'] == 'Ubuntu'
 
-- name: Install packages to build QEMU on Ubuntu 20.04 on non-s390x
+- name: Install Debian exclusive packages to build QEMU
+  package:
+name:
+  - libjpeg62-turbo-dev
+state: present
+  when:
+- ansible_facts['distribution'] == 'Debian'
+
+- name: Install packages to build QEMU on Ubuntu 20.04 or Debian11 on 
non-s390x
   package:
 name:
   - libspice-server-dev
   - libxen-dev
 state: present
   when:
-- ansible_facts['distribution'] == 'Ubuntu'
+- |
+ansible_facts['distribution'] == 'Ubuntu' or
+ansible_facts['distribution'] == 'Debian'
 - ansible_facts['architecture'] == 'aarch64' or 
ansible_facts['architecture'] == 'x86_64'
 
 - name: Install basic packages to build QEMU on Ubuntu 20.04
-- 
2.25.1




[PATCH 5/8] scripts/ci/setup: Add ppc64le to vars.yml template

2022-07-27 Thread Lucas Mateus Castro(alqotel)
Added ppc64le so that the gitlab-runner.yml could be used to set up
ppc64le runners.

Signed-off-by: Lucas Mateus Castro(alqotel) 
---
 scripts/ci/setup/vars.yml.template | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/ci/setup/vars.yml.template 
b/scripts/ci/setup/vars.yml.template
index e48089761f..2c84698b87 100644
--- a/scripts/ci/setup/vars.yml.template
+++ b/scripts/ci/setup/vars.yml.template
@@ -8,5 +8,6 @@ ansible_to_gitlab_arch:
   x86_64: amd64
   aarch64: arm64
   s390x: s390x
+  ppc64le: ppc64le
 # A unique token made available by GitLab to your project for registering 
runners
 gitlab_runner_registration_token: PLEASE_PROVIDE_A_VALID_TOKEN
-- 
2.25.1




[PATCH 1/8] tests/docker: Fix alpine dockerfile

2022-07-27 Thread Lucas Mateus Castro(alqotel)
Currently the run script uses 'readlink -e' but the image only has the
busybox readlink, this commit add the coreutils package which
contains the readlink with the '-e' option.

Signed-off-by: Lucas Mateus Castro(alqotel) 
---
 tests/docker/dockerfiles/alpine.docker | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/docker/dockerfiles/alpine.docker 
b/tests/docker/dockerfiles/alpine.docker
index 3f4c0f95cb..2943a99730 100644
--- a/tests/docker/dockerfiles/alpine.docker
+++ b/tests/docker/dockerfiles/alpine.docker
@@ -21,6 +21,7 @@ RUN apk update && \
 cdrkit \
 ceph-dev \
 clang \
+coreutils \
 ctags \
 curl-dev \
 cyrus-sasl-dev \
-- 
2.25.1




[PATCH 4/8] scripts/ci/setup: spice-server only on x86 aarch64

2022-07-27 Thread Lucas Mateus Castro(alqotel)
Changed build-environment.yml to only install spice-server on x86_64 and
aarch64 as this package is only available on those architectures.

Signed-off-by: Lucas Mateus Castro(alqotel) 
---
 scripts/ci/setup/build-environment.yml | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/scripts/ci/setup/build-environment.yml 
b/scripts/ci/setup/build-environment.yml
index 7535228685..43cf8c759f 100644
--- a/scripts/ci/setup/build-environment.yml
+++ b/scripts/ci/setup/build-environment.yml
@@ -160,7 +160,6 @@
   - python36
   - rdma-core-devel
   - spice-glib-devel
-  - spice-server
   - systemtap-sdt-devel
   - tar
   - zlib-devel
@@ -168,3 +167,14 @@
   when:
 - ansible_facts['distribution_file_variety'] == 'RedHat'
 - ansible_facts['distribution_version'] == '8'
+
+- name: Install packages only available on x86 and aarch64
+  dnf:
+# Spice server not available in ppc64le
+name:
+  - spice-server
+state: present
+  when:
+- ansible_facts['distribution_file_variety'] == 'RedHat'
+- ansible_facts['distribution_version'] == '8'
+- ansible_facts['architecture'] == 'aarch64' or 
ansible_facts['architecture'] == 'x86_64'
-- 
2.25.1




[PATCH 0/8] Patch series to set up a ppc64le CI

2022-07-27 Thread Lucas Mateus Castro(alqotel)
This patch series aim to make easier to set up a compilation and CI
environment in PPC64 and PPC64LE machines.

The first 2 patches is a fix not related to ppc64.
Patch 3 and 4 also affect some other architectures.
Patches 5 to 7 are adding Power specific additions.

Patch 8 is a RFC for a current way to run the docker tests in PPC64LE.

Lucas Mateus Castro(alqotel) (8):
  tests/docker: Fix alpine dockerfile
  scripts/ci/setup: ninja missing from build-environment
  scripts/ci/setup: Fix libxen requirements
  scripts/ci/setup: spice-server only on x86 aarch64
  scripts/ci/setup: Add ppc64le to vars.yml template
  scripts/ci/setup: Add Fedora to build-environment.yml
  scripts/ci/setup: Added debian to build-environment.yml
  tests/docker: Selective line reading by python script

 scripts/ci/setup/build-environment.yml | 54 +-
 scripts/ci/setup/vars.yml.template |  1 +
 tests/docker/docker.py | 15 +--
 tests/docker/dockerfiles/alpine.docker |  3 ++
 4 files changed, 61 insertions(+), 12 deletions(-)

-- 
2.25.1




[RFC PATCH RESEND] scripts/checkpatch.pl: Change line limit warning

2022-06-06 Thread Lucas Mateus Castro(alqotel)
The QEMU documentation mentions that lines should be up to 80
characters and that the script checkpatch will warn at 100 characters,
but the script warns at 80 characters and throw and error at 90, so
this commit changes to warn at 100.

As to why extend, the argument that resulted in the change of the
docs was that trying to always wrap to 80 columns could result in less
readable code, so sometimes not wrapping was the better choice and in
those circumstances checkpatch could nudge people into creating less
readable code.

A 132 error limit is put to catch overly big lines.

Based-on: 20201105154208.12442-1-ganqi...@huawei.com
Signed-off-by: Lucas Mateus Castro(alqotel) 
---
Currently there's a disagreement between the checkpatch code and the
documentation, this RFC just changes the checkpatch to match the
documentation.
But there was a discussion in 2020 as the best way to deal with this,
some alternatives mentioned are: change the warning to remind people to
not blindly wrap just because of the warning, change to warn at 90 columns
(which would mean changing the column limit for the error as well). If any
of those are preferred I'll send a next version updating the documentation
as well as changing checkpatch.pl to the preferred behavior.
---
 scripts/checkpatch.pl | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index d900d18048..2c2d7b31ab 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1639,12 +1639,12 @@ sub process {
if ($line =~ /^\+/ &&
!($line =~ /^\+\s*"[^"]*"\s*(?:\s*|,|\)\s*;)\s*$/) &&
!($rawline =~ /^[^[:alnum:]]*https?:\S*$/) &&
-   $length > 80)
+   $length > 100)
{
-   if ($length > 90) {
-   ERROR("line over 90 characters\n" . $herecurr);
+   if ($length > 132) {
+   ERROR("line over 132 characters\n" . $herecurr);
} else {
-   WARN("line over 80 characters\n" . $herecurr);
+   WARN("line over 100 characters\n" . $herecurr);
}
}
 
@@ -1838,13 +1838,8 @@ sub process {
#print "realcnt<$realcnt> ctx_cnt<$ctx_cnt>\n";
#print 
"pre<$pre_ctx>\nline<$line>\nctx<$ctx>\nnext<$lines[$ctx_ln - 1]>\n";
 
-   # The length of the "previous line" is checked against 
80 because it
-   # includes the + at the beginning of the line (if the 
actual line has
-   # 79 or 80 characters, it is no longer possible to add 
a space and an
-   # opening brace there)
if ($#ctx == 0 && $ctx !~ /{\s*/ &&
-   defined($lines[$ctx_ln - 1]) && $lines[$ctx_ln - 1] 
=~ /^\+\s*\{/ &&
-   defined($lines[$ctx_ln - 2]) && 
length($lines[$ctx_ln - 2]) < 80) {
+   defined($lines[$ctx_ln - 1]) && $lines[$ctx_ln - 1] 
=~ /^\+\s*\{/) {
ERROR("that open brace { should be on the 
previous line\n" .
"$here\n$ctx\n$rawlines[$ctx_ln - 
1]\n");
}
-- 
2.25.1




[PATCH RESEND v3 8/8] target/ppc: Implemented vector module quadword

2022-05-25 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
vmodsq: Vector Modulo Signed Quadword
vmoduq: Vector Modulo Unsigned Quadword

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/744
---
 target/ppc/helper.h |  2 ++
 target/ppc/insn32.decode|  2 ++
 target/ppc/int_helper.c | 21 +
 target/ppc/translate/vmx-impl.c.inc |  2 ++
 4 files changed, 27 insertions(+)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index e7624300df..d627cfe6ed 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -181,6 +181,8 @@ DEF_HELPER_FLAGS_3(VDIVESD, TCG_CALL_NO_RWG, void, avr, 
avr, avr)
 DEF_HELPER_FLAGS_3(VDIVEUD, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(VDIVESQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(VDIVEUQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_3(VMODSQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_3(VMODUQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 75fa206b39..6ea48d5163 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -807,3 +807,5 @@ VMODSW  000100 . . . 0001011@VX
 VMODUW  000100 . . . 11010001011@VX
 VMODSD  000100 . . . 1001011@VX
 VMODUD  000100 . . . 11011001011@VX
+VMODSQ  000100 . . . 1111011@VX
+VMODUQ  000100 . . . 1101011@VX
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 42f0dcfc52..16357c0900 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1247,6 +1247,27 @@ void helper_VDIVEUQ(ppc_avr_t *t, ppc_avr_t *a, 
ppc_avr_t *b)
 }
 }
 
+void helper_VMODSQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b)
+{
+Int128 neg1 = int128_makes64(-1);
+Int128 int128_min = int128_make128(0, INT64_MIN);
+if (likely(int128_nz(b->s128) &&
+  (int128_ne(a->s128, int128_min) || int128_ne(b->s128, neg1 {
+t->s128 = int128_rems(a->s128, b->s128);
+} else {
+t->s128 = int128_zero(); /* Undefined behavior */
+}
+}
+
+void helper_VMODUQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b)
+{
+if (likely(int128_nz(b->s128))) {
+t->s128 = int128_remu(a->s128, b->s128);
+} else {
+t->s128 = int128_zero(); /* Undefined behavior */
+}
+}
+
 void helper_VPERM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
 {
 ppc_avr_t result;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 78277fb018..0b563bed37 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -3381,6 +3381,8 @@ TRANS_FLAGS2(ISA310, VMODSW, do_vdiv_vmod, MO_32, 
do_modsw , NULL)
 TRANS_FLAGS2(ISA310, VMODUW, do_vdiv_vmod, MO_32, do_moduw, NULL)
 TRANS_FLAGS2(ISA310, VMODSD, do_vdiv_vmod, MO_64, NULL, do_modsd)
 TRANS_FLAGS2(ISA310, VMODUD, do_vdiv_vmod, MO_64, NULL, do_modud)
+TRANS_FLAGS2(ISA310, VMODSQ, do_vx_helper, gen_helper_VMODSQ)
+TRANS_FLAGS2(ISA310, VMODUQ, do_vx_helper, gen_helper_VMODUQ)
 
 #undef DIVS32
 #undef DIVU32
-- 
2.31.1




[PATCH RESEND v3 4/8] host-utils: Implemented unsigned 256-by-128 division

2022-05-25 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Based on already existing QEMU implementation, created an unsigned 256
bit by 128 bit division needed to implement the vector divide extended
unsigned instruction from PowerISA3.1

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
This patch had received Reviewed-by by Richard Henderson pending on the
placemente of clz128 being moved to int128.h, but clz128 ended up being changed
to accommodate to int128.h (i.e. the lack of clz64), so out of precaution I'd
like to request a review of the clz128 implementation
---
 include/qemu/host-utils.h |   2 +
 include/qemu/int128.h |  38 +++
 util/host-utils.c | 129 ++
 3 files changed, 169 insertions(+)

diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index f19bd29105..9767af7573 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -32,6 +32,7 @@
 
 #include "qemu/compiler.h"
 #include "qemu/bswap.h"
+#include "qemu/int128.h"
 
 #ifdef CONFIG_INT128
 static inline void mulu64(uint64_t *plow, uint64_t *phigh,
@@ -849,4 +850,5 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 #endif
 }
 
+Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor);
 #endif
diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index ef71f56e3f..d2b76ca6ac 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -128,11 +128,21 @@ static inline bool int128_ge(Int128 a, Int128 b)
 return a >= b;
 }
 
+static inline bool int128_uge(Int128 a, Int128 b)
+{
+return ((__uint128_t)a) >= ((__uint128_t)b);
+}
+
 static inline bool int128_lt(Int128 a, Int128 b)
 {
 return a < b;
 }
 
+static inline bool int128_ult(Int128 a, Int128 b)
+{
+return (__uint128_t)a < (__uint128_t)b;
+}
+
 static inline bool int128_le(Int128 a, Int128 b)
 {
 return a <= b;
@@ -177,6 +187,15 @@ static inline Int128 bswap128(Int128 a)
 #endif
 }
 
+static inline int clz128(Int128 a)
+{
+if (a >> 64) {
+return __builtin_clzll(a >> 64);
+} else {
+return (a) ? __builtin_clzll((uint64_t)a) + 64 : 128;
+}
+}
+
 static inline Int128 int128_divu(Int128 a, Int128 b)
 {
 return (__uint128_t)a / (__uint128_t)b;
@@ -373,11 +392,21 @@ static inline bool int128_ge(Int128 a, Int128 b)
 return a.hi > b.hi || (a.hi == b.hi && a.lo >= b.lo);
 }
 
+static inline bool int128_uge(Int128 a, Int128 b)
+{
+return (uint64_t)a.hi > (uint64_t)b.hi || (a.hi == b.hi && a.lo >= b.lo);
+}
+
 static inline bool int128_lt(Int128 a, Int128 b)
 {
 return !int128_ge(a, b);
 }
 
+static inline bool int128_ult(Int128 a, Int128 b)
+{
+return !int128_uge(a, b);
+}
+
 static inline bool int128_le(Int128 a, Int128 b)
 {
 return int128_ge(b, a);
@@ -418,6 +447,15 @@ static inline Int128 bswap128(Int128 a)
 return int128_make128(bswap64(a.hi), bswap64(a.lo));
 }
 
+static inline int clz128(Int128 a)
+{
+if (a.hi) {
+return __builtin_clzll(a.hi);
+} else {
+return (a.lo) ? __builtin_clzll(a.lo) + 64 : 128;
+}
+}
+
 Int128 int128_divu(Int128, Int128);
 Int128 int128_remu(Int128, Int128);
 Int128 int128_divs(Int128, Int128);
diff --git a/util/host-utils.c b/util/host-utils.c
index 96d5dc0bed..93dfb1b6ab 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -266,3 +266,132 @@ void ulshift(uint64_t *plow, uint64_t *phigh, int32_t 
shift, bool *overflow)
 *plow = *plow << shift;
 }
 }
+
+/*
+ * Unsigned 256-by-128 division.
+ * Returns the remainder via r.
+ * Returns lower 128 bit of quotient.
+ * Needs a normalized divisor (most significant bit set to 1).
+ *
+ * Adapted from include/qemu/host-utils.h udiv_qrnnd,
+ * from the GNU Multi Precision Library - longlong.h __udiv_qrnnd
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
+ *
+ * Licensed under the GPLv2/LGPLv3
+ */
+static Int128 udiv256_qrnnd(Int128 *r, Int128 n1, Int128 n0, Int128 d)
+{
+Int128 d0, d1, q0, q1, r1, r0, m;
+uint64_t mp0, mp1;
+
+d0 = int128_make64(int128_getlo(d));
+d1 = int128_make64(int128_gethi(d));
+
+r1 = int128_remu(n1, d1);
+q1 = int128_divu(n1, d1);
+mp0 = int128_getlo(q1);
+mp1 = int128_gethi(q1);
+mulu128(, , int128_getlo(d0));
+m = int128_make128(mp0, mp1);
+r1 = int128_make128(int128_gethi(n0), int128_getlo(r1));
+if (int128_ult(r1, m)) {
+q1 = int128_sub(q1, int128_one());
+r1 = int128_add(r1, d);
+if (int128_uge(r1, d)) {
+if (int128_ult(r1, m)) {
+q1 = int128_sub(q1, int128_one());
+r1 = int128_add(r1, d);
+}
+}
+}
+r1 = int128_sub(r1, m);
+
+r0 = int128_remu(r1, d1);
+q0 = int128_divu(r1, d1);
+mp0 = int128_getlo(q0);
+mp1 = int128_gethi(q0);
+mulu128(, , int128_getlo(d0));
+m = int128_make128(mp0, mp1);
+r0 = int128_m

[PATCH RESEND v3 3/8] target/ppc: Implemented vector divide extended word

2022-05-25 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
vdivesw: Vector Divide Extended Signed Word
vdiveuw: Vector Divide Extended Unsigned Word

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/insn32.decode|  3 ++
 target/ppc/translate/vmx-impl.c.inc | 48 +
 2 files changed, 51 insertions(+)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 01bfde8c5e..f6d2d4b257 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -795,3 +795,6 @@ VDIVSD  000100 . . . 00111001011@VX
 VDIVUD  000100 . . . 00011001011@VX
 VDIVSQ  000100 . . . 0011011@VX
 VDIVUQ  000100 . . . 0001011@VX
+
+VDIVESW 000100 . . . 01110001011@VX
+VDIVEUW 000100 . . . 01010001011@VX
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 22572e6a79..8c542bcb29 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -3320,6 +3320,54 @@ TRANS_FLAGS2(ISA310, VDIVUD, do_vdiv_vmod, MO_64, NULL, 
do_divud)
 TRANS_FLAGS2(ISA310, VDIVSQ, do_vx_helper, gen_helper_VDIVSQ)
 TRANS_FLAGS2(ISA310, VDIVUQ, do_vx_helper, gen_helper_VDIVUQ)
 
+static void do_dives_i32(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b)
+{
+TCGv_i64 val1, val2;
+
+val1 = tcg_temp_new_i64();
+val2 = tcg_temp_new_i64();
+
+tcg_gen_ext_i32_i64(val1, a);
+tcg_gen_ext_i32_i64(val2, b);
+
+/* (a << 32)/b */
+tcg_gen_shli_i64(val1, val1, 32);
+tcg_gen_div_i64(val1, val1, val2);
+
+/* if quotient doesn't fit in 32 bits the result is undefined */
+tcg_gen_extrl_i64_i32(t, val1);
+
+tcg_temp_free_i64(val1);
+tcg_temp_free_i64(val2);
+}
+
+static void do_diveu_i32(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b)
+{
+TCGv_i64 val1, val2;
+
+val1 = tcg_temp_new_i64();
+val2 = tcg_temp_new_i64();
+
+tcg_gen_extu_i32_i64(val1, a);
+tcg_gen_extu_i32_i64(val2, b);
+
+/* (a << 32)/b */
+tcg_gen_shli_i64(val1, val1, 32);
+tcg_gen_divu_i64(val1, val1, val2);
+
+/* if quotient doesn't fit in 32 bits the result is undefined */
+tcg_gen_extrl_i64_i32(t, val1);
+
+tcg_temp_free_i64(val1);
+tcg_temp_free_i64(val2);
+}
+
+DIVS32(do_divesw, do_dives_i32)
+DIVU32(do_diveuw, do_diveu_i32)
+
+TRANS_FLAGS2(ISA310, VDIVESW, do_vdiv_vmod, MO_32, do_divesw, NULL)
+TRANS_FLAGS2(ISA310, VDIVEUW, do_vdiv_vmod, MO_32, do_diveuw, NULL)
+
 #undef DIVS32
 #undef DIVU32
 #undef DIVS64
-- 
2.31.1




[PATCH RESEND v3 7/8] target/ppc: Implemented vector module word/doubleword

2022-05-25 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
vmodsw: Vector Modulo Signed Word
vmoduw: Vector Modulo Unsigned Word
vmodsd: Vector Modulo Signed Doubleword
vmodud: Vector Modulo Unsigned Doubleword

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/insn32.decode|  5 +
 target/ppc/translate/vmx-impl.c.inc | 10 ++
 2 files changed, 15 insertions(+)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 5b2d7824a0..75fa206b39 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -802,3 +802,8 @@ VDIVESD 000100 . . . 0001011@VX
 VDIVEUD 000100 . . . 01011001011@VX
 VDIVESQ 000100 . . . 0111011@VX
 VDIVEUQ 000100 . . . 0101011@VX
+
+VMODSW  000100 . . . 0001011@VX
+VMODUW  000100 . . . 11010001011@VX
+VMODSD  000100 . . . 1001011@VX
+VMODUD  000100 . . . 11011001011@VX
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index f00aa64bf9..78277fb018 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -3365,6 +3365,11 @@ static void do_diveu_i32(TCGv_i32 t, TCGv_i32 a, 
TCGv_i32 b)
 DIVS32(do_divesw, do_dives_i32)
 DIVU32(do_diveuw, do_diveu_i32)
 
+DIVS32(do_modsw, tcg_gen_rem_i32)
+DIVU32(do_moduw, tcg_gen_remu_i32)
+DIVS64(do_modsd, tcg_gen_rem_i64)
+DIVU64(do_modud, tcg_gen_remu_i64)
+
 TRANS_FLAGS2(ISA310, VDIVESW, do_vdiv_vmod, MO_32, do_divesw, NULL)
 TRANS_FLAGS2(ISA310, VDIVEUW, do_vdiv_vmod, MO_32, do_diveuw, NULL)
 TRANS_FLAGS2(ISA310, VDIVESD, do_vx_helper, gen_helper_VDIVESD)
@@ -3372,6 +3377,11 @@ TRANS_FLAGS2(ISA310, VDIVEUD, do_vx_helper, 
gen_helper_VDIVEUD)
 TRANS_FLAGS2(ISA310, VDIVESQ, do_vx_helper, gen_helper_VDIVESQ)
 TRANS_FLAGS2(ISA310, VDIVEUQ, do_vx_helper, gen_helper_VDIVEUQ)
 
+TRANS_FLAGS2(ISA310, VMODSW, do_vdiv_vmod, MO_32, do_modsw , NULL)
+TRANS_FLAGS2(ISA310, VMODUW, do_vdiv_vmod, MO_32, do_moduw, NULL)
+TRANS_FLAGS2(ISA310, VMODSD, do_vdiv_vmod, MO_64, NULL, do_modsd)
+TRANS_FLAGS2(ISA310, VMODUD, do_vdiv_vmod, MO_64, NULL, do_modud)
+
 #undef DIVS32
 #undef DIVU32
 #undef DIVS64
-- 
2.31.1




[PATCH RESEND v3 5/8] host-utils: Implemented signed 256-by-128 division

2022-05-25 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Based on already existing QEMU implementation created a signed
256 bit by 128 bit division needed to implement the vector divide
extended signed quadword instruction from PowerISA 3.1

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 include/qemu/host-utils.h |  1 +
 util/host-utils.c | 51 +++
 2 files changed, 52 insertions(+)

diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index 9767af7573..bc743f5e32 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -851,4 +851,5 @@ static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 }
 
 Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor);
+Int128 divs256(Int128 *plow, Int128 *phigh, Int128 divisor);
 #endif
diff --git a/util/host-utils.c b/util/host-utils.c
index 93dfb1b6ab..fb91bcba82 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -395,3 +395,54 @@ Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor)
 return rem;
 }
 }
+
+/*
+ * Signed 256-by-128 division.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
+ */
+Int128 divs256(Int128 *plow, Int128 *phigh, Int128 divisor)
+{
+bool neg_quotient = false, neg_remainder = false;
+Int128 unsig_hi = *phigh, unsig_lo = *plow;
+Int128 rem;
+
+if (!int128_nonneg(*phigh)) {
+neg_quotient = !neg_quotient;
+neg_remainder = !neg_remainder;
+
+if (!int128_nz(unsig_lo)) {
+unsig_hi = int128_neg(unsig_hi);
+} else {
+unsig_hi = int128_not(unsig_hi);
+unsig_lo = int128_neg(unsig_lo);
+}
+}
+
+if (!int128_nonneg(divisor)) {
+neg_quotient = !neg_quotient;
+
+divisor = int128_neg(divisor);
+}
+
+rem = divu256(_lo, _hi, divisor);
+
+if (neg_quotient) {
+if (!int128_nz(unsig_lo)) {
+*phigh = int128_neg(unsig_hi);
+*plow = int128_zero();
+} else {
+*phigh = int128_not(unsig_hi);
+*plow = int128_neg(unsig_lo);
+}
+} else {
+*phigh = unsig_hi;
+*plow = unsig_lo;
+}
+
+if (neg_remainder) {
+return int128_neg(rem);
+} else {
+return rem;
+}
+}
-- 
2.31.1




[PATCH RESEND v3 1/8] target/ppc: Implemented vector divide instructions

2022-05-25 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
vdivsw: Vector Divide Signed Word
vdivuw: Vector Divide Unsigned Word
vdivsd: Vector Divide Signed Doubleword
vdivud: Vector Divide Unsigned Doubleword

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/insn32.decode|  7 +++
 target/ppc/translate/vmx-impl.c.inc | 85 +
 2 files changed, 92 insertions(+)

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 18a94fa3b5..6df405e398 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -786,3 +786,10 @@ XVF64GERPP  111011 ... --  0 . 00111010 ..-  
@XX3_at xa=%xx_xa_pair
 XVF64GERPN  111011 ... --  0 . 10111010 ..-  @XX3_at xa=%xx_xa_pair
 XVF64GERNP  111011 ... --  0 . 0010 ..-  @XX3_at xa=%xx_xa_pair
 XVF64GERNN  111011 ... --  0 . 1010 ..-  @XX3_at xa=%xx_xa_pair
+
+## Vector Division Instructions
+
+VDIVSW  000100 . . . 00110001011@VX
+VDIVUW  000100 . . . 00010001011@VX
+VDIVSD  000100 . . . 00111001011@VX
+VDIVUD  000100 . . . 00011001011@VX
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index d7524c3204..4c0b1a32ec 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -3238,6 +3238,91 @@ TRANS(VMULHSD, do_vx_mulh, true , do_vx_vmulhd_i64)
 TRANS(VMULHUW, do_vx_mulh, false, do_vx_vmulhw_i64)
 TRANS(VMULHUD, do_vx_mulh, false, do_vx_vmulhd_i64)
 
+static bool do_vdiv_vmod(DisasContext *ctx, arg_VX *a, const int vece,
+ void (*func_32)(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b),
+ void (*func_64)(TCGv_i64 t, TCGv_i64 a, TCGv_i64 b))
+{
+const GVecGen3 op = {
+.fni4 = func_32,
+.fni8 = func_64,
+.vece = vece
+};
+
+REQUIRE_VECTOR(ctx);
+
+tcg_gen_gvec_3(avr_full_offset(a->vrt), avr_full_offset(a->vra),
+   avr_full_offset(a->vrb), 16, 16, );
+
+return true;
+}
+
+#define DIVU32(NAME, DIV)   \
+static void NAME(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b)\
+{   \
+TCGv_i32 zero = tcg_constant_i32(0);\
+TCGv_i32 one = tcg_constant_i32(1); \
+tcg_gen_movcond_i32(TCG_COND_EQ, b, b, zero, one, b);   \
+DIV(t, a, b);   \
+}
+
+#define DIVS32(NAME, DIV)   \
+static void NAME(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b)\
+{   \
+TCGv_i32 t0 = tcg_temp_new_i32();   \
+TCGv_i32 t1 = tcg_temp_new_i32();   \
+tcg_gen_setcondi_i32(TCG_COND_EQ, t0, a, INT32_MIN);\
+tcg_gen_setcondi_i32(TCG_COND_EQ, t1, b, -1);   \
+tcg_gen_and_i32(t0, t0, t1);\
+tcg_gen_setcondi_i32(TCG_COND_EQ, t1, b, 0);\
+tcg_gen_or_i32(t0, t0, t1); \
+tcg_gen_movi_i32(t1, 0);\
+tcg_gen_movcond_i32(TCG_COND_NE, b, t0, t1, t0, b); \
+DIV(t, a, b);   \
+tcg_temp_free_i32(t0);  \
+tcg_temp_free_i32(t1);  \
+}
+
+#define DIVU64(NAME, DIV)   \
+static void NAME(TCGv_i64 t, TCGv_i64 a, TCGv_i64 b)\
+{   \
+TCGv_i64 zero = tcg_constant_i64(0);\
+TCGv_i64 one = tcg_constant_i64(1); \
+tcg_gen_movcond_i64(TCG_COND_EQ, b, b, zero, one, b);   \
+DIV(t, a, b);   \
+}
+
+#define DIVS64(NAME, DIV)   \
+static void NAME(TCGv_i64 t, TCGv_i64 a, TCGv_i64 b)\
+{   \
+TCGv_i64 t0 = tcg_temp_new_i64();   \
+TCGv_i64 t1 = tcg_temp_new_i64();   \
+tcg_gen_setcondi_i64(TCG_COND_EQ, t0, a, INT64_MIN);\
+tcg_gen_setcondi_i64(TCG_COND_EQ, t1, b, -1);   \
+tcg_gen_and_i64(t0, t0, t1);\
+tcg_gen_setcondi_i64(TC

[PATCH RESEND v3 2/8] target/ppc: Implemented vector divide quadword

2022-05-25 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
vdivsq: Vector Divide Signed Quadword
vdivuq: Vector Divide Unsigned Quadword

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  2 ++
 target/ppc/insn32.decode|  2 ++
 target/ppc/int_helper.c | 21 +
 target/ppc/translate/vmx-impl.c.inc |  2 ++
 4 files changed, 27 insertions(+)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 6233e28d85..9f33e589e0 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -175,6 +175,8 @@ DEF_HELPER_FLAGS_3(VMULOSW, TCG_CALL_NO_RWG, void, avr, 
avr, avr)
 DEF_HELPER_FLAGS_3(VMULOUB, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(VMULOUH, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(VMULOUW, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_3(VDIVSQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_3(VDIVUQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 6df405e398..01bfde8c5e 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -793,3 +793,5 @@ VDIVSW  000100 . . . 00110001011@VX
 VDIVUW  000100 . . . 00010001011@VX
 VDIVSD  000100 . . . 00111001011@VX
 VDIVUD  000100 . . . 00011001011@VX
+VDIVSQ  000100 . . . 0011011@VX
+VDIVUQ  000100 . . . 0001011@VX
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 105b626d1b..033718dc0e 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1162,6 +1162,27 @@ void helper_XXPERMX(ppc_vsr_t *t, ppc_vsr_t *s0, 
ppc_vsr_t *s1, ppc_vsr_t *pcv,
 *t = tmp;
 }
 
+void helper_VDIVSQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b)
+{
+Int128 neg1 = int128_makes64(-1);
+Int128 int128_min = int128_make128(0, INT64_MIN);
+if (likely(int128_nz(b->s128) &&
+  (int128_ne(a->s128, int128_min) || int128_ne(b->s128, neg1 {
+t->s128 = int128_divs(a->s128, b->s128);
+} else {
+t->s128 = a->s128; /* Undefined behavior */
+}
+}
+
+void helper_VDIVUQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b)
+{
+if (int128_nz(b->s128)) {
+t->s128 = int128_divu(a->s128, b->s128);
+} else {
+t->s128 = a->s128; /* Undefined behavior */
+}
+}
+
 void helper_VPERM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
 {
 ppc_avr_t result;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 4c0b1a32ec..22572e6a79 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -3317,6 +3317,8 @@ TRANS_FLAGS2(ISA310, VDIVSW, do_vdiv_vmod, MO_32, 
do_divsw, NULL)
 TRANS_FLAGS2(ISA310, VDIVUW, do_vdiv_vmod, MO_32, do_divuw, NULL)
 TRANS_FLAGS2(ISA310, VDIVSD, do_vdiv_vmod, MO_64, NULL, do_divsd)
 TRANS_FLAGS2(ISA310, VDIVUD, do_vdiv_vmod, MO_64, NULL, do_divud)
+TRANS_FLAGS2(ISA310, VDIVSQ, do_vx_helper, gen_helper_VDIVSQ)
+TRANS_FLAGS2(ISA310, VDIVUQ, do_vx_helper, gen_helper_VDIVUQ)
 
 #undef DIVS32
 #undef DIVU32
-- 
2.31.1




[PATCH RESEND v3 6/8] target/ppc: Implemented remaining vector divide extended

2022-05-25 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
vdivesd: Vector Divide Extended Signed Doubleword
vdiveud: Vector Divide Extended Unsigned Doubleword
vdivesq: Vector Divide Extended Signed Quadword
vdiveuq: Vector Divide Extended Unsigned Quadword

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/helper.h |  4 ++
 target/ppc/insn32.decode|  4 ++
 target/ppc/int_helper.c | 64 +
 target/ppc/translate/vmx-impl.c.inc |  4 ++
 4 files changed, 76 insertions(+)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 9f33e589e0..e7624300df 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -177,6 +177,10 @@ DEF_HELPER_FLAGS_3(VMULOUH, TCG_CALL_NO_RWG, void, avr, 
avr, avr)
 DEF_HELPER_FLAGS_3(VMULOUW, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(VDIVSQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(VDIVUQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_3(VDIVESD, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_3(VDIVEUD, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_3(VDIVESQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
+DEF_HELPER_FLAGS_3(VDIVEUQ, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index f6d2d4b257..5b2d7824a0 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -798,3 +798,7 @@ VDIVUQ  000100 . . . 0001011@VX
 
 VDIVESW 000100 . . . 01110001011@VX
 VDIVEUW 000100 . . . 01010001011@VX
+VDIVESD 000100 . . . 0001011@VX
+VDIVEUD 000100 . . . 01011001011@VX
+VDIVESQ 000100 . . . 0111011@VX
+VDIVEUQ 000100 . . . 0101011@VX
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 033718dc0e..42f0dcfc52 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1183,6 +1183,70 @@ void helper_VDIVUQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t 
*b)
 }
 }
 
+void helper_VDIVESD(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b)
+{
+int i;
+int64_t high;
+uint64_t low;
+for (i = 0; i < 2; i++) {
+high = a->s64[i];
+low = 0;
+if (unlikely((high == INT64_MIN && b->s64[i] == -1) || !b->s64[i])) {
+t->s64[i] = a->s64[i]; /* Undefined behavior */
+} else {
+divs128(, , b->s64[i]);
+t->s64[i] = low;
+}
+}
+}
+
+void helper_VDIVEUD(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b)
+{
+int i;
+uint64_t high, low;
+for (i = 0; i < 2; i++) {
+high = a->u64[i];
+low = 0;
+if (unlikely(!b->u64[i])) {
+t->u64[i] = a->u64[i]; /* Undefined behavior */
+} else {
+divu128(, , b->u64[i]);
+t->u64[i] = low;
+}
+}
+}
+
+void helper_VDIVESQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b)
+{
+Int128 high, low;
+Int128 int128_min = int128_make128(0, INT64_MIN);
+Int128 neg1 = int128_makes64(-1);
+
+high = a->s128;
+low = int128_zero();
+if (unlikely(!int128_nz(b->s128) ||
+ (int128_eq(b->s128, neg1) && int128_eq(high, int128_min {
+t->s128 = a->s128; /* Undefined behavior */
+} else {
+divs256(, , b->s128);
+t->s128 = low;
+}
+}
+
+void helper_VDIVEUQ(ppc_avr_t *t, ppc_avr_t *a, ppc_avr_t *b)
+{
+Int128 high, low;
+
+high = a->s128;
+low = int128_zero();
+if (unlikely(!int128_nz(b->s128))) {
+t->s128 = a->s128; /* Undefined behavior */
+} else {
+divu256(, , b->s128);
+t->s128 = low;
+}
+}
+
 void helper_VPERM(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
 {
 ppc_avr_t result;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 8c542bcb29..f00aa64bf9 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -3367,6 +3367,10 @@ DIVU32(do_diveuw, do_diveu_i32)
 
 TRANS_FLAGS2(ISA310, VDIVESW, do_vdiv_vmod, MO_32, do_divesw, NULL)
 TRANS_FLAGS2(ISA310, VDIVEUW, do_vdiv_vmod, MO_32, do_diveuw, NULL)
+TRANS_FLAGS2(ISA310, VDIVESD, do_vx_helper, gen_helper_VDIVESD)
+TRANS_FLAGS2(ISA310, VDIVEUD, do_vx_helper, gen_helper_VDIVEUD)
+TRANS_FLAGS2(ISA310, VDIVESQ, do_vx_helper, gen_helper_VDIVESQ)
+TRANS_FLAGS2(ISA310, VDIVEUQ, do_vx_helper, gen_helper_VDIVEUQ)
 
 #undef DIVS32
 #undef DIVU32
-- 
2.31.1




[PATCH v6 8/8] linux-user: Add PowerPC ISA 3.1 and MMA to hwcap

2022-05-24 Thread Lucas Mateus Castro(alqotel)
From: Joel Stanley 

These are new hwcap bits added for power10.

Signed-off-by: Joel Stanley 
Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 linux-user/elfload.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 61063fd974..0908692e62 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -779,6 +779,8 @@ enum {
 QEMU_PPC_FEATURE2_DARN = 0x0020, /* darn random number insn */
 QEMU_PPC_FEATURE2_SCV = 0x0010, /* scv syscall */
 QEMU_PPC_FEATURE2_HTM_NO_SUSPEND = 0x0008, /* TM w/o suspended state */
+QEMU_PPC_FEATURE2_ARCH_3_1 = 0x0004, /* ISA 3.1 */
+QEMU_PPC_FEATURE2_MMA = 0x0002, /* Matrix-Multiply Assist */
 };
 
 #define ELF_HWCAP get_elf_hwcap()
@@ -836,6 +838,8 @@ static uint32_t get_elf_hwcap2(void)
   QEMU_PPC_FEATURE2_VEC_CRYPTO);
 GET_FEATURE2(PPC2_ISA300, QEMU_PPC_FEATURE2_ARCH_3_00 |
  QEMU_PPC_FEATURE2_DARN | QEMU_PPC_FEATURE2_HAS_IEEE128);
+GET_FEATURE2(PPC2_ISA310, QEMU_PPC_FEATURE2_ARCH_3_1 |
+ QEMU_PPC_FEATURE2_MMA);
 
 #undef GET_FEATURE
 #undef GET_FEATURE2
-- 
2.31.1




[PATCH v6 7/8] target/ppc: Implemented [pm]xvbf16ger2*

2022-05-24 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xvbf16ger2:   VSX Vector bfloat16 GER (rank-2 update)
xvbf16ger2nn: VSX Vector bfloat16 GER (rank-2 update) Negative multiply,
Negative accumulate
xvbf16ger2np: VSX Vector bfloat16 GER (rank-2 update) Negative multiply,
Positive accumulate
xvbf16ger2pn: VSX Vector bfloat16 GER (rank-2 update) Positive multiply,
Negative accumulate
xvbf16ger2pp: VSX Vector bfloat16 GER (rank-2 update) Positive multiply,
Positive accumulate
pmxvbf16ger2:   Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
pmxvbf16ger2nn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Negative multiply, Negative accumulate
pmxvbf16ger2np: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Negative multiply, Positive accumulate
pmxvbf16ger2pn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Positive multiply, Negative accumulate
pmxvbf16ger2pp: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/fpu_helper.c | 40 +
 target/ppc/helper.h |  5 
 target/ppc/insn32.decode|  6 +
 target/ppc/insn64.decode| 11 
 target/ppc/translate/vsx-impl.c.inc | 12 +
 5 files changed, 74 insertions(+)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index a9b2ef370f..fed0ce420a 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3517,6 +3517,11 @@ static float64 extract_hf16(float16 in, float_status 
*fp_status)
 return float16_to_float64(in, true, fp_status);
 }
 
+static float64 extract_bf16(bfloat16 in, float_status *fp_status)
+{
+return bfloat16_to_float64(in, fp_status);
+}
+
 static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
  ppc_acc_t  *at, uint32_t mask, bool acc,
  bool neg_mul, bool neg_acc, extract_f16 extract)
@@ -3639,6 +3644,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, 
ppc_vsr_t *b,
 vsxger_excp(env, GETPC());
 }
 
+QEMU_FLATTEN
+void helper_XVBF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+   ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, false, false, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2PP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, false, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2PN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, false, true, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2NP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, true, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2NN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, true, true, extract_bf16);
+}
+
 QEMU_FLATTEN
 void helper_XVF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
  ppc_acc_t *at, uint32_t mask)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 4070c0891c..6233e28d85 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -555,6 +555,11 @@ DEF_HELPER_5(XVF16GER2PP, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF16GER2PN, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF16GER2NP, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF16GER2NN, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2PP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2PN, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2NP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2NN, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF32GER, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF32GERPP, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF32GERPN, void, env, vsr, vsr, acc, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index b8e317159c..18a94fa3b5 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -763,6 +763,12 @@ XVI8GER4SPP 111011 ... -- . . 01100011 ..-  
@XX3_at xa=%xx_xa
 XVI16GER2S  111011 ... -- . . 00101011 ..-  @XX3_at xa=%xx_xa
 XVI16GER2SPP111011 ... -- . . 00101010 ..-  @XX3_at xa=%xx_xa
 
+XVBF16GER2  111011 ... -- . . 00110011 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2PP111011 ... -- . . 00110010 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2PN111011 ... -- . . 10110010 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2NP111011 ... -- . . 01110010 ..-  @XX3_at xa=%xx_xa
+XV

[PATCH v6 3/8] target/ppc: Implemented pmxvi*ger* instructions

2022-05-24 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
pmxvi4ger8: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer
GER (rank-4 update)
pmxvi4ger8pp:   Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer
GER (rank-4 update) Positive multiply, Positive accumulate
pmxvi8ger4: Prefixed Masked VSX Vector 4-bit Signed Integer GER
(rank-8 update)
pmxvi8ger4pp:   Prefixed Masked VSX Vector 4-bit Signed Integer GER
(rank-8 update) Positive multiply, Positive accumulate
pmxvi8ger4spp:  Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer
GER (rank-4 update) with Saturate Positive multiply, Positive accumulate
pmxvi16ger2:Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update)
pmxvi16ger2pp:  Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update) Positive multiply, Positive accumulate
pmxvi16ger2s:   Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update) with Saturation
pmxvi16ger2spp: Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update) with Saturation Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn64.decode| 30 +
 target/ppc/translate/vsx-impl.c.inc | 10 ++
 2 files changed, 40 insertions(+)

diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode
index 691e8fe6c0..0eed35c8cd 100644
--- a/target/ppc/insn64.decode
+++ b/target/ppc/insn64.decode
@@ -68,6 +68,15 @@
 .. . . . . ..    \
 &8RR_XX4_uim3 xt=%8rr_xx_xt xa=%8rr_xx_xa xb=%8rr_xx_xb 
xc=%8rr_xx_xc
 
+# Format MMIRR:XX3
+_XX3  !extern xa xb xt pmsk xmsk ymsk
+%xx3_xa 2:1 16:5
+%xx3_xb 1:1 11:5
+%xx3_at 23:3
+@MMIRR_XX3  .. ..  .. . .  xmsk:4 ymsk:4  \
+.. ... .. . .  ...  \
+_XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at
+
 ### Fixed-Point Load Instructions
 
 PLBZ01 10 0--.-- .. \
@@ -115,6 +124,27 @@ PSTFS   01 10 0--.-- .. \
 PSTFD   01 10 0--.-- .. \
 110110 . .  @PLS_D
 
+## VSX GER instruction
+
+PMXVI4GER8  01 11 1001 -- - - pmsk:8   \
+111011 ... -- . . 00100011 ..-  @MMIRR_XX3
+PMXVI4GER8PP01 11 1001 -- - - pmsk:8   \
+111011 ... -- . . 00100010 ..-  @MMIRR_XX3
+PMXVI8GER4  01 11 1001 -- - - pmsk:4   \
+111011 ... -- . . 0011 ..-  @MMIRR_XX3
+PMXVI8GER4PP01 11 1001 -- - - pmsk:4   \
+111011 ... -- . . 0010 ..-  @MMIRR_XX3
+PMXVI16GER2 01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 01001011 ..-  @MMIRR_XX3
+PMXVI16GER2PP   01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 01101011 ..-  @MMIRR_XX3
+PMXVI8GER4SPP   01 11 1001 -- - - pmsk:4   \
+111011 ... -- . . 01100011 ..-  @MMIRR_XX3
+PMXVI16GER2S01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 00101011 ..-  @MMIRR_XX3
+PMXVI16GER2SPP  01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 00101010 ..-  @MMIRR_XX3
+
 ### Prefixed No-operation Instruction
 
 @PNOP   01 11 -- 00 \
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 6026b203e0..b10eded1da 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -2888,6 +2888,16 @@ TRANS(XVI16GER2PP, do_ger, gen_helper_XVI16GER2PP)
 TRANS(XVI16GER2S, do_ger, gen_helper_XVI16GER2S)
 TRANS(XVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP)
 
+TRANS64(PMXVI4GER8, do_ger, gen_helper_XVI4GER8)
+TRANS64(PMXVI4GER8PP, do_ger, gen_helper_XVI4GER8PP)
+TRANS64(PMXVI8GER4, do_ger, gen_helper_XVI8GER4)
+TRANS64(PMXVI8GER4PP, do_ger, gen_helper_XVI8GER4PP)
+TRANS64(PMXVI8GER4SPP, do_ger, gen_helper_XVI8GER4SPP)
+TRANS64(PMXVI16GER2, do_ger, gen_helper_XVI16GER2)
+TRANS64(PMXVI16GER2PP, do_ger, gen_helper_XVI16GER2PP)
+TRANS64(PMXVI16GER2S, do_ger, gen_helper_XVI16GER2S)
+TRANS64(PMXVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP)
+
 #undef GEN_XX2FORM
 #undef GEN_XX3FORM
 #undef GEN_XX2IFORM
-- 
2.31.1




[PATCH v6 6/8] target/ppc: Implemented pmxvf*ger*

2022-05-24 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
pmxvf16ger2:   Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update)
pmxvf16ger2nn: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Negative multiply, Negative accumulate
pmxvf16ger2np: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Negative multiply, Positive accumulate
pmxvf16ger2pn: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Positive multiply, Negative accumulate
pmxvf16ger2pp: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Positive multiply, Positive accumulate
pmxvf32ger:Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update)
pmxvf32gernn:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Negative multiply, Negative accumulate
pmxvf32gernp:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Negative multiply, Positive accumulate
pmxvf32gerpn:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Positive multiply, Negative accumulate
pmxvf32gerpp:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Positive multiply, Positive accumulate
pmxvf64ger:Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update)
pmxvf64gernn:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Negative multiply, Negative accumulate
pmxvf64gernp:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Negative multiply, Positive accumulate
pmxvf64gerpn:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Positive multiply, Negative accumulate
pmxvf64gerpp:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn64.decode| 38 +
 target/ppc/translate/vsx-impl.c.inc | 18 ++
 2 files changed, 56 insertions(+)

diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode
index 0eed35c8cd..5ecc5c85bf 100644
--- a/target/ppc/insn64.decode
+++ b/target/ppc/insn64.decode
@@ -73,10 +73,15 @@
 %xx3_xa 2:1 16:5
 %xx3_xb 1:1 11:5
 %xx3_at 23:3
+%xx3_xa_pair2:1 17:4 !function=times_2
 @MMIRR_XX3  .. ..  .. . .  xmsk:4 ymsk:4  \
 .. ... .. . .  ...  \
 _XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at
 
+@MMIRR_XX3_NO_P .. ..  .. . .  xmsk:4  \
+.. ... .. . .  ... \
+_XX3 xb=%xx3_xb xt=%xx3_at pmsk=1
+
 ### Fixed-Point Load Instructions
 
 PLBZ01 10 0--.-- .. \
@@ -145,6 +150,39 @@ PMXVI16GER2S01 11 1001 -- - - pmsk:2 -- 
   \
 PMXVI16GER2SPP  01 11 1001 -- - - pmsk:2 --    \
 111011 ... -- . . 00101010 ..-  @MMIRR_XX3
 
+PMXVF16GER2 01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 00010011 ..-  @MMIRR_XX3
+PMXVF16GER2PP   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 00010010 ..-  @MMIRR_XX3
+PMXVF16GER2PN   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 10010010 ..-  @MMIRR_XX3
+PMXVF16GER2NP   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 01010010 ..-  @MMIRR_XX3
+PMXVF16GER2NN   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 11010010 ..-  @MMIRR_XX3
+
+PMXVF32GER  01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 00011011 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERPP01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 00011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERPN01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 10011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERNP01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 01011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERNN01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 11011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+
+PMXVF64GER  01 11 1001 -- - -   ymsk:2 -- \
+111011 ... -- 0 . 00111011 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa_pair
+PMXVF64GERPP01 11 1001 -- - -   ymsk:2 -- \
+111011 ... -- 0 . 00111010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa_pair
+PMXVF64GERPN01 11 1001 -- - -   ymsk:2 -- \
+111011 ... -- 0 . 10111010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa_pair
+PMXVF64GERNP01 11 1001 -- - -   ymsk:2 -- \
+

[PATCH v6 1/8] target/ppc: Implement xxm[tf]acc and xxsetaccz

2022-05-24 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xxmfacc: VSX Move From Accumulator
xxmtacc: VSX Move To Accumulator
xxsetaccz: VSX Set Accumulator to Zero

The PowerISA 3.1 mentions that for the current version of the
architecture, "the hardware implementation provides the effect of ACC[i]
and VSRs 4*i to 4*i + 3 logically containing the same data" and "The
Accumulators introduce no new logical state at this time" (page 501).
For now it seems unnecessary to create new structures, so this patch
just uses ACC[i] as VSRs 4*i to 4*i+3 and therefore move to and from
accumulators are no-ops.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h|  5 +
 target/ppc/insn32.decode|  9 +
 target/ppc/translate/vsx-impl.c.inc | 31 +
 3 files changed, 45 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index bf8f8aad2c..c865206827 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2663,6 +2663,11 @@ static inline int vsr_full_offset(int i)
 return offsetof(CPUPPCState, vsr[i].u64[0]);
 }
 
+static inline int acc_full_offset(int i)
+{
+return vsr_full_offset(i * 4);
+}
+
 static inline int fpr_offset(int i)
 {
 return vsr64_offset(i, true);
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index f001c02a8c..c0f545ca38 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -154,6 +154,9 @@
 _vrt_frbp vrt frbp
 @X_vrt_frbp .. vrt:5 . 0 .. .   _vrt_frbp 
frbp=%x_frbp
 
+_ara
+@X_a.. ra:3 .. . . .. . _a
+
 %xx_xt  0:1 21:5
 %xx_xb  1:1 11:5
 %xx_xa  2:1 16:5
@@ -734,3 +737,9 @@ XVTLSBB 00 ... -- 00010 . 111011011 . - 
@XX2_bf_xb
 _s   s:uint8_t
 @XL_s   ..-- s:1 .. -   _s
 RFEBB   010011-- .   0010010010 -   @XL_s
+
+## Accumulator Instructions
+
+XXMFACC 01 ... -- 0 - 0010110001 -   @X_a
+XXMTACC 01 ... -- 1 - 0010110001 -   @X_a
+XXSETACCZ   01 ... -- 00011 - 0010110001 -   @X_a
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 900c1a1ab2..235be360e2 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -2816,6 +2816,37 @@ static bool trans_XVCVBF16SPN(DisasContext *ctx, arg_XX2 
*a)
 return true;
 }
 
+/*
+ *  The PowerISA 3.1 mentions that for the current version of the
+ *  architecture, "the hardware implementation provides the effect of
+ *  ACC[i] and VSRs 4*i to 4*i + 3 logically containing the same data"
+ *  and "The Accumulators introduce no new logical state at this time"
+ *  (page 501). For now it seems unnecessary to create new structures,
+ *  so ACC[i] is the same as VSRs 4*i to 4*i+3 and therefore
+ *  move to and from accumulators are no-ops.
+ */
+static bool trans_XXMFACC(DisasContext *ctx, arg_X_a *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VSX(ctx);
+return true;
+}
+
+static bool trans_XXMTACC(DisasContext *ctx, arg_X_a *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VSX(ctx);
+return true;
+}
+
+static bool trans_XXSETACCZ(DisasContext *ctx, arg_X_a *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VSX(ctx);
+tcg_gen_gvec_dup_imm(MO_64, acc_full_offset(a->ra), 64, 64, 0);
+return true;
+}
+
 #undef GEN_XX2FORM
 #undef GEN_XX3FORM
 #undef GEN_XX2IFORM
-- 
2.31.1




[PATCH v6 4/8] target/ppc: Implemented xvf*ger*

2022-05-24 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xvf32ger:   VSX Vector 32-bit Floating-Point GER (rank-1 update)
xvf32gernn: VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative
multiply, Negative accumulate
xvf32gernp: VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative
multiply, Positive accumulate
xvf32gerpn: VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive
multiply, Negative accumulate
xvf32gerpp: VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive
multiply, Positive accumulate
xvf64ger:   VSX Vector 64-bit Floating-Point GER (rank-1 update)
xvf64gernn: VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative
multiply, Negative accumulate
xvf64gernp: VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative
multiply, Positive accumulate
xvf64gerpn: VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive
multiply, Negative accumulate
xvf64gerpp: VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive
multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h|   4 +
 target/ppc/fpu_helper.c | 194 +++-
 target/ppc/helper.h |  10 ++
 target/ppc/insn32.decode|  13 ++
 target/ppc/translate/vsx-impl.c.inc |  12 ++
 5 files changed, 231 insertions(+), 2 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index dff3ca8222..40c779f246 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2643,6 +2643,8 @@ static inline bool lsw_reg_in_range(int start, int nregs, 
int rx)
 #define VsrSW(i) s32[i]
 #define VsrD(i) u64[i]
 #define VsrSD(i) s64[i]
+#define VsrSF(i) f32[i]
+#define VsrDF(i) f64[i]
 #else
 #define VsrB(i) u8[15 - (i)]
 #define VsrSB(i) s8[15 - (i)]
@@ -2652,6 +2654,8 @@ static inline bool lsw_reg_in_range(int start, int nregs, 
int rx)
 #define VsrSW(i) s32[3 - (i)]
 #define VsrD(i) u64[1 - (i)]
 #define VsrSD(i) s64[1 - (i)]
+#define VsrSF(i) f32[3 - (i)]
+#define VsrDF(i) f64[1 - (i)]
 #endif
 
 static inline int vsr64_offset(int i, bool high)
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 9489e06504..712c71162c 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -414,7 +414,7 @@ void helper_store_fpscr(CPUPPCState *env, uint64_t val, 
uint32_t nibbles)
 ppc_store_fpscr(env, val);
 }
 
-void helper_fpscr_check_status(CPUPPCState *env)
+static void do_fpscr_check_status(CPUPPCState *env, uintptr_t raddr)
 {
 CPUState *cs = env_cpu(env);
 target_ulong fpscr = env->fpscr;
@@ -455,13 +455,19 @@ void helper_fpscr_check_status(CPUPPCState *env)
 }
 cs->exception_index = POWERPC_EXCP_PROGRAM;
 env->error_code = error | POWERPC_EXCP_FP;
+env->fpscr |= error ? FP_FEX : 0;
 /* Deferred floating-point exception after target FPSCR update */
 if (fp_exceptions_enabled(env)) {
 raise_exception_err_ra(env, cs->exception_index,
-   env->error_code, GETPC());
+   env->error_code, raddr);
 }
 }
 
+void helper_fpscr_check_status(CPUPPCState *env)
+{
+do_fpscr_check_status(env, GETPC());
+}
+
 static void do_float_check_status(CPUPPCState *env, bool change_fi,
   uintptr_t raddr)
 {
@@ -3468,3 +3474,187 @@ void helper_xssubqp(CPUPPCState *env, uint32_t opcode,
 *xt = t;
 do_float_check_status(env, true, GETPC());
 }
+
+static inline void vsxger_excp(CPUPPCState *env, uintptr_t retaddr)
+{
+/*
+ * XV*GER instructions execute and set the FPSCR as if exceptions
+ * are disabled and only at the end throw an exception
+ */
+target_ulong enable;
+enable = env->fpscr & (FP_ENABLES | FP_FI | FP_FR);
+env->fpscr &= ~(FP_ENABLES | FP_FI | FP_FR);
+int status = get_float_exception_flags(>fp_status);
+if (unlikely(status & float_flag_invalid)) {
+if (status & float_flag_invalid_snan) {
+float_invalid_op_vxsnan(env, 0);
+}
+if (status & float_flag_invalid_imz) {
+float_invalid_op_vximz(env, false, 0);
+}
+if (status & float_flag_invalid_isi) {
+float_invalid_op_vxisi(env, false, 0);
+}
+}
+do_float_check_status(env, false, retaddr);
+env->fpscr |= enable;
+do_fpscr_check_status(env, retaddr);
+}
+
+typedef void vsxger_zero(ppc_vsr_t *at, int, int);
+
+typedef void vsxger_muladd_f(ppc_vsr_t *, ppc_vsr_t *, ppc_vsr_t *, int, int,
+ int flags, float_status *s);
+
+static void vsxger_muladd32(ppc_vsr_t *at, ppc_vsr_t *a, ppc_vsr_t *b, int i,
+int j, int flags, float_status *s)
+{
+at[i].VsrSF(j) = float32_muladd(a->VsrSF(i), b->VsrSF(j),
+at[i].VsrSF(j), flags, s);
+}
+
+

[PATCH v6 5/8] target/ppc: Implemented xvf16ger*

2022-05-24 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xvf16ger2:   VSX Vector 16-bit Floating-Point GER (rank-2 update)
xvf16ger2nn: VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative
multiply, Negative accumulate
xvf16ger2np: VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative
multiply, Positive accumulate
xvf16ger2pn: VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive
multiply, Negative accumulate
xvf16ger2pp: VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive
multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h|  3 +
 target/ppc/fpu_helper.c | 95 +
 target/ppc/helper.h |  5 ++
 target/ppc/insn32.decode|  6 ++
 target/ppc/translate/vsx-impl.c.inc |  6 ++
 5 files changed, 115 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 40c779f246..6d78078f37 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -227,6 +227,7 @@ typedef union _ppc_vsr_t {
 int16_t s16[8];
 int32_t s32[4];
 int64_t s64[2];
+float16 f16[8];
 float32 f32[4];
 float64 f64[2];
 float128 f128;
@@ -2643,6 +2644,7 @@ static inline bool lsw_reg_in_range(int start, int nregs, 
int rx)
 #define VsrSW(i) s32[i]
 #define VsrD(i) u64[i]
 #define VsrSD(i) s64[i]
+#define VsrHF(i) f16[i]
 #define VsrSF(i) f32[i]
 #define VsrDF(i) f64[i]
 #else
@@ -2654,6 +2656,7 @@ static inline bool lsw_reg_in_range(int start, int nregs, 
int rx)
 #define VsrSW(i) s32[3 - (i)]
 #define VsrD(i) u64[1 - (i)]
 #define VsrSD(i) s64[1 - (i)]
+#define VsrHF(i) f16[7 - (i)]
 #define VsrSF(i) f32[3 - (i)]
 #define VsrDF(i) f64[1 - (i)]
 #endif
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 712c71162c..a9b2ef370f 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -36,6 +36,15 @@ static inline float128 float128_snan_to_qnan(float128 x)
 #define float32_snan_to_qnan(x) ((x) | 0x0040)
 #define float16_snan_to_qnan(x) ((x) | 0x0200)
 
+static inline float32 bfp32_neg(float32 a)
+{
+if (unlikely(float32_is_any_nan(a))) {
+return a;
+} else {
+return float32_chs(a);
+}
+}
+
 static inline bool fp_exceptions_enabled(CPUPPCState *env)
 {
 #ifdef CONFIG_USER_ONLY
@@ -3501,6 +3510,57 @@ static inline void vsxger_excp(CPUPPCState *env, 
uintptr_t retaddr)
 do_fpscr_check_status(env, retaddr);
 }
 
+typedef float64 extract_f16(float16, float_status *);
+
+static float64 extract_hf16(float16 in, float_status *fp_status)
+{
+return float16_to_float64(in, true, fp_status);
+}
+
+static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t  *at, uint32_t mask, bool acc,
+ bool neg_mul, bool neg_acc, extract_f16 extract)
+{
+float32 r, aux_acc;
+float64 psum, va, vb, vc, vd;
+int i, j, xmsk_bit, ymsk_bit;
+uint8_t pmsk = FIELD_EX32(mask, GER_MSK, PMSK),
+xmsk = FIELD_EX32(mask, GER_MSK, XMSK),
+ymsk = FIELD_EX32(mask, GER_MSK, YMSK);
+float_status *excp_ptr = >fp_status;
+for (i = 0, xmsk_bit = 1 << 3; i < 4; i++, xmsk_bit >>= 1) {
+for (j = 0, ymsk_bit = 1 << 3; j < 4; j++, ymsk_bit >>= 1) {
+if ((xmsk_bit & xmsk) && (ymsk_bit & ymsk)) {
+va = !(pmsk & 2) ? float64_zero :
+   extract(a->VsrHF(2 * i), excp_ptr);
+vb = !(pmsk & 2) ? float64_zero :
+   extract(b->VsrHF(2 * j), excp_ptr);
+vc = !(pmsk & 1) ? float64_zero :
+   extract(a->VsrHF(2 * i + 1), excp_ptr);
+vd = !(pmsk & 1) ? float64_zero :
+   extract(b->VsrHF(2 * j + 1), excp_ptr);
+psum = float64_mul(va, vb, excp_ptr);
+psum = float64r32_muladd(vc, vd, psum, 0, excp_ptr);
+r = float64_to_float32(psum, excp_ptr);
+if (acc) {
+aux_acc = at[i].VsrSF(j);
+if (neg_mul) {
+r = bfp32_neg(r);
+}
+if (neg_acc) {
+aux_acc = bfp32_neg(aux_acc);
+}
+r = float32_add(r, aux_acc, excp_ptr);
+}
+at[i].VsrSF(j) = r;
+} else {
+at[i].VsrSF(j) = float32_zero;
+}
+}
+}
+vsxger_excp(env, GETPC());
+}
+
 typedef void vsxger_zero(ppc_vsr_t *at, int, int);
 
 typedef void vsxger_muladd_f(ppc_vsr_t *, ppc_vsr_t *, ppc_vsr_t *, int, int,
@@ -3579,6 +3639,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, 
ppc_vsr_t *b,
 vsxger_excp(env, GETPC());

[PATCH v6 0/8] VSX MMA Implementation

2022-05-24 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Based-on: https://gitlab.com/danielhb/qemu/-/tree/ppc-next

This patch series is a patch series of the Matrix-Multiply Assist (MMA)
instructions implementation from the PowerISA 3.1

This patch series was created based on Victor's target/ppc: Fix FPSCR.FI
bit patch series changes as that series changed do_check_float_status,
which is called by the GER helper functions.

These and the VDIV/VMOD implementation are the last new PowerISA 3.1
instructions left to be implemented.

The XVFGER instructions accumulate the exception status and at the end
set the FPSCR and take a Program interrupt on a trap-enabled exception,
previous versions were based on Victor's rework of FPU exceptions, but
as that patch was rejected this version worked around the fact that
OX/UX/XX and invalid instructions were handled in different functions
by disabling all enable bits then re-enabling them and calling the mtfsf
deferred exception helper.

v6 changes:
- Rebased on ppc-next
- Wrapped lines to stay <= 80 characters

v5 changes:
- Changed VSXGER16 accumulation to negate the multiplication and
  accumulation in independent if's (if necessary) and sum their
  values.

v4 changes:
- Changed VSXGER16 accumulation to always use float32_sum and negate
  the elements according to the type of accumulation

v3 changes:
- GER helpers now use ppc_acc_t instead of ppc_vsr_t for passing acc
- Removed do_ger_XX3 and updated the decodetree to pass the masks in
  32 bits instructions
- Removed unnecessary rounding mode function
- Moved float32_neg to fpu_helper.c and renamed it bfp32_negate to
  make it clearer that it's a 32 bit version of the PowerISA
  bfp_NEGATE
- Negated accumulation now a subtraction
- Changed exception handling by disabling all enable FPSCR enable
  bits to set all FPSCR bits (except FEX) correctly, then re-enable
  them and call do_fpscr_check_status to raise the exception
  accordingly and set FEX if necessary

v2 changes:
- Changed VSXGER, VSXGER16 and XVIGER macros to functions
- Set rounding mode in floating-point instructions based on RN
  before operations
- Separated accumulate and with saturation instructions in
  different helpers
- Used FIELD, FIELD_EX32 and FIELD_DP32 for packing/unpacking masks


Joel Stanley (1):
  linux-user: Add PowerPC ISA 3.1 and MMA to hwcap

Lucas Mateus Castro (alqotel) (7):
  target/ppc: Implement xxm[tf]acc and xxsetaccz
  target/ppc: Implemented xvi*ger* instructions
  target/ppc: Implemented pmxvi*ger* instructions
  target/ppc: Implemented xvf*ger*
  target/ppc: Implemented xvf16ger*
  target/ppc: Implemented pmxvf*ger*
  target/ppc: Implemented [pm]xvbf16ger2*

 linux-user/elfload.c|   4 +
 target/ppc/cpu.h|  13 ++
 target/ppc/fpu_helper.c | 329 +++-
 target/ppc/helper.h |  33 +++
 target/ppc/insn32.decode|  52 +
 target/ppc/insn64.decode|  79 +++
 target/ppc/int_helper.c | 130 +++
 target/ppc/internal.h   |  15 ++
 target/ppc/translate/vsx-impl.c.inc | 130 +++
 9 files changed, 783 insertions(+), 2 deletions(-)

-- 
2.31.1




[PATCH v6 2/8] target/ppc: Implemented xvi*ger* instructions

2022-05-24 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xvi4ger8: VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update)
xvi4ger8pp:   VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update)
Positive multiply, Positive accumulate
xvi8ger4: VSX Vector 4-bit Signed Integer GER (rank-8 update)
xvi8ger4pp:   VSX Vector 4-bit Signed Integer GER (rank-8 update)
Positive multiply, Positive accumulate
xvi8ger4spp:  VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update)
with Saturate Positive multiply, Positive accumulate
xvi16ger2:VSX Vector 16-bit Signed Integer GER (rank-2 update)
xvi16ger2pp:  VSX Vector 16-bit Signed Integer GER (rank-2 update)
Positive multiply, Positive accumulate
xvi16ger2s:   VSX Vector 16-bit Signed Integer GER (rank-2 update)
with Saturation
xvi16ger2spp: VSX Vector 16-bit Signed Integer GER (rank-2 update)
with Saturation Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h|   1 +
 target/ppc/helper.h |  13 +++
 target/ppc/insn32.decode|  18 
 target/ppc/int_helper.c | 130 
 target/ppc/internal.h   |  15 
 target/ppc/translate/vsx-impl.c.inc |  41 +
 6 files changed, 218 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index c865206827..dff3ca8222 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -238,6 +238,7 @@ typedef union _ppc_vsr_t {
 
 typedef ppc_vsr_t ppc_avr_t;
 typedef ppc_vsr_t ppc_fprp_t;
+typedef ppc_vsr_t ppc_acc_t;
 
 #if !defined(CONFIG_USER_ONLY)
 /* Software TLB cache */
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 5e43920b9e..1666797edf 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -133,6 +133,10 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)
 #define dh_ctype_vsr ppc_vsr_t *
 #define dh_typecode_vsr dh_typecode_ptr
 
+#define dh_alias_acc ptr
+#define dh_ctype_acc ppc_acc_t *
+#define dh_typecode_acc dh_typecode_ptr
+
 DEF_HELPER_FLAGS_3(vavgub, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vavguh, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vavguw, TCG_CALL_NO_RWG, void, avr, avr, avr)
@@ -537,6 +541,15 @@ DEF_HELPER_FLAGS_5(XXBLENDVB, TCG_CALL_NO_RWG, void, vsr, 
vsr, vsr, vsr, i32)
 DEF_HELPER_FLAGS_5(XXBLENDVH, TCG_CALL_NO_RWG, void, vsr, vsr, vsr, vsr, i32)
 DEF_HELPER_FLAGS_5(XXBLENDVW, TCG_CALL_NO_RWG, void, vsr, vsr, vsr, vsr, i32)
 DEF_HELPER_FLAGS_5(XXBLENDVD, TCG_CALL_NO_RWG, void, vsr, vsr, vsr, vsr, i32)
+DEF_HELPER_5(XVI4GER8, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI4GER8PP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI8GER4, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI8GER4PP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI8GER4SPP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI16GER2, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI16GER2S, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI16GER2PP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI16GER2SPP, void, env, vsr, vsr, acc, i32)
 
 DEF_HELPER_2(efscfsi, i32, env, i32)
 DEF_HELPER_2(efscfui, i32, env, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index c0f545ca38..0e189fe2da 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -175,6 +175,12 @@
 xt xa xb
 @XX3.. . . .  ...xt=%xx_xt 
xa=%xx_xa xb=%xx_xb
 
+# 32 bit GER instructions have all mask bits considered 1
+_XX3  xa xb xt pmsk xmsk ymsk
+%xx_at  23:3
+@XX3_at .. ... .. . .  ...  _XX3 
xt=%xx_at xb=%xx_xb \
+pmsk=255 
xmsk=15 ymsk=15
+
 _dm xt xa xb dm
 @XX3_dm .. . . . . dm:2 . ...   _dm 
xt=%xx_xt xa=%xx_xa xb=%xx_xb
 
@@ -743,3 +749,15 @@ RFEBB   010011-- .   0010010010 -   
@XL_s
 XXMFACC 01 ... -- 0 - 0010110001 -   @X_a
 XXMTACC 01 ... -- 1 - 0010110001 -   @X_a
 XXSETACCZ   01 ... -- 00011 - 0010110001 -   @X_a
+
+## VSX GER instruction
+
+XVI4GER8111011 ... -- . . 00100011 ..-  @XX3_at xa=%xx_xa
+XVI4GER8PP  111011 ... -- . . 00100010 ..-  @XX3_at xa=%xx_xa
+XVI8GER4111011 ... -- . . 0011 ..-  @XX3_at xa=%xx_xa
+XVI8GER4PP  111011 ... -- . . 0010 ..-  @XX3_at xa=%xx_xa
+XVI16GER2   111011 ... -- . . 01001011 ..-  @XX3_at xa=%xx_xa
+XVI16GER2PP 111011 ... -- . . 01101011 ..-  @XX3_at xa=%xx_xa
+XVI8GER4SPP 111011 ... -- . . 01100011 ..-  @XX3_at xa=%xx_xa
+XVI16GER2S  111011 ... -- . . 00101011 ..-  @XX3_at xa=%xx_xa
+XVI16GER2SPP111011 ... -- . . 00101010 ..-  @XX3_at xa=%xx_xa
diff --git a/target/ppc/int_

[PATCH v5 6/8] target/ppc: Implemented pmxvf*ger*

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
pmxvf16ger2:   Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update)
pmxvf16ger2nn: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Negative multiply, Negative accumulate
pmxvf16ger2np: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Negative multiply, Positive accumulate
pmxvf16ger2pn: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Positive multiply, Negative accumulate
pmxvf16ger2pp: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Positive multiply, Positive accumulate
pmxvf32ger:Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update)
pmxvf32gernn:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Negative multiply, Negative accumulate
pmxvf32gernp:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Negative multiply, Positive accumulate
pmxvf32gerpn:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Positive multiply, Negative accumulate
pmxvf32gerpp:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Positive multiply, Positive accumulate
pmxvf64ger:Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update)
pmxvf64gernn:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Negative multiply, Negative accumulate
pmxvf64gernp:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Negative multiply, Positive accumulate
pmxvf64gerpn:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Positive multiply, Negative accumulate
pmxvf64gerpp:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn64.decode| 38 +
 target/ppc/translate/vsx-impl.c.inc | 18 ++
 2 files changed, 56 insertions(+)

diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode
index 0eed35c8cd..5ecc5c85bf 100644
--- a/target/ppc/insn64.decode
+++ b/target/ppc/insn64.decode
@@ -73,10 +73,15 @@
 %xx3_xa 2:1 16:5
 %xx3_xb 1:1 11:5
 %xx3_at 23:3
+%xx3_xa_pair2:1 17:4 !function=times_2
 @MMIRR_XX3  .. ..  .. . .  xmsk:4 ymsk:4  \
 .. ... .. . .  ...  \
 _XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at
 
+@MMIRR_XX3_NO_P .. ..  .. . .  xmsk:4  \
+.. ... .. . .  ... \
+_XX3 xb=%xx3_xb xt=%xx3_at pmsk=1
+
 ### Fixed-Point Load Instructions
 
 PLBZ01 10 0--.-- .. \
@@ -145,6 +150,39 @@ PMXVI16GER2S01 11 1001 -- - - pmsk:2 -- 
   \
 PMXVI16GER2SPP  01 11 1001 -- - - pmsk:2 --    \
 111011 ... -- . . 00101010 ..-  @MMIRR_XX3
 
+PMXVF16GER2 01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 00010011 ..-  @MMIRR_XX3
+PMXVF16GER2PP   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 00010010 ..-  @MMIRR_XX3
+PMXVF16GER2PN   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 10010010 ..-  @MMIRR_XX3
+PMXVF16GER2NP   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 01010010 ..-  @MMIRR_XX3
+PMXVF16GER2NN   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 11010010 ..-  @MMIRR_XX3
+
+PMXVF32GER  01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 00011011 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERPP01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 00011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERPN01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 10011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERNP01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 01011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERNN01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 11011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+
+PMXVF64GER  01 11 1001 -- - -   ymsk:2 -- \
+111011 ... -- 0 . 00111011 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa_pair
+PMXVF64GERPP01 11 1001 -- - -   ymsk:2 -- \
+111011 ... -- 0 . 00111010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa_pair
+PMXVF64GERPN01 11 1001 -- - -   ymsk:2 -- \
+111011 ... -- 0 . 10111010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa_pair
+PMXVF64GERNP01 11 1001 -- - -   ymsk:2 -- \
+

[PATCH v5 5/8] target/ppc: Implemented xvf16ger*

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xvf16ger2:   VSX Vector 16-bit Floating-Point GER (rank-2 update)
xvf16ger2nn: VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative
multiply, Negative accumulate
xvf16ger2np: VSX Vector 16-bit Floating-Point GER (rank-2 update) Negative
multiply, Positive accumulate
xvf16ger2pn: VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive
multiply, Negative accumulate
xvf16ger2pp: VSX Vector 16-bit Floating-Point GER (rank-2 update) Positive
multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h|  3 +
 target/ppc/fpu_helper.c | 91 +
 target/ppc/helper.h |  5 ++
 target/ppc/insn32.decode|  6 ++
 target/ppc/translate/vsx-impl.c.inc |  6 ++
 5 files changed, 111 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index bdedf4138e..46769a5647 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -227,6 +227,7 @@ typedef union _ppc_vsr_t {
 int16_t s16[8];
 int32_t s32[4];
 int64_t s64[2];
+float16 f16[8];
 float32 f32[4];
 float64 f64[2];
 float128 f128;
@@ -2641,6 +2642,7 @@ static inline bool lsw_reg_in_range(int start, int nregs, 
int rx)
 #define VsrSW(i) s32[i]
 #define VsrD(i) u64[i]
 #define VsrSD(i) s64[i]
+#define VsrHF(i) f16[i]
 #define VsrSF(i) f32[i]
 #define VsrDF(i) f64[i]
 #else
@@ -2652,6 +2654,7 @@ static inline bool lsw_reg_in_range(int start, int nregs, 
int rx)
 #define VsrSW(i) s32[3 - (i)]
 #define VsrD(i) u64[1 - (i)]
 #define VsrSD(i) s64[1 - (i)]
+#define VsrHF(i) f16[7 - (i)]
 #define VsrSF(i) f32[3 - (i)]
 #define VsrDF(i) f64[1 - (i)]
 #endif
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 1766da5bcf..7a7aa03ac4 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -36,6 +36,15 @@ static inline float128 float128_snan_to_qnan(float128 x)
 #define float32_snan_to_qnan(x) ((x) | 0x0040)
 #define float16_snan_to_qnan(x) ((x) | 0x0200)
 
+static inline float32 bfp32_neg(float32 a)
+{
+if (unlikely(float32_is_any_nan(a))) {
+return a;
+} else {
+return float32_chs(a);
+}
+}
+
 static inline bool fp_exceptions_enabled(CPUPPCState *env)
 {
 #ifdef CONFIG_USER_ONLY
@@ -3502,6 +3511,53 @@ static inline void vsxger_excp(CPUPPCState *env, 
uintptr_t retaddr)
 do_fpscr_check_status(env, retaddr);
 }
 
+typedef float64 extract_f16(float16, float_status *);
+
+static float64 extract_hf16(float16 in, float_status *fp_status)
+{
+return float16_to_float64(in, true, fp_status);
+}
+
+static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t  *at, uint32_t mask, bool acc,
+ bool neg_mul, bool neg_acc, extract_f16 extract)
+{
+float32 r, aux_acc;
+float64 psum, va, vb, vc, vd;
+int i, j, xmsk_bit, ymsk_bit;
+uint8_t pmsk = FIELD_EX32(mask, GER_MSK, PMSK),
+xmsk = FIELD_EX32(mask, GER_MSK, XMSK),
+ymsk = FIELD_EX32(mask, GER_MSK, YMSK);
+float_status *excp_ptr = >fp_status;
+for (i = 0, xmsk_bit = 1 << 3; i < 4; i++, xmsk_bit >>= 1) {
+for (j = 0, ymsk_bit = 1 << 3; j < 4; j++, ymsk_bit >>= 1) {
+if ((xmsk_bit & xmsk) && (ymsk_bit & ymsk)) {
+va = !(pmsk & 2) ? float64_zero : extract(a->VsrHF(2 * i), 
excp_ptr);
+vb = !(pmsk & 2) ? float64_zero : extract(b->VsrHF(2 * j), 
excp_ptr);
+vc = !(pmsk & 1) ? float64_zero : extract(a->VsrHF(2 * i + 1), 
excp_ptr);
+vd = !(pmsk & 1) ? float64_zero : extract(b->VsrHF(2 * j + 1), 
excp_ptr);
+psum = float64_mul(va, vb, excp_ptr);
+psum = float64r32_muladd(vc, vd, psum, 0, excp_ptr);
+r = float64_to_float32(psum, excp_ptr);
+if (acc) {
+aux_acc = at[i].VsrSF(j);
+if (neg_mul) {
+r = bfp32_neg(r);
+}
+if (neg_acc) {
+aux_acc = bfp32_neg(aux_acc);
+}
+r = float32_add(r, aux_acc, excp_ptr);
+}
+at[i].VsrSF(j) = r;
+} else {
+at[i].VsrSF(j) = float32_zero;
+}
+}
+}
+vsxger_excp(env, GETPC());
+}
+
 typedef void vsxger_zero(ppc_vsr_t *at, int, int);
 
 typedef void vsxger_muladd_f(ppc_vsr_t *, ppc_vsr_t *, ppc_vsr_t *, int, int,
@@ -3579,6 +3635,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, 
ppc_vsr_t *b, ppc_acc_t  *at,
 vsxger_excp(env, GETPC());
 }
 
+QEMU_FLATTEN
+void helper_XVF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at,

[PATCH v5 7/8] target/ppc: Implemented [pm]xvbf16ger2*

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xvbf16ger2:   VSX Vector bfloat16 GER (rank-2 update)
xvbf16ger2nn: VSX Vector bfloat16 GER (rank-2 update) Negative multiply,
Negative accumulate
xvbf16ger2np: VSX Vector bfloat16 GER (rank-2 update) Negative multiply,
Positive accumulate
xvbf16ger2pn: VSX Vector bfloat16 GER (rank-2 update) Positive multiply,
Negative accumulate
xvbf16ger2pp: VSX Vector bfloat16 GER (rank-2 update) Positive multiply,
Positive accumulate
pmxvbf16ger2:   Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
pmxvbf16ger2nn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Negative multiply, Negative accumulate
pmxvbf16ger2np: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Negative multiply, Positive accumulate
pmxvbf16ger2pn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Positive multiply, Negative accumulate
pmxvbf16ger2pp: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/fpu_helper.c | 40 +
 target/ppc/helper.h |  5 
 target/ppc/insn32.decode|  6 +
 target/ppc/insn64.decode| 11 
 target/ppc/translate/vsx-impl.c.inc | 12 +
 5 files changed, 74 insertions(+)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 7a7aa03ac4..20f134c1d6 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3518,6 +3518,11 @@ static float64 extract_hf16(float16 in, float_status 
*fp_status)
 return float16_to_float64(in, true, fp_status);
 }
 
+static float64 extract_bf16(bfloat16 in, float_status *fp_status)
+{
+return bfloat16_to_float64(in, fp_status);
+}
+
 static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
  ppc_acc_t  *at, uint32_t mask, bool acc,
  bool neg_mul, bool neg_acc, extract_f16 extract)
@@ -3635,6 +3640,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, 
ppc_vsr_t *b, ppc_acc_t  *at,
 vsxger_excp(env, GETPC());
 }
 
+QEMU_FLATTEN
+void helper_XVBF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+   ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, false, false, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2PP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, false, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2PN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, false, true, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2NP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, true, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2NN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, true, true, extract_bf16);
+}
+
 QEMU_FLATTEN
 void helper_XVF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
  ppc_acc_t *at, uint32_t mask)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 7ab5ac8ee7..06203fd893 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -555,6 +555,11 @@ DEF_HELPER_5(XVF16GER2PP, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF16GER2PN, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF16GER2NP, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF16GER2NN, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2PP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2PN, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2NP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2NN, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF32GER, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF32GERPP, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF32GERPN, void, env, vsr, vsr, acc, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index c774227d8c..dfd12e9801 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -739,6 +739,12 @@ XVI8GER4SPP 111011 ... -- . . 01100011 ..-  
@XX3_at xa=%xx_xa
 XVI16GER2S  111011 ... -- . . 00101011 ..-  @XX3_at xa=%xx_xa
 XVI16GER2SPP111011 ... -- . . 00101010 ..-  @XX3_at xa=%xx_xa
 
+XVBF16GER2  111011 ... -- . . 00110011 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2PP111011 ... -- . . 00110010 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2PN111011 ... -- . . 10110010 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2NP111011 ... -- . . 01110010 ..-  @XX3_at

[PATCH v5 8/8] linux-user: Add PowerPC ISA 3.1 and MMA to hwcap

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: Joel Stanley 

These are new hwcap bits added for power10.

Signed-off-by: Joel Stanley 
Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 linux-user/elfload.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 61063fd974..0908692e62 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -779,6 +779,8 @@ enum {
 QEMU_PPC_FEATURE2_DARN = 0x0020, /* darn random number insn */
 QEMU_PPC_FEATURE2_SCV = 0x0010, /* scv syscall */
 QEMU_PPC_FEATURE2_HTM_NO_SUSPEND = 0x0008, /* TM w/o suspended state */
+QEMU_PPC_FEATURE2_ARCH_3_1 = 0x0004, /* ISA 3.1 */
+QEMU_PPC_FEATURE2_MMA = 0x0002, /* Matrix-Multiply Assist */
 };
 
 #define ELF_HWCAP get_elf_hwcap()
@@ -836,6 +838,8 @@ static uint32_t get_elf_hwcap2(void)
   QEMU_PPC_FEATURE2_VEC_CRYPTO);
 GET_FEATURE2(PPC2_ISA300, QEMU_PPC_FEATURE2_ARCH_3_00 |
  QEMU_PPC_FEATURE2_DARN | QEMU_PPC_FEATURE2_HAS_IEEE128);
+GET_FEATURE2(PPC2_ISA310, QEMU_PPC_FEATURE2_ARCH_3_1 |
+ QEMU_PPC_FEATURE2_MMA);
 
 #undef GET_FEATURE
 #undef GET_FEATURE2
-- 
2.31.1




[PATCH v5 1/8] target/ppc: Implement xxm[tf]acc and xxsetaccz

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xxmfacc: VSX Move From Accumulator
xxmtacc: VSX Move To Accumulator
xxsetaccz: VSX Set Accumulator to Zero

The PowerISA 3.1 mentions that for the current version of the
architecture, "the hardware implementation provides the effect of ACC[i]
and VSRs 4*i to 4*i + 3 logically containing the same data" and "The
Accumulators introduce no new logical state at this time" (page 501).
For now it seems unnecessary to create new structures, so this patch
just uses ACC[i] as VSRs 4*i to 4*i+3 and therefore move to and from
accumulators are no-ops.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h|  5 +
 target/ppc/insn32.decode|  9 +
 target/ppc/translate/vsx-impl.c.inc | 31 +
 3 files changed, 45 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 901ded79e9..2e80d0978f 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2661,6 +2661,11 @@ static inline int vsr_full_offset(int i)
 return offsetof(CPUPPCState, vsr[i].u64[0]);
 }
 
+static inline int acc_full_offset(int i)
+{
+return vsr_full_offset(i * 4);
+}
+
 static inline int fpr_offset(int i)
 {
 return vsr64_offset(i, true);
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 39372fe673..7a76bedfa6 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -151,6 +151,9 @@
 _vrt_frbp vrt frbp
 @X_vrt_frbp .. vrt:5 . 0 .. .   _vrt_frbp 
frbp=%x_frbp
 
+_ara
+@X_a.. ra:3 .. . . .. . _a
+
 %xx_xt  0:1 21:5
 %xx_xb  1:1 11:5
 %xx_xa  2:1 16:5
@@ -710,3 +713,9 @@ XVTLSBB 00 ... -- 00010 . 111011011 . - 
@XX2_bf_xb
 _s   s:uint8_t
 @XL_s   ..-- s:1 .. -   _s
 RFEBB   010011-- .   0010010010 -   @XL_s
+
+## Accumulator Instructions
+
+XXMFACC 01 ... -- 0 - 0010110001 -   @X_a
+XXMTACC 01 ... -- 1 - 0010110001 -   @X_a
+XXSETACCZ   01 ... -- 00011 - 0010110001 -   @X_a
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 3692740736..dc8875d5d3 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -2787,6 +2787,37 @@ static bool trans_XVCVBF16SPN(DisasContext *ctx, arg_XX2 
*a)
 return true;
 }
 
+/*
+ *  The PowerISA 3.1 mentions that for the current version of the
+ *  architecture, "the hardware implementation provides the effect of
+ *  ACC[i] and VSRs 4*i to 4*i + 3 logically containing the same data"
+ *  and "The Accumulators introduce no new logical state at this time"
+ *  (page 501). For now it seems unnecessary to create new structures,
+ *  so ACC[i] is the same as VSRs 4*i to 4*i+3 and therefore
+ *  move to and from accumulators are no-ops.
+ */
+static bool trans_XXMFACC(DisasContext *ctx, arg_X_a *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VSX(ctx);
+return true;
+}
+
+static bool trans_XXMTACC(DisasContext *ctx, arg_X_a *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VSX(ctx);
+return true;
+}
+
+static bool trans_XXSETACCZ(DisasContext *ctx, arg_X_a *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VSX(ctx);
+tcg_gen_gvec_dup_imm(MO_64, acc_full_offset(a->ra), 64, 64, 0);
+return true;
+}
+
 #undef GEN_XX2FORM
 #undef GEN_XX3FORM
 #undef GEN_XX2IFORM
-- 
2.31.1




[PATCH v5 2/8] target/ppc: Implemented xvi*ger* instructions

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xvi4ger8: VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update)
xvi4ger8pp:   VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update)
Positive multiply, Positive accumulate
xvi8ger4: VSX Vector 4-bit Signed Integer GER (rank-8 update)
xvi8ger4pp:   VSX Vector 4-bit Signed Integer GER (rank-8 update)
Positive multiply, Positive accumulate
xvi8ger4spp:  VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update)
with Saturate Positive multiply, Positive accumulate
xvi16ger2:VSX Vector 16-bit Signed Integer GER (rank-2 update)
xvi16ger2pp:  VSX Vector 16-bit Signed Integer GER (rank-2 update)
Positive multiply, Positive accumulate
xvi16ger2s:   VSX Vector 16-bit Signed Integer GER (rank-2 update)
with Saturation
xvi16ger2spp: VSX Vector 16-bit Signed Integer GER (rank-2 update)
with Saturation Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h|   1 +
 target/ppc/helper.h |  13 +++
 target/ppc/insn32.decode|  18 
 target/ppc/int_helper.c | 130 
 target/ppc/internal.h   |  15 
 target/ppc/translate/vsx-impl.c.inc |  41 +
 6 files changed, 218 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 2e80d0978f..c8a12a3985 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -238,6 +238,7 @@ typedef union _ppc_vsr_t {
 
 typedef ppc_vsr_t ppc_avr_t;
 typedef ppc_vsr_t ppc_fprp_t;
+typedef ppc_vsr_t ppc_acc_t;
 
 #if !defined(CONFIG_USER_ONLY)
 /* Software TLB cache */
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index aa6773c4a5..29354276f0 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -133,6 +133,10 @@ DEF_HELPER_FLAGS_1(ftsqrt, TCG_CALL_NO_RWG_SE, i32, i64)
 #define dh_ctype_vsr ppc_vsr_t *
 #define dh_typecode_vsr dh_typecode_ptr
 
+#define dh_alias_acc ptr
+#define dh_ctype_acc ppc_acc_t *
+#define dh_typecode_acc dh_typecode_ptr
+
 DEF_HELPER_3(vavgub, void, avr, avr, avr)
 DEF_HELPER_3(vavguh, void, avr, avr, avr)
 DEF_HELPER_3(vavguw, void, avr, avr, avr)
@@ -537,6 +541,15 @@ DEF_HELPER_5(XXBLENDVB, void, vsr, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XXBLENDVH, void, vsr, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XXBLENDVW, void, vsr, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XXBLENDVD, void, vsr, vsr, vsr, vsr, i32)
+DEF_HELPER_5(XVI4GER8, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI4GER8PP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI8GER4, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI8GER4PP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI8GER4SPP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI16GER2, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI16GER2S, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI16GER2PP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVI16GER2SPP, void, env, vsr, vsr, acc, i32)
 
 DEF_HELPER_2(efscfsi, i32, env, i32)
 DEF_HELPER_2(efscfui, i32, env, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 7a76bedfa6..899a04bf77 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -170,6 +170,12 @@
 xt xa xb
 @XX3.. . . .  ...xt=%xx_xt 
xa=%xx_xa xb=%xx_xb
 
+# 32 bit GER instructions have all mask bits considered 1
+_XX3  xa xb xt pmsk xmsk ymsk
+%xx_at  23:3
+@XX3_at .. ... .. . .  ...  _XX3 
xt=%xx_at xb=%xx_xb \
+pmsk=255 
xmsk=15 ymsk=15
+
 _dm xt xa xb dm
 @XX3_dm .. . . . . dm:2 . ...   _dm 
xt=%xx_xt xa=%xx_xa xb=%xx_xb
 
@@ -719,3 +725,15 @@ RFEBB   010011-- .   0010010010 -   
@XL_s
 XXMFACC 01 ... -- 0 - 0010110001 -   @X_a
 XXMTACC 01 ... -- 1 - 0010110001 -   @X_a
 XXSETACCZ   01 ... -- 00011 - 0010110001 -   @X_a
+
+## VSX GER instruction
+
+XVI4GER8111011 ... -- . . 00100011 ..-  @XX3_at xa=%xx_xa
+XVI4GER8PP  111011 ... -- . . 00100010 ..-  @XX3_at xa=%xx_xa
+XVI8GER4111011 ... -- . . 0011 ..-  @XX3_at xa=%xx_xa
+XVI8GER4PP  111011 ... -- . . 0010 ..-  @XX3_at xa=%xx_xa
+XVI16GER2   111011 ... -- . . 01001011 ..-  @XX3_at xa=%xx_xa
+XVI16GER2PP 111011 ... -- . . 01101011 ..-  @XX3_at xa=%xx_xa
+XVI8GER4SPP 111011 ... -- . . 01100011 ..-  @XX3_at xa=%xx_xa
+XVI16GER2S  111011 ... -- . . 00101011 ..-  @XX3_at xa=%xx_xa
+XVI16GER2SPP111011 ... -- . . 00101010 ..-  @XX3_at xa=%xx_xa
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 8c1674510b..32a7d99718 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -782,6 +782,136 @@ VCT(uxs, cv

[PATCH v5 4/8] target/ppc: Implemented xvf*ger*

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xvf32ger:   VSX Vector 32-bit Floating-Point GER (rank-1 update)
xvf32gernn: VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative
multiply, Negative accumulate
xvf32gernp: VSX Vector 32-bit Floating-Point GER (rank-1 update) Negative
multiply, Positive accumulate
xvf32gerpn: VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive
multiply, Negative accumulate
xvf32gerpp: VSX Vector 32-bit Floating-Point GER (rank-1 update) Positive
multiply, Positive accumulate
xvf64ger:   VSX Vector 64-bit Floating-Point GER (rank-1 update)
xvf64gernn: VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative
multiply, Negative accumulate
xvf64gernp: VSX Vector 64-bit Floating-Point GER (rank-1 update) Negative
multiply, Positive accumulate
xvf64gerpn: VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive
multiply, Negative accumulate
xvf64gerpp: VSX Vector 64-bit Floating-Point GER (rank-1 update) Positive
multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h|   4 +
 target/ppc/fpu_helper.c | 193 +++-
 target/ppc/helper.h |  10 ++
 target/ppc/insn32.decode|  13 ++
 target/ppc/translate/vsx-impl.c.inc |  12 ++
 5 files changed, 230 insertions(+), 2 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index c8a12a3985..bdedf4138e 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2641,6 +2641,8 @@ static inline bool lsw_reg_in_range(int start, int nregs, 
int rx)
 #define VsrSW(i) s32[i]
 #define VsrD(i) u64[i]
 #define VsrSD(i) s64[i]
+#define VsrSF(i) f32[i]
+#define VsrDF(i) f64[i]
 #else
 #define VsrB(i) u8[15 - (i)]
 #define VsrSB(i) s8[15 - (i)]
@@ -2650,6 +2652,8 @@ static inline bool lsw_reg_in_range(int start, int nregs, 
int rx)
 #define VsrSW(i) s32[3 - (i)]
 #define VsrD(i) u64[1 - (i)]
 #define VsrSD(i) s64[1 - (i)]
+#define VsrSF(i) f32[3 - (i)]
+#define VsrDF(i) f64[1 - (i)]
 #endif
 
 static inline int vsr64_offset(int i, bool high)
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 8592727792..1766da5bcf 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -414,7 +414,7 @@ void helper_store_fpscr(CPUPPCState *env, uint64_t val, 
uint32_t nibbles)
 ppc_store_fpscr(env, val);
 }
 
-void helper_fpscr_check_status(CPUPPCState *env)
+static void do_fpscr_check_status(CPUPPCState *env, uintptr_t raddr)
 {
 CPUState *cs = env_cpu(env);
 target_ulong fpscr = env->fpscr;
@@ -455,13 +455,19 @@ void helper_fpscr_check_status(CPUPPCState *env)
 }
 cs->exception_index = POWERPC_EXCP_PROGRAM;
 env->error_code = error | POWERPC_EXCP_FP;
+env->fpscr |= error ? FP_FEX : 0;
 /* Deferred floating-point exception after target FPSCR update */
 if (fp_exceptions_enabled(env)) {
 raise_exception_err_ra(env, cs->exception_index,
-   env->error_code, GETPC());
+   env->error_code, raddr);
 }
 }
 
+void helper_fpscr_check_status(CPUPPCState *env)
+{
+do_fpscr_check_status(env, GETPC());
+}
+
 static void do_float_check_status(CPUPPCState *env, bool change_fi,
   uintptr_t raddr)
 {
@@ -3469,3 +3475,186 @@ void helper_xssubqp(CPUPPCState *env, uint32_t opcode,
 *xt = t;
 do_float_check_status(env, true, GETPC());
 }
+
+static inline void vsxger_excp(CPUPPCState *env, uintptr_t retaddr)
+{
+/*
+ * XV*GER instructions execute and set the FPSCR as if exceptions
+ * are disabled and only at the end throw an exception
+ */
+target_ulong enable;
+enable = env->fpscr & (FP_ENABLES | FP_FI | FP_FR);
+env->fpscr &= ~(FP_ENABLES | FP_FI | FP_FR);
+int status = get_float_exception_flags(>fp_status);
+if (unlikely(status & float_flag_invalid)) {
+if (status & float_flag_invalid_snan) {
+float_invalid_op_vxsnan(env, 0);
+}
+if (status & float_flag_invalid_imz) {
+float_invalid_op_vximz(env, false, 0);
+}
+if (status & float_flag_invalid_isi) {
+float_invalid_op_vxisi(env, false, 0);
+}
+}
+do_float_check_status(env, false, retaddr);
+env->fpscr |= enable;
+do_fpscr_check_status(env, retaddr);
+}
+
+typedef void vsxger_zero(ppc_vsr_t *at, int, int);
+
+typedef void vsxger_muladd_f(ppc_vsr_t *, ppc_vsr_t *, ppc_vsr_t *, int, int,
+ int flags, float_status *s);
+
+static void vsxger_muladd32(ppc_vsr_t *at, ppc_vsr_t *a, ppc_vsr_t *b, int i,
+int j, int flags, float_status *s)
+{
+at[i].VsrSF(j) = float32_muladd(a->VsrSF(i), b->VsrSF(j),
+at[i].VsrSF(j), flags, s);
+}
+
+

[PATCH v5 3/8] target/ppc: Implemented pmxvi*ger* instructions

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
pmxvi4ger8: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer
GER (rank-4 update)
pmxvi4ger8pp:   Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer
GER (rank-4 update) Positive multiply, Positive accumulate
pmxvi8ger4: Prefixed Masked VSX Vector 4-bit Signed Integer GER
(rank-8 update)
pmxvi8ger4pp:   Prefixed Masked VSX Vector 4-bit Signed Integer GER
(rank-8 update) Positive multiply, Positive accumulate
pmxvi8ger4spp:  Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer
GER (rank-4 update) with Saturate Positive multiply, Positive accumulate
pmxvi16ger2:Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update)
pmxvi16ger2pp:  Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update) Positive multiply, Positive accumulate
pmxvi16ger2s:   Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update) with Saturation
pmxvi16ger2spp: Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update) with Saturation Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn64.decode| 30 +
 target/ppc/translate/vsx-impl.c.inc | 10 ++
 2 files changed, 40 insertions(+)

diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode
index 691e8fe6c0..0eed35c8cd 100644
--- a/target/ppc/insn64.decode
+++ b/target/ppc/insn64.decode
@@ -68,6 +68,15 @@
 .. . . . . ..    \
 &8RR_XX4_uim3 xt=%8rr_xx_xt xa=%8rr_xx_xa xb=%8rr_xx_xb 
xc=%8rr_xx_xc
 
+# Format MMIRR:XX3
+_XX3  !extern xa xb xt pmsk xmsk ymsk
+%xx3_xa 2:1 16:5
+%xx3_xb 1:1 11:5
+%xx3_at 23:3
+@MMIRR_XX3  .. ..  .. . .  xmsk:4 ymsk:4  \
+.. ... .. . .  ...  \
+_XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at
+
 ### Fixed-Point Load Instructions
 
 PLBZ01 10 0--.-- .. \
@@ -115,6 +124,27 @@ PSTFS   01 10 0--.-- .. \
 PSTFD   01 10 0--.-- .. \
 110110 . .  @PLS_D
 
+## VSX GER instruction
+
+PMXVI4GER8  01 11 1001 -- - - pmsk:8   \
+111011 ... -- . . 00100011 ..-  @MMIRR_XX3
+PMXVI4GER8PP01 11 1001 -- - - pmsk:8   \
+111011 ... -- . . 00100010 ..-  @MMIRR_XX3
+PMXVI8GER4  01 11 1001 -- - - pmsk:4   \
+111011 ... -- . . 0011 ..-  @MMIRR_XX3
+PMXVI8GER4PP01 11 1001 -- - - pmsk:4   \
+111011 ... -- . . 0010 ..-  @MMIRR_XX3
+PMXVI16GER2 01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 01001011 ..-  @MMIRR_XX3
+PMXVI16GER2PP   01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 01101011 ..-  @MMIRR_XX3
+PMXVI8GER4SPP   01 11 1001 -- - - pmsk:4   \
+111011 ... -- . . 01100011 ..-  @MMIRR_XX3
+PMXVI16GER2S01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 00101011 ..-  @MMIRR_XX3
+PMXVI16GER2SPP  01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 00101010 ..-  @MMIRR_XX3
+
 ### Prefixed No-operation Instruction
 
 @PNOP   01 11 -- 00 \
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 9d4309e841..c9ed898bb6 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -2859,6 +2859,16 @@ TRANS(XVI16GER2PP, do_ger, gen_helper_XVI16GER2PP)
 TRANS(XVI16GER2S, do_ger, gen_helper_XVI16GER2S)
 TRANS(XVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP)
 
+TRANS64(PMXVI4GER8, do_ger, gen_helper_XVI4GER8)
+TRANS64(PMXVI4GER8PP, do_ger, gen_helper_XVI4GER8PP)
+TRANS64(PMXVI8GER4, do_ger, gen_helper_XVI8GER4)
+TRANS64(PMXVI8GER4PP, do_ger, gen_helper_XVI8GER4PP)
+TRANS64(PMXVI8GER4SPP, do_ger, gen_helper_XVI8GER4SPP)
+TRANS64(PMXVI16GER2, do_ger, gen_helper_XVI16GER2)
+TRANS64(PMXVI16GER2PP, do_ger, gen_helper_XVI16GER2PP)
+TRANS64(PMXVI16GER2S, do_ger, gen_helper_XVI16GER2S)
+TRANS64(PMXVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP)
+
 #undef GEN_XX2FORM
 #undef GEN_XX3FORM
 #undef GEN_XX2IFORM
-- 
2.31.1




[PATCH v5 0/8] VSX MMA Implementation

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Based-on: <20220517161522.36132-1-victor.colo...@eldorado.org.br>

This patch series is a patch series of the Matrix-Multiply Assist (MMA)
instructions implementation from the PowerISA 3.1

These and the VDIV/VMOD implementation are the last new PowerISA 3.1
instructions left to be implemented.

The XVFGER instructions accumulate the exception status and at the end
set the FPSCR and take a Program interrupt on a trap-enabled exception,
previous versions were based on Victor's rework of FPU exceptions, but
as that patch was rejected this version worked around the fact that
OX/UX/XX and invalid instructions were handled in different functions
by disabling all enable bits then re-enabling them and calling the mtfsf
deferred exception helper.

v5 changes:
- Changed VSXGER16 accumulation to negate the multiplication and
  accumulation in independent if's (if necessary) and sum their
  values.

v4 changes:
- Changed VSXGER16 accumulation to always use float32_sum and negate
  the elements according to the type of accumulation

v3 changes:
- GER helpers now use ppc_acc_t instead of ppc_vsr_t for passing acc
- Removed do_ger_XX3 and updated the decodetree to pass the masks in
  32 bits instructions
- Removed unnecessary rounding mode function
- Moved float32_neg to fpu_helper.c and renamed it bfp32_negate to
  make it clearer that it's a 32 bit version of the PowerISA
  bfp_NEGATE
- Negated accumulation now a subtraction
- Changed exception handling by disabling all enable FPSCR enable
  bits to set all FPSCR bits (except FEX) correctly, then re-enable
  them and call do_fpscr_check_status to raise the exception
  accordingly and set FEX if necessary

v2 changes:
- Changed VSXGER, VSXGER16 and XVIGER macros to functions
- Set rounding mode in floating-point instructions based on RN
  before operations
- Separated accumulate and with saturation instructions in
  different helpers
- Used FIELD, FIELD_EX32 and FIELD_DP32 for packing/unpacking masks


Joel Stanley (1):
  linux-user: Add PowerPC ISA 3.1 and MMA to hwcap

Lucas Mateus Castro (alqotel) (7):
  target/ppc: Implement xxm[tf]acc and xxsetaccz
  target/ppc: Implemented xvi*ger* instructions
  target/ppc: Implemented pmxvi*ger* instructions
  target/ppc: Implemented xvf*ger*
  target/ppc: Implemented xvf16ger*
  target/ppc: Implemented pmxvf*ger*
  target/ppc: Implemented [pm]xvbf16ger2*

 linux-user/elfload.c|   4 +
 target/ppc/cpu.h|  13 ++
 target/ppc/fpu_helper.c | 324 +++-
 target/ppc/helper.h |  33 +++
 target/ppc/insn32.decode|  52 +
 target/ppc/insn64.decode|  79 +++
 target/ppc/int_helper.c | 130 +++
 target/ppc/internal.h   |  15 ++
 target/ppc/translate/vsx-impl.c.inc | 130 +++
 9 files changed, 778 insertions(+), 2 deletions(-)

-- 
2.31.1




[PATCH v4 6/8] target/ppc: Implemented pmxvf*ger*

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
pmxvf16ger2:   Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update)
pmxvf16ger2nn: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Negative multiply, Negative accumulate
pmxvf16ger2np: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Negative multiply, Positive accumulate
pmxvf16ger2pn: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Positive multiply, Negative accumulate
pmxvf16ger2pp: Prefixed Masked VSX Vector 16-bit Floating-Point GER
(rank-2 update) Positive multiply, Positive accumulate
pmxvf32ger:Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update)
pmxvf32gernn:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Negative multiply, Negative accumulate
pmxvf32gernp:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Negative multiply, Positive accumulate
pmxvf32gerpn:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Positive multiply, Negative accumulate
pmxvf32gerpp:  Prefixed Masked VSX Vector 32-bit Floating-Point GER
(rank-1 update) Positive multiply, Positive accumulate
pmxvf64ger:Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update)
pmxvf64gernn:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Negative multiply, Negative accumulate
pmxvf64gernp:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Negative multiply, Positive accumulate
pmxvf64gerpn:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Positive multiply, Negative accumulate
pmxvf64gerpp:  Prefixed Masked VSX Vector 64-bit Floating-Point GER
(rank-1 update) Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn64.decode| 38 +
 target/ppc/translate/vsx-impl.c.inc | 18 ++
 2 files changed, 56 insertions(+)

diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode
index 0eed35c8cd..5ecc5c85bf 100644
--- a/target/ppc/insn64.decode
+++ b/target/ppc/insn64.decode
@@ -73,10 +73,15 @@
 %xx3_xa 2:1 16:5
 %xx3_xb 1:1 11:5
 %xx3_at 23:3
+%xx3_xa_pair2:1 17:4 !function=times_2
 @MMIRR_XX3  .. ..  .. . .  xmsk:4 ymsk:4  \
 .. ... .. . .  ...  \
 _XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at
 
+@MMIRR_XX3_NO_P .. ..  .. . .  xmsk:4  \
+.. ... .. . .  ... \
+_XX3 xb=%xx3_xb xt=%xx3_at pmsk=1
+
 ### Fixed-Point Load Instructions
 
 PLBZ01 10 0--.-- .. \
@@ -145,6 +150,39 @@ PMXVI16GER2S01 11 1001 -- - - pmsk:2 -- 
   \
 PMXVI16GER2SPP  01 11 1001 -- - - pmsk:2 --    \
 111011 ... -- . . 00101010 ..-  @MMIRR_XX3
 
+PMXVF16GER2 01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 00010011 ..-  @MMIRR_XX3
+PMXVF16GER2PP   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 00010010 ..-  @MMIRR_XX3
+PMXVF16GER2PN   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 10010010 ..-  @MMIRR_XX3
+PMXVF16GER2NP   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 01010010 ..-  @MMIRR_XX3
+PMXVF16GER2NN   01 11 1001 -- - - pmsk:2 --  \
+111011 ... -- . . 11010010 ..-  @MMIRR_XX3
+
+PMXVF32GER  01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 00011011 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERPP01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 00011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERPN01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 10011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERNP01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 01011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+PMXVF32GERNN01 11 1001 -- - -   ymsk:4 \
+111011 ... -- . . 11011010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa
+
+PMXVF64GER  01 11 1001 -- - -   ymsk:2 -- \
+111011 ... -- 0 . 00111011 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa_pair
+PMXVF64GERPP01 11 1001 -- - -   ymsk:2 -- \
+111011 ... -- 0 . 00111010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa_pair
+PMXVF64GERPN01 11 1001 -- - -   ymsk:2 -- \
+111011 ... -- 0 . 10111010 ..-  @MMIRR_XX3_NO_P 
xa=%xx3_xa_pair
+PMXVF64GERNP01 11 1001 -- - -   ymsk:2 -- \
+

[PATCH v4 3/8] target/ppc: Implemented pmxvi*ger* instructions

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
pmxvi4ger8: Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer
GER (rank-4 update)
pmxvi4ger8pp:   Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer
GER (rank-4 update) Positive multiply, Positive accumulate
pmxvi8ger4: Prefixed Masked VSX Vector 4-bit Signed Integer GER
(rank-8 update)
pmxvi8ger4pp:   Prefixed Masked VSX Vector 4-bit Signed Integer GER
(rank-8 update) Positive multiply, Positive accumulate
pmxvi8ger4spp:  Prefixed Masked VSX Vector 8-bit Signed/Unsigned Integer
GER (rank-4 update) with Saturate Positive multiply, Positive accumulate
pmxvi16ger2:Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update)
pmxvi16ger2pp:  Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update) Positive multiply, Positive accumulate
pmxvi16ger2s:   Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update) with Saturation
pmxvi16ger2spp: Prefixed Masked VSX Vector 16-bit Signed Integer GER
(rank-2 update) with Saturation Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/insn64.decode| 30 +
 target/ppc/translate/vsx-impl.c.inc | 10 ++
 2 files changed, 40 insertions(+)

diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode
index 691e8fe6c0..0eed35c8cd 100644
--- a/target/ppc/insn64.decode
+++ b/target/ppc/insn64.decode
@@ -68,6 +68,15 @@
 .. . . . . ..    \
 &8RR_XX4_uim3 xt=%8rr_xx_xt xa=%8rr_xx_xa xb=%8rr_xx_xb 
xc=%8rr_xx_xc
 
+# Format MMIRR:XX3
+_XX3  !extern xa xb xt pmsk xmsk ymsk
+%xx3_xa 2:1 16:5
+%xx3_xb 1:1 11:5
+%xx3_at 23:3
+@MMIRR_XX3  .. ..  .. . .  xmsk:4 ymsk:4  \
+.. ... .. . .  ...  \
+_XX3 xa=%xx3_xa xb=%xx3_xb xt=%xx3_at
+
 ### Fixed-Point Load Instructions
 
 PLBZ01 10 0--.-- .. \
@@ -115,6 +124,27 @@ PSTFS   01 10 0--.-- .. \
 PSTFD   01 10 0--.-- .. \
 110110 . .  @PLS_D
 
+## VSX GER instruction
+
+PMXVI4GER8  01 11 1001 -- - - pmsk:8   \
+111011 ... -- . . 00100011 ..-  @MMIRR_XX3
+PMXVI4GER8PP01 11 1001 -- - - pmsk:8   \
+111011 ... -- . . 00100010 ..-  @MMIRR_XX3
+PMXVI8GER4  01 11 1001 -- - - pmsk:4   \
+111011 ... -- . . 0011 ..-  @MMIRR_XX3
+PMXVI8GER4PP01 11 1001 -- - - pmsk:4   \
+111011 ... -- . . 0010 ..-  @MMIRR_XX3
+PMXVI16GER2 01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 01001011 ..-  @MMIRR_XX3
+PMXVI16GER2PP   01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 01101011 ..-  @MMIRR_XX3
+PMXVI8GER4SPP   01 11 1001 -- - - pmsk:4   \
+111011 ... -- . . 01100011 ..-  @MMIRR_XX3
+PMXVI16GER2S01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 00101011 ..-  @MMIRR_XX3
+PMXVI16GER2SPP  01 11 1001 -- - - pmsk:2 --    \
+111011 ... -- . . 00101010 ..-  @MMIRR_XX3
+
 ### Prefixed No-operation Instruction
 
 @PNOP   01 11 -- 00 \
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 9d4309e841..c9ed898bb6 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -2859,6 +2859,16 @@ TRANS(XVI16GER2PP, do_ger, gen_helper_XVI16GER2PP)
 TRANS(XVI16GER2S, do_ger, gen_helper_XVI16GER2S)
 TRANS(XVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP)
 
+TRANS64(PMXVI4GER8, do_ger, gen_helper_XVI4GER8)
+TRANS64(PMXVI4GER8PP, do_ger, gen_helper_XVI4GER8PP)
+TRANS64(PMXVI8GER4, do_ger, gen_helper_XVI8GER4)
+TRANS64(PMXVI8GER4PP, do_ger, gen_helper_XVI8GER4PP)
+TRANS64(PMXVI8GER4SPP, do_ger, gen_helper_XVI8GER4SPP)
+TRANS64(PMXVI16GER2, do_ger, gen_helper_XVI16GER2)
+TRANS64(PMXVI16GER2PP, do_ger, gen_helper_XVI16GER2PP)
+TRANS64(PMXVI16GER2S, do_ger, gen_helper_XVI16GER2S)
+TRANS64(PMXVI16GER2SPP, do_ger, gen_helper_XVI16GER2SPP)
+
 #undef GEN_XX2FORM
 #undef GEN_XX3FORM
 #undef GEN_XX2IFORM
-- 
2.31.1




[PATCH v4 7/8] target/ppc: Implemented [pm]xvbf16ger2*

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xvbf16ger2:   VSX Vector bfloat16 GER (rank-2 update)
xvbf16ger2nn: VSX Vector bfloat16 GER (rank-2 update) Negative multiply,
Negative accumulate
xvbf16ger2np: VSX Vector bfloat16 GER (rank-2 update) Negative multiply,
Positive accumulate
xvbf16ger2pn: VSX Vector bfloat16 GER (rank-2 update) Positive multiply,
Negative accumulate
xvbf16ger2pp: VSX Vector bfloat16 GER (rank-2 update) Positive multiply,
Positive accumulate
pmxvbf16ger2:   Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
pmxvbf16ger2nn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Negative multiply, Negative accumulate
pmxvbf16ger2np: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Negative multiply, Positive accumulate
pmxvbf16ger2pn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Positive multiply, Negative accumulate
pmxvbf16ger2pp: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/fpu_helper.c | 40 +
 target/ppc/helper.h |  5 
 target/ppc/insn32.decode|  6 +
 target/ppc/insn64.decode| 11 
 target/ppc/translate/vsx-impl.c.inc | 12 +
 5 files changed, 74 insertions(+)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index f7da92a51a..46e82b7b26 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3518,6 +3518,11 @@ static float64 extract_hf16(float16 in, float_status 
*fp_status)
 return float16_to_float64(in, true, fp_status);
 }
 
+static float64 extract_bf16(bfloat16 in, float_status *fp_status)
+{
+return bfloat16_to_float64(in, fp_status);
+}
+
 static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
  ppc_acc_t  *at, uint32_t mask, bool acc,
  bool neg_mul, bool neg_acc, extract_f16 extract)
@@ -3637,6 +3642,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, 
ppc_vsr_t *b, ppc_acc_t  *at,
 vsxger_excp(env, GETPC());
 }
 
+QEMU_FLATTEN
+void helper_XVBF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+   ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, false, false, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2PP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, false, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2PN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, false, true, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2NP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, true, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2NN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+ ppc_acc_t *at, uint32_t mask)
+{
+vsxger16(env, a, b, at, mask, true, true, true, extract_bf16);
+}
+
 QEMU_FLATTEN
 void helper_XVF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
  ppc_acc_t *at, uint32_t mask)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 7ab5ac8ee7..06203fd893 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -555,6 +555,11 @@ DEF_HELPER_5(XVF16GER2PP, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF16GER2PN, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF16GER2NP, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF16GER2NN, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2PP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2PN, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2NP, void, env, vsr, vsr, acc, i32)
+DEF_HELPER_5(XVBF16GER2NN, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF32GER, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF32GERPP, void, env, vsr, vsr, acc, i32)
 DEF_HELPER_5(XVF32GERPN, void, env, vsr, vsr, acc, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index c774227d8c..dfd12e9801 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -739,6 +739,12 @@ XVI8GER4SPP 111011 ... -- . . 01100011 ..-  
@XX3_at xa=%xx_xa
 XVI16GER2S  111011 ... -- . . 00101011 ..-  @XX3_at xa=%xx_xa
 XVI16GER2SPP111011 ... -- . . 00101010 ..-  @XX3_at xa=%xx_xa
 
+XVBF16GER2  111011 ... -- . . 00110011 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2PP111011 ... -- . . 00110010 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2PN111011 ... -- . . 10110010 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2NP111011 ... -- . . 01110010 ..-  @XX3_at

[PATCH v4 1/8] target/ppc: Implement xxm[tf]acc and xxsetaccz

2022-05-20 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Implement the following PowerISA v3.1 instructions:
xxmfacc: VSX Move From Accumulator
xxmtacc: VSX Move To Accumulator
xxsetaccz: VSX Set Accumulator to Zero

The PowerISA 3.1 mentions that for the current version of the
architecture, "the hardware implementation provides the effect of ACC[i]
and VSRs 4*i to 4*i + 3 logically containing the same data" and "The
Accumulators introduce no new logical state at this time" (page 501).
For now it seems unnecessary to create new structures, so this patch
just uses ACC[i] as VSRs 4*i to 4*i+3 and therefore move to and from
accumulators are no-ops.

Signed-off-by: Lucas Mateus Castro (alqotel) 
Reviewed-by: Richard Henderson 
---
 target/ppc/cpu.h|  5 +
 target/ppc/insn32.decode|  9 +
 target/ppc/translate/vsx-impl.c.inc | 31 +
 3 files changed, 45 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 901ded79e9..2e80d0978f 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2661,6 +2661,11 @@ static inline int vsr_full_offset(int i)
 return offsetof(CPUPPCState, vsr[i].u64[0]);
 }
 
+static inline int acc_full_offset(int i)
+{
+return vsr_full_offset(i * 4);
+}
+
 static inline int fpr_offset(int i)
 {
 return vsr64_offset(i, true);
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 39372fe673..7a76bedfa6 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -151,6 +151,9 @@
 _vrt_frbp vrt frbp
 @X_vrt_frbp .. vrt:5 . 0 .. .   _vrt_frbp 
frbp=%x_frbp
 
+_ara
+@X_a.. ra:3 .. . . .. . _a
+
 %xx_xt  0:1 21:5
 %xx_xb  1:1 11:5
 %xx_xa  2:1 16:5
@@ -710,3 +713,9 @@ XVTLSBB 00 ... -- 00010 . 111011011 . - 
@XX2_bf_xb
 _s   s:uint8_t
 @XL_s   ..-- s:1 .. -   _s
 RFEBB   010011-- .   0010010010 -   @XL_s
+
+## Accumulator Instructions
+
+XXMFACC 01 ... -- 0 - 0010110001 -   @X_a
+XXMTACC 01 ... -- 1 - 0010110001 -   @X_a
+XXSETACCZ   01 ... -- 00011 - 0010110001 -   @X_a
diff --git a/target/ppc/translate/vsx-impl.c.inc 
b/target/ppc/translate/vsx-impl.c.inc
index 3692740736..dc8875d5d3 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -2787,6 +2787,37 @@ static bool trans_XVCVBF16SPN(DisasContext *ctx, arg_XX2 
*a)
 return true;
 }
 
+/*
+ *  The PowerISA 3.1 mentions that for the current version of the
+ *  architecture, "the hardware implementation provides the effect of
+ *  ACC[i] and VSRs 4*i to 4*i + 3 logically containing the same data"
+ *  and "The Accumulators introduce no new logical state at this time"
+ *  (page 501). For now it seems unnecessary to create new structures,
+ *  so ACC[i] is the same as VSRs 4*i to 4*i+3 and therefore
+ *  move to and from accumulators are no-ops.
+ */
+static bool trans_XXMFACC(DisasContext *ctx, arg_X_a *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VSX(ctx);
+return true;
+}
+
+static bool trans_XXMTACC(DisasContext *ctx, arg_X_a *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VSX(ctx);
+return true;
+}
+
+static bool trans_XXSETACCZ(DisasContext *ctx, arg_X_a *a)
+{
+REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+REQUIRE_VSX(ctx);
+tcg_gen_gvec_dup_imm(MO_64, acc_full_offset(a->ra), 64, 64, 0);
+return true;
+}
+
 #undef GEN_XX2FORM
 #undef GEN_XX3FORM
 #undef GEN_XX2IFORM
-- 
2.31.1




  1   2   3   >