From: "Lucas Mateus Castro (alqotel)"
Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to
decodetree.
vprtybw:
reptloopmaster patch
8 12500 0,01215900 0,00705600 (-42.0%)
25 40000,01198700 0,00574400 (-52.1%)
100 10000,01307800 0,00692200 (-47.1%)
500 200 0,01794800 0,01558800 (-13.1%)
250040 0,04028200 0,05400800 (+34.1%)
800012 0,10127300 0,16744700 (+65.3%)
vprtybd:
reptloopmaster patch
8 12500 0,00757400 0,00791600 (+4.5%)
25 40000,00651300 0,00673700 (+3.4%)
100 10000,00713400 0,00837700 (+17.4%)
500 200 0,01195400 0,01937400 (+62.1%)
250040 0,03478600 0,07005500 (+101.4%)
800012 0,09539600 0,21013500 (+120.3%)
vprtybq:
reptloopmaster patch
8 12500 0,00065540 0,00066440 (+1.4%)
25 40000,00057720 0,00059850 (+3.7%)
100 10000,00066400 0,00069360 (+4.5%)
500 200 0,00115170 0,00127360 (+10.6%)
250040 0,00341890 0,00391550 (+14.5%)
800012 0,00951220 0,0480 (+16.8%)
I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ,
I'm not sure if it's worth to move those instructions. Comparing the
assembly of the helper with the TCGop they are pretty similar, so
I'm not sure why vprtybd took so much more time.
Signed-off-by: Lucas Mateus Castro (alqotel)
---
target/ppc/helper.h | 6 ++--
target/ppc/insn32.decode| 4 +++
target/ppc/int_helper.c | 6 ++--
target/ppc/translate/vmx-impl.c.inc | 55 +++--
target/ppc/translate/vmx-ops.c.inc | 3 --
5 files changed, 62 insertions(+), 12 deletions(-)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index feccf30bcb..6a43e32ad3 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -194,9 +194,9 @@ DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr,
avr)
DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
DEF_HELPER_FLAGS_4(VADDCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
+DEF_HELPER_FLAGS_3(VPRTYBW, TCG_CALL_NO_RWG, void, avr, avr, i32)
+DEF_HELPER_FLAGS_3(VPRTYBD, TCG_CALL_NO_RWG, void, avr, avr, i32)
+DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32)
DEF_HELPER_FLAGS_4(VSUBCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 2658dd3395..aa4968e6b9 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -529,6 +529,10 @@ VCTZDM 000100 . . . 1000100@VX
VPDEPD 000100 . . . 10111001101@VX
VPEXTD 000100 . . . 10110001101@VX
+VPRTYBD 000100 . 01001 . 1100010@VX_tb
+VPRTYBQ 000100 . 01010 . 1100010@VX_tb
+VPRTYBW 000100 . 01000 . 1100010@VX_tb
+
## Vector Permute and Formatting Instruction
VEXTDUBVLX 000100 . . . . 011000 @VA
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 338ebced22..64b2d44a66 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -502,7 +502,7 @@ void helper_VADDCUW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t
*b, uint32_t v)
}
/* vprtybw */
-void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBW(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
{
int i;
for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
@@ -513,7 +513,7 @@ void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
}
/* vprtybd */
-void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBD(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
{
int i;
for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
@@ -525,7 +525,7 @@ void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
}
/* vprtybq */
-void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
{
uint64_t res = b->u64[0] ^ b->u64[1];
res ^= res >> 32;
diff --git a/target/ppc/translate/vmx-impl.c.inc
b/target/ppc/translate/vmx-impl.c.inc
index 3f614097ac..06d91d1304 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1659,9 +1659,58 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
-GEN_VXFORM_NOA(vprtybw, 1, 24);
-GEN_VXFORM_NOA(vprtybd, 1,