Re: [PATCH v2 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec

2022-10-10 Thread Richard Henderson

On 10/10/22 12:13, Lucas Mateus Castro(alqotel) wrote:

From: "Lucas Mateus Castro (alqotel)" 

Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to
decodetree. VPRTYBW and VPRTYBD now also use .fni4 and .fni8,
respectively.

vprtybw:
reptloopmaster patch
8   12500   0,00991200 0,00626300 (-36.8%)
25  40000,01040600 0,00550600 (-47.1%)
100 10000,01084500 0,00601100 (-44.6%)
500 200 0,01490600 0,01394100 (-6.5%)
250040  0,03285100 0,05143000 (+56.6%)
800012  0,08971500 0,14662500 (+63.4%)

vprtybd:
reptloopmaster patch
8   12500   0,00665800 0,00652800 (-2.0%)
25  40000,00589300 0,00670400 (+13.8%)
100 10000,00646800 0,00743900 (+15.0%)
500 200 0,01065800 0,01586400 (+48.8%)
250040  0,03497000 0,07180100 (+105.3%)
800012  0,09242200 0,21566600 (+133.3%)

vprtybq:
reptloopmaster patch
8   12500   0,00656200 0,00665800 (+1.5%)
25  40000,00620500 0,00644900 (+3.9%)
100 10000,00707500 0,00764900 (+8.1%)
500 200 0,01203500 0,01349500 (+12.1%)
250040  0,03505700 0,04123100 (+17.6%)
800012  0,09590600 0,11586700 (+20.8%)

I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ,
I'm not sure if it's worth to move those instructions. Comparing the
assembly of the helper with the TCGop they are pretty similar, so
I'm not sure why vprtybd took so much more time.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
  target/ppc/helper.h |  4 +-
  target/ppc/insn32.decode|  4 ++
  target/ppc/int_helper.c | 25 +
  target/ppc/translate/vmx-impl.c.inc | 80 +++--
  target/ppc/translate/vmx-ops.c.inc  |  3 --
  5 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index b2e910b089..a06193bc67 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -193,9 +193,7 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
  DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
  DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
  DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
+DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32)
  DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
  DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
  DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 2658dd3395..aa4968e6b9 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -529,6 +529,10 @@ VCTZDM  000100 . . . 1000100@VX
  VPDEPD  000100 . . . 10111001101@VX
  VPEXTD  000100 . . . 10110001101@VX
  
+VPRTYBD 000100 . 01001 . 1100010@VX_tb

+VPRTYBQ 000100 . 01010 . 1100010@VX_tb
+VPRTYBW 000100 . 01000 . 1100010@VX_tb
+
  ## Vector Permute and Formatting Instruction
  
  VEXTDUBVLX  000100 . . . . 011000   @VA

diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index c7fd0d1faa..c6ce4665fa 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -492,31 +492,8 @@ static inline void set_vscr_sat(CPUPPCState *env)
  env->vscr_sat.u32[0] = 1;
  }
  
-/* vprtybw */

-void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
-{
-int i;
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
-uint64_t res = b->u32[i] ^ (b->u32[i] >> 16);
-res ^= res >> 8;
-r->u32[i] = res & 1;
-}
-}
-
-/* vprtybd */
-void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
-{
-int i;
-for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
-uint64_t res = b->u64[i] ^ (b->u64[i] >> 32);
-res ^= res >> 16;
-res ^= res >> 8;
-r->u64[i] = res & 1;
-}
-}
-
  /* vprtybq */
-void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
  {
  uint64_t res = b->u64[0] ^ b->u64[1];
  res ^= res >> 32;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index b9a9e83ab3..23601942bc 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1659,9 +1659,83 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
  GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
  GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
  GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
-GEN_VXFORM_NOA(vprtybw, 1, 24);

[PATCH v2 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec

2022-10-10 Thread Lucas Mateus Castro(alqotel)
From: "Lucas Mateus Castro (alqotel)" 

Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to
decodetree. VPRTYBW and VPRTYBD now also use .fni4 and .fni8,
respectively.

vprtybw:
reptloopmaster patch
8   12500   0,00991200 0,00626300 (-36.8%)
25  40000,01040600 0,00550600 (-47.1%)
100 10000,01084500 0,00601100 (-44.6%)
500 200 0,01490600 0,01394100 (-6.5%)
250040  0,03285100 0,05143000 (+56.6%)
800012  0,08971500 0,14662500 (+63.4%)

vprtybd:
reptloopmaster patch
8   12500   0,00665800 0,00652800 (-2.0%)
25  40000,00589300 0,00670400 (+13.8%)
100 10000,00646800 0,00743900 (+15.0%)
500 200 0,01065800 0,01586400 (+48.8%)
250040  0,03497000 0,07180100 (+105.3%)
800012  0,09242200 0,21566600 (+133.3%)

vprtybq:
reptloopmaster patch
8   12500   0,00656200 0,00665800 (+1.5%)
25  40000,00620500 0,00644900 (+3.9%)
100 10000,00707500 0,00764900 (+8.1%)
500 200 0,01203500 0,01349500 (+12.1%)
250040  0,03505700 0,04123100 (+17.6%)
800012  0,09590600 0,11586700 (+20.8%)

I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ,
I'm not sure if it's worth to move those instructions. Comparing the
assembly of the helper with the TCGop they are pretty similar, so
I'm not sure why vprtybd took so much more time.

Signed-off-by: Lucas Mateus Castro (alqotel) 
---
 target/ppc/helper.h |  4 +-
 target/ppc/insn32.decode|  4 ++
 target/ppc/int_helper.c | 25 +
 target/ppc/translate/vmx-impl.c.inc | 80 +++--
 target/ppc/translate/vmx-ops.c.inc  |  3 --
 5 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index b2e910b089..a06193bc67 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -193,9 +193,7 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
 DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
+DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 2658dd3395..aa4968e6b9 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -529,6 +529,10 @@ VCTZDM  000100 . . . 1000100@VX
 VPDEPD  000100 . . . 10111001101@VX
 VPEXTD  000100 . . . 10110001101@VX
 
+VPRTYBD 000100 . 01001 . 1100010@VX_tb
+VPRTYBQ 000100 . 01010 . 1100010@VX_tb
+VPRTYBW 000100 . 01000 . 1100010@VX_tb
+
 ## Vector Permute and Formatting Instruction
 
 VEXTDUBVLX  000100 . . . . 011000   @VA
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index c7fd0d1faa..c6ce4665fa 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -492,31 +492,8 @@ static inline void set_vscr_sat(CPUPPCState *env)
 env->vscr_sat.u32[0] = 1;
 }
 
-/* vprtybw */
-void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
-{
-int i;
-for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
-uint64_t res = b->u32[i] ^ (b->u32[i] >> 16);
-res ^= res >> 8;
-r->u32[i] = res & 1;
-}
-}
-
-/* vprtybd */
-void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
-{
-int i;
-for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
-uint64_t res = b->u64[i] ^ (b->u64[i] >> 32);
-res ^= res >> 16;
-res ^= res >> 8;
-r->u64[i] = res & 1;
-}
-}
-
 /* vprtybq */
-void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
 uint64_t res = b->u64[0] ^ b->u64[1];
 res ^= res >> 32;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index b9a9e83ab3..23601942bc 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1659,9 +1659,83 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
 GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
 GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
 GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
-GEN_VXFORM_NOA(vprtybw, 1, 24);
-GEN_VXFORM_NOA(vprtybd, 1, 24);
-GEN_VXFORM_NOA(vprtybq, 1, 24);
+
+static void