Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-29 Thread Mark Cave-Ayland
On 27/01/2019 18:07, Richard Henderson wrote:

> On 1/27/19 9:45 AM, Mark Cave-Ayland wrote:
>>> I would expect the i < n/2 loop to be faster, because the assignments are
>>> unconditional.  FWIW.
>>
>> Do you have any idea as to how much faster? Is it something that would show
>> up as significant within the context of QEMU?
> 
> I don't have any numbers on that, no.
> 
>> As well as eliminating the HI_IDX/LO_IDX constants I do find the updated
>> version much easier to read, so I would prefer to keep it if possible.
>> What about unrolling the loop into 2 separate ones...
> 
> I doubt that would be helpful.
> 
> I would think that
> 
> #define VMRG_DO(name, access, ofs)
> ...
> int i, half = ARRAY_SIZE(r->access(0)) / 2;
> ...
> for (i = 0; i < half; i++) {
> result.access(2 * i + 0) = a->access(i + ofs);
> result.access(2 * i + 1) = b->access(i + ofs);
> }
> 
> where OFS = 0 for HI and half for LO is best.  I find it quite readable, and 
> it
> avoids duplicating code between LO and HI as you're currently doing.

Attached is my test program which benchmarks the different approaches across
0x800 iterations and gives the following sample output here on an i7 when
compiled with gcc -O2:

$ ./mcatest
Benchmark 1 - existing merge high
Elapsed time: 1434735 us

Benchmark 2 - v3 merge high
Elapsed time: 2603553 us

Benchmark 3 - 2 loops merge high
Elapsed time: 2395434 us

Benchmark 4 - Richard's merge high
Elapsed time: 1318369 us


These indicate that the proposed v3 merge algorithm is nearly 50% slower than 
the
existing implementation - it wasn't something noticeable during emulation, but 
in a
benchmark situation the additional overhead is clearly visible.

TLDR: after playing around with the different approaches, Richard's proposed
algorithm is the fastest, and is actually slightly quicker than the current
implementation. Please go to the foyer after class where you can collect your 
prize :)

On this basis I'll redo v3 using Richard's algorithm and post a v4 later 
assuming
that it passes my local tests again.


ATB,

Mark.
#include 
#include 
#include 
#include 

/* VSX/Altivec registers (128 bits) */
typedef union _ppc_vsr_t {
uint8_t u8[16];
uint16_t u16[8];
uint32_t u32[4];
uint64_t u64[2];
int8_t s8[16];
int16_t s16[8];
int32_t s32[4];
int64_t s64[2];
float f32[4];
double f64[2];
} ppc_vsr_t;

typedef ppc_vsr_t ppc_avr_t;

#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#define IS_LITTLE_ENDIAN (1 == *(unsigned char *)&(const int){1})

#if defined(IS_LITTLE_ENDIAN)
#define HI_IDX 1
#define LO_IDX 0
#define VsrB(i) u8[15 - (i)]
#else
#define HI_IDX 0
#define LO_IDX 1
#define VsrB(i) u8[i]
#endif

#define ITERATIONS 0x800

void benchmark1(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
/* Existing algorithm */
ppc_avr_t result;
int i;
size_t n_elems = ARRAY_SIZE(r->u8);

for (i = 0; i < n_elems / 2; i++) {
result.u8[i*2+HI_IDX] = a->u8[i];
result.u8[i*2+LO_IDX] = b->u8[i];
}
*r = result;
}

void benchmark2(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
/* v3 patchset algorithm */
ppc_avr_t result;
int i;

for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
result.VsrB(i) = (i & 1) ? b->VsrB(i >> 1)
 : a->VsrB(i >> 1);
}
*r = result;
}

void benchmark3(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
/* Split loop into 2 halves */
ppc_avr_t result;
int i;

for (i = 0; i < ARRAY_SIZE(r->u8); i+=2) {
result.VsrB(i) = a->VsrB(i >> 1);
}

for (i = 1; i < ARRAY_SIZE(r->u8); i+=2) {
result.VsrB(i) = b->VsrB(i >> 1);
}
*r = result;
}

void benchmark4(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
/* Richard's merge high */
ppc_avr_t result;
int i;
int half = ARRAY_SIZE(r->u8) / 2;
int ofs = 0;

for (i = 0; i < half; i++) {
result.VsrB(i * 2 + 0) = a->VsrB(i + ofs);
result.VsrB(i * 2 + 1) = b->VsrB(i + ofs);
}
*r = result;
}

int elapsed_time(struct timeval *st, struct timeval *et)
{
return ((et->tv_sec - st->tv_sec) * 100) + (et->tv_usec - st->tv_usec);
}

int main(int argc, char *argv[]) 
{
ppc_avr_t a, b, r;
int i;
struct timeval start_time, end_time;

printf("Benchmark 1 - existing merge high\n");
gettimeofday(_time, NULL);

for (i = 0; i < ITERATIONS; i++) {
benchmark1(, , );
}

gettimeofday(_time ,NULL);
printf("Elapsed time: %d us\n\n", elapsed_time(_time, _time));


printf("Benchmark 2 - v3 merge high\n");
gettimeofday(_time, NULL);

for (i = 0; i < ITERATIONS; i++) {
benchmark2(, , );
}

gettimeofday(_time ,NULL);
printf("Elapsed time: %d us\n\n", elapsed_time(_time, _time));


printf("Benchmark 3 - 2 loops merge high\n");
gettimeofday(_time, NULL);

for (i = 0; i < ITERATIONS; i++) {
benchmark3(, , );
}


Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-28 Thread Mark Cave-Ayland
On 29/01/2019 02:28, David Gibson wrote:

> On Sun, Jan 27, 2019 at 10:07:12AM -0800, Richard Henderson wrote:
>> On 1/27/19 9:45 AM, Mark Cave-Ayland wrote:
 I would expect the i < n/2 loop to be faster, because the assignments are
 unconditional.  FWIW.
>>>
>>> Do you have any idea as to how much faster? Is it something that would show
>>> up as significant within the context of QEMU?
>>
>> I don't have any numbers on that, no.
>>
>>> As well as eliminating the HI_IDX/LO_IDX constants I do find the updated
>>> version much easier to read, so I would prefer to keep it if possible.
>>> What about unrolling the loop into 2 separate ones...
>>
>> I doubt that would be helpful.
>>
>> I would think that
>>
>> #define VMRG_DO(name, access, ofs)
>> ...
>> int i, half = ARRAY_SIZE(r->access(0)) / 2;
>> ...
>> for (i = 0; i < half; i++) {
>> result.access(2 * i + 0) = a->access(i + ofs);
>> result.access(2 * i + 1) = b->access(i + ofs);
>> }
>>
>> where OFS = 0 for HI and half for LO is best.  I find it quite readable, and 
>> it
>> avoids duplicating code between LO and HI as you're currently doing.
> 
> Marc, Richard, where are we at with this?
> 
> Should I wait on a revised version of this patch before applying the
> series?

Certainly the v3 as posted is correct (I've tested this particular patch on 
both big
and small endian machines), so I believe the only question is whether this 
introduces
any noticeable performance penalty.

Let me try and run a few simple tests and report back.

BTW are you able to take my qemu-macppc queue posted yesterday at
https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg07263.html? There's no
functional change except for PPC MacOS users who explicitly enable the new QEMU 
EDID
support on the command line.


ATB,

Mark.



Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-28 Thread David Gibson
On Sun, Jan 27, 2019 at 10:07:12AM -0800, Richard Henderson wrote:
> On 1/27/19 9:45 AM, Mark Cave-Ayland wrote:
> >> I would expect the i < n/2 loop to be faster, because the assignments are
> >> unconditional.  FWIW.
> > 
> > Do you have any idea as to how much faster? Is it something that would show
> > up as significant within the context of QEMU?
> 
> I don't have any numbers on that, no.
> 
> > As well as eliminating the HI_IDX/LO_IDX constants I do find the updated
> > version much easier to read, so I would prefer to keep it if possible.
> > What about unrolling the loop into 2 separate ones...
> 
> I doubt that would be helpful.
> 
> I would think that
> 
> #define VMRG_DO(name, access, ofs)
> ...
> int i, half = ARRAY_SIZE(r->access(0)) / 2;
> ...
> for (i = 0; i < half; i++) {
> result.access(2 * i + 0) = a->access(i + ofs);
> result.access(2 * i + 1) = b->access(i + ofs);
> }
> 
> where OFS = 0 for HI and half for LO is best.  I find it quite readable, and 
> it
> avoids duplicating code between LO and HI as you're currently doing.

Marc, Richard, where are we at with this?

Should I wait on a revised version of this patch before applying the
series?

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-27 Thread BALATON Zoltan

On Sun, 27 Jan 2019, BALATON Zoltan wrote:

On Sun, 27 Jan 2019, Mark Cave-Ayland wrote:

On 27/01/2019 17:26, Richard Henderson wrote:


On 1/27/19 7:19 AM, Mark Cave-Ayland wrote:

Could this make the loop slower? I certainly haven't noticed any obvious
performance difference during testing (OS X uses merge quite a bit for
display rendering), and I'd hope that with a good compiler and modern 
branch

prediction then any effect here would be negligible.


I would expect the i < n/2 loop to be faster, because the assignments are
unconditional.  FWIW.


Do you have any idea as to how much faster? Is it something that would show 
up as

significant within the context of QEMU?


I don't have numbers either but since these vector ops are meant to and are 
used for speeding up repetitive calculations I'd expect it to be run many 
times which means that even a small difference would add up. So I think it's 
worth trying to make these optimal also when host vector ops cannot be used.


I don't know about a good benchmark to measure this. Maybe you could try 
converting some video in Mac OS X or something similar that's known to use 
AltiVec/VMX. There are also these under MorphOS on mac99:

http://www.amiga-news.de/en/news/AN-2012-02-00011-EN.html
where the mplayer one is mostly VMX bound I think and lame is more dependent 
on floating point ops but that also has a VMX version (still mainly float I 
think). I'd copy input file to RAM: disk first to avoid overhead from IDE 
emulation. But these are probably too short to measure this.


I can't test this now but maybe someone reading this on the list who can try 
it with and without this series could help.


I've found these (untested and quite old but may work) so you don't need 
MorphOS only OS X:


https://tmkk.undo.jp/lame/index_e.html

Regards,
BALATON Zoltan



Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-27 Thread BALATON Zoltan

On Sun, 27 Jan 2019, Mark Cave-Ayland wrote:

On 27/01/2019 17:26, Richard Henderson wrote:


On 1/27/19 7:19 AM, Mark Cave-Ayland wrote:

Could this make the loop slower? I certainly haven't noticed any obvious
performance difference during testing (OS X uses merge quite a bit for
display rendering), and I'd hope that with a good compiler and modern branch
prediction then any effect here would be negligible.


I would expect the i < n/2 loop to be faster, because the assignments are
unconditional.  FWIW.


Do you have any idea as to how much faster? Is it something that would show up 
as
significant within the context of QEMU?


I don't have numbers either but since these vector ops are meant to and 
are used for speeding up repetitive calculations I'd expect it to be run 
many times which means that even a small difference would add up. So I 
think it's worth trying to make these optimal also when host vector ops 
cannot be used.


I don't know about a good benchmark to measure this. Maybe you could try 
converting some video in Mac OS X or something similar that's known to use 
AltiVec/VMX. There are also these under MorphOS on mac99:

http://www.amiga-news.de/en/news/AN-2012-02-00011-EN.html
where the mplayer one is mostly VMX bound I think and lame is more 
dependent on floating point ops but that also has a VMX version (still 
mainly float I think). I'd copy input file to RAM: disk first to avoid 
overhead from IDE emulation. But these are probably too short to measure 
this.


I can't test this now but maybe someone reading this on the list who can 
try it with and without this series could help.


Regards,
BALATON Zoltan



Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-27 Thread Richard Henderson
On 1/27/19 9:45 AM, Mark Cave-Ayland wrote:
>> I would expect the i < n/2 loop to be faster, because the assignments are
>> unconditional.  FWIW.
> 
> Do you have any idea as to how much faster? Is it something that would show
> up as significant within the context of QEMU?

I don't have any numbers on that, no.

> As well as eliminating the HI_IDX/LO_IDX constants I do find the updated
> version much easier to read, so I would prefer to keep it if possible.
> What about unrolling the loop into 2 separate ones...

I doubt that would be helpful.

I would think that

#define VMRG_DO(name, access, ofs)
...
int i, half = ARRAY_SIZE(r->access(0)) / 2;
...
for (i = 0; i < half; i++) {
result.access(2 * i + 0) = a->access(i + ofs);
result.access(2 * i + 1) = b->access(i + ofs);
}

where OFS = 0 for HI and half for LO is best.  I find it quite readable, and it
avoids duplicating code between LO and HI as you're currently doing.


r~



Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-27 Thread Mark Cave-Ayland
On 27/01/2019 17:26, Richard Henderson wrote:

> On 1/27/19 7:19 AM, Mark Cave-Ayland wrote:
>> Could this make the loop slower? I certainly haven't noticed any obvious
>> performance difference during testing (OS X uses merge quite a bit for
>> display rendering), and I'd hope that with a good compiler and modern branch
>> prediction then any effect here would be negligible.
>
> I would expect the i < n/2 loop to be faster, because the assignments are
> unconditional.  FWIW.

Do you have any idea as to how much faster? Is it something that would show up 
as
significant within the context of QEMU?

As well as eliminating the HI_IDX/LO_IDX constants I do find the updated 
version much
easier to read, so I would prefer to keep it if possible. What about unrolling 
the
loop into 2 separate ones e.g.

for (i = 0; i < ARRAY_SIZE(r->element); i+=2) {
result.access(i) = a->access(i >> 1);
}

for (i = 1; i < ARRAY_SIZE(r->element); i+=2) {
result.access(i) = b->access(i >> 1);
}

Would you expect this to perform better than the version proposed in the 
patchset?


ATB,

Mark.



Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-27 Thread Richard Henderson
On 1/27/19 7:19 AM, Mark Cave-Ayland wrote:
> Could this make the loop slower? I certainly haven't noticed any obvious
> performance difference during testing (OS X uses merge quite a bit for
> display rendering), and I'd hope that with a good compiler and modern branch
> prediction then any effect here would be negligible.
I would expect the i < n/2 loop to be faster, because the assignments are
unconditional.  FWIW.


r~



Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-27 Thread Mark Cave-Ayland
On 27/01/2019 12:07, BALATON Zoltan wrote:

> On Sun, 27 Jan 2019, Mark Cave-Ayland wrote:
>> The current implementations make use of the endian-specific macros 
>> MRGLO/MRGHI
>> and also reference HI_IDX and LO_IDX directly to calculate array offsets.
>>
>> Rework the implementation to use the Vsr* macros so that these per-endian
>> references can be removed.
>>
>> Signed-off-by: Mark Cave-Ayland 
>> ---
>> target/ppc/int_helper.c | 52 
>> -
>> 1 file changed, 25 insertions(+), 27 deletions(-)
>>
>> diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
>> index 598731d47a..f084a706ee 100644
>> --- a/target/ppc/int_helper.c
>> +++ b/target/ppc/int_helper.c
>> @@ -976,43 +976,41 @@ void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, 
>> ppc_avr_t
>> *b, ppc_avr_t *c)
>>     }
>> }
>>
>> -#define VMRG_DO(name, element, highp)   \
>> +#define VMRG_DOLO(name, element, access)    \
>>     void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)   \
>>     {   \
>>     ppc_avr_t result;   \
>>     int i;  \
>> -    size_t n_elems = ARRAY_SIZE(r->element);    \
>> +    int m = ARRAY_SIZE(r->element) - 1; \
>>     \
>> -    for (i = 0; i < n_elems / 2; i++) { \
>> -    if (highp) {    \
>> -    result.element[i*2+HI_IDX] = a->element[i]; \
>> -    result.element[i*2+LO_IDX] = b->element[i]; \
>> -    } else {    \
>> -    result.element[n_elems - i * 2 - (1 + HI_IDX)] =    \
>> -    b->element[n_elems - i - 1];    \
>> -    result.element[n_elems - i * 2 - (1 + LO_IDX)] =    \
>> -    a->element[n_elems - i - 1];    \
>> -    }   \
>> +    for (i = 0; i < ARRAY_SIZE(r->element); i++) {  \
> 
> Isn't this a performance hit? You seem to do twice as many iterations now:
> 
> - before, the loop was to ARRAY_SIZE/2 and was called twice so it executed 
> ARRAY_SIZE
> times
> 
> - after you have a loop to ARRAY_SIZE but still called twice so it executes
> 2*ARRAY_SIZE times
> 
> Or do I miss something? If these are replaced with hardware vector 
> instructions at
> the end then it may not matter to those who have CPU with needed vector 
> instructions
> but for others this may be slower than the previous hand optimised version. 
> But I
> haven't tested it, just guessing.

One point to clarify here is that the HI and LO variants of vmrg{l,h}{b,h,w} are
separate instructions, so the input elements are being iterated over once, both
before and after the patch.

Simplifying the patch down then effectively what happens is that the patch has
changed the merge implementation from:

for (i = 0; i < ARRAY_SIZE / 2; i++) {
result[f(2 * i)] = a->element[g(i)];
result[f(2 * i + 1)] = b->element[g(i)];
}

to:

for (i = 0; i < ARRAY_SIZE; i++) {
result[f(i)] = (i & 1) ? a->element[g(i)] : b->element[g(i)];
}

So you're still calculating the same number of result elements, even though the 
loop
executes twice as many times.

Could this make the loop slower? I certainly haven't noticed any obvious 
performance
difference during testing (OS X uses merge quite a bit for display rendering), 
and
I'd hope that with a good compiler and modern branch prediction then any effect 
here
would be negligible.


ATB,

Mark.



Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros

2019-01-27 Thread BALATON Zoltan

On Sun, 27 Jan 2019, Mark Cave-Ayland wrote:

The current implementations make use of the endian-specific macros MRGLO/MRGHI
and also reference HI_IDX and LO_IDX directly to calculate array offsets.

Rework the implementation to use the Vsr* macros so that these per-endian
references can be removed.

Signed-off-by: Mark Cave-Ayland 
---
target/ppc/int_helper.c | 52 -
1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 598731d47a..f084a706ee 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -976,43 +976,41 @@ void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, 
ppc_avr_t *b, ppc_avr_t *c)
}
}

-#define VMRG_DO(name, element, highp)   \
+#define VMRG_DOLO(name, element, access)\
void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)   \
{   \
ppc_avr_t result;   \
int i;  \
-size_t n_elems = ARRAY_SIZE(r->element);\
+int m = ARRAY_SIZE(r->element) - 1; \
\
-for (i = 0; i < n_elems / 2; i++) { \
-if (highp) {\
-result.element[i*2+HI_IDX] = a->element[i]; \
-result.element[i*2+LO_IDX] = b->element[i]; \
-} else {\
-result.element[n_elems - i * 2 - (1 + HI_IDX)] =\
-b->element[n_elems - i - 1];\
-result.element[n_elems - i * 2 - (1 + LO_IDX)] =\
-a->element[n_elems - i - 1];\
-}   \
+for (i = 0; i < ARRAY_SIZE(r->element); i++) {  \


Isn't this a performance hit? You seem to do twice as many iterations now:

- before, the loop was to ARRAY_SIZE/2 and was called twice so it 
executed ARRAY_SIZE times


- after you have a loop to ARRAY_SIZE but still called twice so it 
executes 2*ARRAY_SIZE times


Or do I miss something? If these are replaced with hardware vector 
instructions at the end then it may not matter to those who have CPU with 
needed vector instructions but for others this may be slower than the 
previous hand optimised version. But I haven't tested it, just guessing.


Regrads,
BALATON Zoltan


+result.access(m - i) = (i & 1) ? a->access(m - (i >> 1))\
+   : b->access(m - (i >> 1));   \
}   \
*r = result;\
}
-#if defined(HOST_WORDS_BIGENDIAN)
-#define MRGHI 0
-#define MRGLO 1
-#else
-#define MRGHI 1
-#define MRGLO 0
-#endif
-#define VMRG(suffix, element)   \
-VMRG_DO(mrgl##suffix, element, MRGHI)   \
-VMRG_DO(mrgh##suffix, element, MRGLO)
-VMRG(b, u8)
-VMRG(h, u16)
-VMRG(w, u32)
+
+#define VMRG_DOHI(name, element, access)\
+void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)   \
+{   \
+ppc_avr_t result;   \
+int i;  \
+\
+for (i = 0; i < ARRAY_SIZE(r->element); i++) {  \
+result.access(i) = (i & 1) ? b->access(i >> 1)  \
+   : a->access(i >> 1); \
+}   \
+*r = result;\
+}
+
+#define VMRG(suffix, element, access)  \
+VMRG_DOLO(mrgl##suffix, element, access)   \
+VMRG_DOHI(mrgh##suffix, element, access)
+VMRG(b, u8, VsrB)
+VMRG(h, u16, VsrH)
+VMRG(w, u32, VsrW)
#undef VMRG_DO
#undef VMRG
-#undef MRGHI
-#undef MRGLO

void helper_vmsummbm(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a,
 ppc_avr_t *b, ppc_avr_t *c)