Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On 27/01/2019 18:07, Richard Henderson wrote: > On 1/27/19 9:45 AM, Mark Cave-Ayland wrote: >>> I would expect the i < n/2 loop to be faster, because the assignments are >>> unconditional. FWIW. >> >> Do you have any idea as to how much faster? Is it something that would show >> up as significant within the context of QEMU? > > I don't have any numbers on that, no. > >> As well as eliminating the HI_IDX/LO_IDX constants I do find the updated >> version much easier to read, so I would prefer to keep it if possible. >> What about unrolling the loop into 2 separate ones... > > I doubt that would be helpful. > > I would think that > > #define VMRG_DO(name, access, ofs) > ... > int i, half = ARRAY_SIZE(r->access(0)) / 2; > ... > for (i = 0; i < half; i++) { > result.access(2 * i + 0) = a->access(i + ofs); > result.access(2 * i + 1) = b->access(i + ofs); > } > > where OFS = 0 for HI and half for LO is best. I find it quite readable, and > it > avoids duplicating code between LO and HI as you're currently doing. Attached is my test program which benchmarks the different approaches across 0x800 iterations and gives the following sample output here on an i7 when compiled with gcc -O2: $ ./mcatest Benchmark 1 - existing merge high Elapsed time: 1434735 us Benchmark 2 - v3 merge high Elapsed time: 2603553 us Benchmark 3 - 2 loops merge high Elapsed time: 2395434 us Benchmark 4 - Richard's merge high Elapsed time: 1318369 us These indicate that the proposed v3 merge algorithm is nearly 50% slower than the existing implementation - it wasn't something noticeable during emulation, but in a benchmark situation the additional overhead is clearly visible. TLDR: after playing around with the different approaches, Richard's proposed algorithm is the fastest, and is actually slightly quicker than the current implementation. Please go to the foyer after class where you can collect your prize :) On this basis I'll redo v3 using Richard's algorithm and post a v4 later assuming that it passes my local tests again. ATB, Mark. #include #include #include #include /* VSX/Altivec registers (128 bits) */ typedef union _ppc_vsr_t { uint8_t u8[16]; uint16_t u16[8]; uint32_t u32[4]; uint64_t u64[2]; int8_t s8[16]; int16_t s16[8]; int32_t s32[4]; int64_t s64[2]; float f32[4]; double f64[2]; } ppc_vsr_t; typedef ppc_vsr_t ppc_avr_t; #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define IS_LITTLE_ENDIAN (1 == *(unsigned char *)&(const int){1}) #if defined(IS_LITTLE_ENDIAN) #define HI_IDX 1 #define LO_IDX 0 #define VsrB(i) u8[15 - (i)] #else #define HI_IDX 0 #define LO_IDX 1 #define VsrB(i) u8[i] #endif #define ITERATIONS 0x800 void benchmark1(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { /* Existing algorithm */ ppc_avr_t result; int i; size_t n_elems = ARRAY_SIZE(r->u8); for (i = 0; i < n_elems / 2; i++) { result.u8[i*2+HI_IDX] = a->u8[i]; result.u8[i*2+LO_IDX] = b->u8[i]; } *r = result; } void benchmark2(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { /* v3 patchset algorithm */ ppc_avr_t result; int i; for (i = 0; i < ARRAY_SIZE(r->u8); i++) { result.VsrB(i) = (i & 1) ? b->VsrB(i >> 1) : a->VsrB(i >> 1); } *r = result; } void benchmark3(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { /* Split loop into 2 halves */ ppc_avr_t result; int i; for (i = 0; i < ARRAY_SIZE(r->u8); i+=2) { result.VsrB(i) = a->VsrB(i >> 1); } for (i = 1; i < ARRAY_SIZE(r->u8); i+=2) { result.VsrB(i) = b->VsrB(i >> 1); } *r = result; } void benchmark4(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { /* Richard's merge high */ ppc_avr_t result; int i; int half = ARRAY_SIZE(r->u8) / 2; int ofs = 0; for (i = 0; i < half; i++) { result.VsrB(i * 2 + 0) = a->VsrB(i + ofs); result.VsrB(i * 2 + 1) = b->VsrB(i + ofs); } *r = result; } int elapsed_time(struct timeval *st, struct timeval *et) { return ((et->tv_sec - st->tv_sec) * 100) + (et->tv_usec - st->tv_usec); } int main(int argc, char *argv[]) { ppc_avr_t a, b, r; int i; struct timeval start_time, end_time; printf("Benchmark 1 - existing merge high\n"); gettimeofday(_time, NULL); for (i = 0; i < ITERATIONS; i++) { benchmark1(, , ); } gettimeofday(_time ,NULL); printf("Elapsed time: %d us\n\n", elapsed_time(_time, _time)); printf("Benchmark 2 - v3 merge high\n"); gettimeofday(_time, NULL); for (i = 0; i < ITERATIONS; i++) { benchmark2(, , ); } gettimeofday(_time ,NULL); printf("Elapsed time: %d us\n\n", elapsed_time(_time, _time)); printf("Benchmark 3 - 2 loops merge high\n"); gettimeofday(_time, NULL); for (i = 0; i < ITERATIONS; i++) { benchmark3(, , ); }
Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On 29/01/2019 02:28, David Gibson wrote: > On Sun, Jan 27, 2019 at 10:07:12AM -0800, Richard Henderson wrote: >> On 1/27/19 9:45 AM, Mark Cave-Ayland wrote: I would expect the i < n/2 loop to be faster, because the assignments are unconditional. FWIW. >>> >>> Do you have any idea as to how much faster? Is it something that would show >>> up as significant within the context of QEMU? >> >> I don't have any numbers on that, no. >> >>> As well as eliminating the HI_IDX/LO_IDX constants I do find the updated >>> version much easier to read, so I would prefer to keep it if possible. >>> What about unrolling the loop into 2 separate ones... >> >> I doubt that would be helpful. >> >> I would think that >> >> #define VMRG_DO(name, access, ofs) >> ... >> int i, half = ARRAY_SIZE(r->access(0)) / 2; >> ... >> for (i = 0; i < half; i++) { >> result.access(2 * i + 0) = a->access(i + ofs); >> result.access(2 * i + 1) = b->access(i + ofs); >> } >> >> where OFS = 0 for HI and half for LO is best. I find it quite readable, and >> it >> avoids duplicating code between LO and HI as you're currently doing. > > Marc, Richard, where are we at with this? > > Should I wait on a revised version of this patch before applying the > series? Certainly the v3 as posted is correct (I've tested this particular patch on both big and small endian machines), so I believe the only question is whether this introduces any noticeable performance penalty. Let me try and run a few simple tests and report back. BTW are you able to take my qemu-macppc queue posted yesterday at https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg07263.html? There's no functional change except for PPC MacOS users who explicitly enable the new QEMU EDID support on the command line. ATB, Mark.
Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On Sun, Jan 27, 2019 at 10:07:12AM -0800, Richard Henderson wrote: > On 1/27/19 9:45 AM, Mark Cave-Ayland wrote: > >> I would expect the i < n/2 loop to be faster, because the assignments are > >> unconditional. FWIW. > > > > Do you have any idea as to how much faster? Is it something that would show > > up as significant within the context of QEMU? > > I don't have any numbers on that, no. > > > As well as eliminating the HI_IDX/LO_IDX constants I do find the updated > > version much easier to read, so I would prefer to keep it if possible. > > What about unrolling the loop into 2 separate ones... > > I doubt that would be helpful. > > I would think that > > #define VMRG_DO(name, access, ofs) > ... > int i, half = ARRAY_SIZE(r->access(0)) / 2; > ... > for (i = 0; i < half; i++) { > result.access(2 * i + 0) = a->access(i + ofs); > result.access(2 * i + 1) = b->access(i + ofs); > } > > where OFS = 0 for HI and half for LO is best. I find it quite readable, and > it > avoids duplicating code between LO and HI as you're currently doing. Marc, Richard, where are we at with this? Should I wait on a revised version of this patch before applying the series? -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson signature.asc Description: PGP signature
Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On Sun, 27 Jan 2019, BALATON Zoltan wrote: On Sun, 27 Jan 2019, Mark Cave-Ayland wrote: On 27/01/2019 17:26, Richard Henderson wrote: On 1/27/19 7:19 AM, Mark Cave-Ayland wrote: Could this make the loop slower? I certainly haven't noticed any obvious performance difference during testing (OS X uses merge quite a bit for display rendering), and I'd hope that with a good compiler and modern branch prediction then any effect here would be negligible. I would expect the i < n/2 loop to be faster, because the assignments are unconditional. FWIW. Do you have any idea as to how much faster? Is it something that would show up as significant within the context of QEMU? I don't have numbers either but since these vector ops are meant to and are used for speeding up repetitive calculations I'd expect it to be run many times which means that even a small difference would add up. So I think it's worth trying to make these optimal also when host vector ops cannot be used. I don't know about a good benchmark to measure this. Maybe you could try converting some video in Mac OS X or something similar that's known to use AltiVec/VMX. There are also these under MorphOS on mac99: http://www.amiga-news.de/en/news/AN-2012-02-00011-EN.html where the mplayer one is mostly VMX bound I think and lame is more dependent on floating point ops but that also has a VMX version (still mainly float I think). I'd copy input file to RAM: disk first to avoid overhead from IDE emulation. But these are probably too short to measure this. I can't test this now but maybe someone reading this on the list who can try it with and without this series could help. I've found these (untested and quite old but may work) so you don't need MorphOS only OS X: https://tmkk.undo.jp/lame/index_e.html Regards, BALATON Zoltan
Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On Sun, 27 Jan 2019, Mark Cave-Ayland wrote: On 27/01/2019 17:26, Richard Henderson wrote: On 1/27/19 7:19 AM, Mark Cave-Ayland wrote: Could this make the loop slower? I certainly haven't noticed any obvious performance difference during testing (OS X uses merge quite a bit for display rendering), and I'd hope that with a good compiler and modern branch prediction then any effect here would be negligible. I would expect the i < n/2 loop to be faster, because the assignments are unconditional. FWIW. Do you have any idea as to how much faster? Is it something that would show up as significant within the context of QEMU? I don't have numbers either but since these vector ops are meant to and are used for speeding up repetitive calculations I'd expect it to be run many times which means that even a small difference would add up. So I think it's worth trying to make these optimal also when host vector ops cannot be used. I don't know about a good benchmark to measure this. Maybe you could try converting some video in Mac OS X or something similar that's known to use AltiVec/VMX. There are also these under MorphOS on mac99: http://www.amiga-news.de/en/news/AN-2012-02-00011-EN.html where the mplayer one is mostly VMX bound I think and lame is more dependent on floating point ops but that also has a VMX version (still mainly float I think). I'd copy input file to RAM: disk first to avoid overhead from IDE emulation. But these are probably too short to measure this. I can't test this now but maybe someone reading this on the list who can try it with and without this series could help. Regards, BALATON Zoltan
Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On 1/27/19 9:45 AM, Mark Cave-Ayland wrote: >> I would expect the i < n/2 loop to be faster, because the assignments are >> unconditional. FWIW. > > Do you have any idea as to how much faster? Is it something that would show > up as significant within the context of QEMU? I don't have any numbers on that, no. > As well as eliminating the HI_IDX/LO_IDX constants I do find the updated > version much easier to read, so I would prefer to keep it if possible. > What about unrolling the loop into 2 separate ones... I doubt that would be helpful. I would think that #define VMRG_DO(name, access, ofs) ... int i, half = ARRAY_SIZE(r->access(0)) / 2; ... for (i = 0; i < half; i++) { result.access(2 * i + 0) = a->access(i + ofs); result.access(2 * i + 1) = b->access(i + ofs); } where OFS = 0 for HI and half for LO is best. I find it quite readable, and it avoids duplicating code between LO and HI as you're currently doing. r~
Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On 27/01/2019 17:26, Richard Henderson wrote: > On 1/27/19 7:19 AM, Mark Cave-Ayland wrote: >> Could this make the loop slower? I certainly haven't noticed any obvious >> performance difference during testing (OS X uses merge quite a bit for >> display rendering), and I'd hope that with a good compiler and modern branch >> prediction then any effect here would be negligible. > > I would expect the i < n/2 loop to be faster, because the assignments are > unconditional. FWIW. Do you have any idea as to how much faster? Is it something that would show up as significant within the context of QEMU? As well as eliminating the HI_IDX/LO_IDX constants I do find the updated version much easier to read, so I would prefer to keep it if possible. What about unrolling the loop into 2 separate ones e.g. for (i = 0; i < ARRAY_SIZE(r->element); i+=2) { result.access(i) = a->access(i >> 1); } for (i = 1; i < ARRAY_SIZE(r->element); i+=2) { result.access(i) = b->access(i >> 1); } Would you expect this to perform better than the version proposed in the patchset? ATB, Mark.
Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On 1/27/19 7:19 AM, Mark Cave-Ayland wrote: > Could this make the loop slower? I certainly haven't noticed any obvious > performance difference during testing (OS X uses merge quite a bit for > display rendering), and I'd hope that with a good compiler and modern branch > prediction then any effect here would be negligible. I would expect the i < n/2 loop to be faster, because the assignments are unconditional. FWIW. r~
Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On 27/01/2019 12:07, BALATON Zoltan wrote: > On Sun, 27 Jan 2019, Mark Cave-Ayland wrote: >> The current implementations make use of the endian-specific macros >> MRGLO/MRGHI >> and also reference HI_IDX and LO_IDX directly to calculate array offsets. >> >> Rework the implementation to use the Vsr* macros so that these per-endian >> references can be removed. >> >> Signed-off-by: Mark Cave-Ayland >> --- >> target/ppc/int_helper.c | 52 >> - >> 1 file changed, 25 insertions(+), 27 deletions(-) >> >> diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c >> index 598731d47a..f084a706ee 100644 >> --- a/target/ppc/int_helper.c >> +++ b/target/ppc/int_helper.c >> @@ -976,43 +976,41 @@ void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, >> ppc_avr_t >> *b, ppc_avr_t *c) >> } >> } >> >> -#define VMRG_DO(name, element, highp) \ >> +#define VMRG_DOLO(name, element, access) \ >> void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ >> { \ >> ppc_avr_t result; \ >> int i; \ >> - size_t n_elems = ARRAY_SIZE(r->element); \ >> + int m = ARRAY_SIZE(r->element) - 1; \ >> \ >> - for (i = 0; i < n_elems / 2; i++) { \ >> - if (highp) { \ >> - result.element[i*2+HI_IDX] = a->element[i]; \ >> - result.element[i*2+LO_IDX] = b->element[i]; \ >> - } else { \ >> - result.element[n_elems - i * 2 - (1 + HI_IDX)] = \ >> - b->element[n_elems - i - 1]; \ >> - result.element[n_elems - i * 2 - (1 + LO_IDX)] = \ >> - a->element[n_elems - i - 1]; \ >> - } \ >> + for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ > > Isn't this a performance hit? You seem to do twice as many iterations now: > > - before, the loop was to ARRAY_SIZE/2 and was called twice so it executed > ARRAY_SIZE > times > > - after you have a loop to ARRAY_SIZE but still called twice so it executes > 2*ARRAY_SIZE times > > Or do I miss something? If these are replaced with hardware vector > instructions at > the end then it may not matter to those who have CPU with needed vector > instructions > but for others this may be slower than the previous hand optimised version. > But I > haven't tested it, just guessing. One point to clarify here is that the HI and LO variants of vmrg{l,h}{b,h,w} are separate instructions, so the input elements are being iterated over once, both before and after the patch. Simplifying the patch down then effectively what happens is that the patch has changed the merge implementation from: for (i = 0; i < ARRAY_SIZE / 2; i++) { result[f(2 * i)] = a->element[g(i)]; result[f(2 * i + 1)] = b->element[g(i)]; } to: for (i = 0; i < ARRAY_SIZE; i++) { result[f(i)] = (i & 1) ? a->element[g(i)] : b->element[g(i)]; } So you're still calculating the same number of result elements, even though the loop executes twice as many times. Could this make the loop slower? I certainly haven't noticed any obvious performance difference during testing (OS X uses merge quite a bit for display rendering), and I'd hope that with a good compiler and modern branch prediction then any effect here would be negligible. ATB, Mark.
Re: [Qemu-devel] [Qemu-ppc] [PATCH v3 2/8] target/ppc: rework vmrg{l, h}{b, h, w} instructions to use Vsr* macros
On Sun, 27 Jan 2019, Mark Cave-Ayland wrote: The current implementations make use of the endian-specific macros MRGLO/MRGHI and also reference HI_IDX and LO_IDX directly to calculate array offsets. Rework the implementation to use the Vsr* macros so that these per-endian references can be removed. Signed-off-by: Mark Cave-Ayland --- target/ppc/int_helper.c | 52 - 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 598731d47a..f084a706ee 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -976,43 +976,41 @@ void helper_vmladduhm(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c) } } -#define VMRG_DO(name, element, highp) \ +#define VMRG_DOLO(name, element, access)\ void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ { \ ppc_avr_t result; \ int i; \ -size_t n_elems = ARRAY_SIZE(r->element);\ +int m = ARRAY_SIZE(r->element) - 1; \ \ -for (i = 0; i < n_elems / 2; i++) { \ -if (highp) {\ -result.element[i*2+HI_IDX] = a->element[i]; \ -result.element[i*2+LO_IDX] = b->element[i]; \ -} else {\ -result.element[n_elems - i * 2 - (1 + HI_IDX)] =\ -b->element[n_elems - i - 1];\ -result.element[n_elems - i * 2 - (1 + LO_IDX)] =\ -a->element[n_elems - i - 1];\ -} \ +for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ Isn't this a performance hit? You seem to do twice as many iterations now: - before, the loop was to ARRAY_SIZE/2 and was called twice so it executed ARRAY_SIZE times - after you have a loop to ARRAY_SIZE but still called twice so it executes 2*ARRAY_SIZE times Or do I miss something? If these are replaced with hardware vector instructions at the end then it may not matter to those who have CPU with needed vector instructions but for others this may be slower than the previous hand optimised version. But I haven't tested it, just guessing. Regrads, BALATON Zoltan +result.access(m - i) = (i & 1) ? a->access(m - (i >> 1))\ + : b->access(m - (i >> 1)); \ } \ *r = result;\ } -#if defined(HOST_WORDS_BIGENDIAN) -#define MRGHI 0 -#define MRGLO 1 -#else -#define MRGHI 1 -#define MRGLO 0 -#endif -#define VMRG(suffix, element) \ -VMRG_DO(mrgl##suffix, element, MRGHI) \ -VMRG_DO(mrgh##suffix, element, MRGLO) -VMRG(b, u8) -VMRG(h, u16) -VMRG(w, u32) + +#define VMRG_DOHI(name, element, access)\ +void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ +{ \ +ppc_avr_t result; \ +int i; \ +\ +for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ +result.access(i) = (i & 1) ? b->access(i >> 1) \ + : a->access(i >> 1); \ +} \ +*r = result;\ +} + +#define VMRG(suffix, element, access) \ +VMRG_DOLO(mrgl##suffix, element, access) \ +VMRG_DOHI(mrgh##suffix, element, access) +VMRG(b, u8, VsrB) +VMRG(h, u16, VsrH) +VMRG(w, u32, VsrW) #undef VMRG_DO #undef VMRG -#undef MRGHI -#undef MRGLO void helper_vmsummbm(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)