>>+ pshufd m1, m1, 0 >>+ packssdw m1, m1 > packssdw is expendsive instruction, pshuflw+punpcklqdq is better.
revised, thanks. # HG changeset patch # User Satoshi Nakagawa <nakagawa...@oki.com> # Date 1392953002 -32400 # Fri Feb 21 12:23:22 2014 +0900 # Node ID e4a80e46bd80e7d516dc881da7f38737c0071ccf # Parent 894bde574bc1678471e0c23ceb381a806768ea95 asm: update count_nonzero, add testbench diff -r 894bde574bc1 -r e4a80e46bd80 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Thu Feb 20 17:18:42 2014 -0600 +++ b/source/common/x86/pixel-util8.asm Fri Feb 21 12:23:22 2014 +0900 @@ -1240,11 +1240,12 @@ ; int count_nonzero(const int32_t *quantCoeff, int numCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal count_nonzero, 2,3,4 +cglobal count_nonzero, 2,2,4 pxor m0, m0 - pxor m1, m1 - mov r2d, r1d shr r1d, 3 + movd m1, r1d + pshuflw m1, m1, 0 + punpcklqdq m1, m1 .loop mova m2, [r0] @@ -1252,16 +1253,13 @@ add r0, 32 packssdw m2, m3 pcmpeqw m2, m0 - psrlw m2, 15 - packsswb m2, m2 - psadbw m2, m0 - paddd m1, m2 + paddw m1, m2 dec r1d - jnz .loop - - movd r1d, m1 - sub r2d, r1d - mov eax, r2d + jnz .loop + + packuswb m1, m1 + psadbw m1, m0 + movd eax, m1 RET diff -r 894bde574bc1 -r e4a80e46bd80 source/test/mbdstharness.cpp --- a/source/test/mbdstharness.cpp Thu Feb 20 17:18:42 2014 -0600 +++ b/source/test/mbdstharness.cpp Fri Feb 21 12:23:22 2014 +0900 @@ -380,6 +380,41 @@ return true; } +bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt) +{ + ALIGN_VAR_32(int32_t, qcoeff[32 * 32]); + + for (int i = 0; i < 4; i++) + { + int log2TrSize = i + 2; + int num = 1 << (log2TrSize * 2); + int mask = num - 1; + + for (int n = 0; n <= num; n++) + { + memset(qcoeff, 0, num * sizeof(int32_t)); + + for (int j = 0; j < n; j++) + { + int k = rand() & mask; + while (qcoeff[k]) + { + k = (k + 11) & mask; + } + qcoeff[k] = rand() - RAND_MAX / 2; + } + + int refval = ref(qcoeff, num); + int optval = opt(qcoeff, num); + + if (refval != optval) + return false; + } + } + + return true; +} + bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt) { for (int i = 0; i < NUM_DCTS; i++) @@ -424,6 +459,15 @@ } } + if (opt.count_nonzero) + { + if (!check_count_nonzero_primitive(ref.count_nonzero, opt.count_nonzero)) + { + printf("count_nonzero: Failed!\n"); + return false; + } + } + return true; } @@ -465,4 +509,13 @@ int dummy = -1; REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy); } + + if (opt.count_nonzero) + { + for (int i = 4; i <= 32; i <<= 1) + { + printf("count_nonzero[%dx%d]", i, i); + REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbufidct, i * i) + } + } } diff -r 894bde574bc1 -r e4a80e46bd80 source/test/mbdstharness.h --- a/source/test/mbdstharness.h Thu Feb 20 17:18:42 2014 -0600 +++ b/source/test/mbdstharness.h Fri Feb 21 12:23:22 2014 +0900 @@ -43,6 +43,7 @@ bool check_quant_primitive(quant_t ref, quant_t opt); bool check_dct_primitive(dct_t ref, dct_t opt, int width); bool check_idct_primitive(idct_t ref, idct_t opt, int width); + bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt); public: _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel