# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1436252372 -19800 # Tue Jul 07 12:29:32 2015 +0530 # Node ID 25a8323b886f480347f4b0813f7ded18e579704a # Parent 235930aae11da04863e3fb13905e2d1d95e3dc0a asm: sse4 code for saoCuStatsE1, improved 320369c->151086c
diff -r 235930aae11d -r 25a8323b886f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 07 12:17:08 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 07 12:29:32 2015 +0530 @@ -2499,6 +2499,7 @@ #if X86_64 p.saoCuStatsBO = PFX(saoCuStatsBO_sse4); p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4); + p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4); p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4); p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4); diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Tue Jul 07 12:17:08 2015 +0530 +++ b/source/common/x86/loopfilter.asm Tue Jul 07 12:29:32 2015 +0530 @@ -2159,3 +2159,122 @@ add [r1 + 4 * 4], r6d RET %endif + +;------------------------------------------------------------------------------------------------------------------------------------------- +; saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count) +;------------------------------------------------------------------------------------------------------------------------------------------- +%if ARCH_X86_64 +INIT_XMM sse4 +cglobal saoCuStatsE1, 4,11,9,0-32 ; Stack: 5 of stats and 5 of count + mov r4d, r4m + mov r5d, r5m + + ; clear internal temporary buffer + pxor m0, m0 + mova [rsp], m0 + mova [rsp + mmsize], m0 + mova m0, [pb_128] + mova m5, [pb_1] + mova m6, [pb_2] + mova m8, [hmul_16p + 16] + movh m7, [r3 + r4] + +.loopH: + mov r6d, r4d + mov r9, r0 + mov r10, r1 + mov r5, r3 + +.loopW: + movu m1, [r10] + movu m2, [r10 + r2] + + ; signDown + pxor m1, m0 + pxor m2, m0 + pcmpgtb m3, m1, m2 + pand m3, m5 + pcmpgtb m2, m1 + por m2, m3 + pxor m3, m3 + psubb m3, m2 ; -signDown + + ; edgeType + movu m4, [r5] + paddb m4, m6 + paddb m2, m4 + + ; update upBuff1 + movu [r5], m3 + + ; stats[edgeType] + pxor m1, m0 + movu m3, [r9] + punpckhbw m4, m3, m1 + punpcklbw m3, m1 + pmaddubsw m3, m8 + pmaddubsw m4, m8 + + ; 16 pixels +%assign x 0 +%rep 16 + pextrb r7d, m2, x + inc word [rsp + r7 * 2] + + %if (x < 8) + pextrw r8d, m3, (x % 8) + %else + pextrw r8d, m4, (x % 8) + %endif + movsx r8d, r8w + add [rsp + 5 * 2 + r7 * 4], r8d + + dec r6d + jz .next +%assign x x+1 +%endrep + + add r9, 16 + add r10, 16 + add r5, 16 + jmp .loopW + +.next: + ; restore pointer upBuff1 + add r0, r2 + add r1, r2 + + dec byte r5m + jg .loopH + + ; restore unavailable pixels + movh [r3 + r4], m7 + + ; sum to global buffer + mov r1, r6m + mov r0, r7m + + ; s_eoTable = {1,2,0,3,4} + movzx r6d, word [rsp + 0 * 2] + add [r0 + 1 * 4], r6d + movzx r6d, word [rsp + 1 * 2] + add [r0 + 2 * 4], r6d + movzx r6d, word [rsp + 2 * 2] + add [r0 + 0 * 4], r6d + movzx r6d, word [rsp + 3 * 2] + add [r0 + 3 * 4], r6d + movzx r6d, word [rsp + 4 * 2] + add [r0 + 4 * 4], r6d + + mov r6d, [rsp + 5 * 2 + 0 * 4] + add [r1 + 1 * 4], r6d + mov r6d, [rsp + 5 * 2 + 1 * 4] + add [r1 + 2 * 4], r6d + mov r6d, [rsp + 5 * 2 + 2 * 4] + add [r1 + 0 * 4], r6d + mov r6d, [rsp + 5 * 2 + 3 * 4] + add [r1 + 3 * 4], r6d + mov r6d, [rsp + 5 * 2 + 4 * 4] + add [r1 + 4 * 4], r6d + RET +%endif ; ARCH_X86_64 diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.h --- a/source/common/x86/loopfilter.h Tue Jul 07 12:17:08 2015 +0530 +++ b/source/common/x86/loopfilter.h Tue Jul 07 12:29:32 2015 +0530 @@ -37,6 +37,7 @@ void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \ void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \ void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \ + void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \ void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \ void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \ void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX); diff -r 235930aae11d -r 25a8323b886f source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Tue Jul 07 12:17:08 2015 +0530 +++ b/source/test/pixelharness.cpp Tue Jul 07 12:29:32 2015 +0530 @@ -1089,6 +1089,52 @@ return true; } +bool PixelHarness::check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt) +{ + enum { NUM_EDGETYPE = 5 }; + int32_t stats_ref[NUM_EDGETYPE]; + int32_t stats_vec[NUM_EDGETYPE]; + + int32_t count_ref[NUM_EDGETYPE]; + int32_t count_vec[NUM_EDGETYPE]; + + int8_t _upBuff1_ref[MAX_CU_SIZE + 2], *upBuff1_ref = _upBuff1_ref + 1; + int8_t _upBuff1_vec[MAX_CU_SIZE + 2], *upBuff1_vec = _upBuff1_vec + 1; + + int j = 0; + + for (int i = 0; i < ITERS; i++) + { + // initialize input data to random, the dynamic range wrong but good to verify our asm code + for (int x = 0; x < NUM_EDGETYPE; x++) + { + stats_ref[x] = stats_vec[x] = rand(); + count_ref[x] = count_vec[x] = rand(); + } + + // initial sign + for (int x = 0; x < MAX_CU_SIZE + 2; x++) + _upBuff1_ref[x] = _upBuff1_vec[x] = (rand() % 3) - 1; + + intptr_t stride = 16 * (rand() % 4 + 1); + int endX = MAX_CU_SIZE - (rand() % 5); + int endY = MAX_CU_SIZE - (rand() % 4) - 1; + + ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref); + checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec); + + if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref)) + || memcmp(stats_ref, stats_vec, sizeof(stats_ref)) + || memcmp(count_ref, count_vec, sizeof(count_ref))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt) { enum { NUM_EDGETYPE = 5 }; @@ -2184,6 +2230,15 @@ } } + if (opt.saoCuStatsE1) + { + if (!check_saoCuStatsE1_t(ref.saoCuStatsE1, opt.saoCuStatsE1)) + { + printf("saoCuStatsE1 failed\n"); + return false; + } + } + if (opt.saoCuStatsE2) { if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2)) @@ -2630,6 +2685,15 @@ REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, pbuf2, pbuf3, 64, 60, 61, stats, count); } + if (opt.saoCuStatsE1) + { + int32_t stats[5], count[5]; + int8_t upBuff1[MAX_CU_SIZE + 2]; + memset(upBuff1, 1, sizeof(upBuff1)); + HEADER0("saoCuStatsE1"); + REPORT_SPEEDUP(opt.saoCuStatsE1, ref.saoCuStatsE1, pbuf2, pbuf3, 64, upBuff1 + 1,60, 61, stats, count); + } + if (opt.saoCuStatsE2) { int32_t stats[5], count[5]; diff -r 235930aae11d -r 25a8323b886f source/test/pixelharness.h --- a/source/test/pixelharness.h Tue Jul 07 12:17:08 2015 +0530 +++ b/source/test/pixelharness.h Tue Jul 07 12:29:32 2015 +0530 @@ -102,6 +102,7 @@ bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt); bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt); bool check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt); + bool check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt); bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt); bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt); bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel