# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1436251628 -19800 # Tue Jul 07 12:17:08 2015 +0530 # Node ID 235930aae11da04863e3fb13905e2d1d95e3dc0a # Parent e0166f09f332af72a83eb059d878044db15f59bd asm: sse4 code for saoCuStatsE0, improved 250341c->147284c
diff -r e0166f09f332 -r 235930aae11d source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 07 11:14:35 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 07 12:17:08 2015 +0530 @@ -2498,6 +2498,7 @@ #if X86_64 p.saoCuStatsBO = PFX(saoCuStatsBO_sse4); + p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4); p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4); p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4); diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Tue Jul 07 11:14:35 2015 +0530 +++ b/source/common/x86/loopfilter.asm Tue Jul 07 12:17:08 2015 +0530 @@ -2043,3 +2043,119 @@ jnz .loopH RET %endif + +;----------------------------------------------------------------------------------------------------------------------- +; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) +;----------------------------------------------------------------------------------------------------------------------- +%if ARCH_X86_64 +INIT_XMM sse4 +cglobal saoCuStatsE0, 5,8,8, 0-32 + mov r3d, r3m + + ; clear internal temporary buffer + pxor m0, m0 + mova [rsp], m0 + mova [rsp + mmsize], m0 + mova m4, [pb_128] + mova m5, [hmul_16p + 16] + mova m6, [pb_2] + xor r7d, r7d + +.loopH: + mov r5d, r3d + + ; calculate signLeft + mov r7b, [r1] + sub r7b, [r1 - 1] + seta r7b + setb r6b + sub r7b, r6b + neg r7b + pinsrb m0, r7d, 15 + +.loopL: + movu m7, [r1] + movu m2, [r1 + 1] + + pxor m1, m7, m4 + pxor m3, m2, m4 + pcmpgtb m2, m1, m3 + pcmpgtb m3, m1 + pand m2, [pb_1] + por m2, m3 ; signRight + + palignr m3, m2, m0, 15 + psignb m3, m4 ; signLeft + + mova m0, m2 + paddb m2, m3 + paddb m2, m6 ; edgeType + + ; stats[edgeType] + movu m3, [r0] ; fenc[0-15] + punpckhbw m1, m3, m7 + punpcklbw m3, m7 + pmaddubsw m1, m5 + pmaddubsw m3, m5 + +%assign x 0 +%rep 16 + pextrb r7d, m2, x + +%if (x < 8) + pextrw r6d, m3, (x % 8) +%else + pextrw r6d, m1, (x % 8) +%endif + movsx r6d, r6w + inc word [rsp + r7 * 2] ; tmp_count[edgeType]++ + add [rsp + 5 * 2 + r7 * 4], r6d ; tmp_stats[edgeType] += (fenc[x] - rec[x]) + dec r5d + jz .next +%assign x x+1 +%endrep + + add r0q, 16 + add r1q, 16 + jmp .loopL + +.next: + mov r6d, r3d + and r6d, 15 + + sub r6, r3 + add r6, r2 + add r0, r6 + add r1, r6 + + dec r4d + jnz .loopH + + ; sum to global buffer + mov r1, r5m + mov r0, r6m + + ; s_eoTable = {1, 2, 0, 3, 4} + movzx r5d, word [rsp + 0 * 2] + add [r0 + 1 * 4], r5d + movzx r6d, word [rsp + 1 * 2] + add [r0 + 2 * 4], r6d + movzx r5d, word [rsp + 2 * 2] + add [r0 + 0 * 4], r5d + movzx r6d, word [rsp + 3 * 2] + add [r0 + 3 * 4], r6d + movzx r5d, word [rsp + 4 * 2] + add [r0 + 4 * 4], r5d + + mov r6d, [rsp + 5 * 2 + 0 * 4] + add [r1 + 1 * 4], r6d + mov r5d, [rsp + 5 * 2 + 1 * 4] + add [r1 + 2 * 4], r5d + mov r6d, [rsp + 5 * 2 + 2 * 4] + add [r1 + 0 * 4], r6d + mov r5d, [rsp + 5 * 2 + 3 * 4] + add [r1 + 3 * 4], r5d + mov r6d, [rsp + 5 * 2 + 4 * 4] + add [r1 + 4 * 4], r6d + RET +%endif diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.h --- a/source/common/x86/loopfilter.h Tue Jul 07 11:14:35 2015 +0530 +++ b/source/common/x86/loopfilter.h Tue Jul 07 12:17:08 2015 +0530 @@ -36,6 +36,7 @@ void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \ void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \ void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \ + void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \ void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \ void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \ void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX); diff -r e0166f09f332 -r 235930aae11d source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Tue Jul 07 11:14:35 2015 +0530 +++ b/source/test/pixelharness.cpp Tue Jul 07 12:17:08 2015 +0530 @@ -1053,6 +1053,42 @@ return true; } +bool PixelHarness::check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt) +{ + enum { NUM_EDGETYPE = 5 }; + int32_t stats_ref[NUM_EDGETYPE]; + int32_t stats_vec[NUM_EDGETYPE]; + + int32_t count_ref[NUM_EDGETYPE]; + int32_t count_vec[NUM_EDGETYPE]; + + int j = 0; + for (int i = 0; i < ITERS; i++) + { + // initialize input data to random, the dynamic range wrong but good to verify our asm code + for (int x = 0; x < NUM_EDGETYPE; x++) + { + stats_ref[x] = stats_vec[x] = rand(); + count_ref[x] = count_vec[x] = rand(); + } + + intptr_t stride = 16 * (rand() % 4 + 1); + int endX = MAX_CU_SIZE - (rand() % 5) - 1; + int endY = MAX_CU_SIZE - (rand() % 4) - 1; + + ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref); + checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec); + + if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref))) + return false; + + reportfail(); + j += INCR; + } + + return true; +} + bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt) { enum { NUM_EDGETYPE = 5 }; @@ -2139,6 +2175,15 @@ } } + if (opt.saoCuStatsE0) + { + if (!check_saoCuStatsE0_t(ref.saoCuStatsE0, opt.saoCuStatsE0)) + { + printf("saoCuStatsE0 failed\n"); + return false; + } + } + if (opt.saoCuStatsE2) { if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2)) @@ -2578,6 +2623,13 @@ REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, pbuf2, pbuf3, 64, 60, 61, stats, count); } + if (opt.saoCuStatsE0) + { + int32_t stats[33], count[33]; + HEADER0("saoCuStatsE0"); + REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, pbuf2, pbuf3, 64, 60, 61, stats, count); + } + if (opt.saoCuStatsE2) { int32_t stats[5], count[5]; diff -r e0166f09f332 -r 235930aae11d source/test/pixelharness.h --- a/source/test/pixelharness.h Tue Jul 07 11:14:35 2015 +0530 +++ b/source/test/pixelharness.h Tue Jul 07 12:17:08 2015 +0530 @@ -101,6 +101,7 @@ bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt); bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt); bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt); + bool check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt); bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt); bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt); bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel