A little problem on X64 , I was fixed it. Of course, apply this patch will hit some conflict, because it depends on previous three patches.
在 2013-10-29 02:25:21,"Steve Borho" <[email protected]> 写道: On Mon, Oct 28, 2013 at 9:24 AM, Min Chen <[email protected]> wrote: # HG changeset patch # User Min Chen <[email protected]> # Date 1382970234 -28800 # Node ID 41425f18efe14be468715bfa68fdebbb9a49145f # Parent 5f7b3d06d94c6aec44bfd4a7bfb6f6751182b4ed asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8] I'm getting link errors on x86_64 from this series: error LNK2017: 'ADDR32' relocation to 'tab_LumaCoeffV' invalid without /LARGEADDRESSAWARE:NO In general, I think we should drop all of the interpolation merging while we get all the assembly completed for motion compensation. When the assembly is alltogether, we can experiment and figure out if it makes sense to re-merge some of them back together.
From 06a4e39a9981a388226419d246c915589a567988 Mon Sep 17 00:00:00 2001 From: Min Chen <[email protected]> Date: Tue, 29 Oct 2013 12:48:02 +0800 Subject: [PATCH] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8] --- source/common/x86/asm-primitives.cpp | 1 + source/common/x86/ipfilter8.asm | 124 ++++++++++++++++++++++++++++++++++ source/common/x86/ipfilter8.h | 1 + source/test/ipfilterharness.cpp | 10 ++- 4 files changed, 132 insertions(+), 4 deletions(-) diff --git a/source/common/x86/asm-primitives.cpp b/source/common/x86/asm-primitives.cpp index 0a3174f..439c532 100644 --- a/source/common/x86/asm-primitives.cpp +++ b/source/common/x86/asm-primitives.cpp @@ -282,6 +282,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask) p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3; p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3; + p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3; } if (cpuMask & X265_CPU_SSE4) { diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm index d559986..dbd6036 100644 --- a/source/common/x86/ipfilter8.asm +++ b/source/common/x86/ipfilter8.asm @@ -774,3 +774,127 @@ cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 jnz .loopV RET + + +;----------------------------------------------------------------------------- +; void interp_8tap_v_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx); +;----------------------------------------------------------------------------- +INIT_XMM ssse3 + +%if ARCH_X86_64 +cglobal interp_8tap_v_sp, 4, 7+5, 8 +%define tmp_r0 r7 +%define tmp_r2 r8 +%define tmp_r3 r9 +%define tmp_r4d r10d +%define tmp_6rows r11 + +%else ; ARCH_X86_64 = 0 + +cglobal interp_8tap_v_sp, 4, 7, 8, 0-(5*4) +%define tmp_r0 [(rsp + 0 * 4)] +%define tmp_r2 [(rsp + 1 * 4)] +%define tmp_r3 [(rsp + 2 * 4)] +%define tmp_r4d [(rsp + 3 * 4)] +%define tmp_6rows [(rsp + 4 * 4)] +%endif ; ARCH_X86_64 + + mov r4d, r4m + mov r5d, r5m + + mov tmp_r4d, r4d + mov tmp_r2, r2 + + ; load coeff table + mov r6d, r6m + shl r6, 6 + lea r4, [tab_LumaCoeffV] + lea r6, [r4 + r6] + + ; move to -3 + lea r1, [r1 * 2] + lea r4, [r1 + r1 * 2] + sub r0, r4 + lea r4, [r4 * 2] + mov tmp_6rows, r4 + +.loopH: + + ; load width + mov r4d, tmp_r4d + + ; save old src + mov tmp_r0, r0 + +.loopW: + + movu m0, [r0] + movu m1, [r0 + r1] + lea r0, [r0 + r1 * 2] + punpcklwd m2, m0, m1 + pmaddwd m2, [r6 + 0 * 16] + punpckhwd m0, m1 + pmaddwd m0, [r6 + 0 * 16] + + movu m3, [r0] + movu m4, [r0 + r1] + lea r0, [r0 + r1 * 2] + punpcklwd m1, m3, m4 + pmaddwd m1, [r6 + 1 * 16] + paddd m2, m1 + punpckhwd m3, m4 + pmaddwd m3, [r6 + 1 * 16] + paddd m0, m3 + + movu m3, [r0] + movu m4, [r0 + r1] + lea r0, [r0 + r1 * 2] + punpcklwd m1, m3, m4 + pmaddwd m1, [r6 + 2 * 16] + paddd m2, m1 + punpckhwd m3, m4 + pmaddwd m3, [r6 + 2 * 16] + paddd m0, m3 + + movu m3, [r0] + movu m4, [r0 + r1] + punpcklwd m1, m3, m4 + pmaddwd m1, [r6 + 3 * 16] + paddd m2, m1 + punpckhwd m3, m4 + pmaddwd m3, [r6 + 3 * 16] + paddd m0, m3 + + paddd m2, [tab_c_526336] + paddd m0, [tab_c_526336] + psrad m2, 12 + psrad m0, 12 + packssdw m2, m0 + packuswb m2, m2 + + ; move to next 8 col + sub r0, tmp_6rows + + sub r4, 8 + jl .width4 + movq [r2], m2 + je .nextH + lea r0, [r0 + 16] + lea r2, [r2 + 8] + jmp .loopW + +.width4: + movd [r2], m2 + lea r0, [r0 + 4] + +.nextH: + ; move to next row + mov r0, tmp_r0 + lea r0, [r0 + r1] + add tmp_r2, r3 + mov r2, tmp_r2 + + dec r5d + jnz .loopH + + RET diff --git a/source/common/x86/ipfilter8.h b/source/common/x86/ipfilter8.h index 7f93ad8..9ce0188 100644 --- a/source/common/x86/ipfilter8.h +++ b/source/common/x86/ipfilter8.h @@ -89,6 +89,7 @@ CHROMA_FILTERS(_sse4); LUMA_FILTERS(_sse4); void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY); +void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx); #undef SETUP_CHROMA_FUNC_DEF #undef SETUP_LUMA_FUNC_DEF diff --git a/source/test/ipfilterharness.cpp b/source/test/ipfilterharness.cpp index b7eda18..4a8f19f 100644 --- a/source/test/ipfilterharness.cpp +++ b/source/test/ipfilterharness.cpp @@ -164,6 +164,8 @@ bool IPFilterHarness::check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t int rand_width = rand() % 100; // Randomly generated Width int16_t rand_val, rand_srcStride, rand_dstStride; + rand_width &= ~3; + for (int i = 0; i <= 100; i++) { memset(IPF_vec_output_p, 0, ipf_t_size); // Initialize output buffer to zero @@ -173,16 +175,16 @@ bool IPFilterHarness::check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t rand_srcStride = rand() % 100; // Randomly generated srcStride rand_dstStride = rand() % 100; // Randomly generated dstStride - opt(short_buff + 3 * rand_srcStride, + ref(short_buff + 3 * rand_srcStride, rand_srcStride, - IPF_vec_output_p, + IPF_C_output_p, rand_dstStride, rand_width, rand_height, rand_val ); - ref(short_buff + 3 * rand_srcStride, + opt(short_buff + 3 * rand_srcStride, rand_srcStride, - IPF_C_output_p, + IPF_vec_output_p, rand_dstStride, rand_width, rand_height, rand_val -- 1.7.9.msysgit.0
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
