Re: [x265] [PATCH 4 of 4] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]

chen Mon, 28 Oct 2013 22:13:54 -0700

A little problem on X64 , I was fixed it.
Of course, apply this patch will hit some conflict, because it depends on 
previous three patches.



在 2013-10-29 02:25:21，"Steve Borho" <[email protected]> 写道：



On Mon, Oct 28, 2013 at 9:24 AM, Min Chen <[email protected]> wrote:
# HG changeset patch
# User Min Chen <[email protected]>
# Date 1382970234 -28800
# Node ID 41425f18efe14be468715bfa68fdebbb9a49145f
# Parent  5f7b3d06d94c6aec44bfd4a7bfb6f6751182b4ed
asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]





I'm getting link errors on x86_64 from this series:


error LNK2017: 'ADDR32' relocation to 'tab_LumaCoeffV' invalid without 
/LARGEADDRESSAWARE:NO



In general, I think we should drop all of the interpolation merging while we 
get all the assembly completed for motion compensation.  When the assembly is 
alltogether, we can experiment and figure out if it makes sense to re-merge 
some of them back together.

From 06a4e39a9981a388226419d246c915589a567988 Mon Sep 17 00:00:00 2001
From: Min Chen <[email protected]>
Date: Tue, 29 Oct 2013 12:48:02 +0800
Subject: [PATCH] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]

---
 source/common/x86/asm-primitives.cpp |    1 +
 source/common/x86/ipfilter8.asm      |  124 ++++++++++++++++++++++++++++++++++
 source/common/x86/ipfilter8.h        |    1 +
 source/test/ipfilterharness.cpp      |   10 ++-
 4 files changed, 132 insertions(+), 4 deletions(-)

diff --git a/source/common/x86/asm-primitives.cpp 
b/source/common/x86/asm-primitives.cpp
index 0a3174f..439c532 100644
--- a/source/common/x86/asm-primitives.cpp
+++ b/source/common/x86/asm-primitives.cpp
@@ -282,6 +282,7 @@ void Setup_Assembly_Primitives(EncoderPrimitives &p, int 
cpuMask)
         p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3;
 
         p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
+        p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm
index d559986..dbd6036 100644
--- a/source/common/x86/ipfilter8.asm
+++ b/source/common/x86/ipfilter8.asm
@@ -774,3 +774,127 @@ cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
     jnz         .loopV
 
     RET
+
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_v_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t 
dstStride, int width, int height, const int coeffIdx);
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+
+%if ARCH_X86_64
+cglobal interp_8tap_v_sp, 4, 7+5, 8
+%define tmp_r0      r7
+%define tmp_r2      r8
+%define tmp_r3      r9
+%define tmp_r4d     r10d
+%define tmp_6rows   r11
+
+%else ; ARCH_X86_64 = 0
+
+cglobal interp_8tap_v_sp, 4, 7, 8, 0-(5*4)
+%define tmp_r0      [(rsp + 0 * 4)]
+%define tmp_r2      [(rsp + 1 * 4)]
+%define tmp_r3      [(rsp + 2 * 4)]
+%define tmp_r4d     [(rsp + 3 * 4)]
+%define tmp_6rows   [(rsp + 4 * 4)]
+%endif ; ARCH_X86_64
+
+    mov         r4d,        r4m
+    mov         r5d,        r5m
+
+    mov         tmp_r4d, r4d
+    mov         tmp_r2, r2
+
+    ; load coeff table
+    mov         r6d,        r6m
+    shl         r6,         6
+    lea         r4,         [tab_LumaCoeffV]
+    lea         r6,         [r4 + r6]
+
+    ; move to -3
+    lea         r1, [r1 * 2]
+    lea         r4, [r1 + r1 * 2]
+    sub         r0, r4
+    lea         r4, [r4 * 2]
+    mov         tmp_6rows, r4
+
+.loopH:
+
+    ; load width
+    mov         r4d, tmp_r4d
+
+    ; save old src
+    mov         tmp_r0, r0
+
+.loopW:
+
+    movu        m0, [r0]
+    movu        m1, [r0 + r1]
+    lea         r0, [r0 + r1 * 2]
+    punpcklwd   m2, m0, m1
+    pmaddwd     m2, [r6 + 0 * 16]
+    punpckhwd   m0, m1
+    pmaddwd     m0, [r6 + 0 * 16]
+
+    movu        m3, [r0]
+    movu        m4, [r0 + r1]
+    lea         r0, [r0 + r1 * 2]
+    punpcklwd   m1, m3, m4
+    pmaddwd     m1, [r6 + 1 * 16]
+    paddd       m2, m1
+    punpckhwd   m3, m4
+    pmaddwd     m3, [r6 + 1 * 16]
+    paddd       m0, m3
+
+    movu        m3, [r0]
+    movu        m4, [r0 + r1]
+    lea         r0, [r0 + r1 * 2]
+    punpcklwd   m1, m3, m4
+    pmaddwd     m1, [r6 + 2 * 16]
+    paddd       m2, m1
+    punpckhwd   m3, m4
+    pmaddwd     m3, [r6 + 2 * 16]
+    paddd       m0, m3
+
+    movu        m3, [r0]
+    movu        m4, [r0 + r1]
+    punpcklwd   m1, m3, m4
+    pmaddwd     m1, [r6 + 3 * 16]
+    paddd       m2, m1
+    punpckhwd   m3, m4
+    pmaddwd     m3, [r6 + 3 * 16]
+    paddd       m0, m3
+
+    paddd       m2, [tab_c_526336]
+    paddd       m0, [tab_c_526336]
+    psrad       m2, 12
+    psrad       m0, 12
+    packssdw    m2, m0
+    packuswb    m2, m2
+
+    ; move to next 8 col
+    sub         r0, tmp_6rows
+
+    sub         r4, 8
+    jl          .width4
+    movq        [r2], m2
+    je          .nextH
+    lea         r0, [r0 + 16]
+    lea         r2, [r2 + 8]
+    jmp         .loopW
+
+.width4:
+    movd        [r2], m2
+    lea         r0, [r0 + 4]
+
+.nextH:
+    ; move to next row
+    mov         r0, tmp_r0
+    lea         r0, [r0 + r1]
+    add         tmp_r2, r3
+    mov         r2, tmp_r2
+
+    dec         r5d
+    jnz         .loopH
+
+    RET
diff --git a/source/common/x86/ipfilter8.h b/source/common/x86/ipfilter8.h
index 7f93ad8..9ce0188 100644
--- a/source/common/x86/ipfilter8.h
+++ b/source/common/x86/ipfilter8.h
@@ -89,6 +89,7 @@ CHROMA_FILTERS(_sse4);
 LUMA_FILTERS(_sse4);
 
 void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * 
dst, intptr_t dstStride, int idxX, int idxY);
+void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, 
intptr_t dstStride, int width, int height, const int coeffIdx);
 
 #undef SETUP_CHROMA_FUNC_DEF
 #undef SETUP_LUMA_FUNC_DEF
diff --git a/source/test/ipfilterharness.cpp b/source/test/ipfilterharness.cpp
index b7eda18..4a8f19f 100644
--- a/source/test/ipfilterharness.cpp
+++ b/source/test/ipfilterharness.cpp
@@ -164,6 +164,8 @@ bool 
IPFilterHarness::check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t
     int rand_width = rand() % 100;                  // Randomly generated Width
     int16_t rand_val, rand_srcStride, rand_dstStride;
 
+    rand_width &= ~3;
+
     for (int i = 0; i <= 100; i++)
     {
         memset(IPF_vec_output_p, 0, ipf_t_size);      // Initialize output 
buffer to zero
@@ -173,16 +175,16 @@ bool 
IPFilterHarness::check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t
         rand_srcStride = rand() % 100;              // Randomly generated 
srcStride
         rand_dstStride = rand() % 100;              // Randomly generated 
dstStride
 
-        opt(short_buff + 3 * rand_srcStride,
+        ref(short_buff + 3 * rand_srcStride,
             rand_srcStride,
-            IPF_vec_output_p,
+            IPF_C_output_p,
             rand_dstStride,
             rand_width,
             rand_height, rand_val
             );
-        ref(short_buff + 3 * rand_srcStride,
+        opt(short_buff + 3 * rand_srcStride,
             rand_srcStride,
-            IPF_C_output_p,
+            IPF_vec_output_p,
             rand_dstStride,
             rand_width,
             rand_height, rand_val
-- 
1.7.9.msysgit.0

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH 4 of 4] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]

Reply via email to