On 05/06/2015 01:29 PM, chen wrote:
At 2015-05-07 03:45:35,[email protected] wrote:
># HG changeset patch
># User David T Yuen <[email protected]>
># Date 1430940440 25200
># Node ID 4690c9aa24caa1adb665355803d4c308a124ec96
># Parent  87d6724649df0157786c4210f0caebf961b31341
>asm: interp_4tap_vert_pp sse2
>
>This replaces c code for 2x4, 2x8 and 2x16
>
>64-bit
>
>./test/TestBench --testbench interp | grep vpp
>chroma_vpp[  2x4]   1.76x    659.96          1159.98
>chroma_vpp[  2x8]   1.68x    1232.42         2067.47
>chroma_vpp[  2x8]   1.69x    1226.56         2067.48
>chroma_vpp[ 2x16]   1.92x    2352.47         4509.99
>
>32-bit
>
>./test/TestBench --testbench interp | grep vpp
>chroma_vpp[  2x4]   2.00x    809.98          1617.42
>chroma_vpp[  2x8]   2.13x    1324.95         2817.42
>chroma_vpp[  2x8]   2.13x    1324.99         2817.45
>chroma_vpp[ 2x16]   2.61x    2439.97         6358.08
>
>diff -r 87d6724649df -r 4690c9aa24ca source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp  Tue May 05 17:05:22 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp  Wed May 06 12:27:20 2015 -0700
>@@ -1356,6 +1356,10 @@
>         ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
>         p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
>         p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse3;
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = 
x265_interp_4tap_vert_pp_2x4_sse2;
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vpp = 
x265_interp_4tap_vert_pp_2x8_sse2;
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vpp = 
x265_interp_4tap_vert_pp_2x8_sse2;
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vpp = 
x265_interp_4tap_vert_pp_2x16_sse2;
>
>         //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
>         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
>diff -r 87d6724649df -r 4690c9aa24ca source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm       Tue May 05 17:05:22 2015 +0530
>+++ b/source/common/x86/ipfilter8.asm       Wed May 06 12:27:20 2015 -0700
>@@ -74,6 +74,15 @@
>                         dw -2, 16, 54, -4
>                         dw -2, 10, 58, -2
>
>+const tabw_ChromaCoeffV, times 2 dw  0,  0, 64,  0
when you modify instruction, you can share this table with tab_ChromaCoeff 
(rename to pw_*)
>+                         times 2 dw -2, 10, 58, -2
>+                         times 2 dw -4, 16, 54, -2
>+                         times 2 dw -6, 28, 46, -4
>+                         times 2 dw -4, 36, 36, -4
>+                         times 2 dw -4, 46, 28, -6
>+                         times 2 dw -2, 54, 16, -4
>+                         times 2 dw -2, 58, 10, -2
>+
> const tab_ChromaCoeff_V, times 8 db 0, 64
>                          times 8 db 0,  0
>
>@@ -296,6 +305,7 @@
>
> SECTION .text
>
>+cextern pb_0
> cextern pb_128
> cextern pw_1
> cextern pw_32
>@@ -1043,6 +1053,129 @@
>     IPFILTER_LUMA_sse2 16, 64, ps
>
> ;-----------------------------------------------------------------------------
>+; void interp_4tap_vert_pp_2xn(pixel *src, intptr_t srcStride, pixel *dst, 
intptr_t dstStride, int coeffIdx)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_V4_W2_H4_sse2 1
>+INIT_XMM sse2
>+%if ARCH_X86_64
>+cglobal interp_4tap_vert_pp_2x%1, 4, 6, 9
>+%define PB_0 m8
>+    pxor        m8,        m8
>+%else
>+cglobal interp_4tap_vert_pp_2x%1, 4, 6, 8
>+%define PB_0 [pb_0]
maybe can remove when some instrction magic show below
>+%endif
>+
>+    mov         r4d,       r4m
>+    sub         r0,        r1
>+    add         r4d,       r4d
>+
>+%ifdef PIC
>+    lea         r5,        [tabw_ChromaCoeffV]
>+    mova        m0,        [r5 + r4 * 8]
>+%else
>+    mova        m0,        [tabw_ChromaCoeffV + r4 * 8]
>+%endif
>+
>+    mova        m1,        [pw_32]
>+    lea         r5,        [3 * r1]
>+
>+%assign x 1
>+%rep %1/4
>+    movd        m2,        [r0]
>+    movd        m3,        [r0 + r1]
>+    movd        m4,        [r0 + 2 * r1]
>+    movd        m5,        [r0 + r5]
>+
>+    punpcklbw   m2,        m3
>+    punpcklbw   m6,        m4,        m5
>+    punpcklbw   m2,        m6
if you use punpcklwd, you can share table at above
If you are referring to something like the avx2 version of this primitive that also uses pshufb which is ssse3. Adapting this to <ssse3 is going to be slower.

By the way, the sse4 version could be improved by using a byte version of the table I added.
>+
>+    punpcklbw   m2,        PB_0
convert byte to word, can replace by PUNPCKLBW+PSRLW
This performs significantly slower in the testbench.
>+ pmaddwd m2, m0
>+
>+    lea         r0,        [r0 + 4 * r1]
>+    movd        m6,        [r0]
>+
>+    punpcklbw   m3,        m4
>+    punpcklbw   m7,        m5,        m6
>+    punpcklbw   m3,        m7
>+
>+    punpcklbw   m3,        PB_0
>+    pmaddwd     m3,        m0
>+
>+    packssdw    m2,        m3
>+    pshuflw     m3,        m2,          q2301
>+    pshufhw     m3,        m3,          q2301
>+    paddw       m2,        m3
>+    psrld       m2,        16
>+
>+    movd        m7,        [r0 + r1]
>+
>+    punpcklbw   m4,        m5
>+    punpcklbw   m3,        m6,        m7
>+    punpcklbw   m4,        m3
>+
>+    punpcklbw   m4,        PB_0
>+    pmaddwd     m4,        m0
>+
>+    movd        m3,        [r0 + 2 * r1]
>+
>+    punpcklbw   m5,        m6
>+    punpcklbw   m7,        m3
>+    punpcklbw   m5,        m7
>+
>+    punpcklbw   m5,        PB_0
>+    pmaddwd     m5,        m0
>+
>+    packssdw    m4,        m5
>+    pshuflw     m5,        m4,          q2301
>+    pshufhw     m5,        m5,          q2301
>+    paddw       m4,        m5
>+    psrld       m4,        16
>+
>+    packssdw    m2,        m4
>+    paddw       m2,        m1
>+    psraw       m2,        6
>+    packuswb    m2,        m2
>+
>+%if ARCH_X86_64
>+    movq        r4,        m2
>+    mov         [r2],      r4w
>+    shr         r4,        16
>+    mov         [r2 + r3], r4w
>+    lea         r2,        [r2 + 2 * r3]
>+    shr         r4,        16
>+    mov         [r2],      r4w
>+    shr         r4,        16
>+    mov         [r2 + r3], r4w
>+%else
>+    movd        r4,        m2
>+    mov         [r2],      r4w
>+    shr         r4,        16
>+    mov         [r2 + r3], r4w
>+    lea         r2,        [r2 + 2 * r3]
>+    psrldq      m2,        4
>+    movd        r4,        m2
>+    mov         [r2],      r4w
>+    shr         r4,        16
>+    mov         [r2 + r3], r4w
>+%endif
>+
>+%if x < %1/4
>+    lea         r2,        [r2 + 2 * r3]
>+%endif
>+%assign x x+1
>+%endrep
>+    RET
>+
>+%endmacro
>+
>+    FILTER_V4_W2_H4_sse2 4
>+    FILTER_V4_W2_H4_sse2 8
>+    FILTER_V4_W2_H4_sse2 16
>+
>+;-----------------------------------------------------------------------------
> ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, 
intptr_t dstStride, int coeffIdx)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
>diff -r 87d6724649df -r 4690c9aa24ca source/common/x86/ipfilter8.h
>--- a/source/common/x86/ipfilter8.h Tue May 05 17:05:22 2015 +0530
>+++ b/source/common/x86/ipfilter8.h Wed May 06 12:27:20 2015 -0700
>@@ -905,6 +905,9 @@
> void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t 
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t 
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> void x265_interp_8tap_hv_pp_8x8_sse3(const pixel* src, intptr_t srcStride, 
pixel* dst, intptr_t dstStride, int idxX, int idxY);
>+void x265_interp_4tap_vert_pp_2x4_sse2(const pixel *src, intptr_t srcStride, 
pixel *dst, intptr_t dstStride, int coeffIdx);
>+void x265_interp_4tap_vert_pp_2x8_sse2(const pixel *src, intptr_t srcStride, 
pixel *dst, intptr_t dstStride, int coeffIdx);
>+void x265_interp_4tap_vert_pp_2x16_sse2(const pixel *src, intptr_t srcStride, 
pixel *dst, intptr_t dstStride, int coeffIdx);
> #undef LUMA_FILTERS
> #undef LUMA_SP_FILTERS
> #undef LUMA_SS_FILTERS
>_______________________________________________
>x265-devel mailing list
>[email protected]
>https://mailman.videolan.org/listinfo/x265-devel


_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to