# HG changeset patch # User Nabajit Deka # Date 1384517817 -19800 # Fri Nov 15 17:46:57 2013 +0530 # Node ID 351229c80f52d580d24853f64f79e42d47617f87 # Parent b918110fd337178a1cf3616989c65a1e0ed14776 asm: routines for luma vss filter functions for all block sizes.
diff -r b918110fd337 -r 351229c80f52 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Nov 15 17:45:00 2013 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Nov 15 17:46:57 2013 +0530 @@ -5101,3 +5101,147 @@ FILTER_VER_CHROMA_SS_W8_H2 8, 8 FILTER_VER_CHROMA_SS_W8_H2 8, 16 FILTER_VER_CHROMA_SS_W8_H2 8, 32 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SS 2 +INIT_XMM sse2 +cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1 + + add r1d, r1d + add r3d, r3d + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffV + r4] +%endif + + mov byte [rsp], %2/4 +.loopH + mov r4d, (%1/4) +.loopW + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + movq m4, [r0 + 2 * r1] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + lea r0, [r0 + 2 * r1] + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + lea r0, [r0 + 2 * r1] + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + + lea r0, [r0 + 2 * r1] + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + psrad m0, 6 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + psrad m1, 6 + + packssdw m0, m1 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + lea r0, [r0 + 2 * r1] + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + psrad m2, 6 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end + psrad m3, 6 + + packssdw m2, m3 + + movlps [r2 + 2 * r3], m2 + lea r5, [r3 + 2 * r3] + movhps [r2 + r5], m2 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec byte [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_LUMA_SS 4, 4 + FILTER_VER_LUMA_SS 8, 8 + FILTER_VER_LUMA_SS 8, 4 + FILTER_VER_LUMA_SS 4, 8 + FILTER_VER_LUMA_SS 16, 16 + FILTER_VER_LUMA_SS 16, 8 + FILTER_VER_LUMA_SS 8, 16 + FILTER_VER_LUMA_SS 16, 12 + FILTER_VER_LUMA_SS 12, 16 + FILTER_VER_LUMA_SS 16, 4 + FILTER_VER_LUMA_SS 4, 16 + FILTER_VER_LUMA_SS 32, 32 + FILTER_VER_LUMA_SS 32, 16 + FILTER_VER_LUMA_SS 16, 32 + FILTER_VER_LUMA_SS 32, 24 + FILTER_VER_LUMA_SS 24, 32 + FILTER_VER_LUMA_SS 32, 8 + FILTER_VER_LUMA_SS 8, 32 + FILTER_VER_LUMA_SS 64, 64 + FILTER_VER_LUMA_SS 64, 32 + FILTER_VER_LUMA_SS 32, 64 + FILTER_VER_LUMA_SS 64, 48 + FILTER_VER_LUMA_SS 48, 64 + FILTER_VER_LUMA_SS 64, 16 + FILTER_VER_LUMA_SS 16, 64 _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
