# HG changeset patch # User Nabajit Deka # Date 1384339787 -19800 # Wed Nov 13 16:19:47 2013 +0530 # Node ID 31192cf36593bce97071d9f252ca9a2c14ca406d # Parent b5be1a9259e686aa8d0bc9351cb35477c0ab5b0e asm: routines for chroma vps filter functions for 6x8 and 12x16 block sizes.
diff -r b5be1a9259e6 -r 31192cf36593 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Nov 13 16:02:48 2013 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Nov 13 16:19:47 2013 +0530 @@ -4333,3 +4333,188 @@ FILTER_V_PS_W8_H8_H16_H32 8, 8 FILTER_V_PS_W8_H8_H16_H32 8, 16 FILTER_V_PS_W8_H8_H16_H32 8, 32 + +;------------------------------------------------------------------------------------------------------------ +;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_6x8, 4, 7, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd m5, [r6 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [tab_c_8192] + + mov r4d, 2 + +.loop + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + lea r5, [r0 + 2 * r1] + movq m3, [r5 + r1] + + punpcklbw m0, m1 + punpcklbw m7, m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m7, m5 + + paddw m0, m7 + + psubw m0, m4 + movh [r2], m0 + pshufd m0, m0, 2 + movd [r2 + 8], m0 + + movq m0, [r0 + 4 * r1] + + punpcklbw m1, m2 + punpcklbw m7, m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m7, m5 + + paddw m1, m7 + psubw m1, m4 + + movh [r2 + r3], m1 + pshufd m1, m1, 2 + movd [r2 + r3 + 8], m1 + + lea r6, [r0 + 4 * r1] + movq m1, [r6 + r1] + + punpcklbw m2, m3 + punpcklbw m7, m0, m1 + + pmaddubsw m2, m6 + pmaddubsw m7, m5 + + paddw m2, m7 + psubw m2, m4 + + movh [r2 + 2 * r3], m2 + pshufd m2, m2, 2 + movd [r2 + 2 * r3 + 8], m2 + + movq m2,[r6 + 2 * r1] + + punpcklbw m3, m0 + punpcklbw m1, m2 + + pmaddubsw m3, m6 + pmaddubsw m1, m5 + + paddw m3, m1 + psubw m3, m4 + + lea r5,[r2 + 2 * r3] + movh [r5 + r3], m3 + pshufd m3, m3, 2 + movd [r5 + r3 + 8], m3 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + dec r4d + jnz .loop + RET + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_12x16, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mova m7, [tab_c_8192] + + mov r4d, 16/2 + +.loop + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3, + punpckhbw m2, m3, + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + movu m5, [r0 + 2 * r1] + lea r5, [r0 + 2 * r1] + movu m3, [r5 + r1] + + punpcklbw m6, m5, m3, + punpckhbw m5, m3, + + pmaddubsw m6, m0 + pmaddubsw m5, m0 + + paddw m4, m6 + paddw m2, m5 + + psubw m4, m7 + psubw m2, m7 + + movu [r2], m4 + movh [r2 + 16], m2 + + movu m2, [r0 + r1] + movu m3, [r0 + 2 * r1] + + punpcklbw m4, m2, m3, + punpckhbw m2, m3, + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m5, [r5 + r1] + movu m3, [r5 + 2 * r1] + + punpcklbw m6, m5, m3, + punpckhbw m5, m3, + + pmaddubsw m6, m0 + pmaddubsw m5, m0 + + paddw m4, m6 + paddw m2, m5 + + psubw m4, m7 + psubw m2, m7 + + movu [r2 + r3], m4 + movh [r2 + r3 + 16], m2 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel