right
At 2015-06-10 22:55:20,[email protected] wrote: ># HG changeset patch ># User David T Yuen <[email protected]> ># Date 1433948100 25200 ># Node ID c9debeec039e01c501884ab10dc9e32f55092b73 ># Parent 6245476add8f0562e3ccb657f572ff94fe96adf0 >asm: dst4 sse2 8bpp and 10bpp > >This replaces c code. > >64-bit > >dst4x4 1.43x 1575.01 2249.96 > >32-bit > >dst4x4 2.10x 1452.65 3052.47 > >10bpp > >dst4x4 1.40x 1567.49 2192.50 > >diff -r 6245476add8f -r c9debeec039e source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Wed Jun 10 11:54:27 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Wed Jun 10 07:55:00 2015 -0700 >@@ -930,6 +930,7 @@ > p.cu[BLOCK_8x8].idct = x265_idct8_sse2; > > p.idst4x4 = x265_idst4_sse2; >+ p.dst4x4 = x265_dst4_sse2; > > LUMA_VSS_FILTERS(sse2); > >@@ -2049,6 +2050,7 @@ > p.cu[BLOCK_8x8].idct = x265_idct8_sse2; > #endif > p.idst4x4 = x265_idst4_sse2; >+ p.dst4x4 = x265_dst4_sse2; > > p.planecopy_sp = x265_downShift_16_sse2; > ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2); >diff -r 6245476add8f -r c9debeec039e source/common/x86/dct8.asm >--- a/source/common/x86/dct8.asm Wed Jun 10 11:54:27 2015 +0530 >+++ b/source/common/x86/dct8.asm Wed Jun 10 07:55:00 2015 -0700 >@@ -582,6 +582,146 @@ > ;------------------------------------------------------ > ;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride) > ;------------------------------------------------------ >+INIT_XMM sse2 >+%if ARCH_X86_64 >+cglobal dst4, 3, 4, 8+4 >+ %define coef0 m8 >+ %define coef1 m9 >+ %define coef2 m10 >+ %define coef3 m11 >+%else ; ARCH_X86_64 = 0 >+cglobal dst4, 3, 4, 8 >+ %define coef0 [r3 + 0 * 16] >+ %define coef1 [r3 + 1 * 16] >+ %define coef2 [r3 + 2 * 16] >+ %define coef3 [r3 + 3 * 16] >+%endif ; ARCH_X86_64 >+ >+%if BIT_DEPTH == 8 >+ %define DST_SHIFT 1 >+ mova m5, [pd_1] >+%elif BIT_DEPTH == 10 >+ %define DST_SHIFT 3 >+ mova m5, [pd_4] >+%endif >+ add r2d, r2d >+ lea r3, [tab_dst4] >+%if ARCH_X86_64 >+ mova coef0, [r3 + 0 * 16] >+ mova coef1, [r3 + 1 * 16] >+ mova coef2, [r3 + 2 * 16] >+ mova coef3, [r3 + 3 * 16] >+%endif >+ movh m0, [r0 + 0 * r2] ; load >+ movhps m0, [r0 + 1 * r2] >+ lea r0, [r0 + 2 * r2] >+ movh m1, [r0] >+ movhps m1, [r0 + r2] >+ pmaddwd m2, m0, coef0 ; DST1 >+ pmaddwd m3, m1, coef0 >+ pshufd m6, m2, q2301 >+ pshufd m7, m3, q2301 >+ paddd m2, m6 >+ paddd m3, m7 >+ pshufd m2, m2, q3120 >+ pshufd m3, m3, q3120 >+ punpcklqdq m2, m3 >+ paddd m2, m5 >+ psrad m2, DST_SHIFT >+ pmaddwd m3, m0, coef1 >+ pmaddwd m4, m1, coef1 >+ pshufd m6, m4, q2301 >+ pshufd m7, m3, q2301 >+ paddd m4, m6 >+ paddd m3, m7 >+ pshufd m4, m4, q3120 >+ pshufd m3, m3, q3120 >+ punpcklqdq m3, m4 >+ paddd m3, m5 >+ psrad m3, DST_SHIFT >+ packssdw m2, m3 ; m2 = T70 >+ pmaddwd m3, m0, coef2 >+ pmaddwd m4, m1, coef2 >+ pshufd m6, m4, q2301 >+ pshufd m7, m3, q2301 >+ paddd m4, m6 >+ paddd m3, m7 >+ pshufd m4, m4, q3120 >+ pshufd m3, m3, q3120 >+ punpcklqdq m3, m4 >+ paddd m3, m5 >+ psrad m3, DST_SHIFT >+ pmaddwd m0, coef3 >+ pmaddwd m1, coef3 >+ pshufd m6, m0, q2301 >+ pshufd m7, m1, q2301 >+ paddd m0, m6 >+ paddd m1, m7 >+ pshufd m0, m0, q3120 >+ pshufd m1, m1, q3120 >+ punpcklqdq m0, m1 >+ paddd m0, m5 >+ psrad m0, DST_SHIFT >+ packssdw m3, m0 ; m3 = T71 >+ mova m5, [pd_128] >+ >+ pmaddwd m0, m2, coef0 ; DST2 >+ pmaddwd m1, m3, coef0 >+ pshufd m6, m0, q2301 >+ pshufd m7, m1, q2301 >+ paddd m0, m6 >+ paddd m1, m7 >+ pshufd m0, m0, q3120 >+ pshufd m1, m1, q3120 >+ punpcklqdq m0, m1 >+ paddd m0, m5 >+ psrad m0, 8 >+ >+ pmaddwd m4, m2, coef1 >+ pmaddwd m1, m3, coef1 >+ pshufd m6, m4, q2301 >+ pshufd m7, m1, q2301 >+ paddd m4, m6 >+ paddd m1, m7 >+ pshufd m4, m4, q3120 >+ pshufd m1, m1, q3120 >+ punpcklqdq m4, m1 >+ paddd m4, m5 >+ psrad m4, 8 >+ packssdw m0, m4 >+ movu [r1 + 0 * 16], m0 >+ >+ pmaddwd m0, m2, coef2 >+ pmaddwd m1, m3, coef2 >+ pshufd m6, m0, q2301 >+ pshufd m7, m1, q2301 >+ paddd m0, m6 >+ paddd m1, m7 >+ pshufd m0, m0, q3120 >+ pshufd m1, m1, q3120 >+ punpcklqdq m0, m1 >+ paddd m0, m5 >+ psrad m0, 8 >+ >+ pmaddwd m2, coef3 >+ pmaddwd m3, coef3 >+ pshufd m6, m2, q2301 >+ pshufd m7, m3, q2301 >+ paddd m2, m6 >+ paddd m3, m7 >+ pshufd m2, m2, q3120 >+ pshufd m3, m3, q3120 >+ punpcklqdq m2, m3 >+ paddd m2, m5 >+ psrad m2, 8 >+ packssdw m0, m2 >+ movu [r1 + 1 * 16], m0 >+ >+ RET >+ >+;------------------------------------------------------ >+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride) >+;------------------------------------------------------ > INIT_XMM ssse3 > %if ARCH_X86_64 > cglobal dst4, 3, 4, 8+2 >diff -r 6245476add8f -r c9debeec039e source/common/x86/dct8.h >--- a/source/common/x86/dct8.h Wed Jun 10 11:54:27 2015 +0530 >+++ b/source/common/x86/dct8.h Wed Jun 10 07:55:00 2015 -0700 >@@ -25,6 +25,7 @@ > #define X265_DCT8_H > void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride); > void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride); >+void x265_dst4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride); > void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride); > void x265_dst4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride); > void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride); >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
