At 2015-02-24 00:23:58,[email protected] wrote: ># HG changeset patch ># User David T Yuen <[email protected]> ># Date 1424706375 28800 ># Node ID c2eb94770f9b98d5bc5cf0e96d635e26c01ca5c6 ># Parent d179686d7b8d79a125b51fc3f8799152add0fd9f >asm: intrapred dc4 sse2 > >This replaces c code for systems using ssse3 to sse2 processors >The code is backported from intrapred dc4 sse4 > >./test/TestBench --testbench intrapred >Testing only harnesses that match name <intrapred> >Using random seed 54EB498B 8bpp >Testing primitives: SSE2 >Testing primitives: SSE3 > >Test performance improvement with full optimizations >== intrapred primitives == >intra_dc_4x4[f=0] 1.25x 193.40 242.35 >intra_dc_4x4[f=1] 1.23x 418.06 513.03 > >diff -r d179686d7b8d -r c2eb94770f9b source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Thu Feb 19 14:44:16 2015 -0800 >+++ b/source/common/x86/asm-primitives.cpp Mon Feb 23 07:46:15 2015 -0800 >@@ -1070,6 +1070,7 @@ > ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2); > ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2); > ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2); >+ p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; > > p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2; > p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2; >diff -r d179686d7b8d -r c2eb94770f9b source/common/x86/intrapred.h >--- a/source/common/x86/intrapred.h Thu Feb 19 14:44:16 2015 -0800 >+++ b/source/common/x86/intrapred.h Mon Feb 23 07:46:15 2015 -0800 >@@ -26,6 +26,7 @@ > #ifndef X265_INTRAPRED_H > #define X265_INTRAPRED_H > >+void x265_intra_pred_dc4_sse2 (pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); > void x265_intra_pred_dc4_sse4 (pixel* dst, intptr_t dstStride, const > pixel*srcPix, int, int filter); > void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); > void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); >diff -r d179686d7b8d -r c2eb94770f9b source/common/x86/intrapred8.asm >--- a/source/common/x86/intrapred8.asm Thu Feb 19 14:44:16 2015 -0800 >+++ b/source/common/x86/intrapred8.asm Mon Feb 23 07:46:15 2015 -0800 >@@ -65,6 +65,8 @@ > pw_planar32_L: dw 31, 30, 29, 28, 27, 26, 25, 24 > pw_planar32_H: dw 23, 22, 21, 20, 19, 18, 17, 16 > >+INT257: dd 257, 0, 0, 0 >+INT4: dw 4, 0, 0, 0, 0, 0, 0, 0 rename to pdq_* is better > const ang_table > %assign x 0 > %rep 32 >@@ -89,6 +91,71 @@ > ;--------------------------------------------------------------------------------------------- > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int > dirMode, int bFilter) > ;--------------------------------------------------------------------------------------------- >+INIT_XMM sse2 >+cglobal intra_pred_dc4, 5,5,3 >+ inc r2 >+ pxor m0, m0 >+ movu m1, [r2] >+ pshufd m1, m1, 0xF8 >+ psadbw m1, m0 ; m1 = sum >+ >+ test r4d, r4d >+ >+ paddw m1, [INT4] >+ psraw m1, 3 >+ movd r4d, m1 ; r4d = dc_val >+ pmullw m1, [INT257] >+ pshuflw m1, m1, 0x00 >+ >+ ; store DC 4x4 >+ lea r3, [r1 * 3] >+ movd [r0], m1 >+ movd [r0 + r1], m1 >+ movd [r0 + r1 * 2], m1 >+ movd [r0 + r3], m1 >+ >+ ; do DC filter >+ jz .end >+ lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2 >+ add r4d, r3d ; r4d = DC * 3 + 2 >+ movd m1, r4d >+ pshuflw m1, m1, 0 ; m1 = pixDCx3 >+ >+ ; filter top >+ movq m2, [r2] are you just use low 4-pixels? >+ punpcklbw m2, m0 >+ paddw m2, m1 >+ psraw m2, 2 >+ packuswb m2, m2 >+ movd [r0], m2 ; overwrite top-left pixel, we will >update it later >+ >+ ; filter top-left >+ movzx r4d, byte [r2 + 8] >+ add r3d, r4d >+ movzx r4d, byte [r2] >+ add r3d, r4d >+ shr r3d, 2 >+ mov [r0], r3b >+ >+ ; filter left >+ add r0, r1 >+ movq m2, [r2 + 9] >+ punpcklbw m2, m0 >+ paddw m2, m1 >+ psraw m2, 2 >+ packuswb m2, m2 >+ movq r4, m2 >+ mov [r0], r4b >+ shr r4, 8 >+ mov [r0 + r1], r4b >+ shr r4, 8 >+ mov [r0 + r1 * 2], r4b >+.end: >+ RET >+ >+;--------------------------------------------------------------------------------------------- >+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int >dirMode, int bFilter) >+;--------------------------------------------------------------------------------------------- > INIT_XMM sse4 > cglobal intra_pred_dc4, 5,5,3 > inc r2 >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
