At 2015-03-06 08:19:57,[email protected] wrote: ># HG changeset patch ># User David T Yuen <[email protected]> ># Date 1425594719 28800 ># Node ID 912c42dcb4d9b399515e6c1ed6be70db3bf5f675 ># Parent c5fa433ffda0a95889e99f4df787f3edc5880d0f >asm:intra pred dc32 sse2 > >This replaces c code for systems using ssse3 to sse2 processors >The code is backported from intrapred dc32 sse4 > >64-bit > >./test/TestBench --testbench intrapred | grep intra_dc_32x32 >intra_dc_32x32[f=0] 4.53x 1650.00 7474.94 > >32-bit > >./test/TestBench --testbench intrapred | grep intra_dc_32x32 >intra_dc_32x32[f=0] 7.79x 1749.94 13627.45 > >diff -r c5fa433ffda0 -r 912c42dcb4d9 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Thu Mar 05 13:54:48 2015 -0800 >+++ b/source/common/x86/asm-primitives.cpp Thu Mar 05 14:31:59 2015 -0800 >@@ -1213,6 +1213,7 @@ > p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; > p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2; > p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_sse2; >+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2; > > p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = x265_intra_pred_planar4_sse2; > >diff -r c5fa433ffda0 -r 912c42dcb4d9 source/common/x86/intrapred.h >--- a/source/common/x86/intrapred.h Thu Mar 05 13:54:48 2015 -0800 >+++ b/source/common/x86/intrapred.h Thu Mar 05 14:31:59 2015 -0800 >@@ -29,6 +29,7 @@ > void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const > pixel*srcPix, int, int filter); > void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const > pixel*srcPix, int, int filter); > void x265_intra_pred_dc16_sse2(pixel* dst, intptr_t dstStride, const > pixel*srcPix, int, int filter); >+void x265_intra_pred_dc32_sse2(pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); > void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const > pixel*srcPix, int, int filter); > void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); > void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); >diff -r c5fa433ffda0 -r 912c42dcb4d9 source/common/x86/intrapred8.asm >--- a/source/common/x86/intrapred8.asm Thu Mar 05 13:54:48 2015 -0800 >+++ b/source/common/x86/intrapred8.asm Thu Mar 05 14:31:59 2015 -0800 >@@ -495,6 +495,46 @@ > ;--------------------------------------------------------------------------------------------- > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int > dirMode, int bFilter) > ;--------------------------------------------------------------------------------------------- >+INIT_XMM sse2 >+cglobal intra_pred_dc32, 3, 3, 5 >+ pxor m0, m0 >+ movu m1, [r2 + 1] >+ movu m2, [r2 + 17] >+ movu m3, [r2 + 65] >+ movu m4, [r2 + 81] >+ psadbw m1, m0 >+ psadbw m2, m0 >+ psadbw m3, m0 >+ psadbw m4, m0 >+ paddw m1, m2 >+ paddw m3, m4 >+ paddw m1, m3 >+ pshufd m2, m1, 2 >+ paddw m1, m2 >+ >+ paddw m1, [pw_32] >+ psraw m1, 6 >+ pmullw m1, [pw_257] >+ pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] >+ pshufd m1, m1, 0x00 >+ >+%assign x 0 >+%rep 16 >+ ; store DC 16x16 >+ movu [r0], m1 >+ movu [r0 + r1], m1 >+ movu [r0 + 16], m1 >+ movu [r0 + r1 + 16], m1 >+%if x < 16 >+ lea r0, [r0 + 2 * r1] >+%endif we may buffer r1*3 to reduce count of LEA
>+%assign x x+1 >+%endrep >+ RET >+
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
