At 2015-02-25 08:15:59,[email protected] wrote: ># HG changeset patch ># User David T Yuen <[email protected]> ># Date 1424818021 28800 ># Node ID 644d27ca0b197455393171ba705b1190f3d9b420 ># Parent 1a703601f7c8b85f1e6e680caa281b9edada89ab >asm: intrapred dc8 sse2 > >This replaces c code for systems using ssse3 to sse2 processors >The code is backported from intrapred dc8 sse4 > >./test/TestBench --testbench intrapred | grep 8x8 >intra_dc_8x8[f=0] 3.86x 235.09 906.94 >intra_dc_8x8[f=1] 2.44x 525.19 1280.05 > >Also a white space nit > >diff -r 1a703601f7c8 -r 644d27ca0b19 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Tue Feb 24 12:51:43 2015 -0800 >+++ b/source/common/x86/asm-primitives.cpp Tue Feb 24 14:47:01 2015 -0800 >@@ -1074,6 +1074,7 @@ > ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2); > > p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; >+ p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2; > > p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2; > p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2; >diff -r 1a703601f7c8 -r 644d27ca0b19 source/common/x86/intrapred.h >--- a/source/common/x86/intrapred.h Tue Feb 24 12:51:43 2015 -0800 >+++ b/source/common/x86/intrapred.h Tue Feb 24 14:47:01 2015 -0800 >@@ -26,8 +26,9 @@ > #ifndef X265_INTRAPRED_H > #define X265_INTRAPRED_H > >-void x265_intra_pred_dc4_sse2 (pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >-void x265_intra_pred_dc4_sse4 (pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >+void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >+void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >+void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); > void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); > void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); > void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); >diff -r 1a703601f7c8 -r 644d27ca0b19 source/common/x86/intrapred8.asm >--- a/source/common/x86/intrapred8.asm Tue Feb 24 12:51:43 2015 -0800 >+++ b/source/common/x86/intrapred8.asm Tue Feb 24 14:47:01 2015 -0800 >@@ -76,6 +76,7 @@ > > SECTION .text > >+cextern pw_2 > cextern pw_4 > cextern pw_8 > cextern pw_16 >@@ -156,6 +157,90 @@ > ;--------------------------------------------------------------------------------------------- > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int > dirMode, int bFilter) > ;--------------------------------------------------------------------------------------------- >+INIT_XMM sse2 >+cglobal intra_pred_dc8, 5, 7, 3
maximum register is r6 >+ pxor m0, m0 >+ movh m1, [r2 + 1] >+ movh m2, [r2 + 17] >+ punpcklqdq m1, m2 >+ psadbw m1, m0 >+ pshufd m2, m1, 2 >+ paddw m1, m2 >+ >+ paddw m1, [pw_8] >+ psraw m1, 4 >+ pmullw m1, [pw_257] >+ pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] >+ >+ test r4d, r4d >+ >+ ; store DC 8x8 >+ lea r6, [r1 + r1 * 2] >+ lea r7, [r6 + r1 * 2] >+ lea r8, [r6 + r1 * 4] r7, r8 is invalid >+ movh [r0], m1 >+ movh [r0 + r1], m1 >+ movh [r0 + r1 * 2], m1 >+ movh [r0 + r6], m1 >+ movh [r0 + r1 * 4], m1 >+ movh [r0 + r7], m1 >+ movh [r0 + r6 * 2], m1 >+ movh [r0 + r8], m1 >+ >+ ; Do DC Filter >+ jz .end >+ psrlw m1, 8 >+ mova m2, [pw_2] >+ pmullw m2, m1 why not PSLL? >+ paddw m2, [pw_2] >+ movd r4d, m2 ; r4d = DC * 2 + 2 >+ paddw m1, m2 ; m1 = DC * 3 + 2 >+ pshuflw m1, m1, 0 ; m1 = pixDCx3 >+ pshufd m1, m1, 0 >+ >+ ; filter top >+ movq m2, [r2 + 1] >+ punpcklbw m2, m0 >+ paddw m2, m1 >+ psraw m2, 2 ; sum = sum / 16 >+ packuswb m2, m2 >+ movh [r0], m2 >+ >+ ; filter top-left >+ movzx r5d, byte [r2 + 17] >+ add r4d, r5d >+ movzx r3d, byte [r2 + 1] >+ add r3d, r4d >+ shr r3d, 2 >+ mov [r0], r3b >+ >+ ; filter left >+ movq m2, [r2 + 18] >+ punpcklbw m2, m0 >+ paddw m2, m1 >+ psraw m2, 2 >+ packuswb m2, m2 >+ movq r3, m2 >+ mov [r0 + r1], r3b >+ shr r3, 8 >+ mov [r0 + r1 * 2], r3b >+ shr r3, 8 >+ mov [r0 + r6], r3b >+ shr r3, 8 >+ mov [r0 + r1 * 4], r3b >+ shr r3, 8 >+ mov [r0 + r7], r3b >+ shr r3, 8 >+ mov [r0 + r6 * 2], r3b >+ shr r3, 8 >+ mov [r0 + r8], r3b >+ >+.end: >+ RET >+ >+;--------------------------------------------------------------------------------------------- >+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int >dirMode, int bFilter) >+;--------------------------------------------------------------------------------------------- > INIT_XMM sse4 > cglobal intra_pred_dc4, 5,5,3 > inc r2 >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
