At 2015-02-27 13:04:07,[email protected] wrote: ># HG changeset patch ># User David T Yuen <[email protected]> ># Date 1425005958 28800 ># Node ID fb806a6bebfa6312a2657b1978162f0c3e4ef2b9 ># Parent 49f5199f9f2599bd9acc69c8236313806bbb6142 >asm: intrapred dc8 sse2 > >This replaces c code for systems using ssse3 to sse2 processors >The code is backported from intrapred dc8 sse4 > >./test/TestBench --testbench intrapred | grep 8x8 >intra_dc_8x8[f=0] 3.86x 235.11 906.98 >intra_dc_8x8[f=1] 2.40x 532.49 1280.03 > >Also a white space nit > >diff -r 49f5199f9f25 -r fb806a6bebfa source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Thu Feb 26 18:04:35 2015 -0800 >+++ b/source/common/x86/asm-primitives.cpp Thu Feb 26 18:59:18 2015 -0800 >@@ -1207,6 +1207,7 @@ > ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2); > > p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; >+ p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2; > > p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2; > p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2; >diff -r 49f5199f9f25 -r fb806a6bebfa source/common/x86/intrapred.h >--- a/source/common/x86/intrapred.h Thu Feb 26 18:04:35 2015 -0800 >+++ b/source/common/x86/intrapred.h Thu Feb 26 18:59:18 2015 -0800 >@@ -26,8 +26,9 @@ > #ifndef X265_INTRAPRED_H > #define X265_INTRAPRED_H > >-void x265_intra_pred_dc4_sse2 (pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >-void x265_intra_pred_dc4_sse4 (pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >+void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >+void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const pixel* >srcPix, int, int filter); >+void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); > void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); > void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); > void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); >diff -r 49f5199f9f25 -r fb806a6bebfa source/common/x86/intrapred8.asm >--- a/source/common/x86/intrapred8.asm Thu Feb 26 18:04:35 2015 -0800 >+++ b/source/common/x86/intrapred8.asm Thu Feb 26 18:59:18 2015 -0800 >@@ -117,6 +117,7 @@ > > SECTION .text > >+cextern pw_2 > cextern pw_4 > cextern pw_8 > cextern pw_16 >@@ -207,6 +208,90 @@ > ;--------------------------------------------------------------------------------------------- > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int > dirMode, int bFilter) > ;--------------------------------------------------------------------------------------------- >+INIT_XMM sse2 >+cglobal intra_pred_dc8, 5, 9, 3 >+ pxor m0, m0 >+ movh m1, [r2 + 1] >+ movh m2, [r2 + 17] >+ punpcklqdq m1, m2 >+ psadbw m1, m0 >+ pshufd m2, m1, 2 >+ paddw m1, m2 >+ >+ paddw m1, [pw_8] >+ psraw m1, 4 >+ pmullw m1, [pw_257] >+ pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] >+ >+ test r4d, r4d >+ >+ ; store DC 8x8 >+ lea r6, [r1 + r1 * 2] >+ lea r7, [r6 + r1 * 2] >+ lea r8, [r6 + r1 * 4] >+ movh [r0], m1 >+ movh [r0 + r1], m1 >+ movh [r0 + r1 * 2], m1 >+ movh [r0 + r6], m1 >+ movh [r0 + r1 * 4], m1 >+ movh [r0 + r7], m1 >+ movh [r0 + r6 * 2], m1 >+ movh [r0 + r8], m1 >+ >+ ; Do DC Filter >+ jz .end >+ psrlw m1, 8 >+ movq m2, [pw_2] >+ pmullw m2, m1 >+ paddw m2, [pw_2] >+ movd r4d, m2 ; r4d = DC * 2 + 2 >+ paddw m1, m2 ; m1 = DC * 3 + 2 >+ pshuflw m1, m1, 0 ; m1 = pixDCx3 >+ pshufd m1, m1, 0 >+ >+ ; filter top >+ movq m2, [r2 + 1] >+ punpcklbw m2, m0 >+ paddw m2, m1 >+ psraw m2, 2 ; sum = sum / 16 >+ packuswb m2, m2 >+ movh [r0], m2 >+ >+ ; filter top-left >+ movzx r5d, byte [r2 + 17] >+ add r4d, r5d >+ movzx r3d, byte [r2 + 1] >+ add r3d, r4d >+ shr r3d, 2 >+ mov [r0], r3b >+ >+ ; filter left >+ movq m2, [r2 + 18] >+ punpcklbw m2, m0 >+ paddw m2, m1 >+ psraw m2, 2 >+ packuswb m2, m2 >+ movq r3, m2 this code and below is not compatible on x86 >+ mov [r0 + r1], r3b >+ shr r3, 8 >+ mov [r0 + r1 * 2], r3b >+ shr r3, 8 >+ mov [r0 + r6], r3b >+ shr r3, 8 >+ mov [r0 + r1 * 4], r3b >+ shr r3, 8 >+ mov [r0 + r7], r3b >+ shr r3, 8 >+ mov [r0 + r6 * 2], r3b >+ shr r3, 8 >+ mov [r0 + r8], r3b >+ >+.end: >+ RET >+ >+;--------------------------------------------------------------------------------------------- >+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int >dirMode, int bFilter) >+;--------------------------------------------------------------------------------------------- > INIT_XMM sse4 > cglobal intra_pred_dc4, 5,5,3 > inc r2 >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
