在 2015-02-26 07:06:06,dave <[email protected]> 写道: On 02/25/2015 05:55 AM, chen wrote: At 2015-02-25 08:15:59,[email protected] wrote: ># HG changeset patch ># User David T Yuen <[email protected]> ># Date 1424818021 28800 ># Node ID 644d27ca0b197455393171ba705b1190f3d9b420 ># Parent 1a703601f7c8b85f1e6e680caa281b9edada89ab >asm: intrapred dc8 sse2 > >This replaces c code for systems using ssse3 to sse2 processors >The code is backported from intrapred dc8 sse4 > >./test/TestBench --testbench intrapred | grep 8x8 >intra_dc_8x8[f=0] 3.86x 235.09 906.94 >intra_dc_8x8[f=1] 2.44x 525.19 1280.05 > >Also a white space nit > >diff -r 1a703601f7c8 -r 644d27ca0b19 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Tue Feb 24 12:51:43 2015 -0800 >+++ b/source/common/x86/asm-primitives.cpp Tue Feb 24 14:47:01 2015 -0800 >@@ -1074,6 +1074,7 @@ > ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2); > > p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; >+ p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2; > > p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2; > p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2; >diff -r 1a703601f7c8 -r 644d27ca0b19 source/common/x86/intrapred.h >--- a/source/common/x86/intrapred.h Tue Feb 24 12:51:43 2015 -0800 >+++ b/source/common/x86/intrapred.h Tue Feb 24 14:47:01 2015 -0800 >@@ -26,8 +26,9 @@ > #ifndef X265_INTRAPRED_H > #define X265_INTRAPRED_H > >-void x265_intra_pred_dc4_sse2 (pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >-void x265_intra_pred_dc4_sse4 (pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >+void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >+void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); >+void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const >pixel*srcPix, int, int filter); > void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); > void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); > void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* > srcPix, int, int filter); >diff -r 1a703601f7c8 -r 644d27ca0b19 source/common/x86/intrapred8.asm >--- a/source/common/x86/intrapred8.asm Tue Feb 24 12:51:43 2015 -0800 >+++ b/source/common/x86/intrapred8.asm Tue Feb 24 14:47:01 2015 -0800 >@@ -76,6 +76,7 @@ > > SECTION .text > >+cextern pw_2 > cextern pw_4 > cextern pw_8 > cextern pw_16 >@@ -156,6 +157,90 @@ > ;--------------------------------------------------------------------------------------------- > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int > dirMode, int bFilter) > ;--------------------------------------------------------------------------------------------- >+INIT_XMM sse2 >+cglobal intra_pred_dc8, 5, 7, 3 maximum register is r6 I will adjust 7 to 9 >+ pxor m0, m0 >+ movh m1, [r2 + 1] >+ movh m2, [r2 + 17] >+ punpcklqdq m1, m2 >+ psadbw m1, m0 >+ pshufd m2, m1, 2 >+ paddw m1, m2 >+ >+ paddw m1, [pw_8] >+ psraw m1, 4 >+ pmullw m1, [pw_257] >+ pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] >+ >+ test r4d, r4d >+ >+ ; store DC 8x8 >+ lea r6, [r1 + r1 * 2] >+ lea r7, [r6 + r1 * 2] >+ lea r8, [r6 + r1 * 4] r7, r8 is invalid >+ movh [r0], m1 >+ movh [r0 + r1], m1 >+ movh [r0 + r1 * 2], m1 >+ movh [r0 + r6], m1 >+ movh [r0 + r1 * 4], m1 >+ movh [r0 + r7], m1 >+ movh [r0 + r6 * 2], m1 >+ movh [r0 + r8], m1 >+ >+ ; Do DC Filter >+ jz .end >+ psrlw m1, 8 >+ mova m2, [pw_2] >+ pmullw m2, m1 why not PSLL? Do you mean something like pslldq m1, 56 mova m2, [pdq_2] pmuludq m2, m1 paddq m2, [pdq_2] ? once pshuflw m1, m1, 0 happens everything above 16 bits is wiped out and we are only interested in the bottom 8 bits. Besides, I tried this and there was no difference in performance and 16 bits should be enough to handle it. I can change it if that is how you want it. [MC] "pmullw m2,m1" = "pmullw pw_2, m1" = "paddw m2, m1, m1" or "psllw m2, m1, 1"
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
