At 2015-02-25 08:16:44,[email protected] wrote: ># HG changeset patch ># User David T Yuen <[email protected]> ># Date 1424823199 28800 ># Node ID bd1d713da87bb1e3022c80f462398bd78a95ce48 ># Parent 644d27ca0b197455393171ba705b1190f3d9b420 >asm: intrapred dc8 sse2 high bit > >This replaces c code for systems using ssse3 to sse2 processors >The code is backported from intrapred dc8 sse4 high bit > >./test/TestBench --testbench intrapred | grep 8x8 >intra_dc_8x8[f=0] 1.62x 443.94 717.32 >intra_dc_8x8[f=1] 1.59x 729.98 1157.66 > >diff -r 644d27ca0b19 -r bd1d713da87b source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Tue Feb 24 14:47:01 2015 -0800 >+++ b/source/common/x86/asm-primitives.cpp Tue Feb 24 16:13:19 2015 -0800 >@@ -864,6 +864,7 @@ > ALL_LUMA_TU_S(transpose, transpose, sse2); > > p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; >+ p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2; > > p.cu[BLOCK_4x4].sse_ss = x265_pixel_ssd_ss_4x4_mmx2; > ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2); >diff -r 644d27ca0b19 -r bd1d713da87b source/common/x86/intrapred16.asm >--- a/source/common/x86/intrapred16.asm Tue Feb 24 14:47:01 2015 -0800 >+++ b/source/common/x86/intrapred16.asm Tue Feb 24 16:13:19 2015 -0800 >@@ -86,6 +86,7 @@ > cextern pw_8 > cextern pw_16 > cextern pw_1023 >+cextern pd_2 > cextern pd_16 > cextern pd_32 > cextern pw_4096 >@@ -166,6 +167,89 @@ > ;----------------------------------------------------------------------------------- > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int > filter) > ;----------------------------------------------------------------------------------- >+INIT_XMM sse2 >+cglobal intra_pred_dc8, 5, 8, 2 >+ add r2, 2 >+ add r1, r1 >+ movu m0, [r2 + 32] >+ movu m1, [r2] >+ >+ paddw m0, m1 >+ movhlps m1, m0 >+ paddw m0, m1 >+ pshufd m1, m0, 1 >+ paddw m0, m1 >+ pmaddwd m0, [pw_1] >+ >+ paddw m0, [pw_8] >+ psraw m0, 4 ; sum = sum / 16 >+ pshuflw m0, m0, 0 >+ pshufd m0, m0, 0 ; m0 = word [dc_val ...] >+ >+ test r4d, r4d >+ >+ ; store DC 8x8 >+ lea r6, [r1 + r1 * 2] >+ lea r5, [r6 + r1 * 2] >+ lea r7, [r6 + r1 * 4] >+ movu [r0], m0 >+ movu [r0 + r1], m0 >+ movu [r0 + r1 * 2], m0 >+ movu [r0 + r6], m0 >+ movu [r0 + r1 * 4], m0 >+ movu [r0 + r5], m0 >+ movu [r0 + r6 * 2], m0 >+ movu [r0 + r7], m0 >+ >+ ; Do DC Filter >+ jz .end >+ mova m1, [pd_2] >+ pmullw m1, m0 [x x x x] * [0 2 0 2] = [0 dc*2 0 dc*2], am I right? Why not PSLL?
>+ paddw m1, [pd_2] >+ movd r4d, m1 ; r4d = DC * 2 + 2 MOVD fix previous issue >+ paddw m1, m0 ; m1 = DC * 3 + 2 >+ pshuflw m1, m1, 0 >+ pshufd m1, m1, 0 ; m1 = pixDCx3 >+ >+ ; filter top >+ movu m0, [r2] >+ paddw m0, m1 >+ psraw m0, 2 >+ movu [r0], m0 >+ >+ ; filter top-left >+ movzx r3d, word [r2 + 32] >+ add r4d, r3d >+ movzx r3d, word [r2] >+ add r3d, r4d >+ shr r3d, 2 >+ mov [r0], r3w >+ >+ ; filter left >+ movu m0, [r2 + 34] >+ paddw m0, m1 >+ psraw m0, 2 >+ movh r3, m0 >+ mov [r0 + r1], r3w >+ shr r3, 16 >+ mov [r0 + r1 * 2], r3w >+ shr r3, 16 >+ mov [r0 + r6], r3w >+ shr r3, 16 >+ mov [r0 + r1 * 4], r3w >+ pshufd m0, m0, 0x6E >+ movh r3, m0 >+ mov [r0 + r5], r3w >+ shr r3, 16 >+ mov [r0 + r6 * 2], r3w >+ shr r3, 16 >+ mov [r0 + r7], r3w >+.end: >+ RET >+ >+;----------------------------------------------------------------------------------- >+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int >filter) >+;----------------------------------------------------------------------------------- > INIT_XMM sse4 > cglobal intra_pred_dc4, 5,6,2 > lea r3, [r2 + 18] >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
