right, with some comment below At 2015-03-03 12:03:23,[email protected] wrote: ># HG changeset patch ># User David T Yuen <[email protected]> ># Date 1425355260 28800 ># Node ID 48ca9a0c131c99d54778515dc5a6a5a7a9197153 ># Parent 4641827f98c935603f608425de8c76785aef1114 >asm: intrapred dc32 sse2 high bit > >This replaces c code for systems using ssse3 to sse2 processors >The code is backported from intrapred dc32 sse4 high bit > >./test/TestBench --testbench intrapred | grep 32x32 >intra_dc_32x32[f=0] 2.83x 3080.15 8729.59 > >diff -r 4641827f98c9 -r 48ca9a0c131c source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Mon Mar 02 14:46:46 2015 -0800 >+++ b/source/common/x86/asm-primitives.cpp Mon Mar 02 20:01:00 2015 -0800 >@@ -871,6 +871,7 @@ > p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; > p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2; > p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_sse2; >+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2; > > p.cu[BLOCK_4x4].sse_ss = x265_pixel_ssd_ss_4x4_mmx2; > ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2); >diff -r 4641827f98c9 -r 48ca9a0c131c source/common/x86/intrapred16.asm >--- a/source/common/x86/intrapred16.asm Mon Mar 02 14:46:46 2015 -0800 >+++ b/source/common/x86/intrapred16.asm Mon Mar 02 20:01:00 2015 -0800 >@@ -386,6 +386,69 @@ > .end: > RET > >+;------------------------------------------------------------------------------------------- >+; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t >dstStride, int filter) >+;------------------------------------------------------------------------------------------- >+INIT_XMM sse2 >+cglobal intra_pred_dc32, 3, 5, 6 >+ lea r3, [r2 + 130]
I guess 130 because 32*sizeof(pixel)*2 + 1*sizeof(pixel), please make a comment here to more readable >+ add r2, 2 >+ add r1, r1 >+ movu m0, [r3] >+ movu m1, [r3 + 16] >+ movu m2, [r3 + 32] >+ movu m3, [r3 + 48] >+ paddw m0, m1 >+ paddw m2, m3 >+ paddw m0, m2 >+ movu m1, [r2] >+ movu m3, [r2 + 16] >+ movu m4, [r2 + 32] >+ movu m5, [r2 + 48] >+ paddw m1, m3 >+ paddw m4, m5 >+ paddw m1, m4 >+ paddw m0, m1 >+ movhlps m1, m0 >+ paddw m0, m1 >+ pshuflw m1, m0, 0x6E >+ paddw m0, m1 >+ pmaddwd m0, [pw_1] >+ >+ paddd m0, [pd_32] ; sum = sum + 32 >+ psrld m0, 6 ; sum = sum / 64 >+ pshuflw m0, m0, 0 >+ pshufd m0, m0, 0 >+ >+ lea r2, [r1 * 3] >+ >+ ; store DC 32x32 >+%assign x 1 >+%rep 8 >+ movu [r0 + 0], m0 >+ movu [r0 + 16], m0 >+ movu [r0 + 32], m0 >+ movu [r0 + 48], m0 >+ movu [r0 + r1 + 0], m0 >+ movu [r0 + r1 + 16], m0 >+ movu [r0 + r1 + 32], m0 >+ movu [r0 + r1 + 48], m0 >+ movu [r0 + r1 * 2 + 0], m0 >+ movu [r0 + r1 * 2 + 16], m0 >+ movu [r0 + r1 * 2 + 32], m0 >+ movu [r0 + r1 * 2 + 48], m0 >+ movu [r0 + r2 + 0], m0 >+ movu [r0 + r2 + 16], m0 >+ movu [r0 + r2 + 32], m0 >+ movu [r0 + r2 + 48], m0 >+ %if x < 8 >+ lea r0, [r0 + r1 * 4] >+ %endif >+ %assign x x + 1 don't make extra space here, sometime yasm have problem on it >+%endrep >+ >+ RET >+ > ;----------------------------------------------------------------------------------- > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int > filter) > ;----------------------------------------------------------------------------------- >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
