On 03/03, [email protected] wrote: > # HG changeset patch > # User David T Yuen <[email protected]> > # Date 1425405520 28800 > # Node ID 79c396c5cd8e990528a67ad029be55a4ff1f723e > # Parent 4641827f98c935603f608425de8c76785aef1114 > asm: intrapred dc32 sse2 high bit > > This replaces c code for systems using ssse3 to sse2 processors > The code is backported from intrapred dc32 sse4 high bit > > ./test/TestBench --testbench intrapred | grep 32x32 > intra_dc_32x32[f=0] 2.80x 3082.50 8635.09
this one isn't building for me; I hope to clear up the back log of asm patches soon so you should be able to re-send patches based on a more recent tip > diff -r 4641827f98c9 -r 79c396c5cd8e source/common/x86/asm-primitives.cpp > --- a/source/common/x86/asm-primitives.cpp Mon Mar 02 14:46:46 2015 -0800 > +++ b/source/common/x86/asm-primitives.cpp Tue Mar 03 09:58:40 2015 -0800 > @@ -871,6 +871,7 @@ > p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; > p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2; > p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_sse2; > + p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2; > > p.cu[BLOCK_4x4].sse_ss = x265_pixel_ssd_ss_4x4_mmx2; > ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2); > diff -r 4641827f98c9 -r 79c396c5cd8e source/common/x86/intrapred16.asm > --- a/source/common/x86/intrapred16.asm Mon Mar 02 14:46:46 2015 -0800 > +++ b/source/common/x86/intrapred16.asm Tue Mar 03 09:58:40 2015 -0800 > @@ -386,6 +386,67 @@ > .end: > RET > > +;------------------------------------------------------------------------------------------- > +; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t > dstStride, int filter) > +;------------------------------------------------------------------------------------------- > +INIT_XMM sse2 > +cglobal intra_pred_dc32, 3, 4, 6 > + lea r3, [r2 + 130] ;130 = > 32*sizeof(pixel)*2 + 1*sizeof(pixel) > + add r2, 2 > + add r1, r1 > + movu m0, [r3] > + movu m1, [r3 + 16] > + movu m2, [r3 + 32] > + movu m3, [r3 + 48] > + paddw m0, m1 > + paddw m2, m3 > + paddw m0, m2 > + movu m1, [r2] > + movu m3, [r2 + 16] > + movu m4, [r2 + 32] > + movu m5, [r2 + 48] > + paddw m1, m3 > + paddw m4, m5 > + paddw m1, m4 > + paddw m0, m1 > + movhlps m1, m0 > + paddw m0, m1 > + pshuflw m1, m0, 0x6E > + paddw m0, m1 > + pmaddwd m0, [pw_1] > + > + paddd m0, [pd_32] ; sum = sum + 32 > + psrld m0, 6 ; sum = sum / 64 > + pshuflw m0, m0, 0 > + pshufd m0, m0, 0 > + > + lea r2, [r1 * 3] > + ; store DC 32x32 > +%assign x 1 > +%rep 8 > + movu [r0 + 0], m0 > + movu [r0 + 16], m0 > + movu [r0 + 32], m0 > + movu [r0 + 48], m0 > + movu [r0 + r1 + 0], m0 > + movu [r0 + r1 + 16], m0 > + movu [r0 + r1 + 32], m0 > + movu [r0 + r1 + 48], m0 > + movu [r0 + r1 * 2 + 0], m0 > + movu [r0 + r1 * 2 + 16], m0 > + movu [r0 + r1 * 2 + 32], m0 > + movu [r0 + r1 * 2 + 48], m0 > + movu [r0 + r2 + 0], m0 > + movu [r0 + r2 + 16], m0 > + movu [r0 + r2 + 32], m0 > + movu [r0 + r2 + 48], m0 > + %if x < 8 > + lea r0, [r0 + r1 * 4] > + %endif > +%assign x x + 1 > +%endrep > + RET > + > > ;----------------------------------------------------------------------------------- > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int > filter) > > ;----------------------------------------------------------------------------------- > @@ -668,7 +729,7 @@ > > ;------------------------------------------------------------------------------------------- > INIT_XMM sse4 > cglobal intra_pred_dc32, 3, 5, 6 > - lea r3, [r2 + 130] > + lea r3, [r2 + 130] ;130 = > 32*sizeof(pixel)*2 + 1*sizeof(pixel) > add r2, 2 > add r1, r1 > movu m0, [r3] > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel -- Steve Borho _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
