# HG changeset patch # User Min Chen <[email protected]> # Date 1425344040 28800 # Node ID 64214b2faa324d91a015190b8dc69716ebab41f8 # Parent 018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc asm: improve pixel_add_ps_8xN with loop unroll, [8x8] from 278c to 245c --- source/common/x86/pixeladd8.asm | 81 ++++++++++++++++++++------------------- 1 files changed, 41 insertions(+), 40 deletions(-)
diff -r 018e8bbaa854 -r 64214b2faa32 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Fri Feb 27 11:46:09 2015 +0530 +++ b/source/common/x86/pixeladd8.asm Mon Mar 02 16:54:00 2015 -0800 @@ -188,47 +188,48 @@ %macro PIXEL_ADD_PS_W8_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 -cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 - mova m5, [pw_pixel_max] - pxor m4, m4 - mov r6d, %2/4 - add r4, r4 - add r5, r5 - add r1, r1 -.loop: - movu m0, [r2] - movu m2, [r2 + r4] - movu m1, [r3] - movu m3, [r3 + r5] - lea r2, [r2 + r4 * 2] - lea r3, [r3 + r5 * 2] +cglobal pixel_add_ps_8x%2, 6,6,6 + FIX_STRIDES r4, r5, r1 ; the instruction decode component performance affect by register order + pxor m0, m0 + mova m1, [pw_pixel_max] - paddw m0, m1 - paddw m2, m3 - CLIPW2 m0, m2, m4, m5 +%assign x 0 +%rep %2/4 + movu m2, [r2] ; row 0 of src0 + movu m3, [r2 + r4] ; row 1 of src0 + movu m4, [r3] ; row 0 of src1 + movu m5, [r3 + r5] ; row 1 of src1 + paddw m2, m4 + paddw m3, m5 + CLIPW m2, m0, m1 + CLIPW m3, m0, m1 + movu [r0], m2 ; row 0 of dst + movu [r0 + r1], m3 ; row 1 of dst - movu [r0], m0 - movu [r0 + r1], m2 + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + movu m2, [r2] ; row 2 of src0 + movu m3, [r2 + r4] ; row 3 of src0 + movu m4, [r3] ; row 2 of src1 + movu m5, [r3 + r5] ; row 3 of src1 + paddw m2, m4 + paddw m3, m5 + CLIPW m2, m0, m1 + CLIPW m3, m0, m1 + movu [r0], m2 ; row 2 of dst + movu [r0 + r1], m3 ; row 3 of dst - movu m0, [r2] - movu m2, [r2 + r4] - movu m1, [r3] - movu m3, [r3 + r5] - dec r6d - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r4 * 2] - lea r3, [r3 + r5 * 2] + ; didn't generate at last unroll pass + %if x != (%2/4)-1 + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + %endif +%assign x x+1 +%endrep + RET - paddw m0, m1 - paddw m2, m3 - CLIPW2 m0, m2, m4, m5 - - movu [r0], m0 - movu [r0 + r1], m2 - lea r0, [r0 + r1 * 2] - - jnz .loop - RET %else INIT_XMM sse4 cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
