8xN is right
At 2015-02-03 21:07:25,[email protected] wrote: ># HG changeset patch ># User Praveen Tiwari ># Date 1422968834 -19800 ># Node ID 6aba648bfada606d14f20e0a7cdb667d043069ae ># Parent a7dff1040961c2c17254c2e2bb0bf5b7857c8187 >blockcopy_pp_8x64: sse2 asm code optimization > >improved, 800.38c -> 752.18c > >diff -r a7dff1040961 -r 6aba648bfada source/common/x86/blockcopy8.asm >--- a/source/common/x86/blockcopy8.asm Tue Feb 03 18:26:22 2015 +0530 >+++ b/source/common/x86/blockcopy8.asm Tue Feb 03 18:37:14 2015 +0530 >@@ -482,48 +482,38 @@ > RET > > ;----------------------------------------------------------------------------- >-; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, >intptr_t srcStride) >+; void blockcopy_pp_8x64(pixel* dst, intptr_t dstStride, const pixel* src, >intptr_t srcStride) > ;----------------------------------------------------------------------------- >-%macro BLOCKCOPY_PP_W8_H8 2 > INIT_XMM sse2 >-cglobal blockcopy_pp_%1x%2, 4, 5, 6 >- mov r4d, %2/8 >- >-.loop: >- movh m0, [r2] >- movh m1, [r2 + r3] >- lea r2, [r2 + 2 * r3] >- movh m2, [r2] >- movh m3, [r2 + r3] >- lea r2, [r2 + 2 * r3] >- movh m4, [r2] >- movh m5, [r2 + r3] >- >- movh [r0], m0 >- movh [r0 + r1], m1 >- lea r0, [r0 + 2 * r1] >- movh [r0], m2 >- movh [r0 + r1], m3 >- lea r0, [r0 + 2 * r1] >- movh [r0], m4 >- movh [r0 + r1], m5 >- >- lea r2, [r2 + 2 * r3] >- movh m4, [r2] >- movh m5, [r2 + r3] >- lea r0, [r0 + 2 * r1] >- movh [r0], m4 >- movh [r0 + r1], m5 >- >- dec r4d >- lea r0, [r0 + 2 * r1] >- lea r2, [r2 + 2 * r3] >- jnz .loop >-RET >-%endmacro >- >- >-BLOCKCOPY_PP_W8_H8 8, 64 >+cglobal blockcopy_pp_8x64, 4, 6, 4 >+ >+ lea r4, [3 * r3] >+ lea r5, [3 * r1] >+ >+ movh m0, [r2] >+ movh m1, [r2 + r3] >+ movh m2, [r2 + 2 * r3] >+ movh m3, [r2 + r4] >+ >+ movh [r0], m0 >+ movh [r0 + r1], m1 >+ movh [r0 + 2 * r1], m2 >+ movh [r0 + r5], m3 >+ >+ %rep 15 >+ lea r2, [r2 + 4 * r3] >+ movh m0, [r2] >+ movh m1, [r2 + r3] >+ movh m2, [r2 + 2 * r3] >+ movh m3, [r2 + r4] >+ >+ lea r0, [r0 + 4 * r1] >+ movh [r0], m0 >+ movh [r0 + r1], m1 >+ movh [r0 + 2 * r1], m2 >+ movh [r0 + r5], m3 >+ %endrep >+ RET > > ;----------------------------------------------------------------------------- > ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, > intptr_t srcStride) >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
