# HG changeset patch # User Murugan Vairavel <[email protected]> # Date 1383908744 -19800 # Fri Nov 08 16:35:44 2013 +0530 # Node ID c91092ea787273e89dbd2475e7f52f8b35bc5467 # Parent a3bfef2278016a3daaa83a932659ed84a2e01e8f asm: pixelsub_ps routine for 12x16 blocks
diff -r a3bfef227801 -r c91092ea7872 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Fri Nov 08 16:30:31 2013 +0530 +++ b/source/common/x86/pixel-a.asm Fri Nov 08 16:35:44 2013 +0530 @@ -5623,3 +5623,90 @@ PIXELSUB_PS_W8_H4 8, 8 PIXELSUB_PS_W8_H4 8, 16 PIXELSUB_PS_W8_H4 8, 32 + +;----------------------------------------------------------------------------- +; void pixel_sub_ps_c_%1x%2(pixel *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +%macro PIXELSUB_PS_W12_H4 2 +INIT_XMM sse4 +cglobal pixel_sub_ps_%1x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + +add r1, r1 +mov r6d, %2/4 + +.loop + + movh m0, [r2] + movd m1, [r2 + 8] + movh m2, [r3] + movd m3, [r3 + 8] + + movh m4, [r2 + r4] + movd m5, [r2 + r4 + 8] + movh m6, [r3 + r5] + movd m7, [r3 + r5 + 8] + + punpckldq m1, m5 + punpckldq m3, m7 + pmovzxbw m0, m0 + pmovzxbw m1, m1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + pmovzxbw m6, m6 + + psubw m0, m2 + psubw m1, m3 + psubw m4, m6 + + movu [r0], m0 + movlps [r0 + 16], m1 + movu [r0 + r1], m4 + movhps [r0 + r1 + 16], m1 + + movh m0, [r2 + 2 * r4] + movd m1, [r2 + 2 * r4 + 8] + movh m2, [r3 + 2 * r5] + movd m3, [r3 + 2 * r5 + 8] + + lea r2, [r2 + 2 * r4] + lea r3, [r3 + 2 * r5] + + movh m4, [r2 + r4] + movd m5, [r2 + r4 + 8] + movh m6, [r3 + r5] + movd m7, [r3 + r5 + 8] + + punpckldq m1, m5 + punpckldq m3, m7 + pmovzxbw m0, m0 + pmovzxbw m1, m1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + pmovzxbw m6, m6 + + psubw m0, m2 + psubw m1, m3 + psubw m4, m6 + + movu [r0 + 2 * r1], m0 + movlps [r0 + 2 * r1 + 16], m1 + + lea r0, [r0 + 2 * r1] + + movu [r0 + r1], m4 + movhps [r0 + r1 + 16], m1 + + lea r2, [r2 + 2 * r4] + lea r3, [r3 + 2 * r5] + lea r0, [r0 + 2 * r1] + + dec r6d + +jnz .loop + +RET +%endmacro + +PIXELSUB_PS_W12_H4 12, 16 _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
