# HG changeset patch # User Murugan Vairavel <muru...@multicorewareinc.com> # Date 1384868970 -19800 # Tue Nov 19 19:19:30 2013 +0530 # Node ID 435c48eb30e1789cd1271a35fe48fe7bef49ab56 # Parent 3a94cc365533bf7def255dc5b28e6a6a1d1bfa50 asm: code for transpose_16x16 routine
diff -r 3a94cc365533 -r 435c48eb30e1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Nov 19 11:53:09 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Nov 19 19:19:30 2013 +0530 @@ -547,6 +547,7 @@ p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2; p.transpose[BLOCK_4x4] = x265_transpose4_sse2; p.transpose[BLOCK_8x8] = x265_transpose8_sse2; + p.transpose[BLOCK_16x16] = x265_transpose16_sse2; } if (cpuMask & X265_CPU_SSSE3) { diff -r 3a94cc365533 -r 435c48eb30e1 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Tue Nov 19 11:53:09 2013 +0530 +++ b/source/common/x86/pixel-a.asm Tue Nov 19 19:19:30 2013 +0530 @@ -8401,3 +8401,68 @@ movu [r0 + 48], m3 RET + +%macro transpose_8x8 0 + + movh m0, [r1] + movh m1, [r1 + r2] + movh m2, [r1 + 2 * r2] + lea r1, [r1 + 2 * r2] + movh m3, [r1 + r2] + movh m4, [r1 + 2 * r2] + lea r1, [r1 + 2 * r2] + movh m5, [r1 + r2] + movh m6, [r1 + 2 * r2] + lea r1, [r1 + 2 * r2] + movh m7, [r1 + r2] + + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + punpckhwd m5, m4, m6 + punpcklwd m4, m6 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m3, m1, m5 + punpckldq m1, m5 + + movlps [r0], m0 + movhps [r0 + r3], m0 + movlps [r0 + 2 * r3], m2 + lea r0, [r0 + 2 * r3] + movhps [r0 + r3], m2 + movlps [r0 + 2 * r3], m1 + lea r0, [r0 + 2 * r3] + movhps [r0 + r3], m1 + movlps [r0 + 2 * r3], m3 + lea r0, [r0 + 2 * r3] + movhps [r0 + r3], m3 + +%endmacro + + +;----------------------------------------------------------------- +; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride) +;----------------------------------------------------------------- +INIT_XMM sse2 +cglobal transpose16, 3, 5, 8, dest, src, stride + + mov r4, r0 + mov r5, r1 + mov r3, 16 + transpose_8x8 + lea r1, [r1 + 2 * r2] + lea r0, [r4 + 8] + transpose_8x8 + lea r1, [r5 + 8] + lea r0, [r4 + r3 * 8] + transpose_8x8 + lea r1, [r1 + 2 * r2] + lea r0, [r4 + r3 * 8 +8] + transpose_8x8 + + RET diff -r 3a94cc365533 -r 435c48eb30e1 source/common/x86/pixel.h --- a/source/common/x86/pixel.h Tue Nov 19 11:53:09 2013 +0530 +++ b/source/common/x86/pixel.h Tue Nov 19 19:19:30 2013 +0530 @@ -367,5 +367,6 @@ void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride); void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride); +void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride); #endif // ifndef X265_I386_PIXEL_H _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel