At 2013-11-19 14:23:41,[email protected] wrote: ># HG changeset patch ># User Murugan Vairavel <[email protected]> ># Date 1384842189 -19800 ># Tue Nov 19 11:53:09 2013 +0530 ># Node ID 3a94cc365533bf7def255dc5b28e6a6a1d1bfa50 ># Parent f6a050b79cfa400aa432f49ee8a4c2b9f20cf930 >asm: code for transpose_8x8 routine > >diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Tue Nov 19 11:25:00 2013 +0530 >+++ b/source/common/x86/asm-primitives.cpp Tue Nov 19 11:53:09 2013 +0530 >@@ -546,6 +546,7 @@ > p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2; > p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2; > p.transpose[BLOCK_4x4] = x265_transpose4_sse2; >+ p.transpose[BLOCK_8x8] = x265_transpose8_sse2; > } > if (cpuMask & X265_CPU_SSSE3) > { >diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel-a.asm >--- a/source/common/x86/pixel-a.asm Tue Nov 19 11:25:00 2013 +0530 >+++ b/source/common/x86/pixel-a.asm Tue Nov 19 11:53:09 2013 +0530 >@@ -8359,3 +8359,45 @@ > movu [r0], m0 > > RET >+ >+;----------------------------------------------------------------- >+; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride) >+;----------------------------------------------------------------- >+INIT_XMM sse2 >+cglobal transpose8, 3, 3, 8, dest, src, stride >+ >+ movh m0, [r1] >+ movh m1, [r1 + r2] >+ movh m2, [r1 + 2 * r2] >+ lea r1, [r1 + 2 * r2] >+ movh m3, [r1 + r2] >+ movh m4, [r1 + 2 * r2] >+ lea r1, [r1 + 2 * r2] >+ movh m5, [r1 + r2] >+ movh m6, [r1 + 2 * r2] >+ lea r1, [r1 + 2 * r2] >+ movh m7, [r1 + r2] >+ >+ punpcklbw m0, m1 >+ punpcklbw m2, m3 >+ punpcklbw m4, m5 >+ punpcklbw m6, m7 >+ movu m1, m0 register to register copy use mova is better, of course, use "punpckhwd m1, m0, m2" is best way
>+ punpcklwd m0, m2 >+ punpckhwd m1, m2 >+ movu m5, m4 >+ punpcklwd m4, m6 >+ punpckhwd m5, m6 >+ movu m2, m0 >+ punpckldq m0, m4 >+ punpckhdq m2, m4 >+ movu m3, m1 >+ punpckldq m1, m5 >+ punpckhdq m3, m5 >+ >+ movu [r0], m0 >+ movu [r0 + 16], m2 >+ movu [r0 + 32], m1 >+ movu [r0 + 48], m3 >+ >+ RET >diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel.h >--- a/source/common/x86/pixel.h Tue Nov 19 11:25:00 2013 +0530 >+++ b/source/common/x86/pixel.h Tue Nov 19 11:53:09 2013 +0530 >@@ -366,5 +366,6 @@ > void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, > intptr_t stride); > void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, > intptr_t stride); > void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride); >+void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride); > > #endif // ifndef X265_I386_PIXEL_H >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
