# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1396469570 25200 # Node ID 4348a3ed1b3201bc18d80ed51bfc0fccc24d3fcf # Parent 0206822d9fea295c199a0ad192e8fc5e1f2b9124 remove unused parwameter *recon from assembly code
diff -r 0206822d9fea -r 4348a3ed1b32 source/Lib/TLibEncoder/TEncSearch.cpp --- a/source/Lib/TLibEncoder/TEncSearch.cpp Tue Apr 01 23:28:32 2014 +0530 +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Wed Apr 02 13:12:50 2014 -0700 @@ -465,7 +465,7 @@ assert(width <= 32); //===== reconstruction ===== - primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride); + primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride); //===== update distortion ===== outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride); } @@ -587,7 +587,7 @@ assert(((intptr_t)residual & (width - 1)) == 0); assert(width <= 32); //===== reconstruction ===== - primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, reconQtStride, reconIPredStride); + primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride); //===== update distortion ===== uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride); if (ttype == TEXT_CHROMA_U) diff -r 0206822d9fea -r 4348a3ed1b32 source/common/pixel.cpp --- a/source/common/pixel.cpp Tue Apr 01 23:28:32 2014 +0530 +++ b/source/common/pixel.cpp Wed Apr 02 13:12:50 2014 -0700 @@ -460,9 +460,7 @@ } template<int blockSize> -void calcRecons(pixel* pred, int16_t* residual, - pixel*, - int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride) +void calcRecons(pixel* pred, int16_t* residual, int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride) { for (int y = 0; y < blockSize; y++) { diff -r 0206822d9fea -r 4348a3ed1b32 source/common/primitives.h --- a/source/common/primitives.h Tue Apr 01 23:28:32 2014 +0530 +++ b/source/common/primitives.h Wed Apr 02 13:12:50 2014 -0700 @@ -125,7 +125,7 @@ typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride); typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride); typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); -typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride); typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos); typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift); diff -r 0206822d9fea -r 4348a3ed1b32 source/common/x86/pixel-util.h --- a/source/common/x86/pixel-util.h Tue Apr 01 23:28:32 2014 +0530 +++ b/source/common/x86/pixel-util.h Wed Apr 02 13:12:50 2014 -0700 @@ -24,12 +24,12 @@ #ifndef X265_PIXEL_UTIL_H #define X265_PIXEL_UTIL_H -void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); -void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride); diff -r 0206822d9fea -r 4348a3ed1b32 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Tue Apr 01 23:28:32 2014 +0530 +++ b/source/common/x86/pixel-util8.asm Wed Apr 02 13:12:50 2014 -0700 @@ -58,590 +58,452 @@ cextern pw_pixel_max ;----------------------------------------------------------------------------- -; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred) +; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal calcRecons4 %if HIGH_BIT_DEPTH %if ARCH_X86_64 == 1 - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 - PROLOGUE 6,9,6 +cglobal calcRecons4, 5,8,4 + %define t7b r7b %else - DECLARE_REG_TMP 0,1,2,3,4,5 - PROLOGUE 6,7,6 - %define t6 r6m - %define t6d r6d - %define t7 r7m - %define t8d r6d +cglobal calcRecons4, 5,7,4,0-1 + %define t7b byte [rsp] %endif - - mov t6d, r6m -%if ARCH_X86_64 == 0 - add t6d, t6d - mov r6m, t6d -%else + mov r4d, r4m mov r5d, r5m - mov r7d, r7m - add t6d, t6d - add t7, t7 -%endif + mov r6d, r6m + add r4d, r4d + add r5d, r5d + add r6d, r6d pxor m4, m4 mova m5, [pw_pixel_max] - add t5, t5 - mov t8d, 4/2 + mov t7b, 4/2 .loop: - movh m0, [t0] - movh m1, [t0 + t5] + movh m0, [r0] + movh m1, [r0 + r4] punpcklqdq m0, m1 - movh m2, [t1] - movh m3, [t1 + t5] + movh m2, [r1] + movh m3, [r1 + r4] punpcklqdq m2, m3 paddw m0, m2 CLIPW m0, m4, m5 - ; store recon[] and recipred[] - movh [t4], m0 -%if ARCH_X86_64 == 0 - add t4, t7 - add t4, t7 - movhps [t4], m0 - add t4, t7 - add t4, t7 + ; store recipred[] + movh [r3], m0 + movhps [r3 + r6], m0 + + ; store recqt[] + movh [r2], m0 + movhps [r2 + r5], m0 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b + jnz .loop + RET +%else ;HIGH_BIT_DEPTH + +%if ARCH_X86_64 == 1 +cglobal calcRecons4, 5,8,4 + %define t7b r7b %else - movhps [t4 + t7], m0 - lea t4, [t4 + t7 * 2] +cglobal calcRecons4, 5,7,4,0-1 + %define t7b byte [rsp] %endif - - ; store recqt[] - movh [t3], m0 - add t3, t6 - movhps [t3], m0 - add t3, t6 - - lea t0, [t0 + t5 * 2] - lea t1, [t1 + t5 * 2] - - dec t8d - jnz .loop - -%else ;HIGH_BIT_DEPTH -%if ARCH_X86_64 == 1 - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 - PROLOGUE 6,9,4 -%else - DECLARE_REG_TMP 0,1,2,3,4,5 - PROLOGUE 6,7,4 - %define t6 r6m - %define t6d r6d - %define t7 r7m - %define t8d r6d -%endif - - mov t6d, r6m -%if ARCH_X86_64 == 0 - add t6d, t6d - mov r6m, t6d -%else + mov r4d, r4m mov r5d, r5m - mov r7d, r7m - add t6d, t6d -%endif + mov r6d, r6m + add r5d, r5d pxor m0, m0 - mov t8d, 4/2 + mov t7b, 4/2 .loop: - movd m1, [t0] - movd m2, [t0 + t5] + movd m1, [r0] + movd m2, [r0 + r4] punpckldq m1, m2 punpcklbw m1, m0 - movh m2, [t1] - movh m3, [t1 + t5 * 2] + movh m2, [r1] + movh m3, [r1 + r4 * 2] punpcklqdq m2, m3 paddw m1, m2 packuswb m1, m1 ; store recon[] and recipred[] - movd [t4], m1 - add t4, t7 + movd [r3], m1 pshufd m2, m1, 1 - movd [t4], m2 - add t4, t7 + movd [r3 + r6], m2 ; store recqt[] punpcklbw m1, m0 - movlps [t3], m1 - add t3, t6 - movhps [t3], m1 - add t3, t6 - - lea t0, [t0 + t5 * 2] - lea t1, [t1 + t5 * 4] - - dec t8d + movlps [r2], m1 + movhps [r2 + r5], m1 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 4] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b jnz .loop + RET %endif ;HIGH_BIT_DEPTH - RET INIT_XMM sse2 -cglobal calcRecons8 +%if ARCH_X86_64 == 1 +cglobal calcRecons8, 5,8,4 + %define t7b r7b +%else +cglobal calcRecons8, 5,7,4,0-1 + %define t7b byte [rsp] +%endif + %if HIGH_BIT_DEPTH -%if ARCH_X86_64 == 1 - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 - PROLOGUE 6,9,6 -%else - DECLARE_REG_TMP 0,1,2,3,4,5 - PROLOGUE 6,7,6 - %define t6 r6m - %define t6d r6d - %define t7 r7m - %define t8d r6d -%endif - - mov t6d, r6m -%if ARCH_X86_64 == 0 - add t6d, t6d - mov r6m, t6d -%else + mov r4d, r4m mov r5d, r5m - mov r7d, r7m - add t6d, t6d - add t7, t7 -%endif + mov r6d, r6m + add r4d, r4d + add r5d, r5d + add r6d, r6d pxor m4, m4 mova m5, [pw_pixel_max] - add t5, t5 - mov t8d, 8/2 + mov t7b, 8/2 .loop: - movu m0, [t0] - movu m1, [t0 + t5] - movu m2, [t1] - movu m3, [t1 + t5] + movu m0, [r0] + movu m1, [r0 + r4] + movu m2, [r1] + movu m3, [r1 + r4] paddw m0, m2 paddw m1, m3 CLIPW m0, m4, m5 CLIPW m1, m4, m5 - ; store recon[] and recipred[] - movu [t4], m0 -%if ARCH_X86_64 == 0 - add t4, t7 - add t4, t7 - movu [t4], m1 - add t4, t7 - add t4, t7 -%else - movu [t4 + t7], m1 - lea t4, [t4 + t7 * 2] -%endif + ; store recipred[] + movu [r3], m0 + movu [r3 + r6], m1 ; store recqt[] - movu [t3], m0 - add t3, t6 - movu [t3], m1 - add t3, t6 - - lea t0, [t0 + t5 * 2] - lea t1, [t1 + t5 * 2] - - dec t8d + movu [r2], m0 + movu [r2 + r5], m1 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b jnz .loop + RET %else ;HIGH_BIT_DEPTH -%if ARCH_X86_64 == 1 - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 - PROLOGUE 6,9,5 -%else - DECLARE_REG_TMP 0,1,2,3,4,5 - PROLOGUE 6,7,5 - %define t6 r6m - %define t6d r6d - %define t7 r7m - %define t8d r6d -%endif - - mov t6d, r6m -%if ARCH_X86_64 == 0 - add t6d, t6d - mov r6m, t6d -%else + mov r4d, r4m mov r5d, r5m - mov r7d, r7m - add t6d, t6d -%endif + mov r6d, r6m + add r5d, r5d pxor m0, m0 - mov t8d, 8/2 + mov t7b, 8/2 .loop: - movh m1, [t0] - movh m2, [t0 + t5] + movh m1, [r0] + movh m2, [r0 + r4] punpcklbw m1, m0 punpcklbw m2, m0 - movu m3, [t1] - movu m4, [t1 + t5 * 2] + movu m3, [r1] + movu m4, [r1 + r4 * 2] paddw m1, m3 paddw m2, m4 packuswb m1, m2 ; store recon[] and recipred[] - movlps [t4], m1 -%if ARCH_X86_64 == 0 - add t4, t7 - movhps [t4], m1 - add t4, t7 -%else - movhps [t4 + t7], m1 - lea t4, [t4 + t7 * 2] -%endif + movlps [r3], m1 + movhps [r3 + r6], m1 ; store recqt[] punpcklbw m2, m1, m0 punpckhbw m1, m0 - movu [t3], m2 - add t3, t6 - movu [t3], m1 - add t3, t6 - - lea t0, [t0 + t5 * 2] - lea t1, [t1 + t5 * 4] - - dec t8d + movu [r2], m2 + movu [r2 + r5], m1 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 4] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b jnz .loop + RET %endif ;HIGH_BIT_DEPTH - RET %if HIGH_BIT_DEPTH INIT_XMM sse2 -cglobal calcRecons16 %if ARCH_X86_64 == 1 - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 - PROLOGUE 6,9,6 +cglobal calcRecons16, 5,8,4 + %define t7b r7b %else - DECLARE_REG_TMP 0,1,2,3,4,5 - PROLOGUE 6,7,6 - %define t6 r6m - %define t6d r6d - %define t7 r7m - %define t8d r6d +cglobal calcRecons16, 5,7,4,0-1 + %define t7b byte [rsp] %endif - mov t6d, r6m -%if ARCH_X86_64 == 0 - add t6d, t6d - mov r6m, t6d -%else + mov r4d, r4m mov r5d, r5m - mov r7d, r7m - add t6d, t6d - add t7, t7 -%endif + mov r6d, r6m + add r4d, r4d + add r5d, r5d + add r6d, r6d pxor m4, m4 mova m5, [pw_pixel_max] - add t5, t5 - mov t8d, 16/2 + mov t7b, 16/2 .loop: - movu m0, [t0] - movu m1, [t0 + 16] - movu m2, [t1] - movu m3, [t1 + 16] + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r1] + movu m3, [r1 + 16] paddw m0, m2 paddw m1, m3 CLIPW m0, m4, m5 CLIPW m1, m4, m5 - ; store recon[] and recipred[] - movu [t4], m0 - movu [t4 + 16], m1 -%if ARCH_X86_64 == 0 - add t4, t7 - add t4, t7 -%endif + ; store recipred[] + movu [r3], m0 + movu [r3 + 16], m1 ; store recqt[] - movu [t3], m0 - movu [t3 + 16], m1 - add t3, t6 - - movu m0, [t0 + t5] - movu m1, [t0 + t5 + 16] - movu m2, [t1 + t5] - movu m3, [t1 + t5 + 16] + movu [r2], m0 + movu [r2 + 16], m1 + + movu m0, [r0 + r4] + movu m1, [r0 + r4 + 16] + movu m2, [r1 + r4] + movu m3, [r1 + r4 + 16] paddw m0, m2 paddw m1, m3 CLIPW m0, m4, m5 CLIPW m1, m4, m5 ; store recon[] and recipred[] -%if ARCH_X86_64 == 0 - movu [t4], m0 - movu [t4 + 16], m1 - add t4, t7 - add t4, t7 + movu [r3 + r6], m0 + movu [r3 + r6 + 16], m1 + + ; store recqt[] + movu [r2 + r5], m0 + movu [r2 + r5 + 16], m1 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 2] + lea r2, [r2 + r5 * 2] + lea r3, [r3 + r6 * 2] + + dec t7b + jnz .loop + RET +%else ;HIGH_BIT_DEPTH + +INIT_XMM sse4 +%if ARCH_X86_64 == 1 +cglobal calcRecons16, 5,8,4 + %define t7b r7b %else - movu [t4 + t7], m0 - movu [t4 + t7 + 16], m1 - lea t4, [t4 + t7 * 2] +cglobal calcRecons16, 5,7,4,0-1 + %define t7b byte [rsp] %endif - ; store recqt[] - movu [t3], m0 - movu [t3 + 16], m1 - add t3, t6 - - lea t0, [t0 + t5 * 2] - lea t1, [t1 + t5 * 2] - - dec t8d - jnz .loop -%else ;HIGH_BIT_DEPTH -INIT_XMM sse4 -cglobal calcRecons16 -%if ARCH_X86_64 == 1 - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 - PROLOGUE 6,9,3 -%else - DECLARE_REG_TMP 0,1,2,3,4,5 - PROLOGUE 6,7,3 - %define t6 r6m - %define t6d r6d - %define t7 r7m - %define t8d r6d -%endif - - mov t6d, r6m -%if ARCH_X86_64 == 0 - add t6d, t6d - mov r6m, t6d -%else + mov r4d, r4m mov r5d, r5m - mov r7d, r7m - add t6d, t6d -%endif + mov r6d, r6m + add r5d, r5d pxor m0, m0 - mov t8d, 16 + mov t7b, 16 .loop: - movu m2, [t0] + movu m2, [r0] pmovzxbw m1, m2 punpckhbw m2, m0 - paddw m1, [t1] - paddw m2, [t1 + 16] + paddw m1, [r1] + paddw m2, [r1 + 16] packuswb m1, m2 ; store recon[] and recipred[] - movu [t4], m1 + movu [r3], m1 ; store recqt[] pmovzxbw m2, m1 punpckhbw m1, m0 - movu [t3], m2 - movu [t3 + 16], m1 - - add t3, t6 - add t4, t7 - add t0, t5 - lea t1, [t1 + t5 * 2] - - dec t8d + movu [r2], m2 + movu [r2 + 16], m1 + + add r2, r5 + add r3, r6 + add r0, r4 + lea r1, [r1 + r4 * 2] + + dec t7b jnz .loop + RET %endif ;HIGH_BIT_DEPTH - RET %if HIGH_BIT_DEPTH INIT_XMM sse2 -cglobal calcRecons32 %if ARCH_X86_64 == 1 - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 - PROLOGUE 6,9,6 +cglobal calcRecons32, 5,8,4 + %define t7b r7b %else - DECLARE_REG_TMP 0,1,2,3,4,5 - PROLOGUE 6,7,6 - %define t6 r6m - %define t6d r6d - %define t7 r7m - %define t8d r6d +cglobal calcRecons32, 5,7,4,0-1 + %define t7b byte [rsp] %endif - mov t6d, r6m -%if ARCH_X86_64 == 0 - add t6d, t6d - mov r6m, t6d -%else + mov r4d, r4m mov r5d, r5m - mov r7d, r7m - add t6d, t6d - add t7, t7 -%endif + mov r6d, r6m + add r4d, r4d + add r5d, r5d + add r6d, r6d pxor m4, m4 mova m5, [pw_pixel_max] - add t5, t5 - mov t8d, 32/2 + mov t7b, 32/2 .loop: - movu m0, [t0] - movu m1, [t0 + 16] - movu m2, [t1] - movu m3, [t1 + 16] + movu m0, [r0] + movu m1, [r0 + 16] + movu m2, [r1] + movu m3, [r1 + 16] paddw m0, m2 paddw m1, m3 CLIPW m0, m4, m5 CLIPW m1, m4, m5 - ; store recon[] and recipred[] - movu [t4], m0 - movu [t4 + 16], m1 + ; store recipred[] + movu [r3], m0 + movu [r3 + 16], m1 ; store recqt[] - movu [t3], m0 - movu [t3 + 16], m1 - - movu m0, [t0 + 32] - movu m1, [t0 + 48] - movu m2, [t1 + 32] - movu m3, [t1 + 48] + movu [r2], m0 + movu [r2 + 16], m1 + + movu m0, [r0 + 32] + movu m1, [r0 + 48] + movu m2, [r1 + 32] + movu m3, [r1 + 48] paddw m0, m2 paddw m1, m3 CLIPW m0, m4, m5 CLIPW m1, m4, m5 ; store recon[] and recipred[] - movu [t4 + 32], m0 - movu [t4 + 48], m1 -%if ARCH_X86_64 == 0 - add t4, t7 - add t4, t7 -%endif + movu [r3 + 32], m0 + movu [r3 + 48], m1 ; store recqt[] - movu [t3 + 32], m0 - movu [t3 + 48], m1 - add t3, t6 - - movu m0, [t0 + t5] - movu m1, [t0 + t5 + 16] - movu m2, [t1 + t5] - movu m3, [t1 + t5 + 16] + movu [r2 + 32], m0 + movu [r2 + 48], m1 + add r2, r5 + + movu m0, [r0 + r4] + movu m1, [r0 + r4 + 16] + movu m2, [r1 + r4] + movu m3, [r1 + r4 + 16] paddw m0, m2 paddw m1, m3 CLIPW m0, m4, m5 CLIPW m1, m4, m5 ; store recon[] and recipred[] -%if ARCH_X86_64 == 0 - movu [t4], m0 - movu [t4 + 16], m1 -%else - movu [t4 + t7], m0 - movu [t4 + t7 + 16], m1 -%endif + movu [r3 + r6], m0 + movu [r3 + r6 + 16], m1 ; store recqt[] - movu [t3], m0 - movu [t3 + 16], m1 - - movu m0, [t0 + t5 + 32] - movu m1, [t0 + t5 + 48] - movu m2, [t1 + t5 + 32] - movu m3, [t1 + t5 + 48] + movu [r2], m0 + movu [r2 + 16], m1 + + movu m0, [r0 + r4 + 32] + movu m1, [r0 + r4 + 48] + movu m2, [r1 + r4 + 32] + movu m3, [r1 + r4 + 48] paddw m0, m2 paddw m1, m3 CLIPW m0, m4, m5 CLIPW m1, m4, m5 ; store recon[] and recipred[] -%if ARCH_X86_64 == 0 - movu [t4 + 32], m0 - movu [t4 + 48], m1 - add t4, t7 - add t4, t7 -%else - movu [t4 + t7 + 32], m0 - movu [t4 + t7 + 48], m1 - lea t4, [t4 + t7 * 2] -%endif + movu [r3 + r6 + 32], m0 + movu [r3 + r6 + 48], m1 + lea r3, [r3 + r6 * 2] ; store recqt[] - movu [t3 + 32], m0 - movu [t3 + 48], m1 - add t3, t6 - - lea t0, [t0 + t5 * 2] - lea t1, [t1 + t5 * 2] - - dec t8d + movu [r2 + 32], m0 + movu [r2 + 48], m1 + add r2, r5 + + lea r0, [r0 + r4 * 2] + lea r1, [r1 + r4 * 2] + + dec t7b jnz .loop + RET %else ;HIGH_BIT_DEPTH INIT_XMM sse4 -cglobal calcRecons32 %if ARCH_X86_64 == 1 - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 - PROLOGUE 6,9,5 +cglobal calcRecons32, 5,8,4 + %define t7b r7b %else - DECLARE_REG_TMP 0,1,2,3,4,5 - PROLOGUE 6,7,5 - %define t6 r6m - %define t6d r6d - %define t7 r7m - %define t8d r6d +cglobal calcRecons32, 5,7,4,0-1 + %define t7b byte [rsp] %endif - mov t6d, r6m -%if ARCH_X86_64 == 0 - add t6d, t6d - mov r6m, t6d -%else + mov r4d, r4m mov r5d, r5m - mov r7d, r7m - add t6d, t6d -%endif + mov r6d, r6m + add r5d, r5d pxor m0, m0 - mov t8d, 32 + mov t7b, 32 .loop: - movu m2, [t0] - movu m4, [t0 + 16] + movu m2, [r0] + movu m4, [r0 + 16] pmovzxbw m1, m2 punpckhbw m2, m0 pmovzxbw m3, m4 punpckhbw m4, m0 - paddw m1, [t1 + 0 * 16] - paddw m2, [t1 + 1 * 16] + paddw m1, [r1 + 0 * 16] + paddw m2, [r1 + 1 * 16] packuswb m1, m2 - paddw m3, [t1 + 2 * 16] - paddw m4, [t1 + 3 * 16] + paddw m3, [r1 + 2 * 16] + paddw m4, [r1 + 3 * 16] packuswb m3, m4 ; store recon[] and recipred[] - movu [t4], m1 - movu [t4 + 16], m3 + movu [r3], m1 + movu [r3 + 16], m3 ; store recqt[] pmovzxbw m2, m1 punpckhbw m1, m0 - movu [t3 + 0 * 16], m2 - movu [t3 + 1 * 16], m1 + movu [r2 + 0 * 16], m2 + movu [r2 + 1 * 16], m1 pmovzxbw m4, m3 punpckhbw m3, m0 - movu [t3 + 2 * 16], m4 - movu [t3 + 3 * 16], m3 - - add t3, t6 - add t4, t7 - add t0, t5 - lea t1, [t1 + t5 * 2] - - dec t8d + movu [r2 + 2 * 16], m4 + movu [r2 + 3 * 16], m3 + + add r2, r5 + add r3, r6 + add r0, r4 + lea r1, [r1 + r4 * 2] + + dec t7b jnz .loop + RET %endif ;HIGH_BIT_DEPTH - RET ;----------------------------------------------------------------------------- diff -r 0206822d9fea -r 4348a3ed1b32 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Tue Apr 01 23:28:32 2014 +0530 +++ b/source/test/pixelharness.cpp Wed Apr 02 13:12:50 2014 -0700 @@ -354,10 +354,8 @@ int stride = STRIDE; int index1 = rand() % TEST_CASES; int index2 = rand() % TEST_CASES; - ref(pixel_test_buff[index1] + j, short_test_buff[index2] + j, - ref_reco, ref_recq, ref_pred, stride, stride, stride); - opt(pixel_test_buff[index1] + j, short_test_buff[index2] + j, - opt_reco, opt_recq, opt_pred, stride, stride, stride); + ref(pixel_test_buff[index1] + j, short_test_buff[index2] + j, ref_recq, ref_pred, stride, stride, stride); + opt(pixel_test_buff[index1] + j, short_test_buff[index2] + j, opt_recq, opt_pred, stride, stride, stride); if (memcmp(ref_recq, opt_recq, 64 * 64 * sizeof(int16_t))) { @@ -1609,7 +1607,7 @@ if (opt.calcrecon[i]) { HEADER("recon[%dx%d]", 4 << i, 4 << i); - REPORT_SPEEDUP(opt.calcrecon[i], ref.calcrecon[i], pbuf1, sbuf1, pbuf2, sbuf1, pbuf1, 64, 64, 64); + REPORT_SPEEDUP(opt.calcrecon[i], ref.calcrecon[i], pbuf1, sbuf1, sbuf1, pbuf1, 64, 64, 64); } if (opt.blockfill_s[i]) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel