# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1384337983 -28800 # Node ID 8962d6fc534ae0597f7f4547f3be3a732bda439d # Parent 90c2763ee0272247dabce845f4fcc3c3e73316fb asm: assembly code for calcrecon[]
diff -r 90c2763ee027 -r 8962d6fc534a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Nov 12 16:55:09 2013 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 13 18:19:43 2013 +0800 @@ -449,6 +449,8 @@ p.cvt32to16_shr = x265_cvt32to16_shr_sse2; p.ipfilter_ss[FILTER_V_S_S_8] = x265_interp_8tap_v_ss_sse2; + p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2; + p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2; } if (cpuMask & X265_CPU_SSSE3) { @@ -524,6 +526,9 @@ p.chroma_copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4; p.chroma_copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4; p.chroma_copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4; + p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4; + p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4; + p.calcrecon[BLOCK_64x64] = x265_calcRecons64_sse4; } if (cpuMask & X265_CPU_AVX) { diff -r 90c2763ee027 -r 8962d6fc534a source/common/x86/pixel-util.asm --- a/source/common/x86/pixel-util.asm Tue Nov 12 16:55:09 2013 +0530 +++ b/source/common/x86/pixel-util.asm Wed Nov 13 18:19:43 2013 +0800 @@ -1,103 +1,475 @@ -;***************************************************************************** -;* Copyright (C) 2013 x265 project -;* -;* Authors: Min Chen <chenm...@163.com> <min.c...@multicorewareinc.com> -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. -;* -;* This program is also available under a commercial proprietary license. -;* For more information, contact us at licens...@multicorewareinc.com. -;*****************************************************************************/ - -%include "x86inc.asm" -%include "x86util.asm" - -SECTION_RODATA 32 - -SECTION .text - - -;----------------------------------------------------------------------------- -; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size) -;----------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride -%define rnd m7 -%define shift m6 - - ; make shift - mov r5d, r3m - movd shift, r5d - - ; make round - dec r5 - xor r6, r6 - bts r6, r5 - - movd rnd, r6d - pshufd rnd, rnd, 0 - - ; register alloc - ; r0 - dst - ; r1 - src - ; r2 - stride * 2 (short*) - ; r3 - lx - ; r4 - size - ; r5 - ly - ; r6 - diff - lea r2, [r2 * 2] - - mov r4d, r4m - mov r5, r4 - mov r6, r2 - sub r6, r4 - lea r6, [r6 * 2] - - shr r5, 1 -.loop_row: - - mov r3, r4 - shr r3, 2 -.loop_col: - ; row 0 - movu m0, [r1] - paddd m0, rnd - psrad m0, shift - packssdw m0, m0 - movh [r0], m0 - - ; row 1 - movu m0, [r1 + r4 * 4] - paddd m0, rnd - psrad m0, shift - packssdw m0, m0 - movh [r0 + r2], m0 - - ; move col pointer - add r1, 16 - add r0, 8 - - dec r3 - jg .loop_col - - ; update pointer - lea r1, [r1 + r4 * 4] - add r0, r6 - - ; end of loop_row - dec r5 - jg .loop_row - - RET +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Min Chen <chenm...@163.com> <min.c...@multicorewareinc.com> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licens...@multicorewareinc.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +SECTION .text + + +;----------------------------------------------------------------------------- +; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride +%define rnd m7 +%define shift m6 + + ; make shift + mov r5d, r3m + movd shift, r5d + + ; make round + dec r5 + xor r6, r6 + bts r6, r5 + + movd rnd, r6d + pshufd rnd, rnd, 0 + + ; register alloc + ; r0 - dst + ; r1 - src + ; r2 - stride * 2 (short*) + ; r3 - lx + ; r4 - size + ; r5 - ly + ; r6 - diff + lea r2, [r2 * 2] + + mov r4d, r4m + mov r5, r4 + mov r6, r2 + sub r6, r4 + lea r6, [r6 * 2] + + shr r5, 1 +.loop_row: + + mov r3, r4 + shr r3, 2 +.loop_col: + ; row 0 + movu m0, [r1] + paddd m0, rnd + psrad m0, shift + packssdw m0, m0 + movh [r0], m0 + + ; row 1 + movu m0, [r1 + r4 * 4] + paddd m0, rnd + psrad m0, shift + packssdw m0, m0 + movh [r0 + r2], m0 + + ; move col pointer + add r1, 16 + add r0, 8 + + dec r3 + jg .loop_col + + ; update pointer + lea r1, [r1 + r4 * 4] + add r0, r6 + + ; end of loop_row + dec r5 + jg .loop_row + + RET + + +;----------------------------------------------------------------------------- +; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal calcRecons4 +%if ARCH_X86_64 == 1 + DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 + PROLOGUE 6,9,4 +%else + DECLARE_REG_TMP 0,1,2,3,4,5 + PROLOGUE 6,7,4 + %define t6 r6m + %define t6d r6d + %define t7 r7m + %define t8d r6d +%endif + + mov t6d, r6m +%if ARCH_X86_64 == 0 + add t6d, t6d + mov r6m, t6d +%else + mov r5d, r5m + mov r7d, r7m + add t6d, t6d +%endif + + pxor m0, m0 + mov t8d, 4/2 +.loop: + movd m1, [t0] + movd m2, [t0 + t5] + punpckldq m1, m2 + punpcklbw m1, m0 + movh m2, [t1] + movh m3, [t1 + t5 * 2] + punpcklqdq m2, m3 + paddw m1, m2 + packuswb m1, m1 + + ; store recon[] and recipred[] + movd [t2], m1 + movd [t4], m1 + add t4, t7 + pshufd m2, m1, 1 + movd [t2 + t5], m2 + movd [t4], m2 + add t4, t7 + + ; store recqt[] + punpcklbw m1, m0 + movlps [t3], m1 + add t3, t6 + movhps [t3], m1 + add t3, t6 + + lea t0, [t0 + t5 * 2] + lea t1, [t1 + t5 * 4] + lea t2, [t2 + t5 * 2] + + dec t8d + jnz .loop + RET + + +INIT_XMM sse2 +cglobal calcRecons8 +%if ARCH_X86_64 == 1 + DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 + PROLOGUE 6,9,5 +%else + DECLARE_REG_TMP 0,1,2,3,4,5 + PROLOGUE 6,7,5 + %define t6 r6m + %define t6d r6d + %define t7 r7m + %define t8d r6d +%endif + + mov t6d, r6m +%if ARCH_X86_64 == 0 + add t6d, t6d + mov r6m, t6d +%else + mov r5d, r5m + mov r7d, r7m + add t6d, t6d +%endif + + pxor m0, m0 + mov t8d, 8/2 +.loop: + movh m1, [t0] + movh m2, [t0 + t5] + punpcklbw m1, m0 + punpcklbw m2, m0 + movu m3, [t1] + movu m4, [t1 + t5 * 2] + paddw m1, m3 + paddw m2, m4 + packuswb m1, m2 + + ; store recon[] and recipred[] + movlps [t2], m1 + movhps [t2 + t5], m1 + movlps [t4], m1 +%if ARCH_X86_64 == 0 + add t4, t7 + movhps [t4], m1 + add t4, t7 +%else + movhps [t4 + t7], m1 + lea t4, [t4 + t7 * 2] +%endif + + ; store recqt[] + punpcklbw m2, m1, m0 + punpckhbw m1, m0 + movu [t3], m2 + add t3, t6 + movu [t3], m1 + add t3, t6 + + lea t0, [t0 + t5 * 2] + lea t1, [t1 + t5 * 4] + lea t2, [t2 + t5 * 2] + + dec t8d + jnz .loop + RET + + +INIT_XMM sse4 +cglobal calcRecons16 +%if ARCH_X86_64 == 1 + DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 + PROLOGUE 6,9,5 +%else + DECLARE_REG_TMP 0,1,2,3,4,5 + PROLOGUE 6,7,5 + %define t6 r6m + %define t6d r6d + %define t7 r7m + %define t8d r6d +%endif + + mov t6d, r6m +%if ARCH_X86_64 == 0 + add t6d, t6d + mov r6m, t6d +%else + mov r5d, r5m + mov r7d, r7m + add t6d, t6d +%endif + + pxor m0, m0 + mov t8d, 16 +.loop: + movu m2, [t0] + pmovzxbw m1, m2 + punpckhbw m2, m0 + movu m3, [t1] + movu m4, [t1 + 16] + paddw m1, m3 + paddw m2, m4 + packuswb m1, m2 + + ; store recon[] and recipred[] + movu [t2], m1 + movu [t4], m1 + + ; store recqt[] + pmovzxbw m2, m1 + punpckhbw m1, m0 + movu [t3], m2 + movu [t3 + 16], m1 + + add t3, t6 + add t4, t7 + add t0, t5 + lea t1, [t1 + t5 * 2] + add t2, t5 + + dec t8d + jnz .loop + RET + + +INIT_XMM sse4 +cglobal calcRecons32 +%if ARCH_X86_64 == 1 + DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 + PROLOGUE 6,9,7 +%else + DECLARE_REG_TMP 0,1,2,3,4,5 + PROLOGUE 6,7,7 + %define t6 r6m + %define t6d r6d + %define t7 r7m + %define t8d r6d +%endif + + mov t6d, r6m +%if ARCH_X86_64 == 0 + add t6d, t6d + mov r6m, t6d +%else + mov r5d, r5m + mov r7d, r7m + add t6d, t6d +%endif + + pxor m0, m0 + mov t8d, 32 +.loop: + movu m2, [t0] + movu m4, [t0 + 16] + pmovzxbw m1, m2 + punpckhbw m2, m0 + pmovzxbw m3, m4 + punpckhbw m4, m0 + + movu m5, [t1 + 0 * 16] + movu m6, [t1 + 1 * 16] + paddw m1, m5 + paddw m2, m6 + packuswb m1, m2 + + movu m5, [t1 + 2 * 16] + movu m6, [t1 + 3 * 16] + paddw m3, m5 + paddw m4, m6 + packuswb m3, m4 + + ; store recon[] and recipred[] + movu [t2], m1 + movu [t2 + 16], m3 + movu [t4], m1 + movu [t4 + 16], m3 + + ; store recqt[] + pmovzxbw m2, m1 + punpckhbw m1, m0 + movu [t3 + 0 * 16], m2 + movu [t3 + 1 * 16], m1 + pmovzxbw m4, m3 + punpckhbw m3, m0 + movu [t3 + 2 * 16], m4 + movu [t3 + 3 * 16], m3 + + add t3, t6 + add t4, t7 + add t0, t5 + lea t1, [t1 + t5 * 2] + add t2, t5 + + dec t8d + jnz .loop + RET + + +INIT_XMM sse4 +cglobal calcRecons64 +%if ARCH_X86_64 == 1 + DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8 + PROLOGUE 6,9,7 +%else + DECLARE_REG_TMP 0,1,2,3,4,5 + PROLOGUE 6,7,7 + %define t6 r6m + %define t6d r6d + %define t7 r7m + %define t8d r6d +%endif + + mov t6d, r6m +%if ARCH_X86_64 == 0 + add t6d, t6d + mov r6m, t6d +%else + mov r5d, r5m + mov r7d, r7m + add t6d, t6d +%endif + + pxor m0, m0 + mov t8d, 64 +.loop: + ; left 32 pixel + movu m2, [t0 + 0 * 16] + movu m4, [t0 + 1 * 16] + pmovzxbw m1, m2 + punpckhbw m2, m0 + pmovzxbw m3, m4 + punpckhbw m4, m0 + + movu m5, [t1 + 0 * 16] + movu m6, [t1 + 1 * 16] + paddw m1, m5 + paddw m2, m6 + packuswb m1, m2 + + movu m5, [t1 + 2 * 16] + movu m6, [t1 + 3 * 16] + paddw m3, m5 + paddw m4, m6 + packuswb m3, m4 + + ; store recon[] and recipred[] + movu [t2 + 0 * 16], m1 + movu [t2 + 1 * 16], m3 + movu [t4 + 0 * 16], m1 + movu [t4 + 1 * 16], m3 + + ; store recqt[] + pmovzxbw m2, m1 + punpckhbw m1, m0 + movu [t3 + 0 * 16], m2 + movu [t3 + 1 * 16], m1 + pmovzxbw m4, m3 + punpckhbw m3, m0 + movu [t3 + 2 * 16], m4 + movu [t3 + 3 * 16], m3 + + ; right 32 pixel + movu m2, [t0 + 2 * 16] + movu m4, [t0 + 3 * 16] + pmovzxbw m1, m2 + punpckhbw m2, m0 + pmovzxbw m3, m4 + punpckhbw m4, m0 + + movu m5, [t1 + 4 * 16] + movu m6, [t1 + 5 * 16] + paddw m1, m5 + paddw m2, m6 + packuswb m1, m2 + + movu m5, [t1 + 6 * 16] + movu m6, [t1 + 7 * 16] + paddw m3, m5 + paddw m4, m6 + packuswb m3, m4 + + ; store recon[] and recipred[] + movu [t2 + 2 * 16], m1 + movu [t2 + 3 * 16], m3 + movu [t4 + 2 * 16], m1 + movu [t4 + 3 * 16], m3 + + ; store recqt[] + pmovzxbw m2, m1 + punpckhbw m1, m0 + movu [t3 + 4 * 16], m2 + movu [t3 + 5 * 16], m1 + pmovzxbw m4, m3 + punpckhbw m3, m0 + movu [t3 + 6 * 16], m4 + movu [t3 + 7 * 16], m3 + + add t3, t6 + add t4, t7 + add t0, t5 + lea t1, [t1 + t5 * 2] + add t2, t5 + + dec t8d + jnz .loop + RET diff -r 90c2763ee027 -r 8962d6fc534a source/common/x86/pixel.h --- a/source/common/x86/pixel.h Tue Nov 12 16:55:09 2013 +0530 +++ b/source/common/x86/pixel.h Wed Nov 13 18:19:43 2013 +0800 @@ -289,4 +289,10 @@ #undef DECL_X4 #undef DECL_ADS +void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); +void x265_calcRecons64_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred); + #endif // ifndef X265_I386_PIXEL_H diff -r 90c2763ee027 -r 8962d6fc534a source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Tue Nov 12 16:55:09 2013 +0530 +++ b/source/test/pixelharness.cpp Wed Nov 13 18:19:43 2013 +0800 @@ -286,8 +286,8 @@ for (int i = 0; i < ITERS; i++) { int stride = STRIDE; + ref(pbuf1 + j, sbuf1 + j, ref_reco, ref_recq, ref_pred, stride, stride, stride); opt(pbuf1 + j, sbuf1 + j, opt_reco, opt_recq, opt_pred, stride, stride, stride); - ref(pbuf1 + j, sbuf1 + j, ref_reco, ref_recq, ref_pred, stride, stride, stride); if (memcmp(ref_recq, opt_recq, 64 * 64 * sizeof(int16_t))) return false; _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel