From: Pierre Edouard Lepere <[email protected]> Integration to Libav by Josh de Kock <josh at itanimul.li>.
Authors: James Almer <[email protected]> Pierre Edouard Lepere <[email protected]> --- libavcodec/x86/Makefile | 7 +- libavcodec/x86/hevc_res_add.asm | 376 ++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/hevcdsp_init.c | 33 ++++ 3 files changed, 413 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/hevc_res_add.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 1460197..11f3b9c 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -114,9 +114,10 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o -YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o \ - x86/hevc_mc.o \ - x86/hevc_idct.o +YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \ + x86/hevc_deblock.o \ + x86/hevc_idct.o \ + x86/hevc_res_add.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm new file mode 100644 index 0000000..0faa7af --- /dev/null +++ b/libavcodec/x86/hevc_res_add.asm @@ -0,0 +1,376 @@ +; /* +; * Provide SIMD optimizations for add_residual functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * +; * This file is part of Libav. +; * +; * Libav is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * Libav is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with Libav; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +max_pixels_10: times 16 dw ((1 << 10)-1) + + +SECTION .text + +;the res_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file +%macro RES_ADD_MMX_4_8 0 + mova m2, [coeffsq] + mova m4, [coeffsq + 8] + pxor m3, m3 + psubw m3, m2 + packuswb m2, m2 + packuswb m3, m3 + pxor m5, m5 + psubw m5, m4 + packuswb m4, m4 + packuswb m5, m5 + + movh m0, [dstq] + movh m1, [dstq + strideq] + paddusb m0, m2 + paddusb m1, m4 + psubusb m0, m3 + psubusb m1, m5 + movh [dstq], m0 + movh [dstq + strideq], m1 +%endmacro + + +INIT_MMX mmxext +; void ff_hevc_add_residual_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_add_residual_4_8, 3, 5, 6, dst, coeffs, stride + RES_ADD_MMX_4_8 + add coeffsq, 16 + lea dstq, [dstq + strideq * 2] + RES_ADD_MMX_4_8 + RET + +%macro RES_ADD_SSE_8_8 0 + pxor m3, m3 + mova m4, [coeffsq] + mova m6, [coeffsq + 16] + mova m0, [coeffsq + 32] + mova m2, [coeffsq + 48] + psubw m5, m3, m4 + psubw m7, m3, m6 + psubw m1, m3, m0 + packuswb m4, m0 + packuswb m5, m1 + psubw m3, m2 + packuswb m6, m2 + packuswb m7, m3 + + movq m0, [dstq] + movq m1, [dstq + strideq] + movhps m0, [dstq + strideq * 2] + movhps m1, [dstq + r3] + paddusb m0, m4 + paddusb m1, m6 + psubusb m0, m5 + psubusb m1, m7 + movq [dstq], m0 + movq [dstq + strideq], m1 + movhps [dstq + 2 * strideq], m0 + movhps [dstq + r3], m1 +%endmacro + +%macro RES_ADD_SSE_16_32_8 3 + mova m2, [coeffsq + %1] + mova m6, [coeffsq + %1 + 16] +%if cpuflag(avx) + psubw m1, m0, m2 + psubw m5, m0, m6 +%else + mova m1, m0 + mova m5, m0 + psubw m1, m2 + psubw m5, m6 +%endif + packuswb m2, m6 + packuswb m1, m5 + + mova m4, [coeffsq + %1 + 32] + mova m6, [coeffsq + %1 + 48] +%if cpuflag(avx) + psubw m3, m0, m4 + psubw m5, m0, m6 +%else + mova m3, m0 + mova m5, m0 + psubw m3, m4 + psubw m5, m6 +%endif + packuswb m4, m6 + packuswb m3, m5 + + paddusb m2, [%2] + paddusb m4, [%3] + psubusb m2, m1 + psubusb m4, m3 + mova [%2], m2 + mova [%3], m4 +%endmacro + + +%macro RESIDUAL_ADD_8 0 +; void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_add_residual_8_8, 3, 5, 8, dst, coeffs, stride + lea r3, [strideq * 3] + RES_ADD_SSE_8_8 + add coeffsq, 64 + lea dstq, [dstq + strideq * 4] + RES_ADD_SSE_8_8 + RET + +; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_add_residual_16_8, 3, 5, 7, dst, coeffs, stride + pxor m0, m0 + lea r3, [strideq * 3] + RES_ADD_SSE_16_32_8 0, dstq, dstq + strideq + RES_ADD_SSE_16_32_8 64, dstq + strideq * 2, dstq + r3 + mov r4d, 3 +.loop: + add coeffsq, 128 + lea dstq, [dstq + strideq * 4] + RES_ADD_SSE_16_32_8 0, dstq, dstq + strideq + RES_ADD_SSE_16_32_8 64, dstq + strideq * 2, dstq + r3 + dec r4d + jnz .loop + RET + +; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_add_residual_32_8, 3, 5, 7, dst, coeffs, stride + pxor m0, m0 + RES_ADD_SSE_16_32_8 0, dstq, dstq + 16 + RES_ADD_SSE_16_32_8 64, dstq + strideq, dstq + strideq + 16 + mov r4d, 15 +.loop: + add coeffsq, 128 + lea dstq, [dstq + strideq * 2] + RES_ADD_SSE_16_32_8 0, dstq, dstq + 16 + RES_ADD_SSE_16_32_8 64, dstq + strideq, dstq + strideq + 16 + dec r4d + jnz .loop + RET + +%endmacro + +INIT_XMM sse2 +RESIDUAL_ADD_8 +INIT_XMM avx +RESIDUAL_ADD_8 + +;----------------------------------------------------------------------------- +; void ff_hevc_add_residual_10(pixel *dst, int16_t *coeffs, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro RES_ADD_SSE_8_10 4 + mova m0, [%4] + mova m1, [%4 + 16] + mova m2, [%4 + 32] + mova m3, [%4 + 48] + paddw m0, [%1 + 0] + paddw m1, [%1 + %2] + paddw m2, [%1 + %2 * 2] + paddw m3, [%1 + %3] + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1 + 0], m0 + mova [%1 + %2], m1 + mova [%1 + %2 * 2], m2 + mova [%1 + %3], m3 +%endmacro + +%macro RES_ADD_MMX4_10 3 + mova m0, [%1 + 0] + mova m1, [%1 + %2] + paddw m0, [%3] + paddw m1, [%3 + 8] + CLIPW m0, m2, m3 + CLIPW m1, m2, m3 + mova [%1 + 0], m0 + mova [%1 + %2], m1 +%endmacro + +%macro RES_ADD_SSE_16_10 3 + mova m0, [%3] + mova m1, [%3 + 16] + mova m2, [%3 + 32] + mova m3, [%3 + 48] + paddw m0, [%1] + paddw m1, [%1 + 16] + paddw m2, [%1 + %2] + paddw m3, [%1 + %2 + 16] + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1], m0 + mova [%1 + 16], m1 + mova [%1 + %2], m2 + mova [%1 + %2 + 16], m3 +%endmacro + +%macro RES_ADD_SSE_32_10 2 + mova m0, [%2] + mova m1, [%2 + 16] + mova m2, [%2 + 32] + mova m3, [%2 + 48] + + paddw m0, [%1] + paddw m1, [%1 + 16] + paddw m2, [%1 + 32] + paddw m3, [%1 + 48] + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1], m0 + mova [%1 + 16], m1 + mova [%1 + 32], m2 + mova [%1 + 48], m3 +%endmacro + +%macro RES_ADD16_AVX2 4 + mova m0, [%4] + mova m1, [%4 + 32] + mova m2, [%4 + 64] + mova m3, [%4 + 96] + + paddw m0, [%1 + 0] + paddw m1, [%1 + %2] + paddw m2, [%1 + %2 * 2] + paddw m3, [%1 + %3] + + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1 + 0], m0 + mova [%1 + %2], m1 + mova [%1 + %2 * 2], m2 + mova [%1 + %3], m3 +%endmacro + +%macro RES_ADD32_AVX2 3 + mova m0, [%3] + mova m1, [%3 + 32] + mova m2, [%3 + 64] + mova m3, [%3 + 96] + + paddw m0, [%1] + paddw m1, [%1 + 32] + paddw m2, [%1 + %2] + paddw m3, [%1 + %2 + 32] + + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1], m0 + mova [%1 + 32], m1 + mova [%1 + %2], m2 + mova [%1 + %2 + 32], m3 +%endmacro + + +INIT_MMX mmxext +cglobal hevc_add_residual_4_10, 3, 5, 6, dst, coeffs, stride + pxor m2, m2 + mova m3, [max_pixels_10] + RES_ADD_MMX4_10 dstq, strideq, coeffsq + add coeffsq, 16 + lea dstq, [dstq + 2 * strideq] + RES_ADD_MMX4_10 dstq, strideq, coeffsq + RET + +;----------------------------------------------------------------------------- +; void ff_hevc_add_residual_10(pixel *dst, int16_t *coeffs, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal hevc_add_residual_8_10, 3, 5, 6, dst, coeffs, stride + pxor m4, m4 + mova m5, [max_pixels_10] + lea r3, [strideq * 3] + + RES_ADD_SSE_8_10 dstq, strideq, r3, coeffsq + lea dstq, [dstq + strideq * 4] + add coeffsq, 64 + RES_ADD_SSE_8_10 dstq, strideq, r3, coeffsq + RET + +cglobal hevc_add_residual_16_10, 3, 5, 6, dst, coeffs, stride + pxor m4, m4 + mova m5, [max_pixels_10] + + RES_ADD_SSE_16_10 dstq, strideq, coeffsq + mov r4d, 7 +.loop: + lea dstq, [dstq + strideq * 2] + add coeffsq, 64 + RES_ADD_SSE_16_10 dstq, strideq, coeffsq + dec r4d + jnz .loop + RET + +cglobal hevc_add_residual_32_10, 3, 5, 6, dst, coeffs, stride + pxor m4, m4 + mova m5, [max_pixels_10] + + RES_ADD_SSE_32_10 dstq, coeffsq + mov r4d, 31 +.loop: + lea dstq, [dstq + strideq] + add coeffsq, 64 + RES_ADD_SSE_32_10 dstq, coeffsq + dec r4d + jnz .loop + RET + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 + +cglobal hevc_add_residual_16_10, 3, 5, 6, dst, coeffs, stride + pxor m4, m4 + mova m5, [max_pixels_10] + lea r3, [strideq * 3] + + RES_ADD16_AVX2 dstq, strideq, r3, coeffsq + mov r4d, 3 +.loop: + lea dstq, [dstq + strideq * 4] + add coeffsq, 128 + RES_ADD16_AVX2 dstq, strideq, r3, coeffsq + dec r4d + jnz .loop + RET + +cglobal hevc_add_residual_32_10, 3, 5, 6, dst, coeffs, stride + pxor m4, m4 + mova m5, [max_pixels_10] + + RES_ADD32_AVX2 dstq, strideq, coeffsq + mov r4d, 15 +.loop: + lea dstq, [dstq + strideq * 2] + add coeffsq, 128 + RES_ADD32_AVX2 dstq, strideq, coeffsq + dec r4d + jnz .loop + RET +%endif ;HAVE_AVX_EXTERNAL diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index f754038..b97f4c8 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -78,6 +78,23 @@ IDCT_FUNCS(32x32, sse2); IDCT_FUNCS(16x16, avx2); IDCT_FUNCS(32x32, avx2); +void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + +void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + +void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + +void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + #define GET_PIXELS(width, depth, cf) \ void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \ uint8_t *src, ptrdiff_t srcstride, \ @@ -273,6 +290,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2; c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2; + + c->add_residual[1] = ff_hevc_add_residual_8_8_sse2; + c->add_residual[2] = ff_hevc_add_residual_16_8_sse2; + c->add_residual[3] = ff_hevc_add_residual_32_8_sse2; + SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); @@ -301,6 +323,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2; c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2; + c->add_residual[1] = ff_hevc_add_residual_8_10_sse2; + c->add_residual[2] = ff_hevc_add_residual_16_10_sse2; + c->add_residual[3] = ff_hevc_add_residual_32_10_sse2; + SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels); SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels); @@ -329,6 +355,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) #if HAVE_AVX_EXTERNAL SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv); SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv); + + c->add_residual[1] = ff_hevc_add_residual_8_8_avx; + c->add_residual[2] = ff_hevc_add_residual_16_8_avx; + c->add_residual[3] = ff_hevc_add_residual_32_8_avx; #endif /* HAVE_AVX_EXTERNAL */ } if (EXTERNAL_AVX2(cpu_flags)) { @@ -359,6 +389,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2; c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2; + + c->add_residual[2] = ff_hevc_add_residual_16_10_avx2; + c->add_residual[3] = ff_hevc_add_residual_32_10_avx2; } } #endif /* ARCH_X86_64 */ -- 2.7.4 (Apple Git-66) _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
