Re: [libav-devel] [PATCH 2/2] hevc: x86: Add add_residual optimizations
On Wed, Oct 19, 2016 at 10:18 AM, Diego Biurrunwrote: > +%macro ADD_RES_MMX_4_8 0 > +mova m2, [r1] > +mova m4, [r1+8] > +pxor m3, m3 > +psubw m3, m2 > +packuswb m2, m2 > +packuswb m3, m3 > +pxor m5, m5 > +psubw m5, m4 > +packuswb m4, m4 > +packuswb m5, m5 > + > +movh m0, [r0] > +movh m1, [r0+r2] > +paddusb m0, m2 > +paddusb m1, m4 > +psubusb m0, m3 > +psubusb m1, m5 > +movh[r0], m0 > +movh [r0+r2], m1 > +%endmacro mova m0, [r1] mova m2, [r1+8] pxor m1, m1 pxor m3, m3 psubw m1, m0 psubw m3, m2 packuswb m0, m2 packuswb m1, m3 movd m2, [r0] movd m3, [r0+r2] punpckldq m2, m3 paddusb m0, m2 psubusb m0, m1 movd[r0], m0 psrlq m0, 32 movd [r0+r2], m0 [...] > +cglobal hevc_add_residual_4_8, 3, 4, 6 r3 isn't used, no need to reserve it. [...] > +%if cpuflag(avx) > +psubw m3, m0, m4 > +psubw m5, m0, m6 > +%else > +mova m3, m0 > +mova m5, m0 > +psubw m3, m4 > +psubw m5, m6 > +%endif Pointless %else. x86inc will do this automatically for non-AVX when 3-arg syntax is used. [...] > +decr4d > +jnz .loop Nit: jg .loop [...] > +cglobal hevc_add_residual_4_10,3,4, 6 r3 isn't used. [...] > +cglobal hevc_add_residual_8_10,3,5,6 r4 isn't used. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] hevc: x86: Add add_residual optimizations
On 19/10/2016 10:18, Diego Biurrun wrote: > Currently on its last run through oracle, should be good to go. Ok if it survives. lu ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 2/2] hevc: x86: Add add_residual optimizations
From: Pierre Edouard LepereInitially written by Pierre Edouard Lepere , extended by James Almer . Signed-off-by: Alexandra Hájková Signed-off-by: Diego Biurrun --- consistent macro names, indentation, cosmetics Currently on its last run through oracle, should be good to go. libavcodec/x86/Makefile | 7 +- libavcodec/x86/hevc_add_res.asm | 385 libavcodec/x86/hevcdsp_init.c | 42 + 3 files changed, 431 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/hevc_add_res.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a38535b..094c1fa 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -115,9 +115,10 @@ YASM-OBJS-$(CONFIG_AAC_DECODER)+= x86/sbrdsp.o YASM-OBJS-$(CONFIG_APE_DECODER)+= x86/apedsp.o YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o -YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o\ - x86/hevc_mc.o \ - x86/hevc_idct.o +YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o\ + x86/hevc_deblock.o\ + x86/hevc_idct.o \ + x86/hevc_mc.o YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm new file mode 100644 index 000..b6d0c7a --- /dev/null +++ b/libavcodec/x86/hevc_add_res.asm @@ -0,0 +1,385 @@ +; * +; * Provide SIMD optimizations for add_residual functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * +; * This file is part of Libav. +; * +; * Libav is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * Libav is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with Libav; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; ** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +max_pixels_10: times 16 dw ((1 << 10)-1) + +SECTION .text + +; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project +%macro ADD_RES_MMX_4_8 0 +mova m2, [r1] +mova m4, [r1+8] +pxor m3, m3 +psubw m3, m2 +packuswb m2, m2 +packuswb m3, m3 +pxor m5, m5 +psubw m5, m4 +packuswb m4, m4 +packuswb m5, m5 + +movh m0, [r0] +movh m1, [r0+r2] +paddusb m0, m2 +paddusb m1, m4 +psubusb m0, m3 +psubusb m1, m5 +movh[r0], m0 +movh [r0+r2], m1 +%endmacro + + +INIT_MMX mmxext +; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_add_residual_4_8, 3, 4, 6 +ADD_RES_MMX_4_8 +add r1, 16 +lea r0, [r0+r2*2] +ADD_RES_MMX_4_8 +RET + +%macro ADD_RES_SSE_8_8 0 +pxor m3, m3 +mova m4, [r1] +mova m6, [r1+16] +mova m0, [r1+32] +mova m2, [r1+48] +psubw m5, m3, m4 +psubw m7, m3, m6 +psubw m1, m3, m0 +packuswb m4, m0 +packuswb m5, m1 +psubw m3, m2 +packuswb m6, m2 +packuswb m7, m3 + +movq m0, [r0] +movq m1, [r0+r2] +movhpsm0, [r0+r2*2] +movhpsm1, [r0+r3] +paddusb m0, m4 +paddusb m1, m6 +psubusb m0, m5 +psubusb m1, m7 +movq[r0], m0 +movq [r0+r2], m1 +movhps [r0+2*r2], m0 +movhps [r0+r3], m1 +%endmacro + +%macro ADD_RES_SSE_16_32_8 3 +mova
[libav-devel] [PATCH 2/2] hevc: x86: Add add_residual optimizations
From: Pierre Edouard LepereInitially written by Pierre Edouard Lepere , extended by James Almer . Signed-off-by: Alexandra Hájková Signed-off-by: Luca Barbato --- libavcodec/x86/Makefile | 3 +- libavcodec/x86/hevc_add_res.asm | 391 libavcodec/x86/hevcdsp_init.c | 40 3 files changed, 433 insertions(+), 1 deletion(-) create mode 100644 libavcodec/x86/hevc_add_res.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a38535b..7574085 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o\ x86/hevc_mc.o \ - x86/hevc_idct.o + x86/hevc_idct.o \ + x86/hevc_add_res.o YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm new file mode 100644 index 000..0e6706b --- /dev/null +++ b/libavcodec/x86/hevc_add_res.asm @@ -0,0 +1,391 @@ +; * +; * Provide SIMD optimizations for add_residual functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * +; * This file is part of Libav. +; * +; * Libav is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * Libav is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with Libav; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; ** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +max_pixels_10: times 16 dw ((1 << 10)-1) + +SECTION .text + +; the add_res macros and functions were largely inspired by x264 project's code in the h264_idct.asm file +%macro ADD_RES_MMX_4_8 0 +mova m2, [r1] +mova m4, [r1+8] +pxor m3, m3 +psubw m3, m2 +packuswb m2, m2 +packuswb m3, m3 +pxor m5, m5 +psubw m5, m4 +packuswb m4, m4 +packuswb m5, m5 + +movh m0, [r0 ] +movh m1, [r0+r2 ] +paddusb m0, m2 +paddusb m1, m4 +psubusb m0, m3 +psubusb m1, m5 +movh [r0 ], m0 +movh [r0+r2 ], m1 +%endmacro + + +INIT_MMX mmxext +; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_add_residual_4_8, 3, 4, 6 +ADD_RES_MMX_4_8 +add r1, 16 +lea r0, [r0+r2*2] +ADD_RES_MMX_4_8 +RET + +%macro ADD_RES_SSE_8_8 0 +pxor m3, m3 +mova m4, [r1] +mova m6, [r1+16] +mova m0, [r1+32] +mova m2, [r1+48] +psubw m5, m3, m4 +psubw m7, m3, m6 +psubw m1, m3, m0 +packuswb m4, m0 +packuswb m5, m1 +psubw m3, m2 +packuswb m6, m2 +packuswb m7, m3 + +movqm0, [r0 ] +movqm1, [r0+r2 ] +movhps m0, [r0+r2*2] +movhps m1, [r0+r3 ] +paddusb m0, m4 +paddusb m1, m6 +psubusb m0, m5 +psubusb m1, m7 +movq [r0 ], m0 +movq [r0+r2 ], m1 +movhps [r0+2*r2], m0 +movhps [r0+r3 ], m1 +%endmacro + +%macro ADD_RES_SSE_16_32_8 3 +mova xm2, [r1+%1 ] +mova xm6, [r1+%1+16] +%if cpuflag(avx2) +vinserti128 m2, m2, [r1+%1+32], 1 +vinserti128 m6, m6, [r1+%1+48], 1 +%endif +%if cpuflag(avx) +psubw m1, m0, m2 +psubw m5, m0, m6 +%else +mova m1, m0 +mova