On 31/07/14 11:58 AM, Pierre Edouard Lepere wrote: > Hi, > Here's a new version of the patch with the feedback provided. > > Best Regards, > > Pierre-Edouard Lepere
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index 7469293..658ad5e 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o > YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o > YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \ > x86/hevc_deblock.o \ > - x86/hevc_idct.o > + x86/hevc_idct.o \ > + x86/hevc_res_add.o > YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o > YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o > YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o > diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm > new file mode 100644 > index 0000000..2bfd35d > --- /dev/null > +++ b/libavcodec/x86/hevc_res_add.asm > @@ -0,0 +1,396 @@ > +; /* > +; * Provide SSE optimizations for transform_add functions for HEVC decoding nit: SIMD instead of SSE. > +; * Copyright (c) 2014 Pierre-Edouard LEPERE > +; * > +; * This file is part of FFmpeg. > +; * > +; * FFmpeg is free software; you can redistribute it and/or > +; * modify it under the terms of the GNU Lesser General Public > +; * License as published by the Free Software Foundation; either > +; * version 2.1 of the License, or (at your option) any later version. > +; * > +; * FFmpeg is distributed in the hope that it will be useful, > +; * but WITHOUT ANY WARRANTY; without even the implied warranty of > +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +; * Lesser General Public License for more details. > +; * > +; * You should have received a copy of the GNU Lesser General Public > +; * License along with FFmpeg; if not, write to the Free Software > +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +; */ > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA 32 > +max_pixels_10: times 16 dw ((1 << 10)-1) > +tr_add_10: times 4 dd ((1 << 14-10) + 1) This constant seems unused. > + > + > +SECTION .text > + > +;the tr_add macros and functions were largely inspired by x264 project's > code in the h264_idct.asm file > +%macro TR_ADD_MMX_4_8 0 > + mova m2, [r1] > + mova m4, [r1+8] > + pxor m3, m3 > + psubw m3, m2 > + packuswb m2, m2 > + packuswb m3, m3 > + pxor m5, m5 > + psubw m5, m4 > + packuswb m4, m4 > + packuswb m5, m5 > + > + movh m0, [r0 ] > + movh m1, [r0+r2 ] > + paddusb m0, m2 > + paddusb m1, m4 > + psubusb m0, m3 > + psubusb m1, m5 > + movh [r0 ], m0 > + movh [r0+r2 ], m1 > +%endmacro > + > + > +INIT_MMX mmxext > +; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, > ptrdiff_t stride) > +cglobal hevc_transform_add4_8, 3, 4, 6 > + TR_ADD_MMX_4_8 > + add r1, 16 > + lea r0, [r0+r2*2] > + TR_ADD_MMX_4_8 > + RET > + > +%macro TR_ADD_SSE_8_8 0 > + pxor m3, m3 > + mova m4, [r1] > + mova m6, [r1+16] > + mova m0, [r1+32] > + mova m2, [r1+48] > + psubw m5, m3, m4 > + psubw m7, m3, m6 > + psubw m1, m3, m0 > + packuswb m4, m0 > + packuswb m5, m1 > + psubw m3, m2 > + packuswb m6, m2 > + packuswb m7, m3 > + > + movq m0, [r0 ] > + movq m1, [r0+r2 ] > + movhps m0, [r0+r2*2] > + movhps m1, [r0+r3 ] > + paddusb m0, m4 > + paddusb m1, m6 > + psubusb m0, m5 > + psubusb m1, m7 > + movq [r0 ], m0 > + movq [r0+r2 ], m1 > + movhps [r0+2*r2], m0 > + movhps [r0+r3 ], m1 > +%endmacro > + > +%macro TR_ADD_INIT_SSE_8 0 > + pxor m0, m0 > + > + mova m4, [r1] > + mova m1, [r1+16] > + psubw m2, m0, m1 > + psubw m5, m0, m4 Add avx versions of add16 and add32 like you originally did for the 10bit functions (also with different instruction order depending on avx_enabled if possible). This macro is used up to 16 times in a single function, so all the saved movas will make a difference. > + packuswb m4, m1 > + packuswb m5, m2 > + > + mova m6, [r1+32] > + mova m1, [r1+48] > + psubw m2, m0, m1 > + psubw m7, m0, m6 > + packuswb m6, m1 > + packuswb m7, m2 > + > + mova m8, [r1+64] > + mova m1, [r1+80] > + psubw m2, m0, m1 > + psubw m9, m0, m8 > + packuswb m8, m1 > + packuswb m9, m2 > + > + mova m10, [r1+96] > + mova m1, [r1+112] > + psubw m2, m0, m1 > + psubw m11, m0, m10 > + packuswb m10, m1 > + packuswb m11, m2 > +%endmacro > + > + > +%macro TR_ADD_SSE_16_8 0 > + TR_ADD_INIT_SSE_8 > + > + mova m0, [r0 ] > + mova m1, [r0+r2 ] > + mova m2, [r0+r2*2] > + mova m3, [r0+r3 ] No need for these. The paddusb below can fetch the data from memory. With some refactoring you can get this and the add32 function to use 10 or maybe even 8 xmm registers. Getting it down to 10 will barely make a difference, but 8 will allow the functions to work on x86_32. It's not really important, and I can give it a go if you'd rather work on other dsp functions. This can be committed with the above changes alone. > + paddusb m0, m4 > + paddusb m1, m6 > + paddusb m2, m8 > + paddusb m3, m10 > + psubusb m0, m5 > + psubusb m1, m7 > + psubusb m2, m9 > + psubusb m3, m11 > + mova [r0 ], m0 > + mova [r0+r2 ], m1 > + mova [r0+2*r2], m2 > + mova [r0+r3 ], m3 > +%endmacro > + > +%macro TR_ADD_SSE_32_8 0 > + TR_ADD_INIT_SSE_8 > + > + mova m0, [r0 ] > + mova m1, [r0+16 ] > + mova m2, [r0+r2 ] > + mova m3, [r0+r2+16] Same as above. > + paddusb m0, m4 > + paddusb m1, m6 > + paddusb m2, m8 > + paddusb m3, m10 > + psubusb m0, m5 > + psubusb m1, m7 > + psubusb m2, m9 > + psubusb m3, m11 > + mova [r0 ], m0 > + mova [r0+16 ], m1 > + mova [r0+r2 ], m2 > + mova [r0+r2+16], m3 > +%endmacro [...] > diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h > index 4bcc8dc..5c3a51c 100644 > --- a/libavcodec/x86/hevcdsp.h > +++ b/libavcodec/x86/hevcdsp.h > @@ -131,4 +131,25 @@ WEIGHTING_PROTOTYPES(8, sse4); > WEIGHTING_PROTOTYPES(10, sse4); > WEIGHTING_PROTOTYPES(12, sse4); > > +/////////////////////////////////////////////////////////////////////////////// > +// TRANSFORM_ADD > +/////////////////////////////////////////////////////////////////////////////// > +void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, > ptrdiff_t stride); > +void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t > stride); > +void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t > stride); > +void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t > stride); > + > +void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, > ptrdiff_t stride); > +void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t > stride); > +void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, > ptrdiff_t stride); > +void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, > ptrdiff_t stride); > + > + > +void ff_hevc_transform_add8_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t > stride); > +void ff_hevc_transform_add16_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t > stride); > +void ff_hevc_transform_add32_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t > stride); These three are not needed anymore. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel