On 26 July 2018 at 12:28, James Darnley <jdarn...@obe.tv> wrote: > Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the > relevant transform. > C: 119fps > SSE2: 204fps > AVX: 206fps > AVX2: 221fps > > timer measurements, haar horizontal compose: > sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C > avx: 3.68x faster (45143 vs. 12275 decicycles) compared with C > avx2: 5.16x faster (45143 vs. 8742 decicycles) compared with C > haar vertical compose: > sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C > avx: 1.58x faster (31792 vs. 20090 decicycles) compared with C > avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C > --- > libavcodec/dirac_dwt.c | 7 +- > libavcodec/dirac_dwt.h | 1 + > libavcodec/x86/Makefile | 6 +- > libavcodec/x86/dirac_dwt_10bit.asm | 160 ++++++++++++++++++++++++++ > libavcodec/x86/dirac_dwt_init_10bit.c | 76 ++++++++++++ > 5 files changed, 247 insertions(+), 3 deletions(-) > create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm > create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c > > diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c > index cc08f8865a..86bee5bb9b 100644 > --- a/libavcodec/dirac_dwt.c > +++ b/libavcodec/dirac_dwt.c > @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, > enum dwt_type type, > return AVERROR_INVALIDDATA; > } > > - if (ARCH_X86 && bit_depth == 8) > +#if ARCH_X86 > + if (bit_depth == 8) > ff_spatial_idwt_init_x86(d, type); > + else if (bit_depth == 10) > + ff_spatial_idwt_init_10bit_x86(d, type); > +#endif > + > return 0; > } > > diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h > index 994dc21d70..1ad7b9a821 100644 > --- a/libavcodec/dirac_dwt.h > +++ b/libavcodec/dirac_dwt.h > @@ -88,6 +88,7 @@ enum dwt_type { > int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, > int decomposition_count, int bit_depth); > void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type); > +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type); > > void ff_spatial_idwt_slice2(DWTContext *d, int y); > > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index 2350c8bbee..590d83c167 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP) += > x86/blockdsp_init.o > OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o > OBJS-$(CONFIG_DCT) += x86/dct_init.o > OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \ > - x86/dirac_dwt_init.o > + x86/dirac_dwt_init.o \ > + x86/dirac_dwt_init_10bit.o > OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o > OBJS-$(CONFIG_FFT) += x86/fft_init.o > OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o > @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o > X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o > X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o > X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ > - x86/dirac_dwt.o > + x86/dirac_dwt.o \ > + x86/dirac_dwt_10bit.o > X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o > X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o > X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o > diff --git a/libavcodec/x86/dirac_dwt_10bit.asm > b/libavcodec/x86/dirac_dwt_10bit.asm > new file mode 100644 > index 0000000000..baea91329e > --- /dev/null > +++ b/libavcodec/x86/dirac_dwt_10bit.asm > @@ -0,0 +1,160 @@ > +;********************************************************** > ******************** > +;* x86 optimized discrete 10-bit wavelet trasnform > +;* Copyright (c) 2018 James Darnley > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > +;********************************************************** > ******************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +cextern pd_1 > + > +SECTION .text > + > +%macro HAAR_VERTICAL 0 > + >
See below. +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w > + DECLARE_REG_TMP 4,5 > + > + mova m2, [pd_1] > + mov r3d, wd > + and wd, ~(mmsize/4 - 1) > + shl wd, 2 > + add b0q, wq > + add b1q, wq > + neg wq > + > + ALIGN 16 > + .loop_simd: > + mova m0, [b0q + wq] > + mova m1, [b1q + wq] > + paddd m3, m1, m2 > + psrad m3, 1 > + psubd m0, m3 > + paddd m1, m0 > + mova [b0q + wq], m0 > + mova [b1q + wq], m1 > + add wq, mmsize > + jl .loop_simd > + > + and r3d, mmsize/4 - 1 > + jz .end > + .loop_scalar: > + mov t0d, [b0q] > + mov t1d, [b1q] > + mov r2d, t1d > + add r2d, 1 > + sar r2d, 1 > + sub t0d, r2d > + add t1d, t0d > + mov [b0q], t0d > + mov [b1q], t1d > + > + add b0q, 4 > + add b1q, 4 > + sub r3d, 1 > + jg .loop_scalar > + > + .end: > +RET > + > +%endmacro > + > +%macro HAAR_HORIZONTAL 0 > + > Could you remove this newline from every patch? All asm I've written and seen keep them without a newline. It made me think there's something in the asm which checked the value of the macro, not that the entire function is macro'd. +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, > x, b2 > + DECLARE_REG_TMP 2,5 > + %if ARCH_X86_64 > + %define tail r6d > + %else > + %define tail dword wm > + %endif > + > + mova m2, [pd_1] > + xor xd, xd > + shr wd, 1 > + mov tail, wd > + lea b2q, [bq + 4*wq] > + > + ALIGN 16 > + .loop_lo: > + mova m0, [bq + 4*xq] > + movu m1, [b2q + 4*xq] > + paddd m1, m2 > + psrad m1, 1 > + psubd m0, m1 > + mova [temp_q + 4*xq], m0 > + add xd, mmsize/4 > + cmp xd, wd > + jl .loop_lo > + > + xor xd, xd > + and wd, ~(mmsize/4 - 1) > + > + ALIGN 16 > + .loop_hi: > + mova m0, [temp_q + 4*xq] > + movu m1, [b2q + 4*xq] > + paddd m1, m0 > + paddd m0, m2 > + paddd m1, m2 > + psrad m0, 1 > + psrad m1, 1 > + SBUTTERFLY dq, 0,1,3 > + %if cpuflag(avx2) > + SBUTTERFLY dqqq, 0,1,3 > + %endif > + mova [bq + 8*xq], m0 > + mova [bq + 8*xq + mmsize], m1 > + add xd, mmsize/4 > + cmp xd, wd > + jl .loop_hi > + > + and tail, mmsize/4 - 1 > + jz .end > + .loop_scalar: > + mov t0d, [temp_q + 4*xq] > + mov t1d, [b2q + 4*xq] > + add t1d, t0d > + add t0d, 1 > + add t1d, 1 > + sar t0d, 1 > + sar t1d, 1 > + mov [bq + 8*xq], t0d > + mov [bq + 8*xq + 4], t1d > + add xq, 1 > + sub tail, 1 > + jg .loop_scalar > + > + .end: > +REP_RET > + > +%endmacro > + > +INIT_XMM sse2 > +HAAR_HORIZONTAL > +HAAR_VERTICAL > + > +INIT_XMM avx > +HAAR_HORIZONTAL > +HAAR_VERTICAL > You're not using any avx functions in that version, not unless a macro'd instruction inserts one for you. I think you should remove the avx version then. Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per version you can just make a single macro to do both versions at the same time. + > +INIT_YMM avx2 > +HAAR_HORIZONTAL > +HAAR_VERTICAL > diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c > b/libavcodec/x86/dirac_dwt_init_10bit.c > new file mode 100644 > index 0000000000..289862d728 > --- /dev/null > +++ b/libavcodec/x86/dirac_dwt_init_10bit.c > @@ -0,0 +1,76 @@ > +/* > + * x86 optimized discrete wavelet transform > + * Copyright (c) 2018 James Darnley > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "libavutil/x86/asm.h" > +#include "libavutil/x86/cpu.h" > +#include "libavcodec/dirac_dwt.h" > + > +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int > width_align); > +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int > width_align); > +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int > width_align); > + > +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int > width_align); > +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int > width_align); > +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int > width_align); > + > +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type > type) > +{ > +#if HAVE_X86ASM > + int cpu_flags = av_get_cpu_flags(); > + > + if (EXTERNAL_SSE2(cpu_flags)) { > + switch (type) { > + case DWT_DIRAC_HAAR0: > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_sse2; > + break; > + case DWT_DIRAC_HAAR1: > + d->horizontal_compose = (void*)ff_horizontal_compose_ > haar_10bit_sse2; > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_sse2; > + break; > + } > + } > + > + if (EXTERNAL_AVX(cpu_flags)) { > + switch (type) { > + case DWT_DIRAC_HAAR0: > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_avx; > + break; > + case DWT_DIRAC_HAAR1: > + d->horizontal_compose = (void*)ff_horizontal_compose_ > haar_10bit_avx; > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_avx; > + break; > + } > We keep switches and cases on the same line. + } > + > + if (EXTERNAL_AVX2(cpu_flags)) { > + switch (type) { > + case DWT_DIRAC_HAAR0: > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_avx2; > + break; > + case DWT_DIRAC_HAAR1: > + d->horizontal_compose = (void*)ff_horizontal_compose_ > haar_10bit_avx2; > + d->vertical_compose = (void*)ff_vertical_compose_ > haar_10bit_avx2; > + break; > + } Same. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel