On 9/17/2017 3:22 PM, Martin Vignali wrote: > From 338f96a7f3c0f97cfafc0deda2322695a4006b5a Mon Sep 17 00:00:00 2001 > From: Martin Vignali <martin.vign...@gmail.com> > Date: Sun, 17 Sep 2017 20:05:16 +0200 > Subject: [PATCH] libavcodec/exr : add X86 64 SIMD for reorder_pixels > > --- > libavcodec/Makefile | 2 +- > libavcodec/exr.c | 44 ++++++++++++++-------------- > libavcodec/exrdsp.c | 45 +++++++++++++++++++++++++++++ > libavcodec/exrdsp.h | 32 ++++++++++++++++++++ > libavcodec/x86/Makefile | 2 ++ > libavcodec/x86/exrdsp.asm | 69 > ++++++++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/exrdsp_init.c | 43 +++++++++++++++++++++++++++ > 7 files changed, 213 insertions(+), 24 deletions(-) > create mode 100644 libavcodec/exrdsp.c > create mode 100644 libavcodec/exrdsp.h > create mode 100644 libavcodec/x86/exrdsp.asm > create mode 100644 libavcodec/x86/exrdsp_init.c > > diff --git a/libavcodec/Makefile b/libavcodec/Makefile > index 943e5db511..fad56129a3 100644 > --- a/libavcodec/Makefile > +++ b/libavcodec/Makefile > @@ -286,7 +286,7 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER) += 8svx.o > OBJS-$(CONFIG_ESCAPE124_DECODER) += escape124.o > OBJS-$(CONFIG_ESCAPE130_DECODER) += escape130.o > OBJS-$(CONFIG_EVRC_DECODER) += evrcdec.o acelp_vectors.o lsp.o > -OBJS-$(CONFIG_EXR_DECODER) += exr.o > +OBJS-$(CONFIG_EXR_DECODER) += exr.o exrdsp.o > OBJS-$(CONFIG_FFV1_DECODER) += ffv1dec.o ffv1.o > OBJS-$(CONFIG_FFV1_ENCODER) += ffv1enc.o ffv1.o > OBJS-$(CONFIG_FFWAVESYNTH_DECODER) += ffwavesynth.o > diff --git a/libavcodec/exr.c b/libavcodec/exr.c > index 759880756d..478c127ebe 100644 > --- a/libavcodec/exr.c > +++ b/libavcodec/exr.c > @@ -40,6 +40,7 @@ > #include "libavutil/avassert.h" > #include "libavutil/common.h" > #include "libavutil/imgutils.h" > +#include "libavutil/timer.h"
Not needed. > #include "libavutil/intfloat.h" > #include "libavutil/opt.h" > #include "libavutil/color_utils.h" > @@ -55,6 +56,7 @@ > #include "internal.h" > #include "mathops.h" > #include "thread.h" > +#include "exrdsp.h" Add this one above of get_bits.h, to keep the alphabetical order. > > enum ExrCompr { > EXR_RAW, > @@ -121,6 +123,7 @@ typedef struct EXRContext { > AVClass *class; > AVFrame *picture; > AVCodecContext *avctx; > + ExrDSPContext dsp; > > #if HAVE_BIGENDIAN > BswapDSPContext bbdsp; > @@ -275,23 +278,7 @@ static void predictor(uint8_t *src, int size) > } > } > > -static void reorder_pixels(uint8_t *src, uint8_t *dst, int size) > -{ > - const uint8_t *t1 = src; > - int half_size = size / 2; > - const uint8_t *t2 = src + half_size; > - uint8_t *s = dst; > - int i; > - > - av_assert1(size % 2 == 0); > - > - for (i = 0; i < half_size; i++) { > - *(s++) = *(t1++); > - *(s++) = *(t2++); > - } > -} > - > -static int zip_uncompress(const uint8_t *src, int compressed_size, > +static int zip_uncompress(EXRContext *s, const uint8_t *src, int > compressed_size, > int uncompressed_size, EXRThreadData *td) > { > unsigned long dest_len = uncompressed_size; > @@ -300,13 +287,18 @@ static int zip_uncompress(const uint8_t *src, int > compressed_size, > dest_len != uncompressed_size) > return AVERROR_INVALIDDATA; > > + av_assert1(uncompressed_size % 2 == 0); > + > predictor(td->tmp, uncompressed_size); > - reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size); > + > + //START_TIMER; Don't add dead benchmarking/debug code. > + s->dsp.reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size); > + //STOP_TIMER("reorder_pixels_zip"); > > return 0; > } > > -static int rle_uncompress(const uint8_t *src, int compressed_size, > +static int rle_uncompress(EXRContext *ctx, const uint8_t *src, int > compressed_size, > int uncompressed_size, EXRThreadData *td) > { > uint8_t *d = td->tmp; > @@ -345,8 +337,10 @@ static int rle_uncompress(const uint8_t *src, int > compressed_size, > if (dend != d) > return AVERROR_INVALIDDATA; > > + av_assert1(uncompressed_size % 2 == 0); > + > predictor(td->tmp, uncompressed_size); > - reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size); > + ctx->dsp.reorder_pixels(td->tmp, td->uncompressed_data, > uncompressed_size); > > return 0; > } > @@ -954,6 +948,7 @@ static void unpack_14(const uint8_t b[14], uint16_t s[16]) > } > } > > + Stray new line. > static void unpack_3(const uint8_t b[3], uint16_t s[16]) > { > int i; > @@ -1000,6 +995,7 @@ static int b44_uncompress(EXRContext *s, const uint8_t > *src, int compressed_size > > if (src[compressed_size - stay_to_uncompress + 2] == > 0xfc) { /* B44A block */ > unpack_3(sr, tmp_buffer); > + Same. > sr += 3; > stay_to_uncompress -= 3; > } else {/* B44 Block */ > @@ -1152,7 +1148,7 @@ static int decode_block(AVCodecContext *avctx, void > *tdata, > > if (data_size < uncompressed_size) { > av_fast_padded_malloc(&td->uncompressed_data, > - &td->uncompressed_size, uncompressed_size); > + &td->uncompressed_size, uncompressed_size + > 64);/* Force 64 padding for AVX2 reorder_pixels dst */ > > if (!td->uncompressed_data) > return AVERROR(ENOMEM); > @@ -1161,7 +1157,7 @@ static int decode_block(AVCodecContext *avctx, void > *tdata, > switch (s->compression) { > case EXR_ZIP1: > case EXR_ZIP16: > - ret = zip_uncompress(src, data_size, uncompressed_size, td); > + ret = zip_uncompress(s, src, data_size, uncompressed_size, td); > break; > case EXR_PIZ: > ret = piz_uncompress(s, src, data_size, uncompressed_size, td); > @@ -1170,7 +1166,7 @@ static int decode_block(AVCodecContext *avctx, void > *tdata, > ret = pxr24_uncompress(s, src, data_size, uncompressed_size, td); > break; > case EXR_RLE: > - ret = rle_uncompress(src, data_size, uncompressed_size, td); > + ret = rle_uncompress(s, src, data_size, uncompressed_size, td); > break; > case EXR_B44: > case EXR_B44A: > @@ -1804,6 +1800,8 @@ static av_cold int decode_init(AVCodecContext *avctx) > > s->avctx = avctx; > > + ff_exrdsp_init(&s->dsp); > + > #if HAVE_BIGENDIAN > ff_bswapdsp_init(&s->bbdsp); > #endif > diff --git a/libavcodec/exrdsp.c b/libavcodec/exrdsp.c > new file mode 100644 > index 0000000000..af47a6f8df > --- /dev/null > +++ b/libavcodec/exrdsp.c > @@ -0,0 +1,45 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include <stdint.h> > + > +#include "libavutil/attributes.h" > +#include "exrdsp.h" > +#include "config.h" > + > +static void reorder_pixels_scalar(uint8_t *src, uint8_t *dst, ptrdiff_t size) > +{ > + const uint8_t *t1 = src; > + int half_size = size / 2; > + const uint8_t *t2 = src + half_size; > + uint8_t *s = dst; > + int i; > + > + for (i = 0; i < half_size; i++) { > + *(s++) = *(t1++); > + *(s++) = *(t2++); > + } > +} > + > +av_cold void ff_exrdsp_init(ExrDSPContext *c) > +{ > + c->reorder_pixels = reorder_pixels_scalar; > + > + if (ARCH_X86) > + ff_exrdsp_init_x86(c); > +} > diff --git a/libavcodec/exrdsp.h b/libavcodec/exrdsp.h > new file mode 100644 > index 0000000000..09a76a518e > --- /dev/null > +++ b/libavcodec/exrdsp.h > @@ -0,0 +1,32 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#ifndef AVCODEC_EXRDSP_H > +#define AVCODEC_EXRDSP_H > + > +#include <stdint.h> > +#include "libavutil/common.h" > + > +typedef struct ExrDSPContext { > + void (*reorder_pixels)(uint8_t *src, uint8_t *dst, ptrdiff_t size); > +} ExrDSPContext; > + > +void ff_exrdsp_init(ExrDSPContext *c); > +void ff_exrdsp_init_x86(ExrDSPContext *c); > + > +#endif /* AVCODEC_EXRDSP_H */ > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index e36644c72a..a805cd37b4 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -52,6 +52,7 @@ OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o > OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o > OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o > x86/synth_filter_init.o > OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o > +OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o > OBJS-$(CONFIG_OPUS_DECODER) += x86/opus_dsp_init.o > OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_dsp_init.o > OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o > @@ -153,6 +154,7 @@ X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o > x86/synth_filter.o > X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ > x86/dirac_dwt.o > X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o > +X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o > X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o > ifdef CONFIG_GPL > X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o > diff --git a/libavcodec/x86/exrdsp.asm b/libavcodec/x86/exrdsp.asm > new file mode 100644 > index 0000000000..f609c055b0 > --- /dev/null > +++ b/libavcodec/x86/exrdsp.asm > @@ -0,0 +1,69 @@ > +;****************************************************************************** > +;* X86 Optimized functions for Open Exr Decoder > +;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital > Ltd. LLC > +;* > +;* reorder_pixels based on patch by John Loy > +;* port to ASM by Jokyo Images support by CNC - French National Center for > Cinema > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION .text > + > +;------------------------------------------------------------------------------ > +; void ff_reorder_pixels(uint8_t *src, uint8_t *dst, ptrdiff_t size) > +;------------------------------------------------------------------------------ > + > +%macro REORDER_PIXELS 0 > +cglobal reorder_pixels, 3,4,3, src1, dst, size, src2 > + lea src2q, [src1q+sizeq] ; src2 = src + 2 * > half_size > + add dstq, sizeq ; dst offset by > size > + shr sizeq, 1 ; half_size > + add src1q, sizeq ; offset src by > half_size > + neg sizeq ; size = offset > for dst, src1, src2 > +.loop: > + > +%if cpuflag(avx2) > + vpermq m0, [src1q + sizeq], 0xd8; load > first part > + vpermq m1, [src2q + sizeq], 0xd8; load > second part > + > + vpunpcklbw m2, m0, m1 ; > interleaved part 1 > + vmovdqa [dstq + sizeq*2], m2 ; copy to > dst > + > + vpunpckhbw m0, m0, m1 ; > interleaved part 2 > + vmovdqa [dstq + sizeq*2 + mmsize], m0 ; copy to > dst > +%else > + mova m0, [src1q+sizeq] ; load > first part > + movu m1, [src2q+sizeq] ; load > second part > + SBUTTERFLY bw, 0, 1, 2 ; > interleaved > + mova [dstq+2*sizeq ], m0 ; copy to > dst > + mova [dstq+2*sizeq+mmsize], m1 > +%endif > + add sizeq, mmsize > + jl .loop > + RET You can reuse the SBUTTERFLY + 2 store mova in the avx2 version as well. The resulting assembly is essentially the same, and it will look much cleaner here. %if cpuflag(avx2) vpermq m0, [src1q+sizeq], 0xd8 ; load first part vpermq m1, [src2q+sizeq], 0xd8 ; load second part %else mova m0, [src1q+sizeq] ; load first part movu m1, [src2q+sizeq] ; load second part %endif SBUTTERFLY bw, 0, 1, 2 ; interleaved mova [dstq+2*sizeq ], m0 ; copy to dst mova [dstq+2*sizeq+mmsize], m1 > +%endmacro > + > +INIT_XMM sse2 > +REORDER_PIXELS > + > +%if HAVE_AVX2_EXTERNAL > +INIT_YMM avx2 > +REORDER_PIXELS > +%endif > diff --git a/libavcodec/x86/exrdsp_init.c b/libavcodec/x86/exrdsp_init.c > new file mode 100644 > index 0000000000..49fd00e640 > --- /dev/null > +++ b/libavcodec/x86/exrdsp_init.c > @@ -0,0 +1,43 @@ > +/* > + * OpenEXR (.exr) image decoder > + * > + * Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital > Ltd. LLC > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/x86/cpu.h" > +#include "libavcodec/exrdsp.h" > + > +void ff_reorder_pixels_sse2(uint8_t *src, uint8_t *dst, ptrdiff_t size); > + > +void ff_reorder_pixels_avx2(uint8_t *src, uint8_t *dst, ptrdiff_t size); > + > +av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp) > +{ > +#if ARCH_X86_64 The functions are being assembled on x86_32, and they should work just fine with such targets. So why limit the initialization to x86_64 only here? > + int cpu_flags = av_get_cpu_flags(); > + > + if (EXTERNAL_SSE2(cpu_flags)) { > + dsp->reorder_pixels = ff_reorder_pixels_sse2; > + } > + if (EXTERNAL_AVX2(cpu_flags)) { EXTERNAL_AVX2_FAST(cpu_flags) The AVX2 function uses YMM registers, meaning it will be slow on certain AMD CPUs. The _FAST version of the macro makes sure it will not be used with those. > + dsp->reorder_pixels = ff_reorder_pixels_avx2; > + } > +#endif /* ARCH_X86_64 */ > +} > -- > 2.11.0 (Apple Git-81) > fate-exr passes on mingw-w64 as well. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel