On 5/30/19, Ruiling Song <ruiling.s...@intel.com> wrote: > For details of the implementation, please refer to the comment > inlined in the assembly code. It improves the horizontal pass > performance about 100% under single thread. > > Tested overall performance using the command(avx2 enabled): > ./ffmpeg -i 1080p.mp4 -vf gblur -f null /dev/null > ./ffmpeg -i 1080p.mp4 -vf gblur=threads=1 -f null /dev/null > For single thread, the fps improves from 43 to 60, about 40%. > For multi-thread, the fps improves from 110 to 130, about 20%. > > Signed-off-by: Ruiling Song <ruiling.s...@intel.com> > --- > libavfilter/gblur.h | 54 ++++++++++ > libavfilter/vf_gblur.c | 66 +++++------- > libavfilter/x86/Makefile | 2 + > libavfilter/x86/vf_gblur.asm | 182 ++++++++++++++++++++++++++++++++ > libavfilter/x86/vf_gblur_init.c | 36 +++++++ > 5 files changed, 302 insertions(+), 38 deletions(-) > create mode 100644 libavfilter/gblur.h > create mode 100644 libavfilter/x86/vf_gblur.asm > create mode 100644 libavfilter/x86/vf_gblur_init.c > > diff --git a/libavfilter/gblur.h b/libavfilter/gblur.h > new file mode 100644 > index 0000000000..97217044d0 > --- /dev/null > +++ b/libavfilter/gblur.h > @@ -0,0 +1,54 @@ > +/* > + * Copyright (c) 2011 Pascal Getreuer > + * Copyright (c) 2016 Paul B Mahol > + * > + * Redistribution and use in source and binary forms, with or without > modification, > + * are permitted provided that the following conditions are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above > + * copyright notice, this list of conditions and the following > + * disclaimer in the documentation and/or other materials provided > + * with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > + * HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, > + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, > + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR > + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF > + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING > + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS > + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#ifndef AVFILTER_GBLUR_H > +#define AVFILTER_GBLUR_H > +#include "avfilter.h" > + > +typedef struct GBlurContext { > + const AVClass *class; > + > + float sigma; > + float sigmaV; > + int steps; > + int planes; > + > + int depth; > + int planewidth[4]; > + int planeheight[4]; > + float *buffer; > + float boundaryscale; > + float boundaryscaleV; > + float postscale; > + float postscaleV; > + float nu; > + float nuV; > + int nb_planes; > + void (*horiz_slice)(float *buffer, int width, int height, int steps, > float nu, float bscale); > +} GBlurContext; > +void ff_gblur_init_x86(GBlurContext *s); > +#endif > diff --git a/libavfilter/vf_gblur.c b/libavfilter/vf_gblur.c > index b91a8c074a..4e876bca05 100644 > --- a/libavfilter/vf_gblur.c > +++ b/libavfilter/vf_gblur.c > @@ -30,29 +30,11 @@ > #include "libavutil/pixdesc.h" > #include "avfilter.h" > #include "formats.h" > +#include "gblur.h" > #include "internal.h" > #include "video.h" > +#include <immintrin.h>
Is this header really needed? > > -typedef struct GBlurContext { > - const AVClass *class; > - > - float sigma; > - float sigmaV; > - int steps; > - int planes; > - > - int depth; > - int planewidth[4]; > - int planeheight[4]; > - float *buffer; > - float boundaryscale; > - float boundaryscaleV; > - float postscale; > - float postscaleV; > - float nu; > - float nuV; > - int nb_planes; > -} GBlurContext; > > #define OFFSET(x) offsetof(GBlurContext, x) > #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM > @@ -72,39 +54,44 @@ typedef struct ThreadData { > int width; > } ThreadData; > > -static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, > int nb_jobs) > +static void horiz_slice_c(float *buffer, int width, int height, int steps, > + float nu, float bscale) > { > - GBlurContext *s = ctx->priv; > - ThreadData *td = arg; > - const int height = td->height; > - const int width = td->width; > - const int slice_start = (height * jobnr ) / nb_jobs; > - const int slice_end = (height * (jobnr+1)) / nb_jobs; > - const float boundaryscale = s->boundaryscale; > - const int steps = s->steps; > - const float nu = s->nu; > - float *buffer = s->buffer; > - int y, x, step; > + int step, x, y; > float *ptr; > - > - /* Filter horizontally along each row */ > - for (y = slice_start; y < slice_end; y++) { > + for (y = 0; y < height; y++) { > for (step = 0; step < steps; step++) { > ptr = buffer + width * y; > - ptr[0] *= boundaryscale; > + ptr[0] *= bscale; > > /* Filter rightwards */ > for (x = 1; x < width; x++) > ptr[x] += nu * ptr[x - 1]; > - > - ptr[x = width - 1] *= boundaryscale; > + ptr[x = width - 1] *= bscale; > > /* Filter leftwards */ > for (; x > 0; x--) > ptr[x - 1] += nu * ptr[x]; > } > } > +} > + > +static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, > int nb_jobs) > +{ > + GBlurContext *s = ctx->priv; > + ThreadData *td = arg; > + const int height = td->height; > + const int width = td->width; > + const int slice_start = (height * jobnr ) / nb_jobs; > + const int slice_end = (height * (jobnr+1)) / nb_jobs; > + const float boundaryscale = s->boundaryscale; > + const int steps = s->steps; > + const float nu = s->nu; > + float *buffer = s->buffer; > > + s->horiz_slice(buffer + width * slice_start, width, slice_end - > slice_start, > + steps, nu, boundaryscale); > + emms_c(); > return 0; > } > > @@ -251,6 +238,9 @@ static int config_input(AVFilterLink *inlink) > if (s->sigmaV < 0) { > s->sigmaV = s->sigma; > } > + s->horiz_slice = horiz_slice_c; > + if (ARCH_X86_64) > + ff_gblur_init_x86(s); > > return 0; > } > diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile > index 17499f14da..6b0361bed2 100644 > --- a/libavfilter/x86/Makefile > +++ b/libavfilter/x86/Makefile > @@ -7,6 +7,7 @@ OBJS-$(CONFIG_BWDIF_FILTER) += > x86/vf_bwdif_init.o > OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o > OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o > OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o > +OBJS-$(CONFIG_GBLUR_FILTER) += x86/vf_gblur_init.o > OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o > OBJS-$(CONFIG_FRAMERATE_FILTER) += x86/vf_framerate_init.o > OBJS-$(CONFIG_HFLIP_FILTER) += x86/vf_hflip_init.o > @@ -41,6 +42,7 @@ X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += > x86/vf_bwdif.o > X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o > X86ASM-OBJS-$(CONFIG_FRAMERATE_FILTER) += x86/vf_framerate.o > X86ASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o > +X86ASM-OBJS-$(CONFIG_GBLUR_FILTER) += x86/vf_gblur.o > X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o > X86ASM-OBJS-$(CONFIG_HFLIP_FILTER) += x86/vf_hflip.o > X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o > diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm > new file mode 100644 > index 0000000000..79e56a32a7 > --- /dev/null > +++ b/libavfilter/x86/vf_gblur.asm > @@ -0,0 +1,182 @@ > +;***************************************************************************** > +;* x86-optimized functions for gblur filter > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION .text > + > +; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, > +; float nu, float bscale) > + > +%macro HORIZ_SLICE 0 > +%if UNIX64 > +cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, > stride, remain > +%else > +cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, > step, stride, remain > +%endif > +%if WIN64 > + movss m0, num > + movss m1, bscalem > + DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain > +%endif > + mulss m2, m0, m0 ; nu ^ 2 > + mulss m3, m2, m0 ; nu ^ 3 > + mulss m4, m3, m0 ; nu ^ 4 > + xor xq, xq > + xor yq, yq > + xor stepq, stepq > + mov strideq, widthq > + ; stride = width * 4 > + shl strideq, 2 > + ; w = w - ((w - 1) & 3) > + mov remainq, widthq > + sub remainq, 1 > + and remainq, 3 > + sub widthq, remainq > + > + shufps m0, m0, 0 > + shufps m2, m2, 0 > + shufps m3, m3, 0 > + shufps m4, m4, 0 > + > +.loop_y: > + .loop_step: > + ; p0 *= bscale > + mulss m5, m1, [ptrq + xq * 4] > + movss [ptrq + xq * 4], m5 > + inc xq > + ; filter rightwards > + ; Here we are vectorizing the c version by 4 > + ; for (x = 1; x < width; x++) > + ; ptr[x] += nu * ptr[x - 1]; > + ; let p0 stands for ptr[x-1], the data from last loop > + ; and [p1,p2,p3,p4] be the vector data for this loop. > + ; Unrolling the loop, we get: > + ; p1' = p1 + p0*nu > + ; p2' = p2 + p1*nu + p0*nu^2 > + ; p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3 > + ; p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4 > + ; so we can do it in simd: > + ; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu + > + ; [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 + > + ; [0,0,0,p0]*nu^4 > + > + .loop_x: > + movu m6, [ptrq + xq * 4] ; s = [p1,p2,p3,p4] > + pslldq m7, m6, 4 ; [0, p1,p2,p3] > + movss m7, m5 ; [p0,p1,p2,p3] > + FMULADD_PS m6, m7, m0, m6, m8 ; s += [p0,p1,p2,p3] * nu > + pslldq m7, 4 ; [0,p0,p1,p2] > + FMULADD_PS m6, m7, m2, m6, m8 ; s += [0,p0,p1,p2] * nu^2 > + pslldq m7, 4 > + FMULADD_PS m6, m7, m3, m6, m8 ; s += [0,0,p0,p1] * nu^3 > + pslldq m7, 4 > + FMULADD_PS m6, m7, m4, m6, m8 ; s += [0,0,0,p0] * nu^4 > + movu [ptrq + xq * 4], m6 > + shufps m5, m6, m6, q3333 > + add xq, 4 > + cmp xq, widthq > + jl .loop_x > + > + add widthq, remainq > + cmp xq, widthq > + je .end_scalar > + > + .loop_scalar: > + ; ptr[x] += nu * ptr[x-1] > + movss m5, [ptrq + 4*xq - 4] > + mulss m5, m0 > + addss m5, [ptrq + 4*xq] > + movss [ptrq + 4*xq], m5 > + inc xq > + cmp xq, widthq > + jl .loop_scalar > + .end_scalar: > + ; ptr[width - 1] *= bscale > + dec xq > + mulss m5, m1, [ptrq + 4*xq] > + movss [ptrq + 4*xq], m5 > + shufps m5, m5, 0 > + > + ; filter leftwards > + ; for (; x > 0; x--) > + ; ptr[x - 1] += nu * ptr[x]; > + ; The idea here is basically the same as filter rightwards. > + ; But we need to take care as the data layout is different. > + ; Let p0 stands for the ptr[x], which is the data from last loop. > + ; The way we do it in simd as below: > + ; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1] > + ; + [p-3, p-2, p-1, p0] * nu > + ; + [p-2, p-1, p0, 0] * nu^2 > + ; + [p-1, p0, 0, 0] * nu^3 > + ; + [p0, 0, 0, 0] * nu^4 > + .loop_x_back: > + sub xq, 4 > + movu m6, [ptrq + xq * 4] ; s = [p-4, p-3, p-2, p-1] > + psrldq m7, m6, 4 ; [p-3, p-2, p-1, 0 ] > + blendps m7, m5, 0x8 ; [p-3, p-2, p-1, p0 ] > + FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu > + psrldq m7, 4 ; > + FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0, 0] * nu^2 > + psrldq m7, 4 > + FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0, 0, 0] * nu^3 > + psrldq m7, 4 > + FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0, 0, 0, 0] * nu^4 > + movu [ptrq + xq * 4], m6 > + shufps m5, m6, m6, 0 ; m5 = [p-4', p-4', p-4', p-4'] > + cmp xq, remainq > + jg .loop_x_back > + > + cmp xq, 0 > + je .end_scalar_back > + > + .loop_scalar_back: > + ; ptr[x-1] += nu * ptr[x] > + movss m5, [ptrq + 4*xq] > + mulss m5, m0 > + addss m5, [ptrq + 4*xq - 4] > + movss [ptrq + 4*xq - 4], m5 > + dec xq > + cmp xq, 0 > + jg .loop_scalar_back > + .end_scalar_back: > + > + ; reset aligned width for next line > + sub widthq, remainq > + > + inc stepq > + cmp stepq, stepsq > + jl .loop_step > + > + add ptrq, strideq > + inc yq > + cmp yq, heightq > + jl .loop_y > + > + RET > +%endmacro > + > +%if ARCH_X86_64 > +INIT_XMM sse4 > +HORIZ_SLICE > + > +INIT_XMM avx2 > +HORIZ_SLICE > +%endif > diff --git a/libavfilter/x86/vf_gblur_init.c > b/libavfilter/x86/vf_gblur_init.c > new file mode 100644 > index 0000000000..b068edc598 > --- /dev/null > +++ b/libavfilter/x86/vf_gblur_init.c > @@ -0,0 +1,36 @@ > +/* > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/x86/cpu.h" > +#include "libavfilter/gblur.h" > + > +void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, > float nu, float bscale); > +void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, > float nu, float bscale); > + > +av_cold void ff_gblur_init_x86(GBlurContext *s) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (EXTERNAL_SSE4(cpu_flags)) > + s->horiz_slice = ff_horiz_slice_sse4; > + if (EXTERNAL_AVX2(cpu_flags)) > + s->horiz_slice = ff_horiz_slice_avx2; > +} > -- > 2.17.1 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".