On Tue, Jun 7, 2016 at 10:36 AM, James Almer <jamr...@gmail.com> wrote: > On 6/4/2016 4:36 AM, Muhammad Faiz wrote: >> benchmark on x86_64 >> cqt_time: >> plain = 3.292 s >> SSE = 1.640 s >> SSE3 = 1.631 s >> AVX = 1.395 s >> FMA3 = 1.271 s >> FMA4 = not available > > Try using the START_TIMER and STOP_TIMER macros to wrap the s->cqt_calc > call in libavfilter/avf_showcqt.c > It will potentially give more accurate results than the current > UPDATE_TIME(s->cqt_time) check. > OK, but probably I will check it privately (not sending patch)
>> >> untested on x86_32 > > Do you have a sample command to test this? As Michael said FATE doesn't > cover showcqt. > I check it using psnr filter (avg psnr above 90dB means OK) #!/bin/bash # example usage: ./psnr-check audio.mp3 yuv420p "-cpuflags -fma3-fma4-avx-sse3" mkfifo in0.y4m mkfifo in1.y4m # this is new ffmpeg build_path=$HOME/Documents/sources/ffmpeg/ffmpeg-build LD_LIBRARY_PATH=$build_path/libavcodec:$build_path/libavdevice:$build_path/libavfilter LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libavformat:$build_path/libavutil LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libpostproc:$build_path/libswresample LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libswscale export LD_LIBRARY_PATH $build_path/ffmpeg $3 -i "$1" -filter_complex "showcqt, format=$2, format=yuv444p|yuv422p|yuv420p" -f yuv4mpegpipe -y in0.y4m >/dev/null 2>&1 </dev/null & # this is old ffmpeg unset LD_LIBRARY_PATH ffmpeg $3 -i "$1" -filter_complex "showcqt, format=$2, format=yuv444p|yuv422p|yuv420p" -f yuv4mpegpipe -y in1.y4m >/dev/null 2>&1 </dev/null & ffmpeg -i $dir/in0.y4m -i $dir/in1.y4m -filter_complex "psnr=f=-" -f null -y /dev/null >> >> Signed-off-by: Muhammad Faiz <mfc...@gmail.com> >> --- >> libavfilter/avf_showcqt.c | 7 ++ >> libavfilter/avf_showcqt.h | 3 + >> libavfilter/x86/Makefile | 2 + >> libavfilter/x86/avf_showcqt.asm | 206 >> +++++++++++++++++++++++++++++++++++++ >> libavfilter/x86/avf_showcqt_init.c | 63 ++++++++++++ >> 5 files changed, 281 insertions(+) >> create mode 100644 libavfilter/x86/avf_showcqt.asm >> create mode 100644 libavfilter/x86/avf_showcqt_init.c >> >> diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c >> index b88c83c..62d5b09 100644 >> --- a/libavfilter/avf_showcqt.c >> +++ b/libavfilter/avf_showcqt.c >> @@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s) >> w *= sign * (1.0 / s->fft_len); >> s->coeffs[m].val[x - s->coeffs[m].start] = w; >> } >> + >> + if (s->permute_coeffs) >> + s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len); >> } >> >> av_expr_free(expr); >> @@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink) >> >> s->cqt_align = 1; >> s->cqt_calc = cqt_calc; >> + s->permute_coeffs = NULL; >> s->draw_sono = draw_sono; >> if (s->format == AV_PIX_FMT_RGB24) { >> s->draw_bar = draw_bar_rgb; >> @@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink) >> s->update_sono = update_sono_yuv; >> } >> >> + if (ARCH_X86) >> + ff_showcqt_init_x86(s); >> + >> if ((ret = init_cqt(s)) < 0) >> return ret; >> >> diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h >> index b945f49..588830f 100644 >> --- a/libavfilter/avf_showcqt.h >> +++ b/libavfilter/avf_showcqt.h >> @@ -74,6 +74,7 @@ typedef struct { >> /* callback */ >> void (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, >> const Coeffs *coeffs, >> int len, int fft_len); >> + void (*permute_coeffs)(float *v, int len); >> void (*draw_bar)(AVFrame *out, const float *h, const >> float *rcp_h, >> const ColorFloat *c, int bar_h); >> void (*draw_axis)(AVFrame *out, AVFrame *axis, const >> ColorFloat *c, int off); >> @@ -112,4 +113,6 @@ typedef struct { >> int axis; >> } ShowCQTContext; >> >> +void ff_showcqt_init_x86(ShowCQTContext *s); >> + >> #endif >> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile >> index 4486b79..b6195f8 100644 >> --- a/libavfilter/x86/Makefile >> +++ b/libavfilter/x86/Makefile >> @@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER) += >> x86/vf_pp7_init.o >> OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o >> OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o >> OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o >> +OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o >> OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o >> OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o >> OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o >> @@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER) += >> x86/vf_pullup.o >> ifdef CONFIG_GPL >> YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o >> endif >> +YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o >> YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o >> YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o >> YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o >> diff --git a/libavfilter/x86/avf_showcqt.asm >> b/libavfilter/x86/avf_showcqt.asm >> new file mode 100644 >> index 0000000..ba30786 >> --- /dev/null >> +++ b/libavfilter/x86/avf_showcqt.asm >> @@ -0,0 +1,206 @@ >> +;***************************************************************************** >> +;* x86-optimized functions for showcqt filter >> +;* >> +;* Copyright (C) 2016 Muhammad Faiz <mfc...@gmail.com> >> +;* >> +;* This file is part of FFmpeg. >> +;* >> +;* FFmpeg is free software; you can redistribute it and/or >> +;* modify it under the terms of the GNU Lesser General Public >> +;* License as published by the Free Software Foundation; either >> +;* version 2.1 of the License, or (at your option) any later version. >> +;* >> +;* FFmpeg is distributed in the hope that it will be useful, >> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of >> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> +;* Lesser General Public License for more details. >> +;* >> +;* You should have received a copy of the GNU Lesser General Public >> +;* License along with FFmpeg; if not, write to the Free Software >> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >> USA >> +;****************************************************************************** >> + >> +%include "libavutil/x86/x86util.asm" >> + >> +%if ARCH_X86_64 >> +%define pointer resq >> +%else >> +%define pointer resd >> +%endif >> + >> +struc Coeffs >> + .val: pointer 1 >> + .start: resd 1 >> + .len: resd 1 >> + .sizeof: >> +endstruc >> + >> +%macro EMULATE_HADDPS 3 ; dst, src, tmp >> +%if cpuflag(sse3) >> + haddps %1, %2 >> +%else >> + movaps %3, %1 >> + shufps %1, %2, q2020 >> + shufps %3, %2, q3131 >> + addps %1, %3 > > This is great. Much better and more efficient than other attempts to > emulate haddps scattered across the codebase. > It also makes me wonder if haddps, a ~5 cycles latency instruction is > really better than the combination of a mostly free mov, two 1 cycle > shuffles and one 3 cycle add to justify extra functions with it as the > only difference, at least in cases where there are no register > constrains. > > Your benchmarks above suggest it is although barely, so I'm curious > about what the timer.h macros will show. > Even probably haddps is slower than shufps+addps on some machine But, there is haddps and I use it. Updated using SSE3_FAST >> +%endif >> +%endmacro ; EMULATE_HADDPS >> + >> +%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp >> +%if cpuflag(fma3) || cpuflag(fma4) >> + fmaddps %1, %2, %3, %4 >> +%else >> + mulps %5, %2, %3 >> + addps %1, %4, %5 >> +%endif >> +%endmacro ; EMULATE_FMADDPS >> + >> +%macro CQT_CALC 9 >> +; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im >> +; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset >> + mov id, xd >> + add id, [coeffsq + Coeffs.start + %9] >> + movaps m%5, [srcq + 8 * iq] >> + movaps m%7, [srcq + 8 * iq + mmsize] >> + shufps m%6, m%5, m%7, q3131 >> + shufps m%5, m%5, m%7, q2020 >> + sub id, fft_lend >> + EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6 >> + neg id > > Is this supposed to turn a positive value negative? If so then it should > be "neg iq", otherwise on x86_64 the high 32 bits of iq used in the > effective addresses below would be zero. > This is the intended behavior (i = fft_len - i) evaluated with int not int64_t >> + EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5 >> + movups m%5, [srcq + 8 * iq - mmsize + 8] >> + movups m%7, [srcq + 8 * iq - 2*mmsize + 8] >> + %if mmsize == 32 >> + vperm2f128 m%5, m%5, m%5, 1 >> + vperm2f128 m%7, m%7, m%7, 1 >> + %endif >> + shufps m%6, m%5, m%7, q1313 >> + shufps m%5, m%5, m%7, q0202 >> + EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6 >> + EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5 >> +%endmacro ; CQT_CALC >> + >> +%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2 >> + addps m%5, m%4, m%2 >> + subps m%6, m%3, m%1 >> + addps m%1, m%3 >> + subps m%2, m%4 >> + EMULATE_HADDPS m%5, m%6, m%3 >> + EMULATE_HADDPS m%1, m%2, m%3 >> + EMULATE_HADDPS m%1, m%5, m%2 >> + %if mmsize == 32 >> + vextractf128 xmm%2, m%1, 1 >> + addps xmm%1, xmm%2 >> + %endif >> +%endmacro ; CQT_SEPARATE >> + >> +%macro DECLARE_CQT_CALC 0 >> +; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len) >> +%if ARCH_X86_64 >> +cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, >> coeffs_val, coeffs_val2, i, coeffs_len >> + align 16 >> + .loop_k: >> + mov xd, [coeffsq + Coeffs.len] >> + xorps m0, m0 >> + movaps m1, m0 >> + movaps m2, m0 >> + mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] >> + movaps m3, m0 >> + movaps m8, m0 >> + cmp coeffs_lend, xd >> + movaps m9, m0 >> + movaps m10, m0 >> + movaps m11, m0 >> + cmova coeffs_lend, xd >> + xor xd, xd >> + test coeffs_lend, coeffs_lend >> + jz .check_loop_b >> + mov coeffs_valq, [coeffsq + Coeffs.val] >> + mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] >> + align 16 >> + .loop_ab: >> + movaps m7, [coeffs_valq + 4 * xq] >> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 >> + movaps m7, [coeffs_val2q + 4 * xq] >> + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof >> + add xd, mmsize/4 >> + cmp xd, coeffs_lend >> + jb .loop_ab >> + .check_loop_b: >> + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] >> + jae .check_loop_a >> + align 16 >> + .loop_b: >> + movaps m7, [coeffs_val2q + 4 * xq] >> + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof >> + add xd, mmsize/4 >> + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] >> + jb .loop_b >> + .loop_end: >> + CQT_SEPARATE 0, 1, 2, 3, 4, 5 >> + CQT_SEPARATE 8, 9, 10, 11, 4, 5 >> + mulps xmm0, xmm0 >> + mulps xmm8, xmm8 >> + EMULATE_HADDPS xmm0, xmm8, xmm1 >> + movaps [dstq], xmm0 >> + sub lend, 2 >> + lea dstq, [dstq + 16] > > Use add > >> + lea coeffsq, [coeffsq + 2*Coeffs.sizeof] > > Same, assuming sizeof is an immediate. > This is optimization to separate sub and jnz with lea. Using add will clobber flag register. Also lea does not need rex prefix >> + jnz .loop_k >> + REP_RET >> + align 16 >> + .check_loop_a: >> + cmp xd, [coeffsq + Coeffs.len] >> + jae .loop_end >> + align 16 >> + .loop_a: >> + movaps m7, [coeffs_valq + 4 * xq] >> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 >> + add xd, mmsize/4 >> + cmp xd, [coeffsq + Coeffs.len] >> + jb .loop_a >> + jmp .loop_end >> +%else >> +cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i >> +%define fft_lend r4m >> + align 16 >> + .loop_k: >> + mov xd, [coeffsq + Coeffs.len] >> + xorps m0, m0 >> + movaps m1, m0 >> + movaps m2, m0 >> + movaps m3, m0 >> + test xd, xd >> + jz .store >> + mov coeffs_valq, [coeffsq + Coeffs.val] >> + xor xd, xd >> + align 16 >> + .loop_x: >> + movaps m7, [coeffs_valq + 4 * xq] >> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 >> + add xd, mmsize/4 >> + cmp xd, [coeffsq + Coeffs.len] >> + jb .loop_x >> + CQT_SEPARATE 0, 1, 2, 3, 4, 5 >> + mulps xmm0, xmm0 >> + EMULATE_HADDPS xmm0, xmm0, xmm1 >> + .store: >> + movlps [dstq], xmm0 >> + sub lend, 1 >> + lea dstq, [dstq + 8] >> + lea coeffsq, [coeffsq + Coeffs.sizeof] > > Same as above for both of these leas. > Same answer. >> + jnz .loop_k >> + REP_RET >> +%endif ; ARCH_X86_64 >> +%endmacro ; DECLARE_CQT_CALC >> + >> +INIT_XMM sse >> +DECLARE_CQT_CALC >> +INIT_XMM sse3 >> +DECLARE_CQT_CALC >> +INIT_YMM avx >> +DECLARE_CQT_CALC >> +INIT_YMM fma3 >> +DECLARE_CQT_CALC >> +INIT_YMM fma4 > > All CPUs supporting FMA4 underperform in functions using ymm registers. > Make it xmm instead. > OK, updated >> +DECLARE_CQT_CALC >> diff --git a/libavfilter/x86/avf_showcqt_init.c >> b/libavfilter/x86/avf_showcqt_init.c >> new file mode 100644 >> index 0000000..664c6ac >> --- /dev/null >> +++ b/libavfilter/x86/avf_showcqt_init.c >> @@ -0,0 +1,63 @@ >> +/* >> + * Copyright (c) 2016 Muhammad Faiz <mfc...@gmail.com> >> + * >> + * This file is part of FFmpeg. >> + * >> + * FFmpeg is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2.1 of the License, or (at your option) any later version. >> + * >> + * FFmpeg is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with FFmpeg; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >> USA >> + */ >> + >> +#include "libavutil/attributes.h" >> +#include "libavutil/cpu.h" >> +#include "libavutil/x86/cpu.h" >> +#include "libavfilter/avf_showcqt.h" >> + >> +#define DECLARE_CQT_CALC(type) \ >> +void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \ >> + const Coeffs *coeffs, int len, int fft_len) >> + >> +DECLARE_CQT_CALC(sse); >> +DECLARE_CQT_CALC(sse3); >> +DECLARE_CQT_CALC(avx); >> +DECLARE_CQT_CALC(fma3); >> +DECLARE_CQT_CALC(fma4); >> + >> +#define permute_coeffs_0 NULL >> + >> +static void permute_coeffs_01452367(float *v, int len) >> +{ >> + int k; >> + for (k = 0; k < len; k += 8) { >> + FFSWAP(float, v[k+2], v[k+4]); >> + FFSWAP(float, v[k+3], v[k+5]); >> + } >> +} >> + >> +av_cold void ff_showcqt_init_x86(ShowCQTContext *s) >> +{ >> + int cpuflags = av_get_cpu_flags(); >> + >> +#define SELECT_CQT_CALC(type, TYPE, align, perm) \ >> +if (EXTERNAL_##TYPE(cpuflags)) { \ >> + s->cqt_calc = ff_showcqt_cqt_calc_##type; \ >> + s->cqt_align = align; \ >> + s->permute_coeffs = permute_coeffs_##perm; \ >> +} >> + >> + SELECT_CQT_CALC(sse, SSE, 4, 0); >> + SELECT_CQT_CALC(sse3, SSE3, 4, 0); >> + SELECT_CQT_CALC(avx, AVX, 8, 01452367); > > Use AVX_FAST, so this function will not be used on CPUs that set the > AV_CPU_FLAG_AVXSLOW flag. > >> + SELECT_CQT_CALC(fma3, FMA3, 8, 01452367); > > Same, use FMA3_FAST. The result will then be the FMA3 version used by > Intel CPUs and hopefully AMD Zen, and the FMA4 one by Bulldozer based > CPUs. > >> + SELECT_CQT_CALC(fma4, FMA4, 8, 01452367); >> +} >> > OK, also reorder (FMA4 before AVX because AVX/ymm without FMA4 is faster than FMA4/xmm) Modified patch attached Thank's
From ab5ef722d358e4bfa1e9795d3703c2e31e3a6a74 Mon Sep 17 00:00:00 2001 From: Muhammad Faiz <mfc...@gmail.com> Date: Sat, 4 Jun 2016 14:33:05 +0700 Subject: [PATCH] avfilter/avf_showcqt: cqt_calc optimization on x86 benchmark on x86_64 cqt_time: plain = 3.292 s SSE = 1.640 s SSE3 = 1.631 s AVX = 1.395 s FMA3 = 1.271 s FMA4 = not available untested on x86_32 Signed-off-by: Muhammad Faiz <mfc...@gmail.com> --- libavfilter/avf_showcqt.c | 7 ++ libavfilter/avf_showcqt.h | 3 + libavfilter/x86/Makefile | 2 + libavfilter/x86/avf_showcqt.asm | 206 +++++++++++++++++++++++++++++++++++++ libavfilter/x86/avf_showcqt_init.c | 63 ++++++++++++ 5 files changed, 281 insertions(+) create mode 100644 libavfilter/x86/avf_showcqt.asm create mode 100644 libavfilter/x86/avf_showcqt_init.c diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c index b88c83c..62d5b09 100644 --- a/libavfilter/avf_showcqt.c +++ b/libavfilter/avf_showcqt.c @@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s) w *= sign * (1.0 / s->fft_len); s->coeffs[m].val[x - s->coeffs[m].start] = w; } + + if (s->permute_coeffs) + s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len); } av_expr_free(expr); @@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink) s->cqt_align = 1; s->cqt_calc = cqt_calc; + s->permute_coeffs = NULL; s->draw_sono = draw_sono; if (s->format == AV_PIX_FMT_RGB24) { s->draw_bar = draw_bar_rgb; @@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink) s->update_sono = update_sono_yuv; } + if (ARCH_X86) + ff_showcqt_init_x86(s); + if ((ret = init_cqt(s)) < 0) return ret; diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h index b945f49..588830f 100644 --- a/libavfilter/avf_showcqt.h +++ b/libavfilter/avf_showcqt.h @@ -74,6 +74,7 @@ typedef struct { /* callback */ void (*cqt_calc)(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs, int len, int fft_len); + void (*permute_coeffs)(float *v, int len); void (*draw_bar)(AVFrame *out, const float *h, const float *rcp_h, const ColorFloat *c, int bar_h); void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off); @@ -112,4 +113,6 @@ typedef struct { int axis; } ShowCQTContext; +void ff_showcqt_init_x86(ShowCQTContext *s); + #endif diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 4486b79..b6195f8 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o +OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o @@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o ifdef CONFIG_GPL YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o endif +YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o diff --git a/libavfilter/x86/avf_showcqt.asm b/libavfilter/x86/avf_showcqt.asm new file mode 100644 index 0000000..ba30786 --- /dev/null +++ b/libavfilter/x86/avf_showcqt.asm @@ -0,0 +1,206 @@ +;***************************************************************************** +;* x86-optimized functions for showcqt filter +;* +;* Copyright (C) 2016 Muhammad Faiz <mfc...@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 +%define pointer resq +%else +%define pointer resd +%endif + +struc Coeffs + .val: pointer 1 + .start: resd 1 + .len: resd 1 + .sizeof: +endstruc + +%macro EMULATE_HADDPS 3 ; dst, src, tmp +%if cpuflag(sse3) + haddps %1, %2 +%else + movaps %3, %1 + shufps %1, %2, q2020 + shufps %3, %2, q3131 + addps %1, %3 +%endif +%endmacro ; EMULATE_HADDPS + +%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp +%if cpuflag(fma3) || cpuflag(fma4) + fmaddps %1, %2, %3, %4 +%else + mulps %5, %2, %3 + addps %1, %4, %5 +%endif +%endmacro ; EMULATE_FMADDPS + +%macro CQT_CALC 9 +; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im +; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset + mov id, xd + add id, [coeffsq + Coeffs.start + %9] + movaps m%5, [srcq + 8 * iq] + movaps m%7, [srcq + 8 * iq + mmsize] + shufps m%6, m%5, m%7, q3131 + shufps m%5, m%5, m%7, q2020 + sub id, fft_lend + EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6 + neg id + EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5 + movups m%5, [srcq + 8 * iq - mmsize + 8] + movups m%7, [srcq + 8 * iq - 2*mmsize + 8] + %if mmsize == 32 + vperm2f128 m%5, m%5, m%5, 1 + vperm2f128 m%7, m%7, m%7, 1 + %endif + shufps m%6, m%5, m%7, q1313 + shufps m%5, m%5, m%7, q0202 + EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6 + EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5 +%endmacro ; CQT_CALC + +%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2 + addps m%5, m%4, m%2 + subps m%6, m%3, m%1 + addps m%1, m%3 + subps m%2, m%4 + EMULATE_HADDPS m%5, m%6, m%3 + EMULATE_HADDPS m%1, m%2, m%3 + EMULATE_HADDPS m%1, m%5, m%2 + %if mmsize == 32 + vextractf128 xmm%2, m%1, 1 + addps xmm%1, xmm%2 + %endif +%endmacro ; CQT_SEPARATE + +%macro DECLARE_CQT_CALC 0 +; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len) +%if ARCH_X86_64 +cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len + align 16 + .loop_k: + mov xd, [coeffsq + Coeffs.len] + xorps m0, m0 + movaps m1, m0 + movaps m2, m0 + mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] + movaps m3, m0 + movaps m8, m0 + cmp coeffs_lend, xd + movaps m9, m0 + movaps m10, m0 + movaps m11, m0 + cmova coeffs_lend, xd + xor xd, xd + test coeffs_lend, coeffs_lend + jz .check_loop_b + mov coeffs_valq, [coeffsq + Coeffs.val] + mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] + align 16 + .loop_ab: + movaps m7, [coeffs_valq + 4 * xq] + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 + movaps m7, [coeffs_val2q + 4 * xq] + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof + add xd, mmsize/4 + cmp xd, coeffs_lend + jb .loop_ab + .check_loop_b: + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] + jae .check_loop_a + align 16 + .loop_b: + movaps m7, [coeffs_val2q + 4 * xq] + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof + add xd, mmsize/4 + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] + jb .loop_b + .loop_end: + CQT_SEPARATE 0, 1, 2, 3, 4, 5 + CQT_SEPARATE 8, 9, 10, 11, 4, 5 + mulps xmm0, xmm0 + mulps xmm8, xmm8 + EMULATE_HADDPS xmm0, xmm8, xmm1 + movaps [dstq], xmm0 + sub lend, 2 + lea dstq, [dstq + 16] + lea coeffsq, [coeffsq + 2*Coeffs.sizeof] + jnz .loop_k + REP_RET + align 16 + .check_loop_a: + cmp xd, [coeffsq + Coeffs.len] + jae .loop_end + align 16 + .loop_a: + movaps m7, [coeffs_valq + 4 * xq] + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 + add xd, mmsize/4 + cmp xd, [coeffsq + Coeffs.len] + jb .loop_a + jmp .loop_end +%else +cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i +%define fft_lend r4m + align 16 + .loop_k: + mov xd, [coeffsq + Coeffs.len] + xorps m0, m0 + movaps m1, m0 + movaps m2, m0 + movaps m3, m0 + test xd, xd + jz .store + mov coeffs_valq, [coeffsq + Coeffs.val] + xor xd, xd + align 16 + .loop_x: + movaps m7, [coeffs_valq + 4 * xq] + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 + add xd, mmsize/4 + cmp xd, [coeffsq + Coeffs.len] + jb .loop_x + CQT_SEPARATE 0, 1, 2, 3, 4, 5 + mulps xmm0, xmm0 + EMULATE_HADDPS xmm0, xmm0, xmm1 + .store: + movlps [dstq], xmm0 + sub lend, 1 + lea dstq, [dstq + 8] + lea coeffsq, [coeffsq + Coeffs.sizeof] + jnz .loop_k + REP_RET +%endif ; ARCH_X86_64 +%endmacro ; DECLARE_CQT_CALC + +INIT_XMM sse +DECLARE_CQT_CALC +INIT_XMM sse3 +DECLARE_CQT_CALC +INIT_YMM avx +DECLARE_CQT_CALC +INIT_YMM fma3 +DECLARE_CQT_CALC +INIT_XMM fma4 +DECLARE_CQT_CALC diff --git a/libavfilter/x86/avf_showcqt_init.c b/libavfilter/x86/avf_showcqt_init.c new file mode 100644 index 0000000..664c6ac --- /dev/null +++ b/libavfilter/x86/avf_showcqt_init.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2016 Muhammad Faiz <mfc...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/avf_showcqt.h" + +#define DECLARE_CQT_CALC(type) \ +void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \ + const Coeffs *coeffs, int len, int fft_len) + +DECLARE_CQT_CALC(sse); +DECLARE_CQT_CALC(sse3); +DECLARE_CQT_CALC(avx); +DECLARE_CQT_CALC(fma3); +DECLARE_CQT_CALC(fma4); + +#define permute_coeffs_0 NULL + +static void permute_coeffs_01452367(float *v, int len) +{ + int k; + for (k = 0; k < len; k += 8) { + FFSWAP(float, v[k+2], v[k+4]); + FFSWAP(float, v[k+3], v[k+5]); + } +} + +av_cold void ff_showcqt_init_x86(ShowCQTContext *s) +{ + int cpuflags = av_get_cpu_flags(); + +#define SELECT_CQT_CALC(type, TYPE, align, perm) \ +if (EXTERNAL_##TYPE(cpuflags)) { \ + s->cqt_calc = ff_showcqt_cqt_calc_##type; \ + s->cqt_align = align; \ + s->permute_coeffs = permute_coeffs_##perm; \ +} + + SELECT_CQT_CALC(sse, SSE, 4, 0); + SELECT_CQT_CALC(sse3, SSE3_FAST, 4, 0); + SELECT_CQT_CALC(fma4, FMA4, 4, 0); // using xmm + SELECT_CQT_CALC(avx, AVX_FAST, 8, 01452367); + SELECT_CQT_CALC(fma3, FMA3_FAST, 8, 01452367); +} -- 2.5.0
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel