On Tue, Jun 7, 2016 at 4:18 PM, Muhammad Faiz <mfc...@gmail.com> wrote: > On Tue, Jun 7, 2016 at 10:36 AM, James Almer <jamr...@gmail.com> wrote: >> On 6/4/2016 4:36 AM, Muhammad Faiz wrote: >>> benchmark on x86_64 >>> cqt_time: >>> plain = 3.292 s >>> SSE = 1.640 s >>> SSE3 = 1.631 s >>> AVX = 1.395 s >>> FMA3 = 1.271 s >>> FMA4 = not available >> >> Try using the START_TIMER and STOP_TIMER macros to wrap the s->cqt_calc >> call in libavfilter/avf_showcqt.c >> It will potentially give more accurate results than the current >> UPDATE_TIME(s->cqt_time) check. >> > OK, but probably I will check it privately (not sending patch) > >>> >>> untested on x86_32 >> >> Do you have a sample command to test this? As Michael said FATE doesn't >> cover showcqt. >> > > I check it using psnr filter (avg psnr above 90dB means OK) > > #!/bin/bash > # example usage: ./psnr-check audio.mp3 yuv420p "-cpuflags > -fma3-fma4-avx-sse3" > > mkfifo in0.y4m > mkfifo in1.y4m > > # this is new ffmpeg > build_path=$HOME/Documents/sources/ffmpeg/ffmpeg-build > LD_LIBRARY_PATH=$build_path/libavcodec:$build_path/libavdevice:$build_path/libavfilter > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libavformat:$build_path/libavutil > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libpostproc:$build_path/libswresample > LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_path/libswscale > > export LD_LIBRARY_PATH > > $build_path/ffmpeg $3 -i "$1" -filter_complex "showcqt, format=$2, > format=yuv444p|yuv422p|yuv420p" -f yuv4mpegpipe -y in0.y4m >/dev/null > 2>&1 </dev/null & > > # this is old ffmpeg > unset LD_LIBRARY_PATH > ffmpeg $3 -i "$1" -filter_complex "showcqt, format=$2, > format=yuv444p|yuv422p|yuv420p" -f yuv4mpegpipe -y in1.y4m >/dev/null > 2>&1 </dev/null & > > ffmpeg -i $dir/in0.y4m -i $dir/in1.y4m -filter_complex "psnr=f=-" -f > null -y /dev/null > > >>> >>> Signed-off-by: Muhammad Faiz <mfc...@gmail.com> >>> --- >>> libavfilter/avf_showcqt.c | 7 ++ >>> libavfilter/avf_showcqt.h | 3 + >>> libavfilter/x86/Makefile | 2 + >>> libavfilter/x86/avf_showcqt.asm | 206 >>> +++++++++++++++++++++++++++++++++++++ >>> libavfilter/x86/avf_showcqt_init.c | 63 ++++++++++++ >>> 5 files changed, 281 insertions(+) >>> create mode 100644 libavfilter/x86/avf_showcqt.asm >>> create mode 100644 libavfilter/x86/avf_showcqt_init.c >>> >>> diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c >>> index b88c83c..62d5b09 100644 >>> --- a/libavfilter/avf_showcqt.c >>> +++ b/libavfilter/avf_showcqt.c >>> @@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s) >>> w *= sign * (1.0 / s->fft_len); >>> s->coeffs[m].val[x - s->coeffs[m].start] = w; >>> } >>> + >>> + if (s->permute_coeffs) >>> + s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len); >>> } >>> >>> av_expr_free(expr); >>> @@ -1230,6 +1233,7 @@ static int config_output(AVFilterLink *outlink) >>> >>> s->cqt_align = 1; >>> s->cqt_calc = cqt_calc; >>> + s->permute_coeffs = NULL; >>> s->draw_sono = draw_sono; >>> if (s->format == AV_PIX_FMT_RGB24) { >>> s->draw_bar = draw_bar_rgb; >>> @@ -1241,6 +1245,9 @@ static int config_output(AVFilterLink *outlink) >>> s->update_sono = update_sono_yuv; >>> } >>> >>> + if (ARCH_X86) >>> + ff_showcqt_init_x86(s); >>> + >>> if ((ret = init_cqt(s)) < 0) >>> return ret; >>> >>> diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h >>> index b945f49..588830f 100644 >>> --- a/libavfilter/avf_showcqt.h >>> +++ b/libavfilter/avf_showcqt.h >>> @@ -74,6 +74,7 @@ typedef struct { >>> /* callback */ >>> void (*cqt_calc)(FFTComplex *dst, const FFTComplex >>> *src, const Coeffs *coeffs, >>> int len, int fft_len); >>> + void (*permute_coeffs)(float *v, int len); >>> void (*draw_bar)(AVFrame *out, const float *h, const >>> float *rcp_h, >>> const ColorFloat *c, int bar_h); >>> void (*draw_axis)(AVFrame *out, AVFrame *axis, const >>> ColorFloat *c, int off); >>> @@ -112,4 +113,6 @@ typedef struct { >>> int axis; >>> } ShowCQTContext; >>> >>> +void ff_showcqt_init_x86(ShowCQTContext *s); >>> + >>> #endif >>> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile >>> index 4486b79..b6195f8 100644 >>> --- a/libavfilter/x86/Makefile >>> +++ b/libavfilter/x86/Makefile >>> @@ -13,6 +13,7 @@ OBJS-$(CONFIG_PP7_FILTER) += >>> x86/vf_pp7_init.o >>> OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o >>> OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o >>> OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o >>> +OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt_init.o >>> OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o >>> OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o >>> OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o >>> @@ -37,6 +38,7 @@ YASM-OBJS-$(CONFIG_PULLUP_FILTER) += >>> x86/vf_pullup.o >>> ifdef CONFIG_GPL >>> YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o >>> endif >>> +YASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o >>> YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o >>> YASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o >>> YASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o >>> diff --git a/libavfilter/x86/avf_showcqt.asm >>> b/libavfilter/x86/avf_showcqt.asm >>> new file mode 100644 >>> index 0000000..ba30786 >>> --- /dev/null >>> +++ b/libavfilter/x86/avf_showcqt.asm >>> @@ -0,0 +1,206 @@ >>> +;***************************************************************************** >>> +;* x86-optimized functions for showcqt filter >>> +;* >>> +;* Copyright (C) 2016 Muhammad Faiz <mfc...@gmail.com> >>> +;* >>> +;* This file is part of FFmpeg. >>> +;* >>> +;* FFmpeg is free software; you can redistribute it and/or >>> +;* modify it under the terms of the GNU Lesser General Public >>> +;* License as published by the Free Software Foundation; either >>> +;* version 2.1 of the License, or (at your option) any later version. >>> +;* >>> +;* FFmpeg is distributed in the hope that it will be useful, >>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of >>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> +;* Lesser General Public License for more details. >>> +;* >>> +;* You should have received a copy of the GNU Lesser General Public >>> +;* License along with FFmpeg; if not, write to the Free Software >>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >>> 02110-1301 USA >>> +;****************************************************************************** >>> + >>> +%include "libavutil/x86/x86util.asm" >>> + >>> +%if ARCH_X86_64 >>> +%define pointer resq >>> +%else >>> +%define pointer resd >>> +%endif >>> + >>> +struc Coeffs >>> + .val: pointer 1 >>> + .start: resd 1 >>> + .len: resd 1 >>> + .sizeof: >>> +endstruc >>> + >>> +%macro EMULATE_HADDPS 3 ; dst, src, tmp >>> +%if cpuflag(sse3) >>> + haddps %1, %2 >>> +%else >>> + movaps %3, %1 >>> + shufps %1, %2, q2020 >>> + shufps %3, %2, q3131 >>> + addps %1, %3 >> >> This is great. Much better and more efficient than other attempts to >> emulate haddps scattered across the codebase. >> It also makes me wonder if haddps, a ~5 cycles latency instruction is >> really better than the combination of a mostly free mov, two 1 cycle >> shuffles and one 3 cycle add to justify extra functions with it as the >> only difference, at least in cases where there are no register >> constrains. >> >> Your benchmarks above suggest it is although barely, so I'm curious >> about what the timer.h macros will show. >> > Even probably haddps is slower than shufps+addps on some machine > But, there is haddps and I use it. > Updated using SSE3_FAST > >>> +%endif >>> +%endmacro ; EMULATE_HADDPS >>> + >>> +%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp >>> +%if cpuflag(fma3) || cpuflag(fma4) >>> + fmaddps %1, %2, %3, %4 >>> +%else >>> + mulps %5, %2, %3 >>> + addps %1, %4, %5 >>> +%endif >>> +%endmacro ; EMULATE_FMADDPS >>> + >>> +%macro CQT_CALC 9 >>> +; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im >>> +; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset >>> + mov id, xd >>> + add id, [coeffsq + Coeffs.start + %9] >>> + movaps m%5, [srcq + 8 * iq] >>> + movaps m%7, [srcq + 8 * iq + mmsize] >>> + shufps m%6, m%5, m%7, q3131 >>> + shufps m%5, m%5, m%7, q2020 >>> + sub id, fft_lend >>> + EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6 >>> + neg id >> >> Is this supposed to turn a positive value negative? If so then it should >> be "neg iq", otherwise on x86_64 the high 32 bits of iq used in the >> effective addresses below would be zero. >> > This is the intended behavior (i = fft_len - i) evaluated with int not int64_t > >>> + EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5 >>> + movups m%5, [srcq + 8 * iq - mmsize + 8] >>> + movups m%7, [srcq + 8 * iq - 2*mmsize + 8] >>> + %if mmsize == 32 >>> + vperm2f128 m%5, m%5, m%5, 1 >>> + vperm2f128 m%7, m%7, m%7, 1 >>> + %endif >>> + shufps m%6, m%5, m%7, q1313 >>> + shufps m%5, m%5, m%7, q0202 >>> + EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6 >>> + EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5 >>> +%endmacro ; CQT_CALC >>> + >>> +%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2 >>> + addps m%5, m%4, m%2 >>> + subps m%6, m%3, m%1 >>> + addps m%1, m%3 >>> + subps m%2, m%4 >>> + EMULATE_HADDPS m%5, m%6, m%3 >>> + EMULATE_HADDPS m%1, m%2, m%3 >>> + EMULATE_HADDPS m%1, m%5, m%2 >>> + %if mmsize == 32 >>> + vextractf128 xmm%2, m%1, 1 >>> + addps xmm%1, xmm%2 >>> + %endif >>> +%endmacro ; CQT_SEPARATE >>> + >>> +%macro DECLARE_CQT_CALC 0 >>> +; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len) >>> +%if ARCH_X86_64 >>> +cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, >>> coeffs_val, coeffs_val2, i, coeffs_len >>> + align 16 >>> + .loop_k: >>> + mov xd, [coeffsq + Coeffs.len] >>> + xorps m0, m0 >>> + movaps m1, m0 >>> + movaps m2, m0 >>> + mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] >>> + movaps m3, m0 >>> + movaps m8, m0 >>> + cmp coeffs_lend, xd >>> + movaps m9, m0 >>> + movaps m10, m0 >>> + movaps m11, m0 >>> + cmova coeffs_lend, xd >>> + xor xd, xd >>> + test coeffs_lend, coeffs_lend >>> + jz .check_loop_b >>> + mov coeffs_valq, [coeffsq + Coeffs.val] >>> + mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] >>> + align 16 >>> + .loop_ab: >>> + movaps m7, [coeffs_valq + 4 * xq] >>> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 >>> + movaps m7, [coeffs_val2q + 4 * xq] >>> + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof >>> + add xd, mmsize/4 >>> + cmp xd, coeffs_lend >>> + jb .loop_ab >>> + .check_loop_b: >>> + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] >>> + jae .check_loop_a >>> + align 16 >>> + .loop_b: >>> + movaps m7, [coeffs_val2q + 4 * xq] >>> + CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof >>> + add xd, mmsize/4 >>> + cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] >>> + jb .loop_b >>> + .loop_end: >>> + CQT_SEPARATE 0, 1, 2, 3, 4, 5 >>> + CQT_SEPARATE 8, 9, 10, 11, 4, 5 >>> + mulps xmm0, xmm0 >>> + mulps xmm8, xmm8 >>> + EMULATE_HADDPS xmm0, xmm8, xmm1 >>> + movaps [dstq], xmm0 >>> + sub lend, 2 >>> + lea dstq, [dstq + 16] >> >> Use add >> >>> + lea coeffsq, [coeffsq + 2*Coeffs.sizeof] >> >> Same, assuming sizeof is an immediate. >> > This is optimization to separate sub and jnz with lea. > Using add will clobber flag register. > Also lea does not need rex prefix > >>> + jnz .loop_k >>> + REP_RET >>> + align 16 >>> + .check_loop_a: >>> + cmp xd, [coeffsq + Coeffs.len] >>> + jae .loop_end >>> + align 16 >>> + .loop_a: >>> + movaps m7, [coeffs_valq + 4 * xq] >>> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 >>> + add xd, mmsize/4 >>> + cmp xd, [coeffsq + Coeffs.len] >>> + jb .loop_a >>> + jmp .loop_end >>> +%else >>> +cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i >>> +%define fft_lend r4m >>> + align 16 >>> + .loop_k: >>> + mov xd, [coeffsq + Coeffs.len] >>> + xorps m0, m0 >>> + movaps m1, m0 >>> + movaps m2, m0 >>> + movaps m3, m0 >>> + test xd, xd >>> + jz .store >>> + mov coeffs_valq, [coeffsq + Coeffs.val] >>> + xor xd, xd >>> + align 16 >>> + .loop_x: >>> + movaps m7, [coeffs_valq + 4 * xq] >>> + CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 >>> + add xd, mmsize/4 >>> + cmp xd, [coeffsq + Coeffs.len] >>> + jb .loop_x >>> + CQT_SEPARATE 0, 1, 2, 3, 4, 5 >>> + mulps xmm0, xmm0 >>> + EMULATE_HADDPS xmm0, xmm0, xmm1 >>> + .store: >>> + movlps [dstq], xmm0 >>> + sub lend, 1 >>> + lea dstq, [dstq + 8] >>> + lea coeffsq, [coeffsq + Coeffs.sizeof] >> >> Same as above for both of these leas. >> > Same answer. > >>> + jnz .loop_k >>> + REP_RET >>> +%endif ; ARCH_X86_64 >>> +%endmacro ; DECLARE_CQT_CALC >>> + >>> +INIT_XMM sse >>> +DECLARE_CQT_CALC >>> +INIT_XMM sse3 >>> +DECLARE_CQT_CALC >>> +INIT_YMM avx >>> +DECLARE_CQT_CALC >>> +INIT_YMM fma3 >>> +DECLARE_CQT_CALC >>> +INIT_YMM fma4 >> >> All CPUs supporting FMA4 underperform in functions using ymm registers. >> Make it xmm instead. >> > OK, updated > >>> +DECLARE_CQT_CALC >>> diff --git a/libavfilter/x86/avf_showcqt_init.c >>> b/libavfilter/x86/avf_showcqt_init.c >>> new file mode 100644 >>> index 0000000..664c6ac >>> --- /dev/null >>> +++ b/libavfilter/x86/avf_showcqt_init.c >>> @@ -0,0 +1,63 @@ >>> +/* >>> + * Copyright (c) 2016 Muhammad Faiz <mfc...@gmail.com> >>> + * >>> + * This file is part of FFmpeg. >>> + * >>> + * FFmpeg is free software; you can redistribute it and/or >>> + * modify it under the terms of the GNU Lesser General Public >>> + * License as published by the Free Software Foundation; either >>> + * version 2.1 of the License, or (at your option) any later version. >>> + * >>> + * FFmpeg is distributed in the hope that it will be useful, >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + * Lesser General Public License for more details. >>> + * >>> + * You should have received a copy of the GNU Lesser General Public >>> + * License along with FFmpeg; if not, write to the Free Software >>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >>> 02110-1301 USA >>> + */ >>> + >>> +#include "libavutil/attributes.h" >>> +#include "libavutil/cpu.h" >>> +#include "libavutil/x86/cpu.h" >>> +#include "libavfilter/avf_showcqt.h" >>> + >>> +#define DECLARE_CQT_CALC(type) \ >>> +void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \ >>> + const Coeffs *coeffs, int len, int fft_len) >>> + >>> +DECLARE_CQT_CALC(sse); >>> +DECLARE_CQT_CALC(sse3); >>> +DECLARE_CQT_CALC(avx); >>> +DECLARE_CQT_CALC(fma3); >>> +DECLARE_CQT_CALC(fma4); >>> + >>> +#define permute_coeffs_0 NULL >>> + >>> +static void permute_coeffs_01452367(float *v, int len) >>> +{ >>> + int k; >>> + for (k = 0; k < len; k += 8) { >>> + FFSWAP(float, v[k+2], v[k+4]); >>> + FFSWAP(float, v[k+3], v[k+5]); >>> + } >>> +} >>> + >>> +av_cold void ff_showcqt_init_x86(ShowCQTContext *s) >>> +{ >>> + int cpuflags = av_get_cpu_flags(); >>> + >>> +#define SELECT_CQT_CALC(type, TYPE, align, perm) \ >>> +if (EXTERNAL_##TYPE(cpuflags)) { \ >>> + s->cqt_calc = ff_showcqt_cqt_calc_##type; \ >>> + s->cqt_align = align; \ >>> + s->permute_coeffs = permute_coeffs_##perm; \ >>> +} >>> + >>> + SELECT_CQT_CALC(sse, SSE, 4, 0); >>> + SELECT_CQT_CALC(sse3, SSE3, 4, 0); >>> + SELECT_CQT_CALC(avx, AVX, 8, 01452367); >> >> Use AVX_FAST, so this function will not be used on CPUs that set the >> AV_CPU_FLAG_AVXSLOW flag. >> >>> + SELECT_CQT_CALC(fma3, FMA3, 8, 01452367); >> >> Same, use FMA3_FAST. The result will then be the FMA3 version used by >> Intel CPUs and hopefully AMD Zen, and the FMA4 one by Bulldozer based >> CPUs. >> >>> + SELECT_CQT_CALC(fma4, FMA4, 8, 01452367); >>> +} >>> >> > > OK, also reorder (FMA4 before AVX because AVX/ymm without FMA4 is faster than > FMA4/xmm) > > Modified patch attached > > Thank's
applied with modified message Thank's _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel