Signed-off-by: Paul B Mahol <one...@gmail.com> --- libavfilter/vf_nlmeans.c | 3 ++ libavfilter/vf_nlmeans.h | 1 + libavfilter/x86/Makefile | 2 + libavfilter/x86/vf_nlmeans.asm | 89 ++++++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+) create mode 100644 libavfilter/x86/vf_nlmeans.asm
diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c index 93a14bcf19..16171d830a 100644 --- a/libavfilter/vf_nlmeans.c +++ b/libavfilter/vf_nlmeans.c @@ -513,6 +513,9 @@ void ff_nlmeans_init(NLMeansDSPContext *dsp) if (ARCH_AARCH64) ff_nlmeans_init_aarch64(dsp); + + if (ARCH_X86) + ff_nlmeans_init_x86(dsp); } static av_cold int init(AVFilterContext *ctx) diff --git a/libavfilter/vf_nlmeans.h b/libavfilter/vf_nlmeans.h index d0d0056163..ae9f450dbf 100644 --- a/libavfilter/vf_nlmeans.h +++ b/libavfilter/vf_nlmeans.h @@ -45,5 +45,6 @@ typedef struct NLMeansDSPContext { void ff_nlmeans_init(NLMeansDSPContext *dsp); void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp); +void ff_nlmeans_init_x86(NLMeansDSPContext *dsp); #endif /* AVFILTER_NLMEANS_H */ diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index a29941eaeb..e87481bd7a 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -20,6 +20,7 @@ OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp_init.o OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o +OBJS-$(CONFIG_NLMEANS_FILTER) += x86/vf_nlmeans_init.o OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay_init.o OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o @@ -61,6 +62,7 @@ X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o +X86ASM-OBJS-$(CONFIG_NLMEANS_FILTER) += x86/vf_nlmeans.o X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o X86ASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o X86ASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o diff --git a/libavfilter/x86/vf_nlmeans.asm b/libavfilter/x86/vf_nlmeans.asm new file mode 100644 index 0000000000..aebcc59b54 --- /dev/null +++ b/libavfilter/x86/vf_nlmeans.asm @@ -0,0 +1,89 @@ +;***************************************************************************** +;* x86-optimized functions for nlmeans filter +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + + +%include "libavutil/x86/x86util.asm" + +%if HAVE_AVX2_EXTERNAL + +SECTION_RODATA + +SECTION .text + +; void ff_compute_weights_line(const uint32_t *const iia, +; const uint32_t *const iib, +; const uint32_t *const iid, +; const uint32_t *const iie, +; const uint8_t *const src, +; struct weighted_avg *wa, +; const float *const lut, +; int max, +; int startx, int endx); + +INIT_YMM avx2 +cglobal compute_weights_line, 11, 11, 7, iia, iib, iid, iie, src, wa, lut, max, startx, endx, x + movsxdifnidn startxq, startxd + movsxdifnidn endxq, endxd + movsxdifnidn maxq, maxd + + sal startxq, 2 + sal endxq, 2 + + mov xq, startxq + sar startxq, 2 + VBROADCASTI128 m4, maxm + pcmpeqd m5, m5 + + .loop: + movu m0, [iieq + xq] + movu m1, [iidq + xq] + movu m2, [iibq + xq] + movu m3, [iiaq + xq] + vpmovzxbd m6, [srcq + startxq] + vcvtdq2ps m6, m6 + + psubd m0, m1 + psubd m0, m2 + paddd m0, m3 + pminud m0, m4 + pslld m0, 2 + mova m3, m5 + vpgatherdd m1, [lutq + m0], m3 + + vmulps m2, m1, m6 + vunpcklps m0, m1, m2 + vunpckhps m1, m1, m2 + + movu m2, [waq + xq * 2] + movu m3, [waq + xq * 2 + 4 * 4] + + vaddps m0, m2 + vaddps m1, m3 + + movu [waq + xq * 2], m0 + movu [waq + xq * 2 + 4 * 4], m1 + + add startxq, 1 * 4 + add xq, 4 * 4 + cmp xq, endxq + jl .loop + RET + +%endif -- 2.33.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".