On Mon, Jan 12, 2026 at 1:52 PM Andy Wu via ffmpeg-devel <[email protected]> wrote: > > Add an AVX2 implementation of compute_safe_ssd_integral_image used by > vf_nlmeans. > > checkasm: vf_nlmeans > > bench: (x86_64, Linux) ssd_integral_image 1.93x > > bench: (x86_64, Windows/MSVC) ssd_integral_image 1.71x > > Signed-off-by: Andy Wu <[email protected]> > --- > libavfilter/x86/vf_nlmeans.asm | 114 ++++++++++++++++++++++++++++++ > libavfilter/x86/vf_nlmeans_init.c | 9 ++- > 2 files changed, 122 insertions(+), 1 deletion(-) > > diff --git a/libavfilter/x86/vf_nlmeans.asm b/libavfilter/x86/vf_nlmeans.asm > index 8f57801035..c61593b916 100644 > --- a/libavfilter/x86/vf_nlmeans.asm > +++ b/libavfilter/x86/vf_nlmeans.asm > @@ -37,6 +37,120 @@ ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\ > > SECTION .text > > +; void ff_compute_safe_ssd_integral_image(uint32_t *dst, ptrdiff_t > dst_linesize_32, > +; const uint8_t *s1, ptrdiff_t > linesize1, > +; const uint8_t *s2, ptrdiff_t > linesize2, > +; int w, int h); > +; > +; Assumptions (see C version): > +; - w is multiple of 16 and w >= 16 > +; - h >= 1 > +; - dst[-1] and dst_top[-1] are readable > + > +INIT_YMM avx2 > +cglobal compute_safe_ssd_integral_image, 8, 14, 6, 0, dst, dst_lz, s1, ls1, > s2, ls2, w, h, dst_top, dst_stride, x, carry, tmp > + mov wd, dword wm > + mov hd, dword hm > + movsxd wq, wd > + > + mov dst_strideq, dst_lzq > + shl dst_strideq, 2 > + mov dst_topq, dstq > + sub dst_topq, dst_strideq > + > +.yloop: > + xor xq, xq > + mov carryd, [dstq - 4] > + > +.xloop: > + ; ---- process 8 pixels ---- > + pmovzxbd m0, [s1q + xq] > + pmovzxbd m1, [s2q + xq] > + psubd m0, m1 > + pmulld m0, m0 > + > + movu m1, [dst_topq + xq*4] > + movu m2, [dst_topq + xq*4 - 4] > + psubd m1, m2 > + paddd m0, m1 > + > + mova m5, m0 > + pslldq m5, 4 > + paddd m0, m5 > + mova m5, m0 > + pslldq m5, 8 > + paddd m0, m5 > + mova m5, m0 > + pslldq m5, 16 > + paddd m0, m5 > + > + vextracti128 xm5, m0, 0 > + pshufd xm5, xm5, 0xff > + pxor m4, m4 > + vinserti128 m4, m4, xm5, 1 > + paddd m0, m4 > + > + movd xm5, carryd > + vpbroadcastd m4, xm5 > + paddd m0, m4 > + > + movu [dstq + xq*4], m0 > + > + vextracti128 xm5, m0, 1 > + pshufd xm5, xm5, 0xff > + movd carryd, xm5 > + > + add xq, 8 > + > + ; ---- process 8 pixels ---- > + pmovzxbd m0, [s1q + xq] > + pmovzxbd m1, [s2q + xq] > + psubd m0, m1 > + pmulld m0, m0 > + > + movu m1, [dst_topq + xq*4] > + movu m2, [dst_topq + xq*4 - 4] > + psubd m1, m2 > + paddd m0, m1 > + > + mova m5, m0 > + pslldq m5, 4 > + paddd m0, m5 > + mova m5, m0 > + pslldq m5, 8 > + paddd m0, m5 > + mova m5, m0 > + pslldq m5, 16 > + paddd m0, m5 > + > + vextracti128 xm5, m0, 0 > + pshufd xm5, xm5, 0xff > + pxor m4, m4 > + vinserti128 m4, m4, xm5, 1 > + paddd m0, m4 > + > + movd xm5, carryd > + vpbroadcastd m4, xm5 > + paddd m0, m4 > + > + movu [dstq + xq*4], m0 > + > + vextracti128 xm5, m0, 1 > + pshufd xm5, xm5, 0xff > + movd carryd, xm5 > + > + add xq, 8 the duplicated block of code for processing 8 pixels could be wrapped in a macro,it will make the code more maintainable and shorter. > + cmp xq, wq > + jl .xloop > + > + add s1q, ls1q > + add s2q, ls2q > + add dstq, dst_strideq > + add dst_topq, dst_strideq > + dec hd > + jg .yloop > + RET > + > ; void ff_compute_weights_line(const uint32_t *const iia, > ; const uint32_t *const iib, > ; const uint32_t *const iid, > diff --git a/libavfilter/x86/vf_nlmeans_init.c > b/libavfilter/x86/vf_nlmeans_init.c > index 0adb2c7e8a..5bfdc7e028 100644 > --- a/libavfilter/x86/vf_nlmeans_init.c > +++ b/libavfilter/x86/vf_nlmeans_init.c > @@ -20,6 +20,11 @@ > #include "libavutil/x86/cpu.h" > #include "libavfilter/vf_nlmeans.h" > > +void ff_compute_safe_ssd_integral_image_avx2(uint32_t *dst, ptrdiff_t > dst_linesize_32, > + const uint8_t *s1, ptrdiff_t > linesize1, > + const uint8_t *s2, ptrdiff_t > linesize2, > + int w, int h); > + > void ff_compute_weights_line_avx2(const uint32_t *const iia, > const uint32_t *const iib, > const uint32_t *const iid, > @@ -36,7 +41,9 @@ av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp) > #if ARCH_X86_64 > int cpu_flags = av_get_cpu_flags(); > > - if (EXTERNAL_AVX2_FAST(cpu_flags)) > + if (EXTERNAL_AVX2_FAST(cpu_flags)) { > + dsp->compute_safe_ssd_integral_image = > ff_compute_safe_ssd_integral_image_avx2; > dsp->compute_weights_line = ff_compute_weights_line_avx2; > + } > #endif > } > -- > 2.43.0 > > _______________________________________________ > ffmpeg-devel mailing list -- [email protected] > To unsubscribe send an email to [email protected]
-- ======================================= Jun zhao/赵军 +++++++++++++++++++++++++++++++++++++++ _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
