Add an AVX2 implementation of compute_safe_ssd_integral_image used by 
vf_nlmeans.

checkasm: vf_nlmeans

bench: (x86_64, Linux) ssd_integral_image 1.93x

bench: (x86_64, Windows/MSVC) ssd_integral_image 1.71x

Signed-off-by: Andy Wu <[email protected]>
---
 libavfilter/x86/vf_nlmeans.asm    | 114 ++++++++++++++++++++++++++++++
 libavfilter/x86/vf_nlmeans_init.c |   9 ++-
 2 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/libavfilter/x86/vf_nlmeans.asm b/libavfilter/x86/vf_nlmeans.asm
index 8f57801035..c61593b916 100644
--- a/libavfilter/x86/vf_nlmeans.asm
+++ b/libavfilter/x86/vf_nlmeans.asm
@@ -37,6 +37,120 @@ ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\
 
 SECTION .text
 
+; void ff_compute_safe_ssd_integral_image(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
+;                                         const uint8_t *s1, ptrdiff_t 
linesize1,
+;                                         const uint8_t *s2, ptrdiff_t 
linesize2,
+;                                         int w, int h);
+;
+; Assumptions (see C version):
+; - w is multiple of 16 and w >= 16
+; - h >= 1
+; - dst[-1] and dst_top[-1] are readable
+
+INIT_YMM avx2
+cglobal compute_safe_ssd_integral_image, 8, 14, 6, 0, dst, dst_lz, s1, ls1, 
s2, ls2, w, h, dst_top, dst_stride, x, carry, tmp
+    mov            wd, dword wm
+    mov            hd, dword hm
+    movsxd         wq, wd
+
+    mov   dst_strideq, dst_lzq
+    shl   dst_strideq, 2
+    mov      dst_topq, dstq
+    sub      dst_topq, dst_strideq
+
+.yloop:
+    xor           xq, xq
+    mov       carryd, [dstq - 4]
+
+.xloop:
+    ; ---- process 8 pixels ----
+    pmovzxbd      m0, [s1q + xq]
+    pmovzxbd      m1, [s2q + xq]
+    psubd         m0, m1
+    pmulld        m0, m0
+
+    movu          m1, [dst_topq + xq*4]
+    movu          m2, [dst_topq + xq*4 - 4]
+    psubd         m1, m2
+    paddd         m0, m1
+
+    mova          m5, m0
+    pslldq        m5, 4
+    paddd         m0, m5
+    mova          m5, m0
+    pslldq        m5, 8
+    paddd         m0, m5
+    mova          m5, m0
+    pslldq        m5, 16
+    paddd         m0, m5
+
+    vextracti128 xm5, m0, 0
+    pshufd      xm5, xm5, 0xff
+    pxor          m4, m4
+    vinserti128   m4, m4, xm5, 1
+    paddd         m0, m4
+
+    movd        xm5, carryd
+    vpbroadcastd  m4, xm5
+    paddd         m0, m4
+
+    movu [dstq + xq*4], m0
+
+    vextracti128 xm5, m0, 1
+    pshufd      xm5, xm5, 0xff
+    movd      carryd, xm5
+
+    add           xq, 8
+
+    ; ---- process 8 pixels ----
+    pmovzxbd      m0, [s1q + xq]
+    pmovzxbd      m1, [s2q + xq]
+    psubd         m0, m1
+    pmulld        m0, m0
+
+    movu          m1, [dst_topq + xq*4]
+    movu          m2, [dst_topq + xq*4 - 4]
+    psubd         m1, m2
+    paddd         m0, m1
+
+    mova          m5, m0
+    pslldq        m5, 4
+    paddd         m0, m5
+    mova          m5, m0
+    pslldq        m5, 8
+    paddd         m0, m5
+    mova          m5, m0
+    pslldq        m5, 16
+    paddd         m0, m5
+
+    vextracti128 xm5, m0, 0
+    pshufd      xm5, xm5, 0xff
+    pxor          m4, m4
+    vinserti128   m4, m4, xm5, 1
+    paddd         m0, m4
+
+    movd        xm5, carryd
+    vpbroadcastd  m4, xm5
+    paddd         m0, m4
+
+    movu [dstq + xq*4], m0
+
+    vextracti128 xm5, m0, 1
+    pshufd      xm5, xm5, 0xff
+    movd      carryd, xm5
+
+    add           xq, 8
+    cmp           xq, wq
+    jl .xloop
+
+    add          s1q, ls1q
+    add          s2q, ls2q
+    add         dstq, dst_strideq
+    add     dst_topq, dst_strideq
+    dec           hd
+    jg .yloop
+    RET
+
 ; void ff_compute_weights_line(const uint32_t *const iia,
 ;                              const uint32_t *const iib,
 ;                              const uint32_t *const iid,
diff --git a/libavfilter/x86/vf_nlmeans_init.c 
b/libavfilter/x86/vf_nlmeans_init.c
index 0adb2c7e8a..5bfdc7e028 100644
--- a/libavfilter/x86/vf_nlmeans_init.c
+++ b/libavfilter/x86/vf_nlmeans_init.c
@@ -20,6 +20,11 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/vf_nlmeans.h"
 
+void ff_compute_safe_ssd_integral_image_avx2(uint32_t *dst, ptrdiff_t 
dst_linesize_32,
+                                             const uint8_t *s1, ptrdiff_t 
linesize1,
+                                             const uint8_t *s2, ptrdiff_t 
linesize2,
+                                             int w, int h);
+
 void ff_compute_weights_line_avx2(const uint32_t *const iia,
                                   const uint32_t *const iib,
                                   const uint32_t *const iid,
@@ -36,7 +41,9 @@ av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp)
 #if ARCH_X86_64
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_AVX2_FAST(cpu_flags))
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        dsp->compute_safe_ssd_integral_image = 
ff_compute_safe_ssd_integral_image_avx2;
         dsp->compute_weights_line = ff_compute_weights_line_avx2;
+    }
 #endif
 }
-- 
2.43.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to