[FFmpeg-devel] [PR] lavfi/nlmeans: add aarch64 neon for compute_weights_line (PR #21393)

Jun Zhao via ffmpeg-devel Tue, 06 Jan 2026 05:24:12 -0800

PR #21393 opened by Jun Zhao (mypopydev)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21393
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21393.patch


Implement NEON optimization for compute_weights_line.

Also update the function signature to use ptrdiff_t for stack arguments
(max_meaningful_diff, startx, endx). This is necessary for AArch64 where
32-bit stack arguments are packed (4-byte aligned), while the assembly
implementation expects 8-byte slots. Using ptrdiff_t enforces 8-byte
alignment/size, consistent with the assembly.

The x86 AVX2 prototype is updated to match the new signature.

Performance benchmark (AArch64):
./tests/checkasm/checkasm --test=vf_nlmeans --bench
compute_weights_line_c:     579.2 ( 1.00x)
compute_weights_line_neon:  110.4 ( 5.25x)

Signed-off-by: Jun Zhao <[email protected]>


>From 46b2331ef11e14dd06d5849a16a63546522ba5f0 Mon Sep 17 00:00:00 2001
From: Jun Zhao <[email protected]>
Date: Tue, 6 Jan 2026 17:44:01 +0800
Subject: [PATCH] lavfi/nlmeans: add aarch64 neon for compute_weights_line

Implement NEON optimization for compute_weights_line.

Also update the function signature to use ptrdiff_t for stack arguments
(max_meaningful_diff, startx, endx). This is necessary for AArch64 where
32-bit stack arguments are packed (4-byte aligned), while the assembly
implementation expects 8-byte slots. Using ptrdiff_t enforces 8-byte
alignment/size, consistent with the assembly.

The x86 AVX2 prototype is updated to match the new signature.

Performance benchmark (AArch64):
./tests/checkasm/checkasm --test=vf_nlmeans --bench
compute_weights_line_c:     579.2 ( 1.00x)
compute_weights_line_neon:  110.4 ( 5.25x)

Signed-off-by: Jun Zhao <[email protected]>
---
 libavfilter/aarch64/vf_nlmeans_init.c |  15 ++-
 libavfilter/aarch64/vf_nlmeans_neon.S | 128 ++++++++++++++++++++++++++
 libavfilter/vf_nlmeans.h              |   4 +-
 libavfilter/vf_nlmeans_init.h         |   4 +-
 libavfilter/x86/vf_nlmeans_init.c     |   4 +-
 tests/checkasm/vf_nlmeans.c           | 108 ++++++++++++++++++++++
 6 files changed, 256 insertions(+), 7 deletions(-)

diff --git a/libavfilter/aarch64/vf_nlmeans_init.c 
b/libavfilter/aarch64/vf_nlmeans_init.c
index 6793370a4a..fbee336322 100644
--- a/libavfilter/aarch64/vf_nlmeans_init.c
+++ b/libavfilter/aarch64/vf_nlmeans_init.c
@@ -25,10 +25,23 @@ void ff_compute_safe_ssd_integral_image_neon(uint32_t *dst, 
ptrdiff_t dst_linesi
                                              const uint8_t *s2, ptrdiff_t 
linesize2,
                                              int w, int h);
 
+void ff_compute_weights_line_neon(const uint32_t *const iia,
+                                  const uint32_t *const iib,
+                                  const uint32_t *const iid,
+                                  const uint32_t *const iie,
+                                  const uint8_t *const src,
+                                  float *total_weight,
+                                  float *sum,
+                                  const float *const weight_lut,
+                                  ptrdiff_t max_meaningful_diff,
+                                  ptrdiff_t startx, ptrdiff_t endx);
+
 av_cold void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (have_neon(cpu_flags))
+    if (have_neon(cpu_flags)) {
         dsp->compute_safe_ssd_integral_image = 
ff_compute_safe_ssd_integral_image_neon;
+        dsp->compute_weights_line = ff_compute_weights_line_neon;
+    }
 }
diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S 
b/libavfilter/aarch64/vf_nlmeans_neon.S
index a788cffd85..c913ae4e21 100644
--- a/libavfilter/aarch64/vf_nlmeans_neon.S
+++ b/libavfilter/aarch64/vf_nlmeans_neon.S
@@ -78,3 +78,131 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
         b.ne            1b
         ret
 endfunc
+
+function ff_compute_weights_line_neon, export=1
+        // x0 = iia, x1 = iib, x2 = iid, x3 = iie
+        // x4 = src, x5 = total_weight, x6 = sum, x7 = weight_lut
+        // stack: [sp+0] = max_meaningful_diff, [sp+8] = startx, [sp+16] = endx
+
+        ldr             w8,  [sp, #0]                   // max_meaningful_diff
+        ldr             w9,  [sp, #8]                   // startx
+        ldr             w10, [sp, #16]                  // endx
+
+        cmp             w9, w10
+        b.ge            9f                              // if startx >= endx 
return
+
+        // Offset pointers
+        lsl             x11, x9, #2                     // startx * 4 (for 
uint32/float)
+        add             x0, x0, x11                     // iia += startx
+        add             x1, x1, x11                     // iib += startx
+        add             x2, x2, x11                     // iid += startx
+        add             x3, x3, x11                     // iie += startx
+        add             x5, x5, x11                     // total_weight += 
startx
+        add             x6, x6, x11                     // sum += startx
+
+        // src is uint8, so offset is just startx
+        add             x4, x4, x9                      // src += startx
+
+        dup             v30.4s, w8                      // v30 = 
max_meaningful_diff
+
+        sub             w10, w10, w9                    // count = endx - 
startx
+
+1:      // Main loop
+        cmp             w10, #4
+        b.lt            2f                              // Handle leftovers
+
+        // Load integral image values
+        ld1             {v0.4s}, [x0], #16              // iia
+        ld1             {v1.4s}, [x1], #16              // iib
+        ld1             {v2.4s}, [x2], #16              // iid
+        ld1             {v3.4s}, [x3], #16              // iie
+
+        // diff = e - d - b + a
+        sub             v3.4s, v3.4s, v2.4s             // e - d
+        sub             v3.4s, v3.4s, v1.4s             // e - d - b
+        add             v3.4s, v3.4s, v0.4s             // e - d - b + a
+
+        // min(diff, max)
+        umin            v3.4s, v3.4s, v30.4s
+
+        // Lookup weights
+        // Move to scalar registers to address lut
+        mov             w8,  v3.s[0]
+        mov             w9,  v3.s[1]
+        mov             w11, v3.s[2]
+        mov             w12, v3.s[3]
+
+        // Load 4 float weights
+        // LDR S register from x7 + offset*4
+        ldr             s0, [x7, w8, uxtw #2]
+        ldr             s1, [x7, w9, uxtw #2]
+        ldr             s2, [x7, w11, uxtw #2]
+        ldr             s3, [x7, w12, uxtw #2]
+
+        // Merge into v0.4s (s0 already in v0.s[0] from ldr s0)
+        mov             v0.s[1], v1.s[0]
+        mov             v0.s[2], v2.s[0]
+        mov             v0.s[3], v3.s[0]
+
+        // Load src pixels (uint8)
+        ld1             {v1.s}[0], [x4], #4             // Load 4 bytes
+        uxtl            v1.8h, v1.8b                    // 8b -> 16b (lower 
half)
+        uxtl            v1.4s, v1.4h                    // 16b -> 32b (lower 
half)
+        ucvtf           v1.4s, v1.4s                    // int -> float
+
+        // Load total_weight and sum
+        ld1             {v2.4s}, [x5]                   // total_weight
+        ld1             {v3.4s}, [x6]                   // sum
+
+        // Update
+        fadd            v2.4s, v2.4s, v0.4s             // total_weight += 
weight
+        fmla            v3.4s, v1.4s, v0.4s             // sum += src * weight
+
+        // Store back
+        st1             {v2.4s}, [x5], #16
+        st1             {v3.4s}, [x6], #16
+
+        sub             w10, w10, #4
+        b               1b
+
+2:      // Leftovers
+        cmp             w10, #0
+        b.le            9f
+
+        // Single pixel handling
+        ldr             w13, [x0], #4
+        ldr             w14, [x1], #4
+        ldr             w15, [x2], #4
+        ldr             w16, [x3], #4
+
+        sub             w16, w16, w15
+        sub             w16, w16, w14
+        add             w16, w16, w13
+
+        // min (unsigned comparison)
+        ldr             w8,  [sp, #0]
+        cmp             w16, w8
+        csel            w16, w16, w8, ls                // unsigned lower or 
same
+
+        // Load weight
+        ldr             s0, [x7, w16, uxtw #2]
+
+        // Load src
+        ldrb            w13, [x4], #1
+        ucvtf           s1, w13
+
+        // Load acc
+        ldr             s2, [x5]
+        ldr             s3, [x6]
+
+        fadd            s2, s2, s0
+        fmadd           s3, s1, s0, s3
+
+        str             s2, [x5], #4
+        str             s3, [x6], #4
+
+        sub             w10, w10, #1
+        b               2b
+
+9:      ret
+endfunc
diff --git a/libavfilter/vf_nlmeans.h b/libavfilter/vf_nlmeans.h
index 61377f8c69..4d6ab47f54 100644
--- a/libavfilter/vf_nlmeans.h
+++ b/libavfilter/vf_nlmeans.h
@@ -35,8 +35,8 @@ typedef struct NLMeansDSPContext {
                                  float *total_weight,
                                  float *sum,
                                  const float *const weight_lut,
-                                 int max_meaningful_diff,
-                                 int startx, int endx);
+                                 ptrdiff_t max_meaningful_diff,
+                                 ptrdiff_t startx, ptrdiff_t endx);
 } NLMeansDSPContext;
 
 void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp);
diff --git a/libavfilter/vf_nlmeans_init.h b/libavfilter/vf_nlmeans_init.h
index cf31e74bd7..58ba0fdd83 100644
--- a/libavfilter/vf_nlmeans_init.h
+++ b/libavfilter/vf_nlmeans_init.h
@@ -79,8 +79,8 @@ static void compute_weights_line_c(const uint32_t *const iia,
                                    float *total_weight,
                                    float *sum,
                                    const float *const weight_lut,
-                                   int max_meaningful_diff,
-                                   int startx, int endx)
+                                   ptrdiff_t max_meaningful_diff,
+                                   ptrdiff_t startx, ptrdiff_t endx)
 {
     for (int x = startx; x < endx; x++) {
         /*
diff --git a/libavfilter/x86/vf_nlmeans_init.c 
b/libavfilter/x86/vf_nlmeans_init.c
index 5d67090a98..0adb2c7e8a 100644
--- a/libavfilter/x86/vf_nlmeans_init.c
+++ b/libavfilter/x86/vf_nlmeans_init.c
@@ -28,8 +28,8 @@ void ff_compute_weights_line_avx2(const uint32_t *const iia,
                                   float *total_weight,
                                   float *sum,
                                   const float *const weight_lut,
-                                  int max_meaningful_diff,
-                                  int startx, int endx);
+                                  ptrdiff_t max_meaningful_diff,
+                                  ptrdiff_t startx, ptrdiff_t endx);
 
 av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp)
 {
diff --git a/tests/checkasm/vf_nlmeans.c b/tests/checkasm/vf_nlmeans.c
index e61a2efae6..23b82cbd54 100644
--- a/tests/checkasm/vf_nlmeans.c
+++ b/tests/checkasm/vf_nlmeans.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <math.h>
 #include "checkasm.h"
 #include "libavfilter/vf_nlmeans_init.h"
 #include "libavutil/avassert.h"
@@ -29,6 +30,11 @@
         ((uint32_t *)buf)[i] = rnd();       \
 } while (0)
 
+static int float_almost_equal(float a, float b, float eps)
+{
+    return fabsf(a - b) < eps;
+}
+
 void checkasm_check_nlmeans(void)
 {
     NLMeansDSPContext dsp = {0};
@@ -110,5 +116,107 @@ void checkasm_check_nlmeans(void)
         av_freep(&src);
     }
 
+    if (check_func(dsp.compute_weights_line, "compute_weights_line")) {
+        const int test_w = 256;
+        const int max_meaningful_diff = 255;
+        const int startx = 10;
+        const int endx = 200;
+
+        // Allocate aligned buffers
+        uint32_t *iia     = av_malloc_array(test_w + 16, sizeof(uint32_t));
+        uint32_t *iib     = av_malloc_array(test_w + 16, sizeof(uint32_t));
+        uint32_t *iid     = av_malloc_array(test_w + 16, sizeof(uint32_t));
+        uint32_t *iie     = av_malloc_array(test_w + 16, sizeof(uint32_t));
+        uint8_t  *src     = av_malloc(test_w + 16);
+        float    *tw_ref  = av_calloc(test_w + 16, sizeof(float));
+        float    *tw_new  = av_calloc(test_w + 16, sizeof(float));
+        float    *sum_ref = av_calloc(test_w + 16, sizeof(float));
+        float    *sum_new = av_calloc(test_w + 16, sizeof(float));
+        float    *lut     = av_malloc_array(max_meaningful_diff + 1, 
sizeof(float));
+
+        declare_func(void, const uint32_t *const iia,
+                     const uint32_t *const iib,
+                     const uint32_t *const iid,
+                     const uint32_t *const iie,
+                     const uint8_t *const src,
+                     float *total_weight,
+                     float *sum,
+                     const float *const weight_lut,
+                     ptrdiff_t max_meaningful_diff,
+                     ptrdiff_t startx, ptrdiff_t endx);
+
+        if (!iia || !iib || !iid || !iie || !src || !tw_ref || !tw_new ||
+            !sum_ref || !sum_new || !lut)
+            goto cleanup_weights;
+
+        // Initialize LUT: weight = exp(-diff * scale)
+        // Using scale = 0.01 for testing
+        for (int i = 0; i <= max_meaningful_diff; i++)
+            lut[i] = expf(-i * 0.01f);
+
+        // Initialize source pixels
+        for (int i = 0; i < test_w; i++)
+            src[i] = rnd() & 0xff;
+
+        // Initialize integral images
+        // We need to ensure diff = e - d - b + a is non-negative and within 
range
+        // Set up as if computing real integral image values
+        for (int i = 0; i < test_w; i++) {
+            uint32_t base = rnd() % 1000;
+            iia[i] = base;
+            iib[i] = base + (rnd() % 100);
+            iid[i] = base + (rnd() % 100);
+            // e = a + (b - a) + (d - a) + diff
+            // So diff = e - d - b + a will be in range [0, 
max_meaningful_diff]
+            uint32_t diff = rnd() % (max_meaningful_diff + 1);
+            iie[i] = iia[i] + (iib[i] - iia[i]) + (iid[i] - iia[i]) + diff;
+        }
+
+        // Clear output buffers
+        memset(tw_ref,  0, (test_w + 16) * sizeof(float));
+        memset(tw_new,  0, (test_w + 16) * sizeof(float));
+        memset(sum_ref, 0, (test_w + 16) * sizeof(float));
+        memset(sum_new, 0, (test_w + 16) * sizeof(float));
+
+        call_ref(iia, iib, iid, iie, src, tw_ref, sum_ref, lut,
+                 max_meaningful_diff, startx, endx);
+        call_new(iia, iib, iid, iie, src, tw_new, sum_new, lut,
+                 max_meaningful_diff, startx, endx);
+
+        // Compare results with small tolerance for floating point
+        for (int i = startx; i < endx; i++) {
+            if (!float_almost_equal(tw_ref[i], tw_new[i], 1e-5f)) {
+                fprintf(stderr, "total_weight mismatch at %d: ref=%f new=%f\n",
+                        i, tw_ref[i], tw_new[i]);
+                fail();
+                break;
+            }
+            if (!float_almost_equal(sum_ref[i], sum_new[i], 1e-4f)) {
+                fprintf(stderr, "sum mismatch at %d: ref=%f new=%f\n",
+                        i, sum_ref[i], sum_new[i]);
+                fail();
+                break;
+            }
+        }
+
+        // Benchmark
+        memset(tw_new,  0, (test_w + 16) * sizeof(float));
+        memset(sum_new, 0, (test_w + 16) * sizeof(float));
+        bench_new(iia, iib, iid, iie, src, tw_new, sum_new, lut,
+                  max_meaningful_diff, startx, endx);
+
+cleanup_weights:
+        av_freep(&iia);
+        av_freep(&iib);
+        av_freep(&iid);
+        av_freep(&iie);
+        av_freep(&src);
+        av_freep(&tw_ref);
+        av_freep(&tw_new);
+        av_freep(&sum_ref);
+        av_freep(&sum_new);
+        av_freep(&lut);
+    }
+
     report("dsp");
 }
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] lavfi/nlmeans: add aarch64 neon for compute_weights_line (PR #21393)

Reply via email to