PR #21393 opened by Jun Zhao (mypopydev) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21393 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21393.patch
Implement NEON optimization for compute_weights_line. Also update the function signature to use ptrdiff_t for stack arguments (max_meaningful_diff, startx, endx). This is necessary for AArch64 where 32-bit stack arguments are packed (4-byte aligned), while the assembly implementation expects 8-byte slots. Using ptrdiff_t enforces 8-byte alignment/size, consistent with the assembly. The x86 AVX2 prototype is updated to match the new signature. Performance benchmark (AArch64): ./tests/checkasm/checkasm --test=vf_nlmeans --bench compute_weights_line_c: 579.2 ( 1.00x) compute_weights_line_neon: 110.4 ( 5.25x) Signed-off-by: Jun Zhao <[email protected]> >From 46b2331ef11e14dd06d5849a16a63546522ba5f0 Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Tue, 6 Jan 2026 17:44:01 +0800 Subject: [PATCH] lavfi/nlmeans: add aarch64 neon for compute_weights_line Implement NEON optimization for compute_weights_line. Also update the function signature to use ptrdiff_t for stack arguments (max_meaningful_diff, startx, endx). This is necessary for AArch64 where 32-bit stack arguments are packed (4-byte aligned), while the assembly implementation expects 8-byte slots. Using ptrdiff_t enforces 8-byte alignment/size, consistent with the assembly. The x86 AVX2 prototype is updated to match the new signature. Performance benchmark (AArch64): ./tests/checkasm/checkasm --test=vf_nlmeans --bench compute_weights_line_c: 579.2 ( 1.00x) compute_weights_line_neon: 110.4 ( 5.25x) Signed-off-by: Jun Zhao <[email protected]> --- libavfilter/aarch64/vf_nlmeans_init.c | 15 ++- libavfilter/aarch64/vf_nlmeans_neon.S | 128 ++++++++++++++++++++++++++ libavfilter/vf_nlmeans.h | 4 +- libavfilter/vf_nlmeans_init.h | 4 +- libavfilter/x86/vf_nlmeans_init.c | 4 +- tests/checkasm/vf_nlmeans.c | 108 ++++++++++++++++++++++ 6 files changed, 256 insertions(+), 7 deletions(-) diff --git a/libavfilter/aarch64/vf_nlmeans_init.c b/libavfilter/aarch64/vf_nlmeans_init.c index 6793370a4a..fbee336322 100644 --- a/libavfilter/aarch64/vf_nlmeans_init.c +++ b/libavfilter/aarch64/vf_nlmeans_init.c @@ -25,10 +25,23 @@ void ff_compute_safe_ssd_integral_image_neon(uint32_t *dst, ptrdiff_t dst_linesi const uint8_t *s2, ptrdiff_t linesize2, int w, int h); +void ff_compute_weights_line_neon(const uint32_t *const iia, + const uint32_t *const iib, + const uint32_t *const iid, + const uint32_t *const iie, + const uint8_t *const src, + float *total_weight, + float *sum, + const float *const weight_lut, + ptrdiff_t max_meaningful_diff, + ptrdiff_t startx, ptrdiff_t endx); + av_cold void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp) { int cpu_flags = av_get_cpu_flags(); - if (have_neon(cpu_flags)) + if (have_neon(cpu_flags)) { dsp->compute_safe_ssd_integral_image = ff_compute_safe_ssd_integral_image_neon; + dsp->compute_weights_line = ff_compute_weights_line_neon; + } } diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S b/libavfilter/aarch64/vf_nlmeans_neon.S index a788cffd85..c913ae4e21 100644 --- a/libavfilter/aarch64/vf_nlmeans_neon.S +++ b/libavfilter/aarch64/vf_nlmeans_neon.S @@ -78,3 +78,131 @@ function ff_compute_safe_ssd_integral_image_neon, export=1 b.ne 1b ret endfunc + +function ff_compute_weights_line_neon, export=1 + // x0 = iia, x1 = iib, x2 = iid, x3 = iie + // x4 = src, x5 = total_weight, x6 = sum, x7 = weight_lut + // stack: [sp+0] = max_meaningful_diff, [sp+8] = startx, [sp+16] = endx + + ldr w8, [sp, #0] // max_meaningful_diff + ldr w9, [sp, #8] // startx + ldr w10, [sp, #16] // endx + + cmp w9, w10 + b.ge 9f // if startx >= endx return + + // Offset pointers + lsl x11, x9, #2 // startx * 4 (for uint32/float) + add x0, x0, x11 // iia += startx + add x1, x1, x11 // iib += startx + add x2, x2, x11 // iid += startx + add x3, x3, x11 // iie += startx + add x5, x5, x11 // total_weight += startx + add x6, x6, x11 // sum += startx + + // src is uint8, so offset is just startx + add x4, x4, x9 // src += startx + + dup v30.4s, w8 // v30 = max_meaningful_diff + + sub w10, w10, w9 // count = endx - startx + +1: // Main loop + cmp w10, #4 + b.lt 2f // Handle leftovers + + // Load integral image values + ld1 {v0.4s}, [x0], #16 // iia + ld1 {v1.4s}, [x1], #16 // iib + ld1 {v2.4s}, [x2], #16 // iid + ld1 {v3.4s}, [x3], #16 // iie + + // diff = e - d - b + a + sub v3.4s, v3.4s, v2.4s // e - d + sub v3.4s, v3.4s, v1.4s // e - d - b + add v3.4s, v3.4s, v0.4s // e - d - b + a + + // min(diff, max) + umin v3.4s, v3.4s, v30.4s + + // Lookup weights + // Move to scalar registers to address lut + mov w8, v3.s[0] + mov w9, v3.s[1] + mov w11, v3.s[2] + mov w12, v3.s[3] + + // Load 4 float weights + // LDR S register from x7 + offset*4 + ldr s0, [x7, w8, uxtw #2] + ldr s1, [x7, w9, uxtw #2] + ldr s2, [x7, w11, uxtw #2] + ldr s3, [x7, w12, uxtw #2] + + // Merge into v0.4s (s0 already in v0.s[0] from ldr s0) + mov v0.s[1], v1.s[0] + mov v0.s[2], v2.s[0] + mov v0.s[3], v3.s[0] + + // Load src pixels (uint8) + ld1 {v1.s}[0], [x4], #4 // Load 4 bytes + uxtl v1.8h, v1.8b // 8b -> 16b (lower half) + uxtl v1.4s, v1.4h // 16b -> 32b (lower half) + ucvtf v1.4s, v1.4s // int -> float + + // Load total_weight and sum + ld1 {v2.4s}, [x5] // total_weight + ld1 {v3.4s}, [x6] // sum + + // Update + fadd v2.4s, v2.4s, v0.4s // total_weight += weight + fmla v3.4s, v1.4s, v0.4s // sum += src * weight + + // Store back + st1 {v2.4s}, [x5], #16 + st1 {v3.4s}, [x6], #16 + + sub w10, w10, #4 + b 1b + +2: // Leftovers + cmp w10, #0 + b.le 9f + + // Single pixel handling + ldr w13, [x0], #4 + ldr w14, [x1], #4 + ldr w15, [x2], #4 + ldr w16, [x3], #4 + + sub w16, w16, w15 + sub w16, w16, w14 + add w16, w16, w13 + + // min (unsigned comparison) + ldr w8, [sp, #0] + cmp w16, w8 + csel w16, w16, w8, ls // unsigned lower or same + + // Load weight + ldr s0, [x7, w16, uxtw #2] + + // Load src + ldrb w13, [x4], #1 + ucvtf s1, w13 + + // Load acc + ldr s2, [x5] + ldr s3, [x6] + + fadd s2, s2, s0 + fmadd s3, s1, s0, s3 + + str s2, [x5], #4 + str s3, [x6], #4 + + sub w10, w10, #1 + b 2b + +9: ret +endfunc diff --git a/libavfilter/vf_nlmeans.h b/libavfilter/vf_nlmeans.h index 61377f8c69..4d6ab47f54 100644 --- a/libavfilter/vf_nlmeans.h +++ b/libavfilter/vf_nlmeans.h @@ -35,8 +35,8 @@ typedef struct NLMeansDSPContext { float *total_weight, float *sum, const float *const weight_lut, - int max_meaningful_diff, - int startx, int endx); + ptrdiff_t max_meaningful_diff, + ptrdiff_t startx, ptrdiff_t endx); } NLMeansDSPContext; void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp); diff --git a/libavfilter/vf_nlmeans_init.h b/libavfilter/vf_nlmeans_init.h index cf31e74bd7..58ba0fdd83 100644 --- a/libavfilter/vf_nlmeans_init.h +++ b/libavfilter/vf_nlmeans_init.h @@ -79,8 +79,8 @@ static void compute_weights_line_c(const uint32_t *const iia, float *total_weight, float *sum, const float *const weight_lut, - int max_meaningful_diff, - int startx, int endx) + ptrdiff_t max_meaningful_diff, + ptrdiff_t startx, ptrdiff_t endx) { for (int x = startx; x < endx; x++) { /* diff --git a/libavfilter/x86/vf_nlmeans_init.c b/libavfilter/x86/vf_nlmeans_init.c index 5d67090a98..0adb2c7e8a 100644 --- a/libavfilter/x86/vf_nlmeans_init.c +++ b/libavfilter/x86/vf_nlmeans_init.c @@ -28,8 +28,8 @@ void ff_compute_weights_line_avx2(const uint32_t *const iia, float *total_weight, float *sum, const float *const weight_lut, - int max_meaningful_diff, - int startx, int endx); + ptrdiff_t max_meaningful_diff, + ptrdiff_t startx, ptrdiff_t endx); av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp) { diff --git a/tests/checkasm/vf_nlmeans.c b/tests/checkasm/vf_nlmeans.c index e61a2efae6..23b82cbd54 100644 --- a/tests/checkasm/vf_nlmeans.c +++ b/tests/checkasm/vf_nlmeans.c @@ -18,6 +18,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <math.h> #include "checkasm.h" #include "libavfilter/vf_nlmeans_init.h" #include "libavutil/avassert.h" @@ -29,6 +30,11 @@ ((uint32_t *)buf)[i] = rnd(); \ } while (0) +static int float_almost_equal(float a, float b, float eps) +{ + return fabsf(a - b) < eps; +} + void checkasm_check_nlmeans(void) { NLMeansDSPContext dsp = {0}; @@ -110,5 +116,107 @@ void checkasm_check_nlmeans(void) av_freep(&src); } + if (check_func(dsp.compute_weights_line, "compute_weights_line")) { + const int test_w = 256; + const int max_meaningful_diff = 255; + const int startx = 10; + const int endx = 200; + + // Allocate aligned buffers + uint32_t *iia = av_malloc_array(test_w + 16, sizeof(uint32_t)); + uint32_t *iib = av_malloc_array(test_w + 16, sizeof(uint32_t)); + uint32_t *iid = av_malloc_array(test_w + 16, sizeof(uint32_t)); + uint32_t *iie = av_malloc_array(test_w + 16, sizeof(uint32_t)); + uint8_t *src = av_malloc(test_w + 16); + float *tw_ref = av_calloc(test_w + 16, sizeof(float)); + float *tw_new = av_calloc(test_w + 16, sizeof(float)); + float *sum_ref = av_calloc(test_w + 16, sizeof(float)); + float *sum_new = av_calloc(test_w + 16, sizeof(float)); + float *lut = av_malloc_array(max_meaningful_diff + 1, sizeof(float)); + + declare_func(void, const uint32_t *const iia, + const uint32_t *const iib, + const uint32_t *const iid, + const uint32_t *const iie, + const uint8_t *const src, + float *total_weight, + float *sum, + const float *const weight_lut, + ptrdiff_t max_meaningful_diff, + ptrdiff_t startx, ptrdiff_t endx); + + if (!iia || !iib || !iid || !iie || !src || !tw_ref || !tw_new || + !sum_ref || !sum_new || !lut) + goto cleanup_weights; + + // Initialize LUT: weight = exp(-diff * scale) + // Using scale = 0.01 for testing + for (int i = 0; i <= max_meaningful_diff; i++) + lut[i] = expf(-i * 0.01f); + + // Initialize source pixels + for (int i = 0; i < test_w; i++) + src[i] = rnd() & 0xff; + + // Initialize integral images + // We need to ensure diff = e - d - b + a is non-negative and within range + // Set up as if computing real integral image values + for (int i = 0; i < test_w; i++) { + uint32_t base = rnd() % 1000; + iia[i] = base; + iib[i] = base + (rnd() % 100); + iid[i] = base + (rnd() % 100); + // e = a + (b - a) + (d - a) + diff + // So diff = e - d - b + a will be in range [0, max_meaningful_diff] + uint32_t diff = rnd() % (max_meaningful_diff + 1); + iie[i] = iia[i] + (iib[i] - iia[i]) + (iid[i] - iia[i]) + diff; + } + + // Clear output buffers + memset(tw_ref, 0, (test_w + 16) * sizeof(float)); + memset(tw_new, 0, (test_w + 16) * sizeof(float)); + memset(sum_ref, 0, (test_w + 16) * sizeof(float)); + memset(sum_new, 0, (test_w + 16) * sizeof(float)); + + call_ref(iia, iib, iid, iie, src, tw_ref, sum_ref, lut, + max_meaningful_diff, startx, endx); + call_new(iia, iib, iid, iie, src, tw_new, sum_new, lut, + max_meaningful_diff, startx, endx); + + // Compare results with small tolerance for floating point + for (int i = startx; i < endx; i++) { + if (!float_almost_equal(tw_ref[i], tw_new[i], 1e-5f)) { + fprintf(stderr, "total_weight mismatch at %d: ref=%f new=%f\n", + i, tw_ref[i], tw_new[i]); + fail(); + break; + } + if (!float_almost_equal(sum_ref[i], sum_new[i], 1e-4f)) { + fprintf(stderr, "sum mismatch at %d: ref=%f new=%f\n", + i, sum_ref[i], sum_new[i]); + fail(); + break; + } + } + + // Benchmark + memset(tw_new, 0, (test_w + 16) * sizeof(float)); + memset(sum_new, 0, (test_w + 16) * sizeof(float)); + bench_new(iia, iib, iid, iie, src, tw_new, sum_new, lut, + max_meaningful_diff, startx, endx); + +cleanup_weights: + av_freep(&iia); + av_freep(&iib); + av_freep(&iid); + av_freep(&iie); + av_freep(&src); + av_freep(&tw_ref); + av_freep(&tw_new); + av_freep(&sum_ref); + av_freep(&sum_new); + av_freep(&lut); + } + report("dsp"); } -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
