PR #23448 opened by DROO AMOR (DROOdotFOO)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23448
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23448.patch
NEON paths for `vf_threshold`. depth==8 routes to threshold8_neon (16
bytes/iter); depth > 8 to threshold16_neon (8 shorts/iter, covering
9/10/12/14/16).
C is not auto-vectorized because the four independent input streams (in,
threshold, min, max) defeat clang's vectorizer.
Per chunk the NEON kernel is two instructions: cmhs builds the in <= threshold
mask, bsl selects min or max.
Test Name M1-clang
-----------------------------------
threshold8_neon 26.7 (12.84x)
threshold10_neon 26.9 ( 6.89x)
threshold12_neon 26.7 ( 6.99x)
threshold16_neon 26.7 ( 7.03x)
Tested on Apple M1 (clang, -O3):
- checkasm --test=vf_threshold across 5 seeds {1, 42, 999, 314159, 271828}: all
4 depths pass each run. The test iterates widths 1..w-1 with 0xAA-sentinel
output buffers, so both correctness in [0, w*step) and over-writes past w*step
are checked. Multi-row + distinct-linesizes phases cover the per-pointer
row-advance cascade.
- full checkasm: 7845/7845 pass
- fate-checkasm-vf_threshold: passes
- v8-v15 not touched; only x19/x20 spilled as scalar-tail scratch (callee-saved
per AAPCS-64), balanced ldp before ret.
--
Follow-up (separate patch): ff_threshold_init_x86 currently gates the 16-bit
SSE4/AVX2 paths on s->depth == 16; depths 9/10/12/14 fall through to scalar C
even though the kernel (pminuw + pcmpeqw + PBLENDVB) is correct for any
unsigned 16-bit value. One-line dispatch fix to be sent after this lands.
>From ea3e2ce6242c73a471f8a23d5eb52c9638fd2827 Mon Sep 17 00:00:00 2001
From: DROOdotFOO <[email protected]>
Date: Sat, 30 May 2026 00:14:40 +0200
Subject: [PATCH] avfilter/aarch64: NEON threshold filter
depth==8 routes to threshold8_neon (16 bytes/iter); depth > 8 to
threshold16_neon (8 shorts/iter, covering 9/10/12/14/16). C is not
auto-vectorized: four independent input streams defeat clang's
vectorizer. Per chunk the NEON kernel is cmhs+bsl.
Test Name M1-clang
-------------------------------
threshold8_neon 26.7 (12.84x)
threshold10_neon 26.9 ( 6.89x)
threshold12_neon 26.7 ( 6.99x)
threshold16_neon 26.7 ( 7.03x)
Signed-off-by: DROOdotFOO <[email protected]>
---
libavfilter/aarch64/Makefile | 2 +
libavfilter/aarch64/vf_threshold_init.c | 51 ++++++++++
libavfilter/aarch64/vf_threshold_neon.S | 119 ++++++++++++++++++++++++
libavfilter/threshold.h | 1 +
libavfilter/vf_threshold_init.h | 2 +
tests/checkasm/vf_threshold.c | 82 ++++++++++------
6 files changed, 231 insertions(+), 26 deletions(-)
create mode 100644 libavfilter/aarch64/vf_threshold_init.c
create mode 100644 libavfilter/aarch64/vf_threshold_neon.S
diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index c7b7e18467..d09f7e4080 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,7 +1,9 @@
OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_init_aarch64.o
OBJS-$(CONFIG_COLORDETECT_FILTER) += aarch64/vf_colordetect_init.o
OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_init.o
+OBJS-$(CONFIG_THRESHOLD_FILTER) += aarch64/vf_threshold_init.o
NEON-OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_neon.o
NEON-OBJS-$(CONFIG_COLORDETECT_FILTER) += aarch64/vf_colordetect_neon.o
NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_neon.o
+NEON-OBJS-$(CONFIG_THRESHOLD_FILTER) += aarch64/vf_threshold_neon.o
diff --git a/libavfilter/aarch64/vf_threshold_init.c
b/libavfilter/aarch64/vf_threshold_init.c
new file mode 100644
index 0000000000..9b14d644a9
--- /dev/null
+++ b/libavfilter/aarch64/vf_threshold_init.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2026 DROOdotFOO <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/threshold.h"
+
+void ff_threshold8_neon(const uint8_t *in, const uint8_t *threshold,
+ const uint8_t *min, const uint8_t *max,
+ uint8_t *out,
+ ptrdiff_t ilinesize, ptrdiff_t tlinesize,
+ ptrdiff_t flinesize, ptrdiff_t slinesize,
+ ptrdiff_t olinesize,
+ int w, int h);
+
+void ff_threshold16_neon(const uint8_t *in, const uint8_t *threshold,
+ const uint8_t *min, const uint8_t *max,
+ uint8_t *out,
+ ptrdiff_t ilinesize, ptrdiff_t tlinesize,
+ ptrdiff_t flinesize, ptrdiff_t slinesize,
+ ptrdiff_t olinesize,
+ int w, int h);
+
+av_cold void ff_threshold_init_aarch64(ThresholdContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ if (s->depth == 8)
+ s->threshold = ff_threshold8_neon;
+ else
+ s->threshold = ff_threshold16_neon;
+ }
+}
diff --git a/libavfilter/aarch64/vf_threshold_neon.S
b/libavfilter/aarch64/vf_threshold_neon.S
new file mode 100644
index 0000000000..6d76d7c5e9
--- /dev/null
+++ b/libavfilter/aarch64/vf_threshold_neon.S
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2026 DROOdotFOO <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// Stack-arg slot size for an int. Apple packs ints to 4 bytes on the stack;
+// standard AAPCS-64 pads to 8.
+#ifdef __APPLE__
+.set SP_INT, 4
+#else
+.set SP_INT, 8
+#endif
+
+// THRESHOLD <bits> <vecsz> <shape> <ldop> <stop> <step>
+//
+// Generates:
+// void ff_threshold<bits>_neon(const uint8_t *in, const uint8_t *threshold,
+// const uint8_t *min, const uint8_t *max,
+// uint8_t *out,
+// ptrdiff_t ilinesize, ptrdiff_t tlinesize,
+// ptrdiff_t flinesize, ptrdiff_t slinesize,
+// ptrdiff_t olinesize, int w, int h);
+//
+// Per-pixel: out[x] = in[x] <= threshold[x] ? min[x] : max[x].
+//
+// Stack layout after the x19/x20 spill:
+// [sp + 0]: saved x19, x20
+// [sp + 16]: slinesize (ptrdiff_t, 8 bytes)
+// [sp + 24]: olinesize (ptrdiff_t, 8 bytes)
+// [sp + 32]: w (int; 4 or 8 byte slot per SP_INT)
+// [sp + 32 + SP_INT]: h (int)
+//
+// Register allocation:
+// x0..x4 : base pointers (in, threshold, min, max, out) - kept across rows
+// x5..x7 : ilinesize, tlinesize, flinesize - caller-passed, kept
+// x8, x9 : slinesize, olinesize (loaded from stack)
+// w10, w11 : w, h (loaded from stack)
+// x12..x16: per-row working pointers (post-incremented in the row body)
+// w17 : column counter
+// w19, w20: scalar-tail scratch (caller-saved x0-x18 minus x18 are all
+// held by row state, so the tail needs two callee-saved GPRs).
+// v0..v4 : load buffers + result
+.macro THRESHOLD bits, vecsz, shape, ldop, stop, step
+function ff_threshold\bits\()_neon, export=1
+ stp x19, x20, [sp, #-16]!
+ ldr x8, [sp, #16] // slinesize
+ ldr x9, [sp, #24] // olinesize
+ ldr w10, [sp, #32] // w
+ ldr w11, [sp, #32+SP_INT] // h
+ cmp w11, #0
+ b.le 9f // h <= 0: nothing to do
+ cmp w10, #0
+ b.le 9f // w <= 0: nothing to do
+1: // row_loop:
+ mov x12, x0
+ mov x13, x1
+ mov x14, x2
+ mov x15, x3
+ mov x16, x4
+ mov w17, w10
+2: // vec_loop: 16 bytes / q-reg
+ cmp w17, #\vecsz
+ b.lt 3f
+ ld1 {v0.16b}, [x12], #16
+ ld1 {v1.16b}, [x13], #16
+ ld1 {v2.16b}, [x14], #16
+ ld1 {v3.16b}, [x15], #16
+ cmhs v4.\shape, v1.\shape, v0.\shape // thr >= in (= in
<= thr)
+ bsl v4.16b, v2.16b, v3.16b // mask ? min : max
+ st1 {v4.16b}, [x16], #16
+ sub w17, w17, #\vecsz
+ b 2b
+3: // tail:
+ cbz w17, 5f
+4:
+ // cmp below sets NZCV; the four ldrb/ldrh after it do not touch
+ // flags, so the csel reads the cmp's result.
+ \ldop w19, [x12], #\step
+ \ldop w20, [x13], #\step
+ cmp w19, w20 // in vs threshold
+ \ldop w19, [x14], #\step // overwrites in -> min
+ \ldop w20, [x15], #\step // overwrites thr -> max
+ csel w19, w19, w20, ls // ls = unsigned in <= thr
+ \stop w19, [x16], #\step
+ subs w17, w17, #1
+ b.gt 4b
+5: // end_row:
+ add x0, x0, x5 // in += ilinesize
+ add x1, x1, x6 // threshold += tlinesize
+ add x2, x2, x7 // min += flinesize
+ add x3, x3, x8 // max += slinesize
+ add x4, x4, x9 // out += olinesize
+ subs w11, w11, #1
+ b.gt 1b
+9:
+ ldp x19, x20, [sp], #16
+ ret
+endfunc
+.endm
+
+THRESHOLD 8, 16, 16b, ldrb, strb, 1
+THRESHOLD 16, 8, 8h, ldrh, strh, 2
diff --git a/libavfilter/threshold.h b/libavfilter/threshold.h
index 8b55ad6ba1..1efdc9d5e9 100644
--- a/libavfilter/threshold.h
+++ b/libavfilter/threshold.h
@@ -47,5 +47,6 @@ typedef struct ThresholdContext {
} ThresholdContext;
void ff_threshold_init_x86(ThresholdContext *s);
+void ff_threshold_init_aarch64(ThresholdContext *s);
#endif /* AVFILTER_THRESHOLD_H */
diff --git a/libavfilter/vf_threshold_init.h b/libavfilter/vf_threshold_init.h
index fb319c6cf8..87e2ef20f9 100644
--- a/libavfilter/vf_threshold_init.h
+++ b/libavfilter/vf_threshold_init.h
@@ -86,6 +86,8 @@ av_unused static void ff_threshold_init(ThresholdContext *s)
#if ARCH_X86 && HAVE_X86ASM
ff_threshold_init_x86(s);
+#elif ARCH_AARCH64
+ ff_threshold_init_aarch64(s);
#endif
}
diff --git a/tests/checkasm/vf_threshold.c b/tests/checkasm/vf_threshold.c
index e6a425edfe..36fdda167c 100644
--- a/tests/checkasm/vf_threshold.c
+++ b/tests/checkasm/vf_threshold.c
@@ -22,8 +22,11 @@
#include "libavutil/intreadwrite.h"
#include "libavutil/mem_internal.h"
-#define WIDTH 256
-#define WIDTH_PADDED 256 + 32
+#define WIDTH 256
+#define HEIGHT 3
+#define WIDTH_PADDED (WIDTH + 32)
+#define LINESIZE_MAX (WIDTH_PADDED + 32)
+#define BUF_SIZE (LINESIZE_MAX * HEIGHT)
#define randomize_buffers(buf, size) \
do { \
@@ -34,14 +37,15 @@
} while (0)
static void check_threshold(int depth){
- LOCAL_ALIGNED_32(uint8_t, in , [WIDTH_PADDED]);
- LOCAL_ALIGNED_32(uint8_t, threshold, [WIDTH_PADDED]);
- LOCAL_ALIGNED_32(uint8_t, min , [WIDTH_PADDED]);
- LOCAL_ALIGNED_32(uint8_t, max , [WIDTH_PADDED]);
- LOCAL_ALIGNED_32(uint8_t, out_ref , [WIDTH_PADDED]);
- LOCAL_ALIGNED_32(uint8_t, out_new , [WIDTH_PADDED]);
+ LOCAL_ALIGNED_32(uint8_t, in , [BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, threshold, [BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, min , [BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, max , [BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, out_ref , [BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, out_new , [BUF_SIZE]);
ptrdiff_t line_size = WIDTH_PADDED;
- int w = WIDTH;
+ int step = depth > 8 ? 2 : 1;
+ int w = WIDTH / step;
declare_func(void, const uint8_t *in, const uint8_t *threshold,
const uint8_t *min, const uint8_t *max, uint8_t *out,
@@ -53,26 +57,46 @@ static void check_threshold(int depth){
s.depth = depth;
ff_threshold_init(&s);
- memset(in, 0, WIDTH_PADDED);
- memset(threshold, 0, WIDTH_PADDED);
- memset(min, 0, WIDTH_PADDED);
- memset(max, 0, WIDTH_PADDED);
- memset(out_ref, 0, WIDTH_PADDED);
- memset(out_new, 0, WIDTH_PADDED);
- randomize_buffers(in, WIDTH);
- randomize_buffers(threshold, WIDTH);
- randomize_buffers(min, WIDTH);
- randomize_buffers(max, WIDTH);
-
- if (depth == 16)
- w /= 2;
+ memset(in, 0, BUF_SIZE);
+ memset(threshold, 0, BUF_SIZE);
+ memset(min, 0, BUF_SIZE);
+ memset(max, 0, BUF_SIZE);
+ randomize_buffers(in, BUF_SIZE);
+ randomize_buffers(threshold, BUF_SIZE);
+ randomize_buffers(min, BUF_SIZE);
+ randomize_buffers(max, BUF_SIZE);
if (check_func(s.threshold, "threshold%d", depth)) {
- call_ref(in, threshold, min, max, out_ref, line_size, line_size,
line_size, line_size, line_size, w, 1);
- call_new(in, threshold, min, max, out_new, line_size, line_size,
line_size, line_size, line_size, w, 1);
- if (memcmp(out_ref, out_new, WIDTH))
+ for (int i = 1; i < w; i++) {
+ memset(out_ref, 0xAA, BUF_SIZE);
+ memset(out_new, 0xAA, BUF_SIZE);
+ call_ref(in, threshold, min, max, out_ref,
+ line_size, line_size, line_size, line_size, line_size, i,
1);
+ call_new(in, threshold, min, max, out_new,
+ line_size, line_size, line_size, line_size, line_size, i,
1);
+ if (memcmp(out_ref, out_new, BUF_SIZE))
+ fail();
+ }
+ memset(out_ref, 0xAA, BUF_SIZE);
+ memset(out_new, 0xAA, BUF_SIZE);
+ call_ref(in, threshold, min, max, out_ref,
+ line_size, line_size, line_size, line_size, line_size, w,
HEIGHT);
+ call_new(in, threshold, min, max, out_new,
+ line_size, line_size, line_size, line_size, line_size, w,
HEIGHT);
+ if (memcmp(out_ref, out_new, BUF_SIZE))
fail();
- bench_new(in, threshold, min, max, out_new, line_size, line_size,
line_size, line_size, line_size, w, 1);
+ memset(out_ref, 0xAA, BUF_SIZE);
+ memset(out_new, 0xAA, BUF_SIZE);
+ call_ref(in, threshold, min, max, out_ref,
+ line_size + 0, line_size + 8, line_size + 16,
+ line_size + 24, line_size + 32, w, HEIGHT);
+ call_new(in, threshold, min, max, out_new,
+ line_size + 0, line_size + 8, line_size + 16,
+ line_size + 24, line_size + 32, w, HEIGHT);
+ if (memcmp(out_ref, out_new, BUF_SIZE))
+ fail();
+ bench_new(in, threshold, min, max, out_new,
+ line_size, line_size, line_size, line_size, line_size, w, 1);
}
}
@@ -81,6 +105,12 @@ void checkasm_check_vf_threshold(void)
check_threshold(8);
report("threshold8");
+ check_threshold(10);
+ report("threshold10");
+
+ check_threshold(12);
+ report("threshold12");
+
check_threshold(16);
report("threshold16");
}
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]