Signed-off-by: John Cox <j...@kynesim.co.uk>
---
libavfilter/aarch64/vf_bwdif_init_aarch64.c | 21 ++
libavfilter/aarch64/vf_bwdif_neon.S | 215 ++++++++++++++++++++
2 files changed, 236 insertions(+)
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index e75cf2f204..21e67884ab 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void
*cur1, void *next1,
void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int
mrefs,
int prefs3, int mrefs3, int parity, int
clip_max);
+void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void
*next1,
+ int w, int prefs, int mrefs, int prefs2, int
mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max);
+
+
+static void filter_line_helper(void *dst1, void *prev1, void *cur1, void
*next1,
+ int w, int prefs, int mrefs, int prefs2, int
mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max)
+{
+ const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+ ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
+ w0, prefs, mrefs, prefs2, mrefs2, prefs3,
mrefs3, prefs4, mrefs4, parity, clip_max);
+
+ if (w0 < w)
+ ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char
*)cur1 + w0, (char *)next1 + w0,
+ w - w0, prefs, mrefs, prefs2, mrefs2, prefs3,
mrefs3, prefs4, mrefs4, parity, clip_max);
+}
static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int
mrefs2,
@@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
return;
s->filter_intra = filter_intra_helper;
+ s->filter_line = filter_line_helper;
s->filter_edge = filter_edge_helper;
}
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S
b/libavfilter/aarch64/vf_bwdif_neon.S
index a33b235882..675e97d966 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -128,6 +128,221 @@ coeffs:
.hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2],
-hf[1] = v0.h[5]
.hword 5077, 981 // sp[0] = v0.h[6]
+// ===========================================================================
+//
+// void filter_line(
+// void *dst1, // x0
+// void *prev1, // x1
+// void *cur1, // x2
+// void *next1, // x3
+// int w, // w4
+// int prefs, // w5
+// int mrefs, // w6
+// int prefs2, // w7
+// int mrefs2, // [sp, #0]
+// int prefs3, // [sp, #8]
+// int mrefs3, // [sp, #16]
+// int prefs4, // [sp, #24]
+// int mrefs4, // [sp, #32]
+// int parity, // [sp, #40]
+// int clip_max) // [sp, #48]
+
+function ff_bwdif_filter_line_neon, export=1
+ // Sanity check w
+ cmp w4, #0
+ ble 99f
+
+ // Rearrange regs to be the same as line3 for ease of debug!
+ mov w10, w4 // w10 = loop count
+ mov w9, w6 // w9 = mref
+ mov w12, w7 // w12 = pref2
+ mov w11, w5 // w11 = pref
+ ldr w8, [sp, #0] // w8 = mref2
+ ldr w7, [sp, #16] // w7 = mref3
+ ldr w6, [sp, #32] // w6 = mref4
+ ldr w13, [sp, #8] // w13 = pref3
+ ldr w14, [sp, #24] // w14 = pref4
+
+ mov x4, x3
+ mov x3, x2
+ mov x2, x1
+
+// #define prev2 cur
+// const uint8_t * restrict next2 = parity ? prev : next;
+ ldr w17, [sp, #40] // parity
+ cmp w17, #0
+ csel x17, x2, x4, ne
+
+ // We want all the V registers - save all the ones we must
+ stp d14, d15, [sp, #-64]!
+ stp d8, d9, [sp, #48]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #16]