On Thu, 29 Jun 2023, John Cox wrote:

Signed-off-by: John Cox <j...@kynesim.co.uk>
---
libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
libavfilter/aarch64/vf_bwdif_neon.S         | 215 ++++++++++++++++++++
2 files changed, 236 insertions(+)

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c 
b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index e75cf2f204..21e67884ab 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void 
*cur1, void *next1,
void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int 
mrefs,
                                int prefs3, int mrefs3, int parity, int 
clip_max);

+void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void 
*next1,
+                               int w, int prefs, int mrefs, int prefs2, int 
mrefs2,
+                               int prefs3, int mrefs3, int prefs4, int mrefs4,
+                               int parity, int clip_max);
+
+
+static void filter_line_helper(void *dst1, void *prev1, void *cur1, void 
*next1,
+                               int w, int prefs, int mrefs, int prefs2, int 
mrefs2,
+                               int prefs3, int mrefs3, int prefs4, int mrefs4,
+                               int parity, int clip_max)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
+                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, 
mrefs3, prefs4, mrefs4, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char 
*)cur1 + w0, (char *)next1 + w0,
+                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, 
mrefs3, prefs4, mrefs4, parity, clip_max);
+}

static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
                               int w, int prefs, int mrefs, int prefs2, int 
mrefs2,
@@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
        return;

    s->filter_intra = filter_intra_helper;
+    s->filter_line  = filter_line_helper;
    s->filter_edge  = filter_edge_helper;
}

diff --git a/libavfilter/aarch64/vf_bwdif_neon.S 
b/libavfilter/aarch64/vf_bwdif_neon.S
index a33b235882..675e97d966 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -128,6 +128,221 @@ coeffs:
        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], 
-hf[1] = v0.h[5]
        .hword          5077, 981                       // sp[0] = v0.h[6]

+// ===========================================================================
+//
+// void filter_line(
+//      void *dst1,     // x0
+//      void *prev1,    // x1
+//      void *cur1,     // x2
+//      void *next1,    // x3
+//      int w,          // w4
+//      int prefs,      // w5
+//      int mrefs,      // w6
+//      int prefs2,     // w7
+//      int mrefs2,     // [sp, #0]
+//      int prefs3,     // [sp, #8]
+//      int mrefs3,     // [sp, #16]
+//      int prefs4,     // [sp, #24]
+//      int mrefs4,     // [sp, #32]
+//      int parity,     // [sp, #40]
+//      int clip_max)   // [sp, #48]
+
+function ff_bwdif_filter_line_neon, export=1
+        // Sanity check w
+        cmp             w4, #0
+        ble             99f
+
+        // Rearrange regs to be the same as line3 for ease of debug!
+        mov             w10, w4                         // w10 = loop count
+        mov             w9,  w6                         // w9  = mref
+        mov             w12, w7                         // w12 = pref2
+        mov             w11, w5                         // w11 = pref
+        ldr             w8,  [sp, #0]                   // w8 =  mref2
+        ldr             w7,  [sp, #16]                  // w7  = mref3
+        ldr             w6,  [sp, #32]                  // w6  = mref4
+        ldr             w13, [sp, #8]                   // w13 = pref3
+        ldr             w14, [sp, #24]                  // w14 = pref4

Btw, remember that you can load two arguments from the stack at once with ldp, e.g. "ldp x8, x13, [sp, #0]". If they're made intptr_t/ptrdiff_t, you won't have an issue with garbage in the upper 32 bits either.



+
+        mov             x4,  x3
+        mov             x3,  x2
+        mov             x2,  x1
+
+// #define prev2 cur
+//        const uint8_t * restrict next2 = parity ? prev : next;
+        ldr             w17, [sp, #40]                  // parity
+        cmp             w17, #0
+        csel            x17, x2, x4, ne
+
+        // We want all the V registers - save all the ones we must
+        stp             d14, d15, [sp, #-64]!
+        stp             d8,  d9,  [sp, #48]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #16]

The order looks a bit weird here even if they end up sequential on the stack. If you'd fill it from the bottom up, e.g.

stp d8, d9, [sp, #-64]!
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]

they're sequential both in code and on the stack.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to