PR #23110 opened by Zhao Zhili (quink)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23110
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23110.patch

The NEON loop filter always runs the full filter, so it loses to C in
the bestcase. Add a C wrapper that checks and returns early when all
groups skip; otherwise it falls through to NEON.

checkasm vc1dsp bestcase speedup vs C, NEON -> wrapper:

  filter             M1            Pi 5 (A76)
  v_loop_filter4    0.44 -> 2.57  0.78 -> 3.86
  v_loop_filter8    1.09 -> 2.09  0.89 -> 2.40
  v_loop_filter16   0.88 -> 2.16  0.61 -> 1.87
  h_loop_filter4    0.96 -> 2.61  0.80 -> 2.59
  h_loop_filter8    0.78 -> 2.30  0.61 -> 2.54
  h_loop_filter16   0.66 -> 1.83  0.46 -> 1.88

Worstcase speedup is preserved.


>From bba72d4c424d566916e61fcbb22eb66ff31361c1 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <[email protected]>
Date: Sat, 16 May 2026 13:13:40 +0800
Subject: [PATCH] aarch64/vc1dsp: add early-return fast path for loop filter

The NEON loop filter always runs the full filter, so it loses to C in
the bestcase. Add a C wrapper that checks and returns early when all
groups skip; otherwise it falls through to NEON.

checkasm vc1dsp bestcase speedup vs C, NEON -> wrapper:

  filter             M1            Pi 5 (A76)
  v_loop_filter4    0.44 -> 2.57  0.78 -> 3.86
  v_loop_filter8    1.09 -> 2.09  0.89 -> 2.40
  v_loop_filter16   0.88 -> 2.16  0.61 -> 1.87
  h_loop_filter4    0.96 -> 2.61  0.80 -> 2.59
  h_loop_filter8    0.78 -> 2.30  0.61 -> 2.54
  h_loop_filter16   0.66 -> 1.83  0.46 -> 1.88

Worstcase speedup is preserved.
---
 libavcodec/aarch64/vc1dsp_init_aarch64.c | 55 +++++++++++++++++++++---
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c 
b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index 3bc0bd17ee..3acb8c77e3 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -54,6 +54,49 @@ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, const uint8_t 
*src, ptrdiff_t stri
 
 int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t 
*dst);
 
+static av_always_inline int vc1_loop_filter_all_skip(uint8_t *src,
+                                                     ptrdiff_t step,
+                                                     ptrdiff_t stride,
+                                                     int len, int pq)
+{
+    for (int i = 0; i < len; i += 4) {
+        uint8_t *s = src + 2 * step;
+        int a0 = (2 * (s[-2 * stride] - s[stride]) -
+                  5 * (s[-stride] - s[0]) + 4) >> 3;
+        int a0_sign = a0 >> 31;
+        a0 = (a0 ^ a0_sign) - a0_sign;
+        if (a0 < pq)
+            return 0;
+        src += step * 4;
+    }
+    return 1;
+}
+
+#define VC1_V_LOOP_FILTER_WRAPPER(len)                                       \
+static void vc1_v_loop_filter##len##_aarch64(uint8_t *src, ptrdiff_t stride, \
+                                             int pq)                         \
+{                                                                            \
+    if (vc1_loop_filter_all_skip(src, 1, stride, len, pq))                   \
+        return;                                                              \
+    ff_vc1_v_loop_filter##len##_neon(src, stride, pq);                       \
+}
+
+#define VC1_H_LOOP_FILTER_WRAPPER(len)                                       \
+static void vc1_h_loop_filter##len##_aarch64(uint8_t *src, ptrdiff_t stride, \
+                                             int pq)                         \
+{                                                                            \
+    if (vc1_loop_filter_all_skip(src, stride, 1, len, pq))                   \
+        return;                                                              \
+    ff_vc1_h_loop_filter##len##_neon(src, stride, pq);                       \
+}
+
+VC1_V_LOOP_FILTER_WRAPPER(4)
+VC1_H_LOOP_FILTER_WRAPPER(4)
+VC1_V_LOOP_FILTER_WRAPPER(8)
+VC1_H_LOOP_FILTER_WRAPPER(8)
+VC1_V_LOOP_FILTER_WRAPPER(16)
+VC1_H_LOOP_FILTER_WRAPPER(16)
+
 static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
 {
     /* Dealing with starting and stopping, and removing escape bytes, are
@@ -124,12 +167,12 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
         dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
         dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
 
-        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
-        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
-        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
-        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
-        dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
-        dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
+        dsp->vc1_v_loop_filter4  = vc1_v_loop_filter4_aarch64;
+        dsp->vc1_h_loop_filter4  = vc1_h_loop_filter4_aarch64;
+        dsp->vc1_v_loop_filter8  = vc1_v_loop_filter8_aarch64;
+        dsp->vc1_h_loop_filter8  = vc1_h_loop_filter8_aarch64;
+        dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_aarch64;
+        dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_aarch64;
 
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to