PR #20519 opened by welder
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20519
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20519.patch

I hope it's not an overkill, I unrolled the 16 width variant, interleaved the 
loads, stores and arithmetic ops to the best of my ability. Additionally I got 
rid of the internal loop and the mov/add preamble and epilogue.


>From 668ddf2d4a0b9213403f7468ab9d7542a0119afb Mon Sep 17 00:00:00 2001
From: Krzysztof Pyrkosz <ffm...@szaka.eu>
Date: Sun, 14 Sep 2025 19:13:24 +0200
Subject: [PATCH] avcodec/aarch64/vvc: Unroll vvc_bdof_grad_filter_8x_neon

Before and after:
A53:
apply_bdof_8_16x8_neon:                               2733.1 ( 4.88x)
apply_bdof_8_16x16_neon:                              5458.6 ( 4.86x)
apply_bdof_10_16x8_neon:                              2789.8 ( 4.64x)
apply_bdof_10_16x16_neon:                             5523.8 ( 4.68x)
apply_bdof_12_16x8_neon:                              2792.8 ( 4.58x)
apply_bdof_12_16x16_neon:                             5519.5 ( 4.63x)

apply_bdof_8_16x8_neon:                               2571.8 ( 5.12x)
apply_bdof_8_16x16_neon:                              5173.3 ( 5.12x)
apply_bdof_10_16x8_neon:                              2635.1 ( 4.87x)
apply_bdof_10_16x16_neon:                             5243.0 ( 4.89x)
apply_bdof_12_16x8_neon:                              2613.0 ( 4.89x)
apply_bdof_12_16x16_neon:                             5231.7 ( 4.90x)

A78:
apply_bdof_8_16x8_neon:                                565.3 ( 8.43x)
apply_bdof_8_16x16_neon:                              1109.5 ( 8.60x)
apply_bdof_10_16x8_neon:                               568.2 ( 7.92x)
apply_bdof_10_16x16_neon:                             1114.1 ( 8.08x)
apply_bdof_12_16x8_neon:                               570.2 ( 7.87x)
apply_bdof_12_16x16_neon:                             1116.3 ( 8.03x)

apply_bdof_8_16x8_neon:                                541.4 ( 8.81x)
apply_bdof_8_16x16_neon:                              1065.9 ( 8.97x)
apply_bdof_10_16x8_neon:                               543.2 ( 8.32x)
apply_bdof_10_16x16_neon:                             1071.5 ( 8.39x)
apply_bdof_12_16x8_neon:                               544.2 ( 8.25x)
apply_bdof_12_16x16_neon:                             1074.1 ( 8.37x)
---
 libavcodec/aarch64/vvc/inter.S | 85 +++++++++++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 7 deletions(-)

diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 01d2ff155c..47810ec3c1 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -827,10 +827,10 @@ function vvc_bdof_grad_filter_8x_neon, export=0
         src1            .req x5
         width           .req w6
         height          .req w7
+        tbnz            w6, #4, 16f
 
 1:
         mov             x10, src0
-        mov             w11, width
         mov             x12, gh0
         mov             x13, gv0
         mov             x14, src1
@@ -863,16 +863,11 @@ function vvc_bdof_grad_filter_8x_neon, export=0
         // results of gradient_v1
         sub             v6.8h, v6.8h, v7.8h
 
-        add             x10, x10, #16
-        add             x14, x14, #16
-
         // (gradient_h0 + gradient_h1) >> 1
         shadd           v1.8h, v0.8h, v4.8h
         // gradient_h0 - gradient_h1
         sub             v5.8h, v0.8h, v4.8h
 
-        subs            w11, w11, #8
-
         // (gradient_v0 + gradient_v1) >> 1
         shadd           v3.8h, v2.8h, v6.8h
         // gradient_v0 - gradient_v1
@@ -882,7 +877,6 @@ function vvc_bdof_grad_filter_8x_neon, export=0
         st1             {v5.8h}, [x15], #16
         st1             {v3.8h}, [x13], #16
         st1             {v7.8h}, [x16], #16
-        b.ne            2b
 
         subs            height, height, #1
         add             gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
@@ -894,6 +888,83 @@ function vvc_bdof_grad_filter_8x_neon, export=0
         b.ne            1b
         ret
 
+16:
+        ldur            q0, [x4, #2]
+        ldur            q1, [x4, #18]
+        ldur            q16, [x4, #-2]
+        sshr            v0.8h, v0.8h, #6
+        ldur            q17, [x4, #14]
+        sshr            v1.8h, v1.8h, #6
+        ldp             q18, q19, [x4, #-(VVC_MAX_PB_SIZE << 1)]
+        sshr            v16.8h, v16.8h, #6
+        ldp             q2, q3, [x4, #(VVC_MAX_PB_SIZE << 1)]!
+        ldur            q20, [x5, #2]
+        sshr            v17.8h, v17.8h, #6
+        ldur            q21, [x5, #18]
+        sshr            v2.8h, v2.8h, #6
+        ldur            q22, [x5, #-2]
+        sshr            v3.8h, v3.8h, #6
+        ldur            q23, [x5, #14]
+        sshr            v18.8h, v18.8h, #6
+        ldp             q26, q27, [x5, #-(VVC_MAX_PB_SIZE << 1)]
+        sshr            v19.8h, v19.8h, #6
+        ldp             q24, q25, [x5, #(VVC_MAX_PB_SIZE << 1)]!
+
+        // results of gradient_h0
+        sub             v0.8h, v0.8h, v16.8h
+        sub             v1.8h, v1.8h, v17.8h
+
+        // results of gradient_v0
+        sub             v2.8h, v2.8h, v18.8h
+        sub             v3.8h, v3.8h, v19.8h
+
+        sshr            v20.8h, v20.8h, #6
+        sshr            v21.8h, v21.8h, #6
+        sshr            v22.8h, v22.8h, #6
+        sshr            v23.8h, v23.8h, #6
+
+        // results of gradient_h1
+        sub             v20.8h, v20.8h, v22.8h
+        sub             v21.8h, v21.8h, v23.8h
+
+        sshr            v24.8h, v24.8h, #6
+        sshr            v25.8h, v25.8h, #6
+
+        // gradient_h0 - gradient_h1
+        sub             v22.8h, v0.8h, v20.8h
+        sub             v23.8h, v1.8h, v21.8h
+
+        // (gradient_h0 + gradient_h1) >> 1
+        shadd           v16.8h, v0.8h, v20.8h
+        shadd           v17.8h, v1.8h, v21.8h
+
+        st1             {v22.8h, v23.8h}, [gh1], #32
+
+        sshr            v26.8h, v26.8h, #6
+        sshr            v27.8h, v27.8h, #6
+
+        st1             {v16.8h, v17.8h}, [gh0], #32
+
+        // results of gradient_v1
+        sub             v24.8h, v24.8h, v26.8h
+        sub             v25.8h, v25.8h, v27.8h
+
+        // (gradient_v0 + gradient_v1) >> 1
+        shadd           v18.8h, v2.8h, v24.8h
+        shadd           v19.8h, v3.8h, v25.8h
+
+        // gradient_v0 - gradient_v1
+        sub             v26.8h, v2.8h, v24.8h
+        sub             v27.8h, v3.8h, v25.8h
+
+        st1             {v18.8h,v19.8h}, [gv0], #32
+
+        subs            height, height, #1
+        st1             {v26.8h,v27.8h}, [gv1], #32
+
+        b.ne            16b
+        ret
+
 .unreq gh0
 .unreq gh1
 .unreq gv0
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-le...@ffmpeg.org

Reply via email to