PR #23209 opened by Zhao Zhili (quink)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23209
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23209.patch


>From 54ea35984d19a51cb522aa3bc1471006308d9767 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <[email protected]>
Date: Thu, 21 May 2026 02:12:30 +0800
Subject: [PATCH 1/2] lavu/aarch64: split FMLA chain in scalarproduct_float

Unroll to 16 floats per iteration with four independent accumulators
and reduce them once after the loop.

scalarproduct_float_neon:  before          after
  Apple M1 (clang 16):      0.9 (3.56x)    0.4 (9.18x)
  Cortex-A76 (gcc 12.4):  118.7 (4.43x)   85.3 (6.15x)

Signed-off-by: Zhao Zhili <[email protected]>
---
 libavutil/aarch64/float_dsp_neon.S | 38 +++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/libavutil/aarch64/float_dsp_neon.S 
b/libavutil/aarch64/float_dsp_neon.S
index 35e2715b87..fee47cb474 100644
--- a/libavutil/aarch64/float_dsp_neon.S
+++ b/libavutil/aarch64/float_dsp_neon.S
@@ -190,13 +190,35 @@ function ff_butterflies_float_neon, export=1
 endfunc
 
 function ff_scalarproduct_float_neon, export=1
-        movi            v2.4s,  #0
-1:      ld1             {v0.4s}, [x0],   #16
-        ld1             {v1.4s}, [x1],   #16
-        subs            w2,      w2,     #4
-        fmla            v2.4s,   v0.4s,  v1.4s
-        b.gt            1b
-        faddp           v0.4s,   v2.4s,  v2.4s
-        faddp           s0,      v0.2s
+        // 16 elements per iteration; w3 = len / 16.
+        lsr             w3, w2, #4
+        movi            v2.4s, #0
+        movi            v3.4s, #0
+        movi            v4.4s, #0
+        movi            v5.4s, #0
+        cbz             w3, 2f
+1:      ld1             {v16.4s, v17.4s}, [x0], #32
+        ld1             {v20.4s, v21.4s}, [x1], #32
+        ld1             {v18.4s, v19.4s}, [x0], #32
+        ld1             {v22.4s, v23.4s}, [x1], #32
+        subs            w3, w3, #1
+        fmla            v2.4s, v16.4s, v20.4s
+        fmla            v3.4s, v17.4s, v21.4s
+        fmla            v4.4s, v18.4s, v22.4s
+        fmla            v5.4s, v19.4s, v23.4s
+        b.ne            1b
+2:      // len is a multiple of 4
+        and             w2, w2, #12
+        cbz             w2, 4f
+3:      ld1             {v0.4s}, [x0], #16
+        ld1             {v1.4s}, [x1], #16
+        subs            w2, w2, #4
+        fmla            v2.4s, v0.4s, v1.4s
+        b.ne            3b
+4:      fadd            v2.4s, v2.4s, v3.4s
+        fadd            v4.4s, v4.4s, v5.4s
+        fadd            v2.4s, v2.4s, v4.4s
+        faddp           v0.4s, v2.4s, v2.4s
+        faddp           s0, v0.2s
         ret
 endfunc
-- 
2.52.0


>From 060d97864dc3ca274a055411b622e3503a56b86f Mon Sep 17 00:00:00 2001
From: Zhao Zhili <[email protected]>
Date: Thu, 21 May 2026 15:01:50 +0800
Subject: [PATCH 2/2] lavu/aarch64: unroll butterflies_float to 8 floats/iter

butterflies_float_neon:   before           after
  Cortex-A76 (gcc 12.4):  163.1 (3.95x)    144.0 (4.47x)
  Apple M1 (clang 16):      0.7 (0.85x)      0.6 (0.99x)

Signed-off-by: Zhao Zhili <[email protected]>
---
 libavutil/aarch64/float_dsp_neon.S | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/libavutil/aarch64/float_dsp_neon.S 
b/libavutil/aarch64/float_dsp_neon.S
index fee47cb474..67d61a93b2 100644
--- a/libavutil/aarch64/float_dsp_neon.S
+++ b/libavutil/aarch64/float_dsp_neon.S
@@ -178,15 +178,28 @@ function ff_vector_fmul_reverse_neon, export=1
 endfunc
 
 function ff_butterflies_float_neon, export=1
-1:      ld1             {v0.4s}, [x0]
+        subs            w2,  w2,  #8
+        b.lt            2f
+1:      ldp             q0,  q1,  [x0]
+        ldp             q2,  q3,  [x1]
+        subs            w2,  w2,  #8
+        fadd            v4.4s,   v0.4s,  v2.4s
+        fadd            v5.4s,   v1.4s,  v3.4s
+        fsub            v0.4s,   v0.4s,  v2.4s
+        fsub            v1.4s,   v1.4s,  v3.4s
+        stp             q4,  q5,  [x0]
+        stp             q0,  q1,  [x1]
+        add             x0,  x0,  #32
+        add             x1,  x1,  #32
+        b.ge            1b
+2:      tbz             w2,  #2,  3f
+        ld1             {v0.4s}, [x0]
         ld1             {v1.4s}, [x1]
-        subs            w2,  w2,  #4
         fsub            v2.4s,   v0.4s,  v1.4s
         fadd            v3.4s,   v0.4s,  v1.4s
-        st1             {v2.4s}, [x1],   #16
-        st1             {v3.4s}, [x0],   #16
-        b.gt            1b
-        ret
+        st1             {v2.4s}, [x1]
+        st1             {v3.4s}, [x0]
+3:      ret
 endfunc
 
 function ff_scalarproduct_float_neon, export=1
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to