This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 1f66f9041bc73b47f610afa844348e6c3654956f
Author:     Zhao Zhili <[email protected]>
AuthorDate: Thu May 21 02:12:30 2026 +0800
Commit:     Zhao Zhili <[email protected]>
CommitDate: Wed Jun 3 09:36:59 2026 +0000

    lavu/aarch64: split FMLA chain in scalarproduct_float
    
    Unroll to 16 floats per iteration with four independent accumulators
    and reduce them once after the loop.
    
    scalarproduct_float_neon:  before          after
      Apple M1 (clang 16):      0.9 (3.56x)    0.4 (9.18x)
      Cortex-A76 (gcc 12.4):  118.7 (4.43x)   85.3 (6.15x)
    
    Signed-off-by: Zhao Zhili <[email protected]>
---
 libavutil/aarch64/float_dsp_neon.S | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/libavutil/aarch64/float_dsp_neon.S 
b/libavutil/aarch64/float_dsp_neon.S
index 35e2715b87..fee47cb474 100644
--- a/libavutil/aarch64/float_dsp_neon.S
+++ b/libavutil/aarch64/float_dsp_neon.S
@@ -190,13 +190,35 @@ function ff_butterflies_float_neon, export=1
 endfunc
 
 function ff_scalarproduct_float_neon, export=1
-        movi            v2.4s,  #0
-1:      ld1             {v0.4s}, [x0],   #16
-        ld1             {v1.4s}, [x1],   #16
-        subs            w2,      w2,     #4
-        fmla            v2.4s,   v0.4s,  v1.4s
-        b.gt            1b
-        faddp           v0.4s,   v2.4s,  v2.4s
-        faddp           s0,      v0.2s
+        // 16 elements per iteration; w3 = len / 16.
+        lsr             w3, w2, #4
+        movi            v2.4s, #0
+        movi            v3.4s, #0
+        movi            v4.4s, #0
+        movi            v5.4s, #0
+        cbz             w3, 2f
+1:      ld1             {v16.4s, v17.4s}, [x0], #32
+        ld1             {v20.4s, v21.4s}, [x1], #32
+        ld1             {v18.4s, v19.4s}, [x0], #32
+        ld1             {v22.4s, v23.4s}, [x1], #32
+        subs            w3, w3, #1
+        fmla            v2.4s, v16.4s, v20.4s
+        fmla            v3.4s, v17.4s, v21.4s
+        fmla            v4.4s, v18.4s, v22.4s
+        fmla            v5.4s, v19.4s, v23.4s
+        b.ne            1b
+2:      // len is a multiple of 4
+        and             w2, w2, #12
+        cbz             w2, 4f
+3:      ld1             {v0.4s}, [x0], #16
+        ld1             {v1.4s}, [x1], #16
+        subs            w2, w2, #4
+        fmla            v2.4s, v0.4s, v1.4s
+        b.ne            3b
+4:      fadd            v2.4s, v2.4s, v3.4s
+        fadd            v4.4s, v4.4s, v5.4s
+        fadd            v2.4s, v2.4s, v4.4s
+        faddp           v0.4s, v2.4s, v2.4s
+        faddp           s0, v0.2s
         ret
 endfunc

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to