This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 1e86a92a1cd752f64b47a8a09a44626320cd4e27
Author:     Zhao Zhili <[email protected]>
AuthorDate: Thu May 21 15:01:50 2026 +0800
Commit:     Zhao Zhili <[email protected]>
CommitDate: Wed Jun 3 09:36:59 2026 +0000

    lavu/aarch64: unroll butterflies_float to 8 floats/iter
    
    butterflies_float_neon:   before           after
      Cortex-A76 (gcc 12.4):  163.1 (3.95x)    147.0 (4.37x)
      Apple M1 (clang 16):      0.7 (0.85x)      0.6 (0.99x)
    
    Signed-off-by: Zhao Zhili <[email protected]>
---
 libavutil/aarch64/float_dsp_neon.S | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/libavutil/aarch64/float_dsp_neon.S 
b/libavutil/aarch64/float_dsp_neon.S
index fee47cb474..aa6b5189f5 100644
--- a/libavutil/aarch64/float_dsp_neon.S
+++ b/libavutil/aarch64/float_dsp_neon.S
@@ -178,15 +178,26 @@ function ff_vector_fmul_reverse_neon, export=1
 endfunc
 
 function ff_butterflies_float_neon, export=1
-1:      ld1             {v0.4s}, [x0]
+        subs            w2,  w2,  #8
+        b.lt            2f
+1:      ldp             q0,  q1,  [x0]
+        ldp             q2,  q3,  [x1]
+        subs            w2,  w2,  #8
+        fadd            v4.4s,   v0.4s,  v2.4s
+        fadd            v5.4s,   v1.4s,  v3.4s
+        fsub            v0.4s,   v0.4s,  v2.4s
+        fsub            v1.4s,   v1.4s,  v3.4s
+        st1             {v4.4s,  v5.4s}, [x0], #32
+        st1             {v0.4s,  v1.4s}, [x1], #32
+        b.ge            1b
+2:      tbz             w2,  #2,  3f
+        ld1             {v0.4s}, [x0]
         ld1             {v1.4s}, [x1]
-        subs            w2,  w2,  #4
         fsub            v2.4s,   v0.4s,  v1.4s
         fadd            v3.4s,   v0.4s,  v1.4s
-        st1             {v2.4s}, [x1],   #16
-        st1             {v3.4s}, [x0],   #16
-        b.gt            1b
-        ret
+        st1             {v2.4s}, [x1]
+        st1             {v3.4s}, [x0]
+3:      ret
 endfunc
 
 function ff_scalarproduct_float_neon, export=1

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to