PR #23209 opened by Zhao Zhili (quink) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23209 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23209.patch
>From 54ea35984d19a51cb522aa3bc1471006308d9767 Mon Sep 17 00:00:00 2001 From: Zhao Zhili <[email protected]> Date: Thu, 21 May 2026 02:12:30 +0800 Subject: [PATCH 1/2] lavu/aarch64: split FMLA chain in scalarproduct_float Unroll to 16 floats per iteration with four independent accumulators and reduce them once after the loop. scalarproduct_float_neon: before after Apple M1 (clang 16): 0.9 (3.56x) 0.4 (9.18x) Cortex-A76 (gcc 12.4): 118.7 (4.43x) 85.3 (6.15x) Signed-off-by: Zhao Zhili <[email protected]> --- libavutil/aarch64/float_dsp_neon.S | 38 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/libavutil/aarch64/float_dsp_neon.S b/libavutil/aarch64/float_dsp_neon.S index 35e2715b87..fee47cb474 100644 --- a/libavutil/aarch64/float_dsp_neon.S +++ b/libavutil/aarch64/float_dsp_neon.S @@ -190,13 +190,35 @@ function ff_butterflies_float_neon, export=1 endfunc function ff_scalarproduct_float_neon, export=1 - movi v2.4s, #0 -1: ld1 {v0.4s}, [x0], #16 - ld1 {v1.4s}, [x1], #16 - subs w2, w2, #4 - fmla v2.4s, v0.4s, v1.4s - b.gt 1b - faddp v0.4s, v2.4s, v2.4s - faddp s0, v0.2s + // 16 elements per iteration; w3 = len / 16. + lsr w3, w2, #4 + movi v2.4s, #0 + movi v3.4s, #0 + movi v4.4s, #0 + movi v5.4s, #0 + cbz w3, 2f +1: ld1 {v16.4s, v17.4s}, [x0], #32 + ld1 {v20.4s, v21.4s}, [x1], #32 + ld1 {v18.4s, v19.4s}, [x0], #32 + ld1 {v22.4s, v23.4s}, [x1], #32 + subs w3, w3, #1 + fmla v2.4s, v16.4s, v20.4s + fmla v3.4s, v17.4s, v21.4s + fmla v4.4s, v18.4s, v22.4s + fmla v5.4s, v19.4s, v23.4s + b.ne 1b +2: // len is a multiple of 4 + and w2, w2, #12 + cbz w2, 4f +3: ld1 {v0.4s}, [x0], #16 + ld1 {v1.4s}, [x1], #16 + subs w2, w2, #4 + fmla v2.4s, v0.4s, v1.4s + b.ne 3b +4: fadd v2.4s, v2.4s, v3.4s + fadd v4.4s, v4.4s, v5.4s + fadd v2.4s, v2.4s, v4.4s + faddp v0.4s, v2.4s, v2.4s + faddp s0, v0.2s ret endfunc -- 2.52.0 >From 060d97864dc3ca274a055411b622e3503a56b86f Mon Sep 17 00:00:00 2001 From: Zhao Zhili <[email protected]> Date: Thu, 21 May 2026 15:01:50 +0800 Subject: [PATCH 2/2] lavu/aarch64: unroll butterflies_float to 8 floats/iter butterflies_float_neon: before after Cortex-A76 (gcc 12.4): 163.1 (3.95x) 144.0 (4.47x) Apple M1 (clang 16): 0.7 (0.85x) 0.6 (0.99x) Signed-off-by: Zhao Zhili <[email protected]> --- libavutil/aarch64/float_dsp_neon.S | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/libavutil/aarch64/float_dsp_neon.S b/libavutil/aarch64/float_dsp_neon.S index fee47cb474..67d61a93b2 100644 --- a/libavutil/aarch64/float_dsp_neon.S +++ b/libavutil/aarch64/float_dsp_neon.S @@ -178,15 +178,28 @@ function ff_vector_fmul_reverse_neon, export=1 endfunc function ff_butterflies_float_neon, export=1 -1: ld1 {v0.4s}, [x0] + subs w2, w2, #8 + b.lt 2f +1: ldp q0, q1, [x0] + ldp q2, q3, [x1] + subs w2, w2, #8 + fadd v4.4s, v0.4s, v2.4s + fadd v5.4s, v1.4s, v3.4s + fsub v0.4s, v0.4s, v2.4s + fsub v1.4s, v1.4s, v3.4s + stp q4, q5, [x0] + stp q0, q1, [x1] + add x0, x0, #32 + add x1, x1, #32 + b.ge 1b +2: tbz w2, #2, 3f + ld1 {v0.4s}, [x0] ld1 {v1.4s}, [x1] - subs w2, w2, #4 fsub v2.4s, v0.4s, v1.4s fadd v3.4s, v0.4s, v1.4s - st1 {v2.4s}, [x1], #16 - st1 {v3.4s}, [x0], #16 - b.gt 1b - ret + st1 {v2.4s}, [x1] + st1 {v3.4s}, [x0] +3: ret endfunc function ff_scalarproduct_float_neon, export=1 -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
