PR #23601 opened by kjg0724
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23601
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23601.patch

The fixed-point LFE FIR polyphase interpolation filter in DCA has no SIMD
implementation on any architecture. This adds an AArch64 NEON version
modelled after the existing lfe_fir_float NEON functions.

The inner loop processes two outputs at a time: forward and reverse FIR
paths share the same set of eight LFE history samples, loaded once and
kept in v2/v3 (original) and v4/v5 (reversed via EXT+REV64). Each path
accumulates eight int32x int32 products into int64 using SMULL/SMLAL pairs.
Three ADDP instructions reduce the four accumulator registers — the third
ADDP combines both dot-product totals into a single 2D register so that
one SQRSHRN applies norm23 to both outputs simultaneously. clip23 follows
with SQSHL+SSHR.

Tested with checkasm. Measured ~1.41x speedup on Apple M1.


From 9aac704b44058999747c6523caa7395dfa97845e Mon Sep 17 00:00:00 2001
From: Jeongkeun Kim <[email protected]>
Date: Tue, 28 Apr 2026 13:51:25 +0900
Subject: [PATCH 1/2] tests/checkasm/llviddsp: fix add_left_pred_int16 buffer
 compare size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

dst0/dst1 are uint16_t* allocated as width * sizeof(uint16_t), but
the memcmp at the end of check_add_left_pred_16 only compared `width`
bytes — missing the second half of each buffer. Same pattern used
correctly in tests/checkasm/huffyuvdsp.c (memcmp with width * sizeof()).

While at it, fix missing whitespace around & and || on the same line.

Fixes: fbe91487797c ("checkasm/llviddsp : add test for other dsp func")
Signed-off-by: Jeongkeun Kim <[email protected]>
---
 tests/checkasm/llviddsp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/checkasm/llviddsp.c b/tests/checkasm/llviddsp.c
index a8245b0d94..1094638229 100644
--- a/tests/checkasm/llviddsp.c
+++ b/tests/checkasm/llviddsp.c
@@ -145,7 +145,7 @@ static void check_add_left_pred_16(LLVidDSPContext *c, 
unsigned mask, int width,
 
     res0 = call_ref(dst0, src0, mask, width, acc);
     res1 = call_new(dst1, src1, mask, width, acc);
-    if ((res0 &0xFFFF) != (res1 &0xFFFF)|| memcmp(dst0, dst1, width))
+    if ((res0 & 0xFFFF) != (res1 & 0xFFFF) || memcmp(dst0, dst1, width * 
sizeof(*dst0)))
         fail();
     bench_new(dst1, src1, mask, width, acc);
 
-- 
2.52.0


From 15a7bc1c79bb0ba33806b9ae3ff01b2cee266288 Mon Sep 17 00:00:00 2001
From: Jeongkeun Kim <[email protected]>
Date: Fri, 26 Jun 2026 09:30:09 +0900
Subject: [PATCH 2/2] avcodec/aarch64: add NEON lfe_fir_fixed for DCA DSP

Implement the fixed-point LFE FIR polyphase interpolation filter using
AArch64 NEON SIMD. The inner loop computes two 8-tap int32 dot products
simultaneously using SMULL/SMLAL chains with separate accumulator pairs,
then reduces both to scalars with three ADDP instructions and applies
norm23 via SQRSHRN and clip23 via SQSHL/SSHR.

Apple M1: ~1.41x speedup vs scalar.
---
 libavcodec/aarch64/dcadsp_init_aarch64.c |  3 ++
 libavcodec/aarch64/dcadsp_neon.S         | 46 ++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/libavcodec/aarch64/dcadsp_init_aarch64.c 
b/libavcodec/aarch64/dcadsp_init_aarch64.c
index 1fc6ec7a32..4286643802 100644
--- a/libavcodec/aarch64/dcadsp_init_aarch64.c
+++ b/libavcodec/aarch64/dcadsp_init_aarch64.c
@@ -30,6 +30,8 @@ void ff_lfe_fir0_float_neon(float *pcm_samples, const int32_t 
*lfe_samples,
                             const float *filter_coeff, ptrdiff_t npcmblocks);
 void ff_lfe_fir1_float_neon(float *pcm_samples, const int32_t *lfe_samples,
                             const float *filter_coeff, ptrdiff_t npcmblocks);
+void ff_lfe_fir_fixed_neon(int32_t *pcm_samples, const int32_t *lfe_samples,
+                           const int32_t *filter_coeff, ptrdiff_t npcmblocks);
 
 av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
 {
@@ -38,5 +40,6 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
     if (have_neon(cpu_flags)) {
         s->lfe_fir_float[0] = ff_lfe_fir0_float_neon;
         s->lfe_fir_float[1] = ff_lfe_fir1_float_neon;
+        s->lfe_fir_fixed    = ff_lfe_fir_fixed_neon;
     }
 }
diff --git a/libavcodec/aarch64/dcadsp_neon.S b/libavcodec/aarch64/dcadsp_neon.S
index af149767fd..fd07ed3e95 100644
--- a/libavcodec/aarch64/dcadsp_neon.S
+++ b/libavcodec/aarch64/dcadsp_neon.S
@@ -99,3 +99,49 @@ function ff_lfe_fir1_float_neon, export=1
         b.gt            .Louter1
         ret
 endfunc
+
+function ff_lfe_fir_fixed_neon, export=1
+        lsr             x3, x3, #1
+        sub             x1, x1, #(7*4)
+.Louter_fixed:
+        ld1             {v2.4s, v3.4s}, [x1]
+
+        ext             v4.16b, v3.16b, v3.16b, #8
+        rev64           v4.4s,  v4.4s
+        ext             v5.16b, v2.16b, v2.16b, #8
+        rev64           v5.4s,  v5.4s
+
+        mov             x4, x2
+        add             x5, x2, #(248*4)
+        mov             x6, x0
+        add             x7, x0, #(32*4)
+        mov             w8, #32
+.Linner_fixed:
+        ld1             {v0.4s,  v1.4s},  [x4], #32
+        ld1             {v16.4s, v17.4s}, [x5]
+        sub             x5, x5, #32
+        subs            w8, w8, #1
+        smull           v18.2d, v0.2s,  v4.2s
+        smull           v20.2d, v16.2s, v2.2s
+        smull2          v19.2d, v0.4s,  v4.4s
+        smull2          v21.2d, v16.4s, v2.4s
+        smlal           v18.2d, v1.2s,  v5.2s
+        smlal           v20.2d, v17.2s, v3.2s
+        smlal2          v19.2d, v1.4s,  v5.4s
+        smlal2          v21.2d, v17.4s, v3.4s
+        addp            v18.2d, v18.2d, v19.2d
+        addp            v20.2d, v20.2d, v21.2d
+        addp            v18.2d, v18.2d, v20.2d
+        sqrshrn         v22.2s, v18.2d, #23
+        sqshl           v22.2s, v22.2s, #8
+        sshr            v22.2s, v22.2s, #8
+        st1             {v22.s}[0], [x6], #4
+        st1             {v22.s}[1], [x7], #4
+        b.gt            .Linner_fixed
+
+        subs            x3, x3, #1
+        add             x1, x1, #4
+        add             x0, x0, #(64*4)
+        b.gt            .Louter_fixed
+        ret
+endfunc
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to