hevc: add aarch64 neon for 10-bit dequant

Jun Zhao via ffmpeg-cvslog Sat, 24 Jan 2026 22:56:19 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit ce89d974c8764002f127829dc0ecf43725994ff0
Author:     Jun Zhao <[email protected]>
AuthorDate: Fri Jan 23 00:41:11 2026 +0800
Commit:     Jun Zhao <[email protected]>
CommitDate: Sun Jan 25 06:55:26 2026 +0000

    lavc/hevc: add aarch64 neon for 10-bit dequant
    
    Implement NEON optimization for HEVC dequant at 10-bit depth.
    
    For 10-bit: shift = 15 - 10 - log2_size = 5 - log2_size
    
    Performance benchmark on Apple M4:
    ./tests/checkasm/checkasm --test=hevc_dequant --bench
    hevc_dequant_4x4_10_c:                                  16.6 ( 1.00x)
    hevc_dequant_4x4_10_neon:                                7.4 ( 2.23x)
    
    hevc_dequant_8x8_10_c:                                  39.7 ( 1.00x)
    hevc_dequant_8x8_10_neon:                                7.5 ( 5.28x)
    
    hevc_dequant_16x16_10_c:                               168.7 ( 1.00x)
    hevc_dequant_16x16_10_neon:                             10.2 (16.56x)
    
    hevc_dequant_32x32_10_c:                                 1.9 ( 1.00x)
    hevc_dequant_32x32_10_neon:                              1.9 ( 1.01x)
    
    Note: 32x32 shift=0 is identity transform (no-op), so NEON has no
    advantage over C which is also optimized away by the compiler.
    
    Signed-off-by: Jun Zhao <[email protected]>
---
 libavcodec/aarch64/hevcdsp_dequant_neon.S | 77 +++++++++++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c | 17 +++++++
 2 files changed, 94 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_dequant_neon.S 
b/libavcodec/aarch64/hevcdsp_dequant_neon.S
index a757bac6b3..63230afb28 100644
--- a/libavcodec/aarch64/hevcdsp_dequant_neon.S
+++ b/libavcodec/aarch64/hevcdsp_dequant_neon.S
@@ -167,3 +167,80 @@ function ff_hevc_dequant_32x32_8_neon, export=1
         b.ne            1b
         ret
 endfunc
+
+// --------------------------------------------------------------------------
+// HEVC dequant for 10-bit depth
+//
+// For 10-bit: shift = 15 - 10 - log2_size = 5 - log2_size
+//
+// Block size | log2_size | shift | operation
+// 4x4        | 2         | 3     | srshr #3
+// 8x8        | 3         | 2     | srshr #2
+// 16x16      | 4         | 1     | srshr #1
+// 32x32      | 5         | 0     | no-op (identity)
+// --------------------------------------------------------------------------
+
+// void ff_hevc_dequant_4x4_10_neon(int16_t *coeffs)
+// 4x4 = 16 coeffs, shift=3
+function ff_hevc_dequant_4x4_10_neon, export=1
+        ldp             q0, q1, [x0]
+        srshr           v0.8h, v0.8h, #3
+        srshr           v1.8h, v1.8h, #3
+        stp             q0, q1, [x0]
+        ret
+endfunc
+
+// void ff_hevc_dequant_8x8_10_neon(int16_t *coeffs)
+// 8x8 = 64 coeffs, shift=2
+// Fully unrolled - no loop needed for 64 coeffs
+function ff_hevc_dequant_8x8_10_neon, export=1
+        ld1             {v0.16b-v3.16b}, [x0], #64
+        ld1             {v4.16b-v7.16b}, [x0]
+        sub             x0, x0, #64
+        srshr           v0.8h, v0.8h, #2
+        srshr           v1.8h, v1.8h, #2
+        srshr           v2.8h, v2.8h, #2
+        srshr           v3.8h, v3.8h, #2
+        srshr           v4.8h, v4.8h, #2
+        srshr           v5.8h, v5.8h, #2
+        srshr           v6.8h, v6.8h, #2
+        srshr           v7.8h, v7.8h, #2
+        st1             {v0.16b-v3.16b}, [x0], #64
+        st1             {v4.16b-v7.16b}, [x0]
+        ret
+endfunc
+
+// void ff_hevc_dequant_16x16_10_neon(int16_t *coeffs)
+// 16x16 = 256 coeffs, shift=1
+// Pipelined implementation: interleave load/compute/store to hide memory 
latency
+// Uses .irp macro to unroll 4 iterations, processing 64 coeffs per iteration
+// x0 = load pointer, x1 = store pointer (both advance through the buffer)
+function ff_hevc_dequant_16x16_10_neon, export=1
+        mov             x1, x0
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+.irp i, 0, 1, 2, 3
+        srshr           v0.8h, v0.8h, #1
+        srshr           v1.8h, v1.8h, #1
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+        srshr           v2.8h, v2.8h, #1
+        srshr           v3.8h, v3.8h, #1
+        srshr           v4.8h, v4.8h, #1
+        srshr           v5.8h, v5.8h, #1
+        st1             {v0.16b - v3.16b}, [x1], #64
+        srshr           v6.8h, v6.8h, #1
+.if \i < 3
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+.endif
+        srshr           v7.8h, v7.8h, #1
+        st1             {v4.16b - v7.16b}, [x1], #64
+.endr
+        ret
+endfunc
+
+// void ff_hevc_dequant_32x32_10_neon(int16_t *coeffs)
+// 32x32 = 1024 coeffs, shift=0
+// When shift=0: output = (input + 0) >> 0 = input (identity transform)
+// No operation needed - just return immediately
+function ff_hevc_dequant_32x32_10_neon, export=1
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 80c9d1e2d2..ec62285ddb 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -103,6 +103,11 @@ void ff_hevc_dequant_8x8_8_neon(int16_t *coeffs);
 void ff_hevc_dequant_16x16_8_neon(int16_t *coeffs);
 void ff_hevc_dequant_32x32_8_neon(int16_t *coeffs);
 
+void ff_hevc_dequant_4x4_10_neon(int16_t *coeffs);
+void ff_hevc_dequant_8x8_10_neon(int16_t *coeffs);
+void ff_hevc_dequant_16x16_10_neon(int16_t *coeffs);
+void ff_hevc_dequant_32x32_10_neon(int16_t *coeffs);
+
 static void hevc_dequant_8_neon(int16_t *coeffs, int16_t log2_size)
 {
     switch (log2_size) {
@@ -114,6 +119,17 @@ static void hevc_dequant_8_neon(int16_t *coeffs, int16_t 
log2_size)
     }
 }
 
+static void hevc_dequant_10_neon(int16_t *coeffs, int16_t log2_size)
+{
+    switch (log2_size) {
+    case 2: ff_hevc_dequant_4x4_10_neon(coeffs); break;
+    case 3: ff_hevc_dequant_8x8_10_neon(coeffs); break;
+    case 4: ff_hevc_dequant_16x16_10_neon(coeffs); break;
+    case 5: ff_hevc_dequant_32x32_10_neon(coeffs); break;
+    default: av_unreachable("log2_size must be 2, 3, 4 or 5");
+    }
+}
+
 #define NEON8_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
         member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
@@ -292,6 +308,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_10_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_10_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
+        c->dequant                     = hevc_dequant_10_neon;
     }
     if (bit_depth == 12) {
         c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_12_neon;

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 03/04: lavc/hevc: add aarch64 neon for 10-bit dequant

Reply via email to