hevc: add aarch64 neon for 12-bit dequant

Jun Zhao via ffmpeg-cvslog Sat, 24 Jan 2026 22:56:32 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 8966101fa6b2b921bb395de9d9deaceca0f6d501
Author:     Jun Zhao <[email protected]>
AuthorDate: Fri Jan 23 00:48:32 2026 +0800
Commit:     Jun Zhao <[email protected]>
CommitDate: Sun Jan 25 06:55:26 2026 +0000

    lavc/hevc: add aarch64 neon for 12-bit dequant
    
    Implement NEON optimization for HEVC dequant at 12-bit depth.
    
    For 12-bit: shift = 15 - 12 - log2_size = 3 - log2_size. When shift
    is negative, we use shl (shift left) instead of srshr.
    
    Performance benchmark on Apple M4:
    ./tests/checkasm/checkasm --test=hevc_dequant --bench
    hevc_dequant_4x4_12_c:                                   9.9 ( 1.00x)
    hevc_dequant_4x4_12_neon:                                5.7 ( 1.74x)
    
    hevc_dequant_8x8_12_c:                                   1.7 ( 1.00x)
    hevc_dequant_8x8_12_neon:                                1.3 ( 1.30x)
    
    hevc_dequant_16x16_12_c:                               131.1 ( 1.00x)
    hevc_dequant_16x16_12_neon:                              7.9 (16.52x)
    
    hevc_dequant_32x32_12_c:                                69.7 ( 1.00x)
    hevc_dequant_32x32_12_neon:                             28.4 ( 2.46x)
    
    Signed-off-by: Jun Zhao <[email protected]>
---
 libavcodec/aarch64/hevcdsp_dequant_neon.S | 125 ++++++++++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  17 ++++
 2 files changed, 142 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_dequant_neon.S 
b/libavcodec/aarch64/hevcdsp_dequant_neon.S
index 63230afb28..af2b01ac4b 100644
--- a/libavcodec/aarch64/hevcdsp_dequant_neon.S
+++ b/libavcodec/aarch64/hevcdsp_dequant_neon.S
@@ -244,3 +244,128 @@ endfunc
 function ff_hevc_dequant_32x32_10_neon, export=1
         ret
 endfunc
+
+// --------------------------------------------------------------------------
+// HEVC dequant for 12-bit depth
+//
+// For 12-bit: shift = 15 - 12 - log2_size = 3 - log2_size
+//
+// Block size | log2_size | shift | operation
+// 4x4        | 2         | 1     | srshr #1 (shift right)
+// 8x8        | 3         | 0     | no-op (identity)
+// 16x16      | 4         | -1    | shl #1 (shift left)
+// 32x32      | 5         | -2    | shl #2 (shift left)
+// --------------------------------------------------------------------------
+
+// void ff_hevc_dequant_4x4_12_neon(int16_t *coeffs)
+// 4x4 = 16 coeffs, shift=1
+function ff_hevc_dequant_4x4_12_neon, export=1
+        ldp             q0, q1, [x0]
+        srshr           v0.8h, v0.8h, #1
+        srshr           v1.8h, v1.8h, #1
+        stp             q0, q1, [x0]
+        ret
+endfunc
+
+// void ff_hevc_dequant_8x8_12_neon(int16_t *coeffs)
+// 8x8 = 64 coeffs, shift=0
+// When shift=0: output = input (identity transform)
+// No operation needed - just return immediately
+function ff_hevc_dequant_8x8_12_neon, export=1
+        ret
+endfunc
+
+// void ff_hevc_dequant_16x16_12_neon(int16_t *coeffs)
+// 16x16 = 256 coeffs, shift=-1 (left shift by 1)
+// Pipelined implementation: interleave load/compute/store to hide memory 
latency
+// Uses .irp macro to unroll 4 iterations, processing 64 coeffs per iteration
+// x0 = load pointer, x1 = store pointer (both advance through the buffer)
+function ff_hevc_dequant_16x16_12_neon, export=1
+        mov             x1, x0
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+.irp i, 0, 1, 2, 3
+        shl             v0.8h, v0.8h, #1
+        shl             v1.8h, v1.8h, #1
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+        shl             v2.8h, v2.8h, #1
+        shl             v3.8h, v3.8h, #1
+        shl             v4.8h, v4.8h, #1
+        shl             v5.8h, v5.8h, #1
+        st1             {v0.16b - v3.16b}, [x1], #64
+        shl             v6.8h, v6.8h, #1
+.if \i < 3
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+.endif
+        shl             v7.8h, v7.8h, #1
+        st1             {v4.16b - v7.16b}, [x1], #64
+.endr
+        ret
+endfunc
+
+// void ff_hevc_dequant_32x32_12_neon(int16_t *coeffs)
+// 32x32 = 1024 coeffs, shift=-2 (left shift by 2)
+// Process 128 coeffs per iteration (8 iterations)
+// Using pipelined load/compute/store for better performance
+function ff_hevc_dequant_32x32_12_neon, export=1
+        mov             x2, #8
+1:
+        // Group A: q0-q3 (64 bytes / 32 coeffs)
+        ldp             q0, q1, [x0]
+        ldp             q2, q3, [x0, #32]
+        // Group B: q4-q7 (64 bytes / 32 coeffs)
+        ldp             q4, q5, [x0, #64]
+        ldp             q6, q7, [x0, #96]
+        subs            x2, x2, #1                 // Decrement loop counter 
early for better pipelining
+
+        // Calc Group A (shift left by 2)
+        shl             v0.8h, v0.8h, #2
+        shl             v1.8h, v1.8h, #2
+        shl             v2.8h, v2.8h, #2
+        shl             v3.8h, v3.8h, #2
+
+        // Group C: q16-q19 (64 bytes / 32 coeffs)
+        ldp             q16, q17, [x0, #128]
+        ldp             q18, q19, [x0, #160]
+
+        // Calc Group B
+        shl             v4.8h, v4.8h, #2
+        shl             v5.8h, v5.8h, #2
+        shl             v6.8h, v6.8h, #2
+        shl             v7.8h, v7.8h, #2
+
+        // Store Group A
+        stp             q0, q1, [x0]
+        stp             q2, q3, [x0, #32]
+
+        // Group D: q20-q23 (64 bytes / 32 coeffs)
+        ldp             q20, q21, [x0, #192]
+        ldp             q22, q23, [x0, #224]
+
+        // Calc Group C
+        shl             v16.8h, v16.8h, #2
+        shl             v17.8h, v17.8h, #2
+        shl             v18.8h, v18.8h, #2
+        shl             v19.8h, v19.8h, #2
+
+        // Store Group B
+        stp             q4, q5, [x0, #64]
+        stp             q6, q7, [x0, #96]
+
+        // Calc Group D
+        shl             v20.8h, v20.8h, #2
+        shl             v21.8h, v21.8h, #2
+        shl             v22.8h, v22.8h, #2
+        shl             v23.8h, v23.8h, #2
+
+        // Store Group C
+        stp             q16, q17, [x0, #128]
+        stp             q18, q19, [x0, #160]
+
+        // Store Group D
+        stp             q20, q21, [x0, #192]
+        stp             q22, q23, [x0, #224]
+
+        add             x0, x0, #256
+        b.ne            1b
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index ec62285ddb..8ff7f632af 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -108,6 +108,11 @@ void ff_hevc_dequant_8x8_10_neon(int16_t *coeffs);
 void ff_hevc_dequant_16x16_10_neon(int16_t *coeffs);
 void ff_hevc_dequant_32x32_10_neon(int16_t *coeffs);
 
+void ff_hevc_dequant_4x4_12_neon(int16_t *coeffs);
+void ff_hevc_dequant_8x8_12_neon(int16_t *coeffs);
+void ff_hevc_dequant_16x16_12_neon(int16_t *coeffs);
+void ff_hevc_dequant_32x32_12_neon(int16_t *coeffs);
+
 static void hevc_dequant_8_neon(int16_t *coeffs, int16_t log2_size)
 {
     switch (log2_size) {
@@ -130,6 +135,17 @@ static void hevc_dequant_10_neon(int16_t *coeffs, int16_t 
log2_size)
     }
 }
 
+static void hevc_dequant_12_neon(int16_t *coeffs, int16_t log2_size)
+{
+    switch (log2_size) {
+    case 2: ff_hevc_dequant_4x4_12_neon(coeffs); break;
+    case 3: ff_hevc_dequant_8x8_12_neon(coeffs); break;
+    case 4: ff_hevc_dequant_16x16_12_neon(coeffs); break;
+    case 5: ff_hevc_dequant_32x32_12_neon(coeffs); break;
+    default: av_unreachable("log2_size must be 2, 3, 4 or 5");
+    }
+}
+
 #define NEON8_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
         member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
@@ -323,5 +339,6 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_12_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_12_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_12_neon;
+        c->dequant                     = hevc_dequant_12_neon;
     }
 }

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 04/04: lavc/hevc: add aarch64 neon for 12-bit dequant

Reply via email to