This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 8966101fa6b2b921bb395de9d9deaceca0f6d501 Author: Jun Zhao <[email protected]> AuthorDate: Fri Jan 23 00:48:32 2026 +0800 Commit: Jun Zhao <[email protected]> CommitDate: Sun Jan 25 06:55:26 2026 +0000 lavc/hevc: add aarch64 neon for 12-bit dequant Implement NEON optimization for HEVC dequant at 12-bit depth. For 12-bit: shift = 15 - 12 - log2_size = 3 - log2_size. When shift is negative, we use shl (shift left) instead of srshr. Performance benchmark on Apple M4: ./tests/checkasm/checkasm --test=hevc_dequant --bench hevc_dequant_4x4_12_c: 9.9 ( 1.00x) hevc_dequant_4x4_12_neon: 5.7 ( 1.74x) hevc_dequant_8x8_12_c: 1.7 ( 1.00x) hevc_dequant_8x8_12_neon: 1.3 ( 1.30x) hevc_dequant_16x16_12_c: 131.1 ( 1.00x) hevc_dequant_16x16_12_neon: 7.9 (16.52x) hevc_dequant_32x32_12_c: 69.7 ( 1.00x) hevc_dequant_32x32_12_neon: 28.4 ( 2.46x) Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/hevcdsp_dequant_neon.S | 125 ++++++++++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 17 ++++ 2 files changed, 142 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_dequant_neon.S b/libavcodec/aarch64/hevcdsp_dequant_neon.S index 63230afb28..af2b01ac4b 100644 --- a/libavcodec/aarch64/hevcdsp_dequant_neon.S +++ b/libavcodec/aarch64/hevcdsp_dequant_neon.S @@ -244,3 +244,128 @@ endfunc function ff_hevc_dequant_32x32_10_neon, export=1 ret endfunc + +// -------------------------------------------------------------------------- +// HEVC dequant for 12-bit depth +// +// For 12-bit: shift = 15 - 12 - log2_size = 3 - log2_size +// +// Block size | log2_size | shift | operation +// 4x4 | 2 | 1 | srshr #1 (shift right) +// 8x8 | 3 | 0 | no-op (identity) +// 16x16 | 4 | -1 | shl #1 (shift left) +// 32x32 | 5 | -2 | shl #2 (shift left) +// -------------------------------------------------------------------------- + +// void ff_hevc_dequant_4x4_12_neon(int16_t *coeffs) +// 4x4 = 16 coeffs, shift=1 +function ff_hevc_dequant_4x4_12_neon, export=1 + ldp q0, q1, [x0] + srshr v0.8h, v0.8h, #1 + srshr v1.8h, v1.8h, #1 + stp q0, q1, [x0] + ret +endfunc + +// void ff_hevc_dequant_8x8_12_neon(int16_t *coeffs) +// 8x8 = 64 coeffs, shift=0 +// When shift=0: output = input (identity transform) +// No operation needed - just return immediately +function ff_hevc_dequant_8x8_12_neon, export=1 + ret +endfunc + +// void ff_hevc_dequant_16x16_12_neon(int16_t *coeffs) +// 16x16 = 256 coeffs, shift=-1 (left shift by 1) +// Pipelined implementation: interleave load/compute/store to hide memory latency +// Uses .irp macro to unroll 4 iterations, processing 64 coeffs per iteration +// x0 = load pointer, x1 = store pointer (both advance through the buffer) +function ff_hevc_dequant_16x16_12_neon, export=1 + mov x1, x0 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 +.irp i, 0, 1, 2, 3 + shl v0.8h, v0.8h, #1 + shl v1.8h, v1.8h, #1 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64 + shl v2.8h, v2.8h, #1 + shl v3.8h, v3.8h, #1 + shl v4.8h, v4.8h, #1 + shl v5.8h, v5.8h, #1 + st1 {v0.16b - v3.16b}, [x1], #64 + shl v6.8h, v6.8h, #1 +.if \i < 3 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 +.endif + shl v7.8h, v7.8h, #1 + st1 {v4.16b - v7.16b}, [x1], #64 +.endr + ret +endfunc + +// void ff_hevc_dequant_32x32_12_neon(int16_t *coeffs) +// 32x32 = 1024 coeffs, shift=-2 (left shift by 2) +// Process 128 coeffs per iteration (8 iterations) +// Using pipelined load/compute/store for better performance +function ff_hevc_dequant_32x32_12_neon, export=1 + mov x2, #8 +1: + // Group A: q0-q3 (64 bytes / 32 coeffs) + ldp q0, q1, [x0] + ldp q2, q3, [x0, #32] + // Group B: q4-q7 (64 bytes / 32 coeffs) + ldp q4, q5, [x0, #64] + ldp q6, q7, [x0, #96] + subs x2, x2, #1 // Decrement loop counter early for better pipelining + + // Calc Group A (shift left by 2) + shl v0.8h, v0.8h, #2 + shl v1.8h, v1.8h, #2 + shl v2.8h, v2.8h, #2 + shl v3.8h, v3.8h, #2 + + // Group C: q16-q19 (64 bytes / 32 coeffs) + ldp q16, q17, [x0, #128] + ldp q18, q19, [x0, #160] + + // Calc Group B + shl v4.8h, v4.8h, #2 + shl v5.8h, v5.8h, #2 + shl v6.8h, v6.8h, #2 + shl v7.8h, v7.8h, #2 + + // Store Group A + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + + // Group D: q20-q23 (64 bytes / 32 coeffs) + ldp q20, q21, [x0, #192] + ldp q22, q23, [x0, #224] + + // Calc Group C + shl v16.8h, v16.8h, #2 + shl v17.8h, v17.8h, #2 + shl v18.8h, v18.8h, #2 + shl v19.8h, v19.8h, #2 + + // Store Group B + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + + // Calc Group D + shl v20.8h, v20.8h, #2 + shl v21.8h, v21.8h, #2 + shl v22.8h, v22.8h, #2 + shl v23.8h, v23.8h, #2 + + // Store Group C + stp q16, q17, [x0, #128] + stp q18, q19, [x0, #160] + + // Store Group D + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + + add x0, x0, #256 + b.ne 1b + ret +endfunc diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index ec62285ddb..8ff7f632af 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -108,6 +108,11 @@ void ff_hevc_dequant_8x8_10_neon(int16_t *coeffs); void ff_hevc_dequant_16x16_10_neon(int16_t *coeffs); void ff_hevc_dequant_32x32_10_neon(int16_t *coeffs); +void ff_hevc_dequant_4x4_12_neon(int16_t *coeffs); +void ff_hevc_dequant_8x8_12_neon(int16_t *coeffs); +void ff_hevc_dequant_16x16_12_neon(int16_t *coeffs); +void ff_hevc_dequant_32x32_12_neon(int16_t *coeffs); + static void hevc_dequant_8_neon(int16_t *coeffs, int16_t log2_size) { switch (log2_size) { @@ -130,6 +135,17 @@ static void hevc_dequant_10_neon(int16_t *coeffs, int16_t log2_size) } } +static void hevc_dequant_12_neon(int16_t *coeffs, int16_t log2_size) +{ + switch (log2_size) { + case 2: ff_hevc_dequant_4x4_12_neon(coeffs); break; + case 3: ff_hevc_dequant_8x8_12_neon(coeffs); break; + case 4: ff_hevc_dequant_16x16_12_neon(coeffs); break; + case 5: ff_hevc_dequant_32x32_12_neon(coeffs); break; + default: av_unreachable("log2_size must be 2, 3, 4 or 5"); + } +} + #define NEON8_FNASSIGN(member, v, h, fn, ext) \ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \ @@ -323,5 +339,6 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_neon; + c->dequant = hevc_dequant_12_neon; } } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
