This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit ce89d974c8764002f127829dc0ecf43725994ff0 Author: Jun Zhao <[email protected]> AuthorDate: Fri Jan 23 00:41:11 2026 +0800 Commit: Jun Zhao <[email protected]> CommitDate: Sun Jan 25 06:55:26 2026 +0000 lavc/hevc: add aarch64 neon for 10-bit dequant Implement NEON optimization for HEVC dequant at 10-bit depth. For 10-bit: shift = 15 - 10 - log2_size = 5 - log2_size Performance benchmark on Apple M4: ./tests/checkasm/checkasm --test=hevc_dequant --bench hevc_dequant_4x4_10_c: 16.6 ( 1.00x) hevc_dequant_4x4_10_neon: 7.4 ( 2.23x) hevc_dequant_8x8_10_c: 39.7 ( 1.00x) hevc_dequant_8x8_10_neon: 7.5 ( 5.28x) hevc_dequant_16x16_10_c: 168.7 ( 1.00x) hevc_dequant_16x16_10_neon: 10.2 (16.56x) hevc_dequant_32x32_10_c: 1.9 ( 1.00x) hevc_dequant_32x32_10_neon: 1.9 ( 1.01x) Note: 32x32 shift=0 is identity transform (no-op), so NEON has no advantage over C which is also optimized away by the compiler. Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/hevcdsp_dequant_neon.S | 77 +++++++++++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 17 +++++++ 2 files changed, 94 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_dequant_neon.S b/libavcodec/aarch64/hevcdsp_dequant_neon.S index a757bac6b3..63230afb28 100644 --- a/libavcodec/aarch64/hevcdsp_dequant_neon.S +++ b/libavcodec/aarch64/hevcdsp_dequant_neon.S @@ -167,3 +167,80 @@ function ff_hevc_dequant_32x32_8_neon, export=1 b.ne 1b ret endfunc + +// -------------------------------------------------------------------------- +// HEVC dequant for 10-bit depth +// +// For 10-bit: shift = 15 - 10 - log2_size = 5 - log2_size +// +// Block size | log2_size | shift | operation +// 4x4 | 2 | 3 | srshr #3 +// 8x8 | 3 | 2 | srshr #2 +// 16x16 | 4 | 1 | srshr #1 +// 32x32 | 5 | 0 | no-op (identity) +// -------------------------------------------------------------------------- + +// void ff_hevc_dequant_4x4_10_neon(int16_t *coeffs) +// 4x4 = 16 coeffs, shift=3 +function ff_hevc_dequant_4x4_10_neon, export=1 + ldp q0, q1, [x0] + srshr v0.8h, v0.8h, #3 + srshr v1.8h, v1.8h, #3 + stp q0, q1, [x0] + ret +endfunc + +// void ff_hevc_dequant_8x8_10_neon(int16_t *coeffs) +// 8x8 = 64 coeffs, shift=2 +// Fully unrolled - no loop needed for 64 coeffs +function ff_hevc_dequant_8x8_10_neon, export=1 + ld1 {v0.16b-v3.16b}, [x0], #64 + ld1 {v4.16b-v7.16b}, [x0] + sub x0, x0, #64 + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v1.8h, #2 + srshr v2.8h, v2.8h, #2 + srshr v3.8h, v3.8h, #2 + srshr v4.8h, v4.8h, #2 + srshr v5.8h, v5.8h, #2 + srshr v6.8h, v6.8h, #2 + srshr v7.8h, v7.8h, #2 + st1 {v0.16b-v3.16b}, [x0], #64 + st1 {v4.16b-v7.16b}, [x0] + ret +endfunc + +// void ff_hevc_dequant_16x16_10_neon(int16_t *coeffs) +// 16x16 = 256 coeffs, shift=1 +// Pipelined implementation: interleave load/compute/store to hide memory latency +// Uses .irp macro to unroll 4 iterations, processing 64 coeffs per iteration +// x0 = load pointer, x1 = store pointer (both advance through the buffer) +function ff_hevc_dequant_16x16_10_neon, export=1 + mov x1, x0 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 +.irp i, 0, 1, 2, 3 + srshr v0.8h, v0.8h, #1 + srshr v1.8h, v1.8h, #1 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64 + srshr v2.8h, v2.8h, #1 + srshr v3.8h, v3.8h, #1 + srshr v4.8h, v4.8h, #1 + srshr v5.8h, v5.8h, #1 + st1 {v0.16b - v3.16b}, [x1], #64 + srshr v6.8h, v6.8h, #1 +.if \i < 3 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 +.endif + srshr v7.8h, v7.8h, #1 + st1 {v4.16b - v7.16b}, [x1], #64 +.endr + ret +endfunc + +// void ff_hevc_dequant_32x32_10_neon(int16_t *coeffs) +// 32x32 = 1024 coeffs, shift=0 +// When shift=0: output = (input + 0) >> 0 = input (identity transform) +// No operation needed - just return immediately +function ff_hevc_dequant_32x32_10_neon, export=1 + ret +endfunc diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 80c9d1e2d2..ec62285ddb 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -103,6 +103,11 @@ void ff_hevc_dequant_8x8_8_neon(int16_t *coeffs); void ff_hevc_dequant_16x16_8_neon(int16_t *coeffs); void ff_hevc_dequant_32x32_8_neon(int16_t *coeffs); +void ff_hevc_dequant_4x4_10_neon(int16_t *coeffs); +void ff_hevc_dequant_8x8_10_neon(int16_t *coeffs); +void ff_hevc_dequant_16x16_10_neon(int16_t *coeffs); +void ff_hevc_dequant_32x32_10_neon(int16_t *coeffs); + static void hevc_dequant_8_neon(int16_t *coeffs, int16_t log2_size) { switch (log2_size) { @@ -114,6 +119,17 @@ static void hevc_dequant_8_neon(int16_t *coeffs, int16_t log2_size) } } +static void hevc_dequant_10_neon(int16_t *coeffs, int16_t log2_size) +{ + switch (log2_size) { + case 2: ff_hevc_dequant_4x4_10_neon(coeffs); break; + case 3: ff_hevc_dequant_8x8_10_neon(coeffs); break; + case 4: ff_hevc_dequant_16x16_10_neon(coeffs); break; + case 5: ff_hevc_dequant_32x32_10_neon(coeffs); break; + default: av_unreachable("log2_size must be 2, 3, 4 or 5"); + } +} + #define NEON8_FNASSIGN(member, v, h, fn, ext) \ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \ @@ -292,6 +308,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; + c->dequant = hevc_dequant_10_neon; } if (bit_depth == 12) { c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_neon; _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
