PR #21495 opened by Jun Zhao (mypopydev) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21495 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21495.patch
This patch series adds NEON-optimized HEVC dequantization for AArch64, covering 8-bit, 10-bit, and 12-bit depths. All implementations are benchmarked on Apple M4, with performance data verified through 10-run averages using checkasm. >From 51c1e4c124494fecf53aab8b89679dd4d4ce993b Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Wed, 7 Jan 2026 07:50:13 +0800 Subject: [PATCH 1/4] lavc/hevc: add aarch64 neon for 8-bit dequant Implement NEON optimization for HEVC dequant at 8-bit depth. The NEON implementation uses srshr (Signed Rounding Shift Right) which does both the add with offset and right shift in a single instruction. Optimization details: - 4x4 (16 coeffs): Single load-process-store sequence - 8x8 (64 coeffs): Fully unrolled, no loop overhead - 16x16 (256 coeffs): Loop with 64 coeffs per iteration - 32x32 (1024 coeffs): Loop with 128 coeffs per iteration, using all available NEON registers Performance benchmark on Apple M4: ./tests/checkasm/checkasm --test=hevc_dequant --bench hevc_dequant_4x4_8_c: 12.0 ( 1.00x) hevc_dequant_4x4_8_neon: 6.9 ( 1.73x) hevc_dequant_8x8_8_c: 37.1 ( 1.00x) hevc_dequant_8x8_8_neon: 7.3 ( 5.11x) hevc_dequant_16x16_8_c: 165.4 ( 1.00x) hevc_dequant_16x16_8_neon: 13.3 (12.42x) hevc_dequant_32x32_8_c: 86.0 ( 1.00x) hevc_dequant_32x32_8_neon: 35.0 ( 2.45x) Note on Performance Anomaly: The observation that hevc_dequant_32x32_8_c is faster than 16x16 (86.0 vs 165.4) is due to Clang auto-vectorizing only for sizes >= 32x32. Compiler: Apple clang version 17.0.0 (clang-1700.6.3.2) Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/Makefile | 1 + libavcodec/aarch64/hevcdsp_dequant_neon.S | 177 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 16 ++ tests/checkasm/Makefile | 2 +- tests/checkasm/checkasm.c | 1 + tests/checkasm/checkasm.h | 1 + tests/checkasm/hevc_dequant.c | 76 ++++++++++ 7 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 libavcodec/aarch64/hevcdsp_dequant_neon.S create mode 100644 tests/checkasm/hevc_dequant.c diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 2bf48dfa28..1e838ad901 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -71,6 +71,7 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ aarch64/vp9mc_16bpp_neon.o \ aarch64/vp9mc_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_deblock_neon.o \ + aarch64/hevcdsp_dequant_neon.o \ aarch64/hevcdsp_idct_neon.o \ aarch64/hevcdsp_init_aarch64.o \ aarch64/h26x/epel_neon.o \ diff --git a/libavcodec/aarch64/hevcdsp_dequant_neon.S b/libavcodec/aarch64/hevcdsp_dequant_neon.S new file mode 100644 index 0000000000..f352f43e15 --- /dev/null +++ b/libavcodec/aarch64/hevcdsp_dequant_neon.S @@ -0,0 +1,177 @@ +/* + * ARM NEON optimised dequant functions for HEVC decoding + * + * Copyright (c) 2026 FFmpeg contributors + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// HEVC dequant for 8-bit depth +// +// Algorithm (from dsp_template.c): +// shift = 15 - BIT_DEPTH - log2_size +// offset = 1 << (shift - 1) +// output = (input + offset) >> shift +// +// This is equivalent to: output = ROUND(input >> shift) +// NEON srshr (Signed Rounding Shift Right) does exactly this in one instruction! +// +// For 8-bit: shift = 15 - 8 - log2_size = 7 - log2_size +// +// Block size | log2_size | shift | operation +// 4x4 | 2 | 5 | srshr #5 +// 8x8 | 3 | 4 | srshr #4 +// 16x16 | 4 | 3 | srshr #3 +// 32x32 | 5 | 2 | srshr #2 + +// void ff_hevc_dequant_4x4_8_neon(int16_t *coeffs, int16_t log2_size) +// 4x4 = 16 coeffs, shift=5 +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_4x4_8_neon, export=1 + ldp q0, q1, [x0] // load 16 int16_t (32 bytes) + srshr v0.8h, v0.8h, #5 // rounding shift right by 5 + srshr v1.8h, v1.8h, #5 + stp q0, q1, [x0] // store + ret +endfunc + +// void ff_hevc_dequant_8x8_8_neon(int16_t *coeffs, int16_t log2_size) +// 8x8 = 64 coeffs, shift=4 +// Fully unrolled - no loop needed for 64 coeffs +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_8x8_8_neon, export=1 + ldp q0, q1, [x0] + ldp q2, q3, [x0, #32] + ldp q4, q5, [x0, #64] + ldp q6, q7, [x0, #96] + srshr v0.8h, v0.8h, #4 + srshr v1.8h, v1.8h, #4 + srshr v2.8h, v2.8h, #4 + srshr v3.8h, v3.8h, #4 + srshr v4.8h, v4.8h, #4 + srshr v5.8h, v5.8h, #4 + srshr v6.8h, v6.8h, #4 + srshr v7.8h, v7.8h, #4 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + ret +endfunc + +// void ff_hevc_dequant_16x16_8_neon(int16_t *coeffs, int16_t log2_size) +// 16x16 = 256 coeffs, shift=3 +// Process 64 coeffs per iteration (4 iterations) +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_16x16_8_neon, export=1 + mov x2, #4 // loop 4 times (64 coeffs each) +1: + ldp q0, q1, [x0] + ldp q2, q3, [x0, #32] + ldp q4, q5, [x0, #64] + ldp q6, q7, [x0, #96] + srshr v0.8h, v0.8h, #3 + srshr v1.8h, v1.8h, #3 + srshr v2.8h, v2.8h, #3 + srshr v3.8h, v3.8h, #3 + srshr v4.8h, v4.8h, #3 + srshr v5.8h, v5.8h, #3 + srshr v6.8h, v6.8h, #3 + srshr v7.8h, v7.8h, #3 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 + subs x2, x2, #1 + b.ne 1b + ret +endfunc + +// void ff_hevc_dequant_32x32_8_neon(int16_t *coeffs, int16_t log2_size) +// 32x32 = 1024 coeffs, shift=2 +// Process 128 coeffs per iteration (8 iterations) +// Using all available NEON registers for maximum throughput +// AAPCS64: v0-v7 and v16-v31 are volatile (caller-saved) +// We use v0-v7 and v16-v23 to avoid touching callee-saved v8-v15 +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_32x32_8_neon, export=1 + mov x2, #8 // loop 8 times (128 coeffs each) +1: + // Group A: q0-q3 (64 bytes / 32 coeffs) + ldp q0, q1, [x0] + ldp q2, q3, [x0, #32] + // Group B: q4-q7 (64 bytes / 32 coeffs) + ldp q4, q5, [x0, #64] + ldp q6, q7, [x0, #96] + + // Calc Group A (shift right with rounding) + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v1.8h, #2 + srshr v2.8h, v2.8h, #2 + srshr v3.8h, v3.8h, #2 + + // Group C: q16-q19 (64 bytes / 32 coeffs) + // Load into volatile high registers to maximize pipeline usage + ldp q16, q17, [x0, #128] + ldp q18, q19, [x0, #160] + + // Calc Group B + srshr v4.8h, v4.8h, #2 + srshr v5.8h, v5.8h, #2 + srshr v6.8h, v6.8h, #2 + srshr v7.8h, v7.8h, #2 + + // Store Group A (Write back results to memory) + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + + // Group D: q20-q23 (64 bytes / 32 coeffs) + ldp q20, q21, [x0, #192] + ldp q22, q23, [x0, #224] + + // Calc Group C + srshr v16.8h, v16.8h, #2 + srshr v17.8h, v17.8h, #2 + srshr v18.8h, v18.8h, #2 + srshr v19.8h, v19.8h, #2 + + // Store Group B + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + + // Calc Group D + srshr v20.8h, v20.8h, #2 + srshr v21.8h, v21.8h, #2 + srshr v22.8h, v22.8h, #2 + srshr v23.8h, v23.8h, #2 + + // Store Group C + stp q16, q17, [x0, #128] + stp q18, q19, [x0, #160] + + // Store Group D + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + + add x0, x0, #256 // Advance pointer by 128 coeffs (256 bytes) + subs x2, x2, #1 + b.ne 1b + ret +endfunc diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 8dec58bc7f..0d1e1dcfeb 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -97,6 +97,21 @@ void ff_hevc_idct_16x16_dc_12_neon(int16_t *coeffs); void ff_hevc_idct_32x32_dc_12_neon(int16_t *coeffs); void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs); +void ff_hevc_dequant_4x4_8_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_8x8_8_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_16x16_8_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_32x32_8_neon(int16_t *coeffs, int16_t log2_size); + +static void ff_hevc_dequant_8_neon(int16_t *coeffs, int16_t log2_size) +{ + switch (log2_size) { + case 2: ff_hevc_dequant_4x4_8_neon(coeffs, log2_size); break; + case 3: ff_hevc_dequant_8x8_8_neon(coeffs, log2_size); break; + case 4: ff_hevc_dequant_16x16_8_neon(coeffs, log2_size); break; + case 5: ff_hevc_dequant_32x32_8_neon(coeffs, log2_size); break; + } +} + #define NEON8_FNASSIGN(member, v, h, fn, ext) \ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \ @@ -168,6 +183,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; + c->dequant = ff_hevc_dequant_8_neon; c->sao_band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon; c->sao_band_filter[1] = c->sao_band_filter[2] = diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 48f358d40d..f4ac5a940f 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -42,7 +42,7 @@ AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuvdsp.o AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o AVCODECOBJS-$(CONFIG_OPUS_DECODER) += opusdsp.o AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o -AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_deblock.o hevc_idct.o hevc_sao.o hevc_pel.o +AVCODECOBJS-$(CONFIG_HEVC_DECODER) += hevc_add_res.o hevc_deblock.o hevc_dequant.o hevc_idct.o hevc_sao.o hevc_pel.o AVCODECOBJS-$(CONFIG_RV34DSP) += rv34dsp.o AVCODECOBJS-$(CONFIG_RV40_DECODER) += rv40dsp.o AVCODECOBJS-$(CONFIG_SVQ1_ENCODER) += svq1enc.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 7dcdaeb2a4..f9ccb30ce9 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -187,6 +187,7 @@ static const struct { #if CONFIG_HEVC_DECODER { "hevc_add_res", checkasm_check_hevc_add_res }, { "hevc_deblock", checkasm_check_hevc_deblock }, + { "hevc_dequant", checkasm_check_hevc_dequant }, { "hevc_idct", checkasm_check_hevc_idct }, { "hevc_pel", checkasm_check_hevc_pel }, { "hevc_sao", checkasm_check_hevc_sao }, diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index e3addec21e..b246d0c4da 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -110,6 +110,7 @@ void checkasm_check_h264pred(void); void checkasm_check_h264qpel(void); void checkasm_check_hevc_add_res(void); void checkasm_check_hevc_deblock(void); +void checkasm_check_hevc_dequant(void); void checkasm_check_hevc_idct(void); void checkasm_check_hevc_pel(void); void checkasm_check_hevc_sao(void); diff --git a/tests/checkasm/hevc_dequant.c b/tests/checkasm/hevc_dequant.c new file mode 100644 index 0000000000..20e322994a --- /dev/null +++ b/tests/checkasm/hevc_dequant.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2024 FFmpeg contributors + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <string.h> + +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +#include "libavcodec/hevc/dsp.h" + +#include "checkasm.h" + +#define randomize_buffers(buf, size) \ + do { \ + int j; \ + for (j = 0; j < size; j++) { \ + int16_t r = rnd() & 0x7FFF; \ + if (rnd() & 1) r = -r; \ + AV_WN16A(buf + j, r); \ + } \ + } while (0) + +static void check_dequant(HEVCDSPContext *h, int bit_depth) +{ + int i; + LOCAL_ALIGNED(32, int16_t, coeffs0, [32 * 32]); + LOCAL_ALIGNED(32, int16_t, coeffs1, [32 * 32]); + + for (i = 2; i <= 5; i++) { + int block_size = 1 << i; + int size = block_size * block_size; + declare_func(void, int16_t *coeffs, int16_t log2_size); + + randomize_buffers(coeffs0, size); + memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size); + + if (check_func(h->dequant, "hevc_dequant_%dx%d_%d", + block_size, block_size, bit_depth)) { + call_ref(coeffs0, i); + call_new(coeffs1, i); + if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size)) + fail(); + bench_new(coeffs1, i); + } + } +} + +void checkasm_check_hevc_dequant(void) +{ + int bit_depth; + + for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { + HEVCDSPContext h; + + ff_hevc_dsp_init(&h, bit_depth); + check_dequant(&h, bit_depth); + } + report("dequant"); +} -- 2.52.0 >From 100ad43bb82078c23be654cb03f296ff033ad7c0 Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Fri, 16 Jan 2026 21:38:27 +0800 Subject: [PATCH 2/4] lavc/hevc: optimize dequant for shift=0 case (identity transform) The HEVC dequantization uses: shift = 15 - bit_depth - log2_size When shift equals 0, the operation becomes an identity transform: - For shift > 0: output = (input + offset) >> shift - For shift < 0: output = input << (-shift) - For shift = 0: output = input << 0 = input (no change) This occurs in the following cases: - 10-bit, 32x32 block: shift = 15 - 10 - 5 = 0 - 12-bit, 8x8 block: shift = 15 - 12 - 3 = 0 Previously, the code would still iterate through all coefficients and perform redundant read-modify-write operations even when shift=0. This patch adds an early return for shift=0, avoiding unnecessary memory operations. checkasm benchmarks on Apple M4 show: - 10-bit 32x32: 69.1 -> 1.6 cycles (43x faster) - 12-bit 8x8: 30.9 -> 1.7 cycles (18x faster) Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/hevc/dsp_template.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/libavcodec/hevc/dsp_template.c b/libavcodec/hevc/dsp_template.c index a0f79c2673..ed0bd63d71 100644 --- a/libavcodec/hevc/dsp_template.c +++ b/libavcodec/hevc/dsp_template.c @@ -106,6 +106,26 @@ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) } } +/** + * HEVC transform dequantization (ITU-T H.265 8.6.3) + * + * @param coeffs transform coefficient buffer (in-place) + * @param log2_size log2 of transform block size, range: 2..5 (4x4 to 32x32) + * This value comes from recursive split_transform_flag parsing + * in the bitstream, bounded by log2_min_tb_size (min 2) and + * log2_max_trafo_size (max 5) from SPS. + * + * Formula: shift = 15 - BIT_DEPTH - log2_size + * + * bit_depth | 4x4 (2) | 8x8 (3) | 16x16 (4) | 32x32 (5) + * ----------+---------+---------+-----------+---------- + * 8-bit | 5 | 4 | 3 | 2 (shift right) + * 10-bit | 3 | 2 | 1 | 0 (shift right / no-op) + * 12-bit | 1 | 0 | -1 | -2 (no-op / shift left) + * + * When shift == 0, output equals input (identity transform), so we skip + * the loop entirely for better performance. + */ static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) { int shift = 15 - BIT_DEPTH - log2_size; @@ -120,7 +140,7 @@ static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) coeffs++; } } - } else { + } else if (shift < 0) { for (y = 0; y < size; y++) { for (x = 0; x < size; x++) { *coeffs = *(uint16_t*)coeffs << -shift; @@ -128,6 +148,7 @@ static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) } } } + /* shift == 0: no operation needed (identity transform) */ } #define SET(dst, x) (dst) = (x) -- 2.52.0 >From 4b372d119aeaa405281b93aa9c7e5a207304c842 Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Fri, 16 Jan 2026 22:05:58 +0800 Subject: [PATCH 3/4] lavc/hevc: add aarch64 neon for 10-bit dequant Add NEON optimized dequantization for 10-bit HEVC. For 10-bit: shift = 15 - 10 - log2_size = 5 - log2_size Block size | shift | operation -----------+-------+----------- 4x4 | 3 | srshr #3 8x8 | 2 | srshr #2 16x16 | 1 | srshr #1 32x32 | 0 | no-op (identity, just return) Performance benchmark on Apple M4: ./tests/checkasm/checkasm --test=hevc_dequant --bench hevc_dequant_4x4_10_c: 9.7 ( 1.00x) hevc_dequant_4x4_10_neon: 5.6 ( 1.73x) hevc_dequant_8x8_10_c: 30.3 ( 1.00x) hevc_dequant_8x8_10_neon: 5.9 ( 5.14x) hevc_dequant_16x16_10_c: 129.9 ( 1.00x) hevc_dequant_16x16_10_neon: 10.6 (12.25x) hevc_dequant_32x32_10_c: 1.6 ( 1.00x) hevc_dequant_32x32_10_neon: 1.7 ( 0.94x) Note: 32x32 shows no improvement because shift=0 triggers the identity transform optimization (no processing needed). Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/hevcdsp_dequant_neon.S | 85 +++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 16 +++++ 2 files changed, 101 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_dequant_neon.S b/libavcodec/aarch64/hevcdsp_dequant_neon.S index f352f43e15..3b016b839b 100644 --- a/libavcodec/aarch64/hevcdsp_dequant_neon.S +++ b/libavcodec/aarch64/hevcdsp_dequant_neon.S @@ -175,3 +175,88 @@ function ff_hevc_dequant_32x32_8_neon, export=1 b.ne 1b ret endfunc + +// -------------------------------------------------------------------------- +// HEVC dequant for 10-bit depth +// +// For 10-bit: shift = 15 - 10 - log2_size = 5 - log2_size +// +// Block size | log2_size | shift | operation +// 4x4 | 2 | 3 | srshr #3 +// 8x8 | 3 | 2 | srshr #2 +// 16x16 | 4 | 1 | srshr #1 +// 32x32 | 5 | 0 | no-op (identity) +// -------------------------------------------------------------------------- + +// void ff_hevc_dequant_4x4_10_neon(int16_t *coeffs, int16_t log2_size) +// 4x4 = 16 coeffs, shift=3 +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_4x4_10_neon, export=1 + ldp q0, q1, [x0] + srshr v0.8h, v0.8h, #3 + srshr v1.8h, v1.8h, #3 + stp q0, q1, [x0] + ret +endfunc + +// void ff_hevc_dequant_8x8_10_neon(int16_t *coeffs, int16_t log2_size) +// 8x8 = 64 coeffs, shift=2 +// Fully unrolled - no loop needed for 64 coeffs +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_8x8_10_neon, export=1 + ldp q0, q1, [x0] + ldp q2, q3, [x0, #32] + ldp q4, q5, [x0, #64] + ldp q6, q7, [x0, #96] + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v1.8h, #2 + srshr v2.8h, v2.8h, #2 + srshr v3.8h, v3.8h, #2 + srshr v4.8h, v4.8h, #2 + srshr v5.8h, v5.8h, #2 + srshr v6.8h, v6.8h, #2 + srshr v7.8h, v7.8h, #2 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + ret +endfunc + +// void ff_hevc_dequant_16x16_10_neon(int16_t *coeffs, int16_t log2_size) +// 16x16 = 256 coeffs, shift=1 +// Process 64 coeffs per iteration (4 iterations) +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_16x16_10_neon, export=1 + mov x2, #4 +1: + ldp q0, q1, [x0] + ldp q2, q3, [x0, #32] + ldp q4, q5, [x0, #64] + ldp q6, q7, [x0, #96] + srshr v0.8h, v0.8h, #1 + srshr v1.8h, v1.8h, #1 + srshr v2.8h, v2.8h, #1 + srshr v3.8h, v3.8h, #1 + srshr v4.8h, v4.8h, #1 + srshr v5.8h, v5.8h, #1 + srshr v6.8h, v6.8h, #1 + srshr v7.8h, v7.8h, #1 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 + subs x2, x2, #1 + b.ne 1b + ret +endfunc + +// void ff_hevc_dequant_32x32_10_neon(int16_t *coeffs, int16_t log2_size) +// 32x32 = 1024 coeffs, shift=0 +// When shift=0: output = (input + 0) >> 0 = input (identity transform) +// No operation needed - just return immediately +// Note: log2_size parameter (x1) is unused +function ff_hevc_dequant_32x32_10_neon, export=1 + ret +endfunc diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 0d1e1dcfeb..d5eb218e64 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -102,6 +102,11 @@ void ff_hevc_dequant_8x8_8_neon(int16_t *coeffs, int16_t log2_size); void ff_hevc_dequant_16x16_8_neon(int16_t *coeffs, int16_t log2_size); void ff_hevc_dequant_32x32_8_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_4x4_10_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_8x8_10_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_16x16_10_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_32x32_10_neon(int16_t *coeffs, int16_t log2_size); + static void ff_hevc_dequant_8_neon(int16_t *coeffs, int16_t log2_size) { switch (log2_size) { @@ -112,6 +117,16 @@ static void ff_hevc_dequant_8_neon(int16_t *coeffs, int16_t log2_size) } } +static void ff_hevc_dequant_10_neon(int16_t *coeffs, int16_t log2_size) +{ + switch (log2_size) { + case 2: ff_hevc_dequant_4x4_10_neon(coeffs, log2_size); break; + case 3: ff_hevc_dequant_8x8_10_neon(coeffs, log2_size); break; + case 4: ff_hevc_dequant_16x16_10_neon(coeffs, log2_size); break; + case 5: ff_hevc_dequant_32x32_10_neon(coeffs, log2_size); break; + } +} + #define NEON8_FNASSIGN(member, v, h, fn, ext) \ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \ @@ -290,6 +305,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; + c->dequant = ff_hevc_dequant_10_neon; } if (bit_depth == 12) { c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_neon; -- 2.52.0 >From d02c79573c29519ce83fe20969b90fa0b1f702f6 Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Fri, 16 Jan 2026 22:19:27 +0800 Subject: [PATCH 4/4] lavc/hevc: add aarch64 neon for 12-bit dequant Add NEON optimizations for HEVC 12-bit dequantization. For 12-bit: shift = 15 - 12 - log2_size = 3 - log2_size Block size | log2_size | shift | operation -----------+-----------+-------+----------- 4x4 | 2 | 1 | srshr #1 (shift right with rounding) 8x8 | 3 | 0 | identity (no-op, just return) 16x16 | 4 | -1 | shl #1 (shift left) 32x32 | 5 | -2 | shl #2 (shift left) Performance benchmark on Apple M4: ./tests/checkasm/checkasm --test=hevc_dequant --bench hevc_dequant_4x4_12_c: 9.8 ( 1.00x) hevc_dequant_4x4_12_neon: 5.8 ( 1.70x) hevc_dequant_8x8_12_c: 1.7 ( 1.00x) hevc_dequant_8x8_12_neon: 1.3 ( 1.34x) hevc_dequant_16x16_12_c: 133.9 ( 1.00x) hevc_dequant_16x16_12_neon: 11.0 (12.20x) hevc_dequant_32x32_12_c: 70.1 ( 1.00x) hevc_dequant_32x32_12_neon: 29.0 ( 2.42x) Note: 8x8 shows minimal improvement because shift=0 triggers the identity transform optimization (limited processing needed). Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/hevcdsp_dequant_neon.S | 130 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 16 +++ 2 files changed, 146 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_dequant_neon.S b/libavcodec/aarch64/hevcdsp_dequant_neon.S index 3b016b839b..c0b5f3fab4 100644 --- a/libavcodec/aarch64/hevcdsp_dequant_neon.S +++ b/libavcodec/aarch64/hevcdsp_dequant_neon.S @@ -260,3 +260,133 @@ endfunc function ff_hevc_dequant_32x32_10_neon, export=1 ret endfunc + +// -------------------------------------------------------------------------- +// HEVC dequant for 12-bit depth +// +// For 12-bit: shift = 15 - 12 - log2_size = 3 - log2_size +// +// Block size | log2_size | shift | operation +// 4x4 | 2 | 1 | srshr #1 (shift right) +// 8x8 | 3 | 0 | no-op (identity) +// 16x16 | 4 | -1 | shl #1 (shift left) +// 32x32 | 5 | -2 | shl #2 (shift left) +// -------------------------------------------------------------------------- + +// void ff_hevc_dequant_4x4_12_neon(int16_t *coeffs, int16_t log2_size) +// 4x4 = 16 coeffs, shift=1 +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_4x4_12_neon, export=1 + ldp q0, q1, [x0] + srshr v0.8h, v0.8h, #1 + srshr v1.8h, v1.8h, #1 + stp q0, q1, [x0] + ret +endfunc + +// void ff_hevc_dequant_8x8_12_neon(int16_t *coeffs, int16_t log2_size) +// 8x8 = 64 coeffs, shift=0 +// When shift=0: output = input (identity transform) +// No operation needed - just return immediately +// Note: log2_size parameter (x1) is unused +function ff_hevc_dequant_8x8_12_neon, export=1 + ret +endfunc + +// void ff_hevc_dequant_16x16_12_neon(int16_t *coeffs, int16_t log2_size) +// 16x16 = 256 coeffs, shift=-1 (left shift by 1) +// Process 64 coeffs per iteration (4 iterations) +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_16x16_12_neon, export=1 + mov x2, #4 +1: + ldp q0, q1, [x0] + ldp q2, q3, [x0, #32] + ldp q4, q5, [x0, #64] + ldp q6, q7, [x0, #96] + shl v0.8h, v0.8h, #1 + shl v1.8h, v1.8h, #1 + shl v2.8h, v2.8h, #1 + shl v3.8h, v3.8h, #1 + shl v4.8h, v4.8h, #1 + shl v5.8h, v5.8h, #1 + shl v6.8h, v6.8h, #1 + shl v7.8h, v7.8h, #1 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 + subs x2, x2, #1 + b.ne 1b + ret +endfunc + +// void ff_hevc_dequant_32x32_12_neon(int16_t *coeffs, int16_t log2_size) +// 32x32 = 1024 coeffs, shift=-2 (left shift by 2) +// Process 128 coeffs per iteration (8 iterations) +// Using pipelined load/compute/store for better performance +// Note: log2_size parameter (x1) is unused; shift value is hardcoded for performance +function ff_hevc_dequant_32x32_12_neon, export=1 + mov x2, #8 +1: + // Group A: q0-q3 (64 bytes / 32 coeffs) + ldp q0, q1, [x0] + ldp q2, q3, [x0, #32] + // Group B: q4-q7 (64 bytes / 32 coeffs) + ldp q4, q5, [x0, #64] + ldp q6, q7, [x0, #96] + + // Calc Group A (shift left by 2) + shl v0.8h, v0.8h, #2 + shl v1.8h, v1.8h, #2 + shl v2.8h, v2.8h, #2 + shl v3.8h, v3.8h, #2 + + // Group C: q16-q19 (64 bytes / 32 coeffs) + ldp q16, q17, [x0, #128] + ldp q18, q19, [x0, #160] + + // Calc Group B + shl v4.8h, v4.8h, #2 + shl v5.8h, v5.8h, #2 + shl v6.8h, v6.8h, #2 + shl v7.8h, v7.8h, #2 + + // Store Group A + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + + // Group D: q20-q23 (64 bytes / 32 coeffs) + ldp q20, q21, [x0, #192] + ldp q22, q23, [x0, #224] + + // Calc Group C + shl v16.8h, v16.8h, #2 + shl v17.8h, v17.8h, #2 + shl v18.8h, v18.8h, #2 + shl v19.8h, v19.8h, #2 + + // Store Group B + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + + // Calc Group D + shl v20.8h, v20.8h, #2 + shl v21.8h, v21.8h, #2 + shl v22.8h, v22.8h, #2 + shl v23.8h, v23.8h, #2 + + // Store Group C + stp q16, q17, [x0, #128] + stp q18, q19, [x0, #160] + + // Store Group D + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + + add x0, x0, #256 + subs x2, x2, #1 + b.ne 1b + ret +endfunc diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index d5eb218e64..376074fb47 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -107,6 +107,11 @@ void ff_hevc_dequant_8x8_10_neon(int16_t *coeffs, int16_t log2_size); void ff_hevc_dequant_16x16_10_neon(int16_t *coeffs, int16_t log2_size); void ff_hevc_dequant_32x32_10_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_4x4_12_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_8x8_12_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_16x16_12_neon(int16_t *coeffs, int16_t log2_size); +void ff_hevc_dequant_32x32_12_neon(int16_t *coeffs, int16_t log2_size); + static void ff_hevc_dequant_8_neon(int16_t *coeffs, int16_t log2_size) { switch (log2_size) { @@ -127,6 +132,16 @@ static void ff_hevc_dequant_10_neon(int16_t *coeffs, int16_t log2_size) } } +static void ff_hevc_dequant_12_neon(int16_t *coeffs, int16_t log2_size) +{ + switch (log2_size) { + case 2: ff_hevc_dequant_4x4_12_neon(coeffs, log2_size); break; + case 3: ff_hevc_dequant_8x8_12_neon(coeffs, log2_size); break; + case 4: ff_hevc_dequant_16x16_12_neon(coeffs, log2_size); break; + case 5: ff_hevc_dequant_32x32_12_neon(coeffs, log2_size); break; + } +} + #define NEON8_FNASSIGN(member, v, h, fn, ext) \ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \ @@ -320,5 +335,6 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_neon; + c->dequant = ff_hevc_dequant_12_neon; } } -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
