This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 24f296c7a1032accf28c35437e0212a2a8cf5032 Author: Jun Zhao <[email protected]> AuthorDate: Fri Jan 23 00:32:40 2026 +0800 Commit: Jun Zhao <[email protected]> CommitDate: Sun Jan 25 06:55:26 2026 +0000 lavc/hevc: optimize dequant for shift=0 case (identity transform) The HEVC dequantization uses: shift = 15 - bit_depth - log2_size When shift equals 0, the operation becomes an identity transform: - For shift > 0: output = (input + offset) >> shift - For shift < 0: output = input << (-shift) - For shift = 0: output = input << 0 = input (no change) This occurs in the following cases: - 10-bit, 32x32 block: shift = 15 - 10 - 5 = 0 - 12-bit, 8x8 block: shift = 15 - 12 - 3 = 0 Previously, the code would still iterate through all coefficients and perform redundant read-modify-write operations even when shift=0. This patch adds an early return for shift=0, avoiding unnecessary memory operations. checkasm benchmarks on Apple M4 show: - 10-bit 32x32: 69.1 -> 1.6 cycles (43x faster) - 12-bit 8x8: 30.9 -> 1.7 cycles (18x faster) Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/hevc/dsp_template.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/libavcodec/hevc/dsp_template.c b/libavcodec/hevc/dsp_template.c index a0f79c2673..573cf9ee1e 100644 --- a/libavcodec/hevc/dsp_template.c +++ b/libavcodec/hevc/dsp_template.c @@ -106,6 +106,26 @@ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) } } +/** + * HEVC transform dequantization (ITU-T H.265 8.6.3) + * + * @param coeffs transform coefficient buffer (in-place) + * @param log2_size log2 of transform block size, range: 2..5 (4x4 to 32x32) + * This value comes from recursive split_transform_flag parsing + * in the bitstream, bounded by log2_min_tb_size (min 2) and + * log2_max_trafo_size (max 5) from SPS. + * + * Formula: shift = 15 - BIT_DEPTH - log2_size + * + * bit_depth | 4x4 (2) | 8x8 (3) | 16x16 (4) | 32x32 (5) + * ----------+---------+---------+-----------+---------- + * 8-bit | 5 | 4 | 3 | 2 (shift right) + * 10-bit | 3 | 2 | 1 | 0 (shift right / no-op) + * 12-bit | 1 | 0 | -1 | -2 (shift right / no-op / shift left) + * + * When shift == 0, output equals input (identity transform), so we skip + * the loop entirely for better performance. + */ static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) { int shift = 15 - BIT_DEPTH - log2_size; @@ -120,7 +140,7 @@ static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) coeffs++; } } - } else { + } else if (shift < 0) { for (y = 0; y < size; y++) { for (x = 0; x < size; x++) { *coeffs = *(uint16_t*)coeffs << -shift; @@ -128,6 +148,7 @@ static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) } } } + /* shift == 0: no operation needed (identity transform) */ } #define SET(dst, x) (dst) = (x) _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
