PR #22447 opened by michaelni URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22447 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22447.patch
This reverts commit 89f984e3d1d8f1f009c616e0c6425094395fdbdc. For H.264 conformance, the speed regression seems not necessary. The code is under if (bit_depth == 8) The relevant normative text is clause 8.5.10, "Scaling and transformation process for DC transform coefficients for Intra_16x16 macroblock type." It imposes two separate bounds: first on the inverse-transform result fij, and then again on the final scaled output dcYij. Both bounds are [-2^(7+bitDepth), 2^(7+bitDepth)-1]. For ordinary 8-bit H.264, bitDepthY = 8, so the final dcYij range is exactly [-32768, 32767], i.e. it fits in a signed 16-bit value. See: T-REC-H.264-202108 Signed-off-by: Michael Niedermayer <[email protected]> >From afefecf0371f5099b4e01b2369a920ad4d046839 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer <[email protected]> Date: Sun, 8 Mar 2026 19:06:21 +0100 Subject: [PATCH] Revert "avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures" This reverts commit 89f984e3d1d8f1f009c616e0c6425094395fdbdc. For H.264 conformance, the speed regression seems not necessary. The code is under if (bit_depth == 8) The relevant normative text is clause 8.5.10, "Scaling and transformation process for DC transform coefficients for Intra_16x16 macroblock type." It imposes two separate bounds: first on the inverse-transform result fij, and then again on the final scaled output dcYij. Both bounds are [-2^(7+bitDepth), 2^(7+bitDepth)-1]. For ordinary 8-bit H.264, bitDepthY = 8, so the final dcYij range is exactly [-32768, 32767], i.e. it fits in a signed 16-bit value. See: T-REC-H.264-202108 Signed-off-by: Michael Niedermayer <[email protected]> --- libavcodec/x86/h264_idct.asm | 62 ++++++++++++------------------------ 1 file changed, 20 insertions(+), 42 deletions(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 6ae8202748..b3cbbd533c 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -592,58 +592,36 @@ RET psrad m1, %1 psrad m2, %1 psrad m3, %1 + packssdw m0, m1 + packssdw m2, m3 %endmacro -%macro STORE_WORDS 10 -%if ARCH_X86_64 - movq t0, %1 - movq t1, %2 - psrldq %1, 8 - psrldq %2, 8 +%macro STORE_WORDS 9 + movd t0d, %1 + psrldq %1, 4 + movd t1d, %1 + psrldq %1, 4 + mov [t2+%2*32], t0w + mov [t2+%4*32], t1w + shr t0d, 16 + shr t1d, 16 mov [t2+%3*32], t0w - mov [t2+%7*32], t1w - shr t0, 32 - shr t1, 32 - mov [t2+%4*32], t0w - mov [t2+%8*32], t1w - movq t0, %1 - movq t1, %2 - mov [t2+%5*32], t0w - mov [t2+%9*32], t1w - shr t0, 32 - shr t1, 32 + mov [t2+%5*32], t1w + movd t0d, %1 + psrldq %1, 4 + movd t1d, %1 mov [t2+%6*32], t0w - mov [t2+%10*32], t1w -%else - movd t0d, %1 - movd t1d, %2 - psrldq %1, 4 - psrldq %2, 4 - mov [t2+%3*32], t0w - mov [t2+%7*32], t1w - movd t0d, %1 - movd t1d, %2 - psrldq %1, 4 - psrldq %2, 4 - mov [t2+%4*32], t0w mov [t2+%8*32], t1w - movd t0d, %1 - movd t1d, %2 - psrldq %1, 4 - psrldq %2, 4 - mov [t2+%5*32], t0w + shr t0d, 16 + shr t1d, 16 + mov [t2+%7*32], t0w mov [t2+%9*32], t1w - movd t0d, %1 - movd t1d, %2 - mov [t2+%6*32], t0w - mov [t2+%10*32], t1w -%endif %endmacro %macro DEQUANT_STORE 1 DEQUANT %1 - STORE_WORDS m0, m1, 0, 1, 4, 5, 2, 3, 6, 7 - STORE_WORDS m2, m3, 8, 9, 12, 13, 10, 11, 14, 15 + STORE_WORDS m0, 0, 1, 4, 5, 2, 3, 6, 7 + STORE_WORDS m2, 8, 9, 12, 13, 10, 11, 14, 15 %endmacro INIT_XMM sse2 -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
