h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures" (PR #22447)

michaelni via ffmpeg-devel Sun, 08 Mar 2026 12:15:11 -0700

PR #22447 opened by michaelni
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22447
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22447.patch


This reverts commit 89f984e3d1d8f1f009c616e0c6425094395fdbdc.

For H.264 conformance, the speed regression seems not necessary.
The code is under if (bit_depth == 8)

The relevant normative text is clause 8.5.10, "Scaling and transformation 
process for DC transform coefficients for Intra_16x16 macroblock type."
It imposes two separate bounds: first on the inverse-transform result fij, and 
then again on the final scaled output dcYij.
Both bounds are [-2^(7+bitDepth), 2^(7+bitDepth)-1]. For ordinary 8-bit H.264, 
bitDepthY = 8, so the final dcYij range is exactly [-32768, 32767], i.e. it 
fits in a signed 16-bit value.

See: T-REC-H.264-202108
Signed-off-by: Michael Niedermayer <[email protected]>


>From afefecf0371f5099b4e01b2369a920ad4d046839 Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <[email protected]>
Date: Sun, 8 Mar 2026 19:06:21 +0100
Subject: [PATCH] Revert "avcodec/x86/h264_idct: Fix
 ff_h264_luma_dc_dequant_idct_sse2 checkasm failures"

This reverts commit 89f984e3d1d8f1f009c616e0c6425094395fdbdc.

For H.264 conformance, the speed regression seems not necessary.
The code is under if (bit_depth == 8)

The relevant normative text is clause 8.5.10, "Scaling and transformation 
process for DC transform coefficients for Intra_16x16 macroblock type."
It imposes two separate bounds: first on the inverse-transform result fij, and 
then again on the final scaled output dcYij.
Both bounds are [-2^(7+bitDepth), 2^(7+bitDepth)-1]. For ordinary 8-bit H.264, 
bitDepthY = 8, so the final dcYij range is exactly [-32768, 32767], i.e. it 
fits in a signed 16-bit value.

See: T-REC-H.264-202108
Signed-off-by: Michael Niedermayer <[email protected]>
---
 libavcodec/x86/h264_idct.asm | 62 ++++++++++++------------------------
 1 file changed, 20 insertions(+), 42 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 6ae8202748..b3cbbd533c 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -592,58 +592,36 @@ RET
     psrad       m1, %1
     psrad       m2, %1
     psrad       m3, %1
+    packssdw    m0, m1
+    packssdw    m2, m3
 %endmacro
 
-%macro STORE_WORDS 10
-%if ARCH_X86_64
-    movq        t0, %1
-    movq        t1, %2
-    psrldq      %1, 8
-    psrldq      %2, 8
+%macro STORE_WORDS 9
+    movd  t0d, %1
+    psrldq  %1, 4
+    movd  t1d, %1
+    psrldq  %1, 4
+    mov [t2+%2*32], t0w
+    mov [t2+%4*32], t1w
+    shr   t0d, 16
+    shr   t1d, 16
     mov [t2+%3*32], t0w
-    mov [t2+%7*32], t1w
-    shr         t0, 32
-    shr         t1, 32
-    mov [t2+%4*32], t0w
-    mov [t2+%8*32], t1w
-    movq        t0, %1
-    movq        t1, %2
-    mov [t2+%5*32], t0w
-    mov [t2+%9*32], t1w
-    shr         t0, 32
-    shr         t1, 32
+    mov [t2+%5*32], t1w
+    movd  t0d, %1
+    psrldq  %1, 4
+    movd  t1d, %1
     mov [t2+%6*32], t0w
-    mov [t2+%10*32], t1w
-%else
-    movd       t0d, %1
-    movd       t1d, %2
-    psrldq      %1, 4
-    psrldq      %2, 4
-    mov [t2+%3*32], t0w
-    mov [t2+%7*32], t1w
-    movd       t0d, %1
-    movd       t1d, %2
-    psrldq      %1, 4
-    psrldq      %2, 4
-    mov [t2+%4*32], t0w
     mov [t2+%8*32], t1w
-    movd       t0d, %1
-    movd       t1d, %2
-    psrldq      %1, 4
-    psrldq      %2, 4
-    mov [t2+%5*32], t0w
+    shr   t0d, 16
+    shr   t1d, 16
+    mov [t2+%7*32], t0w
     mov [t2+%9*32], t1w
-    movd       t0d, %1
-    movd       t1d, %2
-    mov [t2+%6*32], t0w
-    mov [t2+%10*32], t1w
-%endif
 %endmacro
 
 %macro DEQUANT_STORE 1
     DEQUANT     %1
-    STORE_WORDS m0, m1,  0,  1,  4,  5,  2,  3,  6,  7
-    STORE_WORDS m2, m3,  8,  9, 12, 13, 10, 11, 14, 15
+    STORE_WORDS m0,  0,  1,  4,  5,  2,  3,  6,  7
+    STORE_WORDS m2,  8,  9, 12, 13, 10, 11, 14, 15
 %endmacro
 
 INIT_XMM sse2
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] Revert "avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures" (PR #22447)

Reply via email to