PR #21027 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21027
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21027.patch

The last patch addresses #20835.


>From a78016a3dd55654baef6ecdd51192a78843e9a6d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 26 Nov 2025 12:26:37 +0100
Subject: [PATCH 1/9] avcodec/x86/h264idct: Remove dead MMX macros

Forgotten in 4618f36a2424a3a4d5760afabc2e9dd18d73f0a4.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h264_idct.asm | 105 +----------------------------------
 1 file changed, 3 insertions(+), 102 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index d9c3c9c862..985955d96a 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -145,61 +145,6 @@ SECTION .text
     IDCT8_1D   [%1], [%1+ 64]
 %endmacro
 
-; %1=int16_t *block, %2=int16_t *dstblock
-%macro IDCT8_ADD_MMX_START 2
-    IDCT8_1D_FULL %1
-    mova       [%1], m7
-    TRANSPOSE4x4W 0, 1, 2, 3, 7
-    mova         m7, [%1]
-    mova    [%2   ], m0
-    mova    [%2+16], m1
-    mova    [%2+32], m2
-    mova    [%2+48], m3
-    TRANSPOSE4x4W 4, 5, 6, 7, 3
-    mova    [%2+ 8], m4
-    mova    [%2+24], m5
-    mova    [%2+40], m6
-    mova    [%2+56], m7
-%endmacro
-
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
-%macro IDCT8_ADD_MMX_END 3-4
-    IDCT8_1D_FULL %2
-    mova    [%2   ], m5
-    mova    [%2+16], m6
-    mova    [%2+32], m7
-
-    pxor         m7, m7
-%if %0 == 4
-    movq   [%4+  0], m7
-    movq   [%4+  8], m7
-    movq   [%4+ 16], m7
-    movq   [%4+ 24], m7
-    movq   [%4+ 32], m7
-    movq   [%4+ 40], m7
-    movq   [%4+ 48], m7
-    movq   [%4+ 56], m7
-    movq   [%4+ 64], m7
-    movq   [%4+ 72], m7
-    movq   [%4+ 80], m7
-    movq   [%4+ 88], m7
-    movq   [%4+ 96], m7
-    movq   [%4+104], m7
-    movq   [%4+112], m7
-    movq   [%4+120], m7
-%endif
-    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
-    lea          %1, [%1+%3*2]
-    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
-    mova         m0, [%2   ]
-    mova         m1, [%2+16]
-    mova         m2, [%2+32]
-    lea          %1, [%1+%3*2]
-    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
-    lea          %1, [%1+%3*2]
-    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
-%endmacro
-
 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
 %macro IDCT8_ADD_SSE 4
     IDCT8_1D_FULL %2
@@ -612,7 +557,7 @@ cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
     add8_sse2_cycle 3, 0x64
 RET
 
-;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int 
qmul)
+;void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int 
qmul)
 
 %macro WALSH4_1D 5
     SUMSUB_BADC w, %4, %3, %2, %1, %5
@@ -620,8 +565,7 @@ RET
     SWAP %1, %4, %3
 %endmacro
 
-%macro DEQUANT 1-3
-%if cpuflag(sse2)
+%macro DEQUANT 1
     movd      xmm4, t3d
     movq      xmm5, [pw_1]
     pshufd    xmm4, xmm4, 0
@@ -643,31 +587,9 @@ RET
     psrad     xmm3, %1
     packssdw  xmm0, xmm1
     packssdw  xmm2, xmm3
-%else
-    mova        m7, [pw_1]
-    mova        m4, %1
-    punpcklwd   %1, m7
-    punpckhwd   m4, m7
-    mova        m5, %2
-    punpcklwd   %2, m7
-    punpckhwd   m5, m7
-    movd        m7, t3d
-    punpckldq   m7, m7
-    pmaddwd     %1, m7
-    pmaddwd     %2, m7
-    pmaddwd     m4, m7
-    pmaddwd     m5, m7
-    psrad       %1, %3
-    psrad       %2, %3
-    psrad       m4, %3
-    psrad       m5, %3
-    packssdw    %1, m4
-    packssdw    %2, m5
-%endif
 %endmacro
 
-%macro STORE_WORDS 5-9
-%if cpuflag(sse)
+%macro STORE_WORDS 9
     movd  t0d, %1
     psrldq  %1, 4
     movd  t1d, %1
@@ -687,33 +609,12 @@ RET
     shr   t1d, 16
     mov [t2+%7*32], t0w
     mov [t2+%9*32], t1w
-%else
-    movd  t0d, %1
-    psrlq  %1, 32
-    movd  t1d, %1
-    mov [t2+%2*32], t0w
-    mov [t2+%4*32], t1w
-    shr   t0d, 16
-    shr   t1d, 16
-    mov [t2+%3*32], t0w
-    mov [t2+%5*32], t1w
-%endif
 %endmacro
 
 %macro DEQUANT_STORE 1
-%if cpuflag(sse2)
     DEQUANT     %1
     STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
     STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
-%else
-    DEQUANT     m0, m1, %1
-    STORE_WORDS m0,  0,  1,  4,  5
-    STORE_WORDS m1,  2,  3,  6,  7
-
-    DEQUANT     m2, m3, %1
-    STORE_WORDS m2,  8,  9, 12, 13
-    STORE_WORDS m3, 10, 11, 14, 15
-%endif
 %endmacro
 
 INIT_XMM sse2
-- 
2.49.1


>From 956a47473728b5a45f29d1bc17a7cdd8010dfcb6 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 26 Nov 2025 12:38:58 +0100
Subject: [PATCH 2/9] avcodec/x86/h264_idct: Remove redundant movsxdifnidn

Only exported (i.e. cglobal) functions need it; stride is already
sign-extended when it reaches any of the internal functions used here,
so don't sign-extend again.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h264_idct.asm | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 985955d96a..6863dbcb4d 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -55,7 +55,7 @@ cextern pw_1
 
 SECTION .text
 
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride
 %macro IDCT4_ADD 3
     ; Load dct coeffs
     movq         m0, [%2]
@@ -145,7 +145,7 @@ SECTION .text
     IDCT8_1D   [%1], [%1+ 64]
 %endmacro
 
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride
 %macro IDCT8_ADD_SSE 4
     IDCT8_1D_FULL %2
 %if ARCH_X86_64
@@ -317,7 +317,6 @@ INIT_XMM cpuname
 
 INIT_MMX mmx
 h264_idct_add8_mmx_plane:
-    movsxdifnidn r3, r3d
 .nextblock:
     movzx        r6, byte [scan8+r5]
     movzx        r6, byte [r4+r6]
@@ -372,9 +371,8 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, 
block_offset, block, str
 
     RET ; TODO: check rep ret after a function call
 
-; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
+; r0 = uint8_t *dst, r2 = int16_t *block, r3 = ptrdiff_t stride, r6=clobbered
 h264_idct_dc_add8_mmxext:
-    movsxdifnidn r3, r3d
     movd         m0, [r2   ]          ;  0 0 X D
     mov word [r2+ 0], 0
     punpcklwd    m0, [r2+32]          ;  x X d D
@@ -393,9 +391,8 @@ h264_idct_dc_add8_mmxext:
 
 ALIGN 16
 INIT_XMM sse2
-; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
+; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = ptrdiff_t stride
 h264_add8x4_idct_sse2:
-    movsxdifnidn r3, r3d
     movq   m0, [r2+ 0]
     movq   m1, [r2+ 8]
     movq   m2, [r2+16]
-- 
2.49.1


>From b206204fee005e39a13b642609534dd1a5bf8763 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 26 Nov 2025 14:09:57 +0100
Subject: [PATCH 3/9] avcodec/x86/h264_idct: Avoid call where possible

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h264_idct.asm | 49 ++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 6863dbcb4d..9405aa848a 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -316,29 +316,6 @@ INIT_XMM cpuname
     RET
 
 INIT_MMX mmx
-h264_idct_add8_mmx_plane:
-.nextblock:
-    movzx        r6, byte [scan8+r5]
-    movzx        r6, byte [r4+r6]
-    or          r6w, word [r2]
-    test         r6, r6
-    jz .skipblock
-%if ARCH_X86_64
-    mov         r0d, dword [r1+r5*4]
-    add          r0, [dst2q]
-%else
-    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
-    mov          r0, [r0]
-    add          r0, dword [r1+r5*4]
-%endif
-    IDCT4_ADD    r0, r2, r3
-.skipblock:
-    inc          r5
-    add          r2, 32
-    test         r5, 3
-    jnz .nextblock
-    rep ret
-
 cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, 
stride, nnzc, cntr, coeff, dst2, picreg
 ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
     movsxdifnidn r3, r3d
@@ -367,9 +344,31 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, 
block_offset, block, str
 
     call         h264_idct_add8_mmx_plane
     add r5, 4
-    call         h264_idct_add8_mmx_plane
+    TAIL_CALL    h264_idct_add8_mmx_plane, 0
+
+h264_idct_add8_mmx_plane:
+.nextblock:
+    movzx       r6d, byte [scan8+r5]
+    movzx       r6d, byte [r4+r6]
+    or          r6w, word [r2]
+    test        r6d, r6d
+    jz .skipblock
+%if ARCH_X86_64
+    mov         r0d, dword [r1+r5*4]
+    add          r0, [dst2q]
+%else
+    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
+    mov          r0, [r0]
+    add          r0, dword [r1+r5*4]
+%endif
+    IDCT4_ADD    r0, r2, r3
+.skipblock:
+    inc         r5d
+    add          r2, 32
+    test        r5d, 3
+    jnz .nextblock
+    rep ret
 
-    RET ; TODO: check rep ret after a function call
 
 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = ptrdiff_t stride, r6=clobbered
 h264_idct_dc_add8_mmxext:
-- 
2.49.1


>From 64c493d604e5ebdbe188a276697b777e35d53952 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 26 Nov 2025 14:57:45 +0100
Subject: [PATCH 4/9] avutil/x86/x86inc: Use parentheses in has_epilogue

Prevents surprises.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/x86/x86inc.asm | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index e61d924bc1..0e80ebed43 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -609,7 +609,7 @@ DECLARE_REG 14, R13, 120
     RESET_STACK_STATE
 %endmacro
 
-%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || 
xmm_regs_used > 6+high_mm_regs
+%define has_epilogue (regs_used > 7 || stack_size > 0 || vzeroupper_required 
|| xmm_regs_used > 6+high_mm_regs)
 
 %macro RET 0
     WIN64_RESTORE_XMM_INTERNAL
@@ -658,7 +658,7 @@ DECLARE_REG 14, R13, 72
     %endif
 %endmacro
 
-%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
+%define has_epilogue (regs_used > 9 || stack_size > 0 || vzeroupper_required)
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -722,7 +722,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
     %endif
 %endmacro
 
-%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
+%define has_epilogue (regs_used > 3 || stack_size > 0 || vzeroupper_required)
 
 %macro RET 0
     %if stack_size_padded > 0
-- 
2.49.1


>From 0a577a5927331f1e58973fb652e2c30445743593 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 26 Nov 2025 15:23:31 +0100
Subject: [PATCH 5/9] avcodec/x86/h264_idct: Use tail call where advantageous

It is possible on UNIX64.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h264_idct.asm | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 9405aa848a..4b9efd6d6d 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -55,6 +55,19 @@ cextern pw_1
 
 SECTION .text
 
+; %1=callee, %2=dst to jump to if tail call is impossible (can be empty,
+; then no jmp is performed), %3=current iteration, %4=last iteration
+%macro TAIL_CALL_IF_LAST 4
+%if (%3 == %4) && !has_epilogue
+    jmp         %1
+%else
+    call        %1
+    %ifnempty %2
+        jmp      %2
+    %endif
+%endif
+%endmacro
+
 ; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride
 %macro IDCT4_ADD 3
     ; Load dct coeffs
@@ -424,7 +437,7 @@ h264_add8x4_idct_sse2:
 %else
     add         r0, r0m
 %endif
-    call        h264_add8x4_idct_sse2
+    TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, , %1, 7
 .cycle%1end:
 %if %1 < 7
     add         r2, 64
@@ -461,8 +474,7 @@ RET
 %else
     add         r0, r0m
 %endif
-    call        h264_add8x4_idct_sse2
-    jmp .cycle%1end
+    TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, .cycle%1end, %1, 7
 .try%1dc:
     movsx       r0, word [r2   ]
     or         r0w, word [r2+32]
@@ -473,7 +485,7 @@ RET
 %else
     add         r0, r0m
 %endif
-    call        h264_idct_dc_add8_mmxext
+    TAIL_CALL_IF_LAST h264_idct_dc_add8_mmxext, , %1, 7
 .cycle%1end:
 %if %1 < 7
     add         r2, 64
@@ -510,8 +522,7 @@ RET
     mov         r0, [r0]
     add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
-    call        h264_add8x4_idct_sse2
-    jmp .cycle%1end
+    TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, .cycle%1end, %1, 3
 .try%1dc:
     movsx       r0, word [r2   ]
     or         r0w, word [r2+32]
@@ -524,7 +535,7 @@ RET
     mov         r0, [r0]
     add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
-    call        h264_idct_dc_add8_mmxext
+    TAIL_CALL_IF_LAST h264_idct_dc_add8_mmxext, , %1, 3
 .cycle%1end:
 %if %1 == 1
     add         r2, 384+64
-- 
2.49.1


>From 84eaebab60bf3e832e8a8465103b8425f1773c15 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 26 Nov 2025 15:59:03 +0100
Subject: [PATCH 6/9] avcodec/x86/h264_idct: Zero with full-width stores

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h264_idct.asm | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 4b9efd6d6d..50647f2454 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -90,10 +90,15 @@ SECTION .text
     paddw        m0, m6
     IDCT4_1D      w, 0, 1, 2, 3, 4, 5
     pxor         m7, m7
-    movq    [%2+ 0], m7
-    movq    [%2+ 8], m7
-    movq    [%2+16], m7
-    movq    [%2+24], m7
+    %if mmsize == 16
+        mova    [%2+ 0], m7
+        mova    [%2+16], m7
+    %else
+        movq    [%2+ 0], m7
+        movq    [%2+ 8], m7
+        movq    [%2+16], m7
+        movq    [%2+24], m7
+    %endif
 
     STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
     lea          %1, [%1+%3*2]
-- 
2.49.1


>From 23a00e85b832766596352b6e391effb3e7348870 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 26 Nov 2025 17:26:47 +0100
Subject: [PATCH 7/9] avcodec/x86/h264_idct: Don't use MMX registers in
 ff_h264_luma_dc_dequant_idct_sse2

It is ABI compliant and gives a tiny speedup here (and is 16B smaller).

Old benchmarks:
h264_luma_dc_dequant_idct_8_c:                          33.2 ( 1.00x)
h264_luma_dc_dequant_idct_8_sse2:                       16.0 ( 2.07x)

New benchmarks:
h264_luma_dc_dequant_idct_8_c:                          33.0 ( 1.00x)
h264_luma_dc_dequant_idct_8_sse2:                       15.0 ( 2.20x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h264_idct.asm | 56 +++++++++++++++++++-----------------
 tests/checkasm/h264dsp.c     |  2 +-
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 50647f2454..fe46107867 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -578,27 +578,23 @@ RET
 %endmacro
 
 %macro DEQUANT 1
-    movd      xmm4, t3d
-    movq      xmm5, [pw_1]
-    pshufd    xmm4, xmm4, 0
-    movq2dq   xmm0, m0
-    movq2dq   xmm1, m1
-    movq2dq   xmm2, m2
-    movq2dq   xmm3, m3
-    punpcklwd xmm0, xmm5
-    punpcklwd xmm1, xmm5
-    punpcklwd xmm2, xmm5
-    punpcklwd xmm3, xmm5
-    pmaddwd   xmm0, xmm4
-    pmaddwd   xmm1, xmm4
-    pmaddwd   xmm2, xmm4
-    pmaddwd   xmm3, xmm4
-    psrad     xmm0, %1
-    psrad     xmm1, %1
-    psrad     xmm2, %1
-    psrad     xmm3, %1
-    packssdw  xmm0, xmm1
-    packssdw  xmm2, xmm3
+    movd        m4, t3d
+    movq        m5, [pw_1]
+    pshufd      m4, m4, 0
+    punpcklwd   m0, m5
+    punpcklwd   m1, m5
+    punpcklwd   m2, m5
+    punpcklwd   m3, m5
+    pmaddwd     m0, m4
+    pmaddwd     m1, m4
+    pmaddwd     m2, m4
+    pmaddwd     m3, m4
+    psrad       m0, %1
+    psrad       m1, %1
+    psrad       m2, %1
+    psrad       m3, %1
+    packssdw    m0, m1
+    packssdw    m2, m3
 %endmacro
 
 %macro STORE_WORDS 9
@@ -625,19 +621,25 @@ RET
 
 %macro DEQUANT_STORE 1
     DEQUANT     %1
-    STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
-    STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
+    STORE_WORDS m0,  0,  1,  4,  5,  2,  3,  6,  7
+    STORE_WORDS m2,  8,  9, 12, 13, 10, 11, 14, 15
 %endmacro
 
 INIT_XMM sse2
 cglobal h264_luma_dc_dequant_idct, 3, 4, 7
-INIT_MMX cpuname
     movq        m3, [r1+24]
     movq        m2, [r1+16]
     movq        m1, [r1+ 8]
     movq        m0, [r1+ 0]
     WALSH4_1D    0,1,2,3,4
-    TRANSPOSE4x4W 0,1,2,3,4
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    mova        m4, m0
+    punpckldq   m0, m2
+    punpckhdq   m4, m2
+    movhlps     m1, m0
+    movhlps     m3, m4
+    SWAP 2, 4
     WALSH4_1D    0,1,2,3,4
 
 ; shift, tmp, output, qmul
@@ -665,8 +667,8 @@ INIT_MMX cpuname
     inc        t1d
     shr        t3d, t0b
     sub        t1d, t0d
-    movd      xmm6, t1d
-    DEQUANT_STORE xmm6
+    movd        m6, t1d
+    DEQUANT_STORE m6
     RET
 
 %ifdef __NASM_VER__
diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c
index f05ae419fc..acf4f61ebb 100644
--- a/tests/checkasm/h264dsp.c
+++ b/tests/checkasm/h264dsp.c
@@ -336,7 +336,7 @@ static void check_idct_dequant(void)
     LOCAL_ALIGNED_16(int32_t, dst1_32, [16 * 16]);
     H264DSPContext h;
     int bit_depth, i, qmul;
-    declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_SSE2, void, int16_t 
*output, int16_t *input, int qmul);
+    declare_func(void, int16_t *output, int16_t *input, int qmul);
 
     qmul = rnd() % 4096;
 
-- 
2.49.1


>From a091ad3cc0af5c1ad14fbfdf4fe2b8fd93bd0dcc Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 26 Nov 2025 17:48:01 +0100
Subject: [PATCH 8/9] avcodec/x86/h264_idct: Deduplicate generating constant

pw_1 is currently loaded in both codepaths. Generate it earlier instead.
Gives tiny speedups (15 vs 14.5 cycles) and reduces codesize.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h264_idct.asm | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index fe46107867..d35d583ce7 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -51,7 +51,6 @@ scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
 %endif
 
 cextern pw_32
-cextern pw_1
 
 SECTION .text
 
@@ -577,9 +576,9 @@ RET
     SWAP %1, %4, %3
 %endmacro
 
+; requires m5 to contain pw_1
 %macro DEQUANT 1
     movd        m4, t3d
-    movq        m5, [pw_1]
     pshufd      m4, m4, 0
     punpcklwd   m0, m5
     punpcklwd   m1, m5
@@ -635,6 +634,7 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, 7
     punpcklwd   m0, m1
     punpcklwd   m2, m3
     mova        m4, m0
+    pcmpeqw     m5, m5
     punpckldq   m0, m2
     punpckhdq   m4, m2
     movhlps     m1, m0
@@ -652,6 +652,7 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, 7
 %else
     DECLARE_REG_TMP 1,3,0,2
 %endif
+    psrlw       m5, 15
 
     cmp        t3d, 32767
     jg .big_qmul
-- 
2.49.1


>From 67bcf93f397f3c6bba08f6872b2723c946ae3aed Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 26 Nov 2025 20:15:55 +0100
Subject: [PATCH 9/9] avcodec/x86/h264_idct: Fix
 ff_h264_luma_dc_dequant_idct_sse2 checkasm failures

ff_h264_luma_dc_dequant_idct_sse2() does not pass checkasm for certain
seeds, because the input to packssdw no longer fits into an int16_t,
leading to saturation, where the C code just truncates. I don't know
whether the spec contains provisions that ensure that valid input
must not exceed 16 bit or whether the such inputs (even if invalid)
can be triggered by the actual code and not only the test.

This commit adapts the behavior of the function to the C reference code
to fix the test. packssdw is avoided, instead the lower words are
directly transfered to GPRs to be written out. This has unfortunately
led to a slight performance regression here (14.5 vs 15.1 cycles).

Fixes issue #20835.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h264_idct.asm | 64 ++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index d35d583ce7..47e4116f42 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -592,36 +592,58 @@ RET
     psrad       m1, %1
     psrad       m2, %1
     psrad       m3, %1
-    packssdw    m0, m1
-    packssdw    m2, m3
 %endmacro
 
-%macro STORE_WORDS 9
-    movd  t0d, %1
-    psrldq  %1, 4
-    movd  t1d, %1
-    psrldq  %1, 4
-    mov [t2+%2*32], t0w
-    mov [t2+%4*32], t1w
-    shr   t0d, 16
-    shr   t1d, 16
+%macro STORE_WORDS 10
+%if ARCH_X86_64
+    movq        t0, %1
+    movq        t1, %2
+    psrldq      %1, 8
+    psrldq      %2, 8
     mov [t2+%3*32], t0w
-    mov [t2+%5*32], t1w
-    movd  t0d, %1
-    psrldq  %1, 4
-    movd  t1d, %1
-    mov [t2+%6*32], t0w
+    mov [t2+%7*32], t1w
+    shr         t0, 32
+    shr         t1, 32
+    mov [t2+%4*32], t0w
     mov [t2+%8*32], t1w
-    shr   t0d, 16
-    shr   t1d, 16
-    mov [t2+%7*32], t0w
+    movq        t0, %1
+    movq        t1, %2
+    mov [t2+%5*32], t0w
     mov [t2+%9*32], t1w
+    shr         t0, 32
+    shr         t1, 32
+    mov [t2+%6*32], t0w
+    mov [t2+%10*32], t1w
+%else
+    movd       t0d, %1
+    movd       t1d, %2
+    psrldq      %1, 4
+    psrldq      %2, 4
+    mov [t2+%3*32], t0w
+    mov [t2+%7*32], t1w
+    movd       t0d, %1
+    movd       t1d, %2
+    psrldq      %1, 4
+    psrldq      %2, 4
+    mov [t2+%4*32], t0w
+    mov [t2+%8*32], t1w
+    movd       t0d, %1
+    movd       t1d, %2
+    psrldq      %1, 4
+    psrldq      %2, 4
+    mov [t2+%5*32], t0w
+    mov [t2+%9*32], t1w
+    movd       t0d, %1
+    movd       t1d, %2
+    mov [t2+%6*32], t0w
+    mov [t2+%10*32], t1w
+%endif
 %endmacro
 
 %macro DEQUANT_STORE 1
     DEQUANT     %1
-    STORE_WORDS m0,  0,  1,  4,  5,  2,  3,  6,  7
-    STORE_WORDS m2,  8,  9, 12, 13, 10, 11, 14, 15
+    STORE_WORDS m0, m1,  0,  1,  4,  5,  2,  3,  6,  7
+    STORE_WORDS m2, m3,  8,  9, 12, 13, 10, 11, 14, 15
 %endmacro
 
 INIT_XMM sse2
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to