PR #21027 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21027 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21027.patch
The last patch addresses #20835. >From a78016a3dd55654baef6ecdd51192a78843e9a6d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 26 Nov 2025 12:26:37 +0100 Subject: [PATCH 1/9] avcodec/x86/h264idct: Remove dead MMX macros Forgotten in 4618f36a2424a3a4d5760afabc2e9dd18d73f0a4. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_idct.asm | 105 +---------------------------------- 1 file changed, 3 insertions(+), 102 deletions(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index d9c3c9c862..985955d96a 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -145,61 +145,6 @@ SECTION .text IDCT8_1D [%1], [%1+ 64] %endmacro -; %1=int16_t *block, %2=int16_t *dstblock -%macro IDCT8_ADD_MMX_START 2 - IDCT8_1D_FULL %1 - mova [%1], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - mova m7, [%1] - mova [%2 ], m0 - mova [%2+16], m1 - mova [%2+32], m2 - mova [%2+48], m3 - TRANSPOSE4x4W 4, 5, 6, 7, 3 - mova [%2+ 8], m4 - mova [%2+24], m5 - mova [%2+40], m6 - mova [%2+56], m7 -%endmacro - -; %1=uint8_t *dst, %2=int16_t *block, %3=int stride -%macro IDCT8_ADD_MMX_END 3-4 - IDCT8_1D_FULL %2 - mova [%2 ], m5 - mova [%2+16], m6 - mova [%2+32], m7 - - pxor m7, m7 -%if %0 == 4 - movq [%4+ 0], m7 - movq [%4+ 8], m7 - movq [%4+ 16], m7 - movq [%4+ 24], m7 - movq [%4+ 32], m7 - movq [%4+ 40], m7 - movq [%4+ 48], m7 - movq [%4+ 56], m7 - movq [%4+ 64], m7 - movq [%4+ 72], m7 - movq [%4+ 80], m7 - movq [%4+ 88], m7 - movq [%4+ 96], m7 - movq [%4+104], m7 - movq [%4+112], m7 - movq [%4+120], m7 -%endif - STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 - lea %1, [%1+%3*2] - STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 - mova m0, [%2 ] - mova m1, [%2+16] - mova m2, [%2+32] - lea %1, [%1+%3*2] - STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 - lea %1, [%1+%3*2] - STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 -%endmacro - ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride %macro IDCT8_ADD_SSE 4 IDCT8_1D_FULL %2 @@ -612,7 +557,7 @@ cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 add8_sse2_cycle 3, 0x64 RET -;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul) +;void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul) %macro WALSH4_1D 5 SUMSUB_BADC w, %4, %3, %2, %1, %5 @@ -620,8 +565,7 @@ RET SWAP %1, %4, %3 %endmacro -%macro DEQUANT 1-3 -%if cpuflag(sse2) +%macro DEQUANT 1 movd xmm4, t3d movq xmm5, [pw_1] pshufd xmm4, xmm4, 0 @@ -643,31 +587,9 @@ RET psrad xmm3, %1 packssdw xmm0, xmm1 packssdw xmm2, xmm3 -%else - mova m7, [pw_1] - mova m4, %1 - punpcklwd %1, m7 - punpckhwd m4, m7 - mova m5, %2 - punpcklwd %2, m7 - punpckhwd m5, m7 - movd m7, t3d - punpckldq m7, m7 - pmaddwd %1, m7 - pmaddwd %2, m7 - pmaddwd m4, m7 - pmaddwd m5, m7 - psrad %1, %3 - psrad %2, %3 - psrad m4, %3 - psrad m5, %3 - packssdw %1, m4 - packssdw %2, m5 -%endif %endmacro -%macro STORE_WORDS 5-9 -%if cpuflag(sse) +%macro STORE_WORDS 9 movd t0d, %1 psrldq %1, 4 movd t1d, %1 @@ -687,33 +609,12 @@ RET shr t1d, 16 mov [t2+%7*32], t0w mov [t2+%9*32], t1w -%else - movd t0d, %1 - psrlq %1, 32 - movd t1d, %1 - mov [t2+%2*32], t0w - mov [t2+%4*32], t1w - shr t0d, 16 - shr t1d, 16 - mov [t2+%3*32], t0w - mov [t2+%5*32], t1w -%endif %endmacro %macro DEQUANT_STORE 1 -%if cpuflag(sse2) DEQUANT %1 STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 -%else - DEQUANT m0, m1, %1 - STORE_WORDS m0, 0, 1, 4, 5 - STORE_WORDS m1, 2, 3, 6, 7 - - DEQUANT m2, m3, %1 - STORE_WORDS m2, 8, 9, 12, 13 - STORE_WORDS m3, 10, 11, 14, 15 -%endif %endmacro INIT_XMM sse2 -- 2.49.1 >From 956a47473728b5a45f29d1bc17a7cdd8010dfcb6 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 26 Nov 2025 12:38:58 +0100 Subject: [PATCH 2/9] avcodec/x86/h264_idct: Remove redundant movsxdifnidn Only exported (i.e. cglobal) functions need it; stride is already sign-extended when it reaches any of the internal functions used here, so don't sign-extend again. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_idct.asm | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 985955d96a..6863dbcb4d 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -55,7 +55,7 @@ cextern pw_1 SECTION .text -; %1=uint8_t *dst, %2=int16_t *block, %3=int stride +; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride %macro IDCT4_ADD 3 ; Load dct coeffs movq m0, [%2] @@ -145,7 +145,7 @@ SECTION .text IDCT8_1D [%1], [%1+ 64] %endmacro -; %1=uint8_t *dst, %2=int16_t *block, %3=int stride +; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride %macro IDCT8_ADD_SSE 4 IDCT8_1D_FULL %2 %if ARCH_X86_64 @@ -317,7 +317,6 @@ INIT_XMM cpuname INIT_MMX mmx h264_idct_add8_mmx_plane: - movsxdifnidn r3, r3d .nextblock: movzx r6, byte [scan8+r5] movzx r6, byte [r4+r6] @@ -372,9 +371,8 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, str RET ; TODO: check rep ret after a function call -; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered +; r0 = uint8_t *dst, r2 = int16_t *block, r3 = ptrdiff_t stride, r6=clobbered h264_idct_dc_add8_mmxext: - movsxdifnidn r3, r3d movd m0, [r2 ] ; 0 0 X D mov word [r2+ 0], 0 punpcklwd m0, [r2+32] ; x X d D @@ -393,9 +391,8 @@ h264_idct_dc_add8_mmxext: ALIGN 16 INIT_XMM sse2 -; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride +; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = ptrdiff_t stride h264_add8x4_idct_sse2: - movsxdifnidn r3, r3d movq m0, [r2+ 0] movq m1, [r2+ 8] movq m2, [r2+16] -- 2.49.1 >From b206204fee005e39a13b642609534dd1a5bf8763 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 26 Nov 2025 14:09:57 +0100 Subject: [PATCH 3/9] avcodec/x86/h264_idct: Avoid call where possible Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_idct.asm | 49 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 6863dbcb4d..9405aa848a 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -316,29 +316,6 @@ INIT_XMM cpuname RET INIT_MMX mmx -h264_idct_add8_mmx_plane: -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - or r6w, word [r2] - test r6, r6 - jz .skipblock -%if ARCH_X86_64 - mov r0d, dword [r1+r5*4] - add r0, [dst2q] -%else - mov r0, r1m ; XXX r1m here is actually r0m of the calling func - mov r0, [r0] - add r0, dword [r1+r5*4] -%endif - IDCT4_ADD r0, r2, r3 -.skipblock: - inc r5 - add r2, 32 - test r5, 3 - jnz .nextblock - rep ret - cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg movsxdifnidn r3, r3d @@ -367,9 +344,31 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, str call h264_idct_add8_mmx_plane add r5, 4 - call h264_idct_add8_mmx_plane + TAIL_CALL h264_idct_add8_mmx_plane, 0 + +h264_idct_add8_mmx_plane: +.nextblock: + movzx r6d, byte [scan8+r5] + movzx r6d, byte [r4+r6] + or r6w, word [r2] + test r6d, r6d + jz .skipblock +%if ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [dst2q] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + IDCT4_ADD r0, r2, r3 +.skipblock: + inc r5d + add r2, 32 + test r5d, 3 + jnz .nextblock + rep ret - RET ; TODO: check rep ret after a function call ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = ptrdiff_t stride, r6=clobbered h264_idct_dc_add8_mmxext: -- 2.49.1 >From 64c493d604e5ebdbe188a276697b777e35d53952 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 26 Nov 2025 14:57:45 +0100 Subject: [PATCH 4/9] avutil/x86/x86inc: Use parentheses in has_epilogue Prevents surprises. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/x86/x86inc.asm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index e61d924bc1..0e80ebed43 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -609,7 +609,7 @@ DECLARE_REG 14, R13, 120 RESET_STACK_STATE %endmacro -%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs +%define has_epilogue (regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs) %macro RET 0 WIN64_RESTORE_XMM_INTERNAL @@ -658,7 +658,7 @@ DECLARE_REG 14, R13, 72 %endif %endmacro -%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required +%define has_epilogue (regs_used > 9 || stack_size > 0 || vzeroupper_required) %macro RET 0 %if stack_size_padded > 0 @@ -722,7 +722,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endif %endmacro -%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required +%define has_epilogue (regs_used > 3 || stack_size > 0 || vzeroupper_required) %macro RET 0 %if stack_size_padded > 0 -- 2.49.1 >From 0a577a5927331f1e58973fb652e2c30445743593 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 26 Nov 2025 15:23:31 +0100 Subject: [PATCH 5/9] avcodec/x86/h264_idct: Use tail call where advantageous It is possible on UNIX64. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_idct.asm | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 9405aa848a..4b9efd6d6d 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -55,6 +55,19 @@ cextern pw_1 SECTION .text +; %1=callee, %2=dst to jump to if tail call is impossible (can be empty, +; then no jmp is performed), %3=current iteration, %4=last iteration +%macro TAIL_CALL_IF_LAST 4 +%if (%3 == %4) && !has_epilogue + jmp %1 +%else + call %1 + %ifnempty %2 + jmp %2 + %endif +%endif +%endmacro + ; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride %macro IDCT4_ADD 3 ; Load dct coeffs @@ -424,7 +437,7 @@ h264_add8x4_idct_sse2: %else add r0, r0m %endif - call h264_add8x4_idct_sse2 + TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, , %1, 7 .cycle%1end: %if %1 < 7 add r2, 64 @@ -461,8 +474,7 @@ RET %else add r0, r0m %endif - call h264_add8x4_idct_sse2 - jmp .cycle%1end + TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, .cycle%1end, %1, 7 .try%1dc: movsx r0, word [r2 ] or r0w, word [r2+32] @@ -473,7 +485,7 @@ RET %else add r0, r0m %endif - call h264_idct_dc_add8_mmxext + TAIL_CALL_IF_LAST h264_idct_dc_add8_mmxext, , %1, 7 .cycle%1end: %if %1 < 7 add r2, 64 @@ -510,8 +522,7 @@ RET mov r0, [r0] add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] %endif - call h264_add8x4_idct_sse2 - jmp .cycle%1end + TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, .cycle%1end, %1, 3 .try%1dc: movsx r0, word [r2 ] or r0w, word [r2+32] @@ -524,7 +535,7 @@ RET mov r0, [r0] add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] %endif - call h264_idct_dc_add8_mmxext + TAIL_CALL_IF_LAST h264_idct_dc_add8_mmxext, , %1, 3 .cycle%1end: %if %1 == 1 add r2, 384+64 -- 2.49.1 >From 84eaebab60bf3e832e8a8465103b8425f1773c15 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 26 Nov 2025 15:59:03 +0100 Subject: [PATCH 6/9] avcodec/x86/h264_idct: Zero with full-width stores Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_idct.asm | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 4b9efd6d6d..50647f2454 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -90,10 +90,15 @@ SECTION .text paddw m0, m6 IDCT4_1D w, 0, 1, 2, 3, 4, 5 pxor m7, m7 - movq [%2+ 0], m7 - movq [%2+ 8], m7 - movq [%2+16], m7 - movq [%2+24], m7 + %if mmsize == 16 + mova [%2+ 0], m7 + mova [%2+16], m7 + %else + movq [%2+ 0], m7 + movq [%2+ 8], m7 + movq [%2+16], m7 + movq [%2+24], m7 + %endif STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 lea %1, [%1+%3*2] -- 2.49.1 >From 23a00e85b832766596352b6e391effb3e7348870 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 26 Nov 2025 17:26:47 +0100 Subject: [PATCH 7/9] avcodec/x86/h264_idct: Don't use MMX registers in ff_h264_luma_dc_dequant_idct_sse2 It is ABI compliant and gives a tiny speedup here (and is 16B smaller). Old benchmarks: h264_luma_dc_dequant_idct_8_c: 33.2 ( 1.00x) h264_luma_dc_dequant_idct_8_sse2: 16.0 ( 2.07x) New benchmarks: h264_luma_dc_dequant_idct_8_c: 33.0 ( 1.00x) h264_luma_dc_dequant_idct_8_sse2: 15.0 ( 2.20x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_idct.asm | 56 +++++++++++++++++++----------------- tests/checkasm/h264dsp.c | 2 +- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 50647f2454..fe46107867 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -578,27 +578,23 @@ RET %endmacro %macro DEQUANT 1 - movd xmm4, t3d - movq xmm5, [pw_1] - pshufd xmm4, xmm4, 0 - movq2dq xmm0, m0 - movq2dq xmm1, m1 - movq2dq xmm2, m2 - movq2dq xmm3, m3 - punpcklwd xmm0, xmm5 - punpcklwd xmm1, xmm5 - punpcklwd xmm2, xmm5 - punpcklwd xmm3, xmm5 - pmaddwd xmm0, xmm4 - pmaddwd xmm1, xmm4 - pmaddwd xmm2, xmm4 - pmaddwd xmm3, xmm4 - psrad xmm0, %1 - psrad xmm1, %1 - psrad xmm2, %1 - psrad xmm3, %1 - packssdw xmm0, xmm1 - packssdw xmm2, xmm3 + movd m4, t3d + movq m5, [pw_1] + pshufd m4, m4, 0 + punpcklwd m0, m5 + punpcklwd m1, m5 + punpcklwd m2, m5 + punpcklwd m3, m5 + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m4 + pmaddwd m3, m4 + psrad m0, %1 + psrad m1, %1 + psrad m2, %1 + psrad m3, %1 + packssdw m0, m1 + packssdw m2, m3 %endmacro %macro STORE_WORDS 9 @@ -625,19 +621,25 @@ RET %macro DEQUANT_STORE 1 DEQUANT %1 - STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 - STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 + STORE_WORDS m0, 0, 1, 4, 5, 2, 3, 6, 7 + STORE_WORDS m2, 8, 9, 12, 13, 10, 11, 14, 15 %endmacro INIT_XMM sse2 cglobal h264_luma_dc_dequant_idct, 3, 4, 7 -INIT_MMX cpuname movq m3, [r1+24] movq m2, [r1+16] movq m1, [r1+ 8] movq m0, [r1+ 0] WALSH4_1D 0,1,2,3,4 - TRANSPOSE4x4W 0,1,2,3,4 + punpcklwd m0, m1 + punpcklwd m2, m3 + mova m4, m0 + punpckldq m0, m2 + punpckhdq m4, m2 + movhlps m1, m0 + movhlps m3, m4 + SWAP 2, 4 WALSH4_1D 0,1,2,3,4 ; shift, tmp, output, qmul @@ -665,8 +667,8 @@ INIT_MMX cpuname inc t1d shr t3d, t0b sub t1d, t0d - movd xmm6, t1d - DEQUANT_STORE xmm6 + movd m6, t1d + DEQUANT_STORE m6 RET %ifdef __NASM_VER__ diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c index f05ae419fc..acf4f61ebb 100644 --- a/tests/checkasm/h264dsp.c +++ b/tests/checkasm/h264dsp.c @@ -336,7 +336,7 @@ static void check_idct_dequant(void) LOCAL_ALIGNED_16(int32_t, dst1_32, [16 * 16]); H264DSPContext h; int bit_depth, i, qmul; - declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_SSE2, void, int16_t *output, int16_t *input, int qmul); + declare_func(void, int16_t *output, int16_t *input, int qmul); qmul = rnd() % 4096; -- 2.49.1 >From a091ad3cc0af5c1ad14fbfdf4fe2b8fd93bd0dcc Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 26 Nov 2025 17:48:01 +0100 Subject: [PATCH 8/9] avcodec/x86/h264_idct: Deduplicate generating constant pw_1 is currently loaded in both codepaths. Generate it earlier instead. Gives tiny speedups (15 vs 14.5 cycles) and reduces codesize. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_idct.asm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index fe46107867..d35d583ce7 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -51,7 +51,6 @@ scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 %endif cextern pw_32 -cextern pw_1 SECTION .text @@ -577,9 +576,9 @@ RET SWAP %1, %4, %3 %endmacro +; requires m5 to contain pw_1 %macro DEQUANT 1 movd m4, t3d - movq m5, [pw_1] pshufd m4, m4, 0 punpcklwd m0, m5 punpcklwd m1, m5 @@ -635,6 +634,7 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, 7 punpcklwd m0, m1 punpcklwd m2, m3 mova m4, m0 + pcmpeqw m5, m5 punpckldq m0, m2 punpckhdq m4, m2 movhlps m1, m0 @@ -652,6 +652,7 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, 7 %else DECLARE_REG_TMP 1,3,0,2 %endif + psrlw m5, 15 cmp t3d, 32767 jg .big_qmul -- 2.49.1 >From 67bcf93f397f3c6bba08f6872b2723c946ae3aed Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 26 Nov 2025 20:15:55 +0100 Subject: [PATCH 9/9] avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures ff_h264_luma_dc_dequant_idct_sse2() does not pass checkasm for certain seeds, because the input to packssdw no longer fits into an int16_t, leading to saturation, where the C code just truncates. I don't know whether the spec contains provisions that ensure that valid input must not exceed 16 bit or whether the such inputs (even if invalid) can be triggered by the actual code and not only the test. This commit adapts the behavior of the function to the C reference code to fix the test. packssdw is avoided, instead the lower words are directly transfered to GPRs to be written out. This has unfortunately led to a slight performance regression here (14.5 vs 15.1 cycles). Fixes issue #20835. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_idct.asm | 64 ++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index d35d583ce7..47e4116f42 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -592,36 +592,58 @@ RET psrad m1, %1 psrad m2, %1 psrad m3, %1 - packssdw m0, m1 - packssdw m2, m3 %endmacro -%macro STORE_WORDS 9 - movd t0d, %1 - psrldq %1, 4 - movd t1d, %1 - psrldq %1, 4 - mov [t2+%2*32], t0w - mov [t2+%4*32], t1w - shr t0d, 16 - shr t1d, 16 +%macro STORE_WORDS 10 +%if ARCH_X86_64 + movq t0, %1 + movq t1, %2 + psrldq %1, 8 + psrldq %2, 8 mov [t2+%3*32], t0w - mov [t2+%5*32], t1w - movd t0d, %1 - psrldq %1, 4 - movd t1d, %1 - mov [t2+%6*32], t0w + mov [t2+%7*32], t1w + shr t0, 32 + shr t1, 32 + mov [t2+%4*32], t0w mov [t2+%8*32], t1w - shr t0d, 16 - shr t1d, 16 - mov [t2+%7*32], t0w + movq t0, %1 + movq t1, %2 + mov [t2+%5*32], t0w mov [t2+%9*32], t1w + shr t0, 32 + shr t1, 32 + mov [t2+%6*32], t0w + mov [t2+%10*32], t1w +%else + movd t0d, %1 + movd t1d, %2 + psrldq %1, 4 + psrldq %2, 4 + mov [t2+%3*32], t0w + mov [t2+%7*32], t1w + movd t0d, %1 + movd t1d, %2 + psrldq %1, 4 + psrldq %2, 4 + mov [t2+%4*32], t0w + mov [t2+%8*32], t1w + movd t0d, %1 + movd t1d, %2 + psrldq %1, 4 + psrldq %2, 4 + mov [t2+%5*32], t0w + mov [t2+%9*32], t1w + movd t0d, %1 + movd t1d, %2 + mov [t2+%6*32], t0w + mov [t2+%10*32], t1w +%endif %endmacro %macro DEQUANT_STORE 1 DEQUANT %1 - STORE_WORDS m0, 0, 1, 4, 5, 2, 3, 6, 7 - STORE_WORDS m2, 8, 9, 12, 13, 10, 11, 14, 15 + STORE_WORDS m0, m1, 0, 1, 4, 5, 2, 3, 6, 7 + STORE_WORDS m2, m3, 8, 9, 12, 13, 10, 11, 14, 15 %endmacro INIT_XMM sse2 -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
