PR #22403 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22403 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22403.patch
>From eb9b2b83e71e56463ebc5516fa4912bbc7e84b68 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 5 Mar 2026 18:39:28 +0100 Subject: [PATCH 1/3] avcodec/x86/pngdsp: Don't use mmx register in ff_add_bytes_l2_sse2() This change has no measurable impact on performance here; it is intended to avoid unpredictable behavior with floating point operation like the one that led to commit 57a29f2e7dd2374a1df27316c6cf7c0225e86758. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/pngdsp.asm | 7 ++++--- tests/checkasm/png.c | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm index 8ff49565d3..5fa3780185 100644 --- a/libavcodec/x86/pngdsp.asm +++ b/libavcodec/x86/pngdsp.asm @@ -57,9 +57,10 @@ cglobal add_bytes_l2, 4, 6, 2, dst, src1, src2, wa, w, i and waq, ~7 jmp .end_l .loop_l: - movq mm0, [src1q+iq] - paddb mm0, [src2q+iq] - movq [dstq+iq ], mm0 + movq m0, [src2q+iq] + movq m1, [src1q+iq] + paddb m0, m1 + movq [dstq+iq ], m0 add iq, 8 .end_l: cmp iq, waq diff --git a/tests/checkasm/png.c b/tests/checkasm/png.c index 0807d3ab7b..0fe049cf9c 100644 --- a/tests/checkasm/png.c +++ b/tests/checkasm/png.c @@ -38,8 +38,8 @@ static void check_add_bytes_l2(const PNGDSPContext *c) LOCAL_ALIGNED_16(uint8_t, dst1, [BUF_SIZE]); LOCAL_ALIGNED_16(uint8_t, src, [2], [BUF_SIZE]); - declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t * dst, const uint8_t *src1, - const uint8_t *src2, int w); + declare_func(void, uint8_t * dst, const uint8_t *src1, + const uint8_t *src2, int w); randomize_buf(dst0, BUF_SIZE); memcpy(dst1, dst0, BUF_SIZE); -- 2.52.0 >From dd164f059d54fb9409b7854181abacb6e77990e6 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 5 Mar 2026 19:55:35 +0100 Subject: [PATCH 2/3] avcodec/x86/pngdsp: Avoid jump Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/pngdsp.asm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm index 5fa3780185..009e1034df 100644 --- a/libavcodec/x86/pngdsp.asm +++ b/libavcodec/x86/pngdsp.asm @@ -39,7 +39,8 @@ cglobal add_bytes_l2, 4, 6, 2, dst, src1, src2, wa, w, i ; vector loop mov wq, waq and waq, ~(mmsize*2-1) - jmp .end_v + jz .tail + .loop_v: movu m0, [src2q+iq] movu m1, [src2q+iq+mmsize] @@ -48,11 +49,11 @@ cglobal add_bytes_l2, 4, 6, 2, dst, src1, src2, wa, w, i movu [dstq+iq ], m0 movu [dstq+iq+mmsize], m1 add iq, mmsize*2 -.end_v: cmp iq, waq jl .loop_v ; vector loop +.tail: mov waq, wq and waq, ~7 jmp .end_l -- 2.52.0 >From ceda1d2389ec7ee622fcb0bb85cafb98827f0304 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 5 Mar 2026 20:03:11 +0100 Subject: [PATCH 3/3] avcodec/x86/pngdsp: Don't use 64bit unnecessarily The automatic zero-extensions when assigning a 32bit register make using 64bits unnecessary. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/pngdsp.asm | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm index 009e1034df..e0a3b602de 100644 --- a/libavcodec/x86/pngdsp.asm +++ b/libavcodec/x86/pngdsp.asm @@ -31,14 +31,11 @@ SECTION .text INIT_XMM sse2 cglobal add_bytes_l2, 4, 6, 2, dst, src1, src2, wa, w, i -%if ARCH_X86_64 - movsxd waq, wad -%endif - xor iq, iq + xor id, id ; vector loop - mov wq, waq - and waq, ~(mmsize*2-1) + mov wd, wad + and wad, ~(mmsize*2-1) jz .tail .loop_v: @@ -48,23 +45,23 @@ cglobal add_bytes_l2, 4, 6, 2, dst, src1, src2, wa, w, i paddb m1, [src1q+iq+mmsize] movu [dstq+iq ], m0 movu [dstq+iq+mmsize], m1 - add iq, mmsize*2 - cmp iq, waq + add id, mmsize*2 + cmp id, wad jl .loop_v ; vector loop .tail: - mov waq, wq - and waq, ~7 + mov wad, wd + and wad, ~7 jmp .end_l .loop_l: movq m0, [src2q+iq] movq m1, [src1q+iq] paddb m0, m1 movq [dstq+iq ], m0 - add iq, 8 + add id, 8 .end_l: - cmp iq, waq + cmp id, wad jl .loop_l ; scalar loop for leftover @@ -73,9 +70,9 @@ cglobal add_bytes_l2, 4, 6, 2, dst, src1, src2, wa, w, i mov wab, [src1q+iq] add wab, [src2q+iq] mov [dstq+iq], wab - inc iq + inc id .end_s: - cmp iq, wq + cmp id, wd jl .loop_s RET -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
