PR #22307 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22307 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22307.patch
>From 451d53eb3db21189d9ca66a3a3b6684eb8e34efb Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Fri, 27 Feb 2026 13:19:47 +0100 Subject: [PATCH 1/3] avcodec/x86/bswapdsp: Avoid register copies No change in benchmarks here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/bswapdsp.asm | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm index 31c6c48a21..12fd494ffe 100644 --- a/libavcodec/x86/bswapdsp.asm +++ b/libavcodec/x86/bswapdsp.asm @@ -33,10 +33,10 @@ SECTION .text ; %1 = aligned/unaligned %macro BSWAP_LOOPS 1 mov r3d, r2d - sar r2d, 3 + sar r3d, 3 jz .left4_%1 %if cpuflag(avx2) - sar r2d, 1 + sar r3d, 1 jz .left8_%1 %endif .loop8_%1: @@ -65,12 +65,11 @@ SECTION .text %endif add r0, mmsize*2 add r1, mmsize*2 - dec r2d + dec r3d jnz .loop8_%1 %if cpuflag(avx2) .left8_%1: - mov r2d, r3d - test r3d, 8 + test r2d, 8 jz .left4_%1 mov%1 m0, [r1] pshufb m0, m2 @@ -79,8 +78,7 @@ SECTION .text add r0, mmsize %endif .left4_%1: - mov r2d, r3d - test r3d, 4 + test r2d, 4 jz .left mov%1 xm0, [r1] %if cpuflag(ssse3) -- 2.52.0 >From 3db6adc772ebfadf0537390740883ab6feed2841 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Fri, 27 Feb 2026 13:24:04 +0100 Subject: [PATCH 2/3] avcodec/x86/bswapdsp: combine shifting, avoid check for AVX2 This avoids a check and a shift if >=8 elements are processed; it adds a check if < 8 elements are processed (which should be rare). No change in benchmarks here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/bswapdsp.asm | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm index 12fd494ffe..f89ca76cf1 100644 --- a/libavcodec/x86/bswapdsp.asm +++ b/libavcodec/x86/bswapdsp.asm @@ -33,11 +33,12 @@ SECTION .text ; %1 = aligned/unaligned %macro BSWAP_LOOPS 1 mov r3d, r2d +%if cpuflag(avx2) + sar r3d, 4 + jz .left8_%1 +%else sar r3d, 3 jz .left4_%1 -%if cpuflag(avx2) - sar r3d, 1 - jz .left8_%1 %endif .loop8_%1: mov%1 m0, [r1 + 0] -- 2.52.0 >From 311a587c7f2b90f54a04bb19505736cf9f304a48 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Fri, 27 Feb 2026 13:54:21 +0100 Subject: [PATCH 3/3] avcodec/x86/bswapdsp: Avoid aligned vs unaligned codepaths for AVX2 For modern cpus (like those supporting AVX2) loads and stores using the unaligned versions of instructions are as fast as aligned ones if the address is aligned, so remove the aligned AVX2 version (and the alignment check) and just remove the unaligned one. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/bswapdsp.asm | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm index f89ca76cf1..2b80d8a75e 100644 --- a/libavcodec/x86/bswapdsp.asm +++ b/libavcodec/x86/bswapdsp.asm @@ -100,10 +100,15 @@ SECTION .text ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w); %macro BSWAP32_BUF 0 -%if cpuflag(ssse3)||cpuflag(avx2) +%if cpuflag(avx2) +cglobal bswap32_buf, 3,4,3 + vbroadcasti128 m2, [pb_bswap32] + BSWAP_LOOPS u +%else +%if cpuflag(ssse3) cglobal bswap32_buf, 3,4,3 mov r3, r1 - VBROADCASTI128 m2, [pb_bswap32] + mova m2, [pb_bswap32] %else cglobal bswap32_buf, 3,4,5 mov r3, r1 @@ -115,6 +120,7 @@ cglobal bswap32_buf, 3,4,5 jmp .left .start_align: BSWAP_LOOPS a +%endif .left: %if cpuflag(ssse3) test r2d, 2 -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
