bswapdsp: Minor improvements (PR #22307)

mkver via ffmpeg-devel Fri, 27 Feb 2026 05:04:30 -0800

PR #22307 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22307
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22307.patch



>From 451d53eb3db21189d9ca66a3a3b6684eb8e34efb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Fri, 27 Feb 2026 13:19:47 +0100
Subject: [PATCH 1/3] avcodec/x86/bswapdsp: Avoid register copies

No change in benchmarks here.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/bswapdsp.asm | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 31c6c48a21..12fd494ffe 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -33,10 +33,10 @@ SECTION .text
 ; %1 = aligned/unaligned
 %macro BSWAP_LOOPS  1
     mov      r3d, r2d
-    sar      r2d, 3
+    sar      r3d, 3
     jz       .left4_%1
 %if cpuflag(avx2)
-    sar      r2d, 1
+    sar      r3d, 1
     jz       .left8_%1
 %endif
 .loop8_%1:
@@ -65,12 +65,11 @@ SECTION .text
 %endif
     add      r0, mmsize*2
     add      r1, mmsize*2
-    dec      r2d
+    dec      r3d
     jnz      .loop8_%1
 %if cpuflag(avx2)
 .left8_%1:
-    mov      r2d, r3d
-    test     r3d, 8
+    test     r2d, 8
     jz       .left4_%1
     mov%1    m0, [r1]
     pshufb   m0, m2
@@ -79,8 +78,7 @@ SECTION .text
     add r0, mmsize
 %endif
 .left4_%1:
-    mov      r2d, r3d
-    test     r3d, 4
+    test     r2d, 4
     jz       .left
     mov%1    xm0, [r1]
 %if cpuflag(ssse3)
-- 
2.52.0


>From 3db6adc772ebfadf0537390740883ab6feed2841 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Fri, 27 Feb 2026 13:24:04 +0100
Subject: [PATCH 2/3] avcodec/x86/bswapdsp: combine shifting, avoid check for
 AVX2

This avoids a check and a shift if >=8 elements are processed;
it adds a check if < 8 elements are processed (which should
be rare).
No change in benchmarks here.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/bswapdsp.asm | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 12fd494ffe..f89ca76cf1 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -33,11 +33,12 @@ SECTION .text
 ; %1 = aligned/unaligned
 %macro BSWAP_LOOPS  1
     mov      r3d, r2d
+%if cpuflag(avx2)
+    sar      r3d, 4
+    jz       .left8_%1
+%else
     sar      r3d, 3
     jz       .left4_%1
-%if cpuflag(avx2)
-    sar      r3d, 1
-    jz       .left8_%1
 %endif
 .loop8_%1:
     mov%1    m0, [r1 +  0]
-- 
2.52.0


>From 311a587c7f2b90f54a04bb19505736cf9f304a48 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Fri, 27 Feb 2026 13:54:21 +0100
Subject: [PATCH 3/3] avcodec/x86/bswapdsp: Avoid aligned vs unaligned
 codepaths for AVX2

For modern cpus (like those supporting AVX2) loads and stores
using the unaligned versions of instructions are as fast
as aligned ones if the address is aligned, so remove
the aligned AVX2 version (and the alignment check) and just
remove the unaligned one.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/bswapdsp.asm | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index f89ca76cf1..2b80d8a75e 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -100,10 +100,15 @@ SECTION .text
 
 ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
 %macro BSWAP32_BUF 0
-%if cpuflag(ssse3)||cpuflag(avx2)
+%if cpuflag(avx2)
+cglobal bswap32_buf, 3,4,3
+    vbroadcasti128  m2, [pb_bswap32]
+    BSWAP_LOOPS  u
+%else
+%if cpuflag(ssse3)
 cglobal bswap32_buf, 3,4,3
     mov      r3, r1
-    VBROADCASTI128  m2, [pb_bswap32]
+    mova     m2, [pb_bswap32]
 %else
 cglobal bswap32_buf, 3,4,5
     mov      r3, r1
@@ -115,6 +120,7 @@ cglobal bswap32_buf, 3,4,5
     jmp      .left
 .start_align:
     BSWAP_LOOPS  a
+%endif
 .left:
 %if cpuflag(ssse3)
     test     r2d, 2
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] avcodec/x86/bswapdsp: Minor improvements (PR #22307)

Reply via email to