pixelutils: Remove pointless AVX2 sad32x32 functions

Andreas Rheinhardt via ffmpeg-cvslog Mon, 09 Mar 2026 03:06:55 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit c9e056bc85c2364f8d4df9408acad6da501ace99
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Mar 4 19:08:21 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Mar 9 10:17:26 2026 +0100

    avutil/x86/pixelutils: Remove pointless AVX2 sad32x32 functions
    
    Memory operands of VEX encoded instructions generally have
    no alignment requirement and so can be used in the case where
    both inputs are unaligned, too. Furthermore, unaligned load
    instructions are as fast as aligned loads (from aligned addresses)
    for modern cpus, in particular those with AVX2.
    
    Therefore it makes no sense to have three different AVX2 sad32x32
    functions. So remove two of them (the remaining one is the same
    as the old one where src1 was aligned and src2 was not).
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/x86/pixelutils.asm    | 60 ++++-------------------------------------
 libavutil/x86/pixelutils_init.c | 10 +------
 2 files changed, 6 insertions(+), 64 deletions(-)

diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
index 0bcccb51f5..a80202ef75 100644
--- a/libavutil/x86/pixelutils.asm
+++ b/libavutil/x86/pixelutils.asm
@@ -241,70 +241,24 @@ SAD_XMM_32x32 u
 ;                                  const uint8_t *src2, ptrdiff_t stride2);
 
;-------------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2
-    pxor            m0, m0
-    mov             r4d, 32/4
-    lea             r5, [stride1q * 3]
-    lea             r6, [stride2q * 3]
-
-.loop:
-    movu           m1, [src1q]               ; row 0 of pix0
-    movu           m2, [src2q]               ; row 0 of pix1
-    movu           m3, [src1q + stride1q]    ; row 1 of pix0
-    movu           m4, [src2q + stride2q]    ; row 1 of pix1
-
-    psadbw         m1, m2
-    psadbw         m3, m4
-    paddd          m0, m1
-    paddd          m0, m3
-
-    movu           m1, [src1q + 2 * stride1q] ; row 2 of pix0
-    movu           m2, [src2q + 2 * stride2q] ; row 2 of pix1
-    movu           m3, [src1q + r5]           ; row 3 of pix0
-    movu           m4, [src2q + r6]           ; row 3 of pix1
-
-    psadbw         m1, m2
-    psadbw         m3, m4
-    paddd          m0, m1
-    paddd          m0, m3
-
-    lea            src2q,     [src2q + 4 * stride2q]
-    lea            src1q,     [src1q + 4 * stride1q]
-
-    dec            r4d
-    jnz           .loop
-
-    vextracti128   xm1, m0, 1
-    paddd          xm0, xm1
-    pshufd         xm1, xm0, 2
-    paddd          xm0, xm1
-    movd           eax, xm0
-    RET
-
-;-------------------------------------------------------------------------------
-; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
-;                                       const uint8_t *src2, ptrdiff_t 
stride2);
-;-------------------------------------------------------------------------------
-%macro SAD_AVX2_32x32 1
-INIT_YMM avx2
-cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2
+cglobal pixelutils_sad_32x32, 4,7,3, src1, stride1, src2, stride2
     pxor           m0, m0
     mov            r4d, 32/4
     lea            r5, [stride1q * 3]
     lea            r6, [stride2q * 3]
 
 .loop:
-    mov%1          m1, [src2q]                ; row 0 of pix1
+    movu           m1, [src2q]                ; row 0 of pix1
     psadbw         m1, [src1q]
-    mov%1          m2, [src2q + stride2q]     ; row 1 of pix1
+    movu           m2, [src2q + stride2q]     ; row 1 of pix1
     psadbw         m2, [src1q + stride1q]
 
     paddd          m0, m1
     paddd          m0, m2
 
-    mov%1          m1, [src2q + 2 * stride2q] ; row 2 of pix1
+    movu           m1, [src2q + 2 * stride2q] ; row 2 of pix1
     psadbw         m1, [src1q + 2 * stride1q]
-    mov%1          m2, [src2q + r6]           ; row 3 of pix1
+    movu           m2, [src2q + r6]           ; row 3 of pix1
     psadbw         m2, [src1q + r5]
 
     paddd          m0, m1
@@ -322,8 +276,4 @@ cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, 
src2, stride2
     paddd          xm0, xm1
     movd           eax, xm0
     RET
-%endmacro
-
-SAD_AVX2_32x32 a
-SAD_AVX2_32x32 u
 %endif
diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c
index c3c0662414..57bdeb8cdb 100644
--- a/libavutil/x86/pixelutils_init.c
+++ b/libavutil/x86/pixelutils_init.c
@@ -40,10 +40,6 @@ int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, 
ptrdiff_t stride1,
 
 int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
                                  const uint8_t *src2, ptrdiff_t stride2);
-int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
-                                   const uint8_t *src2, ptrdiff_t stride2);
-int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
-                                   const uint8_t *src2, ptrdiff_t stride2);
 
 void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned)
 {
@@ -76,10 +72,6 @@ void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, 
int aligned)
     }
 
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-        switch (aligned) {
-        case 0: sad[4] = ff_pixelutils_sad_32x32_avx2;   break; // src1 
unaligned, src2 unaligned
-        case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1   
aligned, src2 unaligned
-        case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1   
aligned, src2   aligned
-        }
+        sad[4] = ff_pixelutils_sad_32x32_avx2;
     }
 }

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 02/08: avutil/x86/pixelutils: Remove pointless AVX2 sad32x32 functions

Reply via email to