This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit c9e056bc85c2364f8d4df9408acad6da501ace99 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Wed Mar 4 19:08:21 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Mon Mar 9 10:17:26 2026 +0100 avutil/x86/pixelutils: Remove pointless AVX2 sad32x32 functions Memory operands of VEX encoded instructions generally have no alignment requirement and so can be used in the case where both inputs are unaligned, too. Furthermore, unaligned load instructions are as fast as aligned loads (from aligned addresses) for modern cpus, in particular those with AVX2. Therefore it makes no sense to have three different AVX2 sad32x32 functions. So remove two of them (the remaining one is the same as the old one where src1 was aligned and src2 was not). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/x86/pixelutils.asm | 60 ++++------------------------------------- libavutil/x86/pixelutils_init.c | 10 +------ 2 files changed, 6 insertions(+), 64 deletions(-) diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm index 0bcccb51f5..a80202ef75 100644 --- a/libavutil/x86/pixelutils.asm +++ b/libavutil/x86/pixelutils.asm @@ -241,70 +241,24 @@ SAD_XMM_32x32 u ; const uint8_t *src2, ptrdiff_t stride2); ;------------------------------------------------------------------------------- INIT_YMM avx2 -cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2 - pxor m0, m0 - mov r4d, 32/4 - lea r5, [stride1q * 3] - lea r6, [stride2q * 3] - -.loop: - movu m1, [src1q] ; row 0 of pix0 - movu m2, [src2q] ; row 0 of pix1 - movu m3, [src1q + stride1q] ; row 1 of pix0 - movu m4, [src2q + stride2q] ; row 1 of pix1 - - psadbw m1, m2 - psadbw m3, m4 - paddd m0, m1 - paddd m0, m3 - - movu m1, [src1q + 2 * stride1q] ; row 2 of pix0 - movu m2, [src2q + 2 * stride2q] ; row 2 of pix1 - movu m3, [src1q + r5] ; row 3 of pix0 - movu m4, [src2q + r6] ; row 3 of pix1 - - psadbw m1, m2 - psadbw m3, m4 - paddd m0, m1 - paddd m0, m3 - - lea src2q, [src2q + 4 * stride2q] - lea src1q, [src1q + 4 * stride1q] - - dec r4d - jnz .loop - - vextracti128 xm1, m0, 1 - paddd xm0, xm1 - pshufd xm1, xm0, 2 - paddd xm0, xm1 - movd eax, xm0 - RET - -;------------------------------------------------------------------------------- -; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, -; const uint8_t *src2, ptrdiff_t stride2); -;------------------------------------------------------------------------------- -%macro SAD_AVX2_32x32 1 -INIT_YMM avx2 -cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 +cglobal pixelutils_sad_32x32, 4,7,3, src1, stride1, src2, stride2 pxor m0, m0 mov r4d, 32/4 lea r5, [stride1q * 3] lea r6, [stride2q * 3] .loop: - mov%1 m1, [src2q] ; row 0 of pix1 + movu m1, [src2q] ; row 0 of pix1 psadbw m1, [src1q] - mov%1 m2, [src2q + stride2q] ; row 1 of pix1 + movu m2, [src2q + stride2q] ; row 1 of pix1 psadbw m2, [src1q + stride1q] paddd m0, m1 paddd m0, m2 - mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1 + movu m1, [src2q + 2 * stride2q] ; row 2 of pix1 psadbw m1, [src1q + 2 * stride1q] - mov%1 m2, [src2q + r6] ; row 3 of pix1 + movu m2, [src2q + r6] ; row 3 of pix1 psadbw m2, [src1q + r5] paddd m0, m1 @@ -322,8 +276,4 @@ cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 paddd xm0, xm1 movd eax, xm0 RET -%endmacro - -SAD_AVX2_32x32 a -SAD_AVX2_32x32 u %endif diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c index c3c0662414..57bdeb8cdb 100644 --- a/libavutil/x86/pixelutils_init.c +++ b/libavutil/x86/pixelutils_init.c @@ -40,10 +40,6 @@ int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, const uint8_t *src2, ptrdiff_t stride2); -int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); -int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned) { @@ -76,10 +72,6 @@ void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned) } if (EXTERNAL_AVX2_FAST(cpu_flags)) { - switch (aligned) { - case 0: sad[4] = ff_pixelutils_sad_32x32_avx2; break; // src1 unaligned, src2 unaligned - case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1 aligned, src2 unaligned - case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1 aligned, src2 aligned - } + sad[4] = ff_pixelutils_sad_32x32_avx2; } } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
