The branch, master has been updated
via 81362b319ea7244d8d17110adfa59f10c7e78268 (commit)
via 23720df371d7da9ea8074b01753497f7194973f4 (commit)
via 6588bf368613a7fdc1970c77030a94f31fcfa970 (commit)
from f3346ca6f7d8249738ae0d9544b62be5c51ccfcd (commit)
- Log -----------------------------------------------------------------
commit 81362b319ea7244d8d17110adfa59f10c7e78268
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 25 16:28:51 2025 +0100
Commit: James Almer <[email protected]>
CommitDate: Wed Nov 26 00:01:09 2025 +0000
avcodec/x86/me_cmp: Avoid call on UNIX64
The internal functions for calculating the hadamard difference
of two 8x8 blocks have no epilogue on UNIX64, so one can avoid
the call altogether by placing the 8x8 function so that it directly
falls into the internal function.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 2a196d03bb..3ac8acee2c 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -121,12 +121,8 @@ SECTION .text
movd %3, %1
%endmacro
-%macro hadamard8_16_wrapper 2
-cglobal hadamard8_diff, 4, 4, %1, %2*mmsize
- call hadamard8x8_diff %+ SUFFIX
- RET
-
-cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
+%macro HADAMARD8_DIFF 1
+cglobal hadamard8_diff16, 5, 6, %1, 2*mmsize*ARCH_X86_32
call hadamard8x8_diff %+ SUFFIX
mov r5d, eax
@@ -151,9 +147,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
.done:
mov eax, r5d
RET
-%endmacro
-%macro HADAMARD8_DIFF 1
+cglobal hadamard8_diff, 4, 4, %1, 2*mmsize*ARCH_X86_32
+ TAIL_CALL hadamard8x8_diff %+ SUFFIX, 0
+
; r1, r2 and r3 are not clobbered in this function, so 16x16 can
; simply call this 2x2x (and that's why we access rsp+gprsize
; everywhere, which is rsp of calling function)
@@ -171,8 +168,6 @@ hadamard8x8_diff %+ SUFFIX:
HSUM m0, m1, eax
and eax, 0xFFFF
ret
-
-hadamard8_16_wrapper %1, 2*ARCH_X86_32
%endmacro
INIT_XMM sse2
commit 23720df371d7da9ea8074b01753497f7194973f4
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 25 15:45:44 2025 +0100
Commit: James Almer <[email protected]>
CommitDate: Wed Nov 26 00:01:09 2025 +0000
avcodec/me_cmp: Remove MMXEXT hadamard diff functions
The SSE2 and SSSE3 functions are now available everywhere,
making the MMXEXT functions irrelevant.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index e123089ba3..2a196d03bb 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -112,7 +112,6 @@ SECTION .text
; about 100k on extreme inputs. But that's very unlikely to occur in natural
video,
; and it's even more unlikely to not have any alternative mvs/modes with lower
cost.
%macro HSUM 3
-%if cpuflag(sse2)
movhlps %2, %1
paddusw %1, %2
pshuflw %2, %1, 0xE
@@ -120,35 +119,6 @@ SECTION .text
pshuflw %2, %1, 0x1
paddusw %1, %2
movd %3, %1
-%elif cpuflag(mmxext)
- pshufw %2, %1, 0xE
- paddusw %1, %2
- pshufw %2, %1, 0x1
- paddusw %1, %2
- movd %3, %1
-%elif cpuflag(mmx)
- mova %2, %1
- psrlq %1, 32
- paddusw %1, %2
- mova %2, %1
- psrlq %1, 16
- paddusw %1, %2
- movd %3, %1
-%endif
-%endmacro
-
-%macro STORE4 5
- mova [%1+mmsize*0], %2
- mova [%1+mmsize*1], %3
- mova [%1+mmsize*2], %4
- mova [%1+mmsize*3], %5
-%endmacro
-
-%macro LOAD4 5
- mova %2, [%1+mmsize*0]
- mova %3, [%1+mmsize*1]
- mova %4, [%1+mmsize*2]
- mova %5, [%1+mmsize*3]
%endmacro
%macro hadamard8_16_wrapper 2
@@ -183,8 +153,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
RET
%endmacro
-%macro HADAMARD8_DIFF 0-1
-%if cpuflag(sse2)
+%macro HADAMARD8_DIFF 1
+; r1, r2 and r3 are not clobbered in this function, so 16x16 can
+; simply call this 2x2x (and that's why we access rsp+gprsize
+; everywhere, which is rsp of calling function)
hadamard8x8_diff %+ SUFFIX:
lea r0, [r3*3]
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
@@ -201,60 +173,8 @@ hadamard8x8_diff %+ SUFFIX:
ret
hadamard8_16_wrapper %1, 2*ARCH_X86_32
-%elif cpuflag(mmx)
-ALIGN 16
-; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
-; const uint8_t *src2, ptrdiff_t stride, int h)
-; r0 = void *s = unused, int h = unused (always 8)
-; note how r1, r2 and r3 are not clobbered in this function, so 16x16
-; can simply call this 2x2x (and that's why we access rsp+gprsize
-; everywhere, which is rsp of calling func
-hadamard8x8_diff %+ SUFFIX:
- lea r0, [r3*3]
-
- ; first 4x8 pixels
- DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
- HADAMARD8
- mova [rsp+gprsize+0x60], m7
- TRANSPOSE4x4W 0, 1, 2, 3, 7
- STORE4 rsp+gprsize, m0, m1, m2, m3
- mova m7, [rsp+gprsize+0x60]
- TRANSPOSE4x4W 4, 5, 6, 7, 0
- STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
-
- ; second 4x8 pixels
- DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
- HADAMARD8
- mova [rsp+gprsize+0x60], m7
- TRANSPOSE4x4W 0, 1, 2, 3, 7
- STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
- mova m7, [rsp+gprsize+0x60]
- TRANSPOSE4x4W 4, 5, 6, 7, 0
-
- LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
- HADAMARD8
- ABS_SUM_8x8_32 rsp+gprsize+0x60
- mova [rsp+gprsize+0x60], m0
-
- LOAD4 rsp+gprsize , m0, m1, m2, m3
- LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
- HADAMARD8
- ABS_SUM_8x8_32 rsp+gprsize
- paddusw m0, [rsp+gprsize+0x60]
-
- HSUM m0, m1, eax
- and rax, 0xFFFF
- ret
-
-hadamard8_16_wrapper 0, 13
-%endif
%endmacro
-%if HAVE_ALIGNED_STACK == 0
-INIT_MMX mmxext
-HADAMARD8_DIFF
-%endif
-
INIT_XMM sse2
%if ARCH_X86_64
%define ABS_SUM_8x8 ABS_SUM_8x8_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 35abbbf7f5..3a8b46f4e1 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -77,7 +77,6 @@ int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t
*pix1, const uint8_t
int ff_hadamard8_diff16_ ## cpu(MPVEncContext *s, const uint8_t *src1,
\
const uint8_t *src2, ptrdiff_t stride, int
h);
-hadamard_func(mmxext)
hadamard_func(sse2)
hadamard_func(ssse3)
@@ -116,11 +115,6 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c,
AVCodecContext *avctx)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags)) {
-#if !HAVE_ALIGNED_STACK
- c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
- c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
-#endif
-
c->sad[1] = ff_sad8_mmxext;
c->pix_abs[1][0] = ff_sad8_mmxext;
commit 6588bf368613a7fdc1970c77030a94f31fcfa970
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 25 15:29:19 2025 +0100
Commit: James Almer <[email protected]>
CommitDate: Wed Nov 26 00:01:09 2025 +0000
avcodec/x86/me_cmp: Avoid manual stack handling
Use x86inc's stack alignment feature instead of allocating the stack
manually*; this means that this code now also automatically supports
unaligned stacks, so that the SSE2 and SSSE3 functions will now be
available everywhere.
*: The code for this was also buggy: It resulted in the stack pointer
to be 4 mod 8 for x64 for the mmxext version before it was disabled
in 542765ce3eccbca587d54262a512cbdb1407230d, because it hardcode 4
instead of using gprsize.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 4545eae276..e123089ba3 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -152,23 +152,11 @@ SECTION .text
%endmacro
%macro hadamard8_16_wrapper 2
-cglobal hadamard8_diff, 4, 4, %1
-%ifndef m8
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
- SUB rsp, pad
-%endif
+cglobal hadamard8_diff, 4, 4, %1, %2*mmsize
call hadamard8x8_diff %+ SUFFIX
-%ifndef m8
- ADD rsp, pad
-%endif
RET
-cglobal hadamard8_diff16, 5, 6, %1
-%ifndef m8
- %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
- SUB rsp, pad
-%endif
-
+cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
call hadamard8x8_diff %+ SUFFIX
mov r5d, eax
@@ -192,9 +180,6 @@ cglobal hadamard8_diff16, 5, 6, %1
.done:
mov eax, r5d
-%ifndef m8
- ADD rsp, pad
-%endif
RET
%endmacro
@@ -215,7 +200,7 @@ hadamard8x8_diff %+ SUFFIX:
and eax, 0xFFFF
ret
-hadamard8_16_wrapper %1, 3
+hadamard8_16_wrapper %1, 2*ARCH_X86_32
%elif cpuflag(mmx)
ALIGN 16
; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
@@ -261,7 +246,7 @@ hadamard8x8_diff %+ SUFFIX:
and rax, 0xFFFF
ret
-hadamard8_16_wrapper 0, 14
+hadamard8_16_wrapper 0, 13
%endif
%endmacro
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index d4503eef3b..35abbbf7f5 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -146,10 +146,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c,
AVCodecContext *avctx)
c->pix_abs[0][2] = ff_sad16_y2_sse2;
c->pix_abs[0][3] = ff_sad16_xy2_sse2;
-#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
-#endif
if (avctx->codec_id != AV_CODEC_ID_SNOW) {
c->sad[0] = ff_sad16_sse2;
@@ -179,10 +177,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c,
AVCodecContext *avctx)
c->nsse[1] = nsse8_ssse3;
c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
-#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
-#endif
}
#endif
}
-----------------------------------------------------------------------
Summary of changes:
libavcodec/x86/me_cmp.asm | 116 +++----------------------------------------
libavcodec/x86/me_cmp_init.c | 10 ----
2 files changed, 8 insertions(+), 118 deletions(-)
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]