me_cmp: Avoid call on UNIX64

ffmpeg-git--- via ffmpeg-cvslog Tue, 25 Nov 2025 16:02:07 -0800

The branch, master has been updated
       via  81362b319ea7244d8d17110adfa59f10c7e78268 (commit)
       via  23720df371d7da9ea8074b01753497f7194973f4 (commit)
       via  6588bf368613a7fdc1970c77030a94f31fcfa970 (commit)
      from  f3346ca6f7d8249738ae0d9544b62be5c51ccfcd (commit)



- Log -----------------------------------------------------------------
commit 81362b319ea7244d8d17110adfa59f10c7e78268
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 25 16:28:51 2025 +0100
Commit:     James Almer <[email protected]>
CommitDate: Wed Nov 26 00:01:09 2025 +0000

    avcodec/x86/me_cmp: Avoid call on UNIX64
    
    The internal functions for calculating the hadamard difference
    of two 8x8 blocks have no epilogue on UNIX64, so one can avoid
    the call altogether by placing the 8x8 function so that it directly
    falls into the internal function.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 2a196d03bb..3ac8acee2c 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -121,12 +121,8 @@ SECTION .text
     movd            %3, %1
 %endmacro
 
-%macro hadamard8_16_wrapper 2
-cglobal hadamard8_diff, 4, 4, %1, %2*mmsize
-    call hadamard8x8_diff %+ SUFFIX
-    RET
-
-cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
+%macro HADAMARD8_DIFF 1
+cglobal hadamard8_diff16, 5, 6, %1, 2*mmsize*ARCH_X86_32
     call hadamard8x8_diff %+ SUFFIX
     mov            r5d, eax
 
@@ -151,9 +147,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
 .done:
     mov            eax, r5d
     RET
-%endmacro
 
-%macro HADAMARD8_DIFF 1
+cglobal hadamard8_diff, 4, 4, %1, 2*mmsize*ARCH_X86_32
+    TAIL_CALL hadamard8x8_diff %+ SUFFIX, 0
+
 ; r1, r2 and r3 are not clobbered in this function, so 16x16 can
 ; simply call this 2x2x (and that's why we access rsp+gprsize
 ; everywhere, which is rsp of calling function)
@@ -171,8 +168,6 @@ hadamard8x8_diff %+ SUFFIX:
     HSUM                        m0, m1, eax
     and                         eax, 0xFFFF
     ret
-
-hadamard8_16_wrapper %1, 2*ARCH_X86_32
 %endmacro
 
 INIT_XMM sse2

commit 23720df371d7da9ea8074b01753497f7194973f4
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 25 15:45:44 2025 +0100
Commit:     James Almer <[email protected]>
CommitDate: Wed Nov 26 00:01:09 2025 +0000

    avcodec/me_cmp: Remove MMXEXT hadamard diff functions
    
    The SSE2 and SSSE3 functions are now available everywhere,
    making the MMXEXT functions irrelevant.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index e123089ba3..2a196d03bb 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -112,7 +112,6 @@ SECTION .text
 ; about 100k on extreme inputs. But that's very unlikely to occur in natural 
video,
 ; and it's even more unlikely to not have any alternative mvs/modes with lower 
cost.
 %macro HSUM 3
-%if cpuflag(sse2)
     movhlps         %2, %1
     paddusw         %1, %2
     pshuflw         %2, %1, 0xE
@@ -120,35 +119,6 @@ SECTION .text
     pshuflw         %2, %1, 0x1
     paddusw         %1, %2
     movd            %3, %1
-%elif cpuflag(mmxext)
-    pshufw          %2, %1, 0xE
-    paddusw         %1, %2
-    pshufw          %2, %1, 0x1
-    paddusw         %1, %2
-    movd            %3, %1
-%elif cpuflag(mmx)
-    mova            %2, %1
-    psrlq           %1, 32
-    paddusw         %1, %2
-    mova            %2, %1
-    psrlq           %1, 16
-    paddusw         %1, %2
-    movd            %3, %1
-%endif
-%endmacro
-
-%macro STORE4 5
-    mova [%1+mmsize*0], %2
-    mova [%1+mmsize*1], %3
-    mova [%1+mmsize*2], %4
-    mova [%1+mmsize*3], %5
-%endmacro
-
-%macro LOAD4 5
-    mova            %2, [%1+mmsize*0]
-    mova            %3, [%1+mmsize*1]
-    mova            %4, [%1+mmsize*2]
-    mova            %5, [%1+mmsize*3]
 %endmacro
 
 %macro hadamard8_16_wrapper 2
@@ -183,8 +153,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
     RET
 %endmacro
 
-%macro HADAMARD8_DIFF 0-1
-%if cpuflag(sse2)
+%macro HADAMARD8_DIFF 1
+; r1, r2 and r3 are not clobbered in this function, so 16x16 can
+; simply call this 2x2x (and that's why we access rsp+gprsize
+; everywhere, which is rsp of calling function)
 hadamard8x8_diff %+ SUFFIX:
     lea                          r0, [r3*3]
     DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
@@ -201,60 +173,8 @@ hadamard8x8_diff %+ SUFFIX:
     ret
 
 hadamard8_16_wrapper %1, 2*ARCH_X86_32
-%elif cpuflag(mmx)
-ALIGN 16
-; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
-;                               const uint8_t *src2, ptrdiff_t stride, int h)
-; r0 = void *s = unused, int h = unused (always 8)
-; note how r1, r2 and r3 are not clobbered in this function, so 16x16
-; can simply call this 2x2x (and that's why we access rsp+gprsize
-; everywhere, which is rsp of calling func
-hadamard8x8_diff %+ SUFFIX:
-    lea                          r0, [r3*3]
-
-    ; first 4x8 pixels
-    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
-    HADAMARD8
-    mova         [rsp+gprsize+0x60], m7
-    TRANSPOSE4x4W                 0,  1,  2,  3,  7
-    STORE4              rsp+gprsize, m0, m1, m2, m3
-    mova                         m7, [rsp+gprsize+0x60]
-    TRANSPOSE4x4W                 4,  5,  6,  7,  0
-    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
-
-    ; second 4x8 pixels
-    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
-    HADAMARD8
-    mova         [rsp+gprsize+0x60], m7
-    TRANSPOSE4x4W                 0,  1,  2,  3,  7
-    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
-    mova                         m7, [rsp+gprsize+0x60]
-    TRANSPOSE4x4W                 4,  5,  6,  7,  0
-
-    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
-    HADAMARD8
-    ABS_SUM_8x8_32 rsp+gprsize+0x60
-    mova         [rsp+gprsize+0x60], m0
-
-    LOAD4          rsp+gprsize     , m0, m1, m2, m3
-    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
-    HADAMARD8
-    ABS_SUM_8x8_32 rsp+gprsize
-    paddusw                      m0, [rsp+gprsize+0x60]
-
-    HSUM                         m0, m1, eax
-    and                         rax, 0xFFFF
-    ret
-
-hadamard8_16_wrapper 0, 13
-%endif
 %endmacro
 
-%if HAVE_ALIGNED_STACK == 0
-INIT_MMX mmxext
-HADAMARD8_DIFF
-%endif
-
 INIT_XMM sse2
 %if ARCH_X86_64
 %define ABS_SUM_8x8 ABS_SUM_8x8_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 35abbbf7f5..3a8b46f4e1 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -77,7 +77,6 @@ int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t 
*pix1, const uint8_t
     int ff_hadamard8_diff16_ ## cpu(MPVEncContext *s, const uint8_t *src1,     
  \
                                     const uint8_t *src2, ptrdiff_t stride, int 
h);
 
-hadamard_func(mmxext)
 hadamard_func(sse2)
 hadamard_func(ssse3)
 
@@ -116,11 +115,6 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
-#if !HAVE_ALIGNED_STACK
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
-        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
-#endif
-
         c->sad[1] = ff_sad8_mmxext;
 
         c->pix_abs[1][0] = ff_sad8_mmxext;

commit 6588bf368613a7fdc1970c77030a94f31fcfa970
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 25 15:29:19 2025 +0100
Commit:     James Almer <[email protected]>
CommitDate: Wed Nov 26 00:01:09 2025 +0000

    avcodec/x86/me_cmp: Avoid manual stack handling
    
    Use x86inc's stack alignment feature instead of allocating the stack
    manually*; this means that this code now also automatically supports
    unaligned stacks, so that the SSE2 and SSSE3 functions will now be
    available everywhere.
    
    *: The code for this was also buggy: It resulted in the stack pointer
    to be 4 mod 8 for x64 for the mmxext version before it was disabled
    in 542765ce3eccbca587d54262a512cbdb1407230d, because it hardcode 4
    instead of using gprsize.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 4545eae276..e123089ba3 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -152,23 +152,11 @@ SECTION .text
 %endmacro
 
 %macro hadamard8_16_wrapper 2
-cglobal hadamard8_diff, 4, 4, %1
-%ifndef m8
-    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
-    SUB            rsp, pad
-%endif
+cglobal hadamard8_diff, 4, 4, %1, %2*mmsize
     call hadamard8x8_diff %+ SUFFIX
-%ifndef m8
-    ADD            rsp, pad
-%endif
     RET
 
-cglobal hadamard8_diff16, 5, 6, %1
-%ifndef m8
-    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
-    SUB            rsp, pad
-%endif
-
+cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
     call hadamard8x8_diff %+ SUFFIX
     mov            r5d, eax
 
@@ -192,9 +180,6 @@ cglobal hadamard8_diff16, 5, 6, %1
 
 .done:
     mov            eax, r5d
-%ifndef m8
-    ADD            rsp, pad
-%endif
     RET
 %endmacro
 
@@ -215,7 +200,7 @@ hadamard8x8_diff %+ SUFFIX:
     and                         eax, 0xFFFF
     ret
 
-hadamard8_16_wrapper %1, 3
+hadamard8_16_wrapper %1, 2*ARCH_X86_32
 %elif cpuflag(mmx)
 ALIGN 16
 ; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
@@ -261,7 +246,7 @@ hadamard8x8_diff %+ SUFFIX:
     and                         rax, 0xFFFF
     ret
 
-hadamard8_16_wrapper 0, 14
+hadamard8_16_wrapper 0, 13
 %endif
 %endmacro
 
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index d4503eef3b..35abbbf7f5 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -146,10 +146,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
         c->pix_abs[0][2] = ff_sad16_y2_sse2;
         c->pix_abs[0][3] = ff_sad16_xy2_sse2;
 
-#if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
-#endif
         if (avctx->codec_id != AV_CODEC_ID_SNOW) {
             c->sad[0]        = ff_sad16_sse2;
 
@@ -179,10 +177,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
         c->nsse[1]           = nsse8_ssse3;
 
         c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
-#if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
-#endif
     }
 #endif
 }

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/x86/me_cmp.asm    | 116 +++----------------------------------------
 libavcodec/x86/me_cmp_init.c |  10 ----
 2 files changed, 8 insertions(+), 118 deletions(-)


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] branch master updated. 81362b319e avcodec/x86/me_cmp: Avoid call on UNIX64

Reply via email to