me_cmp: Reduce amount of registers used

Andreas Rheinhardt via ffmpeg-cvslog Fri, 03 Jul 2026 07:58:53 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 67dfae9c5fad698ea4a7fa4f81b1fe4be1f1f4fd
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Jun 30 15:08:34 2026 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Jul 3 16:09:57 2026 +0200

    avcodec/x86/me_cmp: Reduce amount of registers used
    
    One can avoid one scratch register in MEDIAN_ABS_ACC
    (by clobbering topleft which isn't needed lateron).
    Furthermore, the earlier code requested more registers
    in cglobal than it actually used: median_sad8 only needs
    seven registers, median_sad16 only 11. One of these
    registers is actually unclobbered (read-only), so
    median_sad8 does not need to save and restore and
    non-volatile registers on Win64.
    
    Furthermore, the restriction of median_sad8 to x64 can now
    be lifted.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/me_cmp.asm    | 90 ++++++++++++++++++++++----------------------
 libavcodec/x86/me_cmp_init.c |  2 +-
 2 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 5408946245..f06d703020 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -814,7 +814,6 @@ VSAD_APPROX 16, u
 ;int ff_median_sad_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
 ;                        ptrdiff_t stride, int h);
 ;---------------------------------------------------------------------
-%if ARCH_X86_64
 
 ; Load one row of 16 pixels from pix1/pix2 and compute V = pix1 - pix2 as
 ; int16 words.  No zero register is needed: both byte vectors are unpacked
@@ -849,22 +848,22 @@ VSAD_APPROX 16, u
 %endmacro
 
 ; Accumulate abs(%5 - mid_pred(%2, %3, %2 + %3 - %4)) into %1, using
-; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)).  The top predictor
-; %2 is not needed afterwards and is clobbered.
-; %1: accumulator, %2: top, %3: left, %4: topleft, %5: values being predicted
-; %6, %7: temporaries
-%macro MEDIAN_ABS_ACC 7
+; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)).
+; %1: accumulator, %2: top (clobbered), %3: left, %4: topleft (clobbered),
+; %5: values being predicted, %6 scratch register
+%macro MEDIAN_ABS_ACC 6
     paddw     %6, %2, %3        ; top + left
     psubw     %6, %4            ; top + left - topleft
-    pminsw    %7, %2, %3        ; min(top, left)
+    pminsw    %4, %2, %3        ; min(top, left)
     pmaxsw    %2, %3            ; max(top, left)
     pminsw    %2, %6
-    pmaxsw    %7, %2            ; mid_pred(top, left, top + left - topleft)
-    psubw     %6, %5, %7
-    pabsw     %6, %6
-    paddw     %1, %6
+    pmaxsw    %4, %2            ; mid_pred(top, left, top + left - topleft)
+    psubw     %4, %5
+    pabsw     %4, %4
+    paddw     %1, %4
 %endmacro
 
+%if ARCH_X86_64
 ; Accumulate one row's cost from the previous and current row vectors.
 ; %1-%4: previous row V (columns 0-7, 8-15, 0-6, 7-14)
 ; %5-%8: current  row V (columns 0-7, 8-15, 0-6, 7-14), loaded here
@@ -872,114 +871,113 @@ VSAD_APPROX 16, u
 ; predictors %3/%4 are consumed by MEDIAN_ABS_ACC, but they belong to the
 ; previous row and are reloaded before being needed again.
 %macro PROCESS_ROW16 8
-    LOAD_V16  %5, %6, %7, %8, m14
+    LOAD_V16  %5, %6, %7, %8, m10
     add       pix1q, strideq
     add       pix2q, strideq
     ; columns 0-7; no special case for the first element lacking
     ; left and top-left predictors is needed here: The left vectors
     ; have 0 as first element which leads to the desired result.
-    MEDIAN_ABS_ACC m0, %1, %7, %3, %5, m11, m12
+    MEDIAN_ABS_ACC m0, %1, %7, %3, %5, m9
     ; columns 8-15
-    MEDIAN_ABS_ACC m0, %2, %8, %4, %6, m11, m12
+    MEDIAN_ABS_ACC m0, %2, %8, %4, %6, m9
 %endmacro
 
 ; Register layout:
 ;   m0  accumulator
-;   m3-m6   one row's V (columns 0-7, 8-15, 0-6, 7-14)
-;   m7-m10  the other row's V (columns 0-7, 8-15, 0-6, 7-14)
-;   m11, m12 temporaries
-;   m14 scratch register for LOAD_V16
+;   m1-m4   one row's V (columns 0-7, 8-15, 0-6, 7-14)
+;   m5-m8  the other row's V (columns 0-7, 8-15, 0-6, 7-14)
+;   m9 scratch register
+;   m10 dummy register (unclobbered)
 ; The loop is unrolled by two so the two register sets alternate the roles of
 ; previous and current row, which removes the per-row register copies.
 %macro MEDIAN_SAD16 0
-cglobal median_sad16, 5, 5, 15, v, pix1, pix2, stride, h
-    LOAD_V16  m3, m4, m5, m6, m14
+cglobal median_sad16, 5, 5, 10, v, pix1, pix2, stride, h
+    LOAD_V16  m1, m2, m3, m4, m10
     add       pix1q, strideq
     add       pix2q, strideq
 
     ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
-    psubw     m0, m5, m3
+    psubw     m0, m3, m1
+    psubw     m5, m4, m2
     pabsw     m0, m0
-    psubw     m1, m6, m4
-    pabsw     m1, m1
-    paddw     m0, m1
+    pabsw     m5, m5
+    paddw     m0, m5
 
     sub       hd, 1
     jle       .end
 .loop:
-    PROCESS_ROW16 m3, m4, m5, m6, m7, m8, m9, m10
+    PROCESS_ROW16 m1, m2, m3, m4, m5, m6, m7, m8
     sub       hd, 1
     jle       .end
-    PROCESS_ROW16 m7, m8, m9, m10, m3, m4, m5, m6
+    PROCESS_ROW16 m5, m6, m7, m8, m1, m2, m3, m4
     sub       hd, 1
     jg        .loop
 .end:
     ; the per-word sums are at most 2 * 16 * 510, but their total may need
     ; more than 16 bits: widen to dwords before the horizontal sum
     pxor      m1, m1
-    punpckhwd m12, m0, m1
+    punpckhwd m2, m0, m1
     punpcklwd m0, m1
-    paddd     m0, m12
-    HADDD     m0, m12
+    paddd     m0, m2
+    HADDD     m0, m2
     movd      eax, m0
     RET
 %endmacro
 
 INIT_XMM ssse3
 MEDIAN_SAD16
+%endif ; ARCH_X86_64
 
 ; Accumulate one row's cost from the previous and current row vectors.
 ; %1: previous row V columns 0-7, %2: previous row V columns 0-6
 ; %3: current  row V columns 0-7, %4: current  row V columns 0-6 (loaded here)
-; m0 is the accumulator, m7/m8 temporaries, m9 scratch.
+; m0 is the accumulator, m5 scratch register, m6 unclobbered dummy.
 %macro PROCESS_ROW8 4
-    LOAD_V8   %3, %4, m9
+    LOAD_V8   %3, %4, m7
     add       pix1q, strideq
     add       pix2q, strideq
     ; No special case for the first element lacking left and top-left
     ; predictors is needed here: The left vectors have 0 as first element
     ; which leads to the desired result.
-    MEDIAN_ABS_ACC m0, %1, %4, %2, %3, m7, m8
+    MEDIAN_ABS_ACC m0, %1, %4, %2, %3, m5
 %endmacro
 
 ; Register layout:
 ;   m0  accumulator for columns 0-7
-;   m2, m3  one row's V (columns 0-7, 0-6)
-;   m5, m6  the other row's V (columns 0-7, 0-6)
-;   m7, m8 temporaries
-;   m9  scratch register for LOAD_V8
+;   m1, m2  one row's V (columns 0-7, 0-6)
+;   m3, m4  the other row's V (columns 0-7, 0-6)
+;   m5  scratch register
+;   m7  dummy register, unclobbered
 ; As in median_sad16 the loop is unrolled by two so the two register sets
 ; alternate the roles of previous and current row.
 %macro MEDIAN_SAD8 0
-cglobal median_sad8, 5, 5, 10, v, pix1, pix2, stride, h
-    LOAD_V8   m2, m3, m9
+cglobal median_sad8, 5, 5, 6, v, pix1, pix2, stride, h
+    LOAD_V8   m1, m2, m7
     add       pix1q, strideq
     add       pix2q, strideq
 
     ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
-    psubw     m0, m2, m3
+    psubw     m0, m1, m2
     pabsw     m0, m0
 
     sub       hd, 1
     jle       .end
 .loop:
-    PROCESS_ROW8 m2, m3, m5, m6
+    PROCESS_ROW8 m1, m2, m3, m4
     sub       hd, 1
     jle       .end
-    PROCESS_ROW8 m5, m6, m2, m3
+    PROCESS_ROW8 m3, m4, m1, m2
     sub       hd, 1
     jg        .loop
 .end:
     pxor      m4, m4
-    punpckhwd m7, m0, m4
+    punpckhwd m1, m0, m4
     punpcklwd m0, m4
-    paddd     m0, m7
-    HADDD     m0, m7
+    paddd     m0, m1
+    HADDD     m0, m1
     movd      eax, m0
     RET
 %endmacro
 
 INIT_XMM ssse3
 MEDIAN_SAD8
-
-%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 3d41f56874..1ba4003923 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -177,7 +177,7 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
 
 #if ARCH_X86_64
         c->median_sad[0] = ff_median_sad16_ssse3;
-        c->median_sad[1] = ff_median_sad8_ssse3;
 #endif
+        c->median_sad[1] = ff_median_sad8_ssse3;
     }
 }

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 02/04: avcodec/x86/me_cmp: Reduce amount of registers used

Reply via email to