This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 67dfae9c5fad698ea4a7fa4f81b1fe4be1f1f4fd Author: Andreas Rheinhardt <[email protected]> AuthorDate: Tue Jun 30 15:08:34 2026 +0200 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Fri Jul 3 16:09:57 2026 +0200 avcodec/x86/me_cmp: Reduce amount of registers used One can avoid one scratch register in MEDIAN_ABS_ACC (by clobbering topleft which isn't needed lateron). Furthermore, the earlier code requested more registers in cglobal than it actually used: median_sad8 only needs seven registers, median_sad16 only 11. One of these registers is actually unclobbered (read-only), so median_sad8 does not need to save and restore and non-volatile registers on Win64. Furthermore, the restriction of median_sad8 to x64 can now be lifted. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/me_cmp.asm | 90 ++++++++++++++++++++++---------------------- libavcodec/x86/me_cmp_init.c | 2 +- 2 files changed, 45 insertions(+), 47 deletions(-) diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm index 5408946245..f06d703020 100644 --- a/libavcodec/x86/me_cmp.asm +++ b/libavcodec/x86/me_cmp.asm @@ -814,7 +814,6 @@ VSAD_APPROX 16, u ;int ff_median_sad_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ; ptrdiff_t stride, int h); ;--------------------------------------------------------------------- -%if ARCH_X86_64 ; Load one row of 16 pixels from pix1/pix2 and compute V = pix1 - pix2 as ; int16 words. No zero register is needed: both byte vectors are unpacked @@ -849,22 +848,22 @@ VSAD_APPROX 16, u %endmacro ; Accumulate abs(%5 - mid_pred(%2, %3, %2 + %3 - %4)) into %1, using -; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)). The top predictor -; %2 is not needed afterwards and is clobbered. -; %1: accumulator, %2: top, %3: left, %4: topleft, %5: values being predicted -; %6, %7: temporaries -%macro MEDIAN_ABS_ACC 7 +; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)). +; %1: accumulator, %2: top (clobbered), %3: left, %4: topleft (clobbered), +; %5: values being predicted, %6 scratch register +%macro MEDIAN_ABS_ACC 6 paddw %6, %2, %3 ; top + left psubw %6, %4 ; top + left - topleft - pminsw %7, %2, %3 ; min(top, left) + pminsw %4, %2, %3 ; min(top, left) pmaxsw %2, %3 ; max(top, left) pminsw %2, %6 - pmaxsw %7, %2 ; mid_pred(top, left, top + left - topleft) - psubw %6, %5, %7 - pabsw %6, %6 - paddw %1, %6 + pmaxsw %4, %2 ; mid_pred(top, left, top + left - topleft) + psubw %4, %5 + pabsw %4, %4 + paddw %1, %4 %endmacro +%if ARCH_X86_64 ; Accumulate one row's cost from the previous and current row vectors. ; %1-%4: previous row V (columns 0-7, 8-15, 0-6, 7-14) ; %5-%8: current row V (columns 0-7, 8-15, 0-6, 7-14), loaded here @@ -872,114 +871,113 @@ VSAD_APPROX 16, u ; predictors %3/%4 are consumed by MEDIAN_ABS_ACC, but they belong to the ; previous row and are reloaded before being needed again. %macro PROCESS_ROW16 8 - LOAD_V16 %5, %6, %7, %8, m14 + LOAD_V16 %5, %6, %7, %8, m10 add pix1q, strideq add pix2q, strideq ; columns 0-7; no special case for the first element lacking ; left and top-left predictors is needed here: The left vectors ; have 0 as first element which leads to the desired result. - MEDIAN_ABS_ACC m0, %1, %7, %3, %5, m11, m12 + MEDIAN_ABS_ACC m0, %1, %7, %3, %5, m9 ; columns 8-15 - MEDIAN_ABS_ACC m0, %2, %8, %4, %6, m11, m12 + MEDIAN_ABS_ACC m0, %2, %8, %4, %6, m9 %endmacro ; Register layout: ; m0 accumulator -; m3-m6 one row's V (columns 0-7, 8-15, 0-6, 7-14) -; m7-m10 the other row's V (columns 0-7, 8-15, 0-6, 7-14) -; m11, m12 temporaries -; m14 scratch register for LOAD_V16 +; m1-m4 one row's V (columns 0-7, 8-15, 0-6, 7-14) +; m5-m8 the other row's V (columns 0-7, 8-15, 0-6, 7-14) +; m9 scratch register +; m10 dummy register (unclobbered) ; The loop is unrolled by two so the two register sets alternate the roles of ; previous and current row, which removes the per-row register copies. %macro MEDIAN_SAD16 0 -cglobal median_sad16, 5, 5, 15, v, pix1, pix2, stride, h - LOAD_V16 m3, m4, m5, m6, m14 +cglobal median_sad16, 5, 5, 10, v, pix1, pix2, stride, h + LOAD_V16 m1, m2, m3, m4, m10 add pix1q, strideq add pix2q, strideq ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1)) - psubw m0, m5, m3 + psubw m0, m3, m1 + psubw m5, m4, m2 pabsw m0, m0 - psubw m1, m6, m4 - pabsw m1, m1 - paddw m0, m1 + pabsw m5, m5 + paddw m0, m5 sub hd, 1 jle .end .loop: - PROCESS_ROW16 m3, m4, m5, m6, m7, m8, m9, m10 + PROCESS_ROW16 m1, m2, m3, m4, m5, m6, m7, m8 sub hd, 1 jle .end - PROCESS_ROW16 m7, m8, m9, m10, m3, m4, m5, m6 + PROCESS_ROW16 m5, m6, m7, m8, m1, m2, m3, m4 sub hd, 1 jg .loop .end: ; the per-word sums are at most 2 * 16 * 510, but their total may need ; more than 16 bits: widen to dwords before the horizontal sum pxor m1, m1 - punpckhwd m12, m0, m1 + punpckhwd m2, m0, m1 punpcklwd m0, m1 - paddd m0, m12 - HADDD m0, m12 + paddd m0, m2 + HADDD m0, m2 movd eax, m0 RET %endmacro INIT_XMM ssse3 MEDIAN_SAD16 +%endif ; ARCH_X86_64 ; Accumulate one row's cost from the previous and current row vectors. ; %1: previous row V columns 0-7, %2: previous row V columns 0-6 ; %3: current row V columns 0-7, %4: current row V columns 0-6 (loaded here) -; m0 is the accumulator, m7/m8 temporaries, m9 scratch. +; m0 is the accumulator, m5 scratch register, m6 unclobbered dummy. %macro PROCESS_ROW8 4 - LOAD_V8 %3, %4, m9 + LOAD_V8 %3, %4, m7 add pix1q, strideq add pix2q, strideq ; No special case for the first element lacking left and top-left ; predictors is needed here: The left vectors have 0 as first element ; which leads to the desired result. - MEDIAN_ABS_ACC m0, %1, %4, %2, %3, m7, m8 + MEDIAN_ABS_ACC m0, %1, %4, %2, %3, m5 %endmacro ; Register layout: ; m0 accumulator for columns 0-7 -; m2, m3 one row's V (columns 0-7, 0-6) -; m5, m6 the other row's V (columns 0-7, 0-6) -; m7, m8 temporaries -; m9 scratch register for LOAD_V8 +; m1, m2 one row's V (columns 0-7, 0-6) +; m3, m4 the other row's V (columns 0-7, 0-6) +; m5 scratch register +; m7 dummy register, unclobbered ; As in median_sad16 the loop is unrolled by two so the two register sets ; alternate the roles of previous and current row. %macro MEDIAN_SAD8 0 -cglobal median_sad8, 5, 5, 10, v, pix1, pix2, stride, h - LOAD_V8 m2, m3, m9 +cglobal median_sad8, 5, 5, 6, v, pix1, pix2, stride, h + LOAD_V8 m1, m2, m7 add pix1q, strideq add pix2q, strideq ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1)) - psubw m0, m2, m3 + psubw m0, m1, m2 pabsw m0, m0 sub hd, 1 jle .end .loop: - PROCESS_ROW8 m2, m3, m5, m6 + PROCESS_ROW8 m1, m2, m3, m4 sub hd, 1 jle .end - PROCESS_ROW8 m5, m6, m2, m3 + PROCESS_ROW8 m3, m4, m1, m2 sub hd, 1 jg .loop .end: pxor m4, m4 - punpckhwd m7, m0, m4 + punpckhwd m1, m0, m4 punpcklwd m0, m4 - paddd m0, m7 - HADDD m0, m7 + paddd m0, m1 + HADDD m0, m1 movd eax, m0 RET %endmacro INIT_XMM ssse3 MEDIAN_SAD8 - -%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c index 3d41f56874..1ba4003923 100644 --- a/libavcodec/x86/me_cmp_init.c +++ b/libavcodec/x86/me_cmp_init.c @@ -177,7 +177,7 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) #if ARCH_X86_64 c->median_sad[0] = ff_median_sad16_ssse3; - c->median_sad[1] = ff_median_sad8_ssse3; #endif + c->median_sad[1] = ff_median_sad8_ssse3; } } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
