me_cmp: add SSE2 and SSSE3 median_sad implementations (PR #23315)

Marcos Ashton via ffmpeg-devel Tue, 02 Jun 2026 12:18:05 -0700

PR #23315 opened by Marcos Ashton (MarcosAsh)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23315
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23315.patch


## Summary

Adds SSE2 and SSSE3 implementations of both median_sad comparison functions 
(-cmp msad) to libavcodec/x86/me_cmp. These have had aarch64 NEON 
implementations since 2022 (0ee535b1db, b2732115dd) but no x86 SIMD, so x86 
always used the C fallback.

The approach matches the NEON code: widen V = pix1 - pix2 to int16 words, 
compute the median predictor branchlessly with mid_pred(a, b, c) == max(min(a, 
b), min(max(a, b), c)), and accumulate absolute differences per column. Shifted 
column vectors are produced in-register with psrldq so no out-of-bounds loads 
are made. Three-operand instruction forms are used throughout, so an AVX 
version only needs another INIT_XMM instantiation.

checkasm --bench on an Intel Core Ultra 7 155H:

      median_sad_0_c:     801.3 ( 1.00x)
      median_sad_0_sse2:  160.1 ( 5.00x)
      median_sad_0_ssse3: 115.9 ( 6.92x)
      median_sad_1_c:     349.4 ( 1.00x)
      median_sad_1_sse2:   92.0 ( 3.80x)
      median_sad_1_ssse3:  68.4 ( 5.11x)

End to end, mpeg4 encoding of 720p with -cmp msad -subcmp msad -mbcmp msad 
-precmp msad uses 41% less CPU time (16.30s -> 9.65s user time, 300 frames). 
Output is bit-exact with the C implementation, verified with framemd5 against 
-cpuflags 0 under -flags +bitexact.

Both functions are x86_64 only due to register pressure, same as the NEON 
versions being aarch64 only. x86_32 keeps using the C code. Covered by the 
existing checkasm motion test, and the full FATE suite passes.


>From 544ba24e8b7676496d5375da17d7a557e8163187 Mon Sep 17 00:00:00 2001
From: marcos ashton <[email protected]>
Date: Tue, 2 Jun 2026 19:49:41 +0100
Subject: [PATCH 1/2] avcodec/x86/me_cmp: add SSE2 and SSSE3 median_sad16

The median_sad functions have NEON implementations but no x86 ones,
so x86 always used the C code. x86_64 only due to register pressure.

median_sad_0_c:     801.3 ( 1.00x)
median_sad_0_sse2:  160.1 ( 5.00x)
median_sad_0_ssse3: 115.9 ( 6.92x)

Benchmarks and tests run with checkasm on an Intel Core Ultra 7 155H.

Signed-off-by: marcos ashton <[email protected]>
---
 libavcodec/x86/me_cmp.asm    | 121 +++++++++++++++++++++++++++++++++++
 libavcodec/x86/me_cmp_init.c |  13 ++++
 2 files changed, 134 insertions(+)

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 314b091fc8..3c2cb416ea 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -809,3 +809,124 @@ VSAD_APPROX 8,  a
 INIT_XMM sse2
 VSAD_APPROX 16, a
 VSAD_APPROX 16, u
+
+;---------------------------------------------------------------------
+;int ff_median_sad_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+;                        ptrdiff_t stride, int h);
+;---------------------------------------------------------------------
+%if ARCH_X86_64
+
+; Load one row of 16 pixels from pix1/pix2 and compute V = pix1 - pix2 as
+; int16 words, both unshifted (columns 0-15) and shifted by one column
+; (columns 1-16; the last word is garbage and is discarded by the caller).
+; %1: V columns 0-7, %2: V columns 8-15, %3: V columns 1-8, %4: V columns 9-16
+; %5, %6: temporaries, %7: zero register
+%macro LOAD_V16 7
+    movu      %1, [pix1q]
+    movu      %5, [pix2q]
+    psrldq    %3, %1, 1
+    psrldq    %6, %5, 1
+    punpckhbw %2, %1, %7
+    punpcklbw %1, %7
+    punpckhbw %4, %5, %7
+    punpcklbw %5, %7
+    psubw     %1, %5
+    psubw     %2, %4
+    punpckhbw %4, %3, %7
+    punpcklbw %3, %7
+    punpckhbw %5, %6, %7
+    punpcklbw %6, %7
+    psubw     %3, %6
+    psubw     %4, %5
+%endmacro
+
+; Accumulate abs(%5 - mid_pred(%2, %3, %2 + %3 - %4)) into %1, using
+; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)).
+; %1: accumulator, %2: top, %3: left, %4: topleft, %5: values being predicted
+; %6, %7, %8: temporaries
+%macro MEDIAN_ABS_ACC 8
+    paddw     %6, %2, %3        ; top + left
+    psubw     %6, %4            ; top + left - topleft
+    pminsw    %7, %2, %3        ; min(top, left)
+    pmaxsw    %8, %2, %3        ; max(top, left)
+    pminsw    %8, %6
+    pmaxsw    %7, %8            ; mid_pred(top, left, top + left - topleft)
+    psubw     %6, %5, %7
+    ABS1      %6, %8
+    paddw     %1, %6
+%endmacro
+
+; Register layout:
+;   m0  accumulator for columns 1-8
+;   m1  accumulator for columns 9-16 (the last word is discarded at the end)
+;   m2  accumulator for column 0 (only the first word is used)
+;   m3  previous row V, columns 0-7  (topleft predictors)
+;   m4  previous row V, columns 8-15
+;   m5  previous row V, columns 1-8  (top predictors)
+;   m6  previous row V, columns 9-16
+;   m7  zero
+;   m8  current row V, columns 0-7   (left predictors)
+;   m9  current row V, columns 8-15
+;   m10 current row V, columns 1-8   (values being predicted)
+;   m11 current row V, columns 9-16
+;   m12-m14 temporaries
+%macro MEDIAN_SAD16 0
+cglobal median_sad16, 5, 5, 15, v, pix1, pix2, stride, h
+    pxor      m7, m7
+    LOAD_V16  m3, m4, m5, m6, m12, m13, m7
+
+    ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
+    mova      m2, m3
+    ABS1      m2, m12
+    psubw     m0, m5, m3
+    ABS1      m0, m12
+    psubw     m1, m6, m4
+    ABS1      m1, m12
+
+    add       pix1q, strideq
+    add       pix2q, strideq
+    sub       hd, 1
+    jle       .end
+.loop:
+    LOAD_V16  m8, m9, m10, m11, m12, m13, m7
+    ; column 0: abs(V(0) - V(-stride))
+    psubw     m12, m8, m3
+    ABS1      m12, m13
+    paddw     m2, m12
+    ; columns 1-8 and 9-16
+    MEDIAN_ABS_ACC m0, m5, m8, m3, m10, m12, m13, m14
+    MEDIAN_ABS_ACC m1, m6, m9, m4, m11, m12, m13, m14
+    ; the current row becomes the previous row
+    mova      m3, m8
+    mova      m4, m9
+    mova      m5, m10
+    mova      m6, m11
+    add       pix1q, strideq
+    add       pix2q, strideq
+    sub       hd, 1
+    jg        .loop
+.end:
+    ; column 16 lies outside of the block and column 0 only contributes
+    ; its first word
+    pslldq    m1, 2
+    psrldq    m1, 2
+    pslldq    m2, 14
+    psrldq    m2, 14
+    paddw     m0, m1
+    paddw     m0, m2
+    ; the per-word sums are at most 16 * 510, but their total needs more than
+    ; 16 bits: widen to dwords before the horizontal sum
+    punpckhwd m12, m0, m7
+    punpcklwd m0, m7
+    paddd     m0, m12
+    HADDD     m0, m12
+    movd      eax, m0
+    RET
+%endmacro
+
+INIT_XMM sse2
+MEDIAN_SAD16
+INIT_XMM ssse3
+MEDIAN_SAD16
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index dbb4ef96bb..d7d30c3235 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -22,6 +22,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
@@ -70,6 +71,10 @@ int ff_vsad16_approx_sse2(MPVEncContext *v, const uint8_t 
*pix1, const uint8_t *
                    ptrdiff_t stride, int h);
 int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t *pix1, const 
uint8_t *pix2,
                            ptrdiff_t stride, int h);
+int ff_median_sad16_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+                         ptrdiff_t stride, int h);
+int ff_median_sad16_ssse3(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+                          ptrdiff_t stride, int h);
 
 #define hadamard_func(cpu)                                                     
  \
     int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,       
  \
@@ -160,6 +165,10 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
         } else {
             c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
         }
+
+#if ARCH_X86_64
+        c->median_sad[0] = ff_median_sad16_sse2;
+#endif
     }
 
     if (EXTERNAL_SSSE3(cpu_flags)) {
@@ -169,5 +178,9 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
         c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+
+#if ARCH_X86_64
+        c->median_sad[0] = ff_median_sad16_ssse3;
+#endif
     }
 }
-- 
2.52.0


>From e361c811a64d6e9d4dbd844d0ca8bb48cb9f1023 Mon Sep 17 00:00:00 2001
From: marcos ashton <[email protected]>
Date: Tue, 2 Jun 2026 19:50:56 +0100
Subject: [PATCH 2/2] avcodec/x86/me_cmp: add SSE2 and SSSE3 median_sad8

Same approach as median_sad16, processing one 8 pixel row per XMM
register.

median_sad_1_c:     349.4 ( 1.00x)
median_sad_1_sse2:   92.0 ( 3.80x)
median_sad_1_ssse3:  68.4 ( 5.11x)

Benchmarks and tests run with checkasm on an Intel Core Ultra 7 155H.

Signed-off-by: marcos ashton <[email protected]>
---
 libavcodec/x86/me_cmp.asm    | 75 ++++++++++++++++++++++++++++++++++++
 libavcodec/x86/me_cmp_init.c |  6 +++
 2 files changed, 81 insertions(+)

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 3c2cb416ea..8954c01654 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -840,6 +840,21 @@ VSAD_APPROX 16, u
     psubw     %4, %5
 %endmacro
 
+; Same as LOAD_V16 for one row of 8 pixels.
+; %1: V columns 0-7, %2: V columns 1-8, %3, %4: temporaries, %5: zero register
+%macro LOAD_V8 5
+    movq      %1, [pix1q]
+    movq      %3, [pix2q]
+    psrldq    %2, %1, 1
+    psrldq    %4, %3, 1
+    punpcklbw %1, %5
+    punpcklbw %3, %5
+    psubw     %1, %3
+    punpcklbw %2, %5
+    punpcklbw %4, %5
+    psubw     %2, %4
+%endmacro
+
 ; Accumulate abs(%5 - mid_pred(%2, %3, %2 + %3 - %4)) into %1, using
 ; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)).
 ; %1: accumulator, %2: top, %3: left, %4: topleft, %5: values being predicted
@@ -929,4 +944,64 @@ MEDIAN_SAD16
 INIT_XMM ssse3
 MEDIAN_SAD16
 
+; Register layout:
+;   m0  accumulator for columns 1-8 (the last word is discarded at the end)
+;   m1  accumulator for column 0 (only the first word is used)
+;   m2  previous row V, columns 0-7 (topleft predictors)
+;   m3  previous row V, columns 1-8 (top predictors)
+;   m4  zero
+;   m5  current row V, columns 0-7  (left predictors)
+;   m6  current row V, columns 1-8  (values being predicted)
+;   m7-m9 temporaries
+%macro MEDIAN_SAD8 0
+cglobal median_sad8, 5, 5, 10, v, pix1, pix2, stride, h
+    pxor      m4, m4
+    LOAD_V8   m2, m3, m7, m8, m4
+
+    ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
+    mova      m1, m2
+    ABS1      m1, m7
+    psubw     m0, m3, m2
+    ABS1      m0, m7
+
+    add       pix1q, strideq
+    add       pix2q, strideq
+    sub       hd, 1
+    jle       .end
+.loop:
+    LOAD_V8   m5, m6, m7, m8, m4
+    ; column 0: abs(V(0) - V(-stride))
+    psubw     m7, m5, m2
+    ABS1      m7, m8
+    paddw     m1, m7
+    ; columns 1-8
+    MEDIAN_ABS_ACC m0, m3, m5, m2, m6, m7, m8, m9
+    ; the current row becomes the previous row
+    mova      m2, m5
+    mova      m3, m6
+    add       pix1q, strideq
+    add       pix2q, strideq
+    sub       hd, 1
+    jg        .loop
+.end:
+    ; column 8 lies outside of the block and column 0 only contributes
+    ; its first word
+    pslldq    m0, 2
+    psrldq    m0, 2
+    pslldq    m1, 14
+    psrldq    m1, 14
+    paddw     m0, m1
+    punpckhwd m7, m0, m4
+    punpcklwd m0, m4
+    paddd     m0, m7
+    HADDD     m0, m7
+    movd      eax, m0
+    RET
+%endmacro
+
+INIT_XMM sse2
+MEDIAN_SAD8
+INIT_XMM ssse3
+MEDIAN_SAD8
+
 %endif ; ARCH_X86_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index d7d30c3235..2320e09bad 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -73,8 +73,12 @@ int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t 
*pix1, const uint8_t
                            ptrdiff_t stride, int h);
 int ff_median_sad16_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
                          ptrdiff_t stride, int h);
+int ff_median_sad8_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+                        ptrdiff_t stride, int h);
 int ff_median_sad16_ssse3(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
                           ptrdiff_t stride, int h);
+int ff_median_sad8_ssse3(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+                         ptrdiff_t stride, int h);
 
 #define hadamard_func(cpu)                                                     
  \
     int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,       
  \
@@ -168,6 +172,7 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
 
 #if ARCH_X86_64
         c->median_sad[0] = ff_median_sad16_sse2;
+        c->median_sad[1] = ff_median_sad8_sse2;
 #endif
     }
 
@@ -181,6 +186,7 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
 
 #if ARCH_X86_64
         c->median_sad[0] = ff_median_sad16_ssse3;
+        c->median_sad[1] = ff_median_sad8_ssse3;
 #endif
     }
 }
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] avcodec/x86/me_cmp: add SSE2 and SSSE3 median_sad implementations (PR #23315)

Reply via email to