me_cmp: add SSSE3 median_sad8

marcos ashton via ffmpeg-cvslog Thu, 18 Jun 2026 15:08:47 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 8a863106dee4e5c239d27a248ac9fdc80cf5c140
Author:     marcos ashton <[email protected]>
AuthorDate: Thu Jun 4 22:03:51 2026 +0100
Commit:     michaelni <[email protected]>
CommitDate: Thu Jun 18 22:08:02 2026 +0000

    avcodec/x86/me_cmp: add SSSE3 median_sad8
    
    Same approach as median_sad16, processing one 8 pixel row per XMM
    register.
    
    median_sad_1_c:     141.4 ( 1.00x)
    median_sad_1_ssse3:  29.1 ( 4.86x)
    
    Benchmarks and tests run with checkasm on an Intel Core Ultra 7 155H.
    
    Signed-off-by: marcos ashton <[email protected]>
---
 libavcodec/x86/me_cmp.asm    | 75 ++++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/me_cmp_init.c |  3 ++
 2 files changed, 78 insertions(+)

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 5c403b2715..17e8baeba0 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -837,6 +837,17 @@ VSAD_APPROX 16, u
     psrldq    %4, %2, 2         ; V columns 9-16
 %endmacro
 
+; Same as LOAD_V16 for one row of 8 pixels.
+; %1: V columns 0-7, %2: V columns 1-8 (column 8 is zero), %3: scratch register
+%macro LOAD_V8 3
+    movq      %1, [pix1q]
+    movq      %2, [pix2q]
+    punpcklbw %1, %3
+    punpcklbw %2, %3
+    psubw     %1, %2            ; V columns 0-7
+    psrldq    %2, %1, 2         ; V columns 1-8
+%endmacro
+
 ; Accumulate abs(%5 - mid_pred(%2, %3, %2 + %3 - %4)) into %1, using
 ; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)).  The top predictor
 ; %2 is not needed afterwards and is clobbered.
@@ -927,4 +938,68 @@ cglobal median_sad16, 5, 5, 15, v, pix1, pix2, stride, h
 INIT_XMM ssse3
 MEDIAN_SAD16
 
+; Accumulate one row's cost from the previous and current row vectors.
+; %1: previous row V columns 0-7, %2: previous row V columns 1-8
+; %3: current  row V columns 0-7, %4: current  row V columns 1-8 (loaded here)
+; m0/m1 are the accumulators, m7/m8 temporaries, m9 scratch.
+%macro PROCESS_ROW8 4
+    LOAD_V8   %3, %4, m9
+    add       pix1q, strideq
+    add       pix2q, strideq
+    ; column 0: abs(V(0) - V(-stride))
+    psubw     m7, %3, %1
+    pabsw     m7, m7
+    paddw     m1, m7
+    ; columns 1-8
+    MEDIAN_ABS_ACC m0, %2, %3, %1, %4, m7, m8
+%endmacro
+
+; Register layout:
+;   m0  accumulator for columns 1-8 (the last word is discarded at the end)
+;   m1  accumulator for column 0 (only the first word is used)
+;   m2, m3  one row's V (columns 0-7, 1-8)
+;   m5, m6  the other row's V (columns 0-7, 1-8)
+;   m7, m8 temporaries
+;   m9  scratch register for LOAD_V8
+; As in median_sad16 the loop is unrolled by two so the two register sets
+; alternate the roles of previous and current row.
+%macro MEDIAN_SAD8 0
+cglobal median_sad8, 5, 5, 10, v, pix1, pix2, stride, h
+    LOAD_V8   m2, m3, m9
+    add       pix1q, strideq
+    add       pix2q, strideq
+
+    ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
+    pabsw     m1, m2
+    psubw     m0, m3, m2
+    pabsw     m0, m0
+
+    sub       hd, 1
+    jle       .end
+.loop:
+    PROCESS_ROW8 m2, m3, m5, m6
+    sub       hd, 1
+    jle       .end
+    PROCESS_ROW8 m5, m6, m2, m3
+    sub       hd, 1
+    jg        .loop
+.end:
+    ; column 8 lies outside of the block and column 0 only contributes its
+    ; first word; the kept columns may end up in any lane since the final sum
+    ; is horizontal anyway
+    pslldq    m0, 2
+    pslldq    m1, 14
+    paddw     m0, m1
+    pxor      m4, m4
+    punpckhwd m7, m0, m4
+    punpcklwd m0, m4
+    paddd     m0, m7
+    HADDD     m0, m7
+    movd      eax, m0
+    RET
+%endmacro
+
+INIT_XMM ssse3
+MEDIAN_SAD8
+
 %endif ; ARCH_X86_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 3b3ad6aa33..3d41f56874 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -73,6 +73,8 @@ int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t 
*pix1, const uint8_t
                            ptrdiff_t stride, int h);
 int ff_median_sad16_ssse3(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
                           ptrdiff_t stride, int h);
+int ff_median_sad8_ssse3(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+                         ptrdiff_t stride, int h);
 
 #define hadamard_func(cpu)                                                     
  \
     int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,       
  \
@@ -175,6 +177,7 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
 
 #if ARCH_X86_64
         c->median_sad[0] = ff_median_sad16_ssse3;
+        c->median_sad[1] = ff_median_sad8_ssse3;
 #endif
     }
 }

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 02/02: avcodec/x86/me_cmp: add SSSE3 median_sad8

Reply via email to