PR #23232 opened by zuxy
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23232
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23232.patch

Deprecate MMX. Remove the SSSE3 impl. since we no longer use palignr.
The SSE2 impl. uses several more instructions but is slightly faster.

pred8x8l_dc_8_mmxext:                                   20.5 ( 1.66x)
pred8x8l_dc_8_ssse3:                                    17.9 ( 1.90x)
pred8x8l_dc_8_sse2:                                     15.9 ( 2.15x)

Signed-off-by: Zuxy Meng <[email protected]>


>From 0b3bd4f4a2e812800838a3c711101110403cc91c Mon Sep 17 00:00:00 2001
From: Zuxy Meng <[email protected]>
Date: Sun, 26 Apr 2026 21:12:49 -0700
Subject: [PATCH] avcodec/x86/h264_intrapred: SSE2 impl. of pred8x8l_dc_8

Deprecate MMX. Remove the SSSE3 impl. since we no longer use palignr.
The SSE2 impl. uses several more instructions but is slightly faster.

pred8x8l_dc_8_mmxext:                                   20.5 ( 1.66x)
pred8x8l_dc_8_ssse3:                                    17.9 ( 1.90x)
pred8x8l_dc_8_sse2:                                     15.9 ( 2.15x)

Signed-off-by: Zuxy Meng <[email protected]>
---
 libavcodec/x86/h264_intrapred.asm    | 154 +++++++++++++--------------
 libavcodec/x86/h264_intrapred_init.c |   6 +-
 2 files changed, 74 insertions(+), 86 deletions(-)

diff --git a/libavcodec/x86/h264_intrapred.asm 
b/libavcodec/x86/h264_intrapred.asm
index 1074b474f0..f8a4058a9f 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -848,104 +848,94 @@ cglobal pred8x8l_top_dc_8, 4,4,6
 ;                       ptrdiff_t stride)
 ;-----------------------------------------------------------------------------
 
-%macro PRED8x8L_DC 0
-cglobal pred8x8l_dc_8, 4,5
+INIT_XMM sse2
+cglobal pred8x8l_dc_8, 4,5,6
     sub          r0, r3
     lea          r4, [r0+r3*2]
-    movq        mm0, [r0+r3*1-8]
-    punpckhbw   mm0, [r0+r3*0-8]
-    movq        mm1, [r4+r3*1-8]
-    punpckhbw   mm1, [r0+r3*2-8]
+    movd         m0, [r0+r3*1-4]
+    movd         m4, [r0+r3*0-4]
+    punpcklbw    m0, m4
+    movd         m1, [r4+r3*1-4]
+    movd         m4, [r0+r3*2-4]
+    punpcklbw    m1, m4
     mov          r4, r0
-    punpckhwd   mm1, mm0
+    punpcklwd    m1, m0
     lea          r0, [r0+r3*4]
-    movq        mm2, [r0+r3*1-8]
-    punpckhbw   mm2, [r0+r3*0-8]
+    movd         m2, [r0+r3*1-4]
+    movd         m4, [r0+r3*0-4]
+    punpcklbw    m2, m4
     lea          r0, [r0+r3*2]
-    movq        mm3, [r0+r3*1-8]
-    punpckhbw   mm3, [r0+r3*0-8]
-    punpckhwd   mm3, mm2
-    punpckhdq   mm3, mm1
+    movd         m3, [r0+r3*1-4]
+    movd         m4, [r0+r3*0-4]
+    punpcklbw    m3, m4
+    punpcklwd    m3, m2
+    shufps       m3, m1, 0xed
+    pshufd       m3, m3, 0x0d
     lea          r0, [r0+r3*2]
-    movq        mm0, [r0+r3*0-8]
-    movq        mm1, [r4]
+    movq         m0, [r0+r3*0-8]
+    movq         m1, [r4]
     mov          r0, r4
-    movq        mm4, mm3
-    movq        mm2, mm3
-    PALIGNR     mm4, mm0, 7, mm0
-    PALIGNR     mm1, mm2, 1, mm2
+    mova         m4, m3
+    mova         m2, m3
+    punpcklqdq   m0, m4
+    psrldq       m0, 7
+    punpcklqdq   m2, m1
+    psrldq       m2, 1
     test        r1d, r1d
     jnz .do_left
-.fix_lt_1:
-    movq        mm5, mm3
-    pxor        mm5, mm4
-    psrlq       mm5, 56
-    psllq       mm5, 48
-    pxor        mm1, mm5
-    jmp .do_left
-.fix_lt_2:
-    movq        mm5, mm3
-    pxor        mm5, mm2
-    psllq       mm5, 56
-    psrlq       mm5, 56
-    pxor        mm2, mm5
+    pxor         m5, m3, m0
+    psrlq        m5, 56
+    psllq        m5, 48
+    pxor         m2, m5
+.do_left:
+    mova         m4, m0
+    PRED4x4_LOWPASS m1, m2, m4, m3, m5
+    mova         m4, m0
+    PRED4x4_LOWPASS m2, m3, m0, m4, m5
+    psllq        m2, 56
+    punpcklqdq   m2, m1
+    psrldq       m2, 7
+    movu         m0, [r0-8]
+    movu         m3, [r0]
+    mova         m4, m3
+    psrldq       m0, 7
+    psrldq       m4, 1
+    test        r1d, r1d
+    jnz .skip_fix_lt_2
+    pxor         m1, m3, m0
+    psllq        m1, 56
+    psrlq        m1, 56
+    pxor         m0, m1
+.skip_fix_lt_2:
     test        r2d, r2d
     jnz .body
-.fix_tr_1:
-    movq        mm5, mm3
-    pxor        mm5, mm1
-    psrlq       mm5, 56
-    psllq       mm5, 56
-    pxor        mm1, mm5
-    jmp .body
-.do_left:
-    movq        mm0, mm4
-    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
-    movq        mm4, mm0
-    movq        mm7, mm2
-    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
-    psllq       mm1, 56
-    PALIGNR     mm7, mm1, 7, mm3
-    movq        mm0, [r0-8]
-    movq        mm3, [r0]
-    movq        mm1, [r0+8]
-    movq        mm2, mm3
-    movq        mm4, mm3
-    PALIGNR     mm2, mm0, 7, mm0
-    PALIGNR     mm1, mm4, 1, mm4
-    test        r1d, r1d
-    jz .fix_lt_2
-    test        r2d, r2d
-    jz .fix_tr_1
+    pxor         m1, m3, m4
+    psrlq        m1, 56
+    psllq        m1, 56
+    pxor         m4, m1
 .body:
     lea          r1, [r0+r3*2]
-    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
-    pxor        mm0, mm0
-    pxor        mm1, mm1
+    PRED4x4_LOWPASS m3, m0, m4, m3, m1
+    pxor         m1, m1
     lea          r2, [r1+r3*2]
-    psadbw      mm0, mm7
-    psadbw      mm1, mm6
-    paddw       mm0, [pw_8]
-    paddw       mm0, mm1
+    psadbw       m2, m1
+    psadbw       m3, m1
+    paddw        m3, [pw_8]
+    paddw        m3, m2
     lea          r4, [r2+r3*2]
-    psrlw       mm0, 4
-    pshufw      mm0, mm0, 0
-    packuswb    mm0, mm0
-    movq [r0+r3*1], mm0
-    movq [r0+r3*2], mm0
-    movq [r1+r3*1], mm0
-    movq [r1+r3*2], mm0
-    movq [r2+r3*1], mm0
-    movq [r2+r3*2], mm0
-    movq [r4+r3*1], mm0
-    movq [r4+r3*2], mm0
+    psrlw        m3, 4
+    pshuflw      m3, m3, 0
+    punpcklqdq   m3, m3
+    packuswb     m3, m3
+    movq  [r0+r3*1], m3
+    movq  [r0+r3*2], m3
+    movq  [r1+r3*1], m3
+    movq  [r1+r3*2], m3
+    movq  [r2+r3*1], m3
+    movq  [r2+r3*2], m3
+    movq  [r4+r3*1], m3
+    movq  [r4+r3*2], m3
     RET
-%endmacro
-
-INIT_MMX mmxext
-PRED8x8L_DC
-INIT_MMX ssse3
-PRED8x8L_DC
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
diff --git a/libavcodec/x86/h264_intrapred_init.c 
b/libavcodec/x86/h264_intrapred_init.c
index 5b308f658f..87e047db64 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -125,8 +125,7 @@ PRED8x8(tm_vp8, 8, sse2)
 PRED8x8(tm_vp8, 8, ssse3)
 
 PRED8x8L(top_dc, 8, sse2)
-PRED8x8L(dc, 8, mmxext)
-PRED8x8L(dc, 8, ssse3)
+PRED8x8L(dc, 8, sse2)
 PRED8x8L(horizontal, 8, mmxext)
 PRED8x8L(horizontal, 8, ssse3)
 PRED8x8L(vertical, 8, mmxext)
@@ -163,7 +162,6 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int 
codec_id,
 
     if (bit_depth == 8) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
-            h->pred8x8l [DC_PRED                ] = ff_pred8x8l_dc_8_mmxext;
             h->pred8x8l [HOR_PRED               ] = 
ff_pred8x8l_horizontal_8_mmxext;
             h->pred8x8l [VERT_PRED              ] = 
ff_pred8x8l_vertical_8_mmxext;
             h->pred8x8l [HOR_UP_PRED            ] = 
ff_pred8x8l_horizontal_up_8_mmxext;
@@ -195,6 +193,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int 
codec_id,
         if (EXTERNAL_SSE2(cpu_flags)) {
             h->pred16x16[HOR_PRED8x8          ] = 
ff_pred16x16_horizontal_8_sse2;
             h->pred16x16[DC_PRED8x8           ] = ff_pred16x16_dc_8_sse2;
+            h->pred8x8l [DC_PRED              ] = ff_pred8x8l_dc_8_sse2;
             h->pred8x8l [TOP_DC_PRED          ] = ff_pred8x8l_top_dc_8_sse2;
             h->pred8x8l [DIAG_DOWN_LEFT_PRED  ] = ff_pred8x8l_down_left_8_sse2;
             h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = 
ff_pred8x8l_down_right_8_sse2;
@@ -230,7 +229,6 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int 
codec_id,
             h->pred16x16[DC_PRED8x8           ] = ff_pred16x16_dc_8_ssse3;
             if (chroma_format_idc <= 1)
                 h->pred8x8  [HOR_PRED8x8      ] = 
ff_pred8x8_horizontal_8_ssse3;
-            h->pred8x8l [DC_PRED              ] = ff_pred8x8l_dc_8_ssse3;
             h->pred8x8l [HOR_PRED             ] = 
ff_pred8x8l_horizontal_8_ssse3;
             h->pred8x8l [VERT_PRED            ] = ff_pred8x8l_vertical_8_ssse3;
             h->pred8x8l [DIAG_DOWN_LEFT_PRED  ] = 
ff_pred8x8l_down_left_8_ssse3;
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to