PR #23232 opened by zuxy URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23232 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23232.patch
Deprecate MMX. Remove the SSSE3 impl. since we no longer use palignr. The SSE2 impl. uses several more instructions but is slightly faster. pred8x8l_dc_8_mmxext: 20.5 ( 1.66x) pred8x8l_dc_8_ssse3: 17.9 ( 1.90x) pred8x8l_dc_8_sse2: 15.9 ( 2.15x) Signed-off-by: Zuxy Meng <[email protected]> >From 0b3bd4f4a2e812800838a3c711101110403cc91c Mon Sep 17 00:00:00 2001 From: Zuxy Meng <[email protected]> Date: Sun, 26 Apr 2026 21:12:49 -0700 Subject: [PATCH] avcodec/x86/h264_intrapred: SSE2 impl. of pred8x8l_dc_8 Deprecate MMX. Remove the SSSE3 impl. since we no longer use palignr. The SSE2 impl. uses several more instructions but is slightly faster. pred8x8l_dc_8_mmxext: 20.5 ( 1.66x) pred8x8l_dc_8_ssse3: 17.9 ( 1.90x) pred8x8l_dc_8_sse2: 15.9 ( 2.15x) Signed-off-by: Zuxy Meng <[email protected]> --- libavcodec/x86/h264_intrapred.asm | 154 +++++++++++++-------------- libavcodec/x86/h264_intrapred_init.c | 6 +- 2 files changed, 74 insertions(+), 86 deletions(-) diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index 1074b474f0..f8a4058a9f 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -848,104 +848,94 @@ cglobal pred8x8l_top_dc_8, 4,4,6 ; ptrdiff_t stride) ;----------------------------------------------------------------------------- -%macro PRED8x8L_DC 0 -cglobal pred8x8l_dc_8, 4,5 +INIT_XMM sse2 +cglobal pred8x8l_dc_8, 4,5,6 sub r0, r3 lea r4, [r0+r3*2] - movq mm0, [r0+r3*1-8] - punpckhbw mm0, [r0+r3*0-8] - movq mm1, [r4+r3*1-8] - punpckhbw mm1, [r0+r3*2-8] + movd m0, [r0+r3*1-4] + movd m4, [r0+r3*0-4] + punpcklbw m0, m4 + movd m1, [r4+r3*1-4] + movd m4, [r0+r3*2-4] + punpcklbw m1, m4 mov r4, r0 - punpckhwd mm1, mm0 + punpcklwd m1, m0 lea r0, [r0+r3*4] - movq mm2, [r0+r3*1-8] - punpckhbw mm2, [r0+r3*0-8] + movd m2, [r0+r3*1-4] + movd m4, [r0+r3*0-4] + punpcklbw m2, m4 lea r0, [r0+r3*2] - movq mm3, [r0+r3*1-8] - punpckhbw mm3, [r0+r3*0-8] - punpckhwd mm3, mm2 - punpckhdq mm3, mm1 + movd m3, [r0+r3*1-4] + movd m4, [r0+r3*0-4] + punpcklbw m3, m4 + punpcklwd m3, m2 + shufps m3, m1, 0xed + pshufd m3, m3, 0x0d lea r0, [r0+r3*2] - movq mm0, [r0+r3*0-8] - movq mm1, [r4] + movq m0, [r0+r3*0-8] + movq m1, [r4] mov r0, r4 - movq mm4, mm3 - movq mm2, mm3 - PALIGNR mm4, mm0, 7, mm0 - PALIGNR mm1, mm2, 1, mm2 + mova m4, m3 + mova m2, m3 + punpcklqdq m0, m4 + psrldq m0, 7 + punpcklqdq m2, m1 + psrldq m2, 1 test r1d, r1d jnz .do_left -.fix_lt_1: - movq mm5, mm3 - pxor mm5, mm4 - psrlq mm5, 56 - psllq mm5, 48 - pxor mm1, mm5 - jmp .do_left -.fix_lt_2: - movq mm5, mm3 - pxor mm5, mm2 - psllq mm5, 56 - psrlq mm5, 56 - pxor mm2, mm5 + pxor m5, m3, m0 + psrlq m5, 56 + psllq m5, 48 + pxor m2, m5 +.do_left: + mova m4, m0 + PRED4x4_LOWPASS m1, m2, m4, m3, m5 + mova m4, m0 + PRED4x4_LOWPASS m2, m3, m0, m4, m5 + psllq m2, 56 + punpcklqdq m2, m1 + psrldq m2, 7 + movu m0, [r0-8] + movu m3, [r0] + mova m4, m3 + psrldq m0, 7 + psrldq m4, 1 + test r1d, r1d + jnz .skip_fix_lt_2 + pxor m1, m3, m0 + psllq m1, 56 + psrlq m1, 56 + pxor m0, m1 +.skip_fix_lt_2: test r2d, r2d jnz .body -.fix_tr_1: - movq mm5, mm3 - pxor mm5, mm1 - psrlq mm5, 56 - psllq mm5, 56 - pxor mm1, mm5 - jmp .body -.do_left: - movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 - movq mm7, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 - movq mm0, [r0-8] - movq mm3, [r0] - movq mm1, [r0+8] - movq mm2, mm3 - movq mm4, mm3 - PALIGNR mm2, mm0, 7, mm0 - PALIGNR mm1, mm4, 1, mm4 - test r1d, r1d - jz .fix_lt_2 - test r2d, r2d - jz .fix_tr_1 + pxor m1, m3, m4 + psrlq m1, 56 + psllq m1, 56 + pxor m4, m1 .body: lea r1, [r0+r3*2] - PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 - pxor mm0, mm0 - pxor mm1, mm1 + PRED4x4_LOWPASS m3, m0, m4, m3, m1 + pxor m1, m1 lea r2, [r1+r3*2] - psadbw mm0, mm7 - psadbw mm1, mm6 - paddw mm0, [pw_8] - paddw mm0, mm1 + psadbw m2, m1 + psadbw m3, m1 + paddw m3, [pw_8] + paddw m3, m2 lea r4, [r2+r3*2] - psrlw mm0, 4 - pshufw mm0, mm0, 0 - packuswb mm0, mm0 - movq [r0+r3*1], mm0 - movq [r0+r3*2], mm0 - movq [r1+r3*1], mm0 - movq [r1+r3*2], mm0 - movq [r2+r3*1], mm0 - movq [r2+r3*2], mm0 - movq [r4+r3*1], mm0 - movq [r4+r3*2], mm0 + psrlw m3, 4 + pshuflw m3, m3, 0 + punpcklqdq m3, m3 + packuswb m3, m3 + movq [r0+r3*1], m3 + movq [r0+r3*2], m3 + movq [r1+r3*1], m3 + movq [r1+r3*2], m3 + movq [r2+r3*1], m3 + movq [r2+r3*2], m3 + movq [r4+r3*1], m3 + movq [r4+r3*2], m3 RET -%endmacro - -INIT_MMX mmxext -PRED8x8L_DC -INIT_MMX ssse3 -PRED8x8L_DC ;----------------------------------------------------------------------------- ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft, diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 5b308f658f..87e047db64 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -125,8 +125,7 @@ PRED8x8(tm_vp8, 8, sse2) PRED8x8(tm_vp8, 8, ssse3) PRED8x8L(top_dc, 8, sse2) -PRED8x8L(dc, 8, mmxext) -PRED8x8L(dc, 8, ssse3) +PRED8x8L(dc, 8, sse2) PRED8x8L(horizontal, 8, mmxext) PRED8x8L(horizontal, 8, ssse3) PRED8x8L(vertical, 8, mmxext) @@ -163,7 +162,6 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, if (bit_depth == 8) { if (EXTERNAL_MMXEXT(cpu_flags)) { - h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext; h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext; h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext; h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext; @@ -195,6 +193,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, if (EXTERNAL_SSE2(cpu_flags)) { h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_sse2; h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2; + h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_sse2; h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_sse2; h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2; h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2; @@ -230,7 +229,6 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3; if (chroma_format_idc <= 1) h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3; - h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3; h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3; h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3; h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3; -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
