PR #21080 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21080 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21080.patch
Still plenty left. >From b6e358dc25c13205af759f1c5b63e5f5ce288d3b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 30 Nov 2025 18:08:38 +0100 Subject: [PATCH 1/8] avcodec/vp9mc: Remove MMXEXT functions overridden by SSSE3 SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD), so that the overwhelming majority of our users (particularly those that actually update their FFmpeg) will be using the SSSE3 versions. This commit therefore removes the MMXEXT functions overridden by them (which don't abide by the ABI) to get closer to a removal of emms_c. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9dsp_init.c | 20 +++++++++----------- libavcodec/x86/vp9dsp_init.h | 14 ++++++++++---- libavcodec/x86/vp9dsp_init_16bpp_template.c | 8 ++++---- libavcodec/x86/vp9mc.asm | 20 ++++++-------------- 4 files changed, 29 insertions(+), 33 deletions(-) diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index c103751351..25a007008b 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -41,7 +41,6 @@ decl_fpel_func(put, 64, , avx); decl_fpel_func(avg, 32, _8, avx2); decl_fpel_func(avg, 64, _8, avx2); -decl_mc_funcs(4, mmxext, int16_t, 8, 8); decl_mc_funcs(8, sse2, int16_t, 8, 8); decl_mc_funcs(4, ssse3, int8_t, 32, 8); decl_mc_funcs(8, ssse3, int8_t, 32, 8); @@ -70,10 +69,11 @@ mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8) extern const int8_t ff_filters_ssse3[3][15][4][32]; extern const int16_t ff_filters_sse2[3][15][8][8]; -filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2) -filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2) -filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3) -filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3) +filters_8tap_2d_fn2(put, 16, 8, 1, sse2, sse2) +filters_8tap_2d_fn2(avg, 16, 8, 1, sse2, sse2) +filters_8tap_2d_fn3(put, 16, 8, 1, ssse3, ssse3) +filters_8tap_2d_fn3(avg, 16, 8, 1, ssse3, ssse3) + #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3) filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3) @@ -81,10 +81,10 @@ filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3) filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3) #endif -filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2) -filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2) -filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3) -filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3) +filters_8tap_1d_fn3(put, 8, sse2, sse2) +filters_8tap_1d_fn3(avg, 8, sse2, sse2) +filters_8tap_1d_fn4(put, 8, ssse3, ssse3) +filters_8tap_1d_fn4(avg, 8, ssse3, ssse3) #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3) filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3) @@ -285,8 +285,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext; dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext; dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext; - init_subpel2(4, 0, 4, put, 8, mmxext); - init_subpel2(4, 1, 4, avg, 8, mmxext); init_fpel_func(4, 1, 4, avg, _8, mmxext); init_fpel_func(3, 1, 8, avg, _8, mmxext); dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h index 5690d16970..64747173c8 100644 --- a/libavcodec/x86/vp9dsp_init.h +++ b/libavcodec/x86/vp9dsp_init.h @@ -107,12 +107,15 @@ filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, bpp, o filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \ filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt) -#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \ +#define filters_8tap_1d_fn3(op, bpp, opt8, f_opt) \ filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \ filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \ filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \ filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \ -filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt) + +#define filters_8tap_1d_fn4(op, bpp, opt, f_opt) \ +filters_8tap_1d_fn3(op, bpp, opt, f_opt) \ +filters_8tap_1d_fn2(op, 4, bpp, opt, f_opt) \ #define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \ static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ @@ -133,12 +136,15 @@ filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, bpp, bytes, opt) \ filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, bpp, bytes, opt) -#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \ +#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt8, f_opt) \ filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \ filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \ filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \ filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \ -filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt) + +#define filters_8tap_2d_fn3(op, align, bpp, bytes, opt, f_opt) \ +filters_8tap_2d_fn2(op, align, bpp, bytes, opt, f_opt) \ +filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt, f_opt) #define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c index a6aa03bdc8..54ff8892cf 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp_template.c +++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -40,8 +40,8 @@ mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC) mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC) #endif -filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp) -filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp) +filters_8tap_2d_fn3(put, 16, BPC, 2, sse2, 16bpp) +filters_8tap_2d_fn3(avg, 16, BPC, 2, sse2, 16bpp) #if HAVE_AVX2_EXTERNAL filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp) filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp) @@ -51,8 +51,8 @@ filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp) filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp) #endif -filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp) -filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp) +filters_8tap_1d_fn4(put, BPC, sse2, 16bpp) +filters_8tap_1d_fn4(avg, BPC, sse2, 16bpp) #if HAVE_AVX2_EXTERNAL filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp) filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp) diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index b9a62e79a8..682c6a6ea0 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -205,7 +205,7 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h pxor m5, m5 mova m6, [pw_64] mova m7, [filteryq+ 0] -%if ARCH_X86_64 && mmsize > 8 +%if ARCH_X86_64 mova m8, [filteryq+ 16] mova m9, [filteryq+ 32] mova m10, [filteryq+ 48] @@ -226,7 +226,7 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h punpcklbw m3, m5 punpcklbw m4, m5 pmullw m0, m7 -%if ARCH_X86_64 && mmsize > 8 +%if ARCH_X86_64 pmullw m1, m8 pmullw m2, m9 pmullw m3, m10 @@ -247,7 +247,7 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h punpcklbw m1, m5 punpcklbw m3, m5 punpcklbw m4, m5 -%if ARCH_X86_64 && mmsize > 8 +%if ARCH_X86_64 pmullw m1, m12 pmullw m3, m13 pmullw m4, m14 @@ -276,10 +276,6 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h RET %endmacro -INIT_MMX mmxext -filter_sse2_h_fn put -filter_sse2_h_fn avg - INIT_XMM sse2 filter_sse2_h_fn put filter_sse2_h_fn avg @@ -421,7 +417,7 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, f lea src4q, [srcq+sstrideq] sub srcq, sstride3q mova m7, [filteryq+ 0] -%if ARCH_X86_64 && mmsize > 8 +%ifdef m8 mova m8, [filteryq+ 16] mova m9, [filteryq+ 32] mova m10, [filteryq+ 48] @@ -446,7 +442,7 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, f punpcklbw m3, m5 punpcklbw m4, m5 pmullw m0, m7 -%if ARCH_X86_64 && mmsize > 8 +%ifdef m8 pmullw m1, m8 pmullw m2, m9 pmullw m3, m10 @@ -467,7 +463,7 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, f punpcklbw m1, m5 punpcklbw m3, m5 punpcklbw m4, m5 -%if ARCH_X86_64 && mmsize > 8 +%ifdef m8 pmullw m1, m12 pmullw m3, m13 pmullw m4, m14 @@ -496,10 +492,6 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, f RET %endmacro -INIT_MMX mmxext -filter_sse2_v_fn put -filter_sse2_v_fn avg - INIT_XMM sse2 filter_sse2_v_fn put filter_sse2_v_fn avg -- 2.49.1 >From c354fc6366df6bef7815c6ad1ce8d6b0c1adb92f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 30 Nov 2025 20:26:44 +0100 Subject: [PATCH 2/8] avcodec/vp9intrapred: Remove MMXEXT functions overridden by SSSE3 SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD), so that the overwhelming majority of our users (particularly those that actually update their FFmpeg) will be using the SSSE3 versions. This commit therefore removes the MMXEXT functions overridden by them (which don't abide by the ABI) to get closer to a removal of emms_c. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9dsp_init.c | 12 ++-- libavcodec/x86/vp9intrapred.asm | 122 +++----------------------------- 2 files changed, 13 insertions(+), 121 deletions(-) diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 25a007008b..85332da2b9 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -154,6 +154,8 @@ lpf_funcs(88, 16, avx); void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ const uint8_t *l, const uint8_t *a) +ipred_func(4, hd, mmxext); +ipred_func(4, vl, mmxext); ipred_func(8, v, mmx); #define ipred_dc_funcs(size, opt) \ @@ -161,9 +163,6 @@ ipred_func(size, dc, opt); \ ipred_func(size, dc_left, opt); \ ipred_func(size, dc_top, opt) -ipred_dc_funcs(4, mmxext); -ipred_dc_funcs(8, mmxext); - #define ipred_dir_tm_funcs(size, opt) \ ipred_func(size, tm, opt); \ ipred_func(size, dl, opt); \ @@ -173,8 +172,6 @@ ipred_func(size, hu, opt); \ ipred_func(size, vl, opt); \ ipred_func(size, vr, opt) -ipred_dir_tm_funcs(4, mmxext); - ipred_func(16, v, sse); ipred_func(32, v, sse); @@ -288,9 +285,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) init_fpel_func(4, 1, 4, avg, _8, mmxext); init_fpel_func(3, 1, 8, avg, _8, mmxext); dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; - init_dc_ipred(4, mmxext); - init_dc_ipred(8, mmxext); - init_dir_tm_ipred(4, mmxext); + dsp->intra_pred[TX_4X4][HOR_DOWN_PRED] = ff_vp9_ipred_hd_4x4_mmxext; + dsp->intra_pred[TX_4X4][VERT_LEFT_PRED] = ff_vp9_ipred_vl_4x4_mmxext; } if (EXTERNAL_SSE(cpu_flags)) { diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm index b67addd7e3..22390ca831 100644 --- a/libavcodec/x86/vp9intrapred.asm +++ b/libavcodec/x86/vp9intrapred.asm @@ -93,21 +93,14 @@ SECTION .text ; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) -%macro DC_4to8_FUNCS 0 +INIT_MMX ssse3 cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a movd m0, [lq] punpckldq m0, [aq] pxor m1, m1 psadbw m0, m1 -%if cpuflag(ssse3) pmulhrsw m0, [pw_4096] pshufb m0, m1 -%else - paddw m0, [pw_4] - psraw m0, 3 - punpcklbw m0, m0 - pshufw m0, m0, q0000 -%endif movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] @@ -124,15 +117,8 @@ cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a psadbw m0, m2 psadbw m1, m2 paddw m0, m1 -%if cpuflag(ssse3) pmulhrsw m0, [pw_2048] pshufb m0, m2 -%else - paddw m0, [pw_8] - psraw m0, 4 - punpcklbw m0, m0 - pshufw m0, m0, q0000 -%endif movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 @@ -143,12 +129,7 @@ cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 RET -%endmacro -INIT_MMX mmxext -DC_4to8_FUNCS -INIT_MMX ssse3 -DC_4to8_FUNCS %macro DC_16to32_FUNCS 0 cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a @@ -238,15 +219,8 @@ cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a movd m0, [%2q] pxor m1, m1 psadbw m0, m1 -%if cpuflag(ssse3) pmulhrsw m0, [pw_8192] pshufb m0, m1 -%else - paddw m0, [pw_2] - psraw m0, 2 - punpcklbw m0, m0 - pshufw m0, m0, q0000 -%endif movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] @@ -260,15 +234,8 @@ cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a lea stride3q, [strideq*3] pxor m1, m1 psadbw m0, m1 -%if cpuflag(ssse3) pmulhrsw m0, [pw_4096] pshufb m0, m1 -%else - paddw m0, [pw_4] - psraw m0, 3 - punpcklbw m0, m0 - pshufw m0, m0, q0000 -%endif movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 @@ -281,9 +248,6 @@ cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a RET %endmacro -INIT_MMX mmxext -DC_1D_4to8_FUNCS top, a -DC_1D_4to8_FUNCS left, l INIT_MMX ssse3 DC_1D_4to8_FUNCS top, a DC_1D_4to8_FUNCS left, l @@ -548,33 +512,22 @@ H_XMM_FUNCS 4, 8 INIT_XMM avx H_XMM_FUNCS 4, 8 -%macro TM_MMX_FUNCS 0 +INIT_MMX ssse3 cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a pxor m1, m1 movd m0, [aq] pinsrw m2, [aq-1], 0 punpcklbw m0, m1 DEFINE_ARGS dst, stride, l, cnt -%if cpuflag(ssse3) mova m3, [pw_m256] mova m1, [pw_m255] pshufb m2, m3 -%else - punpcklbw m2, m1 - pshufw m2, m2, q0000 -%endif psubw m0, m2 mov cntq, 1 .loop: pinsrw m2, [lq+cntq*2], 0 -%if cpuflag(ssse3) pshufb m4, m2, m1 pshufb m2, m3 -%else - punpcklbw m2, m1 - pshufw m4, m2, q1111 - pshufw m2, m2, q0000 -%endif paddw m4, m0 paddw m2, m0 packuswb m4, m4 @@ -585,12 +538,6 @@ cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a dec cntq jge .loop RET -%endmacro - -INIT_MMX mmxext -TM_MMX_FUNCS -INIT_MMX ssse3 -TM_MMX_FUNCS %macro TM_XMM_FUNCS 0 cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a @@ -784,20 +731,11 @@ TM_XMM_FUNCS pavgb m%1, m%2 %endmacro -%macro DL_MMX_FUNCS 0 +INIT_MMX ssse3 cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a movq m1, [aq] -%if cpuflag(ssse3) pshufb m0, m1, [pb_0to5_2x7] pshufb m2, m1, [pb_2to6_3x7] -%else - punpckhbw m3, m1, m1 ; 44556677 - pand m0, m1, [pb_6xm1_2x0] ; 012345__ - pand m3, [pb_6x0_2xm1] ; ______77 - psrlq m2, m1, 16 ; 234567__ - por m0, m3 ; 01234577 - por m2, m3 ; 23456777 -%endif psrlq m1, 8 LOWPASS 0, 1, 2, 3 @@ -810,12 +748,6 @@ cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a movd [dstq+strideq*0], m0 movd [dstq+strideq*2], m1 RET -%endmacro - -INIT_MMX mmxext -DL_MMX_FUNCS -INIT_MMX ssse3 -DL_MMX_FUNCS %macro DL_XMM_FUNCS 0 cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a @@ -964,14 +896,14 @@ DL_XMM_FUNCS ; dr -%macro DR_MMX_FUNCS 0 +INIT_MMX ssse3 cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a movd m0, [lq] punpckldq m0, [aq-1] movd m1, [aq+3] DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] - PALIGNR m1, m0, 1, m3 + palignr m1, m0, 1 psrlq m2, m1, 8 LOWPASS 0, 1, 2, 3 @@ -983,12 +915,6 @@ cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a psrlq m0, 8 movd [dstq+strideq*0], m0 RET -%endmacro - -INIT_MMX mmxext -DR_MMX_FUNCS -INIT_MMX ssse3 -DR_MMX_FUNCS %macro DR_XMM_FUNCS 0 cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a @@ -1266,7 +1192,7 @@ VL_XMM_FUNCS ; vr -%macro VR_MMX_FUNCS 0 +INIT_MMX ssse3 cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a movq m1, [aq-1] punpckldq m2, [lq] @@ -1274,7 +1200,7 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] pavgb m0, m1 - PALIGNR m1, m2, 5, m3 + palignr m1, m2, 5 psrlq m2, m1, 8 psllq m3, m1, 8 LOWPASS 2, 1, 3, 4 @@ -1284,7 +1210,6 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a ; IABC | m0 contains ABCDxxxx ; JEFG | m2 contains xJIEFGHx -%if cpuflag(ssse3) punpckldq m0, m2 pshufb m2, [pb_13456_3xm1] movd [dstq+strideq*0], m0 @@ -1293,24 +1218,7 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a psrlq m2, 8 movd [dstq+strideq*2], m0 movd [dstq+strideq*1], m2 -%else - psllq m1, m2, 40 - psrlq m2, 24 - movd [dstq+strideq*0], m0 - movd [dstq+strideq*1], m2 - PALIGNR m0, m1, 7, m3 - psllq m1, 8 - PALIGNR m2, m1, 7, m3 - movd [dstq+strideq*2], m0 - movd [dstq+stride3q ], m2 -%endif RET -%endmacro - -INIT_MMX mmxext -VR_MMX_FUNCS -INIT_MMX ssse3 -VR_MMX_FUNCS %macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16 cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a @@ -1688,16 +1596,10 @@ HD_XMM_FUNCS INIT_XMM avx HD_XMM_FUNCS -%macro HU_MMX_FUNCS 0 +INIT_MMX ssse3 cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l movd m0, [lq] -%if cpuflag(ssse3) pshufb m0, [pb_0to2_5x3] -%else - punpcklbw m1, m0, m0 ; 00112233 - pshufw m1, m1, q3333 ; 33333333 - punpckldq m0, m1 ; 01233333 -%endif psrlq m1, m0, 8 psrlq m2, m1, 8 LOWPASS 2, 1, 0, 3 @@ -1705,7 +1607,7 @@ cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] SBUTTERFLY bw, 1, 2, 0 - PALIGNR m2, m1, 2, m0 + palignr m2, m1, 2 movd [dstq+strideq*0], m1 movd [dstq+strideq*1], m2 punpckhdq m1, m1 @@ -1713,12 +1615,6 @@ cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l movd [dstq+strideq*2], m1 movd [dstq+stride3q ], m2 RET -%endmacro - -INIT_MMX mmxext -HU_MMX_FUNCS -INIT_MMX ssse3 -HU_MMX_FUNCS %macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32 cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l -- 2.49.1 >From 4588fe8b908a6561520913c8af53c5f3b29f4fd5 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 30 Nov 2025 20:49:51 +0100 Subject: [PATCH 3/8] avcodec/vp9itxfm{,_16bpp}: Remove MMXEXT functions overridden by SSSE3 SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD), so that the overwhelming majority of our users (particularly those that actually update their FFmpeg) will be using the SSSE3 versions. This commit therefore removes the MMXEXT functions overridden by them (which don't abide by the ABI) to get closer to a removal of emms_c. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9dsp_init.c | 2 -- libavcodec/x86/vp9dsp_init_16bpp_template.c | 4 --- libavcodec/x86/vp9itxfm.asm | 30 +-------------------- libavcodec/x86/vp9itxfm_16bpp.asm | 18 +------------ 4 files changed, 2 insertions(+), 52 deletions(-) diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 85332da2b9..e479fd25ee 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -101,7 +101,6 @@ itxfm_func(iadst, idct, size, opt); \ itxfm_func(idct, iadst, size, opt); \ itxfm_func(iadst, iadst, size, opt) -itxfm_func(idct, idct, 4, mmxext); itxfm_func(idct, iadst, 4, sse2); itxfm_func(iadst, idct, 4, sse2); itxfm_func(iadst, iadst, 4, sse2); @@ -284,7 +283,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext; init_fpel_func(4, 1, 4, avg, _8, mmxext); init_fpel_func(3, 1, 8, avg, _8, mmxext); - dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; dsp->intra_pred[TX_4X4][HOR_DOWN_PRED] = ff_vp9_ipred_hd_4x4_mmxext; dsp->intra_pred[TX_4X4][VERT_LEFT_PRED] = ff_vp9_ipred_vl_4x4_mmxext; } diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c index 54ff8892cf..969db94d3c 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp_template.c +++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -123,7 +123,6 @@ decl_ipred_fns(tm, BPC, mmxext, sse2); decl_itxfm_func(iwht, iwht, 4, BPC, mmxext); #if BPC == 10 -decl_itxfm_func(idct, idct, 4, BPC, mmxext); decl_itxfm_funcs(4, BPC, ssse3); decl_itxfm_funcs(16, BPC, avx512icl); decl_itxfm_func(idct, idct, 32, BPC, avx512icl); @@ -184,9 +183,6 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) init_ipred_func(tm, TM_VP8, 4, BPC, mmxext); if (!bitexact) { init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext); -#if BPC == 10 - init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext); -#endif } } diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm index fe650d519c..bd5966646c 100644 --- a/libavcodec/x86/vp9itxfm.asm +++ b/libavcodec/x86/vp9itxfm.asm @@ -223,49 +223,28 @@ cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob VP9_STORE_2X 2, 3, 6, 7, 4 %endmacro -%macro IDCT_4x4_FN 1 -INIT_MMX %1 +INIT_MMX ssse3 cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob -%if cpuflag(ssse3) cmp eobd, 4 ; 2x2 or smaller jg .idctfull cmp eobd, 1 ; faster path for when only DC is set jne .idct2x2 -%else - cmp eobd, 1 - jg .idctfull -%endif -%if cpuflag(ssse3) movd m0, [blockq] mova m5, [pw_11585x2] pmulhrsw m0, m5 pmulhrsw m0, m5 -%else - DEFINE_ARGS dst, stride, block, coef - movsx coefd, word [blockq] - imul coefd, 11585 - add coefd, 8192 - sar coefd, 14 - imul coefd, 11585 - add coefd, (8 << 14) + 8192 - sar coefd, 14 + 4 - movd m0, coefd -%endif pshufw m0, m0, 0 pxor m4, m4 movh [blockq], m4 -%if cpuflag(ssse3) pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 -%endif VP9_STORE_2X 0, 0, 6, 7, 4 lea dstq, [dstq+2*strideq] VP9_STORE_2X 0, 0, 6, 7, 4 RET -%if cpuflag(ssse3) ; faster path for when only top left 2x2 block is set .idct2x2: movd m0, [blockq+0] @@ -285,16 +264,13 @@ cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob movh [blockq+ 8], m4 VP9_IDCT4_WRITEOUT RET -%endif .idctfull: ; generic full 4x4 idct/idct mova m0, [blockq+ 0] mova m1, [blockq+ 8] mova m2, [blockq+16] mova m3, [blockq+24] -%if cpuflag(ssse3) mova m6, [pw_11585x2] -%endif mova m7, [pd_8192] ; rounding VP9_IDCT4_1D TRANSPOSE4x4W 0, 1, 2, 3, 4 @@ -306,10 +282,6 @@ cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob mova [blockq+24], m4 VP9_IDCT4_WRITEOUT RET -%endmacro - -IDCT_4x4_FN mmxext -IDCT_4x4_FN ssse3 ;------------------------------------------------------------------------------------------- ; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm index ebe6222285..161c73f5a1 100644 --- a/libavcodec/x86/vp9itxfm_16bpp.asm +++ b/libavcodec/x86/vp9itxfm_16bpp.asm @@ -243,29 +243,21 @@ IWHT4_FN 12, 4095 ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits ; in 15+1 words without additional effort, since the coefficients are 15bpp. -%macro IDCT4_10_FN 0 +INIT_MMX ssse3 cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob cmp eobd, 1 jg .idctfull ; dc-only pxor m4, m4 -%if cpuflag(ssse3) movd m0, [blockq] movd [blockq], m4 mova m5, [pw_11585x2] pmulhrsw m0, m5 pmulhrsw m0, m5 -%else - DEFINE_ARGS dst, stride, block, coef - DC_ONLY 4, m4 - movd m0, coefd -%endif pshufw m0, m0, 0 mova m5, [pw_1023] -%if cpuflag(ssse3) pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 -%endif VP9_STORE_2X 0, 0, 6, 7, 4, 5 lea dstq, [dstq+2*strideq] VP9_STORE_2X 0, 0, 6, 7, 4, 5 @@ -281,9 +273,7 @@ cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob packssdw m2, [blockq+2*16+8] packssdw m3, [blockq+3*16+8] -%if cpuflag(ssse3) mova m6, [pw_11585x2] -%endif mova m7, [pd_8192] ; rounding VP9_IDCT4_1D TRANSPOSE4x4W 0, 1, 2, 3, 4 @@ -293,12 +283,6 @@ cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob ZERO_BLOCK blockq, 16, 4, m4 VP9_IDCT4_WRITEOUT RET -%endmacro - -INIT_MMX mmxext -IDCT4_10_FN -INIT_MMX ssse3 -IDCT4_10_FN %macro IADST4_FN 4 cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob -- 2.49.1 >From f3836ba0e7e4078489ca421f0374f5b704138421 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 30 Nov 2025 23:48:36 +0100 Subject: [PATCH 4/8] tests/checkasm/vp9dsp: Allow to run only a subset of tests Make it possible to run only a subset of the VP9 tests in addition to all of them (via the vp9dsp test). This reduces noise and speeds up testing. FATE continues to use vp9dsp. Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/checkasm.c | 6 +++++- tests/checkasm/checkasm.h | 4 ++++ tests/checkasm/vp9dsp.c | 16 ++++++++-------- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 8c64684fa3..a28a85d6f7 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -261,7 +261,11 @@ static const struct { { "vp8dsp", checkasm_check_vp8dsp }, #endif #if CONFIG_VP9_DECODER - { "vp9dsp", checkasm_check_vp9dsp }, + { "vp9dsp", checkasm_check_vp9dsp }, // all of the below + { "vp9_ipred", checkasm_check_vp9_ipred }, + { "vp9_itxfm", checkasm_check_vp9_itxfm }, + { "vp9_loopfilter", checkasm_check_vp9_loopfilter }, + { "vp9_mc", checkasm_check_vp9_mc }, #endif #if CONFIG_VIDEODSP { "videodsp", checkasm_check_videodsp }, diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 05f74ca16b..5d0c491fae 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -157,6 +157,10 @@ void checkasm_check_vp3dsp(void); void checkasm_check_vp6dsp(void); void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); +void checkasm_check_vp9_ipred(void); +void checkasm_check_vp9_itxfm(void); +void checkasm_check_vp9_loopfilter(void); +void checkasm_check_vp9_mc(void); void checkasm_check_videodsp(void); void checkasm_check_vorbisdsp(void); void checkasm_check_vvc_alf(void); diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c index 2a3374541f..d5ff5aa2cd 100644 --- a/tests/checkasm/vp9dsp.c +++ b/tests/checkasm/vp9dsp.c @@ -47,7 +47,7 @@ static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff }; } \ } while (0) -static void check_ipred(void) +void checkasm_check_vp9_ipred(void) { LOCAL_ALIGNED_32(uint8_t, a_buf, [64 * 2]); uint8_t *a = &a_buf[32 * 2]; @@ -308,7 +308,7 @@ static int is_zero(const int16_t *c, int sz) #define SIZEOF_COEF (2 * ((bit_depth + 7) / 8)) -static void check_itxfm(void) +void checkasm_check_vp9_itxfm(void) { LOCAL_ALIGNED_64(uint8_t, src, [32 * 32 * 2]); LOCAL_ALIGNED_64(uint8_t, dst, [32 * 32 * 2]); @@ -449,7 +449,7 @@ static void randomize_loopfilter_buffers(int bidx, int lineoff, int str, randomize_loopfilter_buffers(bidx, lineoff, str, bit_depth, dir, \ E, F, H, I, buf0, buf1) -static void check_loopfilter(void) +void checkasm_check_vp9_loopfilter(void) { LOCAL_ALIGNED_32(uint8_t, base0, [32 + 16 * 16 * 2]); LOCAL_ALIGNED_32(uint8_t, base1, [32 + 16 * 16 * 2]); @@ -556,7 +556,7 @@ static void check_loopfilter(void) } \ } while (0) -static void check_mc(void) +void checkasm_check_vp9_mc(void) { LOCAL_ALIGNED_64(uint8_t, buf, [72 * 72 * 2]); LOCAL_ALIGNED_64(uint8_t, dst0, [64 * 64 * 2]); @@ -626,8 +626,8 @@ static void check_mc(void) void checkasm_check_vp9dsp(void) { - check_ipred(); - check_itxfm(); - check_loopfilter(); - check_mc(); + checkasm_check_vp9_ipred(); + checkasm_check_vp9_itxfm(); + checkasm_check_vp9_loopfilter(); + checkasm_check_vp9_mc(); } -- 2.49.1 >From 4f4be07e84b47a94cf1285760b33adaa70a7f737 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 1 Dec 2025 00:17:55 +0100 Subject: [PATCH 5/8] avcodec/x86/vp9mc: Avoid reloads, MMX regs in width 4 vert 8tap func Four rows of four bytes fit into one xmm register; therefore one can arrange the rows as follows (A,B,C: first, second, third etc. row) xmm0: ABABABAB BCBCBCBC xmm1: CDCDCDCD DEDEDEDE xmm2: EFEFEFEF FGFGFGFG xmm3: GHGHGHGH HIHIHIHI and use four pmaddubsw to calculate two rows in parallel. The history fits into four registers, making this possible even on 32bit systems. Old benchmarks (Unix 64): vp9_avg_8tap_smooth_4v_8bpp_c: 105.5 ( 1.00x) vp9_avg_8tap_smooth_4v_8bpp_ssse3: 16.4 ( 6.44x) vp9_put_8tap_smooth_4v_8bpp_c: 99.3 ( 1.00x) vp9_put_8tap_smooth_4v_8bpp_ssse3: 15.4 ( 6.44x) New benchmarks (Unix 64): vp9_avg_8tap_smooth_4v_8bpp_c: 105.0 ( 1.00x) vp9_avg_8tap_smooth_4v_8bpp_ssse3: 11.8 ( 8.90x) vp9_put_8tap_smooth_4v_8bpp_c: 99.7 ( 1.00x) vp9_put_8tap_smooth_4v_8bpp_ssse3: 10.7 ( 9.30x) Old benchmarks (x86-32): vp9_avg_8tap_smooth_4v_8bpp_c: 138.2 ( 1.00x) vp9_avg_8tap_smooth_4v_8bpp_ssse3: 28.0 ( 4.93x) vp9_put_8tap_smooth_4v_8bpp_c: 123.6 ( 1.00x) vp9_put_8tap_smooth_4v_8bpp_ssse3: 28.0 ( 4.41x) New benchmarks (x86-32): vp9_avg_8tap_smooth_4v_8bpp_c: 139.0 ( 1.00x) vp9_avg_8tap_smooth_4v_8bpp_ssse3: 20.1 ( 6.92x) vp9_put_8tap_smooth_4v_8bpp_c: 124.5 ( 1.00x) vp9_put_8tap_smooth_4v_8bpp_ssse3: 19.9 ( 6.26x) Loading the constants into registers did not turn out to be advantageous here (not to mention Win64, where this would necessitates saving and restoring ever more register); probably because there are only two loop iterations. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9mc.asm | 108 +++++++++++++++++++++++++++++++++++---- 1 file changed, 99 insertions(+), 9 deletions(-) diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index 682c6a6ea0..495746ee4e 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -496,12 +496,102 @@ INIT_XMM sse2 filter_sse2_v_fn put filter_sse2_v_fn avg -%macro filter_v_fn 1 -%assign %%px mmsize/2 +%macro filter4_v_fn 1 %if ARCH_X86_64 -cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 +cglobal vp9_%1_8tap_1d_v_4_8, 6, 7, 8, dst, dstride, src, sstride, h, filtery, sstride3 %else -cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 +cglobal vp9_%1_8tap_1d_v_4_8, 4, 5, 8, dst, dstride, src, sstride, filtery +%define hd r4mp +%define sstride3q filteryq +%endif + lea sstride3q, [sstrideq*3] +%if 0 + mova m8, [filteryq+ 0] + mova m9, [filteryq+32] + mova m10, [filteryq+64] + mova m11, [filteryq+96] + mova m12, [pw_64] +%define M8 m8 +%define M9 m9 +%define M10 m10 +%define M11 m11 +%define PW_64 m12 +%else +%define M8 [filteryq+ 0] +%define M9 [filteryq+32] +%define M10 [filteryq+64] +%define M11 [filteryq+96] +%define PW_64 [pw_64] +%endif + sub srcq, sstride3q + movd m0, [srcq] + movd m1, [srcq+sstrideq] + movd m2, [srcq+sstrideq*2] + movd m3, [srcq+sstride3q] + lea srcq, [srcq+sstrideq*4] + movd m4, [srcq] + movd m5, [srcq+sstrideq] + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + punpcklbw m3, m4 + punpcklqdq m0, m1 + movd m1, [srcq+sstrideq*2] + add srcq, sstride3q +%if ARCH_X86_32 + mov filteryq, r5mp +%endif + punpcklqdq m2, m3 + punpcklbw m4, m5 + punpcklbw m5, m1 + punpcklqdq m4, m5 +.loop: + pmaddubsw m0, M8 + movd m3, [srcq] + movd m5, [srcq+sstrideq] + pmaddubsw m7, m4, M10 + pmaddubsw m6, m2, M9 + punpcklbw m1, m3 + punpcklbw m3, m5 + punpcklqdq m1, m3 + pmaddubsw m3, m1, M11 + paddw m0, PW_64 + lea srcq, [srcq+2*sstrideq] + paddw m7, m0 + mova m0, m2 + mova m2, m4 +%ifidn %1, avg + movd m4, [dstq] +%endif + paddw m6, m3 +%ifidn %1, avg + movd m3, [dstq+dstrideq] +%endif + paddsw m6, m7 + psraw m6, 7 + packuswb m6, m6 + pshuflw m7, m6, 0xE +%ifidn %1, avg + pavgb m6, m4 +%endif + movd [dstq], m6 + mova m4, m1 +%ifidn %1, avg + pavgb m7, m3 +%endif + movd [dstq+dstrideq], m7 + lea dstq, [dstq+2*dstrideq] + mova m1, m5 + sub hd, 2 + jg .loop + RET +%endmacro + +%macro filter_v_fn 1 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_8_8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_8_8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 mov filteryq, r5mp %define hd r4mp %endif @@ -510,7 +600,7 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, f lea src4q, [srcq+sstrideq] sub srcq, sstride3q mova m7, [filteryq+ 0] -%if ARCH_X86_64 && mmsize > 8 +%if ARCH_X86_64 mova m8, [filteryq+32] mova m9, [filteryq+64] mova m10, [filteryq+96] @@ -533,7 +623,7 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, f punpcklbw m4, m5 punpcklbw m1, m3 pmaddubsw m0, m7 -%if ARCH_X86_64 && mmsize > 8 +%if ARCH_X86_64 pmaddubsw m2, m8 pmaddubsw m4, m9 pmaddubsw m1, m10 @@ -560,9 +650,9 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, f RET %endmacro -INIT_MMX ssse3 -filter_v_fn put -filter_v_fn avg +INIT_XMM ssse3 +filter4_v_fn put +filter4_v_fn avg INIT_XMM ssse3 filter_v_fn put -- 2.49.1 >From 46bbea312aaee1c5ef61cb51007b3ef75feae23f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 2 Dec 2025 13:47:32 +0100 Subject: [PATCH 6/8] avcodec/x86/vp9mc: Avoid MMX regs in width 4 hor 8tap funcs Using wider registers (and pshufb) allows to halve the number of pmaddubsw used. It is also ABI compliant (no more missing emms). Old benchmarks: vp9_avg_8tap_smooth_4h_8bpp_c: 97.6 ( 1.00x) vp9_avg_8tap_smooth_4h_8bpp_ssse3: 15.0 ( 6.52x) vp9_avg_8tap_smooth_4hv_8bpp_c: 342.9 ( 1.00x) vp9_avg_8tap_smooth_4hv_8bpp_ssse3: 54.0 ( 6.35x) vp9_put_8tap_smooth_4h_8bpp_c: 94.9 ( 1.00x) vp9_put_8tap_smooth_4h_8bpp_ssse3: 14.2 ( 6.67x) vp9_put_8tap_smooth_4hv_8bpp_c: 325.9 ( 1.00x) vp9_put_8tap_smooth_4hv_8bpp_ssse3: 52.5 ( 6.20x) New benchmarks: vp9_avg_8tap_smooth_4h_8bpp_c: 97.6 ( 1.00x) vp9_avg_8tap_smooth_4h_8bpp_ssse3: 10.8 ( 9.08x) vp9_avg_8tap_smooth_4hv_8bpp_c: 342.4 ( 1.00x) vp9_avg_8tap_smooth_4hv_8bpp_ssse3: 38.8 ( 8.82x) vp9_put_8tap_smooth_4h_8bpp_c: 94.7 ( 1.00x) vp9_put_8tap_smooth_4h_8bpp_ssse3: 9.7 ( 9.75x) vp9_put_8tap_smooth_4hv_8bpp_c: 321.7 ( 1.00x) vp9_put_8tap_smooth_4hv_8bpp_ssse3: 37.0 ( 8.69x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9mc.asm | 50 +++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index 495746ee4e..2020c6d3cd 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -114,6 +114,9 @@ FILTER sse2 ; int16_t ff_filters_16bpp[3][15][4][16] FILTER 16bpp +filter4_h_perm0: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +filter4_h_perm1: db 1, 2, 2, 3, 3, 4, 4, 5, 3, 4, 4, 5, 5, 6, 6, 7 + %if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64 ALIGN 64 spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 @@ -280,12 +283,51 @@ INIT_XMM sse2 filter_sse2_h_fn put filter_sse2_h_fn avg +%macro filter4_h_fn 2 +cglobal vp9_%1_8tap_1d_h_4_8, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m2, [filter4_h_perm0] + mova m3, [filter4_h_perm1] + pcmpeqw m4, m4 + movu m5, [filteryq+24] + movu m6, [filteryq+88] + psllw m4, 6 ; pw_m64 +.loop: + movq m0, [srcq-3] + movq m1, [srcq+0] + pshufb m0, m2 + pshufb m1, m3 + pmaddubsw m0, m5 + pmaddubsw m1, m6 +%ifidn %1, avg + movd m7, [dstq] +%endif + add srcq, sstrideq + paddw m0, m1 + movhlps m1, m0 + psubw m0, m4 + paddsw m0, m1 + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m7 +%endif + movd [dstq], m0 + add dstq, dstrideq + sub hd, 1 + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter4_h_fn put, 7 +filter4_h_fn avg, 8 + %macro filter_h_fn 1 %assign %%px mmsize/2 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery mova m6, [pw_256] mova m7, [filteryq+ 0] -%if ARCH_X86_64 && mmsize > 8 +%ifdef m8 mova m8, [filteryq+32] mova m9, [filteryq+64] mova m10, [filteryq+96] @@ -305,7 +347,7 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h punpcklbw m4, m5 punpcklbw m1, m3 pmaddubsw m0, m7 -%if ARCH_X86_64 && mmsize > 8 +%ifdef m8 pmaddubsw m2, m8 pmaddubsw m4, m9 pmaddubsw m1, m10 @@ -332,10 +374,6 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h RET %endmacro -INIT_MMX ssse3 -filter_h_fn put -filter_h_fn avg - INIT_XMM ssse3 filter_h_fn put filter_h_fn avg -- 2.49.1 >From 198333bd75051c37de85fef2ab0c4316e68df150 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 2 Dec 2025 14:27:23 +0100 Subject: [PATCH 7/8] avcodec/x86/vp9mc: Deduplicate coefficient tables Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9mc.asm | 72 ++++++++++++---------------------------- 1 file changed, 22 insertions(+), 50 deletions(-) diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index 2020c6d3cd..f716eac446 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -53,8 +53,11 @@ times 8 dw %5, %6 times 8 dw %7, %8 %endmacro -%macro FILTER 1 -const filters_%1 ; smooth +%macro FILTER 0-1 + ; smooth +%if %0 > 0 +%1 %+ _smooth: +%endif F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 @@ -71,6 +74,9 @@ const filters_%1 ; smooth F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 ; regular +%if %0 > 0 +%1 %+ _regular: +%endif F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 @@ -87,6 +93,9 @@ const filters_%1 ; smooth F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 ; sharp +%if %0 > 0 +%1 %+ _sharp: +%endif F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 @@ -106,13 +115,16 @@ const filters_%1 ; smooth %define F8_TAPS F8_SSSE3_TAPS ; int8_t ff_filters_ssse3[3][15][4][32] -FILTER ssse3 +const filters_ssse3 +FILTER %define F8_TAPS F8_SSE2_TAPS ; int16_t ff_filters_sse2[3][15][8][8] -FILTER sse2 +const filters_sse2 +FILTER %define F8_TAPS F8_16BPP_TAPS ; int16_t ff_filters_16bpp[3][15][4][16] -FILTER 16bpp +const filters_16bpp +FILTER filter4_h_perm0: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 filter4_h_perm1: db 1, 2, 2, 3, 3, 4, 4, 5, 3, 4, 4, 5, 5, 6, 6, 7 @@ -148,51 +160,11 @@ spel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 1 %define spel_h_shufA (spel_h_perm16+ 0) %define spel_h_shufC (spel_h_perm16+16) -vp9_spel_filter_regular: db 0, 1, -5, 126, 8, -3, 1, 0 - db -1, 3, -10, 122, 18, -6, 2, 0 - db -1, 4, -13, 118, 27, -9, 3, -1 - db -1, 4, -16, 112, 37, -11, 4, -1 - db -1, 5, -18, 105, 48, -14, 4, -1 - db -1, 5, -19, 97, 58, -16, 5, -1 - db -1, 6, -19, 88, 68, -18, 5, -1 - db -1, 6, -19, 78, 78, -19, 6, -1 - db -1, 5, -18, 68, 88, -19, 6, -1 - db -1, 5, -16, 58, 97, -19, 5, -1 - db -1, 4, -14, 48, 105, -18, 5, -1 - db -1, 4, -11, 37, 112, -16, 4, -1 - db -1, 3, -9, 27, 118, -13, 4, -1 - db 0, 2, -6, 18, 122, -10, 3, -1 - db 0, 1, -3, 8, 126, -5, 1, 0 -vp9_spel_filter_sharp: db -1, 3, -7, 127, 8, -3, 1, 0 - db -2, 5, -13, 125, 17, -6, 3, -1 - db -3, 7, -17, 121, 27, -10, 5, -2 - db -4, 9, -20, 115, 37, -13, 6, -2 - db -4, 10, -23, 108, 48, -16, 8, -3 - db -4, 10, -24, 100, 59, -19, 9, -3 - db -4, 11, -24, 90, 70, -21, 10, -4 - db -4, 11, -23, 80, 80, -23, 11, -4 - db -4, 10, -21, 70, 90, -24, 11, -4 - db -3, 9, -19, 59, 100, -24, 10, -4 - db -3, 8, -16, 48, 108, -23, 10, -4 - db -2, 6, -13, 37, 115, -20, 9, -4 - db -2, 5, -10, 27, 121, -17, 7, -3 - db -1, 3, -6, 17, 125, -13, 5, -2 - db 0, 1, -3, 8, 127, -7, 3, -1 -vp9_spel_filter_smooth: db -3, -1, 32, 64, 38, 1, -3, 0 - db -2, -2, 29, 63, 41, 2, -3, 0 - db -2, -2, 26, 63, 43, 4, -4, 0 - db -2, -3, 24, 62, 46, 5, -4, 0 - db -2, -3, 21, 60, 49, 7, -4, 0 - db -1, -4, 18, 59, 51, 9, -4, 0 - db -1, -4, 16, 57, 53, 12, -4, -1 - db -1, -4, 14, 55, 55, 14, -4, -1 - db -1, -4, 12, 53, 57, 16, -4, -1 - db 0, -4, 9, 51, 59, 18, -4, -1 - db 0, -4, 7, 49, 60, 21, -3, -2 - db 0, -4, 5, 46, 62, 24, -3, -2 - db 0, -4, 4, 43, 63, 26, -2, -2 - db 0, -3, 2, 41, 63, 29, -2, -2 - db 0, -3, 1, 38, 64, 32, -1, -3 +%macro F8_AVX512_TAPS 8 +db %1, %2, %3, %4, %5, %6, %7, %8 +%endmacro +%define F8_TAPS F8_AVX512_TAPS +FILTER vp9_spel_filter pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 pd_64: dd 64 -- 2.49.1 >From 917614bfa75bb28f8dfdcc8490abb0d0107615ed Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 2 Dec 2025 17:13:31 +0100 Subject: [PATCH 8/8] avcodec/x86/vp9mc: Reindent after the previous commit Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9mc.asm | 96 ++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index f716eac446..61bce584b7 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -54,63 +54,63 @@ times 8 dw %7, %8 %endmacro %macro FILTER 0-1 - ; smooth %if %0 > 0 %1 %+ _smooth: %endif - F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 - F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 - F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 - F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 - F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 - F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 - F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 - F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 - F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 - F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 - F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 - F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 - F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 - F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 - F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 - ; regular + ; smooth + F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 + F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 + F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 + F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 + F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 + F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 + F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 + F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 + F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 + F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 + F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 + F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 + F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 + F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 + F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 %if %0 > 0 %1 %+ _regular: %endif - F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 - F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 - F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 - F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 - F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 - F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 - F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 - F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 - F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 - F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 - F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 - F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 - F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 - F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 - F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 - ; sharp + ; regular + F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 + F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 + F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 + F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 + F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 + F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 + F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 + F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 + F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 + F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 + F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 + F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 + F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 + F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 + F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 %if %0 > 0 %1 %+ _sharp: %endif - F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 - F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 - F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 - F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 - F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 - F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 - F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 - F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 - F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 - F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 - F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 - F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 - F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 - F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 - F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 + ; sharp + F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 + F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 + F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 + F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 + F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 + F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 + F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 + F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 + F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 + F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 + F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 + F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 + F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 + F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 + F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 %endmacro %define F8_TAPS F8_SSSE3_TAPS -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
