This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit fd711adcb61e2b2e9c6a8bef2006f9e00e84ab35 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Mon May 4 19:51:29 2026 +0200 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Wed Jul 1 20:00:46 2026 +0200 avcodec/x86/vc1dsp_mc: Add size 8 horizontal SSSE3 mc functions pmaddubsw strikes again. vc1dsp.avg_vc1_mspel_pixels_tab_mc10_8_c: 150.2 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc10_8_mmxext: 44.5 ( 3.38x) vc1dsp.avg_vc1_mspel_pixels_tab_mc10_8_ssse3: 18.5 ( 8.12x) vc1dsp.avg_vc1_mspel_pixels_tab_mc20_8_c: 288.2 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc20_8_mmxext: 37.7 ( 7.64x) vc1dsp.avg_vc1_mspel_pixels_tab_mc20_8_ssse3: 18.1 (15.97x) vc1dsp.avg_vc1_mspel_pixels_tab_mc30_8_c: 155.4 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc30_8_mmxext: 46.5 ( 3.34x) vc1dsp.avg_vc1_mspel_pixels_tab_mc30_8_ssse3: 18.1 ( 8.60x) vc1dsp.put_vc1_mspel_pixels_tab_mc10_8_c: 282.2 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc10_8_mmx: 42.7 ( 6.61x) vc1dsp.put_vc1_mspel_pixels_tab_mc10_8_ssse3: 16.4 (17.16x) vc1dsp.put_vc1_mspel_pixels_tab_mc20_8_c: 223.4 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc20_8_mmx: 36.3 ( 6.15x) vc1dsp.put_vc1_mspel_pixels_tab_mc20_8_ssse3: 16.4 (13.59x) vc1dsp.put_vc1_mspel_pixels_tab_mc30_8_c: 255.2 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc30_8_mmx: 43.6 ( 5.85x) vc1dsp.put_vc1_mspel_pixels_tab_mc30_8_ssse3: 16.4 (15.52x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vc1dsp_init.c | 12 ++++++++ libavcodec/x86/vc1dsp_mc.asm | 65 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c index 3f0eb5746c..29c28fb2b7 100644 --- a/libavcodec/x86/vc1dsp_init.c +++ b/libavcodec/x86/vc1dsp_init.c @@ -89,6 +89,14 @@ void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block); +#define MSPEL_FUNC(OP, X, Y, SIZE, XMM) \ + void ff_vc1_ ## OP ## _mspel_mc ## X ## Y ## _ ## SIZE ##_ ## XMM \ + (uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd); \ + dsp->OP ## _vc1_mspel_pixels_tab[SIZE == 8][X + 4 * Y] = \ + ff_vc1_ ## OP ## _mspel_mc ## X ## Y## _ ## SIZE ##_ ## XMM +#define MSPEL_FUNCS_SIZE(X, Y, SIZE, XMM) \ + MSPEL_FUNC(put, X, Y, SIZE, XMM); \ + MSPEL_FUNC(avg, X, Y, SIZE, XMM) av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) { @@ -132,6 +140,10 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) ASSIGN_LF816(ssse3); dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3; + + MSPEL_FUNCS_SIZE(1, 0, 8, ssse3); + MSPEL_FUNCS_SIZE(2, 0, 8, ssse3); + MSPEL_FUNCS_SIZE(3, 0, 8, ssse3); } if (EXTERNAL_SSE4(cpu_flags)) { dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm index 1cb62ac409..f6204afb23 100644 --- a/libavcodec/x86/vc1dsp_mc.asm +++ b/libavcodec/x86/vc1dsp_mc.asm @@ -21,6 +21,12 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA + +pb_m4_36: times 8 db -4, 36 +pb_m4_53: times 8 db -4, 53 +pb_m3_18: times 8 db -3, 18 + cextern pw_9 cextern pw_128 @@ -192,3 +198,62 @@ HOR_16B_SHIFT2 OP_PUT, put INIT_MMX mmxext HOR_16B_SHIFT2 OP_AVG, avg %endif ; HAVE_MMX_INLINE + +%define MOV8 movq + +INIT_XMM ssse3 +%macro HOR_8B 2 + +cglobal vc1_%1_mspel_mc10_%2, 4, 4, 6, dst, src, stride, rnd + mova m1, [pb_m4_53] + mova m2, [pb_m3_18] + sub rndd, 32 + jmp vc1_%1_mspel_mc30_%2_after_prologue + +cglobal vc1_%1_mspel_mc20_%2, 4, 4, 6, dst, src, stride, rnd + mova m1, [pb_m4_36] + lea rndd, [4*rndd-32] + mova m2, m1 + jmp vc1_%1_mspel_mc30_%2_after_prologue + +cglobal vc1_%1_mspel_mc30_%2, 4, 4, 6, dst, src, stride, rnd + mova m2, [pb_m4_53] + mova m1, [pb_m3_18] + sub rndd, 32 + +vc1_%1_mspel_mc30_%2_after_prologue: + movd m0, rndd + WIN64_SPILL_XMM 7 +%define hd rndd + mov hd, %2 + SPLATW m0, m0 +.loop: + MOV%2 m3, [srcq-1] + MOV%2 m4, [srcq] + MOV%2 m5, [srcq+1] + MOV%2 m6, [srcq+2] + + punpcklbw m3, m4 + pmaddubsw m3, m1 +%ifidn %1,avg + movq m4, [dstq] +%endif + punpcklbw m6, m5 + pmaddubsw m6, m2 + add srcq, strideq + psubw m3, m0 + paddw m3, m6 + psraw m3, 6 + packuswb m3, m3 +%ifidn %1,avg + pavgb m3, m4 +%endif + movq [dstq], m3 + add dstq, strideq + dec hd + jnz .loop + RET +%endmacro + +HOR_8B put, 8 +HOR_8B avg, 8 _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
