This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 3135bc0d3a5c1f4fd0cd006eb1f5e1986aeb4ab7 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Mon Nov 24 13:29:42 2025 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Dec 4 15:17:37 2025 +0100 avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_h4_ssse3 Doubling the register width allows to use only one pshufb and pmaddubsw. Old benchmarks: vp8_put_epel4_h4_c: 82.8 ( 1.00x) vp8_put_epel4_h4_ssse3: 13.9 ( 5.96x) New benchmarks: vp8_put_epel4_h4_c: 82.7 ( 1.00x) vp8_put_epel4_h4_ssse3: 11.7 ( 7.08x) Reviewed-by: Ronald S. Bultje <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index fd60feaf1f..6c365898ce 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -24,6 +24,15 @@ SECTION_RODATA +fourtap_filter4_b_m: times 4 db -6, 123 + times 4 db 12, -1 + times 4 db -9, 93 + times 4 db 50, -6 + times 4 db -6, 50 + times 4 db 93, -9 + times 4 db -1, 12 + times 4 db 123, -6 + fourtap_filter_hb_m: times 8 db -6, 123 times 8 db 12, -1 times 8 db -9, 93 @@ -117,6 +126,7 @@ bilinear_filter_vb_m: times 8 db 7, 1 %if PIC %define fourtap_filter_hb picregq %define fourtap_filter_b picregq +%define fourtap_filter4_b picregq %define sixtap_filter_hb picregq %define sixtap_filter_b picregq %define fourtap_filter_v picregq @@ -127,6 +137,7 @@ bilinear_filter_vb_m: times 8 db 7, 1 %else %define fourtap_filter_hb fourtap_filter_hb_m %define fourtap_filter_b fourtap_filter_b_m +%define fourtap_filter4_b fourtap_filter4_b_m %define sixtap_filter_hb sixtap_filter_hb_m %define sixtap_filter_b sixtap_filter_b_m %define fourtap_filter_v fourtap_filter_v_m @@ -136,6 +147,7 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define npicregs 0 %endif +filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 @@ -208,9 +220,11 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h jg .nextrow RET -cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 4 +INIT_XMM ssse3 +cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src, srcstride, height, mx, picreg mova m2, [pw_256] +%if %1 == 8 + shl mxd, 4 mova m3, [filter_h2_shuf] mova m4, [filter_h4_shuf] %if PIC @@ -218,19 +232,34 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h %endif mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes mova m6, [fourtap_filter_hb+mxq] +%else + shl mxd, 3 + mova m3, [filter4_h4_shuf] +%if PIC + lea picregq, [fourtap_filter4_b_m] +%endif + mova m5, [fourtap_filter4_b+mxq-8] +%endif .nextrow: +%if %1 == 4 + movq m0, [srcq-1] + pshufb m0, m3 + pmaddubsw m0, m5 + movhlps m1, m0 +%else movu m0, [srcq-1] mova m1, m0 pshufb m0, m3 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m6 +%endif add srcq, srcstrideq paddsw m0, m1 pmulhrsw m0, m2 packuswb m0, m0 - movh [dstq], m0 ; store + MOV [dstq], m0 ; store ; go to next line add dstq, dststrideq @@ -238,7 +267,6 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h jg .nextrow RET -INIT_XMM ssse3 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 4 %if PIC _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
