This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 7411998757ab0ba32c19326da6eb0d348abe5129 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Sun Nov 23 20:25:26 2025 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Dec 4 15:17:37 2025 +0100 avcodec/x86/vp8dsp: Avoid unpacking multiple times Always pair row i with row i+2 for the vertical four-tap filter and row i+3 for the vertical six-tap filter (instead of pairing the first with the sixth, the second with the third and the fourth and the fifth). This allows to unpack each row only once instead of (at most) three times. Old benchmarks: vp8_put_epel4_v4_c: 98.4 ( 1.00x) vp8_put_epel4_v4_ssse3: 28.6 ( 3.44x) vp8_put_epel4_v6_c: 131.6 ( 1.00x) vp8_put_epel4_v6_ssse3: 38.5 ( 3.42x) vp8_put_epel8_v4_c: 362.5 ( 1.00x) vp8_put_epel8_v4_sse2: 63.8 ( 5.68x) vp8_put_epel8_v4_ssse3: 44.4 ( 8.16x) vp8_put_epel8_v6_c: 538.3 ( 1.00x) vp8_put_epel8_v6_sse2: 86.5 ( 6.22x) vp8_put_epel8_v6_ssse3: 57.0 ( 9.44x) vp8_put_epel16_v6_c: 1044.6 ( 1.00x) vp8_put_epel16_v6_sse2: 158.0 ( 6.61x) vp8_put_epel16_v6_ssse3: 106.7 ( 9.79x) New benchmarks: vp8_put_epel4_v4_c: 100.0 ( 1.00x) vp8_put_epel4_v4_ssse3: 28.4 ( 3.52x) vp8_put_epel4_v6_c: 131.7 ( 1.00x) vp8_put_epel4_v6_ssse3: 34.3 ( 3.84x) vp8_put_epel8_v4_c: 364.4 ( 1.00x) vp8_put_epel8_v4_sse2: 63.7 ( 5.72x) vp8_put_epel8_v4_ssse3: 43.3 ( 8.42x) vp8_put_epel8_v6_c: 550.2 ( 1.00x) vp8_put_epel8_v6_sse2: 86.4 ( 6.37x) vp8_put_epel8_v6_ssse3: 52.9 (10.40x) vp8_put_epel16_v6_c: 1052.5 ( 1.00x) vp8_put_epel16_v6_sse2: 158.3 ( 6.65x) vp8_put_epel16_v6_ssse3: 98.9 (10.64x) Reviewed-by: Ronald S. Bultje <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 68 ++++++++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index e971da68ac..7cb729a443 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -33,6 +33,15 @@ fourtap_filter_hb_m: times 8 db -6, 123 times 8 db -1, 12 times 8 db 123, -6 +fourtap_filter_b_m: times 8 db -6, 12 + times 8 db 123, -1 + times 8 db -9, 50 + times 8 db 93, -6 + times 8 db -6, 93 + times 8 db 50, -9 + times 8 db -1, 123 + times 8 db 12, -6 + sixtap_filter_hb_m: times 8 db 2, 1 times 8 db -11, 108 times 8 db 36, -8 @@ -43,6 +52,16 @@ sixtap_filter_hb_m: times 8 db 2, 1 times 8 db -8, 36 times 8 db 108, -11 +sixtap_filter_b_m: times 8 db 2, 36 + times 8 db -11, -8 + times 8 db 108, 1 + times 8 db 3, 77 + times 8 db -16, -16 + times 8 db 77, 3 + times 8 db 1, 108 + times 8 db -8, -11 + times 8 db 36, 2 + fourtap_filter_v_m: times 8 dw -6 times 8 dw 123 times 8 dw 12 @@ -97,7 +116,9 @@ bilinear_filter_vb_m: times 8 db 7, 1 %if PIC %define fourtap_filter_hb picregq +%define fourtap_filter_b picregq %define sixtap_filter_hb picregq +%define sixtap_filter_b picregq %define fourtap_filter_v picregq %define sixtap_filter_v picregq %define bilinear_filter_vw picregq @@ -105,7 +126,9 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define npicregs 1 %else %define fourtap_filter_hb fourtap_filter_hb_m +%define fourtap_filter_b fourtap_filter_b_m %define sixtap_filter_hb sixtap_filter_hb_m +%define sixtap_filter_b sixtap_filter_b_m %define fourtap_filter_v fourtap_filter_v_m %define sixtap_filter_v sixtap_filter_v_m %define bilinear_filter_vw bilinear_filter_vw_m @@ -212,10 +235,10 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 4 %if PIC - lea picregq, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_b_m] %endif - mova m5, [fourtap_filter_hb+myq-16] - mova m6, [fourtap_filter_hb+myq] + mova m5, [fourtap_filter_b+myq-16] + mova m6, [fourtap_filter_b+myq] mova m7, [pw_256] ; read 3 lines @@ -224,21 +247,20 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr movh m0, [srcq+picregq] movh m1, [srcq] movh m2, [srcq+srcstrideq] + punpcklbw m0, m2 .nextrow: movh m3, [srcq+2*srcstrideq] ; read new row - mova m4, m0 + pmaddubsw m0, m5 + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + add srcq, srcstrideq + paddsw m4, m0 mova m0, m1 - punpcklbw m4, m1 - mova m1, m2 - punpcklbw m2, m3 - pmaddubsw m4, m5 - pmaddubsw m2, m6 - add srcq, srcstrideq - paddsw m4, m2 - mova m2, m3 pmulhrsw m4, m7 + mova m1, m2 packuswb m4, m4 + mova m2, m3 movh [dstq], m4 ; go to next line @@ -250,9 +272,9 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my lea myd, [myq*3] %if PIC - lea picregq, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_b_m] %endif - lea myq, [sixtap_filter_hb+myq*8] + lea myq, [sixtap_filter_b+myq*8] ; read 5 lines mov picregq, srcstrideq @@ -263,20 +285,18 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr movh m3, [srcq+srcstrideq] movh m4, [srcq+2*srcstrideq] lea srcq, [srcq+srcstrideq*2] + punpcklbw m0, m3 + punpcklbw m1, m4 .nextrow: movh m5, [srcq+srcstrideq] ; read new row - mova m6, m0 - punpcklbw m6, m5 + pmaddubsw m0, [myq-48] + punpcklbw m2, m5 + pmaddubsw m6, m1, [myq-32] + pmaddubsw m7, m2, [myq-16] + add srcq, srcstrideq + paddw m6, m0 mova m0, m1 - punpcklbw m1, m2 - mova m7, m3 - punpcklbw m7, m4 - pmaddubsw m6, [myq-48] - pmaddubsw m1, [myq-32] - pmaddubsw m7, [myq-16] - add srcq, srcstrideq - paddsw m6, m1 paddsw m6, m7 mova m1, m2 mova m2, m3 _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
