PR #21081 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21081 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21081.patch
Also remove some unused functions. For the RISCV stuff (the penultimate commit) only compilation was tested. >From e495162f74195c6ef6060a2d8034f0a715425b2d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 23 Nov 2025 11:08:14 +0100 Subject: [PATCH 01/15] avcodec/x86/vp8dsp: Remove MMXEXT functions overridden by SSSE3 SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD), so that the overwhelming majority of our users (particularly those that actually update their FFmpeg) will be using the SSSE3 versions. This commit therefore removes the MMX(EXT) functions overridden by them (which don't abide by the ABI) to get closer to a removal of emms_c. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 159 +---------------------------------- libavcodec/x86/vp8dsp_init.c | 37 +------- 2 files changed, 6 insertions(+), 190 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 231c21ea0d..7b836351e4 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* VP8 MMXEXT optimizations +;* VP8 ASM optimizations ;* Copyright (c) 2010 Ronald S. Bultje <[email protected]> ;* Copyright (c) 2010 Fiona Glaser <[email protected]> ;* @@ -24,25 +24,6 @@ SECTION_RODATA -fourtap_filter_hw_m: times 4 dw -6, 123 - times 4 dw 12, -1 - times 4 dw -9, 93 - times 4 dw 50, -6 - times 4 dw -6, 50 - times 4 dw 93, -9 - times 4 dw -1, 12 - times 4 dw 123, -6 - -sixtap_filter_hw_m: times 4 dw 2, -11 - times 4 dw 108, 36 - times 4 dw -8, 1 - times 4 dw 3, -16 - times 4 dw 77, 77 - times 4 dw -16, 3 - times 4 dw 1, -8 - times 4 dw 36, 108 - times 4 dw -11, 2 - fourtap_filter_hb_m: times 8 db -6, 123 times 8 db 12, -1 times 8 db -9, 93 @@ -115,8 +96,6 @@ bilinear_filter_vb_m: times 8 db 7, 1 times 8 db 1, 7 %if PIC -%define fourtap_filter_hw picregq -%define sixtap_filter_hw picregq %define fourtap_filter_hb picregq %define sixtap_filter_hb picregq %define fourtap_filter_v picregq @@ -125,8 +104,6 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define bilinear_filter_vb picregq %define npicregs 1 %else -%define fourtap_filter_hw fourtap_filter_hw_m -%define sixtap_filter_hw sixtap_filter_hw_m %define fourtap_filter_hb fourtap_filter_hb_m %define sixtap_filter_hb sixtap_filter_hb_m %define fourtap_filter_v fourtap_filter_v_m @@ -322,112 +299,6 @@ FILTER_SSSE3 4 INIT_XMM ssse3 FILTER_SSSE3 8 -; 4x4 block, H-only 4-tap filter -INIT_MMX mmxext -cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 4 -%if PIC - lea picregq, [fourtap_filter_hw_m] -%endif - movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words - movq mm5, [fourtap_filter_hw+mxq] - movq mm7, [pw_64] - pxor mm6, mm6 - -.nextrow: - movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels - - ; first set of 2 pixels - movq mm2, mm1 ; byte ABCD.. - punpcklbw mm1, mm6 ; byte->word ABCD - pshufw mm0, mm2, 9 ; byte CDEF.. - punpcklbw mm0, mm6 ; byte->word CDEF - pshufw mm3, mm1, 0x94 ; word ABBC - pshufw mm1, mm0, 0x94 ; word CDDE - pmaddwd mm3, mm4 ; multiply 2px with F0/F1 - movq mm0, mm1 ; backup for second set of pixels - pmaddwd mm1, mm5 ; multiply 2px with F2/F3 - paddd mm3, mm1 ; finish 1st 2px - - ; second set of 2 pixels, use backup of above - punpckhbw mm2, mm6 ; byte->word EFGH - pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 - pshufw mm1, mm2, 0x94 ; word EFFG - pmaddwd mm1, mm5 ; multiply 2px with F2/F3 - paddd mm0, mm1 ; finish 2nd 2px - - ; merge two sets of 2 pixels into one set of 4, round/clip/store - packssdw mm3, mm0 ; merge dword->word (4px) - paddsw mm3, mm7 ; rounding - psraw mm3, 7 - packuswb mm3, mm6 ; clip and word->bytes - movd [dstq], mm3 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - RET - -; 4x4 block, H-only 6-tap filter -INIT_MMX mmxext -cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg - lea mxd, [mxq*3] -%if PIC - lea picregq, [sixtap_filter_hw_m] -%endif - movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words - movq mm5, [sixtap_filter_hw+mxq*8-32] - movq mm6, [sixtap_filter_hw+mxq*8-16] - movq mm7, [pw_64] - pxor mm3, mm3 - -.nextrow: - movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels - - ; first set of 2 pixels - movq mm2, mm1 ; byte ABCD.. - punpcklbw mm1, mm3 ; byte->word ABCD - pshufw mm0, mm2, 0x9 ; byte CDEF.. - punpckhbw mm2, mm3 ; byte->word EFGH - punpcklbw mm0, mm3 ; byte->word CDEF - pshufw mm1, mm1, 0x94 ; word ABBC - pshufw mm2, mm2, 0x94 ; word EFFG - pmaddwd mm1, mm4 ; multiply 2px with F0/F1 - pshufw mm3, mm0, 0x94 ; word CDDE - movq mm0, mm3 ; backup for second set of pixels - pmaddwd mm3, mm5 ; multiply 2px with F2/F3 - paddd mm1, mm3 ; add to 1st 2px cache - movq mm3, mm2 ; backup for second set of pixels - pmaddwd mm2, mm6 ; multiply 2px with F4/F5 - paddd mm1, mm2 ; finish 1st 2px - - ; second set of 2 pixels, use backup of above - movd mm2, [srcq+3] ; byte FGHI (prevent overreads) - pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 - pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 - paddd mm0, mm3 ; add to 2nd 2px cache - pxor mm3, mm3 - punpcklbw mm2, mm3 ; byte->word FGHI - pshufw mm2, mm2, 0xE9 ; word GHHI - pmaddwd mm2, mm6 ; multiply 2px with F4/F5 - paddd mm0, mm2 ; finish 2nd 2px - - ; merge two sets of 2 pixels into one set of 4, round/clip/store - packssdw mm1, mm0 ; merge dword->word (4px) - paddsw mm1, mm7 ; rounding - psraw mm1, 7 - packuswb mm1, mm3 ; clip and word->bytes - movd [dstq], mm1 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - RET - INIT_XMM sse2 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg shl mxd, 5 @@ -539,9 +410,9 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h jg .nextrow RET -%macro FILTER_V 1 +INIT_XMM sse2 ; 4x4 block, V-only 4-tap filter -cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my +cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 5 %if PIC lea picregq, [fourtap_filter_v_m] @@ -594,7 +465,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr ; 4x4 block, V-only 6-tap filter -cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my +cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 4 lea myq, [myq*3] %if PIC @@ -656,12 +527,6 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr dec heightd ; next row jg .nextrow RET -%endmacro - -INIT_MMX mmxext -FILTER_V 4 -INIT_XMM sse2 -FILTER_V 8 %macro FILTER_BILINEAR 1 %if cpuflag(ssse3) @@ -722,16 +587,9 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%if mmsize == 8 - packuswb m0, m0 - packuswb m2, m2 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m2 -%else packuswb m0, m2 movh [dstq+dststrideq*0], m0 movhps [dstq+dststrideq*1], m0 -%endif %endif ; cpuflag(ssse3) lea dstq, [dstq+dststrideq*2] @@ -799,16 +657,9 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%if mmsize == 8 - packuswb m0, m0 - packuswb m2, m2 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m2 -%else packuswb m0, m2 movh [dstq+dststrideq*0], m0 movhps [dstq+dststrideq*1], m0 -%endif %endif ; cpuflag(ssse3) lea dstq, [dstq+dststrideq*2] @@ -818,8 +669,6 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride RET %endmacro -INIT_MMX mmxext -FILTER_BILINEAR 4 INIT_XMM sse2 FILTER_BILINEAR 8 INIT_MMX ssse3 diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index e37afab775..00733a2564 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -29,19 +29,6 @@ /* * MC functions */ -void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -80,9 +67,6 @@ void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -93,9 +77,6 @@ void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -159,14 +140,6 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT dst, dststride, tmpptr, SIZE, height, mx, my); \ } -#define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) - -HVTAPMMX(4, 4) -HVTAPMMX(4, 6) -HVTAPMMX(6, 4) -HVTAPMMX(6, 6) - #define HVTAPSSE2(x, y, w) \ HVTAP(sse2, 16, x, y, w, 16) \ HVTAP(ssse3, 16, x, y, w, 16) @@ -194,7 +167,6 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ dst, dststride, tmp, SIZE, height, mx, my); \ } -HVBILIN(mmxext, 8, 4, 8) HVBILIN(sse2, 8, 8, 16) HVBILIN(sse2, 8, 16, 16) HVBILIN(ssse3, 8, 4, 8) @@ -285,13 +257,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; } - /* note that 4-tap width=16 functions are missing because w=16 - * is only used for luma, and luma is always a copy or sixtap. */ - if (EXTERNAL_MMXEXT(cpu_flags)) { - VP8_MC_FUNC(2, 4, mmxext); - VP8_BILINEAR_MC_FUNC(2, 4, mmxext); - } - if (EXTERNAL_SSE(cpu_flags)) { c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; @@ -304,6 +269,8 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) VP8_BILINEAR_MC_FUNC(1, 8, sse2); } + /* note that 4-tap width=16 functions are missing because w=16 + * is only used for luma, and luma is always a copy or sixtap. */ if (EXTERNAL_SSSE3(cpu_flags)) { VP8_LUMA_MC_FUNC(0, 16, ssse3); VP8_MC_FUNC(1, 8, ssse3); -- 2.49.1 >From 3fd1685e3d4cbde7f8754c91911e70ea780ce52b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 23 Nov 2025 11:25:26 +0100 Subject: [PATCH 02/15] avcodec/x86/vp8dsp: Don't use MMX registers in put_vp8_pixels8 Use GPRs on x64 and xmm registers else (using GPRs reduces codesize). This avoids clobbering the floating point state and therefore no longer breaks the ABI. No change in benchmarks here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 20 ++++++++++++++------ libavcodec/x86/vp8dsp_init.c | 9 +++------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 7b836351e4..7dee979e20 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -676,14 +676,22 @@ FILTER_BILINEAR 4 INIT_XMM ssse3 FILTER_BILINEAR 8 -INIT_MMX mmx -cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height +INIT_XMM sse2 +cglobal put_vp8_pixels8, 5, 5+2*ARCH_X86_64, 2, dst, dststride, src, srcstride, height .nextrow: - movq mm0, [srcq+srcstrideq*0] - movq mm1, [srcq+srcstrideq*1] +%if ARCH_X86_64 + mov r5q, [srcq+srcstrideq*0] + mov r6q, [srcq+srcstrideq*1] lea srcq, [srcq+srcstrideq*2] - movq [dstq+dststrideq*0], mm0 - movq [dstq+dststrideq*1], mm1 + mov [dstq+dststrideq*0], r5q + mov [dstq+dststrideq*1], r6q +%else + movq m0, [srcq+srcstrideq*0] + movq m1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0], m0 + movq [dstq+dststrideq*1], m1 +%endif lea dstq, [dstq+dststrideq*2] sub heightd, 2 jg .nextrow diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index 00733a2564..40aa52c7f0 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -88,7 +88,7 @@ void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, int height, int mx, int my); -void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride, +void ff_put_vp8_pixels8_sse2(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride, @@ -252,17 +252,14 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) { - c->put_vp8_epel_pixels_tab[1][0][0] = - c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; - } - if (EXTERNAL_SSE(cpu_flags)) { c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } if (EXTERNAL_SSE2_SLOW(cpu_flags)) { + c->put_vp8_epel_pixels_tab[1][0][0] = + c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_sse2; VP8_LUMA_MC_FUNC(0, 16, sse2); VP8_MC_FUNC(1, 8, sse2); VP8_BILINEAR_MC_FUNC(0, 16, sse2); -- 2.49.1 >From a08ac2daa09f50bfe9ff84aec746a9b4c7b80a36 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 23 Nov 2025 12:53:12 +0100 Subject: [PATCH 03/15] avcodec/x86/vp8dsp: Directly use negated stride There is a register available. No change in benchmarks here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 44 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 7dee979e20..6b5ca9f309 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -219,11 +219,11 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr mova m7, [pw_256] ; read 3 lines - sub srcq, srcstrideq - movh m0, [srcq] - movh m1, [srcq+ srcstrideq] - movh m2, [srcq+2*srcstrideq] - add srcq, srcstrideq + mov picregq, srcstrideq + neg picregq + movh m0, [srcq+picregq] + movh m1, [srcq] + movh m2, [srcq+srcstrideq] .nextrow: movh m3, [srcq+2*srcstrideq] ; read new row @@ -255,18 +255,17 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr lea myq, [sixtap_filter_hb+myq*8] ; read 5 lines - sub srcq, srcstrideq - sub srcq, srcstrideq - movh m0, [srcq] - movh m1, [srcq+srcstrideq] - movh m2, [srcq+srcstrideq*2] + mov picregq, srcstrideq + neg picregq + movh m0, [srcq+2*picregq] + movh m1, [srcq+picregq] + movh m2, [srcq] + movh m3, [srcq+srcstrideq] + movh m4, [srcq+2*srcstrideq] lea srcq, [srcq+srcstrideq*2] - add srcq, srcstrideq - movh m3, [srcq] - movh m4, [srcq+srcstrideq] .nextrow: - movh m5, [srcq+2*srcstrideq] ; read new row + movh m5, [srcq+srcstrideq] ; read new row mova m6, m0 punpcklbw m6, m5 mova m0, m1 @@ -475,15 +474,14 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre pxor m7, m7 ; read 5 lines - sub srcq, srcstrideq - sub srcq, srcstrideq - movh m0, [srcq] - movh m1, [srcq+srcstrideq] - movh m2, [srcq+srcstrideq*2] + mov picregq, srcstrideq + neg picregq + movh m0, [srcq+2*picregq] + movh m1, [srcq+picregq] + movh m2, [srcq] + movh m3, [srcq+srcstrideq] + movh m4, [srcq+2*srcstrideq] lea srcq, [srcq+srcstrideq*2] - add srcq, srcstrideq - movh m3, [srcq] - movh m4, [srcq+srcstrideq] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -499,7 +497,7 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre paddsw m6, m5 ; then calculate positive taps - movh m5, [srcq+2*srcstrideq] ; read new row + movh m5, [srcq+srcstrideq] ; read new row punpcklbw m5, m7 pmullw m0, [myq+0] paddsw m6, m0 -- 2.49.1 >From 456ecec84197e6be99b1811fb0eda5722df47da9 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 23 Nov 2025 13:15:07 +0100 Subject: [PATCH 04/15] avcodec/x86/vp8dsp: Increment src pointer earlier Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 6b5ca9f309..0d37012e9d 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -166,6 +166,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h pmaddubsw m0, m5 pmaddubsw m1, m6 pmaddubsw m2, m7 + add srcq, srcstrideq paddsw m0, m1 paddsw m0, m2 pmulhrsw m0, [pw_256] @@ -174,7 +175,6 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h ; go to next line add dstq, dststrideq - add srcq, srcstrideq dec heightd ; next row jg .nextrow RET @@ -197,6 +197,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m6 + add srcq, srcstrideq paddsw m0, m1 pmulhrsw m0, m2 packuswb m0, m0 @@ -204,7 +205,6 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h ; go to next line add dstq, dststrideq - add srcq, srcstrideq dec heightd ; next row jg .nextrow RET @@ -234,6 +234,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr punpcklbw m2, m3 pmaddubsw m4, m5 pmaddubsw m2, m6 + add srcq, srcstrideq paddsw m4, m2 mova m2, m3 pmulhrsw m4, m7 @@ -242,7 +243,6 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr ; go to next line add dstq, dststrideq - add srcq, srcstrideq dec heightd ; next row jg .nextrow RET @@ -275,6 +275,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr pmaddubsw m6, [myq-48] pmaddubsw m1, [myq-32] pmaddubsw m7, [myq-16] + add srcq, srcstrideq paddsw m6, m1 paddsw m6, m7 mova m1, m2 @@ -287,7 +288,6 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr ; go to next line add dstq, dststrideq - add srcq, srcstrideq dec heightd ; next row jg .nextrow RET @@ -331,6 +331,7 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h pmullw m2, [mxq+32] pmullw m3, [mxq+48] %endif + add srcq, srcstrideq paddsw m0, m1 paddsw m2, m3 paddsw m0, m2 @@ -341,7 +342,6 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h ; go to next line add dstq, dststrideq - add srcq, srcstrideq dec heightd ; next row jg .nextrow RET @@ -392,6 +392,7 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h pmullw m4, [mxq+64] pmullw m5, [mxq+80] %endif + add srcq, srcstrideq paddsw m1, m4 paddsw m0, m5 paddsw m1, m2 @@ -404,7 +405,6 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h ; go to next line add dstq, dststrideq - add srcq, srcstrideq dec heightd ; next row jg .nextrow RET @@ -446,6 +446,7 @@ cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picre paddsw m4, m1 mova m1, m2 pmullw m2, [myq+32] + add srcq, srcstrideq paddsw m4, m2 mova m2, m3 @@ -457,7 +458,6 @@ cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picre ; go to next line add dstq, dststrideq - add srcq, srcstrideq dec heightd ; next row jg .nextrow RET @@ -507,6 +507,7 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre paddsw m6, m2 mova m2, m3 pmullw m3, [myq+48] + add srcq, srcstrideq paddsw m6, m3 mova m3, m4 mova m4, m5 @@ -521,7 +522,6 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre ; go to next line add dstq, dststrideq - add srcq, srcstrideq dec heightd ; next row jg .nextrow RET @@ -543,6 +543,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p punpcklbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 + lea srcq, [srcq+srcstrideq*2] psraw m0, 2 psraw m1, 2 pavgw m0, m4 @@ -579,6 +580,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p pmullw m1, m5 pmullw m2, m4 pmullw m3, m5 + lea srcq, [srcq+srcstrideq*2] paddsw m0, m1 paddsw m2, m3 psraw m0, 2 @@ -591,7 +593,6 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p %endif ; cpuflag(ssse3) lea dstq, [dstq+dststrideq*2] - lea srcq, [srcq+srcstrideq*2] sub heightd, 2 jg .nextrow RET @@ -612,6 +613,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride pshufb m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 + lea srcq, [srcq+srcstrideq*2] psraw m0, 2 psraw m1, 2 pavgw m0, m4 @@ -649,6 +651,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride pmullw m1, m5 pmullw m2, m4 pmullw m3, m5 + lea srcq, [srcq+srcstrideq*2] paddsw m0, m1 paddsw m2, m3 psraw m0, 2 @@ -661,7 +664,6 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride %endif ; cpuflag(ssse3) lea dstq, [dstq+dststrideq*2] - lea srcq, [srcq+srcstrideq*2] sub heightd, 2 jg .nextrow RET -- 2.49.1 >From 936f8412aff35236d0f2c786aafa40d75331a640 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 23 Nov 2025 13:27:35 +0100 Subject: [PATCH 05/15] avcodec/x86/vp8dsp: Avoid reload Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 0d37012e9d..e971da68ac 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -535,8 +535,8 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p %endif pxor m4, m4 mova m3, [bilinear_filter_vb+myq-16] -.nextrow: movh m0, [srcq+srcstrideq*0] +.nextrow: movh m1, [srcq+srcstrideq*1] movh m2, [srcq+srcstrideq*2] punpcklbw m0, m1 @@ -558,6 +558,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p movh [dstq+dststrideq*0], m0 movhps [dstq+dststrideq*1], m0 %endif + mova m0, m2 %else ; cpuflag(ssse3) cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my shl myd, 4 -- 2.49.1 >From 15d229859aa0d7804791f70100fd55738925560a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 23 Nov 2025 15:39:48 +0100 Subject: [PATCH 06/15] avcodec/x86/vp8dsp_init: Remove unused macro Forgotten in 6a551f14050674fb685920eb1b0640810cacccf9. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp_init.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index 40aa52c7f0..828b038cdf 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -105,16 +105,6 @@ static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \ ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ dst + 8, dststride, src + 8, srcstride, height, mx, my); \ } -#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \ -static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ - uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ - ptrdiff_t srcstride, int height, int mx, int my) \ -{ \ - ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ - dst, dststride, src, srcstride, height, mx, my); \ - ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ - dst + 4, dststride, src + 4, srcstride, height, mx, my); \ -} TAP_W16(sse2, epel, h6) TAP_W16(sse2, epel, v6) -- 2.49.1 >From 1b99c21a689f61a8dbac5dfd7ec4dc46b3ffd698 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 23 Nov 2025 20:25:26 +0100 Subject: [PATCH 07/15] avcodec/x86/vp8dsp: Avoid unpacking multiple times Always pair row i with row i+2 for the vertical four-tap filter and row i+3 for the vertical six-tap filter (instead of pairing the first with the sixth, the second with the third and the fourth and the fifth). This allows to unpack each row only once instead of (at most) three times. Old benchmarks: vp8_put_epel4_v4_c: 98.4 ( 1.00x) vp8_put_epel4_v4_ssse3: 28.6 ( 3.44x) vp8_put_epel4_v6_c: 131.6 ( 1.00x) vp8_put_epel4_v6_ssse3: 38.5 ( 3.42x) vp8_put_epel8_v4_c: 362.5 ( 1.00x) vp8_put_epel8_v4_sse2: 63.8 ( 5.68x) vp8_put_epel8_v4_ssse3: 44.4 ( 8.16x) vp8_put_epel8_v6_c: 538.3 ( 1.00x) vp8_put_epel8_v6_sse2: 86.5 ( 6.22x) vp8_put_epel8_v6_ssse3: 57.0 ( 9.44x) vp8_put_epel16_v6_c: 1044.6 ( 1.00x) vp8_put_epel16_v6_sse2: 158.0 ( 6.61x) vp8_put_epel16_v6_ssse3: 106.7 ( 9.79x) New benchmarks: vp8_put_epel4_v4_c: 100.0 ( 1.00x) vp8_put_epel4_v4_ssse3: 28.4 ( 3.52x) vp8_put_epel4_v6_c: 131.7 ( 1.00x) vp8_put_epel4_v6_ssse3: 34.3 ( 3.84x) vp8_put_epel8_v4_c: 364.4 ( 1.00x) vp8_put_epel8_v4_sse2: 63.7 ( 5.72x) vp8_put_epel8_v4_ssse3: 43.3 ( 8.42x) vp8_put_epel8_v6_c: 550.2 ( 1.00x) vp8_put_epel8_v6_sse2: 86.4 ( 6.37x) vp8_put_epel8_v6_ssse3: 52.9 (10.40x) vp8_put_epel16_v6_c: 1052.5 ( 1.00x) vp8_put_epel16_v6_sse2: 158.3 ( 6.65x) vp8_put_epel16_v6_ssse3: 98.9 (10.64x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 68 +++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index e971da68ac..7cb729a443 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -33,6 +33,15 @@ fourtap_filter_hb_m: times 8 db -6, 123 times 8 db -1, 12 times 8 db 123, -6 +fourtap_filter_b_m: times 8 db -6, 12 + times 8 db 123, -1 + times 8 db -9, 50 + times 8 db 93, -6 + times 8 db -6, 93 + times 8 db 50, -9 + times 8 db -1, 123 + times 8 db 12, -6 + sixtap_filter_hb_m: times 8 db 2, 1 times 8 db -11, 108 times 8 db 36, -8 @@ -43,6 +52,16 @@ sixtap_filter_hb_m: times 8 db 2, 1 times 8 db -8, 36 times 8 db 108, -11 +sixtap_filter_b_m: times 8 db 2, 36 + times 8 db -11, -8 + times 8 db 108, 1 + times 8 db 3, 77 + times 8 db -16, -16 + times 8 db 77, 3 + times 8 db 1, 108 + times 8 db -8, -11 + times 8 db 36, 2 + fourtap_filter_v_m: times 8 dw -6 times 8 dw 123 times 8 dw 12 @@ -97,7 +116,9 @@ bilinear_filter_vb_m: times 8 db 7, 1 %if PIC %define fourtap_filter_hb picregq +%define fourtap_filter_b picregq %define sixtap_filter_hb picregq +%define sixtap_filter_b picregq %define fourtap_filter_v picregq %define sixtap_filter_v picregq %define bilinear_filter_vw picregq @@ -105,7 +126,9 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define npicregs 1 %else %define fourtap_filter_hb fourtap_filter_hb_m +%define fourtap_filter_b fourtap_filter_b_m %define sixtap_filter_hb sixtap_filter_hb_m +%define sixtap_filter_b sixtap_filter_b_m %define fourtap_filter_v fourtap_filter_v_m %define sixtap_filter_v sixtap_filter_v_m %define bilinear_filter_vw bilinear_filter_vw_m @@ -212,10 +235,10 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 4 %if PIC - lea picregq, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_b_m] %endif - mova m5, [fourtap_filter_hb+myq-16] - mova m6, [fourtap_filter_hb+myq] + mova m5, [fourtap_filter_b+myq-16] + mova m6, [fourtap_filter_b+myq] mova m7, [pw_256] ; read 3 lines @@ -224,21 +247,20 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr movh m0, [srcq+picregq] movh m1, [srcq] movh m2, [srcq+srcstrideq] + punpcklbw m0, m2 .nextrow: movh m3, [srcq+2*srcstrideq] ; read new row - mova m4, m0 + pmaddubsw m0, m5 + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + add srcq, srcstrideq + paddsw m4, m0 mova m0, m1 - punpcklbw m4, m1 - mova m1, m2 - punpcklbw m2, m3 - pmaddubsw m4, m5 - pmaddubsw m2, m6 - add srcq, srcstrideq - paddsw m4, m2 - mova m2, m3 pmulhrsw m4, m7 + mova m1, m2 packuswb m4, m4 + mova m2, m3 movh [dstq], m4 ; go to next line @@ -250,9 +272,9 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my lea myd, [myq*3] %if PIC - lea picregq, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_b_m] %endif - lea myq, [sixtap_filter_hb+myq*8] + lea myq, [sixtap_filter_b+myq*8] ; read 5 lines mov picregq, srcstrideq @@ -263,20 +285,18 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr movh m3, [srcq+srcstrideq] movh m4, [srcq+2*srcstrideq] lea srcq, [srcq+srcstrideq*2] + punpcklbw m0, m3 + punpcklbw m1, m4 .nextrow: movh m5, [srcq+srcstrideq] ; read new row - mova m6, m0 - punpcklbw m6, m5 + pmaddubsw m0, [myq-48] + punpcklbw m2, m5 + pmaddubsw m6, m1, [myq-32] + pmaddubsw m7, m2, [myq-16] + add srcq, srcstrideq + paddw m6, m0 mova m0, m1 - punpcklbw m1, m2 - mova m7, m3 - punpcklbw m7, m4 - pmaddubsw m6, [myq-48] - pmaddubsw m1, [myq-32] - pmaddubsw m7, [myq-16] - add srcq, srcstrideq - paddsw m6, m1 paddsw m6, m7 mova m1, m2 mova m2, m3 -- 2.49.1 >From 25836faa4f89001299f9faa75f00a2bc8d55d0ea Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 23 Nov 2025 23:29:24 +0100 Subject: [PATCH 08/15] avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_v6_sse3 Switching to xmm registers allows to process two rows in parallel, leading to speedups. It is also ABI compliant (no more missing emms). Old benchmarks: vp8_put_epel4_v6_c: 132.8 ( 1.00x) vp8_put_epel4_v6_ssse3: 34.3 ( 3.87x) New benchmarks: vp8_put_epel4_v6_c: 131.5 ( 1.00x) vp8_put_epel4_v6_ssse3: 27.1 ( 4.86x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 48 +++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 7cb729a443..4778944ac7 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -162,6 +162,12 @@ SECTION .text ;------------------------------------------------------------------------------- %macro FILTER_SSSE3 1 +%if %1 == 4 +%define MOV movd +%else +%define MOV movq +%endif + cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg lea mxd, [mxq*3] mova m3, [filter_h6_shuf2] @@ -269,6 +275,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr jg .nextrow RET +INIT_XMM ssse3 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my lea myd, [myq*3] %if PIC @@ -279,14 +286,44 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr ; read 5 lines mov picregq, srcstrideq neg picregq - movh m0, [srcq+2*picregq] - movh m1, [srcq+picregq] - movh m2, [srcq] - movh m3, [srcq+srcstrideq] - movh m4, [srcq+2*srcstrideq] + MOV m0, [srcq+2*picregq] + MOV m1, [srcq+picregq] + MOV m2, [srcq] + MOV m3, [srcq+srcstrideq] + MOV m4, [srcq+2*srcstrideq] lea srcq, [srcq+srcstrideq*2] punpcklbw m0, m3 punpcklbw m1, m4 +%if %1 == 4 + punpcklqdq m0, m1 + +.next2rows: + movd m5, [srcq+srcstrideq] + movd m6, [srcq+2*srcstrideq] + pmaddubsw m0, [myq-48] + punpcklbw m2, m5 + punpcklqdq m1, m2 + pmaddubsw m1, [myq-32] + punpcklbw m3, m6 + punpcklqdq m2, m3 + paddw m0, m1 + pmaddubsw m1, m2, [myq-16] + lea srcq, [srcq+2*srcstrideq] + paddsw m1, m0 + mova m0, m2 + pmulhrsw m1, [pw_256] + mova m2, m4 + packuswb m1, m1 + movd [dstq], m1 + mova m4, m6 + psrldq m1, 4 + movd [dstq+dststrideq], m1 + lea dstq, [dstq+2*dststrideq] + mova m1, m3 + mova m3, m5 + sub heightd, 2 + jg .next2rows +%else .nextrow: movh m5, [srcq+srcstrideq] ; read new row @@ -310,6 +347,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr add dstq, dststrideq dec heightd ; next row jg .nextrow +%endif RET %endmacro -- 2.49.1 >From 9755f51400a2668ddb05d92932badf26bb0c9723 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 24 Nov 2025 09:16:26 +0100 Subject: [PATCH 09/15] avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_v4_ssse3 Switching to xmm registers allows to process two rows in parallel, leading to speedups. It is also ABI compliant (no more missing emms). Old benchmarks: vp8_put_epel4_v4_c: 96.8 ( 1.00x) vp8_put_epel4_v4_ssse3: 28.2 ( 3.43x) New benchmarks: vp8_put_epel4_v4_c: 95.1 ( 1.00x) vp8_put_epel4_v4_ssse3: 22.8 ( 4.17x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 4778944ac7..fd60feaf1f 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -238,6 +238,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h jg .nextrow RET +INIT_XMM ssse3 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 4 %if PIC @@ -250,13 +251,38 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr ; read 3 lines mov picregq, srcstrideq neg picregq - movh m0, [srcq+picregq] - movh m1, [srcq] - movh m2, [srcq+srcstrideq] + MOV m0, [srcq+picregq] + MOV m1, [srcq] + MOV m2, [srcq+srcstrideq] + lea srcq, [srcq+2*srcstrideq] punpcklbw m0, m2 +%if %1 == 4 +.next2rows: + movd m3, [srcq] + movd m4, [srcq+srcstrideq] + punpcklbw m1, m3 + punpcklqdq m0, m1 + punpcklbw m2, m4 + pmaddubsw m0, m5 + punpcklqdq m1, m2 + pmaddubsw m1, m6 + lea srcq, [srcq+2*srcstrideq] + paddsw m1, m0 + pmulhrsw m1, m7 + mova m0, m2 + packuswb m1, m1 + movd [dstq], m1 + mova m2, m4 + psrldq m1, 4 + movd [dstq+dststrideq], m1 + mova m1, m3 + lea dstq, [dstq+2*dststrideq] + sub heightd, 2 + jg .next2rows +%else .nextrow: - movh m3, [srcq+2*srcstrideq] ; read new row + movh m3, [srcq] ; read new row pmaddubsw m0, m5 punpcklbw m1, m3 pmaddubsw m4, m1, m6 @@ -273,9 +299,9 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr add dstq, dststrideq dec heightd ; next row jg .nextrow +%endif RET -INIT_XMM ssse3 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my lea myd, [myq*3] %if PIC -- 2.49.1 >From 131c522c30fdcc8259ac120372b83253f1ab6906 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 24 Nov 2025 13:29:42 +0100 Subject: [PATCH 10/15] avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_h4_ssse3 Doubling the register width allows to use only one pshufb and pmaddubsw. Old benchmarks: vp8_put_epel4_h4_c: 82.8 ( 1.00x) vp8_put_epel4_h4_ssse3: 13.9 ( 5.96x) New benchmarks: vp8_put_epel4_h4_c: 82.7 ( 1.00x) vp8_put_epel4_h4_ssse3: 11.7 ( 7.08x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index fd60feaf1f..6c365898ce 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -24,6 +24,15 @@ SECTION_RODATA +fourtap_filter4_b_m: times 4 db -6, 123 + times 4 db 12, -1 + times 4 db -9, 93 + times 4 db 50, -6 + times 4 db -6, 50 + times 4 db 93, -9 + times 4 db -1, 12 + times 4 db 123, -6 + fourtap_filter_hb_m: times 8 db -6, 123 times 8 db 12, -1 times 8 db -9, 93 @@ -117,6 +126,7 @@ bilinear_filter_vb_m: times 8 db 7, 1 %if PIC %define fourtap_filter_hb picregq %define fourtap_filter_b picregq +%define fourtap_filter4_b picregq %define sixtap_filter_hb picregq %define sixtap_filter_b picregq %define fourtap_filter_v picregq @@ -127,6 +137,7 @@ bilinear_filter_vb_m: times 8 db 7, 1 %else %define fourtap_filter_hb fourtap_filter_hb_m %define fourtap_filter_b fourtap_filter_b_m +%define fourtap_filter4_b fourtap_filter4_b_m %define sixtap_filter_hb sixtap_filter_hb_m %define sixtap_filter_b sixtap_filter_b_m %define fourtap_filter_v fourtap_filter_v_m @@ -136,6 +147,7 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define npicregs 0 %endif +filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 @@ -208,9 +220,11 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h jg .nextrow RET -cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 4 +INIT_XMM ssse3 +cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src, srcstride, height, mx, picreg mova m2, [pw_256] +%if %1 == 8 + shl mxd, 4 mova m3, [filter_h2_shuf] mova m4, [filter_h4_shuf] %if PIC @@ -218,19 +232,34 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h %endif mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes mova m6, [fourtap_filter_hb+mxq] +%else + shl mxd, 3 + mova m3, [filter4_h4_shuf] +%if PIC + lea picregq, [fourtap_filter4_b_m] +%endif + mova m5, [fourtap_filter4_b+mxq-8] +%endif .nextrow: +%if %1 == 4 + movq m0, [srcq-1] + pshufb m0, m3 + pmaddubsw m0, m5 + movhlps m1, m0 +%else movu m0, [srcq-1] mova m1, m0 pshufb m0, m3 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m6 +%endif add srcq, srcstrideq paddsw m0, m1 pmulhrsw m0, m2 packuswb m0, m0 - movh [dstq], m0 ; store + MOV [dstq], m0 ; store ; go to next line add dstq, dststrideq @@ -238,7 +267,6 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h jg .nextrow RET -INIT_XMM ssse3 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 4 %if PIC -- 2.49.1 >From 31ed005d0407e55469bf13d1344469eb1f1af456 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 24 Nov 2025 16:11:10 +0100 Subject: [PATCH 11/15] avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_h6_ssse3 Doubling the register width allowed to avoid a pshufb and a pmaddubsw. Old benchmarks: vp8_put_epel4_h6_c: 115.9 ( 1.00x) vp8_put_epel4_h6_ssse3: 20.2 ( 5.74x) vp8_put_epel4_h6v4_c: 276.3 ( 1.00x) vp8_put_epel4_h6v4_ssse3: 58.6 ( 4.71x) vp8_put_epel4_h6v6_c: 363.6 ( 1.00x) vp8_put_epel4_h6v6_ssse3: 62.5 ( 5.82x) New benchmarks: vp8_put_epel4_h6_c: 116.4 ( 1.00x) vp8_put_epel4_h6_ssse3: 16.0 ( 7.29x) vp8_put_epel4_h6v4_c: 280.9 ( 1.00x) vp8_put_epel4_h6v4_ssse3: 44.3 ( 6.33x) vp8_put_epel4_h6v6_c: 365.6 ( 1.00x) vp8_put_epel4_h6v6_ssse3: 53.1 ( 6.89x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 50 +++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 6c365898ce..2a66e51da6 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -33,6 +33,16 @@ fourtap_filter4_b_m: times 4 db -6, 123 times 4 db -1, 12 times 4 db 123, -6 +sixtap_filter4_hb_m: times 8 db 2, -11 + times 4 db 108, -8 + times 4 db 36, 1 + times 8 db 3, -16 + times 4 db 77, -16 + times 4 db 77, 3 + times 8 db 1, -8 + times 4 db 36, -11 + times 4 db 108, 2 + fourtap_filter_hb_m: times 8 db -6, 123 times 8 db 12, -1 times 8 db -9, 93 @@ -129,6 +139,7 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define fourtap_filter4_b picregq %define sixtap_filter_hb picregq %define sixtap_filter_b picregq +%define sixtap_filter4_hb picregq %define fourtap_filter_v picregq %define sixtap_filter_v picregq %define bilinear_filter_vw picregq @@ -140,6 +151,7 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define fourtap_filter4_b fourtap_filter4_b_m %define sixtap_filter_hb sixtap_filter_hb_m %define sixtap_filter_b sixtap_filter_b_m +%define sixtap_filter4_hb sixtap_filter4_hb_m %define fourtap_filter_v fourtap_filter_v_m %define sixtap_filter_v sixtap_filter_v_m %define bilinear_filter_vw bilinear_filter_vw_m @@ -148,6 +160,7 @@ bilinear_filter_vb_m: times 8 db 7, 1 %endif filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3, 5, 4, 6, 5, 7 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 @@ -180,7 +193,16 @@ SECTION .text %define MOV movq %endif -cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg +cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), dst, dststride, src, srcstride, height, mx, picreg +%if %1 == 4 + mova m3, [filter4_h6_shuf] +%if PIC + lea picregq, [sixtap_filter4_hb_m] +%endif + shl mxd, 4 + mova m4, [sixtap_filter4_hb+mxq-32] + mova m5, [sixtap_filter4_hb+mxq-16] +%else lea mxd, [mxq*3] mova m3, [filter_h6_shuf2] mova m4, [filter_h6_shuf3] @@ -190,29 +212,35 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes mova m6, [sixtap_filter_hb+mxq*8-32] mova m7, [sixtap_filter_hb+mxq*8-16] +%endif .nextrow: +%if %1 == 4 + ; we need nine bytes, so two loads + movq m1, [srcq-1] + movq m0, [srcq-2] + punpcklbw m0, m1 + pshufb m1, m3 + pmaddubsw m1, m5 + pmaddubsw m0, m4 + movhlps m2, m1 +%else movu m0, [srcq-2] mova m1, m0 mova m2, m0 -%if mmsize == 8 -; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the -; shuffle with a memory operand - punpcklbw m0, [srcq+3] -%else pshufb m0, [filter_h6_shuf1] -%endif pshufb m1, m3 pshufb m2, m4 pmaddubsw m0, m5 pmaddubsw m1, m6 pmaddubsw m2, m7 +%endif add srcq, srcstrideq - paddsw m0, m1 + paddw m0, m1 paddsw m0, m2 pmulhrsw m0, [pw_256] packuswb m0, m0 - movh [dstq], m0 ; store + MOV [dstq], m0 ; store ; go to next line add dstq, dststrideq @@ -220,7 +248,6 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h jg .nextrow RET -INIT_XMM ssse3 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src, srcstride, height, mx, picreg mova m2, [pw_256] %if %1 == 8 @@ -405,9 +432,8 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr RET %endmacro -INIT_MMX ssse3 -FILTER_SSSE3 4 INIT_XMM ssse3 +FILTER_SSSE3 4 FILTER_SSSE3 8 INIT_XMM sse2 -- 2.49.1 >From a406799d622e86a73853ab9be9ca77f6367c3d9c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 24 Nov 2025 20:32:58 +0100 Subject: [PATCH 12/15] avcodec/x86/vp8dsp: Reduce number of coefficient tables By changing the permutations used in the epel8_h{4,6} case we can simply reuse the coefficient tables from the vertical epel filters. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 54 ++++++++++++--------------------------- 1 file changed, 17 insertions(+), 37 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 2a66e51da6..340f6cc818 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -43,15 +43,6 @@ sixtap_filter4_hb_m: times 8 db 2, -11 times 4 db 36, -11 times 4 db 108, 2 -fourtap_filter_hb_m: times 8 db -6, 123 - times 8 db 12, -1 - times 8 db -9, 93 - times 8 db 50, -6 - times 8 db -6, 50 - times 8 db 93, -9 - times 8 db -1, 12 - times 8 db 123, -6 - fourtap_filter_b_m: times 8 db -6, 12 times 8 db 123, -1 times 8 db -9, 50 @@ -61,16 +52,6 @@ fourtap_filter_b_m: times 8 db -6, 12 times 8 db -1, 123 times 8 db 12, -6 -sixtap_filter_hb_m: times 8 db 2, 1 - times 8 db -11, 108 - times 8 db 36, -8 - times 8 db 3, 3 - times 8 db -16, 77 - times 8 db 77, -16 - times 8 db 1, 2 - times 8 db -8, 36 - times 8 db 108, -11 - sixtap_filter_b_m: times 8 db 2, 36 times 8 db -11, -8 times 8 db 108, 1 @@ -134,10 +115,8 @@ bilinear_filter_vb_m: times 8 db 7, 1 times 8 db 1, 7 %if PIC -%define fourtap_filter_hb picregq %define fourtap_filter_b picregq %define fourtap_filter4_b picregq -%define sixtap_filter_hb picregq %define sixtap_filter_b picregq %define sixtap_filter4_hb picregq %define fourtap_filter_v picregq @@ -146,10 +125,8 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define bilinear_filter_vb picregq %define npicregs 1 %else -%define fourtap_filter_hb fourtap_filter_hb_m %define fourtap_filter_b fourtap_filter_b_m %define fourtap_filter4_b fourtap_filter4_b_m -%define sixtap_filter_hb sixtap_filter_hb_m %define sixtap_filter_b sixtap_filter_b_m %define sixtap_filter4_hb sixtap_filter4_hb_m %define fourtap_filter_v fourtap_filter_v_m @@ -161,12 +138,15 @@ bilinear_filter_vb_m: times 8 db 7, 1 filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3, 5, 4, 6, 5, 7 -filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 -filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 -filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 +filter_h4_shuf1: db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9 +filter_h4_shuf2: db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 + +filter_h6_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10 +filter_h6_shuf2: db 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11 +filter_h6_shuf3: db 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 9, 12 + +filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 pw_20091: times 4 dw 20091 pw_17734: times 4 dw 17734 @@ -207,11 +187,11 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), dst, dststride, src, sr mova m3, [filter_h6_shuf2] mova m4, [filter_h6_shuf3] %if PIC - lea picregq, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_b_m] %endif - mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes - mova m6, [sixtap_filter_hb+mxq*8-32] - mova m7, [sixtap_filter_hb+mxq*8-16] + mova m5, [sixtap_filter_b+mxq*8-48] ; set up 6tap filter in bytes + mova m6, [sixtap_filter_b+mxq*8-32] + mova m7, [sixtap_filter_b+mxq*8-16] %endif .nextrow: @@ -252,13 +232,13 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src, mova m2, [pw_256] %if %1 == 8 shl mxd, 4 - mova m3, [filter_h2_shuf] - mova m4, [filter_h4_shuf] + mova m3, [filter_h4_shuf1] + mova m4, [filter_h4_shuf2] %if PIC - lea picregq, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_b_m] %endif - mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes - mova m6, [fourtap_filter_hb+mxq] + mova m5, [fourtap_filter_b+mxq-16] ; set up 4tap filter in bytes + mova m6, [fourtap_filter_b+mxq] %else shl mxd, 3 mova m3, [filter4_h4_shuf] -- 2.49.1 >From 61379497c16bfd4048882f93461ee5d094431e1a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 24 Nov 2025 22:36:45 +0100 Subject: [PATCH 13/15] avcodec/x86/vp8dsp: Don't use saturated addition when unnecessary For the epel functions, there can be no overflow as long as the sum contains only one of the two large central coefficients; for bilinear functions, there can be no overflow whatsoever. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 340f6cc818..22356f687b 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -450,10 +450,10 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h pmullw m3, [mxq+48] %endif add srcq, srcstrideq - paddsw m0, m1 - paddsw m2, m3 + paddw m0, m1 + paddw m2, m3 + paddw m0, m4 paddsw m0, m2 - paddsw m0, m4 psraw m0, 7 packuswb m0, m7 movh [dstq], m0 ; store @@ -511,12 +511,12 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h pmullw m5, [mxq+80] %endif add srcq, srcstrideq - paddsw m1, m4 - paddsw m0, m5 - paddsw m1, m2 - paddsw m0, m3 + paddw m1, m4 + paddw m0, m5 + paddw m1, m2 + paddw m0, m3 + paddw m1, m6 paddsw m0, m1 - paddsw m0, m6 psraw m0, 7 packuswb m0, m7 movh [dstq], m0 ; store @@ -556,20 +556,20 @@ cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picre mova m3, m4 pmullw m0, [myq+0] pmullw m4, m5 - paddsw m4, m0 + paddw m4, m0 ; then calculate positive taps mova m0, m1 pmullw m1, [myq+16] - paddsw m4, m1 + paddw m4, m1 mova m1, m2 pmullw m2, [myq+32] + paddw m4, m6 add srcq, srcstrideq paddsw m4, m2 mova m2, m3 ; round/clip/store - paddsw m4, m6 psraw m4, 7 packuswb m4, m7 movh [dstq], m4 @@ -612,17 +612,18 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre pmullw m5, [myq+16] mova m6, m4 pmullw m6, [myq+64] - paddsw m6, m5 + paddw m6, m5 ; then calculate positive taps movh m5, [srcq+srcstrideq] ; read new row punpcklbw m5, m7 pmullw m0, [myq+0] - paddsw m6, m0 + paddw m6, [pw_64] + paddw m6, m0 mova m0, m1 mova m1, m2 pmullw m2, [myq+32] - paddsw m6, m2 + paddw m6, m2 mova m2, m3 pmullw m3, [myq+48] add srcq, srcstrideq @@ -633,7 +634,6 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picre paddsw m6, m5 ; round/clip/store - paddsw m6, [pw_64] psraw m6, 7 packuswb m6, m7 movh [dstq], m6 @@ -700,8 +700,8 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p pmullw m2, m4 pmullw m3, m5 lea srcq, [srcq+srcstrideq*2] - paddsw m0, m1 - paddsw m2, m3 + paddw m0, m1 + paddw m2, m3 psraw m0, 2 psraw m2, 2 pavgw m0, m6 @@ -771,8 +771,8 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride pmullw m2, m4 pmullw m3, m5 lea srcq, [srcq+srcstrideq*2] - paddsw m0, m1 - paddsw m2, m3 + paddw m0, m1 + paddw m2, m3 psraw m0, 2 psraw m2, 2 pavgw m0, m6 -- 2.49.1 >From 25a28953d4737b9a466d24c47170d2c99f651db8 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 2 Dec 2025 19:49:17 +0100 Subject: [PATCH 14/15] avcodec/riscv/vp8dsp_rvv: Remove unused functions Only the sixtap functions are used for size 16. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/riscv/vp8dsp_init.c | 5 ----- libavcodec/riscv/vp8dsp_rvv.S | 9 ++++++++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c index 3e35c72198..fecf6ef9b0 100644 --- a/libavcodec/riscv/vp8dsp_init.c +++ b/libavcodec/riscv/vp8dsp_init.c @@ -90,27 +90,22 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c) c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv; c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv; c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv; - c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv; c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv; c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv; c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv; c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv; c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv; - c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv; c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv; c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv; #if __riscv_xlen <= 64 c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv; c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv; c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv; - c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv; c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv; c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv; - c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv; c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv; c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv; - c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv; c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv; c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv; #endif diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index 2ee7029c60..ed08f72cdc 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -537,7 +537,14 @@ func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x, zba endfunc .endm -.irp len,16,8,4 +# Only the sixtaps versions are used for epel16. +epel 16 6 h +epel 16 6 v +#if __riscv_xlen <= 64 +epel_hv 16 6 6 +#endif + +.irp len,8,4 epel \len 6 h epel \len 4 h epel \len 6 v -- 2.49.1 >From 83fee0147bdb91683c4aaeadc883a5e5a7066dd7 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 24 Nov 2025 23:13:16 +0100 Subject: [PATCH 15/15] avcodec/vp8dsp: Don't compile unused functions The width 16 epel functions never use four taps in any direction*, so don't build said functions. Saves 4352B of .text and 89B of .text.unlikely here. *: mx and my in vp8_mc_luma() are always even. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/vp8dsp.c | 11 +++++------ tests/checkasm/vp8dsp.c | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c index 5543303adb..eabe3edb27 100644 --- a/libavcodec/vp8dsp.c +++ b/libavcodec/vp8dsp.c @@ -558,26 +558,21 @@ put_vp8_epel ## SIZE ## _h ## HTAPS ## v ## VTAPS ## _c(uint8_t *dst, \ } \ } -VP8_EPEL_H(16, 4) VP8_EPEL_H(8, 4) VP8_EPEL_H(4, 4) VP8_EPEL_H(16, 6) VP8_EPEL_H(8, 6) VP8_EPEL_H(4, 6) -VP8_EPEL_V(16, 4) VP8_EPEL_V(8, 4) VP8_EPEL_V(4, 4) VP8_EPEL_V(16, 6) VP8_EPEL_V(8, 6) VP8_EPEL_V(4, 6) -VP8_EPEL_HV(16, 4, 4) VP8_EPEL_HV(8, 4, 4) VP8_EPEL_HV(4, 4, 4) -VP8_EPEL_HV(16, 4, 6) VP8_EPEL_HV(8, 4, 6) VP8_EPEL_HV(4, 4, 6) -VP8_EPEL_HV(16, 6, 4) VP8_EPEL_HV(8, 6, 4) VP8_EPEL_HV(4, 6, 4) VP8_EPEL_HV(16, 6, 6) @@ -667,7 +662,11 @@ VP8_BILINEAR(4) av_cold void ff_vp78dsp_init(VP8DSPContext *dsp) { - VP78_MC_FUNC(0, 16); + dsp->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_c; + dsp->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_c; + dsp->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_c; + dsp->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_c; + VP78_MC_FUNC(1, 8); VP78_MC_FUNC(2, 4); diff --git a/tests/checkasm/vp8dsp.c b/tests/checkasm/vp8dsp.c index a12c295a2a..4d6704d5a9 100644 --- a/tests/checkasm/vp8dsp.c +++ b/tests/checkasm/vp8dsp.c @@ -510,7 +510,8 @@ static void checkasm_check_vp78dsp(VP8DSPContext *d, bool is_vp7) void checkasm_check_vp8dsp(void) { - VP8DSPContext d; + // Needs to be zeroed because not all size 16 epel functions exist. + VP8DSPContext d = { 0 }; ff_vp78dsp_init(&d); check_mc(&d); -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
