[FFmpeg-devel] [PATCH] Avoid MMX in VP8 (PR #21081)

mkver via ffmpeg-devel Tue, 02 Dec 2025 10:55:41 -0800

PR #21081 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21081
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21081.patch


Also remove some unused functions. For the RISCV stuff (the penultimate commit) 
only compilation was tested.


>From e495162f74195c6ef6060a2d8034f0a715425b2d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 23 Nov 2025 11:08:14 +0100
Subject: [PATCH 01/15] avcodec/x86/vp8dsp: Remove MMXEXT functions overridden
 by SSSE3

SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
so that the overwhelming majority of our users (particularly those
that actually update their FFmpeg) will be using the SSSE3 versions.
This commit therefore removes the MMX(EXT) functions overridden
by them (which don't abide by the ABI) to get closer to a removal
of emms_c.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm    | 159 +----------------------------------
 libavcodec/x86/vp8dsp_init.c |  37 +-------
 2 files changed, 6 insertions(+), 190 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 231c21ea0d..7b836351e4 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1,5 +1,5 @@
 ;******************************************************************************
-;* VP8 MMXEXT optimizations
+;* VP8 ASM optimizations
 ;* Copyright (c) 2010 Ronald S. Bultje <[email protected]>
 ;* Copyright (c) 2010 Fiona Glaser <[email protected]>
 ;*
@@ -24,25 +24,6 @@
 
 SECTION_RODATA
 
-fourtap_filter_hw_m: times 4 dw  -6, 123
-                     times 4 dw  12,  -1
-                     times 4 dw  -9,  93
-                     times 4 dw  50,  -6
-                     times 4 dw  -6,  50
-                     times 4 dw  93,  -9
-                     times 4 dw  -1,  12
-                     times 4 dw 123,  -6
-
-sixtap_filter_hw_m:  times 4 dw   2, -11
-                     times 4 dw 108,  36
-                     times 4 dw  -8,   1
-                     times 4 dw   3, -16
-                     times 4 dw  77,  77
-                     times 4 dw -16,   3
-                     times 4 dw   1,  -8
-                     times 4 dw  36, 108
-                     times 4 dw -11,   2
-
 fourtap_filter_hb_m: times 8 db  -6, 123
                      times 8 db  12,  -1
                      times 8 db  -9,  93
@@ -115,8 +96,6 @@ bilinear_filter_vb_m: times 8 db 7, 1
                       times 8 db 1, 7
 
 %if PIC
-%define fourtap_filter_hw  picregq
-%define sixtap_filter_hw   picregq
 %define fourtap_filter_hb  picregq
 %define sixtap_filter_hb   picregq
 %define fourtap_filter_v   picregq
@@ -125,8 +104,6 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %define bilinear_filter_vb picregq
 %define npicregs 1
 %else
-%define fourtap_filter_hw  fourtap_filter_hw_m
-%define sixtap_filter_hw   sixtap_filter_hw_m
 %define fourtap_filter_hb  fourtap_filter_hb_m
 %define sixtap_filter_hb   sixtap_filter_hb_m
 %define fourtap_filter_v   fourtap_filter_v_m
@@ -322,112 +299,6 @@ FILTER_SSSE3 4
 INIT_XMM ssse3
 FILTER_SSSE3 8
 
-; 4x4 block, H-only 4-tap filter
-INIT_MMX mmxext
-cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, 
height, mx, picreg
-    shl       mxd, 4
-%if PIC
-    lea   picregq, [fourtap_filter_hw_m]
-%endif
-    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
-    movq      mm5, [fourtap_filter_hw+mxq]
-    movq      mm7, [pw_64]
-    pxor      mm6, mm6
-
-.nextrow:
-    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal 
pixels
-
-    ; first set of 2 pixels
-    movq      mm2, mm1                     ; byte ABCD..
-    punpcklbw mm1, mm6                     ; byte->word ABCD
-    pshufw    mm0, mm2, 9                  ; byte CDEF..
-    punpcklbw mm0, mm6                     ; byte->word CDEF
-    pshufw    mm3, mm1, 0x94               ; word ABBC
-    pshufw    mm1, mm0, 0x94               ; word CDDE
-    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
-    movq      mm0, mm1                     ; backup for second set of pixels
-    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
-    paddd     mm3, mm1                     ; finish 1st 2px
-
-    ; second set of 2 pixels, use backup of above
-    punpckhbw mm2, mm6                     ; byte->word EFGH
-    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
-    pshufw    mm1, mm2, 0x94               ; word EFFG
-    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
-    paddd     mm0, mm1                     ; finish 2nd 2px
-
-    ; merge two sets of 2 pixels into one set of 4, round/clip/store
-    packssdw  mm3, mm0                     ; merge dword->word (4px)
-    paddsw    mm3, mm7                     ; rounding
-    psraw     mm3, 7
-    packuswb  mm3, mm6                     ; clip and word->bytes
-    movd   [dstq], mm3                     ; store
-
-    ; go to next line
-    add      dstq, dststrideq
-    add      srcq, srcstrideq
-    dec   heightd                          ; next row
-    jg .nextrow
-    RET
-
-; 4x4 block, H-only 6-tap filter
-INIT_MMX mmxext
-cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, 
height, mx, picreg
-    lea       mxd, [mxq*3]
-%if PIC
-    lea   picregq, [sixtap_filter_hw_m]
-%endif
-    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
-    movq      mm5, [sixtap_filter_hw+mxq*8-32]
-    movq      mm6, [sixtap_filter_hw+mxq*8-16]
-    movq      mm7, [pw_64]
-    pxor      mm3, mm3
-
-.nextrow:
-    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal 
pixels
-
-    ; first set of 2 pixels
-    movq      mm2, mm1                     ; byte ABCD..
-    punpcklbw mm1, mm3                     ; byte->word ABCD
-    pshufw    mm0, mm2, 0x9                ; byte CDEF..
-    punpckhbw mm2, mm3                     ; byte->word EFGH
-    punpcklbw mm0, mm3                     ; byte->word CDEF
-    pshufw    mm1, mm1, 0x94               ; word ABBC
-    pshufw    mm2, mm2, 0x94               ; word EFFG
-    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
-    pshufw    mm3, mm0, 0x94               ; word CDDE
-    movq      mm0, mm3                     ; backup for second set of pixels
-    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
-    paddd     mm1, mm3                     ; add to 1st 2px cache
-    movq      mm3, mm2                     ; backup for second set of pixels
-    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
-    paddd     mm1, mm2                     ; finish 1st 2px
-
-    ; second set of 2 pixels, use backup of above
-    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
-    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with 
F0/F1
-    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with 
F2/F3
-    paddd     mm0, mm3                     ; add to 2nd 2px cache
-    pxor      mm3, mm3
-    punpcklbw mm2, mm3                     ; byte->word FGHI
-    pshufw    mm2, mm2, 0xE9               ; word GHHI
-    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
-    paddd     mm0, mm2                     ; finish 2nd 2px
-
-    ; merge two sets of 2 pixels into one set of 4, round/clip/store
-    packssdw  mm1, mm0                     ; merge dword->word (4px)
-    paddsw    mm1, mm7                     ; rounding
-    psraw     mm1, 7
-    packuswb  mm1, mm3                     ; clip and word->bytes
-    movd   [dstq], mm1                     ; store
-
-    ; go to next line
-    add      dstq, dststrideq
-    add      srcq, srcstrideq
-    dec   heightd                          ; next row
-    jg .nextrow
-    RET
-
 INIT_XMM sse2
 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, 
height, mx, picreg
     shl      mxd, 5
@@ -539,9 +410,9 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, 
dststride, src, srcstride, h
     jg .nextrow
     RET
 
-%macro FILTER_V 1
+INIT_XMM sse2
 ; 4x4 block, V-only 4-tap filter
-cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
+cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     shl      myd, 5
 %if PIC
     lea  picregq, [fourtap_filter_v_m]
@@ -594,7 +465,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
 
 
 ; 4x4 block, V-only 6-tap filter
-cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
+cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     shl      myd, 4
     lea      myq, [myq*3]
 %if PIC
@@ -656,12 +527,6 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     dec  heightd                           ; next row
     jg .nextrow
     RET
-%endmacro
-
-INIT_MMX mmxext
-FILTER_V 4
-INIT_XMM sse2
-FILTER_V 8
 
 %macro FILTER_BILINEAR 1
 %if cpuflag(ssse3)
@@ -722,16 +587,9 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, 
src, srcstride, height, p
     psraw     m2, 2
     pavgw     m0, m6
     pavgw     m2, m6
-%if mmsize == 8
-    packuswb  m0, m0
-    packuswb  m2, m2
-    movh   [dstq+dststrideq*0], m0
-    movh   [dstq+dststrideq*1], m2
-%else
     packuswb  m0, m2
     movh   [dstq+dststrideq*0], m0
     movhps [dstq+dststrideq*1], m0
-%endif
 %endif ; cpuflag(ssse3)
 
     lea     dstq, [dstq+dststrideq*2]
@@ -799,16 +657,9 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride
     psraw     m2, 2
     pavgw     m0, m6
     pavgw     m2, m6
-%if mmsize == 8
-    packuswb  m0, m0
-    packuswb  m2, m2
-    movh   [dstq+dststrideq*0], m0
-    movh   [dstq+dststrideq*1], m2
-%else
     packuswb  m0, m2
     movh   [dstq+dststrideq*0], m0
     movhps [dstq+dststrideq*1], m0
-%endif
 %endif ; cpuflag(ssse3)
 
     lea     dstq, [dstq+dststrideq*2]
@@ -818,8 +669,6 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride
     RET
 %endmacro
 
-INIT_MMX mmxext
-FILTER_BILINEAR 4
 INIT_XMM sse2
 FILTER_BILINEAR 8
 INIT_MMX ssse3
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index e37afab775..00733a2564 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -29,19 +29,6 @@
 /*
  * MC functions
  */
-void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                const uint8_t *src, ptrdiff_t srcstride,
-                                int height, int mx, int my);
-void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                const uint8_t *src, ptrdiff_t srcstride,
-                                int height, int mx, int my);
-void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                const uint8_t *src, ptrdiff_t srcstride,
-                                int height, int mx, int my);
-void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                const uint8_t *src, ptrdiff_t srcstride,
-                                int height, int mx, int my);
-
 void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                 const uint8_t *src, ptrdiff_t srcstride,
                                 int height, int mx, int my);
@@ -80,9 +67,6 @@ void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t 
dststride,
                                 const uint8_t *src, ptrdiff_t srcstride,
                                 int height, int mx, int my);
 
-void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                   const uint8_t *src, ptrdiff_t srcstride,
-                                   int height, int mx, int my);
 void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                    const uint8_t *src, ptrdiff_t srcstride,
                                    int height, int mx, int my);
@@ -93,9 +77,6 @@ void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t 
dststride,
                                    const uint8_t *src, ptrdiff_t srcstride,
                                    int height, int mx, int my);
 
-void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                   const uint8_t *src, ptrdiff_t srcstride,
-                                   int height, int mx, int my);
 void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                    const uint8_t *src, ptrdiff_t srcstride,
                                    int height, int mx, int my);
@@ -159,14 +140,6 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v 
## TAPNUMY ## _ ## OPT
         dst, dststride, tmpptr, SIZE,      height,               mx, my); \
 }
 
-#define HVTAPMMX(x, y) \
-HVTAP(mmxext, 8, x, y,  4,  8)
-
-HVTAPMMX(4, 4)
-HVTAPMMX(4, 6)
-HVTAPMMX(6, 4)
-HVTAPMMX(6, 6)
-
 #define HVTAPSSE2(x, y, w) \
 HVTAP(sse2,  16, x, y, w, 16) \
 HVTAP(ssse3, 16, x, y, w, 16)
@@ -194,7 +167,6 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
         dst, dststride, tmp, SIZE,      height,     mx, my); \
 }
 
-HVBILIN(mmxext,  8,  4,  8)
 HVBILIN(sse2,  8,  8, 16)
 HVBILIN(sse2,  8, 16, 16)
 HVBILIN(ssse3, 8,  4,  8)
@@ -285,13 +257,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
     }
 
-    /* note that 4-tap width=16 functions are missing because w=16
-     * is only used for luma, and luma is always a copy or sixtap. */
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        VP8_MC_FUNC(2, 4, mmxext);
-        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
-    }
-
     if (EXTERNAL_SSE(cpu_flags)) {
         c->put_vp8_epel_pixels_tab[0][0][0]     =
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
@@ -304,6 +269,8 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
         VP8_BILINEAR_MC_FUNC(1, 8, sse2);
     }
 
+    /* note that 4-tap width=16 functions are missing because w=16
+     * is only used for luma, and luma is always a copy or sixtap. */
     if (EXTERNAL_SSSE3(cpu_flags)) {
         VP8_LUMA_MC_FUNC(0, 16, ssse3);
         VP8_MC_FUNC(1, 8, ssse3);
-- 
2.49.1


>From 3fd1685e3d4cbde7f8754c91911e70ea780ce52b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 23 Nov 2025 11:25:26 +0100
Subject: [PATCH 02/15] avcodec/x86/vp8dsp: Don't use MMX registers in
 put_vp8_pixels8

Use GPRs on x64 and xmm registers else (using GPRs reduces codesize).
This avoids clobbering the floating point state and therefore no longer
breaks the ABI.
No change in benchmarks here.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm    | 20 ++++++++++++++------
 libavcodec/x86/vp8dsp_init.c |  9 +++------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 7b836351e4..7dee979e20 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -676,14 +676,22 @@ FILTER_BILINEAR 4
 INIT_XMM ssse3
 FILTER_BILINEAR 8
 
-INIT_MMX mmx
-cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
+INIT_XMM sse2
+cglobal put_vp8_pixels8, 5, 5+2*ARCH_X86_64, 2, dst, dststride, src, 
srcstride, height
 .nextrow:
-    movq    mm0, [srcq+srcstrideq*0]
-    movq    mm1, [srcq+srcstrideq*1]
+%if ARCH_X86_64
+    mov     r5q, [srcq+srcstrideq*0]
+    mov     r6q, [srcq+srcstrideq*1]
     lea    srcq, [srcq+srcstrideq*2]
-    movq [dstq+dststrideq*0], mm0
-    movq [dstq+dststrideq*1], mm1
+    mov [dstq+dststrideq*0], r5q
+    mov [dstq+dststrideq*1], r6q
+%else
+    movq     m0, [srcq+srcstrideq*0]
+    movq     m1, [srcq+srcstrideq*1]
+    lea    srcq, [srcq+srcstrideq*2]
+    movq [dstq+dststrideq*0], m0
+    movq [dstq+dststrideq*1], m1
+%endif
     lea    dstq, [dstq+dststrideq*2]
     sub heightd, 2
     jg .nextrow
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index 00733a2564..40aa52c7f0 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -88,7 +88,7 @@ void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t 
dststride,
                                    int height, int mx, int my);
 
 
-void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
+void ff_put_vp8_pixels8_sse2(uint8_t *dst, ptrdiff_t dststride,
                              const uint8_t *src, ptrdiff_t srcstride,
                              int height, int mx, int my);
 void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
@@ -252,17 +252,14 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags)) {
-        c->put_vp8_epel_pixels_tab[1][0][0]     =
-        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
-    }
-
     if (EXTERNAL_SSE(cpu_flags)) {
         c->put_vp8_epel_pixels_tab[0][0][0]     =
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
 
     if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
+        c->put_vp8_epel_pixels_tab[1][0][0]     =
+        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_sse2;
         VP8_LUMA_MC_FUNC(0, 16, sse2);
         VP8_MC_FUNC(1, 8, sse2);
         VP8_BILINEAR_MC_FUNC(0, 16, sse2);
-- 
2.49.1


>From a08ac2daa09f50bfe9ff84aec746a9b4c7b80a36 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 23 Nov 2025 12:53:12 +0100
Subject: [PATCH 03/15] avcodec/x86/vp8dsp: Directly use negated stride

There is a register available. No change in benchmarks here.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 44 +++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 7dee979e20..6b5ca9f309 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -219,11 +219,11 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     mova      m7, [pw_256]
 
     ; read 3 lines
-    sub     srcq, srcstrideq
-    movh      m0, [srcq]
-    movh      m1, [srcq+  srcstrideq]
-    movh      m2, [srcq+2*srcstrideq]
-    add     srcq, srcstrideq
+    mov  picregq, srcstrideq
+    neg  picregq
+    movh      m0, [srcq+picregq]
+    movh      m1, [srcq]
+    movh      m2, [srcq+srcstrideq]
 
 .nextrow:
     movh      m3, [srcq+2*srcstrideq]      ; read new row
@@ -255,18 +255,17 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     lea      myq, [sixtap_filter_hb+myq*8]
 
     ; read 5 lines
-    sub     srcq, srcstrideq
-    sub     srcq, srcstrideq
-    movh      m0, [srcq]
-    movh      m1, [srcq+srcstrideq]
-    movh      m2, [srcq+srcstrideq*2]
+    mov  picregq, srcstrideq
+    neg  picregq
+    movh      m0, [srcq+2*picregq]
+    movh      m1, [srcq+picregq]
+    movh      m2, [srcq]
+    movh      m3, [srcq+srcstrideq]
+    movh      m4, [srcq+2*srcstrideq]
     lea     srcq, [srcq+srcstrideq*2]
-    add     srcq, srcstrideq
-    movh      m3, [srcq]
-    movh      m4, [srcq+srcstrideq]
 
 .nextrow:
-    movh      m5, [srcq+2*srcstrideq]      ; read new row
+    movh      m5, [srcq+srcstrideq]      ; read new row
     mova      m6, m0
     punpcklbw m6, m5
     mova      m0, m1
@@ -475,15 +474,14 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picre
     pxor      m7, m7
 
     ; read 5 lines
-    sub     srcq, srcstrideq
-    sub     srcq, srcstrideq
-    movh      m0, [srcq]
-    movh      m1, [srcq+srcstrideq]
-    movh      m2, [srcq+srcstrideq*2]
+    mov  picregq, srcstrideq
+    neg  picregq
+    movh      m0, [srcq+2*picregq]
+    movh      m1, [srcq+picregq]
+    movh      m2, [srcq]
+    movh      m3, [srcq+srcstrideq]
+    movh      m4, [srcq+2*srcstrideq]
     lea     srcq, [srcq+srcstrideq*2]
-    add     srcq, srcstrideq
-    movh      m3, [srcq]
-    movh      m4, [srcq+srcstrideq]
     punpcklbw m0, m7
     punpcklbw m1, m7
     punpcklbw m2, m7
@@ -499,7 +497,7 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picre
     paddsw    m6, m5
 
     ; then calculate positive taps
-    movh      m5, [srcq+2*srcstrideq]      ; read new row
+    movh      m5, [srcq+srcstrideq]      ; read new row
     punpcklbw m5, m7
     pmullw    m0, [myq+0]
     paddsw    m6, m0
-- 
2.49.1


>From 456ecec84197e6be99b1811fb0eda5722df47da9 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 23 Nov 2025 13:15:07 +0100
Subject: [PATCH 04/15] avcodec/x86/vp8dsp: Increment src pointer earlier

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 6b5ca9f309..0d37012e9d 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -166,6 +166,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, 
dststride, src, srcstride, h
     pmaddubsw m0, m5
     pmaddubsw m1, m6
     pmaddubsw m2, m7
+    add     srcq, srcstrideq
     paddsw    m0, m1
     paddsw    m0, m2
     pmulhrsw  m0, [pw_256]
@@ -174,7 +175,6 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, 
dststride, src, srcstride, h
 
     ; go to next line
     add     dstq, dststrideq
-    add     srcq, srcstrideq
     dec  heightd            ; next row
     jg .nextrow
     RET
@@ -197,6 +197,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride, h
     pshufb    m1, m4
     pmaddubsw m0, m5
     pmaddubsw m1, m6
+    add     srcq, srcstrideq
     paddsw    m0, m1
     pmulhrsw  m0, m2
     packuswb  m0, m0
@@ -204,7 +205,6 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride, h
 
     ; go to next line
     add     dstq, dststrideq
-    add     srcq, srcstrideq
     dec  heightd            ; next row
     jg .nextrow
     RET
@@ -234,6 +234,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     punpcklbw m2, m3
     pmaddubsw m4, m5
     pmaddubsw m2, m6
+    add      srcq, srcstrideq
     paddsw    m4, m2
     mova      m2, m3
     pmulhrsw  m4, m7
@@ -242,7 +243,6 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
 
     ; go to next line
     add      dstq, dststrideq
-    add      srcq, srcstrideq
     dec   heightd                          ; next row
     jg .nextrow
     RET
@@ -275,6 +275,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     pmaddubsw m6, [myq-48]
     pmaddubsw m1, [myq-32]
     pmaddubsw m7, [myq-16]
+    add      srcq, srcstrideq
     paddsw    m6, m1
     paddsw    m6, m7
     mova      m1, m2
@@ -287,7 +288,6 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
 
     ; go to next line
     add      dstq, dststrideq
-    add      srcq, srcstrideq
     dec   heightd                          ; next row
     jg .nextrow
     RET
@@ -331,6 +331,7 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, 
dststride, src, srcstride, h
     pmullw    m2, [mxq+32]
     pmullw    m3, [mxq+48]
 %endif
+    add     srcq, srcstrideq
     paddsw    m0, m1
     paddsw    m2, m3
     paddsw    m0, m2
@@ -341,7 +342,6 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, 
dststride, src, srcstride, h
 
     ; go to next line
     add     dstq, dststrideq
-    add     srcq, srcstrideq
     dec  heightd            ; next row
     jg .nextrow
     RET
@@ -392,6 +392,7 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, 
dststride, src, srcstride, h
     pmullw    m4, [mxq+64]
     pmullw    m5, [mxq+80]
 %endif
+    add     srcq, srcstrideq
     paddsw    m1, m4
     paddsw    m0, m5
     paddsw    m1, m2
@@ -404,7 +405,6 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, 
dststride, src, srcstride, h
 
     ; go to next line
     add     dstq, dststrideq
-    add     srcq, srcstrideq
     dec  heightd            ; next row
     jg .nextrow
     RET
@@ -446,6 +446,7 @@ cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picre
     paddsw    m4, m1
     mova      m1, m2
     pmullw    m2, [myq+32]
+    add     srcq, srcstrideq
     paddsw    m4, m2
     mova      m2, m3
 
@@ -457,7 +458,6 @@ cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picre
 
     ; go to next line
     add     dstq, dststrideq
-    add     srcq, srcstrideq
     dec  heightd                           ; next row
     jg .nextrow
     RET
@@ -507,6 +507,7 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picre
     paddsw    m6, m2
     mova      m2, m3
     pmullw    m3, [myq+48]
+    add     srcq, srcstrideq
     paddsw    m6, m3
     mova      m3, m4
     mova      m4, m5
@@ -521,7 +522,6 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picre
 
     ; go to next line
     add     dstq, dststrideq
-    add     srcq, srcstrideq
     dec  heightd                           ; next row
     jg .nextrow
     RET
@@ -543,6 +543,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, 
srcstride, height, p
     punpcklbw m1, m2
     pmaddubsw m0, m3
     pmaddubsw m1, m3
+    lea     srcq, [srcq+srcstrideq*2]
     psraw     m0, 2
     psraw     m1, 2
     pavgw     m0, m4
@@ -579,6 +580,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, 
srcstride, height, p
     pmullw    m1, m5
     pmullw    m2, m4
     pmullw    m3, m5
+    lea     srcq, [srcq+srcstrideq*2]
     paddsw    m0, m1
     paddsw    m2, m3
     psraw     m0, 2
@@ -591,7 +593,6 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, 
srcstride, height, p
 %endif ; cpuflag(ssse3)
 
     lea     dstq, [dstq+dststrideq*2]
-    lea     srcq, [srcq+srcstrideq*2]
     sub  heightd, 2
     jg .nextrow
     RET
@@ -612,6 +613,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, 
dststride, src, srcstride
     pshufb    m1, m2
     pmaddubsw m0, m3
     pmaddubsw m1, m3
+    lea     srcq, [srcq+srcstrideq*2]
     psraw     m0, 2
     psraw     m1, 2
     pavgw     m0, m4
@@ -649,6 +651,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride
     pmullw    m1, m5
     pmullw    m2, m4
     pmullw    m3, m5
+    lea     srcq, [srcq+srcstrideq*2]
     paddsw    m0, m1
     paddsw    m2, m3
     psraw     m0, 2
@@ -661,7 +664,6 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride
 %endif ; cpuflag(ssse3)
 
     lea     dstq, [dstq+dststrideq*2]
-    lea     srcq, [srcq+srcstrideq*2]
     sub  heightd, 2
     jg .nextrow
     RET
-- 
2.49.1


>From 936f8412aff35236d0f2c786aafa40d75331a640 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 23 Nov 2025 13:27:35 +0100
Subject: [PATCH 05/15] avcodec/x86/vp8dsp: Avoid reload

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 0d37012e9d..e971da68ac 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -535,8 +535,8 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, 
srcstride, height, p
 %endif
     pxor      m4, m4
     mova      m3, [bilinear_filter_vb+myq-16]
-.nextrow:
     movh      m0, [srcq+srcstrideq*0]
+.nextrow:
     movh      m1, [srcq+srcstrideq*1]
     movh      m2, [srcq+srcstrideq*2]
     punpcklbw m0, m1
@@ -558,6 +558,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, 
srcstride, height, p
     movh   [dstq+dststrideq*0], m0
     movhps [dstq+dststrideq*1], m0
 %endif
+    mova      m0, m2
 %else ; cpuflag(ssse3)
 cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, 
picreg, my
     shl      myd, 4
-- 
2.49.1


>From 15d229859aa0d7804791f70100fd55738925560a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 23 Nov 2025 15:39:48 +0100
Subject: [PATCH 06/15] avcodec/x86/vp8dsp_init: Remove unused macro

Forgotten in 6a551f14050674fb685920eb1b0640810cacccf9.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp_init.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index 40aa52c7f0..828b038cdf 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -105,16 +105,6 @@ static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## 
_ ## OPT( \
     ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
         dst + 8, dststride, src + 8, srcstride, height, mx, my); \
 }
-#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
-static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
-    uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
-    ptrdiff_t srcstride, int height, int mx, int my) \
-{ \
-    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
-        dst,     dststride, src,     srcstride, height, mx, my); \
-    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
-        dst + 4, dststride, src + 4, srcstride, height, mx, my); \
-}
 
 TAP_W16(sse2,  epel, h6)
 TAP_W16(sse2,  epel, v6)
-- 
2.49.1


>From 1b99c21a689f61a8dbac5dfd7ec4dc46b3ffd698 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 23 Nov 2025 20:25:26 +0100
Subject: [PATCH 07/15] avcodec/x86/vp8dsp: Avoid unpacking multiple times

Always pair row i with row i+2 for the vertical four-tap filter
and row i+3 for the vertical six-tap filter (instead of pairing
the first with the sixth, the second with the third and the fourth
and the fifth). This allows to unpack each row only once instead
of (at most) three times.

Old benchmarks:
vp8_put_epel4_v4_c:                                     98.4 ( 1.00x)
vp8_put_epel4_v4_ssse3:                                 28.6 ( 3.44x)
vp8_put_epel4_v6_c:                                    131.6 ( 1.00x)
vp8_put_epel4_v6_ssse3:                                 38.5 ( 3.42x)
vp8_put_epel8_v4_c:                                    362.5 ( 1.00x)
vp8_put_epel8_v4_sse2:                                  63.8 ( 5.68x)
vp8_put_epel8_v4_ssse3:                                 44.4 ( 8.16x)
vp8_put_epel8_v6_c:                                    538.3 ( 1.00x)
vp8_put_epel8_v6_sse2:                                  86.5 ( 6.22x)
vp8_put_epel8_v6_ssse3:                                 57.0 ( 9.44x)
vp8_put_epel16_v6_c:                                  1044.6 ( 1.00x)
vp8_put_epel16_v6_sse2:                                158.0 ( 6.61x)
vp8_put_epel16_v6_ssse3:                               106.7 ( 9.79x)

New benchmarks:
vp8_put_epel4_v4_c:                                    100.0 ( 1.00x)
vp8_put_epel4_v4_ssse3:                                 28.4 ( 3.52x)
vp8_put_epel4_v6_c:                                    131.7 ( 1.00x)
vp8_put_epel4_v6_ssse3:                                 34.3 ( 3.84x)
vp8_put_epel8_v4_c:                                    364.4 ( 1.00x)
vp8_put_epel8_v4_sse2:                                  63.7 ( 5.72x)
vp8_put_epel8_v4_ssse3:                                 43.3 ( 8.42x)
vp8_put_epel8_v6_c:                                    550.2 ( 1.00x)
vp8_put_epel8_v6_sse2:                                  86.4 ( 6.37x)
vp8_put_epel8_v6_ssse3:                                 52.9 (10.40x)
vp8_put_epel16_v6_c:                                  1052.5 ( 1.00x)
vp8_put_epel16_v6_sse2:                                158.3 ( 6.65x)
vp8_put_epel16_v6_ssse3:                                98.9 (10.64x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 68 +++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 24 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index e971da68ac..7cb729a443 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -33,6 +33,15 @@ fourtap_filter_hb_m: times 8 db  -6, 123
                      times 8 db  -1,  12
                      times 8 db 123,  -6
 
+fourtap_filter_b_m:  times 8 db  -6,  12
+                     times 8 db 123,  -1
+                     times 8 db  -9,  50
+                     times 8 db  93,  -6
+                     times 8 db  -6,  93
+                     times 8 db  50,  -9
+                     times 8 db  -1, 123
+                     times 8 db  12,  -6
+
 sixtap_filter_hb_m:  times 8 db   2,   1
                      times 8 db -11, 108
                      times 8 db  36,  -8
@@ -43,6 +52,16 @@ sixtap_filter_hb_m:  times 8 db   2,   1
                      times 8 db  -8,  36
                      times 8 db 108, -11
 
+sixtap_filter_b_m:   times 8 db   2,  36
+                     times 8 db -11,  -8
+                     times 8 db 108,   1
+                     times 8 db   3,  77
+                     times 8 db -16, -16
+                     times 8 db  77,   3
+                     times 8 db   1, 108
+                     times 8 db  -8, -11
+                     times 8 db  36,   2
+
 fourtap_filter_v_m:  times 8 dw  -6
                      times 8 dw 123
                      times 8 dw  12
@@ -97,7 +116,9 @@ bilinear_filter_vb_m: times 8 db 7, 1
 
 %if PIC
 %define fourtap_filter_hb  picregq
+%define fourtap_filter_b   picregq
 %define sixtap_filter_hb   picregq
+%define sixtap_filter_b    picregq
 %define fourtap_filter_v   picregq
 %define sixtap_filter_v    picregq
 %define bilinear_filter_vw picregq
@@ -105,7 +126,9 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %define npicregs 1
 %else
 %define fourtap_filter_hb  fourtap_filter_hb_m
+%define fourtap_filter_b   fourtap_filter_b_m
 %define sixtap_filter_hb   sixtap_filter_hb_m
+%define sixtap_filter_b    sixtap_filter_b_m
 %define fourtap_filter_v   fourtap_filter_v_m
 %define sixtap_filter_v    sixtap_filter_v_m
 %define bilinear_filter_vw bilinear_filter_vw_m
@@ -212,10 +235,10 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride, h
 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     shl      myd, 4
 %if PIC
-    lea  picregq, [fourtap_filter_hb_m]
+    lea  picregq, [fourtap_filter_b_m]
 %endif
-    mova      m5, [fourtap_filter_hb+myq-16]
-    mova      m6, [fourtap_filter_hb+myq]
+    mova      m5, [fourtap_filter_b+myq-16]
+    mova      m6, [fourtap_filter_b+myq]
     mova      m7, [pw_256]
 
     ; read 3 lines
@@ -224,21 +247,20 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     movh      m0, [srcq+picregq]
     movh      m1, [srcq]
     movh      m2, [srcq+srcstrideq]
+    punpcklbw m0, m2
 
 .nextrow:
     movh      m3, [srcq+2*srcstrideq]      ; read new row
-    mova      m4, m0
+    pmaddubsw m0, m5
+    punpcklbw m1, m3
+    pmaddubsw m4, m1, m6
+    add     srcq, srcstrideq
+    paddsw    m4, m0
     mova      m0, m1
-    punpcklbw m4, m1
-    mova      m1, m2
-    punpcklbw m2, m3
-    pmaddubsw m4, m5
-    pmaddubsw m2, m6
-    add      srcq, srcstrideq
-    paddsw    m4, m2
-    mova      m2, m3
     pmulhrsw  m4, m7
+    mova      m1, m2
     packuswb  m4, m4
+    mova      m2, m3
     movh  [dstq], m4
 
     ; go to next line
@@ -250,9 +272,9 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     lea      myd, [myq*3]
 %if PIC
-    lea  picregq, [sixtap_filter_hb_m]
+    lea  picregq, [sixtap_filter_b_m]
 %endif
-    lea      myq, [sixtap_filter_hb+myq*8]
+    lea      myq, [sixtap_filter_b+myq*8]
 
     ; read 5 lines
     mov  picregq, srcstrideq
@@ -263,20 +285,18 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     movh      m3, [srcq+srcstrideq]
     movh      m4, [srcq+2*srcstrideq]
     lea     srcq, [srcq+srcstrideq*2]
+    punpcklbw m0, m3
+    punpcklbw m1, m4
 
 .nextrow:
     movh      m5, [srcq+srcstrideq]      ; read new row
-    mova      m6, m0
-    punpcklbw m6, m5
+    pmaddubsw m0, [myq-48]
+    punpcklbw m2, m5
+    pmaddubsw m6, m1, [myq-32]
+    pmaddubsw m7, m2, [myq-16]
+    add     srcq, srcstrideq
+    paddw     m6, m0
     mova      m0, m1
-    punpcklbw m1, m2
-    mova      m7, m3
-    punpcklbw m7, m4
-    pmaddubsw m6, [myq-48]
-    pmaddubsw m1, [myq-32]
-    pmaddubsw m7, [myq-16]
-    add      srcq, srcstrideq
-    paddsw    m6, m1
     paddsw    m6, m7
     mova      m1, m2
     mova      m2, m3
-- 
2.49.1


>From 25836faa4f89001299f9faa75f00a2bc8d55d0ea Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 23 Nov 2025 23:29:24 +0100
Subject: [PATCH 08/15] avcodec/x86/vp8dsp: Don't use MMX registers in
 ff_put_vp8_epel4_v6_sse3

Switching to xmm registers allows to process two rows in parallel,
leading to speedups. It is also ABI compliant (no more missing emms).

Old benchmarks:
vp8_put_epel4_v6_c:                                    132.8 ( 1.00x)
vp8_put_epel4_v6_ssse3:                                 34.3 ( 3.87x)

New benchmarks:
vp8_put_epel4_v6_c:                                    131.5 ( 1.00x)
vp8_put_epel4_v6_ssse3:                                 27.1 ( 4.86x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 48 +++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 7cb729a443..4778944ac7 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -162,6 +162,12 @@ SECTION .text
 
;-------------------------------------------------------------------------------
 
 %macro FILTER_SSSE3 1
+%if %1 == 4
+%define MOV movd
+%else
+%define MOV movq
+%endif
+
 cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, 
height, mx, picreg
     lea      mxd, [mxq*3]
     mova      m3, [filter_h6_shuf2]
@@ -269,6 +275,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     jg .nextrow
     RET
 
+INIT_XMM ssse3
 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     lea      myd, [myq*3]
 %if PIC
@@ -279,14 +286,44 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     ; read 5 lines
     mov  picregq, srcstrideq
     neg  picregq
-    movh      m0, [srcq+2*picregq]
-    movh      m1, [srcq+picregq]
-    movh      m2, [srcq]
-    movh      m3, [srcq+srcstrideq]
-    movh      m4, [srcq+2*srcstrideq]
+    MOV       m0, [srcq+2*picregq]
+    MOV       m1, [srcq+picregq]
+    MOV       m2, [srcq]
+    MOV       m3, [srcq+srcstrideq]
+    MOV       m4, [srcq+2*srcstrideq]
     lea     srcq, [srcq+srcstrideq*2]
     punpcklbw m0, m3
     punpcklbw m1, m4
+%if %1 == 4
+    punpcklqdq m0, m1
+
+.next2rows:
+    movd       m5, [srcq+srcstrideq]
+    movd       m6, [srcq+2*srcstrideq]
+    pmaddubsw  m0, [myq-48]
+    punpcklbw  m2, m5
+    punpcklqdq m1, m2
+    pmaddubsw  m1, [myq-32]
+    punpcklbw  m3, m6
+    punpcklqdq m2, m3
+    paddw      m0, m1
+    pmaddubsw  m1, m2, [myq-16]
+    lea      srcq, [srcq+2*srcstrideq]
+    paddsw     m1, m0
+    mova       m0, m2
+    pmulhrsw   m1, [pw_256]
+    mova       m2, m4
+    packuswb   m1, m1
+    movd   [dstq], m1
+    mova       m4, m6
+    psrldq     m1, 4
+    movd [dstq+dststrideq], m1
+    lea      dstq, [dstq+2*dststrideq]
+    mova       m1, m3
+    mova       m3, m5
+    sub   heightd, 2
+    jg .next2rows
+%else
 
 .nextrow:
     movh      m5, [srcq+srcstrideq]      ; read new row
@@ -310,6 +347,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     add      dstq, dststrideq
     dec   heightd                          ; next row
     jg .nextrow
+%endif
     RET
 %endmacro
 
-- 
2.49.1


>From 9755f51400a2668ddb05d92932badf26bb0c9723 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 24 Nov 2025 09:16:26 +0100
Subject: [PATCH 09/15] avcodec/x86/vp8dsp: Don't use MMX registers in
 ff_put_vp8_epel4_v4_ssse3

Switching to xmm registers allows to process two rows in parallel,
leading to speedups. It is also ABI compliant (no more missing emms).

Old benchmarks:
vp8_put_epel4_v4_c:                                     96.8 ( 1.00x)
vp8_put_epel4_v4_ssse3:                                 28.2 ( 3.43x)

New benchmarks:
vp8_put_epel4_v4_c:                                     95.1 ( 1.00x)
vp8_put_epel4_v4_ssse3:                                 22.8 ( 4.17x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 4778944ac7..fd60feaf1f 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -238,6 +238,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride, h
     jg .nextrow
     RET
 
+INIT_XMM ssse3
 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     shl      myd, 4
 %if PIC
@@ -250,13 +251,38 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     ; read 3 lines
     mov  picregq, srcstrideq
     neg  picregq
-    movh      m0, [srcq+picregq]
-    movh      m1, [srcq]
-    movh      m2, [srcq+srcstrideq]
+    MOV       m0, [srcq+picregq]
+    MOV       m1, [srcq]
+    MOV       m2, [srcq+srcstrideq]
+    lea     srcq, [srcq+2*srcstrideq]
     punpcklbw m0, m2
 
+%if %1 == 4
+.next2rows:
+    movd       m3, [srcq]
+    movd       m4, [srcq+srcstrideq]
+    punpcklbw  m1, m3
+    punpcklqdq m0, m1
+    punpcklbw  m2, m4
+    pmaddubsw  m0, m5
+    punpcklqdq m1, m2
+    pmaddubsw  m1, m6
+    lea     srcq, [srcq+2*srcstrideq]
+    paddsw     m1, m0
+    pmulhrsw   m1, m7
+    mova       m0, m2
+    packuswb   m1, m1
+    movd   [dstq], m1
+    mova       m2, m4
+    psrldq     m1, 4
+    movd [dstq+dststrideq], m1
+    mova       m1, m3
+    lea      dstq, [dstq+2*dststrideq]
+    sub   heightd, 2
+    jg .next2rows
+%else
 .nextrow:
-    movh      m3, [srcq+2*srcstrideq]      ; read new row
+    movh      m3, [srcq]      ; read new row
     pmaddubsw m0, m5
     punpcklbw m1, m3
     pmaddubsw m4, m1, m6
@@ -273,9 +299,9 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     add      dstq, dststrideq
     dec   heightd                          ; next row
     jg .nextrow
+%endif
     RET
 
-INIT_XMM ssse3
 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     lea      myd, [myq*3]
 %if PIC
-- 
2.49.1


>From 131c522c30fdcc8259ac120372b83253f1ab6906 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 24 Nov 2025 13:29:42 +0100
Subject: [PATCH 10/15] avcodec/x86/vp8dsp: Don't use MMX registers in
 ff_put_vp8_epel4_h4_ssse3

Doubling the register width allows to use only one pshufb and pmaddubsw.

Old benchmarks:
vp8_put_epel4_h4_c:                                     82.8 ( 1.00x)
vp8_put_epel4_h4_ssse3:                                 13.9 ( 5.96x)

New benchmarks:
vp8_put_epel4_h4_c:                                     82.7 ( 1.00x)
vp8_put_epel4_h4_ssse3:                                 11.7 ( 7.08x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index fd60feaf1f..6c365898ce 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -24,6 +24,15 @@
 
 SECTION_RODATA
 
+fourtap_filter4_b_m: times 4 db  -6, 123
+                     times 4 db  12,  -1
+                     times 4 db  -9,  93
+                     times 4 db  50,  -6
+                     times 4 db  -6,  50
+                     times 4 db  93,  -9
+                     times 4 db  -1,  12
+                     times 4 db 123,  -6
+
 fourtap_filter_hb_m: times 8 db  -6, 123
                      times 8 db  12,  -1
                      times 8 db  -9,  93
@@ -117,6 +126,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %if PIC
 %define fourtap_filter_hb  picregq
 %define fourtap_filter_b   picregq
+%define fourtap_filter4_b  picregq
 %define sixtap_filter_hb   picregq
 %define sixtap_filter_b    picregq
 %define fourtap_filter_v   picregq
@@ -127,6 +137,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %else
 %define fourtap_filter_hb  fourtap_filter_hb_m
 %define fourtap_filter_b   fourtap_filter_b_m
+%define fourtap_filter4_b  fourtap_filter4_b_m
 %define sixtap_filter_hb   sixtap_filter_hb_m
 %define sixtap_filter_b    sixtap_filter_b_m
 %define fourtap_filter_v   fourtap_filter_v_m
@@ -136,6 +147,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %define npicregs 0
 %endif
 
+filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3,  4, 4,  5, 5,  6
 filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
 filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
 
@@ -208,9 +220,11 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, 
dststride, src, srcstride, h
     jg .nextrow
     RET
 
-cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, 
height, mx, picreg
-    shl      mxd, 4
+INIT_XMM ssse3
+cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, 
src, srcstride, height, mx, picreg
     mova      m2, [pw_256]
+%if %1 == 8
+    shl      mxd, 4
     mova      m3, [filter_h2_shuf]
     mova      m4, [filter_h4_shuf]
 %if PIC
@@ -218,19 +232,34 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride, h
 %endif
     mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
     mova      m6, [fourtap_filter_hb+mxq]
+%else
+    shl      mxd, 3
+    mova      m3, [filter4_h4_shuf]
+%if PIC
+    lea  picregq, [fourtap_filter4_b_m]
+%endif
+    mova      m5, [fourtap_filter4_b+mxq-8]
+%endif
 
 .nextrow:
+%if %1 == 4
+    movq      m0, [srcq-1]
+    pshufb    m0, m3
+    pmaddubsw m0, m5
+    movhlps   m1, m0
+%else
     movu      m0, [srcq-1]
     mova      m1, m0
     pshufb    m0, m3
     pshufb    m1, m4
     pmaddubsw m0, m5
     pmaddubsw m1, m6
+%endif
     add     srcq, srcstrideq
     paddsw    m0, m1
     pmulhrsw  m0, m2
     packuswb  m0, m0
-    movh  [dstq], m0        ; store
+    MOV   [dstq], m0        ; store
 
     ; go to next line
     add     dstq, dststrideq
@@ -238,7 +267,6 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride, h
     jg .nextrow
     RET
 
-INIT_XMM ssse3
 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     shl      myd, 4
 %if PIC
-- 
2.49.1


>From 31ed005d0407e55469bf13d1344469eb1f1af456 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 24 Nov 2025 16:11:10 +0100
Subject: [PATCH 11/15] avcodec/x86/vp8dsp: Don't use MMX registers in
 ff_put_vp8_epel4_h6_ssse3

Doubling the register width allowed to avoid a pshufb and a pmaddubsw.

Old benchmarks:
vp8_put_epel4_h6_c:                                    115.9 ( 1.00x)
vp8_put_epel4_h6_ssse3:                                 20.2 ( 5.74x)
vp8_put_epel4_h6v4_c:                                  276.3 ( 1.00x)
vp8_put_epel4_h6v4_ssse3:                               58.6 ( 4.71x)
vp8_put_epel4_h6v6_c:                                  363.6 ( 1.00x)
vp8_put_epel4_h6v6_ssse3:                               62.5 ( 5.82x)

New benchmarks:
vp8_put_epel4_h6_c:                                    116.4 ( 1.00x)
vp8_put_epel4_h6_ssse3:                                 16.0 ( 7.29x)
vp8_put_epel4_h6v4_c:                                  280.9 ( 1.00x)
vp8_put_epel4_h6v4_ssse3:                               44.3 ( 6.33x)
vp8_put_epel4_h6v6_c:                                  365.6 ( 1.00x)
vp8_put_epel4_h6v6_ssse3:                               53.1 ( 6.89x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 50 +++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 6c365898ce..2a66e51da6 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -33,6 +33,16 @@ fourtap_filter4_b_m: times 4 db  -6, 123
                      times 4 db  -1,  12
                      times 4 db 123,  -6
 
+sixtap_filter4_hb_m: times 8 db   2, -11
+                     times 4 db 108,  -8
+                     times 4 db  36,   1
+                     times 8 db   3, -16
+                     times 4 db  77, -16
+                     times 4 db  77,   3
+                     times 8 db   1,  -8
+                     times 4 db  36, -11
+                     times 4 db 108,   2
+
 fourtap_filter_hb_m: times 8 db  -6, 123
                      times 8 db  12,  -1
                      times 8 db  -9,  93
@@ -129,6 +139,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %define fourtap_filter4_b  picregq
 %define sixtap_filter_hb   picregq
 %define sixtap_filter_b    picregq
+%define sixtap_filter4_hb  picregq
 %define fourtap_filter_v   picregq
 %define sixtap_filter_v    picregq
 %define bilinear_filter_vw picregq
@@ -140,6 +151,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %define fourtap_filter4_b  fourtap_filter4_b_m
 %define sixtap_filter_hb   sixtap_filter_hb_m
 %define sixtap_filter_b    sixtap_filter_b_m
+%define sixtap_filter4_hb  sixtap_filter4_hb_m
 %define fourtap_filter_v   fourtap_filter_v_m
 %define sixtap_filter_v    sixtap_filter_v_m
 %define bilinear_filter_vw bilinear_filter_vw_m
@@ -148,6 +160,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %endif
 
 filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3,  4, 4,  5, 5,  6
+filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3,  5, 4,  6, 5,  7
 filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
 filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
 
@@ -180,7 +193,16 @@ SECTION .text
 %define MOV movq
 %endif
 
-cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, 
height, mx, picreg
+cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), dst, dststride, src, 
srcstride, height, mx, picreg
+%if %1 == 4
+    mova      m3, [filter4_h6_shuf]
+%if PIC
+    lea  picregq, [sixtap_filter4_hb_m]
+%endif
+    shl      mxd, 4
+    mova      m4, [sixtap_filter4_hb+mxq-32]
+    mova      m5, [sixtap_filter4_hb+mxq-16]
+%else
     lea      mxd, [mxq*3]
     mova      m3, [filter_h6_shuf2]
     mova      m4, [filter_h6_shuf3]
@@ -190,29 +212,35 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, 
dststride, src, srcstride, h
     mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
     mova      m6, [sixtap_filter_hb+mxq*8-32]
     mova      m7, [sixtap_filter_hb+mxq*8-16]
+%endif
 
 .nextrow:
+%if %1 == 4
+    ; we need nine bytes, so two loads
+    movq      m1, [srcq-1]
+    movq      m0, [srcq-2]
+    punpcklbw m0, m1
+    pshufb    m1, m3
+    pmaddubsw m1, m5
+    pmaddubsw m0, m4
+    movhlps   m2, m1
+%else
     movu      m0, [srcq-2]
     mova      m1, m0
     mova      m2, m0
-%if mmsize == 8
-; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
-; shuffle with a memory operand
-    punpcklbw m0, [srcq+3]
-%else
     pshufb    m0, [filter_h6_shuf1]
-%endif
     pshufb    m1, m3
     pshufb    m2, m4
     pmaddubsw m0, m5
     pmaddubsw m1, m6
     pmaddubsw m2, m7
+%endif
     add     srcq, srcstrideq
-    paddsw    m0, m1
+    paddw     m0, m1
     paddsw    m0, m2
     pmulhrsw  m0, [pw_256]
     packuswb  m0, m0
-    movh  [dstq], m0        ; store
+    MOV   [dstq], m0        ; store
 
     ; go to next line
     add     dstq, dststrideq
@@ -220,7 +248,6 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, 
dststride, src, srcstride, h
     jg .nextrow
     RET
 
-INIT_XMM ssse3
 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, 
src, srcstride, height, mx, picreg
     mova      m2, [pw_256]
 %if %1 == 8
@@ -405,9 +432,8 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     RET
 %endmacro
 
-INIT_MMX ssse3
-FILTER_SSSE3 4
 INIT_XMM ssse3
+FILTER_SSSE3 4
 FILTER_SSSE3 8
 
 INIT_XMM sse2
-- 
2.49.1


>From a406799d622e86a73853ab9be9ca77f6367c3d9c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 24 Nov 2025 20:32:58 +0100
Subject: [PATCH 12/15] avcodec/x86/vp8dsp: Reduce number of coefficient tables

By changing the permutations used in the epel8_h{4,6} case
we can simply reuse the coefficient tables from the vertical epel
filters.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 54 ++++++++++++---------------------------
 1 file changed, 17 insertions(+), 37 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 2a66e51da6..340f6cc818 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -43,15 +43,6 @@ sixtap_filter4_hb_m: times 8 db   2, -11
                      times 4 db  36, -11
                      times 4 db 108,   2
 
-fourtap_filter_hb_m: times 8 db  -6, 123
-                     times 8 db  12,  -1
-                     times 8 db  -9,  93
-                     times 8 db  50,  -6
-                     times 8 db  -6,  50
-                     times 8 db  93,  -9
-                     times 8 db  -1,  12
-                     times 8 db 123,  -6
-
 fourtap_filter_b_m:  times 8 db  -6,  12
                      times 8 db 123,  -1
                      times 8 db  -9,  50
@@ -61,16 +52,6 @@ fourtap_filter_b_m:  times 8 db  -6,  12
                      times 8 db  -1, 123
                      times 8 db  12,  -6
 
-sixtap_filter_hb_m:  times 8 db   2,   1
-                     times 8 db -11, 108
-                     times 8 db  36,  -8
-                     times 8 db   3,   3
-                     times 8 db -16,  77
-                     times 8 db  77, -16
-                     times 8 db   1,   2
-                     times 8 db  -8,  36
-                     times 8 db 108, -11
-
 sixtap_filter_b_m:   times 8 db   2,  36
                      times 8 db -11,  -8
                      times 8 db 108,   1
@@ -134,10 +115,8 @@ bilinear_filter_vb_m: times 8 db 7, 1
                       times 8 db 1, 7
 
 %if PIC
-%define fourtap_filter_hb  picregq
 %define fourtap_filter_b   picregq
 %define fourtap_filter4_b  picregq
-%define sixtap_filter_hb   picregq
 %define sixtap_filter_b    picregq
 %define sixtap_filter4_hb  picregq
 %define fourtap_filter_v   picregq
@@ -146,10 +125,8 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %define bilinear_filter_vb picregq
 %define npicregs 1
 %else
-%define fourtap_filter_hb  fourtap_filter_hb_m
 %define fourtap_filter_b   fourtap_filter_b_m
 %define fourtap_filter4_b  fourtap_filter4_b_m
-%define sixtap_filter_hb   sixtap_filter_hb_m
 %define sixtap_filter_b    sixtap_filter_b_m
 %define sixtap_filter4_hb  sixtap_filter4_hb_m
 %define fourtap_filter_v   fourtap_filter_v_m
@@ -161,12 +138,15 @@ bilinear_filter_vb_m: times 8 db 7, 1
 
 filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3,  4, 4,  5, 5,  6
 filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3,  5, 4,  6, 5,  7
-filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
-filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
 
-filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
-filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
-filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
+filter_h4_shuf1: db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5,  7, 6,  8, 7,  9
+filter_h4_shuf2: db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6,  8, 7,  9, 8, 10
+
+filter_h6_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5,  8, 6,  9, 7, 10
+filter_h6_shuf2: db 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6,  9, 7, 10, 8, 11
+filter_h6_shuf3: db 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 9, 12
+
+filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7, 7,  8
 
 pw_20091: times 4 dw 20091
 pw_17734: times 4 dw 17734
@@ -207,11 +187,11 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), 
dst, dststride, src, sr
     mova      m3, [filter_h6_shuf2]
     mova      m4, [filter_h6_shuf3]
 %if PIC
-    lea  picregq, [sixtap_filter_hb_m]
+    lea  picregq, [sixtap_filter_b_m]
 %endif
-    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
-    mova      m6, [sixtap_filter_hb+mxq*8-32]
-    mova      m7, [sixtap_filter_hb+mxq*8-16]
+    mova      m5, [sixtap_filter_b+mxq*8-48] ; set up 6tap filter in bytes
+    mova      m6, [sixtap_filter_b+mxq*8-32]
+    mova      m7, [sixtap_filter_b+mxq*8-16]
 %endif
 
 .nextrow:
@@ -252,13 +232,13 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 
8), dst, dststride, src,
     mova      m2, [pw_256]
 %if %1 == 8
     shl      mxd, 4
-    mova      m3, [filter_h2_shuf]
-    mova      m4, [filter_h4_shuf]
+    mova      m3, [filter_h4_shuf1]
+    mova      m4, [filter_h4_shuf2]
 %if PIC
-    lea  picregq, [fourtap_filter_hb_m]
+    lea  picregq, [fourtap_filter_b_m]
 %endif
-    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
-    mova      m6, [fourtap_filter_hb+mxq]
+    mova      m5, [fourtap_filter_b+mxq-16] ; set up 4tap filter in bytes
+    mova      m6, [fourtap_filter_b+mxq]
 %else
     shl      mxd, 3
     mova      m3, [filter4_h4_shuf]
-- 
2.49.1


>From 61379497c16bfd4048882f93461ee5d094431e1a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 24 Nov 2025 22:36:45 +0100
Subject: [PATCH 13/15] avcodec/x86/vp8dsp: Don't use saturated addition when
 unnecessary

For the epel functions, there can be no overflow as long as the sum
contains only one of the two large central coefficients; for bilinear
functions, there can be no overflow whatsoever.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 340f6cc818..22356f687b 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -450,10 +450,10 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, 
dststride, src, srcstride, h
     pmullw    m3, [mxq+48]
 %endif
     add     srcq, srcstrideq
-    paddsw    m0, m1
-    paddsw    m2, m3
+    paddw     m0, m1
+    paddw     m2, m3
+    paddw     m0, m4
     paddsw    m0, m2
-    paddsw    m0, m4
     psraw     m0, 7
     packuswb  m0, m7
     movh  [dstq], m0        ; store
@@ -511,12 +511,12 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, 
dststride, src, srcstride, h
     pmullw    m5, [mxq+80]
 %endif
     add     srcq, srcstrideq
-    paddsw    m1, m4
-    paddsw    m0, m5
-    paddsw    m1, m2
-    paddsw    m0, m3
+    paddw     m1, m4
+    paddw     m0, m5
+    paddw     m1, m2
+    paddw     m0, m3
+    paddw     m1, m6
     paddsw    m0, m1
-    paddsw    m0, m6
     psraw     m0, 7
     packuswb  m0, m7
     movh  [dstq], m0        ; store
@@ -556,20 +556,20 @@ cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picre
     mova      m3, m4
     pmullw    m0, [myq+0]
     pmullw    m4, m5
-    paddsw    m4, m0
+    paddw     m4, m0
 
     ; then calculate positive taps
     mova      m0, m1
     pmullw    m1, [myq+16]
-    paddsw    m4, m1
+    paddw     m4, m1
     mova      m1, m2
     pmullw    m2, [myq+32]
+    paddw     m4, m6
     add     srcq, srcstrideq
     paddsw    m4, m2
     mova      m2, m3
 
     ; round/clip/store
-    paddsw    m4, m6
     psraw     m4, 7
     packuswb  m4, m7
     movh  [dstq], m4
@@ -612,17 +612,18 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picre
     pmullw    m5, [myq+16]
     mova      m6, m4
     pmullw    m6, [myq+64]
-    paddsw    m6, m5
+    paddw     m6, m5
 
     ; then calculate positive taps
     movh      m5, [srcq+srcstrideq]      ; read new row
     punpcklbw m5, m7
     pmullw    m0, [myq+0]
-    paddsw    m6, m0
+    paddw     m6, [pw_64]
+    paddw     m6, m0
     mova      m0, m1
     mova      m1, m2
     pmullw    m2, [myq+32]
-    paddsw    m6, m2
+    paddw     m6, m2
     mova      m2, m3
     pmullw    m3, [myq+48]
     add     srcq, srcstrideq
@@ -633,7 +634,6 @@ cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picre
     paddsw    m6, m5
 
     ; round/clip/store
-    paddsw    m6, [pw_64]
     psraw     m6, 7
     packuswb  m6, m7
     movh  [dstq], m6
@@ -700,8 +700,8 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, 
srcstride, height, p
     pmullw    m2, m4
     pmullw    m3, m5
     lea     srcq, [srcq+srcstrideq*2]
-    paddsw    m0, m1
-    paddsw    m2, m3
+    paddw     m0, m1
+    paddw     m2, m3
     psraw     m0, 2
     psraw     m2, 2
     pavgw     m0, m6
@@ -771,8 +771,8 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride
     pmullw    m2, m4
     pmullw    m3, m5
     lea     srcq, [srcq+srcstrideq*2]
-    paddsw    m0, m1
-    paddsw    m2, m3
+    paddw     m0, m1
+    paddw     m2, m3
     psraw     m0, 2
     psraw     m2, 2
     pavgw     m0, m6
-- 
2.49.1


>From 25a28953d4737b9a466d24c47170d2c99f651db8 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 2 Dec 2025 19:49:17 +0100
Subject: [PATCH 14/15] avcodec/riscv/vp8dsp_rvv: Remove unused functions

Only the sixtap functions are used for size 16.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/riscv/vp8dsp_init.c | 5 -----
 libavcodec/riscv/vp8dsp_rvv.S  | 9 ++++++++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 3e35c72198..fecf6ef9b0 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -90,27 +90,22 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
             c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv;
             c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv;
             c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv;
-            c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
             c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
             c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
 
             c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv;
             c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv;
             c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv;
-            c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
             c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
             c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
 #if __riscv_xlen <= 64
             c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
             c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
             c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
-            c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
             c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
             c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
-            c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
             c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
             c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
-            c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
             c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
             c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
 #endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 2ee7029c60..ed08f72cdc 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -537,7 +537,14 @@ func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, 
zve32x, zba
 endfunc
 .endm
 
-.irp len,16,8,4
+# Only the sixtaps versions are used for epel16.
+epel 16 6 h
+epel 16 6 v
+#if __riscv_xlen <= 64
+epel_hv 16 6 6
+#endif
+
+.irp len,8,4
 epel \len 6 h
 epel \len 4 h
 epel \len 6 v
-- 
2.49.1


>From 83fee0147bdb91683c4aaeadc883a5e5a7066dd7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 24 Nov 2025 23:13:16 +0100
Subject: [PATCH 15/15] avcodec/vp8dsp: Don't compile unused functions

The width 16 epel functions never use four taps in any direction*,
so don't build said functions. Saves 4352B of .text and 89B of
.text.unlikely here.

*: mx and my in vp8_mc_luma() are always even.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/vp8dsp.c     | 11 +++++------
 tests/checkasm/vp8dsp.c |  3 ++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c
index 5543303adb..eabe3edb27 100644
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -558,26 +558,21 @@ put_vp8_epel ## SIZE ## _h ## HTAPS ## v ## VTAPS ## 
_c(uint8_t *dst,         \
     }                                                                         \
 }
 
-VP8_EPEL_H(16, 4)
 VP8_EPEL_H(8,  4)
 VP8_EPEL_H(4,  4)
 VP8_EPEL_H(16, 6)
 VP8_EPEL_H(8,  6)
 VP8_EPEL_H(4,  6)
-VP8_EPEL_V(16, 4)
 VP8_EPEL_V(8,  4)
 VP8_EPEL_V(4,  4)
 VP8_EPEL_V(16, 6)
 VP8_EPEL_V(8,  6)
 VP8_EPEL_V(4,  6)
 
-VP8_EPEL_HV(16, 4, 4)
 VP8_EPEL_HV(8,  4, 4)
 VP8_EPEL_HV(4,  4, 4)
-VP8_EPEL_HV(16, 4, 6)
 VP8_EPEL_HV(8,  4, 6)
 VP8_EPEL_HV(4,  4, 6)
-VP8_EPEL_HV(16, 6, 4)
 VP8_EPEL_HV(8,  6, 4)
 VP8_EPEL_HV(4,  6, 4)
 VP8_EPEL_HV(16, 6, 6)
@@ -667,7 +662,11 @@ VP8_BILINEAR(4)
 
 av_cold void ff_vp78dsp_init(VP8DSPContext *dsp)
 {
-    VP78_MC_FUNC(0, 16);
+    dsp->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_c;
+    dsp->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_c;
+    dsp->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_c;
+    dsp->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_c;
+
     VP78_MC_FUNC(1, 8);
     VP78_MC_FUNC(2, 4);
 
diff --git a/tests/checkasm/vp8dsp.c b/tests/checkasm/vp8dsp.c
index a12c295a2a..4d6704d5a9 100644
--- a/tests/checkasm/vp8dsp.c
+++ b/tests/checkasm/vp8dsp.c
@@ -510,7 +510,8 @@ static void checkasm_check_vp78dsp(VP8DSPContext *d, bool 
is_vp7)
 
 void checkasm_check_vp8dsp(void)
 {
-    VP8DSPContext d;
+    // Needs to be zeroed because not all size 16 epel functions exist.
+    VP8DSPContext d = { 0 };
 
     ff_vp78dsp_init(&d);
     check_mc(&d);
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH] Avoid MMX in VP8 (PR #21081)

Reply via email to