mc,dsp_init: Avoid pointless wrappers for w_avg

Andreas Rheinhardt via ffmpeg-cvslog Sat, 21 Feb 2026 17:55:49 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 81fb70c833ac675ac8e09b38ad845a90de4c3e1c
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Feb 19 00:40:42 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Feb 22 01:01:27 2026 +0100

    avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers for w_avg
    
    They only add overhead (in form of another function call,
    sign-extending some parameters to 64bit (although the upper
    bits are not used at all) and rederiving the actual number
    of bits (from the maximum value (1<<bpp)-1)).
    
    Old benchmarks:
    w_avg_8_2x2_c:                                          16.4 ( 1.00x)
    w_avg_8_2x2_avx2:                                       12.9 ( 1.27x)
    w_avg_8_4x4_c:                                          48.0 ( 1.00x)
    w_avg_8_4x4_avx2:                                       14.9 ( 3.23x)
    w_avg_8_8x8_c:                                         168.2 ( 1.00x)
    w_avg_8_8x8_avx2:                                       22.4 ( 7.49x)
    w_avg_8_16x16_c:                                       396.5 ( 1.00x)
    w_avg_8_16x16_avx2:                                     47.9 ( 8.28x)
    w_avg_8_32x32_c:                                      1466.3 ( 1.00x)
    w_avg_8_32x32_avx2:                                    172.8 ( 8.48x)
    w_avg_8_64x64_c:                                      5629.3 ( 1.00x)
    w_avg_8_64x64_avx2:                                    678.7 ( 8.29x)
    w_avg_8_128x128_c:                                   22122.4 ( 1.00x)
    w_avg_8_128x128_avx2:                                 2743.5 ( 8.06x)
    w_avg_10_2x2_c:                                         18.7 ( 1.00x)
    w_avg_10_2x2_avx2:                                      13.1 ( 1.43x)
    w_avg_10_4x4_c:                                         50.3 ( 1.00x)
    w_avg_10_4x4_avx2:                                      15.9 ( 3.17x)
    w_avg_10_8x8_c:                                        109.3 ( 1.00x)
    w_avg_10_8x8_avx2:                                      20.6 ( 5.30x)
    w_avg_10_16x16_c:                                      395.5 ( 1.00x)
    w_avg_10_16x16_avx2:                                    44.8 ( 8.83x)
    w_avg_10_32x32_c:                                     1534.2 ( 1.00x)
    w_avg_10_32x32_avx2:                                   141.4 (10.85x)
    w_avg_10_64x64_c:                                     6003.6 ( 1.00x)
    w_avg_10_64x64_avx2:                                   557.4 (10.77x)
    w_avg_10_128x128_c:                                  23722.7 ( 1.00x)
    w_avg_10_128x128_avx2:                                2205.0 (10.76x)
    w_avg_12_2x2_c:                                         18.6 ( 1.00x)
    w_avg_12_2x2_avx2:                                      13.1 ( 1.42x)
    w_avg_12_4x4_c:                                         52.2 ( 1.00x)
    w_avg_12_4x4_avx2:                                      16.1 ( 3.24x)
    w_avg_12_8x8_c:                                        109.2 ( 1.00x)
    w_avg_12_8x8_avx2:                                      20.6 ( 5.29x)
    w_avg_12_16x16_c:                                      396.1 ( 1.00x)
    w_avg_12_16x16_avx2:                                    45.0 ( 8.81x)
    w_avg_12_32x32_c:                                     1532.6 ( 1.00x)
    w_avg_12_32x32_avx2:                                   142.1 (10.79x)
    w_avg_12_64x64_c:                                     6002.2 ( 1.00x)
    w_avg_12_64x64_avx2:                                   557.3 (10.77x)
    w_avg_12_128x128_c:                                  23748.7 ( 1.00x)
    w_avg_12_128x128_avx2:                                2206.4 (10.76x)
    
    New benchmarks:
    w_avg_8_2x2_c:                                          16.0 ( 1.00x)
    w_avg_8_2x2_avx2:                                        9.3 ( 1.71x)
    w_avg_8_4x4_c:                                          48.4 ( 1.00x)
    w_avg_8_4x4_avx2:                                       12.4 ( 3.91x)
    w_avg_8_8x8_c:                                         168.7 ( 1.00x)
    w_avg_8_8x8_avx2:                                       21.1 ( 8.00x)
    w_avg_8_16x16_c:                                       394.5 ( 1.00x)
    w_avg_8_16x16_avx2:                                     46.2 ( 8.54x)
    w_avg_8_32x32_c:                                      1456.3 ( 1.00x)
    w_avg_8_32x32_avx2:                                    171.8 ( 8.48x)
    w_avg_8_64x64_c:                                      5636.2 ( 1.00x)
    w_avg_8_64x64_avx2:                                    676.9 ( 8.33x)
    w_avg_8_128x128_c:                                   22129.1 ( 1.00x)
    w_avg_8_128x128_avx2:                                 2734.3 ( 8.09x)
    w_avg_10_2x2_c:                                         18.7 ( 1.00x)
    w_avg_10_2x2_avx2:                                      10.3 ( 1.82x)
    w_avg_10_4x4_c:                                         50.8 ( 1.00x)
    w_avg_10_4x4_avx2:                                      13.4 ( 3.79x)
    w_avg_10_8x8_c:                                        109.7 ( 1.00x)
    w_avg_10_8x8_avx2:                                      20.4 ( 5.38x)
    w_avg_10_16x16_c:                                      395.2 ( 1.00x)
    w_avg_10_16x16_avx2:                                    41.7 ( 9.48x)
    w_avg_10_32x32_c:                                     1535.6 ( 1.00x)
    w_avg_10_32x32_avx2:                                   137.9 (11.13x)
    w_avg_10_64x64_c:                                     6002.1 ( 1.00x)
    w_avg_10_64x64_avx2:                                   548.5 (10.94x)
    w_avg_10_128x128_c:                                  23742.7 ( 1.00x)
    w_avg_10_128x128_avx2:                                2179.8 (10.89x)
    w_avg_12_2x2_c:                                         18.9 ( 1.00x)
    w_avg_12_2x2_avx2:                                      10.3 ( 1.84x)
    w_avg_12_4x4_c:                                         52.4 ( 1.00x)
    w_avg_12_4x4_avx2:                                      13.4 ( 3.91x)
    w_avg_12_8x8_c:                                        109.2 ( 1.00x)
    w_avg_12_8x8_avx2:                                      20.3 ( 5.39x)
    w_avg_12_16x16_c:                                      396.3 ( 1.00x)
    w_avg_12_16x16_avx2:                                    41.7 ( 9.51x)
    w_avg_12_32x32_c:                                     1532.6 ( 1.00x)
    w_avg_12_32x32_avx2:                                   138.6 (11.06x)
    w_avg_12_64x64_c:                                     5996.7 ( 1.00x)
    w_avg_12_64x64_avx2:                                   549.6 (10.91x)
    w_avg_12_128x128_c:                                  23738.0 ( 1.00x)
    w_avg_12_128x128_avx2:                                2177.2 (10.90x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/dsp_init.c | 26 +++-----------
 libavcodec/x86/vvc/mc.asm     | 84 ++++++++++++++++++++++---------------------
 2 files changed, 48 insertions(+), 62 deletions(-)

diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index 80df8e46ee..357f4ea8a1 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -35,14 +35,6 @@
 #define bf(fn, bd,  opt) fn##_##bd##_##opt
 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
 
-#define AVG_BPC_PROTOTYPES(bpc, opt)                                           
                      \
-void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,            
                      \
-    const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
                      \
-    intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, 
intptr_t pixel_max);
-
-AVG_BPC_PROTOTYPES( 8, avx2)
-AVG_BPC_PROTOTYPES(16, avx2)
-
 #define DMVR_PROTOTYPES(bd, opt)                                               
                     \
 void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t 
src_stride,               \
      int height, intptr_t mx, intptr_t my, int width);                         
                     \
@@ -168,19 +160,6 @@ FW_PUT_AVX2(12)
 FW_PUT_16BPC_AVX2(10)
 FW_PUT_16BPC_AVX2(12)
 
-#define AVG_FUNCS(bpc, bd, opt)                                                
                     \
-static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,         
                     \
-    const int16_t *src0, const int16_t *src1, int width, int height,           
                     \
-    int denom, int w0, int w1, int o0, int o1)                                 
                     \
-{                                                                              
                     \
-    BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height,     
                     \
-        denom, w0, w1, o0, o1, (1 << bd)  - 1);                                
                     \
-}
-
-AVG_FUNCS(8,  8,  avx2)
-AVG_FUNCS(16, 10, avx2)
-AVG_FUNCS(16, 12, avx2)
-
 #define ALF_FUNCS(bpc, bd, opt)                                                
                                          \
 static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
     int width, int height, const int16_t *filter, const int16_t *clip, const 
int vb_pos)                                 \
@@ -249,8 +228,11 @@ SAO_FILTER_FUNCS(12, avx2)
 #define AVG_INIT(bd, opt) do {                                       \
 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,     \
     const int16_t *src0, const int16_t *src1, int width, int height);\
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   \
+    const int16_t *src0, const int16_t *src1, int width, int height, \
+    int denom, int w0, int w1,  int o0, int o1);                     \
     c->inter.avg    = bf(ff_vvc_avg, bd, opt);                       \
-    c->inter.w_avg  = bf(vvc_w_avg, bd, opt);                        \
+    c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt);                     \
 } while (0)
 
 #define DMVR_INIT(bd) do {                                           \
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index 7599ee2e6a..8ba493aebd 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -48,8 +48,8 @@ SECTION_RODATA
 
 AVG_JMP_TABLE    avg,  8,  8, avx2,                2, 4, 8, 16, 32, 64, 128
 AVG_JMP_TABLE    avg, 16, 10, avx2,                2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE  w_avg,  8,  8bpc, avx2,             2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE  w_avg, 16, 16bpc, avx2,             2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE  w_avg,  8,  8, avx2,                2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE  w_avg, 16, 10, avx2,                2, 4, 8, 16, 32, 64, 128
 
 SECTION .text
 
@@ -242,64 +242,68 @@ cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h
     AVG_FN               %1, AVG, %3
 %endmacro
 
-;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
-;    const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
-;    intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, 
intptr_t pixel_max);
-%macro VVC_W_AVG_AVX2 1
-cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, 
t0, t1
-
-    movifnidn            hd, hm
-
-    movifnidn           t0d, r8m                ; w1
-    shl                 t0d, 16
-    mov                 t0w, r7m                ; w0
-    movd                xm3, t0d
-    vpbroadcastd         m3, xm3                ; w0, w1
-
-%if %1 != 8
-    pxor                m6, m6                  ;pixel min
-    vpbroadcastw        m7, r11m                ;pixel max
+;void ff_vvc_w_avg_%2_avx(uint8_t *dst, ptrdiff_t dst_stride,
+;                         const int16_t *src0, const int16_t *src1, int width, 
int height,
+;                         int denom, intptr_t w0, int w1, int o0, int o1);
+%macro VVC_W_AVG_AVX2 3
+cglobal vvc_w_avg_%2, 4, 7+2*UNIX64, 6+2*(%1 != 8), dst, stride, src0, src1, 
w, h
+%if UNIX64
+    ; r6-r8 are volatile and not used for parameter passing
+    DECLARE_REG_TMP 6, 7, 8
+%else ; Win64
+    ; r4-r6 are volatile and not used for parameter passing
+    DECLARE_REG_TMP 4, 5, 6
 %endif
 
-    mov                 t1q, rcx                ; save ecx
-    mov                 ecx, r11m
-    inc                 ecx                     ; bd
-    tzcnt               ecx, ecx
-    sub                 ecx, 8
+    mov                 t1d, r6m                ; denom
     mov                 t0d, r9m                ; o0
     add                 t0d, r10m               ; o1
-    shl                 t0d, cl
-    inc                 t0d                     ;((o0 + o1) << (BIT_DEPTH - 
8)) + 1
-
-    neg                 ecx
-    add                 ecx, 7
-    add                 ecx, r6m
-    movd                xm2, ecx                ; shift
+    movifnidn           t2d, r8m                ; w1
+    add                 t1d, 15-%2
+%if %2 != 8
+    shl                 t0d, %2 - 8
+%endif
+    movd                xm2, t1d                ; shift
+    inc                 t0d                     ; ((o0 + o1) << (BIT_DEPTH - 
8)) + 1
+    shl                 t2d, 16
+    movd                xm4, t0d
+    mov                 t2w, r7m                ; w0
+    movd                xm3, t2d
+    vpbroadcastd         m3, xm3                ; w0, w1
 
-    dec                ecx
-    shl                t0d, cl
-    movd               xm4, t0d
-    vpbroadcastd        m4, xm4                 ; offset
-    mov                rcx, t1q                 ; restore ecx
+%if %1 != 8
+    pcmpeqw              m7, m7
+    pxor                 m6, m6                 ; pixel min
+    psrlw                m7, 16-%2              ; pixel max
+%endif
 
     lea                 r6, [w_avg_%1 %+ SUFFIX %+ _table]
     tzcnt               wd, wm
     movsxd              wq, dword [r6+wq*4]
+
+    pslld               xm4, xm2
+    psrad               xm4, 1
+    vpbroadcastd         m4, xm4                 ; offset
+
+    movifnidn            hd, hm
+
     add                 wq, r6
-    AVG_FN              %1, W_AVG
+    AVG_FN              %1, W_AVG, %3
 %endmacro
 
 INIT_YMM avx2
 
 VVC_AVG_AVX2 16, 12, 0
 
+VVC_W_AVG_AVX2 16, 12, 0
+
 VVC_AVG_AVX2 16, 10, 1
 
-VVC_AVG_AVX2 8, 8, 1
+VVC_W_AVG_AVX2 16, 10, 1
 
-VVC_W_AVG_AVX2 16
+VVC_AVG_AVX2 8, 8, 1
 
-VVC_W_AVG_AVX2 8
+VVC_W_AVG_AVX2 8, 8, 1
 %endif
 
 %endif

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 08/12: avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers for w_avg

Reply via email to