mc,dsp_init: Avoid pointless wrappers for avg

Andreas Rheinhardt via ffmpeg-cvslog Sat, 21 Feb 2026 17:55:42 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit ea78402e9c78173118fb1e71d4192cb6840388e7
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Feb 17 23:00:30 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Feb 22 00:58:33 2026 +0100

    avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers for avg
    
    Up until now, there were two averaging assembly functions,
    one for eight bit content and one for <=16 bit content;
    there are also three C-wrappers around these functions,
    for 8, 10 and 12 bpp. These wrappers simply forward the
    maximum permissible value (i.e. (1<<bpp)-1) and promote
    some integer values to ptrdiff_t.
    
    Yet these wrappers are absolutely useless: The assembly functions
    rederive the bpp from the maximum and only the integer part
    of the promoted ptrdiff_t values is ever used. Of course,
    these wrappers also entail an additional call (not a tail call,
    because the additional maximum parameter is passed on the stack).
    
    Remove the wrappers and add per-bpp assembly functions instead.
    Given that the only difference between 10 and 12 bits are some
    constants in registers, the main part of these functions can be
    shared (given that this code uses a jumptable, it can even
    be done without adding any additional jump).
    
    Old benchmarks:
    avg_8_2x2_c:                                            11.4 ( 1.00x)
    avg_8_2x2_avx2:                                          7.9 ( 1.44x)
    avg_8_4x4_c:                                            30.7 ( 1.00x)
    avg_8_4x4_avx2:                                         10.4 ( 2.95x)
    avg_8_8x8_c:                                           134.5 ( 1.00x)
    avg_8_8x8_avx2:                                         16.6 ( 8.12x)
    avg_8_16x16_c:                                         255.6 ( 1.00x)
    avg_8_16x16_avx2:                                       28.2 ( 9.07x)
    avg_8_32x32_c:                                         897.7 ( 1.00x)
    avg_8_32x32_avx2:                                       83.9 (10.70x)
    avg_8_64x64_c:                                        3320.0 ( 1.00x)
    avg_8_64x64_avx2:                                      321.1 (10.34x)
    avg_8_128x128_c:                                     12981.8 ( 1.00x)
    avg_8_128x128_avx2:                                   1480.1 ( 8.77x)
    avg_10_2x2_c:                                           12.0 ( 1.00x)
    avg_10_2x2_avx2:                                         8.4 ( 1.43x)
    avg_10_4x4_c:                                           34.9 ( 1.00x)
    avg_10_4x4_avx2:                                         9.8 ( 3.56x)
    avg_10_8x8_c:                                           76.8 ( 1.00x)
    avg_10_8x8_avx2:                                        15.1 ( 5.08x)
    avg_10_16x16_c:                                        256.6 ( 1.00x)
    avg_10_16x16_avx2:                                      25.1 (10.20x)
    avg_10_32x32_c:                                        932.9 ( 1.00x)
    avg_10_32x32_avx2:                                      73.4 (12.72x)
    avg_10_64x64_c:                                       3517.9 ( 1.00x)
    avg_10_64x64_avx2:                                     414.8 ( 8.48x)
    avg_10_128x128_c:                                    13695.3 ( 1.00x)
    avg_10_128x128_avx2:                                  1648.1 ( 8.31x)
    avg_12_2x2_c:                                           13.1 ( 1.00x)
    avg_12_2x2_avx2:                                         8.6 ( 1.53x)
    avg_12_4x4_c:                                           35.4 ( 1.00x)
    avg_12_4x4_avx2:                                        10.1 ( 3.49x)
    avg_12_8x8_c:                                           76.6 ( 1.00x)
    avg_12_8x8_avx2:                                        16.7 ( 4.60x)
    avg_12_16x16_c:                                        256.6 ( 1.00x)
    avg_12_16x16_avx2:                                      25.5 (10.07x)
    avg_12_32x32_c:                                        933.2 ( 1.00x)
    avg_12_32x32_avx2:                                      75.7 (12.34x)
    avg_12_64x64_c:                                       3519.1 ( 1.00x)
    avg_12_64x64_avx2:                                     416.8 ( 8.44x)
    avg_12_128x128_c:                                    13695.1 ( 1.00x)
    avg_12_128x128_avx2:                                  1651.6 ( 8.29x)
    
    New benchmarks:
    avg_8_2x2_c:                                            11.5 ( 1.00x)
    avg_8_2x2_avx2:                                          6.0 ( 1.91x)
    avg_8_4x4_c:                                            29.7 ( 1.00x)
    avg_8_4x4_avx2:                                          8.0 ( 3.72x)
    avg_8_8x8_c:                                           131.4 ( 1.00x)
    avg_8_8x8_avx2:                                         12.2 (10.74x)
    avg_8_16x16_c:                                         254.3 ( 1.00x)
    avg_8_16x16_avx2:                                       24.8 (10.25x)
    avg_8_32x32_c:                                         897.7 ( 1.00x)
    avg_8_32x32_avx2:                                       77.8 (11.54x)
    avg_8_64x64_c:                                        3321.3 ( 1.00x)
    avg_8_64x64_avx2:                                      318.7 (10.42x)
    avg_8_128x128_c:                                     12988.4 ( 1.00x)
    avg_8_128x128_avx2:                                   1430.1 ( 9.08x)
    avg_10_2x2_c:                                           12.1 ( 1.00x)
    avg_10_2x2_avx2:                                         5.7 ( 2.13x)
    avg_10_4x4_c:                                           35.0 ( 1.00x)
    avg_10_4x4_avx2:                                         9.0 ( 3.88x)
    avg_10_8x8_c:                                           77.2 ( 1.00x)
    avg_10_8x8_avx2:                                        12.4 ( 6.24x)
    avg_10_16x16_c:                                        256.2 ( 1.00x)
    avg_10_16x16_avx2:                                      24.3 (10.56x)
    avg_10_32x32_c:                                        932.9 ( 1.00x)
    avg_10_32x32_avx2:                                      71.9 (12.97x)
    avg_10_64x64_c:                                       3516.8 ( 1.00x)
    avg_10_64x64_avx2:                                     414.7 ( 8.48x)
    avg_10_128x128_c:                                    13693.7 ( 1.00x)
    avg_10_128x128_avx2:                                  1609.3 ( 8.51x)
    avg_12_2x2_c:                                           14.1 ( 1.00x)
    avg_12_2x2_avx2:                                         5.7 ( 2.48x)
    avg_12_4x4_c:                                           35.8 ( 1.00x)
    avg_12_4x4_avx2:                                         9.0 ( 3.96x)
    avg_12_8x8_c:                                           76.9 ( 1.00x)
    avg_12_8x8_avx2:                                        12.4 ( 6.22x)
    avg_12_16x16_c:                                        256.5 ( 1.00x)
    avg_12_16x16_avx2:                                      24.4 (10.50x)
    avg_12_32x32_c:                                        934.1 ( 1.00x)
    avg_12_32x32_avx2:                                      72.0 (12.97x)
    avg_12_64x64_c:                                       3518.2 ( 1.00x)
    avg_12_64x64_avx2:                                     414.8 ( 8.48x)
    avg_12_128x128_c:                                    13689.5 ( 1.00x)
    avg_12_128x128_avx2:                                  1611.1 ( 8.50x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/dsp_init.c | 11 +++------
 libavcodec/x86/vvc/mc.asm     | 55 ++++++++++++++++++++-----------------------
 2 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index cbcfa40a66..80df8e46ee 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -36,8 +36,6 @@
 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
 
 #define AVG_BPC_PROTOTYPES(bpc, opt)                                           
                      \
-void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,              
                      \
-    const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
intptr_t pixel_max);  \
 void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,            
                      \
     const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
                      \
     intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, 
intptr_t pixel_max);
@@ -171,11 +169,6 @@ FW_PUT_16BPC_AVX2(10)
 FW_PUT_16BPC_AVX2(12)
 
 #define AVG_FUNCS(bpc, bd, opt)                                                
                     \
-static void bf(vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,           
                     \
-    const int16_t *src0, const int16_t *src1, int width, int height)           
                     \
-{                                                                              
                     \
-    BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << 
bd)  - 1);           \
-}                                                                              
                     \
 static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,         
                     \
     const int16_t *src0, const int16_t *src1, int width, int height,           
                     \
     int denom, int w0, int w1, int o0, int o1)                                 
                     \
@@ -254,7 +247,9 @@ SAO_FILTER_FUNCS(12, avx2)
 } while (0)
 
 #define AVG_INIT(bd, opt) do {                                       \
-    c->inter.avg    = bf(vvc_avg, bd, opt);                          \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,     \
+    const int16_t *src0, const int16_t *src1, int width, int height);\
+    c->inter.avg    = bf(ff_vvc_avg, bd, opt);                       \
     c->inter.w_avg  = bf(vvc_w_avg, bd, opt);                        \
 } while (0)
 
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index 3272765b57..7599ee2e6a 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -35,23 +35,21 @@ SECTION_RODATA
 
 %if HAVE_AVX2_EXTERNAL
 
-pw_256  times 2 dw 256
-
-%macro AVG_JMP_TABLE 3-*
-    %xdefine %1_%2_%3_table (%%table - 2*%4)
-    %xdefine %%base %1_%2_%3_table
-    %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
+%macro AVG_JMP_TABLE 4-*
+    %xdefine %1_%2_%4_table (%%table - 2*%5)
+    %xdefine %%base %1_%2_%4_table
+    %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%3_%4)
     %%table:
-    %rep %0 - 3
-        dd %%prefix %+ .w%4 - %%base
+    %rep %0 - 4
+        dd %%prefix %+ .w%5 - %%base
         %rotate 1
     %endrep
 %endmacro
 
-AVG_JMP_TABLE    avg,  8, avx2,                2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE    avg, 16, avx2,                2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE  w_avg,  8, avx2,                2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE  w_avg, 16, avx2,                2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE    avg,  8,  8, avx2,                2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE    avg, 16, 10, avx2,                2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE  w_avg,  8,  8bpc, avx2,             2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE  w_avg, 16, 16bpc, avx2,             2, 4, 8, 16, 32, 64, 128
 
 SECTION .text
 
@@ -72,9 +70,10 @@ SECTION .text
     %endrep
 %endmacro
 
-%macro AVG_FN 2 ; bpc, op
+%macro AVG_FN 2-3 1; bpc, op, instantiate implementation
    jmp                  wq
 
+%if %3
 INIT_XMM cpuname
 .w2:
     movd                xm0, [src0q]
@@ -128,6 +127,7 @@ INIT_YMM cpuname
 
 .ret:
     RET
+%endif
 %endmacro
 
 %macro AVG   2 ; bpc, width
@@ -222,31 +222,24 @@ INIT_YMM cpuname
 
 %define AVG_SRC_STRIDE MAX_PB_SIZE*2
 
-;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
-;   const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
intptr_t pixel_max);
-%macro VVC_AVG_AVX2 1
-cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
+;void ff_vvc_avg_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride, const int16_t 
*src0,
+;                        const int16_t *src1, int width, int height);
+%macro VVC_AVG_AVX2 3
+cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h
     movifnidn            hd, hm
 
+    pcmpeqw              m2, m2
 %if %1 != 8
     pxor                 m3, m3             ; pixel min
-    vpbroadcastw         m4, bdm            ; pixel max
 %endif
 
-    movifnidn           bdd, bdm
-    inc                 bdd
-    tzcnt               bdd, bdd            ; bit depth
-
-    sub                 bdd, 8
-    movd                xm0, bdd
-    vpbroadcastd         m2, [pw_256]
-    psllw                m2, xm0                ; shift
-
     lea                  r6, [avg_%1 %+ SUFFIX %+ _table]
     tzcnt                wd, wm
     movsxd               wq, dword [r6+wq*4]
+    psrlw                m4, m2, 16-%2      ; pixel max
+    psubw                m2, m4, m2         ; 1 << bpp
     add                  wq, r6
-    AVG_FN               %1, AVG
+    AVG_FN               %1, AVG, %3
 %endmacro
 
 ;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
@@ -298,9 +291,11 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, 
src0, src1, w, h, t0,
 
 INIT_YMM avx2
 
-VVC_AVG_AVX2 16
+VVC_AVG_AVX2 16, 12, 0
+
+VVC_AVG_AVX2 16, 10, 1
 
-VVC_AVG_AVX2 8
+VVC_AVG_AVX2 8, 8, 1
 
 VVC_W_AVG_AVX2 16
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 07/12: avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers for avg

Reply via email to