This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit ea78402e9c78173118fb1e71d4192cb6840388e7 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Tue Feb 17 23:00:30 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sun Feb 22 00:58:33 2026 +0100 avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers for avg Up until now, there were two averaging assembly functions, one for eight bit content and one for <=16 bit content; there are also three C-wrappers around these functions, for 8, 10 and 12 bpp. These wrappers simply forward the maximum permissible value (i.e. (1<<bpp)-1) and promote some integer values to ptrdiff_t. Yet these wrappers are absolutely useless: The assembly functions rederive the bpp from the maximum and only the integer part of the promoted ptrdiff_t values is ever used. Of course, these wrappers also entail an additional call (not a tail call, because the additional maximum parameter is passed on the stack). Remove the wrappers and add per-bpp assembly functions instead. Given that the only difference between 10 and 12 bits are some constants in registers, the main part of these functions can be shared (given that this code uses a jumptable, it can even be done without adding any additional jump). Old benchmarks: avg_8_2x2_c: 11.4 ( 1.00x) avg_8_2x2_avx2: 7.9 ( 1.44x) avg_8_4x4_c: 30.7 ( 1.00x) avg_8_4x4_avx2: 10.4 ( 2.95x) avg_8_8x8_c: 134.5 ( 1.00x) avg_8_8x8_avx2: 16.6 ( 8.12x) avg_8_16x16_c: 255.6 ( 1.00x) avg_8_16x16_avx2: 28.2 ( 9.07x) avg_8_32x32_c: 897.7 ( 1.00x) avg_8_32x32_avx2: 83.9 (10.70x) avg_8_64x64_c: 3320.0 ( 1.00x) avg_8_64x64_avx2: 321.1 (10.34x) avg_8_128x128_c: 12981.8 ( 1.00x) avg_8_128x128_avx2: 1480.1 ( 8.77x) avg_10_2x2_c: 12.0 ( 1.00x) avg_10_2x2_avx2: 8.4 ( 1.43x) avg_10_4x4_c: 34.9 ( 1.00x) avg_10_4x4_avx2: 9.8 ( 3.56x) avg_10_8x8_c: 76.8 ( 1.00x) avg_10_8x8_avx2: 15.1 ( 5.08x) avg_10_16x16_c: 256.6 ( 1.00x) avg_10_16x16_avx2: 25.1 (10.20x) avg_10_32x32_c: 932.9 ( 1.00x) avg_10_32x32_avx2: 73.4 (12.72x) avg_10_64x64_c: 3517.9 ( 1.00x) avg_10_64x64_avx2: 414.8 ( 8.48x) avg_10_128x128_c: 13695.3 ( 1.00x) avg_10_128x128_avx2: 1648.1 ( 8.31x) avg_12_2x2_c: 13.1 ( 1.00x) avg_12_2x2_avx2: 8.6 ( 1.53x) avg_12_4x4_c: 35.4 ( 1.00x) avg_12_4x4_avx2: 10.1 ( 3.49x) avg_12_8x8_c: 76.6 ( 1.00x) avg_12_8x8_avx2: 16.7 ( 4.60x) avg_12_16x16_c: 256.6 ( 1.00x) avg_12_16x16_avx2: 25.5 (10.07x) avg_12_32x32_c: 933.2 ( 1.00x) avg_12_32x32_avx2: 75.7 (12.34x) avg_12_64x64_c: 3519.1 ( 1.00x) avg_12_64x64_avx2: 416.8 ( 8.44x) avg_12_128x128_c: 13695.1 ( 1.00x) avg_12_128x128_avx2: 1651.6 ( 8.29x) New benchmarks: avg_8_2x2_c: 11.5 ( 1.00x) avg_8_2x2_avx2: 6.0 ( 1.91x) avg_8_4x4_c: 29.7 ( 1.00x) avg_8_4x4_avx2: 8.0 ( 3.72x) avg_8_8x8_c: 131.4 ( 1.00x) avg_8_8x8_avx2: 12.2 (10.74x) avg_8_16x16_c: 254.3 ( 1.00x) avg_8_16x16_avx2: 24.8 (10.25x) avg_8_32x32_c: 897.7 ( 1.00x) avg_8_32x32_avx2: 77.8 (11.54x) avg_8_64x64_c: 3321.3 ( 1.00x) avg_8_64x64_avx2: 318.7 (10.42x) avg_8_128x128_c: 12988.4 ( 1.00x) avg_8_128x128_avx2: 1430.1 ( 9.08x) avg_10_2x2_c: 12.1 ( 1.00x) avg_10_2x2_avx2: 5.7 ( 2.13x) avg_10_4x4_c: 35.0 ( 1.00x) avg_10_4x4_avx2: 9.0 ( 3.88x) avg_10_8x8_c: 77.2 ( 1.00x) avg_10_8x8_avx2: 12.4 ( 6.24x) avg_10_16x16_c: 256.2 ( 1.00x) avg_10_16x16_avx2: 24.3 (10.56x) avg_10_32x32_c: 932.9 ( 1.00x) avg_10_32x32_avx2: 71.9 (12.97x) avg_10_64x64_c: 3516.8 ( 1.00x) avg_10_64x64_avx2: 414.7 ( 8.48x) avg_10_128x128_c: 13693.7 ( 1.00x) avg_10_128x128_avx2: 1609.3 ( 8.51x) avg_12_2x2_c: 14.1 ( 1.00x) avg_12_2x2_avx2: 5.7 ( 2.48x) avg_12_4x4_c: 35.8 ( 1.00x) avg_12_4x4_avx2: 9.0 ( 3.96x) avg_12_8x8_c: 76.9 ( 1.00x) avg_12_8x8_avx2: 12.4 ( 6.22x) avg_12_16x16_c: 256.5 ( 1.00x) avg_12_16x16_avx2: 24.4 (10.50x) avg_12_32x32_c: 934.1 ( 1.00x) avg_12_32x32_avx2: 72.0 (12.97x) avg_12_64x64_c: 3518.2 ( 1.00x) avg_12_64x64_avx2: 414.8 ( 8.48x) avg_12_128x128_c: 13689.5 ( 1.00x) avg_12_128x128_avx2: 1611.1 ( 8.50x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/dsp_init.c | 11 +++------ libavcodec/x86/vvc/mc.asm | 55 ++++++++++++++++++++----------------------- 2 files changed, 28 insertions(+), 38 deletions(-) diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c index cbcfa40a66..80df8e46ee 100644 --- a/libavcodec/x86/vvc/dsp_init.c +++ b/libavcodec/x86/vvc/dsp_init.c @@ -36,8 +36,6 @@ #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt #define AVG_BPC_PROTOTYPES(bpc, opt) \ -void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \ void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \ intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); @@ -171,11 +169,6 @@ FW_PUT_16BPC_AVX2(10) FW_PUT_16BPC_AVX2(12) #define AVG_FUNCS(bpc, bd, opt) \ -static void bf(vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *src0, const int16_t *src1, int width, int height) \ -{ \ - BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \ -} \ static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *src0, const int16_t *src1, int width, int height, \ int denom, int w0, int w1, int o0, int o1) \ @@ -254,7 +247,9 @@ SAO_FILTER_FUNCS(12, avx2) } while (0) #define AVG_INIT(bd, opt) do { \ - c->inter.avg = bf(vvc_avg, bd, opt); \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *src0, const int16_t *src1, int width, int height);\ + c->inter.avg = bf(ff_vvc_avg, bd, opt); \ c->inter.w_avg = bf(vvc_w_avg, bd, opt); \ } while (0) diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm index 3272765b57..7599ee2e6a 100644 --- a/libavcodec/x86/vvc/mc.asm +++ b/libavcodec/x86/vvc/mc.asm @@ -35,23 +35,21 @@ SECTION_RODATA %if HAVE_AVX2_EXTERNAL -pw_256 times 2 dw 256 - -%macro AVG_JMP_TABLE 3-* - %xdefine %1_%2_%3_table (%%table - 2*%4) - %xdefine %%base %1_%2_%3_table - %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3) +%macro AVG_JMP_TABLE 4-* + %xdefine %1_%2_%4_table (%%table - 2*%5) + %xdefine %%base %1_%2_%4_table + %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%3_%4) %%table: - %rep %0 - 3 - dd %%prefix %+ .w%4 - %%base + %rep %0 - 4 + dd %%prefix %+ .w%5 - %%base %rotate 1 %endrep %endmacro -AVG_JMP_TABLE avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128 -AVG_JMP_TABLE avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128 -AVG_JMP_TABLE w_avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128 -AVG_JMP_TABLE w_avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128 +AVG_JMP_TABLE avg, 8, 8, avx2, 2, 4, 8, 16, 32, 64, 128 +AVG_JMP_TABLE avg, 16, 10, avx2, 2, 4, 8, 16, 32, 64, 128 +AVG_JMP_TABLE w_avg, 8, 8bpc, avx2, 2, 4, 8, 16, 32, 64, 128 +AVG_JMP_TABLE w_avg, 16, 16bpc, avx2, 2, 4, 8, 16, 32, 64, 128 SECTION .text @@ -72,9 +70,10 @@ SECTION .text %endrep %endmacro -%macro AVG_FN 2 ; bpc, op +%macro AVG_FN 2-3 1; bpc, op, instantiate implementation jmp wq +%if %3 INIT_XMM cpuname .w2: movd xm0, [src0q] @@ -128,6 +127,7 @@ INIT_YMM cpuname .ret: RET +%endif %endmacro %macro AVG 2 ; bpc, width @@ -222,31 +222,24 @@ INIT_YMM cpuname %define AVG_SRC_STRIDE MAX_PB_SIZE*2 -;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride, -; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); -%macro VVC_AVG_AVX2 1 -cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd +;void ff_vvc_avg_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, +; const int16_t *src1, int width, int height); +%macro VVC_AVG_AVX2 3 +cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h movifnidn hd, hm + pcmpeqw m2, m2 %if %1 != 8 pxor m3, m3 ; pixel min - vpbroadcastw m4, bdm ; pixel max %endif - movifnidn bdd, bdm - inc bdd - tzcnt bdd, bdd ; bit depth - - sub bdd, 8 - movd xm0, bdd - vpbroadcastd m2, [pw_256] - psllw m2, xm0 ; shift - lea r6, [avg_%1 %+ SUFFIX %+ _table] tzcnt wd, wm movsxd wq, dword [r6+wq*4] + psrlw m4, m2, 16-%2 ; pixel max + psubw m2, m4, m2 ; 1 << bpp add wq, r6 - AVG_FN %1, AVG + AVG_FN %1, AVG, %3 %endmacro ;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride, @@ -298,9 +291,11 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0, INIT_YMM avx2 -VVC_AVG_AVX2 16 +VVC_AVG_AVX2 16, 12, 0 + +VVC_AVG_AVX2 16, 10, 1 -VVC_AVG_AVX2 8 +VVC_AVG_AVX2 8, 8, 1 VVC_W_AVG_AVX2 16 _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
