This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 7bf9c1e3f6effbe7d2dd53096bf2a7dbbb07d7ff Author: Andreas Rheinhardt <[email protected]> AuthorDate: Tue Feb 17 15:14:04 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sun Feb 22 00:57:56 2026 +0100 avcodec/x86/vvc/mc: Avoid redundant clipping for 8bit It is already done by packuswb. Old benchmarks: avg_8_2x2_c: 11.1 ( 1.00x) avg_8_2x2_avx2: 8.6 ( 1.28x) avg_8_4x4_c: 30.0 ( 1.00x) avg_8_4x4_avx2: 10.8 ( 2.78x) avg_8_8x8_c: 132.0 ( 1.00x) avg_8_8x8_avx2: 25.7 ( 5.14x) avg_8_16x16_c: 254.6 ( 1.00x) avg_8_16x16_avx2: 33.2 ( 7.67x) avg_8_32x32_c: 897.5 ( 1.00x) avg_8_32x32_avx2: 115.6 ( 7.76x) avg_8_64x64_c: 3316.9 ( 1.00x) avg_8_64x64_avx2: 626.5 ( 5.29x) avg_8_128x128_c: 12973.6 ( 1.00x) avg_8_128x128_avx2: 1914.0 ( 6.78x) w_avg_8_2x2_c: 16.7 ( 1.00x) w_avg_8_2x2_avx2: 14.4 ( 1.16x) w_avg_8_4x4_c: 48.2 ( 1.00x) w_avg_8_4x4_avx2: 16.5 ( 2.92x) w_avg_8_8x8_c: 168.1 ( 1.00x) w_avg_8_8x8_avx2: 49.7 ( 3.38x) w_avg_8_16x16_c: 392.4 ( 1.00x) w_avg_8_16x16_avx2: 61.1 ( 6.43x) w_avg_8_32x32_c: 1455.3 ( 1.00x) w_avg_8_32x32_avx2: 224.6 ( 6.48x) w_avg_8_64x64_c: 5632.1 ( 1.00x) w_avg_8_64x64_avx2: 896.9 ( 6.28x) w_avg_8_128x128_c: 22136.3 ( 1.00x) w_avg_8_128x128_avx2: 3626.7 ( 6.10x) New benchmarks: avg_8_2x2_c: 12.3 ( 1.00x) avg_8_2x2_avx2: 8.1 ( 1.52x) avg_8_4x4_c: 30.3 ( 1.00x) avg_8_4x4_avx2: 11.3 ( 2.67x) avg_8_8x8_c: 131.8 ( 1.00x) avg_8_8x8_avx2: 21.3 ( 6.20x) avg_8_16x16_c: 255.0 ( 1.00x) avg_8_16x16_avx2: 30.6 ( 8.33x) avg_8_32x32_c: 898.5 ( 1.00x) avg_8_32x32_avx2: 104.9 ( 8.57x) avg_8_64x64_c: 3317.7 ( 1.00x) avg_8_64x64_avx2: 540.9 ( 6.13x) avg_8_128x128_c: 12986.5 ( 1.00x) avg_8_128x128_avx2: 1663.4 ( 7.81x) w_avg_8_2x2_c: 16.8 ( 1.00x) w_avg_8_2x2_avx2: 13.9 ( 1.21x) w_avg_8_4x4_c: 48.2 ( 1.00x) w_avg_8_4x4_avx2: 16.2 ( 2.98x) w_avg_8_8x8_c: 168.6 ( 1.00x) w_avg_8_8x8_avx2: 46.3 ( 3.64x) w_avg_8_16x16_c: 392.4 ( 1.00x) w_avg_8_16x16_avx2: 57.7 ( 6.80x) w_avg_8_32x32_c: 1454.6 ( 1.00x) w_avg_8_32x32_avx2: 214.6 ( 6.78x) w_avg_8_64x64_c: 5638.4 ( 1.00x) w_avg_8_64x64_avx2: 875.6 ( 6.44x) w_avg_8_128x128_c: 22133.5 ( 1.00x) w_avg_8_128x128_avx2: 3334.3 ( 6.64x) Also saves 550B of .text here. The improvements will likely be even better on Win64, because it avoids using two nonvolatile registers in the weighted average case. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/mc.asm | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm index 30aa97c65a..a3f858edd8 100644 --- a/libavcodec/x86/vvc/mc.asm +++ b/libavcodec/x86/vvc/mc.asm @@ -64,12 +64,12 @@ SECTION .text %rep %3 %define off %%i AVG_LOAD_W16 0, off - %2 + %2 %1 AVG_SAVE_W16 %1, 0, off AVG_LOAD_W16 1, off - %2 + %2 %1 AVG_SAVE_W16 %1, 1, off %assign %%i %%i+1 @@ -84,7 +84,7 @@ SECTION .text pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1 movd xm1, [src1q] pinsrd xm1, [src1q + AVG_SRC_STRIDE], 1 - %2 + %2 %1 AVG_SAVE_W2 %1 AVG_LOOP_END .w2 @@ -93,7 +93,7 @@ SECTION .text pinsrq xm0, [src0q + AVG_SRC_STRIDE], 1 movq xm1, [src1q] pinsrq xm1, [src1q + AVG_SRC_STRIDE], 1 - %2 + %2 %1 AVG_SAVE_W4 %1 AVG_LOOP_END .w4 @@ -103,7 +103,7 @@ SECTION .text vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1 vinserti128 m1, m1, [src1q], 0 vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1 - %2 + %2 %1 AVG_SAVE_W8 %1 AVG_LOOP_END .w8 @@ -132,13 +132,15 @@ SECTION .text RET %endmacro -%macro AVG 0 +%macro AVG 1 paddsw m0, m1 pmulhrsw m0, m2 +%if %1 != 8 CLIPW m0, m3, m4 +%endif %endmacro -%macro W_AVG 0 +%macro W_AVG 1 punpckhwd m5, m0, m1 pmaddwd m5, m3 paddd m5, m4 @@ -150,7 +152,9 @@ SECTION .text psrad m0, xm2 packssdw m0, m5 +%if %1 != 8 CLIPW m0, m6, m7 +%endif %endmacro %macro AVG_LOAD_W16 2 ; line, offset @@ -217,11 +221,13 @@ SECTION .text ;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride, ; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); %macro VVC_AVG_AVX2 1 -cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd +cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd movifnidn hd, hm +%if %1 != 8 pxor m3, m3 ; pixel min vpbroadcastw m4, bdm ; pixel max +%endif movifnidn bdd, bdm inc bdd @@ -245,7 +251,7 @@ cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd ; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, ; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); %macro VVC_W_AVG_AVX2 1 -cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1 +cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0, t1 movifnidn hd, hm @@ -255,8 +261,10 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1 movd xm3, t0d vpbroadcastd m3, xm3 ; w0, w1 +%if %1 != 8 pxor m6, m6 ;pixel min vpbroadcastw m7, r11m ;pixel max +%endif mov t1q, rcx ; save ecx mov ecx, r11m _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
