This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit caa0ae0cfb35de0ae3fd5f346caef89d62eeaf7c Author: Andreas Rheinhardt <[email protected]> AuthorDate: Tue Feb 17 17:34:49 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sun Feb 22 00:57:56 2026 +0100 avcodec/x86/vvc/mc: Avoid pextr[dq], v{insert,extract}i128 Use mov[dq], movdqu instead if the least significant parts are set (i.e. if the immediate value is 0x0). Old benchmarks: avg_8_2x2_c: 11.3 ( 1.00x) avg_8_2x2_avx2: 7.5 ( 1.50x) avg_8_4x4_c: 31.2 ( 1.00x) avg_8_4x4_avx2: 10.7 ( 2.91x) avg_8_8x8_c: 133.5 ( 1.00x) avg_8_8x8_avx2: 21.2 ( 6.30x) avg_8_16x16_c: 254.7 ( 1.00x) avg_8_16x16_avx2: 30.1 ( 8.46x) avg_8_32x32_c: 896.9 ( 1.00x) avg_8_32x32_avx2: 103.9 ( 8.63x) avg_8_64x64_c: 3320.7 ( 1.00x) avg_8_64x64_avx2: 539.4 ( 6.16x) avg_8_128x128_c: 12991.5 ( 1.00x) avg_8_128x128_avx2: 1661.3 ( 7.82x) avg_10_2x2_c: 21.3 ( 1.00x) avg_10_2x2_avx2: 8.3 ( 2.55x) avg_10_4x4_c: 34.9 ( 1.00x) avg_10_4x4_avx2: 10.6 ( 3.28x) avg_10_8x8_c: 76.3 ( 1.00x) avg_10_8x8_avx2: 20.2 ( 3.77x) avg_10_16x16_c: 255.9 ( 1.00x) avg_10_16x16_avx2: 24.1 (10.60x) avg_10_32x32_c: 932.4 ( 1.00x) avg_10_32x32_avx2: 73.3 (12.72x) avg_10_64x64_c: 3516.4 ( 1.00x) avg_10_64x64_avx2: 601.7 ( 5.84x) avg_10_128x128_c: 13690.6 ( 1.00x) avg_10_128x128_avx2: 1613.2 ( 8.49x) avg_12_2x2_c: 14.0 ( 1.00x) avg_12_2x2_avx2: 8.3 ( 1.67x) avg_12_4x4_c: 35.3 ( 1.00x) avg_12_4x4_avx2: 10.9 ( 3.26x) avg_12_8x8_c: 76.5 ( 1.00x) avg_12_8x8_avx2: 20.3 ( 3.77x) avg_12_16x16_c: 256.7 ( 1.00x) avg_12_16x16_avx2: 24.1 (10.63x) avg_12_32x32_c: 932.5 ( 1.00x) avg_12_32x32_avx2: 73.3 (12.72x) avg_12_64x64_c: 3520.5 ( 1.00x) avg_12_64x64_avx2: 602.6 ( 5.84x) avg_12_128x128_c: 13689.6 ( 1.00x) avg_12_128x128_avx2: 1613.1 ( 8.49x) w_avg_8_2x2_c: 16.7 ( 1.00x) w_avg_8_2x2_avx2: 13.4 ( 1.25x) w_avg_8_4x4_c: 44.5 ( 1.00x) w_avg_8_4x4_avx2: 15.9 ( 2.81x) w_avg_8_8x8_c: 166.1 ( 1.00x) w_avg_8_8x8_avx2: 45.7 ( 3.63x) w_avg_8_16x16_c: 392.9 ( 1.00x) w_avg_8_16x16_avx2: 57.8 ( 6.80x) w_avg_8_32x32_c: 1455.5 ( 1.00x) w_avg_8_32x32_avx2: 215.0 ( 6.77x) w_avg_8_64x64_c: 5621.8 ( 1.00x) w_avg_8_64x64_avx2: 875.2 ( 6.42x) w_avg_8_128x128_c: 22131.3 ( 1.00x) w_avg_8_128x128_avx2: 3390.1 ( 6.53x) w_avg_10_2x2_c: 18.0 ( 1.00x) w_avg_10_2x2_avx2: 14.0 ( 1.28x) w_avg_10_4x4_c: 53.9 ( 1.00x) w_avg_10_4x4_avx2: 15.9 ( 3.40x) w_avg_10_8x8_c: 109.5 ( 1.00x) w_avg_10_8x8_avx2: 40.4 ( 2.71x) w_avg_10_16x16_c: 395.7 ( 1.00x) w_avg_10_16x16_avx2: 44.7 ( 8.86x) w_avg_10_32x32_c: 1532.7 ( 1.00x) w_avg_10_32x32_avx2: 142.4 (10.77x) w_avg_10_64x64_c: 6007.7 ( 1.00x) w_avg_10_64x64_avx2: 745.5 ( 8.06x) w_avg_10_128x128_c: 23719.7 ( 1.00x) w_avg_10_128x128_avx2: 2217.7 (10.70x) w_avg_12_2x2_c: 18.9 ( 1.00x) w_avg_12_2x2_avx2: 13.6 ( 1.38x) w_avg_12_4x4_c: 47.5 ( 1.00x) w_avg_12_4x4_avx2: 15.9 ( 2.99x) w_avg_12_8x8_c: 109.3 ( 1.00x) w_avg_12_8x8_avx2: 40.9 ( 2.67x) w_avg_12_16x16_c: 395.6 ( 1.00x) w_avg_12_16x16_avx2: 44.8 ( 8.84x) w_avg_12_32x32_c: 1531.0 ( 1.00x) w_avg_12_32x32_avx2: 141.8 (10.80x) w_avg_12_64x64_c: 6016.7 ( 1.00x) w_avg_12_64x64_avx2: 732.8 ( 8.21x) w_avg_12_128x128_c: 23762.2 ( 1.00x) w_avg_12_128x128_avx2: 2223.4 (10.69x) New benchmarks: avg_8_2x2_c: 11.3 ( 1.00x) avg_8_2x2_avx2: 7.6 ( 1.49x) avg_8_4x4_c: 31.2 ( 1.00x) avg_8_4x4_avx2: 10.8 ( 2.89x) avg_8_8x8_c: 131.6 ( 1.00x) avg_8_8x8_avx2: 15.6 ( 8.42x) avg_8_16x16_c: 255.3 ( 1.00x) avg_8_16x16_avx2: 27.9 ( 9.16x) avg_8_32x32_c: 897.9 ( 1.00x) avg_8_32x32_avx2: 81.2 (11.06x) avg_8_64x64_c: 3320.0 ( 1.00x) avg_8_64x64_avx2: 335.1 ( 9.91x) avg_8_128x128_c: 12999.1 ( 1.00x) avg_8_128x128_avx2: 1456.3 ( 8.93x) avg_10_2x2_c: 12.0 ( 1.00x) avg_10_2x2_avx2: 8.6 ( 1.40x) avg_10_4x4_c: 34.9 ( 1.00x) avg_10_4x4_avx2: 9.7 ( 3.61x) avg_10_8x8_c: 76.7 ( 1.00x) avg_10_8x8_avx2: 16.3 ( 4.69x) avg_10_16x16_c: 256.3 ( 1.00x) avg_10_16x16_avx2: 25.2 (10.18x) avg_10_32x32_c: 932.8 ( 1.00x) avg_10_32x32_avx2: 73.3 (12.72x) avg_10_64x64_c: 3518.8 ( 1.00x) avg_10_64x64_avx2: 416.8 ( 8.44x) avg_10_128x128_c: 13691.6 ( 1.00x) avg_10_128x128_avx2: 1612.9 ( 8.49x) avg_12_2x2_c: 14.1 ( 1.00x) avg_12_2x2_avx2: 8.7 ( 1.62x) avg_12_4x4_c: 35.7 ( 1.00x) avg_12_4x4_avx2: 9.7 ( 3.68x) avg_12_8x8_c: 77.0 ( 1.00x) avg_12_8x8_avx2: 16.9 ( 4.57x) avg_12_16x16_c: 256.2 ( 1.00x) avg_12_16x16_avx2: 25.7 ( 9.96x) avg_12_32x32_c: 933.5 ( 1.00x) avg_12_32x32_avx2: 74.0 (12.62x) avg_12_64x64_c: 3516.4 ( 1.00x) avg_12_64x64_avx2: 408.7 ( 8.60x) avg_12_128x128_c: 13691.6 ( 1.00x) avg_12_128x128_avx2: 1613.8 ( 8.48x) w_avg_8_2x2_c: 16.7 ( 1.00x) w_avg_8_2x2_avx2: 14.0 ( 1.19x) w_avg_8_4x4_c: 48.2 ( 1.00x) w_avg_8_4x4_avx2: 16.1 ( 3.00x) w_avg_8_8x8_c: 168.0 ( 1.00x) w_avg_8_8x8_avx2: 22.5 ( 7.47x) w_avg_8_16x16_c: 392.5 ( 1.00x) w_avg_8_16x16_avx2: 47.9 ( 8.19x) w_avg_8_32x32_c: 1453.7 ( 1.00x) w_avg_8_32x32_avx2: 176.1 ( 8.26x) w_avg_8_64x64_c: 5631.4 ( 1.00x) w_avg_8_64x64_avx2: 690.8 ( 8.15x) w_avg_8_128x128_c: 22139.5 ( 1.00x) w_avg_8_128x128_avx2: 2742.4 ( 8.07x) w_avg_10_2x2_c: 18.1 ( 1.00x) w_avg_10_2x2_avx2: 13.8 ( 1.31x) w_avg_10_4x4_c: 47.0 ( 1.00x) w_avg_10_4x4_avx2: 16.4 ( 2.87x) w_avg_10_8x8_c: 110.0 ( 1.00x) w_avg_10_8x8_avx2: 21.6 ( 5.09x) w_avg_10_16x16_c: 395.2 ( 1.00x) w_avg_10_16x16_avx2: 45.4 ( 8.71x) w_avg_10_32x32_c: 1533.8 ( 1.00x) w_avg_10_32x32_avx2: 142.6 (10.76x) w_avg_10_64x64_c: 6004.4 ( 1.00x) w_avg_10_64x64_avx2: 672.8 ( 8.92x) w_avg_10_128x128_c: 23748.5 ( 1.00x) w_avg_10_128x128_avx2: 2198.0 (10.80x) w_avg_12_2x2_c: 17.2 ( 1.00x) w_avg_12_2x2_avx2: 13.9 ( 1.24x) w_avg_12_4x4_c: 51.4 ( 1.00x) w_avg_12_4x4_avx2: 16.5 ( 3.11x) w_avg_12_8x8_c: 109.1 ( 1.00x) w_avg_12_8x8_avx2: 22.0 ( 4.96x) w_avg_12_16x16_c: 395.9 ( 1.00x) w_avg_12_16x16_avx2: 44.9 ( 8.81x) w_avg_12_32x32_c: 1533.5 ( 1.00x) w_avg_12_32x32_avx2: 142.3 (10.78x) w_avg_12_64x64_c: 6002.0 ( 1.00x) w_avg_12_64x64_avx2: 557.5 (10.77x) w_avg_12_128x128_c: 23749.5 ( 1.00x) w_avg_12_128x128_avx2: 2202.0 (10.79x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/mc.asm | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm index a3f858edd8..4fb5a19761 100644 --- a/libavcodec/x86/vvc/mc.asm +++ b/libavcodec/x86/vvc/mc.asm @@ -99,9 +99,9 @@ SECTION .text AVG_LOOP_END .w4 .w8: - vinserti128 m0, m0, [src0q], 0 + movu xm0, [src0q] + movu xm1, [src1q] vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1 - vinserti128 m1, m1, [src1q], 0 vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1 %2 %1 AVG_SAVE_W8 %1 @@ -164,7 +164,7 @@ SECTION .text %macro AVG_SAVE_W2 1 ;bpc %if %1 == 16 - pextrd [dstq], xm0, 0 + movd [dstq], xm0 pextrd [dstq + strideq], xm0, 1 %else packuswb m0, m0 @@ -175,23 +175,23 @@ SECTION .text %macro AVG_SAVE_W4 1 ;bpc %if %1 == 16 - pextrq [dstq], xm0, 0 + movq [dstq], xm0 pextrq [dstq + strideq], xm0, 1 %else packuswb m0, m0 - pextrd [dstq], xm0, 0 + movd [dstq], xm0 pextrd [dstq + strideq], xm0, 1 %endif %endmacro %macro AVG_SAVE_W8 1 ;bpc %if %1 == 16 - vextracti128 [dstq], m0, 0 + movu [dstq], xm0 vextracti128 [dstq + strideq], m0, 1 %else packuswb m0, m0 vpermq m0, m0, 1000b - pextrq [dstq], xm0, 0 + movq [dstq], xm0 pextrq [dstq + strideq], xm0, 1 %endif %endmacro @@ -202,7 +202,7 @@ SECTION .text %else packuswb m0, m0 vpermq m0, m0, 1000b - vextracti128 [dstq + %2 * strideq + %3 * 16], m0, 0 + movu [dstq + %2 * strideq + %3 * 16], xm0 %endif %endmacro _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
