This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit caa0ae0cfb35de0ae3fd5f346caef89d62eeaf7c
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Feb 17 17:34:49 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Feb 22 00:57:56 2026 +0100

    avcodec/x86/vvc/mc: Avoid pextr[dq], v{insert,extract}i128
    
    Use mov[dq], movdqu instead if the least significant parts
    are set (i.e. if the immediate value is 0x0).
    
    Old benchmarks:
    avg_8_2x2_c:                                            11.3 ( 1.00x)
    avg_8_2x2_avx2:                                          7.5 ( 1.50x)
    avg_8_4x4_c:                                            31.2 ( 1.00x)
    avg_8_4x4_avx2:                                         10.7 ( 2.91x)
    avg_8_8x8_c:                                           133.5 ( 1.00x)
    avg_8_8x8_avx2:                                         21.2 ( 6.30x)
    avg_8_16x16_c:                                         254.7 ( 1.00x)
    avg_8_16x16_avx2:                                       30.1 ( 8.46x)
    avg_8_32x32_c:                                         896.9 ( 1.00x)
    avg_8_32x32_avx2:                                      103.9 ( 8.63x)
    avg_8_64x64_c:                                        3320.7 ( 1.00x)
    avg_8_64x64_avx2:                                      539.4 ( 6.16x)
    avg_8_128x128_c:                                     12991.5 ( 1.00x)
    avg_8_128x128_avx2:                                   1661.3 ( 7.82x)
    avg_10_2x2_c:                                           21.3 ( 1.00x)
    avg_10_2x2_avx2:                                         8.3 ( 2.55x)
    avg_10_4x4_c:                                           34.9 ( 1.00x)
    avg_10_4x4_avx2:                                        10.6 ( 3.28x)
    avg_10_8x8_c:                                           76.3 ( 1.00x)
    avg_10_8x8_avx2:                                        20.2 ( 3.77x)
    avg_10_16x16_c:                                        255.9 ( 1.00x)
    avg_10_16x16_avx2:                                      24.1 (10.60x)
    avg_10_32x32_c:                                        932.4 ( 1.00x)
    avg_10_32x32_avx2:                                      73.3 (12.72x)
    avg_10_64x64_c:                                       3516.4 ( 1.00x)
    avg_10_64x64_avx2:                                     601.7 ( 5.84x)
    avg_10_128x128_c:                                    13690.6 ( 1.00x)
    avg_10_128x128_avx2:                                  1613.2 ( 8.49x)
    avg_12_2x2_c:                                           14.0 ( 1.00x)
    avg_12_2x2_avx2:                                         8.3 ( 1.67x)
    avg_12_4x4_c:                                           35.3 ( 1.00x)
    avg_12_4x4_avx2:                                        10.9 ( 3.26x)
    avg_12_8x8_c:                                           76.5 ( 1.00x)
    avg_12_8x8_avx2:                                        20.3 ( 3.77x)
    avg_12_16x16_c:                                        256.7 ( 1.00x)
    avg_12_16x16_avx2:                                      24.1 (10.63x)
    avg_12_32x32_c:                                        932.5 ( 1.00x)
    avg_12_32x32_avx2:                                      73.3 (12.72x)
    avg_12_64x64_c:                                       3520.5 ( 1.00x)
    avg_12_64x64_avx2:                                     602.6 ( 5.84x)
    avg_12_128x128_c:                                    13689.6 ( 1.00x)
    avg_12_128x128_avx2:                                  1613.1 ( 8.49x)
    w_avg_8_2x2_c:                                          16.7 ( 1.00x)
    w_avg_8_2x2_avx2:                                       13.4 ( 1.25x)
    w_avg_8_4x4_c:                                          44.5 ( 1.00x)
    w_avg_8_4x4_avx2:                                       15.9 ( 2.81x)
    w_avg_8_8x8_c:                                         166.1 ( 1.00x)
    w_avg_8_8x8_avx2:                                       45.7 ( 3.63x)
    w_avg_8_16x16_c:                                       392.9 ( 1.00x)
    w_avg_8_16x16_avx2:                                     57.8 ( 6.80x)
    w_avg_8_32x32_c:                                      1455.5 ( 1.00x)
    w_avg_8_32x32_avx2:                                    215.0 ( 6.77x)
    w_avg_8_64x64_c:                                      5621.8 ( 1.00x)
    w_avg_8_64x64_avx2:                                    875.2 ( 6.42x)
    w_avg_8_128x128_c:                                   22131.3 ( 1.00x)
    w_avg_8_128x128_avx2:                                 3390.1 ( 6.53x)
    w_avg_10_2x2_c:                                         18.0 ( 1.00x)
    w_avg_10_2x2_avx2:                                      14.0 ( 1.28x)
    w_avg_10_4x4_c:                                         53.9 ( 1.00x)
    w_avg_10_4x4_avx2:                                      15.9 ( 3.40x)
    w_avg_10_8x8_c:                                        109.5 ( 1.00x)
    w_avg_10_8x8_avx2:                                      40.4 ( 2.71x)
    w_avg_10_16x16_c:                                      395.7 ( 1.00x)
    w_avg_10_16x16_avx2:                                    44.7 ( 8.86x)
    w_avg_10_32x32_c:                                     1532.7 ( 1.00x)
    w_avg_10_32x32_avx2:                                   142.4 (10.77x)
    w_avg_10_64x64_c:                                     6007.7 ( 1.00x)
    w_avg_10_64x64_avx2:                                   745.5 ( 8.06x)
    w_avg_10_128x128_c:                                  23719.7 ( 1.00x)
    w_avg_10_128x128_avx2:                                2217.7 (10.70x)
    w_avg_12_2x2_c:                                         18.9 ( 1.00x)
    w_avg_12_2x2_avx2:                                      13.6 ( 1.38x)
    w_avg_12_4x4_c:                                         47.5 ( 1.00x)
    w_avg_12_4x4_avx2:                                      15.9 ( 2.99x)
    w_avg_12_8x8_c:                                        109.3 ( 1.00x)
    w_avg_12_8x8_avx2:                                      40.9 ( 2.67x)
    w_avg_12_16x16_c:                                      395.6 ( 1.00x)
    w_avg_12_16x16_avx2:                                    44.8 ( 8.84x)
    w_avg_12_32x32_c:                                     1531.0 ( 1.00x)
    w_avg_12_32x32_avx2:                                   141.8 (10.80x)
    w_avg_12_64x64_c:                                     6016.7 ( 1.00x)
    w_avg_12_64x64_avx2:                                   732.8 ( 8.21x)
    w_avg_12_128x128_c:                                  23762.2 ( 1.00x)
    w_avg_12_128x128_avx2:                                2223.4 (10.69x)
    
    New benchmarks:
    avg_8_2x2_c:                                            11.3 ( 1.00x)
    avg_8_2x2_avx2:                                          7.6 ( 1.49x)
    avg_8_4x4_c:                                            31.2 ( 1.00x)
    avg_8_4x4_avx2:                                         10.8 ( 2.89x)
    avg_8_8x8_c:                                           131.6 ( 1.00x)
    avg_8_8x8_avx2:                                         15.6 ( 8.42x)
    avg_8_16x16_c:                                         255.3 ( 1.00x)
    avg_8_16x16_avx2:                                       27.9 ( 9.16x)
    avg_8_32x32_c:                                         897.9 ( 1.00x)
    avg_8_32x32_avx2:                                       81.2 (11.06x)
    avg_8_64x64_c:                                        3320.0 ( 1.00x)
    avg_8_64x64_avx2:                                      335.1 ( 9.91x)
    avg_8_128x128_c:                                     12999.1 ( 1.00x)
    avg_8_128x128_avx2:                                   1456.3 ( 8.93x)
    avg_10_2x2_c:                                           12.0 ( 1.00x)
    avg_10_2x2_avx2:                                         8.6 ( 1.40x)
    avg_10_4x4_c:                                           34.9 ( 1.00x)
    avg_10_4x4_avx2:                                         9.7 ( 3.61x)
    avg_10_8x8_c:                                           76.7 ( 1.00x)
    avg_10_8x8_avx2:                                        16.3 ( 4.69x)
    avg_10_16x16_c:                                        256.3 ( 1.00x)
    avg_10_16x16_avx2:                                      25.2 (10.18x)
    avg_10_32x32_c:                                        932.8 ( 1.00x)
    avg_10_32x32_avx2:                                      73.3 (12.72x)
    avg_10_64x64_c:                                       3518.8 ( 1.00x)
    avg_10_64x64_avx2:                                     416.8 ( 8.44x)
    avg_10_128x128_c:                                    13691.6 ( 1.00x)
    avg_10_128x128_avx2:                                  1612.9 ( 8.49x)
    avg_12_2x2_c:                                           14.1 ( 1.00x)
    avg_12_2x2_avx2:                                         8.7 ( 1.62x)
    avg_12_4x4_c:                                           35.7 ( 1.00x)
    avg_12_4x4_avx2:                                         9.7 ( 3.68x)
    avg_12_8x8_c:                                           77.0 ( 1.00x)
    avg_12_8x8_avx2:                                        16.9 ( 4.57x)
    avg_12_16x16_c:                                        256.2 ( 1.00x)
    avg_12_16x16_avx2:                                      25.7 ( 9.96x)
    avg_12_32x32_c:                                        933.5 ( 1.00x)
    avg_12_32x32_avx2:                                      74.0 (12.62x)
    avg_12_64x64_c:                                       3516.4 ( 1.00x)
    avg_12_64x64_avx2:                                     408.7 ( 8.60x)
    avg_12_128x128_c:                                    13691.6 ( 1.00x)
    avg_12_128x128_avx2:                                  1613.8 ( 8.48x)
    w_avg_8_2x2_c:                                          16.7 ( 1.00x)
    w_avg_8_2x2_avx2:                                       14.0 ( 1.19x)
    w_avg_8_4x4_c:                                          48.2 ( 1.00x)
    w_avg_8_4x4_avx2:                                       16.1 ( 3.00x)
    w_avg_8_8x8_c:                                         168.0 ( 1.00x)
    w_avg_8_8x8_avx2:                                       22.5 ( 7.47x)
    w_avg_8_16x16_c:                                       392.5 ( 1.00x)
    w_avg_8_16x16_avx2:                                     47.9 ( 8.19x)
    w_avg_8_32x32_c:                                      1453.7 ( 1.00x)
    w_avg_8_32x32_avx2:                                    176.1 ( 8.26x)
    w_avg_8_64x64_c:                                      5631.4 ( 1.00x)
    w_avg_8_64x64_avx2:                                    690.8 ( 8.15x)
    w_avg_8_128x128_c:                                   22139.5 ( 1.00x)
    w_avg_8_128x128_avx2:                                 2742.4 ( 8.07x)
    w_avg_10_2x2_c:                                         18.1 ( 1.00x)
    w_avg_10_2x2_avx2:                                      13.8 ( 1.31x)
    w_avg_10_4x4_c:                                         47.0 ( 1.00x)
    w_avg_10_4x4_avx2:                                      16.4 ( 2.87x)
    w_avg_10_8x8_c:                                        110.0 ( 1.00x)
    w_avg_10_8x8_avx2:                                      21.6 ( 5.09x)
    w_avg_10_16x16_c:                                      395.2 ( 1.00x)
    w_avg_10_16x16_avx2:                                    45.4 ( 8.71x)
    w_avg_10_32x32_c:                                     1533.8 ( 1.00x)
    w_avg_10_32x32_avx2:                                   142.6 (10.76x)
    w_avg_10_64x64_c:                                     6004.4 ( 1.00x)
    w_avg_10_64x64_avx2:                                   672.8 ( 8.92x)
    w_avg_10_128x128_c:                                  23748.5 ( 1.00x)
    w_avg_10_128x128_avx2:                                2198.0 (10.80x)
    w_avg_12_2x2_c:                                         17.2 ( 1.00x)
    w_avg_12_2x2_avx2:                                      13.9 ( 1.24x)
    w_avg_12_4x4_c:                                         51.4 ( 1.00x)
    w_avg_12_4x4_avx2:                                      16.5 ( 3.11x)
    w_avg_12_8x8_c:                                        109.1 ( 1.00x)
    w_avg_12_8x8_avx2:                                      22.0 ( 4.96x)
    w_avg_12_16x16_c:                                      395.9 ( 1.00x)
    w_avg_12_16x16_avx2:                                    44.9 ( 8.81x)
    w_avg_12_32x32_c:                                     1533.5 ( 1.00x)
    w_avg_12_32x32_avx2:                                   142.3 (10.78x)
    w_avg_12_64x64_c:                                     6002.0 ( 1.00x)
    w_avg_12_64x64_avx2:                                   557.5 (10.77x)
    w_avg_12_128x128_c:                                  23749.5 ( 1.00x)
    w_avg_12_128x128_avx2:                                2202.0 (10.79x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/mc.asm | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index a3f858edd8..4fb5a19761 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -99,9 +99,9 @@ SECTION .text
     AVG_LOOP_END        .w4
 
 .w8:
-    vinserti128         m0, m0, [src0q], 0
+    movu               xm0, [src0q]
+    movu               xm1, [src1q]
     vinserti128         m0, m0, [src0q + AVG_SRC_STRIDE], 1
-    vinserti128         m1, m1, [src1q], 0
     vinserti128         m1, m1, [src1q + AVG_SRC_STRIDE], 1
     %2                  %1
     AVG_SAVE_W8         %1
@@ -164,7 +164,7 @@ SECTION .text
 
 %macro AVG_SAVE_W2 1 ;bpc
     %if %1 == 16
-        pextrd           [dstq], xm0, 0
+        movd             [dstq], xm0
         pextrd [dstq + strideq], xm0, 1
     %else
         packuswb           m0, m0
@@ -175,23 +175,23 @@ SECTION .text
 
 %macro AVG_SAVE_W4 1 ;bpc
     %if %1 == 16
-        pextrq           [dstq], xm0, 0
+        movq             [dstq], xm0
         pextrq [dstq + strideq], xm0, 1
     %else
         packuswb           m0, m0
-        pextrd           [dstq], xm0, 0
+        movd             [dstq], xm0
         pextrd [dstq + strideq], xm0, 1
     %endif
 %endmacro
 
 %macro AVG_SAVE_W8 1 ;bpc
     %if %1 == 16
-        vextracti128            [dstq], m0, 0
+        movu                    [dstq], xm0
         vextracti128  [dstq + strideq], m0, 1
     %else
         packuswb                    m0, m0
         vpermq                      m0, m0, 1000b
-        pextrq                  [dstq], xm0, 0
+        movq                    [dstq], xm0
         pextrq        [dstq + strideq], xm0, 1
     %endif
 %endmacro
@@ -202,7 +202,7 @@ SECTION .text
     %else
         packuswb                                        m0, m0
         vpermq                                          m0, m0, 1000b
-        vextracti128       [dstq + %2 * strideq + %3 * 16], m0, 0
+        movu               [dstq + %2 * strideq + %3 * 16], xm0
     %endif
 %endmacro
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to