mc: Avoid redundant clipping for 8bit

Andreas Rheinhardt via ffmpeg-cvslog Sat, 21 Feb 2026 17:55:05 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 7bf9c1e3f6effbe7d2dd53096bf2a7dbbb07d7ff
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Feb 17 15:14:04 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Feb 22 00:57:56 2026 +0100

    avcodec/x86/vvc/mc: Avoid redundant clipping for 8bit
    
    It is already done by packuswb.
    
    Old benchmarks:
    avg_8_2x2_c:                                            11.1 ( 1.00x)
    avg_8_2x2_avx2:                                          8.6 ( 1.28x)
    avg_8_4x4_c:                                            30.0 ( 1.00x)
    avg_8_4x4_avx2:                                         10.8 ( 2.78x)
    avg_8_8x8_c:                                           132.0 ( 1.00x)
    avg_8_8x8_avx2:                                         25.7 ( 5.14x)
    avg_8_16x16_c:                                         254.6 ( 1.00x)
    avg_8_16x16_avx2:                                       33.2 ( 7.67x)
    avg_8_32x32_c:                                         897.5 ( 1.00x)
    avg_8_32x32_avx2:                                      115.6 ( 7.76x)
    avg_8_64x64_c:                                        3316.9 ( 1.00x)
    avg_8_64x64_avx2:                                      626.5 ( 5.29x)
    avg_8_128x128_c:                                     12973.6 ( 1.00x)
    avg_8_128x128_avx2:                                   1914.0 ( 6.78x)
    w_avg_8_2x2_c:                                          16.7 ( 1.00x)
    w_avg_8_2x2_avx2:                                       14.4 ( 1.16x)
    w_avg_8_4x4_c:                                          48.2 ( 1.00x)
    w_avg_8_4x4_avx2:                                       16.5 ( 2.92x)
    w_avg_8_8x8_c:                                         168.1 ( 1.00x)
    w_avg_8_8x8_avx2:                                       49.7 ( 3.38x)
    w_avg_8_16x16_c:                                       392.4 ( 1.00x)
    w_avg_8_16x16_avx2:                                     61.1 ( 6.43x)
    w_avg_8_32x32_c:                                      1455.3 ( 1.00x)
    w_avg_8_32x32_avx2:                                    224.6 ( 6.48x)
    w_avg_8_64x64_c:                                      5632.1 ( 1.00x)
    w_avg_8_64x64_avx2:                                    896.9 ( 6.28x)
    w_avg_8_128x128_c:                                   22136.3 ( 1.00x)
    w_avg_8_128x128_avx2:                                 3626.7 ( 6.10x)
    
    New benchmarks:
    avg_8_2x2_c:                                            12.3 ( 1.00x)
    avg_8_2x2_avx2:                                          8.1 ( 1.52x)
    avg_8_4x4_c:                                            30.3 ( 1.00x)
    avg_8_4x4_avx2:                                         11.3 ( 2.67x)
    avg_8_8x8_c:                                           131.8 ( 1.00x)
    avg_8_8x8_avx2:                                         21.3 ( 6.20x)
    avg_8_16x16_c:                                         255.0 ( 1.00x)
    avg_8_16x16_avx2:                                       30.6 ( 8.33x)
    avg_8_32x32_c:                                         898.5 ( 1.00x)
    avg_8_32x32_avx2:                                      104.9 ( 8.57x)
    avg_8_64x64_c:                                        3317.7 ( 1.00x)
    avg_8_64x64_avx2:                                      540.9 ( 6.13x)
    avg_8_128x128_c:                                     12986.5 ( 1.00x)
    avg_8_128x128_avx2:                                   1663.4 ( 7.81x)
    w_avg_8_2x2_c:                                          16.8 ( 1.00x)
    w_avg_8_2x2_avx2:                                       13.9 ( 1.21x)
    w_avg_8_4x4_c:                                          48.2 ( 1.00x)
    w_avg_8_4x4_avx2:                                       16.2 ( 2.98x)
    w_avg_8_8x8_c:                                         168.6 ( 1.00x)
    w_avg_8_8x8_avx2:                                       46.3 ( 3.64x)
    w_avg_8_16x16_c:                                       392.4 ( 1.00x)
    w_avg_8_16x16_avx2:                                     57.7 ( 6.80x)
    w_avg_8_32x32_c:                                      1454.6 ( 1.00x)
    w_avg_8_32x32_avx2:                                    214.6 ( 6.78x)
    w_avg_8_64x64_c:                                      5638.4 ( 1.00x)
    w_avg_8_64x64_avx2:                                    875.6 ( 6.44x)
    w_avg_8_128x128_c:                                   22133.5 ( 1.00x)
    w_avg_8_128x128_avx2:                                 3334.3 ( 6.64x)
    
    Also saves 550B of .text here. The improvements will likely
    be even better on Win64, because it avoids using two nonvolatile
    registers in the weighted average case.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/mc.asm | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index 30aa97c65a..a3f858edd8 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -64,12 +64,12 @@ SECTION .text
     %rep %3
         %define off %%i
         AVG_LOAD_W16        0, off
-        %2
+        %2                 %1
         AVG_SAVE_W16       %1, 0, off
 
 
         AVG_LOAD_W16        1, off
-        %2
+        %2                 %1
         AVG_SAVE_W16       %1, 1, off
 
         %assign %%i %%i+1
@@ -84,7 +84,7 @@ SECTION .text
     pinsrd              xm0, [src0q + AVG_SRC_STRIDE], 1
     movd                xm1, [src1q]
     pinsrd              xm1, [src1q + AVG_SRC_STRIDE], 1
-    %2
+    %2                   %1
     AVG_SAVE_W2          %1
     AVG_LOOP_END        .w2
 
@@ -93,7 +93,7 @@ SECTION .text
     pinsrq              xm0, [src0q + AVG_SRC_STRIDE], 1
     movq                xm1, [src1q]
     pinsrq              xm1, [src1q + AVG_SRC_STRIDE], 1
-    %2
+    %2                   %1
     AVG_SAVE_W4          %1
 
     AVG_LOOP_END        .w4
@@ -103,7 +103,7 @@ SECTION .text
     vinserti128         m0, m0, [src0q + AVG_SRC_STRIDE], 1
     vinserti128         m1, m1, [src1q], 0
     vinserti128         m1, m1, [src1q + AVG_SRC_STRIDE], 1
-    %2
+    %2                  %1
     AVG_SAVE_W8         %1
 
     AVG_LOOP_END       .w8
@@ -132,13 +132,15 @@ SECTION .text
     RET
 %endmacro
 
-%macro AVG   0
+%macro AVG   1
     paddsw               m0, m1
     pmulhrsw             m0, m2
+%if %1 != 8
     CLIPW                m0, m3, m4
+%endif
 %endmacro
 
-%macro W_AVG 0
+%macro W_AVG 1
     punpckhwd            m5, m0, m1
     pmaddwd              m5, m3
     paddd                m5, m4
@@ -150,7 +152,9 @@ SECTION .text
     psrad                m0, xm2
 
     packssdw             m0, m5
+%if %1 != 8
     CLIPW                m0, m6, m7
+%endif
 %endmacro
 
 %macro AVG_LOAD_W16 2  ; line, offset
@@ -217,11 +221,13 @@ SECTION .text
 ;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
 ;   const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
intptr_t pixel_max);
 %macro VVC_AVG_AVX2 1
-cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
+cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
     movifnidn            hd, hm
 
+%if %1 != 8
     pxor                 m3, m3             ; pixel min
     vpbroadcastw         m4, bdm            ; pixel max
+%endif
 
     movifnidn           bdd, bdm
     inc                 bdd
@@ -245,7 +251,7 @@ cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, 
h, bd
 ;    const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
 ;    intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, 
intptr_t pixel_max);
 %macro VVC_W_AVG_AVX2 1
-cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1
+cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, 
t0, t1
 
     movifnidn            hd, hm
 
@@ -255,8 +261,10 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, 
w, h, t0, t1
     movd                xm3, t0d
     vpbroadcastd         m3, xm3                ; w0, w1
 
+%if %1 != 8
     pxor                m6, m6                  ;pixel min
     vpbroadcastw        m7, r11m                ;pixel max
+%endif
 
     mov                 t1q, rcx                ; save ecx
     mov                 ecx, r11m

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 01/12: avcodec/x86/vvc/mc: Avoid redundant clipping for 8bit

Reply via email to