of: Break dependency chain

Andreas Rheinhardt via ffmpeg-cvslog Sat, 21 Feb 2026 17:56:00 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit af3f8f5bd2ed0d55cf8614064d722b533eef77e9
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Feb 19 02:08:32 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Feb 22 01:05:12 2026 +0100

    avcodec/x86/vvc/of: Break dependency chain
    
    Don't extract and update one word of one and the same register
    at a time; use separate src and dst registers, so that pextrw
    and bsr can be done in parallel. Also use movd instead of pinsrw
    for the first word.
    
    Old benchmarks:
    apply_bdof_8_8x16_c:                                  3275.2 ( 1.00x)
    apply_bdof_8_8x16_avx2:                                487.6 ( 6.72x)
    apply_bdof_8_16x8_c:                                  3243.1 ( 1.00x)
    apply_bdof_8_16x8_avx2:                                284.4 (11.40x)
    apply_bdof_8_16x16_c:                                 6501.8 ( 1.00x)
    apply_bdof_8_16x16_avx2:                               570.0 (11.41x)
    apply_bdof_10_8x16_c:                                 3286.5 ( 1.00x)
    apply_bdof_10_8x16_avx2:                               461.7 ( 7.12x)
    apply_bdof_10_16x8_c:                                 3274.5 ( 1.00x)
    apply_bdof_10_16x8_avx2:                               271.4 (12.06x)
    apply_bdof_10_16x16_c:                                6590.0 ( 1.00x)
    apply_bdof_10_16x16_avx2:                              543.9 (12.12x)
    apply_bdof_12_8x16_c:                                 3307.6 ( 1.00x)
    apply_bdof_12_8x16_avx2:                               462.2 ( 7.16x)
    apply_bdof_12_16x8_c:                                 3287.4 ( 1.00x)
    apply_bdof_12_16x8_avx2:                               271.8 (12.10x)
    apply_bdof_12_16x16_c:                                6465.7 ( 1.00x)
    apply_bdof_12_16x16_avx2:                              543.8 (11.89x)
    
    New benchmarks:
    apply_bdof_8_8x16_c:                                  3255.7 ( 1.00x)
    apply_bdof_8_8x16_avx2:                                349.3 ( 9.32x)
    apply_bdof_8_16x8_c:                                  3262.5 ( 1.00x)
    apply_bdof_8_16x8_avx2:                                214.8 (15.19x)
    apply_bdof_8_16x16_c:                                 6471.6 ( 1.00x)
    apply_bdof_8_16x16_avx2:                               429.8 (15.06x)
    apply_bdof_10_8x16_c:                                 3227.7 ( 1.00x)
    apply_bdof_10_8x16_avx2:                               321.6 (10.04x)
    apply_bdof_10_16x8_c:                                 3250.2 ( 1.00x)
    apply_bdof_10_16x8_avx2:                               201.2 (16.16x)
    apply_bdof_10_16x16_c:                                6476.5 ( 1.00x)
    apply_bdof_10_16x16_avx2:                              400.9 (16.16x)
    apply_bdof_12_8x16_c:                                 3230.7 ( 1.00x)
    apply_bdof_12_8x16_avx2:                               321.8 (10.04x)
    apply_bdof_12_16x8_c:                                 3210.5 ( 1.00x)
    apply_bdof_12_16x8_avx2:                               200.9 (15.98x)
    apply_bdof_12_16x16_c:                                6474.5 ( 1.00x)
    apply_bdof_12_16x16_avx2:                              400.2 (16.18x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index e11ada8d20..232dc1c2fd 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -251,21 +251,25 @@ INIT_YMM avx2
     psrlw                 %3, %4
 %endmacro
 
-%macro LOG2 2 ; dst/src, offset
-    pextrw              tmp0d, xm%1,  %2
+%macro LOG2 3 ; dst, src, offset
+    pextrw              tmp0d, xm%2,  %3
     bsr                 tmp0d, tmp0d
-    pinsrw               xm%1, tmp0d, %2
+%if %3 != 0
+    pinsrw               xm%1, tmp0d, %3
+%else
+    movd                 xm%1, tmp0d
+%endif
 %endmacro
 
-%macro LOG2 1 ; dst/src
-    LOG2                 %1, 0
-    LOG2                 %1, 1
-    LOG2                 %1, 2
-    LOG2                 %1, 3
-    LOG2                 %1, 4
-    LOG2                 %1, 5
-    LOG2                 %1, 6
-    LOG2                 %1, 7
+%macro LOG2 2 ; dst, src
+    LOG2                 %1, %2, 0
+    LOG2                 %1, %2, 1
+    LOG2                 %1, %2, 2
+    LOG2                 %1, %2, 3
+    LOG2                 %1, %2, 4
+    LOG2                 %1, %2, 5
+    LOG2                 %1, %2, 6
+    LOG2                 %1, %2, 7
 %endmacro
 
 ; %1: 4 (sgx2, sgy2, sgxdi, gydi)
@@ -277,8 +281,7 @@ INIT_YMM avx2
 
     punpcklqdq              m8, m%1, m7             ; 4 (sgx2, sgy2)
     punpckhqdq              m9, m%1, m7             ; 4 (sgxdi, sgydi)
-    mova                   m10, m8
-    LOG2                    10                      ; 4 (log2(sgx2), 
log2(sgy2))
+    LOG2                    10, 8                   ; 4 (log2(sgx2), 
log2(sgy2))
 
     ; Promote to dword since vpsrlvw is AVX-512 only
     pmovsxwd                m8, xm8

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 11/12: avcodec/x86/vvc/of: Break dependency chain

Reply via email to