alf: Avoid pointless wrappers for alf_filter

Andreas Rheinhardt via ffmpeg-cvslog Fri, 06 Mar 2026 11:38:57 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 1960320112f97bc00744511cb80b8e2cfff4cc4a
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Mar 3 01:09:26 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Mar 6 20:02:42 2026 +0100

    avcodec/x86/vvc/alf: Avoid pointless wrappers for alf_filter
    
    They are completely unnecessary for the 8bit case (which only
    handles 8bit) and overtly complicated for the 10 and 12bit cases:
    All one needs to do is set up the (1<<bpp)-1 vector register
    and jmp from (say) the 12bpp function stub inside the 10bpp
    function. The way it is done here even allows to share the
    prologue between the two functions.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm    | 53 +++++++++++++++++++++++++++----------------
 libavcodec/x86/vvc/dsp_init.c | 38 +++++++++++--------------------
 2 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index b7e9c54b68..dd3652843e 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -403,8 +403,7 @@ SECTION .text
 %macro FILTER_16x4 2
 %if LUMA
     push clipq
-    %define s5q clipq
-    %define s6q pixel_maxq
+    %define s6q clipq
 %endif
 
     xor               xd, xd
@@ -443,23 +442,21 @@ SECTION .text
 %endif
 %endmacro
 
-; FILTER(bpc, luma/chroma)
-%macro ALF_FILTER 2
-%xdefine BPC   %1
+; FILTER(bd, luma/chroma, bd of implementation to use)
+%macro ALF_FILTER 3
 %ifidn %2, luma
     %xdefine LUMA 1
 %else
     %xdefine LUMA 0
 %endif
-%define ps (%1 / 8) ; pixel size
+%assign ps (%1+7) / 8 ; pixel size
 
 ; ******************************
-; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
-;      const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt 
ptr_diff_t height,
-;      const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);
+; void ff_vvc_alf_filter_%2_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+;      const uint8_t *src, ptrdiff_t src_stride, int width, int height,
+;      const int16_t *filter, const int16_t *clip, int vb_pos);
 ; ******************************
-cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, 
src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \
-    x, s1, s2, s3, s4
+cglobal vvc_alf_filter_%2_%1
 %if !LUMA
 ; chroma does not use registers m5 and m8. Swap them to reduce the amount
 ; of nonvolatile registers on Win64. It also reduces codesize generally
@@ -471,10 +468,24 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 
12+2*(ps!=1)+2*LUMA, dst, dst_stride, s
         SWAP 5,12
         SWAP 8,13
     %endif
+%elif WIN64 && (ps != 1)
+; Swap m5 and m15, so that the register for the maximum pixel value
+; ends up in a volatile register
+    SWAP 5,15
 %endif
 %if ps != 1
-    movd            xm15, pixel_maxd
-    vpbroadcastw     m15, xm15
+  ; create pw_pixelmax for clipping
+  pcmpeqw         m15, m15
+  psrlw           m15, 16 - %1
+%endif
+
+%if %1 != %3
+    jmp vvc_alf_filter_%2_%3_prologue
+%else
+vvc_alf_filter_%2_%1_prologue:
+    PROLOGUE 9, 14+LUMA, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, src, 
src_stride, width, height, filter, clip, vb_pos, \
+    x, s1, s2, s3, s4, s5
+%if ps != 1
     pxor             m14, m14
 %endif
 
@@ -498,7 +509,9 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 
12+2*(ps!=1)+2*LUMA, dst, dst_stride, s
     je            .w_end
 
 %if LUMA
+SAVE_MM_PERMUTATION
 INIT_XMM cpuname
+LOAD_MM_PERMUTATION
 %endif
     LOAD_PARAMS
     FILTER_16x4  widthd, 0
@@ -518,12 +531,13 @@ INIT_YMM cpuname
     sub          heightd, 4
     jg             .loop
     RET
+%endif
 %endmacro
 
-; FILTER(bpc)
-%macro ALF_FILTER 1
-    ALF_FILTER  %1, luma
-    ALF_FILTER  %1, chroma
+; FILTER(bd, bd of implementation to use)
+%macro ALF_FILTER 2
+    ALF_FILTER  %1, luma,   %2
+    ALF_FILTER  %1, chroma, %2
 %endmacro
 
 %define ALF_GRADIENT_BORDER 2
@@ -891,9 +905,10 @@ cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, 
transpose_idx, gradient_su
 %if ARCH_X86_64
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-ALF_FILTER   16
-ALF_FILTER   8
+ALF_FILTER   12, 10
+ALF_FILTER   10, 10
 ALF_CLASSIFY 16
+ALF_FILTER   8,  8
 ALF_CLASSIFY 8
 %endif
 %endif
diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index 5194ecfdeb..6802294795 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -58,12 +58,6 @@ void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, 
ptrdiff_t dst_stride,
 } while (0)
 
 #define ALF_BPC_PROTOTYPES(bpc, opt)                                           
                                          \
-void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
                                          \
-    const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t 
height,                                         \
-    const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);                                  \
-void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t 
dst_stride,                                          \
-    const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t 
height,                                         \
-    const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);                                  \
 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum,                 
                                          \
     const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, 
intptr_t vb_pos);                         \
 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, 
const int *gradient_sum,                      \
@@ -150,18 +144,6 @@ FW_PUT_16BPC_AVX2(10)
 FW_PUT_16BPC_AVX2(12)
 
 #define ALF_FUNCS(bpc, bd, opt)                                                
                                          \
-static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
-    int width, int height, const int16_t *filter, const int16_t *clip, const 
int vb_pos)                                 \
-{                                                                              
                                          \
-    BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, 
width, height,                                \
-        filter, clip, vb_pos, (1 << bd)  - 1);                                 
                                          \
-}                                                                              
                                          \
-static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
-    int width, int height, const int16_t *filter, const int16_t *clip, const 
int vb_pos)                                 \
-{                                                                              
                                          \
-    BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, 
width, height,                              \
-        filter, clip, vb_pos,(1 << bd)  - 1);                                  
                                          \
-}                                                                              
                                          \
 static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx,  
                                          \
     const uint8_t *src, ptrdiff_t src_stride, int width, int height, int 
vb_pos, int *gradient_tmp)                      \
 {                                                                              
                                          \
@@ -298,10 +280,16 @@ void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride,   \
 int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, 
int block_w, int block_h);
 #define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
 
-#define ALF_INIT(bd) do {                                            \
-    c->alf.filter[LUMA]   = vvc_alf_filter_luma_##bd##_avx2;         \
-    c->alf.filter[CHROMA] = vvc_alf_filter_chroma_##bd##_avx2;       \
-    c->alf.classify       = vvc_alf_classify_##bd##_avx2;            \
+#define ALF_INIT(bd, opt) do {                                                 
\
+void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
\
+    const uint8_t *src, ptrdiff_t src_stride, int width, int height,           
\
+    const int16_t *filter, const int16_t *clip, int vb_pos);                   
\
+void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
\
+    const uint8_t *src, ptrdiff_t src_stride, int width, int height,           
\
+    const int16_t *filter, const int16_t *clip, int vb_pos);                   
\
+    c->alf.filter[LUMA]   = bf(ff_vvc_alf_filter_luma, bd, opt);               
\
+    c->alf.filter[CHROMA] = bf(ff_vvc_alf_filter_chroma, bd, opt);             
\
+    c->alf.classify       = bf(vvc_alf_classify, bd, opt);                     
\
 } while (0)
 
 #endif
@@ -331,7 +319,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, 
const int bd)
             SAD_INIT();
 
             // filter
-            ALF_INIT(8);
+            ALF_INIT(8, avx2);
             SAO_INIT(8, avx2);
         }
 #endif
@@ -353,7 +341,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, 
const int bd)
             SAD_INIT();
 
             // filter
-            ALF_INIT(10);
+            ALF_INIT(10, avx2);
             SAO_INIT(10, avx2);
         }
 #endif
@@ -375,7 +363,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, 
const int bd)
             SAD_INIT();
 
             // filter
-            ALF_INIT(12);
+            ALF_INIT(12, avx2);
             SAO_INIT(12, avx2);
         }
 #endif

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 27/28: avcodec/x86/vvc/alf: Avoid pointless wrappers for alf_filter

Reply via email to