alf: Remove pointless counter, stride

Andreas Rheinhardt via ffmpeg-cvslog Fri, 06 Mar 2026 11:38:49 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 38062ebd18cf2f0de3720bdef054e941d69b6336
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Mar 2 17:20:31 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Mar 6 20:02:42 2026 +0100

    avcodec/x86/vvc/alf: Remove pointless counter, stride
    
    Each luma alf block has 2*12 auxiliary coefficients associated
    with it that the alf_filter functions consume; the C version
    simply increments the pointers.
    
    The x64 dsp function meanwhile does things differenty:
    The vvc_alf_filter functions have three levels of loops.
    The middle layer uses two counters, one of which is
    just the horizontal offset xd in the current line. It is only
    used for addressing these auxiliary coefficients and
    yet one needs to perform work translate from it to
    the coefficient offset, namely a *3 via lea and a *2 scale.
    Furthermore, the base pointers of the coefficients are incremented
    in the outer loop; the stride used for this is calculated
    in the C wrapper functions. Furthermore, due to GPR pressure xd
    is reused as loop counter for the innermost loop; the
    xd from the middle loop is pushed to the stack.
    
    Apart from the translation from horizontal offset to coefficient
    offset all of the above has been done for chroma, too, although
    the coefficient pointers don't get modified for them at all.
    
    This commit changes this to just increment the pointers
    after reading the relevant coefficients.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm    | 36 +++++++++++-------------------------
 libavcodec/x86/vvc/dsp_init.c |  9 ++++-----
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index e924308cff..df2f782683 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -59,9 +59,12 @@ SECTION .text
 ;%1-%3 out
 ;%4 clip or filter
 %macro LOAD_LUMA_PARAMS 4
-    movu                    m%1, [%4q + 2 * offsetq + 0 * mmsize] ; 2 * for 
sizeof(int16_t)
-    movu                    m%2, [%4q + 2 * offsetq + 1 * mmsize]
-    movu                    m%3, [%4q + 2 * offsetq + 2 * mmsize]
+    movu                    m%1, [%4q + 0 * mmsize]
+    movu                    m%2, [%4q + 1 * mmsize]
+    movu                    m%3, [%4q + 2 * mmsize]
+    ; we process mmsize/(2*ALF_BLOCK_SIZE) alf blocks,
+    ; consuming ALF_NUM_COEFF_LUMA int16_t coeffs per alf block
+    add                     %4q, 3 * mmsize
 %endmacro
 
 %macro LOAD_LUMA_PARAMS_W16 6
@@ -113,7 +116,6 @@ SECTION .text
 
 %macro LOAD_PARAMS 0
 %if LUMA
-    lea                 offsetq, [3 * xq]           ;xq * ALF_NUM_COEFF_LUMA / 
ALF_BLOCK_SIZE
     LOAD_LUMA_PARAMS          3, 4, 5, filter, 6, 7
     LOAD_LUMA_PARAMS          6, 7, 8, clip,   9, 10
 %else
@@ -401,18 +403,10 @@ SECTION .text
 %macro FILTER_16x4 1
 %if LUMA
     push clipq
-    push strideq
-    %define s1q clipq
-    %define s2q strideq
-%else
-    %define s1q s5q
-    %define s2q s6q
+    %define s5q clipq
+    %define s6q pixel_maxq
 %endif
 
-    %define s3q pixel_maxq
-    %define s4q offsetq
-    push xq
-
     xor               xd, xd
 %%filter_16x4_loop:
     LOAD_PIXELS       m2, [srcq]   ;p0
@@ -442,10 +436,7 @@ SECTION .text
     neg               xq
     lea             dstq, [dstq + xq * 4]
 
-    pop xq
-
 %if LUMA
-    pop strideq
     pop clipq
 %endif
 %endmacro
@@ -463,10 +454,10 @@ SECTION .text
 ; ******************************
 ; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
 ;      const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt 
ptr_diff_t height,
-;      const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t 
vb_pos, ptrdiff_t pixel_max);
+;      const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);
 ; ******************************
-cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, 
dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, 
pixel_max, \
-    offset, x, s5, s6
+cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, 
dst_stride, src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \
+    x, s1, s2, s3, s4
 %if !LUMA
 ; chroma does not use registers m5 and m8. Swap them to reduce the amount
 ; of nonvolatile registers on Win64. It also reduces codesize generally
@@ -489,7 +480,6 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 
12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s
     push            srcq
     push            dstq
     push          widthq
-    xor               xd, xd
 
     .loop_w:
         cmp       widthd, 16
@@ -500,7 +490,6 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 
12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s
 
         add         srcq, 16 * ps
         add         dstq, 16 * ps
-        add           xd, 16
         sub       widthd, 16
         jmp      .loop_w
 
@@ -525,9 +514,6 @@ INIT_YMM cpuname
     lea             srcq, [srcq + 4 * src_strideq]
     lea             dstq, [dstq + 4 * dst_strideq]
 
-    lea          filterq, [filterq + 2 * strideq]
-    lea            clipq, [clipq   + 2 * strideq]
-
     sub          vb_posd, 4
     sub          heightd, 4
     jg             .loop
diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index 158308fb33..5194ecfdeb 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -60,10 +60,10 @@ void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, 
ptrdiff_t dst_stride,
 #define ALF_BPC_PROTOTYPES(bpc, opt)                                           
                                          \
 void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
                                          \
     const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t 
height,                                         \
-    const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t 
vb_pos, ptrdiff_t pixel_max);                \
+    const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);                                  \
 void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t 
dst_stride,                                          \
     const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t 
height,                                         \
-    const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t 
vb_pos, ptrdiff_t pixel_max);                \
+    const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);                                  \
 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum,                 
                                          \
     const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, 
intptr_t vb_pos);                         \
 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, 
const int *gradient_sum,                      \
@@ -153,15 +153,14 @@ FW_PUT_16BPC_AVX2(12)
 static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
     int width, int height, const int16_t *filter, const int16_t *clip, const 
int vb_pos)                                 \
 {                                                                              
                                          \
-    const int param_stride  = (width >> 2) * ALF_NUM_COEFF_LUMA;               
                                          \
     BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, 
width, height,                                \
-        filter, clip, param_stride, vb_pos, (1 << bd)  - 1);                   
                                          \
+        filter, clip, vb_pos, (1 << bd)  - 1);                                 
                                          \
 }                                                                              
                                          \
 static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
     int width, int height, const int16_t *filter, const int16_t *clip, const 
int vb_pos)                                 \
 {                                                                              
                                          \
     BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, 
width, height,                              \
-        filter, clip, 0, vb_pos,(1 << bd)  - 1);                               
                                          \
+        filter, clip, vb_pos,(1 << bd)  - 1);                                  
                                          \
 }                                                                              
                                          \
 static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx,  
                                          \
     const uint8_t *src, ptrdiff_t src_stride, int width, int height, int 
vb_pos, int *gradient_tmp)                      \

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 24/28: avcodec/x86/vvc/alf: Remove pointless counter, stride

Reply via email to