This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 38062ebd18cf2f0de3720bdef054e941d69b6336 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Mon Mar 2 17:20:31 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Fri Mar 6 20:02:42 2026 +0100 avcodec/x86/vvc/alf: Remove pointless counter, stride Each luma alf block has 2*12 auxiliary coefficients associated with it that the alf_filter functions consume; the C version simply increments the pointers. The x64 dsp function meanwhile does things differenty: The vvc_alf_filter functions have three levels of loops. The middle layer uses two counters, one of which is just the horizontal offset xd in the current line. It is only used for addressing these auxiliary coefficients and yet one needs to perform work translate from it to the coefficient offset, namely a *3 via lea and a *2 scale. Furthermore, the base pointers of the coefficients are incremented in the outer loop; the stride used for this is calculated in the C wrapper functions. Furthermore, due to GPR pressure xd is reused as loop counter for the innermost loop; the xd from the middle loop is pushed to the stack. Apart from the translation from horizontal offset to coefficient offset all of the above has been done for chroma, too, although the coefficient pointers don't get modified for them at all. This commit changes this to just increment the pointers after reading the relevant coefficients. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 36 +++++++++++------------------------- libavcodec/x86/vvc/dsp_init.c | 9 ++++----- 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index e924308cff..df2f782683 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -59,9 +59,12 @@ SECTION .text ;%1-%3 out ;%4 clip or filter %macro LOAD_LUMA_PARAMS 4 - movu m%1, [%4q + 2 * offsetq + 0 * mmsize] ; 2 * for sizeof(int16_t) - movu m%2, [%4q + 2 * offsetq + 1 * mmsize] - movu m%3, [%4q + 2 * offsetq + 2 * mmsize] + movu m%1, [%4q + 0 * mmsize] + movu m%2, [%4q + 1 * mmsize] + movu m%3, [%4q + 2 * mmsize] + ; we process mmsize/(2*ALF_BLOCK_SIZE) alf blocks, + ; consuming ALF_NUM_COEFF_LUMA int16_t coeffs per alf block + add %4q, 3 * mmsize %endmacro %macro LOAD_LUMA_PARAMS_W16 6 @@ -113,7 +116,6 @@ SECTION .text %macro LOAD_PARAMS 0 %if LUMA - lea offsetq, [3 * xq] ;xq * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE LOAD_LUMA_PARAMS 3, 4, 5, filter, 6, 7 LOAD_LUMA_PARAMS 6, 7, 8, clip, 9, 10 %else @@ -401,18 +403,10 @@ SECTION .text %macro FILTER_16x4 1 %if LUMA push clipq - push strideq - %define s1q clipq - %define s2q strideq -%else - %define s1q s5q - %define s2q s6q + %define s5q clipq + %define s6q pixel_maxq %endif - %define s3q pixel_maxq - %define s4q offsetq - push xq - xor xd, xd %%filter_16x4_loop: LOAD_PIXELS m2, [srcq] ;p0 @@ -442,10 +436,7 @@ SECTION .text neg xq lea dstq, [dstq + xq * 4] - pop xq - %if LUMA - pop strideq pop clipq %endif %endmacro @@ -463,10 +454,10 @@ SECTION .text ; ****************************** ; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride, ; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height, -; const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); +; const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); ; ****************************** -cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \ - offset, x, s5, s6 +cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \ + x, s1, s2, s3, s4 %if !LUMA ; chroma does not use registers m5 and m8. Swap them to reduce the amount ; of nonvolatile registers on Win64. It also reduces codesize generally @@ -489,7 +480,6 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s push srcq push dstq push widthq - xor xd, xd .loop_w: cmp widthd, 16 @@ -500,7 +490,6 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s add srcq, 16 * ps add dstq, 16 * ps - add xd, 16 sub widthd, 16 jmp .loop_w @@ -525,9 +514,6 @@ INIT_YMM cpuname lea srcq, [srcq + 4 * src_strideq] lea dstq, [dstq + 4 * dst_strideq] - lea filterq, [filterq + 2 * strideq] - lea clipq, [clipq + 2 * strideq] - sub vb_posd, 4 sub heightd, 4 jg .loop diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c index 158308fb33..5194ecfdeb 100644 --- a/libavcodec/x86/vvc/dsp_init.c +++ b/libavcodec/x86/vvc/dsp_init.c @@ -60,10 +60,10 @@ void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, ptrdiff_t dst_stride, #define ALF_BPC_PROTOTYPES(bpc, opt) \ void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ - const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ + const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ - const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ + const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \ const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \ void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \ @@ -153,15 +153,14 @@ FW_PUT_16BPC_AVX2(12) static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \ { \ - const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA; \ BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \ - filter, clip, param_stride, vb_pos, (1 << bd) - 1); \ + filter, clip, vb_pos, (1 << bd) - 1); \ } \ static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \ { \ BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \ - filter, clip, 0, vb_pos,(1 << bd) - 1); \ + filter, clip, vb_pos,(1 << bd) - 1); \ } \ static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \ const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \ _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
