This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 1960320112f97bc00744511cb80b8e2cfff4cc4a Author: Andreas Rheinhardt <[email protected]> AuthorDate: Tue Mar 3 01:09:26 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Fri Mar 6 20:02:42 2026 +0100 avcodec/x86/vvc/alf: Avoid pointless wrappers for alf_filter They are completely unnecessary for the 8bit case (which only handles 8bit) and overtly complicated for the 10 and 12bit cases: All one needs to do is set up the (1<<bpp)-1 vector register and jmp from (say) the 12bpp function stub inside the 10bpp function. The way it is done here even allows to share the prologue between the two functions. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 53 +++++++++++++++++++++++++++---------------- libavcodec/x86/vvc/dsp_init.c | 38 +++++++++++-------------------- 2 files changed, 47 insertions(+), 44 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index b7e9c54b68..dd3652843e 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -403,8 +403,7 @@ SECTION .text %macro FILTER_16x4 2 %if LUMA push clipq - %define s5q clipq - %define s6q pixel_maxq + %define s6q clipq %endif xor xd, xd @@ -443,23 +442,21 @@ SECTION .text %endif %endmacro -; FILTER(bpc, luma/chroma) -%macro ALF_FILTER 2 -%xdefine BPC %1 +; FILTER(bd, luma/chroma, bd of implementation to use) +%macro ALF_FILTER 3 %ifidn %2, luma %xdefine LUMA 1 %else %xdefine LUMA 0 %endif -%define ps (%1 / 8) ; pixel size +%assign ps (%1+7) / 8 ; pixel size ; ****************************** -; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride, -; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height, -; const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); +; void ff_vvc_alf_filter_%2_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride, +; const uint8_t *src, ptrdiff_t src_stride, int width, int height, +; const int16_t *filter, const int16_t *clip, int vb_pos); ; ****************************** -cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \ - x, s1, s2, s3, s4 +cglobal vvc_alf_filter_%2_%1 %if !LUMA ; chroma does not use registers m5 and m8. Swap them to reduce the amount ; of nonvolatile registers on Win64. It also reduces codesize generally @@ -471,10 +468,24 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, s SWAP 5,12 SWAP 8,13 %endif +%elif WIN64 && (ps != 1) +; Swap m5 and m15, so that the register for the maximum pixel value +; ends up in a volatile register + SWAP 5,15 %endif %if ps != 1 - movd xm15, pixel_maxd - vpbroadcastw m15, xm15 + ; create pw_pixelmax for clipping + pcmpeqw m15, m15 + psrlw m15, 16 - %1 +%endif + +%if %1 != %3 + jmp vvc_alf_filter_%2_%3_prologue +%else +vvc_alf_filter_%2_%1_prologue: + PROLOGUE 9, 14+LUMA, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, src, src_stride, width, height, filter, clip, vb_pos, \ + x, s1, s2, s3, s4, s5 +%if ps != 1 pxor m14, m14 %endif @@ -498,7 +509,9 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, s je .w_end %if LUMA +SAVE_MM_PERMUTATION INIT_XMM cpuname +LOAD_MM_PERMUTATION %endif LOAD_PARAMS FILTER_16x4 widthd, 0 @@ -518,12 +531,13 @@ INIT_YMM cpuname sub heightd, 4 jg .loop RET +%endif %endmacro -; FILTER(bpc) -%macro ALF_FILTER 1 - ALF_FILTER %1, luma - ALF_FILTER %1, chroma +; FILTER(bd, bd of implementation to use) +%macro ALF_FILTER 2 + ALF_FILTER %1, luma, %2 + ALF_FILTER %1, chroma, %2 %endmacro %define ALF_GRADIENT_BORDER 2 @@ -891,9 +905,10 @@ cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, transpose_idx, gradient_su %if ARCH_X86_64 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 -ALF_FILTER 16 -ALF_FILTER 8 +ALF_FILTER 12, 10 +ALF_FILTER 10, 10 ALF_CLASSIFY 16 +ALF_FILTER 8, 8 ALF_CLASSIFY 8 %endif %endif diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c index 5194ecfdeb..6802294795 100644 --- a/libavcodec/x86/vvc/dsp_init.c +++ b/libavcodec/x86/vvc/dsp_init.c @@ -58,12 +58,6 @@ void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, ptrdiff_t dst_stride, } while (0) #define ALF_BPC_PROTOTYPES(bpc, opt) \ -void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ - const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ - const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ -void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ - const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ - const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \ const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \ void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \ @@ -150,18 +144,6 @@ FW_PUT_16BPC_AVX2(10) FW_PUT_16BPC_AVX2(12) #define ALF_FUNCS(bpc, bd, opt) \ -static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ - int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \ -{ \ - BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \ - filter, clip, vb_pos, (1 << bd) - 1); \ -} \ -static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ - int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \ -{ \ - BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \ - filter, clip, vb_pos,(1 << bd) - 1); \ -} \ static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \ const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \ { \ @@ -298,10 +280,16 @@ void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); #define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2 -#define ALF_INIT(bd) do { \ - c->alf.filter[LUMA] = vvc_alf_filter_luma_##bd##_avx2; \ - c->alf.filter[CHROMA] = vvc_alf_filter_chroma_##bd##_avx2; \ - c->alf.classify = vvc_alf_classify_##bd##_avx2; \ +#define ALF_INIT(bd, opt) do { \ +void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, int width, int height, \ + const int16_t *filter, const int16_t *clip, int vb_pos); \ +void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, int width, int height, \ + const int16_t *filter, const int16_t *clip, int vb_pos); \ + c->alf.filter[LUMA] = bf(ff_vvc_alf_filter_luma, bd, opt); \ + c->alf.filter[CHROMA] = bf(ff_vvc_alf_filter_chroma, bd, opt); \ + c->alf.classify = bf(vvc_alf_classify, bd, opt); \ } while (0) #endif @@ -331,7 +319,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) SAD_INIT(); // filter - ALF_INIT(8); + ALF_INIT(8, avx2); SAO_INIT(8, avx2); } #endif @@ -353,7 +341,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) SAD_INIT(); // filter - ALF_INIT(10); + ALF_INIT(10, avx2); SAO_INIT(10, avx2); } #endif @@ -375,7 +363,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) SAD_INIT(); // filter - ALF_INIT(12); + ALF_INIT(12, avx2); SAO_INIT(12, avx2); } #endif _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
