PR #23608 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23608 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23608.patch
No change in performance here. PS: Does someone know the real range of the coefficients used here? e9abef437f0a348c017d4ac8b23a122881c1dc87 and e9abef437f0a348c017d4ac8b23a122881c1dc87 use saturated additions for the 8bit case, yet the 10bit case doesn't.) >From 5ba61f7baafbf78551019c1eefdb3b0ef16d8cad Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Fri, 26 Jun 2026 19:42:44 +0200 Subject: [PATCH] avcodec/x86/hevc/add_res: Port ff_hevc_add_residual_4_8_mmxext to SSE2 No change in performance here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hevc/add_res.asm | 17 ++++++++--------- libavcodec/x86/hevc/dsp.h | 2 +- libavcodec/x86/hevc/dsp_init.c | 5 ++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm index 3489e04e2b..70ae9fbc76 100644 --- a/libavcodec/x86/hevc/add_res.asm +++ b/libavcodec/x86/hevc/add_res.asm @@ -27,9 +27,9 @@ cextern pw_1023 %define max_pixels_10 pw_1023 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project -%macro ADD_RES_MMX_4_8 1 - mova m0, [r1+%1] - mova m2, [r1+%1+8] +%macro ADD_RES_4_8 1 + movq m0, [r1+%1] + movq m2, [r1+%1+8] movd m1, [r0] movd m3, [r0+r2] @@ -45,14 +45,13 @@ cextern pw_1023 movd [r0+r2], m2 %endmacro - -INIT_MMX mmxext -; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride) -cglobal hevc_add_residual_4_8, 3, 3, 6 +INIT_XMM sse2 +; void ff_hevc_add_residual_4_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride) +cglobal hevc_add_residual_4_8, 3, 3, 5 pxor m4, m4 - ADD_RES_MMX_4_8 0 + ADD_RES_4_8 0 lea r0, [r0+r2*2] - ADD_RES_MMX_4_8 16 + ADD_RES_4_8 16 RET %macro ADD_RES_SSE_8_8 1 diff --git a/libavcodec/x86/hevc/dsp.h b/libavcodec/x86/hevc/dsp.h index 0062699ce0..9536fa2508 100644 --- a/libavcodec/x86/hevc/dsp.h +++ b/libavcodec/x86/hevc/dsp.h @@ -167,7 +167,7 @@ void ff_hevc_put_qpel_hv8_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff // TRANSFORM_ADD /////////////////////////////////////////////////////////////////////////////// -void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_4_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); diff --git a/libavcodec/x86/hevc/dsp_init.c b/libavcodec/x86/hevc/dsp_init.c index bd967eac67..dbad0ea925 100644 --- a/libavcodec/x86/hevc/dsp_init.c +++ b/libavcodec/x86/hevc/dsp_init.c @@ -817,10 +817,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) int cpu_flags = av_get_cpu_flags(); if (bit_depth == 8) { - if (EXTERNAL_MMXEXT(cpu_flags)) { - c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext; - } if (EXTERNAL_SSE2(cpu_flags)) { + c->add_residual[0] = ff_hevc_add_residual_4_8_sse2; + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2; #if ARCH_X86_64 -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
