PR #23608 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23608
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23608.patch

No change in performance here.

PS: Does someone know the real range of the coefficients used here? 
e9abef437f0a348c017d4ac8b23a122881c1dc87 and 
e9abef437f0a348c017d4ac8b23a122881c1dc87 use saturated additions for the 8bit 
case, yet the 10bit case doesn't.)


>From 5ba61f7baafbf78551019c1eefdb3b0ef16d8cad Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Fri, 26 Jun 2026 19:42:44 +0200
Subject: [PATCH] avcodec/x86/hevc/add_res: Port
 ff_hevc_add_residual_4_8_mmxext to SSE2

No change in performance here.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/hevc/add_res.asm | 17 ++++++++---------
 libavcodec/x86/hevc/dsp.h       |  2 +-
 libavcodec/x86/hevc/dsp_init.c  |  5 ++---
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm
index 3489e04e2b..70ae9fbc76 100644
--- a/libavcodec/x86/hevc/add_res.asm
+++ b/libavcodec/x86/hevc/add_res.asm
@@ -27,9 +27,9 @@ cextern pw_1023
 %define max_pixels_10 pw_1023
 
 ; the add_res macros and functions were largely inspired by h264_idct.asm from 
the x264 project
-%macro ADD_RES_MMX_4_8 1
-    mova              m0, [r1+%1]
-    mova              m2, [r1+%1+8]
+%macro ADD_RES_4_8 1
+    movq              m0, [r1+%1]
+    movq              m2, [r1+%1+8]
 
     movd              m1, [r0]
     movd              m3, [r0+r2]
@@ -45,14 +45,13 @@ cextern pw_1023
     movd         [r0+r2], m2
 %endmacro
 
-
-INIT_MMX mmxext
-; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, 
ptrdiff_t stride)
-cglobal hevc_add_residual_4_8, 3, 3, 6
+INIT_XMM sse2
+; void ff_hevc_add_residual_4_8_sse2(uint8_t *dst, const int16_t *res, 
ptrdiff_t stride)
+cglobal hevc_add_residual_4_8, 3, 3, 5
     pxor              m4, m4
-    ADD_RES_MMX_4_8    0
+    ADD_RES_4_8        0
     lea               r0, [r0+r2*2]
-    ADD_RES_MMX_4_8   16
+    ADD_RES_4_8       16
     RET
 
 %macro ADD_RES_SSE_8_8 1
diff --git a/libavcodec/x86/hevc/dsp.h b/libavcodec/x86/hevc/dsp.h
index 0062699ce0..9536fa2508 100644
--- a/libavcodec/x86/hevc/dsp.h
+++ b/libavcodec/x86/hevc/dsp.h
@@ -167,7 +167,7 @@ void ff_hevc_put_qpel_hv8_8_avx512icl(int16_t *dst, const 
uint8_t *_src, ptrdiff
 // TRANSFORM_ADD
 ///////////////////////////////////////////////////////////////////////////////
 
-void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, 
ptrdiff_t stride);
+void ff_hevc_add_residual_4_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t 
stride);
 void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t 
stride);
 void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, 
ptrdiff_t stride);
 void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, 
ptrdiff_t stride);
diff --git a/libavcodec/x86/hevc/dsp_init.c b/libavcodec/x86/hevc/dsp_init.c
index bd967eac67..dbad0ea925 100644
--- a/libavcodec/x86/hevc/dsp_init.c
+++ b/libavcodec/x86/hevc/dsp_init.c
@@ -817,10 +817,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
     int cpu_flags = av_get_cpu_flags();
 
     if (bit_depth == 8) {
-        if (EXTERNAL_MMXEXT(cpu_flags)) {
-            c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
-        }
         if (EXTERNAL_SSE2(cpu_flags)) {
+            c->add_residual[0] = ff_hevc_add_residual_4_8_sse2;
+
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
 #if ARCH_X86_64
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to