PR #21579 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21579
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21579.patch


>From 86e553bdda774c17c30b87192b198eddae9dd2ef Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 25 Jan 2026 23:23:36 +0100
Subject: [PATCH 1/4] avcodec/hevc/dsp_template: Optimize impossible branches
 away

Saves 1856B of .text here.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/hevc/dsp_template.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/hevc/dsp_template.c b/libavcodec/hevc/dsp_template.c
index 573cf9ee1e..f703f6d071 100644
--- a/libavcodec/hevc/dsp_template.c
+++ b/libavcodec/hevc/dsp_template.c
@@ -132,7 +132,7 @@ static void FUNC(dequant)(int16_t *coeffs, int16_t 
log2_size)
     int x, y;
     int size = 1 << log2_size;
 
-    if (shift > 0) {
+    if (BIT_DEPTH <= 9 || shift > 0) {
         int offset = 1 << (shift - 1);
         for (y = 0; y < size; y++) {
             for (x = 0; x < size; x++) {
@@ -140,7 +140,7 @@ static void FUNC(dequant)(int16_t *coeffs, int16_t 
log2_size)
                 coeffs++;
             }
         }
-    } else if (shift < 0) {
+    } else if (BIT_DEPTH > 10 && shift < 0) {
         for (y = 0; y < size; y++) {
             for (x = 0; x < size; x++) {
                 *coeffs = *(uint16_t*)coeffs << -shift;
-- 
2.52.0


>From 2e5ae4f840dea1a8cd3c2907d5a007616e7ed27b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 25 Jan 2026 23:32:14 +0100
Subject: [PATCH 2/4] avcodec/hevc/dsp: Add alignment for dequant

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/hevc/dsp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/hevc/dsp.h b/libavcodec/hevc/dsp.h
index a63586c3a2..b884cd36be 100644
--- a/libavcodec/hevc/dsp.h
+++ b/libavcodec/hevc/dsp.h
@@ -50,7 +50,7 @@ typedef struct HEVCDSPContext {
 
     void (*add_residual[4])(uint8_t *dst, const int16_t *res, ptrdiff_t 
stride);
 
-    void (*dequant)(int16_t *coeffs, int16_t log2_size);
+    void (*dequant)(int16_t *coeffs /* align 32 */, int16_t log2_size);
 
     void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
 
-- 
2.52.0


>From 5edc6a6274f1592c3d2de62f9782f4e3b93d1842 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 26 Jan 2026 02:03:32 +0100
Subject: [PATCH 3/4] avcodec/x86/hevc/dequant: Add SSSE3 dequant ASM function

hevc_dequant_4x4_8_c (GCC):                             20.2 ( 1.00x)
hevc_dequant_4x4_8_c (Clang):                           21.7 ( 1.00x)
hevc_dequant_4x4_8_ssse3:                                5.8 ( 3.51x)
hevc_dequant_8x8_8_c (GCC):                             32.9 ( 1.00x)
hevc_dequant_8x8_8_c (Clang):                           78.7 ( 1.00x)
hevc_dequant_8x8_8_ssse3:                                6.8 ( 4.83x)
hevc_dequant_16x16_8_c (GCC):                          105.1 ( 1.00x)
hevc_dequant_16x16_8_c (Clang):                        151.1 ( 1.00x)
hevc_dequant_16x16_8_ssse3:                             19.3 ( 5.45x)
hevc_dequant_32x32_8_c (GCC):                          415.7 ( 1.00x)
hevc_dequant_32x32_8_c (Clang):                        602.3 ( 1.00x)
hevc_dequant_32x32_8_ssse3:                             78.2 ( 5.32x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/hevc/Makefile    |  1 +
 libavcodec/x86/hevc/dequant.asm | 60 +++++++++++++++++++++++++++++++++
 libavcodec/x86/hevc/dsp_init.c  |  3 ++
 3 files changed, 64 insertions(+)
 create mode 100644 libavcodec/x86/hevc/dequant.asm

diff --git a/libavcodec/x86/hevc/Makefile b/libavcodec/x86/hevc/Makefile
index 74418a322c..d09c613a19 100644
--- a/libavcodec/x86/hevc/Makefile
+++ b/libavcodec/x86/hevc/Makefile
@@ -4,6 +4,7 @@ clean::
 X86ASM-OBJS-$(CONFIG_HEVC_DECODER)      += x86/hevc/dsp_init.o      \
                                            x86/hevc/add_res.o       \
                                            x86/hevc/deblock.o       \
+                                           x86/hevc/dequant.o       \
                                            x86/hevc/idct.o          \
                                            x86/hevc/mc.o            \
                                            x86/hevc/sao.o           \
diff --git a/libavcodec/x86/hevc/dequant.asm b/libavcodec/x86/hevc/dequant.asm
new file mode 100644
index 0000000000..f0453c940b
--- /dev/null
+++ b/libavcodec/x86/hevc/dequant.asm
@@ -0,0 +1,60 @@
+;*****************************************************************************
+;* SSSE3-optimized HEVC dequant code
+;*****************************************************************************
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM ssse3
+; void ff_hevc_dequant_8_ssse3(int16_t *coeffs, int16_t log2_size)
+cglobal hevc_dequant_8, 2, 3+UNIX64, 3
+
+; coeffs, log2_size (in ecx), tmp/size
+%if WIN64
+    DECLARE_REG_TMP 1,0,2
+    ; r0 is the shift register (ecx) on win64
+    xchg          r0, r1
+%elif ARCH_X86_64
+    DECLARE_REG_TMP 0,3,1
+    ; r3 is ecx
+    mov          t1d, r1d
+%else
+    ; r1 is ecx
+    DECLARE_REG_TMP 0,1,2
+%endif
+
+    mov          t2d, 256
+    shl          t2d, t1b
+    movd          m0, t2d
+    add          t1d, t1d
+    SPLATW        m0, m0
+    mov          t2d, 1
+    shl          t2d, t1b
+.loop:
+    mova          m1, [t0]
+    mova          m2, [t0+mmsize]
+    pmulhrsw      m1, m0
+    pmulhrsw      m2, m0
+    mova        [t0], m1
+    mova [t0+mmsize], m2
+    add           t0, 2*mmsize
+    sub          t2d, mmsize
+    jg         .loop
+    RET
diff --git a/libavcodec/x86/hevc/dsp_init.c b/libavcodec/x86/hevc/dsp_init.c
index 5b2b10f33a..bd967eac67 100644
--- a/libavcodec/x86/hevc/dsp_init.c
+++ b/libavcodec/x86/hevc/dsp_init.c
@@ -30,6 +30,8 @@
 #include "libavcodec/x86/hevc/dsp.h"
 #include "libavcodec/x86/h26x/h2656dsp.h"
 
+void ff_hevc_dequant_8_ssse3(int16_t *coeffs, int16_t log2_size);
+
 #define LFC_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t 
*pix, ptrdiff_t stride, const int *tc, const uint8_t *no_p, const uint8_t 
*no_q);
 
@@ -847,6 +849,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
 #endif
+            c->dequant = ff_hevc_dequant_8_ssse3;
             SAO_EDGE_INIT(8, ssse3);
         }
 #if HAVE_SSE4_EXTERNAL && ARCH_X86_64
-- 
2.52.0


>From 3fbdf06a6d681a86578bca2812fd052c639f35f9 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 26 Jan 2026 02:16:47 +0100
Subject: [PATCH 4/4] tests/checkasm/hevc_dequant: Only init buffer when needed

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/hevc_dequant.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/checkasm/hevc_dequant.c b/tests/checkasm/hevc_dequant.c
index 20e322994a..5036662666 100644
--- a/tests/checkasm/hevc_dequant.c
+++ b/tests/checkasm/hevc_dequant.c
@@ -48,11 +48,11 @@ static void check_dequant(HEVCDSPContext *h, int bit_depth)
         int size = block_size * block_size;
         declare_func(void, int16_t *coeffs, int16_t log2_size);
 
-        randomize_buffers(coeffs0, size);
-        memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
-
         if (check_func(h->dequant, "hevc_dequant_%dx%d_%d",
                        block_size, block_size, bit_depth)) {
+            randomize_buffers(coeffs0, size);
+            memcpy(coeffs1, coeffs0, sizeof(*coeffs0) * size);
+
             call_ref(coeffs0, i);
             call_new(coeffs1, i);
             if (memcmp(coeffs0, coeffs1, sizeof(*coeffs0) * size))
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to