PR #20932 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20932 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20932.patch
>From a7102ce7ed9e6c0a8c61a92eb8e66b4260057adb Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 15 Nov 2025 16:18:16 +0100 Subject: [PATCH 1/9] avcodec/x86/mpegvideoenc: Remove check for MMX Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/mpegvideoenc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c index eac9947590..bb1d2cc319 100644 --- a/libavcodec/x86/mpegvideoenc.c +++ b/libavcodec/x86/mpegvideoenc.c @@ -123,16 +123,14 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s) const int dct_algo = s->c.avctx->dct_algo; if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) { -#if HAVE_MMX_INLINE - int cpu_flags = av_get_cpu_flags(); #if HAVE_SSE2_INLINE + int cpu_flags = av_get_cpu_flags(); if (INLINE_SSE2(cpu_flags)) { #if HAVE_6REGS s->dct_quantize = dct_quantize_sse2; #endif s->denoise_dct = denoise_dct_sse2; } -#endif #if HAVE_6REGS && HAVE_SSSE3_INLINE if (INLINE_SSSE3(cpu_flags)) s->dct_quantize = dct_quantize_ssse3; -- 2.49.1 >From feecc0585a8b83eb0d0897c8a842e82f080d6f26 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 15 Nov 2025 16:46:18 +0100 Subject: [PATCH 2/9] avcodec/x86/mpegvideoenc: Reduce number of registers used Avoids a push+pop on x64 Windows. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/mpegvideoenc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c index bb1d2cc319..2ca05f69ea 100644 --- a/libavcodec/x86/mpegvideoenc.c +++ b/libavcodec/x86/mpegvideoenc.c @@ -68,7 +68,7 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[]) s->dct_count[intra]++; __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" + "pxor %%xmm6, %%xmm6 \n\t" "1: \n\t" "pxor %%xmm0, %%xmm0 \n\t" "pxor %%xmm1, %%xmm1 \n\t" @@ -90,18 +90,18 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[]) "psubw %%xmm1, %%xmm3 \n\t" "movdqa %%xmm2, (%0) \n\t" "movdqa %%xmm3, 16(%0) \n\t" - "movdqa %%xmm4, %%xmm6 \n\t" + "movdqa %%xmm4, %%xmm2 \n\t" "movdqa %%xmm5, %%xmm0 \n\t" - "punpcklwd %%xmm7, %%xmm4 \n\t" - "punpckhwd %%xmm7, %%xmm6 \n\t" - "punpcklwd %%xmm7, %%xmm5 \n\t" - "punpckhwd %%xmm7, %%xmm0 \n\t" + "punpcklwd %%xmm6, %%xmm4 \n\t" + "punpckhwd %%xmm6, %%xmm2 \n\t" + "punpcklwd %%xmm6, %%xmm5 \n\t" + "punpckhwd %%xmm6, %%xmm0 \n\t" "paddd (%1), %%xmm4 \n\t" - "paddd 16(%1), %%xmm6 \n\t" + "paddd 16(%1), %%xmm2 \n\t" "paddd 32(%1), %%xmm5 \n\t" "paddd 48(%1), %%xmm0 \n\t" "movdqa %%xmm4, (%1) \n\t" - "movdqa %%xmm6, 16(%1) \n\t" + "movdqa %%xmm2, 16(%1) \n\t" "movdqa %%xmm5, 32(%1) \n\t" "movdqa %%xmm0, 48(%1) \n\t" "add $32, %0 \n\t" @@ -112,7 +112,7 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[]) : "+r" (block), "+r" (sum), "+r" (offset) : "r"(block+64) XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7") + "%xmm4", "%xmm5", "%xmm6") ); } #endif /* HAVE_SSE2_INLINE */ -- 2.49.1 >From 89a1bacded6e635f4773d2ae8b72cbd4f9a12338 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 15 Nov 2025 17:32:29 +0100 Subject: [PATCH 3/9] avcodec/x86/mpegvideoenc: Port denoise_dct_sse2 to external assembly Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/mpegvideoenc.c | 59 ++++-------------------------- libavcodec/x86/mpegvideoencdsp.asm | 46 +++++++++++++++++++++++ 2 files changed, 54 insertions(+), 51 deletions(-) diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c index 2ca05f69ea..e5665ac781 100644 --- a/libavcodec/x86/mpegvideoenc.c +++ b/libavcodec/x86/mpegvideoenc.c @@ -57,8 +57,10 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = { #endif /* HAVE_6REGS */ -#if HAVE_INLINE_ASM -#if HAVE_SSE2_INLINE +#if HAVE_SSE2_EXTERNAL +void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64], + const uint16_t dct_offset[64]); + static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[]) { const int intra = s->c.mb_intra; @@ -67,56 +69,9 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[]) s->dct_count[intra]++; - __asm__ volatile( - "pxor %%xmm6, %%xmm6 \n\t" - "1: \n\t" - "pxor %%xmm0, %%xmm0 \n\t" - "pxor %%xmm1, %%xmm1 \n\t" - "movdqa (%0), %%xmm2 \n\t" - "movdqa 16(%0), %%xmm3 \n\t" - "pcmpgtw %%xmm2, %%xmm0 \n\t" - "pcmpgtw %%xmm3, %%xmm1 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, %%xmm4 \n\t" - "movdqa %%xmm3, %%xmm5 \n\t" - "psubusw (%2), %%xmm2 \n\t" - "psubusw 16(%2), %%xmm3 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, (%0) \n\t" - "movdqa %%xmm3, 16(%0) \n\t" - "movdqa %%xmm4, %%xmm2 \n\t" - "movdqa %%xmm5, %%xmm0 \n\t" - "punpcklwd %%xmm6, %%xmm4 \n\t" - "punpckhwd %%xmm6, %%xmm2 \n\t" - "punpcklwd %%xmm6, %%xmm5 \n\t" - "punpckhwd %%xmm6, %%xmm0 \n\t" - "paddd (%1), %%xmm4 \n\t" - "paddd 16(%1), %%xmm2 \n\t" - "paddd 32(%1), %%xmm5 \n\t" - "paddd 48(%1), %%xmm0 \n\t" - "movdqa %%xmm4, (%1) \n\t" - "movdqa %%xmm2, 16(%1) \n\t" - "movdqa %%xmm5, 32(%1) \n\t" - "movdqa %%xmm0, 48(%1) \n\t" - "add $32, %0 \n\t" - "add $64, %1 \n\t" - "add $32, %2 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (block), "+r" (sum), "+r" (offset) - : "r"(block+64) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6") - ); + ff_mpv_denoise_dct_sse2(block, sum, offset); } -#endif /* HAVE_SSE2_INLINE */ -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_SSE2_EXTERNAL */ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s) { @@ -129,7 +84,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s) #if HAVE_6REGS s->dct_quantize = dct_quantize_sse2; #endif +#if HAVE_SSE2_EXTERNAL s->denoise_dct = denoise_dct_sse2; +#endif } #if HAVE_6REGS && HAVE_SSSE3_INLINE if (INLINE_SSSE3(cpu_flags)) diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm index d12646ae54..0e86a5304c 100644 --- a/libavcodec/x86/mpegvideoencdsp.asm +++ b/libavcodec/x86/mpegvideoencdsp.asm @@ -24,6 +24,52 @@ %include "libavutil/x86/x86util.asm" SECTION .text + +INIT_XMM sse2 +cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset + pxor m6, m6 + lea r3, [sumq+256] +.loop: + mova m2, [blockq] + mova m3, [blockq+16] + mova m0, m6 + mova m1, m6 + pcmpgtw m0, m2 + pcmpgtw m1, m3 + pxor m2, m0 + pxor m3, m1 + psubw m2, m0 + psubw m3, m1 + psubusw m4, m2, [offsetq] + psubusw m5, m3, [offsetq+16] + pxor m4, m0 + pxor m5, m1 + add offsetq, 32 + psubw m4, m0 + psubw m5, m1 + mova [blockq], m4 + mova [blockq+16], m5 + mova m0, m2 + mova m1, m3 + add blockq, 32 + punpcklwd m0, m6 + punpckhwd m2, m6 + punpcklwd m1, m6 + punpckhwd m3, m6 + paddd m0, [sumq] + paddd m2, [sumq+16] + paddd m1, [sumq+32] + paddd m3, [sumq+48] + mova [sumq], m0 + mova [sumq+16], m2 + mova [sumq+32], m1 + mova [sumq+48], m3 + add sumq, 64 + cmp sumq, r3 + jb .loop + RET + + ; int ff_pix_sum16(const uint8_t *pix, ptrdiff_t line_size) ; %1 = number of loops ; %2 = number of GPRs used -- 2.49.1 >From bdc7fcbd439cca0cb1d85f51ca06fce91ac7c150 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 15 Nov 2025 18:24:18 +0100 Subject: [PATCH 4/9] avcodec/mpegvideo_enc: Port denoise_dct to MpegvideoEncDSPContext It is very simple to remove the MPVEncContext from it. Notice that this also fixes a bug in x86/mpegvideoenc.c: It only used the SSE2 version of denoise_dct when dct_algo was auto or mmx (and it was therefore unused during FATE). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/mips/Makefile | 3 +- libavcodec/mips/mpegvideo_mips.h | 3 +- libavcodec/mips/mpegvideoenc_init_mips.c | 33 ---------------- libavcodec/mips/mpegvideoencdsp_init_mips.c | 5 +++ ...egvideoenc_mmi.c => mpegvideoencdsp_mmi.c} | 7 +--- libavcodec/mpegvideo_enc.c | 38 +++++-------------- libavcodec/mpegvideoenc.h | 2 - libavcodec/mpegvideoencdsp.c | 25 ++++++++++++ libavcodec/mpegvideoencdsp.h | 3 ++ libavcodec/x86/mpegvideoenc.c | 19 ---------- libavcodec/x86/mpegvideoenc_template.c | 7 +++- libavcodec/x86/mpegvideoencdsp_init.c | 3 ++ 12 files changed, 53 insertions(+), 95 deletions(-) delete mode 100644 libavcodec/mips/mpegvideoenc_init_mips.c rename libavcodec/mips/{mpegvideoenc_mmi.c => mpegvideoencdsp_mmi.c} (95%) diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index 4bbc2f00ea..1d777293d0 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -54,7 +54,6 @@ OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_init_mips.o OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_init_mips.o OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_init_mips.o OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_init_mips.o -OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoenc_init_mips.o OBJS-$(CONFIG_MPEGVIDEOENCDSP) += mips/mpegvideoencdsp_init_mips.o OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_init_mips.o OBJS-$(CONFIG_MPEG4_DECODER) += mips/xvididct_init_mips.o @@ -100,7 +99,7 @@ MMI-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o MMI-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o MMI-OBJS-$(CONFIG_H264PRED) += mips/h264pred_mmi.o MMI-OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_mmi.o -MMI-OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoenc_mmi.o +MMI-OBJS-$(CONFIG_MPEGVIDEOENCDSP) += mips/mpegvideoenc_mmi.o MMI-OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_mmi.o \ mips/simple_idct_mmi.o MMI-OBJS-$(CONFIG_MPEG4_DECODER) += mips/xvid_idct_mmi.o diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h index 72ffed6985..2a9ea4006e 100644 --- a/libavcodec/mips/mpegvideo_mips.h +++ b/libavcodec/mips/mpegvideo_mips.h @@ -22,7 +22,6 @@ #define AVCODEC_MIPS_MPEGVIDEO_MIPS_H #include "libavcodec/mpegvideo.h" -#include "libavcodec/mpegvideoenc.h" void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block, int n, int qscale); @@ -34,6 +33,6 @@ void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block, int n, int qscale); void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block, int n, int qscale); -void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block); +void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t offset[64]); #endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */ diff --git a/libavcodec/mips/mpegvideoenc_init_mips.c b/libavcodec/mips/mpegvideoenc_init_mips.c deleted file mode 100644 index 7831973eb8..0000000000 --- a/libavcodec/mips/mpegvideoenc_init_mips.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2015 Manojkumar Bhosale ([email protected]) - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/mips/cpu.h" -#include "libavcodec/mpegvideoenc.h" -#include "mpegvideo_mips.h" - -av_cold void ff_mpvenc_dct_init_mips(MPVEncContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_mmi(cpu_flags)) { - s->denoise_dct = ff_denoise_dct_mmi; - } -} diff --git a/libavcodec/mips/mpegvideoencdsp_init_mips.c b/libavcodec/mips/mpegvideoencdsp_init_mips.c index 24a17b91db..df916282a2 100644 --- a/libavcodec/mips/mpegvideoencdsp_init_mips.c +++ b/libavcodec/mips/mpegvideoencdsp_init_mips.c @@ -23,12 +23,17 @@ #include "libavcodec/bit_depth_template.c" #include "libavcodec/mpegvideoencdsp.h" #include "h263dsp_mips.h" +#include "mpegvideo_mips.h" av_cold void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c, AVCodecContext *avctx) { int cpu_flags = av_get_cpu_flags(); + if (have_mmi(cpu_flags)) { + c->denoise_dct = ff_denoise_dct_mmi; + } + if (have_msa(cpu_flags)) { #if BIT_DEPTH == 8 c->pix_sum = ff_pix_sum_msa; diff --git a/libavcodec/mips/mpegvideoenc_mmi.c b/libavcodec/mips/mpegvideoencdsp_mmi.c similarity index 95% rename from libavcodec/mips/mpegvideoenc_mmi.c rename to libavcodec/mips/mpegvideoencdsp_mmi.c index 085be3b0ec..2239a05978 100644 --- a/libavcodec/mips/mpegvideoenc_mmi.c +++ b/libavcodec/mips/mpegvideoencdsp_mmi.c @@ -25,17 +25,12 @@ #include "mpegvideo_mips.h" #include "libavutil/mips/mmiutils.h" -void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block) +void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t offset[64]) { - const int intra = s->c.mb_intra; - int *sum = s->dct_error_sum[intra]; - uint16_t *offset = s->dct_offset[intra]; double ftmp[8]; mips_reg addr[1]; DECLARE_VAR_ALL64; - s->dct_count[intra]++; - __asm__ volatile( "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" "1: \n\t" diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c index ce0ee4bb68..9e83026b51 100644 --- a/libavcodec/mpegvideo_enc.c +++ b/libavcodec/mpegvideo_enc.c @@ -86,7 +86,6 @@ static int encode_picture(MPVMainEncContext *const s, const AVPacket *pkt); static int dct_quantize_refine(MPVEncContext *const s, int16_t *block, int16_t *weight, int16_t *orig, int n, int qscale); static int sse_mb(MPVEncContext *const s); -static void denoise_dct_c(MPVEncContext *const s, int16_t *block); static int dct_quantize_c(MPVEncContext *const s, int16_t *block, int n, int qscale, int *overflow); @@ -300,11 +299,8 @@ static av_cold void mpv_encode_defaults(MPVMainEncContext *const m) av_cold void ff_dct_encode_init(MPVEncContext *const s) { s->dct_quantize = dct_quantize_c; - s->denoise_dct = denoise_dct_c; -#if ARCH_MIPS - ff_mpvenc_dct_init_mips(s); -#elif ARCH_X86 +#if ARCH_X86 ff_dct_encode_init_x86(s); #endif @@ -3955,29 +3951,14 @@ static int encode_picture(MPVMainEncContext *const m, const AVPacket *pkt) return 0; } -static void denoise_dct_c(MPVEncContext *const s, int16_t *block) +static inline void denoise_dct(MPVEncContext *const s, int16_t block[]) { + if (!s->dct_error_sum) + return; + const int intra = s->c.mb_intra; - int i; - s->dct_count[intra]++; - - for(i=0; i<64; i++){ - int level= block[i]; - - if(level){ - if(level>0){ - s->dct_error_sum[intra][i] += level; - level -= s->dct_offset[intra][i]; - if(level<0) level=0; - }else{ - s->dct_error_sum[intra][i] -= level; - level += s->dct_offset[intra][i]; - if(level>0) level=0; - } - block[i]= level; - } - } + s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra], s->dct_offset[intra]); } static int dct_quantize_trellis_c(MPVEncContext *const s, @@ -4009,8 +3990,8 @@ static int dct_quantize_trellis_c(MPVEncContext *const s, s->fdsp.fdct(block); - if(s->dct_error_sum) - s->denoise_dct(s, block); + denoise_dct(s, block); + qmul= qscale*16; qadd= ((qscale-1)|1)*8; @@ -4678,8 +4659,7 @@ static int dct_quantize_c(MPVEncContext *const s, s->fdsp.fdct(block); - if(s->dct_error_sum) - s->denoise_dct(s, block); + denoise_dct(s, block); if (s->c.mb_intra) { scantable = s->c.intra_scantable.scantable; diff --git a/libavcodec/mpegvideoenc.h b/libavcodec/mpegvideoenc.h index ee115c3611..131908c10a 100644 --- a/libavcodec/mpegvideoenc.h +++ b/libavcodec/mpegvideoenc.h @@ -123,7 +123,6 @@ typedef struct MPVEncContext { uint16_t (*q_inter_matrix16)[2][64]; /* noise reduction */ - void (*denoise_dct)(struct MPVEncContext *s, int16_t *block); int (*dct_error_sum)[64]; int dct_count[2]; uint16_t (*dct_offset)[64]; @@ -397,7 +396,6 @@ int ff_mpv_reallocate_putbitbuffer(MPVEncContext *s, size_t threshold, size_t si void ff_write_quant_matrix(PutBitContext *pb, uint16_t *matrix); void ff_dct_encode_init(MPVEncContext *s); -void ff_mpvenc_dct_init_mips(MPVEncContext *s); void ff_dct_encode_init_x86(MPVEncContext *s); void ff_convert_matrix(MPVEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[2][64], diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c index b4fd2af915..3b4a57d58a 100644 --- a/libavcodec/mpegvideoencdsp.c +++ b/libavcodec/mpegvideoencdsp.c @@ -28,6 +28,29 @@ #include "mathops.h" #include "mpegvideoencdsp.h" +static void denoise_dct_c(int16_t block[64], int dct_error_sum[64], + const uint16_t dct_offset[64]) +{ + for (int i = 0; i < 64; ++i) { + int level = block[i]; + + if (level) { + if (level > 0) { + dct_error_sum[i] += level; + level -= dct_offset[i]; + if (level < 0) + level = 0; + } else { + dct_error_sum[i] -= level; + level += dct_offset[i]; + if (level > 0) + level = 0; + } + block[i] = level; + } + } +} + static int try_8x8basis_c(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale) { @@ -253,6 +276,8 @@ static void shrink88(uint8_t *dst, ptrdiff_t dst_wrap, av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, AVCodecContext *avctx) { + c->denoise_dct = denoise_dct_c; + c->try_8x8basis = try_8x8basis_c; c->add_8x8basis = add_8x8basis_c; diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h index 6ec665677b..989503f25f 100644 --- a/libavcodec/mpegvideoencdsp.h +++ b/libavcodec/mpegvideoencdsp.h @@ -30,6 +30,9 @@ #define EDGE_BOTTOM 2 typedef struct MpegvideoEncDSPContext { + void (*denoise_dct)(int16_t block[64], int dct_error_sum[64], + const uint16_t dct_offset[64]); + int (*try_8x8basis)(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale); void (*add_8x8basis)(int16_t rem[64], const int16_t basis[64], int scale); diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c index e5665ac781..c667dcd2a2 100644 --- a/libavcodec/x86/mpegvideoenc.c +++ b/libavcodec/x86/mpegvideoenc.c @@ -57,22 +57,6 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = { #endif /* HAVE_6REGS */ -#if HAVE_SSE2_EXTERNAL -void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64], - const uint16_t dct_offset[64]); - -static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[]) -{ - const int intra = s->c.mb_intra; - int *sum= s->dct_error_sum[intra]; - uint16_t *offset= s->dct_offset[intra]; - - s->dct_count[intra]++; - - ff_mpv_denoise_dct_sse2(block, sum, offset); -} -#endif /* HAVE_SSE2_EXTERNAL */ - av_cold void ff_dct_encode_init_x86(MPVEncContext *const s) { const int dct_algo = s->c.avctx->dct_algo; @@ -83,9 +67,6 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s) if (INLINE_SSE2(cpu_flags)) { #if HAVE_6REGS s->dct_quantize = dct_quantize_sse2; -#endif -#if HAVE_SSE2_EXTERNAL - s->denoise_dct = denoise_dct_sse2; #endif } #if HAVE_6REGS && HAVE_SSSE3_INLINE diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c index f0b95c1621..14e993de2b 100644 --- a/libavcodec/x86/mpegvideoenc_template.c +++ b/libavcodec/x86/mpegvideoenc_template.c @@ -76,8 +76,11 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, //s->fdct (block); ff_fdct_sse2(block); // cannot be anything else ... - if(s->dct_error_sum) - s->denoise_dct(s, block); + if (s->dct_error_sum) { + const int intra = s->c.mb_intra; + s->dct_count[intra]++; + s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra], s->dct_offset[intra]); + } if (s->c.mb_intra) { int dummy; diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index bf5b722016..f6169b5399 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -27,6 +27,8 @@ #include "libavcodec/avcodec.h" #include "libavcodec/mpegvideoencdsp.h" +void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64], + const uint16_t dct_offset[64]); int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size); int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size); int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size); @@ -209,6 +211,7 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_SSE2(cpu_flags)) { + c->denoise_dct = ff_mpv_denoise_dct_sse2; c->pix_sum = ff_pix_sum16_sse2; c->pix_norm1 = ff_pix_norm1_sse2; } -- 2.49.1 >From 06076cb368f1cf6baaed5f6de8ed2894236c5910 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 15 Nov 2025 19:06:14 +0100 Subject: [PATCH 5/9] tests/checkasm/mpegvideoencdsp: Test denoise_dct Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/mpegvideoencdsp.c | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c index a4a4fa6f5c..955cd9f5b7 100644 --- a/tests/checkasm/mpegvideoencdsp.c +++ b/tests/checkasm/mpegvideoencdsp.c @@ -37,6 +37,37 @@ buf[j] = rnd() % (max - min + 1) + min; \ } while (0) +static void check_denoise_dct(MpegvideoEncDSPContext *c) +{ + declare_func(void, int16_t block[64], int dct_error_sum[64], + const uint16_t dct_offset[64]); + + if (check_func(c->denoise_dct, "denoise_dct")) { + DECLARE_ALIGNED(16, int16_t, block_ref)[64]; + DECLARE_ALIGNED(16, int16_t, block_new)[64]; + DECLARE_ALIGNED(16, int, dct_error_sum_ref)[64]; + DECLARE_ALIGNED(16, int, dct_error_sum_new)[64]; + DECLARE_ALIGNED(16, uint16_t, dct_offset)[64]; + + for (size_t i = 0; i < FF_ARRAY_ELEMS(block_ref); ++i) { + unsigned random = rnd(); + block_ref[i] = random & (1 << 16) ? random : 0; + } + randomize_buffers(dct_offset, sizeof(dct_offset)); + randomize_buffer_clipped(dct_error_sum_ref, 0, (1 << 24) - 1); + memcpy(block_new, block_ref, sizeof(block_new)); + memcpy(dct_error_sum_new, dct_error_sum_ref, sizeof(dct_error_sum_ref)); + + call_ref(block_ref, dct_error_sum_ref, dct_offset); + call_new(block_new, dct_error_sum_new, dct_offset); + if (memcmp(block_ref, block_new, sizeof(block_ref)) || + memcmp(dct_error_sum_new, dct_error_sum_ref, sizeof(dct_error_sum_new))) + fail(); + + bench_new(block_new, dct_error_sum_new, dct_offset); + } +} + static void check_add_8x8basis(MpegvideoEncDSPContext *c) { declare_func(void, int16_t rem[64], const int16_t basis[64], int scale); @@ -166,6 +197,8 @@ void checkasm_check_mpegvideoencdsp(void) ff_mpegvideoencdsp_init(&c, &avctx); + check_denoise_dct(&c); + report("denoise_dct"); check_pix_sum(&c); report("pix_sum"); check_pix_norm1(&c); -- 2.49.1 >From 1b9714e7bc08908a0f03c4a0a5757485489cec4c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 15 Nov 2025 19:44:02 +0100 Subject: [PATCH 6/9] avcodec/x86/mpegvideoencdsp: Port add_8x8basis_ssse3() to ASM Both GCC and Clang completely unroll the unlikely loop at -O3, leading to codesize bloat; their code is also suboptimal, as they don't make use of pmulhrsw (even with -mssse3). This commit therefore ports the whole function to external assembly. The new function occupies 176B here vs 1406B for GCC. Benchmarks for a testcase with huge qscale (notice that the C version is unrolled just like the unlikely loop in the SSSE3 version): add_8x8basis_c: 43.4 ( 1.00x) add_8x8basis_ssse3 (old): 43.6 ( 1.00x) add_8x8basis_ssse3 (new): 12.6 ( 3.46x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/mpegvideoencdsp.asm | 46 +++++++++++++++++++++++++++ libavcodec/x86/mpegvideoencdsp_init.c | 46 ++++----------------------- 2 files changed, 53 insertions(+), 39 deletions(-) diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm index 0e86a5304c..a85de32449 100644 --- a/libavcodec/x86/mpegvideoencdsp.asm +++ b/libavcodec/x86/mpegvideoencdsp.asm @@ -25,6 +25,52 @@ SECTION .text +; void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale) +INIT_XMM ssse3 +cglobal add_8x8basis, 3, 3, 4, rem, basis, scale + movd m0, scaled + add scaled, 1024 + add basisq, 128 + add remq, 128 + cmp scaled, 2047 + mov r2q, -128 + ja .huge_scale + + punpcklwd m0, m0 + pshufd m0, m0, 0x0 + psllw m0, 5 +.loop1: + mova m1, [basisq+r2q] + mova m2, [basisq+r2q+16] + pmulhrsw m1, m0 + pmulhrsw m2, m0 + paddw m1, [remq+r2q] + paddw m2, [remq+r2q+16] + mova [remq+r2q], m1 + mova [remq+r2q+16], m2 + add r2q, 32 + js .loop1 + RET + +.huge_scale: + pslld m0, 6 + pshuflw m1, m0, 0x55 + psrlw m0, 1 + punpcklwd m0, m0 + punpcklwd m1, m1 + pshufd m0, m0, 0x0 +.loop2: + mova m2, [basisq+r2q] + pmulhrsw m3, m2, m0 + pmullw m2, m1 + paddw m2, m3 + paddw m2, [remq+r2q] + mova [remq+r2q], m2 + add r2q, 16 + js .loop2 + RET + + INIT_XMM sse2 cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset pxor m6, m6 diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index f6169b5399..220c75785a 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -32,6 +32,7 @@ void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64], int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size); int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size); int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size); +void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale); #if HAVE_INLINE_ASM #if HAVE_SSSE3_INLINE @@ -83,41 +84,6 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], c ); return i; } - -static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale) -{ - x86_reg i=0; - - if (FFABS(scale) < 1024) { - scale *= 1 << (16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT); - __asm__ volatile( - "movd %3, %%xmm2 \n\t" - "punpcklwd %%xmm2, %%xmm2 \n\t" - "pshufd $0, %%xmm2, %%xmm2 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movdqa (%1, %0), %%xmm0 \n\t" - "movdqa 16(%1, %0), %%xmm1 \n\t" - "pmulhrsw %%xmm2, %%xmm0 \n\t" - "pmulhrsw %%xmm2, %%xmm1 \n\t" - "paddw (%2, %0), %%xmm0 \n\t" - "paddw 16(%2, %0), %%xmm1 \n\t" - "movdqa %%xmm0, (%2, %0) \n\t" - "movdqa %%xmm1, 16(%2, %0) \n\t" - "add $32, %0 \n\t" - "cmp $128, %0 \n\t" // FIXME optimize & bench - " jb 1b \n\t" - : "+r" (i) - : "r"(basis), "r"(rem), "g"(scale) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2") - ); - } else { - for (i=0; i<8*8; i++) { - rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); - } - } -} - #endif /* HAVE_SSSE3_INLINE */ /* Draw the edges of width 'w' of an image of size width, height */ @@ -227,15 +193,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, c->draw_edges = draw_edges_mmx; } } +#endif /* HAVE_INLINE_ASM */ + if (X86_SSSE3(cpu_flags)) { #if HAVE_SSSE3_INLINE - if (INLINE_SSSE3(cpu_flags)) { if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) { c->try_8x8basis = try_8x8basis_ssse3; } - c->add_8x8basis = add_8x8basis_ssse3; - } #endif /* HAVE_SSSE3_INLINE */ +#if HAVE_SSSE3_EXTERNAL + c->add_8x8basis = ff_add_8x8basis_ssse3; +#endif + } -#endif /* HAVE_INLINE_ASM */ } -- 2.49.1 >From 0dfe66422eedfd67028b89c46ff7db2f8fef80eb Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 15 Nov 2025 19:56:23 +0100 Subject: [PATCH 7/9] avcodec/x86/mpegvideoenc_template: Avoid touching nonvolatile register xmm7 is nonvolatile on x64 Windows. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/mpegvideoenc_template.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c index 14e993de2b..b5417f6d32 100644 --- a/libavcodec/x86/mpegvideoenc_template.c +++ b/libavcodec/x86/mpegvideoenc_template.c @@ -117,7 +117,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, __asm__ volatile( "movd %%"FF_REG_a", %%xmm3 \n\t" // last_non_zero_p1 SPREADW("%%xmm3") - "pxor %%xmm7, %%xmm7 \n\t" // 0 + "pxor %%xmm2, %%xmm2 \n\t" // 0 "pxor %%xmm4, %%xmm4 \n\t" // 0 "movdqa (%2), %%xmm5 \n\t" // qmat[0] "pxor %%xmm6, %%xmm6 \n\t" @@ -132,9 +132,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, "por %%xmm0, %%xmm4 \n\t" RESTORE_SIGN("%%xmm1", "%%xmm0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t" - "pcmpeqw %%xmm7, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00 + "pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00 "movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t" - "movdqa %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0 + "movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0 "pandn %%xmm1, %%xmm0 \n\t" "pmaxsw %%xmm0, %%xmm3 \n\t" "add $16, %%"FF_REG_a" \n\t" @@ -146,13 +146,13 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, : "r" (block+64), "r" (qmat), "r" (bias), "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64) XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7") + "%xmm4", "%xmm5", "%xmm6") ); }else{ // FMT_H263 __asm__ volatile( "movd %%"FF_REG_a", %%xmm3 \n\t" // last_non_zero_p1 SPREADW("%%xmm3") - "pxor %%xmm7, %%xmm7 \n\t" // 0 + "pxor %%xmm2, %%xmm2 \n\t" // 0 "pxor %%xmm4, %%xmm4 \n\t" // 0 "mov $-128, %%"FF_REG_a" \n\t" ".p2align 4 \n\t" @@ -166,9 +166,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, "por %%xmm0, %%xmm4 \n\t" RESTORE_SIGN("%%xmm1", "%%xmm0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t" - "pcmpeqw %%xmm7, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00 + "pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00 "movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t" - "movdqa %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0 + "movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0 "pandn %%xmm1, %%xmm0 \n\t" "pmaxsw %%xmm0, %%xmm3 \n\t" "add $16, %%"FF_REG_a" \n\t" @@ -180,7 +180,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, : "r" (block+64), "r" (qmat+64), "r" (bias+64), "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64) XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7") + "%xmm4", "%xmm5", "%xmm6") ); } __asm__ volatile( -- 2.49.1 >From 1c1109ba320528f01d610da9b25aae8591458526 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 16 Nov 2025 11:10:07 +0100 Subject: [PATCH 8/9] avcodec/x86/mpegvideoenc_template: Reduce number of registers used qmat and bias always have a constant offset, so one can use one register to address both of them. This allows to remove the check for HAVE_6REGS (untested on a system where HAVE_6REGS is false). Also avoid FF_REG_a while at it. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/mpegvideoenc.c | 8 +------- libavcodec/x86/mpegvideoenc_template.c | 21 +++++++++------------ 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c index c667dcd2a2..24dd049200 100644 --- a/libavcodec/x86/mpegvideoenc.c +++ b/libavcodec/x86/mpegvideoenc.c @@ -39,8 +39,6 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = { 36, 37, 49, 50, 58, 59, 63, 64, }; -#if HAVE_6REGS - #if HAVE_SSE2_INLINE #define COMPILE_TEMPLATE_SSSE3 0 #define RENAME(a) a ## _sse2 @@ -55,8 +53,6 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = { #include "mpegvideoenc_template.c" #endif /* HAVE_SSSE3_INLINE */ -#endif /* HAVE_6REGS */ - av_cold void ff_dct_encode_init_x86(MPVEncContext *const s) { const int dct_algo = s->c.avctx->dct_algo; @@ -65,11 +61,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s) #if HAVE_SSE2_INLINE int cpu_flags = av_get_cpu_flags(); if (INLINE_SSE2(cpu_flags)) { -#if HAVE_6REGS s->dct_quantize = dct_quantize_sse2; -#endif } -#if HAVE_6REGS && HAVE_SSSE3_INLINE +#if HAVE_SSSE3_INLINE if (INLINE_SSSE3(cpu_flags)) s->dct_quantize = dct_quantize_ssse3; #endif diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c index b5417f6d32..e6ce791347 100644 --- a/libavcodec/x86/mpegvideoenc_template.c +++ b/libavcodec/x86/mpegvideoenc_template.c @@ -70,7 +70,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, { x86_reg last_non_zero_p1; int level=0, q; //=0 is because gcc says uninitialized ... - const uint16_t *qmat, *bias; + const uint16_t *qmat; LOCAL_ALIGNED_16(int16_t, temp_block, [64]); //s->fdct (block); @@ -86,11 +86,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, int dummy; if (n < 4){ q = s->c.y_dc_scale; - bias = s->q_intra_matrix16[qscale][1]; qmat = s->q_intra_matrix16[qscale][0]; }else{ q = s->c.c_dc_scale; - bias = s->q_chroma_intra_matrix16[qscale][1]; qmat = s->q_chroma_intra_matrix16[qscale][0]; } /* note: block[0] is assumed to be positive */ @@ -109,7 +107,6 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, last_non_zero_p1 = 1; } else { last_non_zero_p1 = 0; - bias = s->q_inter_matrix16[qscale][1]; qmat = s->q_inter_matrix16[qscale][0]; } @@ -121,7 +118,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, "pxor %%xmm4, %%xmm4 \n\t" // 0 "movdqa (%2), %%xmm5 \n\t" // qmat[0] "pxor %%xmm6, %%xmm6 \n\t" - "psubw (%3), %%xmm6 \n\t" // -bias[0] + "psubw 128(%2), %%xmm6 \n\t" // -bias[0] "mov $-128, %%"FF_REG_a" \n\t" ".p2align 4 \n\t" "1: \n\t" @@ -131,9 +128,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, "pmulhw %%xmm5, %%xmm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 "por %%xmm0, %%xmm4 \n\t" RESTORE_SIGN("%%xmm1", "%%xmm0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) - "movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t" + "movdqa %%xmm0, (%4, %0) \n\t" "pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00 - "movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t" + "movdqa (%3, %0), %%xmm1 \n\t" "movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0 "pandn %%xmm1, %%xmm0 \n\t" "pmaxsw %%xmm0, %%xmm3 \n\t" @@ -143,7 +140,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, "movd %%xmm3, %%"FF_REG_a" \n\t" "movzbl %%al, %%eax \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) - : "r" (block+64), "r" (qmat), "r" (bias), + : "r" (block+64), "r" (qmat), "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64) XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6") @@ -159,15 +156,15 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, "1: \n\t" "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t" // block[i] SAVE_SIGN("%%xmm1", "%%xmm0") // ABS(block[i]) - "movdqa (%3, %%"FF_REG_a"), %%xmm6 \n\t" // bias[0] + "movdqa 128(%2, %0), %%xmm6 \n\t" // bias[i] "paddusw %%xmm6, %%xmm0 \n\t" // ABS(block[i]) + bias[0] "movdqa (%2, %%"FF_REG_a"), %%xmm5 \n\t" // qmat[i] "pmulhw %%xmm5, %%xmm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 "por %%xmm0, %%xmm4 \n\t" RESTORE_SIGN("%%xmm1", "%%xmm0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) - "movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t" + "movdqa %%xmm0, (%4, %0) \n\t" "pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00 - "movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t" + "movdqa (%3, %0), %%xmm1 \n\t" "movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0 "pandn %%xmm1, %%xmm0 \n\t" "pmaxsw %%xmm0, %%xmm3 \n\t" @@ -177,7 +174,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s, "movd %%xmm3, %%"FF_REG_a" \n\t" "movzbl %%al, %%eax \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) - : "r" (block+64), "r" (qmat+64), "r" (bias+64), + : "r" (block+64), "r" (qmat+64), "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64) XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6") -- 2.49.1 >From 8ae2428ebedca7f191846e5fde2442069d15e8b1 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 16 Nov 2025 12:10:22 +0100 Subject: [PATCH 9/9] avutil/x86/asm: Remove wrong comment, rename FF_REG_sp Before FFmpeg commit 531b0a316b24f00965cd8a88efdbea2c6d63147f, FFmpeg used REG_SP as macro for the stack pointer, yet this clashed with a REG_SP define in Solaris system headers, so it was changed to REG_sp and a comment was added for this. Libav fixed it by adding an FF_ prefix to the macros in 1e9c5bf4c136fe9e010cc8a7e7270bba0d1bf45e. FFmpeg switched to using these prefixes in 9eb3da2f9942cf1b1148d242bccfc383f666feb6, using FF_REG_sp instead of Libav's FF_REG_SP. In said commit the comment was changed to claim that Solaris system headers define FF_REG_SP, but this is (most likely) wrong. This commit removes the wrong comment and renames the (actually unused) macro to FF_REG_SP to make it consistent with FF_REG_BP. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/x86/asm.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h index 9bff42d628..f06ea25035 100644 --- a/libavutil/x86/asm.h +++ b/libavutil/x86/asm.h @@ -38,8 +38,7 @@ typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg; # define FF_PTR_SIZE "8" typedef int64_t x86_reg; -/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */ -# define FF_REG_sp "rsp" +# define FF_REG_SP "rsp" # define FF_REG_BP "rbp" # define FF_REGBP rbp # define FF_REGa rax @@ -60,7 +59,7 @@ typedef int64_t x86_reg; # define FF_PTR_SIZE "4" typedef int32_t x86_reg; -# define FF_REG_sp "esp" +# define FF_REG_SP "esp" # define FF_REG_BP "ebp" # define FF_REGBP ebp # define FF_REGa eax -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
