PR #21157 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21157 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21157.patch
>From fb58a9bbc01f4d0f70866a43506db245e4789ac3 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 22 Oct 2025 01:11:21 +0200 Subject: [PATCH 1/9] fate/vcodec: Test median prediction in ffvhuff,huffyuv Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/fate/vcodec.mak | 3 ++- tests/ref/vsynth/vsynth1-ffvhuff420p12 | 4 ++-- tests/ref/vsynth/vsynth1-huffyuv | 4 ++-- tests/ref/vsynth/vsynth2-ffvhuff420p12 | 4 ++-- tests/ref/vsynth/vsynth3-ffvhuff420p12 | 4 ++-- tests/ref/vsynth/vsynth_lena-ffvhuff420p12 | 4 ++-- 6 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/fate/vcodec.mak b/tests/fate/vcodec.mak index 008d5ef98e..4a005ec760 100644 --- a/tests/fate/vcodec.mak +++ b/tests/fate/vcodec.mak @@ -185,7 +185,7 @@ fate-vsynth%-ffv1-2pass10: ENCOPTS = -coder range_tab -context 1 -pix_fmt FATE_VCODEC-$(call ENCDEC, FFVHUFF, AVI) += ffvhuff FATE_VCODEC_SCALE-$(call ENCDEC, FFVHUFF, AVI) += ffvhuff444 ffvhuff420p12 ffvhuff422p10left ffvhuff444p16 fate-vsynth%-ffvhuff444: ENCOPTS = -c:v ffvhuff -pix_fmt yuv444p -fate-vsynth%-ffvhuff420p12: ENCOPTS = -c:v ffvhuff -pix_fmt yuv420p12le +fate-vsynth%-ffvhuff420p12: ENCOPTS = -c:v ffvhuff -pix_fmt yuv420p12le -pred median fate-vsynth%-ffvhuff422p10left: ENCOPTS = -c:v ffvhuff -pix_fmt yuv422p10le -pred left fate-vsynth%-ffvhuff444p16: ENCOPTS = -c:v ffvhuff -pix_fmt yuv444p16le -pred plane @@ -216,6 +216,7 @@ fate-vsynth%-h263p: ENCOPTS = -qscale 2 -flags +aic -umv 1 -aiv 1 - FATE_VCODEC_SCALE-$(call ENCDEC, HUFFYUV, AVI) += huffyuv huffyuvbgr24 huffyuvbgra fate-vsynth%-huffyuv: ENCOPTS = -c:v huffyuv -pix_fmt yuv422p -sws_flags neighbor +fate-vsynth1-huffyuv: ENCOPTS = -c:v huffyuv -pix_fmt yuv422p -sws_flags neighbor -pred median fate-vsynth%-huffyuv: DECOPTS = -sws_flags neighbor fate-vsynth%-huffyuvbgr24: ENCOPTS = -c:v huffyuv -pix_fmt rgb24 -sws_flags neighbor fate-vsynth%-huffyuvbgr24: DECOPTS = -sws_flags neighbor diff --git a/tests/ref/vsynth/vsynth1-ffvhuff420p12 b/tests/ref/vsynth/vsynth1-ffvhuff420p12 index d4b22f3b4b..72ff82c006 100644 --- a/tests/ref/vsynth/vsynth1-ffvhuff420p12 +++ b/tests/ref/vsynth/vsynth1-ffvhuff420p12 @@ -1,4 +1,4 @@ -866485c954242232878e40f0389790dd *tests/data/fate/vsynth1-ffvhuff420p12.avi -14205356 tests/data/fate/vsynth1-ffvhuff420p12.avi +6210a990bd25c2dcbc72beafe1f806e2 *tests/data/fate/vsynth1-ffvhuff420p12.avi +12961816 tests/data/fate/vsynth1-ffvhuff420p12.avi c5ccac874dbf808e9088bc3107860042 *tests/data/fate/vsynth1-ffvhuff420p12.out.rawvideo stddev: 0.00 PSNR:999.99 MAXDIFF: 0 bytes: 7603200/ 7603200 diff --git a/tests/ref/vsynth/vsynth1-huffyuv b/tests/ref/vsynth/vsynth1-huffyuv index 6bf79442e4..1bc50ff4fb 100644 --- a/tests/ref/vsynth/vsynth1-huffyuv +++ b/tests/ref/vsynth/vsynth1-huffyuv @@ -1,4 +1,4 @@ -9a89f73cb2e305c15dda99c99c39b9d1 *tests/data/fate/vsynth1-huffyuv.avi -8876474 tests/data/fate/vsynth1-huffyuv.avi +0469244ace79ef728c60f87b2554426a *tests/data/fate/vsynth1-huffyuv.avi +6861194 tests/data/fate/vsynth1-huffyuv.avi c5ccac874dbf808e9088bc3107860042 *tests/data/fate/vsynth1-huffyuv.out.rawvideo stddev: 0.00 PSNR:999.99 MAXDIFF: 0 bytes: 7603200/ 7603200 diff --git a/tests/ref/vsynth/vsynth2-ffvhuff420p12 b/tests/ref/vsynth/vsynth2-ffvhuff420p12 index f97edfbf4e..328f05fd38 100644 --- a/tests/ref/vsynth/vsynth2-ffvhuff420p12 +++ b/tests/ref/vsynth/vsynth2-ffvhuff420p12 @@ -1,4 +1,4 @@ -3ab9567895bf1ec31a82aadf16a5da0e *tests/data/fate/vsynth2-ffvhuff420p12.avi -10562808 tests/data/fate/vsynth2-ffvhuff420p12.avi +29460ef3dd44f72e5f4e90316ac798b8 *tests/data/fate/vsynth2-ffvhuff420p12.avi +9977204 tests/data/fate/vsynth2-ffvhuff420p12.avi 36d7ca943916e1743cefa609eba0205c *tests/data/fate/vsynth2-ffvhuff420p12.out.rawvideo stddev: 0.00 PSNR:999.99 MAXDIFF: 0 bytes: 7603200/ 7603200 diff --git a/tests/ref/vsynth/vsynth3-ffvhuff420p12 b/tests/ref/vsynth/vsynth3-ffvhuff420p12 index 201ec7658b..19290b7ffb 100644 --- a/tests/ref/vsynth/vsynth3-ffvhuff420p12 +++ b/tests/ref/vsynth/vsynth3-ffvhuff420p12 @@ -1,4 +1,4 @@ -e5a178d75afeda6df1d4eb6f7cdfa3a0 *tests/data/fate/vsynth3-ffvhuff420p12.avi -175260 tests/data/fate/vsynth3-ffvhuff420p12.avi +592f3643ba063499c1c477765c08f630 *tests/data/fate/vsynth3-ffvhuff420p12.avi +161128 tests/data/fate/vsynth3-ffvhuff420p12.avi a038ad7c3c09f776304ef7accdea9c74 *tests/data/fate/vsynth3-ffvhuff420p12.out.rawvideo stddev: 0.00 PSNR:999.99 MAXDIFF: 0 bytes: 86700/ 86700 diff --git a/tests/ref/vsynth/vsynth_lena-ffvhuff420p12 b/tests/ref/vsynth/vsynth_lena-ffvhuff420p12 index e77698ba0c..96ab396c26 100644 --- a/tests/ref/vsynth/vsynth_lena-ffvhuff420p12 +++ b/tests/ref/vsynth/vsynth_lena-ffvhuff420p12 @@ -1,4 +1,4 @@ -b2f3d04ca30c113b79877bb5518dd6ea *tests/data/fate/vsynth_lena-ffvhuff420p12.avi -10925580 tests/data/fate/vsynth_lena-ffvhuff420p12.avi +0930b3d622b78d3c13e80222f95b0be2 *tests/data/fate/vsynth_lena-ffvhuff420p12.avi +9901820 tests/data/fate/vsynth_lena-ffvhuff420p12.avi dde5895817ad9d219f79a52d0bdfb001 *tests/data/fate/vsynth_lena-ffvhuff420p12.out.rawvideo stddev: 0.00 PSNR:999.99 MAXDIFF: 0 bytes: 7603200/ 7603200 -- 2.49.1 >From 3be6696c864813ca3cd540523ebbf391433715a2 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 22 Oct 2025 15:53:24 +0200 Subject: [PATCH 2/9] avcodec/x86/lossless_videoencdsp_init: Don't read from before the buffer sub_median_pred_mmxext() calculates a predictor from the left, top and topleft pixel values. The left value is simply read via ptr[-1], although this is not guaranteed to be inside the buffer in case of negative strides. This happens e.g. with ffmpeg -i fate-suite/mpeg2/dvd_single_frame.vob -vf vflip \ -c:v magicyuv -pred median -f null - Fix this by reading the first value like the topleft value. Also change the documentation of sub_median_pred to reflect this change (and the one from 791b5954bc8fe7c0077d7eb959ebd17e40d0a7c6). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/lossless_videoencdsp.h | 1 - libavcodec/x86/lossless_videoencdsp_init.c | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/libavcodec/lossless_videoencdsp.h b/libavcodec/lossless_videoencdsp.h index 7fd0ad32c7..44b33e7edb 100644 --- a/libavcodec/lossless_videoencdsp.h +++ b/libavcodec/lossless_videoencdsp.h @@ -29,7 +29,6 @@ typedef struct LLVidEncDSPContext { intptr_t w); /** * Subtract HuffYUV's variant of median prediction. - * Note, this might read from src1[-1], src2[-1]. */ void (*sub_median_pred)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w, diff --git a/libavcodec/x86/lossless_videoencdsp_init.c b/libavcodec/x86/lossless_videoencdsp_init.c index 22a4014ef1..c20f3ec04f 100644 --- a/libavcodec/x86/lossless_videoencdsp_init.c +++ b/libavcodec/x86/lossless_videoencdsp_init.c @@ -49,9 +49,13 @@ static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1, __asm__ volatile ( "movq (%1, %0), %%mm0 \n\t" // LT "psllq $8, %%mm0 \n\t" + "movq (%2, %0), %%mm2 \n\t" // L + "psllq $8, %%mm2 \n\t" + "jmp 2f \n\t" "1: \n\t" - "movq (%1, %0), %%mm1 \n\t" // T "movq -1(%2, %0), %%mm2 \n\t" // L + "2: \n\t" + "movq (%1, %0), %%mm1 \n\t" // T "movq (%2, %0), %%mm3 \n\t" // X "movq %%mm2, %%mm4 \n\t" // L "psubb %%mm0, %%mm2 \n\t" -- 2.49.1 >From ed11d9a3f56df875a85bced1cf8a575e3b6c733b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 10 Dec 2025 14:27:47 +0100 Subject: [PATCH 3/9] avcodec/x86/lossless_videoencdsp_init: Don't read too often sub_median_pred_mmxext() calculates a predictor from the left, top and topleft pixel values. The topleft values need to be initialized differently for the first loop initialization than for the others in order to avoid reading ptr[-1]. So it has been initialized before the loop and then read again at the end of the loop, so that the last value read was never used. Yet this can lead to reads beyond the end of the buffer, e.g. with ffmpeg -cpuflags mmx+mmxext -f lavfi -i "color=size=64x4,format=yuv420p" \ -vf vflip -c:v ffvhuff -pred median -frames 1 -f null - Fix this by not reading the value at the end of the loop. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/lossless_videoencdsp_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/lossless_videoencdsp_init.c b/libavcodec/x86/lossless_videoencdsp_init.c index c20f3ec04f..d7dfa2e3ae 100644 --- a/libavcodec/x86/lossless_videoencdsp_init.c +++ b/libavcodec/x86/lossless_videoencdsp_init.c @@ -54,6 +54,7 @@ static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1, "jmp 2f \n\t" "1: \n\t" "movq -1(%2, %0), %%mm2 \n\t" // L + "movq -1(%1, %0), %%mm0 \n\t" // LT "2: \n\t" "movq (%1, %0), %%mm1 \n\t" // T "movq (%2, %0), %%mm3 \n\t" // X @@ -68,7 +69,6 @@ static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1, "psubb %%mm4, %%mm3 \n\t" // dst - pred "movq %%mm3, (%3, %0) \n\t" "add $8, %0 \n\t" - "movq -1(%1, %0), %%mm0 \n\t" // LT "cmp %4, %0 \n\t" " jb 1b \n\t" : "+r" (i) -- 2.49.1 >From dca04d18c8364ae4a7944007e93abf9dc64e32d5 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 10 Dec 2025 15:28:06 +0100 Subject: [PATCH 4/9] tests/checkasm/llviddspenc: Add test for sub_median_pred Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/llviddspenc.c | 47 ++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/tests/checkasm/llviddspenc.c b/tests/checkasm/llviddspenc.c index c2eb63519f..8757b22618 100644 --- a/tests/checkasm/llviddspenc.c +++ b/tests/checkasm/llviddspenc.c @@ -28,11 +28,12 @@ #include "checkasm.h" -#define randomize_buffers(buf, size) \ - do { \ - int j; \ - for (j = 0; j < size; j+=4) \ - AV_WN32(buf + j, rnd()); \ +#define randomize_buffers(buf, size) \ + do { \ + for (size_t j = 0; j < size & ~3; j += 4) \ + AV_WN32(buf + j, rnd()); \ + for (size_t j = 0; j < size; ++j) \ + buf[j] = rnd(); \ } while (0) static const struct {uint8_t w, h, s;} planes[] = { @@ -73,6 +74,39 @@ static void check_diff_bytes(LLVidEncDSPContext *c) } } +static void check_sub_median_pred(LLVidEncDSPContext *c) +{ + enum { + BUF_SIZE = MAX_STRIDE + 15 /* to test misalignment */ + }; + uint8_t dst_ref[BUF_SIZE], dst_new[BUF_SIZE]; + uint8_t src1[BUF_SIZE], src2[BUF_SIZE]; + + declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, intptr_t w, + int *left, int *left_top); + + if (check_func(c->sub_median_pred, "sub_median_pred")) { + size_t width = 1 + rnd() % MAX_STRIDE; + size_t offset = rnd() & 0xF; + int left_ref = rnd() & 0xFF, top_ref = rnd() & 0xFF; + int left_new = left_ref, top_new = top_ref; + + memset(dst_ref, 0, sizeof(dst_ref)); + memset(dst_new, 0, sizeof(dst_new)); + + randomize_buffers(src1, sizeof(src1)); + randomize_buffers(src2, sizeof(src2)); + + call_ref(dst_ref + offset, src1 + offset, src2 + offset, width, &left_ref, &top_ref); + call_new(dst_new + offset, src1 + offset, src2 + offset, width, &left_new, &top_new); + if (left_new != left_ref || top_ref != top_new || + memcmp(dst_ref, dst_new, width + offset)) + fail(); + bench_new(dst_new, src1, src2, MAX_STRIDE, &left_new, &top_new); + } +} + static void check_sub_left_pred(LLVidEncDSPContext *c) { int i; @@ -109,6 +143,9 @@ void checkasm_check_llviddspenc(void) check_diff_bytes(&c); report("diff_bytes"); + check_sub_median_pred(&c); + report("sub_median_pred"); + check_sub_left_pred(&c); report("sub_left_predict"); } -- 2.49.1 >From ab6c535ed9c06114211af90620d1c21c085fa6d8 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 10 Dec 2025 15:38:39 +0100 Subject: [PATCH 5/9] avcodec/x86/lossless_videoencdsp_init: Port sub_median_pred to SSE2 Old benchmarks: sub_median_pred_c: 405.7 ( 1.00x) sub_median_pred_mmxext: 35.1 (11.57x) New benchmarks: sub_median_pred_c: 404.1 ( 1.00x) sub_median_pred_sse2: 20.5 (19.67x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/lossless_videoencdsp_init.c | 68 +++++++++++----------- tests/checkasm/llviddspenc.c | 6 +- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/libavcodec/x86/lossless_videoencdsp_init.c b/libavcodec/x86/lossless_videoencdsp_init.c index d7dfa2e3ae..b3ea2da388 100644 --- a/libavcodec/x86/lossless_videoencdsp_init.c +++ b/libavcodec/x86/lossless_videoencdsp_init.c @@ -37,42 +37,44 @@ void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, void ff_sub_left_predict_avx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height); -#if HAVE_INLINE_ASM +#if HAVE_SSE2_INLINE -static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1, - const uint8_t *src2, intptr_t w, - int *left, int *left_top) +static void sub_median_pred_sse2(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, intptr_t w, + int *left, int *left_top) { x86_reg i = 0; uint8_t l, lt; __asm__ volatile ( - "movq (%1, %0), %%mm0 \n\t" // LT - "psllq $8, %%mm0 \n\t" - "movq (%2, %0), %%mm2 \n\t" // L - "psllq $8, %%mm2 \n\t" - "jmp 2f \n\t" - "1: \n\t" - "movq -1(%2, %0), %%mm2 \n\t" // L - "movq -1(%1, %0), %%mm0 \n\t" // LT - "2: \n\t" - "movq (%1, %0), %%mm1 \n\t" // T - "movq (%2, %0), %%mm3 \n\t" // X - "movq %%mm2, %%mm4 \n\t" // L - "psubb %%mm0, %%mm2 \n\t" - "paddb %%mm1, %%mm2 \n\t" // L + T - LT - "movq %%mm4, %%mm5 \n\t" // L - "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) - "pminub %%mm5, %%mm1 \n\t" // min(T, L) - "pminub %%mm2, %%mm4 \n\t" - "pmaxub %%mm1, %%mm4 \n\t" - "psubb %%mm4, %%mm3 \n\t" // dst - pred - "movq %%mm3, (%3, %0) \n\t" - "add $8, %0 \n\t" - "cmp %4, %0 \n\t" - " jb 1b \n\t" + "movdqu (%1, %0), %%xmm0 \n\t" // LT + "movdqu (%2, %0), %%xmm2 \n\t" // L + "pslldq $1, %%xmm0 \n\t" + "pslldq $1, %%xmm2 \n\t" + "jmp 2f \n\t" + "1: \n\t" + "movdqu -1(%2, %0), %%xmm2 \n\t" // L + "movdqu -1(%1, %0), %%xmm0 \n\t" // LT + "2: \n\t" + "movdqu (%1, %0), %%xmm1 \n\t" // T + "movdqu (%2, %0), %%xmm3 \n\t" // X + "movdqa %%xmm2, %%xmm4 \n\t" // L + "psubb %%xmm0, %%xmm2 \n\t" + "paddb %%xmm1, %%xmm2 \n\t" // L + T - LT + "movdqa %%xmm4, %%xmm5 \n\t" // L + "pmaxub %%xmm1, %%xmm4 \n\t" // max(T, L) + "pminub %%xmm5, %%xmm1 \n\t" // min(T, L) + "pminub %%xmm2, %%xmm4 \n\t" + "pmaxub %%xmm1, %%xmm4 \n\t" + "psubb %%xmm4, %%xmm3 \n\t" // dst - pred + "movdqu %%xmm3, (%3, %0) \n\t" + "add $16, %0 \n\t" + "cmp %4, %0 \n\t" + " jb 1b \n\t" : "+r" (i) - : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w)); + : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w) + : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5",) "memory" + ); l = *left; lt = *left_top; @@ -89,11 +91,11 @@ av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c) { av_unused int cpu_flags = av_get_cpu_flags(); -#if HAVE_INLINE_ASM - if (INLINE_MMXEXT(cpu_flags)) { - c->sub_median_pred = sub_median_pred_mmxext; +#if HAVE_SSE2_INLINE + if (INLINE_SSE2(cpu_flags)) { + c->sub_median_pred = sub_median_pred_sse2; } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_SSE2_INLINE */ if (EXTERNAL_SSE2(cpu_flags)) { c->diff_bytes = ff_diff_bytes_sse2; diff --git a/tests/checkasm/llviddspenc.c b/tests/checkasm/llviddspenc.c index 8757b22618..f974e79165 100644 --- a/tests/checkasm/llviddspenc.c +++ b/tests/checkasm/llviddspenc.c @@ -82,9 +82,9 @@ static void check_sub_median_pred(LLVidEncDSPContext *c) uint8_t dst_ref[BUF_SIZE], dst_new[BUF_SIZE]; uint8_t src1[BUF_SIZE], src2[BUF_SIZE]; - declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, const uint8_t *src1, - const uint8_t *src2, intptr_t w, - int *left, int *left_top); + declare_func(void, uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, intptr_t w, + int *left, int *left_top); if (check_func(c->sub_median_pred, "sub_median_pred")) { size_t width = 1 + rnd() % MAX_STRIDE; -- 2.49.1 >From ce4ad97525e8535ee48139e241dce79e8bbb5076 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 10 Dec 2025 15:56:34 +0100 Subject: [PATCH 6/9] avcodec/x86/lossless_videoencdsp_init: Avoid special-casing first pixel Old benchmarks: sub_median_pred_c: 404.1 ( 1.00x) sub_median_pred_sse2: 20.5 (19.67x) New benchmarks: sub_median_pred_c: 408.5 ( 1.00x) sub_median_pred_sse2: 19.2 (21.27x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/lossless_videoencdsp_init.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/lossless_videoencdsp_init.c b/libavcodec/x86/lossless_videoencdsp_init.c index b3ea2da388..d5dd576e5f 100644 --- a/libavcodec/x86/lossless_videoencdsp_init.c +++ b/libavcodec/x86/lossless_videoencdsp_init.c @@ -27,7 +27,6 @@ #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/lossless_videoencdsp.h" -#include "libavcodec/mathops.h" void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w); @@ -37,20 +36,23 @@ void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, void ff_sub_left_predict_avx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height); -#if HAVE_SSE2_INLINE +#if HAVE_SSE2_INLINE && HAVE_7REGS static void sub_median_pred_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w, int *left, int *left_top) { x86_reg i = 0; - uint8_t l, lt; __asm__ volatile ( "movdqu (%1, %0), %%xmm0 \n\t" // LT "movdqu (%2, %0), %%xmm2 \n\t" // L + "movd (%6), %%xmm1 \n\t" // LT + "movd (%5), %%xmm3 \n\t" // L "pslldq $1, %%xmm0 \n\t" "pslldq $1, %%xmm2 \n\t" + "por %%xmm1, %%xmm0 \n\t" // LT + "por %%xmm3, %%xmm2 \n\t" // L "jmp 2f \n\t" "1: \n\t" "movdqu -1(%2, %0), %%xmm2 \n\t" // L @@ -72,15 +74,10 @@ static void sub_median_pred_sse2(uint8_t *dst, const uint8_t *src1, "cmp %4, %0 \n\t" " jb 1b \n\t" : "+r" (i) - : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w) + : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w), "r" (left), "r" (left_top) : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5",) "memory" ); - l = *left; - lt = *left_top; - - dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF); - *left_top = src1[w - 1]; *left = src2[w - 1]; } @@ -91,7 +88,7 @@ av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c) { av_unused int cpu_flags = av_get_cpu_flags(); -#if HAVE_SSE2_INLINE +#if HAVE_SSE2_INLINE && HAVE_7REGS if (INLINE_SSE2(cpu_flags)) { c->sub_median_pred = sub_median_pred_sse2; } -- 2.49.1 >From a8981876a07765e6d27dbbad2a360f1d334b4392 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 10 Dec 2025 17:22:50 +0100 Subject: [PATCH 7/9] avcodec/x86/lossless_videoencdsp: Port sub_median_pred to NASM Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/lossless_videoencdsp.asm | 39 ++++++++++++++ libavcodec/x86/lossless_videoencdsp_init.c | 60 ++-------------------- 2 files changed, 44 insertions(+), 55 deletions(-) diff --git a/libavcodec/x86/lossless_videoencdsp.asm b/libavcodec/x86/lossless_videoencdsp.asm index 8ccaea9139..47a10ae135 100644 --- a/libavcodec/x86/lossless_videoencdsp.asm +++ b/libavcodec/x86/lossless_videoencdsp.asm @@ -143,6 +143,45 @@ DIFF_BYTES_PROLOGUE %undef i %endif +;-------------------------------------------------------------------------------------------------- +;void sub_median_pred(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, +; intptr_t w, int *left, int *left_top) +;-------------------------------------------------------------------------------------------------- + +INIT_XMM sse2 +cglobal sub_median_pred, 6, 7, 6, dst, src1, src2, w, l, lt + movu m0, [src1q] ; LT + movu m4, [src2q] ; L + movd m1, [ltq] ; LT + movd m3, [lq] ; L + xor r6d, r6d + pslldq m0, 1 + pslldq m4, 1 + por m0, m1 ; LT + por m4, m3 ; L + jmp .first_iteration +.loop: + movu m4, [src2q+r6q-1] ; L + movu m0, [src1q+r6q-1] ; LT +.first_iteration: + movu m1, [src1q+r6q] ; T + movu m3, [src2q+r6q] ; X + psubb m2, m4, m0 ; L - LT + paddb m2, m1 ; L + T - LT + pmaxub m5, m4, m1 ; max(T, L) + pminub m1, m4 ; min(T, L) + pminub m5, m2 + pmaxub m5, m1 + psubb m3, m5 ; dst - pred + movu [dstq+r6q], m3 + add r6d, 16 + cmp r6d, wd + jb .loop + movzx src1d, BYTE [src1q+wq-1] + movzx src2d, BYTE [src2q+wq-1] + mov [ltq], src1d + mov [lq], src2d + RET ;-------------------------------------------------------------------------------------------------- ;void sub_left_predict(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height) diff --git a/libavcodec/x86/lossless_videoencdsp_init.c b/libavcodec/x86/lossless_videoencdsp_init.c index d5dd576e5f..b47e23fe42 100644 --- a/libavcodec/x86/lossless_videoencdsp_init.c +++ b/libavcodec/x86/lossless_videoencdsp_init.c @@ -24,7 +24,6 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/lossless_videoencdsp.h" @@ -33,68 +32,19 @@ void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w); +void ff_sub_median_pred_sse2(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, intptr_t w, + int *left, int *left_top); + void ff_sub_left_predict_avx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height); -#if HAVE_SSE2_INLINE && HAVE_7REGS - -static void sub_median_pred_sse2(uint8_t *dst, const uint8_t *src1, - const uint8_t *src2, intptr_t w, - int *left, int *left_top) -{ - x86_reg i = 0; - - __asm__ volatile ( - "movdqu (%1, %0), %%xmm0 \n\t" // LT - "movdqu (%2, %0), %%xmm2 \n\t" // L - "movd (%6), %%xmm1 \n\t" // LT - "movd (%5), %%xmm3 \n\t" // L - "pslldq $1, %%xmm0 \n\t" - "pslldq $1, %%xmm2 \n\t" - "por %%xmm1, %%xmm0 \n\t" // LT - "por %%xmm3, %%xmm2 \n\t" // L - "jmp 2f \n\t" - "1: \n\t" - "movdqu -1(%2, %0), %%xmm2 \n\t" // L - "movdqu -1(%1, %0), %%xmm0 \n\t" // LT - "2: \n\t" - "movdqu (%1, %0), %%xmm1 \n\t" // T - "movdqu (%2, %0), %%xmm3 \n\t" // X - "movdqa %%xmm2, %%xmm4 \n\t" // L - "psubb %%xmm0, %%xmm2 \n\t" - "paddb %%xmm1, %%xmm2 \n\t" // L + T - LT - "movdqa %%xmm4, %%xmm5 \n\t" // L - "pmaxub %%xmm1, %%xmm4 \n\t" // max(T, L) - "pminub %%xmm5, %%xmm1 \n\t" // min(T, L) - "pminub %%xmm2, %%xmm4 \n\t" - "pmaxub %%xmm1, %%xmm4 \n\t" - "psubb %%xmm4, %%xmm3 \n\t" // dst - pred - "movdqu %%xmm3, (%3, %0) \n\t" - "add $16, %0 \n\t" - "cmp %4, %0 \n\t" - " jb 1b \n\t" - : "+r" (i) - : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w), "r" (left), "r" (left_top) - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5",) "memory" - ); - - *left_top = src1[w - 1]; - *left = src2[w - 1]; -} - -#endif /* HAVE_INLINE_ASM */ - av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c) { av_unused int cpu_flags = av_get_cpu_flags(); -#if HAVE_SSE2_INLINE && HAVE_7REGS - if (INLINE_SSE2(cpu_flags)) { - c->sub_median_pred = sub_median_pred_sse2; - } -#endif /* HAVE_SSE2_INLINE */ - if (EXTERNAL_SSE2(cpu_flags)) { + c->sub_median_pred = ff_sub_median_pred_sse2; c->diff_bytes = ff_diff_bytes_sse2; } -- 2.49.1 >From 62b1521be0b4700486ed47b77cf0206a0616e319 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 10 Dec 2025 17:25:03 +0100 Subject: [PATCH 8/9] avcodec/x86/lossless_videoencdsp_init: Remove pointless av_unused Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/lossless_videoencdsp_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/lossless_videoencdsp_init.c b/libavcodec/x86/lossless_videoencdsp_init.c index b47e23fe42..c0cddf5938 100644 --- a/libavcodec/x86/lossless_videoencdsp_init.c +++ b/libavcodec/x86/lossless_videoencdsp_init.c @@ -41,7 +41,7 @@ void ff_sub_left_predict_avx(uint8_t *dst, const uint8_t *src, av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c) { - av_unused int cpu_flags = av_get_cpu_flags(); + int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_SSE2(cpu_flags)) { c->sub_median_pred = ff_sub_median_pred_sse2; -- 2.49.1 >From c92c7eea37360f3c5498553e2b2628f8932ad111 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 10 Dec 2025 18:06:10 +0100 Subject: [PATCH 9/9] tests/checkasm/llviddspenc: Rename to llvidencdsp Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/Makefile | 2 +- tests/checkasm/checkasm.c | 2 +- tests/checkasm/checkasm.h | 2 +- tests/checkasm/{llviddspenc.c => llvidencdsp.c} | 2 +- tests/fate/checkasm.mak | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename tests/checkasm/{llviddspenc.c => llvidencdsp.c} (99%) diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 1c34619249..9e675ce189 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -16,7 +16,7 @@ AVCODECOBJS-$(CONFIG_HPELDSP) += hpeldsp.o AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o AVCODECOBJS-$(CONFIG_LLAUDDSP) += llauddsp.o AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o -AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o +AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llvidencdsp.o AVCODECOBJS-$(CONFIG_LPC) += lpc.o AVCODECOBJS-$(CONFIG_ME_CMP) += motion.o AVCODECOBJS-$(CONFIG_MPEGVIDEO) += mpegvideo_unquantize.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 54665c2fad..268e600346 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -207,7 +207,7 @@ static const struct { { "llviddsp", checkasm_check_llviddsp }, #endif #if CONFIG_LLVIDENCDSP - { "llviddspenc", checkasm_check_llviddspenc }, + { "llvidencdsp", checkasm_check_llvidencdsp }, #endif #if CONFIG_LPC { "lpc", checkasm_check_lpc }, diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 910fc417a7..a54231dd0d 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -120,7 +120,7 @@ void checkasm_check_jpeg2000dsp(void); void checkasm_check_llauddsp(void); void checkasm_check_lls(void); void checkasm_check_llviddsp(void); -void checkasm_check_llviddspenc(void); +void checkasm_check_llvidencdsp(void); void checkasm_check_lpc(void); void checkasm_check_motion(void); void checkasm_check_mpegvideo_unquantize(void); diff --git a/tests/checkasm/llviddspenc.c b/tests/checkasm/llvidencdsp.c similarity index 99% rename from tests/checkasm/llviddspenc.c rename to tests/checkasm/llvidencdsp.c index f974e79165..28b405eafc 100644 --- a/tests/checkasm/llviddspenc.c +++ b/tests/checkasm/llvidencdsp.c @@ -135,7 +135,7 @@ static void check_sub_left_pred(LLVidEncDSPContext *c) } } -void checkasm_check_llviddspenc(void) +void checkasm_check_llvidencdsp(void) { LLVidEncDSPContext c; ff_llvidencdsp_init(&c); diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index f26e534591..f6447294c4 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -36,7 +36,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \ fate-checkasm-llauddsp \ fate-checkasm-lls \ fate-checkasm-llviddsp \ - fate-checkasm-llviddspenc \ + fate-checkasm-llvidencdsp \ fate-checkasm-lpc \ fate-checkasm-motion \ fate-checkasm-mpegvideo_unquantize \ -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
