PR #21581 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21581 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21581.patch
Also improve cfhdencdsp a bit. >From 1c7ff999a850a20a8b3de9f90a986d4a89835b3c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 25 Jan 2026 19:19:39 +0100 Subject: [PATCH 1/6] avcodec/x86/cfhdencdsp: Avoid load of -1 It can be easily generated at runtime. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/cfhdencdsp.asm | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm index 4aaeb56972..3d4aa90e96 100644 --- a/libavcodec/x86/cfhdencdsp.asm +++ b/libavcodec/x86/cfhdencdsp.asm @@ -31,7 +31,6 @@ pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5 pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5 pd_4: times 4 dd 4 pw_n4: times 8 dw -4 -cextern pw_m1 cextern pw_1 cextern pw_4 @@ -45,7 +44,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwid shl hwidthq, 1 mova m7, [pd_4] mova m8, [pw_1] - mova m9, [pw_m1] + pcmpeqw m9, m9 ; -1 mova m10,[pw_p1_n1] movsxdifnidn yq, yd movsxdifnidn widthq, widthd @@ -207,7 +206,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt mova m7, [pd_4] mova m8, [pw_1] - mova m9, [pw_m1] + pcmpeqw m9, m9 ; -1 mova m10,[pw_p1_n1] mova m11,[pw_n1_p1] mova m12,[pw_4] -- 2.52.0 >From 964a58b29677093fe2e195cb7c6fc43234967f22 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 25 Jan 2026 20:16:57 +0100 Subject: [PATCH 2/6] avcodec/x86/cfhdencdsp: Avoid unnecessary constants Up until now, cfhdencdsp used constants consisting of -1, 1, ...,-1,1 words and 1, -1,...,1,-1 words for use as constants in pmaddwd. But one can use the same constants if one shuffles the words in a dword the opposite order. Similarly for some other constants. This also allowed to avoid a register in chfdenc_vert_filter. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/cfhdencdsp.asm | 38 +++++++++++++---------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm index 3d4aa90e96..73e12f283e 100644 --- a/libavcodec/x86/cfhdencdsp.asm +++ b/libavcodec/x86/cfhdencdsp.asm @@ -24,11 +24,8 @@ SECTION_RODATA pw_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1 -pw_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1 pw_p5_n11: dw 5, -11, 5, -11, 5, -11, 5, -11 -pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11 pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5 -pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5 pd_4: times 4 dd 4 pw_n4: times 8 dw -4 cextern pw_1 @@ -44,7 +41,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwid shl hwidthq, 1 mova m7, [pd_4] mova m8, [pw_1] - pcmpeqw m9, m9 ; -1 + pcmpeqw m9, m9 ; -1 mova m10,[pw_p1_n1] movsxdifnidn yq, yd movsxdifnidn widthq, widthd @@ -196,7 +193,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwid %if ARCH_X86_64 INIT_XMM sse2 -cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos +cglobal cfhdenc_vert_filter, 8, 11, 13, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos shl istrideq, 1 shl widthd, 1 @@ -208,9 +205,8 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt mova m8, [pw_1] pcmpeqw m9, m9 ; -1 mova m10,[pw_p1_n1] - mova m11,[pw_n1_p1] - mova m12,[pw_4] - mova m13,[pw_n4] + mova m11, [pw_4] + mova m12, [pw_n4] .loopw: mov yq, 2 @@ -237,9 +233,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt add posq, istrideq movu m5, [inputq + posq] - mova m6, m0 - punpcklwd m0, m1 - punpckhwd m1, m6 + SBUTTERFLY wd, 0, 1, 6 mova m6, m2 punpcklwd m2, m3 @@ -250,9 +244,9 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt punpckhwd m5, m6 pmaddwd m0, [pw_p5_n11] - pmaddwd m1, [pw_n11_p5] - pmaddwd m2, m12 - pmaddwd m3, m12 + pmaddwd m1, [pw_p5_n11] + pmaddwd m2, m11 + pmaddwd m3, m11 pmaddwd m4, m9 pmaddwd m5, m9 @@ -313,9 +307,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt punpcklwd m0, m1 punpckhwd m1, m6 - mova m6, m2 - punpcklwd m2, m3 - punpckhwd m3, m6 + SBUTTERFLY wd, 2, 3, 6 mova m6, m4 punpcklwd m4, m5 @@ -324,7 +316,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt pmaddwd m0, m9 pmaddwd m1, m9 pmaddwd m2, m10 - pmaddwd m3, m11 + pmaddwd m3, m10 pmaddwd m4, m8 pmaddwd m5, m8 @@ -394,16 +386,14 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidt punpcklwd m2, m3 punpckhwd m3, m6 - mova m6, m4 - punpcklwd m4, m5 - punpckhwd m5, m6 + SBUTTERFLY wd, 4, 5, 6 pmaddwd m0, m8 pmaddwd m1, m8 - pmaddwd m2, m13 - pmaddwd m3, m13 + pmaddwd m2, m12 + pmaddwd m3, m12 pmaddwd m4, [pw_p11_n5] - pmaddwd m5, [pw_n5_p11] + pmaddwd m5, [pw_p11_n5] paddd m4, m2 paddd m5, m3 -- 2.52.0 >From 3752f2d0fac7b3212ee2d1278c501ba5e8433f9d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 25 Jan 2026 21:04:21 +0100 Subject: [PATCH 3/6] avcodec/x86/cfhdencdsp: Don't load twice Sign extend the integer arguments directly from the stack instead of loading qwords, followed by sign-extending the lower half. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/cfhdencdsp.asm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm index 73e12f283e..83676cea81 100644 --- a/libavcodec/x86/cfhdencdsp.asm +++ b/libavcodec/x86/cfhdencdsp.asm @@ -35,7 +35,8 @@ SECTION .text %if ARCH_X86_64 INIT_XMM sse2 -cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp +cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp + movsxdifnidn widthq, widthm shl istrideq, 1 shl lwidthq, 1 shl hwidthq, 1 @@ -43,8 +44,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwid mova m8, [pw_1] pcmpeqw m9, m9 ; -1 mova m10,[pw_p1_n1] - movsxdifnidn yq, yd - movsxdifnidn widthq, widthd + movsxdifnidn yq, ym neg yq .looph: movsx xq, word [inputq] -- 2.52.0 >From 4dc24d619dec7088a5f28267deb771d19bd066b6 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 25 Jan 2026 21:29:33 +0100 Subject: [PATCH 4/6] avcodec/x86/cfhdencdsp: Avoid += x, -= x Avoid incrementing lowq and highq inside the loop by using complex addressing modes, avoiding to undo said modification at the end of the horizontal loop. For inputq, modify istrideq outside of the loop so that it is only modified once at the end of the horizontal loop. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/cfhdencdsp.asm | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm index 83676cea81..f2da8720b4 100644 --- a/libavcodec/x86/cfhdencdsp.asm +++ b/libavcodec/x86/cfhdencdsp.asm @@ -37,12 +37,13 @@ SECTION .text INIT_XMM sse2 cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp movsxdifnidn widthq, widthm - shl istrideq, 1 shl lwidthq, 1 shl hwidthq, 1 mova m7, [pd_4] mova m8, [pw_1] pcmpeqw m9, m9 ; -1 + sub istrideq, widthq + shl istrideq, 1 mova m10,[pw_p1_n1] movsxdifnidn yq, ym neg yq @@ -136,8 +137,6 @@ cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwid cmp xq, widthq jl .loopw - add lowq, widthq - add highq, widthq lea inputq, [inputq + widthq * 2] movsx xq, word [inputq - 4] @@ -147,7 +146,7 @@ cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwid movd xm0, tempd packssdw m0, m0 movd tempd, m0 - mov word [lowq-2], tempw + mov word [lowq+widthq-2], tempw movsx tempq, word [inputq - 4] imul tempq, 11 @@ -175,12 +174,7 @@ cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, hwid movd xm0, tempd packssdw m0, m0 movd tempd, m0 - mov word [highq-2], tempw - - sub inputq, widthq - sub inputq, widthq - sub highq, widthq - sub lowq, widthq + mov word [highq+widthq-2], tempw add lowq, lwidthq add highq, hwidthq -- 2.52.0 >From 79ef1a5651f3f4937974140717068b68dc6d51bb Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 26 Jan 2026 03:00:39 +0100 Subject: [PATCH 5/6] avcodec/x86/cfhddsp: Avoid pmaddwd The result of using pmaddwd with the coefficients 1,-1,...,1,-1 is just the negative of using pmaddwd with the coefficients -1,1,...,-1,1, so avoid one pmaddwd. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/cfhddsp.asm | 43 +++++++++++++------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/libavcodec/x86/cfhddsp.asm b/libavcodec/x86/cfhddsp.asm index 87c2df634a..821d511ba2 100644 --- a/libavcodec/x86/cfhddsp.asm +++ b/libavcodec/x86/cfhddsp.asm @@ -24,7 +24,6 @@ SECTION_RODATA factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1, -factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1, factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4, factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4, pd_4: times 4 dd 4 @@ -80,7 +79,6 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height %if ARCH_X86_64 mova m8, [factor_p1_n1] - mova m9, [factor_n1_p1] mova m10, [pw_1] mova m11, [pd_4] %endif @@ -144,29 +142,23 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height punpcklwd m4, m1 punpckhwd m5, m1 - mova m6, m4 - mova m7, m5 - %if ARCH_X86_64 pmaddwd m4, m8 pmaddwd m5, m8 - pmaddwd m6, m9 - pmaddwd m7, m9 + psubd m6, m11, m4 + psubd m7, m11, m5 paddd m4, m11 paddd m5, m11 - paddd m6, m11 - paddd m7, m11 %else + mova m2, [pd_4] pmaddwd m4, [factor_p1_n1] pmaddwd m5, [factor_p1_n1] - pmaddwd m6, [factor_n1_p1] - pmaddwd m7, [factor_n1_p1] - paddd m4, [pd_4] - paddd m5, [pd_4] - paddd m6, [pd_4] - paddd m7, [pd_4] + psubd m6, m2, m4 + psubd m7, m2, m5 + paddd m4, m2 + paddd m5, m2 %endif psrad m4, 3 @@ -313,7 +305,6 @@ cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, dec heightd mova m8, [factor_p1_n1] - mova m9, [factor_n1_p1] mova m10, [pw_1] mova m11, [pd_4] mova m12, [factor_p11_n4] @@ -471,29 +462,23 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height punpcklwd m4, m1 punpckhwd m5, m1 - mova m6, m4 - mova m7, m5 - %if ARCH_X86_64 pmaddwd m4, m8 pmaddwd m5, m8 - pmaddwd m6, m9 - pmaddwd m7, m9 + psubd m6, m11, m4 + psubd m7, m11, m5 paddd m4, m11 paddd m5, m11 - paddd m6, m11 - paddd m7, m11 %else + mova m2, [pd_4] pmaddwd m4, [factor_p1_n1] pmaddwd m5, [factor_p1_n1] - pmaddwd m6, [factor_n1_p1] - pmaddwd m7, [factor_n1_p1] - paddd m4, [pd_4] - paddd m5, [pd_4] - paddd m6, [pd_4] - paddd m7, [pd_4] + psubd m6, m2, m4 + psubd m7, m2, m5 + paddd m4, m2 + paddd m5, m2 %endif psrad m4, 3 -- 2.52.0 >From 506f57782f9595e56fe5f32de495b884fdd9ffab Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 26 Jan 2026 03:21:48 +0100 Subject: [PATCH 6/6] avcodec/x86/cfhddsp: Reduce number of xmm registers used Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/cfhddsp.asm | 68 +++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/libavcodec/x86/cfhddsp.asm b/libavcodec/x86/cfhddsp.asm index 821d511ba2..01ba00f8a6 100644 --- a/libavcodec/x86/cfhddsp.asm +++ b/libavcodec/x86/cfhddsp.asm @@ -36,20 +36,20 @@ SECTION .text %macro CFHD_HORIZ_FILTER 1 %if %1 == 1023 -cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp +cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 3 * ARCH_X86_64, output, low, high, width, x, temp shl widthd, 1 %define ostrideq widthq %define lwidthq widthq %define hwidthq widthq %elif %1 == 4095 -cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp +cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 3 * ARCH_X86_64, output, low, high, width, x, temp shl widthd, 1 %define ostrideq widthq %define lwidthq widthq %define hwidthq widthq %else %if ARCH_X86_64 -cglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp +cglobal cfhd_horiz_filter, 8, 11, 11, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp shl ostrided, 1 shl lwidthd, 1 shl hwidthd, 1 @@ -79,8 +79,8 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height %if ARCH_X86_64 mova m8, [factor_p1_n1] - mova m10, [pw_1] - mova m11, [pd_4] + mova m9, [pw_1] + mova m10, [pd_4] %endif %if %1 == 0 @@ -146,10 +146,10 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height pmaddwd m4, m8 pmaddwd m5, m8 - psubd m6, m11, m4 - psubd m7, m11, m5 - paddd m4, m11 - paddd m5, m11 + psubd m6, m10, m4 + psubd m7, m10, m5 + paddd m4, m10 + paddd m5, m10 %else mova m2, [pd_4] pmaddwd m4, [factor_p1_n1] @@ -177,8 +177,8 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height mova m3, m0 %if ARCH_X86_64 - pmaddwd m2, m10 - pmaddwd m0, m10 + pmaddwd m2, m9 + pmaddwd m0, m9 pmaddwd m1, m8 pmaddwd m3, m8 %else @@ -296,7 +296,7 @@ CFHD_HORIZ_FILTER 4095 INIT_XMM sse2 %if ARCH_X86_64 -cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos +cglobal cfhd_vert_filter, 8, 11, 13, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos shl ostrided, 1 shl lwidthd, 1 shl hwidthd, 1 @@ -305,10 +305,10 @@ cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, dec heightd mova m8, [factor_p1_n1] - mova m10, [pw_1] - mova m11, [pd_4] - mova m12, [factor_p11_n4] - mova m13, [factor_p5_p4] + mova m9, [pw_1] + mova m10, [pd_4] + mova m11, [factor_p11_n4] + mova m12, [factor_p5_p4] %else cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height shl xd, 1 @@ -344,8 +344,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height punpckhwd m2, m1 %if ARCH_X86_64 - pmaddwd m0, m12 - pmaddwd m2, m12 + pmaddwd m0, m11 + pmaddwd m2, m11 %else pmaddwd m0, [factor_p11_n4] pmaddwd m2, [factor_p11_n4] @@ -398,8 +398,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height punpckhwd m2, m1 %if ARCH_X86_64 - pmaddwd m0, m13 - pmaddwd m2, m13 + pmaddwd m0, m12 + pmaddwd m2, m12 %else pmaddwd m0, [factor_p5_p4] pmaddwd m2, [factor_p5_p4] @@ -466,10 +466,10 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height pmaddwd m4, m8 pmaddwd m5, m8 - psubd m6, m11, m4 - psubd m7, m11, m5 - paddd m4, m11 - paddd m5, m11 + psubd m6, m10, m4 + psubd m7, m10, m5 + paddd m4, m10 + paddd m5, m10 %else mova m2, [pd_4] pmaddwd m4, [factor_p1_n1] @@ -502,8 +502,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height mova m3, m2 %if ARCH_X86_64 - pmaddwd m0, m10 - pmaddwd m2, m10 + pmaddwd m0, m9 + pmaddwd m2, m9 pmaddwd m1, m8 pmaddwd m3, m8 %else @@ -550,8 +550,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height punpckhwd m2, m1 %if ARCH_X86_64 - pmaddwd m0, m13 - pmaddwd m2, m13 + pmaddwd m0, m12 + pmaddwd m2, m12 %else pmaddwd m0, [factor_p5_p4] pmaddwd m2, [factor_p5_p4] @@ -571,8 +571,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height psubd m2, m3 %if ARCH_X86_64 - paddd m0, m11 - paddd m2, m11 + paddd m0, m10 + paddd m2, m10 %else paddd m0, [pd_4] paddd m2, [pd_4] @@ -618,8 +618,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height punpckhwd m2, m1 %if ARCH_X86_64 - pmaddwd m0, m12 - pmaddwd m2, m12 + pmaddwd m0, m11 + pmaddwd m2, m11 %else pmaddwd m0, [factor_p11_n4] pmaddwd m2, [factor_p11_n4] @@ -639,8 +639,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height paddd m2, m3 %if ARCH_X86_64 - paddd m0, m11 - paddd m2, m11 + paddd m0, m10 + paddd m2, m10 %else paddd m0, [pd_4] paddd m2, [pd_4] -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
