PR #21581 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21581
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21581.patch

Also improve cfhdencdsp a bit.


>From 1c7ff999a850a20a8b3de9f90a986d4a89835b3c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 25 Jan 2026 19:19:39 +0100
Subject: [PATCH 1/6] avcodec/x86/cfhdencdsp: Avoid load of -1

It can be easily generated at runtime.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/cfhdencdsp.asm | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
index 4aaeb56972..3d4aa90e96 100644
--- a/libavcodec/x86/cfhdencdsp.asm
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -31,7 +31,6 @@ pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
 pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
 pd_4:  times 4 dd  4
 pw_n4: times 8 dw -4
-cextern pw_m1
 cextern pw_1
 cextern pw_4
 
@@ -45,7 +44,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, 
istride, lwidth, hwid
     shl   hwidthq, 1
     mova       m7, [pd_4]
     mova       m8, [pw_1]
-    mova       m9, [pw_m1]
+    pcmpeqw    m9, m9       ; -1
     mova       m10,[pw_p1_n1]
     movsxdifnidn yq, yd
     movsxdifnidn widthq, widthd
@@ -207,7 +206,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, 
istride, lwidth, hwidt
 
     mova       m7, [pd_4]
     mova       m8, [pw_1]
-    mova       m9, [pw_m1]
+    pcmpeqw    m9, m9      ; -1
     mova       m10,[pw_p1_n1]
     mova       m11,[pw_n1_p1]
     mova       m12,[pw_4]
-- 
2.52.0


>From 964a58b29677093fe2e195cb7c6fc43234967f22 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 25 Jan 2026 20:16:57 +0100
Subject: [PATCH 2/6] avcodec/x86/cfhdencdsp: Avoid unnecessary constants

Up until now, cfhdencdsp used constants consisting
of -1, 1, ...,-1,1 words and 1, -1,...,1,-1 words
for use as constants in pmaddwd. But one can use
the same constants if one shuffles the words in
a dword the opposite order. Similarly for some other
constants. This also allowed to avoid a register in
chfdenc_vert_filter.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/cfhdencdsp.asm | 38 +++++++++++++----------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
index 3d4aa90e96..73e12f283e 100644
--- a/libavcodec/x86/cfhdencdsp.asm
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -24,11 +24,8 @@
 SECTION_RODATA
 
 pw_p1_n1:  dw  1, -1, 1, -1, 1, -1, 1, -1
-pw_n1_p1:  dw  -1, 1, -1, 1, -1, 1, -1, 1
 pw_p5_n11: dw  5, -11, 5, -11, 5, -11, 5, -11
-pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
 pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
-pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
 pd_4:  times 4 dd  4
 pw_n4: times 8 dw -4
 cextern pw_1
@@ -44,7 +41,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, 
istride, lwidth, hwid
     shl   hwidthq, 1
     mova       m7, [pd_4]
     mova       m8, [pw_1]
-    pcmpeqw    m9, m9       ; -1
+    pcmpeqw        m9, m9       ; -1
     mova       m10,[pw_p1_n1]
     movsxdifnidn yq, yd
     movsxdifnidn widthq, widthd
@@ -196,7 +193,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, 
istride, lwidth, hwid
 
 %if ARCH_X86_64
 INIT_XMM sse2
-cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, 
hwidth, width, height, x, y, pos
+cglobal cfhdenc_vert_filter, 8, 11, 13, input, low, high, istride, lwidth, 
hwidth, width, height, x, y, pos
     shl  istrideq, 1
 
     shl    widthd, 1
@@ -208,9 +205,8 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, 
istride, lwidth, hwidt
     mova       m8, [pw_1]
     pcmpeqw    m9, m9      ; -1
     mova       m10,[pw_p1_n1]
-    mova       m11,[pw_n1_p1]
-    mova       m12,[pw_4]
-    mova       m13,[pw_n4]
+    mova      m11, [pw_4]
+    mova      m12, [pw_n4]
 .loopw:
     mov        yq, 2
 
@@ -237,9 +233,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, 
istride, lwidth, hwidt
     add      posq, istrideq
     movu       m5, [inputq + posq]
 
-    mova       m6, m0
-    punpcklwd  m0, m1
-    punpckhwd  m1, m6
+    SBUTTERFLY wd, 0, 1, 6
 
     mova       m6, m2
     punpcklwd  m2, m3
@@ -250,9 +244,9 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, 
istride, lwidth, hwidt
     punpckhwd  m5, m6
 
     pmaddwd    m0, [pw_p5_n11]
-    pmaddwd    m1, [pw_n11_p5]
-    pmaddwd    m2, m12
-    pmaddwd    m3, m12
+    pmaddwd    m1, [pw_p5_n11]
+    pmaddwd    m2, m11
+    pmaddwd    m3, m11
     pmaddwd    m4, m9
     pmaddwd    m5, m9
 
@@ -313,9 +307,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, 
istride, lwidth, hwidt
     punpcklwd  m0, m1
     punpckhwd  m1, m6
 
-    mova       m6, m2
-    punpcklwd  m2, m3
-    punpckhwd  m3, m6
+    SBUTTERFLY wd, 2, 3, 6
 
     mova       m6, m4
     punpcklwd  m4, m5
@@ -324,7 +316,7 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, 
istride, lwidth, hwidt
     pmaddwd    m0, m9
     pmaddwd    m1, m9
     pmaddwd    m2, m10
-    pmaddwd    m3, m11
+    pmaddwd    m3, m10
     pmaddwd    m4, m8
     pmaddwd    m5, m8
 
@@ -394,16 +386,14 @@ cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, 
istride, lwidth, hwidt
     punpcklwd  m2, m3
     punpckhwd  m3, m6
 
-    mova       m6, m4
-    punpcklwd  m4, m5
-    punpckhwd  m5, m6
+    SBUTTERFLY wd, 4, 5, 6
 
     pmaddwd    m0, m8
     pmaddwd    m1, m8
-    pmaddwd    m2, m13
-    pmaddwd    m3, m13
+    pmaddwd    m2, m12
+    pmaddwd    m3, m12
     pmaddwd    m4, [pw_p11_n5]
-    pmaddwd    m5, [pw_n5_p11]
+    pmaddwd    m5, [pw_p11_n5]
 
     paddd      m4, m2
     paddd      m5, m3
-- 
2.52.0


>From 3752f2d0fac7b3212ee2d1278c501ba5e8433f9d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 25 Jan 2026 21:04:21 +0100
Subject: [PATCH 3/6] avcodec/x86/cfhdencdsp: Don't load twice

Sign extend the integer arguments directly from the stack
instead of loading qwords, followed by sign-extending the
lower half.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/cfhdencdsp.asm | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
index 73e12f283e..83676cea81 100644
--- a/libavcodec/x86/cfhdencdsp.asm
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -35,7 +35,8 @@ SECTION .text
 
 %if ARCH_X86_64
 INIT_XMM sse2
-cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, 
hwidth, width, y, x, temp
+cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, 
hwidth, width, y, x, temp
+    movsxdifnidn widthq, widthm
     shl  istrideq, 1
     shl   lwidthq, 1
     shl   hwidthq, 1
@@ -43,8 +44,7 @@ cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, 
istride, lwidth, hwid
     mova       m8, [pw_1]
     pcmpeqw        m9, m9       ; -1
     mova       m10,[pw_p1_n1]
-    movsxdifnidn yq, yd
-    movsxdifnidn widthq, widthd
+    movsxdifnidn   yq, ym
     neg        yq
 .looph:
     movsx          xq, word [inputq]
-- 
2.52.0


>From 4dc24d619dec7088a5f28267deb771d19bd066b6 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 25 Jan 2026 21:29:33 +0100
Subject: [PATCH 4/6] avcodec/x86/cfhdencdsp: Avoid += x, -= x

Avoid incrementing lowq and highq inside the loop by using
complex addressing modes, avoiding to undo said modification
at the end of the horizontal loop.
For inputq, modify istrideq outside of the loop so that
it is only modified once at the end of the horizontal loop.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/cfhdencdsp.asm | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
index 83676cea81..f2da8720b4 100644
--- a/libavcodec/x86/cfhdencdsp.asm
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -37,12 +37,13 @@ SECTION .text
 INIT_XMM sse2
 cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, istride, lwidth, 
hwidth, width, y, x, temp
     movsxdifnidn widthq, widthm
-    shl  istrideq, 1
     shl   lwidthq, 1
     shl   hwidthq, 1
     mova       m7, [pd_4]
     mova       m8, [pw_1]
     pcmpeqw        m9, m9       ; -1
+    sub      istrideq, widthq
+    shl      istrideq, 1
     mova       m10,[pw_p1_n1]
     movsxdifnidn   yq, ym
     neg        yq
@@ -136,8 +137,6 @@ cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, 
istride, lwidth, hwid
     cmp            xq, widthq
     jl .loopw
 
-    add          lowq, widthq
-    add         highq, widthq
     lea        inputq, [inputq + widthq * 2]
 
     movsx          xq, word [inputq - 4]
@@ -147,7 +146,7 @@ cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, 
istride, lwidth, hwid
     movd          xm0, tempd
     packssdw       m0, m0
     movd        tempd, m0
-    mov word [lowq-2], tempw
+    mov word [lowq+widthq-2], tempw
 
     movsx       tempq, word [inputq - 4]
     imul        tempq, 11
@@ -175,12 +174,7 @@ cglobal cfhdenc_horiz_filter, 6, 10, 11, input, low, high, 
istride, lwidth, hwid
     movd          xm0, tempd
     packssdw       m0, m0
     movd        tempd, m0
-    mov word [highq-2], tempw
-
-    sub        inputq, widthq
-    sub        inputq, widthq
-    sub         highq, widthq
-    sub          lowq, widthq
+    mov word [highq+widthq-2], tempw
 
     add          lowq, lwidthq
     add         highq, hwidthq
-- 
2.52.0


>From 79ef1a5651f3f4937974140717068b68dc6d51bb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 26 Jan 2026 03:00:39 +0100
Subject: [PATCH 5/6] avcodec/x86/cfhddsp: Avoid pmaddwd

The result of using pmaddwd with the coefficients 1,-1,...,1,-1
is just the negative of using pmaddwd with the coefficients
-1,1,...,-1,1, so avoid one pmaddwd.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/cfhddsp.asm | 43 +++++++++++++-------------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/libavcodec/x86/cfhddsp.asm b/libavcodec/x86/cfhddsp.asm
index 87c2df634a..821d511ba2 100644
--- a/libavcodec/x86/cfhddsp.asm
+++ b/libavcodec/x86/cfhddsp.asm
@@ -24,7 +24,6 @@
 SECTION_RODATA
 
 factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
-factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
 factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
 factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
 pd_4: times 4 dd 4
@@ -80,7 +79,6 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, 
temp, width, height
 
 %if ARCH_X86_64
     mova       m8, [factor_p1_n1]
-    mova       m9, [factor_n1_p1]
     mova      m10, [pw_1]
     mova      m11, [pd_4]
 %endif
@@ -144,29 +142,23 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, 
high, temp, width, height
     punpcklwd      m4, m1
     punpckhwd      m5, m1
 
-    mova           m6, m4
-    mova           m7, m5
-
 %if ARCH_X86_64
     pmaddwd        m4, m8
     pmaddwd        m5, m8
-    pmaddwd        m6, m9
-    pmaddwd        m7, m9
 
+    psubd          m6, m11, m4
+    psubd          m7, m11, m5
     paddd          m4, m11
     paddd          m5, m11
-    paddd          m6, m11
-    paddd          m7, m11
 %else
+    mova           m2, [pd_4]
     pmaddwd        m4, [factor_p1_n1]
     pmaddwd        m5, [factor_p1_n1]
-    pmaddwd        m6, [factor_n1_p1]
-    pmaddwd        m7, [factor_n1_p1]
 
-    paddd          m4, [pd_4]
-    paddd          m5, [pd_4]
-    paddd          m6, [pd_4]
-    paddd          m7, [pd_4]
+    psubd          m6, m2, m4
+    psubd          m7, m2, m5
+    paddd          m4, m2
+    paddd          m5, m2
 %endif
 
     psrad          m4, 3
@@ -313,7 +305,6 @@ cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, 
lwidth, high, hwidth,
     dec   heightd
 
     mova       m8, [factor_p1_n1]
-    mova       m9, [factor_n1_p1]
     mova      m10, [pw_1]
     mova      m11, [pd_4]
     mova      m12, [factor_p11_n4]
@@ -471,29 +462,23 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, 
high, pos, width, height
     punpcklwd  m4, m1
     punpckhwd  m5, m1
 
-    mova       m6, m4
-    mova       m7, m5
-
 %if ARCH_X86_64
     pmaddwd    m4, m8
     pmaddwd    m5, m8
-    pmaddwd    m6, m9
-    pmaddwd    m7, m9
 
+    psubd      m6, m11, m4
+    psubd      m7, m11, m5
     paddd      m4, m11
     paddd      m5, m11
-    paddd      m6, m11
-    paddd      m7, m11
 %else
+    mova       m2, [pd_4]
     pmaddwd    m4, [factor_p1_n1]
     pmaddwd    m5, [factor_p1_n1]
-    pmaddwd    m6, [factor_n1_p1]
-    pmaddwd    m7, [factor_n1_p1]
 
-    paddd      m4, [pd_4]
-    paddd      m5, [pd_4]
-    paddd      m6, [pd_4]
-    paddd      m7, [pd_4]
+    psubd      m6, m2, m4
+    psubd      m7, m2, m5
+    paddd      m4, m2
+    paddd      m5, m2
 %endif
 
     psrad      m4, 3
-- 
2.52.0


>From 506f57782f9595e56fe5f32de495b884fdd9ffab Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 26 Jan 2026 03:21:48 +0100
Subject: [PATCH 6/6] avcodec/x86/cfhddsp: Reduce number of xmm registers used

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/cfhddsp.asm | 68 +++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/libavcodec/x86/cfhddsp.asm b/libavcodec/x86/cfhddsp.asm
index 821d511ba2..01ba00f8a6 100644
--- a/libavcodec/x86/cfhddsp.asm
+++ b/libavcodec/x86/cfhddsp.asm
@@ -36,20 +36,20 @@ SECTION .text
 
 %macro CFHD_HORIZ_FILTER 1
 %if %1 == 1023
-cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, 
high, width, x, temp
+cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 3 * ARCH_X86_64, output, low, 
high, width, x, temp
     shl        widthd, 1
 %define ostrideq widthq
 %define lwidthq  widthq
 %define hwidthq  widthq
 %elif %1 == 4095
-cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, 
high, width, x, temp
+cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 3 * ARCH_X86_64, output, low, 
high, width, x, temp
     shl        widthd, 1
 %define ostrideq widthq
 %define lwidthq  widthq
 %define hwidthq  widthq
 %else
 %if ARCH_X86_64
-cglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, 
hwidth, width, height, x, y, temp
+cglobal cfhd_horiz_filter, 8, 11, 11, output, ostride, low, lwidth, high, 
hwidth, width, height, x, y, temp
     shl  ostrided, 1
     shl   lwidthd, 1
     shl   hwidthd, 1
@@ -79,8 +79,8 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, 
temp, width, height
 
 %if ARCH_X86_64
     mova       m8, [factor_p1_n1]
-    mova      m10, [pw_1]
-    mova      m11, [pd_4]
+    mova       m9, [pw_1]
+    mova      m10, [pd_4]
 %endif
 
 %if %1 == 0
@@ -146,10 +146,10 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, 
high, temp, width, height
     pmaddwd        m4, m8
     pmaddwd        m5, m8
 
-    psubd          m6, m11, m4
-    psubd          m7, m11, m5
-    paddd          m4, m11
-    paddd          m5, m11
+    psubd          m6, m10, m4
+    psubd          m7, m10, m5
+    paddd          m4, m10
+    paddd          m5, m10
 %else
     mova           m2, [pd_4]
     pmaddwd        m4, [factor_p1_n1]
@@ -177,8 +177,8 @@ cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, 
high, temp, width, height
     mova           m3, m0
 
 %if ARCH_X86_64
-    pmaddwd        m2, m10
-    pmaddwd        m0, m10
+    pmaddwd        m2, m9
+    pmaddwd        m0, m9
     pmaddwd        m1, m8
     pmaddwd        m3, m8
 %else
@@ -296,7 +296,7 @@ CFHD_HORIZ_FILTER 4095
 
 INIT_XMM sse2
 %if ARCH_X86_64
-cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, 
hwidth, width, height, x, y, pos
+cglobal cfhd_vert_filter, 8, 11, 13, output, ostride, low, lwidth, high, 
hwidth, width, height, x, y, pos
     shl        ostrided, 1
     shl         lwidthd, 1
     shl         hwidthd, 1
@@ -305,10 +305,10 @@ cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, 
low, lwidth, high, hwidth,
     dec   heightd
 
     mova       m8, [factor_p1_n1]
-    mova      m10, [pw_1]
-    mova      m11, [pd_4]
-    mova      m12, [factor_p11_n4]
-    mova      m13, [factor_p5_p4]
+    mova       m9, [pw_1]
+    mova      m10, [pd_4]
+    mova      m11, [factor_p11_n4]
+    mova      m12, [factor_p5_p4]
 %else
 cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
     shl        xd, 1
@@ -344,8 +344,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, 
pos, width, height
     punpckhwd  m2, m1
 
 %if ARCH_X86_64
-    pmaddwd    m0, m12
-    pmaddwd    m2, m12
+    pmaddwd    m0, m11
+    pmaddwd    m2, m11
 %else
     pmaddwd    m0, [factor_p11_n4]
     pmaddwd    m2, [factor_p11_n4]
@@ -398,8 +398,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, 
pos, width, height
     punpckhwd  m2, m1
 
 %if ARCH_X86_64
-    pmaddwd    m0, m13
-    pmaddwd    m2, m13
+    pmaddwd    m0, m12
+    pmaddwd    m2, m12
 %else
     pmaddwd    m0, [factor_p5_p4]
     pmaddwd    m2, [factor_p5_p4]
@@ -466,10 +466,10 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, 
high, pos, width, height
     pmaddwd    m4, m8
     pmaddwd    m5, m8
 
-    psubd      m6, m11, m4
-    psubd      m7, m11, m5
-    paddd      m4, m11
-    paddd      m5, m11
+    psubd      m6, m10, m4
+    psubd      m7, m10, m5
+    paddd      m4, m10
+    paddd      m5, m10
 %else
     mova       m2, [pd_4]
     pmaddwd    m4, [factor_p1_n1]
@@ -502,8 +502,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, 
pos, width, height
     mova       m3, m2
 
 %if ARCH_X86_64
-    pmaddwd    m0, m10
-    pmaddwd    m2, m10
+    pmaddwd    m0, m9
+    pmaddwd    m2, m9
     pmaddwd    m1, m8
     pmaddwd    m3, m8
 %else
@@ -550,8 +550,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, 
pos, width, height
     punpckhwd  m2, m1
 
 %if ARCH_X86_64
-    pmaddwd    m0, m13
-    pmaddwd    m2, m13
+    pmaddwd    m0, m12
+    pmaddwd    m2, m12
 %else
     pmaddwd    m0, [factor_p5_p4]
     pmaddwd    m2, [factor_p5_p4]
@@ -571,8 +571,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, 
pos, width, height
     psubd      m2, m3
 
 %if ARCH_X86_64
-    paddd      m0, m11
-    paddd      m2, m11
+    paddd      m0, m10
+    paddd      m2, m10
 %else
     paddd      m0, [pd_4]
     paddd      m2, [pd_4]
@@ -618,8 +618,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, 
pos, width, height
     punpckhwd  m2, m1
 
 %if ARCH_X86_64
-    pmaddwd    m0, m12
-    pmaddwd    m2, m12
+    pmaddwd    m0, m11
+    pmaddwd    m2, m11
 %else
     pmaddwd    m0, [factor_p11_n4]
     pmaddwd    m2, [factor_p11_n4]
@@ -639,8 +639,8 @@ cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, 
pos, width, height
     paddd      m2, m3
 
 %if ARCH_X86_64
-    paddd      m0, m11
-    paddd      m2, m11
+    paddd      m0, m10
+    paddd      m2, m10
 %else
     paddd      m0, [pd_4]
     paddd      m2, [pd_4]
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to