The branch, release/5.0 has been updated
       via  fef37af2e2d226e9cab99084c7ddeb89e6e77e35 (commit)
       via  8c8cb15db4eaf8533ccb689cf64a7a8cd8ee48ae (commit)
       via  7ec86238a3c9dcba60beb2a36b1119bcc7129c95 (commit)
      from  3b04d086b417029697fc3a3a3b3f40096b99eb61 (commit)


- Log -----------------------------------------------------------------
commit fef37af2e2d226e9cab99084c7ddeb89e6e77e35
Author:     Bin Peng <[email protected]>
AuthorDate: Fri Oct 24 15:58:08 2025 +0800
Commit:     Martin Storsjö <[email protected]>
CommitDate: Tue Nov 4 14:11:13 2025 +0200

    lavc/aarch64: Fix addp overflow in ff_pred16x16_plane_neon_10
    
    The mismatch between neon and C functions can be reproduced
    using the following bitstream and command line on aarch64 devices.
    
    wget 
https://streams.videolan.org/ffmpeg/incoming/replay_intra_pred_16x16.h264
     ./ffmpeg -cpuflags 0  -threads 1 -i replay_intra_pred_16x16.h264  -f 
framemd5 -y md5_ref
     ./ffmpeg              -threads 1 -i replay_intra_pred_16x16.h264 -f 
framemd5 -y md5_neon
    
    Signed-off-by: Bin Peng <[email protected]>
    (cherry picked from commit 3115c0c0e6c27c689a02a7267dcf8e61fa2ac425)

diff --git a/libavcodec/aarch64/h264pred_neon.S 
b/libavcodec/aarch64/h264pred_neon.S
index d0999938ef..795d2ce540 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -489,10 +489,10 @@ function ff_pred16x16_plane_neon_10, export=1
         mul             v2.8h,  v2.8h,  v0.8h
         mul             v3.8h,  v3.8h,  v0.8h
         addp            v2.8h,  v2.8h,  v3.8h
-        addp            v2.8h,  v2.8h,  v2.8h
-        addp            v2.4h,  v2.4h,  v2.4h
-        sshll           v3.4s,  v2.4h,  #2
-        saddw           v2.4s,  v3.4s,  v2.4h
+        saddlp          v2.4s,  v2.8h
+        addp            v2.4s,  v2.4s,  v2.4s
+        shl             v3.4s,  v2.4s,  #2
+        add             v2.4s,  v3.4s,  v2.4s
         rshrn           v4.4h,  v2.4s,  #6
         trn2            v5.4h,  v4.4h,  v4.4h
         add             v2.4h,  v4.4h,  v5.4h
@@ -506,14 +506,13 @@ function ff_pred16x16_plane_neon_10, export=1
         sxtl            v6.4s,  v5.4h          // c
 
         mov             v0.h[0],  wzr
-        mul             v0.8h,  v0.8h,  v4.h[0]
         dup             v16.4s, v2.s[0]
         dup             v17.4s, v2.s[0]
         dup             v2.8h,  v4.h[0]        // b
         dup             v3.4s,  v6.s[0]        // c
         sshll           v2.4s,  v2.4h,  #3     // b * 8
-        saddw           v16.4s, v16.4s, v0.4h
-        saddw2          v17.4s, v17.4s, v0.8h
+        smlal           v16.4s, v0.4h, v4.h[0]
+        smlal2          v17.4s, v0.8h, v4.h[0]
         sub             v3.4s,  v3.4s,  v2.4s
 
         mov             w3,      #16

commit 8c8cb15db4eaf8533ccb689cf64a7a8cd8ee48ae
Author:     Bin Peng <[email protected]>
AuthorDate: Fri Dec 13 22:19:47 2024 +0800
Commit:     Martin Storsjö <[email protected]>
CommitDate: Tue Nov 4 14:11:09 2025 +0200

    lavc/aarch64: Fix ff_pred16x16_plane_neon_10
    
    Fix test failure on aarch64:
    ./tests/checkasm/checkasm --test=h264pred 367840
    
    Signed-off-by: Peng Bin <[email protected]>
    Signed-off-by: Martin Storsjö <[email protected]>
    (cherry picked from commit 72a3656e8468a394373b6397aacc906d7f7794c2)

diff --git a/libavcodec/aarch64/h264pred_neon.S 
b/libavcodec/aarch64/h264pred_neon.S
index 168f8191ad..d0999938ef 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -502,28 +502,27 @@ function ff_pred16x16_plane_neon_10, export=1
         add             v7.4h,  v7.4h,  v0.4h
         shl             v2.4h,  v7.4h,  #4
         ssubl           v2.4s,  v2.4h,  v3.4h
-        shl             v3.4h,  v4.4h,  #4
         ext             v0.16b, v0.16b, v0.16b, #14
-        ssubl           v6.4s,  v5.4h,  v3.4h
+        sxtl            v6.4s,  v5.4h          // c
 
         mov             v0.h[0],  wzr
         mul             v0.8h,  v0.8h,  v4.h[0]
         dup             v16.4s, v2.s[0]
         dup             v17.4s, v2.s[0]
-        dup             v2.8h,  v4.h[0]
-        dup             v3.4s,  v6.s[0]
-        shl             v2.8h,  v2.8h,  #3
+        dup             v2.8h,  v4.h[0]        // b
+        dup             v3.4s,  v6.s[0]        // c
+        sshll           v2.4s,  v2.4h,  #3     // b * 8
         saddw           v16.4s, v16.4s, v0.4h
         saddw2          v17.4s, v17.4s, v0.8h
-        saddw           v3.4s,  v3.4s,  v2.4h
+        sub             v3.4s,  v3.4s,  v2.4s
 
         mov             w3,      #16
         mvni            v4.8h,   #0xFC, lsl #8 // 1023 for clipping
 1:
         sqshrun         v0.4h,  v16.4s, #5
         sqshrun2        v0.8h,  v17.4s, #5
-        saddw           v16.4s, v16.4s, v2.4h
-        saddw           v17.4s, v17.4s, v2.4h
+        add             v16.4s, v16.4s, v2.4s
+        add             v17.4s, v17.4s, v2.4s
         sqshrun         v1.4h,  v16.4s, #5
         sqshrun2        v1.8h,  v17.4s, #5
         add             v16.4s, v16.4s, v3.4s

commit 7ec86238a3c9dcba60beb2a36b1119bcc7129c95
Author:     Bin Peng <[email protected]>
AuthorDate: Mon Dec 16 10:31:23 2024 +0800
Commit:     Martin Storsjö <[email protected]>
CommitDate: Tue Nov 4 14:11:04 2025 +0200

    lavc/aarch64: Fix ff_pred8x8_plane_neon_10
    
    Fix test failure on aarch64:
    ./tests/checkasm/checkasm --test=h264pred 479612
    
    The mismatch between neon and C functions can also be reproduced using the 
following bitstream and command line.
    
    wget https://streams.videolan.org/ffmpeg/incoming/intra8x8pred_10bit.264
     ./ffmpeg -cpuflags 0  -threads 1 -i intra8x8pred_10bit.264  -f framemd5 -y 
md5_ref
     ./ffmpeg              -threads 1 -i intra8x8pred_10bit.264  -f framemd5 -y 
md5_neon
    
    Signed-off-by: Bin Peng <[email protected]>
    Signed-off-by: Martin Storsjö <[email protected]>
    (cherry picked from commit decc9e643cc3ac5537f42b465e2637fbefbf41cc)

diff --git a/libavcodec/aarch64/h264pred_neon.S 
b/libavcodec/aarch64/h264pred_neon.S
index ea37689f34..168f8191ad 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -595,12 +595,11 @@ function ff_pred8x8_plane_neon_10, export=1
         ssubl           v2.4s,  v2.4h,  v3.4h
         ext             v0.16b, v0.16b, v0.16b, #14
         mov             v0.h[0],  wzr
-        mul             v0.8h,  v0.8h,  v5.h[0]
         dup             v1.4s,  v2.s[0]
         dup             v2.4s,  v2.s[0]
         dup             v3.8h,  v5.h[1]
-        saddw           v1.4s,  v1.4s,  v0.4h
-        saddw2          v2.4s,  v2.4s,  v0.8h
+        smlal           v1.4s,  v0.4h,  v5.h[0]
+        smlal2          v2.4s,  v0.8h,  v5.h[0]
         mov             w3,  #8
         mvni            v4.8h,  #0xFC,  lsl #8 // 1023 for clipping
 1:

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/aarch64/h264pred_neon.S | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to