PR #23347 opened by Martin Storsjö (mstorsjo)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23347
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23347.patch

For cases when returning early without updating any pixels, we
previously returned to return address in the caller's scope,
bypassing one function entirely. While this may seem like a neat
optimization, it makes the return stack predictor mispredict
the returns - which potentially can cost more performance than
it gains.

Secondly, if the armv9.3 feature GCS (Guarded Control Stack) is
enabled, then returns _must_ match the expected value; this feature
is being enabled across linux distributions, and by fixing the
hevc assembly, we can enable the security feature on ffmpeg as well.

This matches the fix that was done for hevcdsp in
1f7ed8a78de1da743a359913ce05cc258a400b5d.

For unknown reasons, QEMU didn't notice this issue when running
checkasm, only if running other FATE tests. Since the switch
to a new checkasm implementation, QEMU does seem to pick up
on the GCS mismatch in these functions as well.

Since 846746be4b8edd66be8be2c123b0072de2636e9d we do
signal that our code was GCS compliant, which means that builds
of such versions (including version 8.1.1 where the GCS enabling
was backported) will fail in the future when run on actual GCS
supporting hardware, until this fix is applied.



From 90fab21acbdc240c113a218bdb3a982504ecccbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <[email protected]>
Date: Thu, 4 Jun 2026 23:41:11 +0300
Subject: [PATCH] aarch64: vp9lpf: Fix GCS violations

For cases when returning early without updating any pixels, we
previously returned to return address in the caller's scope,
bypassing one function entirely. While this may seem like a neat
optimization, it makes the return stack predictor mispredict
the returns - which potentially can cost more performance than
it gains.

Secondly, if the armv9.3 feature GCS (Guarded Control Stack) is
enabled, then returns _must_ match the expected value; this feature
is being enabled across linux distributions, and by fixing the
hevc assembly, we can enable the security feature on ffmpeg as well.

This matches the fix that was done for hevcdsp in
1f7ed8a78de1da743a359913ce05cc258a400b5d.

For unknown reasons, QEMU didn't notice this issue when running
checkasm, only if running other FATE tests. Since the switch
to a new checkasm implementation, QEMU does seem to pick up
on the GCS mismatch in these functions as well.

Since 846746be4b8edd66be8be2c123b0072de2636e9d we do
signal that our code was GCS compliant, which means that builds
of such versions (including version 8.1.1 where the GCS enabling
was backported) will fail in the future when run on actual GCS
supporting hardware, until this fix is applied.
---
 libavcodec/aarch64/vp9lpf_16bpp_neon.S |  65 ++++++++------
 libavcodec/aarch64/vp9lpf_neon.S       | 112 +++++++++++++++----------
 2 files changed, 106 insertions(+), 71 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S 
b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
index e3e70491c6..3defd633e7 100644
--- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -56,9 +56,7 @@
         mov             x11, v4.d[0]
         mov             x12, v4.d[1]
         adds            x11, x11, x12
-        b.ne            1f
-        ret             x10
-1:
+        b.eq            9f
 
 .if \wd >= 8
         dup             v0.8h,  w5
@@ -189,13 +187,7 @@
         // If no pixels need flat8in, jump to flat8out
         // (or to a writeout of the inner 4 pixels, for wd=8)
 .if \wd >= 8
-.if \wd == 16
         b.eq            6f
-.else
-        b.ne            1f
-        ret             x13
-1:
-.endif
 
         // flat8in
         add             \tmp1\().8h, v20.8h, v21.8h
@@ -249,20 +241,16 @@
         mov             x11, v2.d[0]
         mov             x12, v2.d[1]
         adds            x11, x11, x12
-        b.ne            1f
         // If no pixels needed flat8in nor flat8out, jump to a
         // writeout of the inner 4 pixels
-        ret             x14
-1:
+        b.eq            7f
 
         mov             x11, v7.d[0]
         mov             x12, v7.d[1]
         adds            x11, x11, x12
-        b.ne            1f
         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        ret             x15
+        b.eq            8f
 
-1:
         // flat8out
         // This writes all outputs into v2-v17 (skipping v6 and v16).
         // If this part is skipped, the output is read from v21-v26 (which is 
the input
@@ -378,35 +366,60 @@
 // while we need those for inputs/outputs in wd=16 and use v8-v15
 // for temp registers there instead.
 function vp9_loop_filter_4
+        mov             x17, #0
         loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
         ret
+9:
+        mov             x17, #1
+        ret
 endfunc
 
 function vp9_loop_filter_8
+        mov             x17, #0
+        mov             x13, #0
         loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
         ret
+6:
+        mov             x13, #1
+        ret
+9:
+        mov             x17, #1
+        ret
 endfunc
 
 function vp9_loop_filter_16
+        mov             x17, #0
+        mov             x14, #0
+        mov             x15, #0
         loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
         ret
+7:
+        mov             x14, #1
+        ret
+8:
+        mov             x15, #1
+        ret
+9:
+        mov             x17, #1
+        ret
 endfunc
 
 .macro loop_filter_4
         bl              vp9_loop_filter_4
+        cbnz            x17, 9f
 .endm
 
 .macro loop_filter_8
-        // calculate alternative 'return' targets
-        adr             x13, 6f
         bl              vp9_loop_filter_8
+        cbnz            x17, 9f
+        cbnz            x13, 6f
 .endm
 
 .macro loop_filter_16
-        // calculate alternative 'return' targets
-        adr             x14, 7f
-        adr             x15, 8f
         bl              vp9_loop_filter_16
+        cbnz            x17, 9f
+        cbnz            x14, 7f
+        cbnz            x15, 8f
 .endm
 
 
@@ -540,7 +553,7 @@ function vp9_loop_filter_v_4_8_16_neon
         st1             {v23.8h}, [x9], x1
         st1             {v25.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #1
-
+9:
         ret             x10
 endfunc
 
@@ -588,7 +601,7 @@ function vp9_loop_filter_h_4_8_16_neon
         st1             {v25.d}[1], [x0], x1
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #4
-
+9:
         ret             x10
 endfunc
 
@@ -619,7 +632,7 @@ function vp9_loop_filter_v_8_8_16_neon
         st1             {v26.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-
+9:
         ret             x10
 6:
         sub             x9,  x0,  x1, lsl #1
@@ -670,7 +683,7 @@ function vp9_loop_filter_h_8_8_16_neon
         st1             {v27.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #8
-
+9:
         ret             x10
 6:
         // If we didn't need to do the flat8in part, we use the same writeback
@@ -742,7 +755,7 @@ function vp9_loop_filter_v_16_8_16_neon
         st1             {v17.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  x1
-
+9:
         ret             x10
 8:
         add             x9,  x9,  x1, lsl #2
@@ -820,7 +833,7 @@ function vp9_loop_filter_h_16_8_16_neon
         st1             {v9.8h},  [x9], x1
         st1             {v31.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #3
-
+9:
         ret             x10
 8:
         // The same writeback as in loop_filter_h_8_8
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 9a79f48df3..b9b82e6fff 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -388,32 +388,28 @@
 .endif
 .if \wd == 16
 6:
+        // If no pixels needed flat8in nor flat8out, jump to a
+        // writeout of the inner 4 pixels
         orr             v2\sz,  v6\sz,  v7\sz
         mov             x5,  v2.d[0]
 .ifc \sz, .16b
         mov             x6,  v2.d[1]
         adds            x5,  x5,  x6
-        b.ne            1f
+        b.eq            7f
 .else
-        cbnz            x5,  1f
+        cbz             x5,  7f
 .endif
-        // If no pixels needed flat8in nor flat8out, jump to a
-        // writeout of the inner 4 pixels
-        ret             x14
-1:
 
+        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
         mov             x5,  v7.d[0]
 .ifc \sz, .16b
         mov             x6,  v7.d[1]
         adds            x5,  x5,  x6
-        b.ne            1f
+        b.eq            8f
 .else
-        cbnz            x5,  1f
+        cbz             x5,  8f
 .endif
-        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        ret             x15
 
-1:
         // flat8out
         // This writes all outputs into v2-v17 (skipping v6 and v16).
         // If this part is skipped, the output is read from v21-v26 (which is 
the input
@@ -529,76 +525,100 @@
 // while we need those for inputs/outputs in wd=16 and use v8-v15
 // for temp registers there instead.
 function vp9_loop_filter_4
+        mov             x12, #0
         loop_filter     4,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
         ret
 9:
-        ret             x10
+        mov             x12, #1
+        ret
 endfunc
 
 function vp9_loop_filter_4_16b_mix_44
+        mov             x12, #0
         loop_filter     4,  .16b, 44,   v16, v17, v18, v19, v28, v29, v30, v31
         ret
 9:
-        ret             x10
+        mov             x12, #1
+        ret
 endfunc
 
 function vp9_loop_filter_8
+        mov             x12, #0
+        mov             x13, #0
         loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
         ret
 6:
-        ret             x13
+        mov             x13, #1
+        ret
 9:
-        ret             x10
+        mov             x12, #1
+        ret
 endfunc
 
 function vp9_loop_filter_8_16b_mix
+        mov             x12, #0
+        mov             x13, #0
         loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
         ret
 6:
-        ret             x13
+        mov             x13, #1
+        ret
 9:
-        ret             x10
+        mov             x12, #1
+        ret
 endfunc
 
 function vp9_loop_filter_16
+        mov             x12, #0
+        mov             x14, #0
+        mov             x15, #0
         loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
         ret
+7:
+        mov             x14, #1
+        ret
+8:
+        mov             x15, #1
+        ret
 9:
-        ldp             d10, d11, [sp, #0x10]
-        ldp             d12, d13, [sp, #0x20]
-        ldp             d14, d15, [sp, #0x30]
-        ldp             d8,  d9,  [sp], #0x40
-        ret             x10
+        mov             x12, #1
+        ret
 endfunc
 
 function vp9_loop_filter_16_16b
+        mov             x12, #0
+        mov             x14, #0
+        mov             x15, #0
         loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
         ret
+7:
+        mov             x14, #1
+        ret
+8:
+        mov             x15, #1
+        ret
 9:
-        ldp             d10, d11, [sp, #0x10]
-        ldp             d12, d13, [sp, #0x20]
-        ldp             d14, d15, [sp, #0x30]
-        ldp             d8,  d9,  [sp], #0x40
-        ret             x10
+        mov             x12, #1
+        ret
 endfunc
 
 .macro loop_filter_4
         bl              vp9_loop_filter_4
+        cbnz            x12, 9f
 .endm
 
 .macro loop_filter_4_16b_mix mix
         bl              vp9_loop_filter_4_16b_mix_\mix
+        cbnz            x12, 9f
 .endm
 
 .macro loop_filter_8
-        // calculate alternative 'return' targets
-        adr             x13, 6f
         bl              vp9_loop_filter_8
+        cbnz            x13, 6f
+        cbnz            x12, 9f
 .endm
 
 .macro loop_filter_8_16b_mix mix
-        // calculate alternative 'return' targets
-        adr             x13, 6f
 .if \mix == 48
         mov             x11, #0xffffffff00000000
 .elseif \mix == 84
@@ -607,20 +627,22 @@ endfunc
         mov             x11, #0xffffffffffffffff
 .endif
         bl              vp9_loop_filter_8_16b_mix
+        cbnz            x13, 6f
+        cbnz            x12, 9f
 .endm
 
 .macro loop_filter_16
-        // calculate alternative 'return' targets
-        adr             x14, 7f
-        adr             x15, 8f
         bl              vp9_loop_filter_16
+        cbnz            x14, 7f
+        cbnz            x15, 8f
+        cbnz            x12, 9f
 .endm
 
 .macro loop_filter_16_16b
-        // calculate alternative 'return' targets
-        adr             x14, 7f
-        adr             x15, 8f
         bl              vp9_loop_filter_16_16b
+        cbnz            x14, 7f
+        cbnz            x15, 8f
+        cbnz            x12, 9f
 .endm
 
 
@@ -647,7 +669,7 @@ function ff_vp9_loop_filter_v_4_8_neon, export=1
         st1             {v24.8b}, [x0], x1
         st1             {v23.8b}, [x9], x1
         st1             {v25.8b}, [x0], x1
-
+9:
         ret             x10
 endfunc
 
@@ -671,7 +693,7 @@ function ff_vp9_loop_filter_v_44_16_neon, export=1
         st1             {v24.16b}, [x0], x1
         st1             {v23.16b}, [x9], x1
         st1             {v25.16b}, [x0], x1
-
+9:
         ret             x10
 endfunc
 
@@ -713,7 +735,7 @@ function ff_vp9_loop_filter_h_4_8_neon, export=1
         st1             {v24.s}[1], [x0], x1
         st1             {v25.s}[0], [x9], x1
         st1             {v25.s}[1], [x0], x1
-
+9:
         ret             x10
 endfunc
 
@@ -765,7 +787,7 @@ function ff_vp9_loop_filter_h_44_16_neon, export=1
         st1             {v24.s}[3], [x0], x1
         st1             {v25.s}[1], [x9], x1
         st1             {v25.s}[3], [x0], x1
-
+9:
         ret             x10
 endfunc
 
@@ -792,7 +814,7 @@ function ff_vp9_loop_filter_v_8_8_neon, export=1
         st1             {v25.8b}, [x0], x1
         st1             {v23.8b}, [x9], x1
         st1             {v26.8b}, [x0], x1
-
+9:
         ret             x10
 6:
         sub             x9,  x0,  x1, lsl #1
@@ -827,7 +849,7 @@ function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
         st1             {v25.16b}, [x0], x1
         st1             {v23.16b}, [x9], x1
         st1             {v26.16b}, [x0], x1
-
+9:
         ret             x10
 6:
         sub             x9,  x0,  x1, lsl #1
@@ -875,7 +897,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1
         st1             {v26.8b}, [x0], x1
         st1             {v23.8b}, [x9], x1
         st1             {v27.8b}, [x0], x1
-
+9:
         ret             x10
 6:
         // If we didn't need to do the flat8in part, we use the same writeback
@@ -941,7 +963,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
         st1             {v26.d}[1], [x0], x1
         st1             {v27.8b},   [x9], x1
         st1             {v27.d}[1], [x0], x1
-
+9:
         ret             x10
 6:
         add             x9,  x9,  #2
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to