PR #23347 opened by Martin Storsjö (mstorsjo) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23347 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23347.patch
For cases when returning early without updating any pixels, we previously returned to return address in the caller's scope, bypassing one function entirely. While this may seem like a neat optimization, it makes the return stack predictor mispredict the returns - which potentially can cost more performance than it gains. Secondly, if the armv9.3 feature GCS (Guarded Control Stack) is enabled, then returns _must_ match the expected value; this feature is being enabled across linux distributions, and by fixing the hevc assembly, we can enable the security feature on ffmpeg as well. This matches the fix that was done for hevcdsp in 1f7ed8a78de1da743a359913ce05cc258a400b5d. For unknown reasons, QEMU didn't notice this issue when running checkasm, only if running other FATE tests. Since the switch to a new checkasm implementation, QEMU does seem to pick up on the GCS mismatch in these functions as well. Since 846746be4b8edd66be8be2c123b0072de2636e9d we do signal that our code was GCS compliant, which means that builds of such versions (including version 8.1.1 where the GCS enabling was backported) will fail in the future when run on actual GCS supporting hardware, until this fix is applied. From 90fab21acbdc240c113a218bdb3a982504ecccbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <[email protected]> Date: Thu, 4 Jun 2026 23:41:11 +0300 Subject: [PATCH] aarch64: vp9lpf: Fix GCS violations For cases when returning early without updating any pixels, we previously returned to return address in the caller's scope, bypassing one function entirely. While this may seem like a neat optimization, it makes the return stack predictor mispredict the returns - which potentially can cost more performance than it gains. Secondly, if the armv9.3 feature GCS (Guarded Control Stack) is enabled, then returns _must_ match the expected value; this feature is being enabled across linux distributions, and by fixing the hevc assembly, we can enable the security feature on ffmpeg as well. This matches the fix that was done for hevcdsp in 1f7ed8a78de1da743a359913ce05cc258a400b5d. For unknown reasons, QEMU didn't notice this issue when running checkasm, only if running other FATE tests. Since the switch to a new checkasm implementation, QEMU does seem to pick up on the GCS mismatch in these functions as well. Since 846746be4b8edd66be8be2c123b0072de2636e9d we do signal that our code was GCS compliant, which means that builds of such versions (including version 8.1.1 where the GCS enabling was backported) will fail in the future when run on actual GCS supporting hardware, until this fix is applied. --- libavcodec/aarch64/vp9lpf_16bpp_neon.S | 65 ++++++++------ libavcodec/aarch64/vp9lpf_neon.S | 112 +++++++++++++++---------- 2 files changed, 106 insertions(+), 71 deletions(-) diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S index e3e70491c6..3defd633e7 100644 --- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S +++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S @@ -56,9 +56,7 @@ mov x11, v4.d[0] mov x12, v4.d[1] adds x11, x11, x12 - b.ne 1f - ret x10 -1: + b.eq 9f .if \wd >= 8 dup v0.8h, w5 @@ -189,13 +187,7 @@ // If no pixels need flat8in, jump to flat8out // (or to a writeout of the inner 4 pixels, for wd=8) .if \wd >= 8 -.if \wd == 16 b.eq 6f -.else - b.ne 1f - ret x13 -1: -.endif // flat8in add \tmp1\().8h, v20.8h, v21.8h @@ -249,20 +241,16 @@ mov x11, v2.d[0] mov x12, v2.d[1] adds x11, x11, x12 - b.ne 1f // If no pixels needed flat8in nor flat8out, jump to a // writeout of the inner 4 pixels - ret x14 -1: + b.eq 7f mov x11, v7.d[0] mov x12, v7.d[1] adds x11, x11, x12 - b.ne 1f // If no pixels need flat8out, jump to a writeout of the inner 6 pixels - ret x15 + b.eq 8f -1: // flat8out // This writes all outputs into v2-v17 (skipping v6 and v16). // If this part is skipped, the output is read from v21-v26 (which is the input @@ -378,35 +366,60 @@ // while we need those for inputs/outputs in wd=16 and use v8-v15 // for temp registers there instead. function vp9_loop_filter_4 + mov x17, #0 loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31 ret +9: + mov x17, #1 + ret endfunc function vp9_loop_filter_8 + mov x17, #0 + mov x13, #0 loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31 ret +6: + mov x13, #1 + ret +9: + mov x17, #1 + ret endfunc function vp9_loop_filter_16 + mov x17, #0 + mov x14, #0 + mov x15, #0 loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15 ret +7: + mov x14, #1 + ret +8: + mov x15, #1 + ret +9: + mov x17, #1 + ret endfunc .macro loop_filter_4 bl vp9_loop_filter_4 + cbnz x17, 9f .endm .macro loop_filter_8 - // calculate alternative 'return' targets - adr x13, 6f bl vp9_loop_filter_8 + cbnz x17, 9f + cbnz x13, 6f .endm .macro loop_filter_16 - // calculate alternative 'return' targets - adr x14, 7f - adr x15, 8f bl vp9_loop_filter_16 + cbnz x17, 9f + cbnz x14, 7f + cbnz x15, 8f .endm @@ -540,7 +553,7 @@ function vp9_loop_filter_v_4_8_16_neon st1 {v23.8h}, [x9], x1 st1 {v25.8h}, [x0], x1 sub x0, x0, x1, lsl #1 - +9: ret x10 endfunc @@ -588,7 +601,7 @@ function vp9_loop_filter_h_4_8_16_neon st1 {v25.d}[1], [x0], x1 sub x0, x0, x1, lsl #3 add x0, x0, #4 - +9: ret x10 endfunc @@ -619,7 +632,7 @@ function vp9_loop_filter_v_8_8_16_neon st1 {v26.8h}, [x0], x1 sub x0, x0, x1, lsl #1 sub x0, x0, x1 - +9: ret x10 6: sub x9, x0, x1, lsl #1 @@ -670,7 +683,7 @@ function vp9_loop_filter_h_8_8_16_neon st1 {v27.8h}, [x0], x1 sub x0, x0, x1, lsl #3 add x0, x0, #8 - +9: ret x10 6: // If we didn't need to do the flat8in part, we use the same writeback @@ -742,7 +755,7 @@ function vp9_loop_filter_v_16_8_16_neon st1 {v17.8h}, [x0], x1 sub x0, x0, x1, lsl #3 add x0, x0, x1 - +9: ret x10 8: add x9, x9, x1, lsl #2 @@ -820,7 +833,7 @@ function vp9_loop_filter_h_16_8_16_neon st1 {v9.8h}, [x9], x1 st1 {v31.8h}, [x0], x1 sub x0, x0, x1, lsl #3 - +9: ret x10 8: // The same writeback as in loop_filter_h_8_8 diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index 9a79f48df3..b9b82e6fff 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -388,32 +388,28 @@ .endif .if \wd == 16 6: + // If no pixels needed flat8in nor flat8out, jump to a + // writeout of the inner 4 pixels orr v2\sz, v6\sz, v7\sz mov x5, v2.d[0] .ifc \sz, .16b mov x6, v2.d[1] adds x5, x5, x6 - b.ne 1f + b.eq 7f .else - cbnz x5, 1f + cbz x5, 7f .endif - // If no pixels needed flat8in nor flat8out, jump to a - // writeout of the inner 4 pixels - ret x14 -1: + // If no pixels need flat8out, jump to a writeout of the inner 6 pixels mov x5, v7.d[0] .ifc \sz, .16b mov x6, v7.d[1] adds x5, x5, x6 - b.ne 1f + b.eq 8f .else - cbnz x5, 1f + cbz x5, 8f .endif - // If no pixels need flat8out, jump to a writeout of the inner 6 pixels - ret x15 -1: // flat8out // This writes all outputs into v2-v17 (skipping v6 and v16). // If this part is skipped, the output is read from v21-v26 (which is the input @@ -529,76 +525,100 @@ // while we need those for inputs/outputs in wd=16 and use v8-v15 // for temp registers there instead. function vp9_loop_filter_4 + mov x12, #0 loop_filter 4, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 ret 9: - ret x10 + mov x12, #1 + ret endfunc function vp9_loop_filter_4_16b_mix_44 + mov x12, #0 loop_filter 4, .16b, 44, v16, v17, v18, v19, v28, v29, v30, v31 ret 9: - ret x10 + mov x12, #1 + ret endfunc function vp9_loop_filter_8 + mov x12, #0 + mov x13, #0 loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 ret 6: - ret x13 + mov x13, #1 + ret 9: - ret x10 + mov x12, #1 + ret endfunc function vp9_loop_filter_8_16b_mix + mov x12, #0 + mov x13, #0 loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31 ret 6: - ret x13 + mov x13, #1 + ret 9: - ret x10 + mov x12, #1 + ret endfunc function vp9_loop_filter_16 + mov x12, #0 + mov x14, #0 + mov x15, #0 loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15 ret +7: + mov x14, #1 + ret +8: + mov x15, #1 + ret 9: - ldp d10, d11, [sp, #0x10] - ldp d12, d13, [sp, #0x20] - ldp d14, d15, [sp, #0x30] - ldp d8, d9, [sp], #0x40 - ret x10 + mov x12, #1 + ret endfunc function vp9_loop_filter_16_16b + mov x12, #0 + mov x14, #0 + mov x15, #0 loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15 ret +7: + mov x14, #1 + ret +8: + mov x15, #1 + ret 9: - ldp d10, d11, [sp, #0x10] - ldp d12, d13, [sp, #0x20] - ldp d14, d15, [sp, #0x30] - ldp d8, d9, [sp], #0x40 - ret x10 + mov x12, #1 + ret endfunc .macro loop_filter_4 bl vp9_loop_filter_4 + cbnz x12, 9f .endm .macro loop_filter_4_16b_mix mix bl vp9_loop_filter_4_16b_mix_\mix + cbnz x12, 9f .endm .macro loop_filter_8 - // calculate alternative 'return' targets - adr x13, 6f bl vp9_loop_filter_8 + cbnz x13, 6f + cbnz x12, 9f .endm .macro loop_filter_8_16b_mix mix - // calculate alternative 'return' targets - adr x13, 6f .if \mix == 48 mov x11, #0xffffffff00000000 .elseif \mix == 84 @@ -607,20 +627,22 @@ endfunc mov x11, #0xffffffffffffffff .endif bl vp9_loop_filter_8_16b_mix + cbnz x13, 6f + cbnz x12, 9f .endm .macro loop_filter_16 - // calculate alternative 'return' targets - adr x14, 7f - adr x15, 8f bl vp9_loop_filter_16 + cbnz x14, 7f + cbnz x15, 8f + cbnz x12, 9f .endm .macro loop_filter_16_16b - // calculate alternative 'return' targets - adr x14, 7f - adr x15, 8f bl vp9_loop_filter_16_16b + cbnz x14, 7f + cbnz x15, 8f + cbnz x12, 9f .endm @@ -647,7 +669,7 @@ function ff_vp9_loop_filter_v_4_8_neon, export=1 st1 {v24.8b}, [x0], x1 st1 {v23.8b}, [x9], x1 st1 {v25.8b}, [x0], x1 - +9: ret x10 endfunc @@ -671,7 +693,7 @@ function ff_vp9_loop_filter_v_44_16_neon, export=1 st1 {v24.16b}, [x0], x1 st1 {v23.16b}, [x9], x1 st1 {v25.16b}, [x0], x1 - +9: ret x10 endfunc @@ -713,7 +735,7 @@ function ff_vp9_loop_filter_h_4_8_neon, export=1 st1 {v24.s}[1], [x0], x1 st1 {v25.s}[0], [x9], x1 st1 {v25.s}[1], [x0], x1 - +9: ret x10 endfunc @@ -765,7 +787,7 @@ function ff_vp9_loop_filter_h_44_16_neon, export=1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x9], x1 st1 {v25.s}[3], [x0], x1 - +9: ret x10 endfunc @@ -792,7 +814,7 @@ function ff_vp9_loop_filter_v_8_8_neon, export=1 st1 {v25.8b}, [x0], x1 st1 {v23.8b}, [x9], x1 st1 {v26.8b}, [x0], x1 - +9: ret x10 6: sub x9, x0, x1, lsl #1 @@ -827,7 +849,7 @@ function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1 st1 {v25.16b}, [x0], x1 st1 {v23.16b}, [x9], x1 st1 {v26.16b}, [x0], x1 - +9: ret x10 6: sub x9, x0, x1, lsl #1 @@ -875,7 +897,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1 st1 {v26.8b}, [x0], x1 st1 {v23.8b}, [x9], x1 st1 {v27.8b}, [x0], x1 - +9: ret x10 6: // If we didn't need to do the flat8in part, we use the same writeback @@ -941,7 +963,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1 st1 {v26.d}[1], [x0], x1 st1 {v27.8b}, [x9], x1 st1 {v27.d}[1], [x0], x1 - +9: ret x10 6: add x9, x9, #2 -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
