On Fri, Jul 03, 2026 at 01:41:00PM +0530, Mukesh Kumar Chaurasiya (IBM) wrote:
> After enabling GENERIC_ENTRY on PowerPC, syscall_enter_from_user_mode()
> returns -1 as a sentinel to signal that seccomp or ptrace has intercepted
> the syscall and already set a return value via syscall_set_return_value().
> system_call_exception() was not handling this sentinel, and since -1UL
> is >= NR_syscalls, the code fell into the out-of-range path and returned
> -ENOSYS, overwriting the errno already placed in regs->gpr[3].
>
> The naive fix of checking r0 == -1L before the NR_syscalls bounds check
> is ambiguous: a user legitimately calling syscall(-1) also produces r0 ==
> -1L, and a tracer intercepting such a call would have its injected return
> value silently discarded.
>
> Fix this properly by introducing regs->entry_flags, a kernel-internal
> field in struct pt_regs (consuming one slot of the existing __pt_regs_pad
> so the ABI is preserved), with SYSCALL_ENTRY_RET_SET as an out-of-band
> flag. syscall_set_return_value() sets this flag whenever seccomp or ptrace
> injects a return value. system_call_exception() zeros entry_flags before
> calling syscall_enter_from_user_mode(), then checks and clears the flag
> afterwards: if set, it returns regs->gpr[3] directly regardless of what
> syscall number the user originally requested.
>
> This handles all seccomp actions correctly:
>
> - SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE (no tracer), SECCOMP_RET_USER_NOTIF:
> all call syscall_set_return_value(), flag is set, injected value returned.
> - SECCOMP_RET_TRAP, SECCOMP_RET_KILL: call syscall_rollback() and deliver
> a signal; flag is not set, but the process is dying so the return value
> is irrelevant.
>
> The fix covers both ppc32 and ppc64 with no #ifdefs.
>
> Fixes: bee25f97ad24 ("powerpc: Enable GENERIC_ENTRY feature")
> Reported-by: Michal Suchánek <[email protected]>
> Closes: https://lore.kernel.org/all/[email protected]/
> Signed-off-by: Mukesh Kumar Chaurasiya (IBM) <[email protected]>
> ---
> v2 -> v3:
> - Last fix is not working for -1 syscall. Fixed that with this.
> v2: https://lore.kernel.org/all/[email protected]
>
> v1 -> v2:
> - Fix issues in the previous fix (Michal)
> v1: https://lore.kernel.org/all/[email protected]
>
> arch/powerpc/include/asm/ptrace.h | 22 +++++++++++++++++++++-
> arch/powerpc/include/asm/syscall.h | 6 ++++++
> arch/powerpc/include/uapi/asm/ptrace.h | 6 ++++--
> arch/powerpc/kernel/ptrace/ptrace.c | 2 ++
> arch/powerpc/kernel/syscall.c | 18 ++++++++++++++++++
> 5 files changed, 51 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/ptrace.h
> b/arch/powerpc/include/asm/ptrace.h
> index fdeb97421785..1a53d5cfa8db 100644
> --- a/arch/powerpc/include/asm/ptrace.h
> +++ b/arch/powerpc/include/asm/ptrace.h
> @@ -54,8 +54,9 @@ struct pt_regs
> };
> unsigned long result;
> unsigned long exit_flags;
> + unsigned long entry_flags;
> /* Maintain 16 byte interrupt stack alignment */
> - unsigned long __pt_regs_pad[3];
> + unsigned long __pt_regs_pad[2];
> };
> };
> #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_KUAP)
> @@ -233,6 +234,25 @@ static inline unsigned long frame_pointer(struct pt_regs
> *regs)
> #define current_pt_regs() \
> ((struct pt_regs *)((unsigned long)task_stack_page(current) +
> THREAD_SIZE) - 1)
>
> +/*
> + * SYSCALL_ENTRY_RET_SET: seccomp or ptrace called syscall_set_return_value()
> + * and wants the syscall skipped; regs->gpr[3] already holds the return
> value.
> + */
> +#define SYSCALL_ENTRY_RET_SET BIT(0)
> +
> +static inline void set_syscall_entry_ret(struct pt_regs *regs)
> +{
> + regs->entry_flags |= SYSCALL_ENTRY_RET_SET;
> +}
> +
> +static inline bool test_and_clear_syscall_entry_ret(struct pt_regs *regs)
> +{
> + bool set = !!(regs->entry_flags & SYSCALL_ENTRY_RET_SET);
> +
> + regs->entry_flags &= ~SYSCALL_ENTRY_RET_SET;
> + return set;
> +}
> +
> /*
> * The 4 low bits (0xf) are available as flags to overload the trap word,
> * because interrupt vectors have minimum alignment of 0x10. TRAP_FLAGS_MASK
> diff --git a/arch/powerpc/include/asm/syscall.h
> b/arch/powerpc/include/asm/syscall.h
> index 834fcc4f7b54..9ae79326abe3 100644
> --- a/arch/powerpc/include/asm/syscall.h
> +++ b/arch/powerpc/include/asm/syscall.h
> @@ -98,6 +98,12 @@ static inline void syscall_set_return_value(struct
> task_struct *task,
> regs->gpr[3] = val;
> }
> }
> + /*
> + * Mark that a return value has been explicitly set by seccomp or
> + * ptrace so that system_call_exception() can skip the syscall
> + * unconditionally, even when the user requested syscall(-1).
> + */
> + set_syscall_entry_ret(regs);
> }
>
> static inline void syscall_get_arguments(struct task_struct *task,
> diff --git a/arch/powerpc/include/uapi/asm/ptrace.h
> b/arch/powerpc/include/uapi/asm/ptrace.h
> index a393b7f2760a..2f2a43414fe6 100644
> --- a/arch/powerpc/include/uapi/asm/ptrace.h
> +++ b/arch/powerpc/include/uapi/asm/ptrace.h
> @@ -56,7 +56,8 @@ struct pt_regs
> unsigned long dsisr; /* on 4xx/Book-E used for ESR */
> unsigned long result; /* Result of a system call */
> unsigned long exit_flags; /* System call exit flags */
> - unsigned long __pt_regs_pad[3]; /* Maintain 16 byte interrupt stack
> alignment */
> + unsigned long entry_flags; /* System call entry flags */
> + unsigned long __pt_regs_pad[2]; /* Maintain 16 byte interrupt stack
> alignment */
> };
>
> #endif /* __ASSEMBLER__ */
> @@ -117,7 +118,8 @@ struct pt_regs
> #define PT_DSISR 42
> #define PT_RESULT 43
> #define PT_EXIT_FLAGS 44
> -#define PT_PAD 47 /* 3 times */
> +#define PT_ENTRY_FLAGS 45
> +#define PT_PAD 46 /* 2 times */
> #define PT_DSCR 48
> #define PT_REGS_COUNT 48
>
> diff --git a/arch/powerpc/kernel/ptrace/ptrace.c
> b/arch/powerpc/kernel/ptrace/ptrace.c
> index 316d4f5ead8e..440d00690cf2 100644
> --- a/arch/powerpc/kernel/ptrace/ptrace.c
> +++ b/arch/powerpc/kernel/ptrace/ptrace.c
> @@ -235,6 +235,8 @@ void __init pt_regs_check(void)
> offsetof(struct user_pt_regs, dsisr));
> BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
> offsetof(struct user_pt_regs, result));
> + BUILD_BUG_ON(offsetof(struct pt_regs, entry_flags) !=
> + offsetof(struct user_pt_regs, entry_flags));
>
> BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs));
>
> diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
> index a9da2af6efa8..c7458aae199d 100644
> --- a/arch/powerpc/kernel/syscall.c
> +++ b/arch/powerpc/kernel/syscall.c
> @@ -19,9 +19,27 @@ notrace long system_call_exception(struct pt_regs *regs,
> unsigned long r0)
> long ret;
> syscall_fn f;
>
> + /*
> + * Zero entry_flags before syscall_enter_from_user_mode() so that
> + * syscall_set_return_value() can set SYSCALL_ENTRY_RET_SET as an
> + * unambiguous out-of-band signal. The field is not initialised by
> + * the entry assembly.
> + */
> + regs->entry_flags = 0;
> add_random_kstack_offset();
> r0 = syscall_enter_from_user_mode(regs, r0);
>
> + /*
> + * Seccomp or ptrace may have set a return value and requested that
> + * the syscall be skipped. syscall_set_return_value() sets
> + * SYSCALL_ENTRY_RET_SET in regs->entry_flags as an
> + * unambiguous out-of-band signal. This avoids the ambiguity of
> + * using r0 == -1 as the skip sentinel when the user themselves
> + * called syscall(-1).
> + */
> + if (unlikely(test_and_clear_syscall_entry_ret(regs)))
> + return regs->gpr[3];
Hello,
shouldn't this use the getter to correctly decode the value for both scv
and non-scv case?
Thanks
Michal
> +
> if (unlikely(r0 >= NR_syscalls)) {
> if (unlikely(trap_is_unsupported_scv(regs))) {
> /* Unsupported scv vector */
> --
> 2.55.0
>