On Fri, Jul 03, 2026 at 10:33:44AM +0200, Michal Suchánek wrote:
> On Fri, Jul 03, 2026 at 01:41:00PM +0530, Mukesh Kumar Chaurasiya (IBM) wrote:
> > After enabling GENERIC_ENTRY on PowerPC, syscall_enter_from_user_mode()
> > returns -1 as a sentinel to signal that seccomp or ptrace has intercepted
> > the syscall and already set a return value via syscall_set_return_value().
> > system_call_exception() was not handling this sentinel, and since -1UL
> > is >= NR_syscalls, the code fell into the out-of-range path and returned
> > -ENOSYS, overwriting the errno already placed in regs->gpr[3].
> >
> > The naive fix of checking r0 == -1L before the NR_syscalls bounds check
> > is ambiguous: a user legitimately calling syscall(-1) also produces r0 ==
> > -1L, and a tracer intercepting such a call would have its injected return
> > value silently discarded.
> >
> > Fix this properly by introducing regs->entry_flags, a kernel-internal
> > field in struct pt_regs (consuming one slot of the existing __pt_regs_pad
> > so the ABI is preserved), with SYSCALL_ENTRY_RET_SET as an out-of-band
> > flag. syscall_set_return_value() sets this flag whenever seccomp or ptrace
> > injects a return value. system_call_exception() zeros entry_flags before
> > calling syscall_enter_from_user_mode(), then checks and clears the flag
> > afterwards: if set, it returns regs->gpr[3] directly regardless of what
> > syscall number the user originally requested.
> >
> > This handles all seccomp actions correctly:
> >
> > - SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE (no tracer),
> > SECCOMP_RET_USER_NOTIF:
> > all call syscall_set_return_value(), flag is set, injected value
> > returned.
> > - SECCOMP_RET_TRAP, SECCOMP_RET_KILL: call syscall_rollback() and deliver
> > a signal; flag is not set, but the process is dying so the return value
> > is irrelevant.
> >
> > The fix covers both ppc32 and ppc64 with no #ifdefs.
> >
> > Fixes: bee25f97ad24 ("powerpc: Enable GENERIC_ENTRY feature")
> > Reported-by: Michal Suchánek <[email protected]>
> > Closes: https://lore.kernel.org/all/[email protected]/
> > Signed-off-by: Mukesh Kumar Chaurasiya (IBM) <[email protected]>
> > ---
> > v2 -> v3:
> > - Last fix is not working for -1 syscall. Fixed that with this.
> > v2: https://lore.kernel.org/all/[email protected]
> >
> > v1 -> v2:
> > - Fix issues in the previous fix (Michal)
> > v1: https://lore.kernel.org/all/[email protected]
> >
> > arch/powerpc/include/asm/ptrace.h | 22 +++++++++++++++++++++-
> > arch/powerpc/include/asm/syscall.h | 6 ++++++
> > arch/powerpc/include/uapi/asm/ptrace.h | 6 ++++--
> > arch/powerpc/kernel/ptrace/ptrace.c | 2 ++
> > arch/powerpc/kernel/syscall.c | 18 ++++++++++++++++++
> > 5 files changed, 51 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/ptrace.h
> > b/arch/powerpc/include/asm/ptrace.h
> > index fdeb97421785..1a53d5cfa8db 100644
> > --- a/arch/powerpc/include/asm/ptrace.h
> > +++ b/arch/powerpc/include/asm/ptrace.h
> > @@ -54,8 +54,9 @@ struct pt_regs
> > };
> > unsigned long result;
> > unsigned long exit_flags;
> > + unsigned long entry_flags;
> > /* Maintain 16 byte interrupt stack alignment */
> > - unsigned long __pt_regs_pad[3];
> > + unsigned long __pt_regs_pad[2];
> > };
> > };
> > #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_KUAP)
> > @@ -233,6 +234,25 @@ static inline unsigned long frame_pointer(struct
> > pt_regs *regs)
> > #define current_pt_regs() \
> > ((struct pt_regs *)((unsigned long)task_stack_page(current) +
> > THREAD_SIZE) - 1)
> >
> > +/*
> > + * SYSCALL_ENTRY_RET_SET: seccomp or ptrace called
> > syscall_set_return_value()
> > + * and wants the syscall skipped; regs->gpr[3] already holds the return
> > value.
> > + */
> > +#define SYSCALL_ENTRY_RET_SET BIT(0)
> > +
> > +static inline void set_syscall_entry_ret(struct pt_regs *regs)
> > +{
> > + regs->entry_flags |= SYSCALL_ENTRY_RET_SET;
> > +}
> > +
> > +static inline bool test_and_clear_syscall_entry_ret(struct pt_regs *regs)
> > +{
> > + bool set = !!(regs->entry_flags & SYSCALL_ENTRY_RET_SET);
> > +
> > + regs->entry_flags &= ~SYSCALL_ENTRY_RET_SET;
> > + return set;
> > +}
> > +
> > /*
> > * The 4 low bits (0xf) are available as flags to overload the trap word,
> > * because interrupt vectors have minimum alignment of 0x10.
> > TRAP_FLAGS_MASK
> > diff --git a/arch/powerpc/include/asm/syscall.h
> > b/arch/powerpc/include/asm/syscall.h
> > index 834fcc4f7b54..9ae79326abe3 100644
> > --- a/arch/powerpc/include/asm/syscall.h
> > +++ b/arch/powerpc/include/asm/syscall.h
> > @@ -98,6 +98,12 @@ static inline void syscall_set_return_value(struct
> > task_struct *task,
> > regs->gpr[3] = val;
> > }
> > }
> > + /*
> > + * Mark that a return value has been explicitly set by seccomp or
> > + * ptrace so that system_call_exception() can skip the syscall
> > + * unconditionally, even when the user requested syscall(-1).
> > + */
> > + set_syscall_entry_ret(regs);
> > }
> >
> > static inline void syscall_get_arguments(struct task_struct *task,
> > diff --git a/arch/powerpc/include/uapi/asm/ptrace.h
> > b/arch/powerpc/include/uapi/asm/ptrace.h
> > index a393b7f2760a..2f2a43414fe6 100644
> > --- a/arch/powerpc/include/uapi/asm/ptrace.h
> > +++ b/arch/powerpc/include/uapi/asm/ptrace.h
> > @@ -56,7 +56,8 @@ struct pt_regs
> > unsigned long dsisr; /* on 4xx/Book-E used for ESR */
> > unsigned long result; /* Result of a system call */
> > unsigned long exit_flags; /* System call exit flags */
> > - unsigned long __pt_regs_pad[3]; /* Maintain 16 byte interrupt stack
> > alignment */
> > + unsigned long entry_flags; /* System call entry flags */
> > + unsigned long __pt_regs_pad[2]; /* Maintain 16 byte interrupt stack
> > alignment */
> > };
> >
> > #endif /* __ASSEMBLER__ */
> > @@ -117,7 +118,8 @@ struct pt_regs
> > #define PT_DSISR 42
> > #define PT_RESULT 43
> > #define PT_EXIT_FLAGS 44
> > -#define PT_PAD 47 /* 3 times */
> > +#define PT_ENTRY_FLAGS 45
> > +#define PT_PAD 46 /* 2 times */
> > #define PT_DSCR 48
> > #define PT_REGS_COUNT 48
> >
> > diff --git a/arch/powerpc/kernel/ptrace/ptrace.c
> > b/arch/powerpc/kernel/ptrace/ptrace.c
> > index 316d4f5ead8e..440d00690cf2 100644
> > --- a/arch/powerpc/kernel/ptrace/ptrace.c
> > +++ b/arch/powerpc/kernel/ptrace/ptrace.c
> > @@ -235,6 +235,8 @@ void __init pt_regs_check(void)
> > offsetof(struct user_pt_regs, dsisr));
> > BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
> > offsetof(struct user_pt_regs, result));
> > + BUILD_BUG_ON(offsetof(struct pt_regs, entry_flags) !=
> > + offsetof(struct user_pt_regs, entry_flags));
> >
> > BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs));
> >
> > diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
> > index a9da2af6efa8..c7458aae199d 100644
> > --- a/arch/powerpc/kernel/syscall.c
> > +++ b/arch/powerpc/kernel/syscall.c
> > @@ -19,9 +19,27 @@ notrace long system_call_exception(struct pt_regs *regs,
> > unsigned long r0)
> > long ret;
> > syscall_fn f;
> >
> > + /*
> > + * Zero entry_flags before syscall_enter_from_user_mode() so that
> > + * syscall_set_return_value() can set SYSCALL_ENTRY_RET_SET as an
> > + * unambiguous out-of-band signal. The field is not initialised by
> > + * the entry assembly.
> > + */
> > + regs->entry_flags = 0;
> > add_random_kstack_offset();
> > r0 = syscall_enter_from_user_mode(regs, r0);
> >
> > + /*
> > + * Seccomp or ptrace may have set a return value and requested that
> > + * the syscall be skipped. syscall_set_return_value() sets
> > + * SYSCALL_ENTRY_RET_SET in regs->entry_flags as an
> > + * unambiguous out-of-band signal. This avoids the ambiguity of
> > + * using r0 == -1 as the skip sentinel when the user themselves
> > + * called syscall(-1).
> > + */
> > + if (unlikely(test_and_clear_syscall_entry_ret(regs)))
> > + return regs->gpr[3];
>
> Hello,
>
> shouldn't this use the getter to correctly decode the value for both scv
> and non-scv case?
>
> Thanks
>
> Michal
>
Oh yeah,
Correct. Lemme fix that.
Regards,
Mukesh
> > +
> > if (unlikely(r0 >= NR_syscalls)) {
> > if (unlikely(trap_is_unsupported_scv(regs))) {
> > /* Unsupported scv vector */
> > --
> > 2.55.0
> >