On Fri, Jul 03, 2026 at 01:41:00PM +0530, Mukesh Kumar Chaurasiya (IBM) wrote:
> After enabling GENERIC_ENTRY on PowerPC, syscall_enter_from_user_mode()
> returns -1 as a sentinel to signal that seccomp or ptrace has intercepted
> the syscall and already set a return value via syscall_set_return_value().
> system_call_exception() was not handling this sentinel, and since -1UL
> is >= NR_syscalls, the code fell into the out-of-range path and returned
> -ENOSYS, overwriting the errno already placed in regs->gpr[3].
> 
> The naive fix of checking r0 == -1L before the NR_syscalls bounds check
> is ambiguous: a user legitimately calling syscall(-1) also produces r0 ==
> -1L, and a tracer intercepting such a call would have its injected return
> value silently discarded.
> 
> Fix this properly by introducing regs->entry_flags, a kernel-internal
> field in struct pt_regs (consuming one slot of the existing __pt_regs_pad
> so the ABI is preserved), with SYSCALL_ENTRY_RET_SET as an out-of-band
> flag. syscall_set_return_value() sets this flag whenever seccomp or ptrace
> injects a return value. system_call_exception() zeros entry_flags before
> calling syscall_enter_from_user_mode(), then checks and clears the flag
> afterwards: if set, it returns regs->gpr[3] directly regardless of what
> syscall number the user originally requested.
> 
> This handles all seccomp actions correctly:
> 
>   - SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE (no tracer), SECCOMP_RET_USER_NOTIF:
>     all call syscall_set_return_value(), flag is set, injected value returned.
>   - SECCOMP_RET_TRAP, SECCOMP_RET_KILL: call syscall_rollback() and deliver
>     a signal; flag is not set, but the process is dying so the return value
>     is irrelevant.
> 
> The fix covers both ppc32 and ppc64 with no #ifdefs.
> 
> Fixes: bee25f97ad24 ("powerpc: Enable GENERIC_ENTRY feature")
> Reported-by: Michal Suchánek <[email protected]>
> Closes: https://lore.kernel.org/all/[email protected]/
> Signed-off-by: Mukesh Kumar Chaurasiya (IBM) <[email protected]>
> ---
> v2 -> v3:
>  - Last fix is not working for -1 syscall. Fixed that with this.
> v2: https://lore.kernel.org/all/[email protected]
> 
> v1 -> v2:
>  - Fix issues in the previous fix (Michal)
> v1: https://lore.kernel.org/all/[email protected]
> 
>  arch/powerpc/include/asm/ptrace.h      | 22 +++++++++++++++++++++-
>  arch/powerpc/include/asm/syscall.h     |  6 ++++++
>  arch/powerpc/include/uapi/asm/ptrace.h |  6 ++++--
>  arch/powerpc/kernel/ptrace/ptrace.c    |  2 ++
>  arch/powerpc/kernel/syscall.c          | 18 ++++++++++++++++++
>  5 files changed, 51 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/ptrace.h 
> b/arch/powerpc/include/asm/ptrace.h
> index fdeb97421785..1a53d5cfa8db 100644
> --- a/arch/powerpc/include/asm/ptrace.h
> +++ b/arch/powerpc/include/asm/ptrace.h
> @@ -54,8 +54,9 @@ struct pt_regs
>                       };
>                       unsigned long result;
>                       unsigned long exit_flags;
> +                     unsigned long entry_flags;
>                       /* Maintain 16 byte interrupt stack alignment */
> -                     unsigned long __pt_regs_pad[3];
> +                     unsigned long __pt_regs_pad[2];
>               };
>       };
>  #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_KUAP)
> @@ -233,6 +234,25 @@ static inline unsigned long frame_pointer(struct pt_regs 
> *regs)
>  #define current_pt_regs() \
>       ((struct pt_regs *)((unsigned long)task_stack_page(current) + 
> THREAD_SIZE) - 1)
>  
> +/*
> + * SYSCALL_ENTRY_RET_SET: seccomp or ptrace called syscall_set_return_value()
> + * and wants the syscall skipped; regs->gpr[3] already holds the return 
> value.
> + */
> +#define SYSCALL_ENTRY_RET_SET        BIT(0)
> +
> +static inline void set_syscall_entry_ret(struct pt_regs *regs)
> +{
> +     regs->entry_flags |= SYSCALL_ENTRY_RET_SET;
> +}
> +
> +static inline bool test_and_clear_syscall_entry_ret(struct pt_regs *regs)
> +{
> +     bool set = !!(regs->entry_flags & SYSCALL_ENTRY_RET_SET);
> +
> +     regs->entry_flags &= ~SYSCALL_ENTRY_RET_SET;
> +     return set;
> +}
> +
>  /*
>   * The 4 low bits (0xf) are available as flags to overload the trap word,
>   * because interrupt vectors have minimum alignment of 0x10. TRAP_FLAGS_MASK
> diff --git a/arch/powerpc/include/asm/syscall.h 
> b/arch/powerpc/include/asm/syscall.h
> index 834fcc4f7b54..9ae79326abe3 100644
> --- a/arch/powerpc/include/asm/syscall.h
> +++ b/arch/powerpc/include/asm/syscall.h
> @@ -98,6 +98,12 @@ static inline void syscall_set_return_value(struct 
> task_struct *task,
>                       regs->gpr[3] = val;
>               }
>       }
> +     /*
> +      * Mark that a return value has been explicitly set by seccomp or
> +      * ptrace so that system_call_exception() can skip the syscall
> +      * unconditionally, even when the user requested syscall(-1).
> +      */
> +     set_syscall_entry_ret(regs);
>  }
>  
>  static inline void syscall_get_arguments(struct task_struct *task,
> diff --git a/arch/powerpc/include/uapi/asm/ptrace.h 
> b/arch/powerpc/include/uapi/asm/ptrace.h
> index a393b7f2760a..2f2a43414fe6 100644
> --- a/arch/powerpc/include/uapi/asm/ptrace.h
> +++ b/arch/powerpc/include/uapi/asm/ptrace.h
> @@ -56,7 +56,8 @@ struct pt_regs
>       unsigned long dsisr;            /* on 4xx/Book-E used for ESR */
>       unsigned long result;           /* Result of a system call */
>       unsigned long exit_flags;       /* System call exit flags */
> -     unsigned long __pt_regs_pad[3]; /* Maintain 16 byte interrupt stack 
> alignment */
> +     unsigned long entry_flags;      /* System call entry flags */
> +     unsigned long __pt_regs_pad[2]; /* Maintain 16 byte interrupt stack 
> alignment */
>  };
>  
>  #endif /* __ASSEMBLER__ */
> @@ -117,7 +118,8 @@ struct pt_regs
>  #define PT_DSISR 42
>  #define PT_RESULT 43
>  #define PT_EXIT_FLAGS 44
> -#define PT_PAD 47 /* 3 times */
> +#define PT_ENTRY_FLAGS 45
> +#define PT_PAD 46 /* 2 times */
>  #define PT_DSCR 48
>  #define PT_REGS_COUNT 48
>  
> diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
> b/arch/powerpc/kernel/ptrace/ptrace.c
> index 316d4f5ead8e..440d00690cf2 100644
> --- a/arch/powerpc/kernel/ptrace/ptrace.c
> +++ b/arch/powerpc/kernel/ptrace/ptrace.c
> @@ -235,6 +235,8 @@ void __init pt_regs_check(void)
>                    offsetof(struct user_pt_regs, dsisr));
>       BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
>                    offsetof(struct user_pt_regs, result));
> +     BUILD_BUG_ON(offsetof(struct pt_regs, entry_flags) !=
> +                  offsetof(struct user_pt_regs, entry_flags));
>  
>       BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs));
>  
> diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
> index a9da2af6efa8..c7458aae199d 100644
> --- a/arch/powerpc/kernel/syscall.c
> +++ b/arch/powerpc/kernel/syscall.c
> @@ -19,9 +19,27 @@ notrace long system_call_exception(struct pt_regs *regs, 
> unsigned long r0)
>       long ret;
>       syscall_fn f;
>  
> +     /*
> +      * Zero entry_flags before syscall_enter_from_user_mode() so that
> +      * syscall_set_return_value() can set SYSCALL_ENTRY_RET_SET as an
> +      * unambiguous out-of-band signal.  The field is not initialised by
> +      * the entry assembly.
> +      */
> +     regs->entry_flags = 0;
>       add_random_kstack_offset();
>       r0 = syscall_enter_from_user_mode(regs, r0);
>  
> +     /*
> +      * Seccomp or ptrace may have set a return value and requested that
> +      * the syscall be skipped. syscall_set_return_value() sets
> +      * SYSCALL_ENTRY_RET_SET in regs->entry_flags as an
> +      * unambiguous out-of-band signal. This avoids the ambiguity of
> +      * using r0 == -1 as the skip sentinel when the user themselves
> +      * called syscall(-1).
> +      */
> +     if (unlikely(test_and_clear_syscall_entry_ret(regs)))
> +             return regs->gpr[3];

Hello,

shouldn't this use the getter to correctly decode the value for both scv
and non-scv case?

Thanks

Michal

> +
>       if (unlikely(r0 >= NR_syscalls)) {
>               if (unlikely(trap_is_unsupported_scv(regs))) {
>                       /* Unsupported scv vector */
> -- 
> 2.55.0
> 

Reply via email to