On Sun, 20 Jul 2025 13:21:19 +0200 Jiri Olsa <jo...@kernel.org> wrote:
> Adding new uprobe syscall that calls uprobe handlers for given > 'breakpoint' address. > > The idea is that the 'breakpoint' address calls the user space > trampoline which executes the uprobe syscall. > > The syscall handler reads the return address of the initial call > to retrieve the original 'breakpoint' address. With this address > we find the related uprobe object and call its consumers. > > Adding the arch_uprobe_trampoline_mapping function that provides > uprobe trampoline mapping. This mapping is backed with one global > page initialized at __init time and shared by the all the mapping > instances. > > We do not allow to execute uprobe syscall if the caller is not > from uprobe trampoline mapping. > > The uprobe syscall ensures the consumer (bpf program) sees registers > values in the state before the trampoline was called. > Looks good to me. Acked-by: Masami Hiramatsu (Google) <mhira...@kernel.org> Thanks! > Acked-by: Andrii Nakryiko <and...@kernel.org> > Signed-off-by: Jiri Olsa <jo...@kernel.org> > --- > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++ > include/linux/syscalls.h | 2 + > include/linux/uprobes.h | 1 + > kernel/events/uprobes.c | 17 +++ > kernel/sys_ni.c | 1 + > 6 files changed, 161 insertions(+) > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl > b/arch/x86/entry/syscalls/syscall_64.tbl > index cfb5ca41e30d..9fd1291e7bdf 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -345,6 +345,7 @@ > 333 common io_pgetevents sys_io_pgetevents > 334 common rseq sys_rseq > 335 common uretprobe sys_uretprobe > +336 common uprobe sys_uprobe > # don't use numbers 387 through 423, add new calls after the last > # 'common' entry > 424 common pidfd_send_signal sys_pidfd_send_signal > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c > index 6c4dcbdd0c3c..d18e1ae59901 100644 > --- a/arch/x86/kernel/uprobes.c > +++ b/arch/x86/kernel/uprobes.c > @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) > hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) > destroy_uprobe_trampoline(tramp); > } > + > +static bool __in_uprobe_trampoline(unsigned long ip) > +{ > + struct vm_area_struct *vma = vma_lookup(current->mm, ip); > + > + return vma && vma_is_special_mapping(vma, &tramp_mapping); > +} > + > +static bool in_uprobe_trampoline(unsigned long ip) > +{ > + struct mm_struct *mm = current->mm; > + bool found, retry = true; > + unsigned int seq; > + > + rcu_read_lock(); > + if (mmap_lock_speculate_try_begin(mm, &seq)) { > + found = __in_uprobe_trampoline(ip); > + retry = mmap_lock_speculate_retry(mm, seq); > + } > + rcu_read_unlock(); > + > + if (retry) { > + mmap_read_lock(mm); > + found = __in_uprobe_trampoline(ip); > + mmap_read_unlock(mm); > + } > + return found; > +} > + > +/* > + * See uprobe syscall trampoline; the call to the trampoline will push > + * the return address on the stack, the trampoline itself then pushes > + * cx, r11 and ax. > + */ > +struct uprobe_syscall_args { > + unsigned long ax; > + unsigned long r11; > + unsigned long cx; > + unsigned long retaddr; > +}; > + > +SYSCALL_DEFINE0(uprobe) > +{ > + struct pt_regs *regs = task_pt_regs(current); > + struct uprobe_syscall_args args; > + unsigned long ip, sp; > + int err; > + > + /* Allow execution only from uprobe trampolines. */ > + if (!in_uprobe_trampoline(regs->ip)) > + goto sigill; > + > + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); > + if (err) > + goto sigill; > + > + ip = regs->ip; > + > + /* > + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, > plus: > + * - adjust ip to the probe address, call saved next instruction address > + * - adjust sp to the probe's stack frame (check trampoline code) > + */ > + regs->ax = args.ax; > + regs->r11 = args.r11; > + regs->cx = args.cx; > + regs->ip = args.retaddr - 5; > + regs->sp += sizeof(args); > + regs->orig_ax = -1; > + > + sp = regs->sp; > + > + handle_syscall_uprobe(regs, regs->ip); > + > + /* > + * Some of the uprobe consumers has changed sp, we can do nothing, > + * just return via iret. > + */ > + if (regs->sp != sp) { > + /* skip the trampoline call */ > + if (args.retaddr - 5 == regs->ip) > + regs->ip += 5; > + return regs->ax; > + } > + > + regs->sp -= sizeof(args); > + > + /* for the case uprobe_consumer has changed ax/r11/cx */ > + args.ax = regs->ax; > + args.r11 = regs->r11; > + args.cx = regs->cx; > + > + /* keep return address unless we are instructed otherwise */ > + if (args.retaddr - 5 != regs->ip) > + args.retaddr = regs->ip; > + > + regs->ip = ip; > + > + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); > + if (err) > + goto sigill; > + > + /* ensure sysret, see do_syscall_64() */ > + regs->r11 = regs->flags; > + regs->cx = regs->ip; > + return 0; > + > +sigill: > + force_sig(SIGILL); > + return -1; > +} > + > +asm ( > + ".pushsection .rodata\n" > + ".balign " __stringify(PAGE_SIZE) "\n" > + "uprobe_trampoline_entry:\n" > + "push %rcx\n" > + "push %r11\n" > + "push %rax\n" > + "movq $" __stringify(__NR_uprobe) ", %rax\n" > + "syscall\n" > + "pop %rax\n" > + "pop %r11\n" > + "pop %rcx\n" > + "ret\n" > + ".balign " __stringify(PAGE_SIZE) "\n" > + ".popsection\n" > +); > + > +extern u8 uprobe_trampoline_entry[]; > + > +static int __init arch_uprobes_init(void) > +{ > + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); > + return 0; > +} > + > +late_initcall(arch_uprobes_init); > + > #else /* 32-bit: */ > /* > * No RIP-relative addressing on 32-bit > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index e5603cc91963..b0cc60f1c458 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -998,6 +998,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned > long num, int on); > > asmlinkage long sys_uretprobe(void); > > +asmlinkage long sys_uprobe(void); > + > /* pciconfig: alpha, arm, arm64, ia64, sparc */ > asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, > unsigned long off, unsigned long len, > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h > index b40d33aae016..b6b077cc7d0f 100644 > --- a/include/linux/uprobes.h > +++ b/include/linux/uprobes.h > @@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void); > extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, > void *dst, int len); > extern void arch_uprobe_clear_state(struct mm_struct *mm); > extern void arch_uprobe_init_state(struct mm_struct *mm); > +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long > bp_vaddr); > #else /* !CONFIG_UPROBES */ > struct uprobes_state { > }; > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > index acec91a676b7..cbba31c0495f 100644 > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -2772,6 +2772,23 @@ static void handle_swbp(struct pt_regs *regs) > rcu_read_unlock_trace(); > } > > +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) > +{ > + struct uprobe *uprobe; > + int is_swbp; > + > + guard(rcu_tasks_trace)(); > + > + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); > + if (!uprobe) > + return; > + if (!get_utask()) > + return; > + if (arch_uprobe_ignore(&uprobe->arch, regs)) > + return; > + handler_chain(uprobe, regs); > +} > + > /* > * Perform required fix-ups and disable singlestep. > * Allow pending signals to take effect. > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index c00a86931f8c..bf5d05c635ff 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); > COND_SYSCALL(rseq); > > COND_SYSCALL(uretprobe); > +COND_SYSCALL(uprobe); > -- > 2.50.1 > -- Masami Hiramatsu (Google) <mhira...@kernel.org>