this narrows the scope of the KERNEL_LOCK in kbind(2) so the syscall argument checks can be done without the kernel lock.
care is taken to allow the pc/cookie checks to run without any lock by being careful with the order of the checks. all modifications to the pc/cookie state are serialised by the per process mutex. i dont know enough about uvm to say whether it is safe to unlock the actual memory updates too, but even if i was confident i would still prefer to change it as a separate step. ok? Index: kern/init_sysent.c =================================================================== RCS file: /cvs/src/sys/kern/init_sysent.c,v retrieving revision 1.236 diff -u -p -r1.236 init_sysent.c --- kern/init_sysent.c 1 May 2022 23:00:04 -0000 1.236 +++ kern/init_sysent.c 17 May 2022 00:36:03 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: init_sysent.c,v 1.236 2022/05/01 23:00:04 tedu Exp $ */ +/* $OpenBSD$ */ /* * System call switch table. @@ -204,7 +204,7 @@ const struct sysent sysent[] = { sys_utimensat }, /* 84 = utimensat */ { 2, s(struct sys_futimens_args), 0, sys_futimens }, /* 85 = futimens */ - { 3, s(struct sys_kbind_args), 0, + { 3, s(struct sys_kbind_args), SY_NOLOCK | 0, sys_kbind }, /* 86 = kbind */ { 2, s(struct sys_clock_gettime_args), SY_NOLOCK | 0, sys_clock_gettime }, /* 87 = clock_gettime */ Index: kern/syscalls.master =================================================================== RCS file: /cvs/src/sys/kern/syscalls.master,v retrieving revision 1.223 diff -u -p -r1.223 syscalls.master --- kern/syscalls.master 24 Feb 2022 07:41:51 -0000 1.223 +++ kern/syscalls.master 17 May 2022 00:36:03 -0000 @@ -194,7 +194,7 @@ const struct timespec *times, int flag); } 85 STD { int sys_futimens(int fd, \ const struct timespec *times); } -86 STD { int sys_kbind(const struct __kbind *param, \ +86 STD NOLOCK { int sys_kbind(const struct __kbind *param, \ size_t psize, int64_t proc_cookie); } 87 STD NOLOCK { int sys_clock_gettime(clockid_t clock_id, \ struct timespec *tp); } Index: uvm/uvm_mmap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v retrieving revision 1.169 diff -u -p -r1.169 uvm_mmap.c --- uvm/uvm_mmap.c 19 Jan 2022 10:43:48 -0000 1.169 +++ uvm/uvm_mmap.c 17 May 2022 00:36:03 -0000 @@ -70,6 +70,7 @@ #include <sys/pledge.h> #include <sys/unistd.h> /* for KBIND* */ #include <sys/user.h> +#include <sys/atomic.h> #include <machine/exec.h> /* for __LDPGSZ */ @@ -1125,33 +1126,64 @@ sys_kbind(struct proc *p, void *v, regis const char *data; vaddr_t baseva, last_baseva, endva, pageoffset, kva; size_t psize, s; - u_long pc; + u_long pc, kpc; int count, i, extra; + uint64_t cookie; int error; /* * extract syscall args from uap */ paramp = SCARG(uap, param); - psize = SCARG(uap, psize); /* a NULL paramp disables the syscall for the process */ if (paramp == NULL) { + mtx_enter(&pr->ps_mtx); if (pr->ps_kbind_addr != 0) - sigexit(p, SIGILL); + goto leave_sigill; pr->ps_kbind_addr = BOGO_PC; + mtx_leave(&pr->ps_mtx); return 0; } /* security checks */ + + /* + * ps_kbind_addr can only be set to 0 or BOGO_PC by the + * kernel, not by a call from userland. + */ pc = PROC_PC(p); - if (pr->ps_kbind_addr == 0) { - pr->ps_kbind_addr = pc; - pr->ps_kbind_cookie = SCARG(uap, proc_cookie); - } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC) - sigexit(p, SIGILL); - else if (pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) - sigexit(p, SIGILL); + if (pc == 0 || pc == BOGO_PC) + goto sigill; + + cookie = SCARG(uap, proc_cookie); + if (pr->ps_kbind_addr == pc) { + membar_datadep_consumer(); + if (pr->ps_kbind_cookie != cookie) + goto sigill; + } else { + mtx_enter(&pr->ps_mtx); + kpc = pr->ps_kbind_addr; + + /* + * If we're the first thread in (kpc is 0), then + * remember where the call came from for the future. + * + * Maybe another thread raced us and won. pc cannot + * be BOGO_PC, so this also handles the check for + * the disabled case. + */ + + if (kpc == 0) { + pr->ps_kbind_cookie = cookie; + membar_producer(); + pr->ps_kbind_addr = pc; + } else if (kpc != pc || pr->ps_kbind_cookie != cookie) + goto leave_sigill; + mtx_leave(&pr->ps_mtx); + } + + psize = SCARG(uap, psize); if (psize < sizeof(struct __kbind) || psize > sizeof(param)) return EINVAL; if ((error = copyin(paramp, ¶m, psize))) @@ -1187,6 +1219,7 @@ sys_kbind(struct proc *p, void *v, regis data = (const char *)¶mp[count]; /* all looks good, so do the bindings */ + KERNEL_LOCK(); last_baseva = VM_MAXUSER_ADDRESS; kva = 0; TAILQ_INIT(&dead_entries); @@ -1239,6 +1272,14 @@ redo: vm_map_unlock(kernel_map); } uvm_unmap_detach(&dead_entries, AMAP_REFALL); + KERNEL_UNLOCK(); return error; + +leave_sigill: + mtx_leave(&pr->ps_mtx); +sigill: + KERNEL_LOCK(); + sigexit(p, SIGILL); + /* NOTREACHED KERNEL_UNLOCK(); */ } Index: sys/proc.h =================================================================== RCS file: /cvs/src/sys/sys/proc.h,v retrieving revision 1.329 diff -u -p -r1.329 proc.h --- sys/proc.h 10 Mar 2022 15:21:08 -0000 1.329 +++ sys/proc.h 17 May 2022 00:36:03 -0000 @@ -234,8 +234,8 @@ struct process { uint64_t ps_pledge; uint64_t ps_execpledge; - int64_t ps_kbind_cookie; - u_long ps_kbind_addr; + int64_t ps_kbind_cookie; /* [m] */ + u_long ps_kbind_addr; /* [m] */ /* End area that is copied on creation. */ #define ps_endcopy ps_refcnt