On 17/05/22(Tue) 10:44, David Gwynne wrote: > this narrows the scope of the KERNEL_LOCK in kbind(2) so the syscall > argument checks can be done without the kernel lock. > > care is taken to allow the pc/cookie checks to run without any lock by > being careful with the order of the checks. all modifications to the > pc/cookie state are serialised by the per process mutex.
I don't understand why it is safe to do the following check without holding a mutex: if (pr->ps_kbind_addr == pc) ... Is there much differences when always grabbing the per-process mutex? > i dont know enough about uvm to say whether it is safe to unlock the > actual memory updates too, but even if i was confident i would still > prefer to change it as a separate step. I agree. > Index: kern/init_sysent.c > =================================================================== > RCS file: /cvs/src/sys/kern/init_sysent.c,v > retrieving revision 1.236 > diff -u -p -r1.236 init_sysent.c > --- kern/init_sysent.c 1 May 2022 23:00:04 -0000 1.236 > +++ kern/init_sysent.c 17 May 2022 00:36:03 -0000 > @@ -1,4 +1,4 @@ > -/* $OpenBSD: init_sysent.c,v 1.236 2022/05/01 23:00:04 tedu Exp $ */ > +/* $OpenBSD$ */ > > /* > * System call switch table. > @@ -204,7 +204,7 @@ const struct sysent sysent[] = { > sys_utimensat }, /* 84 = utimensat */ > { 2, s(struct sys_futimens_args), 0, > sys_futimens }, /* 85 = futimens */ > - { 3, s(struct sys_kbind_args), 0, > + { 3, s(struct sys_kbind_args), SY_NOLOCK | 0, > sys_kbind }, /* 86 = kbind */ > { 2, s(struct sys_clock_gettime_args), SY_NOLOCK | 0, > sys_clock_gettime }, /* 87 = clock_gettime */ > Index: kern/syscalls.master > =================================================================== > RCS file: /cvs/src/sys/kern/syscalls.master,v > retrieving revision 1.223 > diff -u -p -r1.223 syscalls.master > --- kern/syscalls.master 24 Feb 2022 07:41:51 -0000 1.223 > +++ kern/syscalls.master 17 May 2022 00:36:03 -0000 > @@ -194,7 +194,7 @@ > const struct timespec *times, int flag); } > 85 STD { int sys_futimens(int fd, \ > const struct timespec *times); } > -86 STD { int sys_kbind(const struct __kbind *param, \ > +86 STD NOLOCK { int sys_kbind(const struct __kbind *param, \ > size_t psize, int64_t proc_cookie); } > 87 STD NOLOCK { int sys_clock_gettime(clockid_t clock_id, \ > struct timespec *tp); } > Index: uvm/uvm_mmap.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v > retrieving revision 1.169 > diff -u -p -r1.169 uvm_mmap.c > --- uvm/uvm_mmap.c 19 Jan 2022 10:43:48 -0000 1.169 > +++ uvm/uvm_mmap.c 17 May 2022 00:36:03 -0000 > @@ -70,6 +70,7 @@ > #include <sys/pledge.h> > #include <sys/unistd.h> /* for KBIND* */ > #include <sys/user.h> > +#include <sys/atomic.h> > > #include <machine/exec.h> /* for __LDPGSZ */ > > @@ -1125,33 +1126,64 @@ sys_kbind(struct proc *p, void *v, regis > const char *data; > vaddr_t baseva, last_baseva, endva, pageoffset, kva; > size_t psize, s; > - u_long pc; > + u_long pc, kpc; > int count, i, extra; > + uint64_t cookie; > int error; > > /* > * extract syscall args from uap > */ > paramp = SCARG(uap, param); > - psize = SCARG(uap, psize); > > /* a NULL paramp disables the syscall for the process */ > if (paramp == NULL) { > + mtx_enter(&pr->ps_mtx); > if (pr->ps_kbind_addr != 0) > - sigexit(p, SIGILL); > + goto leave_sigill; > pr->ps_kbind_addr = BOGO_PC; > + mtx_leave(&pr->ps_mtx); > return 0; > } > > /* security checks */ > + > + /* > + * ps_kbind_addr can only be set to 0 or BOGO_PC by the > + * kernel, not by a call from userland. > + */ > pc = PROC_PC(p); > - if (pr->ps_kbind_addr == 0) { > - pr->ps_kbind_addr = pc; > - pr->ps_kbind_cookie = SCARG(uap, proc_cookie); > - } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC) > - sigexit(p, SIGILL); > - else if (pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) > - sigexit(p, SIGILL); > + if (pc == 0 || pc == BOGO_PC) > + goto sigill; > + > + cookie = SCARG(uap, proc_cookie); > + if (pr->ps_kbind_addr == pc) { > + membar_datadep_consumer(); > + if (pr->ps_kbind_cookie != cookie) > + goto sigill; > + } else { > + mtx_enter(&pr->ps_mtx); > + kpc = pr->ps_kbind_addr; > + > + /* > + * If we're the first thread in (kpc is 0), then > + * remember where the call came from for the future. > + * > + * Maybe another thread raced us and won. pc cannot > + * be BOGO_PC, so this also handles the check for > + * the disabled case. > + */ > + > + if (kpc == 0) { > + pr->ps_kbind_cookie = cookie; > + membar_producer(); > + pr->ps_kbind_addr = pc; > + } else if (kpc != pc || pr->ps_kbind_cookie != cookie) > + goto leave_sigill; > + mtx_leave(&pr->ps_mtx); > + } > + > + psize = SCARG(uap, psize); > if (psize < sizeof(struct __kbind) || psize > sizeof(param)) > return EINVAL; > if ((error = copyin(paramp, ¶m, psize))) > @@ -1187,6 +1219,7 @@ sys_kbind(struct proc *p, void *v, regis > data = (const char *)¶mp[count]; > > /* all looks good, so do the bindings */ > + KERNEL_LOCK(); > last_baseva = VM_MAXUSER_ADDRESS; > kva = 0; > TAILQ_INIT(&dead_entries); > @@ -1239,6 +1272,14 @@ redo: > vm_map_unlock(kernel_map); > } > uvm_unmap_detach(&dead_entries, AMAP_REFALL); > + KERNEL_UNLOCK(); > > return error; > + > +leave_sigill: > + mtx_leave(&pr->ps_mtx); > +sigill: > + KERNEL_LOCK(); > + sigexit(p, SIGILL); > + /* NOTREACHED KERNEL_UNLOCK(); */ > } > Index: sys/proc.h > =================================================================== > RCS file: /cvs/src/sys/sys/proc.h,v > retrieving revision 1.329 > diff -u -p -r1.329 proc.h > --- sys/proc.h 10 Mar 2022 15:21:08 -0000 1.329 > +++ sys/proc.h 17 May 2022 00:36:03 -0000 > @@ -234,8 +234,8 @@ struct process { > uint64_t ps_pledge; > uint64_t ps_execpledge; > > - int64_t ps_kbind_cookie; > - u_long ps_kbind_addr; > + int64_t ps_kbind_cookie; /* [m] */ > + u_long ps_kbind_addr; /* [m] */ > > /* End area that is copied on creation. */ > #define ps_endcopy ps_refcnt >