On Wed, Jun 15, 2022 at 10:40:35PM -0500, Scott Cheloha wrote: > On Wed, Jun 15, 2022 at 06:17:07PM -0600, Theo de Raadt wrote: > > Mark Kettenis <mark.kette...@xs4all.nl> wrote: > > > > > Well, I believe that Scott was trying to fix a race condition that can > > > only happen if code is using kbind(2) incorrectly, i.e. when the > > > threads deliberately pass different cookies to kbind(2) or execute > > > kbind(2) from different "text" addresses. > > > > > > I still think the solution is simply to accept that race condition. > > > > Right. > > > > People are not calling kbind. They are calling syscall(SYS_kbind > > > > The man page says "don't do that". No user serviceable parts inside. > > Do not provide to children. > > > > That said, Scott is about to share a diff he and I did a few cycles > > around, to at least make the call-in transaction be a lock. > > [...] > > This patch reorganizes the logic in sys_kbind() preceding the > copyin(9) so that we only need to take the kernel lock in a single > spot if something goes wrong and we need to raise SIGILL. > > It also puts the per-process mutex, ps_mtx, around that logic. This > guarantees that the first thread to reach sys_kbind() sets > ps_kbind_addr and ps_kbind_cookie, even in oddball situations where > the program isn't using ld.so(1) correctly. > > [...]
10 days, no replies public or private. I trust that means the patch is fine. Here is a tweaked patch with the binding loop wrapped with the kernel lock. We can more carefully determine whether uvm_unmap_remove(), uvm_map_extract(), and uvm_unmap_detach() are MP-safe in a subsequent patch. They *look* safe but I can't be sure and nobody volunteered any thoughts or review for the prior patch so we'll do the conservative thing and push the kernel lock down, just as dlg@ was going to do. If something comes of guenther@'s PT_OPENBSD_KBIND idea we can trivially remove ps_mtx from this code. OK? Index: kern/syscalls.master =================================================================== RCS file: /cvs/src/sys/kern/syscalls.master,v retrieving revision 1.224 diff -u -p -r1.224 syscalls.master --- kern/syscalls.master 16 May 2022 07:36:04 -0000 1.224 +++ kern/syscalls.master 24 Jun 2022 14:43:26 -0000 @@ -194,7 +194,7 @@ const struct timespec *times, int flag); } 85 STD { int sys_futimens(int fd, \ const struct timespec *times); } -86 STD { int sys_kbind(const struct __kbind *param, \ +86 STD NOLOCK { int sys_kbind(const struct __kbind *param, \ size_t psize, int64_t proc_cookie); } 87 STD NOLOCK { int sys_clock_gettime(clockid_t clock_id, \ struct timespec *tp); } Index: kern/init_sysent.c =================================================================== RCS file: /cvs/src/sys/kern/init_sysent.c,v retrieving revision 1.237 diff -u -p -r1.237 init_sysent.c --- kern/init_sysent.c 16 May 2022 07:38:10 -0000 1.237 +++ kern/init_sysent.c 24 Jun 2022 14:43:26 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: init_sysent.c,v 1.237 2022/05/16 07:38:10 mvs Exp $ */ +/* $OpenBSD$ */ /* * System call switch table. @@ -204,7 +204,7 @@ const struct sysent sysent[] = { sys_utimensat }, /* 84 = utimensat */ { 2, s(struct sys_futimens_args), 0, sys_futimens }, /* 85 = futimens */ - { 3, s(struct sys_kbind_args), 0, + { 3, s(struct sys_kbind_args), SY_NOLOCK | 0, sys_kbind }, /* 86 = kbind */ { 2, s(struct sys_clock_gettime_args), SY_NOLOCK | 0, sys_clock_gettime }, /* 87 = clock_gettime */ Index: sys/proc.h =================================================================== RCS file: /cvs/src/sys/sys/proc.h,v retrieving revision 1.330 diff -u -p -r1.330 proc.h --- sys/proc.h 13 May 2022 15:32:00 -0000 1.330 +++ sys/proc.h 24 Jun 2022 14:43:27 -0000 @@ -234,8 +234,8 @@ struct process { uint64_t ps_pledge; uint64_t ps_execpledge; - int64_t ps_kbind_cookie; - u_long ps_kbind_addr; + int64_t ps_kbind_cookie; /* [m] */ + u_long ps_kbind_addr; /* [m] */ /* End area that is copied on creation. */ #define ps_endcopy ps_refcnt Index: sys/syscall.h =================================================================== RCS file: /cvs/src/sys/sys/syscall.h,v retrieving revision 1.234 diff -u -p -r1.234 syscall.h --- sys/syscall.h 16 May 2022 07:38:10 -0000 1.234 +++ sys/syscall.h 24 Jun 2022 14:43:27 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: syscall.h,v 1.234 2022/05/16 07:38:10 mvs Exp $ */ +/* $OpenBSD$ */ /* * System call numbers. Index: sys/syscallargs.h =================================================================== RCS file: /cvs/src/sys/sys/syscallargs.h,v retrieving revision 1.237 diff -u -p -r1.237 syscallargs.h --- sys/syscallargs.h 16 May 2022 07:38:10 -0000 1.237 +++ sys/syscallargs.h 24 Jun 2022 14:43:27 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: syscallargs.h,v 1.237 2022/05/16 07:38:10 mvs Exp $ */ +/* $OpenBSD$ */ /* * System call argument lists. Index: uvm/uvm_mmap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v retrieving revision 1.169 diff -u -p -r1.169 uvm_mmap.c --- uvm/uvm_mmap.c 19 Jan 2022 10:43:48 -0000 1.169 +++ uvm/uvm_mmap.c 24 Jun 2022 14:43:27 -0000 @@ -1127,7 +1127,7 @@ sys_kbind(struct proc *p, void *v, regis size_t psize, s; u_long pc; int count, i, extra; - int error; + int error, sigill = 0; /* * extract syscall args from uap @@ -1135,23 +1135,42 @@ sys_kbind(struct proc *p, void *v, regis paramp = SCARG(uap, param); psize = SCARG(uap, psize); - /* a NULL paramp disables the syscall for the process */ - if (paramp == NULL) { - if (pr->ps_kbind_addr != 0) - sigexit(p, SIGILL); - pr->ps_kbind_addr = BOGO_PC; - return 0; - } - - /* security checks */ + /* + * If paramp is NULL and we're uninitialized, disable the syscall + * for the process. Raise SIGILL if paramp is NULL and we're + * already initialized. + * + * If paramp is non-NULL and we're uninitialized, do initialization. + * Otherwise, do security checks and raise SIGILL on failure. + */ pc = PROC_PC(p); - if (pr->ps_kbind_addr == 0) { + mtx_enter(&pr->ps_mtx); + if (paramp == NULL) { + if (pr->ps_kbind_addr == 0) + pr->ps_kbind_addr = BOGO_PC; + else + sigill = 1; + } else if (pr->ps_kbind_addr == 0) { pr->ps_kbind_addr = pc; pr->ps_kbind_cookie = SCARG(uap, proc_cookie); - } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC) - sigexit(p, SIGILL); - else if (pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) + } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC || + pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) { + sigill = 1; + } + mtx_leave(&pr->ps_mtx); + + /* Raise SIGILL if something is off. */ + if (sigill) { + KERNEL_LOCK(); sigexit(p, SIGILL); + /* NOTREACHED */ + KERNEL_UNLOCK(); + } + + /* We're done if we were disabling the syscall. */ + if (paramp == NULL) + return 0; + if (psize < sizeof(struct __kbind) || psize > sizeof(param)) return EINVAL; if ((error = copyin(paramp, ¶m, psize))) @@ -1190,6 +1209,7 @@ sys_kbind(struct proc *p, void *v, regis last_baseva = VM_MAXUSER_ADDRESS; kva = 0; TAILQ_INIT(&dead_entries); + KERNEL_LOCK(); for (i = 0; i < count; i++) { baseva = (vaddr_t)paramp[i].kb_addr; s = paramp[i].kb_size; @@ -1239,6 +1259,7 @@ redo: vm_map_unlock(kernel_map); } uvm_unmap_detach(&dead_entries, AMAP_REFALL); + KERNEL_UNLOCK(); return error; }