On Wed, Jun 15, 2022 at 10:40:35PM -0500, Scott Cheloha wrote:
> On Wed, Jun 15, 2022 at 06:17:07PM -0600, Theo de Raadt wrote:
> > Mark Kettenis <[email protected]> wrote:
> >
> > > Well, I believe that Scott was trying to fix a race condition that can
> > > only happen if code is using kbind(2) incorrectly, i.e. when the
> > > threads deliberately pass different cookies to kbind(2) or execute
> > > kbind(2) from different "text" addresses.
> > >
> > > I still think the solution is simply to accept that race condition.
> >
> > Right.
> >
> > People are not calling kbind. They are calling syscall(SYS_kbind
> >
> > The man page says "don't do that". No user serviceable parts inside.
> > Do not provide to children.
> >
> > That said, Scott is about to share a diff he and I did a few cycles
> > around, to at least make the call-in transaction be a lock.
>
> [...]
>
> This patch reorganizes the logic in sys_kbind() preceding the
> copyin(9) so that we only need to take the kernel lock in a single
> spot if something goes wrong and we need to raise SIGILL.
>
> It also puts the per-process mutex, ps_mtx, around that logic. This
> guarantees that the first thread to reach sys_kbind() sets
> ps_kbind_addr and ps_kbind_cookie, even in oddball situations where
> the program isn't using ld.so(1) correctly.
>
> [...]
10 days, no replies public or private. I trust that means the patch
is fine.
Here is a tweaked patch with the binding loop wrapped with the kernel
lock. We can more carefully determine whether uvm_unmap_remove(),
uvm_map_extract(), and uvm_unmap_detach() are MP-safe in a subsequent
patch. They *look* safe but I can't be sure and nobody volunteered
any thoughts or review for the prior patch so we'll do the
conservative thing and push the kernel lock down, just as dlg@ was
going to do.
If something comes of guenther@'s PT_OPENBSD_KBIND idea we can
trivially remove ps_mtx from this code.
OK?
Index: kern/syscalls.master
===================================================================
RCS file: /cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.224
diff -u -p -r1.224 syscalls.master
--- kern/syscalls.master 16 May 2022 07:36:04 -0000 1.224
+++ kern/syscalls.master 24 Jun 2022 14:43:26 -0000
@@ -194,7 +194,7 @@
const struct timespec *times, int flag); }
85 STD { int sys_futimens(int fd, \
const struct timespec *times); }
-86 STD { int sys_kbind(const struct __kbind *param, \
+86 STD NOLOCK { int sys_kbind(const struct __kbind *param, \
size_t psize, int64_t proc_cookie); }
87 STD NOLOCK { int sys_clock_gettime(clockid_t clock_id, \
struct timespec *tp); }
Index: kern/init_sysent.c
===================================================================
RCS file: /cvs/src/sys/kern/init_sysent.c,v
retrieving revision 1.237
diff -u -p -r1.237 init_sysent.c
--- kern/init_sysent.c 16 May 2022 07:38:10 -0000 1.237
+++ kern/init_sysent.c 24 Jun 2022 14:43:26 -0000
@@ -1,4 +1,4 @@
-/* $OpenBSD: init_sysent.c,v 1.237 2022/05/16 07:38:10 mvs Exp $ */
+/* $OpenBSD$ */
/*
* System call switch table.
@@ -204,7 +204,7 @@ const struct sysent sysent[] = {
sys_utimensat }, /* 84 = utimensat */
{ 2, s(struct sys_futimens_args), 0,
sys_futimens }, /* 85 = futimens */
- { 3, s(struct sys_kbind_args), 0,
+ { 3, s(struct sys_kbind_args), SY_NOLOCK | 0,
sys_kbind }, /* 86 = kbind */
{ 2, s(struct sys_clock_gettime_args), SY_NOLOCK | 0,
sys_clock_gettime }, /* 87 = clock_gettime */
Index: sys/proc.h
===================================================================
RCS file: /cvs/src/sys/sys/proc.h,v
retrieving revision 1.330
diff -u -p -r1.330 proc.h
--- sys/proc.h 13 May 2022 15:32:00 -0000 1.330
+++ sys/proc.h 24 Jun 2022 14:43:27 -0000
@@ -234,8 +234,8 @@ struct process {
uint64_t ps_pledge;
uint64_t ps_execpledge;
- int64_t ps_kbind_cookie;
- u_long ps_kbind_addr;
+ int64_t ps_kbind_cookie; /* [m] */
+ u_long ps_kbind_addr; /* [m] */
/* End area that is copied on creation. */
#define ps_endcopy ps_refcnt
Index: sys/syscall.h
===================================================================
RCS file: /cvs/src/sys/sys/syscall.h,v
retrieving revision 1.234
diff -u -p -r1.234 syscall.h
--- sys/syscall.h 16 May 2022 07:38:10 -0000 1.234
+++ sys/syscall.h 24 Jun 2022 14:43:27 -0000
@@ -1,4 +1,4 @@
-/* $OpenBSD: syscall.h,v 1.234 2022/05/16 07:38:10 mvs Exp $ */
+/* $OpenBSD$ */
/*
* System call numbers.
Index: sys/syscallargs.h
===================================================================
RCS file: /cvs/src/sys/sys/syscallargs.h,v
retrieving revision 1.237
diff -u -p -r1.237 syscallargs.h
--- sys/syscallargs.h 16 May 2022 07:38:10 -0000 1.237
+++ sys/syscallargs.h 24 Jun 2022 14:43:27 -0000
@@ -1,4 +1,4 @@
-/* $OpenBSD: syscallargs.h,v 1.237 2022/05/16 07:38:10 mvs Exp $ */
+/* $OpenBSD$ */
/*
* System call argument lists.
Index: uvm/uvm_mmap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
retrieving revision 1.169
diff -u -p -r1.169 uvm_mmap.c
--- uvm/uvm_mmap.c 19 Jan 2022 10:43:48 -0000 1.169
+++ uvm/uvm_mmap.c 24 Jun 2022 14:43:27 -0000
@@ -1127,7 +1127,7 @@ sys_kbind(struct proc *p, void *v, regis
size_t psize, s;
u_long pc;
int count, i, extra;
- int error;
+ int error, sigill = 0;
/*
* extract syscall args from uap
@@ -1135,23 +1135,42 @@ sys_kbind(struct proc *p, void *v, regis
paramp = SCARG(uap, param);
psize = SCARG(uap, psize);
- /* a NULL paramp disables the syscall for the process */
- if (paramp == NULL) {
- if (pr->ps_kbind_addr != 0)
- sigexit(p, SIGILL);
- pr->ps_kbind_addr = BOGO_PC;
- return 0;
- }
-
- /* security checks */
+ /*
+ * If paramp is NULL and we're uninitialized, disable the syscall
+ * for the process. Raise SIGILL if paramp is NULL and we're
+ * already initialized.
+ *
+ * If paramp is non-NULL and we're uninitialized, do initialization.
+ * Otherwise, do security checks and raise SIGILL on failure.
+ */
pc = PROC_PC(p);
- if (pr->ps_kbind_addr == 0) {
+ mtx_enter(&pr->ps_mtx);
+ if (paramp == NULL) {
+ if (pr->ps_kbind_addr == 0)
+ pr->ps_kbind_addr = BOGO_PC;
+ else
+ sigill = 1;
+ } else if (pr->ps_kbind_addr == 0) {
pr->ps_kbind_addr = pc;
pr->ps_kbind_cookie = SCARG(uap, proc_cookie);
- } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC)
- sigexit(p, SIGILL);
- else if (pr->ps_kbind_cookie != SCARG(uap, proc_cookie))
+ } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC ||
+ pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) {
+ sigill = 1;
+ }
+ mtx_leave(&pr->ps_mtx);
+
+ /* Raise SIGILL if something is off. */
+ if (sigill) {
+ KERNEL_LOCK();
sigexit(p, SIGILL);
+ /* NOTREACHED */
+ KERNEL_UNLOCK();
+ }
+
+ /* We're done if we were disabling the syscall. */
+ if (paramp == NULL)
+ return 0;
+
if (psize < sizeof(struct __kbind) || psize > sizeof(param))
return EINVAL;
if ((error = copyin(paramp, ¶m, psize)))
@@ -1190,6 +1209,7 @@ sys_kbind(struct proc *p, void *v, regis
last_baseva = VM_MAXUSER_ADDRESS;
kva = 0;
TAILQ_INIT(&dead_entries);
+ KERNEL_LOCK();
for (i = 0; i < count; i++) {
baseva = (vaddr_t)paramp[i].kb_addr;
s = paramp[i].kb_size;
@@ -1239,6 +1259,7 @@ redo:
vm_map_unlock(kernel_map);
}
uvm_unmap_detach(&dead_entries, AMAP_REFALL);
+ KERNEL_UNLOCK();
return error;
}