On 17/05/22(Tue) 10:44, David Gwynne wrote:
> this narrows the scope of the KERNEL_LOCK in kbind(2) so the syscall
> argument checks can be done without the kernel lock.
>
> care is taken to allow the pc/cookie checks to run without any lock by
> being careful with the order of the checks. all modifications to the
> pc/cookie state are serialised by the per process mutex.
I don't understand why it is safe to do the following check without
holding a mutex:
if (pr->ps_kbind_addr == pc)
...
Is there much differences when always grabbing the per-process mutex?
> i dont know enough about uvm to say whether it is safe to unlock the
> actual memory updates too, but even if i was confident i would still
> prefer to change it as a separate step.
I agree.
> Index: kern/init_sysent.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/init_sysent.c,v
> retrieving revision 1.236
> diff -u -p -r1.236 init_sysent.c
> --- kern/init_sysent.c 1 May 2022 23:00:04 -0000 1.236
> +++ kern/init_sysent.c 17 May 2022 00:36:03 -0000
> @@ -1,4 +1,4 @@
> -/* $OpenBSD: init_sysent.c,v 1.236 2022/05/01 23:00:04 tedu Exp $ */
> +/* $OpenBSD$ */
>
> /*
> * System call switch table.
> @@ -204,7 +204,7 @@ const struct sysent sysent[] = {
> sys_utimensat }, /* 84 = utimensat */
> { 2, s(struct sys_futimens_args), 0,
> sys_futimens }, /* 85 = futimens */
> - { 3, s(struct sys_kbind_args), 0,
> + { 3, s(struct sys_kbind_args), SY_NOLOCK | 0,
> sys_kbind }, /* 86 = kbind */
> { 2, s(struct sys_clock_gettime_args), SY_NOLOCK | 0,
> sys_clock_gettime }, /* 87 = clock_gettime */
> Index: kern/syscalls.master
> ===================================================================
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.223
> diff -u -p -r1.223 syscalls.master
> --- kern/syscalls.master 24 Feb 2022 07:41:51 -0000 1.223
> +++ kern/syscalls.master 17 May 2022 00:36:03 -0000
> @@ -194,7 +194,7 @@
> const struct timespec *times, int flag); }
> 85 STD { int sys_futimens(int fd, \
> const struct timespec *times); }
> -86 STD { int sys_kbind(const struct __kbind *param, \
> +86 STD NOLOCK { int sys_kbind(const struct __kbind *param, \
> size_t psize, int64_t proc_cookie); }
> 87 STD NOLOCK { int sys_clock_gettime(clockid_t clock_id, \
> struct timespec *tp); }
> Index: uvm/uvm_mmap.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.169
> diff -u -p -r1.169 uvm_mmap.c
> --- uvm/uvm_mmap.c 19 Jan 2022 10:43:48 -0000 1.169
> +++ uvm/uvm_mmap.c 17 May 2022 00:36:03 -0000
> @@ -70,6 +70,7 @@
> #include <sys/pledge.h>
> #include <sys/unistd.h> /* for KBIND* */
> #include <sys/user.h>
> +#include <sys/atomic.h>
>
> #include <machine/exec.h> /* for __LDPGSZ */
>
> @@ -1125,33 +1126,64 @@ sys_kbind(struct proc *p, void *v, regis
> const char *data;
> vaddr_t baseva, last_baseva, endva, pageoffset, kva;
> size_t psize, s;
> - u_long pc;
> + u_long pc, kpc;
> int count, i, extra;
> + uint64_t cookie;
> int error;
>
> /*
> * extract syscall args from uap
> */
> paramp = SCARG(uap, param);
> - psize = SCARG(uap, psize);
>
> /* a NULL paramp disables the syscall for the process */
> if (paramp == NULL) {
> + mtx_enter(&pr->ps_mtx);
> if (pr->ps_kbind_addr != 0)
> - sigexit(p, SIGILL);
> + goto leave_sigill;
> pr->ps_kbind_addr = BOGO_PC;
> + mtx_leave(&pr->ps_mtx);
> return 0;
> }
>
> /* security checks */
> +
> + /*
> + * ps_kbind_addr can only be set to 0 or BOGO_PC by the
> + * kernel, not by a call from userland.
> + */
> pc = PROC_PC(p);
> - if (pr->ps_kbind_addr == 0) {
> - pr->ps_kbind_addr = pc;
> - pr->ps_kbind_cookie = SCARG(uap, proc_cookie);
> - } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC)
> - sigexit(p, SIGILL);
> - else if (pr->ps_kbind_cookie != SCARG(uap, proc_cookie))
> - sigexit(p, SIGILL);
> + if (pc == 0 || pc == BOGO_PC)
> + goto sigill;
> +
> + cookie = SCARG(uap, proc_cookie);
> + if (pr->ps_kbind_addr == pc) {
> + membar_datadep_consumer();
> + if (pr->ps_kbind_cookie != cookie)
> + goto sigill;
> + } else {
> + mtx_enter(&pr->ps_mtx);
> + kpc = pr->ps_kbind_addr;
> +
> + /*
> + * If we're the first thread in (kpc is 0), then
> + * remember where the call came from for the future.
> + *
> + * Maybe another thread raced us and won. pc cannot
> + * be BOGO_PC, so this also handles the check for
> + * the disabled case.
> + */
> +
> + if (kpc == 0) {
> + pr->ps_kbind_cookie = cookie;
> + membar_producer();
> + pr->ps_kbind_addr = pc;
> + } else if (kpc != pc || pr->ps_kbind_cookie != cookie)
> + goto leave_sigill;
> + mtx_leave(&pr->ps_mtx);
> + }
> +
> + psize = SCARG(uap, psize);
> if (psize < sizeof(struct __kbind) || psize > sizeof(param))
> return EINVAL;
> if ((error = copyin(paramp, ¶m, psize)))
> @@ -1187,6 +1219,7 @@ sys_kbind(struct proc *p, void *v, regis
> data = (const char *)¶mp[count];
>
> /* all looks good, so do the bindings */
> + KERNEL_LOCK();
> last_baseva = VM_MAXUSER_ADDRESS;
> kva = 0;
> TAILQ_INIT(&dead_entries);
> @@ -1239,6 +1272,14 @@ redo:
> vm_map_unlock(kernel_map);
> }
> uvm_unmap_detach(&dead_entries, AMAP_REFALL);
> + KERNEL_UNLOCK();
>
> return error;
> +
> +leave_sigill:
> + mtx_leave(&pr->ps_mtx);
> +sigill:
> + KERNEL_LOCK();
> + sigexit(p, SIGILL);
> + /* NOTREACHED KERNEL_UNLOCK(); */
> }
> Index: sys/proc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/proc.h,v
> retrieving revision 1.329
> diff -u -p -r1.329 proc.h
> --- sys/proc.h 10 Mar 2022 15:21:08 -0000 1.329
> +++ sys/proc.h 17 May 2022 00:36:03 -0000
> @@ -234,8 +234,8 @@ struct process {
> uint64_t ps_pledge;
> uint64_t ps_execpledge;
>
> - int64_t ps_kbind_cookie;
> - u_long ps_kbind_addr;
> + int64_t ps_kbind_cookie; /* [m] */
> + u_long ps_kbind_addr; /* [m] */
>
> /* End area that is copied on creation. */
> #define ps_endcopy ps_refcnt
>