On 17/05/22(Tue) 10:44, David Gwynne wrote:
> this narrows the scope of the KERNEL_LOCK in kbind(2) so the syscall
> argument checks can be done without the kernel lock.
> 
> care is taken to allow the pc/cookie checks to run without any lock by
> being careful with the order of the checks. all modifications to the
> pc/cookie state are serialised by the per process mutex.

I don't understand why it is safe to do the following check without
holding a mutex:

        if (pr->ps_kbind_addr == pc)
                ...

Is there much differences when always grabbing the per-process mutex?
        
> i dont know enough about uvm to say whether it is safe to unlock the
> actual memory updates too, but even if i was confident i would still
> prefer to change it as a separate step.

I agree.

> Index: kern/init_sysent.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/init_sysent.c,v
> retrieving revision 1.236
> diff -u -p -r1.236 init_sysent.c
> --- kern/init_sysent.c        1 May 2022 23:00:04 -0000       1.236
> +++ kern/init_sysent.c        17 May 2022 00:36:03 -0000
> @@ -1,4 +1,4 @@
> -/*   $OpenBSD: init_sysent.c,v 1.236 2022/05/01 23:00:04 tedu Exp $  */
> +/*   $OpenBSD$       */
>  
>  /*
>   * System call switch table.
> @@ -204,7 +204,7 @@ const struct sysent sysent[] = {
>           sys_utimensat },                    /* 84 = utimensat */
>       { 2, s(struct sys_futimens_args), 0,
>           sys_futimens },                     /* 85 = futimens */
> -     { 3, s(struct sys_kbind_args), 0,
> +     { 3, s(struct sys_kbind_args), SY_NOLOCK | 0,
>           sys_kbind },                        /* 86 = kbind */
>       { 2, s(struct sys_clock_gettime_args), SY_NOLOCK | 0,
>           sys_clock_gettime },                /* 87 = clock_gettime */
> Index: kern/syscalls.master
> ===================================================================
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.223
> diff -u -p -r1.223 syscalls.master
> --- kern/syscalls.master      24 Feb 2022 07:41:51 -0000      1.223
> +++ kern/syscalls.master      17 May 2022 00:36:03 -0000
> @@ -194,7 +194,7 @@
>                           const struct timespec *times, int flag); }
>  85   STD             { int sys_futimens(int fd, \
>                           const struct timespec *times); }
> -86   STD             { int sys_kbind(const struct __kbind *param, \
> +86   STD NOLOCK      { int sys_kbind(const struct __kbind *param, \
>                           size_t psize, int64_t proc_cookie); }
>  87   STD NOLOCK      { int sys_clock_gettime(clockid_t clock_id, \
>                           struct timespec *tp); }
> Index: uvm/uvm_mmap.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.169
> diff -u -p -r1.169 uvm_mmap.c
> --- uvm/uvm_mmap.c    19 Jan 2022 10:43:48 -0000      1.169
> +++ uvm/uvm_mmap.c    17 May 2022 00:36:03 -0000
> @@ -70,6 +70,7 @@
>  #include <sys/pledge.h>
>  #include <sys/unistd.h>              /* for KBIND* */
>  #include <sys/user.h>
> +#include <sys/atomic.h>
>  
>  #include <machine/exec.h>    /* for __LDPGSZ */
>  
> @@ -1125,33 +1126,64 @@ sys_kbind(struct proc *p, void *v, regis
>       const char *data;
>       vaddr_t baseva, last_baseva, endva, pageoffset, kva;
>       size_t psize, s;
> -     u_long pc;
> +     u_long pc, kpc;
>       int count, i, extra;
> +     uint64_t cookie;
>       int error;
>  
>       /*
>        * extract syscall args from uap
>        */
>       paramp = SCARG(uap, param);
> -     psize = SCARG(uap, psize);
>  
>       /* a NULL paramp disables the syscall for the process */
>       if (paramp == NULL) {
> +             mtx_enter(&pr->ps_mtx);
>               if (pr->ps_kbind_addr != 0)
> -                     sigexit(p, SIGILL);
> +                     goto leave_sigill;
>               pr->ps_kbind_addr = BOGO_PC;
> +             mtx_leave(&pr->ps_mtx);
>               return 0;
>       }
>  
>       /* security checks */
> +
> +     /*
> +      * ps_kbind_addr can only be set to 0 or BOGO_PC by the
> +      * kernel, not by a call from userland.
> +      */
>       pc = PROC_PC(p);
> -     if (pr->ps_kbind_addr == 0) {
> -             pr->ps_kbind_addr = pc;
> -             pr->ps_kbind_cookie = SCARG(uap, proc_cookie);
> -     } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC)
> -             sigexit(p, SIGILL);
> -     else if (pr->ps_kbind_cookie != SCARG(uap, proc_cookie))
> -             sigexit(p, SIGILL);
> +     if (pc == 0 || pc == BOGO_PC)
> +             goto sigill;
> +
> +     cookie = SCARG(uap, proc_cookie);
> +     if (pr->ps_kbind_addr == pc) {
> +             membar_datadep_consumer();
> +             if (pr->ps_kbind_cookie != cookie)
> +                     goto sigill;
> +     } else {
> +             mtx_enter(&pr->ps_mtx);
> +             kpc = pr->ps_kbind_addr;
> +
> +             /*
> +              * If we're the first thread in (kpc is 0), then
> +              * remember where the call came from for the future.
> +              *
> +              * Maybe another thread raced us and won. pc cannot
> +              * be BOGO_PC, so this also handles the check for
> +              * the disabled case.
> +              */
> +
> +             if (kpc == 0) {
> +                     pr->ps_kbind_cookie = cookie;
> +                     membar_producer();
> +                     pr->ps_kbind_addr = pc;
> +             } else if (kpc != pc || pr->ps_kbind_cookie != cookie)
> +                     goto leave_sigill;
> +             mtx_leave(&pr->ps_mtx);
> +     }
> +
> +     psize = SCARG(uap, psize);
>       if (psize < sizeof(struct __kbind) || psize > sizeof(param))
>               return EINVAL;
>       if ((error = copyin(paramp, &param, psize)))
> @@ -1187,6 +1219,7 @@ sys_kbind(struct proc *p, void *v, regis
>       data = (const char *)&paramp[count];
>  
>       /* all looks good, so do the bindings */
> +     KERNEL_LOCK();
>       last_baseva = VM_MAXUSER_ADDRESS;
>       kva = 0;
>       TAILQ_INIT(&dead_entries);
> @@ -1239,6 +1272,14 @@ redo:
>               vm_map_unlock(kernel_map);
>       }
>       uvm_unmap_detach(&dead_entries, AMAP_REFALL);
> +     KERNEL_UNLOCK();
>  
>       return error;
> +
> +leave_sigill:
> +     mtx_leave(&pr->ps_mtx);
> +sigill:
> +     KERNEL_LOCK();
> +     sigexit(p, SIGILL);
> +     /* NOTREACHED KERNEL_UNLOCK(); */
>  }
> Index: sys/proc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/proc.h,v
> retrieving revision 1.329
> diff -u -p -r1.329 proc.h
> --- sys/proc.h        10 Mar 2022 15:21:08 -0000      1.329
> +++ sys/proc.h        17 May 2022 00:36:03 -0000
> @@ -234,8 +234,8 @@ struct process {
>       uint64_t ps_pledge;
>       uint64_t ps_execpledge;
>  
> -     int64_t ps_kbind_cookie;
> -     u_long  ps_kbind_addr;
> +     int64_t ps_kbind_cookie;        /* [m] */
> +     u_long  ps_kbind_addr;          /* [m] */
>  
>  /* End area that is copied on creation. */
>  #define ps_endcopy   ps_refcnt
> 

Reply via email to