By assuming that `f_data' is immutable, which AFAIK is true for sockets, we can remove the KERNEL_LOCK() from the following syscalls iff files are refcounted in an MP-safe way.
This diff includes the EBUSY check in dup2(2) which is currently required to avoid races with accept(2) and will later make our life easier wrt open(2). It also includes the fdinsert() diff I sent earlier. On top of that I'm introducing a global mutex, `fhdlk', that protects `f_count' and the implicit reference in `filehead'. A socket stays alive as long as its associated file has a positive refcount. When this refcount drops, fdrop() will be called and soclose() will free/clean `f_data'. That's the only place where `f_data' is changed during the life of a socket. That's why it is safe to dereference `f_data' when getsock() returned a valid & refcounted `fp'. Many ktrace(2) internals now need to grab the KERNEL_LOCK(), just like ptsignal(). Note that for unix, routing and pfkey sockets, solock() still grabs the KERNEL_LOCK(). So even if syscalls are marked as SY_NOLOCK that doesn't mean they won't grab it. In fact some network functions like ifa_ifwithaddr() below now need to grab the KERNEL_LOCK(). That's good that means we're pushing the lock down. Tests? Comments? Index: kern/exec_script.c =================================================================== RCS file: /cvs/src/sys/kern/exec_script.c,v retrieving revision 1.44 diff -u -p -r1.44 exec_script.c --- kern/exec_script.c 2 May 2018 02:24:56 -0000 1.44 +++ kern/exec_script.c 22 May 2018 13:21:54 -0000 @@ -170,17 +170,20 @@ check_shell: #endif fdplock(p->p_fd); - error = falloc(p, 0, &fp, &epp->ep_fd); - fdpunlock(p->p_fd); - if (error) + error = falloc(p, &fp, &epp->ep_fd); + if (error) { + fdpunlock(p->p_fd); goto fail; + } epp->ep_flags |= EXEC_HASFD; fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_data = (caddr_t) scriptvp; fp->f_flag = FREAD; - FILE_SET_MATURE(fp, p); + fdinsert(p->p_fd, epp->ep_fd, 0, fp); + fdpunlock(p->p_fd); + FRELE(fp, p); } /* set up the parameters for the recursive check_exec() call */ Index: kern/init_sysent.c =================================================================== RCS file: /cvs/src/sys/kern/init_sysent.c,v retrieving revision 1.191 diff -u -p -r1.191 init_sysent.c --- kern/init_sysent.c 12 Dec 2017 01:13:14 -0000 1.191 +++ kern/init_sysent.c 22 May 2018 13:21:54 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: init_sysent.c,v 1.191 2017/12/12 01:13:14 deraadt Exp $ */ +/* $OpenBSD$ */ /* * System call switch table. @@ -76,17 +76,17 @@ struct sysent sysent[] = { { 0, 0, 0, sys_nosys }, /* 26 = unimplemented ptrace */ #endif - { 3, s(struct sys_recvmsg_args), 0, + { 3, s(struct sys_recvmsg_args), SY_NOLOCK | 0, sys_recvmsg }, /* 27 = recvmsg */ - { 3, s(struct sys_sendmsg_args), 0, + { 3, s(struct sys_sendmsg_args), SY_NOLOCK | 0, sys_sendmsg }, /* 28 = sendmsg */ - { 6, s(struct sys_recvfrom_args), 0, + { 6, s(struct sys_recvfrom_args), SY_NOLOCK | 0, sys_recvfrom }, /* 29 = recvfrom */ - { 3, s(struct sys_accept_args), 0, + { 3, s(struct sys_accept_args), SY_NOLOCK | 0, sys_accept }, /* 30 = accept */ - { 3, s(struct sys_getpeername_args), 0, + { 3, s(struct sys_getpeername_args), SY_NOLOCK | 0, sys_getpeername }, /* 31 = getpeername */ - { 3, s(struct sys_getsockname_args), 0, + { 3, s(struct sys_getsockname_args), SY_NOLOCK | 0, sys_getsockname }, /* 32 = getsockname */ { 2, s(struct sys_access_args), 0, sys_access }, /* 33 = access */ @@ -218,7 +218,7 @@ struct sysent sysent[] = { sys_nanosleep }, /* 91 = nanosleep */ { 3, s(struct sys_fcntl_args), 0, sys_fcntl }, /* 92 = fcntl */ - { 4, s(struct sys_accept4_args), 0, + { 4, s(struct sys_accept4_args), SY_NOLOCK | 0, sys_accept4 }, /* 93 = accept4 */ { 5, s(struct sys___thrsleep_args), 0, sys___thrsleep }, /* 94 = __thrsleep */ @@ -226,9 +226,9 @@ struct sysent sysent[] = { sys_fsync }, /* 95 = fsync */ { 3, s(struct sys_setpriority_args), 0, sys_setpriority }, /* 96 = setpriority */ - { 3, s(struct sys_socket_args), 0, + { 3, s(struct sys_socket_args), SY_NOLOCK | 0, sys_socket }, /* 97 = socket */ - { 3, s(struct sys_connect_args), 0, + { 3, s(struct sys_connect_args), SY_NOLOCK | 0, sys_connect }, /* 98 = connect */ { 3, s(struct sys_getdents_args), 0, sys_getdents }, /* 99 = getdents */ @@ -240,11 +240,11 @@ struct sysent sysent[] = { sys_dup3 }, /* 102 = dup3 */ { 1, s(struct sys_sigreturn_args), 0, sys_sigreturn }, /* 103 = sigreturn */ - { 3, s(struct sys_bind_args), 0, + { 3, s(struct sys_bind_args), SY_NOLOCK | 0, sys_bind }, /* 104 = bind */ - { 5, s(struct sys_setsockopt_args), 0, + { 5, s(struct sys_setsockopt_args), SY_NOLOCK | 0, sys_setsockopt }, /* 105 = setsockopt */ - { 2, s(struct sys_listen_args), 0, + { 2, s(struct sys_listen_args), SY_NOLOCK | 0, sys_listen }, /* 106 = listen */ { 4, s(struct sys_chflagsat_args), 0, sys_chflagsat }, /* 107 = chflagsat */ @@ -268,7 +268,7 @@ struct sysent sysent[] = { sys_nosys }, /* 116 = obsolete t32_gettimeofday */ { 0, 0, 0, sys_nosys }, /* 117 = obsolete t32_getrusage */ - { 5, s(struct sys_getsockopt_args), 0, + { 5, s(struct sys_getsockopt_args), SY_NOLOCK | 0, sys_getsockopt }, /* 118 = getsockopt */ { 3, s(struct sys_thrkill_args), 0, sys_thrkill }, /* 119 = thrkill */ @@ -298,11 +298,11 @@ struct sysent sysent[] = { sys_flock }, /* 131 = flock */ { 2, s(struct sys_mkfifo_args), 0, sys_mkfifo }, /* 132 = mkfifo */ - { 6, s(struct sys_sendto_args), 0, + { 6, s(struct sys_sendto_args), SY_NOLOCK | 0, sys_sendto }, /* 133 = sendto */ - { 2, s(struct sys_shutdown_args), 0, + { 2, s(struct sys_shutdown_args), SY_NOLOCK | 0, sys_shutdown }, /* 134 = shutdown */ - { 4, s(struct sys_socketpair_args), 0, + { 4, s(struct sys_socketpair_args), SY_NOLOCK | 0, sys_socketpair }, /* 135 = socketpair */ { 2, s(struct sys_mkdir_args), 0, sys_mkdir }, /* 136 = mkdir */ Index: kern/kern_descrip.c =================================================================== RCS file: /cvs/src/sys/kern/kern_descrip.c,v retrieving revision 1.158 diff -u -p -r1.158 kern_descrip.c --- kern/kern_descrip.c 8 May 2018 09:03:58 -0000 1.158 +++ kern/kern_descrip.c 22 May 2018 13:21:54 -0000 @@ -67,6 +67,7 @@ /* * Descriptor management. */ +struct mutex fhdlk = MUTEX_INITIALIZER(IPL_NONE); struct filelist filehead; /* head of list of open files */ int numfiles; /* actual number of open files */ @@ -144,6 +145,23 @@ find_last_set(struct filedesc *fd, int l return i; } +static __inline int +fd_inuse(struct filedesc *fdp, int fd) +{ + u_int off = fd >> NDENTRYSHIFT; + + if (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) + return 1; + + if (fdp->fd_lomap[off] != ~0) + return 0; + + if (fdp->fd_himap[off >> NDENTRYSHIFT] & (1 << (off & NDENTRYMASK))) + return 1; + + return 0; +} + static __inline void fd_used(struct filedesc *fdp, int fd) { @@ -184,16 +202,18 @@ fd_iterfile(struct file *fp, struct proc { struct file *nfp; + mtx_enter(&fhdlk); if (fp == NULL) nfp = LIST_FIRST(&filehead); else nfp = LIST_NEXT(fp, f_list); - /* don't FREF when f_count == 0 to avoid race in fdrop() */ - while (nfp != NULL && (nfp->f_count == 0 || !FILE_IS_USABLE(nfp))) + /* don't refcount when f_count == 0 to avoid race in fdrop() */ + while (nfp != NULL && nfp->f_count == 0) nfp = LIST_NEXT(nfp, f_list); if (nfp != NULL) - FREF(nfp); + nfp->f_count++; + mtx_leave(&fhdlk); if (fp != NULL) FRELE(fp, p); @@ -206,13 +226,17 @@ fd_getfile(struct filedesc *fdp, int fd) { struct file *fp; - if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) - return (NULL); + vfs_stall_barrier(); - if (!FILE_IS_USABLE(fp)) + if ((u_int)fd >= fdp->fd_nfiles) return (NULL); - FREF(fp); + mtx_enter(&fhdlk); + fp = fdp->fd_ofiles[fd]; + if (fp != NULL) + fp->f_count++; + mtx_leave(&fhdlk); + return (fp); } @@ -634,18 +658,22 @@ finishdup(struct proc *p, struct file *f return (EDEADLK); } - /* - * Don't fd_getfile here. We want to closef LARVAL files and - * closef can deal with that. - */ + mtx_enter(&fhdlk); oldfp = fdp->fd_ofiles[new]; if (oldfp != NULL) - FREF(oldfp); + oldfp->f_count++; + mtx_leave(&fhdlk); + + if (dup2 && oldfp == NULL) { + if (fd_inuse(fdp, new)) { + FRELE(fp, p); + return (EBUSY); + } + fd_used(fdp, new); + } fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE; - if (dup2 && oldfp == NULL) - fd_used(fdp, new); *retval = new; if (oldfp != NULL) { @@ -658,6 +686,25 @@ finishdup(struct proc *p, struct file *f } void +fdinsert(struct filedesc *fdp, int fd, int flags, struct file *fp) +{ + struct file *fq; + + fdpassertlocked(fdp); + + mtx_enter(&fhdlk); + if ((fq = fdp->fd_ofiles[0]) != NULL) { + LIST_INSERT_AFTER(fq, fp, f_list); + } else { + LIST_INSERT_HEAD(&filehead, fp, f_list); + } + fdp->fd_ofiles[fd] = fp; + fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE); + fp->f_iflags |= FIF_INSERTED; + mtx_leave(&fhdlk); +} + +void fdremove(struct filedesc *fdp, int fd) { fdpassertlocked(fdp); @@ -670,21 +717,14 @@ int fdrelease(struct proc *p, int fd) { struct filedesc *fdp = p->p_fd; - struct file **fpp, *fp; + struct file *fp; fdpassertlocked(fdp); - /* - * Don't fd_getfile here. We want to closef LARVAL files and closef - * can deal with that. - */ - fpp = &fdp->fd_ofiles[fd]; - fp = *fpp; + fp = fd_getfile(fdp, fd); if (fp == NULL) return (EBADF); - FREF(fp); - *fpp = NULL; - fd_unused(fdp, fd); + fdremove(fdp, fd); if (fd < fdp->fd_knlistsize) knote_fdclose(p, fd); return (closef(fp, p)); @@ -927,9 +967,9 @@ fdexpand(struct proc *p) * a file descriptor for the process that refers to it. */ int -falloc(struct proc *p, int flags, struct file **resultfp, int *resultfd) +falloc(struct proc *p, struct file **resultfp, int *resultfd) { - struct file *fp, *fq; + struct file *fp; int error, i; KASSERT(resultfp != NULL); @@ -958,20 +998,16 @@ restart: numfiles++; fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO); mtx_init(&fp->f_mtx, IPL_NONE); - fp->f_iflags = FIF_LARVAL; - if ((fq = p->p_fd->fd_ofiles[0]) != NULL) { - LIST_INSERT_AFTER(fq, fp, f_list); - } else { - LIST_INSERT_HEAD(&filehead, fp, f_list); - } - p->p_fd->fd_ofiles[i] = fp; - p->p_fd->fd_ofileflags[i] |= (flags & UF_EXCLOSE); fp->f_count = 1; fp->f_cred = p->p_ucred; crhold(fp->f_cred); *resultfp = fp; *resultfd = i; - FREF(fp); + + mtx_enter(&fhdlk); + fp->f_count++; + mtx_leave(&fhdlk); + return (0); } @@ -1063,6 +1099,7 @@ fdcopy(struct process *pr) newfdp->fd_flags = fdp->fd_flags; newfdp->fd_cmask = fdp->fd_cmask; + mtx_enter(&fhdlk); for (i = 0; i <= fdp->fd_lastfile; i++) { struct file *fp = fdp->fd_ofiles[i]; @@ -1079,12 +1116,13 @@ fdcopy(struct process *pr) fp->f_type == DTYPE_KQUEUE) continue; - FREF(fp); + fp->f_count++; newfdp->fd_ofiles[i] = fp; newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; fd_used(newfdp, i); } } + mtx_leave(&fhdlk); fdpunlock(fdp); return (newfdp); @@ -1106,8 +1144,9 @@ fdfree(struct proc *p) for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) { fp = *fpp; if (fp != NULL) { - FREF(fp); *fpp = NULL; + /* closef() expects a refcount of 2 */ + FREF(fp); (void) closef(fp, p); } } @@ -1145,11 +1184,11 @@ closef(struct file *fp, struct proc *p) if (fp == NULL) return (0); -#ifdef DIAGNOSTIC - if (fp->f_count < 2) - panic("closef: count (%ld) < 2", fp->f_count); -#endif + KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count); + + mtx_enter(&fhdlk); fp->f_count--; + mtx_leave(&fhdlk); /* * POSIX record locking dictates that any close releases ALL @@ -1181,18 +1220,19 @@ fdrop(struct file *fp, struct proc *p) { int error; -#ifdef DIAGNOSTIC - if (fp->f_count != 0) - panic("fdrop: count (%ld) != 0", fp->f_count); -#endif + MUTEX_ASSERT_LOCKED(&fhdlk); + + KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count); + + if (fp->f_iflags & FIF_INSERTED) + LIST_REMOVE(fp, f_list); + mtx_leave(&fhdlk); if (fp->f_ops) error = (*fp->f_ops->fo_close)(fp, p); else error = 0; - /* Free fp */ - LIST_REMOVE(fp, f_list); crfree(fp->f_cred); numfiles--; pool_put(&file_pool, fp); @@ -1307,7 +1347,7 @@ dupfdopen(struct proc *p, int indx, int * of file descriptors, or the fd to be dup'd has already been * closed, reject. Note, there is no need to check for new == old * because fd_getfile will return NULL if the file at indx is - * newly created by falloc (FIF_LARVAL). + * newly created by falloc. */ if ((wfp = fd_getfile(fdp, dupfd)) == NULL) return (EBADF); Index: kern/kern_event.c =================================================================== RCS file: /cvs/src/sys/kern/kern_event.c,v retrieving revision 1.88 diff -u -p -r1.88 kern_event.c --- kern/kern_event.c 27 Apr 2018 10:13:37 -0000 1.88 +++ kern/kern_event.c 22 May 2018 13:21:54 -0000 @@ -441,10 +441,9 @@ sys_kqueue(struct proc *p, void *v, regi int fd, error; fdplock(fdp); - error = falloc(p, 0, &fp, &fd); - fdpunlock(fdp); + error = falloc(p, &fp, &fd); if (error) - return (error); + goto out; fp->f_flag = FREAD | FWRITE; fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; @@ -456,8 +455,11 @@ sys_kqueue(struct proc *p, void *v, regi if (fdp->fd_knlistsize < 0) fdp->fd_knlistsize = 0; /* this process has a kq */ kq->kq_fdp = fdp; - FILE_SET_MATURE(fp, p); - return (0); + fdinsert(fdp, fd, 0, fp); + FRELE(fp, p); +out: + fdpunlock(fdp); + return (error); } int Index: kern/kern_exec.c =================================================================== RCS file: /cvs/src/sys/kern/kern_exec.c,v retrieving revision 1.195 diff -u -p -r1.195 kern_exec.c --- kern/kern_exec.c 28 Apr 2018 03:13:04 -0000 1.195 +++ kern/kern_exec.c 22 May 2018 13:21:54 -0000 @@ -584,7 +584,7 @@ sys_execve(struct proc *p, void *v, regi struct vnode *vp; int indx; - if ((error = falloc(p, 0, &fp, &indx)) != 0) + if ((error = falloc(p, &fp, &indx)) != 0) break; #ifdef DIAGNOSTIC if (indx != i) @@ -607,10 +607,9 @@ sys_execve(struct proc *p, void *v, regi fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_data = (caddr_t)vp; - FILE_SET_MATURE(fp, p); - } else { - FRELE(fp, p); + fdinsert(p->p_fd, indx, 0, fp); } + FRELE(fp, p); } fdpunlock(p->p_fd); if (error) Index: kern/kern_ktrace.c =================================================================== RCS file: /cvs/src/sys/kern/kern_ktrace.c,v retrieving revision 1.96 diff -u -p -r1.96 kern_ktrace.c --- kern/kern_ktrace.c 28 Apr 2018 03:13:04 -0000 1.96 +++ kern/kern_ktrace.c 22 May 2018 13:21:54 -0000 @@ -225,7 +225,7 @@ ktrgenio(struct proc *p, int fd, enum ui struct ktr_header kth; struct ktr_genio ktp; caddr_t cp; - int count; + int count, error; int buflen; atomic_setbits_int(&p->p_flag, P_INKTR); @@ -254,7 +254,10 @@ ktrgenio(struct proc *p, int fd, enum ui if (copyin(iov->iov_base, cp, count)) break; - if (ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count) != 0) + KERNEL_LOCK(); + error = ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count); + KERNEL_UNLOCK(); + if (error != 0) break; iov->iov_len -= count; @@ -294,13 +297,14 @@ ktrstruct(struct proc *p, const char *na { struct ktr_header kth; - KERNEL_ASSERT_LOCKED(); atomic_setbits_int(&p->p_flag, P_INKTR); ktrinitheader(&kth, p, KTR_STRUCT); - + if (data == NULL) datalen = 0; + KERNEL_LOCK(); ktrwrite2(p, &kth, name, strlen(name) + 1, data, datalen); + KERNEL_UNLOCK(); atomic_clearbits_int(&p->p_flag, P_INKTR); } @@ -386,7 +390,9 @@ ktrpledge(struct proc *p, int error, uin kp.code = code; kp.syscall = syscall; + KERNEL_LOCK(); ktrwrite(p, &kth, &kp, sizeof(kp)); + KERNEL_UNLOCK(); atomic_clearbits_int(&p->p_flag, P_INKTR); } @@ -622,6 +628,8 @@ ktrwriteraw(struct proc *curp, struct vn struct iovec aiov[3]; struct process *pr; int error; + + KERNEL_ASSERT_LOCKED(); auio.uio_iov = &aiov[0]; auio.uio_offset = 0; Index: kern/kern_pledge.c =================================================================== RCS file: /cvs/src/sys/kern/kern_pledge.c,v retrieving revision 1.230 diff -u -p -r1.230 kern_pledge.c --- kern/kern_pledge.c 28 Apr 2018 12:49:21 -0000 1.230 +++ kern/kern_pledge.c 22 May 2018 13:21:54 -0000 @@ -523,6 +523,7 @@ pledge_fail(struct proc *p, int error, u if (p->p_p->ps_pledge & PLEDGE_ERROR) return (ENOSYS); + KERNEL_LOCK(); log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n", p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall); p->p_p->ps_acflag |= APLEDGE; @@ -535,6 +536,7 @@ pledge_fail(struct proc *p, int error, u psignal(p, SIGABRT); p->p_p->ps_pledge = 0; /* Disable all PLEDGE_ flags */ + KERNEL_UNLOCK(); return (error); } Index: kern/kern_sysctl.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sysctl.c,v retrieving revision 1.337 diff -u -p -r1.337 kern_sysctl.c --- kern/kern_sysctl.c 16 May 2018 14:53:43 -0000 1.337 +++ kern/kern_sysctl.c 22 May 2018 13:21:54 -0000 @@ -1059,7 +1059,9 @@ fill_file(struct kinfo_file *kf, struct kf->f_flag = fp->f_flag; kf->f_iflags = fp->f_iflags; kf->f_type = fp->f_type; + mtx_enter(&fhdlk); kf->f_count = fp->f_count; + mtx_leave(&fhdlk); if (show_pointers) kf->f_ucred = PTRTOINT64(fp->f_cred); kf->f_uid = fp->f_cred->cr_uid; Index: kern/sys_pipe.c =================================================================== RCS file: /cvs/src/sys/kern/sys_pipe.c,v retrieving revision 1.78 diff -u -p -r1.78 sys_pipe.c --- kern/sys_pipe.c 10 Apr 2018 09:17:45 -0000 1.78 +++ kern/sys_pipe.c 22 May 2018 13:21:54 -0000 @@ -154,7 +154,7 @@ dopipe(struct proc *p, int *ufds, int fl fdplock(fdp); - error = falloc(p, cloexec, &rf, &fds[0]); + error = falloc(p, &rf, &fds[0]); if (error != 0) goto free2; rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK); @@ -162,7 +162,7 @@ dopipe(struct proc *p, int *ufds, int fl rf->f_data = rpipe; rf->f_ops = &pipeops; - error = falloc(p, cloexec, &wf, &fds[1]); + error = falloc(p, &wf, &fds[1]); if (error != 0) goto free3; wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK); @@ -173,8 +173,8 @@ dopipe(struct proc *p, int *ufds, int fl rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; - FILE_SET_MATURE(rf, p); - FILE_SET_MATURE(wf, p); + fdinsert(fdp, fds[0], cloexec, rf); + fdinsert(fdp, fds[1], cloexec, wf); error = copyout(fds, ufds, sizeof(fds)); if (error != 0) { @@ -186,6 +186,9 @@ dopipe(struct proc *p, int *ufds, int fl ktrfds(p, fds, 2); #endif fdpunlock(fdp); + + FRELE(rf, p); + FRELE(wf, p); return (error); free3: Index: kern/syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/syscalls.c,v retrieving revision 1.190 diff -u -p -r1.190 syscalls.c --- kern/syscalls.c 12 Dec 2017 01:13:14 -0000 1.190 +++ kern/syscalls.c 22 May 2018 13:21:54 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: syscalls.c,v 1.190 2017/12/12 01:13:14 deraadt Exp $ */ +/* $OpenBSD$ */ /* * System call names. Index: kern/syscalls.master =================================================================== RCS file: /cvs/src/sys/kern/syscalls.master,v retrieving revision 1.180 diff -u -p -r1.180 syscalls.master --- kern/syscalls.master 12 Dec 2017 01:12:34 -0000 1.180 +++ kern/syscalls.master 22 May 2018 13:21:54 -0000 @@ -88,18 +88,18 @@ #else 26 UNIMPL ptrace #endif -27 STD { ssize_t sys_recvmsg(int s, struct msghdr *msg, \ +27 STD NOLOCK { ssize_t sys_recvmsg(int s, struct msghdr *msg, \ int flags); } -28 STD { ssize_t sys_sendmsg(int s, \ +28 STD NOLOCK { ssize_t sys_sendmsg(int s, \ const struct msghdr *msg, int flags); } -29 STD { ssize_t sys_recvfrom(int s, void *buf, size_t len, \ +29 STD NOLOCK { ssize_t sys_recvfrom(int s, void *buf, size_t len, \ int flags, struct sockaddr *from, \ socklen_t *fromlenaddr); } -30 STD { int sys_accept(int s, struct sockaddr *name, \ +30 STD NOLOCK { int sys_accept(int s, struct sockaddr *name, \ socklen_t *anamelen); } -31 STD { int sys_getpeername(int fdes, struct sockaddr *asa, \ +31 STD NOLOCK { int sys_getpeername(int fdes, struct sockaddr *asa, \ socklen_t *alen); } -32 STD { int sys_getsockname(int fdes, struct sockaddr *asa, \ +32 STD NOLOCK { int sys_getsockname(int fdes, struct sockaddr *asa, \ socklen_t *alen); } 33 STD { int sys_access(const char *path, int amode); } 34 STD { int sys_chflags(const char *path, u_int flags); } @@ -205,26 +205,26 @@ 91 STD { int sys_nanosleep(const struct timespec *rqtp, \ struct timespec *rmtp); } 92 STD { int sys_fcntl(int fd, int cmd, ... void *arg); } -93 STD { int sys_accept4(int s, struct sockaddr *name, \ +93 STD NOLOCK { int sys_accept4(int s, struct sockaddr *name, \ socklen_t *anamelen, int flags); } 94 STD { int sys___thrsleep(const volatile void *ident, \ clockid_t clock_id, const struct timespec *tp, \ void *lock, const int *abort); } 95 STD { int sys_fsync(int fd); } 96 STD { int sys_setpriority(int which, id_t who, int prio); } -97 STD { int sys_socket(int domain, int type, int protocol); } -98 STD { int sys_connect(int s, const struct sockaddr *name, \ +97 STD NOLOCK { int sys_socket(int domain, int type, int protocol); } +98 STD NOLOCK { int sys_connect(int s, const struct sockaddr *name, \ socklen_t namelen); } 99 STD { int sys_getdents(int fd, void *buf, size_t buflen); } 100 STD { int sys_getpriority(int which, id_t who); } 101 STD { int sys_pipe2(int *fdp, int flags); } 102 STD { int sys_dup3(int from, int to, int flags); } 103 STD { int sys_sigreturn(struct sigcontext *sigcntxp); } -104 STD { int sys_bind(int s, const struct sockaddr *name, \ +104 STD NOLOCK { int sys_bind(int s, const struct sockaddr *name, \ socklen_t namelen); } -105 STD { int sys_setsockopt(int s, int level, int name, \ +105 STD NOLOCK { int sys_setsockopt(int s, int level, int name, \ const void *val, socklen_t valsize); } -106 STD { int sys_listen(int s, int backlog); } +106 STD NOLOCK { int sys_listen(int s, int backlog); } 107 STD { int sys_chflagsat(int fd, const char *path, \ u_int flags, int atflags); } 108 STD { int sys_pledge(const char *promises, \ @@ -243,7 +243,7 @@ 115 OBSOL vtrace 116 OBSOL t32_gettimeofday 117 OBSOL t32_getrusage -118 STD { int sys_getsockopt(int s, int level, int name, \ +118 STD NOLOCK { int sys_getsockopt(int s, int level, int name, \ void *val, socklen_t *avalsize); } 119 STD { int sys_thrkill(pid_t tid, int signum, void *tcb); } 120 STD { ssize_t sys_readv(int fd, \ @@ -261,11 +261,11 @@ 130 OBSOL oftruncate 131 STD { int sys_flock(int fd, int how); } 132 STD { int sys_mkfifo(const char *path, mode_t mode); } -133 STD { ssize_t sys_sendto(int s, const void *buf, \ +133 STD NOLOCK { ssize_t sys_sendto(int s, const void *buf, \ size_t len, int flags, const struct sockaddr *to, \ socklen_t tolen); } -134 STD { int sys_shutdown(int s, int how); } -135 STD { int sys_socketpair(int domain, int type, \ +134 STD NOLOCK { int sys_shutdown(int s, int how); } +135 STD NOLOCK { int sys_socketpair(int domain, int type, \ int protocol, int *rsv); } 136 STD { int sys_mkdir(const char *path, mode_t mode); } 137 STD { int sys_rmdir(const char *path); } Index: kern/tty_pty.c =================================================================== RCS file: /cvs/src/sys/kern/tty_pty.c,v retrieving revision 1.84 diff -u -p -r1.84 tty_pty.c --- kern/tty_pty.c 28 Apr 2018 03:13:04 -0000 1.84 +++ kern/tty_pty.c 22 May 2018 13:21:54 -0000 @@ -1070,11 +1070,11 @@ ptmioctl(dev_t dev, u_long cmd, caddr_t case PTMGET: fdplock(fdp); /* Grab two filedescriptors. */ - if ((error = falloc(p, 0, &cfp, &cindx)) != 0) { + if ((error = falloc(p, &cfp, &cindx)) != 0) { fdpunlock(fdp); break; } - if ((error = falloc(p, 0, &sfp, &sindx)) != 0) { + if ((error = falloc(p, &sfp, &sindx)) != 0) { fdremove(fdp, cindx); closef(cfp, p); fdpunlock(fdp); @@ -1166,11 +1166,12 @@ retry: memcpy(ptm->cn, pti->pty_pn, sizeof(pti->pty_pn)); memcpy(ptm->sn, pti->pty_sn, sizeof(pti->pty_sn)); - /* mark the files mature now that we've passed all errors */ - FILE_SET_MATURE(cfp, p); - FILE_SET_MATURE(sfp, p); - + /* insert files now that we've passed all errors */ + fdinsert(fdp, cindx, 0, cfp); + fdinsert(fdp, sindx, 0, sfp); fdpunlock(fdp); + FRELE(cfp, p); + FRELE(sfp, p); break; default: error = EINVAL; Index: kern/uipc_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.171 diff -u -p -r1.171 uipc_syscalls.c --- kern/uipc_syscalls.c 22 May 2018 09:51:01 -0000 1.171 +++ kern/uipc_syscalls.c 22 May 2018 13:21:54 -0000 @@ -101,13 +101,14 @@ sys_socket(struct proc *p, void *v, regi fflag = FREAD | FWRITE | (nonblock ? FNONBLOCK : 0); error = socreate(SCARG(uap, domain), &so, type, SCARG(uap, protocol)); - if (error != 0) - goto out; + if (error) + return (error); + KERNEL_LOCK(); fdplock(fdp); - error = falloc(p, cloexec, &fp, &fd); - fdpunlock(fdp); + error = falloc(p, &fp, &fd); if (error) { + fdpunlock(fdp); soclose(so); } else { fp->f_flag = fflag; @@ -117,10 +118,12 @@ sys_socket(struct proc *p, void *v, regi so->so_state |= SS_NBIO; so->so_state |= ss; fp->f_data = so; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, fd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); *retval = fd; } -out: + KERNEL_UNLOCK(); return (error); } @@ -272,7 +275,9 @@ doaccept(struct proc *p, int sock, struc socklen_t namelen; int error, s, tmpfd; struct socket *head, *so; - int nflag; + int cloexec, nflag; + + cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0; if (name && (error = copyin(anamelen, &namelen, sizeof (namelen)))) return (error); @@ -282,7 +287,7 @@ doaccept(struct proc *p, int sock, struc headfp = fp; fdplock(fdp); - error = falloc(p, (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0, &fp, &tmpfd); + error = falloc(p, &fp, &tmpfd); fdpunlock(fdp); if (error) { FRELE(headfp, p); @@ -347,8 +352,11 @@ out: else so->so_state &= ~SS_NBIO; sounlock(s); + fdplock(fdp); fp->f_data = so; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, tmpfd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); *retval = tmpfd; } else { sounlock(s); @@ -475,14 +483,15 @@ sys_socketpair(struct proc *p, void *v, if (error != 0) goto free2; } + KERNEL_LOCK(); fdplock(fdp); - if ((error = falloc(p, cloexec, &fp1, &sv[0])) != 0) + if ((error = falloc(p, &fp1, &sv[0])) != 0) goto free3; fp1->f_flag = fflag; fp1->f_type = DTYPE_SOCKET; fp1->f_ops = &socketops; fp1->f_data = so1; - if ((error = falloc(p, cloexec, &fp2, &sv[1])) != 0) + if ((error = falloc(p, &fp2, &sv[1])) != 0) goto free4; fp2->f_flag = fflag; fp2->f_type = DTYPE_SOCKET; @@ -500,9 +509,12 @@ sys_socketpair(struct proc *p, void *v, (*fp2->f_ops->fo_ioctl)(fp2, FIONBIO, (caddr_t)&type, p); } - FILE_SET_MATURE(fp1, p); - FILE_SET_MATURE(fp2, p); + fdinsert(fdp, sv[0], cloexec, fp1); + fdinsert(fdp, sv[1], cloexec, fp2); fdpunlock(fdp); + FRELE(fp1, p); + FRELE(fp2, p); + KERNEL_UNLOCK(); return (0); } fdremove(fdp, sv[1]); @@ -514,6 +526,7 @@ free4: so1 = NULL; free3: fdpunlock(fdp); + KERNEL_UNLOCK(); free2: if (so2 != NULL) (void)soclose(so2); @@ -678,13 +691,16 @@ sendit(struct proc *p, int s, struct msg } #endif len = auio.uio_resid; - error = sosend(fp->f_data, to, &auio, NULL, control, flags); + error = sosend(so, to, &auio, NULL, control, flags); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; - if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) + if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) { + KERNEL_LOCK(); ptsignal(p, SIGPIPE, STHREAD); + KERNEL_UNLOCK(); + } } if (error == 0) { *retsize = len - auio.uio_resid; @@ -925,11 +941,13 @@ sys_shutdown(struct proc *p, void *v, re syscallarg(int) how; } */ *uap = v; struct file *fp; + struct socket *so; int error; if ((error = getsock(p, SCARG(uap, s), &fp)) != 0) return (error); - error = soshutdown(fp->f_data, SCARG(uap, how)); + so = fp->f_data; + error = soshutdown(so, SCARG(uap, how)); FRELE(fp, p); return (error); } @@ -1163,7 +1181,8 @@ getsock(struct proc *p, int fdes, struct { struct file *fp; - if ((fp = fd_getfile(p->p_fd, fdes)) == NULL) + fp = fd_getfile(p->p_fd, fdes); + if (fp == NULL) return (EBADF); if (fp->f_type != DTYPE_SOCKET) { FRELE(fp, p); Index: kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.126 diff -u -p -r1.126 uipc_usrreq.c --- kern/uipc_usrreq.c 28 Apr 2018 03:13:04 -0000 1.126 +++ kern/uipc_usrreq.c 22 May 2018 13:21:54 -0000 @@ -899,6 +899,7 @@ unp_gc(void *arg __unused) fp = defer->ud_fp[i].fp; if (fp == NULL) continue; + /* closef() expects a refcount of 2 */ FREF(fp); if ((unp = fptounp(fp)) != NULL) unp->unp_msgcount--; @@ -915,6 +916,8 @@ unp_gc(void *arg __unused) do { nunref = 0; LIST_FOREACH(unp, &unp_head, unp_link) { + mtx_enter(&fhdlk); + fp = unp->unp_file; if (unp->unp_flags & UNP_GCDEFER) { /* * This socket is referenced by another @@ -925,8 +928,9 @@ unp_gc(void *arg __unused) unp_defer--; } else if (unp->unp_flags & UNP_GCMARK) { /* marked as live in previous pass */ + mtx_leave(&fhdlk); continue; - } else if ((fp = unp->unp_file) == NULL) { + } else if (fp == NULL) { /* not being passed, so can't be in loop */ } else if (fp->f_count == 0) { /* @@ -943,9 +947,11 @@ unp_gc(void *arg __unused) if (fp->f_count == unp->unp_msgcount) { nunref++; unp->unp_flags |= UNP_GCDEAD; + mtx_leave(&fhdlk); continue; } } + mtx_leave(&fhdlk); /* * This is the first time we've seen this socket on Index: kern/vfs_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_syscalls.c,v retrieving revision 1.283 diff -u -p -r1.283 vfs_syscalls.c --- kern/vfs_syscalls.c 8 May 2018 08:53:41 -0000 1.283 +++ kern/vfs_syscalls.c 22 May 2018 13:21:54 -0000 @@ -899,7 +899,7 @@ doopenat(struct proc *p, int fd, const c struct file *fp; struct vnode *vp; struct vattr vattr; - int flags, cmode; + int flags, cloexec, cmode; int type, indx, error, localtrunc = 0; struct flock lf; struct nameidata nd; @@ -911,10 +911,10 @@ doopenat(struct proc *p, int fd, const c return (error); } - fdplock(fdp); + cloexec = (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0; - if ((error = falloc(p, (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp, - &indx)) != 0) + fdplock(fdp); + if ((error = falloc(p, &fp, &indx)) != 0) goto out; flags = FFLAGS(oflags); if (flags & FREAD) @@ -999,7 +999,8 @@ doopenat(struct proc *p, int fd, const c } VOP_UNLOCK(vp); *retval = indx; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, indx, cloexec, fp); + FRELE(fp, p); out: fdpunlock(fdp); return (error); @@ -1060,7 +1061,7 @@ sys_fhopen(struct proc *p, void *v, regi struct vnode *vp = NULL; struct mount *mp; struct ucred *cred = p->p_ucred; - int flags; + int flags, cloexec; int type, indx, error=0; struct flock lf; struct vattr va; @@ -1078,9 +1079,10 @@ sys_fhopen(struct proc *p, void *v, regi if ((flags & O_CREAT)) return (EINVAL); + cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0; + fdplock(fdp); - if ((error = falloc(p, (flags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp, - &indx)) != 0) { + if ((error = falloc(p, &fp, &indx)) != 0) { fp = NULL; goto bad; } @@ -1160,9 +1162,9 @@ sys_fhopen(struct proc *p, void *v, regi } VOP_UNLOCK(vp); *retval = indx; - FILE_SET_MATURE(fp, p); - + fdinsert(fdp, indx, cloexec, fp); fdpunlock(fdp); + FRELE(fp, p); return (0); bad: Index: net/if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.552 diff -u -p -r1.552 if.c --- net/if.c 17 May 2018 11:04:14 -0000 1.552 +++ net/if.c 22 May 2018 13:21:54 -0000 @@ -1378,7 +1378,7 @@ ifa_ifwithaddr(struct sockaddr *addr, u_ struct ifaddr *ifa; u_int rdomain; - KERNEL_ASSERT_LOCKED(); + KERNEL_LOCK(); rdomain = rtable_l2(rtableid); TAILQ_FOREACH(ifp, &ifnet, if_list) { if (ifp->if_rdomain != rdomain) @@ -1388,10 +1388,13 @@ ifa_ifwithaddr(struct sockaddr *addr, u_ if (ifa->ifa_addr->sa_family != addr->sa_family) continue; - if (equal(addr, ifa->ifa_addr)) + if (equal(addr, ifa->ifa_addr)) { + KERNEL_UNLOCK(); return (ifa); + } } } + KERNEL_UNLOCK(); return (NULL); } @@ -1404,8 +1407,8 @@ ifa_ifwithdstaddr(struct sockaddr *addr, struct ifnet *ifp; struct ifaddr *ifa; - KERNEL_ASSERT_LOCKED(); rdomain = rtable_l2(rdomain); + KERNEL_LOCK(); TAILQ_FOREACH(ifp, &ifnet, if_list) { if (ifp->if_rdomain != rdomain) continue; @@ -1414,11 +1417,14 @@ ifa_ifwithdstaddr(struct sockaddr *addr, if (ifa->ifa_addr->sa_family != addr->sa_family || ifa->ifa_dstaddr == NULL) continue; - if (equal(addr, ifa->ifa_dstaddr)) + if (equal(addr, ifa->ifa_dstaddr)) { + KERNEL_UNLOCK(); return (ifa); + } } } } + KERNEL_UNLOCK(); return (NULL); } Index: sys/file.h =================================================================== RCS file: /cvs/src/sys/sys/file.h,v retrieving revision 1.45 diff -u -p -r1.45 file.h --- sys/file.h 9 May 2018 08:42:02 -0000 1.45 +++ sys/file.h 22 May 2018 13:21:54 -0000 @@ -65,6 +65,7 @@ struct fileops { * * Locks used to protect struct members in this file: * I immutable after creation + * F global `fhdlk' mutex * f per file `f_mtx' * k kernel lock */ @@ -77,7 +78,7 @@ struct file { #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_KQUEUE 4 /* event queue */ short f_type; /* [I] descriptor type */ - long f_count; /* [k] reference count */ + long f_count; /* [F] reference count */ struct ucred *f_cred; /* [I] credentials associated with descriptor */ struct fileops *f_ops; /* [I] file operation pointers */ off_t f_offset; /* [k] */ @@ -91,26 +92,31 @@ struct file { }; #define FIF_HASLOCK 0x01 /* descriptor holds advisory lock */ -#define FIF_LARVAL 0x02 /* not fully constructed, don't use */ - -#define FILE_IS_USABLE(fp) \ - (((fp)->f_iflags & FIF_LARVAL) == 0) +#define FIF_INSERTED 0x80 /* present in `filehead' */ #define FREF(fp) \ do { \ extern void vfs_stall_barrier(void); \ vfs_stall_barrier(); \ + mtx_enter(&fhdlk); \ (fp)->f_count++; \ + mtx_leave(&fhdlk); \ } while (0) -#define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0) -#define FILE_SET_MATURE(fp,p) do { \ - (fp)->f_iflags &= ~FIF_LARVAL; \ - FRELE(fp, p); \ -} while (0) +#define FRELE(fp,p) \ +({ \ + int rv = 0; \ + mtx_enter(&fhdlk); \ + if (--(fp)->f_count == 0) \ + rv = fdrop(fp, p); \ + else \ + mtx_leave(&fhdlk); \ + rv; \ +}) int fdrop(struct file *, struct proc *); +extern struct mutex fhdlk; /* protects `filehead' and f_count */ LIST_HEAD(filelist, file); extern int maxfiles; /* kernel limit on number of open files */ extern int numfiles; /* actual number of open files */ Index: sys/filedesc.h =================================================================== RCS file: /cvs/src/sys/sys/filedesc.h,v retrieving revision 1.35 diff -u -p -r1.35 filedesc.h --- sys/filedesc.h 25 Apr 2018 10:29:17 -0000 1.35 +++ sys/filedesc.h 22 May 2018 13:21:54 -0000 @@ -125,12 +125,13 @@ void filedesc_init(void); int dupfdopen(struct proc *, int, int); int fdalloc(struct proc *p, int want, int *result); void fdexpand(struct proc *); -int falloc(struct proc *_p, int _flags, struct file **_rfp, int *_rfd); +int falloc(struct proc *_p, struct file **_rfp, int *_rfd); struct filedesc *fdinit(void); struct filedesc *fdshare(struct process *); struct filedesc *fdcopy(struct process *); void fdfree(struct proc *p); int fdrelease(struct proc *p, int); +void fdinsert(struct filedesc *, int, int, struct file *); void fdremove(struct filedesc *, int); void fdcloseexec(struct proc *); struct file *fd_iterfile(struct file *, struct proc *); Index: sys/syscall.h =================================================================== RCS file: /cvs/src/sys/sys/syscall.h,v retrieving revision 1.190 diff -u -p -r1.190 syscall.h --- sys/syscall.h 12 Dec 2017 01:13:14 -0000 1.190 +++ sys/syscall.h 22 May 2018 13:21:54 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: syscall.h,v 1.190 2017/12/12 01:13:14 deraadt Exp $ */ +/* $OpenBSD$ */ /* * System call numbers. Index: sys/syscallargs.h =================================================================== RCS file: /cvs/src/sys/sys/syscallargs.h,v retrieving revision 1.193 diff -u -p -r1.193 syscallargs.h --- sys/syscallargs.h 12 Dec 2017 01:13:14 -0000 1.193 +++ sys/syscallargs.h 22 May 2018 13:21:54 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: syscallargs.h,v 1.193 2017/12/12 01:13:14 deraadt Exp $ */ +/* $OpenBSD$ */ /* * System call argument lists.