On 22/05/18(Tue) 15:39, Martin Pieuchot wrote: > By assuming that `f_data' is immutable, which AFAIK is true for sockets, > we can remove the KERNEL_LOCK() from the following syscalls iff files are > refcounted in an MP-safe way. > > This diff includes the EBUSY check in dup2(2) which is currently required > to avoid races with accept(2) and will later make our life easier wrt > open(2). > > It also includes the fdinsert() diff I sent earlier. > > On top of that I'm introducing a global mutex, `fhdlk', that protects > `f_count' and the implicit reference in `filehead'. > > A socket stays alive as long as its associated file has a positive > refcount. When this refcount drops, fdrop() will be called and soclose() > will free/clean `f_data'. That's the only place where `f_data' is > changed during the life of a socket. That's why it is safe to dereference > `f_data' when getsock() returned a valid & refcounted `fp'. > > Many ktrace(2) internals now need to grab the KERNEL_LOCK(), just like > ptsignal(). > > Note that for unix, routing and pfkey sockets, solock() still grabs the > KERNEL_LOCK(). So even if syscalls are marked as SY_NOLOCK that doesn't > mean they won't grab it. In fact some network functions like > ifa_ifwithaddr() below now need to grab the KERNEL_LOCK(). That's good > that means we're pushing the lock down. > > Tests? Comments?
Updated diff that should prevent reported hangs, as analyzed by tb@ and visa@. Index: kern/exec_script.c =================================================================== RCS file: /cvs/src/sys/kern/exec_script.c,v retrieving revision 1.44 diff -u -p -r1.44 exec_script.c --- kern/exec_script.c 2 May 2018 02:24:56 -0000 1.44 +++ kern/exec_script.c 25 May 2018 08:24:33 -0000 @@ -170,17 +170,20 @@ check_shell: #endif fdplock(p->p_fd); - error = falloc(p, 0, &fp, &epp->ep_fd); - fdpunlock(p->p_fd); - if (error) + error = falloc(p, &fp, &epp->ep_fd); + if (error) { + fdpunlock(p->p_fd); goto fail; + } epp->ep_flags |= EXEC_HASFD; fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_data = (caddr_t) scriptvp; fp->f_flag = FREAD; - FILE_SET_MATURE(fp, p); + fdinsert(p->p_fd, epp->ep_fd, 0, fp); + fdpunlock(p->p_fd); + FRELE(fp, p); } /* set up the parameters for the recursive check_exec() call */ Index: kern/init_sysent.c =================================================================== RCS file: /cvs/src/sys/kern/init_sysent.c,v retrieving revision 1.191 diff -u -p -r1.191 init_sysent.c --- kern/init_sysent.c 12 Dec 2017 01:13:14 -0000 1.191 +++ kern/init_sysent.c 25 May 2018 08:24:33 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: init_sysent.c,v 1.191 2017/12/12 01:13:14 deraadt Exp $ */ +/* $OpenBSD$ */ /* * System call switch table. @@ -76,17 +76,17 @@ struct sysent sysent[] = { { 0, 0, 0, sys_nosys }, /* 26 = unimplemented ptrace */ #endif - { 3, s(struct sys_recvmsg_args), 0, + { 3, s(struct sys_recvmsg_args), SY_NOLOCK | 0, sys_recvmsg }, /* 27 = recvmsg */ - { 3, s(struct sys_sendmsg_args), 0, + { 3, s(struct sys_sendmsg_args), SY_NOLOCK | 0, sys_sendmsg }, /* 28 = sendmsg */ - { 6, s(struct sys_recvfrom_args), 0, + { 6, s(struct sys_recvfrom_args), SY_NOLOCK | 0, sys_recvfrom }, /* 29 = recvfrom */ - { 3, s(struct sys_accept_args), 0, + { 3, s(struct sys_accept_args), SY_NOLOCK | 0, sys_accept }, /* 30 = accept */ - { 3, s(struct sys_getpeername_args), 0, + { 3, s(struct sys_getpeername_args), SY_NOLOCK | 0, sys_getpeername }, /* 31 = getpeername */ - { 3, s(struct sys_getsockname_args), 0, + { 3, s(struct sys_getsockname_args), SY_NOLOCK | 0, sys_getsockname }, /* 32 = getsockname */ { 2, s(struct sys_access_args), 0, sys_access }, /* 33 = access */ @@ -218,7 +218,7 @@ struct sysent sysent[] = { sys_nanosleep }, /* 91 = nanosleep */ { 3, s(struct sys_fcntl_args), 0, sys_fcntl }, /* 92 = fcntl */ - { 4, s(struct sys_accept4_args), 0, + { 4, s(struct sys_accept4_args), SY_NOLOCK | 0, sys_accept4 }, /* 93 = accept4 */ { 5, s(struct sys___thrsleep_args), 0, sys___thrsleep }, /* 94 = __thrsleep */ @@ -226,9 +226,9 @@ struct sysent sysent[] = { sys_fsync }, /* 95 = fsync */ { 3, s(struct sys_setpriority_args), 0, sys_setpriority }, /* 96 = setpriority */ - { 3, s(struct sys_socket_args), 0, + { 3, s(struct sys_socket_args), SY_NOLOCK | 0, sys_socket }, /* 97 = socket */ - { 3, s(struct sys_connect_args), 0, + { 3, s(struct sys_connect_args), SY_NOLOCK | 0, sys_connect }, /* 98 = connect */ { 3, s(struct sys_getdents_args), 0, sys_getdents }, /* 99 = getdents */ @@ -240,11 +240,11 @@ struct sysent sysent[] = { sys_dup3 }, /* 102 = dup3 */ { 1, s(struct sys_sigreturn_args), 0, sys_sigreturn }, /* 103 = sigreturn */ - { 3, s(struct sys_bind_args), 0, + { 3, s(struct sys_bind_args), SY_NOLOCK | 0, sys_bind }, /* 104 = bind */ - { 5, s(struct sys_setsockopt_args), 0, + { 5, s(struct sys_setsockopt_args), SY_NOLOCK | 0, sys_setsockopt }, /* 105 = setsockopt */ - { 2, s(struct sys_listen_args), 0, + { 2, s(struct sys_listen_args), SY_NOLOCK | 0, sys_listen }, /* 106 = listen */ { 4, s(struct sys_chflagsat_args), 0, sys_chflagsat }, /* 107 = chflagsat */ @@ -268,7 +268,7 @@ struct sysent sysent[] = { sys_nosys }, /* 116 = obsolete t32_gettimeofday */ { 0, 0, 0, sys_nosys }, /* 117 = obsolete t32_getrusage */ - { 5, s(struct sys_getsockopt_args), 0, + { 5, s(struct sys_getsockopt_args), SY_NOLOCK | 0, sys_getsockopt }, /* 118 = getsockopt */ { 3, s(struct sys_thrkill_args), 0, sys_thrkill }, /* 119 = thrkill */ @@ -298,11 +298,11 @@ struct sysent sysent[] = { sys_flock }, /* 131 = flock */ { 2, s(struct sys_mkfifo_args), 0, sys_mkfifo }, /* 132 = mkfifo */ - { 6, s(struct sys_sendto_args), 0, + { 6, s(struct sys_sendto_args), SY_NOLOCK | 0, sys_sendto }, /* 133 = sendto */ - { 2, s(struct sys_shutdown_args), 0, + { 2, s(struct sys_shutdown_args), SY_NOLOCK | 0, sys_shutdown }, /* 134 = shutdown */ - { 4, s(struct sys_socketpair_args), 0, + { 4, s(struct sys_socketpair_args), SY_NOLOCK | 0, sys_socketpair }, /* 135 = socketpair */ { 2, s(struct sys_mkdir_args), 0, sys_mkdir }, /* 136 = mkdir */ Index: kern/kern_descrip.c =================================================================== RCS file: /cvs/src/sys/kern/kern_descrip.c,v retrieving revision 1.158 diff -u -p -r1.158 kern_descrip.c --- kern/kern_descrip.c 8 May 2018 09:03:58 -0000 1.158 +++ kern/kern_descrip.c 25 May 2018 08:24:58 -0000 @@ -67,6 +67,7 @@ /* * Descriptor management. */ +struct mutex fhdlk = MUTEX_INITIALIZER(IPL_VM); struct filelist filehead; /* head of list of open files */ int numfiles; /* actual number of open files */ @@ -144,6 +145,23 @@ find_last_set(struct filedesc *fd, int l return i; } +static __inline int +fd_inuse(struct filedesc *fdp, int fd) +{ + u_int off = fd >> NDENTRYSHIFT; + + if (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) + return 1; + + if (fdp->fd_lomap[off] != ~0) + return 0; + + if (fdp->fd_himap[off >> NDENTRYSHIFT] & (1 << (off & NDENTRYMASK))) + return 1; + + return 0; +} + static __inline void fd_used(struct filedesc *fdp, int fd) { @@ -184,16 +202,18 @@ fd_iterfile(struct file *fp, struct proc { struct file *nfp; + mtx_enter(&fhdlk); if (fp == NULL) nfp = LIST_FIRST(&filehead); else nfp = LIST_NEXT(fp, f_list); - /* don't FREF when f_count == 0 to avoid race in fdrop() */ - while (nfp != NULL && (nfp->f_count == 0 || !FILE_IS_USABLE(nfp))) + /* don't refcount when f_count == 0 to avoid race in fdrop() */ + while (nfp != NULL && nfp->f_count == 0) nfp = LIST_NEXT(nfp, f_list); if (nfp != NULL) - FREF(nfp); + nfp->f_count++; + mtx_leave(&fhdlk); if (fp != NULL) FRELE(fp, p); @@ -206,13 +226,17 @@ fd_getfile(struct filedesc *fdp, int fd) { struct file *fp; - if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) - return (NULL); + vfs_stall_barrier(); - if (!FILE_IS_USABLE(fp)) + if ((u_int)fd >= fdp->fd_nfiles) return (NULL); - FREF(fp); + mtx_enter(&fhdlk); + fp = fdp->fd_ofiles[fd]; + if (fp != NULL) + fp->f_count++; + mtx_leave(&fhdlk); + return (fp); } @@ -634,18 +658,22 @@ finishdup(struct proc *p, struct file *f return (EDEADLK); } - /* - * Don't fd_getfile here. We want to closef LARVAL files and - * closef can deal with that. - */ + mtx_enter(&fhdlk); oldfp = fdp->fd_ofiles[new]; if (oldfp != NULL) - FREF(oldfp); + oldfp->f_count++; + mtx_leave(&fhdlk); + + if (dup2 && oldfp == NULL) { + if (fd_inuse(fdp, new)) { + FRELE(fp, p); + return (EBUSY); + } + fd_used(fdp, new); + } fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE; - if (dup2 && oldfp == NULL) - fd_used(fdp, new); *retval = new; if (oldfp != NULL) { @@ -658,6 +686,25 @@ finishdup(struct proc *p, struct file *f } void +fdinsert(struct filedesc *fdp, int fd, int flags, struct file *fp) +{ + struct file *fq; + + fdpassertlocked(fdp); + + mtx_enter(&fhdlk); + if ((fq = fdp->fd_ofiles[0]) != NULL) { + LIST_INSERT_AFTER(fq, fp, f_list); + } else { + LIST_INSERT_HEAD(&filehead, fp, f_list); + } + fdp->fd_ofiles[fd] = fp; + fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE); + fp->f_iflags |= FIF_INSERTED; + mtx_leave(&fhdlk); +} + +void fdremove(struct filedesc *fdp, int fd) { fdpassertlocked(fdp); @@ -670,21 +717,14 @@ int fdrelease(struct proc *p, int fd) { struct filedesc *fdp = p->p_fd; - struct file **fpp, *fp; + struct file *fp; fdpassertlocked(fdp); - /* - * Don't fd_getfile here. We want to closef LARVAL files and closef - * can deal with that. - */ - fpp = &fdp->fd_ofiles[fd]; - fp = *fpp; + fp = fd_getfile(fdp, fd); if (fp == NULL) return (EBADF); - FREF(fp); - *fpp = NULL; - fd_unused(fdp, fd); + fdremove(fdp, fd); if (fd < fdp->fd_knlistsize) knote_fdclose(p, fd); return (closef(fp, p)); @@ -927,9 +967,9 @@ fdexpand(struct proc *p) * a file descriptor for the process that refers to it. */ int -falloc(struct proc *p, int flags, struct file **resultfp, int *resultfd) +falloc(struct proc *p, struct file **resultfp, int *resultfd) { - struct file *fp, *fq; + struct file *fp; int error, i; KASSERT(resultfp != NULL); @@ -957,21 +997,17 @@ restart: */ numfiles++; fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO); - mtx_init(&fp->f_mtx, IPL_NONE); - fp->f_iflags = FIF_LARVAL; - if ((fq = p->p_fd->fd_ofiles[0]) != NULL) { - LIST_INSERT_AFTER(fq, fp, f_list); - } else { - LIST_INSERT_HEAD(&filehead, fp, f_list); - } - p->p_fd->fd_ofiles[i] = fp; - p->p_fd->fd_ofileflags[i] |= (flags & UF_EXCLOSE); + mtx_init(&fp->f_mtx, IPL_VM); fp->f_count = 1; fp->f_cred = p->p_ucred; crhold(fp->f_cred); *resultfp = fp; *resultfd = i; - FREF(fp); + + mtx_enter(&fhdlk); + fp->f_count++; + mtx_leave(&fhdlk); + return (0); } @@ -1063,6 +1099,7 @@ fdcopy(struct process *pr) newfdp->fd_flags = fdp->fd_flags; newfdp->fd_cmask = fdp->fd_cmask; + mtx_enter(&fhdlk); for (i = 0; i <= fdp->fd_lastfile; i++) { struct file *fp = fdp->fd_ofiles[i]; @@ -1079,12 +1116,13 @@ fdcopy(struct process *pr) fp->f_type == DTYPE_KQUEUE) continue; - FREF(fp); + fp->f_count++; newfdp->fd_ofiles[i] = fp; newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; fd_used(newfdp, i); } } + mtx_leave(&fhdlk); fdpunlock(fdp); return (newfdp); @@ -1106,8 +1144,9 @@ fdfree(struct proc *p) for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) { fp = *fpp; if (fp != NULL) { - FREF(fp); *fpp = NULL; + /* closef() expects a refcount of 2 */ + FREF(fp); (void) closef(fp, p); } } @@ -1145,11 +1184,11 @@ closef(struct file *fp, struct proc *p) if (fp == NULL) return (0); -#ifdef DIAGNOSTIC - if (fp->f_count < 2) - panic("closef: count (%ld) < 2", fp->f_count); -#endif + KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count); + + mtx_enter(&fhdlk); fp->f_count--; + mtx_leave(&fhdlk); /* * POSIX record locking dictates that any close releases ALL @@ -1181,18 +1220,19 @@ fdrop(struct file *fp, struct proc *p) { int error; -#ifdef DIAGNOSTIC - if (fp->f_count != 0) - panic("fdrop: count (%ld) != 0", fp->f_count); -#endif + MUTEX_ASSERT_LOCKED(&fhdlk); + + KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count); + + if (fp->f_iflags & FIF_INSERTED) + LIST_REMOVE(fp, f_list); + mtx_leave(&fhdlk); if (fp->f_ops) error = (*fp->f_ops->fo_close)(fp, p); else error = 0; - /* Free fp */ - LIST_REMOVE(fp, f_list); crfree(fp->f_cred); numfiles--; pool_put(&file_pool, fp); @@ -1307,7 +1347,7 @@ dupfdopen(struct proc *p, int indx, int * of file descriptors, or the fd to be dup'd has already been * closed, reject. Note, there is no need to check for new == old * because fd_getfile will return NULL if the file at indx is - * newly created by falloc (FIF_LARVAL). + * newly created by falloc. */ if ((wfp = fd_getfile(fdp, dupfd)) == NULL) return (EBADF); Index: kern/kern_event.c =================================================================== RCS file: /cvs/src/sys/kern/kern_event.c,v retrieving revision 1.89 diff -u -p -r1.89 kern_event.c --- kern/kern_event.c 22 May 2018 19:15:22 -0000 1.89 +++ kern/kern_event.c 25 May 2018 08:24:33 -0000 @@ -441,10 +441,9 @@ sys_kqueue(struct proc *p, void *v, regi int fd, error; fdplock(fdp); - error = falloc(p, 0, &fp, &fd); - fdpunlock(fdp); + error = falloc(p, &fp, &fd); if (error) - return (error); + goto out; fp->f_flag = FREAD | FWRITE; fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; @@ -456,8 +455,11 @@ sys_kqueue(struct proc *p, void *v, regi if (fdp->fd_knlistsize < 0) fdp->fd_knlistsize = 0; /* this process has a kq */ kq->kq_fdp = fdp; - FILE_SET_MATURE(fp, p); - return (0); + fdinsert(fdp, fd, 0, fp); + FRELE(fp, p); +out: + fdpunlock(fdp); + return (error); } int Index: kern/kern_exec.c =================================================================== RCS file: /cvs/src/sys/kern/kern_exec.c,v retrieving revision 1.195 diff -u -p -r1.195 kern_exec.c --- kern/kern_exec.c 28 Apr 2018 03:13:04 -0000 1.195 +++ kern/kern_exec.c 25 May 2018 08:24:33 -0000 @@ -584,7 +584,7 @@ sys_execve(struct proc *p, void *v, regi struct vnode *vp; int indx; - if ((error = falloc(p, 0, &fp, &indx)) != 0) + if ((error = falloc(p, &fp, &indx)) != 0) break; #ifdef DIAGNOSTIC if (indx != i) @@ -607,10 +607,9 @@ sys_execve(struct proc *p, void *v, regi fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_data = (caddr_t)vp; - FILE_SET_MATURE(fp, p); - } else { - FRELE(fp, p); + fdinsert(p->p_fd, indx, 0, fp); } + FRELE(fp, p); } fdpunlock(p->p_fd); if (error) Index: kern/kern_ktrace.c =================================================================== RCS file: /cvs/src/sys/kern/kern_ktrace.c,v retrieving revision 1.96 diff -u -p -r1.96 kern_ktrace.c --- kern/kern_ktrace.c 28 Apr 2018 03:13:04 -0000 1.96 +++ kern/kern_ktrace.c 25 May 2018 08:24:33 -0000 @@ -225,7 +225,7 @@ ktrgenio(struct proc *p, int fd, enum ui struct ktr_header kth; struct ktr_genio ktp; caddr_t cp; - int count; + int count, error; int buflen; atomic_setbits_int(&p->p_flag, P_INKTR); @@ -254,7 +254,10 @@ ktrgenio(struct proc *p, int fd, enum ui if (copyin(iov->iov_base, cp, count)) break; - if (ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count) != 0) + KERNEL_LOCK(); + error = ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count); + KERNEL_UNLOCK(); + if (error != 0) break; iov->iov_len -= count; @@ -294,13 +297,14 @@ ktrstruct(struct proc *p, const char *na { struct ktr_header kth; - KERNEL_ASSERT_LOCKED(); atomic_setbits_int(&p->p_flag, P_INKTR); ktrinitheader(&kth, p, KTR_STRUCT); - + if (data == NULL) datalen = 0; + KERNEL_LOCK(); ktrwrite2(p, &kth, name, strlen(name) + 1, data, datalen); + KERNEL_UNLOCK(); atomic_clearbits_int(&p->p_flag, P_INKTR); } @@ -386,7 +390,9 @@ ktrpledge(struct proc *p, int error, uin kp.code = code; kp.syscall = syscall; + KERNEL_LOCK(); ktrwrite(p, &kth, &kp, sizeof(kp)); + KERNEL_UNLOCK(); atomic_clearbits_int(&p->p_flag, P_INKTR); } @@ -622,6 +628,8 @@ ktrwriteraw(struct proc *curp, struct vn struct iovec aiov[3]; struct process *pr; int error; + + KERNEL_ASSERT_LOCKED(); auio.uio_iov = &aiov[0]; auio.uio_offset = 0; Index: kern/kern_pledge.c =================================================================== RCS file: /cvs/src/sys/kern/kern_pledge.c,v retrieving revision 1.230 diff -u -p -r1.230 kern_pledge.c --- kern/kern_pledge.c 28 Apr 2018 12:49:21 -0000 1.230 +++ kern/kern_pledge.c 25 May 2018 08:24:33 -0000 @@ -523,6 +523,7 @@ pledge_fail(struct proc *p, int error, u if (p->p_p->ps_pledge & PLEDGE_ERROR) return (ENOSYS); + KERNEL_LOCK(); log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n", p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall); p->p_p->ps_acflag |= APLEDGE; @@ -535,6 +536,7 @@ pledge_fail(struct proc *p, int error, u psignal(p, SIGABRT); p->p_p->ps_pledge = 0; /* Disable all PLEDGE_ flags */ + KERNEL_UNLOCK(); return (error); } Index: kern/kern_sysctl.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sysctl.c,v retrieving revision 1.337 diff -u -p -r1.337 kern_sysctl.c --- kern/kern_sysctl.c 16 May 2018 14:53:43 -0000 1.337 +++ kern/kern_sysctl.c 25 May 2018 08:24:33 -0000 @@ -1059,7 +1059,9 @@ fill_file(struct kinfo_file *kf, struct kf->f_flag = fp->f_flag; kf->f_iflags = fp->f_iflags; kf->f_type = fp->f_type; + mtx_enter(&fhdlk); kf->f_count = fp->f_count; + mtx_leave(&fhdlk); if (show_pointers) kf->f_ucred = PTRTOINT64(fp->f_cred); kf->f_uid = fp->f_cred->cr_uid; Index: kern/sys_pipe.c =================================================================== RCS file: /cvs/src/sys/kern/sys_pipe.c,v retrieving revision 1.78 diff -u -p -r1.78 sys_pipe.c --- kern/sys_pipe.c 10 Apr 2018 09:17:45 -0000 1.78 +++ kern/sys_pipe.c 25 May 2018 08:24:33 -0000 @@ -154,7 +154,7 @@ dopipe(struct proc *p, int *ufds, int fl fdplock(fdp); - error = falloc(p, cloexec, &rf, &fds[0]); + error = falloc(p, &rf, &fds[0]); if (error != 0) goto free2; rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK); @@ -162,7 +162,7 @@ dopipe(struct proc *p, int *ufds, int fl rf->f_data = rpipe; rf->f_ops = &pipeops; - error = falloc(p, cloexec, &wf, &fds[1]); + error = falloc(p, &wf, &fds[1]); if (error != 0) goto free3; wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK); @@ -173,8 +173,8 @@ dopipe(struct proc *p, int *ufds, int fl rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; - FILE_SET_MATURE(rf, p); - FILE_SET_MATURE(wf, p); + fdinsert(fdp, fds[0], cloexec, rf); + fdinsert(fdp, fds[1], cloexec, wf); error = copyout(fds, ufds, sizeof(fds)); if (error != 0) { @@ -186,6 +186,9 @@ dopipe(struct proc *p, int *ufds, int fl ktrfds(p, fds, 2); #endif fdpunlock(fdp); + + FRELE(rf, p); + FRELE(wf, p); return (error); free3: Index: kern/syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/syscalls.c,v retrieving revision 1.190 diff -u -p -r1.190 syscalls.c --- kern/syscalls.c 12 Dec 2017 01:13:14 -0000 1.190 +++ kern/syscalls.c 25 May 2018 08:24:33 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: syscalls.c,v 1.190 2017/12/12 01:13:14 deraadt Exp $ */ +/* $OpenBSD$ */ /* * System call names. Index: kern/syscalls.master =================================================================== RCS file: /cvs/src/sys/kern/syscalls.master,v retrieving revision 1.180 diff -u -p -r1.180 syscalls.master --- kern/syscalls.master 12 Dec 2017 01:12:34 -0000 1.180 +++ kern/syscalls.master 25 May 2018 08:24:33 -0000 @@ -88,18 +88,18 @@ #else 26 UNIMPL ptrace #endif -27 STD { ssize_t sys_recvmsg(int s, struct msghdr *msg, \ +27 STD NOLOCK { ssize_t sys_recvmsg(int s, struct msghdr *msg, \ int flags); } -28 STD { ssize_t sys_sendmsg(int s, \ +28 STD NOLOCK { ssize_t sys_sendmsg(int s, \ const struct msghdr *msg, int flags); } -29 STD { ssize_t sys_recvfrom(int s, void *buf, size_t len, \ +29 STD NOLOCK { ssize_t sys_recvfrom(int s, void *buf, size_t len, \ int flags, struct sockaddr *from, \ socklen_t *fromlenaddr); } -30 STD { int sys_accept(int s, struct sockaddr *name, \ +30 STD NOLOCK { int sys_accept(int s, struct sockaddr *name, \ socklen_t *anamelen); } -31 STD { int sys_getpeername(int fdes, struct sockaddr *asa, \ +31 STD NOLOCK { int sys_getpeername(int fdes, struct sockaddr *asa, \ socklen_t *alen); } -32 STD { int sys_getsockname(int fdes, struct sockaddr *asa, \ +32 STD NOLOCK { int sys_getsockname(int fdes, struct sockaddr *asa, \ socklen_t *alen); } 33 STD { int sys_access(const char *path, int amode); } 34 STD { int sys_chflags(const char *path, u_int flags); } @@ -205,26 +205,26 @@ 91 STD { int sys_nanosleep(const struct timespec *rqtp, \ struct timespec *rmtp); } 92 STD { int sys_fcntl(int fd, int cmd, ... void *arg); } -93 STD { int sys_accept4(int s, struct sockaddr *name, \ +93 STD NOLOCK { int sys_accept4(int s, struct sockaddr *name, \ socklen_t *anamelen, int flags); } 94 STD { int sys___thrsleep(const volatile void *ident, \ clockid_t clock_id, const struct timespec *tp, \ void *lock, const int *abort); } 95 STD { int sys_fsync(int fd); } 96 STD { int sys_setpriority(int which, id_t who, int prio); } -97 STD { int sys_socket(int domain, int type, int protocol); } -98 STD { int sys_connect(int s, const struct sockaddr *name, \ +97 STD NOLOCK { int sys_socket(int domain, int type, int protocol); } +98 STD NOLOCK { int sys_connect(int s, const struct sockaddr *name, \ socklen_t namelen); } 99 STD { int sys_getdents(int fd, void *buf, size_t buflen); } 100 STD { int sys_getpriority(int which, id_t who); } 101 STD { int sys_pipe2(int *fdp, int flags); } 102 STD { int sys_dup3(int from, int to, int flags); } 103 STD { int sys_sigreturn(struct sigcontext *sigcntxp); } -104 STD { int sys_bind(int s, const struct sockaddr *name, \ +104 STD NOLOCK { int sys_bind(int s, const struct sockaddr *name, \ socklen_t namelen); } -105 STD { int sys_setsockopt(int s, int level, int name, \ +105 STD NOLOCK { int sys_setsockopt(int s, int level, int name, \ const void *val, socklen_t valsize); } -106 STD { int sys_listen(int s, int backlog); } +106 STD NOLOCK { int sys_listen(int s, int backlog); } 107 STD { int sys_chflagsat(int fd, const char *path, \ u_int flags, int atflags); } 108 STD { int sys_pledge(const char *promises, \ @@ -243,7 +243,7 @@ 115 OBSOL vtrace 116 OBSOL t32_gettimeofday 117 OBSOL t32_getrusage -118 STD { int sys_getsockopt(int s, int level, int name, \ +118 STD NOLOCK { int sys_getsockopt(int s, int level, int name, \ void *val, socklen_t *avalsize); } 119 STD { int sys_thrkill(pid_t tid, int signum, void *tcb); } 120 STD { ssize_t sys_readv(int fd, \ @@ -261,11 +261,11 @@ 130 OBSOL oftruncate 131 STD { int sys_flock(int fd, int how); } 132 STD { int sys_mkfifo(const char *path, mode_t mode); } -133 STD { ssize_t sys_sendto(int s, const void *buf, \ +133 STD NOLOCK { ssize_t sys_sendto(int s, const void *buf, \ size_t len, int flags, const struct sockaddr *to, \ socklen_t tolen); } -134 STD { int sys_shutdown(int s, int how); } -135 STD { int sys_socketpair(int domain, int type, \ +134 STD NOLOCK { int sys_shutdown(int s, int how); } +135 STD NOLOCK { int sys_socketpair(int domain, int type, \ int protocol, int *rsv); } 136 STD { int sys_mkdir(const char *path, mode_t mode); } 137 STD { int sys_rmdir(const char *path); } Index: kern/tty_pty.c =================================================================== RCS file: /cvs/src/sys/kern/tty_pty.c,v retrieving revision 1.84 diff -u -p -r1.84 tty_pty.c --- kern/tty_pty.c 28 Apr 2018 03:13:04 -0000 1.84 +++ kern/tty_pty.c 25 May 2018 08:24:33 -0000 @@ -1070,11 +1070,11 @@ ptmioctl(dev_t dev, u_long cmd, caddr_t case PTMGET: fdplock(fdp); /* Grab two filedescriptors. */ - if ((error = falloc(p, 0, &cfp, &cindx)) != 0) { + if ((error = falloc(p, &cfp, &cindx)) != 0) { fdpunlock(fdp); break; } - if ((error = falloc(p, 0, &sfp, &sindx)) != 0) { + if ((error = falloc(p, &sfp, &sindx)) != 0) { fdremove(fdp, cindx); closef(cfp, p); fdpunlock(fdp); @@ -1166,11 +1166,12 @@ retry: memcpy(ptm->cn, pti->pty_pn, sizeof(pti->pty_pn)); memcpy(ptm->sn, pti->pty_sn, sizeof(pti->pty_sn)); - /* mark the files mature now that we've passed all errors */ - FILE_SET_MATURE(cfp, p); - FILE_SET_MATURE(sfp, p); - + /* insert files now that we've passed all errors */ + fdinsert(fdp, cindx, 0, cfp); + fdinsert(fdp, sindx, 0, sfp); fdpunlock(fdp); + FRELE(cfp, p); + FRELE(sfp, p); break; default: error = EINVAL; Index: kern/uipc_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.171 diff -u -p -r1.171 uipc_syscalls.c --- kern/uipc_syscalls.c 22 May 2018 09:51:01 -0000 1.171 +++ kern/uipc_syscalls.c 25 May 2018 08:24:33 -0000 @@ -101,13 +101,14 @@ sys_socket(struct proc *p, void *v, regi fflag = FREAD | FWRITE | (nonblock ? FNONBLOCK : 0); error = socreate(SCARG(uap, domain), &so, type, SCARG(uap, protocol)); - if (error != 0) - goto out; + if (error) + return (error); + KERNEL_LOCK(); fdplock(fdp); - error = falloc(p, cloexec, &fp, &fd); - fdpunlock(fdp); + error = falloc(p, &fp, &fd); if (error) { + fdpunlock(fdp); soclose(so); } else { fp->f_flag = fflag; @@ -117,10 +118,12 @@ sys_socket(struct proc *p, void *v, regi so->so_state |= SS_NBIO; so->so_state |= ss; fp->f_data = so; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, fd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); *retval = fd; } -out: + KERNEL_UNLOCK(); return (error); } @@ -272,7 +275,9 @@ doaccept(struct proc *p, int sock, struc socklen_t namelen; int error, s, tmpfd; struct socket *head, *so; - int nflag; + int cloexec, nflag; + + cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0; if (name && (error = copyin(anamelen, &namelen, sizeof (namelen)))) return (error); @@ -282,7 +287,7 @@ doaccept(struct proc *p, int sock, struc headfp = fp; fdplock(fdp); - error = falloc(p, (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0, &fp, &tmpfd); + error = falloc(p, &fp, &tmpfd); fdpunlock(fdp); if (error) { FRELE(headfp, p); @@ -347,8 +352,11 @@ out: else so->so_state &= ~SS_NBIO; sounlock(s); + fdplock(fdp); fp->f_data = so; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, tmpfd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); *retval = tmpfd; } else { sounlock(s); @@ -475,14 +483,15 @@ sys_socketpair(struct proc *p, void *v, if (error != 0) goto free2; } + KERNEL_LOCK(); fdplock(fdp); - if ((error = falloc(p, cloexec, &fp1, &sv[0])) != 0) + if ((error = falloc(p, &fp1, &sv[0])) != 0) goto free3; fp1->f_flag = fflag; fp1->f_type = DTYPE_SOCKET; fp1->f_ops = &socketops; fp1->f_data = so1; - if ((error = falloc(p, cloexec, &fp2, &sv[1])) != 0) + if ((error = falloc(p, &fp2, &sv[1])) != 0) goto free4; fp2->f_flag = fflag; fp2->f_type = DTYPE_SOCKET; @@ -500,9 +509,12 @@ sys_socketpair(struct proc *p, void *v, (*fp2->f_ops->fo_ioctl)(fp2, FIONBIO, (caddr_t)&type, p); } - FILE_SET_MATURE(fp1, p); - FILE_SET_MATURE(fp2, p); + fdinsert(fdp, sv[0], cloexec, fp1); + fdinsert(fdp, sv[1], cloexec, fp2); fdpunlock(fdp); + FRELE(fp1, p); + FRELE(fp2, p); + KERNEL_UNLOCK(); return (0); } fdremove(fdp, sv[1]); @@ -514,6 +526,7 @@ free4: so1 = NULL; free3: fdpunlock(fdp); + KERNEL_UNLOCK(); free2: if (so2 != NULL) (void)soclose(so2); @@ -678,13 +691,16 @@ sendit(struct proc *p, int s, struct msg } #endif len = auio.uio_resid; - error = sosend(fp->f_data, to, &auio, NULL, control, flags); + error = sosend(so, to, &auio, NULL, control, flags); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; - if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) + if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) { + KERNEL_LOCK(); ptsignal(p, SIGPIPE, STHREAD); + KERNEL_UNLOCK(); + } } if (error == 0) { *retsize = len - auio.uio_resid; @@ -925,11 +941,13 @@ sys_shutdown(struct proc *p, void *v, re syscallarg(int) how; } */ *uap = v; struct file *fp; + struct socket *so; int error; if ((error = getsock(p, SCARG(uap, s), &fp)) != 0) return (error); - error = soshutdown(fp->f_data, SCARG(uap, how)); + so = fp->f_data; + error = soshutdown(so, SCARG(uap, how)); FRELE(fp, p); return (error); } @@ -1163,7 +1181,8 @@ getsock(struct proc *p, int fdes, struct { struct file *fp; - if ((fp = fd_getfile(p->p_fd, fdes)) == NULL) + fp = fd_getfile(p->p_fd, fdes); + if (fp == NULL) return (EBADF); if (fp->f_type != DTYPE_SOCKET) { FRELE(fp, p); Index: kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.126 diff -u -p -r1.126 uipc_usrreq.c --- kern/uipc_usrreq.c 28 Apr 2018 03:13:04 -0000 1.126 +++ kern/uipc_usrreq.c 25 May 2018 08:24:33 -0000 @@ -899,6 +899,7 @@ unp_gc(void *arg __unused) fp = defer->ud_fp[i].fp; if (fp == NULL) continue; + /* closef() expects a refcount of 2 */ FREF(fp); if ((unp = fptounp(fp)) != NULL) unp->unp_msgcount--; @@ -915,6 +916,8 @@ unp_gc(void *arg __unused) do { nunref = 0; LIST_FOREACH(unp, &unp_head, unp_link) { + mtx_enter(&fhdlk); + fp = unp->unp_file; if (unp->unp_flags & UNP_GCDEFER) { /* * This socket is referenced by another @@ -925,8 +928,9 @@ unp_gc(void *arg __unused) unp_defer--; } else if (unp->unp_flags & UNP_GCMARK) { /* marked as live in previous pass */ + mtx_leave(&fhdlk); continue; - } else if ((fp = unp->unp_file) == NULL) { + } else if (fp == NULL) { /* not being passed, so can't be in loop */ } else if (fp->f_count == 0) { /* @@ -943,9 +947,11 @@ unp_gc(void *arg __unused) if (fp->f_count == unp->unp_msgcount) { nunref++; unp->unp_flags |= UNP_GCDEAD; + mtx_leave(&fhdlk); continue; } } + mtx_leave(&fhdlk); /* * This is the first time we've seen this socket on Index: kern/vfs_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_syscalls.c,v retrieving revision 1.283 diff -u -p -r1.283 vfs_syscalls.c --- kern/vfs_syscalls.c 8 May 2018 08:53:41 -0000 1.283 +++ kern/vfs_syscalls.c 25 May 2018 08:24:33 -0000 @@ -899,7 +899,7 @@ doopenat(struct proc *p, int fd, const c struct file *fp; struct vnode *vp; struct vattr vattr; - int flags, cmode; + int flags, cloexec, cmode; int type, indx, error, localtrunc = 0; struct flock lf; struct nameidata nd; @@ -911,10 +911,10 @@ doopenat(struct proc *p, int fd, const c return (error); } - fdplock(fdp); + cloexec = (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0; - if ((error = falloc(p, (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp, - &indx)) != 0) + fdplock(fdp); + if ((error = falloc(p, &fp, &indx)) != 0) goto out; flags = FFLAGS(oflags); if (flags & FREAD) @@ -999,7 +999,8 @@ doopenat(struct proc *p, int fd, const c } VOP_UNLOCK(vp); *retval = indx; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, indx, cloexec, fp); + FRELE(fp, p); out: fdpunlock(fdp); return (error); @@ -1060,7 +1061,7 @@ sys_fhopen(struct proc *p, void *v, regi struct vnode *vp = NULL; struct mount *mp; struct ucred *cred = p->p_ucred; - int flags; + int flags, cloexec; int type, indx, error=0; struct flock lf; struct vattr va; @@ -1078,9 +1079,10 @@ sys_fhopen(struct proc *p, void *v, regi if ((flags & O_CREAT)) return (EINVAL); + cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0; + fdplock(fdp); - if ((error = falloc(p, (flags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp, - &indx)) != 0) { + if ((error = falloc(p, &fp, &indx)) != 0) { fp = NULL; goto bad; } @@ -1160,9 +1162,9 @@ sys_fhopen(struct proc *p, void *v, regi } VOP_UNLOCK(vp); *retval = indx; - FILE_SET_MATURE(fp, p); - + fdinsert(fdp, indx, cloexec, fp); fdpunlock(fdp); + FRELE(fp, p); return (0); bad: Index: net/if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.552 diff -u -p -r1.552 if.c --- net/if.c 17 May 2018 11:04:14 -0000 1.552 +++ net/if.c 25 May 2018 08:24:33 -0000 @@ -1378,7 +1378,7 @@ ifa_ifwithaddr(struct sockaddr *addr, u_ struct ifaddr *ifa; u_int rdomain; - KERNEL_ASSERT_LOCKED(); + KERNEL_LOCK(); rdomain = rtable_l2(rtableid); TAILQ_FOREACH(ifp, &ifnet, if_list) { if (ifp->if_rdomain != rdomain) @@ -1388,10 +1388,13 @@ ifa_ifwithaddr(struct sockaddr *addr, u_ if (ifa->ifa_addr->sa_family != addr->sa_family) continue; - if (equal(addr, ifa->ifa_addr)) + if (equal(addr, ifa->ifa_addr)) { + KERNEL_UNLOCK(); return (ifa); + } } } + KERNEL_UNLOCK(); return (NULL); } @@ -1404,8 +1407,8 @@ ifa_ifwithdstaddr(struct sockaddr *addr, struct ifnet *ifp; struct ifaddr *ifa; - KERNEL_ASSERT_LOCKED(); rdomain = rtable_l2(rdomain); + KERNEL_LOCK(); TAILQ_FOREACH(ifp, &ifnet, if_list) { if (ifp->if_rdomain != rdomain) continue; @@ -1414,11 +1417,14 @@ ifa_ifwithdstaddr(struct sockaddr *addr, if (ifa->ifa_addr->sa_family != addr->sa_family || ifa->ifa_dstaddr == NULL) continue; - if (equal(addr, ifa->ifa_dstaddr)) + if (equal(addr, ifa->ifa_dstaddr)) { + KERNEL_UNLOCK(); return (ifa); + } } } } + KERNEL_UNLOCK(); return (NULL); } Index: sys/file.h =================================================================== RCS file: /cvs/src/sys/sys/file.h,v retrieving revision 1.45 diff -u -p -r1.45 file.h --- sys/file.h 9 May 2018 08:42:02 -0000 1.45 +++ sys/file.h 25 May 2018 08:24:33 -0000 @@ -65,6 +65,7 @@ struct fileops { * * Locks used to protect struct members in this file: * I immutable after creation + * F global `fhdlk' mutex * f per file `f_mtx' * k kernel lock */ @@ -77,7 +78,7 @@ struct file { #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_KQUEUE 4 /* event queue */ short f_type; /* [I] descriptor type */ - long f_count; /* [k] reference count */ + long f_count; /* [F] reference count */ struct ucred *f_cred; /* [I] credentials associated with descriptor */ struct fileops *f_ops; /* [I] file operation pointers */ off_t f_offset; /* [k] */ @@ -91,26 +92,31 @@ struct file { }; #define FIF_HASLOCK 0x01 /* descriptor holds advisory lock */ -#define FIF_LARVAL 0x02 /* not fully constructed, don't use */ - -#define FILE_IS_USABLE(fp) \ - (((fp)->f_iflags & FIF_LARVAL) == 0) +#define FIF_INSERTED 0x80 /* present in `filehead' */ #define FREF(fp) \ do { \ extern void vfs_stall_barrier(void); \ vfs_stall_barrier(); \ + mtx_enter(&fhdlk); \ (fp)->f_count++; \ + mtx_leave(&fhdlk); \ } while (0) -#define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0) -#define FILE_SET_MATURE(fp,p) do { \ - (fp)->f_iflags &= ~FIF_LARVAL; \ - FRELE(fp, p); \ -} while (0) +#define FRELE(fp,p) \ +({ \ + int rv = 0; \ + mtx_enter(&fhdlk); \ + if (--(fp)->f_count == 0) \ + rv = fdrop(fp, p); \ + else \ + mtx_leave(&fhdlk); \ + rv; \ +}) int fdrop(struct file *, struct proc *); +extern struct mutex fhdlk; /* protects `filehead' and f_count */ LIST_HEAD(filelist, file); extern int maxfiles; /* kernel limit on number of open files */ extern int numfiles; /* actual number of open files */ Index: sys/filedesc.h =================================================================== RCS file: /cvs/src/sys/sys/filedesc.h,v retrieving revision 1.35 diff -u -p -r1.35 filedesc.h --- sys/filedesc.h 25 Apr 2018 10:29:17 -0000 1.35 +++ sys/filedesc.h 25 May 2018 08:24:33 -0000 @@ -125,12 +125,13 @@ void filedesc_init(void); int dupfdopen(struct proc *, int, int); int fdalloc(struct proc *p, int want, int *result); void fdexpand(struct proc *); -int falloc(struct proc *_p, int _flags, struct file **_rfp, int *_rfd); +int falloc(struct proc *_p, struct file **_rfp, int *_rfd); struct filedesc *fdinit(void); struct filedesc *fdshare(struct process *); struct filedesc *fdcopy(struct process *); void fdfree(struct proc *p); int fdrelease(struct proc *p, int); +void fdinsert(struct filedesc *, int, int, struct file *); void fdremove(struct filedesc *, int); void fdcloseexec(struct proc *); struct file *fd_iterfile(struct file *, struct proc *); Index: sys/syscall.h =================================================================== RCS file: /cvs/src/sys/sys/syscall.h,v retrieving revision 1.190 diff -u -p -r1.190 syscall.h --- sys/syscall.h 12 Dec 2017 01:13:14 -0000 1.190 +++ sys/syscall.h 25 May 2018 08:24:33 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: syscall.h,v 1.190 2017/12/12 01:13:14 deraadt Exp $ */ +/* $OpenBSD$ */ /* * System call numbers. Index: sys/syscallargs.h =================================================================== RCS file: /cvs/src/sys/sys/syscallargs.h,v retrieving revision 1.193 diff -u -p -r1.193 syscallargs.h --- sys/syscallargs.h 12 Dec 2017 01:13:14 -0000 1.193 +++ sys/syscallargs.h 25 May 2018 08:24:33 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: syscallargs.h,v 1.193 2017/12/12 01:13:14 deraadt Exp $ */ +/* $OpenBSD$ */ /* * System call argument lists.