This version should fixes the previous issue where LARVAL files were incorrectly accounted in find_last_set() resulting in a panic() in fd_unused().
If you haven't read the previous explanations, the idea of this diff is to publish file descriptors only when they are completely setup. This will allow us to serialize access to file descriptors in a mp-safe way, which is the last requirement for unlock most of the socket-related syscalls. The important point of this diff is that we don't store on the descriptor itself the fact that it isn't ready yet (LARVAL). Otherwise we have a chicken & egg problem for reference counting. As pointed by Mathieu Masson other BSDs do that by using a new structure for open files. We might want to do that later when we'll turn these functions mp-safe :) For now, using the existing bitmask is good enough. Here are previous explanations: https://marc.info/?l=openbsd-tech&m=152579630904328&w=2 https://marc.info/?l=openbsd-tech&m=152750590114899&w=2 Please test & report as usual. Ok? Index: sys/kern/exec_script.c =================================================================== RCS file: /cvs/src/sys/kern/exec_script.c,v retrieving revision 1.46 diff -u -p -r1.46 exec_script.c --- sys/kern/exec_script.c 5 Jun 2018 09:29:05 -0000 1.46 +++ sys/kern/exec_script.c 14 Jun 2018 11:54:38 -0000 @@ -170,17 +170,20 @@ check_shell: #endif fdplock(p->p_fd); - error = falloc(p, 0, &fp, &epp->ep_fd); - fdpunlock(p->p_fd); - if (error) + error = falloc(p, &fp, &epp->ep_fd); + if (error) { + fdpunlock(p->p_fd); goto fail; + } epp->ep_flags |= EXEC_HASFD; fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_data = (caddr_t) scriptvp; fp->f_flag = FREAD; - FILE_SET_MATURE(fp, p); + fdinsert(p->p_fd, epp->ep_fd, 0, fp); + fdpunlock(p->p_fd); + FRELE(fp, p); } /* set up the parameters for the recursive check_exec() call */ Index: sys/kern/kern_descrip.c =================================================================== RCS file: /cvs/src/sys/kern/kern_descrip.c,v retrieving revision 1.164 diff -u -p -r1.164 kern_descrip.c --- sys/kern/kern_descrip.c 5 Jun 2018 09:29:05 -0000 1.164 +++ sys/kern/kern_descrip.c 14 Jun 2018 11:54:38 -0000 @@ -73,6 +73,7 @@ int numfiles; /* actual number of open static __inline void fd_used(struct filedesc *, int); static __inline void fd_unused(struct filedesc *, int); static __inline int find_next_zero(u_int *, int, u_int); +static __inline int fd_inuse(struct filedesc *, int); int finishdup(struct proc *, struct file *, int, int, register_t *, int); int find_last_set(struct filedesc *, int); int dodup3(struct proc *, int, int, int, register_t *); @@ -125,7 +126,6 @@ int find_last_set(struct filedesc *fd, int last) { int off, i; - struct file **ofiles = fd->fd_ofiles; u_int *bitmap = fd->fd_lomap; off = (last - 1) >> NDENTRYSHIFT; @@ -139,11 +139,22 @@ find_last_set(struct filedesc *fd, int l if (i >= last) i = last - 1; - while (i > 0 && ofiles[i] == NULL) + while (i > 0 && !fd_inuse(fd, i)) i--; return i; } +static __inline int +fd_inuse(struct filedesc *fdp, int fd) +{ + u_int off = fd >> NDENTRYSHIFT; + + if (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) + return 1; + + return 0; +} + static __inline void fd_used(struct filedesc *fdp, int fd) { @@ -190,7 +201,7 @@ fd_iterfile(struct file *fp, struct proc nfp = LIST_NEXT(fp, f_list); /* don't FREF when f_count == 0 to avoid race in fdrop() */ - while (nfp != NULL && (nfp->f_count == 0 || !FILE_IS_USABLE(nfp))) + while (nfp != NULL && nfp->f_count == 0) nfp = LIST_NEXT(nfp, f_list); if (nfp != NULL) FREF(nfp); @@ -209,9 +220,6 @@ fd_getfile(struct filedesc *fdp, int fd) if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) return (NULL); - if (!FILE_IS_USABLE(fp)) - return (NULL); - FREF(fp); return (fp); } @@ -629,24 +637,24 @@ finishdup(struct proc *p, struct file *f struct filedesc *fdp = p->p_fd; fdpassertlocked(fdp); + KASSERT(fp->f_iflags & FIF_INSERTED); + if (fp->f_count == LONG_MAX-2) { FRELE(fp, p); return (EDEADLK); } - oldfp = fdp->fd_ofiles[new]; - if (oldfp != NULL) { - if (!FILE_IS_USABLE(oldfp)) { - FRELE(fp, p); - return (EBUSY); - } - FREF(oldfp); - } + oldfp = fd_getfile(fdp, new); + if (dup2 && oldfp == NULL) { + if (fd_inuse(fdp, new)) { + FRELE(fp, p); + return (EBUSY); + } + fd_used(fdp, new); + } fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE; - if (dup2 && oldfp == NULL) - fd_used(fdp, new); *retval = new; if (oldfp != NULL) { @@ -659,6 +667,23 @@ finishdup(struct proc *p, struct file *f } void +fdinsert(struct filedesc *fdp, int fd, int flags, struct file *fp) +{ + struct file *fq; + + fdpassertlocked(fdp); + if ((fq = fdp->fd_ofiles[0]) != NULL) { + LIST_INSERT_AFTER(fq, fp, f_list); + } else { + LIST_INSERT_HEAD(&filehead, fp, f_list); + } + KASSERT(fdp->fd_ofiles[fd] == NULL); + fdp->fd_ofiles[fd] = fp; + fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE); + fp->f_iflags |= FIF_INSERTED; +} + +void fdremove(struct filedesc *fdp, int fd) { fdpassertlocked(fdp); @@ -671,21 +696,14 @@ int fdrelease(struct proc *p, int fd) { struct filedesc *fdp = p->p_fd; - struct file **fpp, *fp; + struct file *fp; fdpassertlocked(fdp); - /* - * Don't fd_getfile here. We want to closef LARVAL files and closef - * can deal with that. - */ - fpp = &fdp->fd_ofiles[fd]; - fp = *fpp; + fp = fd_getfile(fdp, fd); if (fp == NULL) return (EBADF); - FREF(fp); - *fpp = NULL; - fd_unused(fdp, fd); + fdremove(fdp, fd); if (fd < fdp->fd_knlistsize) knote_fdclose(p, fd); return (closef(fp, p)); @@ -702,12 +720,6 @@ sys_close(struct proc *p, void *v, regis } */ *uap = v; int fd = SCARG(uap, fd), error; struct filedesc *fdp = p->p_fd; - struct file *fp; - - fp = fd_getfile(fdp, fd); - if (fp == NULL) - return (EBADF); - FRELE(fp, p); fdplock(fdp); error = fdrelease(p, fd); @@ -928,9 +940,9 @@ fdexpand(struct proc *p) * a file descriptor for the process that refers to it. */ int -falloc(struct proc *p, int flags, struct file **resultfp, int *resultfd) +falloc(struct proc *p, struct file **resultfp, int *resultfd) { - struct file *fp, *fq; + struct file *fp; int error, i; KASSERT(resultfp != NULL); @@ -963,14 +975,6 @@ restart: * with and without the KERNEL_LOCK(). */ mtx_init(&fp->f_mtx, IPL_MPFLOOR); - fp->f_iflags = FIF_LARVAL; - if ((fq = p->p_fd->fd_ofiles[0]) != NULL) { - LIST_INSERT_AFTER(fq, fp, f_list); - } else { - LIST_INSERT_HEAD(&filehead, fp, f_list); - } - p->p_fd->fd_ofiles[i] = fp; - p->p_fd->fd_ofileflags[i] |= (flags & UF_EXCLOSE); fp->f_count = 1; fp->f_cred = p->p_ucred; crhold(fp->f_cred); @@ -1196,8 +1200,8 @@ fdrop(struct file *fp, struct proc *p) else error = 0; - /* Free fp */ - LIST_REMOVE(fp, f_list); + if (fp->f_iflags & FIF_INSERTED) + LIST_REMOVE(fp, f_list); crfree(fp->f_cred); numfiles--; pool_put(&file_pool, fp); @@ -1312,7 +1316,7 @@ dupfdopen(struct proc *p, int indx, int * of file descriptors, or the fd to be dup'd has already been * closed, reject. Note, there is no need to check for new == old * because fd_getfile will return NULL if the file at indx is - * newly created by falloc (FIF_LARVAL). + * newly created by falloc. */ if ((wfp = fd_getfile(fdp, dupfd)) == NULL) return (EBADF); Index: sys/kern/kern_event.c =================================================================== RCS file: /cvs/src/sys/kern/kern_event.c,v retrieving revision 1.91 diff -u -p -r1.91 kern_event.c --- sys/kern/kern_event.c 5 Jun 2018 09:29:05 -0000 1.91 +++ sys/kern/kern_event.c 14 Jun 2018 11:54:38 -0000 @@ -441,10 +441,9 @@ sys_kqueue(struct proc *p, void *v, regi int fd, error; fdplock(fdp); - error = falloc(p, 0, &fp, &fd); - fdpunlock(fdp); + error = falloc(p, &fp, &fd); if (error) - return (error); + goto out; fp->f_flag = FREAD | FWRITE; fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; @@ -456,8 +455,11 @@ sys_kqueue(struct proc *p, void *v, regi if (fdp->fd_knlistsize < 0) fdp->fd_knlistsize = 0; /* this process has a kq */ kq->kq_fdp = fdp; - FILE_SET_MATURE(fp, p); - return (0); + fdinsert(fdp, fd, 0, fp); + FRELE(fp, p); +out: + fdpunlock(fdp); + return (error); } int Index: sys/kern/kern_exec.c =================================================================== RCS file: /cvs/src/sys/kern/kern_exec.c,v retrieving revision 1.197 diff -u -p -r1.197 kern_exec.c --- sys/kern/kern_exec.c 5 Jun 2018 09:29:05 -0000 1.197 +++ sys/kern/kern_exec.c 14 Jun 2018 11:54:38 -0000 @@ -584,7 +584,7 @@ sys_execve(struct proc *p, void *v, regi struct vnode *vp; int indx; - if ((error = falloc(p, 0, &fp, &indx)) != 0) + if ((error = falloc(p, &fp, &indx)) != 0) break; #ifdef DIAGNOSTIC if (indx != i) @@ -607,10 +607,9 @@ sys_execve(struct proc *p, void *v, regi fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_data = (caddr_t)vp; - FILE_SET_MATURE(fp, p); - } else { - FRELE(fp, p); + fdinsert(p->p_fd, indx, 0, fp); } + FRELE(fp, p); } fdpunlock(p->p_fd); if (error) Index: sys/kern/sys_pipe.c =================================================================== RCS file: /cvs/src/sys/kern/sys_pipe.c,v retrieving revision 1.80 diff -u -p -r1.80 sys_pipe.c --- sys/kern/sys_pipe.c 5 Jun 2018 09:29:05 -0000 1.80 +++ sys/kern/sys_pipe.c 14 Jun 2018 11:54:38 -0000 @@ -154,7 +154,7 @@ dopipe(struct proc *p, int *ufds, int fl fdplock(fdp); - error = falloc(p, cloexec, &rf, &fds[0]); + error = falloc(p, &rf, &fds[0]); if (error != 0) goto free2; rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK); @@ -162,7 +162,7 @@ dopipe(struct proc *p, int *ufds, int fl rf->f_data = rpipe; rf->f_ops = &pipeops; - error = falloc(p, cloexec, &wf, &fds[1]); + error = falloc(p, &wf, &fds[1]); if (error != 0) goto free3; wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK); @@ -173,8 +173,8 @@ dopipe(struct proc *p, int *ufds, int fl rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; - FILE_SET_MATURE(rf, p); - FILE_SET_MATURE(wf, p); + fdinsert(fdp, fds[0], cloexec, rf); + fdinsert(fdp, fds[1], cloexec, wf); error = copyout(fds, ufds, sizeof(fds)); if (error != 0) { @@ -186,6 +186,9 @@ dopipe(struct proc *p, int *ufds, int fl ktrfds(p, fds, 2); #endif fdpunlock(fdp); + + FRELE(rf, p); + FRELE(wf, p); return (error); free3: Index: sys/kern/tty_pty.c =================================================================== RCS file: /cvs/src/sys/kern/tty_pty.c,v retrieving revision 1.86 diff -u -p -r1.86 tty_pty.c --- sys/kern/tty_pty.c 5 Jun 2018 09:29:05 -0000 1.86 +++ sys/kern/tty_pty.c 14 Jun 2018 11:54:38 -0000 @@ -1070,11 +1070,11 @@ ptmioctl(dev_t dev, u_long cmd, caddr_t case PTMGET: fdplock(fdp); /* Grab two filedescriptors. */ - if ((error = falloc(p, 0, &cfp, &cindx)) != 0) { + if ((error = falloc(p, &cfp, &cindx)) != 0) { fdpunlock(fdp); break; } - if ((error = falloc(p, 0, &sfp, &sindx)) != 0) { + if ((error = falloc(p, &sfp, &sindx)) != 0) { fdremove(fdp, cindx); closef(cfp, p); fdpunlock(fdp); @@ -1166,11 +1166,12 @@ retry: memcpy(ptm->cn, pti->pty_pn, sizeof(pti->pty_pn)); memcpy(ptm->sn, pti->pty_sn, sizeof(pti->pty_sn)); - /* mark the files mature now that we've passed all errors */ - FILE_SET_MATURE(cfp, p); - FILE_SET_MATURE(sfp, p); - + /* insert files now that we've passed all errors */ + fdinsert(fdp, cindx, 0, cfp); + fdinsert(fdp, sindx, 0, sfp); fdpunlock(fdp); + FRELE(cfp, p); + FRELE(sfp, p); break; default: error = EINVAL; Index: sys/kern/uipc_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.175 diff -u -p -r1.175 uipc_syscalls.c --- sys/kern/uipc_syscalls.c 6 Jun 2018 06:55:22 -0000 1.175 +++ sys/kern/uipc_syscalls.c 14 Jun 2018 11:54:38 -0000 @@ -106,9 +106,9 @@ sys_socket(struct proc *p, void *v, regi KERNEL_LOCK(); fdplock(fdp); - error = falloc(p, cloexec, &fp, &fd); - fdpunlock(fdp); + error = falloc(p, &fp, &fd); if (error) { + fdpunlock(fdp); soclose(so); } else { fp->f_flag = fflag; @@ -118,7 +118,9 @@ sys_socket(struct proc *p, void *v, regi so->so_state |= SS_NBIO; so->so_state |= ss; fp->f_data = so; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, fd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); *retval = fd; } KERNEL_UNLOCK(); @@ -273,7 +275,9 @@ doaccept(struct proc *p, int sock, struc socklen_t namelen; int error, s, tmpfd; struct socket *head, *so; - int nflag; + int cloexec, nflag; + + cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0; if (name && (error = copyin(anamelen, &namelen, sizeof (namelen)))) return (error); @@ -283,7 +287,7 @@ doaccept(struct proc *p, int sock, struc headfp = fp; fdplock(fdp); - error = falloc(p, (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0, &fp, &tmpfd); + error = falloc(p, &fp, &tmpfd); fdpunlock(fdp); if (error) { FRELE(headfp, p); @@ -348,8 +352,11 @@ out: else so->so_state &= ~SS_NBIO; sounlock(head, s); + fdplock(fdp); fp->f_data = so; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, tmpfd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); *retval = tmpfd; } else { sounlock(head, s); @@ -478,13 +485,13 @@ sys_socketpair(struct proc *p, void *v, } KERNEL_LOCK(); fdplock(fdp); - if ((error = falloc(p, cloexec, &fp1, &sv[0])) != 0) + if ((error = falloc(p, &fp1, &sv[0])) != 0) goto free3; fp1->f_flag = fflag; fp1->f_type = DTYPE_SOCKET; fp1->f_ops = &socketops; fp1->f_data = so1; - if ((error = falloc(p, cloexec, &fp2, &sv[1])) != 0) + if ((error = falloc(p, &fp2, &sv[1])) != 0) goto free4; fp2->f_flag = fflag; fp2->f_type = DTYPE_SOCKET; @@ -502,9 +509,11 @@ sys_socketpair(struct proc *p, void *v, (*fp2->f_ops->fo_ioctl)(fp2, FIONBIO, (caddr_t)&type, p); } - FILE_SET_MATURE(fp1, p); - FILE_SET_MATURE(fp2, p); + fdinsert(fdp, sv[0], cloexec, fp1); + fdinsert(fdp, sv[1], cloexec, fp2); fdpunlock(fdp); + FRELE(fp1, p); + FRELE(fp2, p); KERNEL_UNLOCK(); return (0); } Index: sys/kern/vfs_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_syscalls.c,v retrieving revision 1.289 diff -u -p -r1.289 vfs_syscalls.c --- sys/kern/vfs_syscalls.c 14 Jun 2018 00:23:36 -0000 1.289 +++ sys/kern/vfs_syscalls.c 14 Jun 2018 11:54:38 -0000 @@ -899,7 +899,7 @@ doopenat(struct proc *p, int fd, const c struct file *fp; struct vnode *vp; struct vattr vattr; - int flags, cmode; + int flags, cloexec, cmode; int type, indx, error, localtrunc = 0; struct flock lf; struct nameidata nd; @@ -911,10 +911,10 @@ doopenat(struct proc *p, int fd, const c return (error); } - fdplock(fdp); + cloexec = (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0; - if ((error = falloc(p, (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp, - &indx)) != 0) + fdplock(fdp); + if ((error = falloc(p, &fp, &indx)) != 0) goto out; flags = FFLAGS(oflags); if (flags & FREAD) @@ -999,7 +999,8 @@ doopenat(struct proc *p, int fd, const c } VOP_UNLOCK(vp); *retval = indx; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, indx, cloexec, fp); + FRELE(fp, p); out: fdpunlock(fdp); return (error); @@ -1060,7 +1061,7 @@ sys_fhopen(struct proc *p, void *v, regi struct vnode *vp = NULL; struct mount *mp; struct ucred *cred = p->p_ucred; - int flags; + int flags, cloexec; int type, indx, error=0; struct flock lf; struct vattr va; @@ -1078,9 +1079,10 @@ sys_fhopen(struct proc *p, void *v, regi if ((flags & O_CREAT)) return (EINVAL); + cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0; + fdplock(fdp); - if ((error = falloc(p, (flags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp, - &indx)) != 0) { + if ((error = falloc(p, &fp, &indx)) != 0) { fp = NULL; goto bad; } @@ -1160,9 +1162,9 @@ sys_fhopen(struct proc *p, void *v, regi } VOP_UNLOCK(vp); *retval = indx; - FILE_SET_MATURE(fp, p); - + fdinsert(fdp, indx, cloexec, fp); fdpunlock(fdp); + FRELE(fp, p); return (0); bad: Index: sys/sys/file.h =================================================================== RCS file: /cvs/src/sys/sys/file.h,v retrieving revision 1.47 diff -u -p -r1.47 file.h --- sys/sys/file.h 5 Jun 2018 09:29:05 -0000 1.47 +++ sys/sys/file.h 14 Jun 2018 11:54:38 -0000 @@ -91,10 +91,7 @@ struct file { }; #define FIF_HASLOCK 0x01 /* descriptor holds advisory lock */ -#define FIF_LARVAL 0x02 /* not fully constructed, don't use */ - -#define FILE_IS_USABLE(fp) \ - (((fp)->f_iflags & FIF_LARVAL) == 0) +#define FIF_INSERTED 0x80 /* present in `filehead' */ #define FREF(fp) \ do { \ @@ -103,11 +100,6 @@ struct file { (fp)->f_count++; \ } while (0) #define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0) - -#define FILE_SET_MATURE(fp,p) do { \ - (fp)->f_iflags &= ~FIF_LARVAL; \ - FRELE(fp, p); \ -} while (0) int fdrop(struct file *, struct proc *); Index: sys/sys/filedesc.h =================================================================== RCS file: /cvs/src/sys/sys/filedesc.h,v retrieving revision 1.37 diff -u -p -r1.37 filedesc.h --- sys/sys/filedesc.h 5 Jun 2018 09:29:05 -0000 1.37 +++ sys/sys/filedesc.h 14 Jun 2018 11:54:38 -0000 @@ -125,12 +125,13 @@ void filedesc_init(void); int dupfdopen(struct proc *, int, int); int fdalloc(struct proc *p, int want, int *result); void fdexpand(struct proc *); -int falloc(struct proc *_p, int _flags, struct file **_rfp, int *_rfd); +int falloc(struct proc *_p, struct file **_rfp, int *_rfd); struct filedesc *fdinit(void); struct filedesc *fdshare(struct process *); struct filedesc *fdcopy(struct process *); void fdfree(struct proc *p); int fdrelease(struct proc *p, int); +void fdinsert(struct filedesc *, int, int, struct file *); void fdremove(struct filedesc *, int); void fdcloseexec(struct proc *); struct file *fd_iterfile(struct file *, struct proc *); Index: usr.sbin/pstat/pstat.8 =================================================================== RCS file: /cvs/src/usr.sbin/pstat/pstat.8,v retrieving revision 1.54 diff -u -p -r1.54 pstat.8 --- usr.sbin/pstat/pstat.8 5 Jun 2018 09:29:05 -0000 1.54 +++ usr.sbin/pstat/pstat.8 14 Jun 2018 11:54:38 -0000 @@ -101,8 +101,6 @@ open for appending exclusive or shared lock present .It I signal pgrp when data ready -.It l -file descriptor slot is larval .El .It CNT Number of processes that know this open file. Index: usr.sbin/pstat/pstat.c =================================================================== RCS file: /cvs/src/usr.sbin/pstat/pstat.c,v retrieving revision 1.116 diff -u -p -r1.116 pstat.c --- usr.sbin/pstat/pstat.c 5 Jun 2018 09:29:05 -0000 1.116 +++ usr.sbin/pstat/pstat.c 14 Jun 2018 11:54:38 -0000 @@ -1044,8 +1044,6 @@ filemode(void) if (kf->f_iflags & FIF_HASLOCK) *fbp++ = 'L'; - if (kf->f_iflags & FIF_LARVAL) - *fbp++ = 'l'; *fbp = '\0'; (void)printf("%6s %3ld", flagbuf, (long)kf->f_count);