On 10/05/18(Thu) 22:49, Philip Guenther wrote: > On Wed, May 9, 2018 at 3:31 AM, Martin Pieuchot <m...@openbsd.org> wrote: > > > On 08/05/18(Tue) 14:12, Philip Guenther wrote: > > > On Tue, 8 May 2018, Martin Pieuchot wrote: > > > > The way our kernel allocates and populates new 'struct file *' is > > > > currently a complete mess. One of the problems is that it puts > > > > incomplete descriptors on the global `filehead' list and on the > > > > open file array `fd_ofiles'. But to make sure that other threads > > > > won't use such descriptors before they are ready, they are first > > > > marked as LARVAL then unmarked via the FILE_SET_MATURE() macro. > > > > > > The reason we do that is to meet two goals: > > > 1) we want to allocate the fd (and fail if we can't) before > > > possibly blocking while completing the creation of the struct file. > > > For example, in doaccept(), this thread being blocked waiting for a > > new > > > connection must not block other threads from making fd changes. > > > > But the way I understand it, allocating the fd isn't changed by my diff. > > fdalloc(), called by falloc(), still allocates it and mark the > > corresponding > > bits with fd_used(). > > > > Those bits don't block dup2() *targeting* the fd, or close of the fd. > > > > > > 2) close(half_created) and dup2(old, half_created) must be safe: it must > > > not leak the half_created file in the kernel, and on completion the > > > half_created fd must either be closed or dup'ed to; the half-created > > > file must not suddenly show up there later > > > > I don't understand, are you talking about close(2) and dup2(2)? How > > syscalls are related to half_created files? How is that related to > > my explanation below? > > > > thread 1: socket() -- > returns 3 > thread 1: bind(3, someaddr); listen(3); > thread 1: accept(3) --> expected to return 4 (it's the first free fd), but > blocks waiting for connection > thread 2: socket() --> returns 5 (if not, then explain that and > ignore the rest of this sequences) > thread 2: dup2(0, 4) --> better not block! after this fd 4 must be a > dup of 0! > thread 2: connect(5, someaddr) --> unblocks thread 1 > > So what happens at the end of this? Did accept() return 4 as expected and > if so is fd 4 a duplicate of fd 0 or is it the accepted socket? I argue > that it must be a dup of 0 because dup2 succeeded, and the accept that > completed later cannot close that fd. That's how the current kernel > behaves. > > I haven't tested it, but my reading of the diff is that is fd 4 will be the > accepted fd and the dup'ed copy of fd 0 will be lost, leaving it's > reference count too high by one.
Here's an updated diff on top of the two bug fixes I sent today. It makes use of a new function, fd_inuse() to detect if a file is in the LARVAL state and return an error in the case of dup2(). I'd appreciate review as this is a blocking issue for unlocking all file related syscalls. diff --git sys/kern/exec_script.c sys/kern/exec_script.c index 0bfaea0aee0..1c7aae6f69b 100644 --- sys/kern/exec_script.c +++ sys/kern/exec_script.c @@ -170,17 +170,20 @@ check_shell: #endif fdplock(p->p_fd); - error = falloc(p, 0, &fp, &epp->ep_fd); - fdpunlock(p->p_fd); - if (error) + error = falloc(p, &fp, &epp->ep_fd); + if (error) { + fdpunlock(p->p_fd); goto fail; + } epp->ep_flags |= EXEC_HASFD; fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_data = (caddr_t) scriptvp; fp->f_flag = FREAD; - FILE_SET_MATURE(fp, p); + fdinsert(p->p_fd, epp->ep_fd, 0, fp); + fdpunlock(p->p_fd); + FRELE(fp, p); } /* set up the parameters for the recursive check_exec() call */ diff --git sys/kern/kern_descrip.c sys/kern/kern_descrip.c index 1734fb5cff0..694ebf1a128 100644 --- sys/kern/kern_descrip.c +++ sys/kern/kern_descrip.c @@ -144,6 +144,23 @@ find_last_set(struct filedesc *fd, int last) return i; } +static __inline int +fd_inuse(struct filedesc *fdp, int fd) +{ + u_int off = fd >> NDENTRYSHIFT; + + if (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) + return 1; + + if (fdp->fd_lomap[off] != ~0) + return 0; + + if (fdp->fd_himap[off >> NDENTRYSHIFT] & (1 << (off & NDENTRYMASK))) + return 1; + + return 0; +} + static __inline void fd_used(struct filedesc *fdp, int fd) { @@ -190,7 +207,7 @@ fd_iterfile(struct file *fp, struct proc *p) nfp = LIST_NEXT(fp, f_list); /* don't FREF when f_count == 0 to avoid race in fdrop() */ - while (nfp != NULL && (nfp->f_count == 0 || !FILE_IS_USABLE(nfp))) + while (nfp != NULL && nfp->f_count == 0) nfp = LIST_NEXT(nfp, f_list); if (nfp != NULL) FREF(nfp); @@ -209,9 +226,6 @@ fd_getfile(struct filedesc *fdp, int fd) if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) return (NULL); - if (!FILE_IS_USABLE(fp)) - return (NULL); - FREF(fp); return (fp); } @@ -635,18 +649,19 @@ finishdup(struct proc *p, struct file *fp, int old, int new, } oldfp = fdp->fd_ofiles[new]; - if (oldfp != NULL) { - if (!FILE_IS_USABLE(oldfp)) { + if (oldfp != NULL) + FREF(oldfp); + + if (dup2 && oldfp == NULL) { + if (fd_inuse(fdp, new)) { FRELE(fp, p); return (EBUSY); } - FREF(oldfp); + fd_used(fdp, new); } fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE; - if (dup2 && oldfp == NULL) - fd_used(fdp, new); *retval = new; if (oldfp != NULL) { @@ -658,6 +673,22 @@ finishdup(struct proc *p, struct file *fp, int old, int new, return (0); } +void +fdinsert(struct filedesc *fdp, int fd, int flags, struct file *fp) +{ + struct file *fq; + + fdpassertlocked(fdp); + if ((fq = fdp->fd_ofiles[0]) != NULL) { + LIST_INSERT_AFTER(fq, fp, f_list); + } else { + LIST_INSERT_HEAD(&filehead, fp, f_list); + } + fdp->fd_ofiles[fd] = fp; + fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE); + fp->f_iflags |= FIF_INSERTED; +} + void fdremove(struct filedesc *fdp, int fd) { @@ -928,9 +959,9 @@ fdexpand(struct proc *p) * a file descriptor for the process that refers to it. */ int -falloc(struct proc *p, int flags, struct file **resultfp, int *resultfd) +falloc(struct proc *p, struct file **resultfp, int *resultfd) { - struct file *fp, *fq; + struct file *fp; int error, i; KASSERT(resultfp != NULL); @@ -959,14 +990,6 @@ restart: numfiles++; fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO); mtx_init(&fp->f_mtx, IPL_NONE); - fp->f_iflags = FIF_LARVAL; - if ((fq = p->p_fd->fd_ofiles[0]) != NULL) { - LIST_INSERT_AFTER(fq, fp, f_list); - } else { - LIST_INSERT_HEAD(&filehead, fp, f_list); - } - p->p_fd->fd_ofiles[i] = fp; - p->p_fd->fd_ofileflags[i] |= (flags & UF_EXCLOSE); fp->f_count = 1; fp->f_cred = p->p_ucred; crhold(fp->f_cred); @@ -1192,8 +1215,8 @@ fdrop(struct file *fp, struct proc *p) else error = 0; - /* Free fp */ - LIST_REMOVE(fp, f_list); + if (fp->f_iflags & FIF_INSERTED) + LIST_REMOVE(fp, f_list); crfree(fp->f_cred); numfiles--; pool_put(&file_pool, fp); diff --git sys/kern/kern_event.c sys/kern/kern_event.c index dd5cb18d748..ec5b56a0d6d 100644 --- sys/kern/kern_event.c +++ sys/kern/kern_event.c @@ -441,10 +441,9 @@ sys_kqueue(struct proc *p, void *v, register_t *retval) int fd, error; fdplock(fdp); - error = falloc(p, 0, &fp, &fd); - fdpunlock(fdp); + error = falloc(p, &fp, &fd); if (error) - return (error); + goto out; fp->f_flag = FREAD | FWRITE; fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; @@ -456,8 +455,11 @@ sys_kqueue(struct proc *p, void *v, register_t *retval) if (fdp->fd_knlistsize < 0) fdp->fd_knlistsize = 0; /* this process has a kq */ kq->kq_fdp = fdp; - FILE_SET_MATURE(fp, p); - return (0); + fdinsert(fdp, fd, 0, fp); + FRELE(fp, p); +out: + fdpunlock(fdp); + return (error); } int diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c index 10d2eb1bbbc..f9649a1e353 100644 --- sys/kern/kern_exec.c +++ sys/kern/kern_exec.c @@ -584,7 +584,7 @@ sys_execve(struct proc *p, void *v, register_t *retval) struct vnode *vp; int indx; - if ((error = falloc(p, 0, &fp, &indx)) != 0) + if ((error = falloc(p, &fp, &indx)) != 0) break; #ifdef DIAGNOSTIC if (indx != i) @@ -607,10 +607,9 @@ sys_execve(struct proc *p, void *v, register_t *retval) fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; fp->f_data = (caddr_t)vp; - FILE_SET_MATURE(fp, p); - } else { - FRELE(fp, p); + fdinsert(p->p_fd, indx, 0, fp); } + FRELE(fp, p); } fdpunlock(p->p_fd); if (error) diff --git sys/kern/sys_pipe.c sys/kern/sys_pipe.c index c0655cdda67..ebb79c9da77 100644 --- sys/kern/sys_pipe.c +++ sys/kern/sys_pipe.c @@ -154,7 +154,7 @@ dopipe(struct proc *p, int *ufds, int flags) fdplock(fdp); - error = falloc(p, cloexec, &rf, &fds[0]); + error = falloc(p, &rf, &fds[0]); if (error != 0) goto free2; rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK); @@ -162,7 +162,7 @@ dopipe(struct proc *p, int *ufds, int flags) rf->f_data = rpipe; rf->f_ops = &pipeops; - error = falloc(p, cloexec, &wf, &fds[1]); + error = falloc(p, &wf, &fds[1]); if (error != 0) goto free3; wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK); @@ -173,8 +173,8 @@ dopipe(struct proc *p, int *ufds, int flags) rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; - FILE_SET_MATURE(rf, p); - FILE_SET_MATURE(wf, p); + fdinsert(fdp, fds[0], cloexec, rf); + fdinsert(fdp, fds[1], cloexec, wf); error = copyout(fds, ufds, sizeof(fds)); if (error != 0) { @@ -186,6 +186,9 @@ dopipe(struct proc *p, int *ufds, int flags) ktrfds(p, fds, 2); #endif fdpunlock(fdp); + + FRELE(rf, p); + FRELE(wf, p); return (error); free3: diff --git sys/kern/tty_pty.c sys/kern/tty_pty.c index 4e5f75aa1b9..b6eeded0a62 100644 --- sys/kern/tty_pty.c +++ sys/kern/tty_pty.c @@ -1070,11 +1070,11 @@ ptmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) case PTMGET: fdplock(fdp); /* Grab two filedescriptors. */ - if ((error = falloc(p, 0, &cfp, &cindx)) != 0) { + if ((error = falloc(p, &cfp, &cindx)) != 0) { fdpunlock(fdp); break; } - if ((error = falloc(p, 0, &sfp, &sindx)) != 0) { + if ((error = falloc(p, &sfp, &sindx)) != 0) { fdremove(fdp, cindx); closef(cfp, p); fdpunlock(fdp); @@ -1166,11 +1166,12 @@ retry: memcpy(ptm->cn, pti->pty_pn, sizeof(pti->pty_pn)); memcpy(ptm->sn, pti->pty_sn, sizeof(pti->pty_sn)); - /* mark the files mature now that we've passed all errors */ - FILE_SET_MATURE(cfp, p); - FILE_SET_MATURE(sfp, p); - + /* insert files now that we've passed all errors */ + fdinsert(fdp, cindx, 0, cfp); + fdinsert(fdp, sindx, 0, sfp); fdpunlock(fdp); + FRELE(cfp, p); + FRELE(sfp, p); break; default: error = EINVAL; diff --git sys/kern/uipc_syscalls.c sys/kern/uipc_syscalls.c index 6b3a629a0c4..aafd96c9083 100644 --- sys/kern/uipc_syscalls.c +++ sys/kern/uipc_syscalls.c @@ -105,9 +105,9 @@ sys_socket(struct proc *p, void *v, register_t *retval) goto out; fdplock(fdp); - error = falloc(p, cloexec, &fp, &fd); - fdpunlock(fdp); + error = falloc(p, &fp, &fd); if (error) { + fdpunlock(fdp); soclose(so); } else { fp->f_flag = fflag; @@ -117,7 +117,9 @@ sys_socket(struct proc *p, void *v, register_t *retval) so->so_state |= SS_NBIO; so->so_state |= ss; fp->f_data = so; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, fd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); *retval = fd; } out: @@ -272,7 +274,9 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen, socklen_t namelen; int error, s, tmpfd; struct socket *head, *so; - int nflag; + int cloexec, nflag; + + cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0; if (name && (error = copyin(anamelen, &namelen, sizeof (namelen)))) return (error); @@ -282,7 +286,7 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen, headfp = fp; fdplock(fdp); - error = falloc(p, (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0, &fp, &tmpfd); + error = falloc(p, &fp, &tmpfd); fdpunlock(fdp); if (error) { FRELE(headfp, p); @@ -347,8 +351,11 @@ out: else so->so_state &= ~SS_NBIO; sounlock(s); + fdplock(fdp); fp->f_data = so; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, tmpfd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); *retval = tmpfd; } else { sounlock(s); @@ -476,13 +483,13 @@ sys_socketpair(struct proc *p, void *v, register_t *retval) goto free2; } fdplock(fdp); - if ((error = falloc(p, cloexec, &fp1, &sv[0])) != 0) + if ((error = falloc(p, &fp1, &sv[0])) != 0) goto free3; fp1->f_flag = fflag; fp1->f_type = DTYPE_SOCKET; fp1->f_ops = &socketops; fp1->f_data = so1; - if ((error = falloc(p, cloexec, &fp2, &sv[1])) != 0) + if ((error = falloc(p, &fp2, &sv[1])) != 0) goto free4; fp2->f_flag = fflag; fp2->f_type = DTYPE_SOCKET; @@ -500,9 +507,11 @@ sys_socketpair(struct proc *p, void *v, register_t *retval) (*fp2->f_ops->fo_ioctl)(fp2, FIONBIO, (caddr_t)&type, p); } - FILE_SET_MATURE(fp1, p); - FILE_SET_MATURE(fp2, p); + fdinsert(fdp, sv[0], cloexec, fp1); + fdinsert(fdp, sv[1], cloexec, fp2); fdpunlock(fdp); + FRELE(fp1, p); + FRELE(fp2, p); return (0); } fdremove(fdp, sv[1]); diff --git sys/kern/vfs_syscalls.c sys/kern/vfs_syscalls.c index a39376f6fe7..52e82963f20 100644 --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -899,7 +899,7 @@ doopenat(struct proc *p, int fd, const char *path, int oflags, mode_t mode, struct file *fp; struct vnode *vp; struct vattr vattr; - int flags, cmode; + int flags, cloexec, cmode; int type, indx, error, localtrunc = 0; struct flock lf; struct nameidata nd; @@ -911,10 +911,10 @@ doopenat(struct proc *p, int fd, const char *path, int oflags, mode_t mode, return (error); } - fdplock(fdp); + cloexec = (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0; - if ((error = falloc(p, (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp, - &indx)) != 0) + fdplock(fdp); + if ((error = falloc(p, &fp, &indx)) != 0) goto out; flags = FFLAGS(oflags); if (flags & FREAD) @@ -999,7 +999,8 @@ doopenat(struct proc *p, int fd, const char *path, int oflags, mode_t mode, } VOP_UNLOCK(vp); *retval = indx; - FILE_SET_MATURE(fp, p); + fdinsert(fdp, indx, cloexec, fp); + FRELE(fp, p); out: fdpunlock(fdp); return (error); @@ -1060,7 +1061,7 @@ sys_fhopen(struct proc *p, void *v, register_t *retval) struct vnode *vp = NULL; struct mount *mp; struct ucred *cred = p->p_ucred; - int flags; + int flags, cloexec; int type, indx, error=0; struct flock lf; struct vattr va; @@ -1078,9 +1079,10 @@ sys_fhopen(struct proc *p, void *v, register_t *retval) if ((flags & O_CREAT)) return (EINVAL); + cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0; + fdplock(fdp); - if ((error = falloc(p, (flags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp, - &indx)) != 0) { + if ((error = falloc(p, &fp, &indx)) != 0) { fp = NULL; goto bad; } @@ -1160,9 +1162,9 @@ sys_fhopen(struct proc *p, void *v, register_t *retval) } VOP_UNLOCK(vp); *retval = indx; - FILE_SET_MATURE(fp, p); - + fdinsert(fdp, indx, cloexec, fp); fdpunlock(fdp); + FRELE(fp, p); return (0); bad: diff --git sys/sys/file.h sys/sys/file.h index 042238ebd3f..9e322b4ae12 100644 --- sys/sys/file.h +++ sys/sys/file.h @@ -91,10 +91,7 @@ struct file { }; #define FIF_HASLOCK 0x01 /* descriptor holds advisory lock */ -#define FIF_LARVAL 0x02 /* not fully constructed, don't use */ - -#define FILE_IS_USABLE(fp) \ - (((fp)->f_iflags & FIF_LARVAL) == 0) +#define FIF_INSERTED 0x80 /* present in `filehead' */ #define FREF(fp) \ do { \ @@ -104,11 +101,6 @@ struct file { } while (0) #define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0) -#define FILE_SET_MATURE(fp,p) do { \ - (fp)->f_iflags &= ~FIF_LARVAL; \ - FRELE(fp, p); \ -} while (0) - int fdrop(struct file *, struct proc *); LIST_HEAD(filelist, file); diff --git sys/sys/filedesc.h sys/sys/filedesc.h index 39486fafd60..1e1c463642e 100644 --- sys/sys/filedesc.h +++ sys/sys/filedesc.h @@ -125,12 +125,13 @@ void filedesc_init(void); int dupfdopen(struct proc *, int, int); int fdalloc(struct proc *p, int want, int *result); void fdexpand(struct proc *); -int falloc(struct proc *_p, int _flags, struct file **_rfp, int *_rfd); +int falloc(struct proc *_p, struct file **_rfp, int *_rfd); struct filedesc *fdinit(void); struct filedesc *fdshare(struct process *); struct filedesc *fdcopy(struct process *); void fdfree(struct proc *p); int fdrelease(struct proc *p, int); +void fdinsert(struct filedesc *, int, int, struct file *); void fdremove(struct filedesc *, int); void fdcloseexec(struct proc *); struct file *fd_iterfile(struct file *, struct proc *);