On 10/05/18(Thu) 22:49, Philip Guenther wrote:
> On Wed, May 9, 2018 at 3:31 AM, Martin Pieuchot <m...@openbsd.org> wrote:
> 
> > On 08/05/18(Tue) 14:12, Philip Guenther wrote:
> > > On Tue, 8 May 2018, Martin Pieuchot wrote:
> > > > The way our kernel allocates and populates new 'struct file *' is
> > > > currently a complete mess.  One of the problems is that it puts
> > > > incomplete descriptors on the global `filehead' list and on the
> > > > open file array `fd_ofiles'.  But to make sure that other threads
> > > > won't use such descriptors before they are ready, they are first
> > > > marked as LARVAL then unmarked via the FILE_SET_MATURE() macro.
> > >
> > > The reason we do that is to meet two goals:
> > > 1) we want to allocate the fd (and fail if we can't) before
> > >    possibly blocking while completing the creation of the struct file.
> > >    For example, in doaccept(), this thread being blocked waiting for a
> > new
> > >    connection must not block other threads from making fd changes.
> >
> > But the way I understand it, allocating the fd isn't changed by my diff.
> > fdalloc(), called by falloc(), still allocates it and mark the
> > corresponding
> > bits with fd_used().
> >
> 
> Those bits don't block dup2() *targeting* the fd, or close of the fd.
> 
> 
> 
> > > 2) close(half_created) and dup2(old, half_created) must be safe: it must
> > >    not leak the half_created file in the kernel, and on completion the
> > >    half_created fd must either be closed or dup'ed to; the half-created
> > >    file must not suddenly show up there later
> >
> > I don't understand, are you talking about close(2) and dup2(2)?  How
> > syscalls are related to half_created files?  How is that related to
> > my explanation below?
> >
> 
> thread 1: socket() -- > returns 3
> thread 1: bind(3, someaddr); listen(3);
> thread 1: accept(3) --> expected to return 4 (it's the first free fd), but
> blocks waiting for connection
> thread 2:       socket() --> returns 5 (if not, then explain that and
> ignore the rest of this sequences)
> thread 2:       dup2(0, 4) --> better not block!  after this fd 4 must be a
> dup of 0!
> thread 2:       connect(5, someaddr) --> unblocks thread 1
> 
> So what happens at the end of this?  Did accept() return 4 as expected and
> if so is fd 4 a duplicate of fd 0 or is it the accepted socket?  I argue
> that it must be a dup of 0 because dup2 succeeded, and the accept that
> completed later cannot close that fd.  That's how the current kernel
> behaves.
> 
> I haven't tested it, but my reading of the diff is that is fd 4 will be the
> accepted fd and the dup'ed copy of fd 0 will be lost, leaving it's
> reference count too high by one.

Here's an updated diff on top of the two bug fixes I sent today.  It
makes use of a new function, fd_inuse() to detect if a file is in the
LARVAL state and return an error in the case of dup2().

I'd appreciate review as this is a blocking issue for unlocking all
file related syscalls.

diff --git sys/kern/exec_script.c sys/kern/exec_script.c
index 0bfaea0aee0..1c7aae6f69b 100644
--- sys/kern/exec_script.c
+++ sys/kern/exec_script.c
@@ -170,17 +170,20 @@ check_shell:
 #endif
 
                fdplock(p->p_fd);
-               error = falloc(p, 0, &fp, &epp->ep_fd);
-               fdpunlock(p->p_fd);
-               if (error)
+               error = falloc(p, &fp, &epp->ep_fd);
+               if (error) {
+                       fdpunlock(p->p_fd);
                        goto fail;
+               }
 
                epp->ep_flags |= EXEC_HASFD;
                fp->f_type = DTYPE_VNODE;
                fp->f_ops = &vnops;
                fp->f_data = (caddr_t) scriptvp;
                fp->f_flag = FREAD;
-               FILE_SET_MATURE(fp, p);
+               fdinsert(p->p_fd, epp->ep_fd, 0, fp);
+               fdpunlock(p->p_fd);
+               FRELE(fp, p);
        }
 
        /* set up the parameters for the recursive check_exec() call */
diff --git sys/kern/kern_descrip.c sys/kern/kern_descrip.c
index 1734fb5cff0..694ebf1a128 100644
--- sys/kern/kern_descrip.c
+++ sys/kern/kern_descrip.c
@@ -144,6 +144,23 @@ find_last_set(struct filedesc *fd, int last)
        return i;
 }
 
+static __inline int
+fd_inuse(struct filedesc *fdp, int fd)
+{
+       u_int off = fd >> NDENTRYSHIFT;
+
+       if (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK)))
+               return 1;
+
+       if (fdp->fd_lomap[off] != ~0)
+               return 0;
+
+       if (fdp->fd_himap[off >> NDENTRYSHIFT] & (1 << (off & NDENTRYMASK)))
+               return 1;
+
+       return 0;
+}
+
 static __inline void
 fd_used(struct filedesc *fdp, int fd)
 {
@@ -190,7 +207,7 @@ fd_iterfile(struct file *fp, struct proc *p)
                nfp = LIST_NEXT(fp, f_list);
 
        /* don't FREF when f_count == 0 to avoid race in fdrop() */
-       while (nfp != NULL && (nfp->f_count == 0 || !FILE_IS_USABLE(nfp)))
+       while (nfp != NULL && nfp->f_count == 0)
                nfp = LIST_NEXT(nfp, f_list);
        if (nfp != NULL)
                FREF(nfp);
@@ -209,9 +226,6 @@ fd_getfile(struct filedesc *fdp, int fd)
        if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
                return (NULL);
 
-       if (!FILE_IS_USABLE(fp))
-               return (NULL);
-
        FREF(fp);
        return (fp);
 }
@@ -635,18 +649,19 @@ finishdup(struct proc *p, struct file *fp, int old, int 
new,
        }
 
        oldfp = fdp->fd_ofiles[new];
-       if (oldfp != NULL) {
-               if (!FILE_IS_USABLE(oldfp)) {
+       if (oldfp != NULL)
+               FREF(oldfp);
+
+       if (dup2 && oldfp == NULL) {
+               if (fd_inuse(fdp, new)) {
                        FRELE(fp, p);
                        return (EBUSY);
                }
-               FREF(oldfp);
+               fd_used(fdp, new);
        }
 
        fdp->fd_ofiles[new] = fp;
        fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
-       if (dup2 && oldfp == NULL)
-               fd_used(fdp, new);
        *retval = new;
 
        if (oldfp != NULL) {
@@ -658,6 +673,22 @@ finishdup(struct proc *p, struct file *fp, int old, int 
new,
        return (0);
 }
 
+void
+fdinsert(struct filedesc *fdp, int fd, int flags, struct file *fp)
+{
+       struct file *fq;
+
+       fdpassertlocked(fdp);
+       if ((fq = fdp->fd_ofiles[0]) != NULL) {
+               LIST_INSERT_AFTER(fq, fp, f_list);
+       } else {
+               LIST_INSERT_HEAD(&filehead, fp, f_list);
+       }
+       fdp->fd_ofiles[fd] = fp;
+       fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
+       fp->f_iflags |= FIF_INSERTED;
+}
+
 void
 fdremove(struct filedesc *fdp, int fd)
 {
@@ -928,9 +959,9 @@ fdexpand(struct proc *p)
  * a file descriptor for the process that refers to it.
  */
 int
-falloc(struct proc *p, int flags, struct file **resultfp, int *resultfd)
+falloc(struct proc *p, struct file **resultfp, int *resultfd)
 {
-       struct file *fp, *fq;
+       struct file *fp;
        int error, i;
 
        KASSERT(resultfp != NULL);
@@ -959,14 +990,6 @@ restart:
        numfiles++;
        fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO);
        mtx_init(&fp->f_mtx, IPL_NONE);
-       fp->f_iflags = FIF_LARVAL;
-       if ((fq = p->p_fd->fd_ofiles[0]) != NULL) {
-               LIST_INSERT_AFTER(fq, fp, f_list);
-       } else {
-               LIST_INSERT_HEAD(&filehead, fp, f_list);
-       }
-       p->p_fd->fd_ofiles[i] = fp;
-       p->p_fd->fd_ofileflags[i] |= (flags & UF_EXCLOSE);
        fp->f_count = 1;
        fp->f_cred = p->p_ucred;
        crhold(fp->f_cred);
@@ -1192,8 +1215,8 @@ fdrop(struct file *fp, struct proc *p)
        else
                error = 0;
 
-       /* Free fp */
-       LIST_REMOVE(fp, f_list);
+       if (fp->f_iflags & FIF_INSERTED)
+               LIST_REMOVE(fp, f_list);
        crfree(fp->f_cred);
        numfiles--;
        pool_put(&file_pool, fp);
diff --git sys/kern/kern_event.c sys/kern/kern_event.c
index dd5cb18d748..ec5b56a0d6d 100644
--- sys/kern/kern_event.c
+++ sys/kern/kern_event.c
@@ -441,10 +441,9 @@ sys_kqueue(struct proc *p, void *v, register_t *retval)
        int fd, error;
 
        fdplock(fdp);
-       error = falloc(p, 0, &fp, &fd);
-       fdpunlock(fdp);
+       error = falloc(p, &fp, &fd);
        if (error)
-               return (error);
+               goto out;
        fp->f_flag = FREAD | FWRITE;
        fp->f_type = DTYPE_KQUEUE;
        fp->f_ops = &kqueueops;
@@ -456,8 +455,11 @@ sys_kqueue(struct proc *p, void *v, register_t *retval)
        if (fdp->fd_knlistsize < 0)
                fdp->fd_knlistsize = 0;         /* this process has a kq */
        kq->kq_fdp = fdp;
-       FILE_SET_MATURE(fp, p);
-       return (0);
+       fdinsert(fdp, fd, 0, fp);
+       FRELE(fp, p);
+out:
+       fdpunlock(fdp);
+       return (error);
 }
 
 int
diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
index 10d2eb1bbbc..f9649a1e353 100644
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -584,7 +584,7 @@ sys_execve(struct proc *p, void *v, register_t *retval)
                                struct vnode *vp;
                                int indx;
 
-                               if ((error = falloc(p, 0, &fp, &indx)) != 0)
+                               if ((error = falloc(p, &fp, &indx)) != 0)
                                        break;
 #ifdef DIAGNOSTIC
                                if (indx != i)
@@ -607,10 +607,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
                                fp->f_type = DTYPE_VNODE;
                                fp->f_ops = &vnops;
                                fp->f_data = (caddr_t)vp;
-                               FILE_SET_MATURE(fp, p);
-                       } else {
-                               FRELE(fp, p);
+                               fdinsert(p->p_fd, indx, 0, fp);
                        }
+                       FRELE(fp, p);
                }
                fdpunlock(p->p_fd);
                if (error)
diff --git sys/kern/sys_pipe.c sys/kern/sys_pipe.c
index c0655cdda67..ebb79c9da77 100644
--- sys/kern/sys_pipe.c
+++ sys/kern/sys_pipe.c
@@ -154,7 +154,7 @@ dopipe(struct proc *p, int *ufds, int flags)
 
        fdplock(fdp);
 
-       error = falloc(p, cloexec, &rf, &fds[0]);
+       error = falloc(p, &rf, &fds[0]);
        if (error != 0)
                goto free2;
        rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
@@ -162,7 +162,7 @@ dopipe(struct proc *p, int *ufds, int flags)
        rf->f_data = rpipe;
        rf->f_ops = &pipeops;
 
-       error = falloc(p, cloexec, &wf, &fds[1]);
+       error = falloc(p, &wf, &fds[1]);
        if (error != 0)
                goto free3;
        wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
@@ -173,8 +173,8 @@ dopipe(struct proc *p, int *ufds, int flags)
        rpipe->pipe_peer = wpipe;
        wpipe->pipe_peer = rpipe;
 
-       FILE_SET_MATURE(rf, p);
-       FILE_SET_MATURE(wf, p);
+       fdinsert(fdp, fds[0], cloexec, rf);
+       fdinsert(fdp, fds[1], cloexec, wf);
 
        error = copyout(fds, ufds, sizeof(fds));
        if (error != 0) {
@@ -186,6 +186,9 @@ dopipe(struct proc *p, int *ufds, int flags)
                ktrfds(p, fds, 2);
 #endif
        fdpunlock(fdp);
+
+       FRELE(rf, p);
+       FRELE(wf, p);
        return (error);
 
 free3:
diff --git sys/kern/tty_pty.c sys/kern/tty_pty.c
index 4e5f75aa1b9..b6eeded0a62 100644
--- sys/kern/tty_pty.c
+++ sys/kern/tty_pty.c
@@ -1070,11 +1070,11 @@ ptmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, 
struct proc *p)
        case PTMGET:
                fdplock(fdp);
                /* Grab two filedescriptors. */
-               if ((error = falloc(p, 0, &cfp, &cindx)) != 0) {
+               if ((error = falloc(p, &cfp, &cindx)) != 0) {
                        fdpunlock(fdp);
                        break;
                }
-               if ((error = falloc(p, 0, &sfp, &sindx)) != 0) {
+               if ((error = falloc(p, &sfp, &sindx)) != 0) {
                        fdremove(fdp, cindx);
                        closef(cfp, p);
                        fdpunlock(fdp);
@@ -1166,11 +1166,12 @@ retry:
                memcpy(ptm->cn, pti->pty_pn, sizeof(pti->pty_pn));
                memcpy(ptm->sn, pti->pty_sn, sizeof(pti->pty_sn));
 
-               /* mark the files mature now that we've passed all errors */
-               FILE_SET_MATURE(cfp, p);
-               FILE_SET_MATURE(sfp, p);
-
+               /* insert files now that we've passed all errors */
+               fdinsert(fdp, cindx, 0, cfp);
+               fdinsert(fdp, sindx, 0, sfp);
                fdpunlock(fdp);
+               FRELE(cfp, p);
+               FRELE(sfp, p);
                break;
        default:
                error = EINVAL;
diff --git sys/kern/uipc_syscalls.c sys/kern/uipc_syscalls.c
index 6b3a629a0c4..aafd96c9083 100644
--- sys/kern/uipc_syscalls.c
+++ sys/kern/uipc_syscalls.c
@@ -105,9 +105,9 @@ sys_socket(struct proc *p, void *v, register_t *retval)
                goto out;
 
        fdplock(fdp);
-       error = falloc(p, cloexec, &fp, &fd);
-       fdpunlock(fdp);
+       error = falloc(p, &fp, &fd);
        if (error) {
+               fdpunlock(fdp);
                soclose(so);
        } else {
                fp->f_flag = fflag;
@@ -117,7 +117,9 @@ sys_socket(struct proc *p, void *v, register_t *retval)
                        so->so_state |= SS_NBIO;
                so->so_state |= ss;
                fp->f_data = so;
-               FILE_SET_MATURE(fp, p);
+               fdinsert(fdp, fd, cloexec, fp);
+               fdpunlock(fdp);
+               FRELE(fp, p);
                *retval = fd;
        }
 out:
@@ -272,7 +274,9 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, 
socklen_t *anamelen,
        socklen_t namelen;
        int error, s, tmpfd;
        struct socket *head, *so;
-       int nflag;
+       int cloexec, nflag;
+
+       cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0;
 
        if (name && (error = copyin(anamelen, &namelen, sizeof (namelen))))
                return (error);
@@ -282,7 +286,7 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, 
socklen_t *anamelen,
        headfp = fp;
 
        fdplock(fdp);
-       error = falloc(p, (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0, &fp, &tmpfd);
+       error = falloc(p, &fp, &tmpfd);
        fdpunlock(fdp);
        if (error) {
                FRELE(headfp, p);
@@ -347,8 +351,11 @@ out:
                else
                        so->so_state &= ~SS_NBIO;
                sounlock(s);
+               fdplock(fdp);
                fp->f_data = so;
-               FILE_SET_MATURE(fp, p);
+               fdinsert(fdp, tmpfd, cloexec, fp);
+               fdpunlock(fdp);
+               FRELE(fp, p);
                *retval = tmpfd;
        } else {
                sounlock(s);
@@ -476,13 +483,13 @@ sys_socketpair(struct proc *p, void *v, register_t 
*retval)
                        goto free2;
        }
        fdplock(fdp);
-       if ((error = falloc(p, cloexec, &fp1, &sv[0])) != 0)
+       if ((error = falloc(p, &fp1, &sv[0])) != 0)
                goto free3;
        fp1->f_flag = fflag;
        fp1->f_type = DTYPE_SOCKET;
        fp1->f_ops = &socketops;
        fp1->f_data = so1;
-       if ((error = falloc(p, cloexec, &fp2, &sv[1])) != 0)
+       if ((error = falloc(p, &fp2, &sv[1])) != 0)
                goto free4;
        fp2->f_flag = fflag;
        fp2->f_type = DTYPE_SOCKET;
@@ -500,9 +507,11 @@ sys_socketpair(struct proc *p, void *v, register_t *retval)
                        (*fp2->f_ops->fo_ioctl)(fp2, FIONBIO, (caddr_t)&type,
                            p);
                }
-               FILE_SET_MATURE(fp1, p);
-               FILE_SET_MATURE(fp2, p);
+               fdinsert(fdp, sv[0], cloexec, fp1);
+               fdinsert(fdp, sv[1], cloexec, fp2);
                fdpunlock(fdp);
+               FRELE(fp1, p);
+               FRELE(fp2, p);
                return (0);
        }
        fdremove(fdp, sv[1]);
diff --git sys/kern/vfs_syscalls.c sys/kern/vfs_syscalls.c
index a39376f6fe7..52e82963f20 100644
--- sys/kern/vfs_syscalls.c
+++ sys/kern/vfs_syscalls.c
@@ -899,7 +899,7 @@ doopenat(struct proc *p, int fd, const char *path, int 
oflags, mode_t mode,
        struct file *fp;
        struct vnode *vp;
        struct vattr vattr;
-       int flags, cmode;
+       int flags, cloexec, cmode;
        int type, indx, error, localtrunc = 0;
        struct flock lf;
        struct nameidata nd;
@@ -911,10 +911,10 @@ doopenat(struct proc *p, int fd, const char *path, int 
oflags, mode_t mode,
                        return (error);
        }
 
-       fdplock(fdp);
+       cloexec = (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0;
 
-       if ((error = falloc(p, (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp,
-           &indx)) != 0)
+       fdplock(fdp);
+       if ((error = falloc(p, &fp, &indx)) != 0)
                goto out;
        flags = FFLAGS(oflags);
        if (flags & FREAD)
@@ -999,7 +999,8 @@ doopenat(struct proc *p, int fd, const char *path, int 
oflags, mode_t mode,
        }
        VOP_UNLOCK(vp);
        *retval = indx;
-       FILE_SET_MATURE(fp, p);
+       fdinsert(fdp, indx, cloexec, fp);
+       FRELE(fp, p);
 out:
        fdpunlock(fdp);
        return (error);
@@ -1060,7 +1061,7 @@ sys_fhopen(struct proc *p, void *v, register_t *retval)
        struct vnode *vp = NULL;
        struct mount *mp;
        struct ucred *cred = p->p_ucred;
-       int flags;
+       int flags, cloexec;
        int type, indx, error=0;
        struct flock lf;
        struct vattr va;
@@ -1078,9 +1079,10 @@ sys_fhopen(struct proc *p, void *v, register_t *retval)
        if ((flags & O_CREAT))
                return (EINVAL);
 
+       cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
+
        fdplock(fdp);
-       if ((error = falloc(p, (flags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp,
-           &indx)) != 0) {
+       if ((error = falloc(p, &fp, &indx)) != 0) {
                fp = NULL;
                goto bad;
        }
@@ -1160,9 +1162,9 @@ sys_fhopen(struct proc *p, void *v, register_t *retval)
        }
        VOP_UNLOCK(vp);
        *retval = indx;
-       FILE_SET_MATURE(fp, p);
-
+       fdinsert(fdp, indx, cloexec, fp);
        fdpunlock(fdp);
+       FRELE(fp, p);
        return (0);
 
 bad:
diff --git sys/sys/file.h sys/sys/file.h
index 042238ebd3f..9e322b4ae12 100644
--- sys/sys/file.h
+++ sys/sys/file.h
@@ -91,10 +91,7 @@ struct file {
 };
 
 #define FIF_HASLOCK            0x01    /* descriptor holds advisory lock */
-#define FIF_LARVAL             0x02    /* not fully constructed, don't use */
-
-#define FILE_IS_USABLE(fp) \
-       (((fp)->f_iflags & FIF_LARVAL) == 0)
+#define FIF_INSERTED           0x80    /* present in `filehead' */
 
 #define FREF(fp) \
        do { \
@@ -104,11 +101,6 @@ struct file {
        } while (0)
 #define FRELE(fp,p)    (--(fp)->f_count == 0 ? fdrop(fp, p) : 0)
 
-#define FILE_SET_MATURE(fp,p) do {                             \
-       (fp)->f_iflags &= ~FIF_LARVAL;                          \
-       FRELE(fp, p);                                           \
-} while (0)
-
 int    fdrop(struct file *, struct proc *);
 
 LIST_HEAD(filelist, file);
diff --git sys/sys/filedesc.h sys/sys/filedesc.h
index 39486fafd60..1e1c463642e 100644
--- sys/sys/filedesc.h
+++ sys/sys/filedesc.h
@@ -125,12 +125,13 @@ void      filedesc_init(void);
 int    dupfdopen(struct proc *, int, int);
 int    fdalloc(struct proc *p, int want, int *result);
 void   fdexpand(struct proc *);
-int    falloc(struct proc *_p, int _flags, struct file **_rfp, int *_rfd);
+int    falloc(struct proc *_p, struct file **_rfp, int *_rfd);
 struct filedesc *fdinit(void);
 struct filedesc *fdshare(struct process *);
 struct filedesc *fdcopy(struct process *);
 void   fdfree(struct proc *p);
 int    fdrelease(struct proc *p, int);
+void   fdinsert(struct filedesc *, int, int, struct file *);
 void   fdremove(struct filedesc *, int);
 void   fdcloseexec(struct proc *);
 struct file *fd_iterfile(struct file *, struct proc *);

Reply via email to