On 10/05/18(Thu) 22:49, Philip Guenther wrote:
> On Wed, May 9, 2018 at 3:31 AM, Martin Pieuchot <[email protected]> wrote:
>
> > On 08/05/18(Tue) 14:12, Philip Guenther wrote:
> > > On Tue, 8 May 2018, Martin Pieuchot wrote:
> > > > The way our kernel allocates and populates new 'struct file *' is
> > > > currently a complete mess. One of the problems is that it puts
> > > > incomplete descriptors on the global `filehead' list and on the
> > > > open file array `fd_ofiles'. But to make sure that other threads
> > > > won't use such descriptors before they are ready, they are first
> > > > marked as LARVAL then unmarked via the FILE_SET_MATURE() macro.
> > >
> > > The reason we do that is to meet two goals:
> > > 1) we want to allocate the fd (and fail if we can't) before
> > > possibly blocking while completing the creation of the struct file.
> > > For example, in doaccept(), this thread being blocked waiting for a
> > new
> > > connection must not block other threads from making fd changes.
> >
> > But the way I understand it, allocating the fd isn't changed by my diff.
> > fdalloc(), called by falloc(), still allocates it and mark the
> > corresponding
> > bits with fd_used().
> >
>
> Those bits don't block dup2() *targeting* the fd, or close of the fd.
>
>
>
> > > 2) close(half_created) and dup2(old, half_created) must be safe: it must
> > > not leak the half_created file in the kernel, and on completion the
> > > half_created fd must either be closed or dup'ed to; the half-created
> > > file must not suddenly show up there later
> >
> > I don't understand, are you talking about close(2) and dup2(2)? How
> > syscalls are related to half_created files? How is that related to
> > my explanation below?
> >
>
> thread 1: socket() -- > returns 3
> thread 1: bind(3, someaddr); listen(3);
> thread 1: accept(3) --> expected to return 4 (it's the first free fd), but
> blocks waiting for connection
> thread 2: socket() --> returns 5 (if not, then explain that and
> ignore the rest of this sequences)
> thread 2: dup2(0, 4) --> better not block! after this fd 4 must be a
> dup of 0!
> thread 2: connect(5, someaddr) --> unblocks thread 1
>
> So what happens at the end of this? Did accept() return 4 as expected and
> if so is fd 4 a duplicate of fd 0 or is it the accepted socket? I argue
> that it must be a dup of 0 because dup2 succeeded, and the accept that
> completed later cannot close that fd. That's how the current kernel
> behaves.
>
> I haven't tested it, but my reading of the diff is that is fd 4 will be the
> accepted fd and the dup'ed copy of fd 0 will be lost, leaving it's
> reference count too high by one.
Here's an updated diff on top of the two bug fixes I sent today. It
makes use of a new function, fd_inuse() to detect if a file is in the
LARVAL state and return an error in the case of dup2().
I'd appreciate review as this is a blocking issue for unlocking all
file related syscalls.
diff --git sys/kern/exec_script.c sys/kern/exec_script.c
index 0bfaea0aee0..1c7aae6f69b 100644
--- sys/kern/exec_script.c
+++ sys/kern/exec_script.c
@@ -170,17 +170,20 @@ check_shell:
#endif
fdplock(p->p_fd);
- error = falloc(p, 0, &fp, &epp->ep_fd);
- fdpunlock(p->p_fd);
- if (error)
+ error = falloc(p, &fp, &epp->ep_fd);
+ if (error) {
+ fdpunlock(p->p_fd);
goto fail;
+ }
epp->ep_flags |= EXEC_HASFD;
fp->f_type = DTYPE_VNODE;
fp->f_ops = &vnops;
fp->f_data = (caddr_t) scriptvp;
fp->f_flag = FREAD;
- FILE_SET_MATURE(fp, p);
+ fdinsert(p->p_fd, epp->ep_fd, 0, fp);
+ fdpunlock(p->p_fd);
+ FRELE(fp, p);
}
/* set up the parameters for the recursive check_exec() call */
diff --git sys/kern/kern_descrip.c sys/kern/kern_descrip.c
index 1734fb5cff0..694ebf1a128 100644
--- sys/kern/kern_descrip.c
+++ sys/kern/kern_descrip.c
@@ -144,6 +144,23 @@ find_last_set(struct filedesc *fd, int last)
return i;
}
+static __inline int
+fd_inuse(struct filedesc *fdp, int fd)
+{
+ u_int off = fd >> NDENTRYSHIFT;
+
+ if (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK)))
+ return 1;
+
+ if (fdp->fd_lomap[off] != ~0)
+ return 0;
+
+ if (fdp->fd_himap[off >> NDENTRYSHIFT] & (1 << (off & NDENTRYMASK)))
+ return 1;
+
+ return 0;
+}
+
static __inline void
fd_used(struct filedesc *fdp, int fd)
{
@@ -190,7 +207,7 @@ fd_iterfile(struct file *fp, struct proc *p)
nfp = LIST_NEXT(fp, f_list);
/* don't FREF when f_count == 0 to avoid race in fdrop() */
- while (nfp != NULL && (nfp->f_count == 0 || !FILE_IS_USABLE(nfp)))
+ while (nfp != NULL && nfp->f_count == 0)
nfp = LIST_NEXT(nfp, f_list);
if (nfp != NULL)
FREF(nfp);
@@ -209,9 +226,6 @@ fd_getfile(struct filedesc *fdp, int fd)
if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
return (NULL);
- if (!FILE_IS_USABLE(fp))
- return (NULL);
-
FREF(fp);
return (fp);
}
@@ -635,18 +649,19 @@ finishdup(struct proc *p, struct file *fp, int old, int
new,
}
oldfp = fdp->fd_ofiles[new];
- if (oldfp != NULL) {
- if (!FILE_IS_USABLE(oldfp)) {
+ if (oldfp != NULL)
+ FREF(oldfp);
+
+ if (dup2 && oldfp == NULL) {
+ if (fd_inuse(fdp, new)) {
FRELE(fp, p);
return (EBUSY);
}
- FREF(oldfp);
+ fd_used(fdp, new);
}
fdp->fd_ofiles[new] = fp;
fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
- if (dup2 && oldfp == NULL)
- fd_used(fdp, new);
*retval = new;
if (oldfp != NULL) {
@@ -658,6 +673,22 @@ finishdup(struct proc *p, struct file *fp, int old, int
new,
return (0);
}
+void
+fdinsert(struct filedesc *fdp, int fd, int flags, struct file *fp)
+{
+ struct file *fq;
+
+ fdpassertlocked(fdp);
+ if ((fq = fdp->fd_ofiles[0]) != NULL) {
+ LIST_INSERT_AFTER(fq, fp, f_list);
+ } else {
+ LIST_INSERT_HEAD(&filehead, fp, f_list);
+ }
+ fdp->fd_ofiles[fd] = fp;
+ fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
+ fp->f_iflags |= FIF_INSERTED;
+}
+
void
fdremove(struct filedesc *fdp, int fd)
{
@@ -928,9 +959,9 @@ fdexpand(struct proc *p)
* a file descriptor for the process that refers to it.
*/
int
-falloc(struct proc *p, int flags, struct file **resultfp, int *resultfd)
+falloc(struct proc *p, struct file **resultfp, int *resultfd)
{
- struct file *fp, *fq;
+ struct file *fp;
int error, i;
KASSERT(resultfp != NULL);
@@ -959,14 +990,6 @@ restart:
numfiles++;
fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO);
mtx_init(&fp->f_mtx, IPL_NONE);
- fp->f_iflags = FIF_LARVAL;
- if ((fq = p->p_fd->fd_ofiles[0]) != NULL) {
- LIST_INSERT_AFTER(fq, fp, f_list);
- } else {
- LIST_INSERT_HEAD(&filehead, fp, f_list);
- }
- p->p_fd->fd_ofiles[i] = fp;
- p->p_fd->fd_ofileflags[i] |= (flags & UF_EXCLOSE);
fp->f_count = 1;
fp->f_cred = p->p_ucred;
crhold(fp->f_cred);
@@ -1192,8 +1215,8 @@ fdrop(struct file *fp, struct proc *p)
else
error = 0;
- /* Free fp */
- LIST_REMOVE(fp, f_list);
+ if (fp->f_iflags & FIF_INSERTED)
+ LIST_REMOVE(fp, f_list);
crfree(fp->f_cred);
numfiles--;
pool_put(&file_pool, fp);
diff --git sys/kern/kern_event.c sys/kern/kern_event.c
index dd5cb18d748..ec5b56a0d6d 100644
--- sys/kern/kern_event.c
+++ sys/kern/kern_event.c
@@ -441,10 +441,9 @@ sys_kqueue(struct proc *p, void *v, register_t *retval)
int fd, error;
fdplock(fdp);
- error = falloc(p, 0, &fp, &fd);
- fdpunlock(fdp);
+ error = falloc(p, &fp, &fd);
if (error)
- return (error);
+ goto out;
fp->f_flag = FREAD | FWRITE;
fp->f_type = DTYPE_KQUEUE;
fp->f_ops = &kqueueops;
@@ -456,8 +455,11 @@ sys_kqueue(struct proc *p, void *v, register_t *retval)
if (fdp->fd_knlistsize < 0)
fdp->fd_knlistsize = 0; /* this process has a kq */
kq->kq_fdp = fdp;
- FILE_SET_MATURE(fp, p);
- return (0);
+ fdinsert(fdp, fd, 0, fp);
+ FRELE(fp, p);
+out:
+ fdpunlock(fdp);
+ return (error);
}
int
diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
index 10d2eb1bbbc..f9649a1e353 100644
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -584,7 +584,7 @@ sys_execve(struct proc *p, void *v, register_t *retval)
struct vnode *vp;
int indx;
- if ((error = falloc(p, 0, &fp, &indx)) != 0)
+ if ((error = falloc(p, &fp, &indx)) != 0)
break;
#ifdef DIAGNOSTIC
if (indx != i)
@@ -607,10 +607,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
fp->f_type = DTYPE_VNODE;
fp->f_ops = &vnops;
fp->f_data = (caddr_t)vp;
- FILE_SET_MATURE(fp, p);
- } else {
- FRELE(fp, p);
+ fdinsert(p->p_fd, indx, 0, fp);
}
+ FRELE(fp, p);
}
fdpunlock(p->p_fd);
if (error)
diff --git sys/kern/sys_pipe.c sys/kern/sys_pipe.c
index c0655cdda67..ebb79c9da77 100644
--- sys/kern/sys_pipe.c
+++ sys/kern/sys_pipe.c
@@ -154,7 +154,7 @@ dopipe(struct proc *p, int *ufds, int flags)
fdplock(fdp);
- error = falloc(p, cloexec, &rf, &fds[0]);
+ error = falloc(p, &rf, &fds[0]);
if (error != 0)
goto free2;
rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
@@ -162,7 +162,7 @@ dopipe(struct proc *p, int *ufds, int flags)
rf->f_data = rpipe;
rf->f_ops = &pipeops;
- error = falloc(p, cloexec, &wf, &fds[1]);
+ error = falloc(p, &wf, &fds[1]);
if (error != 0)
goto free3;
wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
@@ -173,8 +173,8 @@ dopipe(struct proc *p, int *ufds, int flags)
rpipe->pipe_peer = wpipe;
wpipe->pipe_peer = rpipe;
- FILE_SET_MATURE(rf, p);
- FILE_SET_MATURE(wf, p);
+ fdinsert(fdp, fds[0], cloexec, rf);
+ fdinsert(fdp, fds[1], cloexec, wf);
error = copyout(fds, ufds, sizeof(fds));
if (error != 0) {
@@ -186,6 +186,9 @@ dopipe(struct proc *p, int *ufds, int flags)
ktrfds(p, fds, 2);
#endif
fdpunlock(fdp);
+
+ FRELE(rf, p);
+ FRELE(wf, p);
return (error);
free3:
diff --git sys/kern/tty_pty.c sys/kern/tty_pty.c
index 4e5f75aa1b9..b6eeded0a62 100644
--- sys/kern/tty_pty.c
+++ sys/kern/tty_pty.c
@@ -1070,11 +1070,11 @@ ptmioctl(dev_t dev, u_long cmd, caddr_t data, int flag,
struct proc *p)
case PTMGET:
fdplock(fdp);
/* Grab two filedescriptors. */
- if ((error = falloc(p, 0, &cfp, &cindx)) != 0) {
+ if ((error = falloc(p, &cfp, &cindx)) != 0) {
fdpunlock(fdp);
break;
}
- if ((error = falloc(p, 0, &sfp, &sindx)) != 0) {
+ if ((error = falloc(p, &sfp, &sindx)) != 0) {
fdremove(fdp, cindx);
closef(cfp, p);
fdpunlock(fdp);
@@ -1166,11 +1166,12 @@ retry:
memcpy(ptm->cn, pti->pty_pn, sizeof(pti->pty_pn));
memcpy(ptm->sn, pti->pty_sn, sizeof(pti->pty_sn));
- /* mark the files mature now that we've passed all errors */
- FILE_SET_MATURE(cfp, p);
- FILE_SET_MATURE(sfp, p);
-
+ /* insert files now that we've passed all errors */
+ fdinsert(fdp, cindx, 0, cfp);
+ fdinsert(fdp, sindx, 0, sfp);
fdpunlock(fdp);
+ FRELE(cfp, p);
+ FRELE(sfp, p);
break;
default:
error = EINVAL;
diff --git sys/kern/uipc_syscalls.c sys/kern/uipc_syscalls.c
index 6b3a629a0c4..aafd96c9083 100644
--- sys/kern/uipc_syscalls.c
+++ sys/kern/uipc_syscalls.c
@@ -105,9 +105,9 @@ sys_socket(struct proc *p, void *v, register_t *retval)
goto out;
fdplock(fdp);
- error = falloc(p, cloexec, &fp, &fd);
- fdpunlock(fdp);
+ error = falloc(p, &fp, &fd);
if (error) {
+ fdpunlock(fdp);
soclose(so);
} else {
fp->f_flag = fflag;
@@ -117,7 +117,9 @@ sys_socket(struct proc *p, void *v, register_t *retval)
so->so_state |= SS_NBIO;
so->so_state |= ss;
fp->f_data = so;
- FILE_SET_MATURE(fp, p);
+ fdinsert(fdp, fd, cloexec, fp);
+ fdpunlock(fdp);
+ FRELE(fp, p);
*retval = fd;
}
out:
@@ -272,7 +274,9 @@ doaccept(struct proc *p, int sock, struct sockaddr *name,
socklen_t *anamelen,
socklen_t namelen;
int error, s, tmpfd;
struct socket *head, *so;
- int nflag;
+ int cloexec, nflag;
+
+ cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0;
if (name && (error = copyin(anamelen, &namelen, sizeof (namelen))))
return (error);
@@ -282,7 +286,7 @@ doaccept(struct proc *p, int sock, struct sockaddr *name,
socklen_t *anamelen,
headfp = fp;
fdplock(fdp);
- error = falloc(p, (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0, &fp, &tmpfd);
+ error = falloc(p, &fp, &tmpfd);
fdpunlock(fdp);
if (error) {
FRELE(headfp, p);
@@ -347,8 +351,11 @@ out:
else
so->so_state &= ~SS_NBIO;
sounlock(s);
+ fdplock(fdp);
fp->f_data = so;
- FILE_SET_MATURE(fp, p);
+ fdinsert(fdp, tmpfd, cloexec, fp);
+ fdpunlock(fdp);
+ FRELE(fp, p);
*retval = tmpfd;
} else {
sounlock(s);
@@ -476,13 +483,13 @@ sys_socketpair(struct proc *p, void *v, register_t
*retval)
goto free2;
}
fdplock(fdp);
- if ((error = falloc(p, cloexec, &fp1, &sv[0])) != 0)
+ if ((error = falloc(p, &fp1, &sv[0])) != 0)
goto free3;
fp1->f_flag = fflag;
fp1->f_type = DTYPE_SOCKET;
fp1->f_ops = &socketops;
fp1->f_data = so1;
- if ((error = falloc(p, cloexec, &fp2, &sv[1])) != 0)
+ if ((error = falloc(p, &fp2, &sv[1])) != 0)
goto free4;
fp2->f_flag = fflag;
fp2->f_type = DTYPE_SOCKET;
@@ -500,9 +507,11 @@ sys_socketpair(struct proc *p, void *v, register_t *retval)
(*fp2->f_ops->fo_ioctl)(fp2, FIONBIO, (caddr_t)&type,
p);
}
- FILE_SET_MATURE(fp1, p);
- FILE_SET_MATURE(fp2, p);
+ fdinsert(fdp, sv[0], cloexec, fp1);
+ fdinsert(fdp, sv[1], cloexec, fp2);
fdpunlock(fdp);
+ FRELE(fp1, p);
+ FRELE(fp2, p);
return (0);
}
fdremove(fdp, sv[1]);
diff --git sys/kern/vfs_syscalls.c sys/kern/vfs_syscalls.c
index a39376f6fe7..52e82963f20 100644
--- sys/kern/vfs_syscalls.c
+++ sys/kern/vfs_syscalls.c
@@ -899,7 +899,7 @@ doopenat(struct proc *p, int fd, const char *path, int
oflags, mode_t mode,
struct file *fp;
struct vnode *vp;
struct vattr vattr;
- int flags, cmode;
+ int flags, cloexec, cmode;
int type, indx, error, localtrunc = 0;
struct flock lf;
struct nameidata nd;
@@ -911,10 +911,10 @@ doopenat(struct proc *p, int fd, const char *path, int
oflags, mode_t mode,
return (error);
}
- fdplock(fdp);
+ cloexec = (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0;
- if ((error = falloc(p, (oflags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp,
- &indx)) != 0)
+ fdplock(fdp);
+ if ((error = falloc(p, &fp, &indx)) != 0)
goto out;
flags = FFLAGS(oflags);
if (flags & FREAD)
@@ -999,7 +999,8 @@ doopenat(struct proc *p, int fd, const char *path, int
oflags, mode_t mode,
}
VOP_UNLOCK(vp);
*retval = indx;
- FILE_SET_MATURE(fp, p);
+ fdinsert(fdp, indx, cloexec, fp);
+ FRELE(fp, p);
out:
fdpunlock(fdp);
return (error);
@@ -1060,7 +1061,7 @@ sys_fhopen(struct proc *p, void *v, register_t *retval)
struct vnode *vp = NULL;
struct mount *mp;
struct ucred *cred = p->p_ucred;
- int flags;
+ int flags, cloexec;
int type, indx, error=0;
struct flock lf;
struct vattr va;
@@ -1078,9 +1079,10 @@ sys_fhopen(struct proc *p, void *v, register_t *retval)
if ((flags & O_CREAT))
return (EINVAL);
+ cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
+
fdplock(fdp);
- if ((error = falloc(p, (flags & O_CLOEXEC) ? UF_EXCLOSE : 0, &fp,
- &indx)) != 0) {
+ if ((error = falloc(p, &fp, &indx)) != 0) {
fp = NULL;
goto bad;
}
@@ -1160,9 +1162,9 @@ sys_fhopen(struct proc *p, void *v, register_t *retval)
}
VOP_UNLOCK(vp);
*retval = indx;
- FILE_SET_MATURE(fp, p);
-
+ fdinsert(fdp, indx, cloexec, fp);
fdpunlock(fdp);
+ FRELE(fp, p);
return (0);
bad:
diff --git sys/sys/file.h sys/sys/file.h
index 042238ebd3f..9e322b4ae12 100644
--- sys/sys/file.h
+++ sys/sys/file.h
@@ -91,10 +91,7 @@ struct file {
};
#define FIF_HASLOCK 0x01 /* descriptor holds advisory lock */
-#define FIF_LARVAL 0x02 /* not fully constructed, don't use */
-
-#define FILE_IS_USABLE(fp) \
- (((fp)->f_iflags & FIF_LARVAL) == 0)
+#define FIF_INSERTED 0x80 /* present in `filehead' */
#define FREF(fp) \
do { \
@@ -104,11 +101,6 @@ struct file {
} while (0)
#define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0)
-#define FILE_SET_MATURE(fp,p) do { \
- (fp)->f_iflags &= ~FIF_LARVAL; \
- FRELE(fp, p); \
-} while (0)
-
int fdrop(struct file *, struct proc *);
LIST_HEAD(filelist, file);
diff --git sys/sys/filedesc.h sys/sys/filedesc.h
index 39486fafd60..1e1c463642e 100644
--- sys/sys/filedesc.h
+++ sys/sys/filedesc.h
@@ -125,12 +125,13 @@ void filedesc_init(void);
int dupfdopen(struct proc *, int, int);
int fdalloc(struct proc *p, int want, int *result);
void fdexpand(struct proc *);
-int falloc(struct proc *_p, int _flags, struct file **_rfp, int *_rfd);
+int falloc(struct proc *_p, struct file **_rfp, int *_rfd);
struct filedesc *fdinit(void);
struct filedesc *fdshare(struct process *);
struct filedesc *fdcopy(struct process *);
void fdfree(struct proc *p);
int fdrelease(struct proc *p, int);
+void fdinsert(struct filedesc *, int, int, struct file *);
void fdremove(struct filedesc *, int);
void fdcloseexec(struct proc *);
struct file *fd_iterfile(struct file *, struct proc *);