Here is a new version of the f_count atomic update diff. Now all
updating of the `f_count' field should use atomic operations, and
the `fhdlk' is only used for serializing access to the list of open
files. In addition, the role of the `fd_fplock' should be clearer now:
its only purpose is to let fd_getfile() instantiate a file reference
safely. The `fd_lock' has to be locked when changing a file descriptor
table.

When the unlocking of system calls proceeds further, it will become
necessary to protect the `fd_ofileflags' field with a lock because
of fdexpand() and to maintain coherency with `fd_ofiles'. The patch
includes two small, logically separate, pieces towards that: fdp
locking for fcntl(F_GETFD), and fdp locking for unp_internalize().
The former merely ensures that the `fd_ofileflags' array is valid,
while the latter also makes sure the `fd_ofileflags[fd]' entry matches
with the file instance.

OK?

Index: kern/kern_descrip.c
===================================================================
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.171
diff -u -p -r1.171 kern_descrip.c
--- kern/kern_descrip.c 26 Jun 2018 14:43:01 -0000      1.171
+++ kern/kern_descrip.c 27 Jun 2018 12:52:25 -0000
@@ -198,6 +198,7 @@ struct file *
 fd_iterfile(struct file *fp, struct proc *p)
 {
        struct file *nfp;
+       unsigned int count;
 
        mtx_enter(&fhdlk);
        if (fp == NULL)
@@ -206,10 +207,15 @@ fd_iterfile(struct file *fp, struct proc
                nfp = LIST_NEXT(fp, f_list);
 
        /* don't refcount when f_count == 0 to avoid race in fdrop() */
-       while (nfp != NULL && nfp->f_count == 0)
-               nfp = LIST_NEXT(nfp, f_list);
-       if (nfp != NULL)
-               nfp->f_count++;
+       while (nfp != NULL) {
+               count = nfp->f_count;
+               if (count == 0) {
+                       nfp = LIST_NEXT(nfp, f_list);
+                       continue;
+               }
+               if (atomic_cas_uint(&nfp->f_count, count, count + 1) == count)
+                       break;
+       }
        mtx_leave(&fhdlk);
 
        if (fp != NULL)
@@ -228,11 +234,11 @@ fd_getfile(struct filedesc *fdp, int fd)
        if ((u_int)fd >= fdp->fd_nfiles)
                return (NULL);
 
-       mtx_enter(&fhdlk);
+       mtx_enter(&fdp->fd_fplock);
        fp = fdp->fd_ofiles[fd];
        if (fp != NULL)
-               fp->f_count++;
-       mtx_leave(&fhdlk);
+               atomic_inc_int(&fp->f_count);
+       mtx_leave(&fdp->fd_fplock);
 
        return (fp);
 }
@@ -433,7 +439,9 @@ restart:
                return (error);
 
        case F_GETFD:
+               fdplock(fdp);
                *retval = fdp->fd_ofileflags[fd] & UF_EXCLOSE ? 1 : 0;
+               fdpunlock(fdp);
                break;
 
        case F_SETFD:
@@ -652,7 +660,7 @@ finishdup(struct proc *p, struct file *f
        fdpassertlocked(fdp);
        KASSERT(fp->f_iflags & FIF_INSERTED);
 
-       if (fp->f_count == LONG_MAX-2) {
+       if (fp->f_count >= FDUP_MAX_COUNT) {
                FRELE(fp, p);
                return (EDEADLK);
        }
@@ -666,7 +674,14 @@ finishdup(struct proc *p, struct file *f
                fd_used(fdp, new);
        }
 
+       /*
+        * Use `fd_fplock' to synchronize with fd_getfile() so that
+        * the function no longer creates a new reference to the old file.
+        */
+       mtx_enter(&fdp->fd_fplock);
        fdp->fd_ofiles[new] = fp;
+       mtx_leave(&fdp->fd_fplock);
+
        fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
        *retval = new;
 
@@ -694,18 +709,31 @@ fdinsert(struct filedesc *fdp, int fd, i
                        LIST_INSERT_HEAD(&filehead, fp, f_list);
                }
        }
+       mtx_leave(&fhdlk);
+
+       mtx_enter(&fdp->fd_fplock);
        KASSERT(fdp->fd_ofiles[fd] == NULL);
        fdp->fd_ofiles[fd] = fp;
+       mtx_leave(&fdp->fd_fplock);
+
        fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
-       mtx_leave(&fhdlk);
 }
 
 void
 fdremove(struct filedesc *fdp, int fd)
 {
        fdpassertlocked(fdp);
+
+       /*
+        * Use `fd_fplock' to synchronize with fd_getfile() so that
+        * the function no longer creates a new reference to the file.
+        */
+       mtx_enter(&fdp->fd_fplock);
        fdp->fd_ofiles[fd] = NULL;
+       mtx_leave(&fdp->fd_fplock);
+
        fdp->fd_ofileflags[fd] = 0;
+
        fd_unused(fdp, fd);
 }
 
@@ -839,6 +867,8 @@ fdalloc(struct proc *p, int want, int *r
        int lim, last, i;
        u_int new, off;
 
+       fdpassertlocked(fdp);
+
        /*
         * Search for a free descriptor starting at the higher
         * of want or fd_freefile.  If that fails, consider
@@ -886,14 +916,17 @@ void
 fdexpand(struct proc *p)
 {
        struct filedesc *fdp = p->p_fd;
-       int nfiles;
+       int nfiles, oldnfiles;
        size_t copylen;
-       struct file **newofile;
+       struct file **newofile, **oldofile;
        char *newofileflags;
        u_int *newhimap, *newlomap;
 
        fdpassertlocked(fdp);
 
+       oldnfiles = fdp->fd_nfiles;
+       oldofile = fdp->fd_ofiles;
+
        /*
         * No space in current array.
         */
@@ -927,9 +960,6 @@ fdexpand(struct proc *p)
        memcpy(newofileflags, fdp->fd_ofileflags, copylen);
        memset(newofileflags + copylen, 0, nfiles * sizeof(char) - copylen);
 
-       if (fdp->fd_nfiles > NDFILE)
-               free(fdp->fd_ofiles, M_FILEDESC, fdp->fd_nfiles * OFILESIZE);
-
        if (NDHISLOTS(nfiles) > NDHISLOTS(fdp->fd_nfiles)) {
                copylen = NDHISLOTS(fdp->fd_nfiles) * sizeof(u_int);
                memcpy(newhimap, fdp->fd_himap, copylen);
@@ -950,9 +980,16 @@ fdexpand(struct proc *p)
                fdp->fd_himap = newhimap;
                fdp->fd_lomap = newlomap;
        }
+
+       mtx_enter(&fdp->fd_fplock);
        fdp->fd_ofiles = newofile;
+       mtx_leave(&fdp->fd_fplock);
+
        fdp->fd_ofileflags = newofileflags;
-       fdp->fd_nfiles = nfiles;        
+       fdp->fd_nfiles = nfiles;
+
+       if (oldnfiles > NDFILE)
+               free(oldofile, M_FILEDESC, oldnfiles * OFILESIZE);
 }
 
 /*
@@ -1019,9 +1056,7 @@ fnew(struct proc *p)
        fp->f_cred = p->p_ucred;
        crhold(fp->f_cred);
 
-       mtx_enter(&fhdlk);
-       fp->f_count++;
-       mtx_leave(&fhdlk);
+       FREF(fp);
 
        return (fp);
 }
@@ -1036,6 +1071,7 @@ fdinit(void)
 
        newfdp = pool_get(&fdesc_pool, PR_WAITOK|PR_ZERO);
        rw_init(&newfdp->fd_fd.fd_lock, "fdlock");
+       mtx_init(&newfdp->fd_fd.fd_fplock, IPL_MPFLOOR);
 
        /* Create the file descriptor table. */
        newfdp->fd_fd.fd_refcnt = 1;
@@ -1113,7 +1149,6 @@ fdcopy(struct process *pr)
        newfdp->fd_flags = fdp->fd_flags;
        newfdp->fd_cmask = fdp->fd_cmask;
 
-       mtx_enter(&fhdlk);
        for (i = 0; i <= fdp->fd_lastfile; i++) {
                struct file *fp = fdp->fd_ofiles[i];
 
@@ -1126,17 +1161,16 @@ fdcopy(struct process *pr)
                         * tied to the process that opened them to enforce
                         * their internal consistency, so close them here.
                         */
-                       if (fp->f_count == LONG_MAX-2 ||
+                       if (fp->f_count >= FDUP_MAX_COUNT ||
                            fp->f_type == DTYPE_KQUEUE)
                                continue;
 
-                       fp->f_count++;
+                       FREF(fp);
                        newfdp->fd_ofiles[i] = fp;
                        newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
                        fd_used(newfdp, i);
                }
        }
-       mtx_leave(&fhdlk);
        fdpunlock(fdp);
 
        return (newfdp);
@@ -1196,11 +1230,9 @@ closef(struct file *fp, struct proc *p)
        if (fp == NULL)
                return (0);
 
-       KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count);
+       KASSERTMSG(fp->f_count >= 2, "count (%u) < 2", fp->f_count);
 
-       mtx_enter(&fhdlk);
-       fp->f_count--;
-       mtx_leave(&fhdlk);
+       atomic_dec_int(&fp->f_count);
 
        /*
         * POSIX record locking dictates that any close releases ALL
@@ -1232,10 +1264,9 @@ fdrop(struct file *fp, struct proc *p)
 {
        int error;
 
-       MUTEX_ASSERT_LOCKED(&fhdlk);
-
-       KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count);
+       KASSERTMSG(fp->f_count == 0, "count (%u) != 0", fp->f_count);
 
+       mtx_enter(&fhdlk);
        if (fp->f_iflags & FIF_INSERTED)
                LIST_REMOVE(fp, f_list);
        mtx_leave(&fhdlk);
@@ -1372,13 +1403,18 @@ dupfdopen(struct proc *p, int indx, int 
                FRELE(wfp, p);
                return (EACCES);
        }
-       if (wfp->f_count == LONG_MAX-2) {
+       if (wfp->f_count >= FDUP_MAX_COUNT) {
                FRELE(wfp, p);
                return (EDEADLK);
        }
 
        KASSERT(wfp->f_iflags & FIF_INSERTED);
+
+       mtx_enter(&fdp->fd_fplock);
+       KASSERT(fdp->fd_ofiles[indx] == NULL);
        fdp->fd_ofiles[indx] = wfp;
+       mtx_leave(&fdp->fd_fplock);
+
        fdp->fd_ofileflags[indx] = (fdp->fd_ofileflags[indx] & UF_EXCLOSE) |
            (fdp->fd_ofileflags[dupfd] & ~UF_EXCLOSE);
 
Index: kern/kern_sysctl.c
===================================================================
RCS file: src/sys/kern/kern_sysctl.c,v
retrieving revision 1.343
diff -u -p -r1.343 kern_sysctl.c
--- kern/kern_sysctl.c  20 Jun 2018 10:52:49 -0000      1.343
+++ kern/kern_sysctl.c  27 Jun 2018 12:52:25 -0000
@@ -1077,9 +1077,7 @@ fill_file(struct kinfo_file *kf, struct 
                kf->f_flag = fp->f_flag;
                kf->f_iflags = fp->f_iflags;
                kf->f_type = fp->f_type;
-               mtx_enter(&fhdlk);
                kf->f_count = fp->f_count;
-               mtx_leave(&fhdlk);
                if (show_pointers)
                        kf->f_ucred = PTRTOINT64(fp->f_cred);
                kf->f_uid = fp->f_cred->cr_uid;
Index: kern/uipc_usrreq.c
===================================================================
RCS file: src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.131
diff -u -p -r1.131 uipc_usrreq.c
--- kern/uipc_usrreq.c  23 Jun 2018 11:33:32 -0000      1.131
+++ kern/uipc_usrreq.c  27 Jun 2018 12:52:25 -0000
@@ -744,12 +744,16 @@ restart:
                 * fdalloc() works properly.. We finalize it all
                 * in the loop below.
                 */
+               mtx_enter(&p->p_fd->fd_fplock);
+               KASSERT(p->p_fd->fd_ofiles[fds[i]] == NULL);
                fdp->fd_ofiles[fds[i]] = rp->fp;
-               fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
-               rp++;
+               mtx_leave(&p->p_fd->fd_fplock);
 
+               fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
                if (flags & MSG_CMSG_CLOEXEC)
                        fdp->fd_ofileflags[fds[i]] |= UF_EXCLOSE;
+
+               rp++;
        }
 
        /*
@@ -839,6 +843,7 @@ morespace:
 
        ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
        rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
+       fdplock(fdp);
        for (i = 0; i < nfds; i++) {
                memcpy(&fd, ip, sizeof fd);
                ip--;
@@ -846,7 +851,7 @@ morespace:
                        error = EBADF;
                        goto fail;
                }
-               if (fp->f_count == LONG_MAX-2) {
+               if (fp->f_count >= FDUP_MAX_COUNT) {
                        error = EDEADLK;
                        goto fail;
                }
@@ -868,8 +873,10 @@ morespace:
                }
                unp_rights++;
        }
+       fdpunlock(fdp);
        return (0);
 fail:
+       fdpunlock(fdp);
        if (fp != NULL)
                FRELE(fp, p);
        /* Back out what we just did. */
@@ -924,7 +931,6 @@ unp_gc(void *arg __unused)
        do {
                nunref = 0;
                LIST_FOREACH(unp, &unp_head, unp_link) {
-                       mtx_enter(&fhdlk);
                        fp = unp->unp_file;
                        if (unp->unp_flags & UNP_GCDEFER) {
                                /*
@@ -936,7 +942,6 @@ unp_gc(void *arg __unused)
                                unp_defer--;
                        } else if (unp->unp_flags & UNP_GCMARK) {
                                /* marked as live in previous pass */
-                               mtx_leave(&fhdlk);
                                continue;
                        } else if (fp == NULL) {
                                /* not being passed, so can't be in loop */
@@ -955,11 +960,9 @@ unp_gc(void *arg __unused)
                                if (fp->f_count == unp->unp_msgcount) {
                                        nunref++;
                                        unp->unp_flags |= UNP_GCDEAD;
-                                       mtx_leave(&fhdlk);
                                        continue;
                                }
                        }
-                       mtx_leave(&fhdlk);
 
                        /*
                         * This is the first time we've seen this socket on
Index: sys/file.h
===================================================================
RCS file: src/sys/sys/file.h,v
retrieving revision 1.50
diff -u -p -r1.50 file.h
--- sys/file.h  25 Jun 2018 22:29:16 -0000      1.50
+++ sys/file.h  27 Jun 2018 12:52:25 -0000
@@ -66,11 +66,12 @@ struct      fileops {
  *  Locks used to protect struct members in this file:
  *     I       immutable after creation
  *     F       global `fhdlk' mutex
+ *     a       atomic operations
  *     f       per file `f_mtx'
  *     k       kernel lock
  */
 struct file {
-       LIST_ENTRY(file) f_list;/* [k] list of active files */
+       LIST_ENTRY(file) f_list;/* [F] list of active files */
        struct mutex f_mtx;
        short   f_flag;         /* [k] see fcntl.h */
 #define        DTYPE_VNODE     1       /* file */
@@ -79,7 +80,7 @@ struct file {
 #define        DTYPE_KQUEUE    4       /* event queue */
 #define        DTYPE_DMABUF    5       /* DMA buffer (for DRM) */
        short   f_type;         /* [I] descriptor type */
-       long    f_count;        /* [F] reference count */
+       u_int   f_count;        /* [a] reference count */
        struct  ucred *f_cred;  /* [I] credentials associated with descriptor */
        struct  fileops *f_ops; /* [I] file operation pointers */
        off_t   f_offset;       /* [k] */
@@ -99,25 +100,16 @@ struct file {
        do { \
                extern void vfs_stall_barrier(void); \
                vfs_stall_barrier(); \
-               mtx_enter(&fhdlk); \
-               (fp)->f_count++; \
-               mtx_leave(&fhdlk); \
+               atomic_inc_int(&(fp)->f_count); \
        } while (0)
 
 #define FRELE(fp,p) \
-({ \
-       int rv = 0; \
-       mtx_enter(&fhdlk); \
-       if (--(fp)->f_count == 0) \
-               rv = fdrop(fp, p); \
-       else \
-               mtx_leave(&fhdlk); \
-       rv; \
-})
+       (atomic_dec_int_nv(&fp->f_count) == 0 ? fdrop(fp, p) : 0)
+
+#define FDUP_MAX_COUNT         (UINT_MAX - 2 * MAXCPUS)
 
 int    fdrop(struct file *, struct proc *);
 
-extern struct mutex fhdlk;             /* protects `filehead' and f_count */
 LIST_HEAD(filelist, file);
 extern int maxfiles;                   /* kernel limit on number of open files 
*/
 extern int numfiles;                   /* actual number of open files */
Index: sys/filedesc.h
===================================================================
RCS file: src/sys/sys/filedesc.h,v
retrieving revision 1.40
diff -u -p -r1.40 filedesc.h
--- sys/filedesc.h      25 Jun 2018 09:36:28 -0000      1.40
+++ sys/filedesc.h      27 Jun 2018 12:52:25 -0000
@@ -32,6 +32,7 @@
  *     @(#)filedesc.h  8.1 (Berkeley) 6/2/93
  */
 
+#include <sys/mutex.h>
 #include <sys/rwlock.h>
 /*
  * This structure is used for the management of descriptors.  It may be
@@ -72,6 +73,8 @@ struct filedesc {
        struct rwlock fd_lock;          /* lock for the file descs; must be */
                                        /* held when writing to fd_ofiles, */
                                        /* fd_ofileflags, or fd_{hi,lo}map */
+       struct mutex fd_fplock;         /* lock for reading fd_ofiles without
+                                        * fd_lock */
 
        int fd_flags;                   /* flags on the file descriptor table */
 };

Reply via email to