Diff below contains the interesting bits to unlock most of the network
related syscalls.  As previously explained [0], we know that `f_data'
is immutable for sockets so we only have to protect `f_count'.  This
is done by using a global mutex: `fhdlk'.

I'm aware that this is not the best solution, but my goal is to remove
the KERNEL_LOCK(), not to improve a not-yet-existing fine grained locking.
That said, if you feel like turning this mutex into SRPs or atomic ops,
please go ahead!  I'll continue to reduce the scope of the KERNEL_LOCK()
and the NET_LOCK() but I'll do my best to review your work.

This diff also adds some more KERNEL_LOCK() for ktrace(1) and psignal().

I'm quite confident as it has been tested by many people as part of the
big unlocking diff.  But more tests/reviews are welcome!

ok?

[0] https://marc.info/?l=openbsd-tech&m=152699644924165&w=2 

Index: kern/kern_descrip.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_descrip.c,v
retrieving revision 1.163
diff -u -p -r1.163 kern_descrip.c
--- kern/kern_descrip.c 2 Jun 2018 12:42:18 -0000       1.163
+++ kern/kern_descrip.c 4 Jun 2018 08:46:52 -0000
@@ -66,7 +66,11 @@
 
 /*
  * Descriptor management.
+ *
+ * We need to block interrupts as long as `fhdlk' is being taken
+ * with and without the KERNEL_LOCK().
  */
+struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR);
 struct filelist filehead;      /* head of list of open files */
 int numfiles;                  /* actual number of open files */
 
@@ -195,16 +199,18 @@ fd_iterfile(struct file *fp, struct proc
 {
        struct file *nfp;
 
+       mtx_enter(&fhdlk);
        if (fp == NULL)
                nfp = LIST_FIRST(&filehead);
        else
                nfp = LIST_NEXT(fp, f_list);
 
-       /* don't FREF when f_count == 0 to avoid race in fdrop() */
+       /* don't refcount when f_count == 0 to avoid race in fdrop() */
        while (nfp != NULL && nfp->f_count == 0)
                nfp = LIST_NEXT(nfp, f_list);
        if (nfp != NULL)
-               FREF(nfp);
+               nfp->f_count++;
+       mtx_leave(&fhdlk);
 
        if (fp != NULL)
                FRELE(fp, p);
@@ -217,10 +223,17 @@ fd_getfile(struct filedesc *fdp, int fd)
 {
        struct file *fp;
 
-       if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
+       vfs_stall_barrier();
+
+       if ((u_int)fd >= fdp->fd_nfiles)
                return (NULL);
 
-       FREF(fp);
+       mtx_enter(&fhdlk);
+       fp = fdp->fd_ofiles[fd];
+       if (fp != NULL)
+               fp->f_count++;
+       mtx_leave(&fhdlk);
+
        return (fp);
 }
 
@@ -672,6 +685,8 @@ fdinsert(struct filedesc *fdp, int fd, i
        struct file *fq;
 
        fdpassertlocked(fdp);
+
+       mtx_enter(&fhdlk);
        if ((fq = fdp->fd_ofiles[0]) != NULL) {
                LIST_INSERT_AFTER(fq, fp, f_list);
        } else {
@@ -681,6 +696,7 @@ fdinsert(struct filedesc *fdp, int fd, i
        fdp->fd_ofiles[fd] = fp;
        fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
        fp->f_iflags |= FIF_INSERTED;
+       mtx_leave(&fhdlk);
 }
 
 void
@@ -986,7 +1002,11 @@ restart:
        crhold(fp->f_cred);
        *resultfp = fp;
        *resultfd = i;
-       FREF(fp);
+
+       mtx_enter(&fhdlk);
+       fp->f_count++;
+       mtx_leave(&fhdlk);
+
        return (0);
 }
 
@@ -1078,6 +1098,7 @@ fdcopy(struct process *pr)
        newfdp->fd_flags = fdp->fd_flags;
        newfdp->fd_cmask = fdp->fd_cmask;
 
+       mtx_enter(&fhdlk);
        for (i = 0; i <= fdp->fd_lastfile; i++) {
                struct file *fp = fdp->fd_ofiles[i];
 
@@ -1094,12 +1115,13 @@ fdcopy(struct process *pr)
                            fp->f_type == DTYPE_KQUEUE)
                                continue;
 
-                       FREF(fp);
+                       fp->f_count++;
                        newfdp->fd_ofiles[i] = fp;
                        newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
                        fd_used(newfdp, i);
                }
        }
+       mtx_leave(&fhdlk);
        fdpunlock(fdp);
 
        return (newfdp);
@@ -1121,8 +1143,9 @@ fdfree(struct proc *p)
        for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) {
                fp = *fpp;
                if (fp != NULL) {
-                       FREF(fp);
                        *fpp = NULL;
+                        /* closef() expects a refcount of 2 */
+                       FREF(fp);
                        (void) closef(fp, p);
                }
        }
@@ -1160,11 +1183,11 @@ closef(struct file *fp, struct proc *p)
        if (fp == NULL)
                return (0);
 
-#ifdef DIAGNOSTIC
-       if (fp->f_count < 2)
-               panic("closef: count (%ld) < 2", fp->f_count);
-#endif
+       KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count);
+
+       mtx_enter(&fhdlk);
        fp->f_count--;
+       mtx_leave(&fhdlk);
 
        /*
         * POSIX record locking dictates that any close releases ALL
@@ -1196,18 +1219,19 @@ fdrop(struct file *fp, struct proc *p)
 {
        int error;
 
-#ifdef DIAGNOSTIC
-       if (fp->f_count != 0)
-               panic("fdrop: count (%ld) != 0", fp->f_count);
-#endif
+       MUTEX_ASSERT_LOCKED(&fhdlk);
+
+       KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count);
+
+       if (fp->f_iflags & FIF_INSERTED)
+               LIST_REMOVE(fp, f_list);
+       mtx_leave(&fhdlk);
 
        if (fp->f_ops)
                error = (*fp->f_ops->fo_close)(fp, p);
        else
                error = 0;
 
-       if (fp->f_iflags & FIF_INSERTED)
-               LIST_REMOVE(fp, f_list);
        crfree(fp->f_cred);
        numfiles--;
        pool_put(&file_pool, fp);
Index: kern/kern_ktrace.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_ktrace.c,v
retrieving revision 1.97
diff -u -p -r1.97 kern_ktrace.c
--- kern/kern_ktrace.c  27 May 2018 06:02:14 -0000      1.97
+++ kern/kern_ktrace.c  4 Jun 2018 08:46:52 -0000
@@ -225,7 +225,7 @@ ktrgenio(struct proc *p, int fd, enum ui
        struct ktr_header kth;
        struct ktr_genio ktp;
        caddr_t cp;
-       int count;
+       int count, error;
        int buflen;
 
        atomic_setbits_int(&p->p_flag, P_INKTR);
@@ -254,7 +254,10 @@ ktrgenio(struct proc *p, int fd, enum ui
                if (copyin(iov->iov_base, cp, count))
                        break;
 
-               if (ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count) != 0)
+               KERNEL_LOCK();
+               error = ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count);
+               KERNEL_UNLOCK();
+               if (error != 0)
                        break;
 
                iov->iov_len -= count;
@@ -294,13 +297,14 @@ ktrstruct(struct proc *p, const char *na
 {
        struct ktr_header kth;
 
-       KERNEL_ASSERT_LOCKED();
        atomic_setbits_int(&p->p_flag, P_INKTR);
        ktrinitheader(&kth, p, KTR_STRUCT);
-       
+
        if (data == NULL)
                datalen = 0;
+       KERNEL_LOCK();
        ktrwrite2(p, &kth, name, strlen(name) + 1, data, datalen);
+       KERNEL_UNLOCK();
        atomic_clearbits_int(&p->p_flag, P_INKTR);
 }
 
@@ -386,7 +390,9 @@ ktrpledge(struct proc *p, int error, uin
        kp.code = code;
        kp.syscall = syscall;
 
+       KERNEL_LOCK();
        ktrwrite(p, &kth, &kp, sizeof(kp));
+       KERNEL_UNLOCK();
        atomic_clearbits_int(&p->p_flag, P_INKTR);
 }
 
@@ -622,6 +628,8 @@ ktrwriteraw(struct proc *curp, struct vn
        struct iovec aiov[3];
        struct process *pr;
        int error;
+
+       KERNEL_ASSERT_LOCKED();
 
        auio.uio_iov = &aiov[0];
        auio.uio_offset = 0;
Index: kern/kern_pledge.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_pledge.c,v
retrieving revision 1.231
diff -u -p -r1.231 kern_pledge.c
--- kern/kern_pledge.c  3 Jun 2018 18:20:28 -0000       1.231
+++ kern/kern_pledge.c  4 Jun 2018 08:46:52 -0000
@@ -523,6 +523,7 @@ pledge_fail(struct proc *p, int error, u
        if (p->p_p->ps_pledge & PLEDGE_ERROR)
                return (ENOSYS);
 
+       KERNEL_LOCK();
        log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n",
            p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall);
        p->p_p->ps_acflag |= APLEDGE;
@@ -535,6 +536,7 @@ pledge_fail(struct proc *p, int error, u
        psignal(p, SIGABRT);
 
        p->p_p->ps_pledge = 0;          /* Disable all PLEDGE_ flags */
+       KERNEL_UNLOCK();
        return (error);
 }
 
Index: kern/kern_sysctl.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.341
diff -u -p -r1.341 kern_sysctl.c
--- kern/kern_sysctl.c  2 Jun 2018 16:38:21 -0000       1.341
+++ kern/kern_sysctl.c  4 Jun 2018 08:46:52 -0000
@@ -1073,7 +1073,9 @@ fill_file(struct kinfo_file *kf, struct 
                kf->f_flag = fp->f_flag;
                kf->f_iflags = fp->f_iflags;
                kf->f_type = fp->f_type;
+               mtx_enter(&fhdlk);
                kf->f_count = fp->f_count;
+               mtx_leave(&fhdlk);
                if (show_pointers)
                        kf->f_ucred = PTRTOINT64(fp->f_cred);
                kf->f_uid = fp->f_cred->cr_uid;
Index: kern/syscalls.master
===================================================================
RCS file: /cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.181
diff -u -p -r1.181 syscalls.master
--- kern/syscalls.master        28 May 2018 09:17:11 -0000      1.181
+++ kern/syscalls.master        4 Jun 2018 08:46:52 -0000
@@ -90,7 +90,7 @@
 #endif
 27     STD             { ssize_t sys_recvmsg(int s, struct msghdr *msg, \
                            int flags); }
-28     STD             { ssize_t sys_sendmsg(int s, \
+28     STD NOLOCK      { ssize_t sys_sendmsg(int s, \
                            const struct msghdr *msg, int flags); }
 29     STD             { ssize_t sys_recvfrom(int s, void *buf, size_t len, \
                            int flags, struct sockaddr *from, \
@@ -261,7 +261,7 @@
 130    OBSOL           oftruncate
 131    STD             { int sys_flock(int fd, int how); }
 132    STD             { int sys_mkfifo(const char *path, mode_t mode); }
-133    STD             { ssize_t sys_sendto(int s, const void *buf, \
+133    STD NOLOCK      { ssize_t sys_sendto(int s, const void *buf, \
                            size_t len, int flags, const struct sockaddr *to, \
                            socklen_t tolen); }
 134    STD             { int sys_shutdown(int s, int how); }
Index: kern/uipc_syscalls.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.173
diff -u -p -r1.173 uipc_syscalls.c
--- kern/uipc_syscalls.c        2 Jun 2018 10:27:43 -0000       1.173
+++ kern/uipc_syscalls.c        4 Jun 2018 08:46:52 -0000
@@ -691,13 +691,16 @@ sendit(struct proc *p, int s, struct msg
        }
 #endif
        len = auio.uio_resid;
-       error = sosend(fp->f_data, to, &auio, NULL, control, flags);
+       error = sosend(so, to, &auio, NULL, control, flags);
        if (error) {
                if (auio.uio_resid != len && (error == ERESTART ||
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
-               if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0)
+               if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) {
+                       KERNEL_LOCK();
                        ptsignal(p, SIGPIPE, STHREAD);
+                       KERNEL_UNLOCK();
+               }
        }
        if (error == 0) {
                *retsize = len - auio.uio_resid;
@@ -1176,7 +1179,8 @@ getsock(struct proc *p, int fdes, struct
 {
        struct file *fp;
 
-       if ((fp = fd_getfile(p->p_fd, fdes)) == NULL)
+       fp = fd_getfile(p->p_fd, fdes);
+       if (fp == NULL)
                return (EBADF);
        if (fp->f_type != DTYPE_SOCKET) {
                FRELE(fp, p);
Index: kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.126
diff -u -p -r1.126 uipc_usrreq.c
--- kern/uipc_usrreq.c  28 Apr 2018 03:13:04 -0000      1.126
+++ kern/uipc_usrreq.c  4 Jun 2018 08:46:52 -0000
@@ -899,6 +899,7 @@ unp_gc(void *arg __unused)
                        fp = defer->ud_fp[i].fp;
                        if (fp == NULL)
                                continue;
+                        /* closef() expects a refcount of 2 */
                        FREF(fp);
                        if ((unp = fptounp(fp)) != NULL)
                                unp->unp_msgcount--;
@@ -915,6 +916,8 @@ unp_gc(void *arg __unused)
        do {
                nunref = 0;
                LIST_FOREACH(unp, &unp_head, unp_link) {
+                       mtx_enter(&fhdlk);
+                       fp = unp->unp_file;
                        if (unp->unp_flags & UNP_GCDEFER) {
                                /*
                                 * This socket is referenced by another
@@ -925,8 +928,9 @@ unp_gc(void *arg __unused)
                                unp_defer--;
                        } else if (unp->unp_flags & UNP_GCMARK) {
                                /* marked as live in previous pass */
+                               mtx_leave(&fhdlk);
                                continue;
-                       } else if ((fp = unp->unp_file) == NULL) {
+                       } else if (fp == NULL) {
                                /* not being passed, so can't be in loop */
                        } else if (fp->f_count == 0) {
                                /*
@@ -943,9 +947,11 @@ unp_gc(void *arg __unused)
                                if (fp->f_count == unp->unp_msgcount) {
                                        nunref++;
                                        unp->unp_flags |= UNP_GCDEAD;
+                                       mtx_leave(&fhdlk);
                                        continue;
                                }
                        }
+                       mtx_leave(&fhdlk);
 
                        /*
                         * This is the first time we've seen this socket on
Index: sys/file.h
===================================================================
RCS file: /cvs/src/sys/sys/file.h,v
retrieving revision 1.46
diff -u -p -r1.46 file.h
--- sys/file.h  2 Jun 2018 10:27:43 -0000       1.46
+++ sys/file.h  4 Jun 2018 08:46:52 -0000
@@ -65,6 +65,7 @@ struct        fileops {
  *
  *  Locks used to protect struct members in this file:
  *     I       immutable after creation
+ *     F       global `fhdlk' mutex
  *     f       per file `f_mtx'
  *     k       kernel lock
  */
@@ -77,7 +78,7 @@ struct file {
 #define        DTYPE_PIPE      3       /* pipe */
 #define        DTYPE_KQUEUE    4       /* event queue */
        short   f_type;         /* [I] descriptor type */
-       long    f_count;        /* [k] reference count */
+       long    f_count;        /* [F] reference count */
        struct  ucred *f_cred;  /* [I] credentials associated with descriptor */
        struct  fileops *f_ops; /* [I] file operation pointers */
        off_t   f_offset;       /* [k] */
@@ -97,12 +98,25 @@ struct file {
        do { \
                extern void vfs_stall_barrier(void); \
                vfs_stall_barrier(); \
+               mtx_enter(&fhdlk); \
                (fp)->f_count++; \
+               mtx_leave(&fhdlk); \
        } while (0)
-#define FRELE(fp,p)    (--(fp)->f_count == 0 ? fdrop(fp, p) : 0)
+
+#define FRELE(fp,p) \
+({ \
+       int rv = 0; \
+       mtx_enter(&fhdlk); \
+       if (--(fp)->f_count == 0) \
+               rv = fdrop(fp, p); \
+       else \
+               mtx_leave(&fhdlk); \
+       rv; \
+})
 
 int    fdrop(struct file *, struct proc *);
 
+extern struct mutex fhdlk;             /* protects `filehead' and f_count */
 LIST_HEAD(filelist, file);
 extern int maxfiles;                   /* kernel limit on number of open files 
*/
 extern int numfiles;                   /* actual number of open files */

Reply via email to