Diff below is _not_ for commit. It's just an experiment I wish to share.

Socket layer is already protected by solock(). Inet sockets are
protected by NET_LOCK() which is grabbed by solock(). All other sockets
are still under KERNEL_LOCK().

My suggestion is to implement another lock, identical to NET_LOCK() and
grab it within solock() for PF_UNIX sockets. I suggest solock() is
already placed everythere it's required :) struct file is already
mp-safe.

Diff below implements UNIXDOMAIN_LOCK(). It's just rwlock
"unixdomainlock" like NET_LOCK()'s "netlock".

solock(), sounlock() and soassertlocked() deal with UNIXDOMAIN_LOCK() in
PF_UNIX case. sosleep_nsec() uses unixdomainlock directly in PF_UNIX case.

Garbage collector unp_gc() placed to systqmp instead of systq. unp_gc()
grabs UNIXDOMAIN_LOCK().

Vnode and process credentials are still protected by KERNEL_LOCK().
These are filesystem sockets "connect", "bind" and "disconnect" cases.

pledge(2) checks are protected by KERNEL_LOCK() too. These are "sendfd"
and "recvfd" cases.

All other socket operations protected by solock() and underelaying
UNIXDOMAIN_LOCK().

I wrote and run simultaneously a lot of test programs doing various
socket related things. System looks stable.

May be after 6.7 release PF_UNIX sockets can be direction to go.

Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.104
diff -u -p -r1.104 uipc_socket2.c
--- sys/kern/uipc_socket2.c     11 Apr 2020 14:07:06 -0000      1.104
+++ sys/kern/uipc_socket2.c     19 Apr 2020 11:07:10 -0000
@@ -282,6 +282,8 @@ solock(struct socket *so)
                NET_LOCK();
                break;
        case PF_UNIX:
+               UNIXDOMAIN_LOCK();
+               break;
        case PF_ROUTE:
        case PF_KEY:
        default:
@@ -306,6 +308,8 @@ sounlock(struct socket *so, int s)
                NET_UNLOCK();
                break;
        case PF_UNIX:
+               UNIXDOMAIN_UNLOCK();
+               break;
        case PF_ROUTE:
        case PF_KEY:
        default:
@@ -323,6 +327,8 @@ soassertlocked(struct socket *so)
                NET_ASSERT_LOCKED();
                break;
        case PF_UNIX:
+               UNIXDOMAIN_ASSERT_LOCKED();
+               break;
        case PF_ROUTE:
        case PF_KEY:
        default:
@@ -335,12 +341,24 @@ int
 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
     uint64_t nsecs)
 {
-       if ((so->so_proto->pr_domain->dom_family != PF_UNIX) &&
-           (so->so_proto->pr_domain->dom_family != PF_ROUTE) &&
-           (so->so_proto->pr_domain->dom_family != PF_KEY)) {
-               return rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
-       } else
-               return tsleep_nsec(ident, prio, wmesg, nsecs);
+       int ret;
+
+       switch (so->so_proto->pr_domain->dom_family) {
+       case PF_INET:
+       case PF_INET6:
+               ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
+               break;
+       case PF_UNIX:
+               ret = rwsleep_nsec(ident, &unixdomainlock, prio, wmesg, nsecs);
+               break;
+       case PF_ROUTE:
+       case PF_KEY:
+       default:
+               ret = tsleep_nsec(ident, prio, wmesg, nsecs);
+               break;
+       }
+
+       return ret;
 }
 
 /*
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.142
diff -u -p -r1.142 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c      16 Jul 2019 21:41:37 -0000      1.142
+++ sys/kern/uipc_usrreq.c      19 Apr 2020 11:07:10 -0000
@@ -51,6 +51,9 @@
 #include <sys/task.h>
 #include <sys/pledge.h>
 #include <sys/pool.h>
+#include <sys/rwlock.h>
+
+struct rwlock unixdomainlock = RWLOCK_INITIALIZER("unixdomainlock");
 
 void   uipc_setaddr(const struct unpcb *, struct mbuf *);
 
@@ -152,13 +155,17 @@ uipc_usrreq(struct socket *so, int req, 
        case PRU_CONNECT2:
                error = unp_connect2(so, (struct socket *)nam);
                if (!error) {
+                       KERNEL_LOCK();
                        unp->unp_connid.uid = p->p_ucred->cr_uid;
                        unp->unp_connid.gid = p->p_ucred->cr_gid;
+                       KERNEL_UNLOCK();
                        unp->unp_connid.pid = p->p_p->ps_pid;
                        unp->unp_flags |= UNP_FEIDS;
                        unp2 = sotounpcb((struct socket *)nam);
+                       KERNEL_LOCK();
                        unp2->unp_connid.uid = p->p_ucred->cr_uid;
                        unp2->unp_connid.gid = p->p_ucred->cr_gid;
+                       KERNEL_UNLOCK();
                        unp2->unp_connid.pid = p->p_p->ps_pid;
                        unp2->unp_flags |= UNP_FEIDS;
                }
@@ -411,10 +418,12 @@ unp_detach(struct unpcb *unp)
 
        LIST_REMOVE(unp, unp_link);
        if (unp->unp_vnode) {
+               KERNEL_LOCK();
                unp->unp_vnode->v_socket = NULL;
                vp = unp->unp_vnode;
                unp->unp_vnode = NULL;
                vrele(vp);
+               KERNEL_UNLOCK();
        }
        if (unp->unp_conn)
                unp_disconnect(unp);
@@ -425,7 +434,7 @@ unp_detach(struct unpcb *unp)
        m_freem(unp->unp_addr);
        pool_put(&unpcb_pool, unp);
        if (unp_rights)
-               task_add(systq, &unp_gc_task);
+               task_add(systqmp, &unp_gc_task);
 }
 
 int
@@ -455,11 +464,13 @@ unp_bind(struct unpcb *unp, struct mbuf 
        /* Fixup sun_len to keep it in sync with m_len. */
        soun->sun_len = nam2->m_len;
 
+       KERNEL_LOCK();
        NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
            soun->sun_path, p);
        nd.ni_pledge = PLEDGE_UNIX;
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
        if ((error = namei(&nd)) != 0) {
+               KERNEL_UNLOCK();
                m_freem(nam2);
                return (error);
        }
@@ -471,6 +482,7 @@ unp_bind(struct unpcb *unp, struct mbuf 
                else
                        vput(nd.ni_dvp);
                vrele(vp);
+               KERNEL_UNLOCK();
                m_freem(nam2);
                return (EADDRINUSE);
        }
@@ -480,6 +492,7 @@ unp_bind(struct unpcb *unp, struct mbuf 
        error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
        vput(nd.ni_dvp);
        if (error) {
+               KERNEL_UNLOCK();
                m_freem(nam2);
                return (error);
        }
@@ -492,6 +505,8 @@ unp_bind(struct unpcb *unp, struct mbuf 
        unp->unp_connid.pid = p->p_p->ps_pid;
        unp->unp_flags |= UNP_FEIDSBIND;
        VOP_UNLOCK(vp);
+       KERNEL_UNLOCK();
+
        return (0);
 }
 
@@ -508,18 +523,22 @@ unp_connect(struct socket *so, struct mb
        if ((error = unp_nam2sun(nam, &soun, NULL)))
                return (error);
 
+       KERNEL_LOCK();
        NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
        nd.ni_pledge = PLEDGE_UNIX;
-       if ((error = namei(&nd)) != 0)
+       if ((error = namei(&nd)) != 0) {
+               KERNEL_UNLOCK();
                return (error);
+       }
        vp = nd.ni_vp;
        if (vp->v_type != VSOCK) {
                error = ENOTSOCK;
-               goto bad;
+               goto bad_locked;
        }
        if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
-               goto bad;
+               goto bad_locked;
        so2 = vp->v_socket;
+       KERNEL_UNLOCK();
        if (so2 == NULL) {
                error = ECONNREFUSED;
                goto bad;
@@ -540,8 +559,10 @@ unp_connect(struct socket *so, struct mb
                if (unp2->unp_addr)
                        unp3->unp_addr =
                            m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
+               KERNEL_LOCK();
                unp3->unp_connid.uid = p->p_ucred->cr_uid;
                unp3->unp_connid.gid = p->p_ucred->cr_gid;
+               KERNEL_UNLOCK();
                unp3->unp_connid.pid = p->p_p->ps_pid;
                unp3->unp_flags |= UNP_FEIDS;
                so2 = so3;
@@ -552,7 +573,11 @@ unp_connect(struct socket *so, struct mb
        }
        error = unp_connect2(so, so2);
 bad:
+       KERNEL_LOCK();
+bad_locked:
        vput(vp);
+       KERNEL_UNLOCK();
+
        return (error);
 }
 
@@ -635,17 +660,11 @@ unp_drop(struct unpcb *unp, int errno)
 {
        struct socket *so = unp->unp_socket;
 
-       KERNEL_ASSERT_LOCKED();
-
        so->so_error = errno;
        unp_disconnect(unp);
        if (so->so_head) {
                so->so_pcb = NULL;
-               /*
-                * As long as the KERNEL_LOCK() is the default lock for Unix
-                * sockets, do not release it.
-                */
-               sofree(so, SL_NOUNLOCK);
+               sofree(so, SL_LOCKED);
                m_freem(unp->unp_addr);
                pool_put(&unpcb_pool, unp);
        }
@@ -708,7 +727,9 @@ unp_externalize(struct mbuf *rights, soc
        for (i = 0; i < nfds; i++) {
                fp = rp->fp;
                rp++;
+               KERNEL_LOCK();
                error = pledge_recvfd(p, fp);
+               KERNEL_UNLOCK();
                if (error)
                        break;
 
@@ -717,14 +738,19 @@ unp_externalize(struct mbuf *rights, soc
                 * make sure that it is underneath the root.
                 */
                if (fdp->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
-                       struct vnode *vp = (struct vnode *)fp->f_data;
+                       struct vnode *vp;
+                       KERNEL_LOCK();
+                       vp = (struct vnode *)fp->f_data;
 
                        if (vp->v_type == VBLK ||
                            (vp->v_type == VDIR &&
                            !vn_isunder(vp, fdp->fd_rdir, p))) {
+                               KERNEL_UNLOCK();
                                error = EPERM;
                                break;
                        }
+
+                       KERNEL_UNLOCK();
                }
        }
 
@@ -886,7 +912,9 @@ morespace:
                        error = EDEADLK;
                        goto fail;
                }
+               KERNEL_LOCK();
                error = pledge_sendfd(p, fp);
+               KERNEL_UNLOCK();
                if (error)
                        goto fail;
 
@@ -934,8 +962,12 @@ unp_gc(void *arg __unused)
        struct unpcb *unp;
        int nunref, i;
 
-       if (unp_gcing)
+       UNIXDOMAIN_LOCK();
+
+       if (unp_gcing) {
+               UNIXDOMAIN_UNLOCK();
                return;
+       }
        unp_gcing = 1;
 
        /* close any fds on the deferred list */
@@ -950,7 +982,9 @@ unp_gc(void *arg __unused)
                        if ((unp = fptounp(fp)) != NULL)
                                unp->unp_msgcount--;
                        unp_rights--;
+                       UNIXDOMAIN_UNLOCK();
                        (void) closef(fp, NULL);
+                       UNIXDOMAIN_LOCK();
                }
                free(defer, M_TEMP, sizeof(*defer) +
                    sizeof(struct fdpass) * defer->ud_n);
@@ -1021,6 +1055,8 @@ unp_gc(void *arg __unused)
                }
        }
        unp_gcing = 0;
+
+       UNIXDOMAIN_UNLOCK();
 }
 
 void
@@ -1095,7 +1131,7 @@ unp_discard(struct fdpass *rp, int nfds)
        memset(rp, 0, sizeof(*rp) * nfds);
        SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
 
-       task_add(systq, &unp_gc_task);
+       task_add(systqmp, &unp_gc_task);
 }
 
 int
Index: sys/sys/systm.h
===================================================================
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.145
diff -u -p -r1.145 systm.h
--- sys/sys/systm.h     20 Mar 2020 03:37:08 -0000      1.145
+++ sys/sys/systm.h     19 Apr 2020 11:07:10 -0000
@@ -347,6 +347,18 @@ do {                                                       
                \
                splassert_fail(RW_READ, _s, __func__);                  \
 } while (0)
 
+extern struct rwlock unixdomainlock;
+
+#define UNIXDOMAIN_LOCK() do { rw_enter_write(&unixdomainlock); } while (0)
+#define UNIXDOMAIN_UNLOCK() do { rw_exit_write(&unixdomainlock); } while (0)
+
+#define UNIXDOMAIN_ASSERT_LOCKED()                                     \
+do {                                                                   \
+       int __s = rw_status(&unixdomainlock);                           \
+       if((splassert_ctl > 0) && (__s != RW_WRITE))                    \
+               splassert_fail(RW_WRITE, __s, __func__);                \
+} while (0)
+
 __returns_twice int    setjmp(label_t *);
 __dead void    longjmp(label_t *);
 #endif

Reply via email to