The latest version of the PF_UNIX sockets unlocking diff. The new `unp_lock' rwlock(9) was used as solock()'s backend to protect the whole layer.
We release solock() while we perform all vnode(9) related operations. Since fifo subsystem uses unix sockets as backend this is required to enforce `i_lock' -> `unp_lock' lock order. The socket can't be closed by concurrent thread because the file descriptor reference is held. Also the new two flags `UNP_BINDING' and `UNP_CONNECTING' were introduced to prevent connection or binding override while solock() is released. This version was tested by claudio@. The previous version was tested by bluhm@, but the differences are mostly in commentaries and error paths simplification. I hope someone else will try it and gives positive feedback which allow to push it forward. Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.104 diff -u -p -r1.104 uipc_socket2.c --- sys/kern/uipc_socket2.c 11 Apr 2020 14:07:06 -0000 1.104 +++ sys/kern/uipc_socket2.c 4 Feb 2021 11:19:29 -0000 @@ -53,6 +53,8 @@ u_long sb_max = SB_MAX; /* patchable */ extern struct pool mclpools[]; extern struct pool mbpool; +extern struct rwlock unp_lock; + /* * Procedures to manipulate state flags of socket * and do appropriate wakeups. Normal sequence from the @@ -282,6 +284,8 @@ solock(struct socket *so) NET_LOCK(); break; case PF_UNIX: + rw_enter_write(&unp_lock); + break; case PF_ROUTE: case PF_KEY: default: @@ -306,6 +310,8 @@ sounlock(struct socket *so, int s) NET_UNLOCK(); break; case PF_UNIX: + rw_exit_write(&unp_lock); + break; case PF_ROUTE: case PF_KEY: default: @@ -323,6 +329,8 @@ soassertlocked(struct socket *so) NET_ASSERT_LOCKED(); break; case PF_UNIX: + rw_assert_wrlock(&unp_lock); + break; case PF_ROUTE: case PF_KEY: default: @@ -335,12 +343,24 @@ int sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, uint64_t nsecs) { - if ((so->so_proto->pr_domain->dom_family != PF_UNIX) && - (so->so_proto->pr_domain->dom_family != PF_ROUTE) && - (so->so_proto->pr_domain->dom_family != PF_KEY)) { - return rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); - } else - return tsleep_nsec(ident, prio, wmesg, nsecs); + int ret; + + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); + break; + case PF_UNIX: + ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs); + break; + case PF_ROUTE: + case PF_KEY: + default: + ret = tsleep_nsec(ident, prio, wmesg, nsecs); + break; + } + + return ret; } /* Index: sys/kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.142 diff -u -p -r1.142 uipc_usrreq.c --- sys/kern/uipc_usrreq.c 16 Jul 2019 21:41:37 -0000 1.142 +++ sys/kern/uipc_usrreq.c 4 Feb 2021 11:19:29 -0000 @@ -51,35 +51,35 @@ #include <sys/task.h> #include <sys/pledge.h> #include <sys/pool.h> +#include <sys/rwlock.h> -void uipc_setaddr(const struct unpcb *, struct mbuf *); - -/* list of all UNIX domain sockets, for unp_gc() */ -LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head); +/* + * Locks used to protect global data and struct members: + * I immutable after creation + * U unp_lock + */ +struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock"); /* * Stack of sets of files that were passed over a socket but were * not received and need to be closed. */ struct unp_deferral { - SLIST_ENTRY(unp_deferral) ud_link; - int ud_n; + SLIST_ENTRY(unp_deferral) ud_link; /* [U] */ + int ud_n; /* [I] */ /* followed by ud_n struct fdpass */ - struct fdpass ud_fp[]; + struct fdpass ud_fp[]; /* [I] */ }; +void uipc_setaddr(const struct unpcb *, struct mbuf *); void unp_discard(struct fdpass *, int); void unp_mark(struct fdpass *, int); void unp_scan(struct mbuf *, void (*)(struct fdpass *, int)); int unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *); struct pool unpcb_pool; -/* list of sets of files that were sent over sockets that are now closed */ -SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred); - struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL); - /* * Unix communications domain. * @@ -88,14 +88,25 @@ struct task unp_gc_task = TASK_INITIALIZ * rethink name space problems * need a proper out-of-band */ -struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX }; -ino_t unp_ino; /* prototype for fake inode numbers */ +const struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX }; + +/* [U] list of all UNIX domain sockets, for unp_gc() */ +LIST_HEAD(unp_head, unpcb) unp_head = + LIST_HEAD_INITIALIZER(unp_head); +/* [U] list of sets of files that were sent over sockets that are now closed */ +SLIST_HEAD(,unp_deferral) unp_deferred = + SLIST_HEAD_INITIALIZER(unp_deferred); + +ino_t unp_ino; /* [U] prototype for fake inode numbers */ +int unp_rights; /* [U] file descriptors in flight */ +int unp_defer; /* [U] number of deferred fp to close by the GC task */ +int unp_gcing; /* [U] GC task currently running */ void unp_init(void) { pool_init(&unpcb_pool, sizeof(struct unpcb), 0, - IPL_NONE, 0, "unpcb", NULL); + IPL_SOFTNET, 0, "unpcb", NULL); } void @@ -214,7 +225,7 @@ uipc_usrreq(struct socket *so, int req, switch (so->so_type) { case SOCK_DGRAM: { - struct sockaddr *from; + const struct sockaddr *from; if (nam) { if (unp->unp_conn) { @@ -351,14 +362,14 @@ u_long unpst_recvspace = PIPSIZ; u_long unpdg_sendspace = 2*1024; /* really max datagram size */ u_long unpdg_recvspace = 4*1024; -int unp_rights; /* file descriptors in flight */ - int uipc_attach(struct socket *so, int proto) { struct unpcb *unp; int error; + rw_assert_wrlock(&unp_lock); + if (so->so_pcb) return EISCONN; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { @@ -407,25 +418,48 @@ uipc_detach(struct socket *so) void unp_detach(struct unpcb *unp) { - struct vnode *vp; + struct socket *so = unp->unp_socket; + struct vnode *vp = NULL; + + rw_assert_wrlock(&unp_lock); LIST_REMOVE(unp, unp_link); if (unp->unp_vnode) { + /* + * `v_socket' is only read in unp_connect and + * unplock prevents concurrent access. + */ + unp->unp_vnode->v_socket = NULL; vp = unp->unp_vnode; unp->unp_vnode = NULL; - vrele(vp); } + if (unp->unp_conn) unp_disconnect(unp); while (!SLIST_EMPTY(&unp->unp_refs)) unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET); - soisdisconnected(unp->unp_socket); - unp->unp_socket->so_pcb = NULL; + soisdisconnected(so); + so->so_pcb = NULL; m_freem(unp->unp_addr); pool_put(&unpcb_pool, unp); if (unp_rights) task_add(systq, &unp_gc_task); + + if (vp != NULL) { + /* + * Enforce `i_lock' -> `unplock' because fifo subsystem + * requires it. The socket can't be closed concurrently + * because the file descriptor reference is + * still hold. + */ + + sounlock(so, SL_LOCKED); + KERNEL_LOCK(); + vrele(vp); + KERNEL_UNLOCK(); + solock(so); + } } int @@ -439,6 +473,8 @@ unp_bind(struct unpcb *unp, struct mbuf struct nameidata nd; size_t pathlen; + if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING)) + return (EINVAL); if (unp->unp_vnode != NULL) return (EINVAL); if ((error = unp_nam2sun(nam, &soun, &pathlen))) @@ -458,10 +494,24 @@ unp_bind(struct unpcb *unp, struct mbuf NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE, soun->sun_path, p); nd.ni_pledge = PLEDGE_UNIX; + + unp->unp_flags |= UNP_BINDING; + + /* + * Enforce `i_lock' -> `unplock' because fifo subsystem + * requires it. The socket can't be closed concurrently + * because the file descriptor reference is still held. + */ + + sounlock(unp->unp_socket, SL_LOCKED); + + KERNEL_LOCK(); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ - if ((error = namei(&nd)) != 0) { + error = namei(&nd); + if (error != 0) { m_freem(nam2); - return (error); + solock(unp->unp_socket); + goto out; } vp = nd.ni_vp; if (vp != NULL) { @@ -472,7 +522,9 @@ unp_bind(struct unpcb *unp, struct mbuf vput(nd.ni_dvp); vrele(vp); m_freem(nam2); - return (EADDRINUSE); + error = EADDRINUSE; + solock(unp->unp_socket); + goto out; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; @@ -481,8 +533,10 @@ unp_bind(struct unpcb *unp, struct mbuf vput(nd.ni_dvp); if (error) { m_freem(nam2); - return (error); + solock(unp->unp_socket); + goto out; } + solock(unp->unp_socket); unp->unp_addr = nam2; vp = nd.ni_vp; vp->v_socket = unp->unp_socket; @@ -492,7 +546,11 @@ unp_bind(struct unpcb *unp, struct mbuf unp->unp_connid.pid = p->p_p->ps_pid; unp->unp_flags |= UNP_FEIDSBIND; VOP_UNLOCK(vp); - return (0); +out: + KERNEL_UNLOCK(); + unp->unp_flags &= ~UNP_BINDING; + + return (error); } int @@ -505,36 +563,52 @@ unp_connect(struct socket *so, struct mb struct nameidata nd; int error; + unp = sotounpcb(so); + if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING)) + return (EISCONN); if ((error = unp_nam2sun(nam, &soun, NULL))) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p); nd.ni_pledge = PLEDGE_UNIX; - if ((error = namei(&nd)) != 0) - return (error); + + unp->unp_flags |= UNP_CONNECTING; + + /* + * Enforce `i_lock' -> `unplock' because fifo subsystem + * requires it. The socket can't be closed concurrently + * because the file descriptor reference is still held. + */ + + sounlock(so, SL_LOCKED); + + KERNEL_LOCK(); + error = namei(&nd); + if (error != 0) + goto unlock; vp = nd.ni_vp; if (vp->v_type != VSOCK) { error = ENOTSOCK; - goto bad; + goto put; } if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0) - goto bad; + goto put; + solock(so); so2 = vp->v_socket; if (so2 == NULL) { error = ECONNREFUSED; - goto bad; + goto put_locked; } if (so->so_type != so2->so_type) { error = EPROTOTYPE; - goto bad; + goto put_locked; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so2->so_options & SO_ACCEPTCONN) == 0 || (so3 = sonewconn(so2, 0)) == 0) { error = ECONNREFUSED; - goto bad; + goto put_locked; } - unp = sotounpcb(so); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); if (unp2->unp_addr) @@ -551,8 +625,15 @@ unp_connect(struct socket *so, struct mb } } error = unp_connect2(so, so2); -bad: +put_locked: + sounlock(so, SL_LOCKED); +put: vput(vp); +unlock: + KERNEL_UNLOCK(); + solock(so); + unp->unp_flags &= ~UNP_CONNECTING; + return (error); } @@ -562,6 +643,8 @@ unp_connect2(struct socket *so, struct s struct unpcb *unp = sotounpcb(so); struct unpcb *unp2; + rw_assert_wrlock(&unp_lock); + if (so2->so_type != so->so_type) return (EPROTOTYPE); unp2 = sotounpcb(so2); @@ -635,15 +718,16 @@ unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; - KERNEL_ASSERT_LOCKED(); + rw_assert_wrlock(&unp_lock); so->so_error = errno; unp_disconnect(unp); if (so->so_head) { so->so_pcb = NULL; /* - * As long as the KERNEL_LOCK() is the default lock for Unix - * sockets, do not release it. + * As long as `unp_lock' is taken before entering + * uipc_usrreq() releasing it here would lead to a + * double unlock. */ sofree(so, SL_NOUNLOCK); m_freem(unp->unp_addr); @@ -685,6 +769,8 @@ unp_externalize(struct mbuf *rights, soc struct file *fp; int nfds, error = 0; + rw_assert_wrlock(&unp_lock); + /* * This code only works because SCM_RIGHTS is the only supported * control message type on unix sockets. Enforce this here. @@ -705,6 +791,10 @@ unp_externalize(struct mbuf *rights, soc /* Make sure the recipient should be able to see the descriptors.. */ rp = (struct fdpass *)CMSG_DATA(cm); + + /* fdp->fd_rdir requires KERNEL_LOCK() */ + KERNEL_LOCK(); + for (i = 0; i < nfds; i++) { fp = rp->fp; rp++; @@ -728,6 +818,8 @@ unp_externalize(struct mbuf *rights, soc } } + KERNEL_UNLOCK(); + fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK); restart: @@ -825,6 +917,8 @@ unp_internalize(struct mbuf *control, st int i, error; int nfds, *ip, fd, neededspace; + rw_assert_wrlock(&unp_lock); + /* * Check for two potential msg_controllen values because * IETF stuck their nose in a place it does not belong. @@ -923,8 +1017,6 @@ fail: return (error); } -int unp_defer, unp_gcing; - void unp_gc(void *arg __unused) { @@ -934,8 +1026,10 @@ unp_gc(void *arg __unused) struct unpcb *unp; int nunref, i; + rw_enter_write(&unp_lock); + if (unp_gcing) - return; + goto unlock; unp_gcing = 1; /* close any fds on the deferred list */ @@ -950,7 +1044,9 @@ unp_gc(void *arg __unused) if ((unp = fptounp(fp)) != NULL) unp->unp_msgcount--; unp_rights--; + rw_exit_write(&unp_lock); (void) closef(fp, NULL); + rw_enter_write(&unp_lock); } free(defer, M_TEMP, sizeof(*defer) + sizeof(struct fdpass) * defer->ud_n); @@ -1021,6 +1117,8 @@ unp_gc(void *arg __unused) } } unp_gcing = 0; +unlock: + rw_exit_write(&unp_lock); } void @@ -1066,6 +1164,8 @@ unp_mark(struct fdpass *rp, int nfds) struct unpcb *unp; int i; + rw_assert_wrlock(&unp_lock); + for (i = 0; i < nfds; i++) { if (rp[i].fp == NULL) continue; @@ -1087,6 +1187,8 @@ void unp_discard(struct fdpass *rp, int nfds) { struct unp_deferral *defer; + + rw_assert_wrlock(&unp_lock); /* copy the file pointers to a deferral structure */ defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK); Index: sys/sys/unpcb.h =================================================================== RCS file: /cvs/src/sys/sys/unpcb.h,v retrieving revision 1.17 diff -u -p -r1.17 unpcb.h --- sys/sys/unpcb.h 15 Jul 2019 12:28:06 -0000 1.17 +++ sys/sys/unpcb.h 4 Feb 2021 11:19:30 -0000 @@ -56,22 +56,27 @@ * Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt * so that changes in the sockbuf may be computed to modify * back pressure on the sender accordingly. + * + * Locks used to protect struct members: + * I immutable after creation + * U unp_lock */ + struct unpcb { - struct socket *unp_socket; /* pointer back to socket */ - struct vnode *unp_vnode; /* if associated with file */ - struct file *unp_file; /* backpointer for unp_gc() */ - struct unpcb *unp_conn; /* control block of connected socket */ - ino_t unp_ino; /* fake inode number */ - SLIST_HEAD(,unpcb) unp_refs; /* referencing socket linked list */ - SLIST_ENTRY(unpcb) unp_nextref; /* link in unp_refs list */ - struct mbuf *unp_addr; /* bound address of socket */ - long unp_msgcount; /* references from socket rcv buf */ - int unp_flags; /* this unpcb contains peer eids */ - struct sockpeercred unp_connid;/* id of peer process */ - struct timespec unp_ctime; /* holds creation time */ - LIST_ENTRY(unpcb) unp_link; /* link in per-AF list of sockets */ + struct socket *unp_socket; /* [I] pointer back to socket */ + struct vnode *unp_vnode; /* [U] if associated with file */ + struct file *unp_file; /* [U] backpointer for unp_gc() */ + struct unpcb *unp_conn; /* [U] control block of connected socket */ + ino_t unp_ino; /* [U] fake inode number */ + SLIST_HEAD(,unpcb) unp_refs; /* [U] referencing socket linked list */ + SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */ + struct mbuf *unp_addr; /* [U] bound address of socket */ + long unp_msgcount; /* [U] references from socket rcv buf */ + int unp_flags; /* [U] this unpcb contains peer eids */ + struct sockpeercred unp_connid;/* [U] id of peer process */ + struct timespec unp_ctime; /* [I] holds creation time */ + LIST_ENTRY(unpcb) unp_link; /* [U] link in per-AF list of sockets */ }; /* @@ -82,6 +87,8 @@ struct unpcb { #define UNP_GCMARK 0x04 /* mark during unp_gc() */ #define UNP_GCDEFER 0x08 /* ref'd, but not marked in this pass */ #define UNP_GCDEAD 0x10 /* unref'd in this pass */ +#define UNP_BINDING 0x20 /* unp is binding now */ +#define UNP_CONNECTING 0x40 /* unp is connecting now */ #define sotounpcb(so) ((struct unpcb *)((so)->so_pcb))