Updated diff. Re-lock dances were simplified in the unix(4) sockets layer. Reference counters added to unix(4) sockets layer too. This makes pointer dereference of peer's control block always safe after re-lock.
The `unp_refs' list cleanup done in the unp_detach(). This removes the case where the socket connected to our dying socket could be passed to unp_disconnect() and the check of it's connection state became much easier. Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.269 diff -u -p -r1.269 uipc_socket.c --- sys/kern/uipc_socket.c 11 Nov 2021 16:35:09 -0000 1.269 +++ sys/kern/uipc_socket.c 20 Nov 2021 00:11:10 -0000 @@ -52,6 +52,7 @@ #include <sys/atomic.h> #include <sys/rwlock.h> #include <sys/time.h> +#include <sys/refcnt.h> #ifdef DDB #include <machine/db_machdep.h> @@ -156,7 +157,9 @@ soalloc(int prflags) so = pool_get(&socket_pool, prflags); if (so == NULL) return (NULL); - rw_init(&so->so_lock, "solock"); + rw_init_flags(&so->so_lock, "solock", RWL_DUPOK); + refcnt_init(&so->so_refcnt); + return (so); } @@ -257,6 +260,8 @@ solisten(struct socket *so, int backlog) void sofree(struct socket *so, int s) { + int persocket = solock_persocket(so); + soassertlocked(so); if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { @@ -264,16 +269,57 @@ sofree(struct socket *so, int s) return; } if (so->so_head) { + struct socket *head = so->so_head; + /* * We must not decommission a socket that's on the accept(2) * queue. If we do, then accept(2) may hang after select(2) * indicated that the listening socket was ready. */ - if (!soqremque(so, 0)) { + if (so->so_onq == &head->so_q) { sounlock(so, s); return; } + + if (persocket) { + if (so < head) + solock(head); + else { + /* + * Concurrent close of `head' could + * abort `so' due to re-lock. + */ + soref(so); + soref(head); + sounlock(so, SL_LOCKED); + solock(head); + solock(so); + + if (so->so_onq != &head->so_q0) { + sounlock(head, SL_LOCKED); + sounlock(so, SL_LOCKED); + sorele(head); + sorele(so); + return; + } + + sorele(head); + sorele(so); + } + } + + soqremque(so, 0); + + if (persocket) + sounlock(head, SL_LOCKED); } + + if (persocket) { + sounlock(so, SL_LOCKED); + refcnt_finalize(&so->so_refcnt, "sofinal"); + solock(so); + } + sigio_free(&so->so_sigio); klist_free(&so->so_rcv.sb_sel.si_note); klist_free(&so->so_snd.sb_sel.si_note); @@ -363,13 +409,68 @@ drop: error = error2; } if (so->so_options & SO_ACCEPTCONN) { + int persocket = solock_persocket(so); + + if (persocket) { + /* Wait concurrent sonewconn() threads. */ + while (so->so_newconn > 0) { + so->so_state |= SS_NEWCONN_WAIT; + sosleep_nsec(so, &so->so_newconn, PSOCK, + "netlck", INFSLP); + } + } + while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { + if (persocket) { + if (so < so2) + solock(so2); + else { + soref(so2); + sounlock(so, SL_LOCKED); + solock(so2); + solock(so); + + if (so2->so_onq != &so->so_q0) { + sounlock(so2, SL_LOCKED); + sorele(so2); + continue; + } + + sorele(so2); + } + } (void) soqremque(so2, 0); + if (persocket) + sounlock(so, SL_LOCKED); (void) soabort(so2); + if (persocket) + solock(so); } while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { + if (persocket) { + if (so < so2) + solock(so2); + else { + soref(so2); + sounlock(so, SL_LOCKED); + solock(so2); + solock(so); + + if (so2->so_onq != &so->so_q) { + sounlock(so2, SL_LOCKED); + sorele(so2); + continue; + } + + sorele(so2); + } + } (void) soqremque(so2, 1); + if (persocket) + sounlock(so, SL_LOCKED); (void) soabort(so2); + if (persocket) + solock(so); } } discard: @@ -437,11 +538,19 @@ soconnect(struct socket *so, struct mbuf int soconnect2(struct socket *so1, struct socket *so2) { - int s, error; + int persocket, s, error; + + if ((persocket = solock_persocket(so1))) { + solock_pair(so1, so2); + s = SL_LOCKED; + } else + s = solock(so1); - s = solock(so1); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, (struct mbuf *)so2, NULL, curproc); + + if (persocket) + sounlock(so2, s); sounlock(so1, s); return (error); } Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.116 diff -u -p -r1.116 uipc_socket2.c --- sys/kern/uipc_socket2.c 6 Nov 2021 05:26:33 -0000 1.116 +++ sys/kern/uipc_socket2.c 20 Nov 2021 00:11:10 -0000 @@ -53,8 +53,6 @@ u_long sb_max = SB_MAX; /* patchable */ extern struct pool mclpools[]; extern struct pool mbpool; -extern struct rwlock unp_lock; - /* * Procedures to manipulate state flags of socket * and do appropriate wakeups. Normal sequence from the @@ -101,10 +99,42 @@ soisconnected(struct socket *so) soassertlocked(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTED; - if (head && soqremque(so, 0)) { + + if (head != NULL && so->so_onq == &head->so_q0) { + int persocket = solock_persocket(so); + + if (persocket) { + if (so < head) + solock(head); + else { + soref(so); + soref(head); + + sounlock(so, SL_LOCKED); + solock(head); + solock(so); + + if (so->so_onq != &head->so_q0) { + sounlock(head, SL_LOCKED); + sounlock(so, SL_LOCKED); + sorele(head); + sorele(so); + + return; + } + + sorele(head); + sorele(so); + } + } + + soqremque(so, 0); soqinsque(head, so, 1); sorwakeup(head); wakeup_one(&head->so_timeo); + + if (persocket) + sounlock(head, SL_LOCKED); } else { wakeup(&so->so_timeo); sorwakeup(so); @@ -146,7 +176,8 @@ struct socket * sonewconn(struct socket *head, int connstatus) { struct socket *so; - int soqueue = connstatus ? 1 : 0; + int persocket = solock_persocket(head); + int error; /* * XXXSMP as long as `so' and `head' share the same lock, we @@ -174,10 +205,20 @@ sonewconn(struct socket *head, int conns so->so_rgid = head->so_rgid; so->so_cpid = head->so_cpid; + if (persocket) { + /* + * Lock order doesn't matter. We are the only thread + * which simultaneously locks these sockets. + */ + solock(so); + } + /* * Inherit watermarks but those may get clamped in low mem situations. */ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + if (persocket) + sounlock(so, SL_LOCKED); pool_put(&socket_pool, so); return (NULL); } @@ -193,20 +234,56 @@ sonewconn(struct socket *head, int conns sigio_init(&so->so_sigio); sigio_copy(&so->so_sigio, &head->so_sigio); - soqinsque(head, so, soqueue); - if ((*so->so_proto->pr_attach)(so, 0)) { - (void) soqremque(so, soqueue); + soqinsque(head, so, 0); + + /* + * We need to unlock `head' because PCB layer could release + * solock() to enforce desired lock order. + */ + if (persocket) { + head->so_newconn++; + sounlock(head, SL_LOCKED); + } + + error = (*so->so_proto->pr_attach)(so, 0); + + /* + * The lock order matters from here. + */ + if (persocket) { + sounlock(so, SL_LOCKED); + solock_pair(head, so); + + if ((head->so_newconn--) == 0) { + if ((head->so_state & SS_NEWCONN_WAIT) != 0) { + head->so_state &= ~SS_NEWCONN_WAIT; + wakeup(&head->so_newconn); + } + } + } + + if (error) { + soqremque(so, 0); + if (persocket) + sounlock(so, SL_LOCKED); sigio_free(&so->so_sigio); klist_free(&so->so_rcv.sb_sel.si_note); klist_free(&so->so_snd.sb_sel.si_note); pool_put(&socket_pool, so); return (NULL); } + if (connstatus) { + so->so_state |= connstatus; + soqremque(so, 0); + soqinsque(head, so, 1); sorwakeup(head); wakeup(&head->so_timeo); - so->so_state |= connstatus; } + + if (persocket) + sounlock(so, SL_LOCKED); + return (so); } @@ -214,6 +291,7 @@ void soqinsque(struct socket *head, struct socket *so, int q) { soassertlocked(head); + soassertlocked(so); KASSERT(so->so_onq == NULL); @@ -233,6 +311,7 @@ soqremque(struct socket *so, int q) { struct socket *head = so->so_head; + soassertlocked(so); soassertlocked(head); if (q == 0) { @@ -284,9 +363,6 @@ solock(struct socket *so) case PF_INET6: NET_LOCK(); break; - case PF_UNIX: - rw_enter_write(&unp_lock); - break; default: rw_enter_write(&so->so_lock); break; @@ -295,6 +371,34 @@ solock(struct socket *so) return (SL_LOCKED); } +int +solock_persocket(struct socket *so) +{ + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + return 0; + default: + return 1; + } +} + +void +solock_pair(struct socket *so1, struct socket *so2) +{ + KASSERT(so1 != so2); + KASSERT(so1->so_type == so2->so_type); + KASSERT(solock_persocket(so1)); + + if (so1 < so2) { + solock(so1); + solock(so2); + } else { + solock(so2); + solock(so1); + } +} + void sounlock(struct socket *so, int s) { @@ -308,9 +412,6 @@ sounlock(struct socket *so, int s) case PF_INET6: NET_UNLOCK(); break; - case PF_UNIX: - rw_exit_write(&unp_lock); - break; default: rw_exit_write(&so->so_lock); break; @@ -325,9 +426,6 @@ soassertlocked(struct socket *so) case PF_INET6: NET_ASSERT_LOCKED(); break; - case PF_UNIX: - rw_assert_wrlock(&unp_lock); - break; default: rw_assert_wrlock(&so->so_lock); break; @@ -344,9 +442,6 @@ sosleep_nsec(struct socket *so, void *id case PF_INET: case PF_INET6: ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); - break; - case PF_UNIX: - ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs); break; default: ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); Index: sys/kern/uipc_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.194 diff -u -p -r1.194 uipc_syscalls.c --- sys/kern/uipc_syscalls.c 24 Oct 2021 00:02:25 -0000 1.194 +++ sys/kern/uipc_syscalls.c 20 Nov 2021 00:11:10 -0000 @@ -246,7 +246,7 @@ doaccept(struct proc *p, int sock, struc socklen_t namelen; int error, s, tmpfd; struct socket *head, *so; - int cloexec, nflag; + int cloexec, nflag, persocket; cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0; @@ -269,16 +269,20 @@ doaccept(struct proc *p, int sock, struc head = headfp->f_data; s = solock(head); + + persocket = solock_persocket(head); + if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; - goto out; + goto out_unlock; } +retry: if ((headfp->f_flag & FNONBLOCK) && head->so_qlen == 0) { if (head->so_state & SS_CANTRCVMORE) error = ECONNABORTED; else error = EWOULDBLOCK; - goto out; + goto out_unlock; } while (head->so_qlen == 0 && head->so_error == 0) { if (head->so_state & SS_CANTRCVMORE) { @@ -288,18 +292,40 @@ doaccept(struct proc *p, int sock, struc error = sosleep_nsec(head, &head->so_timeo, PSOCK | PCATCH, "netcon", INFSLP); if (error) - goto out; + goto out_unlock; } if (head->so_error) { error = head->so_error; head->so_error = 0; - goto out; + goto out_unlock; } /* * Do not sleep after we have taken the socket out of the queue. */ - so = TAILQ_FIRST(&head->so_q); + + if ((so = TAILQ_FIRST(&head->so_q)) == NULL) + panic("accept"); + + if (persocket) { + if (head < so) + solock(so); + else { + soref(so); + sounlock(head, SL_LOCKED); + solock(so); + solock(head); + + if (so->so_head != head) { + sounlock(so, SL_LOCKED); + sorele(so); + goto retry; + } + + sorele(so); + } + } + if (soqremque(so, 1) == 0) panic("accept"); @@ -310,30 +336,53 @@ doaccept(struct proc *p, int sock, struc /* connection has been removed from the listen queue */ KNOTE(&head->so_rcv.sb_sel.si_note, 0); + if (persocket) + sounlock(head, s); + fp->f_type = DTYPE_SOCKET; fp->f_flag = FREAD | FWRITE | nflag; fp->f_ops = &socketops; fp->f_data = so; + error = soaccept(so, nam); -out: - sounlock(head, s); - if (!error && name != NULL) + + /* + * It doesn't matter which socket to unlock when we + * locked the whole layer. + */ + sounlock(so, s); + + if (error) + goto out; + + if (name != NULL) { error = copyaddrout(p, nam, name, namelen, anamelen); - if (!error) { - fdplock(fdp); - fdinsert(fdp, tmpfd, cloexec, fp); - fdpunlock(fdp); - FRELE(fp, p); - *retval = tmpfd; - } else { - fdplock(fdp); - fdremove(fdp, tmpfd); - fdpunlock(fdp); - closef(fp, p); + if (error) + goto out; } + fdplock(fdp); + fdinsert(fdp, tmpfd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); + *retval = tmpfd; + m_freem(nam); FRELE(headfp, p); + + return 0; + +out_unlock: + sounlock(head, s); +out: + fdplock(fdp); + fdremove(fdp, tmpfd); + fdpunlock(fdp); + closef(fp, p); + + m_freem(nam); + FRELE(headfp, p); + return (error); } Index: sys/kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.157 diff -u -p -r1.157 uipc_usrreq.c --- sys/kern/uipc_usrreq.c 16 Nov 2021 08:56:19 -0000 1.157 +++ sys/kern/uipc_usrreq.c 20 Nov 2021 00:11:10 -0000 @@ -52,25 +52,26 @@ #include <sys/pledge.h> #include <sys/pool.h> #include <sys/rwlock.h> -#include <sys/mutex.h> #include <sys/sysctl.h> #include <sys/lock.h> +#include <sys/refcnt.h> /* * Locks used to protect global data and struct members: * I immutable after creation * D unp_df_lock * G unp_gc_lock - * U unp_lock + * M unp_ino_mtx * R unp_rights_mtx * a atomic + * s socket lock */ -struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock"); struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk"); struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk"); struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); +struct mutex unp_ino_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); /* * Stack of sets of files that were passed over a socket but were @@ -88,6 +89,9 @@ void unp_discard(struct fdpass *, int); void unp_mark(struct fdpass *, int); void unp_scan(struct mbuf *, void (*)(struct fdpass *, int)); int unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *); +static inline void unp_ref(struct unpcb *); +static inline void unp_rele(struct unpcb *); +struct socket *unp_solock_peer(struct socket *); struct pool unpcb_pool; struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL); @@ -121,6 +125,53 @@ unp_init(void) IPL_SOFTNET, 0, "unpcb", NULL); } +static inline void +unp_ref(struct unpcb *unp) +{ + refcnt_take(&unp->unp_refcnt); +} + +static inline void +unp_rele(struct unpcb *unp) +{ + refcnt_rele_wake(&unp->unp_refcnt); +} + +struct socket * +unp_solock_peer(struct socket *so) +{ + struct unpcb *unp, *unp2; + struct socket *so2; + + unp = so->so_pcb; + +again: + if ((unp2 = unp->unp_conn) == NULL) + return NULL; + + so2 = unp2->unp_socket; + + if (so < so2) + solock(so2); + else if (so > so2){ + unp_ref(unp2); + sounlock(so, SL_LOCKED); + solock(so2); + solock(so); + + /* Datagram socket could be reconnected due to re-lock. */ + if (unp->unp_conn != unp2) { + sounlock(so2, SL_LOCKED); + unp_rele(unp2); + goto again; + } + + unp_rele(unp2); + } + + return so2; +} + void uipc_setaddr(const struct unpcb *unp, struct mbuf *nam) { @@ -195,6 +246,12 @@ uipc_usrreq(struct socket *so, int req, * if it was bound and we are still connected * (our peer may have closed already!). */ + /* + * Don't need to lock `unp_conn'. The `unp_addr' is + * immutable since we set it within unp_connect(). + * Both sockets are locked while we connecting them + * so it's enough to hold lock on `unp'. + */ uipc_setaddr(unp->unp_conn, nam); break; @@ -212,9 +269,8 @@ uipc_usrreq(struct socket *so, int req, case SOCK_STREAM: case SOCK_SEQPACKET: - if (unp->unp_conn == NULL) + if ((so2 = unp_solock_peer(so)) == NULL) break; - so2 = unp->unp_conn->unp_socket; /* * Adjust backpressure on sender * and wakeup any waiting to write. @@ -222,6 +278,7 @@ uipc_usrreq(struct socket *so, int req, so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt; so2->so_snd.sb_cc = so->so_rcv.sb_cc; sowwakeup(so2); + sounlock(so2, SL_LOCKED); break; default: @@ -250,13 +307,16 @@ uipc_usrreq(struct socket *so, int req, error = unp_connect(so, nam, p); if (error) break; - } else { - if (unp->unp_conn == NULL) { + } + + if ((so2 = unp_solock_peer(so)) == NULL) { + if (nam != NULL) + error = ECONNREFUSED; + else error = ENOTCONN; - break; - } + break; } - so2 = unp->unp_conn->unp_socket; + if (unp->unp_addr) from = mtod(unp->unp_addr, struct sockaddr *); else @@ -267,6 +327,10 @@ uipc_usrreq(struct socket *so, int req, control = NULL; } else error = ENOBUFS; + + if (so2 != so) + sounlock(so2, SL_LOCKED); + if (nam) unp_disconnect(unp); break; @@ -278,11 +342,11 @@ uipc_usrreq(struct socket *so, int req, error = EPIPE; break; } - if (unp->unp_conn == NULL) { + if ((so2 = unp_solock_peer(so)) == NULL) { error = ENOTCONN; break; } - so2 = unp->unp_conn->unp_socket; + /* * Send to paired receive port, and then raise * send buffer counts to maintain backpressure. @@ -304,6 +368,8 @@ uipc_usrreq(struct socket *so, int req, so->so_snd.sb_cc = so2->so_rcv.sb_cc; if (so2->so_rcv.sb_cc > 0) sorwakeup(so2); + + sounlock(so2, SL_LOCKED); m = NULL; break; @@ -317,12 +383,7 @@ uipc_usrreq(struct socket *so, int req, case PRU_ABORT: unp_detach(unp); - /* - * As long as `unp_lock' is taken before entering - * uipc_usrreq() releasing it here would lead to a - * double unlock. - */ - sofree(so, SL_NOUNLOCK); + sofree(so, SL_LOCKED); break; case PRU_SENSE: { @@ -330,8 +391,10 @@ uipc_usrreq(struct socket *so, int req, sb->st_blksize = so->so_snd.sb_hiwat; sb->st_dev = NODEV; + mtx_enter(&unp_ino_mtx); if (unp->unp_ino == 0) unp->unp_ino = unp_ino++; + mtx_leave(&unp_ino_mtx); sb->st_atim.tv_sec = sb->st_mtim.tv_sec = sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec; @@ -352,6 +415,12 @@ uipc_usrreq(struct socket *so, int req, break; case PRU_PEERADDR: + /* + * Don't need to lock `unp_conn'. The `unp_addr' is + * immutable since we set it within unp_connect(). + * Both sockets are locked while we connecting them + * so it's enough to hold lock on `unp'. + */ uipc_setaddr(unp->unp_conn, nam); break; @@ -404,8 +473,6 @@ uipc_attach(struct socket *so, int proto struct unpcb *unp; int error; - rw_assert_wrlock(&unp_lock); - if (so->so_pcb) return EISCONN; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { @@ -432,6 +499,7 @@ uipc_attach(struct socket *so, int proto unp = pool_get(&unpcb_pool, PR_NOWAIT|PR_ZERO); if (unp == NULL) return (ENOBUFS); + refcnt_init(&unp->unp_refcnt); unp->unp_socket = so; so->so_pcb = unp; getnanotime(&unp->unp_ctime); @@ -439,12 +507,6 @@ uipc_attach(struct socket *so, int proto /* * Enforce `unp_gc_lock' -> `solock()' lock order. */ - /* - * We also release the lock on listening socket and on our peer - * socket when called from unp_connect(). This is safe. The - * listening socket protected by vnode(9) lock. The peer socket - * has 'UNP_CONNECTING' flag set. - */ sounlock(so, SL_LOCKED); rw_enter_write(&unp_gc_lock); LIST_INSERT_HEAD(&unp_head, unp, unp_link); @@ -506,14 +568,13 @@ unp_detach(struct unpcb *unp) { struct socket *so = unp->unp_socket; struct vnode *vp = unp->unp_vnode; - - rw_assert_wrlock(&unp_lock); + struct unpcb *unp2; unp->unp_vnode = NULL; /* * Enforce `unp_gc_lock' -> `solock()' lock order. - * Enforce `i_lock' -> `unp_lock' lock order. + * Enforce `i_lock' -> `solock()' lock order. */ sounlock(so, SL_LOCKED); @@ -532,10 +593,47 @@ unp_detach(struct unpcb *unp) solock(so); - if (unp->unp_conn) + if (unp->unp_conn != NULL) { + /* + * Datagram socket could be connected to itself. + * Such socket will be disconnected here. + */ unp_disconnect(unp); - while (!SLIST_EMPTY(&unp->unp_refs)) - unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET); + } + + while ((unp2 = SLIST_FIRST(&unp->unp_refs)) != NULL) { + struct socket *so2 = unp2->unp_socket; + + if (so < so2) + solock(so2); + else { + unp_ref(unp2); + sounlock(so, SL_LOCKED); + solock(so2); + solock(so); + + if (unp2->unp_conn != unp) { + /* `unp2' was disconnected due to re-lock. */ + sounlock(so2, SL_LOCKED); + unp_rele(unp2); + continue; + } + + unp_rele(unp2); + } + + unp2->unp_conn = NULL; + SLIST_REMOVE(&unp->unp_refs, unp2, unpcb, unp_nextref); + so2->so_error = ECONNRESET; + so2->so_state &= ~SS_ISCONNECTED; + + sounlock(so2, SL_LOCKED); + } + + sounlock(so, SL_LOCKED); + refcnt_finalize(&unp->unp_refcnt, "unpfinal"); + solock(so); + soisdisconnected(so); so->so_pcb = NULL; m_freem(unp->unp_addr); @@ -675,24 +773,42 @@ unp_connect(struct socket *so, struct mb } if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0) goto put; - solock(so); so2 = vp->v_socket; if (so2 == NULL) { error = ECONNREFUSED; - goto put_locked; + goto put; } if (so->so_type != so2->so_type) { error = EPROTOTYPE; - goto put_locked; + goto put; } + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + solock(so2); + if ((so2->so_options & SO_ACCEPTCONN) == 0 || (so3 = sonewconn(so2, 0)) == NULL) { error = ECONNREFUSED; - goto put_locked; } + + sounlock(so2, SL_LOCKED); + + if (error != 0) + goto put; + + /* + * Since `so2' is protected by vnode(9) lock, `so3' + * can't be PRU_ABORT'ed here. + */ + solock_pair(so, so3); + unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); + + /* + * `unp_addr', `unp_connid' and 'UNP_FEIDSBIND' flag + * are immutable since we set them in unp_bind(). + */ if (unp2->unp_addr) unp3->unp_addr = m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT); @@ -700,15 +816,29 @@ unp_connect(struct socket *so, struct mb unp3->unp_connid.gid = p->p_ucred->cr_gid; unp3->unp_connid.pid = p->p_p->ps_pid; unp3->unp_flags |= UNP_FEIDS; - so2 = so3; + if (unp2->unp_flags & UNP_FEIDSBIND) { unp->unp_connid = unp2->unp_connid; unp->unp_flags |= UNP_FEIDS; } + + so2 = so3; + } else { + if (so2 != so) + solock_pair(so, so2); + else + solock(so); } + error = unp_connect2(so, so2); -put_locked: + sounlock(so, SL_LOCKED); + + /* + * `so2' can't be PRU_ABORT'ed concurrently + */ + if (so2 != so) + sounlock(so2, SL_LOCKED); put: vput(vp); unlock: @@ -725,7 +855,8 @@ unp_connect2(struct socket *so, struct s struct unpcb *unp = sotounpcb(so); struct unpcb *unp2; - rw_assert_wrlock(&unp_lock); + soassertlocked(so); + soassertlocked(so2); if (so2->so_type != so->so_type) return (EPROTOTYPE); @@ -740,6 +871,13 @@ unp_connect2(struct socket *so, struct s case SOCK_STREAM: case SOCK_SEQPACKET: + /* + * soisconnected() could re-lock `so2' and will lock + * 'so2->so_head'. `so2' is not yet exposed to userland + * and we can't have concurrent thread which locks `so2' + * and then `so'. Also we can't have concurrent thread + * which simultaneously locks `so' and 'so2->so_head'. + */ unp2->unp_conn = unp; soisconnected(so); soisconnected(so2); @@ -754,11 +892,15 @@ unp_connect2(struct socket *so, struct s void unp_disconnect(struct unpcb *unp) { - struct unpcb *unp2 = unp->unp_conn; + struct socket *so2; + struct unpcb *unp2; - if (unp2 == NULL) + if ((so2 = unp_solock_peer(unp->unp_socket)) == NULL) return; + + unp2 = unp->unp_conn; unp->unp_conn = NULL; + switch (unp->unp_socket->so_type) { case SOCK_DGRAM: @@ -777,33 +919,29 @@ unp_disconnect(struct unpcb *unp) soisdisconnected(unp2->unp_socket); break; } + + if (so2 != unp->unp_socket) + sounlock(so2, SL_LOCKED); } void unp_shutdown(struct unpcb *unp) { - struct socket *so; + struct socket *so2; switch (unp->unp_socket->so_type) { case SOCK_STREAM: case SOCK_SEQPACKET: - if (unp->unp_conn && (so = unp->unp_conn->unp_socket)) - socantrcvmore(so); + if ((so2 = unp_solock_peer(unp->unp_socket)) == NULL) + break; + + socantrcvmore(so2); + sounlock(so2, SL_LOCKED); + break; default: break; } -} - -void -unp_drop(struct unpcb *unp, int errno) -{ - struct socket *so = unp->unp_socket; - - rw_assert_wrlock(&unp_lock); - - so->so_error = errno; - unp_disconnect(unp); } #ifdef notdef Index: sys/miscfs/fifofs/fifo_vnops.c =================================================================== RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v retrieving revision 1.85 diff -u -p -r1.85 fifo_vnops.c --- sys/miscfs/fifofs/fifo_vnops.c 24 Oct 2021 11:23:22 -0000 1.85 +++ sys/miscfs/fifofs/fifo_vnops.c 20 Nov 2021 00:11:10 -0000 @@ -156,7 +156,7 @@ fifo_open(void *v) struct vnode *vp = ap->a_vp; struct fifoinfo *fip; struct socket *rso, *wso; - int s, error; + int error; if ((fip = vp->v_fifoinfo) == NULL) { fip = malloc(sizeof(*fip), M_VNODE, M_WAITOK); @@ -182,18 +182,20 @@ fifo_open(void *v) return (error); } fip->fi_readers = fip->fi_writers = 0; - s = solock(wso); + solock(wso); wso->so_state |= SS_CANTSENDMORE; wso->so_snd.sb_lowat = PIPE_BUF; + sounlock(wso, SL_LOCKED); } else { rso = fip->fi_readsock; wso = fip->fi_writesock; - s = solock(wso); } if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { + solock(wso); wso->so_state &= ~SS_CANTSENDMORE; + sounlock(wso, SL_LOCKED); if (fip->fi_writers > 0) wakeup(&fip->fi_writers); } @@ -202,16 +204,16 @@ fifo_open(void *v) fip->fi_writers++; if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) { error = ENXIO; - sounlock(wso, s); goto bad; } if (fip->fi_writers == 1) { + solock(rso); rso->so_state &= ~(SS_CANTRCVMORE|SS_ISDISCONNECTED); + sounlock(rso, SL_LOCKED); if (fip->fi_readers > 0) wakeup(&fip->fi_readers); } } - sounlock(wso, s); if ((ap->a_mode & O_NONBLOCK) == 0) { if ((ap->a_mode & FREAD) && fip->fi_writers == 0) { VOP_UNLOCK(vp); @@ -327,17 +329,16 @@ fifo_poll(void *v) struct socket *wso = ap->a_vp->v_fifoinfo->fi_writesock; int events = 0; int revents = 0; - int s; /* * FIFOs don't support out-of-band or high priority data. */ - s = solock(rso); if (ap->a_fflag & FREAD) events |= ap->a_events & (POLLIN | POLLRDNORM); if (ap->a_fflag & FWRITE) events |= ap->a_events & (POLLOUT | POLLWRNORM); + solock_pair(rso, wso); if (events & (POLLIN | POLLRDNORM)) { if (soreadable(rso)) revents |= events & (POLLIN | POLLRDNORM); @@ -362,7 +363,8 @@ fifo_poll(void *v) wso->so_snd.sb_flags |= SB_SEL; } } - sounlock(rso, s); + sounlock(rso, SL_LOCKED); + sounlock(wso, SL_LOCKED); return (revents); } Index: sys/sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v retrieving revision 1.101 diff -u -p -r1.101 socketvar.h --- sys/sys/socketvar.h 6 Nov 2021 05:26:33 -0000 1.101 +++ sys/sys/socketvar.h 20 Nov 2021 00:11:10 -0000 @@ -38,6 +38,7 @@ #include <sys/task.h> #include <sys/timeout.h> #include <sys/rwlock.h> +#include <sys/refcnt.h> #ifndef _SOCKLEN_T_DEFINED_ #define _SOCKLEN_T_DEFINED_ @@ -55,6 +56,7 @@ TAILQ_HEAD(soqhead, socket); struct socket { const struct protosw *so_proto; /* protocol handle */ struct rwlock so_lock; /* this socket lock */ + struct refcnt so_refcnt; /* references to this socket */ void *so_pcb; /* protocol control block */ u_int so_state; /* internal state flags SS_*, below */ short so_type; /* generic type, see socket.h */ @@ -80,6 +82,7 @@ struct socket { short so_q0len; /* partials on so_q0 */ short so_qlen; /* number of connections on so_q */ short so_qlimit; /* max number queued connections */ + u_long so_newconn; /* # of pending sonewconn() threads */ short so_timeo; /* connection timeout */ u_long so_oobmark; /* chars to oob mark */ u_int so_error; /* error affecting connection */ @@ -150,6 +153,7 @@ struct socket { #define SS_CONNECTOUT 0x1000 /* connect, not accept, at this end */ #define SS_ISSENDING 0x2000 /* hint for lower layer */ #define SS_DNS 0x4000 /* created using SOCK_DNS socket(2) */ +#define SS_NEWCONN_WAIT 0x8000 /* waiting sonewconn() relock */ #ifdef _KERNEL @@ -163,6 +167,18 @@ struct socket { void soassertlocked(struct socket *); +static inline void +soref(struct socket *so) +{ + refcnt_take(&so->so_refcnt); +} + +static inline void +sorele(struct socket *so) +{ + refcnt_rele_wake(&so->so_refcnt); +} + /* * Macros for sockets and socket buffering. */ @@ -337,6 +353,8 @@ int sockargs(struct mbuf **, const void int sosleep_nsec(struct socket *, void *, int, const char *, uint64_t); int solock(struct socket *); +int solock_persocket(struct socket *); +void solock_pair(struct socket *, struct socket *); void sounlock(struct socket *, int); int sendit(struct proc *, int, struct msghdr *, int, register_t *); Index: sys/sys/unpcb.h =================================================================== RCS file: /cvs/src/sys/sys/unpcb.h,v retrieving revision 1.20 diff -u -p -r1.20 unpcb.h --- sys/sys/unpcb.h 16 Nov 2021 08:56:20 -0000 1.20 +++ sys/sys/unpcb.h 20 Nov 2021 00:11:10 -0000 @@ -60,24 +60,26 @@ * Locks used to protect struct members: * I immutable after creation * G unp_gc_lock - * U unp_lock * a atomic + * s socket lock */ struct unpcb { + struct refcnt unp_refcnt; /* references to this pcb */ struct socket *unp_socket; /* [I] pointer back to socket */ - struct vnode *unp_vnode; /* [U] if associated with file */ + struct vnode *unp_vnode; /* [s] if associated with file */ struct file *unp_file; /* [a] backpointer for unp_gc() */ - struct unpcb *unp_conn; /* [U] control block of connected socket */ - ino_t unp_ino; /* [U] fake inode number */ - SLIST_HEAD(,unpcb) unp_refs; /* [U] referencing socket linked list */ - SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */ - struct mbuf *unp_addr; /* [U] bound address of socket */ + struct unpcb *unp_conn; /* [s] control block of connected + socket */ + ino_t unp_ino; /* [s] fake inode number */ + SLIST_HEAD(,unpcb) unp_refs; /* [s] referencing socket linked list */ + SLIST_ENTRY(unpcb) unp_nextref; /* [s] link in unp_refs list */ + struct mbuf *unp_addr; /* [s] bound address of socket */ long unp_msgcount; /* [a] references from socket rcv buf */ - int unp_flags; /* [U] this unpcb contains peer eids */ - int unp_gcflags; /* [G] garbge collector flags */ - struct sockpeercred unp_connid;/* [U] id of peer process */ + int unp_flags; /* [s] this unpcb contains peer eids */ + int unp_gcflags; /* [G] garbage collector flags */ + struct sockpeercred unp_connid;/* [s] id of peer process */ struct timespec unp_ctime; /* [I] holds creation time */ LIST_ENTRY(unpcb) unp_link; /* [G] link in per-AF list of sockets */ }; @@ -116,7 +118,6 @@ int unp_connect(struct socket *, struct int unp_connect2(struct socket *, struct socket *); void unp_detach(struct unpcb *); void unp_disconnect(struct unpcb *); -void unp_drop(struct unpcb *, int); void unp_gc(void *); void unp_shutdown(struct unpcb *); int unp_externalize(struct mbuf *, socklen_t, int);