The branch main has been updated by glebius: URL: https://cgit.FreeBSD.org/src/commit/?id=64f7e3c9c178ab35cb1f8fdf791aec74ede6f6b2
commit 64f7e3c9c178ab35cb1f8fdf791aec74ede6f6b2 Author: Gleb Smirnoff <[email protected]> AuthorDate: 2026-02-03 17:09:49 +0000 Commit: Gleb Smirnoff <[email protected]> CommitDate: 2026-02-03 17:09:49 +0000 sockets: let protocols be responsible for socket buffer mutexes Sockets that implement their own socket buffers (marked with PR_SOCKBUF) are now also responsible for initialization of socket buffer mutexes in pr_attach and for destruction in pr_detach (or pr_close). This removes a big bunch of reported LORs, as now WITNESS is able to see that tcp(4) socket buffer mutex and netlink(4) socket buffer mutex are two different things. Distinct names also improve diagnostics for blocked threads. This also removes a hack from unix(4), where we used to mtx_destroy(). Also removes an innocent bug from unix(4) where for accept(2)-ed socket soreserve() was called twice. This one was innocent since first call to soreserve() was asking for 0 bytes of space. This slightly increased amount of pasted code in TCP's syncache_socket(). The problem is that while for sockets created with socket(2) it is pr_attach responsible for call to soreserve() (including !PR_SOCKBUF protocols), but for the sockets created with accept(2) it was solisten_clone() doing soreserve(), combined with the fact that for accept(2) TCP completely bypasses pr_attach. This all should improve once TCP has its own socket buffers. Reviewed by: markj Differential Revision: https://reviews.freebsd.org/D54984 --- sys/kern/uipc_socket.c | 77 +++++++++++++++++++++++--------------------- sys/kern/uipc_usrreq.c | 17 +++++----- sys/netinet/tcp_syncache.c | 15 +++++++-- sys/netlink/netlink_domain.c | 10 ++++-- 4 files changed, 68 insertions(+), 51 deletions(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 4014006ce2e5..73ac2c6efc4e 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -818,8 +818,6 @@ soalloc(struct vnet *vnet) * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); - mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); - mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd_sx, "so_snd_sx"); @@ -893,14 +891,47 @@ sodealloc(struct socket *so) &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd_sx); sx_destroy(&so->so_rcv_sx); - mtx_destroy(&so->so_snd_mtx); - mtx_destroy(&so->so_rcv_mtx); } crfree(so->so_cred); mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } +/* + * Shim to accomodate protocols that already do their own socket buffers + * management (marked with PR_SOCKBUF) with protocols that yet do not. + * + * Attach via socket(2) is different from attach via accept(2). In case of + * normal socket(2) syscall it is the pr_attach that calls soreserve(), even + * for protocols that don't yet do PR_SOCKBUF. In case of accepted connection + * it is our shim that calls soreserve() and the hiwat values are taken from + * the parent socket. + */ +static int +soattach(struct socket *so, int proto, struct thread *td, struct socket *head) +{ + int error; + + VNET_ASSERT(curvnet == so->so_vnet, + ("%s: %p != %p", __func__, curvnet, so->so_vnet)); + + if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { + mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); + mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); + so->so_snd.sb_mtx = &so->so_snd_mtx; + so->so_rcv.sb_mtx = &so->so_rcv_mtx; + } + if (head == NULL || (error = soreserve(so, head->sol_sbsnd_hiwat, + head->sol_sbrcv_hiwat)) == 0) + error = so->so_proto->pr_attach(so, proto, td); + if (error != 0 && (so->so_proto->pr_flags & PR_SOCKBUF) == 0) { + mtx_destroy(&so->so_snd_mtx); + mtx_destroy(&so->so_rcv_mtx); + } + + return (error); +} + /* * socreate returns a socket with a ref count of 1 and a file descriptor * reference. The socket should be closed with soclose(). @@ -956,16 +987,8 @@ socreate(int dom, struct socket **aso, int type, int proto, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); - if ((prp->pr_flags & PR_SOCKBUF) == 0) { - so->so_snd.sb_mtx = &so->so_snd_mtx; - so->so_rcv.sb_mtx = &so->so_rcv_mtx; - } - /* - * Auto-sizing of socket buffers is managed by the protocols and - * the appropriate flags must be set in the pr_attach() method. - */ CURVNET_SET(so->so_vnet); - error = prp->pr_attach(so, proto, td); + error = soattach(so, proto, td, NULL); CURVNET_RESTORE(); if (error) { sodealloc(so); @@ -1192,13 +1215,6 @@ solisten_clone(struct socket *head) so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); - VNET_SO_ASSERT(head); - if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { - sodealloc(so); - log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", - __func__, head->so_pcb); - return (NULL); - } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; @@ -1206,10 +1222,6 @@ solisten_clone(struct socket *head) so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags = head->sol_sbsnd_flags & (SB_AUTOSIZE | SB_AUTOLOWAT); - if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { - so->so_snd.sb_mtx = &so->so_snd_mtx; - so->so_rcv.sb_mtx = &so->so_rcv_mtx; - } return (so); } @@ -1223,7 +1235,7 @@ sonewconn(struct socket *head, int connstatus) if ((so = solisten_clone(head)) == NULL) return (NULL); - if (so->so_proto->pr_attach(so, 0, NULL) != 0) { + if (soattach(so, 0, NULL, head) != 0) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", __func__, head->so_pcb); @@ -1327,14 +1339,7 @@ sopeeloff(struct socket *head) so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); - VNET_SO_ASSERT(head); - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { - sodealloc(so); - log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", - __func__, head->so_pcb); - return (NULL); - } - if (so->so_proto->pr_attach(so, 0, NULL)) { + if (soattach(so, 0, NULL, head)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", __func__, head->so_pcb); @@ -1346,10 +1351,6 @@ sopeeloff(struct socket *head) so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; - if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { - so->so_snd.sb_mtx = &so->so_snd_mtx; - so->so_rcv.sb_mtx = &so->so_rcv_mtx; - } soref(so); @@ -1906,6 +1907,8 @@ sofree(struct socket *so) SOCK_SENDBUF_UNLOCK(so); SOCK_RECVBUF_UNLOCK(so); #endif + mtx_destroy(&so->so_snd_mtx); + mtx_destroy(&so->so_rcv_mtx); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 60736af5adf6..d56aac883d9c 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -479,7 +479,7 @@ uipc_attach(struct socket *so, int proto, struct thread *td) { u_long sendspace, recvspace; struct unpcb *unp; - int error; + int error, rcvmtxopts; bool locked; KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); @@ -494,6 +494,7 @@ uipc_attach(struct socket *so, int proto, struct thread *td) * limits to unpdg_recvspace. */ sendspace = recvspace = unpdg_recvspace; + rcvmtxopts = 0; break; case SOCK_STREAM: @@ -505,14 +506,7 @@ uipc_attach(struct socket *so, int proto, struct thread *td) sendspace = unpsp_sendspace; recvspace = unpsp_recvspace; common: - /* - * XXXGL: we need to initialize the mutex with MTX_DUPOK. - * Ideally, protocols that have PR_SOCKBUF should be - * responsible for mutex initialization officially, and then - * this uglyness with mtx_destroy(); mtx_init(); would go away. - */ - mtx_destroy(&so->so_rcv_mtx); - mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF | MTX_DUPOK); + rcvmtxopts = MTX_DUPOK; knlist_init(&so->so_wrsel.si_note, so, uipc_wrknl_lock, uipc_wrknl_unlock, uipc_wrknl_assert_lock); STAILQ_INIT(&so->so_rcv.uxst_mbq); @@ -520,6 +514,8 @@ common: default: panic("uipc_attach"); } + mtx_init(&so->so_rcv_mtx, "unix so_rcv", NULL, MTX_DEF | rcvmtxopts); + mtx_init(&so->so_snd_mtx, "unix so_snd", NULL, MTX_DEF); error = soreserve(so, sendspace, recvspace); if (error) return (error); @@ -888,6 +884,9 @@ uipc_detach(struct socket *so) MPASS(TAILQ_EMPTY(&so->so_rcv.uxdg_conns)); MPASS(STAILQ_EMPTY(&so->so_snd.uxdg_mb)); } + + mtx_destroy(&so->so_snd_mtx); + mtx_destroy(&so->so_rcv_mtx); } static int diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 8c58be63cd5a..4b501b221bcb 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -772,12 +772,21 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) NET_EPOCH_ASSERT(); /* - * Ok, create the full blown connection, and set things up - * as they would have been set up if we had created the - * connection when the SYN arrived. + * Creation of a socket via solisten_clone() bypasses call to pr_attach. + * That's why there is some pasted code from soattach() and from + * tcp_usr_attach() here. This should improve once TCP is PR_SOCKBUF. */ if ((so = solisten_clone(lso)) == NULL) goto allocfail; + mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); + mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); + so->so_snd.sb_mtx = &so->so_snd_mtx; + so->so_rcv.sb_mtx = &so->so_rcv_mtx; + error = soreserve(so, lso->sol_sbsnd_hiwat, lso->sol_sbrcv_hiwat); + if (error) { + sodealloc(so); + goto allocfail; + } #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c index 74b46114716e..e906e0d635af 100644 --- a/sys/netlink/netlink_domain.c +++ b/sys/netlink/netlink_domain.c @@ -330,14 +330,17 @@ nl_attach(struct socket *so, int proto, struct thread *td) so, is_linux ? "(linux) " : "", curproc->p_pid, nl_get_proto_name(proto)); - nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); + mtx_init(&so->so_snd_mtx, "netlink so_snd", NULL, MTX_DEF); + mtx_init(&so->so_rcv_mtx, "netlink so_rcv", NULL, MTX_DEF); error = soreserve(so, nl_sendspace, nl_recvspace); if (error != 0) { - free(nlp, M_PCB); + mtx_destroy(&so->so_snd_mtx); + mtx_destroy(&so->so_rcv_mtx); return (error); } TAILQ_INIT(&so->so_rcv.nl_queue); TAILQ_INIT(&so->so_snd.nl_queue); + nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); so->so_pcb = nlp; nlp->nl_socket = so; nlp->nl_proto = proto; @@ -517,6 +520,9 @@ nl_close(struct socket *so) nl_buf_free(nb); } + mtx_destroy(&so->so_snd_mtx); + mtx_destroy(&so->so_rcv_mtx); + NL_LOG(LOG_DEBUG3, "socket %p, detached", so); /* XXX: is delayed free needed? */
