Hi,

In filt_solisten_common() you touches `so_qlen’ only. It’s not
related to buffer and not protected by introduced `sb_mtx’ so
the solock() replacement in filt_solisten*() is wrong.

However, in filt_solisten_common() you only checks is
`so_qlen’ != 0 condition and such check could be performed lockless.
I propose you to commit this by separate diff.

Also there are two places I want to point: 

> @@ -208,8 +208,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, 
> struct mbuf *nam,
>                        * Adjust backpressure on sender
>                        * and wakeup any waiting to write.
>                        */
> +                     mtx_enter(&so2->so_snd.sb_mtx);
>                       so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
>                       so2->so_snd.sb_cc = so->so_rcv.sb_cc;
> +                     mtx_leave(&so2->so_snd.sb_mtx);
>                       sowwakeup(so2);
>                       break;
> 

This is 'PRU_RCVD’ case, so you hold solock() on `so’ and it’s receive
buffer is locked by sblock(). Is it assumed `sb_mbcnt’ and `sb_cc’
modification of `so_rcv’ protected by solock()? Should the both buffers
be locked here? I’m asking because you only remove solock() from kqueue(9)
path and the solock() still serialises the rest of sockets, but you are
going to reduce solock().

The same question for 'PRU_SEND’ case.

> @@ -284,8 +286,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, 
> struct mbuf *nam,
>                               sbappendrecord(so2, &so2->so_rcv, m);
>                       else
>                               sbappend(so2, &so2->so_rcv, m);
> +                     mtx_enter(&so2->so_snd.sb_mtx);
>                       so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
>                       so->so_snd.sb_cc = so2->so_rcv.sb_cc;
> +                     mtx_leave(&so2->so_snd.sb_mtx);
>                       if (so2->so_rcv.sb_cc > 0)
>                               sorwakeup(so2);


Since you touch 'so2->so_rcv’ content here, you want to lock it instead
of 'so2->so_snd’, right?


> On 10 Jul 2021, at 18:26, Martin Pieuchot <m...@openbsd.org> wrote:
> 
> One of the reasons for the drop of performances in the kqueue-based
> poll/select is the fact that kqueue filters are called up to 3 times
> per syscall and that they all spin on the NET_LOCK() for TCP/UDP
> packets.
> 
> Diff below is a RFC for improving the situation.
> 
> socket kqueue filters mainly check for the amount of available items to
> read/write.  This involves comparing various socket buffer fields (sb_cc,
> sb_lowat, etc).  The diff below introduces a new mutex to serialize
> updates of those fields with reads in the kqueue filters.
> 
> Since these fields are always modified with the socket lock held, either
> the mutex or the solock are enough to have a coherent view of them.
> Note that either of these locks is necessary only if multiple fields
> have to be read (like in sbspace()).
> 
> Other per-socket fields accessed in the kqueue filters are never
> combined (with &&) to determine a condition.  So assuming it is fine to
> read register-sized fields w/o the socket lock we can safely remove it
> there.
> 
> Could such mutex also be used to serialize klist updates?
> 
> Comments?
> 
> diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
> index dce20208828..d1cb9f4fc3b 100644
> --- sys/kern/uipc_socket.c
> +++ sys/kern/uipc_socket.c
> @@ -181,6 +181,8 @@ socreate(int dom, struct socket **aso, int type, int 
> proto)
>       so->so_egid = p->p_ucred->cr_gid;
>       so->so_cpid = p->p_p->ps_pid;
>       so->so_proto = prp;
> +     mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
> +     mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
>       so->so_snd.sb_timeo_nsecs = INFSLP;
>       so->so_rcv.sb_timeo_nsecs = INFSLP;
> 
> @@ -276,7 +278,9 @@ sofree(struct socket *so, int s)
>               }
>       }
> #endif /* SOCKET_SPLICE */
> +     mtx_enter(&so->so_snd.sb_mtx);
>       sbrelease(so, &so->so_snd);
> +     mtx_leave(&so->so_snd.sb_mtx);
>       sorflush(so);
>       sounlock(so, s);
> #ifdef SOCKET_SPLICE
> @@ -1019,8 +1023,10 @@ dontblock:
>                                       *mp = m_copym(m, 0, len, M_WAIT);
>                               m->m_data += len;
>                               m->m_len -= len;
> +                             mtx_enter(&so->so_rcv.sb_mtx);
>                               so->so_rcv.sb_cc -= len;
>                               so->so_rcv.sb_datacc -= len;
> +                             mtx_leave(&so->so_rcv.sb_mtx);
>                       }
>               }
>               if (so->so_oobmark) {
> @@ -1537,8 +1543,10 @@ somove(struct socket *so, int wait)
>                       }
>                       so->so_rcv.sb_mb->m_data += size;
>                       so->so_rcv.sb_mb->m_len -= size;
> +                     mtx_enter(&so->so_rcv.sb_mtx);
>                       so->so_rcv.sb_cc -= size;
>                       so->so_rcv.sb_datacc -= size;
> +                     mtx_leave(&so->so_rcv.sb_mtx);
>               } else {
>                       *mp = so->so_rcv.sb_mb;
>                       sbfree(&so->so_rcv, *mp);
> @@ -1777,30 +1785,40 @@ sosetopt(struct socket *so, int level, int optname, 
> struct mbuf *m)
>                       case SO_SNDBUF:
>                               if (so->so_state & SS_CANTSENDMORE)
>                                       return (EINVAL);
> +                             mtx_enter(&so->so_snd.sb_mtx);
>                               if (sbcheckreserve(cnt, so->so_snd.sb_wat) ||
>                                   sbreserve(so, &so->so_snd, cnt))
> -                                     return (ENOBUFS);
> -                             so->so_snd.sb_wat = cnt;
> +                                     error = ENOBUFS;
> +                             if (error == 0)
> +                                     so->so_snd.sb_wat = cnt;
> +                             mtx_leave(&so->so_snd.sb_mtx);
>                               break;
> 
>                       case SO_RCVBUF:
>                               if (so->so_state & SS_CANTRCVMORE)
>                                       return (EINVAL);
> +                             mtx_enter(&so->so_rcv.sb_mtx);
>                               if (sbcheckreserve(cnt, so->so_rcv.sb_wat) ||
>                                   sbreserve(so, &so->so_rcv, cnt))
> -                                     return (ENOBUFS);
> -                             so->so_rcv.sb_wat = cnt;
> +                                     error = ENOBUFS;
> +                             if (error == 0)
> +                                     so->so_rcv.sb_wat = cnt;
> +                             mtx_leave(&so->so_rcv.sb_mtx);
>                               break;
> 
>                       case SO_SNDLOWAT:
> +                             mtx_enter(&so->so_snd.sb_mtx);
>                               so->so_snd.sb_lowat =
>                                   (cnt > so->so_snd.sb_hiwat) ?
>                                   so->so_snd.sb_hiwat : cnt;
> +                             mtx_leave(&so->so_snd.sb_mtx);
>                               break;
>                       case SO_RCVLOWAT:
> +                             mtx_leave(&so->so_rcv.sb_mtx);
>                               so->so_rcv.sb_lowat =
>                                   (cnt > so->so_rcv.sb_hiwat) ?
>                                   so->so_rcv.sb_hiwat : cnt;
> +                             mtx_leave(&so->so_rcv.sb_mtx);
>                               break;
>                       }
>                       break;
> @@ -2077,9 +2095,10 @@ filt_sordetach(struct knote *kn)
> int
> filt_soread_common(struct knote *kn, struct socket *so)
> {
> +     u_int sostate = so->so_state;
>       int rv = 0;
> 
> -     soassertlocked(so);
> +     MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
> 
>       kn->kn_data = so->so_rcv.sb_cc;
> #ifdef SOCKET_SPLICE
> @@ -2088,15 +2107,17 @@ filt_soread_common(struct knote *kn, struct socket 
> *so)
>       } else
> #endif /* SOCKET_SPLICE */
>       if (kn->kn_sfflags & NOTE_OOB) {
> -             if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
> +             u_long oobmark = so->so_oobmark;
> +
> +             if (oobmark || (sostate & SS_RCVATMARK)) {
>                       kn->kn_fflags |= NOTE_OOB;
> -                     kn->kn_data -= so->so_oobmark;
> +                     kn->kn_data -= oobmark;
>                       rv = 1;
>               }
> -     } else if (so->so_state & SS_CANTRCVMORE) {
> +     } else if (sostate & SS_CANTRCVMORE) {
>               kn->kn_flags |= EV_EOF;
>               if (kn->kn_flags & __EV_POLL) {
> -                     if (so->so_state & SS_ISDISCONNECTED)
> +                     if (sostate & SS_ISDISCONNECTED)
>                               kn->kn_flags |= __EV_HUP;
>               }
>               kn->kn_fflags = so->so_error;
> @@ -2116,20 +2137,25 @@ int
> filt_soread(struct knote *kn, long hint)
> {
>       struct socket *so = kn->kn_fp->f_data;
> +     int rv;
> +
> +     mtx_enter(&so->so_rcv.sb_mtx);
> +     rv = filt_soread_common(kn, so);
> +     mtx_leave(&so->so_rcv.sb_mtx);
> 
> -     return (filt_soread_common(kn, so));
> +     return (rv);
> }
> 
> int
> filt_soreadmodify(struct kevent *kev, struct knote *kn)
> {
>       struct socket *so = kn->kn_fp->f_data;
> -     int rv, s;
> +     int rv;
> 
> -     s = solock(so);
> +     mtx_enter(&so->so_rcv.sb_mtx);
>       knote_modify(kev, kn);
>       rv = filt_soread_common(kn, so);
> -     sounlock(so, s);
> +     mtx_leave(&so->so_rcv.sb_mtx);
> 
>       return (rv);
> }
> @@ -2138,16 +2164,16 @@ int
> filt_soreadprocess(struct knote *kn, struct kevent *kev)
> {
>       struct socket *so = kn->kn_fp->f_data;
> -     int rv, s;
> +     int rv;
> 
> -     s = solock(so);
> +     mtx_enter(&so->so_rcv.sb_mtx);
>       if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
>               rv = 1;
>       else
>               rv = filt_soread_common(kn, so);
>       if (rv != 0)
>               knote_submit(kn, kev);
> -     sounlock(so, s);
> +     mtx_leave(&so->so_rcv.sb_mtx);
> 
>       return (rv);
> }
> @@ -2165,22 +2191,23 @@ filt_sowdetach(struct knote *kn)
> int
> filt_sowrite_common(struct knote *kn, struct socket *so)
> {
> +     u_int sostate = so->so_state;
>       int rv;
> 
> -     soassertlocked(so);
> +     MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
> 
> -     kn->kn_data = sbspace(so, &so->so_snd);
> -     if (so->so_state & SS_CANTSENDMORE) {
> +     kn->kn_data = sbspace_locked(so, &so->so_snd);
> +     if (sostate & SS_CANTSENDMORE) {
>               kn->kn_flags |= EV_EOF;
>               if (kn->kn_flags & __EV_POLL) {
> -                     if (so->so_state & SS_ISDISCONNECTED)
> +                     if (sostate & SS_ISDISCONNECTED)
>                               kn->kn_flags |= __EV_HUP;
>               }
>               kn->kn_fflags = so->so_error;
>               rv = 1;
>       } else if (so->so_error) {      /* temporary udp error */
>               rv = 1;
> -     } else if (((so->so_state & SS_ISCONNECTED) == 0) &&
> +     } else if (((sostate & SS_ISCONNECTED) == 0) &&
>           (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
>               rv = 0;
>       } else if (kn->kn_sfflags & NOTE_LOWAT) {
> @@ -2196,20 +2223,25 @@ int
> filt_sowrite(struct knote *kn, long hint)
> {
>       struct socket *so = kn->kn_fp->f_data;
> +     int rv;
> +
> +     mtx_enter(&so->so_snd.sb_mtx);
> +     rv = filt_sowrite_common(kn, so);
> +     mtx_leave(&so->so_snd.sb_mtx);
> 
> -     return (filt_sowrite_common(kn, so));
> +     return (rv);
> }
> 
> int
> filt_sowritemodify(struct kevent *kev, struct knote *kn)
> {
>       struct socket *so = kn->kn_fp->f_data;
> -     int rv, s;
> +     int rv;
> 
> -     s = solock(so);
> +     mtx_enter(&so->so_snd.sb_mtx);
>       knote_modify(kev, kn);
>       rv = filt_sowrite_common(kn, so);
> -     sounlock(so, s);
> +     mtx_leave(&so->so_snd.sb_mtx);
> 
>       return (rv);
> }
> @@ -2218,16 +2250,16 @@ int
> filt_sowriteprocess(struct knote *kn, struct kevent *kev)
> {
>       struct socket *so = kn->kn_fp->f_data;
> -     int rv, s;
> +     int rv;
> 
> -     s = solock(so);
> +     mtx_enter(&so->so_snd.sb_mtx);
>       if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
>               rv = 1;
>       else
>               rv = filt_sowrite_common(kn, so);
>       if (rv != 0)
>               knote_submit(kn, kev);
> -     sounlock(so, s);
> +     mtx_leave(&so->so_snd.sb_mtx);
> 
>       return (rv);
> }
> @@ -2235,8 +2267,6 @@ filt_sowriteprocess(struct knote *kn, struct kevent 
> *kev)
> int
> filt_solisten_common(struct knote *kn, struct socket *so)
> {
> -     soassertlocked(so);
> -
>       kn->kn_data = so->so_qlen;
> 
>       return (kn->kn_data != 0);
> @@ -2254,12 +2284,12 @@ int
> filt_solistenmodify(struct kevent *kev, struct knote *kn)
> {
>       struct socket *so = kn->kn_fp->f_data;
> -     int rv, s;
> +     int rv;
> 
> -     s = solock(so);
> +     mtx_enter(&so->so_rcv.sb_mtx);
>       knote_modify(kev, kn);
>       rv = filt_solisten_common(kn, so);
> -     sounlock(so, s);
> +     mtx_leave(&so->so_rcv.sb_mtx);
> 
>       return (rv);
> }
> @@ -2268,16 +2298,16 @@ int
> filt_solistenprocess(struct knote *kn, struct kevent *kev)
> {
>       struct socket *so = kn->kn_fp->f_data;
> -     int rv, s;
> +     int rv;
> 
> -     s = solock(so);
> +     mtx_enter(&so->so_rcv.sb_mtx);
>       if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
>               rv = 1;
>       else
>               rv = filt_solisten_common(kn, so);
>       if (rv != 0)
>               knote_submit(kn, kev);
> -     sounlock(so, s);
> +     mtx_leave(&so->so_rcv.sb_mtx);
> 
>       return (rv);
> }
> diff --git sys/kern/uipc_socket2.c sys/kern/uipc_socket2.c
> index 69ec14230e6..b2f25ab8d40 100644
> --- sys/kern/uipc_socket2.c
> +++ sys/kern/uipc_socket2.c
> @@ -34,7 +34,6 @@
> 
> #include <sys/param.h>
> #include <sys/systm.h>
> -#include <sys/malloc.h>
> #include <sys/mbuf.h>
> #include <sys/protosw.h>
> #include <sys/domain.h>
> @@ -163,6 +162,8 @@ sonewconn(struct socket *head, int connstatus)
>       if (so == NULL)
>               return (NULL);
>       rw_init(&so->so_lock, "solock");
> +     mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR);
> +     mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR);
>       so->so_type = head->so_type;
>       so->so_options = head->so_options &~ SO_ACCEPTCONN;
>       so->so_linger = head->so_linger;
> @@ -463,8 +464,10 @@ soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
> {
>       soassertlocked(so);
> 
> +     mtx_enter(&so->so_snd.sb_mtx);
>       if (sbreserve(so, &so->so_snd, sndcc))
>               goto bad;
> +     mtx_enter(&so->so_rcv.sb_mtx);
>       if (sbreserve(so, &so->so_rcv, rcvcc))
>               goto bad2;
>       so->so_snd.sb_wat = sndcc;
> @@ -475,10 +478,14 @@ soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
>               so->so_snd.sb_lowat = MCLBYTES;
>       if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
>               so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
> +     mtx_leave(&so->so_rcv.sb_mtx);
> +     mtx_leave(&so->so_snd.sb_mtx);
>       return (0);
> bad2:
> +     mtx_leave(&so->so_rcv.sb_mtx);
>       sbrelease(so, &so->so_snd);
> bad:
> +     mtx_leave(&so->so_snd.sb_mtx);
>       return (ENOBUFS);
> }
> 
> @@ -492,6 +499,7 @@ sbreserve(struct socket *so, struct sockbuf *sb, u_long 
> cc)
> {
>       KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
>       soassertlocked(so);
> +     MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
> 
>       if (cc == 0 || cc > sb_max)
>               return (1);
> @@ -533,6 +541,7 @@ sbchecklowmem(void)
> void
> sbrelease(struct socket *so, struct sockbuf *sb)
> {
> +     MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
> 
>       sbflush(so, sb);
>       sb->sb_hiwat = sb->sb_mbmax = 0;
> @@ -686,6 +695,7 @@ sbcheck(struct sockbuf *sb)
>       struct mbuf *m, *n;
>       u_long len = 0, mbcnt = 0;
> 
> +     mtx_enter(&sb->sb_mtx);
>       for (m = sb->sb_mb; m; m = m->m_nextpkt) {
>               for (n = m; n; n = n->m_next) {
>                       len += n->m_len;
> @@ -701,6 +711,7 @@ sbcheck(struct sockbuf *sb)
>                   mbcnt, sb->sb_mbcnt);
>               panic("sbcheck");
>       }
> +     mtx_leave(&sb->sb_mtx);
> }
> #endif
> 
> @@ -908,9 +919,11 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct 
> mbuf *n)
>                       memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t),
>                           m->m_len);
>                       n->m_len += m->m_len;
> +                     mtx_enter(&sb->sb_mtx);
>                       sb->sb_cc += m->m_len;
>                       if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME)
>                               sb->sb_datacc += m->m_len;
> +                     mtx_leave(&sb->sb_mtx);
>                       m = m_free(m);
>                       continue;
>               }
> @@ -943,6 +956,7 @@ sbflush(struct socket *so, struct sockbuf *sb)
> {
>       KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
>       KASSERT((sb->sb_flags & SB_LOCK) == 0);
> +     MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
> 
>       while (sb->sb_mbcnt)
>               sbdrop(so, sb, (int)sb->sb_cc);
> @@ -965,6 +979,7 @@ sbdrop(struct socket *so, struct sockbuf *sb, int len)
> 
>       KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
>       soassertlocked(so);
> +     MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
> 
>       next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
>       while (len > 0) {
> @@ -984,12 +999,12 @@ sbdrop(struct socket *so, struct sockbuf *sb, int len)
>                       break;
>               }
>               len -= m->m_len;
> -             sbfree(sb, m);
> +             sbfree_locked(sb, m);
>               mn = m_free(m);
>               m = mn;
>       }
>       while (m && m->m_len == 0) {
> -             sbfree(sb, m);
> +             sbfree_locked(sb, m);
>               mn = m_free(m);
>               m = mn;
>       }
> diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c
> index 83e62bad6c1..73f59e61160 100644
> --- sys/kern/uipc_usrreq.c
> +++ sys/kern/uipc_usrreq.c
> @@ -208,8 +208,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, 
> struct mbuf *nam,
>                        * Adjust backpressure on sender
>                        * and wakeup any waiting to write.
>                        */
> +                     mtx_enter(&so2->so_snd.sb_mtx);
>                       so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
>                       so2->so_snd.sb_cc = so->so_rcv.sb_cc;
> +                     mtx_leave(&so2->so_snd.sb_mtx);
>                       sowwakeup(so2);
>                       break;
> 
> @@ -284,8 +286,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, 
> struct mbuf *nam,
>                               sbappendrecord(so2, &so2->so_rcv, m);
>                       else
>                               sbappend(so2, &so2->so_rcv, m);
> +                     mtx_enter(&so2->so_snd.sb_mtx);
>                       so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
>                       so->so_snd.sb_cc = so2->so_rcv.sb_cc;
> +                     mtx_leave(&so2->so_snd.sb_mtx);
>                       if (so2->so_rcv.sb_cc > 0)
>                               sorwakeup(so2);
>                       m = NULL;
> @@ -736,12 +740,16 @@ unp_disconnect(struct unpcb *unp)
> 
>       case SOCK_STREAM:
>       case SOCK_SEQPACKET:
> +             mtx_enter(&unp->unp_socket->so_snd.sb_mtx);
>               unp->unp_socket->so_snd.sb_mbcnt = 0;
>               unp->unp_socket->so_snd.sb_cc = 0;
> +             mtx_leave(&unp->unp_socket->so_snd.sb_mtx);
>               soisdisconnected(unp->unp_socket);
>               unp2->unp_conn = NULL;
> +             mtx_enter(&unp2->unp_socket->so_snd.sb_mtx);
>               unp2->unp_socket->so_snd.sb_mbcnt = 0;
>               unp2->unp_socket->so_snd.sb_cc = 0;
> +             mtx_leave(&unp2->unp_socket->so_snd.sb_mtx);
>               soisdisconnected(unp2->unp_socket);
>               break;
>       }
> diff --git sys/netinet/tcp_input.c sys/netinet/tcp_input.c
> index cd0c12dcd3b..c60bddc9f55 100644
> --- sys/netinet/tcp_input.c
> +++ sys/netinet/tcp_input.c
> @@ -946,7 +946,9 @@ findpcb:
>                               tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte,
>                                   acked);
>                               ND6_HINT(tp);
> +                             mtx_enter(&so->so_snd.sb_mtx);
>                               sbdrop(so, &so->so_snd, acked);
> +                             mtx_leave(&so->so_snd.sb_mtx);
> 
>                               /*
>                                * If we had a pending ICMP message that
> @@ -1714,6 +1716,7 @@ trimthenstep6:
>                           TCP_MAXWIN << tp->snd_scale);
>               }
>               ND6_HINT(tp);
> +             mtx_enter(&so->so_snd.sb_mtx);
>               if (acked > so->so_snd.sb_cc) {
>                       if (tp->snd_wnd > so->so_snd.sb_cc)
>                               tp->snd_wnd -= so->so_snd.sb_cc;
> @@ -1729,6 +1732,7 @@ trimthenstep6:
>                               tp->snd_wnd = 0;
>                       ourfinisacked = 0;
>               }
> +             mtx_leave(&so->so_snd.sb_mtx);
> 
>               tcp_update_sndspace(tp);
>               if (sb_notify(so, &so->so_snd)) {
> @@ -2967,7 +2971,9 @@ tcp_mss_update(struct tcpcb *tp)
>               bufsize = roundup(bufsize, mss);
>               if (bufsize > sb_max)
>                       bufsize = sb_max;
> +             mtx_enter(&so->so_snd.sb_mtx);
>               (void)sbreserve(so, &so->so_snd, bufsize);
> +             mtx_leave(&so->so_snd.sb_mtx);
>       }
> 
>       bufsize = so->so_rcv.sb_hiwat;
> @@ -2975,7 +2981,9 @@ tcp_mss_update(struct tcpcb *tp)
>               bufsize = roundup(bufsize, mss);
>               if (bufsize > sb_max)
>                       bufsize = sb_max;
> +             mtx_enter(&so->so_rcv.sb_mtx);
>               (void)sbreserve(so, &so->so_rcv, bufsize);
> +             mtx_leave(&so->so_rcv.sb_mtx);
>       }
> 
> }
> diff --git sys/netinet/tcp_usrreq.c sys/netinet/tcp_usrreq.c
> index 98d2270d8f4..4652e6a25ce 100644
> --- sys/netinet/tcp_usrreq.c
> +++ sys/netinet/tcp_usrreq.c
> @@ -688,7 +688,9 @@ tcp_disconnect(struct tcpcb *tp)
>               tp = tcp_drop(tp, 0);
>       else {
>               soisdisconnecting(so);
> +             mtx_enter(&so->so_rcv.sb_mtx);
>               sbflush(so, &so->so_rcv);
> +             mtx_leave(&so->so_rcv.sb_mtx);
>               tp = tcp_usrclosed(tp);
>               if (tp)
>                       (void) tcp_output(tp);
> @@ -1115,6 +1117,7 @@ tcp_update_sndspace(struct tcpcb *tp)
>       struct socket *so = tp->t_inpcb->inp_socket;
>       u_long nmax = so->so_snd.sb_hiwat;
> 
> +     mtx_enter(&so->so_snd.sb_mtx);
>       if (sbchecklowmem()) {
>               /* low on memory try to get rid of some */
>               if (tcp_sendspace < nmax)
> @@ -1128,7 +1131,7 @@ tcp_update_sndspace(struct tcpcb *tp)
>                   tp->snd_una);
> 
>       /* a writable socket must be preserved because of poll(2) semantics */
> -     if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
> +     if (sbspace_locked(so, &so->so_snd) >= so->so_snd.sb_lowat) {
>               if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
>                       nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
>               /* keep in sync with sbreserve() calculation */
> @@ -1141,6 +1144,7 @@ tcp_update_sndspace(struct tcpcb *tp)
> 
>       if (nmax != so->so_snd.sb_hiwat)
>               sbreserve(so, &so->so_snd, nmax);
> +     mtx_leave(&so->so_snd.sb_mtx);
> }
> 
> /*
> @@ -1179,5 +1183,7 @@ tcp_update_rcvspace(struct tcpcb *tp)
> 
>       /* round to MSS boundary */
>       nmax = roundup(nmax, tp->t_maxseg);
> +     mtx_enter(&so->so_rcv.sb_mtx);
>       sbreserve(so, &so->so_rcv, nmax);
> +     mtx_leave(&so->so_rcv.sb_mtx);
> }
> diff --git sys/sys/socketvar.h sys/sys/socketvar.h
> index 72bb18d3e59..32cab70c9bd 100644
> --- sys/sys/socketvar.h
> +++ sys/sys/socketvar.h
> @@ -33,10 +33,12 @@
>  */
> 
> #include <sys/selinfo.h>                      /* for struct selinfo */
> +#include <sys/systm.h>                               /* panicstr for 
> MUTEX_ASSERT */
> #include <sys/queue.h>
> #include <sys/sigio.h>                                /* for struct sigio_ref 
> */
> #include <sys/task.h>
> #include <sys/timeout.h>
> +#include <sys/mutex.h>
> #include <sys/rwlock.h>
> 
> #ifndef       _SOCKLEN_T_DEFINED_
> @@ -51,6 +53,10 @@ TAILQ_HEAD(soqhead, socket);
>  * Contains send and receive buffer queues,
>  * handle on protocol and pointer to protocol
>  * private data and error information.
> + *
> + * Locks used to protect struct members in this file:
> + *   s       this socket solock
> + *   m       per-sockbuf mutex
>  */
> struct socket {
>       const struct protosw *so_proto; /* protocol handle */
> @@ -99,15 +105,16 @@ struct socket {
>  * Variables for socket buffering.
>  */
>       struct  sockbuf {
> +             struct mutex sb_mtx;
> /* The following fields are all zeroed on flush. */
> #define       sb_startzero    sb_cc
> -             u_long  sb_cc;          /* actual chars in buffer */
> -             u_long  sb_datacc;      /* data only chars in buffer */
> -             u_long  sb_hiwat;       /* max actual char count */
> -             u_long  sb_wat;         /* default watermark */
> -             u_long  sb_mbcnt;       /* chars of mbufs used */
> -             u_long  sb_mbmax;       /* max chars of mbufs to use */
> -             long    sb_lowat;       /* low water mark */
> +             u_long  sb_cc;          /* [s|m] actual chars in buffer */
> +             u_long  sb_datacc;      /* [s|m] data only chars in buffer */
> +             u_long  sb_hiwat;       /* [s|m] max actual char count */
> +             u_long  sb_wat;         /* [s|m] default watermark */
> +             u_long  sb_mbcnt;       /* [s|m] chars of mbufs used */
> +             u_long  sb_mbmax;       /* [s|m] max chars of mbufs to use */
> +             long    sb_lowat;       /* [s|m] low water mark */
>               struct mbuf *sb_mb;     /* the mbuf chain */
>               struct mbuf *sb_mbtail; /* the last mbuf in the chain */
>               struct mbuf *sb_lastrecord;/* first mbuf of last record in
> @@ -189,13 +196,27 @@ sb_notify(struct socket *so, struct sockbuf *sb)
>  * overflow and return 0.
>  */
> static inline long
> -sbspace(struct socket *so, struct sockbuf *sb)
> +sbspace_locked(struct socket *so, struct sockbuf *sb)
> {
>       KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
> -     soassertlocked(so);
> +     MUTEX_ASSERT_LOCKED(&sb->sb_mtx);
> +
>       return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
> }
> 
> +static inline long
> +sbspace(struct socket *so, struct sockbuf *sb)
> +{
> +     long space;
> +
> +     mtx_enter(&sb->sb_mtx);
> +     space = sbspace_locked(so, sb);
> +     mtx_leave(&sb->sb_mtx);
> +
> +     return space;
> +}
> +
> +
> /* do we have to send all at once on a socket? */
> #define       sosendallatonce(so) \
>     ((so)->so_proto->pr_flags & PR_ATOMIC)
> @@ -224,16 +245,19 @@ soreadable(struct socket *so)
> 
> /* adjust counters in sb reflecting allocation of m */
> #define       sballoc(sb, m) do {                                             
> \
> +     mtx_enter(&(sb)->sb_mtx);                                       \
>       (sb)->sb_cc += (m)->m_len;                                      \
>       if ((m)->m_type != MT_CONTROL && (m)->m_type != MT_SONAME)      \
>               (sb)->sb_datacc += (m)->m_len;                          \
>       (sb)->sb_mbcnt += MSIZE;                                        \
>       if ((m)->m_flags & M_EXT)                                       \
>               (sb)->sb_mbcnt += (m)->m_ext.ext_size;                  \
> +     mtx_leave(&(sb)->sb_mtx);                                       \
> } while (/* CONSTCOND */ 0)
> 
> /* adjust counters in sb reflecting freeing of m */
> -#define      sbfree(sb, m) do {                                              
> \
> +#define      sbfree_locked(sb, m) do {                                       
> \
> +     MUTEX_ASSERT_LOCKED(&sb->sb_mtx);                               \
>       (sb)->sb_cc -= (m)->m_len;                                      \
>       if ((m)->m_type != MT_CONTROL && (m)->m_type != MT_SONAME)      \
>               (sb)->sb_datacc -= (m)->m_len;                          \
> @@ -242,6 +266,12 @@ soreadable(struct socket *so)
>               (sb)->sb_mbcnt -= (m)->m_ext.ext_size;                  \
> } while (/* CONSTCOND */ 0)
> 
> +#define      sbfree(sb, m) do {                                              
> \
> +     mtx_enter(&(sb)->sb_mtx);                                       \
> +     sbfree_locked((sb), (m));                                       \
> +     mtx_leave(&(sb)->sb_mtx);                                       \
> +} while (/* CONSTCOND */ 0)
> +
> /*
>  * Set lock on sockbuf sb; sleep if lock is already held.
>  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
> 

Reply via email to