> On 6 Aug 2022, at 00:36, Alexander Bluhm <[email protected]> wrote:
>
> On Sat, Aug 06, 2022 at 12:20:48AM +0300, Vitaliy Makkoveev wrote:
>>> On 5 Aug 2022, at 23:50, Alexander Bluhm <[email protected]> wrote:
>>>
>>> On Thu, Aug 04, 2022 at 06:07:49PM +0300, Vitaliy Makkoveev wrote:
>>>> Yes please. Also I like (*pr_usrreq)() be splitted to multiple handlers.
>>>
>>> With all you feedback diff looks like this now.
>>>
>>
>> Did you missed sbwait() modification? I also pointed it should
>> release `inp_mtx??? in soreceive() path.
>
> You are talkung about this comment:
>
>>> It seems sbwait() should release `inp_mtx' and keep shared netlock
>>> locked.
>> Both of netlock and `inp_mtx??? should be released by sbwait(). This
>> is fully identical to current behaviour.
>
> sbwait() calls sosleep_nsec(). My diff calls pr_unlock and pr_lock
> there. So it already does that, or I missunderstood something.
>
Sorry, I missed sosleep_nsec hunk.
> The condition "rw_status(&netlock) == RW_READ" to unlock the mutex
> is an ugly hack, but it seems to work.
>
> bluhm
I thought you will introduce something like below. This does the
same but it has no heuristic under the hood.
int
sbwait_shared(struct socket *so, struct sockbuf *sb)
{
int error;
if (so->so_proto->pr_unlock != NULL)
(*so->so_proto->pr_unlock)(so);
error = sbwait(so, sb);
if (so->so_proto->pr_lock != NULL)
(*so->so_proto->pr_lock)(so);
return error;
}
The rest of diff looks good to me, but sorry I’ll do proper review
tomorrow with the fresh look.
>
>>> Index: kern/uipc_socket.c
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
>>> retrieving revision 1.280
>>> diff -u -p -r1.280 uipc_socket.c
>>> --- kern/uipc_socket.c 25 Jul 2022 07:28:22 -0000 1.280
>>> +++ kern/uipc_socket.c 4 Aug 2022 21:26:33 -0000
>>> @@ -836,10 +836,10 @@ bad:
>>> if (mp)
>>> *mp = NULL;
>>>
>>> - solock(so);
>>> + solock_shared(so);
>>> restart:
>>> if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
>>> - sounlock(so);
>>> + sounlock_shared(so);
>>> return (error);
>>> }
>>>
>>> @@ -907,7 +907,7 @@ restart:
>>> sbunlock(so, &so->so_rcv);
>>> error = sbwait(so, &so->so_rcv);
>>> if (error) {
>>> - sounlock(so);
>>> + sounlock_shared(so);
>>> return (error);
>>> }
>>> goto restart;
>>> @@ -976,11 +976,11 @@ dontblock:
>>> sbsync(&so->so_rcv, nextrecord);
>>> if (controlp) {
>>> if (pr->pr_domain->dom_externalize) {
>>> - sounlock(so);
>>> + sounlock_shared(so);
>>> error =
>>> (*pr->pr_domain->dom_externalize)
>>> (cm, controllen, flags);
>>> - solock(so);
>>> + solock_shared(so);
>>> }
>>> *controlp = cm;
>>> } else {
>>> @@ -1054,9 +1054,9 @@ dontblock:
>>> SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
>>> SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
>>> resid = uio->uio_resid;
>>> - sounlock(so);
>>> + sounlock_shared(so);
>>> uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
>>> - solock(so);
>>> + solock_shared(so);
>>> if (uio_error)
>>> uio->uio_resid = resid - len;
>>> } else
>>> @@ -1140,7 +1140,7 @@ dontblock:
>>> error = sbwait(so, &so->so_rcv);
>>> if (error) {
>>> sbunlock(so, &so->so_rcv);
>>> - sounlock(so);
>>> + sounlock_shared(so);
>>> return (0);
>>> }
>>> if ((m = so->so_rcv.sb_mb) != NULL)
>>> @@ -1186,7 +1186,7 @@ dontblock:
>>> *flagsp |= flags;
>>> release:
>>> sbunlock(so, &so->so_rcv);
>>> - sounlock(so);
>>> + sounlock_shared(so);
>>> return (error);
>>> }
>>>
>>> Index: kern/uipc_socket2.c
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v
>>> retrieving revision 1.126
>>> diff -u -p -r1.126 uipc_socket2.c
>>> --- kern/uipc_socket2.c 25 Jul 2022 07:28:22 -0000 1.126
>>> +++ kern/uipc_socket2.c 4 Aug 2022 21:45:24 -0000
>>> @@ -360,6 +360,24 @@ solock(struct socket *so)
>>> }
>>> }
>>>
>>> +void
>>> +solock_shared(struct socket *so)
>>> +{
>>> + switch (so->so_proto->pr_domain->dom_family) {
>>> + case PF_INET:
>>> + case PF_INET6:
>>> + if (so->so_proto->pr_lock != NULL) {
>>> + NET_RLOCK_IN_SYSCALL();
>>> + (*so->so_proto->pr_lock)(so);
>>> + } else
>>> + NET_LOCK();
>>> + break;
>>> + default:
>>> + rw_enter_write(&so->so_lock);
>>> + break;
>>> + }
>>> +}
>>> +
>>> int
>>> solock_persocket(struct socket *so)
>>> {
>>> @@ -403,6 +421,24 @@ sounlock(struct socket *so)
>>> }
>>>
>>> void
>>> +sounlock_shared(struct socket *so)
>>> +{
>>> + switch (so->so_proto->pr_domain->dom_family) {
>>> + case PF_INET:
>>> + case PF_INET6:
>>> + if (so->so_proto->pr_unlock != NULL) {
>>> + (*so->so_proto->pr_unlock)(so);
>>> + NET_RUNLOCK_IN_SYSCALL();
>>> + } else
>>> + NET_UNLOCK();
>>> + break;
>>> + default:
>>> + rw_exit_write(&so->so_lock);
>>> + break;
>>> + }
>>> +}
>>> +
>>> +void
>>> soassertlocked(struct socket *so)
>>> {
>>> switch (so->so_proto->pr_domain->dom_family) {
>>> @@ -425,7 +461,15 @@ sosleep_nsec(struct socket *so, void *id
>>> switch (so->so_proto->pr_domain->dom_family) {
>>> case PF_INET:
>>> case PF_INET6:
>>> + if (so->so_proto->pr_unlock != NULL &&
>>> + rw_status(&netlock) == RW_READ) {
>>> + (*so->so_proto->pr_unlock)(so);
>>> + }
>>> ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
>>> + if (so->so_proto->pr_lock != NULL &&
>>> + rw_status(&netlock) == RW_READ) {
>>> + (*so->so_proto->pr_lock)(so);
>>> + }
>>> break;
>>> default:
>>> ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
>>> Index: netinet/in_pcb.c
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
>>> retrieving revision 1.268
>>> diff -u -p -r1.268 in_pcb.c
>>> --- netinet/in_pcb.c 28 Jun 2022 09:32:27 -0000 1.268
>>> +++ netinet/in_pcb.c 4 Aug 2022 21:26:33 -0000
>>> @@ -236,6 +236,7 @@ in_pcballoc(struct socket *so, struct in
>>> inp->inp_table = table;
>>> inp->inp_socket = so;
>>> refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB);
>>> + mtx_init(&inp->inp_mtx, IPL_SOFTNET);
>>> inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT;
>>> inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
>>> inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
>>> Index: netinet/in_pcb.h
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
>>> retrieving revision 1.129
>>> diff -u -p -r1.129 in_pcb.h
>>> --- netinet/in_pcb.h 15 May 2022 09:12:20 -0000 1.129
>>> +++ netinet/in_pcb.h 4 Aug 2022 21:26:33 -0000
>>> @@ -79,6 +79,7 @@
>>> * I immutable after creation
>>> * N net lock
>>> * t inpt_mtx pcb table mutex
>>> + * p inpcb_mtx pcb mutex
>>> */
>>>
>>> struct pf_state_key;
>>> @@ -121,6 +122,7 @@ struct inpcb {
>>> #define inp_route inp_ru.ru_route
>>> #define inp_route6 inp_ru.ru_route6
>>> struct refcnt inp_refcnt; /* refcount PCB, delay memory free */
>>> + struct mutex inp_mtx; /* protect PCB and socket members */
>>> int inp_flags; /* generic IP/datagram flags */
>>> union { /* Header prototype. */
>>> struct ip hu_ip;
>>> Index: netinet/in_proto.c
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
>>> retrieving revision 1.98
>>> diff -u -p -r1.98 in_proto.c
>>> --- netinet/in_proto.c 25 Feb 2022 23:51:03 -0000 1.98
>>> +++ netinet/in_proto.c 4 Aug 2022 21:44:07 -0000
>>> @@ -387,6 +387,8 @@ const struct protosw inetsw[] = {
>>> .pr_usrreq = divert_usrreq,
>>> .pr_attach = divert_attach,
>>> .pr_detach = divert_detach,
>>> + .pr_lock = divert_lock,
>>> + .pr_unlock = divert_unlock,
>>> .pr_init = divert_init,
>>> .pr_sysctl = divert_sysctl
>>> },
>>> Index: netinet/ip_divert.c
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
>>> retrieving revision 1.68
>>> diff -u -p -r1.68 ip_divert.c
>>> --- netinet/ip_divert.c 9 May 2022 19:33:46 -0000 1.68
>>> +++ netinet/ip_divert.c 4 Aug 2022 21:31:30 -0000
>>> @@ -221,22 +221,15 @@ divert_packet(struct mbuf *m, int dir, u
>>> if_put(ifp);
>>> }
>>>
>>> + mtx_enter(&inp->inp_mtx);
>>> so = inp->inp_socket;
>>> - /*
>>> - * XXXSMP sbappendaddr() is not MP safe and this function is called
>>> - * from pf with shared netlock. To call only one sbappendaddr() from
>>> - * divert_packet(), protect it with kernel lock. All other places
>>> - * call sbappendaddr() with exclusive net lock. This blocks
>>> - * divert_packet() as we have the shared lock.
>>> - */
>>> - KERNEL_LOCK();
>>> if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) {
>>> - KERNEL_UNLOCK();
>>> + mtx_leave(&inp->inp_mtx);
>>> divstat_inc(divs_fullsock);
>>> goto bad;
>>> }
>>> - sorwakeup(inp->inp_socket);
>>> - KERNEL_UNLOCK();
>>> + mtx_leave(&inp->inp_mtx);
>>> + sorwakeup(so);
>>>
>>> in_pcbunref(inp);
>>> return;
>>> @@ -356,6 +349,24 @@ divert_detach(struct socket *so)
>>>
>>> in_pcbdetach(inp);
>>> return (0);
>>> +}
>>> +
>>> +void
>>> +divert_lock(struct socket *so)
>>> +{
>>> + struct inpcb *inp = sotoinpcb(so);
>>> +
>>> + NET_ASSERT_LOCKED();
>>> + mtx_enter(&inp->inp_mtx);
>>> +}
>>> +
>>> +void
>>> +divert_unlock(struct socket *so)
>>> +{
>>> + struct inpcb *inp = sotoinpcb(so);
>>> +
>>> + NET_ASSERT_LOCKED();
>>> + mtx_leave(&inp->inp_mtx);
>>> }
>>>
>>> int
>>> Index: netinet/ip_divert.h
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.h,v
>>> retrieving revision 1.15
>>> diff -u -p -r1.15 ip_divert.h
>>> --- netinet/ip_divert.h 5 May 2022 16:44:22 -0000 1.15
>>> +++ netinet/ip_divert.h 4 Aug 2022 21:43:44 -0000
>>> @@ -72,5 +72,7 @@ int divert_usrreq(struct socket *,
>>> int, struct mbuf *, struct mbuf *, struct mbuf *, struct proc *);
>>> int divert_attach(struct socket *, int);
>>> int divert_detach(struct socket *);
>>> +void divert_lock(struct socket *);
>>> +void divert_unlock(struct socket *);
>>> #endif /* _KERNEL */
>>> #endif /* _IP_DIVERT_H_ */
>>> Index: sys/protosw.h
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v
>>> retrieving revision 1.35
>>> diff -u -p -r1.35 protosw.h
>>> --- sys/protosw.h 25 Feb 2022 23:51:04 -0000 1.35
>>> +++ sys/protosw.h 4 Aug 2022 21:41:19 -0000
>>> @@ -83,6 +83,8 @@ struct protosw {
>>>
>>> int (*pr_attach)(struct socket *, int);
>>> int (*pr_detach)(struct socket *);
>>> + void (*pr_lock)(struct socket *);
>>> + void (*pr_unlock)(struct socket *);
>>>
>>> /* utility hooks */
>>> void (*pr_init)(void); /* initialization hook */
>>> Index: sys/socketvar.h
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
>>> retrieving revision 1.106
>>> diff -u -p -r1.106 socketvar.h
>>> --- sys/socketvar.h 15 Jul 2022 17:20:24 -0000 1.106
>>> +++ sys/socketvar.h 4 Aug 2022 21:26:33 -0000
>>> @@ -346,9 +346,11 @@ int sockargs(struct mbuf **, const void
>>>
>>> int sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
>>> void solock(struct socket *);
>>> +void solock_shared(struct socket *);
>>> int solock_persocket(struct socket *);
>>> void solock_pair(struct socket *, struct socket *);
>>> void sounlock(struct socket *);
>>> +void sounlock_shared(struct socket *);
>>>
>>> int sendit(struct proc *, int, struct msghdr *, int, register_t *);
>>> int recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *);
>>> Index: sys/systm.h
>>> ===================================================================
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/systm.h,v
>>> retrieving revision 1.157
>>> diff -u -p -r1.157 systm.h
>>> --- sys/systm.h 12 Jul 2022 17:12:31 -0000 1.157
>>> +++ sys/systm.h 4 Aug 2022 21:31:39 -0000
>>> @@ -341,6 +341,15 @@ extern struct rwlock netlock;
>>> #define NET_RLOCK_IN_IOCTL() do { rw_enter_read(&netlock); } while
>>> (0)
>>> #define NET_RUNLOCK_IN_IOCTL() do { rw_exit_read(&netlock); } while (0)
>>>
>>> +/*
>>> + * Reader version of NET_LOCK() to be used in send and receive syscall.
>>> + *
>>> + * Can be grabbed instead of the exclusive version when no field
>>> + * protected by the NET_LOCK() is modified by the ioctl/sysctl.
>>> + */
>>> +#define NET_RLOCK_IN_SYSCALL() do { rw_enter_read(&netlock); } while
>>> (0)
>>> +#define NET_RUNLOCK_IN_SYSCALL() do { rw_exit_read(&netlock); } while
>>> (0)
>>> +
>>> #ifdef DIAGNOSTIC
>>>
>>> #define NET_ASSERT_UNLOCKED()
>>> \
>>
>