Also, I like to have exclusive layer locks like `tcp_lock’, `udp_lock’, etc.. And take them with shared netlock held as the first step of inet sockets unlocking.
> On 4 Aug 2022, at 02:13, Vitaliy Makkoveev <[email protected]> wrote: > >> On 4 Aug 2022, at 01:36, Alexander Bluhm <[email protected]> wrote: >> >> Hi, >> >> Divert packet has a strange design as it calls the protocol layer >> directly from pf. As pf runs in parallel, divert_packet() has a >> XXXSMP comment and kernel lock. This gives the opportunity to make >> experiments. >> >> I added a mutex in inet PCB layer. It can be taken directly in >> protocol input functions or from socket via PRU_LOCK. This makes >> it possible to run soreceive() in parallel. >> >> I see a speed increase and it runs stable. This diff is not ready >> for commit. Next I want to work on UDP input, to see if my idea >> can be more generalized. Just showing what I have now. >> >> bluhm >> > > sblock()/sbunlock() relie on exclusive socket or layer lock, > which protects `sb_flags’ modification. This diff breaks them, > because nothing stops concurrent soreceive() threads to set > SB_LOCK flag and and be sure that socket buffer is locked. > > sblock(struct socket *so, struct sockbuf *sb, int wait) > { > int error, prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; > > soassertlocked(so); > > if ((sb->sb_flags & SB_LOCK) == 0) { > sb->sb_flags |= SB_LOCK; > return (0); > } > if (wait & M_NOWAIT) > return (EWOULDBLOCK); > > while (sb->sb_flags & SB_LOCK) { > sb->sb_flags |= SB_WANT; > error = sosleep_nsec(so, &sb->sb_flags, prio, "netlck", > INFSLP); > if (error) > return (error); > } > sb->sb_flags |= SB_LOCK; > return (0); > } > > I started the work to allow socket buffer locking without socket > lock held, but the work is far from the diff expose. > >> Index: kern/uipc_socket.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v >> retrieving revision 1.280 >> diff -u -p -r1.280 uipc_socket.c >> --- kern/uipc_socket.c 25 Jul 2022 07:28:22 -0000 1.280 >> +++ kern/uipc_socket.c 2 Aug 2022 13:54:03 -0000 >> @@ -836,10 +836,10 @@ bad: >> if (mp) >> *mp = NULL; >> >> - solock(so); >> + solock_shared(so); >> restart: >> if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { >> - sounlock(so); >> + sounlock_shared(so); >> return (error); >> } >> >> @@ -907,7 +907,7 @@ restart: >> sbunlock(so, &so->so_rcv); >> error = sbwait(so, &so->so_rcv); >> if (error) { >> - sounlock(so); >> + sounlock_shared(so); >> return (error); >> } >> goto restart; >> @@ -976,11 +976,11 @@ dontblock: >> sbsync(&so->so_rcv, nextrecord); >> if (controlp) { >> if (pr->pr_domain->dom_externalize) { >> - sounlock(so); >> + sounlock_shared(so); >> error = >> (*pr->pr_domain->dom_externalize) >> (cm, controllen, flags); >> - solock(so); >> + solock_shared(so); >> } >> *controlp = cm; >> } else { >> @@ -1054,9 +1054,9 @@ dontblock: >> SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); >> SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); >> resid = uio->uio_resid; >> - sounlock(so); >> + sounlock_shared(so); >> uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); >> - solock(so); >> + solock_shared(so); >> if (uio_error) >> uio->uio_resid = resid - len; >> } else >> @@ -1140,7 +1140,7 @@ dontblock: >> error = sbwait(so, &so->so_rcv); >> if (error) { >> sbunlock(so, &so->so_rcv); >> - sounlock(so); >> + sounlock_shared(so); >> return (0); >> } >> if ((m = so->so_rcv.sb_mb) != NULL) >> @@ -1186,7 +1186,7 @@ dontblock: >> *flagsp |= flags; >> release: >> sbunlock(so, &so->so_rcv); >> - sounlock(so); >> + sounlock_shared(so); >> return (error); >> } >> >> Index: kern/uipc_socket2.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v >> retrieving revision 1.126 >> diff -u -p -r1.126 uipc_socket2.c >> --- kern/uipc_socket2.c 25 Jul 2022 07:28:22 -0000 1.126 >> +++ kern/uipc_socket2.c 2 Aug 2022 19:07:22 -0000 >> @@ -360,6 +360,25 @@ solock(struct socket *so) >> } >> } >> >> +void >> +solock_shared(struct socket *so) >> +{ >> + switch (so->so_proto->pr_domain->dom_family) { >> + case PF_INET: >> + case PF_INET6: >> + if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) { >> + NET_RLOCK_IN_SYSCALL(); >> + (*so->so_proto->pr_usrreq)(so, PRU_LOCK, >> + NULL, NULL, NULL, NULL); >> + } else >> + NET_LOCK(); >> + break; >> + default: >> + rw_enter_write(&so->so_lock); >> + break; >> + } >> +} >> + >> int >> solock_persocket(struct socket *so) >> { >> @@ -403,6 +422,25 @@ sounlock(struct socket *so) >> } >> >> void >> +sounlock_shared(struct socket *so) >> +{ >> + switch (so->so_proto->pr_domain->dom_family) { >> + case PF_INET: >> + case PF_INET6: >> + if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) { >> + (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK, >> + NULL, NULL, NULL, NULL); >> + NET_RUNLOCK_IN_SYSCALL(); >> + } else >> + NET_UNLOCK(); >> + break; >> + default: >> + rw_exit_write(&so->so_lock); >> + break; >> + } >> +} >> + >> +void >> soassertlocked(struct socket *so) >> { >> switch (so->so_proto->pr_domain->dom_family) { >> @@ -425,7 +463,17 @@ sosleep_nsec(struct socket *so, void *id >> switch (so->so_proto->pr_domain->dom_family) { >> case PF_INET: >> case PF_INET6: >> + if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) && >> + rw_status(&netlock) == RW_READ) { >> + (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK, >> + NULL, NULL, NULL, NULL); >> + } >> ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); >> + if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) && >> + rw_status(&netlock) == RW_READ) { >> + (*so->so_proto->pr_usrreq)(so, PRU_LOCK, >> + NULL, NULL, NULL, NULL); >> + } >> break; >> default: >> ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); >> Index: netinet/in_pcb.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v >> retrieving revision 1.268 >> diff -u -p -r1.268 in_pcb.c >> --- netinet/in_pcb.c 28 Jun 2022 09:32:27 -0000 1.268 >> +++ netinet/in_pcb.c 2 Aug 2022 13:32:15 -0000 >> @@ -236,6 +236,7 @@ in_pcballoc(struct socket *so, struct in >> inp->inp_table = table; >> inp->inp_socket = so; >> refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB); >> + mtx_init(&inp->inp_mtx, IPL_SOFTNET); >> inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT; >> inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT; >> inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT; >> Index: netinet/in_pcb.h >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v >> retrieving revision 1.129 >> diff -u -p -r1.129 in_pcb.h >> --- netinet/in_pcb.h 15 May 2022 09:12:20 -0000 1.129 >> +++ netinet/in_pcb.h 2 Aug 2022 13:32:15 -0000 >> @@ -79,6 +79,7 @@ >> * I immutable after creation >> * N net lock >> * t inpt_mtx pcb table mutex >> + * p inpcb_mtx pcb mutex >> */ >> >> struct pf_state_key; >> @@ -121,6 +122,7 @@ struct inpcb { >> #define inp_route inp_ru.ru_route >> #define inp_route6 inp_ru.ru_route6 >> struct refcnt inp_refcnt; /* refcount PCB, delay memory free */ >> + struct mutex inp_mtx; /* protect PCB and socket members */ >> int inp_flags; /* generic IP/datagram flags */ >> union { /* Header prototype. */ >> struct ip hu_ip; >> Index: netinet/in_proto.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v >> retrieving revision 1.98 >> diff -u -p -r1.98 in_proto.c >> --- netinet/in_proto.c 25 Feb 2022 23:51:03 -0000 1.98 >> +++ netinet/in_proto.c 2 Aug 2022 13:54:03 -0000 >> @@ -382,7 +382,7 @@ const struct protosw inetsw[] = { >> .pr_type = SOCK_RAW, >> .pr_domain = &inetdomain, >> .pr_protocol = IPPROTO_DIVERT, >> - .pr_flags = PR_ATOMIC|PR_ADDR, >> + .pr_flags = PR_ATOMIC|PR_ADDR|PR_MPSAFE, >> .pr_ctloutput = rip_ctloutput, >> .pr_usrreq = divert_usrreq, >> .pr_attach = divert_attach, >> Index: netinet/ip_divert.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v >> retrieving revision 1.68 >> diff -u -p -r1.68 ip_divert.c >> --- netinet/ip_divert.c 9 May 2022 19:33:46 -0000 1.68 >> +++ netinet/ip_divert.c 2 Aug 2022 19:07:22 -0000 >> @@ -221,22 +221,15 @@ divert_packet(struct mbuf *m, int dir, u >> if_put(ifp); >> } >> >> + mtx_enter(&inp->inp_mtx); >> so = inp->inp_socket; >> - /* >> - * XXXSMP sbappendaddr() is not MP safe and this function is called >> - * from pf with shared netlock. To call only one sbappendaddr() from >> - * divert_packet(), protect it with kernel lock. All other places >> - * call sbappendaddr() with exclusive net lock. This blocks >> - * divert_packet() as we have the shared lock. >> - */ >> - KERNEL_LOCK(); >> if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) { >> - KERNEL_UNLOCK(); >> + mtx_leave(&inp->inp_mtx); >> divstat_inc(divs_fullsock); >> goto bad; >> } >> + mtx_leave(&inp->inp_mtx); >> sorwakeup(inp->inp_socket); >> - KERNEL_UNLOCK(); >> >> in_pcbunref(inp); >> return; >> @@ -293,6 +286,14 @@ divert_usrreq(struct socket *so, int req >> break; >> >> case PRU_SENSE: >> + break; >> + >> + case PRU_LOCK: >> + mtx_enter(&inp->inp_mtx); >> + break; >> + >> + case PRU_UNLOCK: >> + mtx_leave(&inp->inp_mtx); >> break; >> >> case PRU_LISTEN: >> Index: sys/protosw.h >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v >> retrieving revision 1.35 >> diff -u -p -r1.35 protosw.h >> --- sys/protosw.h 25 Feb 2022 23:51:04 -0000 1.35 >> +++ sys/protosw.h 2 Aug 2022 13:54:03 -0000 >> @@ -108,6 +108,7 @@ struct protosw { >> #define PR_ABRTACPTDIS 0x20 /* abort on accept(2) to >> disconnected >> socket */ >> #define PR_SPLICE 0x40 /* socket splicing is possible >> */ >> +#define PR_MPSAFE 0x80 /* shared net lock for >> send/recv */ >> >> /* >> * The arguments to usrreq are: >> @@ -144,8 +145,10 @@ struct protosw { >> #define PRU_SLOWTIMO 19 /* 500ms timeout */ >> #define PRU_PROTORCV 20 /* receive from below */ >> #define PRU_PROTOSEND 21 /* send to below */ >> +#define PRU_LOCK 22 /* lock protocol layer */ >> +#define PRU_UNLOCK 23 /* unlock protocol layer */ >> >> -#define PRU_NREQ 22 >> +#define PRU_NREQ 24 >> >> #ifdef PRUREQUESTS >> const char *prurequests[] = { >> @@ -154,7 +157,7 @@ const char *prurequests[] = { >> "RCVD", "SEND", "ABORT", "CONTROL", >> "SENSE", "RCVOOB", "SENDOOB", "SOCKADDR", >> "PEERADDR", "CONNECT2", "FASTTIMO", "SLOWTIMO", >> - "PROTORCV", "PROTOSEND", >> + "PROTORCV", "PROTOSEND", "LOCK", "UNLOCK" >> }; >> #endif >> >> Index: sys/socketvar.h >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v >> retrieving revision 1.106 >> diff -u -p -r1.106 socketvar.h >> --- sys/socketvar.h 15 Jul 2022 17:20:24 -0000 1.106 >> +++ sys/socketvar.h 2 Aug 2022 13:54:03 -0000 >> @@ -346,9 +346,11 @@ int sockargs(struct mbuf **, const void >> >> int sosleep_nsec(struct socket *, void *, int, const char *, uint64_t); >> void solock(struct socket *); >> +void solock_shared(struct socket *); >> int solock_persocket(struct socket *); >> void solock_pair(struct socket *, struct socket *); >> void sounlock(struct socket *); >> +void sounlock_shared(struct socket *); >> >> int sendit(struct proc *, int, struct msghdr *, int, register_t *); >> int recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *); >> Index: sys/systm.h >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/systm.h,v >> retrieving revision 1.157 >> diff -u -p -r1.157 systm.h >> --- sys/systm.h 12 Jul 2022 17:12:31 -0000 1.157 >> +++ sys/systm.h 2 Aug 2022 19:07:22 -0000 >> @@ -341,6 +341,15 @@ extern struct rwlock netlock; >> #define NET_RLOCK_IN_IOCTL() do { rw_enter_read(&netlock); } while >> (0) >> #define NET_RUNLOCK_IN_IOCTL() do { rw_exit_read(&netlock); } while (0) >> >> +/* >> + * Reader version of NET_LOCK() to be used in send and receive syscall. >> + * >> + * Can be grabbed instead of the exclusive version when no field >> + * protected by the NET_LOCK() is modified by the ioctl/sysctl. >> + */ >> +#define NET_RLOCK_IN_SYSCALL() do { rw_enter_read(&netlock); } while >> (0) >> +#define NET_RUNLOCK_IN_SYSCALL() do { rw_exit_read(&netlock); } while >> (0) >> + >> #ifdef DIAGNOSTIC >> >> #define NET_ASSERT_UNLOCKED() >> \
