Hi,
Divert packet has a strange design as it calls the protocol layer
directly from pf. As pf runs in parallel, divert_packet() has a
XXXSMP comment and kernel lock. This gives the opportunity to make
experiments.
I added a mutex in inet PCB layer. It can be taken directly in
protocol input functions or from socket via PRU_LOCK. This makes
it possible to run soreceive() in parallel.
I see a speed increase and it runs stable. This diff is not ready
for commit. Next I want to work on UDP input, to see if my idea
can be more generalized. Just showing what I have now.
bluhm
Index: kern/uipc_socket.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.280
diff -u -p -r1.280 uipc_socket.c
--- kern/uipc_socket.c 25 Jul 2022 07:28:22 -0000 1.280
+++ kern/uipc_socket.c 2 Aug 2022 13:54:03 -0000
@@ -836,10 +836,10 @@ bad:
if (mp)
*mp = NULL;
- solock(so);
+ solock_shared(so);
restart:
if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
- sounlock(so);
+ sounlock_shared(so);
return (error);
}
@@ -907,7 +907,7 @@ restart:
sbunlock(so, &so->so_rcv);
error = sbwait(so, &so->so_rcv);
if (error) {
- sounlock(so);
+ sounlock_shared(so);
return (error);
}
goto restart;
@@ -976,11 +976,11 @@ dontblock:
sbsync(&so->so_rcv, nextrecord);
if (controlp) {
if (pr->pr_domain->dom_externalize) {
- sounlock(so);
+ sounlock_shared(so);
error =
(*pr->pr_domain->dom_externalize)
(cm, controllen, flags);
- solock(so);
+ solock_shared(so);
}
*controlp = cm;
} else {
@@ -1054,9 +1054,9 @@ dontblock:
SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
resid = uio->uio_resid;
- sounlock(so);
+ sounlock_shared(so);
uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
- solock(so);
+ solock_shared(so);
if (uio_error)
uio->uio_resid = resid - len;
} else
@@ -1140,7 +1140,7 @@ dontblock:
error = sbwait(so, &so->so_rcv);
if (error) {
sbunlock(so, &so->so_rcv);
- sounlock(so);
+ sounlock_shared(so);
return (0);
}
if ((m = so->so_rcv.sb_mb) != NULL)
@@ -1186,7 +1186,7 @@ dontblock:
*flagsp |= flags;
release:
sbunlock(so, &so->so_rcv);
- sounlock(so);
+ sounlock_shared(so);
return (error);
}
Index: kern/uipc_socket2.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.126
diff -u -p -r1.126 uipc_socket2.c
--- kern/uipc_socket2.c 25 Jul 2022 07:28:22 -0000 1.126
+++ kern/uipc_socket2.c 2 Aug 2022 19:07:22 -0000
@@ -360,6 +360,25 @@ solock(struct socket *so)
}
}
+void
+solock_shared(struct socket *so)
+{
+ switch (so->so_proto->pr_domain->dom_family) {
+ case PF_INET:
+ case PF_INET6:
+ if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) {
+ NET_RLOCK_IN_SYSCALL();
+ (*so->so_proto->pr_usrreq)(so, PRU_LOCK,
+ NULL, NULL, NULL, NULL);
+ } else
+ NET_LOCK();
+ break;
+ default:
+ rw_enter_write(&so->so_lock);
+ break;
+ }
+}
+
int
solock_persocket(struct socket *so)
{
@@ -403,6 +422,25 @@ sounlock(struct socket *so)
}
void
+sounlock_shared(struct socket *so)
+{
+ switch (so->so_proto->pr_domain->dom_family) {
+ case PF_INET:
+ case PF_INET6:
+ if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) {
+ (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK,
+ NULL, NULL, NULL, NULL);
+ NET_RUNLOCK_IN_SYSCALL();
+ } else
+ NET_UNLOCK();
+ break;
+ default:
+ rw_exit_write(&so->so_lock);
+ break;
+ }
+}
+
+void
soassertlocked(struct socket *so)
{
switch (so->so_proto->pr_domain->dom_family) {
@@ -425,7 +463,17 @@ sosleep_nsec(struct socket *so, void *id
switch (so->so_proto->pr_domain->dom_family) {
case PF_INET:
case PF_INET6:
+ if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) &&
+ rw_status(&netlock) == RW_READ) {
+ (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK,
+ NULL, NULL, NULL, NULL);
+ }
ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
+ if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) &&
+ rw_status(&netlock) == RW_READ) {
+ (*so->so_proto->pr_usrreq)(so, PRU_LOCK,
+ NULL, NULL, NULL, NULL);
+ }
break;
default:
ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
retrieving revision 1.268
diff -u -p -r1.268 in_pcb.c
--- netinet/in_pcb.c 28 Jun 2022 09:32:27 -0000 1.268
+++ netinet/in_pcb.c 2 Aug 2022 13:32:15 -0000
@@ -236,6 +236,7 @@ in_pcballoc(struct socket *so, struct in
inp->inp_table = table;
inp->inp_socket = so;
refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB);
+ mtx_init(&inp->inp_mtx, IPL_SOFTNET);
inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT;
inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.129
diff -u -p -r1.129 in_pcb.h
--- netinet/in_pcb.h 15 May 2022 09:12:20 -0000 1.129
+++ netinet/in_pcb.h 2 Aug 2022 13:32:15 -0000
@@ -79,6 +79,7 @@
* I immutable after creation
* N net lock
* t inpt_mtx pcb table mutex
+ * p inpcb_mtx pcb mutex
*/
struct pf_state_key;
@@ -121,6 +122,7 @@ struct inpcb {
#define inp_route inp_ru.ru_route
#define inp_route6 inp_ru.ru_route6
struct refcnt inp_refcnt; /* refcount PCB, delay memory free */
+ struct mutex inp_mtx; /* protect PCB and socket members */
int inp_flags; /* generic IP/datagram flags */
union { /* Header prototype. */
struct ip hu_ip;
Index: netinet/in_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
retrieving revision 1.98
diff -u -p -r1.98 in_proto.c
--- netinet/in_proto.c 25 Feb 2022 23:51:03 -0000 1.98
+++ netinet/in_proto.c 2 Aug 2022 13:54:03 -0000
@@ -382,7 +382,7 @@ const struct protosw inetsw[] = {
.pr_type = SOCK_RAW,
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_DIVERT,
- .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_MPSAFE,
.pr_ctloutput = rip_ctloutput,
.pr_usrreq = divert_usrreq,
.pr_attach = divert_attach,
Index: netinet/ip_divert.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
retrieving revision 1.68
diff -u -p -r1.68 ip_divert.c
--- netinet/ip_divert.c 9 May 2022 19:33:46 -0000 1.68
+++ netinet/ip_divert.c 2 Aug 2022 19:07:22 -0000
@@ -221,22 +221,15 @@ divert_packet(struct mbuf *m, int dir, u
if_put(ifp);
}
+ mtx_enter(&inp->inp_mtx);
so = inp->inp_socket;
- /*
- * XXXSMP sbappendaddr() is not MP safe and this function is called
- * from pf with shared netlock. To call only one sbappendaddr() from
- * divert_packet(), protect it with kernel lock. All other places
- * call sbappendaddr() with exclusive net lock. This blocks
- * divert_packet() as we have the shared lock.
- */
- KERNEL_LOCK();
if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) {
- KERNEL_UNLOCK();
+ mtx_leave(&inp->inp_mtx);
divstat_inc(divs_fullsock);
goto bad;
}
+ mtx_leave(&inp->inp_mtx);
sorwakeup(inp->inp_socket);
- KERNEL_UNLOCK();
in_pcbunref(inp);
return;
@@ -293,6 +286,14 @@ divert_usrreq(struct socket *so, int req
break;
case PRU_SENSE:
+ break;
+
+ case PRU_LOCK:
+ mtx_enter(&inp->inp_mtx);
+ break;
+
+ case PRU_UNLOCK:
+ mtx_leave(&inp->inp_mtx);
break;
case PRU_LISTEN:
Index: sys/protosw.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v
retrieving revision 1.35
diff -u -p -r1.35 protosw.h
--- sys/protosw.h 25 Feb 2022 23:51:04 -0000 1.35
+++ sys/protosw.h 2 Aug 2022 13:54:03 -0000
@@ -108,6 +108,7 @@ struct protosw {
#define PR_ABRTACPTDIS 0x20 /* abort on accept(2) to
disconnected
socket */
#define PR_SPLICE 0x40 /* socket splicing is possible
*/
+#define PR_MPSAFE 0x80 /* shared net lock for
send/recv */
/*
* The arguments to usrreq are:
@@ -144,8 +145,10 @@ struct protosw {
#define PRU_SLOWTIMO 19 /* 500ms timeout */
#define PRU_PROTORCV 20 /* receive from below */
#define PRU_PROTOSEND 21 /* send to below */
+#define PRU_LOCK 22 /* lock protocol layer */
+#define PRU_UNLOCK 23 /* unlock protocol layer */
-#define PRU_NREQ 22
+#define PRU_NREQ 24
#ifdef PRUREQUESTS
const char *prurequests[] = {
@@ -154,7 +157,7 @@ const char *prurequests[] = {
"RCVD", "SEND", "ABORT", "CONTROL",
"SENSE", "RCVOOB", "SENDOOB", "SOCKADDR",
"PEERADDR", "CONNECT2", "FASTTIMO", "SLOWTIMO",
- "PROTORCV", "PROTOSEND",
+ "PROTORCV", "PROTOSEND", "LOCK", "UNLOCK"
};
#endif
Index: sys/socketvar.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.106
diff -u -p -r1.106 socketvar.h
--- sys/socketvar.h 15 Jul 2022 17:20:24 -0000 1.106
+++ sys/socketvar.h 2 Aug 2022 13:54:03 -0000
@@ -346,9 +346,11 @@ int sockargs(struct mbuf **, const void
int sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
void solock(struct socket *);
+void solock_shared(struct socket *);
int solock_persocket(struct socket *);
void solock_pair(struct socket *, struct socket *);
void sounlock(struct socket *);
+void sounlock_shared(struct socket *);
int sendit(struct proc *, int, struct msghdr *, int, register_t *);
int recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *);
Index: sys/systm.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/systm.h,v
retrieving revision 1.157
diff -u -p -r1.157 systm.h
--- sys/systm.h 12 Jul 2022 17:12:31 -0000 1.157
+++ sys/systm.h 2 Aug 2022 19:07:22 -0000
@@ -341,6 +341,15 @@ extern struct rwlock netlock;
#define NET_RLOCK_IN_IOCTL() do { rw_enter_read(&netlock); } while
(0)
#define NET_RUNLOCK_IN_IOCTL() do { rw_exit_read(&netlock); } while (0)
+/*
+ * Reader version of NET_LOCK() to be used in send and receive syscall.
+ *
+ * Can be grabbed instead of the exclusive version when no field
+ * protected by the NET_LOCK() is modified by the ioctl/sysctl.
+ */
+#define NET_RLOCK_IN_SYSCALL() do { rw_enter_read(&netlock); } while
(0)
+#define NET_RUNLOCK_IN_SYSCALL() do { rw_exit_read(&netlock); } while
(0)
+
#ifdef DIAGNOSTIC
#define NET_ASSERT_UNLOCKED()
\