Regress tested with witness, rebased to current, anyone?
On Wed, Mar 02, 2022 at 07:13:09PM +0100, Alexander Bluhm wrote:
> Hi,
>
> pf_socket_lookup() calls in_pcbhashlookup() in the PCB layer. So
> to run pf in parallel we must make parts of the stack MP safe.
> Protect the list and hashes in the PCB tables with a mutex.
>
> Note that the protocol notify functions may call pf via tcp_output().
> As the pf lock is a sleeping rw_lock, we must not hold a mutex. To
> solve this for now, I collect these PCBs in inp_notify list and
> protect it with the exclusive netlock.
>
> ok?
>
> bluhm
Index: kern/kern_sysctl.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.399
diff -u -p -r1.399 kern_sysctl.c
--- kern/kern_sysctl.c 25 Jan 2022 04:04:40 -0000 1.399
+++ kern/kern_sysctl.c 11 Mar 2022 15:07:13 -0000
@@ -1366,16 +1366,24 @@ sysctl_file(int *name, u_int namelen, ch
struct inpcb *inp;
NET_LOCK();
+ mtx_enter(&tcbtable.inpt_mtx);
TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue)
FILLSO(inp->inp_socket);
+ mtx_leave(&tcbtable.inpt_mtx);
+ mtx_enter(&udbtable.inpt_mtx);
TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue)
FILLSO(inp->inp_socket);
+ mtx_leave(&udbtable.inpt_mtx);
+ mtx_enter(&rawcbtable.inpt_mtx);
TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue)
FILLSO(inp->inp_socket);
+ mtx_leave(&rawcbtable.inpt_mtx);
#ifdef INET6
+ mtx_enter(&rawin6pcbtable.inpt_mtx);
TAILQ_FOREACH(inp, &rawin6pcbtable.inpt_queue,
inp_queue)
FILLSO(inp->inp_socket);
+ mtx_leave(&rawin6pcbtable.inpt_mtx);
#endif
NET_UNLOCK();
}
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
retrieving revision 1.259
diff -u -p -r1.259 in_pcb.c
--- netinet/in_pcb.c 4 Mar 2022 20:35:10 -0000 1.259
+++ netinet/in_pcb.c 11 Mar 2022 15:07:13 -0000
@@ -120,7 +120,8 @@ struct baddynamicports baddynamicports;
struct baddynamicports rootonlyports;
struct pool inpcb_pool;
-int in_pcbresize (struct inpcbtable *, int);
+void in_pcbrehash_locked(struct inpcb *);
+int in_pcbresize(struct inpcbtable *, int);
#define INPCBHASH_LOADFACTOR(_x) (((_x) * 3) / 4)
@@ -173,7 +174,7 @@ in_pcblhash(struct inpcbtable *table, in
void
in_pcbinit(struct inpcbtable *table, int hashsize)
{
-
+ mtx_init(&table->inpt_mtx, IPL_SOFTNET);
TAILQ_INIT(&table->inpt_queue);
table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK,
&table->inpt_mask);
@@ -252,6 +253,7 @@ in_pcballoc(struct socket *so, struct in
inp->inp_cksum6 = -1;
#endif /* INET6 */
+ mtx_enter(&table->inpt_mtx);
if (table->inpt_count++ > INPCBHASH_LOADFACTOR(table->inpt_size))
(void)in_pcbresize(table, table->inpt_size * 2);
TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue);
@@ -268,6 +270,8 @@ in_pcballoc(struct socket *so, struct in
&inp->inp_faddr, inp->inp_fport,
&inp->inp_laddr, inp->inp_lport);
LIST_INSERT_HEAD(head, inp, inp_hash);
+ mtx_leave(&table->inpt_mtx);
+
so->so_pcb = inp;
return (0);
@@ -556,6 +560,7 @@ void
in_pcbdetach(struct inpcb *inp)
{
struct socket *so = inp->inp_socket;
+ struct inpcbtable *table = inp->inp_table;
NET_ASSERT_LOCKED();
@@ -585,10 +590,13 @@ in_pcbdetach(struct inpcb *inp)
pf_inp_unlink(inp);
}
#endif
+ mtx_enter(&table->inpt_mtx);
LIST_REMOVE(inp, inp_lhash);
LIST_REMOVE(inp, inp_hash);
- TAILQ_REMOVE(&inp->inp_table->inpt_queue, inp, inp_queue);
- inp->inp_table->inpt_count--;
+ TAILQ_REMOVE(&table->inpt_queue, inp, inp_queue);
+ table->inpt_count--;
+ mtx_leave(&table->inpt_mtx);
+
in_pcbunref(inp);
}
@@ -661,20 +669,25 @@ void
in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable,
int errno, void (*notify)(struct inpcb *, int))
{
- struct inpcb *inp, *ninp;
+ SIMPLEQ_HEAD(, inpcb) inpcblist;
+ struct inpcb *inp;
struct in_addr faddr;
u_int rdomain;
- NET_ASSERT_LOCKED();
+ NET_ASSERT_WLOCKED();
if (dst->sa_family != AF_INET)
return;
faddr = satosin(dst)->sin_addr;
if (faddr.s_addr == INADDR_ANY)
return;
+ if (notify == NULL)
+ return;
+ SIMPLEQ_INIT(&inpcblist);
rdomain = rtable_l2(rtable);
- TAILQ_FOREACH_SAFE(inp, &table->inpt_queue, inp_queue, ninp) {
+ mtx_enter(&table->inpt_mtx);
+ TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
#ifdef INET6
if (inp->inp_flags & INP_IPV6)
continue;
@@ -684,8 +697,15 @@ in_pcbnotifyall(struct inpcbtable *table
inp->inp_socket == NULL) {
continue;
}
- if (notify)
- (*notify)(inp, errno);
+ in_pcbref(inp);
+ SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
+ }
+ mtx_leave(&table->inpt_mtx);
+
+ while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
+ SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
+ (*notify)(inp, errno);
+ in_pcbunref(inp);
}
}
@@ -759,6 +779,7 @@ in_pcblookup_local(struct inpcbtable *ta
u_int rdomain;
rdomain = rtable_l2(rtable);
+ mtx_enter(&table->inpt_mtx);
head = in_pcblhash(table, rdomain, lport);
LIST_FOREACH(inp, head, inp_lhash) {
if (rtable_l2(inp->inp_rtableid) != rdomain)
@@ -809,6 +830,8 @@ in_pcblookup_local(struct inpcbtable *ta
break;
}
}
+ mtx_leave(&table->inpt_mtx);
+
return (match);
}
@@ -977,9 +1000,20 @@ void
in_pcbrehash(struct inpcb *inp)
{
struct inpcbtable *table = inp->inp_table;
+
+ mtx_enter(&table->inpt_mtx);
+ in_pcbrehash_locked(inp);
+ mtx_leave(&table->inpt_mtx);
+}
+
+void
+in_pcbrehash_locked(struct inpcb *inp)
+{
+ struct inpcbtable *table = inp->inp_table;
struct inpcbhead *head;
NET_ASSERT_LOCKED();
+ MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
LIST_REMOVE(inp, inp_lhash);
head = in_pcblhash(table, inp->inp_rtableid, inp->inp_lport);
@@ -1006,6 +1040,8 @@ in_pcbresize(struct inpcbtable *table, i
void *nhashtbl, *nlhashtbl, *ohashtbl, *olhashtbl;
struct inpcb *inp;
+ MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
+
ohashtbl = table->inpt_hashtbl;
olhashtbl = table->inpt_lhashtbl;
osize = table->inpt_size;
@@ -1027,7 +1063,7 @@ in_pcbresize(struct inpcbtable *table, i
arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey));
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
- in_pcbrehash(inp);
+ in_pcbrehash_locked(inp);
}
hashfree(ohashtbl, osize, M_PCB);
hashfree(olhashtbl, osize, M_PCB);
@@ -1058,6 +1094,7 @@ in_pcbhashlookup(struct inpcbtable *tabl
u_int rdomain;
rdomain = rtable_l2(rtable);
+ mtx_enter(&table->inpt_mtx);
head = in_pcbhash(table, rdomain, &faddr, fport, &laddr, lport);
LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
@@ -1080,6 +1117,7 @@ in_pcbhashlookup(struct inpcbtable *tabl
break;
}
}
+ mtx_leave(&table->inpt_mtx);
#ifdef DIAGNOSTIC
if (inp == NULL && in_pcbnotifymiss) {
printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n",
@@ -1141,6 +1179,7 @@ in_pcblookup_listen(struct inpcbtable *t
#endif
rdomain = rtable_l2(rtable);
+ mtx_enter(&table->inpt_mtx);
head = in_pcbhash(table, rdomain, &zeroin_addr, 0, key1, lport);
LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
@@ -1177,6 +1216,7 @@ in_pcblookup_listen(struct inpcbtable *t
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
+ mtx_leave(&table->inpt_mtx);
#ifdef DIAGNOSTIC
if (inp == NULL && in_pcbnotifymiss) {
printf("%s: laddr=%08x lport=%d rdom=%u\n",
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.123
diff -u -p -r1.123 in_pcb.h
--- netinet/in_pcb.h 2 Mar 2022 12:53:15 -0000 1.123
+++ netinet/in_pcb.h 11 Mar 2022 15:07:13 -0000
@@ -73,6 +73,13 @@
#include <crypto/siphash.h>
+/*
+ * Locks used to protect struct members in this file:
+ * I immutable after creation
+ * N net lock
+ * t inpt_mtx pcb table mutex
+ */
+
struct pf_state_key;
union inpaddru {
@@ -91,10 +98,11 @@ union inpaddru {
* control block.
*/
struct inpcb {
- LIST_ENTRY(inpcb) inp_hash; /* local and foreign hash */
- LIST_ENTRY(inpcb) inp_lhash; /* local port hash */
- TAILQ_ENTRY(inpcb) inp_queue; /* inet PCB queue */
- struct inpcbtable *inp_table; /* inet queue/hash table */
+ LIST_ENTRY(inpcb) inp_hash; /* [t] local and foreign hash */
+ LIST_ENTRY(inpcb) inp_lhash; /* [t] local port hash */
+ TAILQ_ENTRY(inpcb) inp_queue; /* [t] inet PCB queue */
+ SIMPLEQ_ENTRY(inpcb) inp_notify; /* [N] queue to notify PCB */
+ struct inpcbtable *inp_table; /* [I] inet queue/hash table */
union inpaddru inp_faddru; /* Foreign address. */
union inpaddru inp_laddru; /* Local address. */
#define inp_faddr inp_faddru.iau_a4u.inaddr
@@ -154,12 +162,13 @@ struct inpcb {
LIST_HEAD(inpcbhead, inpcb);
struct inpcbtable {
- TAILQ_HEAD(inpthead, inpcb) inpt_queue; /* inet PCB queue */
- struct inpcbhead *inpt_hashtbl; /* local and foreign hash */
- struct inpcbhead *inpt_lhashtbl; /* local port hash */
- SIPHASH_KEY inpt_key, inpt_lkey; /* secrets for hashes */
- u_long inpt_mask, inpt_lmask; /* hash masks */
- int inpt_count, inpt_size; /* queue count, hash size */
+ struct mutex inpt_mtx; /* protect queue and hash */
+ TAILQ_HEAD(inpthead, inpcb) inpt_queue; /* [t] inet PCB queue */
+ struct inpcbhead *inpt_hashtbl; /* [t] local and foreign hash */
+ struct inpcbhead *inpt_lhashtbl; /* [t] local port hash */
+ SIPHASH_KEY inpt_key, inpt_lkey; /* [t] secrets for hashes */
+ u_long inpt_mask, inpt_lmask; /* [t] hash masks */
+ int inpt_count, inpt_size; /* [t] queue count, hash size */
};
/* flags in inp_flags: */
Index: netinet/raw_ip.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
retrieving revision 1.121
diff -u -p -r1.121 raw_ip.c
--- netinet/raw_ip.c 25 Feb 2022 23:51:03 -0000 1.121
+++ netinet/raw_ip.c 11 Mar 2022 15:07:13 -0000
@@ -151,6 +151,7 @@ rip_input(struct mbuf **mp, int *offp, i
}
#endif
NET_ASSERT_LOCKED();
+ mtx_enter(&rawcbtable.inpt_mtx);
TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
if (inp->inp_socket->so_state & SS_CANTRCVMORE)
continue;
@@ -190,6 +191,8 @@ rip_input(struct mbuf **mp, int *offp, i
}
last = inp;
}
+ mtx_leave(&rawcbtable.inpt_mtx);
+
if (last) {
if (last->inp_flags & INP_CONTROLOPTS ||
last->inp_socket->so_options & SO_TIMESTAMP)
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
retrieving revision 1.272
diff -u -p -r1.272 udp_usrreq.c
--- netinet/udp_usrreq.c 2 Mar 2022 12:53:15 -0000 1.272
+++ netinet/udp_usrreq.c 11 Mar 2022 15:07:13 -0000
@@ -365,6 +365,7 @@ udp_input(struct mbuf **mp, int *offp, i
*/
last = NULL;
NET_ASSERT_LOCKED();
+ mtx_enter(&udbtable.inpt_mtx);
TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
if (inp->inp_socket->so_state & SS_CANTRCVMORE)
continue;
@@ -440,6 +441,7 @@ udp_input(struct mbuf **mp, int *offp, i
SO_REUSEADDR)) == 0)
break;
}
+ mtx_leave(&udbtable.inpt_mtx);
if (last == NULL) {
/*
Index: netinet6/in6_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v
retrieving revision 1.113
diff -u -p -r1.113 in6_pcb.c
--- netinet6/in6_pcb.c 2 Mar 2022 12:53:15 -0000 1.113
+++ netinet6/in6_pcb.c 11 Mar 2022 15:07:13 -0000
@@ -369,14 +369,15 @@ in6_pcbnotify(struct inpcbtable *table,
uint fport_arg, const struct sockaddr_in6 *src, uint lport_arg,
u_int rtable, int cmd, void *cmdarg, void (*notify)(struct inpcb *, int))
{
- struct inpcb *inp, *ninp;
+ SIMPLEQ_HEAD(, inpcb) inpcblist;
+ struct inpcb *inp;
u_short fport = fport_arg, lport = lport_arg;
struct sockaddr_in6 sa6_src;
int errno;
u_int32_t flowinfo;
u_int rdomain;
- NET_ASSERT_LOCKED();
+ NET_ASSERT_WLOCKED();
if ((unsigned)cmd >= PRC_NCMDS)
return;
@@ -414,9 +415,13 @@ in6_pcbnotify(struct inpcbtable *table,
notify = in_rtchange;
}
errno = inet6ctlerrmap[cmd];
+ if (notify == NULL)
+ return;
+ SIMPLEQ_INIT(&inpcblist);
rdomain = rtable_l2(rtable);
- TAILQ_FOREACH_SAFE(inp, &table->inpt_queue, inp_queue, ninp) {
+ mtx_enter(&table->inpt_mtx);
+ TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
if ((inp->inp_flags & INP_IPV6) == 0)
continue;
@@ -488,8 +493,15 @@ in6_pcbnotify(struct inpcbtable *table,
continue;
}
do_notify:
- if (notify)
- (*notify)(inp, errno);
+ in_pcbref(inp);
+ SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
+ }
+ mtx_leave(&table->inpt_mtx);
+
+ while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
+ SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
+ (*notify)(inp, errno);
+ in_pcbunref(inp);
}
}
@@ -504,6 +516,7 @@ in6_pcbhashlookup(struct inpcbtable *tab
u_int rdomain;
rdomain = rtable_l2(rtable);
+ mtx_enter(&table->inpt_mtx);
head = in6_pcbhash(table, rdomain, faddr, fport, laddr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (!(inp->inp_flags & INP_IPV6))
@@ -524,6 +537,7 @@ in6_pcbhashlookup(struct inpcbtable *tab
break;
}
}
+ mtx_leave(&table->inpt_mtx);
#ifdef DIAGNOSTIC
if (inp == NULL && in_pcbnotifymiss) {
printf("%s: faddr= fport=%d laddr= lport=%d rdom=%u\n",
@@ -574,6 +588,7 @@ in6_pcblookup_listen(struct inpcbtable *
#endif
rdomain = rtable_l2(rtable);
+ mtx_enter(&table->inpt_mtx);
head = in6_pcbhash(table, rdomain, &zeroin6_addr, 0, key1, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (!(inp->inp_flags & INP_IPV6))
@@ -606,6 +621,7 @@ in6_pcblookup_listen(struct inpcbtable *
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
+ mtx_leave(&table->inpt_mtx);
#ifdef DIAGNOSTIC
if (inp == NULL && in_pcbnotifymiss) {
printf("%s: laddr= lport=%d rdom=%u\n",
Index: netinet6/raw_ip6.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v
retrieving revision 1.142
diff -u -p -r1.142 raw_ip6.c
--- netinet6/raw_ip6.c 2 Mar 2022 12:53:15 -0000 1.142
+++ netinet6/raw_ip6.c 11 Mar 2022 15:07:13 -0000
@@ -157,6 +157,7 @@ rip6_input(struct mbuf **mp, int *offp,
}
#endif
NET_ASSERT_LOCKED();
+ mtx_enter(&rawin6pcbtable.inpt_mtx);
TAILQ_FOREACH(in6p, &rawin6pcbtable.inpt_queue, inp_queue) {
if (in6p->inp_socket->so_state & SS_CANTRCVMORE)
continue;
@@ -180,8 +181,10 @@ rip6_input(struct mbuf **mp, int *offp,
IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, *offp,
sizeof(*icmp6));
- if (icmp6 == NULL)
+ if (icmp6 == NULL) {
+ mtx_leave(&rawin6pcbtable.inpt_mtx);
return IPPROTO_DONE;
+ }
if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
in6p->inp_icmp6filt))
continue;
@@ -224,6 +227,8 @@ rip6_input(struct mbuf **mp, int *offp,
}
last = in6p;
}
+ mtx_leave(&rawin6pcbtable.inpt_mtx);
+
if (last) {
if (last->inp_flags & IN6P_CONTROLOPTS)
ip6_savecontrol(last, m, &opts);