Hi,
To run over all PCB, I have introduced temporary notify lists. They
are necessary as the list of all PCB is protected by a mutex. Notify
may call ip_output() eventually which may sleep as pf lock is a
rwlock. Also the SRP implementation of the route table might sleep.
The same inp_notify is used for UDP multicast and raw IP delivery.
This could be a mutex, but use the same list entry and rwlock anyway.
Currently the inp_notify list is protected by exclusive netlock.
Introduce new rwlock per PCB table to relax this requirement to
shared netlock.
ok?
bluhm
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
retrieving revision 1.271
diff -u -p -r1.271 in_pcb.c
--- netinet/in_pcb.c 21 Aug 2022 11:44:53 -0000 1.271
+++ netinet/in_pcb.c 22 Aug 2022 07:07:35 -0000
@@ -175,6 +175,7 @@ void
in_pcbinit(struct inpcbtable *table, int hashsize)
{
mtx_init(&table->inpt_mtx, IPL_SOFTNET);
+ rw_init(&table->inpt_notify, "inpnotify");
TAILQ_INIT(&table->inpt_queue);
table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK,
&table->inpt_mask);
@@ -696,8 +697,6 @@ in_pcbnotifyall(struct inpcbtable *table
struct in_addr faddr;
u_int rdomain;
- NET_ASSERT_LOCKED_EXCLUSIVE();
-
if (dst->sa_family != AF_INET)
return;
faddr = satosin(dst)->sin_addr;
@@ -708,6 +707,7 @@ in_pcbnotifyall(struct inpcbtable *table
SIMPLEQ_INIT(&inpcblist);
rdomain = rtable_l2(rtable);
+ rw_enter_write(&table->inpt_notify);
mtx_enter(&table->inpt_mtx);
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
#ifdef INET6
@@ -729,6 +729,7 @@ in_pcbnotifyall(struct inpcbtable *table
(*notify)(inp, errno);
in_pcbunref(inp);
}
+ rw_exit_write(&table->inpt_notify);
}
/*
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.130
diff -u -p -r1.130 in_pcb.h
--- netinet/in_pcb.h 21 Aug 2022 11:44:53 -0000 1.130
+++ netinet/in_pcb.h 22 Aug 2022 07:07:35 -0000
@@ -66,6 +66,7 @@
#include <sys/queue.h>
#include <sys/mutex.h>
+#include <sys/rwlock.h>
#include <sys/refcnt.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
@@ -79,6 +80,7 @@
* I immutable after creation
* N net lock
* t inpt_mtx pcb table mutex
+ * y inpt_notify pcb table rwlock for notify
* p inpcb_mtx pcb mutex
*/
@@ -103,7 +105,7 @@ struct inpcb {
LIST_ENTRY(inpcb) inp_hash; /* [t] local and foreign hash */
LIST_ENTRY(inpcb) inp_lhash; /* [t] local port hash */
TAILQ_ENTRY(inpcb) inp_queue; /* [t] inet PCB queue */
- SIMPLEQ_ENTRY(inpcb) inp_notify; /* [N] notify or udp append */
+ SIMPLEQ_ENTRY(inpcb) inp_notify; /* [y] notify or udp append */
struct inpcbtable *inp_table; /* [I] inet queue/hash table */
union inpaddru inp_faddru; /* Foreign address. */
union inpaddru inp_laddru; /* Local address. */
@@ -166,6 +168,7 @@ LIST_HEAD(inpcbhead, inpcb);
struct inpcbtable {
struct mutex inpt_mtx; /* protect queue and hash */
+ struct rwlock inpt_notify; /* protect inp_notify list */
TAILQ_HEAD(inpthead, inpcb) inpt_queue; /* [t] inet PCB queue */
struct inpcbhead *inpt_hashtbl; /* [t] local and foreign hash */
struct inpcbhead *inpt_lhashtbl; /* [t] local port hash */
Index: netinet/raw_ip.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
retrieving revision 1.133
diff -u -p -r1.133 raw_ip.c
--- netinet/raw_ip.c 21 Aug 2022 22:45:55 -0000 1.133
+++ netinet/raw_ip.c 22 Aug 2022 07:07:35 -0000
@@ -160,8 +160,8 @@ rip_input(struct mbuf **mp, int *offp, i
}
}
#endif
- NET_ASSERT_LOCKED_EXCLUSIVE();
SIMPLEQ_INIT(&inpcblist);
+ rw_enter_write(&rawcbtable.inpt_notify);
mtx_enter(&rawcbtable.inpt_mtx);
TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
if (inp->inp_socket->so_state & SS_CANTRCVMORE)
@@ -189,6 +189,8 @@ rip_input(struct mbuf **mp, int *offp, i
mtx_leave(&rawcbtable.inpt_mtx);
if (SIMPLEQ_EMPTY(&inpcblist)) {
+ rw_exit_write(&rawcbtable.inpt_notify);
+
if (ip->ip_p != IPPROTO_ICMP)
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
0, 0);
@@ -199,6 +201,8 @@ rip_input(struct mbuf **mp, int *offp, i
counters[ips_noproto]++;
counters[ips_delivered]--;
counters_leave(&ref, ipcounters);
+
+ return IPPROTO_DONE;
}
while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
@@ -224,6 +228,8 @@ rip_input(struct mbuf **mp, int *offp, i
}
in_pcbunref(inp);
}
+ rw_exit_write(&rawcbtable.inpt_notify);
+
return IPPROTO_DONE;
}
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
retrieving revision 1.285
diff -u -p -r1.285 udp_usrreq.c
--- netinet/udp_usrreq.c 21 Aug 2022 22:45:55 -0000 1.285
+++ netinet/udp_usrreq.c 22 Aug 2022 07:14:17 -0000
@@ -372,8 +372,8 @@ udp_input(struct mbuf **mp, int *offp, i
* Locate pcb(s) for datagram.
* (Algorithm copied from raw_intr().)
*/
- NET_ASSERT_LOCKED_EXCLUSIVE();
SIMPLEQ_INIT(&inpcblist);
+ rw_enter_write(&udbtable.inpt_notify);
mtx_enter(&udbtable.inpt_mtx);
TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
if (inp->inp_socket->so_state & SS_CANTRCVMORE)
@@ -446,6 +446,8 @@ udp_input(struct mbuf **mp, int *offp, i
mtx_leave(&udbtable.inpt_mtx);
if (SIMPLEQ_EMPTY(&inpcblist)) {
+ rw_exit_write(&udbtable.inpt_notify);
+
/*
* No matching pcb found; discard datagram.
* (No need to send an ICMP Port Unreachable
@@ -469,6 +471,8 @@ udp_input(struct mbuf **mp, int *offp, i
}
in_pcbunref(inp);
}
+ rw_exit_write(&udbtable.inpt_notify);
+
return IPPROTO_DONE;
}
/*
Index: netinet6/in6_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v
retrieving revision 1.119
diff -u -p -r1.119 in6_pcb.c
--- netinet6/in6_pcb.c 8 Aug 2022 12:06:31 -0000 1.119
+++ netinet6/in6_pcb.c 22 Aug 2022 07:07:35 -0000
@@ -387,8 +387,6 @@ in6_pcbnotify(struct inpcbtable *table,
u_int32_t flowinfo;
u_int rdomain;
- NET_ASSERT_LOCKED_EXCLUSIVE();
-
if ((unsigned)cmd >= PRC_NCMDS)
return;
@@ -430,6 +428,7 @@ in6_pcbnotify(struct inpcbtable *table,
SIMPLEQ_INIT(&inpcblist);
rdomain = rtable_l2(rtable);
+ rw_enter_write(&table->inpt_notify);
mtx_enter(&table->inpt_mtx);
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
if ((inp->inp_flags & INP_IPV6) == 0)
@@ -513,6 +512,7 @@ in6_pcbnotify(struct inpcbtable *table,
(*notify)(inp, errno);
in_pcbunref(inp);
}
+ rw_exit_write(&table->inpt_notify);
}
struct inpcb *
Index: netinet6/raw_ip6.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v
retrieving revision 1.153
diff -u -p -r1.153 raw_ip6.c
--- netinet6/raw_ip6.c 21 Aug 2022 22:45:55 -0000 1.153
+++ netinet6/raw_ip6.c 22 Aug 2022 07:07:35 -0000
@@ -172,8 +172,8 @@ rip6_input(struct mbuf **mp, int *offp,
}
}
#endif
- NET_ASSERT_LOCKED_EXCLUSIVE();
SIMPLEQ_INIT(&inpcblist);
+ rw_enter_write(&rawin6pcbtable.inpt_notify);
mtx_enter(&rawin6pcbtable.inpt_mtx);
TAILQ_FOREACH(in6p, &rawin6pcbtable.inpt_queue, inp_queue) {
if (in6p->inp_socket->so_state & SS_CANTRCVMORE)
@@ -224,6 +224,8 @@ rip6_input(struct mbuf **mp, int *offp,
struct counters_ref ref;
uint64_t *counters;
+ rw_exit_write(&rawin6pcbtable.inpt_notify);
+
if (proto != IPPROTO_ICMPV6) {
rip6stat_inc(rip6s_nosock);
if (m->m_flags & M_MCAST)
@@ -240,6 +242,8 @@ rip6_input(struct mbuf **mp, int *offp,
counters = counters_enter(&ref, ip6counters);
counters[ip6s_delivered]--;
counters_leave(&ref, ip6counters);
+
+ return IPPROTO_DONE;
}
while ((in6p = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
@@ -267,6 +271,8 @@ rip6_input(struct mbuf **mp, int *offp,
}
in_pcbunref(in6p);
}
+ rw_exit_write(&rawin6pcbtable.inpt_notify);
+
return IPPROTO_DONE;
}