On Mon, Aug 22, 2022 at 09:38:15AM +0200, Alexander Bluhm wrote:
> Hi,
>
> To run over all PCB, I have introduced temporary notify lists. They
> are necessary as the list of all PCB is protected by a mutex. Notify
> may call ip_output() eventually which may sleep as pf lock is a
> rwlock. Also the SRP implementation of the route table might sleep.
>
> The same inp_notify is used for UDP multicast and raw IP delivery.
> This could be a mutex, but use the same list entry and rwlock anyway.
>
> Currently the inp_notify list is protected by exclusive netlock.
> Introduce new rwlock per PCB table to relax this requirement to
> shared netlock.
>
> ok?
>
ok mvs@
> bluhm
>
> Index: netinet/in_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
> retrieving revision 1.271
> diff -u -p -r1.271 in_pcb.c
> --- netinet/in_pcb.c 21 Aug 2022 11:44:53 -0000 1.271
> +++ netinet/in_pcb.c 22 Aug 2022 07:07:35 -0000
> @@ -175,6 +175,7 @@ void
> in_pcbinit(struct inpcbtable *table, int hashsize)
> {
> mtx_init(&table->inpt_mtx, IPL_SOFTNET);
> + rw_init(&table->inpt_notify, "inpnotify");
> TAILQ_INIT(&table->inpt_queue);
> table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK,
> &table->inpt_mask);
> @@ -696,8 +697,6 @@ in_pcbnotifyall(struct inpcbtable *table
> struct in_addr faddr;
> u_int rdomain;
>
> - NET_ASSERT_LOCKED_EXCLUSIVE();
> -
> if (dst->sa_family != AF_INET)
> return;
> faddr = satosin(dst)->sin_addr;
> @@ -708,6 +707,7 @@ in_pcbnotifyall(struct inpcbtable *table
>
> SIMPLEQ_INIT(&inpcblist);
> rdomain = rtable_l2(rtable);
> + rw_enter_write(&table->inpt_notify);
> mtx_enter(&table->inpt_mtx);
> TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
> #ifdef INET6
> @@ -729,6 +729,7 @@ in_pcbnotifyall(struct inpcbtable *table
> (*notify)(inp, errno);
> in_pcbunref(inp);
> }
> + rw_exit_write(&table->inpt_notify);
> }
>
> /*
> Index: netinet/in_pcb.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> retrieving revision 1.130
> diff -u -p -r1.130 in_pcb.h
> --- netinet/in_pcb.h 21 Aug 2022 11:44:53 -0000 1.130
> +++ netinet/in_pcb.h 22 Aug 2022 07:07:35 -0000
> @@ -66,6 +66,7 @@
>
> #include <sys/queue.h>
> #include <sys/mutex.h>
> +#include <sys/rwlock.h>
> #include <sys/refcnt.h>
> #include <netinet/ip6.h>
> #include <netinet6/ip6_var.h>
> @@ -79,6 +80,7 @@
> * I immutable after creation
> * N net lock
> * t inpt_mtx pcb table mutex
> + * y inpt_notify pcb table rwlock for notify
> * p inpcb_mtx pcb mutex
> */
>
> @@ -103,7 +105,7 @@ struct inpcb {
> LIST_ENTRY(inpcb) inp_hash; /* [t] local and foreign hash */
> LIST_ENTRY(inpcb) inp_lhash; /* [t] local port hash */
> TAILQ_ENTRY(inpcb) inp_queue; /* [t] inet PCB queue */
> - SIMPLEQ_ENTRY(inpcb) inp_notify; /* [N] notify or udp append */
> + SIMPLEQ_ENTRY(inpcb) inp_notify; /* [y] notify or udp append */
> struct inpcbtable *inp_table; /* [I] inet queue/hash table */
> union inpaddru inp_faddru; /* Foreign address. */
> union inpaddru inp_laddru; /* Local address. */
> @@ -166,6 +168,7 @@ LIST_HEAD(inpcbhead, inpcb);
>
> struct inpcbtable {
> struct mutex inpt_mtx; /* protect queue and hash */
> + struct rwlock inpt_notify; /* protect inp_notify list */
> TAILQ_HEAD(inpthead, inpcb) inpt_queue; /* [t] inet PCB queue */
> struct inpcbhead *inpt_hashtbl; /* [t] local and foreign hash */
> struct inpcbhead *inpt_lhashtbl; /* [t] local port hash */
> Index: netinet/raw_ip.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
> retrieving revision 1.133
> diff -u -p -r1.133 raw_ip.c
> --- netinet/raw_ip.c 21 Aug 2022 22:45:55 -0000 1.133
> +++ netinet/raw_ip.c 22 Aug 2022 07:07:35 -0000
> @@ -160,8 +160,8 @@ rip_input(struct mbuf **mp, int *offp, i
> }
> }
> #endif
> - NET_ASSERT_LOCKED_EXCLUSIVE();
> SIMPLEQ_INIT(&inpcblist);
> + rw_enter_write(&rawcbtable.inpt_notify);
> mtx_enter(&rawcbtable.inpt_mtx);
> TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
> if (inp->inp_socket->so_state & SS_CANTRCVMORE)
> @@ -189,6 +189,8 @@ rip_input(struct mbuf **mp, int *offp, i
> mtx_leave(&rawcbtable.inpt_mtx);
>
> if (SIMPLEQ_EMPTY(&inpcblist)) {
> + rw_exit_write(&rawcbtable.inpt_notify);
> +
> if (ip->ip_p != IPPROTO_ICMP)
> icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
> 0, 0);
> @@ -199,6 +201,8 @@ rip_input(struct mbuf **mp, int *offp, i
> counters[ips_noproto]++;
> counters[ips_delivered]--;
> counters_leave(&ref, ipcounters);
> +
> + return IPPROTO_DONE;
> }
>
> while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> @@ -224,6 +228,8 @@ rip_input(struct mbuf **mp, int *offp, i
> }
> in_pcbunref(inp);
> }
> + rw_exit_write(&rawcbtable.inpt_notify);
> +
> return IPPROTO_DONE;
> }
>
> Index: netinet/udp_usrreq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
> retrieving revision 1.285
> diff -u -p -r1.285 udp_usrreq.c
> --- netinet/udp_usrreq.c 21 Aug 2022 22:45:55 -0000 1.285
> +++ netinet/udp_usrreq.c 22 Aug 2022 07:14:17 -0000
> @@ -372,8 +372,8 @@ udp_input(struct mbuf **mp, int *offp, i
> * Locate pcb(s) for datagram.
> * (Algorithm copied from raw_intr().)
> */
> - NET_ASSERT_LOCKED_EXCLUSIVE();
> SIMPLEQ_INIT(&inpcblist);
> + rw_enter_write(&udbtable.inpt_notify);
> mtx_enter(&udbtable.inpt_mtx);
> TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
> if (inp->inp_socket->so_state & SS_CANTRCVMORE)
> @@ -446,6 +446,8 @@ udp_input(struct mbuf **mp, int *offp, i
> mtx_leave(&udbtable.inpt_mtx);
>
> if (SIMPLEQ_EMPTY(&inpcblist)) {
> + rw_exit_write(&udbtable.inpt_notify);
> +
> /*
> * No matching pcb found; discard datagram.
> * (No need to send an ICMP Port Unreachable
> @@ -469,6 +471,8 @@ udp_input(struct mbuf **mp, int *offp, i
> }
> in_pcbunref(inp);
> }
> + rw_exit_write(&udbtable.inpt_notify);
> +
> return IPPROTO_DONE;
> }
> /*
> Index: netinet6/in6_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v
> retrieving revision 1.119
> diff -u -p -r1.119 in6_pcb.c
> --- netinet6/in6_pcb.c 8 Aug 2022 12:06:31 -0000 1.119
> +++ netinet6/in6_pcb.c 22 Aug 2022 07:07:35 -0000
> @@ -387,8 +387,6 @@ in6_pcbnotify(struct inpcbtable *table,
> u_int32_t flowinfo;
> u_int rdomain;
>
> - NET_ASSERT_LOCKED_EXCLUSIVE();
> -
> if ((unsigned)cmd >= PRC_NCMDS)
> return;
>
> @@ -430,6 +428,7 @@ in6_pcbnotify(struct inpcbtable *table,
>
> SIMPLEQ_INIT(&inpcblist);
> rdomain = rtable_l2(rtable);
> + rw_enter_write(&table->inpt_notify);
> mtx_enter(&table->inpt_mtx);
> TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
> if ((inp->inp_flags & INP_IPV6) == 0)
> @@ -513,6 +512,7 @@ in6_pcbnotify(struct inpcbtable *table,
> (*notify)(inp, errno);
> in_pcbunref(inp);
> }
> + rw_exit_write(&table->inpt_notify);
> }
>
> struct inpcb *
> Index: netinet6/raw_ip6.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v
> retrieving revision 1.153
> diff -u -p -r1.153 raw_ip6.c
> --- netinet6/raw_ip6.c 21 Aug 2022 22:45:55 -0000 1.153
> +++ netinet6/raw_ip6.c 22 Aug 2022 07:07:35 -0000
> @@ -172,8 +172,8 @@ rip6_input(struct mbuf **mp, int *offp,
> }
> }
> #endif
> - NET_ASSERT_LOCKED_EXCLUSIVE();
> SIMPLEQ_INIT(&inpcblist);
> + rw_enter_write(&rawin6pcbtable.inpt_notify);
> mtx_enter(&rawin6pcbtable.inpt_mtx);
> TAILQ_FOREACH(in6p, &rawin6pcbtable.inpt_queue, inp_queue) {
> if (in6p->inp_socket->so_state & SS_CANTRCVMORE)
> @@ -224,6 +224,8 @@ rip6_input(struct mbuf **mp, int *offp,
> struct counters_ref ref;
> uint64_t *counters;
>
> + rw_exit_write(&rawin6pcbtable.inpt_notify);
> +
> if (proto != IPPROTO_ICMPV6) {
> rip6stat_inc(rip6s_nosock);
> if (m->m_flags & M_MCAST)
> @@ -240,6 +242,8 @@ rip6_input(struct mbuf **mp, int *offp,
> counters = counters_enter(&ref, ip6counters);
> counters[ip6s_delivered]--;
> counters_leave(&ref, ip6counters);
> +
> + return IPPROTO_DONE;
> }
>
> while ((in6p = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> @@ -267,6 +271,8 @@ rip6_input(struct mbuf **mp, int *offp,
> }
> in_pcbunref(in6p);
> }
> + rw_exit_write(&rawin6pcbtable.inpt_notify);
> +
> return IPPROTO_DONE;
> }
>
>