On Fri, Dec 03, 2021 at 08:35:45PM +0100, Alexander Bluhm wrote: > Hi, > > After implementing MP safe refcounting for IPsec TDB, I wonder how > stable my diff for forwarding on multiple CPU is. > > Note that IPsec still has the workaround to disable multiple queues. > We do not have a mutex that protects the TDB fields yet. We have > only fixed the memory management. > > My goal is to get real MP pressure on the lower part of the IP > network stack. Only this will show remaining bugs. > > bluhm >
This unlocked `ipsec_in_use' check doesn't work. SADB_X_ADDFLOW and `ipsec_in_use' increment are netlock serialized. > if_input_process(struct ifnet *ifp, struct mbuf_list *ml) > { > struct mbuf *m; > + int exclusive_lock = 0; > > if (ml_empty(ml)) > return; > @@ -834,15 +839,27 @@ if_input_process(struct ifnet *ifp, stru > * lists and the socket layer. > */ > > +#ifdef IPSEC > /* > * XXXSMP IPsec data structures are not ready to be accessed > * by multiple network threads in parallel. In this case > * use an exclusive lock. > */ > - NET_LOCK(); > + if (ipsec_in_use) > + exclusive_lock = 1; > +#endif Concurrent SADB_X_ADDFLOW threads could add some SA between your unlocked `ipsec_in_use' check and the following shared netlock. This time `ipsec_in_use' modification is netlock serialized but you could add the new rwlock(9). Since we are only interesting in non-null value of `ipsec_in_use' it's enough to add it here and around SADB_X_ADDFLOW case within pfkeyv2_send(). I already posted the fix for this diff to your old [1] thread. > + if (exclusive_lock) > + NET_LOCK(); > + else > + NET_RLOCK_IN_SOFTNET(); > + > while ((m = ml_dequeue(ml)) != NULL) > (*ifp->if_input)(ifp, m); 1. https://marc.info/?l=openbsd-tech&m=162698721127298&w=2 > Index: net/if.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v > retrieving revision 1.644 > diff -u -p -r1.644 if.c > --- net/if.c 11 Nov 2021 10:03:10 -0000 1.644 > +++ net/if.c 3 Dec 2021 18:51:29 -0000 > @@ -108,6 +108,10 @@ > #include <netinet6/ip6_var.h> > #endif > > +#ifdef IPSEC > +#include <netinet/ip_ipsp.h> > +#endif > + > #ifdef MPLS > #include <netmpls/mpls.h> > #endif > @@ -237,7 +241,7 @@ int ifq_congestion; > > int netisr; > > -#define NET_TASKQ 1 > +#define NET_TASKQ 4 > struct taskq *nettqmp[NET_TASKQ]; > > struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL); > @@ -814,6 +818,7 @@ void > if_input_process(struct ifnet *ifp, struct mbuf_list *ml) > { > struct mbuf *m; > + int exclusive_lock = 0; > > if (ml_empty(ml)) > return; > @@ -834,15 +839,27 @@ if_input_process(struct ifnet *ifp, stru > * lists and the socket layer. > */ > > +#ifdef IPSEC > /* > * XXXSMP IPsec data structures are not ready to be accessed > * by multiple network threads in parallel. In this case > * use an exclusive lock. > */ > - NET_LOCK(); > + if (ipsec_in_use) > + exclusive_lock = 1; > +#endif > + if (exclusive_lock) > + NET_LOCK(); > + else > + NET_RLOCK_IN_SOFTNET(); > + > while ((m = ml_dequeue(ml)) != NULL) > (*ifp->if_input)(ifp, m); > - NET_UNLOCK(); > + > + if (exclusive_lock) > + NET_UNLOCK(); > + else > + NET_RUNLOCK_IN_SOFTNET(); > } > > void > @@ -899,6 +916,12 @@ if_netisr(void *unused) > arpintr(); > KERNEL_UNLOCK(); > } > +#endif > + if (n & (1 << NETISR_IP)) > + ipintr(); > +#ifdef INET6 > + if (n & (1 << NETISR_IPV6)) > + ip6intr(); > #endif > #if NPPP > 0 > if (n & (1 << NETISR_PPP)) { > Index: net/if_ethersubr.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_ethersubr.c,v > retrieving revision 1.276 > diff -u -p -r1.276 if_ethersubr.c > --- net/if_ethersubr.c 19 Aug 2021 10:22:00 -0000 1.276 > +++ net/if_ethersubr.c 3 Dec 2021 18:51:29 -0000 > @@ -222,7 +222,10 @@ ether_resolve(struct ifnet *ifp, struct > > switch (af) { > case AF_INET: > + KERNEL_LOCK(); > + /* XXXSMP there is a MP race in arpresolve() */ > error = arpresolve(ifp, rt, m, dst, eh->ether_dhost); > + KERNEL_UNLOCK(); > if (error) > return (error); > eh->ether_type = htons(ETHERTYPE_IP); > @@ -245,7 +248,10 @@ ether_resolve(struct ifnet *ifp, struct > break; > #ifdef INET6 > case AF_INET6: > + KERNEL_LOCK(); > + /* XXXSMP there is a MP race in nd6_resolve() */ > error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost); > + KERNEL_UNLOCK(); > if (error) > return (error); > eh->ether_type = htons(ETHERTYPE_IPV6); > @@ -271,13 +277,19 @@ ether_resolve(struct ifnet *ifp, struct > break; > #ifdef INET6 > case AF_INET6: > + KERNEL_LOCK(); > + /* XXXSMP there is a MP race in nd6_resolve() */ > error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost); > + KERNEL_UNLOCK(); > if (error) > return (error); > break; > #endif > case AF_INET: > + KERNEL_LOCK(); > + /* XXXSMP there is a MP race in arpresolve() */ > error = arpresolve(ifp, rt, m, dst, eh->ether_dhost); > + KERNEL_UNLOCK(); > if (error) > return (error); > break; > @@ -529,12 +541,14 @@ ether_input(struct ifnet *ifp, struct mb > case ETHERTYPE_PPPOE: > if (m->m_flags & (M_MCAST | M_BCAST)) > goto dropanyway; > + KERNEL_LOCK(); > #ifdef PIPEX > if (pipex_enable) { > struct pipex_session *session; > > if ((session = pipex_pppoe_lookup_session(m)) != NULL) { > pipex_pppoe_input(m, session); > + KERNEL_UNLOCK(); > return; > } > } > @@ -543,6 +557,7 @@ ether_input(struct ifnet *ifp, struct mb > pppoe_disc_input(m); > else > pppoe_data_input(m); > + KERNEL_UNLOCK(); > return; > #endif > #ifdef MPLS > Index: net/ifq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/ifq.c,v > retrieving revision 1.44 > diff -u -p -r1.44 ifq.c > --- net/ifq.c 9 Jul 2021 01:22:05 -0000 1.44 > +++ net/ifq.c 3 Dec 2021 18:51:29 -0000 > @@ -243,7 +243,7 @@ void > ifq_init(struct ifqueue *ifq, struct ifnet *ifp, unsigned int idx) > { > ifq->ifq_if = ifp; > - ifq->ifq_softnet = net_tq(ifp->if_index); /* + idx */ > + ifq->ifq_softnet = net_tq(ifp->if_index + idx); > ifq->ifq_softc = NULL; > > mtx_init(&ifq->ifq_mtx, IPL_NET); > @@ -620,7 +620,7 @@ void > ifiq_init(struct ifiqueue *ifiq, struct ifnet *ifp, unsigned int idx) > { > ifiq->ifiq_if = ifp; > - ifiq->ifiq_softnet = net_tq(ifp->if_index); /* + idx */ > + ifiq->ifiq_softnet = net_tq(ifp->if_index + idx); > ifiq->ifiq_softc = NULL; > > mtx_init(&ifiq->ifiq_mtx, IPL_NET); > Index: net/netisr.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/netisr.h,v > retrieving revision 1.55 > diff -u -p -r1.55 netisr.h > --- net/netisr.h 5 Jan 2021 20:43:36 -0000 1.55 > +++ net/netisr.h 3 Dec 2021 18:51:29 -0000 > @@ -41,8 +41,10 @@ > * interrupt used for scheduling the network code to calls > * on the lowest level routine of each protocol. > */ > +#define NETISR_IP 2 /* same as AF_INET */ > #define NETISR_PFSYNC 5 /* for pfsync "immediate" tx */ > #define NETISR_ARP 18 /* same as AF_LINK */ > +#define NETISR_IPV6 24 /* same as AF_INET6 */ > #define NETISR_PPP 28 /* for PPP processing */ > #define NETISR_BRIDGE 29 /* for bridge processing */ > #define NETISR_SWITCH 31 /* for switch dataplane */ > @@ -57,6 +59,8 @@ extern int netisr; /* scheduling bits > extern struct task if_input_task_locked; > > void arpintr(void); > +void ipintr(void); > +void ip6intr(void); > void pppintr(void); > void bridgeintr(void); > void switchintr(void); > Index: netinet/ip_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v > retrieving revision 1.364 > diff -u -p -r1.364 ip_input.c > --- netinet/ip_input.c 22 Nov 2021 13:47:10 -0000 1.364 > +++ netinet/ip_input.c 3 Dec 2021 18:52:11 -0000 > @@ -130,6 +130,8 @@ const struct sysctl_bounded_args ipctl_v > { IPCTL_ARPDOWN, &arpt_down, 0, INT_MAX }, > }; > > +struct niqueue ipintrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IP); > + > struct pool ipqent_pool; > struct pool ipq_pool; > > @@ -143,6 +145,7 @@ static struct mbuf_queue ipsendraw_mq; > extern struct niqueue arpinq; > > int ip_ours(struct mbuf **, int *, int, int); > +int ip_local(struct mbuf **, int *, int, int); > int ip_dooptions(struct mbuf *, struct ifnet *); > int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **); > > @@ -230,6 +233,43 @@ ip_init(void) > } > > /* > + * Enqueue packet for local delivery. Queuing is used as a boundary > + * between the network layer (input/forward path) running with shared > + * NET_RLOCK_IN_SOFTNET() and the transport layer needing it exclusively. > + */ > +int > +ip_ours(struct mbuf **mp, int *offp, int nxt, int af) > +{ > + /* We are already in a IPv4/IPv6 local deliver loop. */ > + if (af != AF_UNSPEC) > + return ip_local(mp, offp, nxt, af); > + > + niq_enqueue(&ipintrq, *mp); > + *mp = NULL; > + return IPPROTO_DONE; > +} > + > +/* > + * Dequeue and process locally delivered packets. > + */ > +void > +ipintr(void) > +{ > + struct mbuf *m; > + int off, nxt; > + > + while ((m = niq_dequeue(&ipintrq)) != NULL) { > +#ifdef DIAGNOSTIC > + if ((m->m_flags & M_PKTHDR) == 0) > + panic("ipintr no HDR"); > +#endif > + off = 0; > + nxt = ip_local(&m, &off, IPPROTO_IPV4, AF_UNSPEC); > + KASSERT(nxt == IPPROTO_DONE); > + } > +} > + > +/* > * IPv4 input routine. > * > * Checksum and byte swap header. Process options. Forward or deliver. > @@ -514,7 +554,7 @@ ip_input_if(struct mbuf **mp, int *offp, > * If fragmented try to reassemble. Pass to next level. > */ > int > -ip_ours(struct mbuf **mp, int *offp, int nxt, int af) > +ip_local(struct mbuf **mp, int *offp, int nxt, int af) > { > struct mbuf *m = *mp; > struct ip *ip = mtod(m, struct ip *); > @@ -1663,7 +1703,8 @@ ip_sysctl(int *name, u_int namelen, void > newlen)); > #endif > case IPCTL_IFQUEUE: > - return (EOPNOTSUPP); > + return (sysctl_niq(name + 1, namelen - 1, > + oldp, oldlenp, newp, newlen, &ipintrq)); > case IPCTL_ARPQUEUE: > return (sysctl_niq(name + 1, namelen - 1, > oldp, oldlenp, newp, newlen, &arpinq)); > Index: netinet/ip_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v > retrieving revision 1.88 > diff -u -p -r1.88 ip_var.h > --- netinet/ip_var.h 30 Mar 2021 08:37:11 -0000 1.88 > +++ netinet/ip_var.h 3 Dec 2021 18:51:29 -0000 > @@ -248,7 +248,6 @@ void ip_stripoptions(struct mbuf *); > int ip_sysctl(int *, u_int, void *, size_t *, void *, size_t); > void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, > struct mbuf *); > -void ipintr(void); > int ip_input_if(struct mbuf **, int *, int, int, struct ifnet *); > int ip_deliver(struct mbuf **, int *, int, int); > void ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int); > Index: netinet6/ip6_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_input.c,v > retrieving revision 1.237 > diff -u -p -r1.237 ip6_input.c > --- netinet6/ip6_input.c 3 Jun 2021 04:47:54 -0000 1.237 > +++ netinet6/ip6_input.c 3 Dec 2021 18:52:28 -0000 > @@ -115,11 +115,14 @@ > #include <netinet/ip_carp.h> > #endif > > +struct niqueue ip6intrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IPV6); > + > struct cpumem *ip6counters; > > uint8_t ip6_soiikey[IP6_SOIIKEY_LEN]; > > int ip6_ours(struct mbuf **, int *, int, int); > +int ip6_local(struct mbuf **, int *, int, int); > int ip6_check_rh0hdr(struct mbuf *, int *); > int ip6_hbhchcheck(struct mbuf *, int *, int *, int *); > int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); > @@ -162,6 +165,43 @@ ip6_init(void) > ip6counters = counters_alloc(ip6s_ncounters); > } > > +/* > + * Enqueue packet for local delivery. Queuing is used as a boundary > + * between the network layer (input/forward path) running with shared > + * NET_RLOCK_IN_SOFTNET() and the transport layer needing it exclusively. > + */ > +int > +ip6_ours(struct mbuf **mp, int *offp, int nxt, int af) > +{ > + /* We are already in a IPv4/IPv6 local deliver loop. */ > + if (af != AF_UNSPEC) > + return ip6_local(mp, offp, nxt, af); > + > + niq_enqueue(&ip6intrq, *mp); > + *mp = NULL; > + return IPPROTO_DONE; > +} > + > +/* > + * Dequeue and process locally delivered packets. > + */ > +void > +ip6intr(void) > +{ > + struct mbuf *m; > + int off, nxt; > + > + while ((m = niq_dequeue(&ip6intrq)) != NULL) { > +#ifdef DIAGNOSTIC > + if ((m->m_flags & M_PKTHDR) == 0) > + panic("ip6intr no HDR"); > +#endif > + off = 0; > + nxt = ip6_local(&m, &off, IPPROTO_IPV6, AF_UNSPEC); > + KASSERT(nxt == IPPROTO_DONE); > + } > +} > + > void > ipv6_input(struct ifnet *ifp, struct mbuf *m) > { > @@ -544,7 +584,7 @@ ip6_input_if(struct mbuf **mp, int *offp > } > > int > -ip6_ours(struct mbuf **mp, int *offp, int nxt, int af) > +ip6_local(struct mbuf **mp, int *offp, int nxt, int af) > { > if (ip6_hbhchcheck(*mp, offp, &nxt, NULL)) > return IPPROTO_DONE; > @@ -1470,7 +1510,8 @@ ip6_sysctl(int *name, u_int namelen, voi > NET_UNLOCK(); > return (error); > case IPV6CTL_IFQUEUE: > - return (EOPNOTSUPP); > + return (sysctl_niq(name + 1, namelen - 1, > + oldp, oldlenp, newp, newlen, &ip6intrq)); > case IPV6CTL_SOIIKEY: > return (ip6_sysctl_soiikey(oldp, oldlenp, newp, newlen)); > default: >