Hi,

I now the right time to commit the parallel forwarding diff?

Known limitiations are:
- Hrvoje has seen a crash with both pfsync and ipsec on his production
  machine.  But he cannot reproduce it in his lab.
- TCP processing gets slower as we have an additional queue between
  IP and protocol layer.
- Protocol layer may starve as 1 exclusive lock is fightig with 4
  shared locks.  This happens only when forwardig a lot.

The advantage of commiting is that we see how relevant these things
are in real world.  But the most important thing is that we learn
how all the locks behave under MP pressure.  You can add a lot of
locking, but only when you run in parallel, you see if it is correct.

An alternative could be to commit it with NET_TASKQ 1.  With only
one softnet thread I would expect to see less bugs, but there is
also less to learn.  NET_TASKQ 1 could be a safe point where we
could easily switch back.

bluhm

Index: net/if.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
retrieving revision 1.649
diff -u -p -r1.649 if.c
--- net/if.c    25 Feb 2022 23:51:03 -0000      1.649
+++ net/if.c    29 Mar 2022 12:44:05 -0000
@@ -237,7 +237,7 @@ int ifq_congestion;
 
 int             netisr;
 
-#define        NET_TASKQ       1
+#define        NET_TASKQ       4
 struct taskq   *nettqmp[NET_TASKQ];
 
 struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
@@ -834,15 +834,10 @@ if_input_process(struct ifnet *ifp, stru
         * lists and the socket layer.
         */
 
-       /*
-        * XXXSMP IPsec data structures are not ready to be accessed
-        * by multiple network threads in parallel.  In this case
-        * use an exclusive lock.
-        */
-       NET_LOCK();
+       NET_RLOCK_IN_SOFTNET();
        while ((m = ml_dequeue(ml)) != NULL)
                (*ifp->if_input)(ifp, m);
-       NET_UNLOCK();
+       NET_RUNLOCK_IN_SOFTNET();
 }
 
 void
@@ -899,6 +894,12 @@ if_netisr(void *unused)
                        arpintr();
                        KERNEL_UNLOCK();
                }
+#endif
+               if (n & (1 << NETISR_IP))
+                       ipintr();
+#ifdef INET6
+               if (n & (1 << NETISR_IPV6))
+                       ip6intr();
 #endif
 #if NPPP > 0
                if (n & (1 << NETISR_PPP)) {
Index: net/if_ethersubr.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_ethersubr.c,v
retrieving revision 1.278
diff -u -p -r1.278 if_ethersubr.c
--- net/if_ethersubr.c  22 Feb 2022 01:15:02 -0000      1.278
+++ net/if_ethersubr.c  29 Mar 2022 12:44:05 -0000
@@ -221,7 +221,10 @@ ether_resolve(struct ifnet *ifp, struct 
 
        switch (af) {
        case AF_INET:
+               KERNEL_LOCK();
+               /* XXXSMP there is a MP race in arpresolve() */
                error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
+               KERNEL_UNLOCK();
                if (error)
                        return (error);
                eh->ether_type = htons(ETHERTYPE_IP);
@@ -244,7 +247,10 @@ ether_resolve(struct ifnet *ifp, struct 
                break;
 #ifdef INET6
        case AF_INET6:
+               KERNEL_LOCK();
+               /* XXXSMP there is a MP race in nd6_resolve() */
                error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
+               KERNEL_UNLOCK();
                if (error)
                        return (error);
                eh->ether_type = htons(ETHERTYPE_IPV6);
@@ -270,13 +276,19 @@ ether_resolve(struct ifnet *ifp, struct 
                        break;
 #ifdef INET6
                case AF_INET6:
+                       KERNEL_LOCK();
+                       /* XXXSMP there is a MP race in nd6_resolve() */
                        error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
+                       KERNEL_UNLOCK();
                        if (error)
                                return (error);
                        break;
 #endif
                case AF_INET:
+                       KERNEL_LOCK();
+                       /* XXXSMP there is a MP race in arpresolve() */
                        error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
+                       KERNEL_UNLOCK();
                        if (error)
                                return (error);
                        break;
@@ -528,12 +540,14 @@ ether_input(struct ifnet *ifp, struct mb
        case ETHERTYPE_PPPOE:
                if (m->m_flags & (M_MCAST | M_BCAST))
                        goto dropanyway;
+               KERNEL_LOCK();
 #ifdef PIPEX
                if (pipex_enable) {
                        struct pipex_session *session;
 
                        if ((session = pipex_pppoe_lookup_session(m)) != NULL) {
                                pipex_pppoe_input(m, session);
+                               KERNEL_UNLOCK();
                                return;
                        }
                }
@@ -542,6 +556,7 @@ ether_input(struct ifnet *ifp, struct mb
                        pppoe_disc_input(m);
                else
                        pppoe_data_input(m);
+               KERNEL_UNLOCK();
                return;
 #endif
 #ifdef MPLS
Index: net/ifq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/ifq.c,v
retrieving revision 1.45
diff -u -p -r1.45 ifq.c
--- net/ifq.c   18 Jan 2022 10:54:05 -0000      1.45
+++ net/ifq.c   29 Mar 2022 12:44:05 -0000
@@ -243,7 +243,7 @@ void
 ifq_init(struct ifqueue *ifq, struct ifnet *ifp, unsigned int idx)
 {
        ifq->ifq_if = ifp;
-       ifq->ifq_softnet = net_tq(ifp->if_index); /* + idx */
+       ifq->ifq_softnet = net_tq(ifp->if_index + idx);
        ifq->ifq_softc = NULL;
 
        mtx_init(&ifq->ifq_mtx, IPL_NET);
@@ -620,7 +620,7 @@ void
 ifiq_init(struct ifiqueue *ifiq, struct ifnet *ifp, unsigned int idx)
 {
        ifiq->ifiq_if = ifp;
-       ifiq->ifiq_softnet = net_tq(ifp->if_index); /* + idx */
+       ifiq->ifiq_softnet = net_tq(ifp->if_index + idx);
        ifiq->ifiq_softc = NULL;
 
        mtx_init(&ifiq->ifiq_mtx, IPL_NET);
Index: net/netisr.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/netisr.h,v
retrieving revision 1.55
diff -u -p -r1.55 netisr.h
--- net/netisr.h        5 Jan 2021 20:43:36 -0000       1.55
+++ net/netisr.h        29 Mar 2022 12:44:05 -0000
@@ -41,8 +41,10 @@
  * interrupt used for scheduling the network code to calls
  * on the lowest level routine of each protocol.
  */
+#define        NETISR_IP       2               /* same as AF_INET */
 #define        NETISR_PFSYNC   5               /* for pfsync "immediate" tx */
 #define        NETISR_ARP      18              /* same as AF_LINK */
+#define        NETISR_IPV6     24              /* same as AF_INET6 */
 #define        NETISR_PPP      28              /* for PPP processing */
 #define        NETISR_BRIDGE   29              /* for bridge processing */
 #define        NETISR_SWITCH   31              /* for switch dataplane */
@@ -57,6 +59,8 @@ extern int    netisr;                 /* scheduling bits 
 extern struct task if_input_task_locked;
 
 void   arpintr(void);
+void   ipintr(void);
+void   ip6intr(void);
 void   pppintr(void);
 void   bridgeintr(void);
 void   switchintr(void);
Index: netinet/ip_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.366
diff -u -p -r1.366 ip_input.c
--- netinet/ip_input.c  22 Feb 2022 01:35:40 -0000      1.366
+++ netinet/ip_input.c  29 Mar 2022 12:44:05 -0000
@@ -129,6 +129,8 @@ const struct sysctl_bounded_args ipctl_v
        { IPCTL_ARPDOWN, &arpt_down, 0, INT_MAX },
 };
 
+struct niqueue ipintrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IP);
+
 struct pool ipqent_pool;
 struct pool ipq_pool;
 
@@ -142,6 +144,7 @@ static struct mbuf_queue    ipsendraw_mq;
 extern struct niqueue          arpinq;
 
 int    ip_ours(struct mbuf **, int *, int, int);
+int    ip_local(struct mbuf **, int *, int, int);
 int    ip_dooptions(struct mbuf *, struct ifnet *);
 int    in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **);
 
@@ -229,6 +232,43 @@ ip_init(void)
 }
 
 /*
+ * Enqueue packet for local delivery.  Queuing is used as a boundary
+ * between the network layer (input/forward path) running with shared
+ * NET_RLOCK_IN_SOFTNET() and the transport layer needing it exclusively.
+ */
+int
+ip_ours(struct mbuf **mp, int *offp, int nxt, int af)
+{
+       /* We are already in a IPv4/IPv6 local deliver loop. */
+       if (af != AF_UNSPEC)
+               return ip_local(mp, offp, nxt, af);
+
+       niq_enqueue(&ipintrq, *mp);
+       *mp = NULL;
+       return IPPROTO_DONE;
+}
+
+/*
+ * Dequeue and process locally delivered packets.
+ */
+void
+ipintr(void)
+{
+       struct mbuf *m;
+       int off, nxt;
+
+       while ((m = niq_dequeue(&ipintrq)) != NULL) {
+#ifdef DIAGNOSTIC
+               if ((m->m_flags & M_PKTHDR) == 0)
+                       panic("ipintr no HDR");
+#endif
+               off = 0;
+               nxt = ip_local(&m, &off, IPPROTO_IPV4, AF_UNSPEC);
+               KASSERT(nxt == IPPROTO_DONE);
+       }
+}
+
+/*
  * IPv4 input routine.
  *
  * Checksum and byte swap header.  Process options. Forward or deliver.
@@ -513,7 +553,7 @@ ip_input_if(struct mbuf **mp, int *offp,
  * If fragmented try to reassemble.  Pass to next level.
  */
 int
-ip_ours(struct mbuf **mp, int *offp, int nxt, int af)
+ip_local(struct mbuf **mp, int *offp, int nxt, int af)
 {
        struct mbuf *m = *mp;
        struct ip *ip = mtod(m, struct ip *);
@@ -521,6 +561,8 @@ ip_ours(struct mbuf **mp, int *offp, int
        struct ipqent *ipqe;
        int mff, hlen;
 
+       NET_ASSERT_WLOCKED();
+
        hlen = ip->ip_hl << 2;
 
        /*
@@ -1657,7 +1699,8 @@ ip_sysctl(int *name, u_int namelen, void
                    newlen));
 #endif
        case IPCTL_IFQUEUE:
-               return (EOPNOTSUPP);
+               return (sysctl_niq(name + 1, namelen - 1,
+                   oldp, oldlenp, newp, newlen, &ipintrq));
        case IPCTL_ARPQUEUE:
                return (sysctl_niq(name + 1, namelen - 1,
                    oldp, oldlenp, newp, newlen, &arpinq));
Index: netinet/ip_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
retrieving revision 1.90
diff -u -p -r1.90 ip_var.h
--- netinet/ip_var.h    25 Feb 2022 23:51:03 -0000      1.90
+++ netinet/ip_var.h    29 Mar 2022 12:44:05 -0000
@@ -248,7 +248,6 @@ void         ip_stripoptions(struct mbuf *);
 int     ip_sysctl(int *, u_int, void *, size_t *, void *, size_t);
 void    ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
            struct mbuf *);
-void    ipintr(void);
 int     ip_input_if(struct mbuf **, int *, int, int, struct ifnet *);
 int     ip_deliver(struct mbuf **, int *, int, int);
 void    ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int);
Index: netinet6/ip6_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_input.c,v
retrieving revision 1.240
diff -u -p -r1.240 ip6_input.c
--- netinet6/ip6_input.c        22 Feb 2022 01:35:41 -0000      1.240
+++ netinet6/ip6_input.c        29 Mar 2022 12:44:05 -0000
@@ -113,11 +113,14 @@
 #include <netinet/ip_carp.h>
 #endif
 
+struct niqueue ip6intrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IPV6);
+
 struct cpumem *ip6counters;
 
 uint8_t ip6_soiikey[IP6_SOIIKEY_LEN];
 
 int ip6_ours(struct mbuf **, int *, int, int);
+int ip6_local(struct mbuf **, int *, int, int);
 int ip6_check_rh0hdr(struct mbuf *, int *);
 int ip6_hbhchcheck(struct mbuf *, int *, int *, int *);
 int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
@@ -160,6 +163,43 @@ ip6_init(void)
        ip6counters = counters_alloc(ip6s_ncounters);
 }
 
+/*
+ * Enqueue packet for local delivery.  Queuing is used as a boundary
+ * between the network layer (input/forward path) running with shared
+ * NET_RLOCK_IN_SOFTNET() and the transport layer needing it exclusively.
+ */
+int
+ip6_ours(struct mbuf **mp, int *offp, int nxt, int af)
+{
+       /* We are already in a IPv4/IPv6 local deliver loop. */
+       if (af != AF_UNSPEC)
+               return ip6_local(mp, offp, nxt, af);
+
+       niq_enqueue(&ip6intrq, *mp);
+       *mp = NULL;
+       return IPPROTO_DONE;
+}
+
+/*
+ * Dequeue and process locally delivered packets.
+ */
+void
+ip6intr(void)
+{
+       struct mbuf *m;
+       int off, nxt;
+
+       while ((m = niq_dequeue(&ip6intrq)) != NULL) {
+#ifdef DIAGNOSTIC
+               if ((m->m_flags & M_PKTHDR) == 0)
+                       panic("ip6intr no HDR");
+#endif
+               off = 0;
+               nxt = ip6_local(&m, &off, IPPROTO_IPV6, AF_UNSPEC);
+               KASSERT(nxt == IPPROTO_DONE);
+       }
+}
+
 void
 ipv6_input(struct ifnet *ifp, struct mbuf *m)
 {
@@ -536,8 +576,10 @@ ip6_input_if(struct mbuf **mp, int *offp
 }
 
 int
-ip6_ours(struct mbuf **mp, int *offp, int nxt, int af)
+ip6_local(struct mbuf **mp, int *offp, int nxt, int af)
 {
+       NET_ASSERT_WLOCKED();
+
        if (ip6_hbhchcheck(*mp, offp, &nxt, NULL))
                return IPPROTO_DONE;
 
@@ -1462,7 +1504,8 @@ ip6_sysctl(int *name, u_int namelen, voi
                NET_UNLOCK();
                return (error);
        case IPV6CTL_IFQUEUE:
-               return (EOPNOTSUPP);
+               return (sysctl_niq(name + 1, namelen - 1,
+                   oldp, oldlenp, newp, newlen, &ip6intrq));
        case IPV6CTL_SOIIKEY:
                return (ip6_sysctl_soiikey(oldp, oldlenp, newp, newlen));
        default:

Reply via email to