On 30/05/17(Tue) 10:45, Martin Pieuchot wrote:
> Diff below moves IPv4 & IPv6 incoming/forwarding path, PIPEX ppp
> processing and IPv4 & IPv6 dispatch functions outside the KERNEL_LOCK().
> 
> We currently rely on the NET_LOCK() serializing access to most global
> data structures for that.  IP input queues are no longer used in the
> forwarding case.  They still exist as boundary between the network and
> transport layers because TCP/UDP & friends still need the KERNEL_LOCK().
> 
> Since we do not want to grab the NET_LOCK() for every packet, the
> softnet thread will do it once before processing a batch.  That means
> the L2 processing path, which is currently running without lock, will
> now run with the NET_LOCK().
> 
> IPsec is the bridge of this layer.  A bad player.  Since IPsec isn't
> ready to run without KERNEL_LOCK(), the softnet thread will grab the
> KERNEL_LOCK() as soon as ``ipsec_in_use'' is set.
> 
> I tried to document as much as possible the current design in my
> commit messages and in the comment below.  Please ask if something
> isn't clear.

Hrvoje Popovski found that ip{,6}_send_dispatch() also need the IPsec
dance.

Updated diff below.

Index: net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.502
diff -u -p -r1.502 if.c
--- net/if.c    30 May 2017 07:50:37 -0000      1.502
+++ net/if.c    30 May 2017 08:24:30 -0000
@@ -874,7 +874,10 @@ if_input_process(void *xifidx)
        struct ifnet *ifp;
        struct ifih *ifih;
        struct srp_ref sr;
-       int s;
+       int s, s2;
+#ifdef IPSEC
+       int locked = 0;
+#endif /* IPSEC */
 
        ifp = if_get(ifidx);
        if (ifp == NULL)
@@ -887,6 +890,32 @@ if_input_process(void *xifidx)
        if (!ISSET(ifp->if_xflags, IFXF_CLONED))
                add_net_randomness(ml_len(&ml));
 
+#ifdef IPSEC
+       /*
+        * IPsec is not ready to run without KERNEL_LOCK().  So all
+        * the traffic on your machine is punished if you have IPsec
+        * enabled.
+        */
+       extern int ipsec_in_use;
+       if (ipsec_in_use) {
+               KERNEL_LOCK();
+               locked = 1;
+       }
+#endif /* IPSEC */
+
+       /*
+        * We grab the NET_LOCK() before processing any packet to
+        * ensure there's no contention on the routing table lock.
+        *
+        * Without it we could race with a userland thread to insert
+        * a L2 entry in ip{6,}_output().  Such race would result in
+        * one of the threads sleeping *inside* the IP output path.
+        *
+        * Since we have a NET_LOCK() we also use it to serialize access
+        * to PF globals, pipex globals, unicast and multicast addresses
+        * lists.
+        */
+       NET_LOCK(s2);
        s = splnet();
        while ((m = ml_dequeue(&ml)) != NULL) {
                /*
@@ -903,7 +932,12 @@ if_input_process(void *xifidx)
                        m_freem(m);
        }
        splx(s);
+       NET_UNLOCK(s2);
 
+#ifdef IPSEC
+       if (locked)
+               KERNEL_UNLOCK();
+#endif /* IPSEC */
 out:
        if_put(ifp);
 }
Index: net/if_ethersubr.c
===================================================================
RCS file: /cvs/src/sys/net/if_ethersubr.c,v
retrieving revision 1.245
diff -u -p -r1.245 if_ethersubr.c
--- net/if_ethersubr.c  30 May 2017 07:50:37 -0000      1.245
+++ net/if_ethersubr.c  30 May 2017 08:02:13 -0000
@@ -416,15 +416,11 @@ decapsulate:
 #ifdef PIPEX
                if (pipex_enable) {
                        struct pipex_session *session;
-                       int s;
 
-                       NET_LOCK(s);
                        if ((session = pipex_pppoe_lookup_session(m)) != NULL) {
                                pipex_pppoe_input(m, session);
-                               NET_UNLOCK(s);
                                return (1);
                        }
-                       NET_UNLOCK(s);
                }
 #endif
                if (etype == ETHERTYPE_PPPOEDISC)
Index: netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.308
diff -u -p -r1.308 ip_input.c
--- netinet/ip_input.c  30 May 2017 07:50:37 -0000      1.308
+++ netinet/ip_input.c  30 May 2017 09:44:53 -0000
@@ -127,6 +127,7 @@ int ip_sysctl_ipstat(void *, size_t *, v
 static struct mbuf_queue       ipsend_mq;
 
 void   ip_ours(struct mbuf *);
+void   ip_local(struct mbuf *);
 int    ip_dooptions(struct mbuf *, struct ifnet *);
 int    in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **);
 
@@ -207,27 +208,31 @@ ip_init(void)
        mq_init(&ipsend_mq, 64, IPL_SOFTNET);
 }
 
+/*
+ * Enqueue packet for local delivery.  Queuing is used as a boundary
+ * between the network layer (input/forward path) running without
+ * KERNEL_LOCK() and the transport layer still needing it.
+ */
 void
-ipv4_input(struct ifnet *ifp, struct mbuf *m)
+ip_ours(struct mbuf *m)
 {
        niq_enqueue(&ipintrq, m);
 }
 
+/*
+ * Dequeue and process locally delivered packets.
+ */
 void
 ipintr(void)
 {
        struct mbuf *m;
 
-       /*
-        * Get next datagram off input queue and get IP header
-        * in first mbuf.
-        */
        while ((m = niq_dequeue(&ipintrq)) != NULL) {
-#ifdef DIAGNOSTIC
+#ifdef DIAGNOSTIC
                if ((m->m_flags & M_PKTHDR) == 0)
                        panic("ipintr no HDR");
 #endif
-               ip_input(m);
+               ip_local(m);
        }
 }
 
@@ -237,18 +242,13 @@ ipintr(void)
  * Checksum and byte swap header.  Process options. Forward or deliver.
  */
 void
-ip_input(struct mbuf *m)
+ipv4_input(struct ifnet *ifp, struct mbuf *m)
 {
-       struct ifnet    *ifp;
        struct rtentry  *rt = NULL;
        struct ip       *ip;
        int hlen, len;
        in_addr_t pfrdr = 0;
 
-       ifp = if_get(m->m_pkthdr.ph_ifidx);
-       if (ifp == NULL)
-               goto bad;
-
        ipstat_inc(ips_total);
        if (m->m_len < sizeof (struct ip) &&
            (m = m_pullup(m, sizeof (struct ip))) == NULL) {
@@ -462,13 +462,11 @@ ip_input(struct mbuf *m)
 #endif /* IPSEC */
 
        ip_forward(m, ifp, rt, pfrdr);
-       if_put(ifp);
        return;
 bad:
        m_freem(m);
 out:
        rtfree(rt);
-       if_put(ifp);
 }
 
 /*
@@ -477,13 +475,15 @@ out:
  * If fragmented try to reassemble.  Pass to next level.
  */
 void
-ip_ours(struct mbuf *m)
+ip_local(struct mbuf *m)
 {
        struct ip *ip = mtod(m, struct ip *);
        struct ipq *fp;
        struct ipqent *ipqe;
        int mff, hlen;
 
+       KERNEL_ASSERT_LOCKED();
+
        hlen = ip->ip_hl << 2;
 
        /*
@@ -1680,18 +1680,37 @@ ip_send_dispatch(void *xmq)
        struct mbuf *m;
        struct mbuf_list ml;
        int s;
+#ifdef IPSEC
+       int locked = 0;
+#endif /* IPSEC */
 
        mq_delist(mq, &ml);
        if (ml_empty(&ml))
                return;
 
-       KERNEL_LOCK();
+#ifdef IPSEC
+       /*
+        * IPsec is not ready to run without KERNEL_LOCK().  So all
+        * the traffic on your machine is punished if you have IPsec
+        * enabled.
+        */
+       extern int ipsec_in_use;
+       if (ipsec_in_use) {
+               KERNEL_LOCK();
+               locked = 1;
+       }
+#endif /* IPSEC */
+
        NET_LOCK(s);
        while ((m = ml_dequeue(&ml)) != NULL) {
                ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
        }
        NET_UNLOCK(s);
-       KERNEL_UNLOCK();
+
+#ifdef IPSEC
+       if (locked)
+               KERNEL_UNLOCK();
+#endif /* IPSEC */
 }
 
 void
Index: netinet/ip_var.h
===================================================================
RCS file: /cvs/src/sys/netinet/ip_var.h,v
retrieving revision 1.77
diff -u -p -r1.77 ip_var.h
--- netinet/ip_var.h    30 May 2017 07:50:37 -0000      1.77
+++ netinet/ip_var.h    30 May 2017 07:57:31 -0000
@@ -248,7 +248,6 @@ int  ip_sysctl(int *, u_int, void *, siz
 void    ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
            struct mbuf *);
 void    ipintr(void);
-void    ip_input(struct mbuf *);
 void    ip_deliver(struct mbuf **, int *, int, int);
 void    ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int);
 int     rip_ctloutput(int, struct socket *, int, int, struct mbuf *);
Index: netinet6/ip6_input.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
retrieving revision 1.192
diff -u -p -r1.192 ip6_input.c
--- netinet6/ip6_input.c        30 May 2017 07:50:37 -0000      1.192
+++ netinet6/ip6_input.c        30 May 2017 09:45:25 -0000
@@ -119,6 +119,7 @@ struct niqueue ip6intrq = NIQUEUE_INITIA
 struct cpumem *ip6counters;
 
 void ip6_ours(struct mbuf *);
+void ip6_local(struct mbuf *);
 int ip6_check_rh0hdr(struct mbuf *, int *);
 int ip6_hbhchcheck(struct mbuf *, int *, int *, int *);
 int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
@@ -160,28 +161,37 @@ ip6_init(void)
        ip6counters = counters_alloc(ip6s_ncounters);
 }
 
+/*
+ * Enqueue packet for local delivery.  Queuing is used as a boundary
+ * between the network layer (input/forward path) running without
+ * KERNEL_LOCK() and the transport layer still needing it.
+ */
 void
-ipv6_input(struct ifnet *ifp, struct mbuf *m)
+ip6_ours(struct mbuf *m)
 {
        niq_enqueue(&ip6intrq, m);
 }
 
 /*
- * IP6 input interrupt handling. Just pass the packet to ip6_input.
+ * Dequeue and process locally delivered packets.
  */
 void
 ip6intr(void)
 {
        struct mbuf *m;
 
-       while ((m = niq_dequeue(&ip6intrq)) != NULL)
-               ip6_input(m);
+       while ((m = niq_dequeue(&ip6intrq)) != NULL) {
+#ifdef DIAGNOSTIC
+               if ((m->m_flags & M_PKTHDR) == 0)
+                       panic("ipintr no HDR");
+#endif
+               ip6_local(m);
+       }
 }
 
 void
-ip6_input(struct mbuf *m)
+ipv6_input(struct ifnet *ifp, struct mbuf *m)
 {
-       struct ifnet *ifp;
        struct ip6_hdr *ip6;
        struct sockaddr_in6 sin6;
        struct rtentry *rt = NULL;
@@ -192,10 +202,6 @@ ip6_input(struct mbuf *m)
 #endif
        int srcrt = 0;
 
-       ifp = if_get(m->m_pkthdr.ph_ifidx);
-       if (ifp == NULL)
-               goto bad;
-
        ip6stat_inc(ip6s_total);
 
        if (m->m_len < sizeof(struct ip6_hdr)) {
@@ -441,8 +447,8 @@ ip6_input(struct mbuf *m)
                        inet_ntop(AF_INET6, &ip6->ip6_dst, dst, sizeof(dst));
                        /* address is not ready, so discard the packet. */
                        nd6log((LOG_INFO,
-                           "ip6_input: packet to an unready address %s->%s\n",
-                           src, dst));
+                           "%s: packet to an unready address %s->%s\n",
+                           __func__, src, dst));
 
                        goto bad;
                } else {
@@ -500,11 +506,10 @@ ip6_input(struct mbuf *m)
        m_freem(m);
  out:
        rtfree(rt);
-       if_put(ifp);
 }
 
 void
-ip6_ours(struct mbuf *m)
+ip6_local(struct mbuf *m)
 {
        int off, nxt;
 
@@ -1456,18 +1461,37 @@ ip6_send_dispatch(void *xmq)
        struct mbuf *m;
        struct mbuf_list ml;
        int s;
+#ifdef IPSEC
+       int locked = 0;
+#endif /* IPSEC */
 
        mq_delist(mq, &ml);
        if (ml_empty(&ml))
                return;
 
-       KERNEL_LOCK();
+#ifdef IPSEC
+       /*
+        * IPsec is not ready to run without KERNEL_LOCK().  So all
+        * the traffic on your machine is punished if you have IPsec
+        * enabled.
+        */
+       extern int ipsec_in_use;
+       if (ipsec_in_use) {
+               KERNEL_LOCK();
+               locked = 1;
+       }
+#endif /* IPSEC */
+
        NET_LOCK(s);
        while ((m = ml_dequeue(&ml)) != NULL) {
                ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL);
        }
        NET_UNLOCK(s);
-       KERNEL_UNLOCK();
+
+#ifdef IPSEC
+       if (locked)
+               KERNEL_UNLOCK();
+#endif /* IPSEC */
 }
 
 void
Index: netinet6/ip6_var.h
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_var.h,v
retrieving revision 1.74
diff -u -p -r1.74 ip6_var.h
--- netinet6/ip6_var.h  28 May 2017 09:25:51 -0000      1.74
+++ netinet6/ip6_var.h  30 May 2017 08:06:20 -0000
@@ -303,7 +303,6 @@ int icmp6_ctloutput(int, struct socket *
 
 void   ip6_init(void);
 void   ip6intr(void);
-void   ip6_input(struct mbuf *);
 void   ip6_deliver(struct mbuf **, int *, int, int);
 void   ip6_freepcbopts(struct ip6_pktopts *);
 void   ip6_freemoptions(struct ip6_moptions *);

Reply via email to