On 30/05/17(Tue) 10:45, Martin Pieuchot wrote: > Diff below moves IPv4 & IPv6 incoming/forwarding path, PIPEX ppp > processing and IPv4 & IPv6 dispatch functions outside the KERNEL_LOCK(). > > We currently rely on the NET_LOCK() serializing access to most global > data structures for that. IP input queues are no longer used in the > forwarding case. They still exist as boundary between the network and > transport layers because TCP/UDP & friends still need the KERNEL_LOCK(). > > Since we do not want to grab the NET_LOCK() for every packet, the > softnet thread will do it once before processing a batch. That means > the L2 processing path, which is currently running without lock, will > now run with the NET_LOCK(). > > IPsec is the bridge of this layer. A bad player. Since IPsec isn't > ready to run without KERNEL_LOCK(), the softnet thread will grab the > KERNEL_LOCK() as soon as ``ipsec_in_use'' is set. > > I tried to document as much as possible the current design in my > commit messages and in the comment below. Please ask if something > isn't clear.
Hrvoje Popovski found that ip{,6}_send_dispatch() also need the IPsec dance. Updated diff below. Index: net/if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.502 diff -u -p -r1.502 if.c --- net/if.c 30 May 2017 07:50:37 -0000 1.502 +++ net/if.c 30 May 2017 08:24:30 -0000 @@ -874,7 +874,10 @@ if_input_process(void *xifidx) struct ifnet *ifp; struct ifih *ifih; struct srp_ref sr; - int s; + int s, s2; +#ifdef IPSEC + int locked = 0; +#endif /* IPSEC */ ifp = if_get(ifidx); if (ifp == NULL) @@ -887,6 +890,32 @@ if_input_process(void *xifidx) if (!ISSET(ifp->if_xflags, IFXF_CLONED)) add_net_randomness(ml_len(&ml)); +#ifdef IPSEC + /* + * IPsec is not ready to run without KERNEL_LOCK(). So all + * the traffic on your machine is punished if you have IPsec + * enabled. + */ + extern int ipsec_in_use; + if (ipsec_in_use) { + KERNEL_LOCK(); + locked = 1; + } +#endif /* IPSEC */ + + /* + * We grab the NET_LOCK() before processing any packet to + * ensure there's no contention on the routing table lock. + * + * Without it we could race with a userland thread to insert + * a L2 entry in ip{6,}_output(). Such race would result in + * one of the threads sleeping *inside* the IP output path. + * + * Since we have a NET_LOCK() we also use it to serialize access + * to PF globals, pipex globals, unicast and multicast addresses + * lists. + */ + NET_LOCK(s2); s = splnet(); while ((m = ml_dequeue(&ml)) != NULL) { /* @@ -903,7 +932,12 @@ if_input_process(void *xifidx) m_freem(m); } splx(s); + NET_UNLOCK(s2); +#ifdef IPSEC + if (locked) + KERNEL_UNLOCK(); +#endif /* IPSEC */ out: if_put(ifp); } Index: net/if_ethersubr.c =================================================================== RCS file: /cvs/src/sys/net/if_ethersubr.c,v retrieving revision 1.245 diff -u -p -r1.245 if_ethersubr.c --- net/if_ethersubr.c 30 May 2017 07:50:37 -0000 1.245 +++ net/if_ethersubr.c 30 May 2017 08:02:13 -0000 @@ -416,15 +416,11 @@ decapsulate: #ifdef PIPEX if (pipex_enable) { struct pipex_session *session; - int s; - NET_LOCK(s); if ((session = pipex_pppoe_lookup_session(m)) != NULL) { pipex_pppoe_input(m, session); - NET_UNLOCK(s); return (1); } - NET_UNLOCK(s); } #endif if (etype == ETHERTYPE_PPPOEDISC) Index: netinet/ip_input.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_input.c,v retrieving revision 1.308 diff -u -p -r1.308 ip_input.c --- netinet/ip_input.c 30 May 2017 07:50:37 -0000 1.308 +++ netinet/ip_input.c 30 May 2017 09:44:53 -0000 @@ -127,6 +127,7 @@ int ip_sysctl_ipstat(void *, size_t *, v static struct mbuf_queue ipsend_mq; void ip_ours(struct mbuf *); +void ip_local(struct mbuf *); int ip_dooptions(struct mbuf *, struct ifnet *); int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **); @@ -207,27 +208,31 @@ ip_init(void) mq_init(&ipsend_mq, 64, IPL_SOFTNET); } +/* + * Enqueue packet for local delivery. Queuing is used as a boundary + * between the network layer (input/forward path) running without + * KERNEL_LOCK() and the transport layer still needing it. + */ void -ipv4_input(struct ifnet *ifp, struct mbuf *m) +ip_ours(struct mbuf *m) { niq_enqueue(&ipintrq, m); } +/* + * Dequeue and process locally delivered packets. + */ void ipintr(void) { struct mbuf *m; - /* - * Get next datagram off input queue and get IP header - * in first mbuf. - */ while ((m = niq_dequeue(&ipintrq)) != NULL) { -#ifdef DIAGNOSTIC +#ifdef DIAGNOSTIC if ((m->m_flags & M_PKTHDR) == 0) panic("ipintr no HDR"); #endif - ip_input(m); + ip_local(m); } } @@ -237,18 +242,13 @@ ipintr(void) * Checksum and byte swap header. Process options. Forward or deliver. */ void -ip_input(struct mbuf *m) +ipv4_input(struct ifnet *ifp, struct mbuf *m) { - struct ifnet *ifp; struct rtentry *rt = NULL; struct ip *ip; int hlen, len; in_addr_t pfrdr = 0; - ifp = if_get(m->m_pkthdr.ph_ifidx); - if (ifp == NULL) - goto bad; - ipstat_inc(ips_total); if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { @@ -462,13 +462,11 @@ ip_input(struct mbuf *m) #endif /* IPSEC */ ip_forward(m, ifp, rt, pfrdr); - if_put(ifp); return; bad: m_freem(m); out: rtfree(rt); - if_put(ifp); } /* @@ -477,13 +475,15 @@ out: * If fragmented try to reassemble. Pass to next level. */ void -ip_ours(struct mbuf *m) +ip_local(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); struct ipq *fp; struct ipqent *ipqe; int mff, hlen; + KERNEL_ASSERT_LOCKED(); + hlen = ip->ip_hl << 2; /* @@ -1680,18 +1680,37 @@ ip_send_dispatch(void *xmq) struct mbuf *m; struct mbuf_list ml; int s; +#ifdef IPSEC + int locked = 0; +#endif /* IPSEC */ mq_delist(mq, &ml); if (ml_empty(&ml)) return; - KERNEL_LOCK(); +#ifdef IPSEC + /* + * IPsec is not ready to run without KERNEL_LOCK(). So all + * the traffic on your machine is punished if you have IPsec + * enabled. + */ + extern int ipsec_in_use; + if (ipsec_in_use) { + KERNEL_LOCK(); + locked = 1; + } +#endif /* IPSEC */ + NET_LOCK(s); while ((m = ml_dequeue(&ml)) != NULL) { ip_output(m, NULL, NULL, 0, NULL, NULL, 0); } NET_UNLOCK(s); - KERNEL_UNLOCK(); + +#ifdef IPSEC + if (locked) + KERNEL_UNLOCK(); +#endif /* IPSEC */ } void Index: netinet/ip_var.h =================================================================== RCS file: /cvs/src/sys/netinet/ip_var.h,v retrieving revision 1.77 diff -u -p -r1.77 ip_var.h --- netinet/ip_var.h 30 May 2017 07:50:37 -0000 1.77 +++ netinet/ip_var.h 30 May 2017 07:57:31 -0000 @@ -248,7 +248,6 @@ int ip_sysctl(int *, u_int, void *, siz void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); void ipintr(void); -void ip_input(struct mbuf *); void ip_deliver(struct mbuf **, int *, int, int); void ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int); int rip_ctloutput(int, struct socket *, int, int, struct mbuf *); Index: netinet6/ip6_input.c =================================================================== RCS file: /cvs/src/sys/netinet6/ip6_input.c,v retrieving revision 1.192 diff -u -p -r1.192 ip6_input.c --- netinet6/ip6_input.c 30 May 2017 07:50:37 -0000 1.192 +++ netinet6/ip6_input.c 30 May 2017 09:45:25 -0000 @@ -119,6 +119,7 @@ struct niqueue ip6intrq = NIQUEUE_INITIA struct cpumem *ip6counters; void ip6_ours(struct mbuf *); +void ip6_local(struct mbuf *); int ip6_check_rh0hdr(struct mbuf *, int *); int ip6_hbhchcheck(struct mbuf *, int *, int *, int *); int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); @@ -160,28 +161,37 @@ ip6_init(void) ip6counters = counters_alloc(ip6s_ncounters); } +/* + * Enqueue packet for local delivery. Queuing is used as a boundary + * between the network layer (input/forward path) running without + * KERNEL_LOCK() and the transport layer still needing it. + */ void -ipv6_input(struct ifnet *ifp, struct mbuf *m) +ip6_ours(struct mbuf *m) { niq_enqueue(&ip6intrq, m); } /* - * IP6 input interrupt handling. Just pass the packet to ip6_input. + * Dequeue and process locally delivered packets. */ void ip6intr(void) { struct mbuf *m; - while ((m = niq_dequeue(&ip6intrq)) != NULL) - ip6_input(m); + while ((m = niq_dequeue(&ip6intrq)) != NULL) { +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("ipintr no HDR"); +#endif + ip6_local(m); + } } void -ip6_input(struct mbuf *m) +ipv6_input(struct ifnet *ifp, struct mbuf *m) { - struct ifnet *ifp; struct ip6_hdr *ip6; struct sockaddr_in6 sin6; struct rtentry *rt = NULL; @@ -192,10 +202,6 @@ ip6_input(struct mbuf *m) #endif int srcrt = 0; - ifp = if_get(m->m_pkthdr.ph_ifidx); - if (ifp == NULL) - goto bad; - ip6stat_inc(ip6s_total); if (m->m_len < sizeof(struct ip6_hdr)) { @@ -441,8 +447,8 @@ ip6_input(struct mbuf *m) inet_ntop(AF_INET6, &ip6->ip6_dst, dst, sizeof(dst)); /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, - "ip6_input: packet to an unready address %s->%s\n", - src, dst)); + "%s: packet to an unready address %s->%s\n", + __func__, src, dst)); goto bad; } else { @@ -500,11 +506,10 @@ ip6_input(struct mbuf *m) m_freem(m); out: rtfree(rt); - if_put(ifp); } void -ip6_ours(struct mbuf *m) +ip6_local(struct mbuf *m) { int off, nxt; @@ -1456,18 +1461,37 @@ ip6_send_dispatch(void *xmq) struct mbuf *m; struct mbuf_list ml; int s; +#ifdef IPSEC + int locked = 0; +#endif /* IPSEC */ mq_delist(mq, &ml); if (ml_empty(&ml)) return; - KERNEL_LOCK(); +#ifdef IPSEC + /* + * IPsec is not ready to run without KERNEL_LOCK(). So all + * the traffic on your machine is punished if you have IPsec + * enabled. + */ + extern int ipsec_in_use; + if (ipsec_in_use) { + KERNEL_LOCK(); + locked = 1; + } +#endif /* IPSEC */ + NET_LOCK(s); while ((m = ml_dequeue(&ml)) != NULL) { ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL); } NET_UNLOCK(s); - KERNEL_UNLOCK(); + +#ifdef IPSEC + if (locked) + KERNEL_UNLOCK(); +#endif /* IPSEC */ } void Index: netinet6/ip6_var.h =================================================================== RCS file: /cvs/src/sys/netinet6/ip6_var.h,v retrieving revision 1.74 diff -u -p -r1.74 ip6_var.h --- netinet6/ip6_var.h 28 May 2017 09:25:51 -0000 1.74 +++ netinet6/ip6_var.h 30 May 2017 08:06:20 -0000 @@ -303,7 +303,6 @@ int icmp6_ctloutput(int, struct socket * void ip6_init(void); void ip6intr(void); -void ip6_input(struct mbuf *); void ip6_deliver(struct mbuf **, int *, int, int); void ip6_freepcbopts(struct ip6_pktopts *); void ip6_freemoptions(struct ip6_moptions *);