Module Name:    src
Committed By:   ozaki-r
Date:           Thu Feb  2 02:52:10 UTC 2017

Modified Files:
        src/sys/net: files.net
        src/sys/netinet: ip_carp.c ip_icmp.c
        src/sys/netinet6: icmp6.c
        src/sys/rump/librump/rumpnet: Makefile.rumpnet
Added Files:
        src/sys/netinet: wqinput.c wqinput.h

Log Message:
Defer some pr_input to workqueue

pr_input is currently called in softint. Some pr_input such as ICMP, ICMPv6
and CARP can add/delete/update IP addresses and routing table entries. For
example, icmp6_redirect_input updates an a routing table entry and
nd6_ra_input may delete an IP address.

Basically such operations shouldn't be done in softint. That aside, we have
a reason to avoid the situation; psz/psref waits cannot be used in softint,
however they are required to work in such pr_input in the MP-safe world.

The change implements the workqueue pr_input framework called wqinput which
provides a means to defer pr_input of a protocol to workqueue easily.
Currently icmp_input, icmp6_input, carp_proto_input and carp6_proto_input
are deferred to workqueue by the framework.

Proposed and discussed on tech-kern and tech-net


To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 src/sys/net/files.net
cvs rdiff -u -r1.83 -r1.84 src/sys/netinet/ip_carp.c
cvs rdiff -u -r1.155 -r1.156 src/sys/netinet/ip_icmp.c
cvs rdiff -u -r0 -r1.1 src/sys/netinet/wqinput.c src/sys/netinet/wqinput.h
cvs rdiff -u -r1.206 -r1.207 src/sys/netinet6/icmp6.c
cvs rdiff -u -r1.20 -r1.21 src/sys/rump/librump/rumpnet/Makefile.rumpnet

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/net/files.net
diff -u src/sys/net/files.net:1.11 src/sys/net/files.net:1.12
--- src/sys/net/files.net:1.11	Fri Sep 16 03:10:45 2016
+++ src/sys/net/files.net	Thu Feb  2 02:52:10 2017
@@ -1,4 +1,4 @@
-#	$NetBSD: files.net,v 1.11 2016/09/16 03:10:45 pgoyette Exp $
+#	$NetBSD: files.net,v 1.12 2017/02/02 02:52:10 ozaki-r Exp $
 
 # XXX CLEANUP
 define	net
@@ -62,6 +62,7 @@ file	netinet/ip_carp.c		carp & (inet | i
 file	netinet/ip_ecn.c		ipsec | gif | stf
 file	netinet/ip_encap.c		inet | inet6
 file	netinet/ip_etherip.c		etherip & inet
+file	netinet/wqinput.c		inet | inet6
 file	netinet6/ip6_etherip.c		etherip & inet6
 file	netinet6/in6_gif.c		gif & inet6
 

Index: src/sys/netinet/ip_carp.c
diff -u src/sys/netinet/ip_carp.c:1.83 src/sys/netinet/ip_carp.c:1.84
--- src/sys/netinet/ip_carp.c:1.83	Mon Jan 16 15:44:47 2017
+++ src/sys/netinet/ip_carp.c	Thu Feb  2 02:52:10 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: ip_carp.c,v 1.83 2017/01/16 15:44:47 christos Exp $	*/
+/*	$NetBSD: ip_carp.c,v 1.84 2017/02/02 02:52:10 ozaki-r Exp $	*/
 /*	$OpenBSD: ip_carp.c,v 1.113 2005/11/04 08:11:54 mcbride Exp $	*/
 
 /*
@@ -33,7 +33,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ip_carp.c,v 1.83 2017/01/16 15:44:47 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ip_carp.c,v 1.84 2017/02/02 02:52:10 ozaki-r Exp $");
 
 /*
  * TODO:
@@ -70,6 +70,7 @@ __KERNEL_RCSID(0, "$NetBSD: ip_carp.c,v 
 #include <net/netisr.h>
 #include <net/net_stats.h>
 #include <netinet/if_inarp.h>
+#include <netinet/wqinput.h>
 
 #if NFDDI > 0
 #include <net/if_fddi.h>
@@ -234,6 +235,14 @@ static void	carp_ether_purgemulti(struct
 
 static void	sysctl_net_inet_carp_setup(struct sysctllog **);
 
+/* workqueue-based pr_input */
+static struct wqinput *carp_wqinput;
+static void _carp_proto_input(struct mbuf *, int, int);
+#ifdef INET6
+static struct wqinput *carp6_wqinput;
+static void _carp6_proto_input(struct mbuf *, int, int);
+#endif
+
 struct if_clone carp_cloner =
     IF_CLONE_INITIALIZER("carp", carp_clone_create, carp_clone_destroy);
 
@@ -468,19 +477,15 @@ carp_setroute(struct carp_softc *sc, int
  * we have rearranged checks order compared to the rfc,
  * but it seems more efficient this way or not possible otherwise.
  */
-void
-carp_proto_input(struct mbuf *m, ...)
+static void
+_carp_proto_input(struct mbuf *m, int hlen, int proto)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct carp_softc *sc = NULL;
 	struct carp_header *ch;
 	int iplen, len;
-	va_list ap;
 	struct ifnet *rcvif;
 
-	va_start(ap, m);
-	va_end(ap);
-
 	CARP_STATINC(CARP_STAT_IPACKETS);
 	MCLAIM(m, &carp_proto_mowner_rx);
 
@@ -542,11 +547,17 @@ carp_proto_input(struct mbuf *m, ...)
 	carp_proto_input_c(m, ch, AF_INET);
 }
 
+void
+carp_proto_input(struct mbuf *m, ...)
+{
+
+	wqinput_input(carp_wqinput, m, 0, 0);
+}
+
 #ifdef INET6
-int
-carp6_proto_input(struct mbuf **mp, int *offp, int proto)
+static void
+_carp6_proto_input(struct mbuf *m, int off, int proto)
 {
-	struct mbuf *m = *mp;
 	struct carp_softc *sc = NULL;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct carp_header *ch;
@@ -558,7 +569,7 @@ carp6_proto_input(struct mbuf **mp, int 
 
 	if (!carp_opts[CARPCTL_ALLOW]) {
 		m_freem(m);
-		return (IPPROTO_DONE);
+		return;
 	}
 
 	rcvif = m_get_rcvif_NOMPSAFE(m);
@@ -569,7 +580,7 @@ carp6_proto_input(struct mbuf **mp, int 
 		CARP_LOG(sc, ("packet received on non-carp interface: %s",
 		    rcvif->if_xname));
 		m_freem(m);
-		return (IPPROTO_DONE);
+		return;
 	}
 
 	/* verify that the IP TTL is 255 */
@@ -578,31 +589,40 @@ carp6_proto_input(struct mbuf **mp, int 
 		CARP_LOG(sc, ("received ttl %d != %d on %s", ip6->ip6_hlim,
 		    CARP_DFLTTL, rcvif->if_xname));
 		m_freem(m);
-		return (IPPROTO_DONE);
+		return;
 	}
 
 	/* verify that we have a complete carp packet */
 	len = m->m_len;
-	IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
+	IP6_EXTHDR_GET(ch, struct carp_header *, m, off, sizeof(*ch));
 	if (ch == NULL) {
 		CARP_STATINC(CARP_STAT_BADLEN);
 		CARP_LOG(sc, ("packet size %u too small", len));
-		return (IPPROTO_DONE);
+		return;
 	}
 
 
 	/* verify the CARP checksum */
-	m->m_data += *offp;
+	m->m_data += off;
 	if (carp_cksum(m, sizeof(*ch))) {
 		CARP_STATINC(CARP_STAT_BADSUM);
 		CARP_LOG(sc, ("checksum failed, on %s", rcvif->if_xname));
 		m_freem(m);
-		return (IPPROTO_DONE);
+		return;
 	}
-	m->m_data -= *offp;
+	m->m_data -= off;
 
 	carp_proto_input_c(m, ch, AF_INET6);
-	return (IPPROTO_DONE);
+	return;
+}
+
+int
+carp6_proto_input(struct mbuf **mp, int *offp, int proto)
+{
+
+	wqinput_input(carp6_wqinput, *mp, *offp, proto);
+
+	return IPPROTO_DONE;
 }
 #endif /* INET6 */
 
@@ -2342,6 +2362,11 @@ carp_init(void)
 	MOWNER_ATTACH(&carp_proto6_mowner_rx);
 	MOWNER_ATTACH(&carp_proto6_mowner_tx);
 #endif
+
+	carp_wqinput = wqinput_create("carp", _carp_proto_input);
+#ifdef INET6
+	carp6_wqinput = wqinput_create("carp6", _carp6_proto_input);
+#endif
 }
 
 static void

Index: src/sys/netinet/ip_icmp.c
diff -u src/sys/netinet/ip_icmp.c:1.155 src/sys/netinet/ip_icmp.c:1.156
--- src/sys/netinet/ip_icmp.c:1.155	Tue Jan 24 07:09:24 2017
+++ src/sys/netinet/ip_icmp.c	Thu Feb  2 02:52:10 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: ip_icmp.c,v 1.155 2017/01/24 07:09:24 ozaki-r Exp $	*/
+/*	$NetBSD: ip_icmp.c,v 1.156 2017/02/02 02:52:10 ozaki-r Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -94,7 +94,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ip_icmp.c,v 1.155 2017/01/24 07:09:24 ozaki-r Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ip_icmp.c,v 1.156 2017/02/02 02:52:10 ozaki-r Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_ipsec.h"
@@ -125,6 +125,7 @@ __KERNEL_RCSID(0, "$NetBSD: ip_icmp.c,v 
 #include <netinet/in_proto.h>
 #include <netinet/icmp_var.h>
 #include <netinet/icmp_private.h>
+#include <netinet/wqinput.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -175,6 +176,10 @@ static void icmp_redirect_timeout(struct
 
 static void sysctl_netinet_icmp_setup(struct sysctllog **);
 
+/* workqueue-based pr_input */
+static struct wqinput *icmp_wqinput;
+static void _icmp_input(struct mbuf *, int, int);
+
 void
 icmp_init(void)
 {
@@ -191,6 +196,7 @@ icmp_init(void)
 	}
 
 	icmpstat_percpu = percpu_alloc(sizeof(uint64_t) * ICMP_NSTATS);
+	icmp_wqinput = wqinput_create("icmp", _icmp_input);
 }
 
 /*
@@ -384,10 +390,9 @@ struct sockaddr_in icmpmask = { 
 /*
  * Process a received ICMP message.
  */
-void
-icmp_input(struct mbuf *m, ...)
+static void
+_icmp_input(struct mbuf *m, int hlen, int proto)
 {
-	int proto;
 	struct icmp *icp;
 	struct ip *ip = mtod(m, struct ip *);
 	int icmplen;
@@ -395,15 +400,8 @@ icmp_input(struct mbuf *m, ...)
 	struct in_ifaddr *ia;
 	void *(*ctlfunc)(int, const struct sockaddr *, void *);
 	int code;
-	int hlen;
-	va_list ap;
 	struct rtentry *rt;
 
-	va_start(ap, m);
-	hlen = va_arg(ap, int);
-	proto = va_arg(ap, int);
-	va_end(ap);
-
 	/*
 	 * Locate icmp structure in mbuf, and check
 	 * that not corrupted and of at least minimum length.
@@ -685,6 +683,20 @@ freeit:
 	return;
 }
 
+void
+icmp_input(struct mbuf *m, ...)
+{
+	int hlen, proto;
+	va_list ap;
+
+	va_start(ap, m);
+	hlen = va_arg(ap, int);
+	proto = va_arg(ap, int);
+	va_end(ap);
+
+	wqinput_input(icmp_wqinput, m, hlen, proto);
+}
+
 /*
  * Reflect the ip packet back to the source
  */

Index: src/sys/netinet6/icmp6.c
diff -u src/sys/netinet6/icmp6.c:1.206 src/sys/netinet6/icmp6.c:1.207
--- src/sys/netinet6/icmp6.c:1.206	Mon Jan 16 15:44:47 2017
+++ src/sys/netinet6/icmp6.c	Thu Feb  2 02:52:10 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: icmp6.c,v 1.206 2017/01/16 15:44:47 christos Exp $	*/
+/*	$NetBSD: icmp6.c,v 1.207 2017/02/02 02:52:10 ozaki-r Exp $	*/
 /*	$KAME: icmp6.c,v 1.217 2001/06/20 15:03:29 jinmei Exp $	*/
 
 /*
@@ -62,7 +62,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: icmp6.c,v 1.206 2017/01/16 15:44:47 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: icmp6.c,v 1.207 2017/02/02 02:52:10 ozaki-r Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_inet.h"
@@ -90,6 +90,7 @@ __KERNEL_RCSID(0, "$NetBSD: icmp6.c,v 1.
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip6.h>
+#include <netinet/wqinput.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/ip6_private.h>
 #include <netinet/icmp6.h>
@@ -169,6 +170,9 @@ static void icmp6_mtudisc_timeout(struct
 static void icmp6_redirect_timeout(struct rtentry *, struct rttimer *);
 static void sysctl_net_inet6_icmp6_setup(struct sysctllog **);
 
+/* workqueue-based pr_input */
+static struct wqinput *icmp6_wqinput;
+static void _icmp6_input(struct mbuf *m, int off, int proto);
 
 void
 icmp6_init(void)
@@ -180,6 +184,8 @@ icmp6_init(void)
 	icmp6_redirect_timeout_q = rt_timer_queue_create(icmp6_redirtimeout);
 
 	icmp6stat_percpu = percpu_alloc(sizeof(uint64_t) * ICMP6_NSTATS);
+
+	icmp6_wqinput = wqinput_create("icmp6", _icmp6_input);
 }
 
 static void
@@ -444,13 +450,12 @@ icmp6_error(struct mbuf *m, int type, in
 /*
  * Process a received ICMP6 message.
  */
-int
-icmp6_input(struct mbuf **mp, int *offp, int proto)
+static void
+_icmp6_input(struct mbuf *m, int off, int proto)
 {
-	struct mbuf *m = *mp, *n;
+	struct mbuf *n;
 	struct ip6_hdr *ip6, *nip6;
 	struct icmp6_hdr *icmp6, *nicmp6;
-	int off = *offp;
 	int icmp6len = m->m_pkthdr.len - off;
 	int code, sum, noff;
 	struct ifnet *rcvif;
@@ -879,7 +884,7 @@ icmp6_input(struct mbuf **mp, int *offp,
 		if (icmp6_notify_error(m, off, icmp6len, code)) {
 			/* In this case, m should've been freed. */
 			m_put_rcvif_psref(rcvif, &psref);
-			return (IPPROTO_DONE);
+			return;
 		}
 		break;
 
@@ -896,11 +901,20 @@ icmp6_input(struct mbuf **mp, int *offp,
 	/* deliver the packet to appropriate sockets */
 	icmp6_rip6_input(&m, off);
 
-	return IPPROTO_DONE;
+	return;
 
  freeit:
 	m_put_rcvif_psref(rcvif, &psref);
 	m_freem(m);
+	return;
+}
+
+int
+icmp6_input(struct mbuf **mp, int *offp, int proto)
+{
+
+	wqinput_input(icmp6_wqinput, *mp, *offp, proto);
+
 	return IPPROTO_DONE;
 }
 

Index: src/sys/rump/librump/rumpnet/Makefile.rumpnet
diff -u src/sys/rump/librump/rumpnet/Makefile.rumpnet:1.20 src/sys/rump/librump/rumpnet/Makefile.rumpnet:1.21
--- src/sys/rump/librump/rumpnet/Makefile.rumpnet:1.20	Tue Jan 17 08:10:37 2017
+++ src/sys/rump/librump/rumpnet/Makefile.rumpnet	Thu Feb  2 02:52:10 2017
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile.rumpnet,v 1.20 2017/01/17 08:10:37 ozaki-r Exp $
+#	$NetBSD: Makefile.rumpnet,v 1.21 2017/02/02 02:52:10 ozaki-r Exp $
 #
 
 LIB=		rumpnet
@@ -38,6 +38,9 @@ SRCS+=	
 # bpf stubs, required for all kernels
 SRCS+=	bpf_stub.c
 
+# workqueue-based pr_input (required by inet and inet6)
+SRCS+=	wqinput.c
+
 CPPFLAGS+=	-I${RUMPTOP}/librump/rumpkern
 
 .include "${RUMPTOP}/Makefile.rump"

Added files:

Index: src/sys/netinet/wqinput.c
diff -u /dev/null src/sys/netinet/wqinput.c:1.1
--- /dev/null	Thu Feb  2 02:52:10 2017
+++ src/sys/netinet/wqinput.c	Thu Feb  2 02:52:10 2017
@@ -0,0 +1,267 @@
+/*	$NetBSD: wqinput.c,v 1.1 2017/02/02 02:52:10 ozaki-r Exp $	*/
+
+/*-
+ * Copyright (c) 2017 Internet Initiative Japan Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+#include <sys/workqueue.h>
+#include <sys/atomic.h>
+#include <sys/queue.h>
+#include <sys/percpu.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <netinet/wqinput.h>
+
+#define WQINPUT_LIST_MAXLEN	IFQ_MAXLEN
+
+struct wqinput_work {
+	struct mbuf	*ww_mbuf;
+	int		ww_off;
+	int		ww_proto;
+	struct wqinput_work *ww_next;
+};
+
+struct wqinput_worklist {
+	/*
+	 * XXX: TAILQ cannot be used because TAILQ_INIT memories the address
+	 * of percpu data while percpu(9) may move percpu data during bootup.
+	 */
+	struct wqinput_work *wwl_head;
+	struct wqinput_work *wwl_tail;
+	unsigned int	wwl_len;
+	unsigned long	wwl_dropped;
+	struct work	wwl_work;
+	bool		wwl_wq_is_active;
+};
+
+struct wqinput {
+	struct workqueue *wqi_wq;
+	struct pool	wqi_work_pool;
+	struct percpu	*wqi_worklists; /* struct wqinput_worklist */
+	void    	(*wqi_input)(struct mbuf *, int, int);
+};
+
+static void wqinput_work(struct work *, void *);
+static void wqinput_sysctl_setup(const char *, struct wqinput *);
+
+static void
+wqinput_drops(void *p, void *arg, struct cpu_info *ci __unused)
+{
+	struct wqinput_worklist *const wwl = p;
+	int *sum = arg;
+
+	*sum += wwl->wwl_dropped;
+}
+
+static int
+wqinput_sysctl_drops_handler(SYSCTLFN_ARGS)
+{
+	struct sysctlnode node;
+	struct wqinput *wqi;
+	int sum = 0;
+	int error;
+
+	node = *rnode;
+	wqi = node.sysctl_data;
+
+	percpu_foreach(wqi->wqi_worklists, wqinput_drops, &sum);
+
+	node.sysctl_data = &sum;
+	error = sysctl_lookup(SYSCTLFN_CALL(&node));
+	if (error != 0 || newp == NULL)
+		return error;
+
+	return 0;
+}
+
+static void
+wqinput_sysctl_setup(const char *name, struct wqinput *wqi)
+{
+	const struct sysctlnode *cnode, *rnode;
+	int error;
+
+	error = sysctl_createv(NULL, 0, NULL, &rnode,
+	    CTLFLAG_PERMANENT, CTLTYPE_NODE, "wqinput",
+	    SYSCTL_DESCR("workqueue-based pr_input controls"),
+	    NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL);
+	if (error != 0)
+		goto bad;
+
+	error = sysctl_createv(NULL, 0, &rnode, &rnode,
+	    CTLFLAG_PERMANENT, CTLTYPE_NODE, name,
+	    SYSCTL_DESCR("Protocol controls for workqueue-based pr_input"),
+	    NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
+	if (error != 0)
+		goto bad;
+
+	error = sysctl_createv(NULL, 0, &rnode, &rnode,
+	    CTLFLAG_PERMANENT, CTLTYPE_NODE, "inputq",
+	    SYSCTL_DESCR("wqinput input queue controls"),
+	    NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
+	if (error != 0)
+		goto bad;
+
+	error = sysctl_createv(NULL, 0, &rnode, &cnode,
+	    CTLFLAG_PERMANENT, CTLTYPE_INT, "drops",
+	    SYSCTL_DESCR("Total packets dropped due to full input queue"),
+	    wqinput_sysctl_drops_handler, 0, (void *)wqi, 0, CTL_CREATE, CTL_EOL);
+	if (error != 0)
+		goto bad;
+
+	return;
+bad:
+	log(LOG_ERR, "%s: could not create a sysctl node for %s\n",
+	    __func__, name);
+	return;
+}
+
+struct wqinput *
+wqinput_create(const char *name, void (*func)(struct mbuf *, int, int))
+{
+	struct wqinput *wqi;
+	int error;
+	char namebuf[32];
+
+	snprintf(namebuf, sizeof(namebuf), "%s_wqinput", name);
+
+	wqi = kmem_alloc(sizeof(*wqi), KM_SLEEP);
+
+	error = workqueue_create(&wqi->wqi_wq, namebuf, wqinput_work, wqi,
+	    PRI_SOFTNET, IPL_SOFTNET, WQ_MPSAFE|WQ_PERCPU);
+	if (error != 0)
+		panic("%s: workqueue_create failed (%d)\n", __func__, error);
+	pool_init(&wqi->wqi_work_pool, sizeof(struct wqinput_work), 0, 0, 0,
+	    namebuf, NULL, IPL_SOFTNET);
+	wqi->wqi_worklists = percpu_alloc(sizeof(struct wqinput_worklist));
+	wqi->wqi_input = func;
+
+	wqinput_sysctl_setup(name, wqi);
+
+	return wqi;
+}
+
+static struct wqinput_work *
+wqinput_work_get(struct wqinput_worklist *wwl)
+{
+	struct wqinput_work *work;
+
+	/* Must be called at IPL_SOFTNET */
+
+	work = wwl->wwl_head;
+	if (work != NULL) {
+		KASSERTMSG(wwl->wwl_len > 0, "wwl->wwl_len=%d", wwl->wwl_len);
+		wwl->wwl_len--;
+		wwl->wwl_head = work->ww_next;
+		work->ww_next = NULL;
+
+		if (wwl->wwl_head == NULL)
+			wwl->wwl_tail = NULL;
+	} else {
+		KASSERT(wwl->wwl_len == 0);
+	}
+
+	return work;
+}
+
+static void
+wqinput_work(struct work *wk, void *arg)
+{
+	struct wqinput *wqi = arg;
+	struct wqinput_work *work;
+	struct wqinput_worklist *wwl;
+	int s;
+
+	/* Users expect to run at IPL_SOFTNET */
+	s = splsoftnet();
+	/* This also prevents LWP migrations between CPUs */
+	wwl = percpu_getref(wqi->wqi_worklists);
+
+	/* We can allow enqueuing another work at this point */
+	wwl->wwl_wq_is_active = false;
+
+	while ((work = wqinput_work_get(wwl)) != NULL) {
+		mutex_enter(softnet_lock);
+		wqi->wqi_input(work->ww_mbuf, work->ww_off, work->ww_proto);
+		mutex_exit(softnet_lock);
+
+		pool_put(&wqi->wqi_work_pool, work);
+	}
+
+	percpu_putref(wqi->wqi_worklists);
+	splx(s);
+}
+
+static void
+wqinput_work_put(struct wqinput_worklist *wwl, struct wqinput_work *work)
+{
+
+	if (wwl->wwl_tail != NULL) {
+		wwl->wwl_tail->ww_next = work;
+	} else {
+		wwl->wwl_head = work;
+	}
+	wwl->wwl_tail = work;
+	wwl->wwl_len++;
+}
+
+void
+wqinput_input(struct wqinput *wqi, struct mbuf *m, int off, int proto)
+{
+	struct wqinput_work *work;
+	struct wqinput_worklist *wwl;
+
+	wwl = percpu_getref(wqi->wqi_worklists);
+
+	/* Prevent too much work and mbuf from being queued */
+	if (wwl->wwl_len >= WQINPUT_LIST_MAXLEN) {
+		wwl->wwl_dropped++;
+		m_freem(m);
+		goto out;
+	}
+
+	work = pool_get(&wqi->wqi_work_pool, PR_NOWAIT);
+	work->ww_mbuf = m;
+	work->ww_off = off;
+	work->ww_proto = proto;
+	work->ww_next = NULL;
+
+	wqinput_work_put(wwl, work);
+
+	/* Avoid enqueuing another work when one is already enqueued */
+	if (wwl->wwl_wq_is_active)
+		goto out;
+	wwl->wwl_wq_is_active = true;
+
+	workqueue_enqueue(wqi->wqi_wq, &wwl->wwl_work, NULL);
+out:
+	percpu_putref(wqi->wqi_worklists);
+}
Index: src/sys/netinet/wqinput.h
diff -u /dev/null src/sys/netinet/wqinput.h:1.1
--- /dev/null	Thu Feb  2 02:52:10 2017
+++ src/sys/netinet/wqinput.h	Thu Feb  2 02:52:10 2017
@@ -0,0 +1,42 @@
+/*	$NetBSD: wqinput.h,v 1.1 2017/02/02 02:52:10 ozaki-r Exp $	*/
+
+/*-
+ * Copyright (c) 2017 Internet Initiative Japan Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NETINET_WQINPUT_H_
+#define _NETINET_WQINPUT_H_
+
+#if !defined(_KERNEL)
+#error "not supposed to be exposed to userland."
+#endif
+
+#include <sys/mbuf.h>
+
+struct wqinput;
+struct wqinput *wqinput_create(const char *, void(*)(struct mbuf *, int, int));
+void wqinput_input(struct wqinput *, struct mbuf *, int, int);
+
+#endif /* _NETINET_WQINPUT_H_ */

Reply via email to