From: Anton Ivanov <[email protected]>
Support for multi-packet vector IO - multiple packets
read in one syscall and (optionally) written in one syscall.
Support for (optional) queueing on EAGAIN/ENOBUFS - applies
only to socket transports. Sorry TAP, -EYOULOSE - it will remain
slower than any socket transport for a very log time because
sendmmsg/recvmmsg is supported only for sockets, not for tap fds.
Should work with legacy UML, thorough tested only for the epoll
based IRQ controller
Minimal host kernel version for RX - 2.6.32
Minimal host kernel version for TX - 3.0 - optional, config
option UML_NET_VECTOR_TX
Tested on Debian 7.0/Ubuntu 12.x LTS host which have the relevant
syscalls, but do not have the appropriate glibc routine for TX
(this is why it is a direct syscall).
Tested thoroughly with Debian and OpenWRT guests across a range of
kernels (3.2, 3.3, 3.4, 3.8, 3.12).
Signed-off-by: Anton Ivanov <[email protected]>
---
Issues addressed in this version:
1. Incorrect IRQ release in close procedure, close now works correctly
arch/um/Kconfig.net | 9 ++
arch/um/drivers/Makefile | 2 +-
arch/um/drivers/net_extra_kern.c | 308 +++++++++++++++++++++++++++++++++++
arch/um/drivers/net_extra_user.c | 317 +++++++++++++++++++++++++++++++++++++
arch/um/drivers/net_kern.c | 61 ++++---
arch/um/include/asm/irq.h | 26 +--
arch/um/include/shared/net_kern.h | 31 ++++
arch/um/include/shared/net_user.h | 24 +++
arch/um/kernel/irq.c | 5 +
9 files changed, 751 insertions(+), 32 deletions(-)
create mode 100644 arch/um/drivers/net_extra_kern.c
create mode 100644 arch/um/drivers/net_extra_user.c
diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net
index 820a56f..e4a7cf2 100644
--- a/arch/um/Kconfig.net
+++ b/arch/um/Kconfig.net
@@ -21,6 +21,15 @@ config UML_NET
enable at least one of the following transport options to actually
make use of UML networking.
+config UML_NET_VECTOR_TX
+ bool "Vector transmit in network devices"
+ depends on UML_NET
+ help
+ Accelerate network IO by using sendmmsg() linux syscall. This option
+ requires the host running UML to run at least linux 3.0
+ Presently the acceleration is only for forwarding including firewall,
+ NAT, etc where it yields 25%+ improvement in packet rates and
throughput
+
config UML_NET_ETHERTAP
bool "Ethertap transport"
depends on UML_NET
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index e7582e1..836baaf 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -10,7 +10,7 @@ slip-objs := slip_kern.o slip_user.o
slirp-objs := slirp_kern.o slirp_user.o
daemon-objs := daemon_kern.o daemon_user.o
umcast-objs := umcast_kern.o umcast_user.o
-net-objs := net_kern.o net_user.o
+net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
mconsole-objs := mconsole_kern.o mconsole_user.o
hostaudio-objs := hostaudio_kern.o
ubd-objs := ubd_kern.o ubd_user.o
diff --git a/arch/um/drivers/net_extra_kern.c b/arch/um/drivers/net_extra_kern.c
new file mode 100644
index 0000000..5ee6f9b
--- /dev/null
+++ b/arch/um/drivers/net_extra_kern.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 Lennert Buytenhek ([email protected]) and
+ * James Leu ([email protected]).
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include "init.h"
+#include "irq_kern.h"
+#include "irq_user.h"
+#include "mconsole_kern.h"
+#include "net_kern.h"
+#include "net_user.h"
+
+#define DRIVER_NAME "uml-netdev"
+
+/*
+ These are wrappers around key kernel side functions so we can
+ invoke them from the user side of our Schizofreniac self
+
+*/
+
+extern spinlock_t uml_sigio_lock;
+extern int in_epoll_loop;
+
+static DEFINE_SPINLOCK(net_queue_list);
+
+static struct mmsg_queue_info * pending_queue = NULL;
+
+void uml_net_destroy_skb(void * skb)
+{
+ if (skb) {
+ kfree_skb((struct sk_buff *) skb);
+ }
+}
+
+void * uml_net_build_skb (void * dev)
+{
+ struct uml_net_private *lp = netdev_priv((struct net_device *) dev);
+ struct sk_buff * skb;
+
+ skb = dev_alloc_skb(lp->max_packet + 32);
+ if (skb) {
+ /* add some tunneling space just in case, we usually do not need it as
we use vector IO */
+ skb_reserve(skb,32);
+ skb->dev = dev;
+ skb_put(skb, lp->max_packet);
+ skb_reset_mac_header(skb);
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ printk("Failed Atomic SKB Allocation, will drop\n");
+ }
+ return skb;
+}
+
+void * uml_net_skb_data (void * skb) {
+ if (skb) {
+ return ((struct sk_buff *) skb)->data;
+ } else {
+ printk("hole in vector!!!\n");
+ return NULL;
+ }
+}
+
+int uml_net_advance_head( struct mmsg_queue_info * queue_info, int advance)
+{
+ int queue_depth;
+ queue_info->head =
+ (queue_info->head + advance)
+ % queue_info->max_depth;
+
+ /* caller is already holding the head_lock */
+
+ spin_lock(&queue_info->tail_lock);
+ queue_info->queue_depth -= advance;
+
+ /* we are at 0, use this to
+ * reset head and tail so we can use max size vectors
+ */
+ if (queue_info->queue_depth == 0) {
+ queue_info->head = 0;
+ queue_info->tail = 0;
+ }
+ queue_depth = queue_info->queue_depth;
+ spin_unlock(&queue_info->tail_lock);
+ return queue_depth;
+}
+
+/*
+* This is called by enqueuers which should hold the
+* head lock already
+*/
+
+int uml_net_advance_tail( struct mmsg_queue_info * queue_info, int advance)
+{
+ int queue_depth;
+ queue_info->tail =
+ (queue_info->tail + advance)
+ % queue_info->max_depth;
+ spin_lock(&queue_info->head_lock);
+ queue_info->queue_depth += advance;
+ queue_depth = queue_info->queue_depth;
+ spin_unlock(&queue_info->head_lock);
+ return queue_depth;
+}
+
+/*
+* Generic vector enqueue with support for forming headers using transport
+* specific callback. Allows GRE, L2TPv3, RAW (and potentially when ported)
+* daemon to use a common enqueue procedure in vector mode
+*/
+
+int uml_net_enqueue (
+ struct mmsg_queue_info * queue_info,
+ struct sk_buff * skb,
+ struct uml_net_private *lp,
+ void (*form_header)(void * header, struct sk_buff * skb, struct
uml_net_private * lp),
+ void * remote_addr,
+ int remote_addr_size)
+{
+
+ int queue_depth;
+ struct sk_buff * mmsg_clone;
+ struct mmsghdr * mmsg_send_vector;
+ void ** skb_send_vector;
+ struct iovec * iov;
+
+ if (!queue_info) {
+ /* someone passed us a NULL queue */
+ return 0;
+ }
+
+ spin_lock(&queue_info->tail_lock);
+ spin_lock(&queue_info->head_lock);
+ queue_depth = queue_info->queue_depth;
+ spin_unlock(&queue_info->head_lock);
+
+ if (queue_depth < queue_info->max_depth) {
+ mmsg_clone = skb_clone(skb, GFP_ATOMIC);
+ if (mmsg_clone) {
+
+ skb_send_vector = queue_info->skb_send_vector;
+ skb_send_vector += queue_info->tail;
+
+ (* skb_send_vector) = mmsg_clone;
+
+ mmsg_send_vector = queue_info->mmsg_send_vector;
+ mmsg_send_vector += queue_info->tail;
+
+ iov = mmsg_send_vector->msg_hdr.msg_iov;
+
+ if (iov) {
+ mmsg_send_vector->msg_hdr.msg_name =
remote_addr;
+ mmsg_send_vector->msg_hdr.msg_namelen =
remote_addr_size;
+ if (form_header != NULL) {
+ (* form_header)(iov->iov_base, skb, lp);
+ iov++;
+ }
+ iov->iov_base = skb->data;
+ iov->iov_len = skb->len;
+
+ queue_depth = uml_net_advance_tail(queue_info,
1);
+ } else {
+ printk("no iov, cannot enqueue\n");
+ }
+ } else {
+ printk("cloning failed\n");
+ }
+ }
+ spin_unlock(&queue_info->tail_lock);
+ return queue_depth;
+}
+
+static int send_mmsg_queue(struct mmsg_queue_info * queue_info, int
queue_depth)
+{
+ int fd = queue_info->fd;
+ struct mmsghdr * send_from;
+ void ** skb_send_vector;
+ int result = 0, send_len, skb_index, allowed_drop = 0;
+
+ if (! queue_info) {
+ /* someone passed a null queue, should not occur */
+ return 0;
+ }
+
+ if (spin_trylock(&queue_info->head_lock)) {
+ if (spin_trylock(&queue_info->tail_lock)) {
+ /* update queue_depth to current value */
+ queue_depth = queue_info->queue_depth;
+ spin_unlock(&queue_info->tail_lock);
+ if (queue_depth > 0) {
+ send_len = queue_depth;
+ send_from = queue_info->mmsg_send_vector;
+ send_from += queue_info->head;
+ if (send_len + queue_info->head >
queue_info->max_depth) {
+ send_len = queue_info->max_depth -
queue_info->head;
+ }
+ if (send_len > 0) {
+ result = net_sendmmsg(
+ fd, send_from, send_len, 0
+ );
+ }
+ if (result < 0) {
+ printk("error %i in multisend\n",
result);
+ result = send_len; /* drop the lot */
+ }
+ if (result > 0) {
+ if (result != send_len) {
+ /* we need to drop a few,
exponentially increasing
+ * drop bucket in use
+ */
+ result += allowed_drop;
+ allowed_drop += allowed_drop *
2 + 1;
+ if (result > send_len) {
+ /* do not drop beyond
requested size */
+ result = send_len;
+ }
+ } else {
+ /* clear drop bucket size */
+ allowed_drop = 0;
+ }
+ skb_send_vector =
queue_info->skb_send_vector;
+ skb_send_vector += queue_info->head;
+ for (skb_index = 0; skb_index <
send_len; skb_index++) {
+ uml_net_destroy_skb(*
skb_send_vector);
+ (* skb_send_vector) = NULL; /*
just in case */
+ skb_send_vector ++ ;
+ }
+ queue_depth =
uml_net_advance_head(queue_info, result);
+ }
+ }
+ }
+ spin_unlock(&queue_info->head_lock);
+ }
+ return queue_depth;
+}
+
+int uml_net_flush_mmsg_queue(
+ struct mmsg_queue_info * queue_info, int queue_depth)
+{
+ int old_queue_depth;
+
+ if (queue_depth >= (queue_info->max_depth - 1)) {
+ /* queue full, flush some regardless */
+ queue_depth = send_mmsg_queue(queue_info, queue_depth);
+ }
+ if ((queue_depth > 0) && (spin_trylock(¨_sigio_lock))) {
+ /* unconditional flush, non zero queue - not in epoll loop so
not forwarding */
+ if (!(in_epoll_loop)) {
+ while (queue_depth > 0) {
+ queue_depth = send_mmsg_queue(queue_info,
queue_depth);
+ }
+ }
+ spin_unlock(¨_sigio_lock);
+ }
+
+ /* we are forwarding (most likely) - check if there is a pending queue,
if there is a
+ * pending queue, flush it, then put the current queue as pending
+ */
+
+ spin_lock(&net_queue_list);
+ if ((pending_queue) && (pending_queue != queue_info)) {
+ old_queue_depth = send_mmsg_queue(pending_queue, 1);
+ while (old_queue_depth > 0) {
+ old_queue_depth =
+ send_mmsg_queue(pending_queue, old_queue_depth);
+ }
+ }
+ if (queue_depth) {
+ pending_queue = queue_info;
+ } else {
+ pending_queue = NULL;
+ }
+ spin_unlock(&net_queue_list);
+
+ return queue_depth;
+}
+
+/*
+* this is invoked out of the IRQ IO event loop to flush pending
+* packets on "current" interface
+*/
+
+void flush_pending_netio(void) {
+ int result;
+ spin_lock(&net_queue_list);
+ if (pending_queue) {
+ do {
+ result = send_mmsg_queue(pending_queue, 1);
+ } while (result > 0);
+ }
+ pending_queue = NULL;
+ spin_unlock(&net_queue_list);
+}
diff --git a/arch/um/drivers/net_extra_user.c b/arch/um/drivers/net_extra_user.c
new file mode 100644
index 0000000..1037899
--- /dev/null
+++ b/arch/um/drivers/net_extra_user.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Licensed under the GPL
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <asm/unistd.h>
+#include "net_user.h"
+#include "os.h"
+#include "um_malloc.h"
+
+/*
+* Principles of operation:
+*
+* EVERYTHING here is built to tolerate a failed memory allocation.
+* If either a header buffer or a data buffer (taken from skb->data)
+* is NULL the read will fail and the packet will be dropped. This
+* is the normal behaviour of recvmsg and recvmmsg functions - if a
+* particular iov_base == NULL and its corresponding iov_baselen is
+* 0 we truncate and/or drop the packet altogether.
+*
+* On the negative side this means that we have to do a few more
+* checks for NULL here and there. On the positive side this means
+* that the whole thing is more robust including under low
+* memory conditions.
+*
+* There is one special case which we need to handle as a result of
+* this - any header verification functions should return "broken
+* header" on hitting a NULL. This will in turn invoke the applicable
+* packet drop logic.
+*
+* Any changes should follow this overall design.
+*
+* Side effect - none of these need to use the shared (and mutexed)
+* drop skb. This is surplus to reqs, the normal recvm(m)msg drop
+* mechanics will drop it.
+*/
+
+int net_readv(int fd, void *iov, int iovcnt)
+{
+ int n;
+
+ CATCH_EINTR(n = readv(fd, iov, iovcnt));
+ if ((n < 0) && (errno == EAGAIN))
+ return 0;
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_recvfrom2(int fd, void *buf, int len, void *src_addr, int *addrlen)
+{
+ int n;
+
+ CATCH_EINTR(n = recvfrom(fd, buf, len, 0, src_addr, addrlen));
+ if (n < 0) {
+ if (errno == EAGAIN)
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_writev(int fd, void *iov, int iovcnt)
+{
+ int n;
+
+ CATCH_EINTR(n = writev(fd, iov, iovcnt));
+
+ if ((n < 0) && ((errno == EAGAIN) || (errno == ENOBUFS)))
+ return 0;
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_sendmessage(int fd, void *msg, int flags)
+{
+ int n;
+
+ CATCH_EINTR(n = sendmsg(fd, msg, flags));
+ if (n < 0) {
+ if ((errno == EAGAIN) || (errno == ENOBUFS))
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+int net_recvmessage(int fd, void *msg, int flags)
+{
+ int n;
+
+ CATCH_EINTR(n = recvmsg(fd, msg, flags));
+ if (n < 0) {
+ if (errno == EAGAIN)
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_recvmmsg(int fd, void *msgvec, unsigned int vlen,
+ unsigned int flags, struct timespec *timeout)
+{
+ int n;
+
+ CATCH_EINTR(n = recvmmsg(fd, msgvec, vlen, flags, timeout));
+ if (n < 0) {
+ if (errno == EAGAIN)
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_sendmmsg(int fd, void *msgvec, unsigned int vlen,
+ unsigned int flags)
+{
+ int n;
+
+#ifdef HAS_SENDMMSG
+
+ /* has proper sendmmsg */
+
+ CATCH_EINTR(n = sendmmsg(fd, msgvec, vlen, flags));
+#else
+
+ /* no glibc wrapper for sendmmsg - Ubuntu LTS 12.04, Debian 7.x */
+
+ CATCH_EINTR(n = syscall(__NR_sendmmsg, fd, msgvec, vlen, flags));
+#endif
+ if (n < 0) {
+ if ((errno == EAGAIN) || (errno == ENOBUFS))
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+void destroy_skb_vector(void ** vector, int size)
+{
+ int i;
+ void ** tofree = vector;
+
+ for (i=0;i<size;i++) {
+ if ( * vector) {
+ uml_net_destroy_skb(* vector);
+ }
+ vector ++;
+ }
+ kfree(tofree);
+}
+
+void destroy_mmsg_vector(void * mmsgvector, int size, int free_iov_base)
+{
+ struct mmsghdr * vector = (struct mmsghdr *) mmsgvector;
+ struct iovec * iov;
+ int i;
+ if (vector) {
+ for (i = 0; i < size; i++) {
+ iov = vector->msg_hdr.msg_iov;
+ if (iov) {
+ if (free_iov_base) {
+ kfree(iov->iov_base);
+ }
+ kfree(iov);
+ }
+ vector ++;
+ }
+ kfree(mmsgvector);
+ } else {
+ printk("NULL mmsg vector in destroy, should not occur\n");
+ }
+}
+
+void * build_skbuf_vector(int size, void * dev)
+{
+ int i;
+ void **result, **vector;
+ result = uml_kmalloc(size * sizeof(void *), UM_GFP_KERNEL);
+ vector = result;
+ if (vector) {
+ for (i = 0; i < size; i++) {
+ * vector = uml_net_build_skb(dev);
+ vector++;
+ }
+ }
+ return result;
+}
+
+void rebuild_skbuf_vector(void ** skbvec, int size, void * dev)
+{
+ int i;
+ if (skbvec) {
+ for (i = 0; i < size; i++) {
+ * skbvec = uml_net_build_skb(dev);
+ skbvec++;
+ }
+ }
+}
+
+void repair_mmsg (void *vec, int iovsize, int header_size)
+{
+ struct mmsghdr * msgvec = (struct mmsghdr *) vec;
+ struct iovec * iov;
+ if (! msgvec->msg_hdr.msg_iov) {
+ msgvec->msg_hdr.msg_iov = uml_kmalloc(sizeof(struct iovec) *
iovsize, UM_GFP_KERNEL);
+ }
+ iov = msgvec->msg_hdr.msg_iov;
+ if (iov) {
+ if (! iov->iov_base) {
+ iov->iov_base=uml_kmalloc(header_size, UM_GFP_KERNEL);
+ }
+ if (iov->iov_base) {
+ /* put correct header size just in case - we may have
had a short frame */
+ iov->iov_len = header_size;
+ } else {
+ printk("failed to allocate a header buffer, will cause
a packet drop later\n");
+ iov->iov_len = 0;
+ }
+ }
+}
+
+void * build_mmsg_vector(int size, int iovsize)
+{
+ int i;
+ struct mmsghdr *msgvec, *result;
+ struct iovec * iov;
+
+ result = uml_kmalloc(sizeof(struct mmsghdr) * size, UM_GFP_KERNEL);
+ msgvec = result;
+ if (msgvec) {
+ memset(msgvec, '\0', sizeof(struct mmsghdr) * size);
+ for ( i = 0; i < size; i++) {
+ iov = uml_kmalloc(sizeof(struct iovec) * iovsize,
UM_GFP_KERNEL);
+ msgvec->msg_hdr.msg_iov=iov;
+ if (iov) {
+ memset(iov, '\0', sizeof(struct iovec) *
iovsize);
+ msgvec->msg_hdr.msg_iovlen=iovsize;
+ } else {
+ printk("failed to allocate iov\n");
+ msgvec->msg_hdr.msg_iovlen=0; /* silent drop on
receive, no xmit */
+ }
+ msgvec++;
+ }
+ }
+ return result;
+}
+
+void add_header_buffers(void * msgvec, int size, int header_size)
+{
+ int i;
+ struct iovec * iov;
+ struct mmsghdr * mmsgvec = (struct mmsghdr *) msgvec;
+ for ( i = 0; i < size; i++) {
+ iov = mmsgvec->msg_hdr.msg_iov;
+ if (iov) {
+ iov->iov_base=uml_kmalloc(header_size, UM_GFP_KERNEL);
+ if (iov->iov_base) {
+ iov->iov_len = header_size;
+ } else {
+ printk("failed to allocate a header buffer,
will cause a packet drop later\n");
+ iov->iov_len = 0;
+ }
+ }
+ mmsgvec++;
+ }
+}
+
+/* NOTE - this is only for offset = 0 or 1, other cases are unhandled!!! */
+
+void add_skbuffs(void * msgvec, void ** skbvec, int size, int skb_size, int
offset) {
+ int i;
+ struct iovec * iov;
+ struct mmsghdr * mmsgvec = (struct mmsghdr *) msgvec;
+ for ( i = 0; i < size; i++) {
+ /*
+ This heavily relies on all IOVs being present, if the initial
allocation
+ fails it must clean up and switch to "normal" per-packet receive
instead
+ Later allocations of skbufs can fail - this will result in short
reads
+ and skips
+
+ */
+ iov = mmsgvec->msg_hdr.msg_iov;
+ if (iov) {
+ iov += offset;
+ iov->iov_base=uml_net_skb_data(* skbvec);
+ if (iov->iov_base) {
+ iov->iov_len = skb_size;
+ } else {
+ printk("NULL SKB will drop\n");
+ iov->iov_len = 0;
+ }
+ } else {
+ printk("NULL IOV will drop\n");
+ }
+ mmsgvec++;
+ skbvec++;
+ }
+}
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 64d8426..d9d9d93 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Copyright (C) 2001 Lennert Buytenhek ([email protected]) and
* James Leu ([email protected]).
@@ -29,6 +30,7 @@
static DEFINE_SPINLOCK(opened_lock);
static LIST_HEAD(opened);
+static int rr_counter = 0;
/*
* The drop_skb is used when we can't allocate an skb. The
@@ -42,6 +44,7 @@ static DEFINE_SPINLOCK(drop_lock);
static struct sk_buff *drop_skb;
static int drop_max;
+
static int update_drop_skb(int max)
{
struct sk_buff *new;
@@ -77,24 +80,38 @@ static int uml_net_rx(struct net_device *dev)
struct sk_buff *skb;
/* If we can't allocate memory, try again next round. */
- skb = dev_alloc_skb(lp->max_packet);
- if (skb == NULL) {
- drop_skb->dev = dev;
- /* Read a packet into drop_skb and don't do anything with it. */
- (*lp->read)(lp->fd, drop_skb, lp);
- dev->stats.rx_dropped++;
+ if (lp->options & UML_NET_USE_SKB_READ) {
+ /* we expect a full formed, well behaved skb from zero copy drivers
here */
+ skb = (*lp->skb_read)(lp);
+ if (skb == NULL) {
return 0;
- }
-
- skb->dev = dev;
- skb_put(skb, lp->max_packet);
- skb_reset_mac_header(skb);
- pkt_len = (*lp->read)(lp->fd, skb, lp);
-
- if (pkt_len > 0) {
+ }
+ pkt_len = skb->len;
+ } else {
+ skb = dev_alloc_skb(lp->max_packet + 32);
+ if (skb == NULL) {
+ drop_skb->dev = dev;
+ /* Read a packet into drop_skb and don't do anything with
it. */
+ (*lp->read)(lp->fd, drop_skb, lp);
+ dev->stats.rx_dropped++;
+ return 0;
+ }
+
+ skb_reserve(skb,32);
+ skb->dev = dev;
+ skb_put(skb, lp->max_packet);
+ skb_reset_mac_header(skb);
+
+ // Mark that virtual devices cannot provide required checksum.
+ skb->ip_summed = CHECKSUM_NONE;
+ pkt_len = (*lp->read)(lp->fd, skb, lp);
+ if (pkt_len > 0) {
skb_trim(skb, pkt_len);
skb->protocol = (*lp->protocol)(skb);
+ }
+ }
+ if (pkt_len > 0) {
dev->stats.rx_bytes += skb->len;
dev->stats.rx_packets++;
netif_rx(skb);
@@ -192,6 +209,7 @@ static int uml_net_close(struct net_device *dev)
struct uml_net_private *lp = netdev_priv(dev);
netif_stop_queue(dev);
+ deactivate_fd(lp->fd, dev->irq);
um_free_irq(dev->irq, dev);
if (lp->close != NULL)
@@ -216,7 +234,6 @@ static int uml_net_start_xmit(struct sk_buff *skb, struct
net_device *dev)
spin_lock_irqsave(&lp->lock, flags);
len = (*lp->write)(lp->fd, skb, lp);
- skb_tx_timestamp(skb);
if (len == skb->len) {
dev->stats.tx_packets++;
@@ -273,14 +290,13 @@ static void uml_net_poll_controller(struct net_device
*dev)
static void uml_net_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
- strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
- strlcpy(info->version, "42", sizeof(info->version));
+ strcpy(info->driver, DRIVER_NAME);
+ strcpy(info->version, "42");
}
static const struct ethtool_ops uml_net_ethtool_ops = {
.get_drvinfo = uml_net_get_drvinfo,
.get_link = ethtool_op_get_link,
- .get_ts_info = ethtool_op_get_ts_info,
};
static void uml_net_user_timer_expire(unsigned long _conn)
@@ -447,6 +463,7 @@ static void eth_configure(int n, void *init, char *mac,
* These just fill in a data structure, so there's no failure
* to be worried about.
*/
+ dev->ethtool_ops = ¨_net_ethtool_ops;
(*transport->kern->init)(dev, init);
*lp = ((struct uml_net_private)
@@ -459,7 +476,9 @@ static void eth_configure(int n, void *init, char *mac,
.open = transport->user->open,
.close = transport->user->close,
.remove = transport->user->remove,
+ .options = transport->kern->options,
.read = transport->kern->read,
+ .skb_read = transport->kern->skb_read,
.write = transport->kern->write,
.add_address = transport->user->add_address,
.delete_address = transport->user->delete_address });
@@ -475,9 +494,9 @@ static void eth_configure(int n, void *init, char *mac,
dev->mtu = transport->user->mtu;
dev->netdev_ops = ¨_netdev_ops;
- dev->ethtool_ops = ¨_net_ethtool_ops;
dev->watchdog_timeo = (HZ >> 1);
- dev->irq = UM_ETH_IRQ;
+ dev->irq = UM_ETH_BASE_IRQ + (rr_counter % UM_ETH_IRQ_RR);
+ rr_counter++;
err = update_drop_skb(lp->max_packet);
if (err)
@@ -829,7 +848,7 @@ static void close_devices(void)
spin_lock(&opened_lock);
list_for_each(ele, &opened) {
lp = list_entry(ele, struct uml_net_private, list);
- um_free_irq(lp->dev->irq, lp->dev);
+ free_irq(lp->dev->irq, lp->dev);
if ((lp->close != NULL) && (lp->fd >= 0))
(*lp->close)(lp->fd, &lp->user);
if (lp->remove != NULL)
diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h
index 4a2037f..be9128b 100644
--- a/arch/um/include/asm/irq.h
+++ b/arch/um/include/asm/irq.h
@@ -1,21 +1,27 @@
+
#ifndef __UM_IRQ_H
#define __UM_IRQ_H
+#define UM_ETH_IRQ_RR 32
+
#define TIMER_IRQ 0
#define UMN_IRQ 1
#define CONSOLE_IRQ 2
#define CONSOLE_WRITE_IRQ 3
#define UBD_IRQ 4
-#define UM_ETH_IRQ 5
-#define SSL_IRQ 6
-#define SSL_WRITE_IRQ 7
-#define ACCEPT_IRQ 8
-#define MCONSOLE_IRQ 9
-#define WINCH_IRQ 10
-#define SIGIO_WRITE_IRQ 11
-#define TELNETD_IRQ 12
-#define XTERM_IRQ 13
-#define RANDOM_IRQ 14
+#define UM_ETH_BASE_IRQ 5
+
+#define UM_END_ETH_IRQ UM_ETH_BASE_IRQ + UM_ETH_IRQ_RR
+
+#define SSL_IRQ UM_END_ETH_IRQ + 1
+#define SSL_WRITE_IRQ UM_END_ETH_IRQ + 2
+#define ACCEPT_IRQ UM_END_ETH_IRQ + 3
+#define MCONSOLE_IRQ UM_END_ETH_IRQ + 4
+#define WINCH_IRQ UM_END_ETH_IRQ + 5
+#define SIGIO_WRITE_IRQ UM_END_ETH_IRQ + 6
+#define TELNETD_IRQ UM_END_ETH_IRQ + 7
+#define XTERM_IRQ UM_END_ETH_IRQ + 8
+#define RANDOM_IRQ UM_END_ETH_IRQ + 9
#define LAST_IRQ RANDOM_IRQ
#define NR_IRQS (LAST_IRQ + 1)
diff --git a/arch/um/include/shared/net_kern.h
b/arch/um/include/shared/net_kern.h
index 012ac87..1e64658 100644
--- a/arch/um/include/shared/net_kern.h
+++ b/arch/um/include/shared/net_kern.h
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -13,6 +14,8 @@
#include <linux/list.h>
#include <linux/workqueue.h>
+#define UML_NET_USE_SKB_READ 1
+
struct uml_net {
struct list_head list;
struct net_device *dev;
@@ -28,6 +31,7 @@ struct uml_net_private {
struct work_struct work;
int fd;
+ unsigned int options;
unsigned char mac[ETH_ALEN];
int max_packet;
unsigned short (*protocol)(struct sk_buff *);
@@ -36,6 +40,7 @@ struct uml_net_private {
void (*remove)(void *);
int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
+ struct sk_buff * (*skb_read)(struct uml_net_private *);
void (*add_address)(unsigned char *, unsigned char *, void *);
void (*delete_address)(unsigned char *, unsigned char *, void *);
@@ -47,6 +52,8 @@ struct net_kern_info {
unsigned short (*protocol)(struct sk_buff *);
int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
+ struct sk_buff * (*skb_read)(struct uml_net_private *);
+ unsigned int options;
};
struct transport {
@@ -59,11 +66,35 @@ struct transport {
const int setup_size;
};
+struct mmsg_queue_info {
+ int fd;
+ struct mmsghdr * mmsg_send_vector;
+ void ** skb_send_vector;
+ int queue_depth, head, tail, max_depth;
+ spinlock_t head_lock;
+ spinlock_t tail_lock;
+};
+
extern struct net_device *ether_init(int);
extern unsigned short ether_protocol(struct sk_buff *);
extern int tap_setup_common(char *str, char *type, char **dev_name,
char **mac_out, char **gate_addr);
extern void register_transport(struct transport *new);
extern unsigned short eth_protocol(struct sk_buff *skb);
+extern struct sk_buff *my_build_skb(void * head, void *data, unsigned int
frag_size);
+
+extern void flush_pending_netio(void);
+
+extern int uml_net_advance_tail( struct mmsg_queue_info * queue_info, int
advance);
+extern int uml_net_advance_head( struct mmsg_queue_info * queue_info, int
advance);
+extern int uml_net_flush_mmsg_queue(struct mmsg_queue_info * queue_info, int
queue_depth);
+
+extern int uml_net_enqueue (
+ struct mmsg_queue_info * queue_info,
+ struct sk_buff * skb,
+ struct uml_net_private *lp,
+ void (*form_header)(void * header, struct sk_buff * skb, struct
uml_net_private * lp),
+ void * remote_addr,
+ int remote_addr_size);
#endif
diff --git a/arch/um/include/shared/net_user.h
b/arch/um/include/shared/net_user.h
index 3dabbe1..4b46f37 100644
--- a/arch/um/include/shared/net_user.h
+++ b/arch/um/include/shared/net_user.h
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -38,10 +39,15 @@ extern void tap_check_ips(char *gate_addr, unsigned char
*eth_addr);
extern void read_output(int fd, char *output_out, int len);
extern int net_read(int fd, void *buf, int len);
+extern int net_readv(int fd, void *iov, int iovcnt);
extern int net_recvfrom(int fd, void *buf, int len);
+extern int net_recvfrom2(int fd, void *buf, int len, void *src_addr, int
*addrlen);
extern int net_write(int fd, void *buf, int len);
+extern int net_writev(int fd, void *iov, int iovcnt);
extern int net_send(int fd, void *buf, int len);
extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len);
+extern int net_sendmessage(int fd, void *msg, int flags);
+extern int net_recvmessage(int fd, void *msg, int flags);
extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg);
extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg);
@@ -50,4 +56,22 @@ extern char *split_if_spec(char *str, ...);
extern int dev_netmask(void *d, void *m);
+
+extern void uml_net_destroy_skb(void * skb);
+extern void * uml_net_build_skb (void * dev);
+extern void * uml_net_skb_data (void * skb);
+
+extern void add_skbuffs(void * msgvec, void ** skbvec, int size, int skb_size,
int offset);
+extern void add_header_buffers(void * msgvec, int size, int header_size);
+extern void * build_mmsg_vector(int size, int iovsize);
+extern void rebuild_skbuf_vector(void ** skbvec, int size, void * dev);
+extern void * build_skbuf_vector(int size, void * dev);
+extern int net_recvmmsg(int fd, void *msgvec, unsigned int vlen,
+ unsigned int flags, struct timespec *timeout);
+extern int net_sendmmsg(int fd, void *msgvec, unsigned int vlen,
+ unsigned int flags);
+extern void repair_mmsg (void *msgvec, int iovsize, int header_size);
+extern void destroy_skb_vector(void ** vector, int size);
+extern void destroy_mmsg_vector(void * mmsgvector, int size, int
free_iov_base);
+
#endif
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index eba7f62..12cba21 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -17,6 +17,7 @@
#include <as-layout.h>
#include <kern_util.h>
#include <os.h>
+#include <net_kern.h>
/*
* We are on the "kernel side" so we cannot pick up the sys/epoll.h
@@ -136,6 +137,10 @@ void sigio_handler(int sig, struct siginfo *unused_si,
struct uml_pt_regs *regs)
spin_unlock_irqrestore(¨_sigio_lock, flags);
}
+#ifdef CONFIG_UML_NET_VECTOR_TX
+ flush_pending_netio();
+#endif
+
/* This needs a better way - it slows down the event loop */
free_irqs();
--
1.7.10.4
------------------------------------------------------------------------------
Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
_______________________________________________
User-mode-linux-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel