This is Anthony's net-tap-zero-copy.patch which eliminates
a copy on the host->guest data path with virtio_net.
---
qemu/hw/virtio-net.c | 76 ++++++++++++++++++++++++++++++++++++-------------
qemu/net.h | 3 ++
qemu/vl.c | 50 +++++++++++++++++++++++++++++++++
3 files changed, 109 insertions(+), 20 deletions(-)
diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
index a681a7e..5e71afe 100644
--- a/qemu/hw/virtio-net.c
+++ b/qemu/hw/virtio-net.c
@@ -70,6 +70,8 @@ typedef struct VirtIONet
VLANClientState *vc;
QEMUTimer *tx_timer;
int tx_timer_active;
+ int last_elem_valid;
+ VirtQueueElement last_elem;
} VirtIONet;
/* TODO
@@ -153,47 +155,80 @@ static int virtio_net_can_receive(void *opaque)
return 1;
}
-static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
+static void virtio_net_receive_zc(void *opaque, IOZeroCopyHandler *zc, void
*data)
{
VirtIONet *n = opaque;
- VirtQueueElement elem;
+ VirtQueueElement *elem = &n->last_elem;
struct virtio_net_hdr *hdr;
- int offset, i;
- int total;
+ ssize_t err;
+ int idx;
- if (virtqueue_pop(n->rx_vq, &elem) == 0)
+ if (!n->last_elem_valid && virtqueue_pop(n->rx_vq, elem) == 0)
return;
- if (elem.in_num < 1 || elem.in_sg[0].iov_len != sizeof(*hdr)) {
+ if (elem->in_num < 1 || elem->in_sg[0].iov_len != sizeof(*hdr)) {
fprintf(stderr, "virtio-net header not in first element\n");
exit(1);
}
- hdr = (void *)elem.in_sg[0].iov_base;
+ n->last_elem_valid = 1;
+
+ hdr = (void *)elem->in_sg[0].iov_base;
hdr->flags = 0;
hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
- offset = 0;
- total = sizeof(*hdr);
+ idx = tap_has_offload(n->vc->vlan->first_client) ? 0 : 1;
+
+ do {
+ err = zc(data, &elem->in_sg[idx], elem->in_num - idx);
+ } while (err == -1 && errno == EINTR);
+
+ if (err == -1 && errno == EAGAIN)
+ return;
- if (tap_has_offload(n->vc->vlan->first_client)) {
- memcpy(hdr, buf, sizeof(*hdr));
- offset += total;
+ if (err < 0) {
+ fprintf(stderr, "virtio_net: error during IO\n");
+ return;
}
+ /* signal other side */
+ n->last_elem_valid = 0;
+ virtqueue_push(n->rx_vq, elem, sizeof(*hdr) + err);
+ virtio_notify(&n->vdev, n->rx_vq);
+}
+
+struct compat_data
+{
+ const uint8_t *buf;
+ int size;
+};
+
+static ssize_t compat_copy(void *opaque, struct iovec *iov, int iovcnt)
+{
+ struct compat_data *compat = opaque;
+ int offset, i;
+
/* copy in packet. ugh */
- i = 1;
- while (offset < size && i < elem.in_num) {
- int len = MIN(elem.in_sg[i].iov_len, size - offset);
- memcpy(elem.in_sg[i].iov_base, buf + offset, len);
+ offset = 0;
+ i = 0;
+ while (offset < compat->size && i < iovcnt) {
+ int len = MIN(iov[i].iov_len, compat->size - offset);
+ memcpy(iov[i].iov_base, compat->buf + offset, len);
offset += len;
- total += len;
i++;
}
- /* signal other side */
- virtqueue_push(n->rx_vq, &elem, total);
- virtio_notify(&n->vdev, n->rx_vq);
+ return offset;
+}
+
+static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
+{
+ struct compat_data compat;
+
+ compat.buf = buf;
+ compat.size = size;
+
+ virtio_net_receive_zc(opaque, compat_copy, &compat);
}
/* TX */
@@ -310,6 +345,7 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int
devfn)
memcpy(n->mac, nd->macaddr, 6);
n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive,
virtio_net_can_receive, n);
+ n->vc->fd_read_zc = virtio_net_receive_zc;
n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
n->tx_timer_active = 0;
diff --git a/qemu/net.h b/qemu/net.h
index 6cfd8ce..aca50e9 100644
--- a/qemu/net.h
+++ b/qemu/net.h
@@ -6,6 +6,8 @@
/* VLANs support */
typedef ssize_t (IOReadvHandler)(void *, const struct iovec *, int);
+typedef ssize_t (IOZeroCopyHandler)(void *, struct iovec *, int);
+typedef void (IOReadZCHandler)(void *, IOZeroCopyHandler *, void *);
typedef struct VLANClientState VLANClientState;
@@ -14,6 +16,7 @@ typedef void (SetOffload)(VLANClientState *, int, int, int,
int);
struct VLANClientState {
IOReadHandler *fd_read;
IOReadvHandler *fd_readv;
+ IOReadZCHandler *fd_read_zc;
/* Packets may still be sent if this returns zero. It's used to
rate-limit the slirp code. */
IOCanRWHandler *fd_can_read;
diff --git a/qemu/vl.c b/qemu/vl.c
index de92848..bc5b151 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -4204,6 +4204,7 @@ typedef struct TAPState {
char buf[TAP_BUFSIZE];
int size;
int offload;
+ int received_eagain;
} TAPState;
static void tap_receive(void *opaque, const uint8_t *buf, int size)
@@ -4232,6 +4233,48 @@ static ssize_t tap_readv(void *opaque, const struct
iovec *iov,
return len;
}
+static VLANClientState *tap_can_zero_copy(TAPState *s)
+{
+ VLANClientState *vc, *vc1 = NULL;
+ int vc_count = 0;
+
+ for (vc = s->vc->vlan->first_client; vc; vc = vc->next) {
+ if (vc == s->vc)
+ continue;
+
+ if (!vc->fd_read_zc || vc_count)
+ return NULL;
+
+ vc_count++;
+ vc1 = vc;
+ }
+
+ return vc1;
+}
+
+static ssize_t tap_sendv(void *opaque, struct iovec *iov, int iovcnt)
+{
+ TAPState *s = opaque;
+ ssize_t ret;
+
+ kvm_sleep_begin();
+ ret = readv(s->fd, iov, iovcnt);
+ kvm_sleep_end();
+ if (ret == -1 && errno == EAGAIN)
+ s->received_eagain = 1;
+
+ return ret;
+}
+
+static void tap_send_zero_copy(TAPState *s, VLANClientState *vc)
+{
+ s->received_eagain = 0;
+ while (s->received_eagain == 0 &&
+ (!vc->fd_can_read || vc->fd_can_read(vc->opaque))) {
+ vc->fd_read_zc(vc->opaque, tap_sendv, s);
+ }
+}
+
static int tap_can_send(void *opaque)
{
TAPState *s = opaque;
@@ -4261,6 +4304,13 @@ static int tap_can_send(void *opaque)
static void tap_send(void *opaque)
{
TAPState *s = opaque;
+ VLANClientState *zc;
+
+ zc = tap_can_zero_copy(s);
+ if (zc) {
+ tap_send_zero_copy(s, zc);
+ return;
+ }
/* First try to send any buffered packet */
if (s->size > 0) {
--
1.5.4.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html