I've taken a step back from my previous growing diff [1] to break it
down into something more reviewable, testable, and incremental.
This diff introduces better defined packet size limitations based on the
underlying tap(4) limits (TUNMRU=16384) and fully implements support for
device rx-side descriptor chains.
Downline goal through this and upcoming diffs is to:
1. enforce an MTU (this patch starts doing that with 1 limitation) on
both rx and tx side
2. allow guests to receive packets up to said MTU, e.g. 9kb jumbo
frames which can improve guest-to-guest private networking
3. communicate to drivers the hardware MTU so they can make better
decisions on driver-side buffer sizing
4. enable mergeable RX buffers to better support large packets on the
driver side (this allows vio(4) to take advantage of the above)
5. update vio(4) to support receiving the MTU from the device and
remove reliance on the arbitrary 16000 MTU enforced when mergeable
rx buffers are negotiated.
This diff addresses goal (1) above and sets us up for (2).
The current vmd virtio network device design for the RX side mimics the
incomplete implementation of support for virtio descriptor chains in
vio(4). An OpenBSD guest vm will only allocate a 2-descriptor chain with
the head descriptor being 10 bytes (for the virtio net header) and the
tail being a 2038 byte buffer (MCLBYTES-10). (There's a XXX todo comment
in the vio(4) code pointing out it wasn't completed.)
In other hypervisor environments like Xen or KVM, their virtio devices
present support for mergeable receive-side buffers. vio(4) supports this
feature and when negotiated defaults to them, overcoming the 2038 byte
limit.
Linux's virtio_net driver fully implements the virtio descriptor chain
spec and (if the hardware MTU feature is presented) will decide if
there's a chance the driver will be asked to change its MTU to something
larger. It allocates long enough chains (usually each buffer backed by a
4kb page) to support the driver being asked to switch to things like 9kb
jumbo frames.
Couple notes to reviewers:
- This diff won't fully allow for the device receiving packets up to the
max size (TUNMRU + Ethernet header) as it still relies on a
stack-allocated 4kb buffer. A driver should be able to transmit a
packet up to new TUNMRU-based limit, however.
read(2) of the tap(4) will truncate packets larger than 4kb. I want
that change as a standalone diff as the plan is to introduce
heap-allocated buffers and there's a life cycle around calloc/free
that needs to be incorporated into the device life cycle.
- Some parts of the diff (hdr_sz, chain_desc_idx) exist to build off
of for the subsequent diffs. For an example of what that will look
like, see [1]
OK?
[1] https://marc.info/?l=openbsd-tech&m=162231914610109&w=2
-dv
Index: virtio.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
retrieving revision 1.87
diff -u -p -r1.87 virtio.c
--- virtio.c 18 May 2021 11:06:43 -0000 1.87
+++ virtio.c 3 Jun 2021 16:43:08 -0000
@@ -1110,6 +1110,13 @@ vionet_update_qs(struct vionet_dev *dev)
}
/*
+ * vionet_enq_rx
+ *
+ * Take a given packet from the host-side tap and copy it into the guest's
+ * buffers utilizing the rx virtio ring. If the packet length is invalid
+ * (too small or too large) or if there are not enough buffers available,
+ * the packet is dropped.
+ *
* Must be called with dev->mutex acquired.
*/
int
@@ -1117,24 +1124,25 @@ vionet_enq_rx(struct vionet_dev *dev, ch
{
uint64_t q_gpa;
uint32_t vr_sz;
- uint16_t idx, pkt_desc_idx, hdr_desc_idx;
- ptrdiff_t off;
- int ret;
- char *vr;
- size_t rem;
+ uint16_t dxx, idx, hdr_desc_idx, chain_hdr_idx;
+ int ret = 0;
+ char *vr = NULL;
+ size_t bufsz = 0, off = 0, pkt_offset = 0, chunk_size = 0;
+ size_t chain_len = 0;
struct vring_desc *desc, *pkt_desc, *hdr_desc;
struct vring_avail *avail;
struct vring_used *used;
struct vring_used_elem *ue;
struct virtio_net_hdr hdr;
+ size_t hdr_sz;
- ret = 0;
-
- if (sz < 1 || sz > IP_MAXPACKET + ETHER_HDR_LEN) {
+ if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
log_warn("%s: invalid packet size", __func__);
return (0);
}
+ hdr_sz = sizeof(hdr);
+
if (!(dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK))
return ret;
@@ -1159,90 +1167,91 @@ vionet_enq_rx(struct vionet_dev *dev, ch
used = (struct vring_used *)(vr + dev->vq[RXQ].vq_usedoffset);
idx = dev->vq[RXQ].last_avail & VIONET_QUEUE_MASK;
-
if ((dev->vq[RXQ].notified_avail & VIONET_QUEUE_MASK) == idx) {
- log_debug("vionet queue notify - no space, dropping packet");
+ log_debug("%s: insufficient available buffer capacity, "
+ "dropping packet.", __func__);
goto out;
}
hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
hdr_desc = &desc[hdr_desc_idx];
- pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK;
- pkt_desc = &desc[pkt_desc_idx];
+ dxx = hdr_desc_idx;
+ chain_hdr_idx = dxx;
+ chain_len = 0;
- /* Set up the virtio header (written first, before the packet data) */
- memset(&hdr, 0, sizeof(struct virtio_net_hdr));
- hdr.hdr_len = sizeof(struct virtio_net_hdr);
-
- /* Check size of header descriptor */
- if (hdr_desc->len < sizeof(struct virtio_net_hdr)) {
- log_warnx("%s: invalid header descriptor (too small)",
- __func__);
- goto out;
- }
-
- /* Write out virtio header */
- if (write_mem(hdr_desc->addr, &hdr, sizeof(struct virtio_net_hdr))) {
- log_warnx("vionet: rx enq header write_mem error @ "
- "0x%llx", hdr_desc->addr);
- goto out;
- }
+ /* Process the descriptor and walk any potential chain. */
+ do {
+ off = 0;
+ pkt_desc = &desc[dxx];
+ if (!(pkt_desc->flags & VRING_DESC_F_WRITE)) {
+ log_warnx("%s: invalid descriptor, not writable",
+ __func__);
+ goto out;
+ }
- /*
- * Compute remaining space in the first (header) descriptor, and
- * copy the packet data after if space is available. Otherwise,
- * copy to the pkt_desc descriptor.
- */
- rem = hdr_desc->len - sizeof(struct virtio_net_hdr);
+ /* How much data do we get to write? */
+ if (sz - bufsz > pkt_desc->len)
+ chunk_size = pkt_desc->len;
+ else
+ chunk_size = sz - bufsz;
- if (rem >= sz) {
- if (write_mem(hdr_desc->addr + sizeof(struct virtio_net_hdr),
- pkt, sz)) {
- log_warnx("vionet: rx enq packet write_mem error @ "
- "0x%llx", pkt_desc->addr);
- goto out;
+ if (chain_len == 0) {
+ off = hdr_sz;
+ if (chunk_size == pkt_desc->len)
+ chunk_size -= off;
}
- } else {
- /* Fallback to pkt_desc descriptor */
- if (pkt_desc->len >= sz) {
- /* Must be not readable */
- if ((pkt_desc->flags & VRING_DESC_F_WRITE) == 0) {
- log_warnx("unexpected readable rx desc %d",
- pkt_desc_idx);
- goto out;
- }
- /* Write packet to descriptor ring */
- if (write_mem(pkt_desc->addr, pkt, sz)) {
- log_warnx("vionet: rx enq packet write_mem "
- "error @ 0x%llx", pkt_desc->addr);
- goto out;
- }
- } else {
- log_warnx("%s: descriptor too small for packet data",
- __func__);
+ /* Write a chunk of data if we need to */
+ if (chunk_size && write_mem(pkt_desc->addr + off,
+ pkt + pkt_offset, chunk_size)) {
+ log_warnx("%s: failed to write to buffer 0x%llx",
+ __func__, pkt_desc->addr);
goto out;
}
+
+ chain_len += chunk_size + off;
+ bufsz += chunk_size;
+ pkt_offset += chunk_size;
+
+ dxx = pkt_desc->next & VIONET_QUEUE_MASK;
+ } while (bufsz < sz && pkt_desc->flags & VRING_DESC_F_NEXT);
+
+ /* Update the list of used buffers. */
+ ue = &used->ring[(used->idx) & VIONET_QUEUE_MASK];
+ ue->id = chain_hdr_idx;
+ ue->len = chain_len;
+ off = ((char *)ue - vr);
+ if (write_mem(q_gpa + off, ue, sizeof(*ue))) {
+ log_warnx("%s: error updating rx used ring", __func__);
+ goto out;
}
- ret = 1;
- dev->cfg.isr_status = 1;
- ue = &used->ring[used->idx & VIONET_QUEUE_MASK];
- ue->id = hdr_desc_idx;
- ue->len = sz + sizeof(struct virtio_net_hdr);
+ /* Move our marker in the ring...*/
used->idx++;
- dev->vq[RXQ].last_avail++;
- *spc = dev->vq[RXQ].notified_avail - dev->vq[RXQ].last_avail;
+ dev->vq[RXQ].last_avail = (dev->vq[RXQ].last_avail + 1) &
+ VIONET_QUEUE_MASK;
- off = (char *)ue - vr;
- if (write_mem(q_gpa + off, ue, sizeof *ue))
- log_warnx("vionet: error writing vio ring");
- else {
- off = (char *)&used->idx - vr;
- if (write_mem(q_gpa + off, &used->idx, sizeof used->idx))
- log_warnx("vionet: error writing vio ring");
+ /* Prepend the virtio net header in the first buffer. */
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.hdr_len = hdr_sz;
+ if (write_mem(hdr_desc->addr, &hdr, hdr_sz)) {
+ log_warnx("vionet: rx enq header write_mem error @ 0x%llx",
+ hdr_desc->addr);
+ goto out;
}
+
+ /* Update the index field in the used ring. This must be done last. */
+ dev->cfg.isr_status = 1;
+ off = (char *)&used->idx - vr;
+ *spc = (dev->vq[RXQ].notified_avail - dev->vq[RXQ].last_avail) &
+ VIONET_QUEUE_MASK;
+
+ if (write_mem(q_gpa + off, &used->idx, sizeof(used->idx)))
+ log_warnx("vionet: error writing vio ring");
+
+ ret = 1;
+
out:
free(vr);
return (ret);
@@ -1505,11 +1514,11 @@ vionet_notify_tx(struct vionet_dev *dev)
/* Remove virtio header descriptor len */
pktsz -= hdr_desc->len;
- /* Only allow buffer len < max IP packet + Ethernet header */
- if (pktsz > IP_MAXPACKET + ETHER_HDR_LEN) {
+ /* Drop packets violating device MTU-based limits */
+ if (pktsz < VIONET_MIN_TXLEN || pktsz > VIONET_MAX_TXLEN) {
log_warnx("%s: invalid packet size %lu", __func__,
pktsz);
- goto out;
+ goto drop_packet;
}
pkt = malloc(pktsz);
if (pkt == NULL) {
@@ -1590,6 +1599,7 @@ vionet_notify_tx(struct vionet_dev *dev)
goto out;
}
+ drop_packet:
ret = 1;
dev->cfg.isr_status = 1;
used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx;
@@ -1931,6 +1941,8 @@ virtio_init(struct vmd_vm *vm, int child
sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
vionet[i].vq[RXQ].last_avail = 0;
+ vionet[i].vq[RXQ].notified_avail = 0;
+
vionet[i].vq[TXQ].qs = VIONET_QUEUE_SIZE;
vionet[i].vq[TXQ].vq_availoffset =
sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
Index: virtio.h
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v
retrieving revision 1.38
diff -u -p -r1.38 virtio.h
--- virtio.h 21 Apr 2021 18:27:36 -0000 1.38
+++ virtio.h 3 Jun 2021 16:43:08 -0000
@@ -17,6 +17,7 @@
*/
#include <dev/pv/virtioreg.h>
+#include <net/if_tun.h>
#define VIRTQUEUE_ALIGN(n) (((n)+(VIRTIO_PAGE_SIZE-1))& \
~(VIRTIO_PAGE_SIZE-1))
@@ -35,6 +36,11 @@
#define VIONET_QUEUE_SIZE 256
#define VIONET_QUEUE_MASK (VIONET_QUEUE_SIZE - 1)
+
+/* Virtio network device is backed by tap(4), so inherit limits */
+#define VIONET_HARD_MTU TUNMRU
+#define VIONET_MIN_TXLEN ETHER_HDR_LEN
+#define VIONET_MAX_TXLEN VIONET_HARD_MTU + ETHER_HDR_LEN
/* VMM Control Interface shutdown timeout (in seconds) */
#define VMMCI_TIMEOUT 3