Hello everyone,

I saw that there is an open task of adding Netmap support to bhyve in
the latest status report of FreeBSD [1]. So I implement the netmap
support for bhyve, and following is the patch. Hope this can be helpful!

diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
index 5ac9ecd..d7c8e84 100644
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -35,6 +35,10 @@ __FBSDID("$FreeBSD$");
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <net/ethernet.h>
+#ifndef NETMAP_WITH_LIBS
+#define NETMAP_WITH_LIBS
+#endif
+#include <net/netmap_user.h>
 
 #include <errno.h>
 #include <fcntl.h>
@@ -132,6 +136,8 @@ struct pci_vtnet_softc {
        struct mevent   *vsc_mevp;
 
        int             vsc_tapfd;
+       struct nm_desc  *vsc_nmd;
+
        int             vsc_rx_ready;
        volatile int    resetting;      /* set and checked outside lock */
 
@@ -148,6 +154,10 @@ struct pci_vtnet_softc {
        pthread_mutex_t tx_mtx;
        pthread_cond_t  tx_cond;
        int             tx_in_progress;
+
+       void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
+       void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
+                            int iovcnt, int len);
 };
 
 static void pci_vtnet_reset(void *);
@@ -369,14 +379,208 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
        vq_endchains(vq, 1);
 }
 
+static int
+pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
+{
+       int r, i;
+       int len = 0;
+
+       for (r = nmd->cur_tx_ring; ; ) {
+               struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
+               uint32_t cur, idx;
+               char *buf;
+
+               if (nm_ring_empty(ring)) {
+                       r++;
+                       if (r > nmd->last_tx_ring)
+                               r = nmd->first_tx_ring;
+                       if (r == nmd->cur_rx_ring)
+                               break;
+                       continue;
+               }
+               cur = ring->cur;
+               idx = ring->slot[cur].buf_idx;
+               buf = NETMAP_BUF(ring, idx);
+
+               for (i = 0; i < iovcnt; i++) {
+                       memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
+                       len += iov[i].iov_len;
+               }
+               ring->slot[cur].len = len;
+               ring->head = ring->cur = nm_ring_next(ring, cur);
+               nmd->cur_tx_ring = r;
+               ioctl(nmd->fd, NIOCTXSYNC, NULL);
+               break;
+       }
+
+       return (len);
+}
+
+static inline int
+pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
+{
+       int len = 0;
+       int i = 0;
+       int r;
+
+       for (r = nmd->cur_rx_ring; ; ) {
+               struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
+               uint32_t cur, idx;
+               char *buf;
+               size_t left;
+
+               if (nm_ring_empty(ring)) {
+                       r++;
+                       if (r > nmd->last_rx_ring)
+                               r = nmd->first_rx_ring;
+                       if (r == nmd->cur_rx_ring)
+                               break;
+                       continue;
+               }
+               cur = ring->cur;
+               idx = ring->slot[cur].buf_idx;
+               buf = NETMAP_BUF(ring, idx);
+               left = ring->slot[cur].len;
+
+               for (i = 0; i < iovcnt && left > 0; i++) {
+                       if (iov[i].iov_len > left)
+                               iov[i].iov_len = left;
+                       memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
+                       len += iov[i].iov_len;
+                       left -= iov[i].iov_len;
+               }
+               ring->head = ring->cur = nm_ring_next(ring, cur);
+               nmd->cur_rx_ring = r;
+               ioctl(nmd->fd, NIOCRXSYNC, NULL);
+               break;
+       }
+       for (; i < iovcnt; i++)
+               iov[i].iov_len = 0;
+
+       return (len);
+}
+
+/*
+ * Called to send a buffer chain out to the vale port
+ */
 static void
-pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
+pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+                   int len)
+{
+       static char pad[60]; /* all zero bytes */
+
+       if (sc->vsc_nmd == NULL)
+               return;
+
+       /*
+        * If the length is < 60, pad out to that and add the
+        * extra zero'd segment to the iov. It is guaranteed that
+        * there is always an extra iov available by the caller.
+        */
+       if (len < 60) {
+               iov[iovcnt].iov_base = pad;
+               iov[iovcnt].iov_len = 60 - len;
+               iovcnt++;
+       }
+       (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
+}
+
+static void
+pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
+{
+       struct iovec iov[VTNET_MAXSEGS], *riov;
+       struct vqueue_info *vq;
+       void *vrx;
+       int len, n;
+
+       /*
+        * Should never be called without a valid netmap descriptor
+        */
+       assert(sc->vsc_nmd != NULL);
+
+       /*
+        * But, will be called when the rx ring hasn't yet
+        * been set up or the guest is resetting the device.
+        */
+       if (!sc->vsc_rx_ready || sc->resetting) {
+               /*
+                * Drop the packet and try later.
+                */
+               (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
+               return;
+       }
+
+       /*
+        * Check for available rx buffers
+        */
+       vq = &sc->vsc_queues[VTNET_RXQ];
+       vq_startchains(vq);
+       if (!vq_has_descs(vq)) {
+               /*
+                * Drop the packet and try later.  Interrupt on
+                * empty, if that's negotiated.
+                */
+               (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
+               vq_endchains(vq, 1);
+               return;
+       }
+
+       do {
+               /*
+                * Get descriptor chain.
+                */
+               n = vq_getchain(vq, iov, VTNET_MAXSEGS, NULL);
+               assert(n >= 1 && n <= VTNET_MAXSEGS);
+
+               /*
+                * Get a pointer to the rx header, and use the
+                * data immediately following it for the packet buffer.
+                */
+               vrx = iov[0].iov_base;
+               riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
+
+               len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
+
+               if (len == 0) {
+                       /*
+                        * No more packets, but still some avail ring
+                        * entries.  Interrupt if needed/appropriate.
+                        */
+                       vq_endchains(vq, 0);
+                       return;
+               }
+
+               /*
+                * The only valid field in the rx packet header is the
+                * number of buffers if merged rx bufs were negotiated.
+                */
+               memset(vrx, 0, sc->rx_vhdrlen);
+
+               if (sc->rx_merge) {
+                       struct virtio_net_rxhdr *vrxh;
+
+                       vrxh = vrx;
+                       vrxh->vrh_bufs = 1;
+               }
+
+               /*
+                * Release this chain and handle more chains.
+                */
+               vq_relchain(vq, len + sc->rx_vhdrlen);
+       } while (vq_has_descs(vq));
+
+       /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
+       vq_endchains(vq, 1);
+}
+
+static void
+pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
 {
        struct pci_vtnet_softc *sc = param;
 
        pthread_mutex_lock(&sc->rx_mtx);
        sc->rx_in_progress = 1;
-       pci_vtnet_tap_rx(sc);
+       sc->pci_vtnet_rx(sc);
        sc->rx_in_progress = 0;
        pthread_mutex_unlock(&sc->rx_mtx);
 
@@ -417,7 +621,7 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct 
vqueue_info *vq)
        }
 
        DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
-       pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen);
+       sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
 
        /* chain is processed, release it and set tlen */
        vq_relchain(vq, tlen);
@@ -530,6 +734,67 @@ pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
         return (0);
 }
 
+static void
+pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
+{
+       char tbuf[80];
+
+       strcpy(tbuf, "/dev/");
+       strlcat(tbuf, devname, sizeof(tbuf));
+
+       sc->pci_vtnet_rx = pci_vtnet_tap_rx;
+       sc->pci_vtnet_tx = pci_vtnet_tap_tx;
+
+       sc->vsc_tapfd = open(tbuf, O_RDWR);
+       if (sc->vsc_tapfd == -1) {
+               WPRINTF(("open of tap device %s failed\n", tbuf));
+               return;
+       }
+
+       /*
+        * Set non-blocking and register for read
+        * notifications with the event loop
+        */
+       int opt = 1;
+       if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+               WPRINTF(("tap device O_NONBLOCK failed\n"));
+               close(sc->vsc_tapfd);
+               sc->vsc_tapfd = -1;
+       }
+
+       sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+                                 EVF_READ,
+                                 pci_vtnet_rx_callback,
+                                 sc);
+       if (sc->vsc_mevp == NULL) {
+               WPRINTF(("Could not register event\n"));
+               close(sc->vsc_tapfd);
+               sc->vsc_tapfd = -1;
+       }
+}
+
+static void
+pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
+{
+       sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
+       sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
+
+       sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
+       if (sc->vsc_nmd == NULL) {
+               WPRINTF(("open of netmap device %s failed\n", ifname));
+               return;
+       }
+
+       sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
+                                 EVF_READ,
+                                 pci_vtnet_rx_callback,
+                                 sc);
+       if (sc->vsc_mevp == NULL) {
+               WPRINTF(("Could not register event\n"));
+               nm_close(sc->vsc_nmd);
+               sc->vsc_nmd = NULL;
+       }
+}
 
 static int
 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
@@ -565,8 +830,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, 
char *opts)
         */
        mac_provided = 0;
        sc->vsc_tapfd = -1;
+       sc->vsc_nmd = NULL;
        if (opts != NULL) {
-               char tbuf[80];
                int err;
 
                devname = vtopts = strdup(opts);
@@ -581,36 +846,12 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, 
char *opts)
                        mac_provided = 1;
                }
 
-               strcpy(tbuf, "/dev/");
-               strlcat(tbuf, devname, sizeof(tbuf));
+               if (strncmp(devname, "vale", 4) == 0)
+                       pci_vtnet_netmap_setup(sc, devname);
+               if (strncmp(devname, "tap", 3) == 0)
+                       pci_vtnet_tap_setup(sc, devname);
 
                free(devname);
-
-               sc->vsc_tapfd = open(tbuf, O_RDWR);
-               if (sc->vsc_tapfd == -1) {
-                       WPRINTF(("open of tap device %s failed\n", tbuf));
-               } else {
-                       /*
-                        * Set non-blocking and register for read
-                        * notifications with the event loop
-                        */
-                       int opt = 1;
-                       if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
-                               WPRINTF(("tap device O_NONBLOCK failed\n"));
-                               close(sc->vsc_tapfd);
-                               sc->vsc_tapfd = -1;
-                       }
-
-                       sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
-                                                 EVF_READ,
-                                                 pci_vtnet_tap_callback,
-                                                 sc);
-                       if (sc->vsc_mevp == NULL) {
-                               WPRINTF(("Could not register event\n"));
-                               close(sc->vsc_tapfd);
-                               sc->vsc_tapfd = -1;
-                       }
-               }               
        }
 
        /*
-- 
1.9.3 (Apple Git-50)

----------------------------------------------------------------

Following are the simple instructions to test the patch by creating
two virtual machines:

#1. Launch the first vm with a NIC configured as a vale port:

$ bhyve -A -H -P \
        -s 0:0,hostbridge \
        -s 1:0,lpc \
        -s 2,virtio-blk,./disk-vm1 \
        -s 3:0,virtio-net,vale0:vm1 \
        -l com1,stdio \
        -m 1G -c 2 vm1

#2. Launch the second vm with a NIC configured as a vale port:

$ bhyve -A -H -P \
        -s 0:0,hostbridge \
        -s 1:0,lpc \
        -s 2,virtio-blk,./disk-vm2 \
        -s 3:0,virtio-net,vale0:vm2 \
        -l com1,stdio \
        -m 1G -c 2 vm2

Both of vale0:vm1 and vale0:vm2 are the ports of vale0 switch.

#3. Setup the network in each vm:

$ ifconfig vtnet0 192.168.1.1 up     # in the 1st vm
$ ifconfig vtnet0 192.168.1.2 up     # in the 2nd vm

#4. Ping test

# ping -c 5 192.168.1.1              # in the 2nd vm
PING 192.168.1.1 (192.168.1.1): 56 data bytes
64 bytes from 192.168.1.1: icmp_seq=0 ttl=64 time=0.513 ms
64 bytes from 192.168.1.1: icmp_seq=1 ttl=64 time=0.356 ms
64 bytes from 192.168.1.1: icmp_seq=2 ttl=64 time=0.523 ms
64 bytes from 192.168.1.1: icmp_seq=3 ttl=64 time=0.362 ms
64 bytes from 192.168.1.1: icmp_seq=4 ttl=64 time=0.532 ms

--- 192.168.1.1 ping statistics ---
5 packets transmitted, 5 packets received, 0.0% packet loss
round-trip min/avg/max/stddev = 0.356/0.457/0.532/0.080 ms

[1] https://www.freebsd.org/news/status/report-2014-10-2014-12.html#bhyve

Tiwei Bie

_______________________________________________
freebsd-virtualization@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-virtualization
To unsubscribe, send any mail to 
"freebsd-virtualization-unsubscr...@freebsd.org"

Reply via email to