On Tue, Oct 29, 2013 at 3:12 AM, Vincenzo Maffione <v.maffi...@gmail.com> wrote: > This patch adds support for a network backend based on netmap. > netmap is a framework for high speed packet I/O. You can use it > to build extremely fast traffic generators, monitors, software > switches or network middleboxes. Its companion software switch > VALE lets you interconnect virtual machines. > netmap and VALE are implemented as a non intrusive kernel module, > support NICs from multiple vendors, are part of standard FreeBSD > distributions and available in source format for Linux too.
I don't think it's a good idea to support this on Linux hosts. This is an out of tree module that most likely will never go upstream. I don't want to live through another kqemu with this if it eventually starts to bit-rot. Regards, Anthony Liguori > > To compile QEMU with netmap support, use the following configure > options: > ./configure [...] --enable-netmap --extra-cflags=-I/path/to/netmap/sys > where "/path/to/netmap" contains the netmap source code, available at > http://info.iet.unipi.it/~luigi/netmap/ > > The same webpage contains more information about the netmap project > (together with papers and presentations). > > Signed-off-by: Vincenzo Maffione <v.maffi...@gmail.com> > --- > This patch follows a previous thread (whose subject was "netmap backend"), > in which a previous version was already revised. All the review comments > have been taken into consideration or applied. > > This patch only contains the simplest netmap backend for QEMU. > In particular, this backend implementation is still not > able to make use of batching on the TX side (frontend -> backend), > which is where most of the TX performance gain comes from. > As you can see from the code, there is an ioctl(NIOCTXSYNC) for each > packet, instead of an ioctl(NIOCTXSYNC) for a batch of packets. > In order to make TX batching possible, we would need to do some > modifications to the generic net/net.c code, adding to the > frontend/backend datapath interface a way to send a batch (this can > be done using a QEMU_NET_PACKET_FLAG_MORE, without changing too > much the existing interface). > We will propose these features in future patches. > > configure | 31 ++++ > hmp-commands.hx | 4 +- > net/Makefile.objs | 1 + > net/clients.h | 5 + > net/net.c | 6 + > net/netmap.c | 423 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > qapi-schema.json | 19 ++- > qemu-options.hx | 8 ++ > 8 files changed, 494 insertions(+), 3 deletions(-) > create mode 100644 net/netmap.c > > diff --git a/configure b/configure > index 57ee62a..4046fe5 100755 > --- a/configure > +++ b/configure > @@ -155,6 +155,7 @@ curl="" > curses="" > docs="" > fdt="" > +netmap="" > pixman="" > sdl="" > virtfs="" > @@ -777,6 +778,10 @@ for opt do > ;; > --enable-vde) vde="yes" > ;; > + --disable-netmap) netmap="no" > + ;; > + --enable-netmap) netmap="yes" > + ;; > --disable-xen) xen="no" > ;; > --enable-xen) xen="yes" > @@ -1157,6 +1162,8 @@ echo " --disable-uuid disable uuid support" > echo " --enable-uuid enable uuid support" > echo " --disable-vde disable support for vde network" > echo " --enable-vde enable support for vde network" > +echo " --disable-netmap disable support for netmap network" > +echo " --enable-netmap enable support for netmap network" > echo " --disable-linux-aio disable Linux AIO support" > echo " --enable-linux-aio enable Linux AIO support" > echo " --disable-cap-ng disable libcap-ng support" > @@ -2061,6 +2068,26 @@ EOF > fi > > ########################################## > +# netmap headers probe > +if test "$netmap" != "no" ; then > + cat > $TMPC << EOF > +#include <inttypes.h> > +#include <net/if.h> > +#include <net/netmap.h> > +#include <net/netmap_user.h> > +int main(void) { return 0; } > +EOF > + if compile_prog "" "" ; then > + netmap=yes > + else > + if test "$netmap" = "yes" ; then > + feature_not_found "netmap" > + fi > + netmap=no > + fi > +fi > + > +########################################## > # libcap-ng library probe > if test "$cap_ng" != "no" ; then > cap_libs="-lcap-ng" > @@ -3716,6 +3743,7 @@ echo "uname -r $uname_release" > echo "GUEST_BASE $guest_base" > echo "PIE $pie" > echo "vde support $vde" > +echo "netmap support $netmap" > echo "Linux AIO support $linux_aio" > echo "ATTR/XATTR support $attr" > echo "Install blobs $blobs" > @@ -3854,6 +3882,9 @@ fi > if test "$vde" = "yes" ; then > echo "CONFIG_VDE=y" >> $config_host_mak > fi > +if test "$netmap" = "yes" ; then > + echo "CONFIG_NETMAP=y" >> $config_host_mak > +fi > if test "$cap_ng" = "yes" ; then > echo "CONFIG_LIBCAP=y" >> $config_host_mak > fi > diff --git a/hmp-commands.hx b/hmp-commands.hx > index caae5ad..ebe8e78 100644 > --- a/hmp-commands.hx > +++ b/hmp-commands.hx > @@ -1190,7 +1190,7 @@ ETEXI > { > .name = "host_net_add", > .args_type = "device:s,opts:s?", > - .params = "tap|user|socket|vde|dump [options]", > + .params = "tap|user|socket|vde|netmap|dump [options]", > .help = "add host VLAN client", > .mhandler.cmd = net_host_device_add, > }, > @@ -1218,7 +1218,7 @@ ETEXI > { > .name = "netdev_add", > .args_type = "netdev:O", > - .params = "[user|tap|socket|hubport],id=str[,prop=value][,...]", > + .params = > "[user|tap|socket|hubport|netmap],id=str[,prop=value][,...]", > .help = "add host network device", > .mhandler.cmd = hmp_netdev_add, > }, > diff --git a/net/Makefile.objs b/net/Makefile.objs > index 4854a14..c25fe69 100644 > --- a/net/Makefile.objs > +++ b/net/Makefile.objs > @@ -11,3 +11,4 @@ common-obj-$(CONFIG_AIX) += tap-aix.o > common-obj-$(CONFIG_HAIKU) += tap-haiku.o > common-obj-$(CONFIG_SLIRP) += slirp.o > common-obj-$(CONFIG_VDE) += vde.o > +common-obj-$(CONFIG_NETMAP) += netmap.o > diff --git a/net/clients.h b/net/clients.h > index 7793294..7322ff5 100644 > --- a/net/clients.h > +++ b/net/clients.h > @@ -52,4 +52,9 @@ int net_init_vde(const NetClientOptions *opts, const char > *name, > NetClientState *peer); > #endif > > +#ifdef CONFIG_NETMAP > +int net_init_netmap(const NetClientOptions *opts, const char *name, > + NetClientState *peer); > +#endif > + > #endif /* QEMU_NET_CLIENTS_H */ > diff --git a/net/net.c b/net/net.c > index c330c9a..2526e4a 100644 > --- a/net/net.c > +++ b/net/net.c > @@ -721,6 +721,9 @@ static int (* const > net_client_init_fun[NET_CLIENT_OPTIONS_KIND_MAX])( > #ifdef CONFIG_VDE > [NET_CLIENT_OPTIONS_KIND_VDE] = net_init_vde, > #endif > +#ifdef CONFIG_NETMAP > + [NET_CLIENT_OPTIONS_KIND_NETMAP] = net_init_netmap, > +#endif > [NET_CLIENT_OPTIONS_KIND_DUMP] = net_init_dump, > #ifdef CONFIG_NET_BRIDGE > [NET_CLIENT_OPTIONS_KIND_BRIDGE] = net_init_bridge, > @@ -752,6 +755,9 @@ static int net_client_init1(const void *object, int > is_netdev, Error **errp) > #ifdef CONFIG_VDE > case NET_CLIENT_OPTIONS_KIND_VDE: > #endif > +#ifdef CONFIG_NETMAP > + case NET_CLIENT_OPTIONS_KIND_NETMAP: > +#endif > #ifdef CONFIG_NET_BRIDGE > case NET_CLIENT_OPTIONS_KIND_BRIDGE: > #endif > diff --git a/net/netmap.c b/net/netmap.c > new file mode 100644 > index 0000000..6dbe138 > --- /dev/null > +++ b/net/netmap.c > @@ -0,0 +1,423 @@ > +/* > + * netmap access for qemu > + * > + * Copyright (c) 2012-2013 Luigi Rizzo > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > copy > + * of this software and associated documentation files (the "Software"), to > deal > + * in the Software without restriction, including without limitation the > rights > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > + * copies of the Software, and to permit persons to whom the Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > + * THE SOFTWARE. > + */ > + > + > +#include "net/net.h" > +#include "clients.h" > +#include "sysemu/sysemu.h" > +#include "qemu/error-report.h" > + > +#include <sys/ioctl.h> > +#include <net/if.h> > +#include <sys/mman.h> > +#include <net/netmap.h> > +#include <net/netmap_user.h> > +#include <qemu/iov.h> > + > + > +/* Private netmap device info. */ > +typedef struct NetmapPriv { > + int fd; > + size_t memsize; > + void *mem; > + struct netmap_if *nifp; > + struct netmap_ring *rx; > + struct netmap_ring *tx; > + char fdname[PATH_MAX]; /* Normally "/dev/netmap". > */ > + char ifname[IFNAMSIZ]; > +} NetmapPriv; > + > +typedef struct NetmapState { > + NetClientState nc; > + NetmapPriv me; > + bool read_poll; > + bool write_poll; > + struct iovec iov[IOV_MAX]; > +} NetmapState; > + > +#define D(format, ...) \ > + do { \ > + struct timeval __xxts; \ > + gettimeofday(&__xxts, NULL); \ > + printf("%03d.%06d %s [%d] " format "\n", \ > + (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ > + __func__, __LINE__, ##__VA_ARGS__); \ > + } while (0) > + > +/* Rate limited version of "D", lps indicates how many per second */ > +#define RD(lps, format, ...) \ > + do { \ > + static int t0, __cnt; \ > + struct timeval __xxts; \ > + gettimeofday(&__xxts, NULL); \ > + if (t0 != __xxts.tv_sec) { \ > + t0 = __xxts.tv_sec; \ > + __cnt = 0; \ > + } \ > + if (__cnt++ < lps) { \ > + D(format, ##__VA_ARGS__); \ > + } \ > + } while (0) > + > + > +#ifndef __FreeBSD__ > +#define pkt_copy bcopy > +#else > +/* A fast copy routine only for multiples of 64 bytes, non overlapped. */ > +static inline void > +pkt_copy(const void *_src, void *_dst, int l) > +{ > + const uint64_t *src = _src; > + uint64_t *dst = _dst; > + if (unlikely(l >= 1024)) { > + bcopy(src, dst, l); > + return; > + } > + for (; l > 0; l -= 64) { > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + } > +} > +#endif /* __FreeBSD__ */ > + > +/* > + * Open a netmap device. We assume there is only one queue > + * (which is the case for the VALE bridge). > + */ > +static int netmap_open(NetmapPriv *me) > +{ > + int fd; > + int err; > + size_t l; > + struct nmreq req; > + > + me->fd = fd = open(me->fdname, O_RDWR); > + if (fd < 0) { > + error_report("Unable to open netmap device '%s'", me->fdname); > + return -1; > + } > + bzero(&req, sizeof(req)); > + pstrcpy(req.nr_name, sizeof(req.nr_name), me->ifname); > + req.nr_ringid = NETMAP_NO_TX_POLL; > + req.nr_version = NETMAP_API; > + err = ioctl(fd, NIOCREGIF, &req); > + if (err) { > + error_report("Unable to register %s", me->ifname); > + goto error; > + } > + l = me->memsize = req.nr_memsize; > + > + me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); > + if (me->mem == MAP_FAILED) { > + error_report("Unable to mmap"); > + me->mem = NULL; > + goto error; > + } > + > + me->nifp = NETMAP_IF(me->mem, req.nr_offset); > + me->tx = NETMAP_TXRING(me->nifp, 0); > + me->rx = NETMAP_RXRING(me->nifp, 0); > + return 0; > + > +error: > + close(me->fd); > + return -1; > +} > + > +/* Tell the event-loop if the netmap backend can send packets > + to the frontend. */ > +static int netmap_can_send(void *opaque) > +{ > + NetmapState *s = opaque; > + > + return qemu_can_send_packet(&s->nc); > +} > + > +static void netmap_send(void *opaque); > +static void netmap_writable(void *opaque); > + > +/* Set the event-loop handlers for the netmap backend. */ > +static void netmap_update_fd_handler(NetmapState *s) > +{ > + qemu_set_fd_handler2(s->me.fd, > + s->read_poll ? netmap_can_send : NULL, > + s->read_poll ? netmap_send : NULL, > + s->write_poll ? netmap_writable : NULL, > + s); > +} > + > +/* Update the read handler. */ > +static void netmap_read_poll(NetmapState *s, bool enable) > +{ > + if (s->read_poll != enable) { /* Do nothing if not changed. */ > + s->read_poll = enable; > + netmap_update_fd_handler(s); > + } > +} > + > +/* Update the write handler. */ > +static void netmap_write_poll(NetmapState *s, bool enable) > +{ > + if (s->write_poll != enable) { > + s->write_poll = enable; > + netmap_update_fd_handler(s); > + } > +} > + > +static void netmap_poll(NetClientState *nc, bool enable) > +{ > + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); > + > + if (s->read_poll != enable || s->write_poll != enable) { > + s->read_poll = enable; > + s->read_poll = enable; > + netmap_update_fd_handler(s); > + } > +} > + > +/* > + * The fd_write() callback, invoked if the fd is marked as > + * writable after a poll. Unregister the handler and flush any > + * buffered packets. > + */ > +static void netmap_writable(void *opaque) > +{ > + NetmapState *s = opaque; > + > + netmap_write_poll(s, false); > + qemu_flush_queued_packets(&s->nc); > +} > + > +static ssize_t netmap_receive(NetClientState *nc, > + const uint8_t *buf, size_t size) > +{ > + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); > + struct netmap_ring *ring = s->me.tx; > + > + if (unlikely(size > ring->nr_buf_size)) { > + RD(5, "[netmap_receive] drop packet of size %d > %d\n", > + (int)size, ring->nr_buf_size); > + return size; > + } > + > + if (ring) { > + if (ring->avail == 0) { > + /* No available slots in the netmap TX ring. */ > + netmap_write_poll(s, true); > + return 0; > + } > + uint32_t i = ring->cur; > + uint32_t idx = ring->slot[i].buf_idx; > + uint8_t *dst = (uint8_t *)NETMAP_BUF(ring, idx); > + > + ring->slot[i].len = size; > + ring->slot[i].flags = 0; > + pkt_copy(buf, dst, size); > + ring->cur = NETMAP_RING_NEXT(ring, i); > + ring->avail--; > + ioctl(s->me.fd, NIOCTXSYNC, NULL); > + } > + > + return size; > +} > + > +static ssize_t netmap_receive_iov(NetClientState *nc, > + const struct iovec *iov, int iovcnt) > +{ > + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); > + struct netmap_ring *ring = s->me.tx; > + > + if (ring) { > + uint32_t i = 0; > + uint32_t idx; > + uint8_t *dst; > + int j; > + uint32_t cur = ring->cur; > + uint32_t avail = ring->avail; > + int iov_frag_size; > + int nm_frag_size; > + int offset; > + > + if (avail < iovcnt) { > + /* Not enough netmap slots. */ > + netmap_write_poll(s, true); > + return 0; > + } > + > + for (j = 0; j < iovcnt; j++) { > + iov_frag_size = iov[j].iov_len; > + offset = 0; > + > + /* Split each iovec fragment over more netmap slots, if > + necessary (without performing data copy). */ > + while (iov_frag_size) { > + nm_frag_size = MIN(iov_frag_size, ring->nr_buf_size); > + > + if (unlikely(avail == 0)) { > + /* We run out of netmap slots while splitting the > + iovec fragments. */ > + return 0; > + } > + > + i = cur; > + idx = ring->slot[i].buf_idx; > + dst = (uint8_t *)NETMAP_BUF(ring, idx); > + > + ring->slot[i].len = nm_frag_size; > + ring->slot[i].flags = NS_MOREFRAG; > + pkt_copy(iov[j].iov_base + offset, dst, nm_frag_size); > + > + cur = NETMAP_RING_NEXT(ring, i); > + avail--; > + > + offset += nm_frag_size; > + iov_frag_size -= nm_frag_size; > + } > + } > + /* The last slot must not have NS_MOREFRAG set. */ > + ring->slot[i].flags &= ~NS_MOREFRAG; > + > + /* Now update ring->cur and ring->avail. */ > + ring->cur = cur; > + ring->avail = avail; > + > + ioctl(s->me.fd, NIOCTXSYNC, NULL); > + } > + > + return iov_size(iov, iovcnt); > +} > + > +/* Complete a previous send (backend --> guest) and enable the > + fd_read callback. */ > +static void netmap_send_completed(NetClientState *nc, ssize_t len) > +{ > + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); > + > + netmap_read_poll(s, true); > +} > + > +static void netmap_send(void *opaque) > +{ > + NetmapState *s = opaque; > + struct netmap_ring *ring = s->me.rx; > + > + /* Keep sending while there are available packets into the netmap > + RX ring and the forwarding path towards the peer is open. */ > + while (ring->avail > 0 && qemu_can_send_packet(&s->nc)) { > + uint32_t i; > + uint32_t idx; > + bool morefrag; > + int iovcnt = 0; > + int iovsize; > + > + do { > + i = ring->cur; > + idx = ring->slot[i].buf_idx; > + morefrag = (ring->slot[i].flags & NS_MOREFRAG); > + s->iov[iovcnt].iov_base = (u_char *)NETMAP_BUF(ring, idx); > + s->iov[iovcnt].iov_len = ring->slot[i].len; > + iovcnt++; > + > + ring->cur = NETMAP_RING_NEXT(ring, i); > + ring->avail--; > + } while (ring->avail && morefrag); > + > + if (unlikely(!ring->avail && morefrag)) { > + RD(5, "[netmap_send] ran out of slots, with a pending" > + "incomplete packet\n"); > + } > + > + iovsize = qemu_sendv_packet_async(&s->nc, s->iov, iovcnt, > + netmap_send_completed); > + > + if (iovsize == 0) { > + /* The peer does not receive anymore. Packet is queued, stop > + * reading from the backend until netmap_send_completed() > + */ > + netmap_read_poll(s, false); > + break; > + } > + } > +} > + > +/* Flush and close. */ > +static void netmap_cleanup(NetClientState *nc) > +{ > + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); > + > + qemu_purge_queued_packets(nc); > + > + netmap_poll(nc, false); > + munmap(s->me.mem, s->me.memsize); > + close(s->me.fd); > + > + s->me.fd = -1; > +} > + > + > +/* NetClientInfo methods */ > +static NetClientInfo net_netmap_info = { > + .type = NET_CLIENT_OPTIONS_KIND_NETMAP, > + .size = sizeof(NetmapState), > + .receive = netmap_receive, > + .receive_iov = netmap_receive_iov, > + .poll = netmap_poll, > + .cleanup = netmap_cleanup, > +}; > + > +/* The exported init function > + * > + * ... -net netmap,ifname="..." > + */ > +int net_init_netmap(const NetClientOptions *opts, > + const char *name, NetClientState *peer) > +{ > + const NetdevNetmapOptions *netmap_opts = opts->netmap; > + NetClientState *nc; > + NetmapPriv me; > + NetmapState *s; > + > + pstrcpy(me.fdname, sizeof(me.fdname), > + netmap_opts->has_devname ? netmap_opts->devname : "/dev/netmap"); > + /* Set default name for the port if not supplied. */ > + pstrcpy(me.ifname, sizeof(me.ifname), > + netmap_opts->has_ifname ? netmap_opts->ifname : "vale0"); > + if (netmap_open(&me)) { > + return -1; > + } > + /* Create the object. */ > + nc = qemu_new_net_client(&net_netmap_info, peer, "netmap", name); > + s = DO_UPCAST(NetmapState, nc, nc); > + s->me = me; > + netmap_read_poll(s, true); /* Initially only poll for reads. */ > + > + return 0; > +} > + > diff --git a/qapi-schema.json b/qapi-schema.json > index 60f3fd1..6bb19ae 100644 > --- a/qapi-schema.json > +++ b/qapi-schema.json > @@ -2984,6 +2984,22 @@ > 'hubid': 'int32' } } > > ## > +# @NetdevNetmapOptions > +# > +# Connect two or more net clients through a VALE switch > +# > +# @ifname: optional name of the VALE port > +# > +# @devname: optional path of the netmap device > +# > +# Since 1.2 > +## > +{ 'type': 'NetdevNetmapOptions', > + 'data': { > + '*ifname': 'str', > + '*devname': 'str' } } > + > +## > # @NetClientOptions > # > # A discriminated record of network device traits. > @@ -3000,7 +3016,8 @@ > 'vde': 'NetdevVdeOptions', > 'dump': 'NetdevDumpOptions', > 'bridge': 'NetdevBridgeOptions', > - 'hubport': 'NetdevHubPortOptions' } } > + 'hubport': 'NetdevHubPortOptions', > + 'netmap': 'NetdevNetmapOptions' } } > > ## > # @NetLegacy > diff --git a/qemu-options.hx b/qemu-options.hx > index 5dc8b75..31a2004 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -1409,6 +1409,11 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, > " Use group 'groupname' and mode 'octalmode' to change > default\n" > " ownership and permissions for communication port.\n" > #endif > +#ifdef CONFIG_NETMAP > + "-net netmap[,vlan=n][,ifname=name][,devname=name]\n" > + " connect the vlan 'n' to VALE port 'name'\n" > + " ('devname' is name of the netmap device, defaults to > '/dev/netmap')\n" > +#endif > "-net dump[,vlan=n][,file=f][,len=n]\n" > " dump traffic on vlan 'n' to file 'f' (max n bytes per > packet)\n" > "-net none use it alone to have zero network devices. If no -net > option\n" > @@ -1423,6 +1428,9 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev, > #ifdef CONFIG_VDE > "vde|" > #endif > +#ifdef CONFIG_NETMAP > + "netmap|" > +#endif > "socket|" > "hubport],id=str[,option][,option][,...]\n", QEMU_ARCH_ALL) > STEXI > -- > 1.8.4.1 > >