> -----Original Message----- > From: dev [mailto:dev-boun...@openvswitch.org] On Behalf Of Ciara > Loftus > Sent: Wednesday, May 11, 2016 4:31 PM > To: dev@openvswitch.org > Subject: [ovs-dev] [PATCH v3 2/3] netdev-dpdk: Add vHost User PMD > > DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' > ports > to be controlled by the librte_ether API, like physical 'dpdk' ports. > The commit integrates this functionality into OVS, and refactors some > of the existing vhost code such that it is vhost-cuse specific. > Similarly, there is now some overlap between dpdk and vhost-user port > code. > > Signed-off-by: Ciara Loftus <ciara.lof...@intel.com>
Hi, few minor comments below. I didn't review the cuse specific code this time around. Kevin. > --- > INSTALL.DPDK.md | 12 ++ > NEWS | 2 + > lib/netdev-dpdk.c | 628 +++++++++++++++++++++++++++++++++----------- > ---------- > 3 files changed, 396 insertions(+), 246 deletions(-) > > diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md > index 93f92e4..db7153a 100644 > --- a/INSTALL.DPDK.md > +++ b/INSTALL.DPDK.md > @@ -990,6 +990,18 @@ Restrictions: > increased to the desired number of queues. Both DPDK and OVS > must be > recompiled for this change to take effect. > > + DPDK 'eth' type ports: > + - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the > context of > + DPDK as they are all managed by the rte_ether API. This means > that they > + adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS > which by > + default is set to 32. This means by default the combined total > number of > + dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK > is 32. This > + value can be changed if desired by modifying the configuration > file in > + DPDK, or by overriding the default value on the command line > when building > + DPDK. eg. > + > + `make install CONFIG_RTE_MAX_ETHPORTS=64` > + > Bug Reporting: > -------------- > > diff --git a/NEWS b/NEWS > index 4e81cad..841314b 100644 > --- a/NEWS > +++ b/NEWS > @@ -32,6 +32,8 @@ Post-v2.5.0 > * DB entries have been added for many of the DPDK EAL command > line > arguments. Additional arguments can be passed via the dpdk- > extra > entry. > + * vHost PMD integration brings vhost-user ports under control > of the > + rte_ether DPDK API. > - ovs-benchmark: This utility has been removed due to lack of use > and > bitrot. > - ovs-appctl: > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index 89d783a..814ef83 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -55,6 +55,7 @@ > #include "unixctl.h" > > #include "rte_config.h" > +#include "rte_eth_vhost.h" > #include "rte_mbuf.h" > #include "rte_meter.h" > #include "rte_virtio_net.h" > @@ -139,6 +140,11 @@ static char *cuse_dev_name = NULL; /* > Character device cuse_dev_name. */ > #endif > static char *vhost_sock_dir = NULL; /* Location of vhost-user > sockets */ > > +/* Array that tracks the used & unused vHost user driver IDs */ > +static unsigned int vhost_user_drv_ids[RTE_MAX_ETHPORTS]; I think you can replace this array with a counter. You don't need a unique id - just that you are < MAX. > +/* Maximum string length allowed to provide to rte_eth_attach > function */ > +#define DEVARGS_MAX (RTE_ETH_NAME_MAX_LEN + PATH_MAX + 18) > + > /* > * Maximum amount of time in micro seconds to try and enqueue to > vhost. > */ > @@ -172,7 +178,8 @@ enum { DRAIN_TSC = 200000ULL }; > > enum dpdk_dev_type { > DPDK_DEV_ETH = 0, > - DPDK_DEV_VHOST = 1, > + DPDK_DEV_VHOST_USER = 1, > + DPDK_DEV_VHOST_CUSE = 2, > }; > > static int rte_eal_init_ret = ENODEV; > @@ -358,12 +365,22 @@ struct netdev_dpdk { > int real_n_rxq; > bool txq_needs_locking; > > - /* virtio-net structure for vhost device */ > + /* Spinlock for vhost cuse transmission. Other DPDK devices use > spinlocks > + * in dpdk_tx_queue */ > + rte_spinlock_t vhost_cuse_tx_lock; > + > + /* virtio-net structure for vhost cuse device */ > OVSRCU_TYPE(struct virtio_net *) virtio_dev; > > + /* Number of virtqueue pairs reported by the guest */ > + uint32_t vhost_qp_nb; > + > /* Identifier used to distinguish vhost devices from each other > */ > char vhost_id[PATH_MAX]; > > + /* ID of vhost user port given to the PMD driver */ > + unsigned int vhost_pmd_id; > + This could be removed if you just use a counter as per comment above. > /* In dpdk_list. */ > struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); > > @@ -381,16 +398,20 @@ struct netdev_rxq_dpdk { > static bool dpdk_thread_is_pmd(void); > > static int netdev_dpdk_construct(struct netdev *); > +static int netdev_dpdk_vhost_user_construct(struct netdev *); > > struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk > *dev); > > void link_status_changed_callback(uint8_t port_id, > enum rte_eth_event_type type OVS_UNUSED, void *param > OVS_UNUSED); > +void vring_state_changed_callback(uint8_t port_id, > + enum rte_eth_event_type type OVS_UNUSED, void *param > OVS_UNUSED); > > static bool > -is_dpdk_class(const struct netdev_class *class) > +is_dpdk_eth_class(const struct netdev_class *class) > { > - return class->construct == netdev_dpdk_construct; > + return ((class->construct == netdev_dpdk_construct) || > + (class->construct == netdev_dpdk_vhost_user_construct)); > } > > /* DPDK NIC drivers allocate RX buffers at a particular granularity, > typically > @@ -592,7 +613,12 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk > *dev, int n_rxq, int n_txq) > } > > dev->up.n_rxq = n_rxq; > - dev->real_n_txq = n_txq; > + /* Only set real_n_txq for physical devices. vHost User > devices will > + * set this value correctly during vring_state_changed > callbacks. > + */ > + if (dev->type == DPDK_DEV_ETH) { > + dev->real_n_txq = n_txq; > + } > > return 0; > } > @@ -697,6 +723,118 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev, > unsigned int n_txqs) > } > } > > +/* > + * Fixes mapping for vhost-user tx queues. Must be called after each > + * enabling/disabling of queues and real_n_txq modifications. > + */ > +static void > +netdev_dpdk_remap_txqs(struct netdev_dpdk *dev) > + OVS_REQUIRES(dev->mutex) > +{ > + int *enabled_queues, n_enabled = 0; > + int i, k, total_txqs = dev->real_n_txq; > + > + enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof > *enabled_queues); > + > + for (i = 0; i < total_txqs; i++) { > + /* Enabled queues always mapped to themselves. */ > + if (dev->tx_q[i].map == i) { > + enabled_queues[n_enabled++] = i; > + } > + } > + > + if (n_enabled == 0 && total_txqs != 0) { > + enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED; > + n_enabled = 1; > + } > + > + k = 0; > + for (i = 0; i < total_txqs; i++) { > + if (dev->tx_q[i].map != i) { > + dev->tx_q[i].map = enabled_queues[k]; > + k = (k + 1) % n_enabled; > + } > + } > + > + VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id); > + for (i = 0; i < total_txqs; i++) { > + VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map); > + } > + > + rte_free(enabled_queues); > +} > + > +static void > +netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev) > + OVS_REQUIRES(dev->mutex) > +{ > + dev->real_n_rxq = dev->vhost_qp_nb; > + dev->real_n_txq = dev->vhost_qp_nb; > + dev->txq_needs_locking = true; > + > + /* Enable TX queue 0 by default if it wasn't disabled. */ > + if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) { > + dev->tx_q[0].map = 0; > + } > + > + netdev_dpdk_remap_txqs(dev); > + > + return; > +} > + > +/* Clears mapping for all available queues of vhost interface. */ > +static void > +netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev) > + OVS_REQUIRES(dev->mutex) > +{ > + int i; > + > + for (i = 0; i < dev->real_n_txq; i++) { > + dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN; > + } > +} > + > +void > +vring_state_changed_callback(uint8_t port_id, > + enum rte_eth_event_type type > OVS_UNUSED, > + void *param OVS_UNUSED) > +{ > + struct netdev_dpdk *dev; > + struct rte_eth_vhost_queue_event event; > + int err = 0; > + > + err = rte_eth_vhost_get_queue_event(port_id, &event); > + if (err || (event.rx == 1)) { if (err || event.rx) would be clearer > + return; > + } > + > + ovs_mutex_lock(&dpdk_mutex); > + LIST_FOR_EACH (dev, list_node, &dpdk_list) { > + if (port_id == dev->port_id) { > + ovs_mutex_lock(&dev->mutex); > + if (event.enable) { > + dev->tx_q[event.queue_id].map = event.queue_id; > + dev->vhost_qp_nb++; > + } else { > + dev->tx_q[event.queue_id].map = > OVS_VHOST_QUEUE_DISABLED; > + dev->vhost_qp_nb--; > + } > + if (dev->vhost_qp_nb > dev->up.n_rxq) { There's a few places through the code where vhost_qp_nb is assumed to have a value, though it is initialized as 0. It *should* be ok, but perhaps the initialization could be changed or 0 could be incorporated into the range checks. What do you think? > + VLOG_ERR("Error configuring vhost device %s - guest > queues " > + "out-number queues configured in OVS %d > > %d", > + dev->vhost_id, dev->vhost_qp_nb, dev- > >up.n_rxq); > + } else { > + netdev_dpdk_vhost_set_queues(dev); > + } > + ovs_mutex_unlock(&dev->mutex); > + break; > + } > + } > + ovs_mutex_unlock(&dpdk_mutex); > + > + return; > +} > + > void > link_status_changed_callback(uint8_t port_id, > enum rte_eth_event_type type > OVS_UNUSED, > @@ -709,6 +847,25 @@ link_status_changed_callback(uint8_t port_id, > if (port_id == dev->port_id) { > ovs_mutex_lock(&dev->mutex); > check_link_status(dev); > + if (dev->type == DPDK_DEV_VHOST_USER) { > + if (dev->link.link_status == ETH_LINK_UP) { > + /* new device */ > + if (dev->vhost_qp_nb > dev->up.n_rxq) { > + ovs_mutex_unlock(&dev->mutex); > + ovs_mutex_unlock(&dpdk_mutex); > + return; > + } else { > + netdev_dpdk_vhost_set_queues(dev); > + VLOG_INFO("vHost Device '%s' has been > added", > + dev->vhost_id); > + } > + } else { > + /* destroy device */ > + netdev_dpdk_txq_map_clear(dev); > + VLOG_INFO("vHost Device '%s' has been removed", > + dev->vhost_id); > + } > + } > ovs_mutex_unlock(&dev->mutex); > break; > } > @@ -727,6 +884,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned > int port_no, > int sid; > int err = 0; > uint32_t buf_size; > + unsigned int nr_q = 0; > > ovs_mutex_init(&dev->mutex); > ovs_mutex_lock(&dev->mutex); > @@ -736,7 +894,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned > int port_no, > /* If the 'sid' is negative, it means that the kernel fails > * to obtain the pci numa info. In that situation, always > * use 'SOCKET0'. */ > - if (type == DPDK_DEV_ETH) { > + if (type != DPDK_DEV_VHOST_CUSE) { > sid = rte_eth_dev_socket_id(port_no); > } else { > sid = rte_lcore_to_socket_id(rte_get_master_lcore()); > @@ -764,22 +922,27 @@ netdev_dpdk_init(struct netdev *netdev, > unsigned int port_no, > netdev->n_rxq = NR_QUEUE; > netdev->requested_n_rxq = NR_QUEUE; > dev->real_n_txq = NR_QUEUE; > + dev->vhost_qp_nb = 0; Similar to above comment - defaulting the num of queues to 1, but the qp_nb to 0. I understand you'll update the qp_nb in vring_state_change, but you'll need to be sure it's safe to have them in this state for the meantime. > > - if (type == DPDK_DEV_ETH) { > - netdev_dpdk_alloc_txq(dev, NR_QUEUE); > + if (type != DPDK_DEV_VHOST_CUSE) { > + nr_q = (type == DPDK_DEV_ETH ? 1 : RTE_MAX_QUEUES_PER_PORT); > + netdev_dpdk_alloc_txq(dev, nr_q); > err = dpdk_eth_dev_init(dev); > if (err) { > goto unlock; > } > - } else { > - netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM); > } > > - if (type == DPDK_DEV_ETH) { > + if (type != DPDK_DEV_VHOST_CUSE) { > rte_eth_dev_callback_register(port_no, > RTE_ETH_EVENT_INTR_LSC, > > (void*)link_status_changed_callback, > NULL); > } > + if (type == DPDK_DEV_VHOST_USER) { > + rte_eth_dev_callback_register(port_no, > RTE_ETH_EVENT_QUEUE_STATE, > + > (void*)vring_state_changed_callback, > + NULL); > + } > > ovs_list_push_back(&dpdk_list, &dev->list_node); > > @@ -813,16 +976,6 @@ dpdk_dev_parse_name(const char dev_name[], const > char prefix[], > } > > static int > -vhost_construct_helper(struct netdev *netdev) > OVS_REQUIRES(dpdk_mutex) > -{ > - if (rte_eal_init_ret) { > - return rte_eal_init_ret; > - } > - > - return netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST); > -} > - > -static int > netdev_dpdk_vhost_cuse_construct(struct netdev *netdev) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > @@ -834,22 +987,48 @@ netdev_dpdk_vhost_cuse_construct(struct netdev > *netdev) > > ovs_mutex_lock(&dpdk_mutex); > strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id)); > - err = vhost_construct_helper(netdev); > + > + rte_spinlock_init(&dev->vhost_cuse_tx_lock); > + > + err = rte_eal_init_ret ? rte_eal_init_ret : > + netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST_CUSE); > + > ovs_mutex_unlock(&dpdk_mutex); > return err; > } > > static int > +get_vhost_user_drv_id(void) > +{ > + int i = 0; > + > + for (i = 0; i < RTE_MAX_ETHPORTS; i++) { > + if (vhost_user_drv_ids[i] == 0) { > + return i; > + } > + } > + > + return -1; > +} > + > +static int > netdev_dpdk_vhost_user_construct(struct netdev *netdev) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > const char *name = netdev->name; > int err; > + uint8_t port_no = 0; > + char devargs[DEVARGS_MAX]; > + int driver_id = 0; > + > + if (rte_eal_init_ret) { > + return rte_eal_init_ret; > + } > > /* 'name' is appended to 'vhost_sock_dir' and used to create a > socket in > * the file system. '/' or '\' would traverse directories, so > they're not > * acceptable in 'name'. */ > - if (strchr(name, '/') || strchr(name, '\\')) { > + if (strchr(name, '/') || strchr(name, '\\') || strchr(name, > ',')) { > VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. > " > "A valid name must not include '/' or '\\'", > name); > @@ -866,18 +1045,32 @@ netdev_dpdk_vhost_user_construct(struct netdev > *netdev) > */ > snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s", > vhost_sock_dir, name); > + driver_id = get_vhost_user_drv_id(); > + if (driver_id == -1) { > + VLOG_ERR("Unable to create vhost-user device %s - too many > vhost-user" > + "devices registered with PMD", dev->vhost_id); > + err = ENODEV; > + goto out; > + > + } else { > + snprintf(devargs, sizeof(devargs), > "eth_vhost%u,iface=%s,queues=%i", > + driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT); > + err = rte_eth_dev_attach(devargs, &port_no); > + } > > - err = rte_vhost_driver_register(dev->vhost_id); > if (err) { > - VLOG_ERR("vhost-user socket device setup failure for socket > %s\n", > + VLOG_ERR("Failed to attach vhost-user device %s to DPDK", > dev->vhost_id); > } else { > fatal_signal_add_file_to_unlink(dev->vhost_id); > VLOG_INFO("Socket %s created for vhost-user port %s\n", > dev->vhost_id, name); > - err = vhost_construct_helper(netdev); > + dev->vhost_pmd_id = driver_id; > + vhost_user_drv_ids[dev->vhost_pmd_id] = 1; If you do keep this, then you have a get_vhost_user_drv_id(), so you should probably match with a set_ > + err = netdev_dpdk_init(netdev, port_no, > DPDK_DEV_VHOST_USER); > } > > +out: > ovs_mutex_unlock(&dpdk_mutex); > return err; > } > @@ -910,6 +1103,12 @@ netdev_dpdk_destruct(struct netdev *netdev) > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > > ovs_mutex_lock(&dev->mutex); > + > + if (dev->type == DPDK_DEV_VHOST_USER) { > + rte_eth_dev_detach(dev->port_id, dev->vhost_id); > + vhost_user_drv_ids[dev->vhost_pmd_id] = 0; > + } > + > rte_eth_dev_stop(dev->port_id); > ovs_mutex_unlock(&dev->mutex); > > @@ -921,7 +1120,7 @@ netdev_dpdk_destruct(struct netdev *netdev) > } > > static void > -netdev_dpdk_vhost_destruct(struct netdev *netdev) > +netdev_dpdk_vhost_cuse_destruct(struct netdev *netdev) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > > @@ -929,15 +1128,8 @@ netdev_dpdk_vhost_destruct(struct netdev > *netdev) > if (netdev_dpdk_get_virtio(dev) != NULL) { > VLOG_ERR("Removing port '%s' while vhost device still > attached.", > netdev->name); > - VLOG_ERR("To restore connectivity after re-adding of port, > VM on socket" > - " '%s' must be restarted.", > - dev->vhost_id); > - } > - > - if (rte_vhost_driver_unregister(dev->vhost_id)) { > - VLOG_ERR("Unable to remove vhost-user socket %s", dev- > >vhost_id); > - } else { > - fatal_signal_remove_file_to_unlink(dev->vhost_id); > + VLOG_ERR("To restore connectivity after re-adding of port, > VM with" > + "port '%s' must be restarted.", dev->vhost_id); > } > > ovs_mutex_lock(&dpdk_mutex); > @@ -1037,11 +1229,12 @@ netdev_dpdk_set_multiq(struct netdev *netdev, > unsigned int n_txq, > } > > static int > -netdev_dpdk_vhost_cuse_set_multiq(struct netdev *netdev, unsigned > int n_txq, > - unsigned int n_rxq) > +netdev_dpdk_vhost_user_set_multiq(struct netdev *netdev, unsigned > int n_txq, > + unsigned int n_rxq) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > int err = 0; > + int old_rxq, old_txq; > > if (netdev->n_txq == n_txq && netdev->n_rxq == n_rxq) { > return err; > @@ -1050,10 +1243,20 @@ netdev_dpdk_vhost_cuse_set_multiq(struct > netdev *netdev, unsigned int n_txq, > ovs_mutex_lock(&dpdk_mutex); > ovs_mutex_lock(&dev->mutex); > > + rte_eth_dev_stop(dev->port_id); > + > + old_txq = netdev->n_txq; > + old_rxq = netdev->n_rxq; > netdev->n_txq = n_txq; > - dev->real_n_txq = 1; > - netdev->n_rxq = 1; > - dev->txq_needs_locking = dev->real_n_txq != netdev->n_txq; > + netdev->n_rxq = n_rxq; > + > + err = dpdk_eth_dev_init(dev); > + if (err) { > + /* If there has been an error, it means that the requested > queues > + * have not been created. Restore the old numbers. */ > + netdev->n_txq = old_txq; > + netdev->n_rxq = old_rxq; > + } > > ovs_mutex_unlock(&dev->mutex); > ovs_mutex_unlock(&dpdk_mutex); > @@ -1062,8 +1265,8 @@ netdev_dpdk_vhost_cuse_set_multiq(struct netdev > *netdev, unsigned int n_txq, > } > > static int > -netdev_dpdk_vhost_set_multiq(struct netdev *netdev, unsigned int > n_txq, > - unsigned int n_rxq) > +netdev_dpdk_vhost_cuse_set_multiq(struct netdev *netdev, unsigned > int n_txq, > + unsigned int n_rxq) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > int err = 0; > @@ -1076,7 +1279,8 @@ netdev_dpdk_vhost_set_multiq(struct netdev > *netdev, unsigned int n_txq, > ovs_mutex_lock(&dev->mutex); > > netdev->n_txq = n_txq; > - netdev->n_rxq = n_rxq; > + dev->real_n_txq = 1; > + netdev->n_rxq = 1; > > ovs_mutex_unlock(&dev->mutex); > ovs_mutex_unlock(&dpdk_mutex); > @@ -1171,13 +1375,13 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int > qid) > } > > static bool > -is_vhost_running(struct virtio_net *virtio_dev) > +is_vhost_cuse_running(struct virtio_net *virtio_dev) > { > return (virtio_dev != NULL && (virtio_dev->flags & > VIRTIO_DEV_RUNNING)); > } > > static inline void > -netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats > *stats, > +netdev_dpdk_vhost_cuse_update_rx_size_counters(struct netdev_stats > *stats, > unsigned int packet_size) > { > /* Hard-coded search for the size bucket. */ > @@ -1203,7 +1407,7 @@ > netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats, > } > > static inline void > -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats, > +netdev_dpdk_vhost_cuse_update_rx_counters(struct netdev_stats > *stats, > struct dp_packet **packets, int > count) > { > int i; > @@ -1224,7 +1428,7 @@ netdev_dpdk_vhost_update_rx_counters(struct > netdev_stats *stats, > continue; > } > > - netdev_dpdk_vhost_update_rx_size_counters(stats, > packet_size); > + netdev_dpdk_vhost_cuse_update_rx_size_counters(stats, > packet_size); > > struct eth_header *eh = (struct eth_header *) > dp_packet_data(packet); > if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) { > @@ -1236,26 +1440,22 @@ netdev_dpdk_vhost_update_rx_counters(struct > netdev_stats *stats, > } > > /* > - * The receive path for the vhost port is the TX path out from > guest. > + * The receive path for the vhost cuse port is the TX path out from > guest. > */ > static int > -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, > +netdev_dpdk_vhost_cuse_rxq_recv(struct netdev_rxq *rxq, > struct dp_packet **packets, int *c) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev); > struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev); > - int qid = rxq->queue_id; > + int qid = 1; > uint16_t nb_rx = 0; > > - if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) { > + if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) { > return EAGAIN; > } > > - if (rxq->queue_id >= dev->real_n_rxq) { > - return EOPNOTSUPP; > - } > - > - nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM + > VIRTIO_TXQ, > + nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid, > dev->dpdk_mp->mp, > (struct rte_mbuf **)packets, > NETDEV_MAX_BURST); > @@ -1264,7 +1464,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq > *rxq, > } > > rte_spinlock_lock(&dev->stats_lock); > - netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, > nb_rx); > + netdev_dpdk_vhost_cuse_update_rx_counters(&dev->stats, packets, > nb_rx); > rte_spinlock_unlock(&dev->stats_lock); > > *c = (int) nb_rx; > @@ -1300,6 +1500,18 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, > struct dp_packet **packets, > return 0; > } > > +static int > +netdev_dpdk_vhost_user_rxq_recv(struct netdev_rxq *rxq, > + struct dp_packet **packets, int *c) > +{ > + struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev); > + > + if (rxq->queue_id >= dev->real_n_rxq) { > + return EOPNOTSUPP; > + } > + > + return netdev_dpdk_rxq_recv(rxq, packets, c); > +} > static inline int > netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf > **pkts, > int cnt) > @@ -1318,7 +1530,7 @@ netdev_dpdk_qos_run__(struct netdev_dpdk *dev, > struct rte_mbuf **pkts, > } > > static inline void > -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats, > +netdev_dpdk_vhost_cuse_update_tx_counters(struct netdev_stats > *stats, > struct dp_packet **packets, > int attempted, > int dropped) > @@ -1335,9 +1547,8 @@ netdev_dpdk_vhost_update_tx_counters(struct > netdev_stats *stats, > } > > static void > -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid, > - struct dp_packet **pkts, int cnt, > - bool may_steal) > +__netdev_dpdk_vhost_cuse_send(struct netdev *netdev, struct > dp_packet **pkts, > + int cnt, bool may_steal) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev); > @@ -1346,26 +1557,24 @@ __netdev_dpdk_vhost_send(struct netdev > *netdev, int qid, > unsigned int qos_pkts = cnt; > uint64_t start = 0; > > - qid = dev->tx_q[qid % dev->real_n_txq].map; > - > - if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0)) { > + if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) { > rte_spinlock_lock(&dev->stats_lock); > dev->stats.tx_dropped+= cnt; > rte_spinlock_unlock(&dev->stats_lock); > goto out; > } > > - rte_spinlock_lock(&dev->tx_q[qid].tx_lock); > + /* There is a single vhost-cuse TX queue, So we need to lock it > for TX. */ > + rte_spinlock_lock(&dev->vhost_cuse_tx_lock); > > /* Check has QoS has been configured for the netdev */ > cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt); > qos_pkts -= cnt; > > do { > - int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ; > unsigned int tx_pkts; > > - tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid, > + tx_pkts = rte_vhost_enqueue_burst(virtio_dev, VIRTIO_RXQ, > cur_pkts, cnt); > if (OVS_LIKELY(tx_pkts)) { > /* Packets have been sent.*/ > @@ -1384,7 +1593,7 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, > int qid, > * Unable to enqueue packets to vhost interface. > * Check available entries before retrying. > */ > - while (!rte_vring_available_entries(virtio_dev, > vhost_qid)) { > + while (!rte_vring_available_entries(virtio_dev, > VIRTIO_RXQ)) { > if (OVS_UNLIKELY((rte_get_timer_cycles() - start) > > timeout)) { > expired = 1; > break; > @@ -1396,12 +1605,12 @@ __netdev_dpdk_vhost_send(struct netdev > *netdev, int qid, > } > } > } while (cnt); > - > - rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); > + rte_spinlock_unlock(&dev->vhost_cuse_tx_lock); > > rte_spinlock_lock(&dev->stats_lock); > cnt += qos_pkts; > - netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, > total_pkts, cnt); > + netdev_dpdk_vhost_cuse_update_tx_counters(&dev->stats, pkts, > total_pkts, > + cnt); > rte_spinlock_unlock(&dev->stats_lock); > > out: > @@ -1495,8 +1704,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, > struct dp_packet **pkts, > newcnt++; > } > > - if (dev->type == DPDK_DEV_VHOST) { > - __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) > mbufs, newcnt, true); > + if (dev->type == DPDK_DEV_VHOST_CUSE) { > + __netdev_dpdk_vhost_cuse_send(netdev, (struct dp_packet **) > mbufs, > + newcnt, true); > } else { > unsigned int qos_pkts = newcnt; > > @@ -1520,8 +1730,8 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, > struct dp_packet **pkts, > } > > static int > -netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct > dp_packet **pkts, > - int cnt, bool may_steal) > +netdev_dpdk_vhost_cuse_send(struct netdev *netdev, int qid > OVS_UNUSED, > + struct dp_packet **pkts, int cnt, bool > may_steal) > { > if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) { > int i; > @@ -1533,7 +1743,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, > int qid, struct dp_packet **pkts, > } > } > } else { > - __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal); > + __netdev_dpdk_vhost_cuse_send(netdev, pkts, cnt, may_steal); > } > return 0; > } > @@ -1544,11 +1754,6 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, > int qid, > { > int i; > > - if (OVS_UNLIKELY(dev->txq_needs_locking)) { > - qid = qid % dev->real_n_txq; > - rte_spinlock_lock(&dev->tx_q[qid].tx_lock); > - } > - > if (OVS_UNLIKELY(!may_steal || > pkts[0]->source != DPBUF_DPDK)) { > struct netdev *netdev = &dev->up; > @@ -1607,19 +1812,49 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, > int qid, > rte_spinlock_unlock(&dev->stats_lock); > } > } > +} > + > +static int > +netdev_dpdk_eth_send(struct netdev *netdev, int qid, > + struct dp_packet **pkts, int cnt, bool > may_steal) > +{ > + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > + > + if (OVS_UNLIKELY(dev->txq_needs_locking)) { > + qid = qid % dev->real_n_txq; > + rte_spinlock_lock(&dev->tx_q[qid].tx_lock); > + } > + > + netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal); > > if (OVS_UNLIKELY(dev->txq_needs_locking)) { > rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); > } > + > + return 0; > } > > static int > -netdev_dpdk_eth_send(struct netdev *netdev, int qid, > +netdev_dpdk_vhost_user_send(struct netdev *netdev, int qid, > struct dp_packet **pkts, int cnt, bool > may_steal) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > > - netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal); > + qid = dev->tx_q[qid % dev->real_n_txq].map; > + if (qid == -1) { > + if (may_steal) { > + int i; > + > + for (i = 0; i < cnt; i++) { > + dp_packet_delete(pkts[i]); Can we capture these packets in stats? > + } > + } > + } else { > + rte_spinlock_lock(&dev->tx_q[qid].tx_lock); > + netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal); > + rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); > + } > + > return 0; > } > > @@ -1717,7 +1952,7 @@ static int > netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier); > > static int > -netdev_dpdk_vhost_get_stats(const struct netdev *netdev, > +netdev_dpdk_vhost_cuse_get_stats(const struct netdev *netdev, > struct netdev_stats *stats) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > @@ -1847,14 +2082,26 @@ netdev_dpdk_get_stats(const struct netdev > *netdev, struct netdev_stats *stats) > stats->tx_packets = rte_stats.opackets; > stats->rx_bytes = rte_stats.ibytes; > stats->tx_bytes = rte_stats.obytes; > - /* DPDK counts imissed as errors, but count them here as dropped > instead */ > - stats->rx_errors = rte_stats.ierrors - rte_stats.imissed; > - stats->tx_errors = rte_stats.oerrors; > - stats->multicast = rte_stats.imcasts; > > - rte_spinlock_lock(&dev->stats_lock); > - stats->tx_dropped = dev->stats.tx_dropped; > - rte_spinlock_unlock(&dev->stats_lock); > + if (dev->type == DPDK_DEV_ETH) { > + /* DPDK counts imissed as errors, but count them here as > dropped > + * instead */ > + stats->rx_errors = rte_stats.ierrors - rte_stats.imissed; > + stats->tx_errors = rte_stats.oerrors; > + stats->multicast = rte_stats.imcasts; > + > + rte_spinlock_lock(&dev->stats_lock); > + stats->tx_dropped = dev->stats.tx_dropped; > + rte_spinlock_unlock(&dev->stats_lock); > + } else { > + stats->rx_errors = UINT64_MAX; > + stats->tx_errors = UINT64_MAX; > + stats->multicast = UINT64_MAX; > + > + rte_spinlock_lock(&dev->stats_lock); > + stats->tx_dropped = UINT64_MAX; We should still be able to count tx dropped. > + rte_spinlock_unlock(&dev->stats_lock); > + } I assume there's some rework because of the extended stats that have merged now? > > /* These are the available DPDK counters for packets not > received due to > * local resource constraints in DPDK and NIC respectively. */ > @@ -1940,14 +2187,14 @@ netdev_dpdk_get_carrier(const struct netdev > *netdev, bool *carrier) > } > > static int > -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool > *carrier) > +netdev_dpdk_vhost_cuse_get_carrier(const struct netdev *netdev, bool > *carrier) > { > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev); > > ovs_mutex_lock(&dev->mutex); > > - if (is_vhost_running(virtio_dev)) { > + if (is_vhost_cuse_running(virtio_dev)) { > *carrier = 1; > } else { > *carrier = 0; > @@ -1997,7 +2244,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk > *dev, > return 0; > } > > - if (dev->type == DPDK_DEV_ETH) { > + if (dev->type != DPDK_DEV_VHOST_CUSE) { > if (dev->flags & NETDEV_UP) { > err = rte_eth_dev_start(dev->port_id); > if (err) > @@ -2098,7 +2345,7 @@ netdev_dpdk_set_admin_state(struct unixctl_conn > *conn, int argc, > > if (argc > 2) { > struct netdev *netdev = netdev_from_name(argv[1]); > - if (netdev && is_dpdk_class(netdev->netdev_class)) { > + if (netdev && is_dpdk_eth_class(netdev->netdev_class)) { > struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev); > > ovs_mutex_lock(&dpdk_dev->mutex); > @@ -2142,75 +2389,7 @@ set_irq_status(struct virtio_net *virtio_dev) > } > > /* > - * Fixes mapping for vhost-user tx queues. Must be called after each > - * enabling/disabling of queues and real_n_txq modifications. > - */ > -static void > -netdev_dpdk_remap_txqs(struct netdev_dpdk *dev) > - OVS_REQUIRES(dev->mutex) > -{ > - int *enabled_queues, n_enabled = 0; > - int i, k, total_txqs = dev->real_n_txq; > - > - enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof > *enabled_queues); > - > - for (i = 0; i < total_txqs; i++) { > - /* Enabled queues always mapped to themselves. */ > - if (dev->tx_q[i].map == i) { > - enabled_queues[n_enabled++] = i; > - } > - } > - > - if (n_enabled == 0 && total_txqs != 0) { > - enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED; > - n_enabled = 1; > - } > - > - k = 0; > - for (i = 0; i < total_txqs; i++) { > - if (dev->tx_q[i].map != i) { > - dev->tx_q[i].map = enabled_queues[k]; > - k = (k + 1) % n_enabled; > - } > - } > - > - VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id); > - for (i = 0; i < total_txqs; i++) { > - VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map); > - } > - > - rte_free(enabled_queues); > -} > - > -static int > -netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev, struct > virtio_net *virtio_dev) > - OVS_REQUIRES(dev->mutex) > -{ > - uint32_t qp_num; > - > - qp_num = virtio_dev->virt_qp_nb; > - if (qp_num > dev->up.n_rxq) { > - VLOG_ERR("vHost Device '%s' %"PRIu64" can't be added - " > - "too many queues %d > %d", virtio_dev->ifname, > virtio_dev->device_fh, > - qp_num, dev->up.n_rxq); > - return -1; > - } > - > - dev->real_n_rxq = qp_num; > - dev->real_n_txq = qp_num; > - dev->txq_needs_locking = true; > - /* Enable TX queue 0 by default if it wasn't disabled. */ > - if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) { > - dev->tx_q[0].map = 0; > - } > - > - netdev_dpdk_remap_txqs(dev); > - > - return 0; > -} > - > -/* > - * A new virtio-net device is added to a vhost port. > + * A new virtio-net device is added to a vhost cuse port. > */ > static int > new_device(struct virtio_net *virtio_dev) > @@ -2223,11 +2402,6 @@ new_device(struct virtio_net *virtio_dev) > LIST_FOR_EACH(dev, list_node, &dpdk_list) { > if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) > == 0) { > ovs_mutex_lock(&dev->mutex); > - if (netdev_dpdk_vhost_set_queues(dev, virtio_dev)) { > - ovs_mutex_unlock(&dev->mutex); > - ovs_mutex_unlock(&dpdk_mutex); > - return -1; > - } > ovsrcu_set(&dev->virtio_dev, virtio_dev); > exists = true; > virtio_dev->flags |= VIRTIO_DEV_RUNNING; > @@ -2251,23 +2425,11 @@ new_device(struct virtio_net *virtio_dev) > return 0; > } > > -/* Clears mapping for all available queues of vhost interface. */ > -static void > -netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev) > - OVS_REQUIRES(dev->mutex) > -{ > - int i; > - > - for (i = 0; i < dev->real_n_txq; i++) { > - dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN; > - } > -} > - > /* > - * Remove a virtio-net device from the specific vhost port. Use > dev->remove > - * flag to stop any more packets from being sent or received to/from > a VM and > - * ensure all currently queued packets have been sent/received > before removing > - * the device. > + * Remove a virtio-net device from the specific vhost cuse port. Use > + * dev->remove flag to stop any more packets from being sent or > received > + * to/from a VM and ensure all currently queued packets have been > sent/received > + * before removing the device. > */ > static void > destroy_device(volatile struct virtio_net *virtio_dev) > @@ -2282,7 +2444,6 @@ destroy_device(volatile struct virtio_net > *virtio_dev) > ovs_mutex_lock(&dev->mutex); > virtio_dev->flags &= ~VIRTIO_DEV_RUNNING; > ovsrcu_set(&dev->virtio_dev, NULL); > - netdev_dpdk_txq_map_clear(dev); > exists = true; > ovs_mutex_unlock(&dev->mutex); > break; > @@ -2310,49 +2471,6 @@ destroy_device(volatile struct virtio_net > *virtio_dev) > } > } > > -static int > -vring_state_changed(struct virtio_net *virtio_dev, uint16_t > queue_id, > - int enable) > -{ > - struct netdev_dpdk *dev; > - bool exists = false; > - int qid = queue_id / VIRTIO_QNUM; > - > - if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) { > - return 0; > - } > - > - ovs_mutex_lock(&dpdk_mutex); > - LIST_FOR_EACH (dev, list_node, &dpdk_list) { > - if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) > == 0) { > - ovs_mutex_lock(&dev->mutex); > - if (enable) { > - dev->tx_q[qid].map = qid; > - } else { > - dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED; > - } > - netdev_dpdk_remap_txqs(dev); > - exists = true; > - ovs_mutex_unlock(&dev->mutex); > - break; > - } > - } > - ovs_mutex_unlock(&dpdk_mutex); > - > - if (exists) { > - VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device > '%s' %" > - PRIu64" changed to \'%s\'", queue_id, qid, > - virtio_dev->ifname, virtio_dev->device_fh, > - (enable == 1) ? "enabled" : "disabled"); > - } else { > - VLOG_INFO("vHost Device '%s' %"PRIu64" not found", > virtio_dev->ifname, > - virtio_dev->device_fh); > - return -1; > - } > - > - return 0; > -} > - > struct virtio_net * > netdev_dpdk_get_virtio(const struct netdev_dpdk *dev) > { > @@ -2360,18 +2478,18 @@ netdev_dpdk_get_virtio(const struct > netdev_dpdk *dev) > } > > /* > - * These callbacks allow virtio-net devices to be added to vhost > ports when > - * configuration has been fully complete. > + * These callbacks allow virtio-net devices to be added to vhost > cuse ports > + * when configuration has been fully complete. > */ > static const struct virtio_net_device_ops virtio_net_device_ops = > { > .new_device = new_device, > .destroy_device = destroy_device, > - .vring_state_changed = vring_state_changed > }; > > +#ifdef VHOST_CUSE > static void * > -start_vhost_loop(void *dummy OVS_UNUSED) > +start_vhost_cuse_loop(void *dummy OVS_UNUSED) > { > pthread_detach(pthread_self()); > /* Put the cuse thread into quiescent state. */ > @@ -2379,19 +2497,27 @@ start_vhost_loop(void *dummy OVS_UNUSED) > rte_vhost_driver_session_start(); > return NULL; > } > +#endif > > static int > dpdk_vhost_class_init(void) > { > +#ifdef VHOST_CUSE > rte_vhost_driver_callback_register(&virtio_net_device_ops); > rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4 > | 1ULL << VIRTIO_NET_F_HOST_TSO6 > | 1ULL << VIRTIO_NET_F_CSUM); > > - ovs_thread_create("vhost_thread", start_vhost_loop, NULL); > + ovs_thread_create("vhost_cuse_thread", start_vhost_cuse_loop, > NULL); > +#else > + rte_eth_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4 > + | 1ULL << VIRTIO_NET_F_HOST_TSO6 > + | 1ULL << VIRTIO_NET_F_CSUM); > +#endif > return 0; > } > > + > static int > dpdk_vhost_cuse_class_init(void) > { > @@ -2511,7 +2637,17 @@ netdev_dpdk_ring_send(struct netdev *netdev, > int qid, > dp_packet_rss_invalidate(pkts[i]); > } > > + if (OVS_UNLIKELY(dev->txq_needs_locking)) { > + qid = qid % dev->real_n_txq; > + rte_spinlock_lock(&dev->tx_q[qid].tx_lock); > + } > + > netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal); > + > + if (OVS_UNLIKELY(dev->txq_needs_locking)) { > + rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); > + } > + > return 0; > } > > @@ -3276,28 +3412,28 @@ static const struct netdev_class OVS_UNUSED > dpdk_vhost_cuse_class = > "dpdkvhostcuse", > dpdk_vhost_cuse_class_init, > netdev_dpdk_vhost_cuse_construct, > - netdev_dpdk_vhost_destruct, > + netdev_dpdk_vhost_cuse_destruct, > netdev_dpdk_vhost_cuse_set_multiq, > - netdev_dpdk_vhost_send, > - netdev_dpdk_vhost_get_carrier, > - netdev_dpdk_vhost_get_stats, > + netdev_dpdk_vhost_cuse_send, > + netdev_dpdk_vhost_cuse_get_carrier, > + netdev_dpdk_vhost_cuse_get_stats, > NULL, > NULL, > - netdev_dpdk_vhost_rxq_recv); > + netdev_dpdk_vhost_cuse_rxq_recv); > > static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class = > NETDEV_DPDK_CLASS( > "dpdkvhostuser", > dpdk_vhost_user_class_init, > netdev_dpdk_vhost_user_construct, > - netdev_dpdk_vhost_destruct, > - netdev_dpdk_vhost_set_multiq, > - netdev_dpdk_vhost_send, > - netdev_dpdk_vhost_get_carrier, > - netdev_dpdk_vhost_get_stats, > + netdev_dpdk_destruct, > + netdev_dpdk_vhost_user_set_multiq, > + netdev_dpdk_vhost_user_send, > + netdev_dpdk_get_carrier, > + netdev_dpdk_get_stats, > NULL, > NULL, > - netdev_dpdk_vhost_rxq_recv); > + netdev_dpdk_vhost_user_rxq_recv); > > void > netdev_dpdk_register(void) > -- > 2.4.3 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev