Hi,
First of all thanks for the patch.
I found some issues that I would like to discuss while I continue
reviewing the patch. I haven't tested yet.
See my comment in line.
Thanks!
fbl
On Tue, May 19, 2020 at 05:19:12PM +0530, Sivaprasad Tummala wrote:
> The vHost PMD brings vHost User port types ('dpdkvhostuser' and
> 'dpdkvhostuserclient') under control of DPDK's librte_ether API, like
> other DPDK netdev types ('dpdk'). In doing so, direct
> calls to DPDK's librte_vhost library are removed and replaced with
> librte_ether API calls, for which most of the infrastructure is already
> in place.
>
> To enable TSO, specific changes were required in the vhost PMD. The
> patch which enables these is available on dpdk-master and here:
> https://patches.dpdk.org/patch/66052/
>
> Signed-off-by: Ciara Loftus <[email protected]>
> Signed-off-by: Sivaprasad Tummala <[email protected]>
>
> Tested-by: Sunil Pai G <[email protected]>
> ---
> Documentation/topics/dpdk/vhost-user.rst | 3 +
> NEWS | 3 +
> acinclude.m4 | 4 +
> include/openvswitch/netdev.h | 1 +
> lib/dpdk.c | 11 +
> lib/dpdk.h | 2 +
> lib/netdev-dpdk.c | 1384 ++++++++--------------
> 7 files changed, 535 insertions(+), 873 deletions(-)
>
> diff --git a/Documentation/topics/dpdk/vhost-user.rst
> b/Documentation/topics/dpdk/vhost-user.rst
> index c6c6fd8bd..644598f79 100644
> --- a/Documentation/topics/dpdk/vhost-user.rst
> +++ b/Documentation/topics/dpdk/vhost-user.rst
> @@ -34,6 +34,9 @@ User, refer to the `QEMU documentation`_ on same.
> To use any DPDK-backed interface, you must ensure your bridge is
> configured
> correctly. For more information, refer to :doc:`bridge`.
>
> + Maximum number of vHost ports should be less than RTE_MAX_ETHPORTS as
> + defined in the DPDK configuration.
> +
This needs more clarification saying that now the vhost-user ports
are counted and the maximum is RTE_MAX_ETHPORTS. This also misses
the name restriction change disallowing commas.
> Quick Example
> -------------
>
> diff --git a/NEWS b/NEWS
> index 3dbd8ec0e..0071916fb 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -9,6 +9,9 @@ Post-v2.13.0
> - DPDK:
> * Deprecated DPDK pdump packet capture support removed.
> * Deprecated DPDK ring ports (dpdkr) are no longer supported.
> + * Use DPDK's vHost PMD instead of direct library calls. This means the
> + maximum number of vHost ports is equal to RTE_MAX_ETHPORTS as defined
> + in the DPDK configuration.
Same here.
> - Linux datapath:
> * Support for kernel versions up to 5.5.x.
> - AF_XDP:
> diff --git a/acinclude.m4 b/acinclude.m4
> index dabbffd01..80d09f61c 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -367,6 +367,10 @@ AC_DEFUN([OVS_CHECK_DPDK], [
> AC_DEFINE([VHOST_NUMA], [1], [NUMA Aware vHost support detected in
> DPDK.])
> ], [], [[#include <rte_config.h>]])
>
> + AC_CHECK_DECL([RTE_LIBRTE_PMD_VHOST], [], [
> + AC_MSG_ERROR([RTE_LIBRTE_PMD_VHOST is not defined in rte_config.h])
> + ], [[#include <rte_config.h>]])
> +
> AC_CHECK_DECL([RTE_LIBRTE_MLX5_PMD], [dnl found
> OVS_FIND_DEPENDENCY([mnl_attr_put], [mnl], [libmnl])
> AC_CHECK_DECL([RTE_IBVERBS_LINK_DLOPEN], [], [dnl not found
> diff --git a/include/openvswitch/netdev.h b/include/openvswitch/netdev.h
> index 0c10f7b48..09027e15d 100644
> --- a/include/openvswitch/netdev.h
> +++ b/include/openvswitch/netdev.h
> @@ -84,6 +84,7 @@ struct netdev_stats {
> uint64_t tx_broadcast_packets;
>
> uint64_t rx_undersized_errors;
> + uint64_t rx_undersize_packets;
This doesn't seem to be required for this patchset, correct?. I know
it will improve things but it could be done on a separate patch
to reduce the patch's size and complexity.
> uint64_t rx_oversize_errors;
> uint64_t rx_fragmented_errors;
> uint64_t rx_jabber_errors;
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 31450d470..202965b2a 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -23,12 +23,14 @@
> #include <getopt.h>
>
> #include <rte_errno.h>
> +#include <rte_ethdev.h>
Why this is needed?
> #include <rte_log.h>
> #include <rte_memzone.h>
> #include <rte_version.h>
>
> #include "dirs.h"
> #include "fatal-signal.h"
> +#include "id-pool.h"
> #include "netdev-dpdk.h"
> #include "netdev-offload-provider.h"
> #include "openvswitch/dynamic-string.h"
> @@ -50,6 +52,7 @@ static bool vhost_postcopy_enabled = false; /* Status of
> vHost POSTCOPY
> static bool dpdk_initialized = false; /* Indicates successful initialization
> * of DPDK. */
> static bool per_port_memory = false; /* Status of per port memory support */
> +static struct id_pool *vhost_driver_ids; /* Pool of IDs for vHost PMDs. */
>
> static int
> process_vhost_flags(char *flag, const char *default_val, int size,
> @@ -428,6 +431,8 @@ dpdk_init__(const struct smap *ovs_other_config)
> /* We are called from the main thread here */
> RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>
> + vhost_driver_ids = id_pool_create(0, RTE_MAX_ETHPORTS);
> +
> /* Finally, register the dpdk classes */
> netdev_dpdk_register();
> netdev_register_flow_api_provider(&netdev_offload_dpdk);
> @@ -499,6 +504,12 @@ dpdk_available(void)
> return dpdk_initialized;
> }
>
> +struct id_pool *
> +dpdk_get_vhost_id_pool(void)
> +{
> + return vhost_driver_ids;
> +}
> +
> void
> dpdk_set_lcore_id(unsigned cpu)
> {
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..2dea08f84 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -44,4 +44,6 @@ bool dpdk_per_port_memory(void);
> bool dpdk_available(void);
> void print_dpdk_version(void);
> void dpdk_status(const struct ovsrec_open_vswitch *);
> +struct id_pool *dpdk_get_vhost_id_pool(void);
> +
> #endif /* dpdk.h */
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 9d2b4104c..e02929174 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -30,6 +30,7 @@
> #include <rte_config.h>
> #include <rte_cycles.h>
> #include <rte_errno.h>
> +#include <rte_eth_vhost.h>
> #include <rte_ethdev.h>
> #include <rte_flow.h>
> #include <rte_malloc.h>
> @@ -47,6 +48,7 @@
> #include "dpif-netdev.h"
> #include "fatal-signal.h"
> #include "if-notifier.h"
> +#include "id-pool.h"
> #include "netdev-provider.h"
> #include "netdev-vport.h"
> #include "odp-util.h"
> @@ -70,6 +72,7 @@
> #include "uuid.h"
>
> enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
> +enum {VHOST_SERVER_MODE, VHOST_CLIENT_MODE};
Why do we need this since we have dev->vhost_driver_flags?
> VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
> static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> @@ -125,7 +128,7 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF /
> ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
> /* Maximum size of Physical NIC Queues */
> #define NIC_PORT_MAX_Q_SIZE 4096
>
> -#define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues.
> */
> +#define OVS_VHOST_MAX_QUEUE_NUM RTE_MAX_QUEUES_PER_PORT /* Max vHost TXQs. */
Since we want to use a PMD with eth API, then I'd suggest to remove
that define and use RTE_MAX_QUEUES_PER_PORT instead to reduce the
code differences.
> #define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
> #define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and
> not
> * yet mapped to another queue. */
> @@ -172,27 +175,6 @@ static const struct rte_eth_conf port_conf = {
> },
> };
>
> -/*
> - * These callbacks allow virtio-net devices to be added to vhost ports when
> - * configuration has been fully completed.
> - */
> -static int new_device(int vid);
> -static void destroy_device(int vid);
> -static int vring_state_changed(int vid, uint16_t queue_id, int enable);
> -static void destroy_connection(int vid);
> -static void vhost_guest_notified(int vid);
> -
> -static const struct vhost_device_ops virtio_net_device_ops =
> -{
> - .new_device = new_device,
> - .destroy_device = destroy_device,
> - .vring_state_changed = vring_state_changed,
> - .features_changed = NULL,
> - .new_connection = NULL,
> - .destroy_connection = destroy_connection,
> - .guest_notified = vhost_guest_notified,
> -};
> -
This reverts commit 3d56e4ac445d17e69484a95b319ac578e3580b65
from Eelco and it is not noted anywhere.
> /* Custom software stats for dpdk ports */
> struct netdev_dpdk_sw_stats {
> /* No. of retries when unable to transmit. */
> @@ -444,6 +426,8 @@ struct netdev_dpdk {
> };
> struct dpdk_tx_queue *tx_q;
> struct rte_eth_link link;
> + /* ID of vhost user port given to the PMD driver. */
> + int32_t vhost_pmd_id;
> );
>
> PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
> @@ -540,7 +524,12 @@ static int netdev_dpdk_get_sw_custom_stats(const struct
> netdev *,
> struct netdev_custom_stats *);
> static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
>
> -int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
> +static int link_status_changed_callback(dpdk_port_t port_id,
> + enum rte_eth_event_type type, void *param, void *ret_param);
> +static int vring_state_changed_callback(dpdk_port_t port_id,
> + enum rte_eth_event_type type, void *param, void *ret_param);
> +static void netdev_dpdk_remap_txqs(struct netdev_dpdk *dev);
> +static void netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev);
>
> struct ingress_policer *
> netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
> @@ -1008,20 +997,22 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int
> n_rxq, int n_txq)
> break;
> }
>
> - diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
> - if (diag) {
> - /* A device may not support rte_eth_dev_set_mtu, in this case
> - * flag a warning to the user and include the devices configured
> - * MTU value that will be used instead. */
> - if (-ENOTSUP == diag) {
> - rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
> - VLOG_WARN("Interface %s does not support MTU configuration, "
> - "max packet size supported is %"PRIu16".",
> - dev->up.name, conf_mtu);
> - } else {
> - VLOG_ERR("Interface %s MTU (%d) setup error: %s",
> - dev->up.name, dev->mtu, rte_strerror(-diag));
> - break;
> + if (dev->type == DPDK_DEV_ETH) {
> + diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
> + if (diag) {
> + /* A device may not support rte_eth_dev_set_mtu, in this
> + * case flag a warning to the user and include the devices
> + * configured MTU value that will be used instead. */
> + if (-ENOTSUP == diag) {
> + rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
> + VLOG_WARN("Interface %s does not support MTU
> configuration"
> + ", max packet size supported is %"PRIu16".",
> + dev->up.name, conf_mtu);
> + } else {
> + VLOG_ERR("Interface %s MTU (%d) setup error: %s",
> + dev->up.name, dev->mtu, rte_strerror(-diag));
> + break;
> + }
> }
> }
>
> @@ -1058,8 +1049,13 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int
> n_rxq, int n_txq)
> continue;
> }
>
> - dev->up.n_rxq = n_rxq;
> - dev->up.n_txq = n_txq;
> + /* Only set n_*xq for physical devices. vHost User devices will set
> + * this value correctly using info from the virtio frontend.
> + */
> + if (dev->type == DPDK_DEV_ETH) {
> + dev->up.n_rxq = n_rxq;
> + dev->up.n_txq = n_txq;
> + }
>
> return 0;
> }
> @@ -1133,8 +1129,17 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
> }
> }
>
> - n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> - n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
> + if (dev->type == DPDK_DEV_VHOST) {
> + /* We don't know how many queues the Virtio frontend will use, so we
> + * need to provision for the maximum, as if we configure less up
> front
> + * than what Virtio configures later, those surplus queues will never
> + * be available to the vHost backend. */
> + n_rxq = OVS_VHOST_MAX_QUEUE_NUM;
> + n_txq = OVS_VHOST_MAX_QUEUE_NUM;
> + } else {
> + n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> + n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
> + }
>
> diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
> if (diag) {
> @@ -1229,7 +1234,7 @@ common_construct(struct netdev *netdev, dpdk_port_t
> port_no,
> dev->requested_mtu = RTE_ETHER_MTU;
> dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
> dev->requested_lsc_interrupt_mode = 0;
> - ovsrcu_index_init(&dev->vid, -1);
> + dev->vhost_pmd_id = -1;
> dev->vhost_reconfigured = false;
> dev->attached = false;
> dev->started = false;
> @@ -1293,27 +1298,83 @@ netdev_dpdk_get_num_ports(struct rte_device *device)
> }
>
> static int
> -vhost_common_construct(struct netdev *netdev)
> - OVS_REQUIRES(dpdk_mutex)
> +dpdk_attach_vhost_pmd(struct netdev_dpdk *dev, int mode)
> {
> - int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
> - struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> + char *devargs;
> + int err = 0;
> + dpdk_port_t port_no = 0;
> + uint32_t driver_id = 0;
> + int iommu_enabled = 0;
> + int zc_enabled = 0;
> + int postcopy_enabled = 0;
> + int linear_buf = 1;
> + int ext_buf = 0;
> + int tso = 0;
> + char *driver_name;
>
> - dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
> - sizeof
> *dev->vhost_rxq_enabled);
> - if (!dev->vhost_rxq_enabled) {
> - return ENOMEM;
> + atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
> +
> + if (dev->vhost_driver_flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY) {
> + zc_enabled = 1;
> + /*
> + * DPDK vHost library doesn't allow zero-copy with linear buffers
> + * currently. Hence disabling the Linear buffer check until the
> + * issue is fixed in DPDK.
> + */
> + linear_buf = 0;
> + VLOG_WARN("Zero copy enabled and hence disabling linear buffer"
> + " check for vHost port %s", dev->up.name);
Please see Maxime's comment in another reply.
> }
> - dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
> - if (!dev->tx_q) {
> - rte_free(dev->vhost_rxq_enabled);
> - return ENOMEM;
> +
> + if (dpdk_vhost_postcopy_enabled()) {
> + postcopy_enabled = 1;
> }
>
> - atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
> + if (dpdk_vhost_iommu_enabled()) {
> + iommu_enabled = 1;
> + }
> +
> + if (userspace_tso_enabled()) {
> + ext_buf = 1;
> + tso = 1;
> + }
> +
> + if (id_pool_alloc_id(dpdk_get_vhost_id_pool(), &driver_id)) {
> + driver_name = xasprintf("net_vhost%u", driver_id);
> + devargs = xasprintf("%s,iface=%s,queues=%i,client=%i,"
> + "dequeue-zero-copy=%i,postcopy-support=%i,"
> + "iommu-support=%i,tso=%i,"
> + "linear-buffer=%i,ext-buffer=%i",
> + driver_name, dev->vhost_id, OVS_VHOST_MAX_QUEUE_NUM, mode,
> + zc_enabled, postcopy_enabled, iommu_enabled, tso,
> + linear_buf, ext_buf);
> + if (!rte_dev_probe(devargs) &&
> + !rte_eth_dev_get_port_by_name(driver_name, &port_no)) {
> + dev->attached = true;
> + dev->port_id = port_no;
> + dev->vhost_pmd_id = driver_id;
> +
> + rte_eth_dev_callback_register(dev->port_id,
> + RTE_ETH_EVENT_QUEUE_STATE,
> + vring_state_changed_callback,
> + NULL);
> + rte_eth_dev_callback_register(dev->port_id,
> + RTE_ETH_EVENT_INTR_LSC,
> + link_status_changed_callback,
> + NULL);
> + } else {
> + id_pool_free_id(dpdk_get_vhost_id_pool(), driver_id);
> + VLOG_ERR("Failed to attach vhost-user device %s to DPDK",
> + dev->vhost_id);
> + err = ENODEV;
> + }
> + } else {
> + VLOG_ERR("Unable to create vhost-user device %s - too many
> vhost-user "
> + "devices registered with PMD", dev->vhost_id);
> + err = ENODEV;
> + }
>
> - return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
> - DPDK_DEV_VHOST, socket_id);
> + return err;
> }
>
> static int
> @@ -1322,11 +1383,12 @@ netdev_dpdk_vhost_construct(struct netdev *netdev)
> struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> const char *name = netdev->name;
> int err;
> + struct rte_eth_dev_info dev_info;
>
> /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
> * the file system. '/' or '\' would traverse directories, so they're not
> * acceptable in 'name'. */
> - if (strchr(name, '/') || strchr(name, '\\')) {
> + if (strchr(name, '/') || strchr(name, '\\') || strchr(name, ',')) {
> VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
> "A valid name must not include '/' or '\\'",
> name);
> @@ -1341,50 +1403,32 @@ netdev_dpdk_vhost_construct(struct netdev *netdev)
>
> dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
>
> - /* There is no support for multi-segments buffers. */
> - dev->vhost_driver_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
> - err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
> - if (err) {
> - VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
> - dev->vhost_id);
> - goto out;
> - } else {
> + err = dpdk_attach_vhost_pmd(dev, VHOST_SERVER_MODE);
> + if (!err) {
> fatal_signal_add_file_to_unlink(dev->vhost_id);
> VLOG_INFO("Socket %s created for vhost-user port %s\n",
> dev->vhost_id, name);
> - }
> -
> - err = rte_vhost_driver_callback_register(dev->vhost_id,
> - &virtio_net_device_ops);
> - if (err) {
> - VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
> - "port: %s\n", name);
> + } else {
> goto out;
> }
>
> - if (!userspace_tso_enabled()) {
> - err = rte_vhost_driver_disable_features(dev->vhost_id,
> - 1ULL << VIRTIO_NET_F_HOST_TSO4
> - | 1ULL << VIRTIO_NET_F_HOST_TSO6
> - | 1ULL << VIRTIO_NET_F_CSUM);
> - if (err) {
> - VLOG_ERR("rte_vhost_driver_disable_features failed for vhost
> user "
> - "port: %s\n", name);
> - goto out;
> - }
> - }
> -
> - err = rte_vhost_driver_start(dev->vhost_id);
> - if (err) {
> - VLOG_ERR("rte_vhost_driver_start failed for vhost user "
> - "port: %s\n", name);
> - goto out;
> + dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
> + sizeof
> *dev->vhost_rxq_enabled);
> + if (!dev->vhost_rxq_enabled) {
> + return ENOMEM;
> }
>
> - err = vhost_common_construct(netdev);
> + err = common_construct(&dev->up, dev->port_id, DPDK_DEV_VHOST,
> + rte_lcore_to_socket_id(rte_get_master_lcore()));
> if (err) {
> - VLOG_ERR("vhost_common_construct failed for vhost user "
> - "port: %s\n", name);
> + VLOG_ERR("common_construct failed for vhost user port: %s\n", name);
> + rte_eth_dev_info_get(dev->port_id, &dev_info);
> + if (!dev_info.device || rte_dev_remove(dev_info.device)) {
> + VLOG_ERR("Error detatching device\n");
> + }
> + if (dev->vhost_pmd_id >= 0) {
> + id_pool_free_id(dpdk_get_vhost_id_pool(), dev->vhost_pmd_id);
> + }
> }
>
> out:
> @@ -1402,12 +1446,14 @@ out:
> static int
> netdev_dpdk_vhost_client_construct(struct netdev *netdev)
> {
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> int err;
>
> ovs_mutex_lock(&dpdk_mutex);
> - err = vhost_common_construct(netdev);
> + err = common_construct(&dev->up, DPDK_ETH_PORT_ID_INVALID,
> DPDK_DEV_VHOST,
> + rte_lcore_to_socket_id(rte_get_master_lcore()));
> if (err) {
> - VLOG_ERR("vhost_common_construct failed for vhost user client"
> + VLOG_ERR("common_construct failed for vhost user client"
> "port: %s\n", netdev->name);
> }
> ovs_mutex_unlock(&dpdk_mutex);
> @@ -1442,110 +1488,114 @@ common_destruct(struct netdev_dpdk *dev)
> }
>
> static void
> -netdev_dpdk_destruct(struct netdev *netdev)
> +dpdk_dev_remove_helper(struct netdev_dpdk *dev)
> {
> - struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> struct rte_device *rte_dev;
> struct rte_eth_dev *eth_dev;
> bool remove_on_close;
>
> - ovs_mutex_lock(&dpdk_mutex);
> + /* Retrieve eth device data before closing it.
> + * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
> + */
> + eth_dev = &rte_eth_devices[dev->port_id];
>
> - rte_eth_dev_stop(dev->port_id);
> - dev->started = false;
> + remove_on_close =
> + eth_dev->data &&
> + (eth_dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE);
> + rte_dev = eth_dev->device;
>
> - if (dev->attached) {
> - /* Retrieve eth device data before closing it.
> - * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
> - */
> - eth_dev = &rte_eth_devices[dev->port_id];
> - remove_on_close =
> - eth_dev->data &&
> - (eth_dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE);
> - rte_dev = eth_dev->device;
> -
> - /* Remove the eth device. */
> - rte_eth_dev_close(dev->port_id);
> -
> - /* Remove this rte device and all its eth devices if flag
> - * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means
> representors
> - * are not supported), or if all the eth devices belonging to the rte
> - * device are closed.
> - */
> - if (!remove_on_close || !netdev_dpdk_get_num_ports(rte_dev)) {
> - int ret = rte_dev_remove(rte_dev);
> + /* Remove the eth device. */
> + rte_eth_dev_close(dev->port_id);
>
> - if (ret < 0) {
> - VLOG_ERR("Device '%s' can not be detached: %s.",
> - dev->devargs, rte_strerror(-ret));
> - } else {
> - /* Device was closed and detached. */
> - VLOG_INFO("Device '%s' has been removed and detached",
> - dev->devargs);
> - }
> + /* Remove this rte device and all its eth devices if flag
> + * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means representors
> + * are not supported), or if all the eth devices belonging to the rte
> + * device are closed.
> + */
> + if (!remove_on_close || !netdev_dpdk_get_num_ports(rte_dev)) {
> + int ret = rte_dev_remove(rte_dev);
> +
> + if (ret < 0) {
> + VLOG_ERR("Device '%s' can not be detached: %s.",
> + dev->devargs, rte_strerror(-ret));
> } else {
> - /* Device was only closed. rte_dev_remove() was not called. */
> - VLOG_INFO("Device '%s' has been removed", dev->devargs);
> + /* Device was closed and detached. */
> + VLOG_INFO("Device '%s' has been removed and detached",
> + dev->devargs);
> }
> + } else {
> + /* Device was only closed. rte_dev_remove() was not called. */
> + VLOG_INFO("Device '%s' has been removed", dev->devargs);
> }
>
> - netdev_dpdk_clear_xstats(dev);
> - free(dev->devargs);
> - common_destruct(dev);
> -
> - ovs_mutex_unlock(&dpdk_mutex);
> }
>
> -/* rte_vhost_driver_unregister() can call back destroy_device(), which will
> - * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
> - * deadlock, none of the mutexes must be held while calling this function. */
> -static int
> -dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
> - char *vhost_id)
> - OVS_EXCLUDED(dpdk_mutex)
> - OVS_EXCLUDED(dev->mutex)
> +static void
> +netdev_dpdk_destruct(struct netdev *netdev)
> {
> - return rte_vhost_driver_unregister(vhost_id);
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> + rte_eth_dev_stop(dev->port_id);
> + dev->started = false;
> +
> + if (dev->attached) {
> + dpdk_dev_remove_helper(dev);
> + }
> +
> + netdev_dpdk_clear_xstats(dev);
> + rte_free(dev->tx_q);
> + dpdk_mp_put(dev->dpdk_mp);
> + ovs_list_remove(&dev->list_node);
> + free(ovsrcu_get_protected(struct ingress_policer *,
> + &dev->ingress_policer));
> + ovs_mutex_destroy(&dev->mutex);
> }
>
> static void
> netdev_dpdk_vhost_destruct(struct netdev *netdev)
> {
> struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> - char *vhost_id;
>
> ovs_mutex_lock(&dpdk_mutex);
>
> /* Guest becomes an orphan if still attached. */
> - if (netdev_dpdk_get_vid(dev) >= 0
> - && !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
> + check_link_status(dev);
> + if (dev->link.link_status == ETH_LINK_UP) {
> VLOG_ERR("Removing port '%s' while vhost device still attached.",
> netdev->name);
> VLOG_ERR("To restore connectivity after re-adding of port, VM on "
> "socket '%s' must be restarted.", dev->vhost_id);
> }
>
> - vhost_id = dev->vhost_id;
> - dev->vhost_id = NULL;
> - rte_free(dev->vhost_rxq_enabled);
> + rte_eth_dev_stop(dev->port_id);
> + dev->started = false;
>
> - common_destruct(dev);
> + rte_eth_dev_callback_unregister(dev->port_id,
> + RTE_ETH_EVENT_QUEUE_STATE,
> + vring_state_changed_callback, NULL);
> + rte_eth_dev_callback_unregister(dev->port_id,
> + RTE_ETH_EVENT_INTR_LSC,
> + link_status_changed_callback, NULL);
>
> - ovs_mutex_unlock(&dpdk_mutex);
> + if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
> + /* OVS server mode - remove this socket from list for deletion. */
> + fatal_signal_remove_file_to_unlink(dev->vhost_id);
> + }
>
> - if (!vhost_id) {
> - goto out;
> + if (dev->attached) {
> + dpdk_dev_remove_helper(dev);
> }
>
> - if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
> - VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
> - netdev->name, vhost_id);
> - } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
> - /* OVS server mode - remove this socket from list for deletion */
> - fatal_signal_remove_file_to_unlink(vhost_id);
> + if (dev->vhost_pmd_id >= 0) {
> + id_pool_free_id(dpdk_get_vhost_id_pool(),
> + dev->vhost_pmd_id);
> }
> -out:
> - free(vhost_id);
> +
> + rte_free(dev->vhost_rxq_enabled);
> +
> + common_destruct(dev);
> +
> + ovs_mutex_unlock(&dpdk_mutex);
> }
>
> static void
> @@ -2192,12 +2242,15 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev,
> struct rte_mbuf **pkts,
> * Returns the number of packets that weren't transmitted. */
> static inline int
> netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> - struct rte_mbuf **pkts, int cnt)
> + struct rte_mbuf **pkts, int cnt, bool vhost)
> {
> uint32_t nb_tx = 0;
> uint16_t nb_tx_prep = cnt;
> + int retries = 0;
>
> - if (userspace_tso_enabled()) {
> + if (vhost) {
> + atomic_read_relaxed(&dev->vhost_tx_retries_max, &retries);
> + } else if (userspace_tso_enabled()) {
This is ignoring TSO support for vhost-user which is its main use
case today.
> nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> if (nb_tx_prep != cnt) {
> VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> @@ -2216,6 +2269,10 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int
> qid,
> }
>
> nb_tx += ret;
> +
> + if (OVS_UNLIKELY(vhost && (--retries < 0))) {
> + break;
> + }
> }
>
> if (OVS_UNLIKELY(nb_tx != cnt)) {
> @@ -2287,12 +2344,6 @@ ingress_policer_run(struct ingress_policer *policer,
> struct rte_mbuf **pkts,
> return cnt;
> }
>
> -static bool
> -is_vhost_running(struct netdev_dpdk *dev)
> -{
> - return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
> -}
> -
> static inline void
> netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
> unsigned int packet_size)
> @@ -2359,72 +2410,9 @@ netdev_dpdk_vhost_update_rx_counters(struct
> netdev_dpdk *dev,
> }
> }
>
> -/*
> - * The receive path for the vhost port is the TX path out from guest.
> - */
> -static int
> -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> - struct dp_packet_batch *batch, int *qfill)
> -{
> - struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> - struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
> - uint16_t nb_rx = 0;
> - uint16_t qos_drops = 0;
> - int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
> - int vid = netdev_dpdk_get_vid(dev);
> -
> - if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
> - || !(dev->flags & NETDEV_UP))) {
> - return EAGAIN;
> - }
> -
> - nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
> - (struct rte_mbuf **) batch->packets,
> - NETDEV_MAX_BURST);
> - if (!nb_rx) {
> - return EAGAIN;
> - }
> -
> - if (qfill) {
> - if (nb_rx == NETDEV_MAX_BURST) {
> - /* The DPDK API returns a uint32_t which often has invalid bits
> in
> - * the upper 16-bits. Need to restrict the value to uint16_t. */
> - *qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
> - } else {
> - *qfill = 0;
> - }
> - }
> -
> - if (policer) {
> - qos_drops = nb_rx;
> - nb_rx = ingress_policer_run(policer,
> - (struct rte_mbuf **) batch->packets,
> - nb_rx, true);
> - qos_drops -= nb_rx;
> - }
> -
> - rte_spinlock_lock(&dev->stats_lock);
> - netdev_dpdk_vhost_update_rx_counters(dev, batch->packets,
> - nb_rx, qos_drops);
This removes the only caller for netdev_dpdk_vhost_update_rx_counters
but left the function behind.
> - rte_spinlock_unlock(&dev->stats_lock);
> -
> - batch->count = nb_rx;
> - dp_packet_batch_init_packet_fields(batch);
> -
> - return 0;
> -}
> -
> -static bool
> -netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
> -{
> - struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> -
> - return dev->vhost_rxq_enabled[rxq->queue_id];
> -}
> -
> static int
> -netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
> - int *qfill)
> +common_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
> + int *qfill)
> {
> struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
> struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> @@ -2473,6 +2461,38 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct
> dp_packet_batch *batch,
> return 0;
> }
>
> +/*
> + * The receive path for the vhost port is the TX path out from guest.
> + */
> +static int
> +netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> + struct dp_packet_batch *batch,
> + int *qfill)
> +{
> + struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +
> + if (dev->vhost_reconfigured) {
> + return common_recv(rxq, batch, qfill);
> + }
> +
> + return EAGAIN;
> +}
> +
> +static bool
> +netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
> +{
> + struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +
> + return dev->vhost_rxq_enabled[rxq->queue_id];
> +}
> +
> +static int
> +netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
> + int *qfill)
> +{
> + return common_recv(rxq, batch, qfill);
> +}
> +
> static inline int
> netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
> int cnt, bool should_steal)
> @@ -2517,295 +2537,66 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk
> *dev, struct rte_mbuf **pkts,
> return cnt;
> }
>
> -static inline void
> -netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk *dev,
> - struct dp_packet **packets,
> - int attempted,
> - struct netdev_dpdk_sw_stats
> *sw_stats_add)
> -{
> - int dropped = sw_stats_add->tx_mtu_exceeded_drops +
> - sw_stats_add->tx_qos_drops +
> - sw_stats_add->tx_failure_drops +
> - sw_stats_add->tx_invalid_hwol_drops;
> - struct netdev_stats *stats = &dev->stats;
> - int sent = attempted - dropped;
> - int i;
> -
> - stats->tx_packets += sent;
> - stats->tx_dropped += dropped;
> -
> - for (i = 0; i < sent; i++) {
> - stats->tx_bytes += dp_packet_size(packets[i]);
> - }
> -
> - if (OVS_UNLIKELY(dropped || sw_stats_add->tx_retries)) {
> - struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
> -
> - sw_stats->tx_retries += sw_stats_add->tx_retries;
> - sw_stats->tx_failure_drops += sw_stats_add->tx_failure_drops;
> - sw_stats->tx_mtu_exceeded_drops +=
> sw_stats_add->tx_mtu_exceeded_drops;
> - sw_stats->tx_qos_drops += sw_stats_add->tx_qos_drops;
> - sw_stats->tx_invalid_hwol_drops +=
> sw_stats_add->tx_invalid_hwol_drops;
> - }
> -}
>
> +/* Tx function. Transmit packets indefinitely. */
> static void
> -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> - struct dp_packet **pkts, int cnt)
> +dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch
> *batch)
> + OVS_NO_THREAD_SAFETY_ANALYSIS
> {
> + const size_t batch_cnt = dp_packet_batch_size(batch);
> +#if !defined(__CHECKER__) && !defined(_WIN32)
> + const size_t PKT_ARRAY_SIZE = batch_cnt;
> +#else
> + /* Sparse or MSVC doesn't like variable length array. */
> + enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
> +#endif
> struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> - struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
> - struct netdev_dpdk_sw_stats sw_stats_add;
> - unsigned int n_packets_to_free = cnt;
> - unsigned int total_packets = cnt;
> - int i, retries = 0;
> - int max_retries = VHOST_ENQ_RETRY_MIN;
> - int vid = netdev_dpdk_get_vid(dev);
> -
> - qid = dev->tx_q[qid % netdev->n_txq].map;
> -
> - if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
> - || !(dev->flags & NETDEV_UP))) {
> - rte_spinlock_lock(&dev->stats_lock);
> - dev->stats.tx_dropped+= cnt;
> - rte_spinlock_unlock(&dev->stats_lock);
> - goto out;
> - }
> -
> - if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev->tx_q[qid].tx_lock))) {
> - COVERAGE_INC(vhost_tx_contention);
> - rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> - }
> + struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
> + struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
> + uint32_t cnt = batch_cnt;
> + uint32_t dropped = 0;
> + uint32_t tx_failure = 0;
> + uint32_t mtu_drops = 0;
> + uint32_t qos_drops = 0;
> + bool vhost = true;
>
> - sw_stats_add.tx_invalid_hwol_drops = cnt;
> - if (userspace_tso_enabled()) {
> - cnt = netdev_dpdk_prep_hwol_batch(dev, cur_pkts, cnt);
> + if (dev->type != DPDK_DEV_VHOST) {
> + vhost = false;
> + /* Check if QoS has been configured for this netdev. */
> + cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
> + batch_cnt, false);
> + qos_drops = batch_cnt - cnt;
> }
>
> - sw_stats_add.tx_invalid_hwol_drops -= cnt;
> - sw_stats_add.tx_mtu_exceeded_drops = cnt;
> - cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
> - sw_stats_add.tx_mtu_exceeded_drops -= cnt;
> + uint32_t txcnt = 0;
>
> - /* Check has QoS has been configured for the netdev */
> - sw_stats_add.tx_qos_drops = cnt;
> - cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
> - sw_stats_add.tx_qos_drops -= cnt;
> + for (uint32_t i = 0; i < cnt; i++) {
> + struct dp_packet *packet = batch->packets[i];
> + uint32_t size = dp_packet_size(packet);
>
> - n_packets_to_free = cnt;
> + if (OVS_UNLIKELY(size > dev->max_packet_len)) {
> + VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
> + size, dev->max_packet_len);
> + mtu_drops++;
I think this will drop all TSO packet.
> + continue;
> + }
>
> - do {
> - int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
> - unsigned int tx_pkts;
> -
> - tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
> - if (OVS_LIKELY(tx_pkts)) {
> - /* Packets have been sent.*/
> - cnt -= tx_pkts;
> - /* Prepare for possible retry.*/
> - cur_pkts = &cur_pkts[tx_pkts];
> - if (OVS_UNLIKELY(cnt && !retries)) {
> - /*
> - * Read max retries as there are packets not sent
> - * and no retries have already occurred.
> - */
> - atomic_read_relaxed(&dev->vhost_tx_retries_max,
> &max_retries);
> - }
> - } else {
> - /* No packets sent - do not retry.*/
> + pkts[txcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
> + if (OVS_UNLIKELY(!pkts[txcnt])) {
> + dropped = cnt - i;
> break;
> }
> - } while (cnt && (retries++ < max_retries));
> -
> - rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>
> - sw_stats_add.tx_failure_drops = cnt;
> - sw_stats_add.tx_retries = MIN(retries, max_retries);
> + /* We have to do a copy for now. */
> + memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
> + dp_packet_data(packet), size);
> + dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
This also doesn't support TSO because we need to allocate
buffers using rte_malloc() and attach it as an external buffer.
>
> - rte_spinlock_lock(&dev->stats_lock);
> - netdev_dpdk_vhost_update_tx_counters(dev, pkts, total_packets,
> - &sw_stats_add);
> - rte_spinlock_unlock(&dev->stats_lock);
> -
> -out:
> - for (i = 0; i < n_packets_to_free; i++) {
> - dp_packet_delete(pkts[i]);
> - }
> -}
> -
> -static void
> -netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
> -{
> - rte_free(opaque);
> -}
> -
> -static struct rte_mbuf *
> -dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
> -{
> - uint32_t total_len = RTE_PKTMBUF_HEADROOM + data_len;
> - struct rte_mbuf_ext_shared_info *shinfo = NULL;
> - uint16_t buf_len;
> - void *buf;
> -
> - if (rte_pktmbuf_tailroom(pkt) >= sizeof *shinfo) {
> - shinfo = rte_pktmbuf_mtod(pkt, struct rte_mbuf_ext_shared_info *);
> - } else {
> - total_len += sizeof *shinfo + sizeof(uintptr_t);
> - total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
> - }
> -
> - if (OVS_UNLIKELY(total_len > UINT16_MAX)) {
> - VLOG_ERR("Can't copy packet: too big %u", total_len);
> - return NULL;
> - }
> -
> - buf_len = total_len;
> - buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
> - if (OVS_UNLIKELY(buf == NULL)) {
> - VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len);
> - return NULL;
> - }
> -
> - /* Initialize shinfo. */
> - if (shinfo) {
> - shinfo->free_cb = netdev_dpdk_extbuf_free;
> - shinfo->fcb_opaque = buf;
> - rte_mbuf_ext_refcnt_set(shinfo, 1);
> - } else {
> - shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
> - netdev_dpdk_extbuf_free,
> - buf);
> - if (OVS_UNLIKELY(shinfo == NULL)) {
> - rte_free(buf);
> - VLOG_ERR("Failed to initialize shared info for mbuf while "
> - "attempting to attach an external buffer.");
> - return NULL;
> - }
> - }
> -
> - rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len,
> - shinfo);
> - rte_pktmbuf_reset_headroom(pkt);
> -
> - return pkt;
> -}
> -
> -static struct rte_mbuf *
> -dpdk_pktmbuf_alloc(struct rte_mempool *mp, uint32_t data_len)
> -{
> - struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> -
> - if (OVS_UNLIKELY(!pkt)) {
> - return NULL;
> - }
> -
> - if (rte_pktmbuf_tailroom(pkt) >= data_len) {
> - return pkt;
> - }
> -
> - if (dpdk_pktmbuf_attach_extbuf(pkt, data_len)) {
> - return pkt;
> - }
> -
> - rte_pktmbuf_free(pkt);
> -
> - return NULL;
> -}
> -
> -static struct dp_packet *
> -dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet
> *pkt_orig)
> -{
> - struct rte_mbuf *mbuf_dest;
> - struct dp_packet *pkt_dest;
> - uint32_t pkt_len;
> -
> - pkt_len = dp_packet_size(pkt_orig);
> - mbuf_dest = dpdk_pktmbuf_alloc(mp, pkt_len);
> - if (OVS_UNLIKELY(mbuf_dest == NULL)) {
> - return NULL;
> - }
> -
> - pkt_dest = CONTAINER_OF(mbuf_dest, struct dp_packet, mbuf);
> - memcpy(dp_packet_data(pkt_dest), dp_packet_data(pkt_orig), pkt_len);
> - dp_packet_set_size(pkt_dest, pkt_len);
> -
> - mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
> - mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
> - mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
> - ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
> -
> - memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
> - sizeof(struct dp_packet) - offsetof(struct dp_packet,
> l2_pad_size));
> -
> - if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
> - mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
> - - (char *)dp_packet_eth(pkt_dest);
> - mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
> - - (char *) dp_packet_l3(pkt_dest);
> - }
> -
> - return pkt_dest;
> -}
> -
> -/* Tx function. Transmit packets indefinitely */
> -static void
> -dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch
> *batch)
> - OVS_NO_THREAD_SAFETY_ANALYSIS
> -{
> - const size_t batch_cnt = dp_packet_batch_size(batch);
> -#if !defined(__CHECKER__) && !defined(_WIN32)
> - const size_t PKT_ARRAY_SIZE = batch_cnt;
> -#else
> - /* Sparse or MSVC doesn't like variable length array. */
> - enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
> -#endif
> - struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> - struct dp_packet *pkts[PKT_ARRAY_SIZE];
> - struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
> - uint32_t cnt = batch_cnt;
> - uint32_t dropped = 0;
> - uint32_t tx_failure = 0;
> - uint32_t mtu_drops = 0;
> - uint32_t qos_drops = 0;
> -
> - if (dev->type != DPDK_DEV_VHOST) {
> - /* Check if QoS has been configured for this netdev. */
> - cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
> - batch_cnt, false);
> - qos_drops = batch_cnt - cnt;
> - }
> -
> - uint32_t txcnt = 0;
> -
> - for (uint32_t i = 0; i < cnt; i++) {
> - struct dp_packet *packet = batch->packets[i];
> - uint32_t size = dp_packet_size(packet);
> -
> - if (size > dev->max_packet_len
> - && !(packet->mbuf.ol_flags & PKT_TX_TCP_SEG)) {
> - VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d", size,
> - dev->max_packet_len);
> - mtu_drops++;
> - continue;
> - }
> -
> - pkts[txcnt] = dpdk_copy_dp_packet_to_mbuf(dev->dpdk_mp->mp, packet);
> - if (OVS_UNLIKELY(!pkts[txcnt])) {
> - dropped = cnt - i;
> - break;
> - }
> -
> - txcnt++;
> - }
> + txcnt++;
> + }
>
> if (OVS_LIKELY(txcnt)) {
> - if (dev->type == DPDK_DEV_VHOST) {
> - __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt);
> - } else {
> - tx_failure += netdev_dpdk_eth_tx_burst(dev, qid,
> - (struct rte_mbuf **)pkts,
> - txcnt);
> - }
> + dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, txcnt, vhost);
> }
>
> dropped += qos_drops + mtu_drops + tx_failure;
> @@ -2819,26 +2610,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,
> struct dp_packet_batch *batch)
> }
> }
>
> -static int
> -netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> - struct dp_packet_batch *batch,
> - bool concurrent_txq OVS_UNUSED)
> -{
> -
> - if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
> - dpdk_do_tx_copy(netdev, qid, batch);
> - dp_packet_delete_batch(batch, true);
> - } else {
> - __netdev_dpdk_vhost_send(netdev, qid, batch->packets,
> - dp_packet_batch_size(batch));
> - }
> - return 0;
> -}
> -
> static inline void
> netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
> struct dp_packet_batch *batch,
> - bool concurrent_txq)
> + bool concurrent_txq, bool vhost)
> {
> if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
> dp_packet_delete_batch(batch, true);
> @@ -2846,7 +2621,6 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
> }
>
> if (OVS_UNLIKELY(concurrent_txq)) {
> - qid = qid % dev->up.n_txq;
> rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> }
>
> @@ -2874,7 +2648,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
> batch_cnt = netdev_dpdk_qos_run(dev, pkts, batch_cnt, true);
> qos_drops -= batch_cnt;
>
> - tx_failure = netdev_dpdk_eth_tx_burst(dev, qid, pkts, batch_cnt);
> + tx_failure = netdev_dpdk_eth_tx_burst(dev, qid, pkts, batch_cnt,
> + vhost);
>
> dropped = tx_failure + mtu_drops + qos_drops + hwol_drops;
> if (OVS_UNLIKELY(dropped)) {
> @@ -2899,7 +2674,31 @@ netdev_dpdk_eth_send(struct netdev *netdev, int qid,
> {
> struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
> - netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
> + if (concurrent_txq) {
> + qid = qid % dev->up.n_txq;
> + }
> +
> + netdev_dpdk_send__(dev, qid, batch, concurrent_txq, false);
> + return 0;
> +}
> +
> +static int
> +netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> + struct dp_packet_batch *batch,
> + bool concurrent_txq OVS_UNUSED)
> +{
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> + qid = dev->tx_q[qid % netdev->n_txq].map;
> + if (qid == -1 || !dev->vhost_reconfigured) {
> + rte_spinlock_lock(&dev->stats_lock);
> + dev->stats.tx_dropped+= batch->count;
> + rte_spinlock_unlock(&dev->stats_lock);
> + dp_packet_delete_batch(batch, true);
> + } else {
> + netdev_dpdk_send__(dev, qid, batch, false, true);
Why concurrent_txq is disabled? It won't lock the TX queue
which it does before the patch and I don't see any other
protection.
Also this change obsoleted vhost_tx_contention coverage inc
reverting 9ff24b9c93 from David without a note.
> + }
> +
> return 0;
> }
>
> @@ -2977,41 +2776,6 @@ netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
> static int
> netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
>
> -static int
> -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
> - struct netdev_stats *stats)
> -{
> - struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -
> - ovs_mutex_lock(&dev->mutex);
> -
> - rte_spinlock_lock(&dev->stats_lock);
> - /* Supported Stats */
> - stats->rx_packets = dev->stats.rx_packets;
> - stats->tx_packets = dev->stats.tx_packets;
> - stats->rx_dropped = dev->stats.rx_dropped;
> - stats->tx_dropped = dev->stats.tx_dropped;
> - stats->multicast = dev->stats.multicast;
> - stats->rx_bytes = dev->stats.rx_bytes;
> - stats->tx_bytes = dev->stats.tx_bytes;
> - stats->rx_errors = dev->stats.rx_errors;
> - stats->rx_length_errors = dev->stats.rx_length_errors;
> -
> - stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
> - stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
> - stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
> - stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
> - stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
> - stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
> - stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
> -
> - rte_spinlock_unlock(&dev->stats_lock);
> -
> - ovs_mutex_unlock(&dev->mutex);
> -
> - return 0;
> -}
> -
> static void
> netdev_dpdk_convert_xstats(struct netdev_stats *stats,
> const struct rte_eth_xstat *xstats,
> @@ -3025,6 +2789,7 @@ netdev_dpdk_convert_xstats(struct netdev_stats *stats,
> DPDK_XSTAT(rx_broadcast_packets, "rx_broadcast_packets" ) \
> DPDK_XSTAT(tx_broadcast_packets, "tx_broadcast_packets" ) \
> DPDK_XSTAT(rx_undersized_errors, "rx_undersized_errors" ) \
> + DPDK_XSTAT(rx_undersize_packets, "rx_undersize_packets" ) \
> DPDK_XSTAT(rx_oversize_errors, "rx_oversize_errors" ) \
> DPDK_XSTAT(rx_fragmented_errors, "rx_fragmented_errors" ) \
> DPDK_XSTAT(rx_jabber_errors, "rx_jabber_errors" ) \
> @@ -3069,6 +2834,11 @@ netdev_dpdk_get_stats(const struct netdev *netdev,
> struct netdev_stats *stats)
> struct rte_eth_xstat_name *rte_xstats_names = NULL;
> int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
>
> + if (!rte_eth_dev_is_valid_port(dev->port_id)) {
> + ovs_mutex_unlock(&dev->mutex);
> + return EPROTO;
> + }
> +
> if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
> VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
> dev->port_id);
> @@ -3147,6 +2917,10 @@ netdev_dpdk_get_custom_stats(const struct netdev
> *netdev,
>
> ovs_mutex_lock(&dev->mutex);
>
> + if (rte_eth_dev_is_valid_port(dev->port_id)) {
> + goto out;
> + }
> +
> if (netdev_dpdk_configure_xstats(dev)) {
> uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
> sizeof(uint64_t));
> @@ -3182,6 +2956,7 @@ netdev_dpdk_get_custom_stats(const struct netdev
> *netdev,
> free(values);
> }
>
> +out:
> ovs_mutex_unlock(&dev->mutex);
>
> return 0;
> @@ -3409,24 +3184,6 @@ netdev_dpdk_get_carrier(const struct netdev *netdev,
> bool *carrier)
> return 0;
> }
>
> -static int
> -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
> -{
> - struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -
> - ovs_mutex_lock(&dev->mutex);
> -
> - if (is_vhost_running(dev)) {
> - *carrier = 1;
> - } else {
> - *carrier = 0;
> - }
> -
> - ovs_mutex_unlock(&dev->mutex);
> -
> - return 0;
> -}
> -
> static long long int
> netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
> {
> @@ -3496,8 +3253,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
> * running then change netdev's change_seq to trigger link state
> * update. */
>
> - if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
> - && is_vhost_running(dev)) {
> + if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))) {
> netdev_change_seq_changed(&dev->up);
>
> /* Clear statistics if device is getting up. */
> @@ -3527,18 +3283,41 @@ netdev_dpdk_update_flags(struct netdev *netdev,
> return error;
> }
>
> +static void
> +common_get_status(struct smap *args, struct netdev_dpdk *dev,
> + struct rte_eth_dev_info *dev_info)
> +{
> + smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
> + smap_add_format(args, "numa_id", "%d",
> + rte_eth_dev_socket_id(dev->port_id));
> + smap_add_format(args, "driver_name", "%s", dev_info->driver_name);
> + smap_add_format(args, "min_rx_bufsize", "%u", dev_info->min_rx_bufsize);
> + smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
> + smap_add_format(args, "max_rx_queues", "%u", dev_info->max_rx_queues);
> + smap_add_format(args, "max_tx_queues", "%u", dev_info->max_tx_queues);
> + smap_add_format(args, "max_mac_addrs", "%u", dev_info->max_mac_addrs);
> +}
> +
> static int
> netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
> struct smap *args)
> {
> struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> + struct rte_eth_dev_info dev_info;
> +
> + if (!rte_eth_dev_is_valid_port(dev->port_id)) {
> + return ENODEV;
> + }
>
> ovs_mutex_lock(&dev->mutex);
> + rte_eth_dev_info_get(dev->port_id, &dev_info);
> +
> + common_get_status(args, dev, &dev_info);
>
> bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
> smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
>
> - int vid = netdev_dpdk_get_vid(dev);
> + int vid = rte_eth_vhost_get_vid_from_port_id(dev->port_id);;
> if (vid < 0) {
> smap_add_format(args, "status", "disconnected");
> ovs_mutex_unlock(&dev->mutex);
> @@ -3638,15 +3417,8 @@ netdev_dpdk_get_status(const struct netdev *netdev,
> struct smap *args)
> }
> ovs_mutex_unlock(&dpdk_mutex);
>
> - smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
> - smap_add_format(args, "numa_id", "%d",
> - rte_eth_dev_socket_id(dev->port_id));
> - smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
> - smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
> - smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
> - smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
> - smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
> - smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
> + common_get_status(args, dev, &dev_info);
> +
> smap_add_format(args, "max_hash_mac_addrs", "%u",
> dev_info.max_hash_mac_addrs);
> smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
> @@ -3843,19 +3615,6 @@ out:
> netdev_close(netdev);
> }
>
> -/*
> - * Set virtqueue flags so that we do not receive interrupts.
> - */
> -static void
> -set_irq_status(int vid)
> -{
> - uint32_t i;
> -
> - for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
> - rte_vhost_enable_guest_notification(vid, i, 0);
> - }
> -}
> -
> /*
> * Fixes mapping for vhost-user tx queues. Must be called after each
> * enabling/disabling of queues and n_txq modifications.
> @@ -3905,53 +3664,62 @@ netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
> free(enabled_queues);
> }
>
> -/*
> - * A new virtio-net device is added to a vhost port.
> - */
> static int
> -new_device(int vid)
> +link_status_changed_callback(dpdk_port_t port_id,
> + enum rte_eth_event_type type OVS_UNUSED,
> + void *param OVS_UNUSED,
> + void *ret_param OVS_UNUSED)
> {
> struct netdev_dpdk *dev;
> bool exists = false;
> int newnode = 0;
> - char ifname[IF_NAME_SZ];
> -
> - rte_vhost_get_ifname(vid, ifname, sizeof ifname);
>
> ovs_mutex_lock(&dpdk_mutex);
> /* Add device to the vhost port with the same name as that passed down.
> */
> LIST_FOR_EACH(dev, list_node, &dpdk_list) {
> ovs_mutex_lock(&dev->mutex);
> - if (nullable_string_is_equal(ifname, dev->vhost_id)) {
> - uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
> -
> - /* Get NUMA information */
> - newnode = rte_vhost_get_numa_node(vid);
> - if (newnode == -1) {
> + if (port_id == dev->port_id) {
> + check_link_status(dev);
> + if (dev->link.link_status == ETH_LINK_UP) {
> + /* Device brought up. */
> + /* Get queue information. */
> + int vid = rte_eth_vhost_get_vid_from_port_id(dev->port_id);
> + uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
> + if (qp_num <= 0) {
> + qp_num = dev->requested_n_rxq;
> + }
> + /* Get NUMA information. */
> + newnode = rte_eth_dev_socket_id(dev->port_id);
> + if (newnode == -1) {
> #ifdef VHOST_NUMA
> - VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
> - ifname);
> + VLOG_INFO("Error getting NUMA info for vHost Device
> '%s'",
> + dev->vhost_id);
> #endif
> - newnode = dev->socket_id;
> - }
> + newnode = dev->socket_id;
> + }
> + if (dev->requested_n_txq != qp_num
> + || dev->requested_n_rxq != qp_num
> + || dev->requested_socket_id != newnode) {
> + dev->requested_socket_id = newnode;
> + dev->requested_n_rxq = qp_num;
> + dev->requested_n_txq = qp_num;
> + netdev_request_reconfigure(&dev->up);
> + } else {
> + /* Reconfiguration not required. */
> + dev->vhost_reconfigured = true;
> + }
>
> - if (dev->requested_n_txq < qp_num
> - || dev->requested_n_rxq < qp_num
> - || dev->requested_socket_id != newnode) {
> - dev->requested_socket_id = newnode;
> - dev->requested_n_rxq = qp_num;
> - dev->requested_n_txq = qp_num;
> - netdev_request_reconfigure(&dev->up);
> + VLOG_INFO("vHost Device '%s' has been added on numa node %i",
> + dev->vhost_id, newnode);
> } else {
> - /* Reconfiguration not required. */
> - dev->vhost_reconfigured = true;
> + /* Device brought down. */
> + dev->vhost_reconfigured = false;
> + memset(dev->vhost_rxq_enabled, 0,
> + dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
> + netdev_dpdk_txq_map_clear(dev);
> + VLOG_INFO("vHost Device '%s' has been removed",
> dev->vhost_id);
> }
> -
> - ovsrcu_index_set(&dev->vid, vid);
> exists = true;
> -
> - /* Disable notifications. */
> - set_irq_status(vid);
> netdev_change_seq_changed(&dev->up);
> ovs_mutex_unlock(&dev->mutex);
> break;
> @@ -3961,14 +3729,11 @@ new_device(int vid)
> ovs_mutex_unlock(&dpdk_mutex);
>
> if (!exists) {
> - VLOG_INFO("vHost Device '%s' can't be added - name not found",
> ifname);
> + VLOG_INFO("vHost Device with port id %i not found", port_id);
>
> return -1;
> }
>
> - VLOG_INFO("vHost Device '%s' has been added on numa node %i",
> - ifname, newnode);
> -
> return 0;
> }
>
> @@ -3984,175 +3749,63 @@ netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
> }
> }
>
> -/*
> - * Remove a virtio-net device from the specific vhost port. Use dev->remove
> - * flag to stop any more packets from being sent or received to/from a VM and
> - * ensure all currently queued packets have been sent/received before
> removing
> - * the device.
> - */
> -static void
> -destroy_device(int vid)
> +static int
> +vring_state_changed_callback(dpdk_port_t port_id,
> + enum rte_eth_event_type type OVS_UNUSED,
> + void *param OVS_UNUSED,
> + void *ret_param OVS_UNUSED)
> {
> struct netdev_dpdk *dev;
> bool exists = false;
> - char ifname[IF_NAME_SZ];
> -
> - rte_vhost_get_ifname(vid, ifname, sizeof ifname);
> -
> - ovs_mutex_lock(&dpdk_mutex);
> - LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> - if (netdev_dpdk_get_vid(dev) == vid) {
> -
> - ovs_mutex_lock(&dev->mutex);
> - dev->vhost_reconfigured = false;
> - ovsrcu_index_set(&dev->vid, -1);
> - memset(dev->vhost_rxq_enabled, 0,
> - dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
> - netdev_dpdk_txq_map_clear(dev);
> -
> - netdev_change_seq_changed(&dev->up);
> - ovs_mutex_unlock(&dev->mutex);
> - exists = true;
> - break;
> - }
> - }
> -
> - ovs_mutex_unlock(&dpdk_mutex);
> + bool is_rx;
> + uint16_t qid;
> + struct rte_eth_vhost_queue_event event;
>
> - if (exists) {
> - /*
> - * Wait for other threads to quiesce after setting the 'virtio_dev'
> - * to NULL, before returning.
> - */
> - ovsrcu_synchronize();
> - /*
> - * As call to ovsrcu_synchronize() will end the quiescent state,
> - * put thread back into quiescent state before returning.
> - */
> - ovsrcu_quiesce_start();
> - VLOG_INFO("vHost Device '%s' has been removed", ifname);
> - } else {
> - VLOG_INFO("vHost Device '%s' not found", ifname);
> + if (rte_eth_vhost_get_queue_event(port_id, &event)) {
> + return 0;
> }
> -}
>
> -static int
> -vring_state_changed(int vid, uint16_t queue_id, int enable)
> -{
> - struct netdev_dpdk *dev;
> - bool exists = false;
> - int qid = queue_id / VIRTIO_QNUM;
> - bool is_rx = (queue_id % VIRTIO_QNUM) == VIRTIO_TXQ;
> - char ifname[IF_NAME_SZ];
> -
> - rte_vhost_get_ifname(vid, ifname, sizeof ifname);
> + is_rx = event.rx;
> + qid = event.queue_id;
>
> ovs_mutex_lock(&dpdk_mutex);
> LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> ovs_mutex_lock(&dev->mutex);
> - if (nullable_string_is_equal(ifname, dev->vhost_id)) {
> + if (port_id == dev->port_id) {
> if (is_rx) {
> bool old_state = dev->vhost_rxq_enabled[qid];
>
> - dev->vhost_rxq_enabled[qid] = enable != 0;
> + dev->vhost_rxq_enabled[qid] = event.enable != 0;
> if (old_state != dev->vhost_rxq_enabled[qid]) {
> netdev_change_seq_changed(&dev->up);
> }
> } else {
> - if (enable) {
> - dev->tx_q[qid].map = qid;
> + if (event.enable) {
> + dev->tx_q[event.queue_id].map = event.queue_id;
> } else {
> - dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> + dev->tx_q[event.queue_id].map = OVS_VHOST_QUEUE_DISABLED;
> }
> - netdev_dpdk_remap_txqs(dev);
> - }
> - exists = true;
> - ovs_mutex_unlock(&dev->mutex);
> - break;
> + exists = true;
> + ovs_mutex_unlock(&dev->mutex);
> + break;
> + }
> }
> ovs_mutex_unlock(&dev->mutex);
> }
> ovs_mutex_unlock(&dpdk_mutex);
>
> if (exists) {
> - VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
> - "changed to \'%s\'", queue_id, is_rx == true ? "rx" : "tx",
> - qid, ifname, (enable == 1) ? "enabled" : "disabled");
> + VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device "
> + "changed to \'%s\'", qid, is_rx == true ? "rx" : "tx",
> + qid, (event.enable == 1) ? "enabled" : "disabled");
> } else {
> - VLOG_INFO("vHost Device '%s' not found", ifname);
> + VLOG_INFO("vHost Device not found");
> return -1;
> }
>
> return 0;
> }
>
> -static void
> -destroy_connection(int vid)
> -{
> - struct netdev_dpdk *dev;
> - char ifname[IF_NAME_SZ];
> - bool exists = false;
> -
> - rte_vhost_get_ifname(vid, ifname, sizeof ifname);
> -
> - ovs_mutex_lock(&dpdk_mutex);
> - LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> - ovs_mutex_lock(&dev->mutex);
> - if (nullable_string_is_equal(ifname, dev->vhost_id)) {
> - uint32_t qp_num = NR_QUEUE;
> -
> - if (netdev_dpdk_get_vid(dev) >= 0) {
> - VLOG_ERR("Connection on socket '%s' destroyed while vhost "
> - "device still attached.", dev->vhost_id);
> - }
> -
> - /* Restore the number of queue pairs to default. */
> - if (dev->requested_n_txq != qp_num
> - || dev->requested_n_rxq != qp_num) {
> - dev->requested_n_rxq = qp_num;
> - dev->requested_n_txq = qp_num;
> - netdev_request_reconfigure(&dev->up);
> - }
> - ovs_mutex_unlock(&dev->mutex);
> - exists = true;
> - break;
> - }
> - ovs_mutex_unlock(&dev->mutex);
> - }
> - ovs_mutex_unlock(&dpdk_mutex);
> -
> - if (exists) {
> - VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname);
> - } else {
> - VLOG_INFO("vHost Device '%s' not found", ifname);
> - }
> -}
> -
> -static
> -void vhost_guest_notified(int vid OVS_UNUSED)
> -{
> - COVERAGE_INC(vhost_notification);
> -}
> -
> -/*
> - * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
> - * or vhostuserclient netdev.
> - *
> - * Returns a value greater or equal to zero for a valid vid or '-1' if
> - * there is no valid vid associated. A vid of '-1' must not be used in
> - * rte_vhost_ APi calls.
> - *
> - * Once obtained and validated, a vid can be used by a PMD for multiple
> - * subsequent rte_vhost API calls until the PMD quiesces. A PMD should
> - * not fetch the vid again for each of a series of API calls.
> - */
> -
> -int
> -netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
> -{
> - return ovsrcu_index_get(&dev->vid);
> -}
> -
> struct ingress_policer *
> netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
> {
> @@ -4893,13 +4546,12 @@ static const struct dpdk_qos_ops trtcm_policer_ops = {
> };
>
> static int
> -netdev_dpdk_reconfigure(struct netdev *netdev)
> +common_reconfigure(struct netdev *netdev)
> + OVS_REQUIRES(dev->mutex)
> {
> struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> int err = 0;
>
> - ovs_mutex_lock(&dev->mutex);
> -
> if (netdev->n_txq == dev->requested_n_txq
> && netdev->n_rxq == dev->requested_n_rxq
> && dev->mtu == dev->requested_mtu
> @@ -4956,17 +4608,36 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
> netdev_change_seq_changed(netdev);
>
> out:
> + return err;
> +}
> +
> +static int
> +netdev_dpdk_reconfigure(struct netdev *netdev)
> +{
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> + int err = 0;
> +
> + ovs_mutex_lock(&dev->mutex);
> + err = common_reconfigure(netdev);
> ovs_mutex_unlock(&dev->mutex);
> +
> return err;
> }
>
> static int
> -dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
> +dpdk_vhost_reconfigure_helper(struct netdev *netdev)
> OVS_REQUIRES(dev->mutex)
> {
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> + int err;
> +
> dev->up.n_txq = dev->requested_n_txq;
> dev->up.n_rxq = dev->requested_n_rxq;
> - int err;
> +
> + err = common_reconfigure(netdev);
> + if (err) {
> + return err;
> + }
>
> /* Always keep RX queue 0 enabled for implementations that won't
> * report vring states. */
> @@ -4984,14 +4655,7 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
>
> netdev_dpdk_remap_txqs(dev);
>
> - err = netdev_dpdk_mempool_configure(dev);
> - if (!err) {
> - /* A new mempool was created or re-used. */
> - netdev_change_seq_changed(&dev->up);
> - } else if (err != EEXIST) {
> - return err;
> - }
> - if (netdev_dpdk_get_vid(dev) >= 0) {
> + if (rte_eth_vhost_get_vid_from_port_id(dev->port_id) >= 0) {
> if (dev->vhost_reconfigured == false) {
> dev->vhost_reconfigured = true;
> /* Carrier status may need updating. */
> @@ -5009,7 +4673,7 @@ netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
> int err;
>
> ovs_mutex_lock(&dev->mutex);
> - err = dpdk_vhost_reconfigure_helper(dev);
> + err = dpdk_vhost_reconfigure_helper(netdev);
> ovs_mutex_unlock(&dev->mutex);
>
> return err;
> @@ -5020,9 +4684,8 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev
> *netdev)
> {
> struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> int err;
> - uint64_t vhost_flags = 0;
> - uint64_t vhost_unsup_flags;
> - bool zc_enabled;
> + int sid = -1;
> + struct rte_eth_dev_info dev_info;
>
> ovs_mutex_lock(&dev->mutex);
>
> @@ -5032,90 +4695,65 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev
> *netdev)
> * 2. A path has been specified.
> */
> if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id)
> {
> - /* Register client-mode device. */
> - vhost_flags |= RTE_VHOST_USER_CLIENT;
> -
> - /* There is no support for multi-segments buffers. */
> - vhost_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
> + /* First time once-only configuration. */
> + err = dpdk_attach_vhost_pmd(dev, VHOST_CLIENT_MODE);
> +
> + if (!err) {
> + sid = rte_eth_dev_socket_id(dev->port_id);
> + dev->socket_id = sid < 0 ? SOCKET0 : sid;
> + dev->vhost_driver_flags |= RTE_VHOST_USER_CLIENT;
> +
> + if (dev->requested_socket_id != dev->socket_id
> + || dev->requested_mtu != dev->mtu) {
> + err = netdev_dpdk_mempool_configure(dev);
> + if (err && err != EEXIST) {
> + goto unlock;
> + }
> + }
>
> - /* Enable IOMMU support, if explicitly requested. */
> - if (dpdk_vhost_iommu_enabled()) {
> - vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
> - }
> + netdev->n_txq = dev->requested_n_txq;
> + netdev->n_rxq = dev->requested_n_rxq;
>
> - /* Enable POSTCOPY support, if explicitly requested. */
> - if (dpdk_vhost_postcopy_enabled()) {
> - vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
> - }
> + rte_free(dev->vhost_rxq_enabled);
> + dev->vhost_rxq_enabled = dpdk_rte_mzalloc(
> + OVS_VHOST_MAX_QUEUE_NUM *
> + sizeof *dev->vhost_rxq_enabled);
> + if (!dev->vhost_rxq_enabled) {
> + err = ENOMEM;
> + goto err_unlock;
> + }
>
> - zc_enabled = dev->vhost_driver_flags
> - & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> - /* Enable zero copy flag, if requested */
> - if (zc_enabled) {
> - vhost_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> - }
> + rte_free(dev->tx_q);
> + err = dpdk_eth_dev_init(dev);
> + dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
> + if (!dev->tx_q) {
> + err = ENOMEM;
> + goto err_unlock;
> + }
>
> - /* Enable External Buffers if TCP Segmentation Offload is enabled. */
> - if (userspace_tso_enabled()) {
> - vhost_flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
> - }
> + netdev_change_seq_changed(netdev);
>
> - err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
> - if (err) {
> - VLOG_ERR("vhost-user device setup failure for device %s\n",
> - dev->vhost_id);
> - goto unlock;
> - } else {
> - /* Configuration successful */
> - dev->vhost_driver_flags |= vhost_flags;
> VLOG_INFO("vHost User device '%s' created in 'client' mode, "
> "using client socket '%s'",
> dev->up.name, dev->vhost_id);
> - if (zc_enabled) {
> - VLOG_INFO("Zero copy enabled for vHost port %s",
> dev->up.name);
> - }
> - }
> -
> - err = rte_vhost_driver_callback_register(dev->vhost_id,
> - &virtio_net_device_ops);
> - if (err) {
> - VLOG_ERR("rte_vhost_driver_callback_register failed for "
> - "vhost user client port: %s\n", dev->up.name);
> - goto unlock;
> - }
> -
> - if (userspace_tso_enabled()) {
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_HOST_ECN
> - | 1ULL << VIRTIO_NET_F_HOST_UFO;
> - } else {
> - /* This disables checksum offloading and all the features
> - * that depends on it (TSO, UFO, ECN) according to virtio
> - * specification. */
> - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_CSUM;
> - }
> -
> - err = rte_vhost_driver_disable_features(dev->vhost_id,
> - vhost_unsup_flags);
> - if (err) {
> - VLOG_ERR("rte_vhost_driver_disable_features failed for "
> - "vhost user client port: %s\n", dev->up.name);
> - goto unlock;
> }
> + goto unlock;
> + }
>
> - err = rte_vhost_driver_start(dev->vhost_id);
> - if (err) {
> - VLOG_ERR("rte_vhost_driver_start failed for vhost user "
> - "client port: %s\n", dev->up.name);
> - goto unlock;
> - }
> + if (rte_eth_dev_is_valid_port(dev->port_id)) {
> + err = dpdk_vhost_reconfigure_helper(netdev);
> + goto unlock;
> }
>
> - err = dpdk_vhost_reconfigure_helper(dev);
> +err_unlock:
> + rte_eth_dev_info_get(dev->port_id, &dev_info);
> + if (!dev_info.device || rte_dev_remove(dev_info.device)) {
> + VLOG_ERR("Error detatching device\n");
> + }
> + if (dev->vhost_pmd_id >= 0) {
> + id_pool_free_id(dpdk_get_vhost_id_pool(),
> + dev->vhost_pmd_id);
> + }
>
> unlock:
> ovs_mutex_unlock(&dev->mutex);
> @@ -5279,9 +4917,9 @@ static const struct netdev_class dpdk_vhost_class = {
> .construct = netdev_dpdk_vhost_construct,
> .destruct = netdev_dpdk_vhost_destruct,
> .send = netdev_dpdk_vhost_send,
> - .get_carrier = netdev_dpdk_vhost_get_carrier,
> - .get_stats = netdev_dpdk_vhost_get_stats,
> - .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
> + .get_carrier = netdev_dpdk_get_carrier,
> + .get_stats = netdev_dpdk_get_stats,
> + .get_custom_stats = netdev_dpdk_get_custom_stats,
> .get_status = netdev_dpdk_vhost_user_get_status,
> .reconfigure = netdev_dpdk_vhost_reconfigure,
> .rxq_recv = netdev_dpdk_vhost_rxq_recv,
> @@ -5295,9 +4933,9 @@ static const struct netdev_class
> dpdk_vhost_client_class = {
> .destruct = netdev_dpdk_vhost_destruct,
> .set_config = netdev_dpdk_vhost_client_set_config,
> .send = netdev_dpdk_vhost_send,
> - .get_carrier = netdev_dpdk_vhost_get_carrier,
> - .get_stats = netdev_dpdk_vhost_get_stats,
> - .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
> + .get_carrier = netdev_dpdk_get_carrier,
> + .get_stats = netdev_dpdk_get_stats,
> + .get_custom_stats = netdev_dpdk_get_custom_stats,
> .get_status = netdev_dpdk_vhost_user_get_status,
> .reconfigure = netdev_dpdk_vhost_client_reconfigure,
> .rxq_recv = netdev_dpdk_vhost_rxq_recv,
I would suggest to move get_stats, get_carrier and get_custom_stats
to the dpdk class base since now all classes use the same functions.
It will make the benefits of this patch more visible. :-)
Thanks,
--
fbl
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev