This patch adds the initial support for DPDK vHost async API's to offload the memory copy operations to the hardware.
Signed-off-by: Sunil Pai G <[email protected]> --- lib/dpdk-stub.c | 6 + lib/dpdk.c | 39 +++ lib/dpdk.h | 1 + lib/netdev-dpdk.c | 823 ++++++++++++++++++++++++++++++++++++++++++++-- lib/netdev-dpdk.h | 20 ++ 5 files changed, 865 insertions(+), 24 deletions(-) diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c index b7d577870..5404e2056 100644 --- a/lib/dpdk-stub.c +++ b/lib/dpdk-stub.c @@ -62,6 +62,12 @@ dpdk_vhost_postcopy_enabled(void) return false; } +bool +dpdk_vhost_async_enabled(void) +{ + return false; +} + bool dpdk_per_port_memory(void) { diff --git a/lib/dpdk.c b/lib/dpdk.c index b2ef31cd2..9736283a2 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -54,6 +54,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization * of DPDK. */ static bool per_port_memory = false; /* Status of per port memory support */ +static bool vhost_async_copy_enabled = false; /* Status of vhost async + support. */ + static int process_vhost_flags(char *flag, const char *default_val, int size, const struct smap *ovs_other_config, @@ -397,6 +400,22 @@ dpdk_init__(const struct smap *ovs_other_config) VLOG_INFO("POSTCOPY support for vhost-user-client %s.", vhost_postcopy_enabled ? "enabled" : "disabled"); + vhost_async_copy_enabled = smap_get_bool(ovs_other_config, + "vhost-async-support", false); + if (vhost_async_copy_enabled) { + if (vhost_postcopy_enabled) { + VLOG_WARN("Async-copy and post-copy are not compatible " + "for vhost-user-client. Disabling POSTCOPY support."); + vhost_postcopy_enabled = false; + } + + if (vhost_iommu_enabled) { + vhost_iommu_enabled = false; + VLOG_WARN("Async copy is not compatible with IOMMU support for" + " vhost-user-client. IOMMU support disabled."); + } + } + per_port_memory = smap_get_bool(ovs_other_config, "per-port-memory", false); VLOG_INFO("Per port memory for DPDK devices %s.", @@ -456,6 +475,20 @@ dpdk_init__(const struct smap *ovs_other_config) /* Make sure things are initialized ... */ result = rte_eal_init(args.n, argv); + if (vhost_async_copy_enabled) { + /* TODO: Remove this check ? if PA , disable SW fallback ? */ + if (rte_eal_iova_mode() != RTE_IOVA_VA) { + VLOG_WARN("Async-copy for vhost-user-client requires IOVA as VA to" + " be enabled. Async support disabled."); + vhost_async_copy_enabled = false; + } else { + VLOG_INFO("Async support enabled for vhost-user-client."); + } + } else { + VLOG_INFO("Async support disabled for vhost-user-client."); + } + + free(argv); svec_destroy(&args); @@ -559,6 +592,12 @@ dpdk_vhost_postcopy_enabled(void) return vhost_postcopy_enabled; } +bool +dpdk_vhost_async_enabled(void) +{ + return vhost_async_copy_enabled; +} + bool dpdk_per_port_memory(void) { diff --git a/lib/dpdk.h b/lib/dpdk.h index 445a51d06..626d0f368 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -40,6 +40,7 @@ void dpdk_set_lcore_id(unsigned cpu); const char *dpdk_get_vhost_sock_dir(void); bool dpdk_vhost_iommu_enabled(void); bool dpdk_vhost_postcopy_enabled(void); +bool dpdk_vhost_async_enabled(void); bool dpdk_per_port_memory(void); bool dpdk_available(void); void print_dpdk_version(void); diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 1e6c5a92c..9fc53e5e2 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -29,6 +29,7 @@ #include <rte_bus_pci.h> #include <rte_config.h> #include <rte_cycles.h> +#include <rte_dmadev.h> #include <rte_errno.h> #include <rte_ethdev.h> #include <rte_flow.h> @@ -38,6 +39,7 @@ #include <rte_pci.h> #include <rte_version.h> #include <rte_vhost.h> +#include <rte_vhost_async.h> #include "cmap.h" #include "coverage.h" @@ -76,6 +78,23 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); COVERAGE_DEFINE(vhost_tx_contention); COVERAGE_DEFINE(vhost_notification); +COVERAGE_DEFINE(vhost_async_tx_poll); +COVERAGE_DEFINE(vhost_async_tx_poll_empty); +COVERAGE_DEFINE(vhost_async_rx_poll); +COVERAGE_DEFINE(vhost_async_rx_poll_empty); + +COVERAGE_DEFINE(vhost_async_tx_dma_ring_full); +COVERAGE_DEFINE(vhost_async_tx_compl_ring_full_drops); +COVERAGE_DEFINE(vhost_async_tx_compl_ring_empty); +COVERAGE_DEFINE(vhost_async_rx_dma_ring_full); +COVERAGE_DEFINE(vhost_async_rx_compl_ring_full_drops); +COVERAGE_DEFINE(vhost_async_rx_compl_ring_empty); +COVERAGE_DEFINE(vhost_async_rx_enqueue); +COVERAGE_DEFINE(vhost_async_tx_enqueue); +COVERAGE_DEFINE(vhost_async_tx_enqueue_sw_fallback); +COVERAGE_DEFINE(vhost_async_rx_enqueue_sw_fallback); + + #define DPDK_PORT_WATCHDOG_INTERVAL 5 @@ -145,6 +164,9 @@ typedef uint16_t dpdk_port_t; #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) +/* vHost async DMA ring size. */ +#define VHOST_ASYNC_DMA_RING_SIZE 4096 + /* List of required flags advertised by the hardware that will be used * if TSO is enabled. Ideally this should include DEV_TX_OFFLOAD_SCTP_CKSUM. * However, very few drivers supports that the moment and SCTP is not a @@ -192,6 +214,166 @@ static const struct vhost_device_ops virtio_net_device_ops = .guest_notified = vhost_guest_notified, }; +/* + * vHost async callbacks to transfer packets via DMA + * and query the transfer status. + */ +static int32_t +vhost_async_dmadev_transfer_data_cb(int vid, + uint16_t virtq_qid, + struct rte_vhost_async_desc *descs, + struct rte_vhost_async_status *opaque_data, + uint16_t count); +static int32_t +vhost_async_dmadev_check_completed_copies_cb(int vid, + uint16_t virtq_qid, + struct rte_vhost_async_status + *opaque_data, + uint16_t max_packets); + +/* Async channel operations for vhost ports. */ +static struct rte_vhost_async_channel_ops vhost_async_chnl_ops = { + .transfer_data = vhost_async_dmadev_transfer_data_cb, + .check_completed_copies = vhost_async_dmadev_check_completed_copies_cb +}; + +/* Set the poll factor for large packets based on max number of segments + * the packet could have. Compute this based on the MBUF size and + * the max MTU supported in OVS. */ +static const uint8_t vhost_dma_poll_factor = + (NETDEV_DPDK_MAX_PKT_LEN/ RTE_MBUF_DEFAULT_BUF_SIZE) + + ((NETDEV_DPDK_MAX_PKT_LEN % RTE_MBUF_DEFAULT_BUF_SIZE) != 0); + +DEFINE_EXTERN_PER_THREAD_DATA(dmadev_id, DMADEV_ID_UNASSIGNED); +DEFINE_EXTERN_PER_THREAD_DATA(dma_compl_ptr, 0); + +static struct ovs_mutex dmadev_mutex = OVS_MUTEX_INITIALIZER; + +static bool dmadev_devices_used[RTE_DMADEV_MAX_DEVS]; + +static inline uint16_t +dmadev_find_free_dev(int pmd_numa_id, struct rte_dmadev_info *dev_info) +{ + uint16_t dmadev_id = 0; + int other_numa_dmadev_id = DMADEV_ID_INVALID; + uint64_t capab = RTE_DMADEV_CAPA_MEM_TO_MEM | RTE_DMADEV_CAPA_OPS_COPY; + + for (dmadev_id = 0; dmadev_id < rte_dmadev_count(); dmadev_id++) { + if (!dmadev_devices_used[dmadev_id] && + !rte_dmadev_info_get(dmadev_id, dev_info)) { + /* DMA device must be capable of : + * MEM to MEM transfer + * Support COPY operation + * have atleast 1 virtual channel. */ + if (!((dev_info->dev_capa & capab) && dev_info->max_vchans >= 1)) { + continue; + } + + if (dev_info->device->numa_node == pmd_numa_id) { + dmadev_devices_used[dmadev_id] = true; + return dmadev_id; + } else if (other_numa_dmadev_id == DMADEV_ID_INVALID) { + other_numa_dmadev_id = dmadev_id; + } + } + } + + if (other_numa_dmadev_id != DMADEV_ID_INVALID) { + /* No DMA device found on same NUMA, hence + * allocating an available DMA from other NUMA. */ + dmadev_devices_used[other_numa_dmadev_id] = true; + rte_dmadev_info_get(other_numa_dmadev_id, dev_info); + return other_numa_dmadev_id; + } + + return DMADEV_ID_INVALID; +} + +static void * dpdk_rte_mzalloc(size_t sz); + +/* DMADEV enqueue tracking ring. */ +static void **dmadev_enq_track[RTE_DMADEV_MAX_DEVS]; + +static uint16_t +dmadev_get_free_dev(int pmd_numa_id) +{ + uint16_t dmadev_id; + struct rte_dmadev_info dev_info = {0}; + struct rte_dmadev_conf dev_conf = {0}; + struct rte_dmadev_vchan_conf vchan_conf = {0}; + size_t ring_size = VHOST_ASYNC_DMA_RING_SIZE; + size_t dma_track_ring_size = sizeof(void *) * ring_size; + + dmadev_id = dmadev_find_free_dev(pmd_numa_id, &dev_info); + if (dmadev_id == DMADEV_ID_INVALID) { + VLOG_INFO("No available DMA device found for vhost async copy " + "offload for this pmd."); + return dmadev_id; + } + + /* Configure the device. */ + dev_conf.nb_vchans = 1; + dev_conf.enable_silent = false; + int ret = rte_dmadev_configure(dmadev_id, &dev_conf); + if (OVS_UNLIKELY(ret)) { + VLOG_ERR("Configure failed for DMA device %s with dev id: %u" + " while assigning to pmd for vhost async copy offload.", + dev_info.device->name, dmadev_id); + dmadev_id = DMADEV_ID_INVALID; + } else { + vchan_conf.direction = RTE_DMA_DIR_MEM_TO_MEM; + vchan_conf.nb_desc = VHOST_ASYNC_DMA_RING_SIZE; + ret = rte_dmadev_vchan_setup(dmadev_id, 0, &vchan_conf); + if (ret < 0) { + VLOG_ERR("Virtual channel setup failed with err %d for" + " DMA device %s with dev id: %d", + ret, dev_info.device->name, dmadev_id); + dmadev_id = DMADEV_ID_INVALID; + goto out; + } + /* TODO: Free this when thread exits ? but how ?*/ + dmadev_enq_track[dmadev_id] = dpdk_rte_mzalloc(dma_track_ring_size); + if (!dmadev_enq_track[dmadev_id]) { + VLOG_ERR("Failed to allocate memory for enqueue tracking ring of" + " DMA device %s with dev id: %d", + dev_info.device->name, dmadev_id); + dmadev_id = DMADEV_ID_INVALID; + goto out; + } + + rte_dmadev_start(dmadev_id); + if (dev_info.device->numa_node != pmd_numa_id) { + VLOG_WARN("No available DMA device found on numa node %d," + " assigning %s with dev id: %d on numa %d to pmd for" + " vhost async copy offload.", + pmd_numa_id, dev_info.device->name, dmadev_id, + dev_info.device->numa_node); + return dmadev_id; + } + VLOG_INFO("DMA device %s with dev id: %d assigned to pmd for vhost" + " async copy offload.", dev_info.device->name, dmadev_id); + } + +out: + return dmadev_id; +} + +uint16_t +dmadev_id_init(void) +{ + uint16_t new_id; + unsigned int pmd_core_id = RTE_PER_LCORE(_lcore_id); + + new_id = *dmadev_id_get(); + + ovs_assert(new_id == DMADEV_ID_UNASSIGNED); + ovs_mutex_lock(&dmadev_mutex); + new_id = dmadev_get_free_dev(ovs_numa_get_numa_id(pmd_core_id)); + ovs_mutex_unlock(&dmadev_mutex); + + return *dmadev_id_get() = new_id; +} + /* Custom software stats for dpdk ports */ struct netdev_dpdk_sw_stats { /* No. of retries when unable to transmit. */ @@ -206,6 +388,8 @@ struct netdev_dpdk_sw_stats { uint64_t rx_qos_drops; /* Packet drops in HWOL processing. */ uint64_t tx_invalid_hwol_drops; + /* No. of packets pending to be tx'ed by async device. */ + uint64_t tx_async_inflight; }; enum dpdk_dev_type { @@ -371,6 +555,27 @@ struct dpdk_mp { struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex); }; +/* Tracking information for DMA in vhost async usecase. */ +struct enq_info_t{ + atomic_uint8_t pkt_rcvd; + atomic_uint8_t end_of_burst; +}; + +#define DMA_COMPLETION_RING_SIZE VHOST_ASYNC_DMA_RING_SIZE + +/* DMA completion tracking ring to report the packets + * back to the vhost library in_order. + * Note: The write's to the enq_info array should be atomic + * to guarantee correct behaviour. */ +struct dma_completions_t { + struct enq_info_t enq_info[DMA_COMPLETION_RING_SIZE]; + uint16_t count; + uint16_t read_idx; + uint16_t write_idx; + /* Bursts completed but not reported to above layer for completion. */ + uint16_t bursts_completed; +}; + /* There should be one 'struct dpdk_tx_queue' created for * each netdev tx queue. */ struct dpdk_tx_queue { @@ -380,6 +585,13 @@ struct dpdk_tx_queue { * It is used only if the queue is shared among different pmd threads * (see 'concurrent_txq'). */ rte_spinlock_t tx_lock; + + /* vHost asynchronous channel registration status. */ + bool is_async_reg; + + /* DMA enqueue tracker to maintain in_order reporting of packets. */ + struct dma_completions_t *dma_completions; + /* Mapping of configured vhost-user queue to enabled by guest. */ int map; ); @@ -474,6 +686,8 @@ struct netdev_dpdk { /* Array of vhost rxq states, see vring_state_changed. */ bool *vhost_rxq_enabled; + /* Array of vhost rxq async registration status. */ + bool *vhost_rxq_async_reg; ); PADDED_MEMBERS(CACHE_LINE_SIZE, @@ -533,6 +747,7 @@ struct netdev_dpdk { struct netdev_rxq_dpdk { struct netdev_rxq up; dpdk_port_t port_id; + struct dma_completions_t *dma_compl; }; static void netdev_dpdk_destruct(struct netdev *netdev); @@ -1200,20 +1415,37 @@ netdev_dpdk_alloc(void) } static struct dpdk_tx_queue * -netdev_dpdk_alloc_txq(unsigned int n_txqs) +netdev_dpdk_alloc_txq(unsigned int n_txqs, bool is_vhost) { struct dpdk_tx_queue *txqs; - unsigned i; + unsigned i = 0; + const size_t dma_compl_size = sizeof(struct dma_completions_t); + const bool is_vhost_async = is_vhost && dpdk_vhost_async_enabled(); + bool alloc_failed = false; txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs); if (txqs) { for (i = 0; i < n_txqs; i++) { + if (is_vhost_async) { + txqs[i].dma_completions = dpdk_rte_mzalloc(dma_compl_size); + if (!txqs[i].dma_completions) { + alloc_failed = true; + break; + } + } /* Initialize map for vhost devices. */ txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN; rte_spinlock_init(&txqs[i].tx_lock); } } + if (alloc_failed) { + for (int j = 0; j < i; j++) { + rte_free(txqs[j].dma_completions); + } + rte_free(txqs); + } + return txqs; } @@ -1314,9 +1546,18 @@ vhost_common_construct(struct netdev *netdev) if (!dev->vhost_rxq_enabled) { return ENOMEM; } - dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM); + + dev->vhost_rxq_async_reg = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM * + sizeof(bool)); + if (!dev->vhost_rxq_async_reg) { + rte_free(dev->vhost_rxq_enabled); + return ENOMEM; + } + + dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM, true); if (!dev->tx_q) { rte_free(dev->vhost_rxq_enabled); + rte_free(dev->vhost_rxq_async_reg); return ENOMEM; } @@ -1353,6 +1594,11 @@ netdev_dpdk_vhost_construct(struct netdev *netdev) /* There is no support for multi-segments buffers. */ dev->vhost_driver_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT; + + /* Enable async copy flag, if explicitly requested. */ + if (dpdk_vhost_async_enabled()) { + dev->vhost_driver_flags |= RTE_VHOST_USER_ASYNC_COPY; + } err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags); if (err) { VLOG_ERR("vhost-user socket device setup failure for socket %s\n", @@ -1436,6 +1682,48 @@ netdev_dpdk_construct(struct netdev *netdev) return err; } +/* Register the vHost async device for a queue. */ +static inline int +netdev_dpdk_vhost_async_reg(const int vid, const int qid, + const int virtq_id, const bool is_rx) +{ + int ret = -1; + + if (OVS_UNLIKELY(vid < 0)) { + return ret; + } + + struct rte_vhost_async_config config = { + .features = RTE_VHOST_ASYNC_INORDER + }; + + ret = rte_vhost_async_channel_register_thread_unsafe(vid, virtq_id, config, + &vhost_async_chnl_ops); + if (ret) { + VLOG_ERR("Async channel register failed for vid: %d, queue: %s%d " + "with status: %d", vid, is_rx ? "rxq" : "txq", qid, ret); + return ret; + } + VLOG_INFO("Async channel register success for vid: %d, queue: %s%d", + vid, is_rx ? "rxq" : "txq", qid); + return ret; +} + +/* Unregister the vHost async channel for a queue. */ +static inline void +netdev_dpdk_vhost_async_unreg(const int vid, const int qid, + const int virtq_id, const bool is_rx) +{ + int ret = rte_vhost_async_channel_unregister_thread_unsafe(vid, virtq_id); + if (ret) { + VLOG_ERR("Async channel unregister failed for vid: %d, queue: %s%d " + "with status: %d", vid, is_rx ? "rxq" : "txq", qid, ret); + return; + } + VLOG_INFO("Async channel unregister success for vid: %d, queue: %s%d", + vid, is_rx ? "rxq" : "txq", qid); +} + static void common_destruct(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex) @@ -1532,7 +1820,13 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev) vhost_id = dev->vhost_id; dev->vhost_id = NULL; rte_free(dev->vhost_rxq_enabled); + rte_free(dev->vhost_rxq_async_reg); + if (dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY) { + for (int i = 0; i < OVS_VHOST_MAX_QUEUE_NUM; i++) { + rte_free(dev->tx_q[i].dma_completions); + } + } common_destruct(dev); ovs_mutex_unlock(&dpdk_mutex); @@ -2129,6 +2423,26 @@ netdev_dpdk_rxq_alloc(void) return NULL; } +static struct netdev_rxq * +netdev_dpdk_vhost_rxq_alloc(void) +{ + struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx); + if (rx) { + if (dpdk_vhost_async_enabled()) { + rx->dma_compl = dpdk_rte_mzalloc(sizeof(struct dma_completions_t)); + if (rx->dma_compl) { + return &rx->up; + } else { + rte_free(rx); + return NULL; + } + } + return &rx->up; + } + + return NULL; +} + static struct netdev_rxq_dpdk * netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq) { @@ -2161,6 +2475,14 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq) rte_free(rx); } +static void +netdev_dpdk_vhost_rxq_dealloc(struct netdev_rxq *rxq) +{ + struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq); + rte_free(rx->dma_compl); + rte_free(rx); +} + /* Prepare the packet for HWOL. * Return True if the packet is OK to continue. */ static bool @@ -2408,15 +2730,29 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, uint16_t qos_drops = 0; int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ; int vid = netdev_dpdk_get_vid(dev); + int async_inflight = 0; if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || !(dev->flags & NETDEV_UP))) { return EAGAIN; } - nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp, - (struct rte_mbuf **) batch->packets, - NETDEV_MAX_BURST); + if (dev->vhost_rxq_async_reg[rxq->queue_id] + && dmadev_get_device() != DMADEV_ID_INVALID) { + struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq); + *dma_compl_ptr_get() = (uintptr_t) rx->dma_compl; + + nb_rx = rte_vhost_async_try_dequeue_burst(vid, qid, dev->dpdk_mp->mp, + (struct rte_mbuf **) + batch->packets, + NETDEV_MAX_BURST, + &async_inflight); + } else { + nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp, + (struct rte_mbuf **) batch->packets, + NETDEV_MAX_BURST); + } + if (!nb_rx) { return EAGAIN; } @@ -2557,14 +2893,15 @@ static inline void netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk *dev, struct dp_packet **packets, int attempted, - struct netdev_dpdk_sw_stats *sw_stats_add) + struct netdev_dpdk_sw_stats *sw_stats_add, + bool is_sent) { int dropped = sw_stats_add->tx_mtu_exceeded_drops + sw_stats_add->tx_qos_drops + sw_stats_add->tx_failure_drops + sw_stats_add->tx_invalid_hwol_drops; struct netdev_stats *stats = &dev->stats; - int sent = attempted - dropped; + int sent = is_sent ? attempted - dropped : 0; int i; stats->tx_packets += sent; @@ -2585,9 +2922,108 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk *dev, } } +/* Checks if the dma_completion ring is full. */ +static inline bool +is_compl_ring_full(struct dma_completions_t *dma_compl) +{ + return dma_compl->count == DMA_COMPLETION_RING_SIZE; +} + +/* Checks if the dma_completion ring is empty. */ +static inline bool +is_compl_ring_empty(struct dma_completions_t *dma_compl) +{ + return dma_compl->count == 0; +} + +static inline bool +is_burst_complete(struct dma_completions_t *dma_compl) +{ + if (dma_compl->bursts_completed) { + dma_compl->bursts_completed--; + return true; + } + return false; +} + +/* Free the packets sent via the async data path and + * return -EINPROGRESS if there are more packets to be freed. */ +static int +netdev_dpdk_vhost_async_free(struct netdev *netdev, int qid, bool force) +{ + int ret = 0; + int max_attempt = 100; + uint16_t nr_xfrd_pkts = 0; + struct dp_packet *cmpl_pkts[NETDEV_MAX_BURST]; + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + int vid = netdev_dpdk_get_vid(dev); + struct netdev_dpdk_sw_stats sw_stats_add = {0}; + qid = dev->tx_q[qid].map; + + if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0 + || !(dev->flags & NETDEV_UP))) { + return 0; + } + + do { + if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev->tx_q[qid].tx_lock))) { + COVERAGE_INC(vhost_tx_contention); + rte_spinlock_lock(&dev->tx_q[qid].tx_lock); + } + + if (is_compl_ring_empty(dev->tx_q[qid].dma_completions)) { + /* Reset burst counter to 0. */ + dev->tx_q[qid].dma_completions->bursts_completed = 0; + /* No more packets to free, so return. */ + rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); + return 0; + } + + max_attempt--; + COVERAGE_INC(vhost_async_tx_poll); + const uint16_t vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ; + *dma_compl_ptr_get() = (uintptr_t) dev->tx_q[qid].dma_completions; + /* Get the completion status of async transfer. */ + nr_xfrd_pkts = rte_vhost_poll_enqueue_completed(vid, vhost_qid, + (struct rte_mbuf **) + cmpl_pkts, + NETDEV_MAX_BURST); + if (!is_burst_complete(dev->tx_q[qid].dma_completions)) { + ret = -EINPROGRESS; + } + + rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); + + if (!nr_xfrd_pkts) { + COVERAGE_INC(vhost_async_tx_poll_empty); + continue; + } + + rte_spinlock_lock(&dev->stats_lock); + dev->sw_stats->tx_async_inflight -= nr_xfrd_pkts; + netdev_dpdk_vhost_update_tx_counters(dev, cmpl_pkts, nr_xfrd_pkts, + &sw_stats_add, true); + rte_spinlock_unlock(&dev->stats_lock); + + for (int i = 0; i < nr_xfrd_pkts; i++) { + dp_packet_delete(cmpl_pkts[i]); + } + } while (force && max_attempt); + return ret; +} + +static inline void +vhost_async_set_end_of_burst(struct dma_completions_t *dma_compl) +{ + const uint16_t ring_mask = DMA_COMPLETION_RING_SIZE - 1; + int prev_slot_idx = (dma_compl->write_idx - 1) & ring_mask; + struct enq_info_t *slot_addr = &(dma_compl->enq_info[prev_slot_idx]); + atomic_store_relaxed(&(slot_addr->end_of_burst), 1); +} + static int __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, - struct dp_packet **pkts, int cnt) + struct dp_packet **pkts, int cnt, bool dpdk_buf) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts; @@ -2597,6 +3033,9 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, int i, retries = 0; int max_retries = VHOST_ENQ_RETRY_MIN; int vid = netdev_dpdk_get_vid(dev); + int free_start_idx = 0; + bool is_async = false; + int ret = 0; qid = dev->tx_q[qid % netdev->n_txq].map; @@ -2628,13 +3067,24 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true); sw_stats_add.tx_qos_drops -= cnt; + sw_stats_add.tx_async_inflight = 0; n_packets_to_free = cnt; + is_async = dev->tx_q[qid].is_async_reg && dpdk_buf + && (dmadev_get_device() != DMADEV_ID_INVALID); do { int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ; unsigned int tx_pkts; + if (is_async) { + *dma_compl_ptr_get() = (uintptr_t) dev->tx_q[qid].dma_completions; + /* Call the transfer data callback for async transfer.*/ + tx_pkts = rte_vhost_submit_enqueue_burst(vid, vhost_qid, + cur_pkts, cnt); + sw_stats_add.tx_async_inflight += tx_pkts; + } else { + tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt); + } - tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt); if (OVS_LIKELY(tx_pkts)) { /* Packets have been sent.*/ cnt -= tx_pkts; @@ -2652,23 +3102,31 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, break; } } while (cnt && (retries++ < max_retries)); - + if (sw_stats_add.tx_async_inflight) { + vhost_async_set_end_of_burst(dev->tx_q[qid].dma_completions); + /* Set return to call free for asynchronously sent packets. */ + ret = -EINPROGRESS; + } rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); sw_stats_add.tx_failure_drops = cnt; sw_stats_add.tx_retries = MIN(retries, max_retries); rte_spinlock_lock(&dev->stats_lock); + dev->sw_stats->tx_async_inflight += sw_stats_add.tx_async_inflight; netdev_dpdk_vhost_update_tx_counters(dev, pkts, total_packets, - &sw_stats_add); + &sw_stats_add, !is_async); rte_spinlock_unlock(&dev->stats_lock); + /* Since dropped packets are at the end of the burst, + * update index to delete the packets dropped in current burst. */ + free_start_idx = sw_stats_add.tx_async_inflight; out: - for (i = 0; i < n_packets_to_free; i++) { + for (i = free_start_idx; i < n_packets_to_free; i++) { dp_packet_delete(pkts[i]); } - return 0; + return ret; } static void @@ -2829,7 +3287,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) if (OVS_LIKELY(txcnt)) { if (dev->type == DPDK_DEV_VHOST) { - ret = __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt); + ret = __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt, false); } else { tx_failure += netdev_dpdk_eth_tx_burst(dev, qid, (struct rte_mbuf **)pkts, @@ -2862,7 +3320,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, dp_packet_delete_batch(batch, true); } else { ret = __netdev_dpdk_vhost_send(netdev, qid, batch->packets, - dp_packet_batch_size(batch)); + dp_packet_batch_size(batch), true); } return ret; } @@ -3258,7 +3716,8 @@ netdev_dpdk_get_sw_custom_stats(const struct netdev *netdev, SW_CSTAT(tx_mtu_exceeded_drops) \ SW_CSTAT(tx_qos_drops) \ SW_CSTAT(rx_qos_drops) \ - SW_CSTAT(tx_invalid_hwol_drops) + SW_CSTAT(tx_invalid_hwol_drops) \ + SW_CSTAT(tx_async_inflight) #define SW_CSTAT(NAME) + 1 custom_stats->size = SW_CSTATS; @@ -3972,6 +4431,282 @@ netdev_dpdk_remap_txqs(struct netdev_dpdk *dev) free(enabled_queues); } +/* Enqueue a packet via DMA. */ +static inline void +vhost_async_dmadev_enqueue_packet(const uint16_t dev_id, + const struct rte_vhost_iov_iter *src_ptr, + const struct rte_vhost_iov_iter *dst_ptr, + const uint16_t nr_segs, + struct enq_info_t *slot_addr, + bool is_rx) +{ + uint16_t seg_idx = 0; + struct enq_info_t *addr = NULL; + uint64_t dma_flags = RTE_DMA_OP_FLAG_LLC; + const uint16_t dmadev_ring_mask = VHOST_ASYNC_DMA_RING_SIZE - 1; + + while (OVS_LIKELY(seg_idx < nr_segs)) { + /* Fetch DMA source start addr. */ + const rte_iova_t s_base = (uintptr_t)(src_ptr->iov[seg_idx].iov_base); + const rte_iova_t dma_src_start_addr = src_ptr->offset + s_base; + /* Fetch DMA destination start addr. */ + const rte_iova_t d_base = (uintptr_t)(dst_ptr->iov[seg_idx].iov_base); + const rte_iova_t dma_dst_start_addr = dst_ptr->offset + d_base; + /* Fetch packet segment length. */ + const uint32_t dma_src_len = src_ptr->iov[seg_idx].iov_len; + /* Check if this segment is the last. */ + if (seg_idx == nr_segs - 1) { + addr = slot_addr; + } + + int enq_index = rte_dmadev_copy(dev_id, + 0, + dma_src_start_addr, + dma_dst_start_addr, + dma_src_len, + dma_flags); + if (OVS_UNLIKELY(enq_index < 0)) { + break; + } + dmadev_enq_track[dev_id][enq_index & dmadev_ring_mask] = (void *) addr; + is_rx ? COVERAGE_INC(vhost_async_rx_enqueue) : + COVERAGE_INC(vhost_async_tx_enqueue); + seg_idx++; + } +} + +/* Enqueue a packet through SW copy. */ +static inline void +sw_enqueue_packet(const struct rte_vhost_iov_iter *src_ptr, + const struct rte_vhost_iov_iter *dst_ptr, + const uint16_t nr_segs, + bool is_rx) +{ + uint16_t seg_idx = 0; + + while (OVS_LIKELY(seg_idx < nr_segs)) { + /* Fetch source start address. */ + const uintptr_t s_base = (uintptr_t)(src_ptr->iov[seg_idx].iov_base); + const uintptr_t src_start_addr = src_ptr->offset + s_base; + /* Fetch destination start address. */ + const uintptr_t d_base = (uintptr_t)(dst_ptr->iov[seg_idx].iov_base); + const uintptr_t dst_start_addr = dst_ptr->offset + d_base; + /* Fetch segment length. */ + const size_t src_len = src_ptr->iov[seg_idx].iov_len; + + rte_memcpy((void *) dst_start_addr, + (void *) src_start_addr, + src_len); + is_rx ? COVERAGE_INC(vhost_async_rx_enqueue_sw_fallback) : + COVERAGE_INC(vhost_async_tx_enqueue_sw_fallback); + seg_idx++; + } +} + +/* Fetch the slot address for a packet. */ +static inline struct enq_info_t * +compl_slot_get_and_inc(struct dma_completions_t *dma_compl) +{ + struct enq_info_t *slot_addr + = &(dma_compl->enq_info[dma_compl->write_idx]); + const uint16_t ring_mask = DMA_COMPLETION_RING_SIZE - 1; + + dma_compl->write_idx++; + dma_compl->write_idx &= ring_mask; + dma_compl->count++; + return slot_addr; +} + +/* Calculate packets sent for a txq by parsing dma_completion ring. */ +static inline uint32_t +count_completed_packets(struct dma_completions_t *dma_compl, + const bool is_rx, + const int max_pkts) +{ + uint32_t pkts; + int count = dma_compl->count; + int read_idx = dma_compl->read_idx; + uint8_t pkt_rcvd = 0, end_of_burst = 0; + const uint16_t ring_mask = DMA_COMPLETION_RING_SIZE - 1; + + for (pkts = 0; (pkts < max_pkts) && (count > 0); pkts++) { + read_idx &= ring_mask; + atomic_read_relaxed(&(dma_compl->enq_info[read_idx].pkt_rcvd), + &pkt_rcvd); + if (!pkt_rcvd) { + break; + } + if (!is_rx) { + atomic_read_relaxed(&dma_compl->enq_info[read_idx].end_of_burst, + &end_of_burst); + dma_compl->bursts_completed += end_of_burst; + atomic_store_relaxed(&(dma_compl->enq_info[read_idx].end_of_burst), + 0); + } + + atomic_store_relaxed(&(dma_compl->enq_info[read_idx].pkt_rcvd), 0); + + count--; + read_idx++; + } + dma_compl->count = count; + dma_compl->read_idx = read_idx; + return pkts; +} + +/* vHost async callback to offload enqueue via DMA. */ +static int32_t +vhost_async_dmadev_transfer_data_cb(int vid OVS_UNUSED, + uint16_t virtq_qid, + struct rte_vhost_async_desc *descs, + struct rte_vhost_async_status *opaque_data, + uint16_t count) +{ + uint16_t desc_idx = 0; + struct enq_info_t *slot_addr = NULL; + struct dma_completions_t *compl = NULL; + bool is_rx = (virtq_qid % VIRTIO_QNUM) == VIRTIO_TXQ; + + ovs_assert(opaque_data == NULL); + + compl = (struct dma_completions_t *)*dma_compl_ptr_get(); + if (is_compl_ring_full(compl)) { + if (is_rx) { + COVERAGE_ADD(vhost_async_rx_compl_ring_full_drops, count); + } else { + COVERAGE_ADD(vhost_async_tx_compl_ring_full_drops, count); + } + goto out; + } + + /* Fetch the dmadev id assigned to the current thread. */ + uint16_t dev_id = dmadev_get_device(); + /* Cache space left in DMA ring to avoid driver call for every packet. */ + /* Similar to following API will be added in the next revisions of DMADEV. + Comment out for now.*/ + /* uint16_t dmadev_space_left = rte_dmadev_burst_capacity(dev_id, 0); */ + const int compl_space_left = DMA_COMPLETION_RING_SIZE - compl->count; + if (count > compl_space_left) { + if (is_rx) { + COVERAGE_ADD(vhost_async_rx_compl_ring_full_drops, + count - compl_space_left); + } else { + COVERAGE_ADD(vhost_async_tx_compl_ring_full_drops, + count - compl_space_left); + } + count = compl_space_left; + } + + while (desc_idx < count) { + const struct rte_vhost_iov_iter *src_ptr = descs[desc_idx].src; + const struct rte_vhost_iov_iter *dst_ptr = descs[desc_idx].dst; + const uint16_t nr_segs = src_ptr->nr_segs; + /* + if (dmadev_space_left < nr_segs) { + if (is_rx) { + COVERAGE_INC(vhost_async_rx_dma_ring_full); + } else { + COVERAGE_INC(vhost_async_tx_dma_ring_full); + } + goto ring_doorbell; + } + */ + slot_addr = compl_slot_get_and_inc(compl); + vhost_async_dmadev_enqueue_packet(dev_id, src_ptr, dst_ptr, + nr_segs, slot_addr, is_rx); + /* dmadev_space_left -= nr_segs;*/ + desc_idx++; + } +/* +ring_doorbell: +*/ + if (desc_idx != 0) { + /* Ring the doorbell. */ + rte_dmadev_submit(dev_id, 0); + } + + /* Do software copy for packets that do no fit in the DMA ring. */ + while (desc_idx < count) { + const struct rte_vhost_iov_iter *src_ptr = descs[desc_idx].src; + const struct rte_vhost_iov_iter *dst_ptr = descs[desc_idx].dst; + slot_addr = compl_slot_get_and_inc(compl); + sw_enqueue_packet(src_ptr, dst_ptr, src_ptr->nr_segs, is_rx); + atomic_store_relaxed(&(slot_addr->pkt_rcvd), 1); + desc_idx++; + } + +out: + return desc_idx; +} + +/* vHost async callback to query transfer status of DMA. */ +static int32_t +vhost_async_dmadev_check_completed_copies_cb(int vid, + uint16_t virtq_qid, + struct rte_vhost_async_status + *opaque_data, + uint16_t max_pkts) +{ + bool error; + uint16_t last_idx; + uint32_t nr_pkts = 0; + struct enq_info_t *slots; + struct dma_completions_t *compl = NULL; + bool is_rx = (virtq_qid % VIRTIO_QNUM) == VIRTIO_TXQ; + const uint16_t dmadev_ring_mask = VHOST_ASYNC_DMA_RING_SIZE - 1; + const uint8_t max_copies = NETDEV_MAX_BURST * vhost_dma_poll_factor; + + ovs_assert(opaque_data == NULL); + + compl = (struct dma_completions_t *)*dma_compl_ptr_get(); + if (OVS_UNLIKELY(is_compl_ring_empty(compl))) { + if (is_rx) { + COVERAGE_INC(vhost_async_rx_compl_ring_empty); + } else { + COVERAGE_INC(vhost_async_tx_compl_ring_empty); + } + goto out; + } + + /* Fetch the dmadev id assigned to the current thread. */ + uint16_t dev_id = dmadev_get_device(); + + /* Check the completion status of DMA. */ + const uint16_t ret_segs = rte_dmadev_completed(dev_id, + 0, + max_copies, + &last_idx, + &error); + if (OVS_UNLIKELY(error)) { + VLOG_WARN_RL(&rl,"rte_dmadev_completed returned error for dev id: %u" + "with vid: %d, qid %u", dev_id, vid, + virtq_qid/ VIRTIO_QNUM); + return -1; + } + /* Compute the start index. */ + uint16_t pkt_idx = (last_idx - ret_segs + 1) & dmadev_ring_mask; + for (int i = 0; i < ret_segs; i++) { + slots = (struct enq_info_t *) dmadev_enq_track[dev_id][pkt_idx]; + if (slots) { + /* Mark the packet slot as received. + * The slot could belong to another queue but writes are atomic. */ + atomic_store_relaxed(&(slots->pkt_rcvd), 1); + } + pkt_idx = (pkt_idx + 1) & dmadev_ring_mask; + } + /* Calculate packets successfully DMA'ed from this virtqueue. */ + nr_pkts = count_completed_packets(compl, is_rx, max_pkts); + if (is_rx) { + COVERAGE_INC(vhost_async_rx_poll); + if (!nr_pkts) { + COVERAGE_INC(vhost_async_rx_poll_empty); + } + } + +out: + return nr_pkts; +} + /* * A new virtio-net device is added to a vhost port. */ @@ -4075,6 +4810,8 @@ destroy_device(int vid) ovsrcu_index_set(&dev->vid, -1); memset(dev->vhost_rxq_enabled, 0, dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled); + memset(dev->vhost_rxq_enabled, 0, + dev->up.n_rxq * sizeof *dev->vhost_rxq_async_reg); netdev_dpdk_txq_map_clear(dev); netdev_change_seq_changed(&dev->up); @@ -4122,13 +4859,42 @@ vring_state_changed(int vid, uint16_t queue_id, int enable) bool old_state = dev->vhost_rxq_enabled[qid]; dev->vhost_rxq_enabled[qid] = enable != 0; + if (enable) { + if ((dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY) + && !dev->vhost_rxq_async_reg[qid]) { + if (!netdev_dpdk_vhost_async_reg(vid, qid, queue_id, + is_rx)) { + dev->vhost_rxq_async_reg[qid] = true; + } + } + } else { + if ((dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY) + && dev->vhost_rxq_async_reg[qid]) { + netdev_dpdk_vhost_async_unreg(vid, qid, queue_id, + is_rx); + dev->vhost_rxq_async_reg[qid] = false; + } + } if (old_state != dev->vhost_rxq_enabled[qid]) { netdev_change_seq_changed(&dev->up); } } else { if (enable) { dev->tx_q[qid].map = qid; + if ((dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY) + && !dev->tx_q[qid].is_async_reg) { + if (!netdev_dpdk_vhost_async_reg(vid, qid, queue_id, + is_rx)) { + dev->tx_q[qid].is_async_reg = true; + } + } } else { + if ((dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY) + && dev->tx_q[qid].is_async_reg) { + netdev_dpdk_vhost_async_unreg(vid, qid, queue_id, + is_rx); + dev->tx_q[qid].is_async_reg = false; + } dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED; } netdev_dpdk_remap_txqs(dev); @@ -5036,7 +5802,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev) */ dev->requested_hwaddr = dev->hwaddr; - dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq); + dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq, false); if (!dev->tx_q) { err = ENOMEM; } @@ -5135,6 +5901,11 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT; } + /* Enable async copy flag, if explicitly requested. */ + if (dpdk_vhost_async_enabled()) { + vhost_flags |= RTE_VHOST_USER_ASYNC_COPY; + } + /* Enable External Buffers if TCP Segmentation Offload is enabled. */ if (userspace_tso_enabled()) { vhost_flags |= RTE_VHOST_USER_EXTBUF_SUPPORT; @@ -5442,10 +6213,8 @@ netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *netdev, .queue_dump_next = netdev_dpdk_queue_dump_next, \ .queue_dump_done = netdev_dpdk_queue_dump_done, \ .update_flags = netdev_dpdk_update_flags, \ - .rxq_alloc = netdev_dpdk_rxq_alloc, \ .rxq_construct = netdev_dpdk_rxq_construct, \ - .rxq_destruct = netdev_dpdk_rxq_destruct, \ - .rxq_dealloc = netdev_dpdk_rxq_dealloc + .rxq_destruct = netdev_dpdk_rxq_destruct #define NETDEV_DPDK_CLASS_BASE \ NETDEV_DPDK_CLASS_COMMON, \ @@ -5458,7 +6227,9 @@ netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *netdev, .get_features = netdev_dpdk_get_features, \ .get_status = netdev_dpdk_get_status, \ .reconfigure = netdev_dpdk_reconfigure, \ - .rxq_recv = netdev_dpdk_rxq_recv + .rxq_recv = netdev_dpdk_rxq_recv, \ + .rxq_alloc = netdev_dpdk_rxq_alloc, \ + .rxq_dealloc = netdev_dpdk_rxq_dealloc static const struct netdev_class dpdk_class = { .type = "dpdk", @@ -5474,7 +6245,7 @@ static const struct netdev_class dpdk_vhost_class = { .construct = netdev_dpdk_vhost_construct, .destruct = netdev_dpdk_vhost_destruct, .send = netdev_dpdk_vhost_send, - .process_async = NULL, + .process_async = netdev_dpdk_vhost_async_free, .get_carrier = netdev_dpdk_vhost_get_carrier, .get_stats = netdev_dpdk_vhost_get_stats, .get_custom_stats = netdev_dpdk_get_sw_custom_stats, @@ -5482,6 +6253,8 @@ static const struct netdev_class dpdk_vhost_class = { .reconfigure = netdev_dpdk_vhost_reconfigure, .rxq_recv = netdev_dpdk_vhost_rxq_recv, .rxq_enabled = netdev_dpdk_vhost_rxq_enabled, + .rxq_alloc = netdev_dpdk_vhost_rxq_alloc, + .rxq_dealloc = netdev_dpdk_vhost_rxq_dealloc, }; static const struct netdev_class dpdk_vhost_client_class = { @@ -5491,7 +6264,7 @@ static const struct netdev_class dpdk_vhost_client_class = { .destruct = netdev_dpdk_vhost_destruct, .set_config = netdev_dpdk_vhost_client_set_config, .send = netdev_dpdk_vhost_send, - .process_async = NULL, + .process_async = netdev_dpdk_vhost_async_free, .get_carrier = netdev_dpdk_vhost_get_carrier, .get_stats = netdev_dpdk_vhost_get_stats, .get_custom_stats = netdev_dpdk_get_sw_custom_stats, @@ -5499,6 +6272,8 @@ static const struct netdev_class dpdk_vhost_client_class = { .reconfigure = netdev_dpdk_vhost_client_reconfigure, .rxq_recv = netdev_dpdk_vhost_rxq_recv, .rxq_enabled = netdev_dpdk_vhost_rxq_enabled, + .rxq_alloc = netdev_dpdk_vhost_rxq_alloc, + .rxq_dealloc = netdev_dpdk_vhost_rxq_dealloc, }; void diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index 699be3fb4..690b0b830 100644 --- a/lib/netdev-dpdk.h +++ b/lib/netdev-dpdk.h @@ -20,6 +20,7 @@ #include <config.h> #include "openvswitch/compiler.h" +#include "ovs-thread.h" struct dp_packet; struct netdev; @@ -27,6 +28,25 @@ struct netdev; #ifdef DPDK_NETDEV #include <rte_flow.h> +/* For vHost async datapath, dmadev id alloation is per dataplane thread. */ +DECLARE_EXTERN_PER_THREAD_DATA(uint16_t, dmadev_id); +DECLARE_EXTERN_PER_THREAD_DATA(uintptr_t, dma_compl_ptr); + +#define DMADEV_ID_UNASSIGNED UINT16_MAX +#define DMADEV_ID_INVALID (UINT16_MAX - 1) + +uint16_t dmadev_id_init(void); + +static inline uint16_t +dmadev_get_device(void) +{ + uint16_t id = *dmadev_id_get(); + + if (id == DMADEV_ID_UNASSIGNED) { + id = dmadev_id_init(); + } + return id; +} void netdev_dpdk_register(void); void free_dpdk_buf(struct dp_packet *); -- 2.25.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
