[ovs-dev] [PATCH RFC dpdk-latest V2 1/1] netdev-dpdk: Enable vhost async API's in OvS.

Sunil Pai G Tue, 07 Sep 2021 05:01:12 -0700

This patch adds the initial support for DPDK vHost
async API's to offload the memory copy operations to the hardware.


Signed-off-by: Sunil Pai G <[email protected]>
---
 lib/dpdk-stub.c   |   6 +
 lib/dpdk.c        |  39 +++
 lib/dpdk.h        |   1 +
 lib/netdev-dpdk.c | 823 ++++++++++++++++++++++++++++++++++++++++++++--
 lib/netdev-dpdk.h |  20 ++
 5 files changed, 865 insertions(+), 24 deletions(-)

diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
index b7d577870..5404e2056 100644
--- a/lib/dpdk-stub.c
+++ b/lib/dpdk-stub.c
@@ -62,6 +62,12 @@ dpdk_vhost_postcopy_enabled(void)
     return false;
 }
 
+bool
+dpdk_vhost_async_enabled(void)
+{
+    return false;
+}
+
 bool
 dpdk_per_port_memory(void)
 {
diff --git a/lib/dpdk.c b/lib/dpdk.c
index b2ef31cd2..9736283a2 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -54,6 +54,9 @@ static bool dpdk_initialized = false; /* Indicates successful 
initialization
                                        * of DPDK. */
 static bool per_port_memory = false; /* Status of per port memory support */
 
+static bool vhost_async_copy_enabled = false; /* Status of vhost async
+                                                 support. */
+
 static int
 process_vhost_flags(char *flag, const char *default_val, int size,
                     const struct smap *ovs_other_config,
@@ -397,6 +400,22 @@ dpdk_init__(const struct smap *ovs_other_config)
     VLOG_INFO("POSTCOPY support for vhost-user-client %s.",
               vhost_postcopy_enabled ? "enabled" : "disabled");
 
+    vhost_async_copy_enabled = smap_get_bool(ovs_other_config,
+                                             "vhost-async-support", false);
+    if (vhost_async_copy_enabled) {
+        if (vhost_postcopy_enabled) {
+            VLOG_WARN("Async-copy and post-copy are not compatible "
+                      "for vhost-user-client. Disabling POSTCOPY support.");
+            vhost_postcopy_enabled = false;
+        }
+
+        if (vhost_iommu_enabled) {
+            vhost_iommu_enabled = false;
+            VLOG_WARN("Async copy is not compatible with IOMMU support for"
+                      " vhost-user-client. IOMMU support disabled.");
+        }
+    }
+
     per_port_memory = smap_get_bool(ovs_other_config,
                                     "per-port-memory", false);
     VLOG_INFO("Per port memory for DPDK devices %s.",
@@ -456,6 +475,20 @@ dpdk_init__(const struct smap *ovs_other_config)
     /* Make sure things are initialized ... */
     result = rte_eal_init(args.n, argv);
 
+    if (vhost_async_copy_enabled) {
+        /* TODO: Remove this check ? if PA , disable SW fallback ? */
+        if (rte_eal_iova_mode() != RTE_IOVA_VA) {
+            VLOG_WARN("Async-copy for vhost-user-client requires IOVA as VA to"
+                      " be enabled. Async support disabled.");
+            vhost_async_copy_enabled = false;
+        } else {
+            VLOG_INFO("Async support enabled for vhost-user-client.");
+        }
+    } else {
+        VLOG_INFO("Async support disabled for vhost-user-client.");
+    }
+
+
     free(argv);
     svec_destroy(&args);
 
@@ -559,6 +592,12 @@ dpdk_vhost_postcopy_enabled(void)
     return vhost_postcopy_enabled;
 }
 
+bool
+dpdk_vhost_async_enabled(void)
+{
+    return vhost_async_copy_enabled;
+}
+
 bool
 dpdk_per_port_memory(void)
 {
diff --git a/lib/dpdk.h b/lib/dpdk.h
index 445a51d06..626d0f368 100644
--- a/lib/dpdk.h
+++ b/lib/dpdk.h
@@ -40,6 +40,7 @@ void dpdk_set_lcore_id(unsigned cpu);
 const char *dpdk_get_vhost_sock_dir(void);
 bool dpdk_vhost_iommu_enabled(void);
 bool dpdk_vhost_postcopy_enabled(void);
+bool dpdk_vhost_async_enabled(void);
 bool dpdk_per_port_memory(void);
 bool dpdk_available(void);
 void print_dpdk_version(void);
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1e6c5a92c..9fc53e5e2 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -29,6 +29,7 @@
 #include <rte_bus_pci.h>
 #include <rte_config.h>
 #include <rte_cycles.h>
+#include <rte_dmadev.h>
 #include <rte_errno.h>
 #include <rte_ethdev.h>
 #include <rte_flow.h>
@@ -38,6 +39,7 @@
 #include <rte_pci.h>
 #include <rte_version.h>
 #include <rte_vhost.h>
+#include <rte_vhost_async.h>
 
 #include "cmap.h"
 #include "coverage.h"
@@ -76,6 +78,23 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 
20);
 
 COVERAGE_DEFINE(vhost_tx_contention);
 COVERAGE_DEFINE(vhost_notification);
+COVERAGE_DEFINE(vhost_async_tx_poll);
+COVERAGE_DEFINE(vhost_async_tx_poll_empty);
+COVERAGE_DEFINE(vhost_async_rx_poll);
+COVERAGE_DEFINE(vhost_async_rx_poll_empty);
+
+COVERAGE_DEFINE(vhost_async_tx_dma_ring_full);
+COVERAGE_DEFINE(vhost_async_tx_compl_ring_full_drops);
+COVERAGE_DEFINE(vhost_async_tx_compl_ring_empty);
+COVERAGE_DEFINE(vhost_async_rx_dma_ring_full);
+COVERAGE_DEFINE(vhost_async_rx_compl_ring_full_drops);
+COVERAGE_DEFINE(vhost_async_rx_compl_ring_empty);
+COVERAGE_DEFINE(vhost_async_rx_enqueue);
+COVERAGE_DEFINE(vhost_async_tx_enqueue);
+COVERAGE_DEFINE(vhost_async_tx_enqueue_sw_fallback);
+COVERAGE_DEFINE(vhost_async_rx_enqueue_sw_fallback);
+
+
 
 #define DPDK_PORT_WATCHDOG_INTERVAL 5
 
@@ -145,6 +164,9 @@ typedef uint16_t dpdk_port_t;
 
 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
 
+/* vHost async DMA ring size. */
+#define VHOST_ASYNC_DMA_RING_SIZE 4096
+
 /* List of required flags advertised by the hardware that will be used
  * if TSO is enabled. Ideally this should include DEV_TX_OFFLOAD_SCTP_CKSUM.
  * However, very few drivers supports that the moment and SCTP is not a
@@ -192,6 +214,166 @@ static const struct vhost_device_ops 
virtio_net_device_ops =
     .guest_notified = vhost_guest_notified,
 };
 
+/*
+ * vHost async callbacks to transfer packets via DMA
+ * and query the transfer status.
+ */
+static int32_t
+vhost_async_dmadev_transfer_data_cb(int vid,
+                                    uint16_t virtq_qid,
+                                    struct rte_vhost_async_desc *descs,
+                                    struct rte_vhost_async_status *opaque_data,
+                                    uint16_t count);
+static int32_t
+vhost_async_dmadev_check_completed_copies_cb(int vid,
+                                             uint16_t virtq_qid,
+                                             struct rte_vhost_async_status
+                                             *opaque_data,
+                                             uint16_t max_packets);
+
+/* Async channel operations for vhost ports. */
+static struct rte_vhost_async_channel_ops vhost_async_chnl_ops = {
+        .transfer_data = vhost_async_dmadev_transfer_data_cb,
+        .check_completed_copies = vhost_async_dmadev_check_completed_copies_cb
+};
+
+/* Set the poll factor for large packets based on max number of segments
+ * the packet could have. Compute this based on the MBUF size and
+ * the max MTU supported in OVS. */
+static const uint8_t vhost_dma_poll_factor =
+    (NETDEV_DPDK_MAX_PKT_LEN/ RTE_MBUF_DEFAULT_BUF_SIZE) +
+    ((NETDEV_DPDK_MAX_PKT_LEN % RTE_MBUF_DEFAULT_BUF_SIZE) != 0);
+
+DEFINE_EXTERN_PER_THREAD_DATA(dmadev_id, DMADEV_ID_UNASSIGNED);
+DEFINE_EXTERN_PER_THREAD_DATA(dma_compl_ptr, 0);
+
+static struct ovs_mutex dmadev_mutex = OVS_MUTEX_INITIALIZER;
+
+static bool dmadev_devices_used[RTE_DMADEV_MAX_DEVS];
+
+static inline uint16_t
+dmadev_find_free_dev(int pmd_numa_id, struct rte_dmadev_info *dev_info)
+{
+    uint16_t dmadev_id = 0;
+    int other_numa_dmadev_id = DMADEV_ID_INVALID;
+    uint64_t capab = RTE_DMADEV_CAPA_MEM_TO_MEM | RTE_DMADEV_CAPA_OPS_COPY;
+
+    for (dmadev_id = 0; dmadev_id < rte_dmadev_count(); dmadev_id++) {
+        if (!dmadev_devices_used[dmadev_id] &&
+                !rte_dmadev_info_get(dmadev_id, dev_info)) {
+            /* DMA device must be capable of :
+             * MEM to MEM transfer
+             * Support COPY operation
+             * have atleast 1 virtual channel. */
+            if (!((dev_info->dev_capa & capab) && dev_info->max_vchans >= 1)) {
+                continue;
+            }
+
+            if (dev_info->device->numa_node == pmd_numa_id) {
+                dmadev_devices_used[dmadev_id] = true;
+                return dmadev_id;
+            } else if (other_numa_dmadev_id == DMADEV_ID_INVALID) {
+                other_numa_dmadev_id = dmadev_id;
+            }
+        }
+    }
+
+    if (other_numa_dmadev_id != DMADEV_ID_INVALID) {
+        /* No DMA device found on same NUMA, hence
+         * allocating an available DMA from other NUMA. */
+        dmadev_devices_used[other_numa_dmadev_id] = true;
+        rte_dmadev_info_get(other_numa_dmadev_id, dev_info);
+        return other_numa_dmadev_id;
+    }
+
+    return DMADEV_ID_INVALID;
+}
+
+static void * dpdk_rte_mzalloc(size_t sz);
+
+/* DMADEV enqueue tracking ring. */
+static void **dmadev_enq_track[RTE_DMADEV_MAX_DEVS];
+
+static uint16_t
+dmadev_get_free_dev(int pmd_numa_id)
+{
+    uint16_t dmadev_id;
+    struct rte_dmadev_info dev_info = {0};
+    struct rte_dmadev_conf dev_conf = {0};
+    struct rte_dmadev_vchan_conf vchan_conf = {0};
+    size_t ring_size = VHOST_ASYNC_DMA_RING_SIZE;
+    size_t dma_track_ring_size = sizeof(void *) * ring_size;
+
+    dmadev_id = dmadev_find_free_dev(pmd_numa_id, &dev_info);
+    if (dmadev_id == DMADEV_ID_INVALID) {
+        VLOG_INFO("No available DMA device found for vhost async copy "
+                  "offload for this pmd.");
+        return dmadev_id;
+    }
+
+    /* Configure the device. */
+    dev_conf.nb_vchans = 1;
+    dev_conf.enable_silent = false;
+    int ret = rte_dmadev_configure(dmadev_id, &dev_conf);
+    if (OVS_UNLIKELY(ret)) {
+        VLOG_ERR("Configure failed for DMA device %s with dev id: %u"
+                 " while assigning to pmd for vhost async copy offload.",
+                  dev_info.device->name, dmadev_id);
+        dmadev_id = DMADEV_ID_INVALID;
+    } else {
+        vchan_conf.direction = RTE_DMA_DIR_MEM_TO_MEM;
+        vchan_conf.nb_desc = VHOST_ASYNC_DMA_RING_SIZE;
+        ret = rte_dmadev_vchan_setup(dmadev_id, 0, &vchan_conf);
+        if (ret < 0) {
+            VLOG_ERR("Virtual channel setup failed with err %d for"
+                      " DMA device %s with dev id: %d",
+                      ret, dev_info.device->name, dmadev_id);
+            dmadev_id = DMADEV_ID_INVALID;
+            goto out;
+        }
+        /* TODO: Free this when thread exits ? but how ?*/
+        dmadev_enq_track[dmadev_id] = dpdk_rte_mzalloc(dma_track_ring_size);
+        if (!dmadev_enq_track[dmadev_id]) {
+            VLOG_ERR("Failed to allocate memory for enqueue tracking ring of"
+                     " DMA device %s with dev id: %d",
+                     dev_info.device->name, dmadev_id);
+            dmadev_id = DMADEV_ID_INVALID;
+            goto out;
+        }
+
+        rte_dmadev_start(dmadev_id);
+        if (dev_info.device->numa_node != pmd_numa_id) {
+            VLOG_WARN("No available DMA device found on numa node %d,"
+                      " assigning %s with dev id: %d on numa %d to pmd for"
+                      " vhost async copy offload.",
+                      pmd_numa_id, dev_info.device->name, dmadev_id,
+                      dev_info.device->numa_node);
+            return dmadev_id;
+        }
+        VLOG_INFO("DMA device %s with dev id: %d assigned to pmd for vhost"
+                  " async copy offload.", dev_info.device->name, dmadev_id);
+    }
+
+out:
+    return dmadev_id;
+}
+
+uint16_t
+dmadev_id_init(void)
+{
+    uint16_t new_id;
+    unsigned int pmd_core_id = RTE_PER_LCORE(_lcore_id);
+
+    new_id = *dmadev_id_get();
+
+    ovs_assert(new_id == DMADEV_ID_UNASSIGNED);
+    ovs_mutex_lock(&dmadev_mutex);
+    new_id = dmadev_get_free_dev(ovs_numa_get_numa_id(pmd_core_id));
+    ovs_mutex_unlock(&dmadev_mutex);
+
+    return *dmadev_id_get() = new_id;
+}
+
 /* Custom software stats for dpdk ports */
 struct netdev_dpdk_sw_stats {
     /* No. of retries when unable to transmit. */
@@ -206,6 +388,8 @@ struct netdev_dpdk_sw_stats {
     uint64_t rx_qos_drops;
     /* Packet drops in HWOL processing. */
     uint64_t tx_invalid_hwol_drops;
+    /* No. of packets pending to be tx'ed by async device. */
+    uint64_t tx_async_inflight;
 };
 
 enum dpdk_dev_type {
@@ -371,6 +555,27 @@ struct dpdk_mp {
      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
  };
 
+/* Tracking information for DMA in vhost async usecase. */
+struct enq_info_t{
+    atomic_uint8_t pkt_rcvd;
+    atomic_uint8_t end_of_burst;
+};
+
+#define DMA_COMPLETION_RING_SIZE VHOST_ASYNC_DMA_RING_SIZE
+
+/* DMA completion tracking ring to report the packets
+ * back to the vhost library in_order.
+ * Note: The write's to the enq_info array should be atomic
+ *       to guarantee correct behaviour. */
+struct dma_completions_t {
+    struct enq_info_t enq_info[DMA_COMPLETION_RING_SIZE];
+    uint16_t count;
+    uint16_t read_idx;
+    uint16_t write_idx;
+    /* Bursts completed but not reported to above layer for completion. */
+    uint16_t bursts_completed;
+};
+
 /* There should be one 'struct dpdk_tx_queue' created for
  * each netdev tx queue. */
 struct dpdk_tx_queue {
@@ -380,6 +585,13 @@ struct dpdk_tx_queue {
          * It is used only if the queue is shared among different pmd threads
          * (see 'concurrent_txq'). */
         rte_spinlock_t tx_lock;
+
+        /* vHost asynchronous channel registration status. */
+        bool is_async_reg;
+
+        /* DMA enqueue tracker to maintain in_order reporting of packets. */
+        struct dma_completions_t *dma_completions;
+
         /* Mapping of configured vhost-user queue to enabled by guest. */
         int map;
     );
@@ -474,6 +686,8 @@ struct netdev_dpdk {
 
         /* Array of vhost rxq states, see vring_state_changed. */
         bool *vhost_rxq_enabled;
+        /* Array of vhost rxq async registration status. */
+        bool *vhost_rxq_async_reg;
     );
 
     PADDED_MEMBERS(CACHE_LINE_SIZE,
@@ -533,6 +747,7 @@ struct netdev_dpdk {
 struct netdev_rxq_dpdk {
     struct netdev_rxq up;
     dpdk_port_t port_id;
+    struct dma_completions_t *dma_compl;
 };
 
 static void netdev_dpdk_destruct(struct netdev *netdev);
@@ -1200,20 +1415,37 @@ netdev_dpdk_alloc(void)
 }
 
 static struct dpdk_tx_queue *
-netdev_dpdk_alloc_txq(unsigned int n_txqs)
+netdev_dpdk_alloc_txq(unsigned int n_txqs, bool is_vhost)
 {
     struct dpdk_tx_queue *txqs;
-    unsigned i;
+    unsigned i = 0;
+    const size_t dma_compl_size = sizeof(struct dma_completions_t);
+    const bool is_vhost_async = is_vhost && dpdk_vhost_async_enabled();
+    bool alloc_failed = false;
 
     txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
     if (txqs) {
         for (i = 0; i < n_txqs; i++) {
+            if (is_vhost_async) {
+                txqs[i].dma_completions = dpdk_rte_mzalloc(dma_compl_size);
+                if (!txqs[i].dma_completions) {
+                    alloc_failed = true;
+                    break;
+                }
+            }
             /* Initialize map for vhost devices. */
             txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
             rte_spinlock_init(&txqs[i].tx_lock);
         }
     }
 
+    if (alloc_failed) {
+        for (int j = 0; j < i; j++) {
+            rte_free(txqs[j].dma_completions);
+        }
+        rte_free(txqs);
+    }
+
     return txqs;
 }
 
@@ -1314,9 +1546,18 @@ vhost_common_construct(struct netdev *netdev)
     if (!dev->vhost_rxq_enabled) {
         return ENOMEM;
     }
-    dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
+
+    dev->vhost_rxq_async_reg = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
+                                                sizeof(bool));
+    if (!dev->vhost_rxq_async_reg) {
+        rte_free(dev->vhost_rxq_enabled);
+        return ENOMEM;
+    }
+
+    dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM, true);
     if (!dev->tx_q) {
         rte_free(dev->vhost_rxq_enabled);
+        rte_free(dev->vhost_rxq_async_reg);
         return ENOMEM;
     }
 
@@ -1353,6 +1594,11 @@ netdev_dpdk_vhost_construct(struct netdev *netdev)
 
     /* There is no support for multi-segments buffers. */
     dev->vhost_driver_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
+
+    /* Enable async copy flag, if explicitly requested. */
+    if (dpdk_vhost_async_enabled()) {
+        dev->vhost_driver_flags |= RTE_VHOST_USER_ASYNC_COPY;
+    }
     err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
     if (err) {
         VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
@@ -1436,6 +1682,48 @@ netdev_dpdk_construct(struct netdev *netdev)
     return err;
 }
 
+/* Register the vHost async device for a queue. */
+static inline int
+netdev_dpdk_vhost_async_reg(const int vid, const int qid,
+                            const int virtq_id, const bool is_rx)
+{
+    int ret = -1;
+
+    if (OVS_UNLIKELY(vid < 0)) {
+        return ret;
+    }
+
+    struct rte_vhost_async_config config = {
+                        .features = RTE_VHOST_ASYNC_INORDER
+    };
+
+    ret = rte_vhost_async_channel_register_thread_unsafe(vid, virtq_id, config,
+                                                        &vhost_async_chnl_ops);
+    if (ret) {
+        VLOG_ERR("Async channel register failed for vid: %d, queue: %s%d "
+                 "with status: %d", vid, is_rx ? "rxq" : "txq", qid, ret);
+        return ret;
+    }
+    VLOG_INFO("Async channel register success for vid: %d, queue: %s%d",
+               vid, is_rx ? "rxq" : "txq", qid);
+    return ret;
+}
+
+/* Unregister the vHost async channel for a queue. */
+static inline void
+netdev_dpdk_vhost_async_unreg(const int vid, const int qid,
+                              const int virtq_id, const bool is_rx)
+{
+    int ret = rte_vhost_async_channel_unregister_thread_unsafe(vid, virtq_id);
+    if (ret) {
+        VLOG_ERR("Async channel unregister failed for vid: %d, queue: %s%d "
+                 "with status: %d", vid, is_rx ? "rxq" : "txq", qid, ret);
+        return;
+    }
+    VLOG_INFO("Async channel unregister success for vid: %d, queue: %s%d",
+               vid, is_rx ? "rxq" : "txq", qid);
+}
+
 static void
 common_destruct(struct netdev_dpdk *dev)
     OVS_REQUIRES(dpdk_mutex)
@@ -1532,7 +1820,13 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
     vhost_id = dev->vhost_id;
     dev->vhost_id = NULL;
     rte_free(dev->vhost_rxq_enabled);
+    rte_free(dev->vhost_rxq_async_reg);
 
+    if (dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY) {
+        for (int i = 0; i < OVS_VHOST_MAX_QUEUE_NUM; i++) {
+            rte_free(dev->tx_q[i].dma_completions);
+        }
+    }
     common_destruct(dev);
 
     ovs_mutex_unlock(&dpdk_mutex);
@@ -2129,6 +2423,26 @@ netdev_dpdk_rxq_alloc(void)
     return NULL;
 }
 
+static struct netdev_rxq *
+netdev_dpdk_vhost_rxq_alloc(void)
+{
+    struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
+    if (rx) {
+        if (dpdk_vhost_async_enabled()) {
+            rx->dma_compl = dpdk_rte_mzalloc(sizeof(struct dma_completions_t));
+            if (rx->dma_compl) {
+                return &rx->up;
+            } else {
+                rte_free(rx);
+                return NULL;
+            }
+        }
+        return &rx->up;
+    }
+
+    return NULL;
+}
+
 static struct netdev_rxq_dpdk *
 netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
 {
@@ -2161,6 +2475,14 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+static void
+netdev_dpdk_vhost_rxq_dealloc(struct netdev_rxq *rxq)
+{
+    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
+    rte_free(rx->dma_compl);
+    rte_free(rx);
+}
+
 /* Prepare the packet for HWOL.
  * Return True if the packet is OK to continue. */
 static bool
@@ -2408,15 +2730,29 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
     uint16_t qos_drops = 0;
     int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
     int vid = netdev_dpdk_get_vid(dev);
+    int async_inflight = 0;
 
     if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
                      || !(dev->flags & NETDEV_UP))) {
         return EAGAIN;
     }
 
-    nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
-                                    (struct rte_mbuf **) batch->packets,
-                                    NETDEV_MAX_BURST);
+    if (dev->vhost_rxq_async_reg[rxq->queue_id]
+            && dmadev_get_device() != DMADEV_ID_INVALID) {
+        struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
+        *dma_compl_ptr_get() = (uintptr_t) rx->dma_compl;
+
+        nb_rx = rte_vhost_async_try_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
+                                                  (struct rte_mbuf **)
+                                                  batch->packets,
+                                                  NETDEV_MAX_BURST,
+                                                  &async_inflight);
+    } else {
+        nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
+                                        (struct rte_mbuf **) batch->packets,
+                                        NETDEV_MAX_BURST);
+    }
+
     if (!nb_rx) {
         return EAGAIN;
     }
@@ -2557,14 +2893,15 @@ static inline void
 netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk *dev,
                                      struct dp_packet **packets,
                                      int attempted,
-                                     struct netdev_dpdk_sw_stats *sw_stats_add)
+                                     struct netdev_dpdk_sw_stats *sw_stats_add,
+                                     bool is_sent)
 {
     int dropped = sw_stats_add->tx_mtu_exceeded_drops +
                   sw_stats_add->tx_qos_drops +
                   sw_stats_add->tx_failure_drops +
                   sw_stats_add->tx_invalid_hwol_drops;
     struct netdev_stats *stats = &dev->stats;
-    int sent = attempted - dropped;
+    int sent = is_sent ?  attempted - dropped : 0;
     int i;
 
     stats->tx_packets += sent;
@@ -2585,9 +2922,108 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk 
*dev,
     }
 }
 
+/* Checks if the dma_completion ring is full. */
+static inline bool
+is_compl_ring_full(struct dma_completions_t *dma_compl)
+{
+    return dma_compl->count == DMA_COMPLETION_RING_SIZE;
+}
+
+/* Checks if the dma_completion ring is empty. */
+static inline bool
+is_compl_ring_empty(struct dma_completions_t *dma_compl)
+{
+    return dma_compl->count == 0;
+}
+
+static inline bool
+is_burst_complete(struct dma_completions_t *dma_compl)
+{
+    if (dma_compl->bursts_completed) {
+        dma_compl->bursts_completed--;
+        return true;
+    }
+    return false;
+}
+
+/* Free the packets sent via the async data path and
+ * return -EINPROGRESS if there are more packets to be freed. */
+static int
+netdev_dpdk_vhost_async_free(struct netdev *netdev, int qid, bool force)
+{
+    int ret = 0;
+    int max_attempt = 100;
+    uint16_t nr_xfrd_pkts = 0;
+    struct dp_packet *cmpl_pkts[NETDEV_MAX_BURST];
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    int vid = netdev_dpdk_get_vid(dev);
+    struct netdev_dpdk_sw_stats sw_stats_add = {0};
+    qid = dev->tx_q[qid].map;
+
+    if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
+                     || !(dev->flags & NETDEV_UP))) {
+        return 0;
+    }
+
+    do {
+        if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev->tx_q[qid].tx_lock))) {
+            COVERAGE_INC(vhost_tx_contention);
+            rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
+        }
+
+        if (is_compl_ring_empty(dev->tx_q[qid].dma_completions)) {
+            /* Reset burst counter to 0. */
+            dev->tx_q[qid].dma_completions->bursts_completed = 0;
+            /* No more packets to free, so return. */
+            rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
+            return 0;
+        }
+
+        max_attempt--;
+        COVERAGE_INC(vhost_async_tx_poll);
+        const uint16_t vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
+        *dma_compl_ptr_get() = (uintptr_t) dev->tx_q[qid].dma_completions;
+        /* Get the completion status of async transfer. */
+        nr_xfrd_pkts = rte_vhost_poll_enqueue_completed(vid, vhost_qid,
+                                                        (struct rte_mbuf **)
+                                                        cmpl_pkts,
+                                                        NETDEV_MAX_BURST);
+        if (!is_burst_complete(dev->tx_q[qid].dma_completions)) {
+            ret = -EINPROGRESS;
+        }
+
+        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
+
+        if (!nr_xfrd_pkts) {
+            COVERAGE_INC(vhost_async_tx_poll_empty);
+            continue;
+        }
+
+        rte_spinlock_lock(&dev->stats_lock);
+        dev->sw_stats->tx_async_inflight -= nr_xfrd_pkts;
+        netdev_dpdk_vhost_update_tx_counters(dev, cmpl_pkts, nr_xfrd_pkts,
+                                             &sw_stats_add, true);
+        rte_spinlock_unlock(&dev->stats_lock);
+
+        for (int i = 0; i < nr_xfrd_pkts; i++) {
+            dp_packet_delete(cmpl_pkts[i]);
+        }
+    } while (force && max_attempt);
+    return ret;
+}
+
+static inline void
+vhost_async_set_end_of_burst(struct dma_completions_t *dma_compl)
+{
+    const uint16_t ring_mask = DMA_COMPLETION_RING_SIZE - 1;
+    int prev_slot_idx = (dma_compl->write_idx - 1) & ring_mask;
+    struct enq_info_t *slot_addr = &(dma_compl->enq_info[prev_slot_idx]);
+    atomic_store_relaxed(&(slot_addr->end_of_burst), 1);
+}
+
 static int
 __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
-                         struct dp_packet **pkts, int cnt)
+                         struct dp_packet **pkts, int cnt, bool dpdk_buf)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
     struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
@@ -2597,6 +3033,9 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
     int i, retries = 0;
     int max_retries = VHOST_ENQ_RETRY_MIN;
     int vid = netdev_dpdk_get_vid(dev);
+    int free_start_idx = 0;
+    bool is_async = false;
+    int ret = 0;
 
     qid = dev->tx_q[qid % netdev->n_txq].map;
 
@@ -2628,13 +3067,24 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
     cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
     sw_stats_add.tx_qos_drops -= cnt;
 
+    sw_stats_add.tx_async_inflight = 0;
     n_packets_to_free = cnt;
+    is_async = dev->tx_q[qid].is_async_reg && dpdk_buf
+                    && (dmadev_get_device() != DMADEV_ID_INVALID);
 
     do {
         int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
         unsigned int tx_pkts;
+        if (is_async) {
+            *dma_compl_ptr_get() = (uintptr_t) dev->tx_q[qid].dma_completions;
+            /* Call the transfer data callback for async transfer.*/
+            tx_pkts = rte_vhost_submit_enqueue_burst(vid, vhost_qid,
+                                                     cur_pkts, cnt);
+            sw_stats_add.tx_async_inflight += tx_pkts;
+        } else {
+            tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
+        }
 
-        tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
         if (OVS_LIKELY(tx_pkts)) {
             /* Packets have been sent.*/
             cnt -= tx_pkts;
@@ -2652,23 +3102,31 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
             break;
         }
     } while (cnt && (retries++ < max_retries));
-
+    if (sw_stats_add.tx_async_inflight) {
+        vhost_async_set_end_of_burst(dev->tx_q[qid].dma_completions);
+        /* Set return to call free for asynchronously sent packets. */
+        ret = -EINPROGRESS;
+    }
     rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
 
     sw_stats_add.tx_failure_drops = cnt;
     sw_stats_add.tx_retries = MIN(retries, max_retries);
 
     rte_spinlock_lock(&dev->stats_lock);
+    dev->sw_stats->tx_async_inflight += sw_stats_add.tx_async_inflight;
     netdev_dpdk_vhost_update_tx_counters(dev, pkts, total_packets,
-                                         &sw_stats_add);
+                                         &sw_stats_add, !is_async);
     rte_spinlock_unlock(&dev->stats_lock);
 
+    /* Since dropped packets are at the end of the burst,
+    * update index to delete the packets dropped in current burst. */
+    free_start_idx = sw_stats_add.tx_async_inflight;
 out:
-    for (i = 0; i < n_packets_to_free; i++) {
+    for (i = free_start_idx; i < n_packets_to_free; i++) {
         dp_packet_delete(pkts[i]);
     }
 
-    return 0;
+    return ret;
 }
 
 static void
@@ -2829,7 +3287,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct 
dp_packet_batch *batch)
 
     if (OVS_LIKELY(txcnt)) {
         if (dev->type == DPDK_DEV_VHOST) {
-            ret = __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt);
+            ret = __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt, false);
         } else {
             tx_failure += netdev_dpdk_eth_tx_burst(dev, qid,
                                                    (struct rte_mbuf **)pkts,
@@ -2862,7 +3320,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
         dp_packet_delete_batch(batch, true);
     } else {
         ret = __netdev_dpdk_vhost_send(netdev, qid, batch->packets,
-                                       dp_packet_batch_size(batch));
+                                       dp_packet_batch_size(batch), true);
     }
     return ret;
 }
@@ -3258,7 +3716,8 @@ netdev_dpdk_get_sw_custom_stats(const struct netdev 
*netdev,
     SW_CSTAT(tx_mtu_exceeded_drops)  \
     SW_CSTAT(tx_qos_drops)           \
     SW_CSTAT(rx_qos_drops)           \
-    SW_CSTAT(tx_invalid_hwol_drops)
+    SW_CSTAT(tx_invalid_hwol_drops)  \
+    SW_CSTAT(tx_async_inflight)
 
 #define SW_CSTAT(NAME) + 1
     custom_stats->size = SW_CSTATS;
@@ -3972,6 +4431,282 @@ netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
     free(enabled_queues);
 }
 
+/* Enqueue a packet via DMA. */
+static inline void
+vhost_async_dmadev_enqueue_packet(const uint16_t dev_id,
+                                  const struct rte_vhost_iov_iter *src_ptr,
+                                  const struct rte_vhost_iov_iter *dst_ptr,
+                                  const uint16_t nr_segs,
+                                  struct enq_info_t *slot_addr,
+                                  bool is_rx)
+{
+    uint16_t seg_idx = 0;
+    struct enq_info_t *addr = NULL;
+    uint64_t dma_flags = RTE_DMA_OP_FLAG_LLC;
+    const uint16_t dmadev_ring_mask = VHOST_ASYNC_DMA_RING_SIZE - 1;
+
+    while (OVS_LIKELY(seg_idx < nr_segs)) {
+        /* Fetch DMA source start addr. */
+        const rte_iova_t  s_base = (uintptr_t)(src_ptr->iov[seg_idx].iov_base);
+        const rte_iova_t  dma_src_start_addr = src_ptr->offset + s_base;
+        /* Fetch DMA destination start addr. */
+        const rte_iova_t  d_base = (uintptr_t)(dst_ptr->iov[seg_idx].iov_base);
+        const rte_iova_t  dma_dst_start_addr = dst_ptr->offset + d_base;
+        /* Fetch packet segment length. */
+        const uint32_t dma_src_len = src_ptr->iov[seg_idx].iov_len;
+        /* Check if this segment is the last. */
+        if (seg_idx == nr_segs - 1) {
+            addr = slot_addr;
+        }
+
+        int enq_index = rte_dmadev_copy(dev_id,
+                                        0,
+                                        dma_src_start_addr,
+                                        dma_dst_start_addr,
+                                        dma_src_len,
+                                        dma_flags);
+        if (OVS_UNLIKELY(enq_index < 0)) {
+            break;
+        }
+        dmadev_enq_track[dev_id][enq_index & dmadev_ring_mask] = (void *) addr;
+        is_rx ? COVERAGE_INC(vhost_async_rx_enqueue) :
+                COVERAGE_INC(vhost_async_tx_enqueue);
+        seg_idx++;
+    }
+}
+
+/* Enqueue a packet through SW copy. */
+static inline void
+sw_enqueue_packet(const struct rte_vhost_iov_iter *src_ptr,
+                  const struct rte_vhost_iov_iter *dst_ptr,
+                  const uint16_t nr_segs,
+                  bool is_rx)
+{
+    uint16_t seg_idx = 0;
+
+    while (OVS_LIKELY(seg_idx < nr_segs)) {
+        /* Fetch source start address. */
+        const uintptr_t s_base = (uintptr_t)(src_ptr->iov[seg_idx].iov_base);
+        const uintptr_t src_start_addr = src_ptr->offset + s_base;
+        /* Fetch destination start address. */
+        const uintptr_t d_base = (uintptr_t)(dst_ptr->iov[seg_idx].iov_base);
+        const uintptr_t dst_start_addr = dst_ptr->offset + d_base;
+        /* Fetch segment length. */
+        const size_t src_len = src_ptr->iov[seg_idx].iov_len;
+
+        rte_memcpy((void *) dst_start_addr,
+                   (void *) src_start_addr,
+                   src_len);
+        is_rx ? COVERAGE_INC(vhost_async_rx_enqueue_sw_fallback) :
+                COVERAGE_INC(vhost_async_tx_enqueue_sw_fallback);
+        seg_idx++;
+    }
+}
+
+/* Fetch the slot address for a packet. */
+static inline struct enq_info_t *
+compl_slot_get_and_inc(struct dma_completions_t *dma_compl)
+{
+    struct enq_info_t *slot_addr
+                        = &(dma_compl->enq_info[dma_compl->write_idx]);
+    const uint16_t ring_mask = DMA_COMPLETION_RING_SIZE - 1;
+
+    dma_compl->write_idx++;
+    dma_compl->write_idx &= ring_mask;
+    dma_compl->count++;
+    return slot_addr;
+}
+
+/* Calculate packets sent for a txq by parsing dma_completion ring. */
+static inline uint32_t
+count_completed_packets(struct dma_completions_t *dma_compl,
+                        const bool is_rx,
+                        const int max_pkts)
+{
+    uint32_t pkts;
+    int count = dma_compl->count;
+    int read_idx = dma_compl->read_idx;
+    uint8_t pkt_rcvd = 0, end_of_burst = 0;
+    const uint16_t ring_mask = DMA_COMPLETION_RING_SIZE - 1;
+
+    for (pkts = 0; (pkts < max_pkts) && (count > 0); pkts++) {
+        read_idx &= ring_mask;
+        atomic_read_relaxed(&(dma_compl->enq_info[read_idx].pkt_rcvd),
+                            &pkt_rcvd);
+        if (!pkt_rcvd) {
+            break;
+        }
+        if (!is_rx) {
+            atomic_read_relaxed(&dma_compl->enq_info[read_idx].end_of_burst,
+                                &end_of_burst);
+            dma_compl->bursts_completed += end_of_burst;
+            atomic_store_relaxed(&(dma_compl->enq_info[read_idx].end_of_burst),
+                                 0);
+        }
+
+        atomic_store_relaxed(&(dma_compl->enq_info[read_idx].pkt_rcvd), 0);
+
+        count--;
+        read_idx++;
+    }
+    dma_compl->count = count;
+    dma_compl->read_idx = read_idx;
+    return pkts;
+}
+
+/* vHost async callback to offload enqueue via DMA. */
+static int32_t
+vhost_async_dmadev_transfer_data_cb(int vid OVS_UNUSED,
+                                    uint16_t virtq_qid,
+                                    struct rte_vhost_async_desc *descs,
+                                    struct rte_vhost_async_status *opaque_data,
+                                    uint16_t count)
+{
+    uint16_t desc_idx = 0;
+    struct enq_info_t *slot_addr = NULL;
+    struct dma_completions_t *compl = NULL;
+    bool is_rx = (virtq_qid % VIRTIO_QNUM) == VIRTIO_TXQ;
+
+    ovs_assert(opaque_data == NULL);
+
+    compl = (struct dma_completions_t *)*dma_compl_ptr_get();
+    if (is_compl_ring_full(compl)) {
+        if (is_rx) {
+            COVERAGE_ADD(vhost_async_rx_compl_ring_full_drops, count);
+        } else {
+            COVERAGE_ADD(vhost_async_tx_compl_ring_full_drops, count);
+        }
+        goto out;
+    }
+
+    /* Fetch the dmadev id assigned to the current thread. */
+    uint16_t dev_id = dmadev_get_device();
+    /* Cache space left in DMA ring to avoid driver call for every packet. */
+    /* Similar to following API will be added in the next revisions of DMADEV.
+       Comment out for now.*/
+    /* uint16_t dmadev_space_left = rte_dmadev_burst_capacity(dev_id, 0); */
+    const int compl_space_left = DMA_COMPLETION_RING_SIZE - compl->count;
+    if (count > compl_space_left) {
+        if (is_rx) {
+            COVERAGE_ADD(vhost_async_rx_compl_ring_full_drops,
+                         count - compl_space_left);
+        } else {
+            COVERAGE_ADD(vhost_async_tx_compl_ring_full_drops,
+                         count - compl_space_left);
+        }
+        count = compl_space_left;
+    }
+
+    while (desc_idx < count) {
+        const struct rte_vhost_iov_iter *src_ptr = descs[desc_idx].src;
+        const struct rte_vhost_iov_iter *dst_ptr = descs[desc_idx].dst;
+        const uint16_t nr_segs = src_ptr->nr_segs;
+        /*
+        if (dmadev_space_left < nr_segs) {
+            if (is_rx) {
+                COVERAGE_INC(vhost_async_rx_dma_ring_full);
+            } else {
+                COVERAGE_INC(vhost_async_tx_dma_ring_full);
+            }
+            goto ring_doorbell;
+        }
+        */
+        slot_addr = compl_slot_get_and_inc(compl);
+        vhost_async_dmadev_enqueue_packet(dev_id, src_ptr, dst_ptr,
+                                          nr_segs, slot_addr, is_rx);
+        /* dmadev_space_left -= nr_segs;*/
+        desc_idx++;
+    }
+/*
+ring_doorbell:
+*/
+    if (desc_idx != 0) {
+        /* Ring the doorbell. */
+        rte_dmadev_submit(dev_id, 0);
+    }
+
+    /* Do software copy for packets that do no fit in the DMA ring. */
+    while (desc_idx < count) {
+        const struct rte_vhost_iov_iter *src_ptr = descs[desc_idx].src;
+        const struct rte_vhost_iov_iter *dst_ptr = descs[desc_idx].dst;
+        slot_addr = compl_slot_get_and_inc(compl);
+        sw_enqueue_packet(src_ptr, dst_ptr, src_ptr->nr_segs, is_rx);
+        atomic_store_relaxed(&(slot_addr->pkt_rcvd), 1);
+        desc_idx++;
+    }
+
+out:
+    return desc_idx;
+}
+
+/* vHost async callback to query transfer status of DMA. */
+static int32_t
+vhost_async_dmadev_check_completed_copies_cb(int vid,
+                                             uint16_t virtq_qid,
+                                             struct rte_vhost_async_status
+                                             *opaque_data,
+                                             uint16_t max_pkts)
+{
+    bool error;
+    uint16_t last_idx;
+    uint32_t nr_pkts = 0;
+    struct enq_info_t *slots;
+    struct dma_completions_t *compl = NULL;
+    bool is_rx = (virtq_qid % VIRTIO_QNUM) == VIRTIO_TXQ;
+    const uint16_t dmadev_ring_mask = VHOST_ASYNC_DMA_RING_SIZE - 1;
+    const uint8_t max_copies = NETDEV_MAX_BURST * vhost_dma_poll_factor;
+
+    ovs_assert(opaque_data == NULL);
+
+    compl = (struct dma_completions_t *)*dma_compl_ptr_get();
+    if (OVS_UNLIKELY(is_compl_ring_empty(compl))) {
+        if (is_rx) {
+            COVERAGE_INC(vhost_async_rx_compl_ring_empty);
+        } else {
+            COVERAGE_INC(vhost_async_tx_compl_ring_empty);
+        }
+        goto out;
+    }
+
+    /* Fetch the dmadev id assigned to the current thread. */
+    uint16_t dev_id = dmadev_get_device();
+
+    /* Check the completion status of DMA. */
+    const uint16_t ret_segs = rte_dmadev_completed(dev_id,
+                                                    0,
+                                                    max_copies,
+                                                    &last_idx,
+                                                    &error);
+    if (OVS_UNLIKELY(error)) {
+        VLOG_WARN_RL(&rl,"rte_dmadev_completed returned error for dev id: %u"
+                     "with vid: %d, qid %u", dev_id, vid,
+                     virtq_qid/ VIRTIO_QNUM);
+        return -1;
+    }
+    /* Compute the start index. */
+    uint16_t pkt_idx = (last_idx - ret_segs + 1) & dmadev_ring_mask;
+    for (int i = 0; i < ret_segs; i++) {
+        slots = (struct enq_info_t *) dmadev_enq_track[dev_id][pkt_idx];
+        if (slots) {
+            /* Mark the packet slot as received.
+             * The slot could belong to another queue but writes are atomic. */
+            atomic_store_relaxed(&(slots->pkt_rcvd), 1);
+        }
+        pkt_idx = (pkt_idx + 1) & dmadev_ring_mask;
+    }
+    /* Calculate packets successfully DMA'ed from this virtqueue. */
+    nr_pkts = count_completed_packets(compl, is_rx, max_pkts);
+    if (is_rx) {
+        COVERAGE_INC(vhost_async_rx_poll);
+        if (!nr_pkts) {
+            COVERAGE_INC(vhost_async_rx_poll_empty);
+        }
+    }
+
+out:
+    return nr_pkts;
+}
+
 /*
  * A new virtio-net device is added to a vhost port.
  */
@@ -4075,6 +4810,8 @@ destroy_device(int vid)
             ovsrcu_index_set(&dev->vid, -1);
             memset(dev->vhost_rxq_enabled, 0,
                    dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
+            memset(dev->vhost_rxq_enabled, 0,
+                   dev->up.n_rxq * sizeof *dev->vhost_rxq_async_reg);
             netdev_dpdk_txq_map_clear(dev);
 
             netdev_change_seq_changed(&dev->up);
@@ -4122,13 +4859,42 @@ vring_state_changed(int vid, uint16_t queue_id, int 
enable)
                 bool old_state = dev->vhost_rxq_enabled[qid];
 
                 dev->vhost_rxq_enabled[qid] = enable != 0;
+                if (enable) {
+                    if ((dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY)
+                            && !dev->vhost_rxq_async_reg[qid]) {
+                        if (!netdev_dpdk_vhost_async_reg(vid, qid, queue_id,
+                                                         is_rx)) {
+                            dev->vhost_rxq_async_reg[qid] = true;
+                        }
+                    }
+                } else {
+                    if ((dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY)
+                            && dev->vhost_rxq_async_reg[qid]) {
+                        netdev_dpdk_vhost_async_unreg(vid, qid, queue_id,
+                                                      is_rx);
+                        dev->vhost_rxq_async_reg[qid] = false;
+                    }
+                }
                 if (old_state != dev->vhost_rxq_enabled[qid]) {
                     netdev_change_seq_changed(&dev->up);
                 }
             } else {
                 if (enable) {
                     dev->tx_q[qid].map = qid;
+                    if ((dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY)
+                            && !dev->tx_q[qid].is_async_reg) {
+                        if (!netdev_dpdk_vhost_async_reg(vid, qid, queue_id,
+                                                         is_rx)) {
+                            dev->tx_q[qid].is_async_reg = true;
+                        }
+                    }
                 } else {
+                    if ((dev->vhost_driver_flags & RTE_VHOST_USER_ASYNC_COPY)
+                            && dev->tx_q[qid].is_async_reg) {
+                        netdev_dpdk_vhost_async_unreg(vid, qid, queue_id,
+                                                      is_rx);
+                        dev->tx_q[qid].is_async_reg = false;
+                    }
                     dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
                 }
                 netdev_dpdk_remap_txqs(dev);
@@ -5036,7 +5802,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
      */
     dev->requested_hwaddr = dev->hwaddr;
 
-    dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
+    dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq, false);
     if (!dev->tx_q) {
         err = ENOMEM;
     }
@@ -5135,6 +5901,11 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev 
*netdev)
             vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
         }
 
+        /* Enable async copy flag, if explicitly requested. */
+        if (dpdk_vhost_async_enabled()) {
+            vhost_flags |= RTE_VHOST_USER_ASYNC_COPY;
+        }
+
         /* Enable External Buffers if TCP Segmentation Offload is enabled. */
         if (userspace_tso_enabled()) {
             vhost_flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
@@ -5442,10 +6213,8 @@ netdev_dpdk_rte_flow_tunnel_item_release(struct netdev 
*netdev,
     .queue_dump_next = netdev_dpdk_queue_dump_next,         \
     .queue_dump_done = netdev_dpdk_queue_dump_done,         \
     .update_flags = netdev_dpdk_update_flags,               \
-    .rxq_alloc = netdev_dpdk_rxq_alloc,                     \
     .rxq_construct = netdev_dpdk_rxq_construct,             \
-    .rxq_destruct = netdev_dpdk_rxq_destruct,               \
-    .rxq_dealloc = netdev_dpdk_rxq_dealloc
+    .rxq_destruct = netdev_dpdk_rxq_destruct
 
 #define NETDEV_DPDK_CLASS_BASE                          \
     NETDEV_DPDK_CLASS_COMMON,                           \
@@ -5458,7 +6227,9 @@ netdev_dpdk_rte_flow_tunnel_item_release(struct netdev 
*netdev,
     .get_features = netdev_dpdk_get_features,           \
     .get_status = netdev_dpdk_get_status,               \
     .reconfigure = netdev_dpdk_reconfigure,             \
-    .rxq_recv = netdev_dpdk_rxq_recv
+    .rxq_recv = netdev_dpdk_rxq_recv,                   \
+    .rxq_alloc = netdev_dpdk_rxq_alloc,                 \
+    .rxq_dealloc = netdev_dpdk_rxq_dealloc
 
 static const struct netdev_class dpdk_class = {
     .type = "dpdk",
@@ -5474,7 +6245,7 @@ static const struct netdev_class dpdk_vhost_class = {
     .construct = netdev_dpdk_vhost_construct,
     .destruct = netdev_dpdk_vhost_destruct,
     .send = netdev_dpdk_vhost_send,
-    .process_async = NULL,
+    .process_async = netdev_dpdk_vhost_async_free,
     .get_carrier = netdev_dpdk_vhost_get_carrier,
     .get_stats = netdev_dpdk_vhost_get_stats,
     .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
@@ -5482,6 +6253,8 @@ static const struct netdev_class dpdk_vhost_class = {
     .reconfigure = netdev_dpdk_vhost_reconfigure,
     .rxq_recv = netdev_dpdk_vhost_rxq_recv,
     .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
+    .rxq_alloc = netdev_dpdk_vhost_rxq_alloc,
+    .rxq_dealloc = netdev_dpdk_vhost_rxq_dealloc,
 };
 
 static const struct netdev_class dpdk_vhost_client_class = {
@@ -5491,7 +6264,7 @@ static const struct netdev_class dpdk_vhost_client_class 
= {
     .destruct = netdev_dpdk_vhost_destruct,
     .set_config = netdev_dpdk_vhost_client_set_config,
     .send = netdev_dpdk_vhost_send,
-    .process_async = NULL,
+    .process_async = netdev_dpdk_vhost_async_free,
     .get_carrier = netdev_dpdk_vhost_get_carrier,
     .get_stats = netdev_dpdk_vhost_get_stats,
     .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
@@ -5499,6 +6272,8 @@ static const struct netdev_class dpdk_vhost_client_class 
= {
     .reconfigure = netdev_dpdk_vhost_client_reconfigure,
     .rxq_recv = netdev_dpdk_vhost_rxq_recv,
     .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
+    .rxq_alloc = netdev_dpdk_vhost_rxq_alloc,
+    .rxq_dealloc = netdev_dpdk_vhost_rxq_dealloc,
 };
 
 void
diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
index 699be3fb4..690b0b830 100644
--- a/lib/netdev-dpdk.h
+++ b/lib/netdev-dpdk.h
@@ -20,6 +20,7 @@
 #include <config.h>
 
 #include "openvswitch/compiler.h"
+#include "ovs-thread.h"
 
 struct dp_packet;
 struct netdev;
@@ -27,6 +28,25 @@ struct netdev;
 #ifdef DPDK_NETDEV
 
 #include <rte_flow.h>
+/* For vHost async datapath, dmadev id alloation is per dataplane thread. */
+DECLARE_EXTERN_PER_THREAD_DATA(uint16_t, dmadev_id);
+DECLARE_EXTERN_PER_THREAD_DATA(uintptr_t, dma_compl_ptr);
+
+#define DMADEV_ID_UNASSIGNED UINT16_MAX
+#define DMADEV_ID_INVALID (UINT16_MAX - 1)
+
+uint16_t dmadev_id_init(void);
+
+static inline uint16_t
+dmadev_get_device(void)
+{
+    uint16_t id = *dmadev_id_get();
+
+    if (id == DMADEV_ID_UNASSIGNED) {
+        id = dmadev_id_init();
+    }
+    return id;
+}
 
 void netdev_dpdk_register(void);
 void free_dpdk_buf(struct dp_packet *);
-- 
2.25.1


_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH RFC dpdk-latest V2 1/1] netdev-dpdk: Enable vhost async API's in OvS.

Reply via email to