Bypass Verbs to improve TX performance.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com>
Signed-off-by: Yaacov Hazan <yaacovh at mellanox.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
---
 drivers/net/mlx5/Makefile      |   5 -
 drivers/net/mlx5/mlx5_ethdev.c |  10 +-
 drivers/net/mlx5/mlx5_mr.c     |   4 +-
 drivers/net/mlx5/mlx5_rxtx.c   | 359 ++++++++++++++++++++++-------------------
 drivers/net/mlx5/mlx5_rxtx.h   |  53 +++---
 drivers/net/mlx5/mlx5_txq.c    | 210 ++++++++++++------------
 6 files changed, 334 insertions(+), 307 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index a63d6b3..9b4455b 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -102,11 +102,6 @@ endif
 mlx5_autoconf.h: $(RTE_SDK)/scripts/auto-config-h.sh
        $Q $(RM) -f -- '$@'
        $Q sh -- '$<' '$@' \
-               HAVE_VERBS_VLAN_INSERTION \
-               infiniband/verbs.h \
-               enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
-               $(AUTOCONF_OUTPUT)
-       $Q sh -- '$<' '$@' \
                HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \
                infiniband/verbs_exp.h \
                enum IBV_EXP_CQ_COMPRESSED_CQE \
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 4cfcbd5..aaa6c16 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1243,11 +1243,11 @@ mlx5_secondary_data_setup(struct priv *priv)
                txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0,
                                             primary_txq_ctrl->socket);
                if (txq_ctrl != NULL) {
-                       if (txq_setup(priv->dev,
-                                     primary_txq_ctrl,
-                                     primary_txq->elts_n,
-                                     primary_txq_ctrl->socket,
-                                     NULL) == 0) {
+                       if (txq_ctrl_setup(priv->dev,
+                                          primary_txq_ctrl,
+                                          primary_txq->elts_n,
+                                          primary_txq_ctrl->socket,
+                                          NULL) == 0) {
                                txq_ctrl->txq.stats.idx = 
primary_txq->stats.idx;
                                tx_queues[i] = &txq_ctrl->txq;
                                continue;
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 79d5568..e5e8a04 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -189,7 +189,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, 
unsigned int idx)
        /* Add a new entry, register MR first. */
        DEBUG("%p: discovered new memory pool \"%s\" (%p)",
              (void *)txq_ctrl, mp->name, (void *)mp);
-       mr = mlx5_mp2mr(txq_ctrl->txq.priv->pd, mp);
+       mr = mlx5_mp2mr(txq_ctrl->priv->pd, mp);
        if (unlikely(mr == NULL)) {
                DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
                      (void *)txq_ctrl);
@@ -208,7 +208,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, 
unsigned int idx)
        /* Store the new entry. */
        txq_ctrl->txq.mp2mr[idx].mp = mp;
        txq_ctrl->txq.mp2mr[idx].mr = mr;
-       txq_ctrl->txq.mp2mr[idx].lkey = mr->lkey;
+       txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey);
        DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
              (void *)txq_ctrl, mp->name, (void *)mp,
              txq_ctrl->txq.mp2mr[idx].lkey);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7d74074..cee6067 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -119,68 +119,52 @@ get_cqe64(volatile struct mlx5_cqe64 cqes[],
  *
  * @param txq
  *   Pointer to TX queue structure.
- *
- * @return
- *   0 on success, -1 on failure.
  */
-static int
+static void
 txq_complete(struct txq *txq)
 {
-       unsigned int elts_comp = txq->elts_comp;
-       unsigned int elts_tail = txq->elts_tail;
-       unsigned int elts_free = txq->elts_tail;
        const unsigned int elts_n = txq->elts_n;
-       int wcs_n;
-
-       if (unlikely(elts_comp == 0))
-               return 0;
-#ifdef DEBUG_SEND
-       DEBUG("%p: processing %u work requests completions",
-             (void *)txq, elts_comp);
-#endif
-       wcs_n = txq->poll_cnt(txq->cq, elts_comp);
-       if (unlikely(wcs_n == 0))
-               return 0;
-       if (unlikely(wcs_n < 0)) {
-               DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
-                     (void *)txq, wcs_n);
-               return -1;
+       const unsigned int cqe_n = txq->cqe_n;
+       uint16_t elts_free = txq->elts_tail;
+       uint16_t elts_tail;
+       uint16_t cq_ci = txq->cq_ci;
+       unsigned int wqe_ci = (unsigned int)-1;
+       int ret = 0;
+
+       while (ret == 0) {
+               volatile struct mlx5_cqe64 *cqe;
+
+               cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
+               if (cqe == NULL)
+                       break;
+               wqe_ci = ntohs(cqe->wqe_counter);
        }
-       elts_comp -= wcs_n;
-       assert(elts_comp <= txq->elts_comp);
-       /*
-        * Assume WC status is successful as nothing can be done about it
-        * anyway.
-        */
-       elts_tail += wcs_n * txq->elts_comp_cd_init;
-       if (elts_tail >= elts_n)
-               elts_tail -= elts_n;
-
-       while (elts_free != elts_tail) {
-               struct txq_elt *elt = &(*txq->elts)[elts_free];
+       if (unlikely(wqe_ci == (unsigned int)-1))
+               return;
+       /* Free buffers. */
+       elts_tail = (wqe_ci + 1) & (elts_n - 1);
+       do {
+               struct rte_mbuf *elt = (*txq->elts)[elts_free];
                unsigned int elts_free_next =
-                       (((elts_free + 1) == elts_n) ? 0 : elts_free + 1);
-               struct rte_mbuf *tmp = elt->buf;
-               struct txq_elt *elt_next = &(*txq->elts)[elts_free_next];
+                       (elts_free + 1) & (elts_n - 1);
+               struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];

 #ifndef NDEBUG
                /* Poisoning. */
-               memset(elt, 0x66, sizeof(*elt));
+               memset(&(*txq->elts)[elts_free],
+                      0x66,
+                      sizeof((*txq->elts)[elts_free]));
 #endif
-               RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-               /* Faster than rte_pktmbuf_free(). */
-               do {
-                       struct rte_mbuf *next = NEXT(tmp);
-
-                       rte_pktmbuf_free_seg(tmp);
-                       tmp = next;
-               } while (tmp != NULL);
+               RTE_MBUF_PREFETCH_TO_FREE(elt_next);
+               /* Only one segment needs to be freed. */
+               rte_pktmbuf_free_seg(elt);
                elts_free = elts_free_next;
-       }
-
+       } while (elts_free != elts_tail);
+       txq->cq_ci = cq_ci;
        txq->elts_tail = elts_tail;
-       txq->elts_comp = elts_comp;
-       return 0;
+       /* Update the consumer index. */
+       rte_wmb();
+       *txq->cq_db = htonl(cq_ci);
 }

 /**
@@ -231,7 +215,8 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
                }
                if (txq->mp2mr[i].mp == mp) {
                        assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-                       assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+                       assert(htonl(txq->mp2mr[i].mr->lkey) ==
+                              txq->mp2mr[i].lkey);
                        lkey = txq->mp2mr[i].lkey;
                        break;
                }
@@ -242,33 +227,136 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 }

 /**
- * Insert VLAN using mbuf headroom space.
- *
- * @param buf
- *   Buffer for VLAN insertion.
+ * Write a regular WQE.
  *
- * @return
- *   0 on success, errno value on failure.
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the WQE to fill.
+ * @param addr
+ *   Buffer data address.
+ * @param length
+ *   Packet length.
+ * @param lkey
+ *   Memory region lkey.
  */
-static inline int
-insert_vlan_sw(struct rte_mbuf *buf)
+static inline void
+mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
+              uintptr_t addr, uint32_t length, uint32_t lkey)
 {
-       uintptr_t addr;
-       uint32_t vlan;
-       uint16_t head_room_len = rte_pktmbuf_headroom(buf);
+       wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+       wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+       wqe->wqe.ctrl.data[3] = 0;
+       wqe->inl.eseg.rsvd0 = 0;
+       wqe->inl.eseg.rsvd1 = 0;
+       wqe->inl.eseg.mss = 0;
+       wqe->inl.eseg.rsvd2 = 0;
+       wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
+       /* Copy the first 16 bytes into inline header. */
+       rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+                  (uint8_t *)(uintptr_t)addr,
+                  MLX5_ETH_INLINE_HEADER_SIZE);
+       addr += MLX5_ETH_INLINE_HEADER_SIZE;
+       length -= MLX5_ETH_INLINE_HEADER_SIZE;
+       /* Store remaining data in data segment. */
+       wqe->wqe.dseg.byte_count = htonl(length);
+       wqe->wqe.dseg.lkey = lkey;
+       wqe->wqe.dseg.addr = htonll(addr);
+       /* Increment consumer index. */
+       ++txq->wqe_ci;
+}

-       if (head_room_len < 4)
-               return EINVAL;
+/**
+ * Write a regular WQE with VLAN.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the WQE to fill.
+ * @param addr
+ *   Buffer data address.
+ * @param length
+ *   Packet length.
+ * @param lkey
+ *   Memory region lkey.
+ * @param vlan_tci
+ *   VLAN field to insert in packet.
+ */
+static inline void
+mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
+                   uintptr_t addr, uint32_t length, uint32_t lkey,
+                   uint16_t vlan_tci)
+{
+       uint32_t vlan = htonl(0x81000000 | vlan_tci);
+
+       wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+       wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+       wqe->wqe.ctrl.data[3] = 0;
+       wqe->inl.eseg.rsvd0 = 0;
+       wqe->inl.eseg.rsvd1 = 0;
+       wqe->inl.eseg.mss = 0;
+       wqe->inl.eseg.rsvd2 = 0;
+       wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
+       /*
+        * Copy 12 bytes of source & destination MAC address.
+        * Copy 4 bytes of VLAN.
+        * Copy 2 bytes of Ether type.
+        */
+       rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+                  (uint8_t *)(uintptr_t)addr, 12);
+       rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 12),
+                  &vlan, sizeof(vlan));
+       rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 16),
+                  (uint8_t *)((uintptr_t)addr + 12), 2);
+       addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+       length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+       /* Store remaining data in data segment. */
+       wqe->wqe.dseg.byte_count = htonl(length);
+       wqe->wqe.dseg.lkey = lkey;
+       wqe->wqe.dseg.addr = htonll(addr);
+       /* Increment consumer index. */
+       ++txq->wqe_ci;
+}

-       addr = rte_pktmbuf_mtod(buf, uintptr_t);
-       vlan = htonl(0x81000000 | buf->vlan_tci);
-       memmove((void *)(addr - 4), (void *)addr, 12);
-       memcpy((void *)(addr + 8), &vlan, sizeof(vlan));
+/**
+ * Ring TX queue doorbell.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ */
+static inline void
+mlx5_tx_dbrec(struct txq *txq)
+{
+       uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset);
+       uint32_t data[4] = {
+               htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
+               htonl(txq->qp_num_8s),
+               0,
+               0,
+       };
+       rte_wmb();
+       *txq->qp_db = htonl(txq->wqe_ci);
+       /* Ensure ordering between DB record and BF copy. */
+       rte_wmb();
+       rte_mov16(dst, (uint8_t *)data);
+       txq->bf_offset ^= txq->bf_buf_size;
+}

-       SET_DATA_OFF(buf, head_room_len - 4);
-       DATA_LEN(buf) += 4;
+/**
+ * Prefetch a CQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param cqe_ci
+ *   CQE consumer index.
+ */
+static inline void
+tx_prefetch_cqe(struct txq *txq, uint16_t ci)
+{
+       volatile struct mlx5_cqe64 *cqe;

-       return 0;
+       cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)];
+       rte_prefetch0(cqe);
 }

 /**
@@ -288,18 +376,21 @@ uint16_t
 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
        struct txq *txq = (struct txq *)dpdk_txq;
-       unsigned int elts_head = txq->elts_head;
+       uint16_t elts_head = txq->elts_head;
        const unsigned int elts_n = txq->elts_n;
-       unsigned int elts_comp_cd = txq->elts_comp_cd;
-       unsigned int elts_comp = 0;
        unsigned int i;
        unsigned int max;
-       int err;
-       struct rte_mbuf *buf = pkts[0];
+       volatile union mlx5_wqe *wqe;
+       struct rte_mbuf *buf;

-       assert(elts_comp_cd != 0);
+       if (unlikely(!pkts_n))
+               return 0;
+       buf = pkts[0];
        /* Prefetch first packet cacheline. */
+       tx_prefetch_cqe(txq, txq->cq_ci);
+       tx_prefetch_cqe(txq, txq->cq_ci + 1);
        rte_prefetch0(buf);
+       /* Start processing. */
        txq_complete(txq);
        max = (elts_n - (elts_head - txq->elts_tail));
        if (max > elts_n)
@@ -313,101 +404,51 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, 
uint16_t pkts_n)
        if (max > pkts_n)
                max = pkts_n;
        for (i = 0; (i != max); ++i) {
-               struct rte_mbuf *buf_next = pkts[i + 1];
-               unsigned int elts_head_next =
-                       (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
-               struct txq_elt *elt = &(*txq->elts)[elts_head];
-               uint32_t send_flags = 0;
-#ifdef HAVE_VERBS_VLAN_INSERTION
-               int insert_vlan = 0;
-#endif /* HAVE_VERBS_VLAN_INSERTION */
+               unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
                uintptr_t addr;
                uint32_t length;
                uint32_t lkey;
-               uintptr_t buf_next_addr;

+               wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+               rte_prefetch0(wqe);
                if (i + 1 < max)
-                       rte_prefetch0(buf_next);
-               /* Request TX completion. */
-               if (unlikely(--elts_comp_cd == 0)) {
-                       elts_comp_cd = txq->elts_comp_cd_init;
-                       ++elts_comp;
-                       send_flags |= IBV_EXP_QP_BURST_SIGNALED;
-               }
-               /* Should we enable HW CKSUM offload */
-               if (buf->ol_flags &
-                   (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-                       send_flags |= IBV_EXP_QP_BURST_IP_CSUM;
-                       /* HW does not support checksum offloads at arbitrary
-                        * offsets but automatically recognizes the packet
-                        * type. For inner L3/L4 checksums, only VXLAN (UDP)
-                        * tunnels are currently supported. */
-                       if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type))
-                               send_flags |= IBV_EXP_QP_BURST_TUNNEL;
-               }
-               if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-#ifdef HAVE_VERBS_VLAN_INSERTION
-                       if (!txq->priv->mps)
-                               insert_vlan = 1;
-                       else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
-                       {
-                               err = insert_vlan_sw(buf);
-                               if (unlikely(err))
-                                       goto stop;
-                       }
-               }
+                       rte_prefetch0(pkts[i + 1]);
                /* Retrieve buffer information. */
                addr = rte_pktmbuf_mtod(buf, uintptr_t);
                length = DATA_LEN(buf);
                /* Update element. */
-               elt->buf = buf;
-               if (txq->priv->sriov)
-                       rte_prefetch0((volatile void *)
-                                     (uintptr_t)addr);
+               (*txq->elts)[elts_head] = buf;
                /* Prefetch next buffer data. */
-               if (i + 1 < max) {
-                       buf_next_addr =
-                               rte_pktmbuf_mtod(buf_next, uintptr_t);
-                       rte_prefetch0((volatile void *)
-                                     (uintptr_t)buf_next_addr);
-               }
+               if (i + 1 < max)
+                       rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+                                                      volatile void *));
                /* Retrieve Memory Region key for this memory pool. */
                lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-               if (unlikely(lkey == (uint32_t)-1)) {
-                       /* MR does not exist. */
-                       DEBUG("%p: unable to get MP <-> MR"
-                             " association", (void *)txq);
-                       /* Clean up TX element. */
-                       elt->buf = NULL;
-                       goto stop;
-               }
-#ifdef HAVE_VERBS_VLAN_INSERTION
-               if (insert_vlan)
-                       err = txq->send_pending_vlan
-                               (txq->qp,
-                                addr,
-                                length,
-                                lkey,
-                                send_flags,
-                                &buf->vlan_tci);
+               if (buf->ol_flags & PKT_TX_VLAN_PKT)
+                       mlx5_wqe_write_vlan(txq, wqe, addr, length, lkey,
+                                           buf->vlan_tci);
                else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
-                       err = txq->send_pending
-                               (txq->qp,
-                                addr,
-                                length,
-                                lkey,
-                                send_flags);
-               if (unlikely(err))
-                       goto stop;
+                       mlx5_wqe_write(txq, wqe, addr, length, lkey);
+               /* Request completion if needed. */
+               if (unlikely(--txq->elts_comp == 0)) {
+                       wqe->wqe.ctrl.data[2] = htonl(8);
+                       txq->elts_comp = txq->elts_comp_cd_init;
+               } else
+                       wqe->wqe.ctrl.data[2] = 0;
+               /* Should we enable HW CKSUM offload */
+               if (buf->ol_flags &
+                   (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+                       wqe->wqe.eseg.cs_flags =
+                               MLX5_ETH_WQE_L3_CSUM |
+                               MLX5_ETH_WQE_L4_CSUM;
+               } else
+                       wqe->wqe.eseg.cs_flags = 0;
 #ifdef MLX5_PMD_SOFT_COUNTERS
                /* Increment sent bytes counter. */
                txq->stats.obytes += length;
 #endif
-stop:
                elts_head = elts_head_next;
-               buf = buf_next;
+               buf = pkts[i + 1];
        }
        /* Take a shortcut if nothing must be sent. */
        if (unlikely(i == 0))
@@ -417,16 +458,8 @@ stop:
        txq->stats.opackets += i;
 #endif
        /* Ring QP doorbell. */
-       err = txq->send_flush(txq->qp);
-       if (unlikely(err)) {
-               /* A nonzero value is not supposed to be returned.
-                * Nothing can be done about it. */
-               DEBUG("%p: send_flush() failed with error %d",
-                     (void *)txq, err);
-       }
+       mlx5_tx_dbrec(txq);
        txq->elts_head = elts_head;
-       txq->elts_comp += elts_comp;
-       txq->elts_comp_cd = elts_comp_cd;
        return i;
 }

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 7e4e7cf..3c1c5a5 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -62,6 +62,7 @@
 #include "mlx5.h"
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
+#include "mlx5_prm.h"

 struct mlx5_rxq_stats {
        unsigned int idx; /**< Mapping index. */
@@ -222,44 +223,40 @@ struct hash_rxq {
                [MLX5_MAX_SPECIAL_FLOWS][MLX5_MAX_VLAN_IDS];
 };

-/* TX element. */
-struct txq_elt {
-       struct rte_mbuf *buf;
-};
-
 /* TX queue descriptor. */
 struct txq {
-       struct priv *priv; /* Back pointer to private data. */
-       int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max);
-       int (*send_pending)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
-       int (*send_pending_vlan)();
-#endif
-       int (*send_flush)(struct ibv_qp *qp);
-       struct ibv_cq *cq; /* Completion Queue. */
-       struct ibv_qp *qp; /* Queue Pair. */
-       struct txq_elt (*elts)[]; /* TX elements. */
-       unsigned int elts_n; /* (*elts)[] length. */
-       unsigned int elts_head; /* Current index in (*elts)[]. */
-       unsigned int elts_tail; /* First element awaiting completion. */
-       unsigned int elts_comp; /* Number of completion requests. */
-       unsigned int elts_comp_cd; /* Countdown for next completion request. */
-       unsigned int elts_comp_cd_init; /* Initial value for countdown. */
+       uint16_t elts_head; /* Current index in (*elts)[]. */
+       uint16_t elts_tail; /* First element awaiting completion. */
+       uint16_t elts_comp_cd_init; /* Initial value for countdown. */
+       uint16_t elts_comp; /* Elements before asking a completion. */
+       uint16_t elts_n; /* (*elts)[] length. */
+       uint16_t cq_ci; /* Consumer index for completion queue. */
+       uint16_t cqe_n; /* Number of CQ elements. */
+       uint16_t wqe_ci; /* Consumer index for work queue. */
+       uint16_t wqe_n; /* Number of WQ elements. */
+       uint16_t bf_offset; /* Blueflame offset. */
+       uint16_t bf_buf_size; /* Blueflame size. */
+       volatile struct mlx5_cqe64 (*cqes)[]; /* Completion queue. */
+       volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
+       volatile uint32_t *qp_db; /* Work queue doorbell. */
+       volatile uint32_t *cq_db; /* Completion queue doorbell. */
+       volatile void *bf_reg; /* Blueflame register. */
        struct {
                const struct rte_mempool *mp; /* Cached Memory Pool. */
                struct ibv_mr *mr; /* Memory Region (for mp). */
-               uint32_t lkey; /* mr->lkey */
+               uint32_t lkey; /* htonl(mr->lkey) */
        } mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
+       struct rte_mbuf *(*elts)[]; /* TX elements. */
        struct mlx5_txq_stats stats; /* TX queue counters. */
+       uint32_t qp_num_8s; /* QP number shifted by 8. */
 } __rte_cache_aligned;

 /* TX queue control descriptor. */
 struct txq_ctrl {
-#ifdef HAVE_VERBS_VLAN_INSERTION
-       struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
-#else
+       struct priv *priv; /* Back pointer to private data. */
+       struct ibv_cq *cq; /* Completion Queue. */
+       struct ibv_qp *qp; /* Queue Pair. */
        struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-#endif
        struct ibv_exp_cq_family *if_cq; /* CQ interface. */
        struct ibv_exp_res_domain *rd; /* Resource Domain. */
        unsigned int socket; /* CPU socket ID for allocations. */
@@ -293,8 +290,8 @@ uint16_t mlx5_rx_burst_secondary_setup(void *, struct 
rte_mbuf **, uint16_t);
 /* mlx5_txq.c */

 void txq_cleanup(struct txq_ctrl *);
-int txq_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, unsigned int,
-             const struct rte_eth_txconf *);
+int txq_ctrl_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t,
+                  unsigned int, const struct rte_eth_txconf *);
 int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
                        const struct rte_eth_txconf *);
 void mlx5_tx_queue_release(void *);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index d7cc39d..95c6f2b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -60,6 +60,7 @@
 #endif

 #include "mlx5_utils.h"
+#include "mlx5_defs.h"
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
 #include "mlx5_autoconf.h"
@@ -72,48 +73,22 @@
  *   Pointer to TX queue structure.
  * @param elts_n
  *   Number of elements to allocate.
- *
- * @return
- *   0 on success, errno value on failure.
  */
-static int
+static void
 txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 {
        unsigned int i;
-       struct txq_elt (*elts)[elts_n] =
-               rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq_ctrl->socket);
-       int ret = 0;

-       if (elts == NULL) {
-               ERROR("%p: can't allocate packets array", (void *)txq_ctrl);
-               ret = ENOMEM;
-               goto error;
-       }
-       for (i = 0; (i != elts_n); ++i) {
-               struct txq_elt *elt = &(*elts)[i];
+       for (i = 0; (i != elts_n); ++i)
+               (*txq_ctrl->txq.elts)[i] = NULL;
+       for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
+               volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i];

-               elt->buf = NULL;
+               memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
        }
        DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
-       txq_ctrl->txq.elts_n = elts_n;
-       txq_ctrl->txq.elts = elts;
        txq_ctrl->txq.elts_head = 0;
        txq_ctrl->txq.elts_tail = 0;
-       txq_ctrl->txq.elts_comp = 0;
-       /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
-        * at least 4 times per ring. */
-       txq_ctrl->txq.elts_comp_cd_init =
-               ((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
-                MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
-       txq_ctrl->txq.elts_comp_cd = txq_ctrl->txq.elts_comp_cd_init;
-       assert(ret == 0);
-       return 0;
-error:
-       rte_free(elts);
-
-       DEBUG("%p: failed, freed everything", (void *)txq_ctrl);
-       assert(ret > 0);
-       return ret;
 }

 /**
@@ -128,32 +103,26 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
        unsigned int elts_n = txq_ctrl->txq.elts_n;
        unsigned int elts_head = txq_ctrl->txq.elts_head;
        unsigned int elts_tail = txq_ctrl->txq.elts_tail;
-       struct txq_elt (*elts)[elts_n] = txq_ctrl->txq.elts;
+       struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;

        DEBUG("%p: freeing WRs", (void *)txq_ctrl);
-       txq_ctrl->txq.elts_n = 0;
        txq_ctrl->txq.elts_head = 0;
        txq_ctrl->txq.elts_tail = 0;
-       txq_ctrl->txq.elts_comp = 0;
-       txq_ctrl->txq.elts_comp_cd = 0;
-       txq_ctrl->txq.elts_comp_cd_init = 0;
-       txq_ctrl->txq.elts = NULL;

-       if (elts == NULL)
-               return;
        while (elts_tail != elts_head) {
-               struct txq_elt *elt = &(*elts)[elts_tail];
+               struct rte_mbuf *elt = (*elts)[elts_tail];

-               assert(elt->buf != NULL);
-               rte_pktmbuf_free(elt->buf);
+               assert(elt != NULL);
+               rte_pktmbuf_free(elt);
 #ifndef NDEBUG
                /* Poisoning. */
-               memset(elt, 0x77, sizeof(*elt));
+               memset(&(*elts)[elts_tail],
+                      0x77,
+                      sizeof((*elts)[elts_tail]));
 #endif
                if (++elts_tail == elts_n)
                        elts_tail = 0;
        }
-       rte_free(elts);
 }

 /**
@@ -172,42 +141,40 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)

        DEBUG("cleaning up %p", (void *)txq_ctrl);
        txq_free_elts(txq_ctrl);
-       txq_ctrl->txq.poll_cnt = NULL;
-       txq_ctrl->txq.send_flush = NULL;
        if (txq_ctrl->if_qp != NULL) {
-               assert(txq_ctrl->txq.priv != NULL);
-               assert(txq_ctrl->txq.priv->ctx != NULL);
-               assert(txq_ctrl->txq.qp != NULL);
+               assert(txq_ctrl->priv != NULL);
+               assert(txq_ctrl->priv->ctx != NULL);
+               assert(txq_ctrl->qp != NULL);
                params = (struct ibv_exp_release_intf_params){
                        .comp_mask = 0,
                };
-               claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+               claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
                                                txq_ctrl->if_qp,
                                                &params));
        }
        if (txq_ctrl->if_cq != NULL) {
-               assert(txq_ctrl->txq.priv != NULL);
-               assert(txq_ctrl->txq.priv->ctx != NULL);
-               assert(txq_ctrl->txq.cq != NULL);
+               assert(txq_ctrl->priv != NULL);
+               assert(txq_ctrl->priv->ctx != NULL);
+               assert(txq_ctrl->cq != NULL);
                params = (struct ibv_exp_release_intf_params){
                        .comp_mask = 0,
                };
-               claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+               claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
                                                txq_ctrl->if_cq,
                                                &params));
        }
-       if (txq_ctrl->txq.qp != NULL)
-               claim_zero(ibv_destroy_qp(txq_ctrl->txq.qp));
-       if (txq_ctrl->txq.cq != NULL)
-               claim_zero(ibv_destroy_cq(txq_ctrl->txq.cq));
+       if (txq_ctrl->qp != NULL)
+               claim_zero(ibv_destroy_qp(txq_ctrl->qp));
+       if (txq_ctrl->cq != NULL)
+               claim_zero(ibv_destroy_cq(txq_ctrl->cq));
        if (txq_ctrl->rd != NULL) {
                struct ibv_exp_destroy_res_domain_attr attr = {
                        .comp_mask = 0,
                };

-               assert(txq_ctrl->txq.priv != NULL);
-               assert(txq_ctrl->txq.priv->ctx != NULL);
-               claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->txq.priv->ctx,
+               assert(txq_ctrl->priv != NULL);
+               assert(txq_ctrl->priv->ctx != NULL);
+               claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->priv->ctx,
                                                      txq_ctrl->rd,
                                                      &attr));
        }
@@ -221,6 +188,40 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
 }

 /**
+ * Initialize TX queue.
+ *
+ * @param tmpl
+ *   Pointer to TX queue control template.
+ * @param txq_ctrl
+ *   Pointer to TX queue control.
+ */
+static inline void
+txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
+{
+       struct mlx5_qp *qp = to_mqp(tmpl->qp);
+       struct ibv_cq *ibcq = tmpl->cq;
+       struct mlx5_cq *cq = to_mxxx(cq, cq);
+
+       tmpl->txq.cqe_n = ibcq->cqe + 1;
+       tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
+       tmpl->txq.wqes =
+               (volatile union mlx5_wqe (*)[])
+               (uintptr_t)qp->gen_data.sqstart;
+       tmpl->txq.wqe_n = qp->sq.wqe_cnt;
+       tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
+       tmpl->txq.bf_reg = qp->gen_data.bf->reg;
+       tmpl->txq.bf_offset = qp->gen_data.bf->offset;
+       tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size;
+       tmpl->txq.cq_db = cq->dbrec;
+       tmpl->txq.cqes =
+               (volatile struct mlx5_cqe64 (*)[])
+               (uintptr_t)cq->active_buf->buf;
+       tmpl->txq.elts =
+               (struct rte_mbuf *(*)[tmpl->txq.elts_n])
+               ((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
+}
+
+/**
  * Configure a TX queue.
  *
  * @param dev
@@ -238,15 +239,14 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
  *   0 on success, errno value on failure.
  */
 int
-txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
-         unsigned int socket, const struct rte_eth_txconf *conf)
+txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
+              uint16_t desc, unsigned int socket,
+              const struct rte_eth_txconf *conf)
 {
        struct priv *priv = mlx5_get_priv(dev);
        struct txq_ctrl tmpl = {
+               .priv = priv,
                .socket = socket,
-               .txq = {
-                       .priv = priv,
-               },
        };
        union {
                struct ibv_exp_query_intf_params params;
@@ -254,15 +254,19 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl 
*txq_ctrl, uint16_t desc,
                struct ibv_exp_res_domain_init_attr rd;
                struct ibv_exp_cq_init_attr cq;
                struct ibv_exp_qp_attr mod;
+               struct ibv_exp_cq_attr cq_attr;
        } attr;
        enum ibv_exp_query_intf_status status;
        int ret = 0;

        (void)conf; /* Thresholds configuration (ignored). */
-       if (desc == 0) {
-               ERROR("%p: invalid number of TX descriptors", (void *)dev);
-               return EINVAL;
-       }
+       tmpl.txq.elts_n = desc;
+       /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
+        * at least 4 times per ring. */
+       tmpl.txq.elts_comp_cd_init =
+               ((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ?
+                MLX5_PMD_TX_PER_COMP_REQ : (desc / 4));
+       tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init;
        /* MRs will be registered in mp2mr[] later. */
        attr.rd = (struct ibv_exp_res_domain_init_attr){
                .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -281,8 +285,10 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl 
*txq_ctrl, uint16_t desc,
                .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
                .res_domain = tmpl.rd,
        };
-       tmpl.txq.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, 
&attr.cq);
-       if (tmpl.txq.cq == NULL) {
+       tmpl.cq = ibv_exp_create_cq(priv->ctx,
+                                   (desc / tmpl.txq.elts_comp_cd_init) - 1,
+                                   NULL, NULL, 0, &attr.cq);
+       if (tmpl.cq == NULL) {
                ret = ENOMEM;
                ERROR("%p: CQ creation failure: %s",
                      (void *)dev, strerror(ret));
@@ -294,9 +300,9 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl 
*txq_ctrl, uint16_t desc,
              priv->device_attr.max_sge);
        attr.init = (struct ibv_exp_qp_init_attr){
                /* CQ to be associated with the send queue. */
-               .send_cq = tmpl.txq.cq,
+               .send_cq = tmpl.cq,
                /* CQ to be associated with the receive queue. */
-               .recv_cq = tmpl.txq.cq,
+               .recv_cq = tmpl.cq,
                .cap = {
                        /* Max number of outstanding WRs. */
                        .max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -314,8 +320,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl 
*txq_ctrl, uint16_t desc,
                .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
                              IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
        };
-       tmpl.txq.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
-       if (tmpl.txq.qp == NULL) {
+       tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+       if (tmpl.qp == NULL) {
                ret = (errno ? errno : EINVAL);
                ERROR("%p: QP creation failure: %s",
                      (void *)dev, strerror(ret));
@@ -327,30 +333,26 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl 
*txq_ctrl, uint16_t desc,
                /* Primary port number. */
                .port_num = priv->port
        };
-       ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod,
+       ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
                                (IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
        if (ret) {
                ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
                      (void *)dev, strerror(ret));
                goto error;
        }
-       ret = txq_alloc_elts(&tmpl, desc);
-       if (ret) {
-               ERROR("%p: TXQ allocation failed: %s",
-                     (void *)dev, strerror(ret));
-               goto error;
-       }
+       txq_setup(&tmpl, txq_ctrl);
+       txq_alloc_elts(&tmpl, desc);
        attr.mod = (struct ibv_exp_qp_attr){
                .qp_state = IBV_QPS_RTR
        };
-       ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+       ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
        if (ret) {
                ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
                      (void *)dev, strerror(ret));
                goto error;
        }
        attr.mod.qp_state = IBV_QPS_RTS;
-       ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+       ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
        if (ret) {
                ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
                      (void *)dev, strerror(ret));
@@ -359,7 +361,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl 
*txq_ctrl, uint16_t desc,
        attr.params = (struct ibv_exp_query_intf_params){
                .intf_scope = IBV_EXP_INTF_GLOBAL,
                .intf = IBV_EXP_INTF_CQ,
-               .obj = tmpl.txq.cq,
+               .obj = tmpl.cq,
        };
        tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
        if (tmpl.if_cq == NULL) {
@@ -371,10 +373,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl 
*txq_ctrl, uint16_t desc,
        attr.params = (struct ibv_exp_query_intf_params){
                .intf_scope = IBV_EXP_INTF_GLOBAL,
                .intf = IBV_EXP_INTF_QP_BURST,
-               .obj = tmpl.txq.qp,
-#ifdef HAVE_VERBS_VLAN_INSERTION
                .intf_version = 1,
-#endif
+               .obj = tmpl.qp,
                /* Enable multi-packet send if supported. */
                .family_flags =
                        (priv->mps ?
@@ -392,12 +392,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl 
*txq_ctrl, uint16_t desc,
        DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
        txq_cleanup(txq_ctrl);
        *txq_ctrl = tmpl;
-       txq_ctrl->txq.poll_cnt = txq_ctrl->if_cq->poll_cnt;
-       txq_ctrl->txq.send_pending = txq_ctrl->if_qp->send_pending;
-#ifdef HAVE_VERBS_VLAN_INSERTION
-       txq_ctrl->txq.send_pending_vlan = txq_ctrl->if_qp->send_pending_vlan;
-#endif
-       txq_ctrl->txq.send_flush = txq_ctrl->if_qp->send_flush;
        DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
        /* Pre-register known mempools. */
        rte_mempool_walk(txq_mp2mr_iter, txq_ctrl);
@@ -432,15 +426,19 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t 
idx, uint16_t desc,
 {
        struct priv *priv = dev->data->dev_private;
        struct txq *txq = (*priv->txqs)[idx];
-       struct txq_ctrl *txq_ctrl;
+       struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
        int ret;

        if (mlx5_is_secondary())
                return -E_RTE_SECONDARY;

        priv_lock(priv);
-       if (txq)
-               txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+       if (!rte_is_power_of_2(desc)) {
+               desc = 1 << log2above(desc);
+               WARN("%p: increased number of descriptors in TX queue %u"
+                    " to the next power of two (%d)",
+                    (void *)dev, idx, desc);
+       }
        DEBUG("%p: configuring queue %u for %u descriptors",
              (void *)dev, idx, desc);
        if (idx >= priv->txqs_n) {
@@ -459,8 +457,11 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, 
uint16_t desc,
                (*priv->txqs)[idx] = NULL;
                txq_cleanup(txq_ctrl);
        } else {
-               txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl),
-                                            0, socket);
+               txq_ctrl =
+                       rte_calloc_socket("TXQ", 1,
+                                         sizeof(*txq_ctrl) +
+                                         desc * sizeof(struct rte_mbuf *),
+                                         0, socket);
                if (txq_ctrl == NULL) {
                        ERROR("%p: unable to allocate queue index %u",
                              (void *)dev, idx);
@@ -468,7 +469,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, 
uint16_t desc,
                        return -ENOMEM;
                }
        }
-       ret = txq_setup(dev, txq_ctrl, desc, socket, conf);
+       ret = txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf);
        if (ret)
                rte_free(txq_ctrl);
        else {
@@ -503,7 +504,7 @@ mlx5_tx_queue_release(void *dpdk_txq)
        if (txq == NULL)
                return;
        txq_ctrl = container_of(txq, struct txq_ctrl, txq);
-       priv = txq->priv;
+       priv = txq_ctrl->priv;
        priv_lock(priv);
        for (i = 0; (i != priv->txqs_n); ++i)
                if ((*priv->txqs)[i] == txq) {
@@ -538,7 +539,8 @@ mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct 
rte_mbuf **pkts,
                              uint16_t pkts_n)
 {
        struct txq *txq = dpdk_txq;
-       struct priv *priv = mlx5_secondary_data_setup(txq->priv);
+       struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+       struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv);
        struct priv *primary_priv;
        unsigned int index;

-- 
2.1.4

Reply via email to