Implement support for hardware TSO.

Signed-off-by: Moti Haimovsky <mo...@mellanox.com>
---
v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-mo...@mellanox.com

v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-mo...@mellanox.com
---

 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |   3 +
 drivers/net/mlx4/mlx4.c           |  16 ++
 drivers/net/mlx4/mlx4.h           |   5 +
 drivers/net/mlx4/mlx4_prm.h       |  12 ++
 drivers/net/mlx4/mlx4_rxtx.c      | 372 +++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
 drivers/net/mlx4/mlx4_txq.c       |   8 +-
 8 files changed, 415 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/features/mlx4.ini 
b/doc/guides/nics/features/mlx4.ini
index f6efd21..98a3f61 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -13,6 +13,7 @@ Queue start/stop     = Y
 MTU update           = Y
 Jumbo frame          = Y
 Scattered Rx         = Y
+TSO                  = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Unicast MAC filter   = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 491106a..12adaeb 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -142,6 +142,9 @@ Limitations
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
 
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+  4.4 and above or in rdma-core version v18 and above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index d151a90..61b7844 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -519,6 +519,8 @@ struct mlx4_conf {
                .ports.present = 0,
        };
        unsigned int vf;
+       struct rte_mbuf mbuf;
+       uint64_t size_test = UINT_MAX;
        int i;
 
        (void)pci_drv;
@@ -677,6 +679,20 @@ struct mlx4_conf {
                                        IBV_RAW_PACKET_CAP_SCATTER_FCS);
                DEBUG("FCS stripping toggling is %ssupported",
                      priv->hw_fcs_strip ? "" : "not ");
+               /*
+                * No TSO SIZE is defined in DPDK, need to figure it out
+                * in order to see if we can support it.
+                */
+               mbuf.tso_segsz = size_test;
+               priv->tso =
+                       ((device_attr_ex.tso_caps.max_tso >= mbuf.tso_segsz) &&
+                        (device_attr_ex.tso_caps.supported_qpts &
+                         (1 << IBV_QPT_RAW_PACKET)));
+               if (priv->tso)
+                       priv->tso_max_payload_sz =
+                                       device_attr_ex.tso_caps.max_tso;
+               DEBUG("TSO is %ssupported",
+                     priv->tso ? "" : "not ");
                /* Configure the first MAC address by default. */
                err = mlx4_get_mac(priv, &mac.addr_bytes);
                if (err) {
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 300cb4d..742d741 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -47,6 +47,9 @@
 /** Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
+/* Maximum Packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192  // TODO: find the real value
+
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
@@ -90,6 +93,8 @@ struct priv {
        uint32_t hw_csum:1; /**< Checksum offload is supported. */
        uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
        uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+       uint32_t tso:1; /**< Transmit segmentation offload is supported */
+       uint32_t tso_max_payload_sz; /* Max TSO payload size being supported */
        uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
        struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
        struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index e15a3c1..915796b 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -40,6 +40,7 @@
 /* Work queue element (WQE) flags. */
 #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
 #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
 
 /* CQE checksum flags. */
 enum {
@@ -97,6 +98,17 @@ struct mlx4_cq {
        int arm_sn; /**< Rx event counter. */
 };
 
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg_ {
+       __be32 mss_hdr_size;
+       __be32 header[0];
+};
+
 /**
  * Retrieve a CQE entry from a CQ.
  *
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index a92da66..992d193 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -38,10 +38,25 @@
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-       volatile struct mlx4_wqe_data_seg *dseg;
+       union {
+               volatile struct mlx4_wqe_data_seg *dseg;
+               volatile uint32_t *dst;
+       };
        uint32_t val;
 };
 
+/** A helper struct for TSO packet handling. */
+struct tso_info {
+       /* Total size of the WQE including padding */
+       uint32_t wqe_size;
+       /* size of TSO header to prepend to each packet to send */
+       uint16_t tso_header_sz;
+       /* Total size of the TSO entry in the WQE. */
+       uint16_t wqe_tso_seg_size;
+       /* Raw WQE size in units of 16 Bytes and without padding. */
+       uint8_t fence_size;
+};
+
 /** A table to translate Rx completion flags to packet type. */
 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
        /*
@@ -377,6 +392,349 @@ struct pv {
 }
 
 /**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+                            struct txq *txq,
+                            struct tso_info *tinfo)
+{
+       struct mlx4_sq *sq = &txq->msq;
+       const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+                                (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+       tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
+       if (tunneled)
+               tinfo->tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
+       if (unlikely(buf->tso_segsz == 0 || tinfo->tso_header_sz == 0)) {
+               DEBUG("%p: Invalid TSO parameters", (void *)txq);
+               return -EINVAL;
+       }
+       /* First segment must contain all TSO headers. */
+       if (unlikely(tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER) ||
+                    tinfo->tso_header_sz > buf->data_len) {
+               DEBUG("%p: Invalid TSO header length", (void *)txq);
+               return -EINVAL;
+       }
+       /*
+        * Calculate the WQE TSO segment size
+        * Note:
+        * 1. An LSO segment must be padded such that the subsequent data
+        *    segment is 16-byte aligned.
+        * 2. The start address of the TSO segment is always 16 Bytes aligned.
+        */
+       tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg_) +
+                                           tinfo->tso_header_sz,
+                                           sizeof(struct mlx4_wqe_data_seg));
+       tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+                            tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+                            buf->nb_segs;
+       tinfo->wqe_size =
+               RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+                         MLX4_TXBB_SIZE);
+       /* Validate WQE size and WQE space in the send queue. */
+       if (sq->remain_size < tinfo->wqe_size ||
+           tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+               return -ENOMEM;
+       return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ * @param pv
+ *   Pointer to a stash area for saving the first 32bit word of each TXBB
+ *   used for the TSO WQE.
+ * @param pv_counter
+ *   Current location in the stash.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_fill_tso_segs(struct rte_mbuf *buf,
+                           struct txq *txq,
+                           const struct tso_info *tinfo,
+                           volatile struct mlx4_wqe_data_seg *dseg,
+                           struct pv *pv, int *pv_counter)
+{
+       uint32_t lkey;
+       int nb_segs = buf->nb_segs;
+       int nb_segs_txbb;
+       struct mlx4_sq *sq = &txq->msq;
+       struct rte_mbuf *sbuf = buf;
+       uint16_t sb_of = tinfo->tso_header_sz;
+       uint16_t data_len;
+
+       while (nb_segs > 0) {
+               /* Wrap dseg if it points at the end of the queue. */
+               if ((volatile uint8_t *)dseg >= sq->eob)
+                       dseg = (volatile struct mlx4_wqe_data_seg *)
+                                       (volatile uint8_t *)dseg - sq->size;
+               /* how many dseg entries do we have in the current TXBB ? */
+               nb_segs_txbb =
+                       (MLX4_TXBB_SIZE / sizeof(struct mlx4_wqe_data_seg)) -
+                       ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
+                       sizeof(struct mlx4_wqe_data_seg);
+               switch (nb_segs_txbb) {
+               case 4:
+                       /* Memory region key for this memory pool. */
+                       lkey = mlx4_tx_mb2mr(txq, sbuf);
+                       if (unlikely(lkey == (uint32_t)-1))
+                               goto lkey_err;
+                       dseg->addr =
+                           rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
+                                                                    uintptr_t,
+                                                                    sb_of));
+                       dseg->lkey = lkey;
+                       /*
+                        * This data segment starts at the beginning of a new
+                        * TXBB, so we need to postpone its byte_count writing
+                        * for later.
+                        */
+                       pv[*pv_counter].dseg = dseg;
+                       /*
+                        * Zero length segment is treated as inline segment
+                        * with zero data.
+                        */
+                       data_len = sbuf->data_len - sb_of;
+                       pv[(*pv_counter)++].val =
+                               rte_cpu_to_be_32(data_len ?
+                                                data_len :
+                                                0x80000000);
+                       sb_of = 0;
+                       sbuf = sbuf->next;
+                       dseg++;
+                       if (--nb_segs == 0)
+                               break;
+                       /* fallthrough */
+               case 3:
+                       lkey = mlx4_tx_mb2mr(txq, sbuf);
+                       if (unlikely(lkey == (uint32_t)-1))
+                               goto lkey_err;
+                       data_len = sbuf->data_len - sb_of;
+                       mlx4_fill_tx_data_seg(dseg,
+                                       lkey,
+                                       rte_pktmbuf_mtod_offset(sbuf,
+                                                               uintptr_t,
+                                                               sb_of),
+                                       rte_cpu_to_be_32(data_len ?
+                                                        data_len :
+                                                        0x80000000));
+                       sb_of = 0;
+                       sbuf = sbuf->next;
+                       dseg++;
+                       if (--nb_segs == 0)
+                               break;
+                       /* fallthrough */
+               case 2:
+                       lkey = mlx4_tx_mb2mr(txq, sbuf);
+                       if (unlikely(lkey == (uint32_t)-1))
+                               goto lkey_err;
+                       data_len = sbuf->data_len - sb_of;
+                       mlx4_fill_tx_data_seg(dseg,
+                                       lkey,
+                                       rte_pktmbuf_mtod_offset(sbuf,
+                                                               uintptr_t,
+                                                               sb_of),
+                                       rte_cpu_to_be_32(data_len ?
+                                                        data_len :
+                                                        0x80000000));
+                       sb_of = 0;
+                       sbuf = sbuf->next;
+                       dseg++;
+                       if (--nb_segs == 0)
+                               break;
+                       /* fallthrough */
+               case 1:
+                       lkey = mlx4_tx_mb2mr(txq, sbuf);
+                       if (unlikely(lkey == (uint32_t)-1))
+                               goto lkey_err;
+                       data_len = sbuf->data_len - sb_of;
+                       mlx4_fill_tx_data_seg(dseg,
+                                       lkey,
+                                       rte_pktmbuf_mtod_offset(sbuf,
+                                                               uintptr_t,
+                                                               sb_of),
+                                       rte_cpu_to_be_32(data_len ?
+                                                        data_len :
+                                                        0x80000000));
+                       sb_of = 0;
+                       sbuf = sbuf->next;
+                       dseg++;
+                       --nb_segs;
+                       break;
+               default:
+                       /* Should never happen */
+                       ERROR("%p: invalid number of txbb data segments %d",
+                             (void *)txq, nb_segs_txbb);
+                       return -EINVAL;
+               }
+       }
+       return 0;
+lkey_err:
+       DEBUG("%p: unable to get MP <-> MR association",
+             (void *)txq);
+       return -EFAULT;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *  This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param tseg
+ *   Pointer to the TSO header field in the TSO WQE.
+ * @param pv
+ *   Pointer to a stash area for saving the first 32bit word of each TXBB
+ *   used for the TSO WQE.
+ * @param pv_counter
+ *   Current location in the stash.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+                          struct txq *txq,
+                          const struct tso_info *tinfo,
+                          volatile struct mlx4_wqe_lso_seg_ *tseg,
+                           struct pv *pv, int *pv_counter)
+{
+       struct mlx4_sq *sq = &txq->msq;
+       int remain_sz = tinfo->tso_header_sz;
+       char *from = rte_pktmbuf_mtod(buf, char *);
+       uint16_t txbb_avail_space;
+       int copy_sz;
+       /* Union to overcome volatile constraints when copying TSO header. */
+       union {
+               volatile uint8_t *vto;
+               uint8_t *to;
+       } thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+       /*
+        * TSO data always starts at offset 20 from the beginning of the TXBB
+        * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+        * we can write the first 44 TSO header bytes without worry for TxQ
+        * wrapping or overwriting the first TXBB 32bit word.
+        */
+       txbb_avail_space = MLX4_TXBB_SIZE -
+                          (sizeof(struct mlx4_wqe_ctrl_seg) +
+                           sizeof(struct mlx4_wqe_lso_seg_));
+       copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
+       rte_memcpy(thdr.to, from, copy_sz);
+       remain_sz -= copy_sz;
+       while (remain_sz > 0) {
+               from += copy_sz;
+               thdr.to += copy_sz;
+               /* Start of TXBB need to check for TxQ wrap. */
+               if (thdr.to >= sq->eob)
+                       thdr.vto = sq->buf;
+               /* New TXBB, stash the first 32bits for later use. */
+               pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
+               pv[(*pv_counter)++].val = *((uint32_t *)from);
+               from += sizeof(uint32_t);
+               thdr.to += sizeof(uint32_t);
+               remain_sz -= sizeof(uint32_t);
+               if (remain_sz <= 0)
+                       break;
+               /* Now copy the rest */
+               txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+               copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
+               rte_memcpy(thdr.to, from, copy_sz);
+               remain_sz -= copy_sz;
+       }
+       /* TODO: handle PID and IPID ? */
+       tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+                                             tinfo->tso_header_sz);
+       return 0;
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+                 volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+       volatile struct mlx4_wqe_data_seg *dseg;
+       volatile struct mlx4_wqe_lso_seg_ *tseg =
+               (volatile struct mlx4_wqe_lso_seg_ *)(ctrl + 1);
+       struct mlx4_sq *sq = &txq->msq;
+       struct tso_info tinfo;
+       struct pv *pv = (struct pv *)txq->bounce_buf;
+       int pv_counter = 0;
+       int ret;
+
+       ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+       if (ret)
+               goto error;
+       ret = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo,
+                                        tseg, pv, &pv_counter);
+       if (ret)
+               goto error;
+       /* Calculate data segment location */
+       dseg = (volatile struct mlx4_wqe_data_seg *)
+                               ((uintptr_t)tseg + tinfo.wqe_tso_seg_size);
+       if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+               dseg = (volatile struct mlx4_wqe_data_seg *)
+                                       ((uintptr_t)dseg - sq->size);
+       ret = mlx4_tx_burst_fill_tso_segs(buf, txq, &tinfo,
+                                         dseg, pv, &pv_counter);
+       if (ret)
+               goto error;
+       /* Write the first DWORD of each TXBB save earlier. */
+       if (pv_counter) {
+               /* Need a barrier here before writing the first TXBB word. */
+               rte_io_wmb();
+               for (--pv_counter; pv_counter  >= 0; pv_counter--)
+                       *pv[pv_counter].dst = pv[pv_counter].val;
+       }
+       ctrl->fence_size = tinfo.fence_size;
+       sq->remain_size -= tinfo.wqe_size;
+       /* Align next WQE address to the next TXBB. */
+       return (volatile struct mlx4_wqe_ctrl_seg *)
+               ((volatile uint8_t *)ctrl + tinfo.wqe_size);
+error:
+       txq->stats.odropped++;
+       rte_errno = ret;
+       return NULL;
+}
+
+/**
  * Write data segments of multi-segment packet.
  *
  * @param buf
@@ -569,6 +927,7 @@ struct pv {
                        uint16_t flags16[2];
                } srcrb;
                uint32_t lkey;
+               bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 
                /* Clean up old buffer. */
                if (likely(elt->buf != NULL)) {
@@ -587,7 +946,16 @@ struct pv {
                        } while (tmp != NULL);
                }
                RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-               if (buf->nb_segs == 1) {
+               if (tso) {
+                       /* Change opcode to TSO */
+                       owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+                       owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+                       ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+                       if (!ctrl_next) {
+                               elt->buf = NULL;
+                               break;
+                       }
+               } else if (buf->nb_segs == 1) {
                        /* Validate WQE space in the send queue. */
                        if (sq->remain_size < MLX4_TXBB_SIZE) {
                                elt->buf = NULL;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 4c025e3..ffa8abf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -90,7 +90,7 @@ struct mlx4_txq_stats {
        unsigned int idx; /**< Mapping index. */
        uint64_t opackets; /**< Total of successfully sent packets. */
        uint64_t obytes; /**< Total of successfully sent bytes. */
-       uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+       uint64_t odropped; /**< Total number of packets failed to transmit. */
 };
 
 /** Tx queue descriptor. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 6edaadb..9aa7440 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -116,8 +116,14 @@
                             DEV_TX_OFFLOAD_UDP_CKSUM |
                             DEV_TX_OFFLOAD_TCP_CKSUM);
        }
-       if (priv->hw_csum_l2tun)
+       if (priv->tso)
+               offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+       if (priv->hw_csum_l2tun) {
                offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+               if (priv->tso)
+                       offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+                                    DEV_TX_OFFLOAD_GRE_TNL_TSO);
+       }
        return offloads;
 }
 
-- 
1.8.3.1

Reply via email to