This commit implements TCP segmentation offload in TAP.
librte_gso library is used to segment large TCP payloads (e.g. packets
of 64K bytes size) into smaller MTU size buffers.
By supporting TSO offload capability in software a TAP device can be used
as a failsafe sub device and be paired with another PCI device which
supports TSO capability in HW.

For more details on librte_gso implementation please refer to dpdk
documentation.
The number of newly generated TCP TSO segments is limited to 64.

Signed-off-by: Ophir Munk <ophi...@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +-
 drivers/net/tap/rte_eth_tap.c | 159 +++++++++++++++++++++++++++++++++++-------
 drivers/net/tap/rte_eth_tap.h |   4 ++
 mk/rte.app.mk                 |   4 +-
 4 files changed, 139 insertions(+), 30 deletions(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..3243365 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -24,7 +24,7 @@ CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
-LDLIBS += -lrte_bus_vdev
+LDLIBS += -lrte_bus_vdev -lrte_gso
 
 CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
 
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index d77a64f..fe62ab3 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -17,6 +17,7 @@
 #include <rte_ip.h>
 #include <rte_string_fns.h>
 
+#include <assert.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/socket.h>
@@ -408,7 +409,8 @@ tap_tx_offload_get_port_capa(void)
        return DEV_TX_OFFLOAD_MULTI_SEGS |
               DEV_TX_OFFLOAD_IPV4_CKSUM |
               DEV_TX_OFFLOAD_UDP_CKSUM |
-              DEV_TX_OFFLOAD_TCP_CKSUM;
+              DEV_TX_OFFLOAD_TCP_CKSUM |
+              DEV_TX_OFFLOAD_TCP_TSO;
 }
 
 static uint64_t
@@ -417,7 +419,8 @@ tap_tx_offload_get_queue_capa(void)
        return DEV_TX_OFFLOAD_MULTI_SEGS |
               DEV_TX_OFFLOAD_IPV4_CKSUM |
               DEV_TX_OFFLOAD_UDP_CKSUM |
-              DEV_TX_OFFLOAD_TCP_CKSUM;
+              DEV_TX_OFFLOAD_TCP_CKSUM |
+              DEV_TX_OFFLOAD_TCP_TSO;
 }
 
 static bool
@@ -500,23 +503,15 @@ tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned 
int l2_len,
        }
 }
 
-/* Callback to handle sending packets from the tap interface
- */
-static uint16_t
-pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+static inline void
+tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
+                       struct rte_mbuf **pmbufs, uint16_t l234_hlen,
+                       uint16_t *num_packets, unsigned long *num_tx_bytes)
 {
-       struct tx_queue *txq = queue;
-       uint16_t num_tx = 0;
-       unsigned long num_tx_bytes = 0;
-       uint32_t max_size;
        int i;
 
-       if (unlikely(nb_pkts == 0))
-               return 0;
-
-       max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
-       for (i = 0; i < nb_pkts; i++) {
-               struct rte_mbuf *mbuf = bufs[num_tx];
+       for (i = 0; i < num_mbufs; i++) {
+               struct rte_mbuf *mbuf = pmbufs[i];
                struct iovec iovecs[mbuf->nb_segs + 2];
                struct tun_pi pi = { .flags = 0, .proto = 0x00 };
                struct rte_mbuf *seg = mbuf;
@@ -524,17 +519,13 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
                int proto;
                int n;
                int j;
-               int k; /* first index in iovecs for copying segments */
-               uint16_t l234_hlen; /* length of layers 2,3,4 headers */
+               int k; /* current index in iovecs for copying segments */
                uint16_t seg_len; /* length of first segment */
                uint16_t nb_segs;
                uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
                uint32_t l4_raw_cksum = 0; /* TCP/UDP payload raw checksum */
                uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
 
-               /* stats.errs will be incremented */
-               if (rte_pktmbuf_pkt_len(mbuf) > max_size)
-                       break;
                l4_cksum = NULL;
                /*
                 * TUN and TAP are created with IFF_NO_PI disabled.
@@ -567,9 +558,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t 
nb_pkts)
                        l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
                        if (seg_len < l234_hlen)
                                break;
-
-                       /* To change checksums, work on a
-                        * copy of l2, l3 l4 headers.
+                       /* To change checksums, work on a * copy of l2, l3
+                        * headers + l4 pseudo header
                         */
                        rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
                                        l234_hlen);
@@ -609,13 +599,78 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
                n = writev(txq->fd, iovecs, j);
                if (n <= 0)
                        break;
+               (*num_packets)++;
+               (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
+       }
+}
+
+/* Callback to handle sending packets from the tap interface
+ */
+static uint16_t
+pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+       struct tx_queue *txq = queue;
+       uint16_t num_tx = 0;
+       uint16_t num_packets = 0;
+       unsigned long num_tx_bytes = 0;
+       uint16_t tso_segsz = 0;
+       uint16_t hdrs_len;
+       uint32_t max_size;
+       int i;
+       uint64_t tso;
+       int ret;
+
+       if (unlikely(nb_pkts == 0))
+               return 0;
+
+       struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
+       max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
+       for (i = 0; i < nb_pkts; i++) {
+               struct rte_mbuf *mbuf_in = bufs[num_tx];
+               struct rte_mbuf **mbuf;
+               uint16_t num_mbufs;
+
+               tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
+               if (tso) {
+                       struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
+                       assert(gso_ctx != NULL);
+                       /* gso size is calculated without ETHER_CRC_LEN */
+                       hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
+                                       mbuf_in->l4_len;
+                       tso_segsz = mbuf_in->tso_segsz + hdrs_len;
+                       if (unlikely(tso_segsz == hdrs_len) ||
+                               tso_segsz > max_size) {
+                               txq->stats.errs++;
+                               break;
+                       }
+                       gso_ctx->gso_size = tso_segsz;
+                       ret = rte_gso_segment(mbuf_in, /* packet to segment */
+                               gso_ctx, /* gso control block */
+                               (struct rte_mbuf **)&gso_mbufs, /* out mbufs */
+                               RTE_DIM(gso_mbufs)); /* max tso mbufs */
+
+                       /* ret contains the number of new created mbufs */
+                       if (ret < 0)
+                               break;
 
+                       mbuf = gso_mbufs;
+                       num_mbufs = ret;
+               } else {
+                       /* stats.errs will be incremented */
+                       if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
+                               break;
+
+                       mbuf = &mbuf_in;
+                       num_mbufs = 1;
+               }
+
+               tap_write_mbufs(txq, num_mbufs, mbuf, hdrs_len,
+                               &num_packets, &num_tx_bytes);
                num_tx++;
-               num_tx_bytes += mbuf->pkt_len;
-               rte_pktmbuf_free(mbuf);
+               rte_pktmbuf_free(mbuf_in);
        }
 
-       txq->stats.opackets += num_tx;
+       txq->stats.opackets += num_packets;
        txq->stats.errs += nb_pkts - num_tx;
        txq->stats.obytes += num_tx_bytes;
 
@@ -1064,32 +1119,77 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr 
*mac_addr)
        return 0;
 }
 
+#define TAP_GSO_MBUFS_NUM 64
+#define TAP_GSO_MBUF_SEG_SIZE 128
+
+static int
+tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
+{
+       uint32_t gso_types;
+       char pool_name[64];
+
+       /* Create private mbuf pool with 128 bytes size per mbuf
+        * use this pool for both direct and indirect mbufs
+        */
+
+       struct rte_mempool *mp;      /* Mempool for GSO packets */
+       /* initialize GSO context */
+       gso_types = DEV_TX_OFFLOAD_TCP_TSO | DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+               DEV_TX_OFFLOAD_GRE_TNL_TSO;
+       snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
+       mp = rte_mempool_lookup((const char *)pool_name);
+       if (!mp) {
+               mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
+                       0, 0, RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
+                       SOCKET_ID_ANY);
+               if (!mp) {
+                       struct pmd_internals *pmd = dev->data->dev_private;
+                       RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for 
device %s\n",
+                               pmd->name, dev->device->name);
+                       return -1;
+               }
+       }
+
+       gso_ctx->direct_pool = mp;
+       gso_ctx->indirect_pool = mp;
+       gso_ctx->gso_types = gso_types;
+       gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
+       gso_ctx->flag = 0;
+
+       return 0;
+}
+
 static int
 tap_setup_queue(struct rte_eth_dev *dev,
                struct pmd_internals *internals,
                uint16_t qid,
                int is_rx)
 {
+       int ret;
        int *fd;
        int *other_fd;
        const char *dir;
        struct pmd_internals *pmd = dev->data->dev_private;
        struct rx_queue *rx = &internals->rxq[qid];
        struct tx_queue *tx = &internals->txq[qid];
+       struct rte_gso_ctx *gso_ctx;
 
        if (is_rx) {
                fd = &rx->fd;
                other_fd = &tx->fd;
                dir = "rx";
+               gso_ctx = NULL;
        } else {
                fd = &tx->fd;
                other_fd = &rx->fd;
                dir = "tx";
+               gso_ctx = &tx->gso_ctx;
        }
        if (*fd != -1) {
                /* fd for this queue already exists */
                RTE_LOG(DEBUG, PMD, "%s: fd %d for %s queue qid %d exists\n",
                        pmd->name, *fd, dir, qid);
+               gso_ctx = NULL;
        } else if (*other_fd != -1) {
                /* Only other_fd exists. dup it */
                *fd = dup(*other_fd);
@@ -1116,6 +1216,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
 
        tx->mtu = &dev->data->mtu;
        rx->rxmode = &dev->data->dev_conf.rxmode;
+       if (gso_ctx) {
+               ret = tap_gso_ctx_setup(gso_ctx, dev);
+               if (ret)
+                       return -1;
+       }
 
        return *fd;
 }
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 25b65bf..69f746f 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -15,6 +15,7 @@
 
 #include <rte_ethdev_driver.h>
 #include <rte_ether.h>
+#include <rte_gso.h>
 
 #ifdef IFF_MULTI_QUEUE
 #define RTE_PMD_TAP_MAX_QUEUES TAP_MAX_QUEUES
@@ -22,6 +23,8 @@
 #define RTE_PMD_TAP_MAX_QUEUES 1
 #endif
 
+#define MAX_GSO_MBUFS 64
+
 struct pkt_stats {
        uint64_t opackets;              /* Number of output packets */
        uint64_t ipackets;              /* Number of input packets */
@@ -50,6 +53,7 @@ struct tx_queue {
        uint16_t *mtu;                  /* Pointer to MTU from dev_data */
        uint16_t csum:1;                /* Enable checksum offloading */
        struct pkt_stats stats;         /* Stats for this TX queue */
+       struct rte_gso_ctx gso_ctx;     /* GSO context */
 };
 
 struct pmd_internals {
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 0e18d0f..cd09dc6 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -66,8 +66,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT)           += -lrte_port
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP)          += -lrte_pdump
 _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR)    += -lrte_distributor
 _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG)        += -lrte_ip_frag
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
 _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lrte_meter
 _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM)            += -lrte_lpm
 # librte_acl needs --whole-archive because of weak functions
@@ -85,6 +83,8 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 _LDLIBS-y += --whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
 _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH)           += -lrte_hash
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER)         += -lrte_member
 _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST)          += -lrte_vhost
-- 
2.7.4

Reply via email to