This commit implements TCP segmentation offload in TAP.
librte_gso library is used to segment large TCP payloads (e.g. packets
of 64K bytes size) into smaller MTU size buffers.
By supporting TSO offload capability in software a TAP device can be used
as a failsafe sub device and be paired with another PCI device which
supports TSO capability in HW.

For more details on librte_gso implementation please refer to dpdk
documentation.
The number of newly generated TCP TSO segments is limited to 64.

Reviewed-by: Raslan Darawsheh <rasl...@mellanox.com>
Signed-off-by: Ophir Munk <ophi...@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +-
 drivers/net/tap/rte_eth_tap.c | 174 +++++++++++++++++++++++++++++++++++-------
 drivers/net/tap/rte_eth_tap.h |   3 +
 mk/rte.app.mk                 |   4 +-
 4 files changed, 154 insertions(+), 29 deletions(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..3243365 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -24,7 +24,7 @@ CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
-LDLIBS += -lrte_bus_vdev
+LDLIBS += -lrte_bus_vdev -lrte_gso
 
 CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
 
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 8903646..d137f58 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -17,6 +17,7 @@
 #include <rte_ip.h>
 #include <rte_string_fns.h>
 
+#include <assert.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/socket.h>
@@ -55,6 +56,12 @@
 #define ETH_TAP_CMP_MAC_FMT     "0123456789ABCDEFabcdef"
 #define ETH_TAP_MAC_ARG_FMT     ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
 
+#define TAP_GSO_MBUFS_PER_CORE 128
+#define TAP_GSO_MBUF_SEG_SIZE  128
+#define TAP_GSO_MBUF_CACHE_SIZE        4
+#define TAP_GSO_MBUFS_NUM \
+       (TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)
+
 static struct rte_vdev_driver pmd_tap_drv;
 static struct rte_vdev_driver pmd_tun_drv;
 
@@ -412,7 +419,8 @@ tap_tx_offload_get_queue_capa(void)
        return DEV_TX_OFFLOAD_MULTI_SEGS |
               DEV_TX_OFFLOAD_IPV4_CKSUM |
               DEV_TX_OFFLOAD_UDP_CKSUM |
-              DEV_TX_OFFLOAD_TCP_CKSUM;
+              DEV_TX_OFFLOAD_TCP_CKSUM |
+              DEV_TX_OFFLOAD_TCP_TSO;
 }
 
 /* Finalize l4 checksum calculation */
@@ -480,23 +488,16 @@ tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned 
int l2_len,
        }
 }
 
-/* Callback to handle sending packets from the tap interface
- */
-static uint16_t
-pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+static inline void
+tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
+                       struct rte_mbuf **pmbufs,
+                       uint16_t *num_packets, unsigned long *num_tx_bytes)
 {
-       struct tx_queue *txq = queue;
-       uint16_t num_tx = 0;
-       unsigned long num_tx_bytes = 0;
-       uint32_t max_size;
        int i;
+       uint16_t l234_hlen;
 
-       if (unlikely(nb_pkts == 0))
-               return 0;
-
-       max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
-       for (i = 0; i < nb_pkts; i++) {
-               struct rte_mbuf *mbuf = bufs[num_tx];
+       for (i = 0; i < num_mbufs; i++) {
+               struct rte_mbuf *mbuf = pmbufs[i];
                struct iovec iovecs[mbuf->nb_segs + 2];
                struct tun_pi pi = { .flags = 0, .proto = 0x00 };
                struct rte_mbuf *seg = mbuf;
@@ -504,8 +505,7 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t 
nb_pkts)
                int proto;
                int n;
                int j;
-               int k; /* first index in iovecs for copying segments */
-               uint16_t l234_hlen; /* length of layers 2,3,4 headers */
+               int k; /* current index in iovecs for copying segments */
                uint16_t seg_len; /* length of first segment */
                uint16_t nb_segs;
                uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
@@ -513,10 +513,6 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t 
nb_pkts)
                uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
                uint16_t is_cksum = 0; /* in case cksum should be offloaded */
 
-               /* stats.errs will be incremented */
-               if (rte_pktmbuf_pkt_len(mbuf) > max_size)
-                       break;
-
                l4_cksum = NULL;
                if (txq->type == ETH_TUNTAP_TYPE_TUN) {
                        /*
@@ -558,8 +554,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t 
nb_pkts)
                        if (seg_len < l234_hlen)
                                break;
 
-                       /* To change checksums, work on a
-                        * copy of l2, l3 l4 headers.
+                       /* To change checksums, work on a * copy of l2, l3
+                        * headers + l4 pseudo header
                         */
                        rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
                                        l234_hlen);
@@ -603,13 +599,90 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
                n = writev(txq->fd, iovecs, j);
                if (n <= 0)
                        break;
+               (*num_packets)++;
+               (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
+       }
+}
+
+/* Callback to handle sending packets from the tap interface
+ */
+static uint16_t
+pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+       struct tx_queue *txq = queue;
+       uint16_t num_tx = 0;
+       uint16_t num_packets = 0;
+       unsigned long num_tx_bytes = 0;
+       uint32_t max_size;
+       int i;
+
+       if (unlikely(nb_pkts == 0))
+               return 0;
 
+       struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
+       max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
+       for (i = 0; i < nb_pkts; i++) {
+               struct rte_mbuf *mbuf_in = bufs[num_tx];
+               struct rte_mbuf **mbuf;
+               uint16_t num_mbufs = 0;
+               uint16_t tso_segsz = 0;
+               int ret;
+               uint16_t hdrs_len;
+               int j;
+               uint64_t tso;
+
+               tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
+               if (tso) {
+                       struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
+
+                       assert(gso_ctx != NULL);
+
+                       /* TCP segmentation implies TCP checksum offload */
+                       mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
+
+                       /* gso size is calculated without ETHER_CRC_LEN */
+                       hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
+                                       mbuf_in->l4_len;
+                       tso_segsz = mbuf_in->tso_segsz + hdrs_len;
+                       if (unlikely(tso_segsz == hdrs_len) ||
+                               tso_segsz > *txq->mtu) {
+                               txq->stats.errs++;
+                               break;
+                       }
+                       gso_ctx->gso_size = tso_segsz;
+                       ret = rte_gso_segment(mbuf_in, /* packet to segment */
+                               gso_ctx, /* gso control block */
+                               (struct rte_mbuf **)&gso_mbufs, /* out mbufs */
+                               RTE_DIM(gso_mbufs)); /* max tso mbufs */
+
+                       /* ret contains the number of new created mbufs */
+                       if (ret < 0)
+                               break;
+
+                       mbuf = gso_mbufs;
+                       num_mbufs = ret;
+               } else {
+                       /* stats.errs will be incremented */
+                       if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
+                               break;
+
+                       /* ret 0 indicates no new mbufs were created */
+                       ret = 0;
+                       mbuf = &mbuf_in;
+                       num_mbufs = 1;
+               }
+
+               tap_write_mbufs(txq, num_mbufs, mbuf,
+                               &num_packets, &num_tx_bytes);
                num_tx++;
-               num_tx_bytes += mbuf->pkt_len;
-               rte_pktmbuf_free(mbuf);
+               /* free original mbuf */
+               rte_pktmbuf_free(mbuf_in);
+               /* free tso mbufs */
+               for (j = 0; j < ret; j++)
+                       rte_pktmbuf_free(mbuf[j]);
        }
 
-       txq->stats.opackets += num_tx;
+       txq->stats.opackets += num_packets;
        txq->stats.errs += nb_pkts - num_tx;
        txq->stats.obytes += num_tx_bytes;
 
@@ -1071,31 +1144,75 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr 
*mac_addr)
 }
 
 static int
+tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
+{
+       uint32_t gso_types;
+       char pool_name[64];
+
+       /*
+        * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes
+        * size per mbuf use this pool for both direct and indirect mbufs
+        */
+
+       struct rte_mempool *mp;      /* Mempool for GSO packets */
+
+       /* initialize GSO context */
+       gso_types = DEV_TX_OFFLOAD_TCP_TSO;
+       snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
+       mp = rte_mempool_lookup((const char *)pool_name);
+       if (!mp) {
+               mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
+                       TAP_GSO_MBUF_CACHE_SIZE, 0,
+                       RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
+                       SOCKET_ID_ANY);
+               if (!mp) {
+                       struct pmd_internals *pmd = dev->data->dev_private;
+                       RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for 
device %s\n",
+                               pmd->name, dev->device->name);
+                       return -1;
+               }
+       }
+
+       gso_ctx->direct_pool = mp;
+       gso_ctx->indirect_pool = mp;
+       gso_ctx->gso_types = gso_types;
+       gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
+       gso_ctx->flag = 0;
+
+       return 0;
+}
+
+static int
 tap_setup_queue(struct rte_eth_dev *dev,
                struct pmd_internals *internals,
                uint16_t qid,
                int is_rx)
 {
+       int ret;
        int *fd;
        int *other_fd;
        const char *dir;
        struct pmd_internals *pmd = dev->data->dev_private;
        struct rx_queue *rx = &internals->rxq[qid];
        struct tx_queue *tx = &internals->txq[qid];
+       struct rte_gso_ctx *gso_ctx;
 
        if (is_rx) {
                fd = &rx->fd;
                other_fd = &tx->fd;
                dir = "rx";
+               gso_ctx = NULL;
        } else {
                fd = &tx->fd;
                other_fd = &rx->fd;
                dir = "tx";
+               gso_ctx = &tx->gso_ctx;
        }
        if (*fd != -1) {
                /* fd for this queue already exists */
                TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
                        pmd->name, *fd, dir, qid);
+               gso_ctx = NULL;
        } else if (*other_fd != -1) {
                /* Only other_fd exists. dup it */
                *fd = dup(*other_fd);
@@ -1120,6 +1237,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
 
        tx->mtu = &dev->data->mtu;
        rx->rxmode = &dev->data->dev_conf.rxmode;
+       if (gso_ctx) {
+               ret = tap_gso_ctx_setup(gso_ctx, dev);
+               if (ret)
+                       return -1;
+       }
 
        tx->type = pmd->type;
 
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 7b21d0d..44e2773 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -15,6 +15,7 @@
 
 #include <rte_ethdev_driver.h>
 #include <rte_ether.h>
+#include <rte_gso.h>
 #include "tap_log.h"
 
 #ifdef IFF_MULTI_QUEUE
@@ -22,6 +23,7 @@
 #else
 #define RTE_PMD_TAP_MAX_QUEUES 1
 #endif
+#define MAX_GSO_MBUFS 64
 
 enum rte_tuntap_type {
        ETH_TUNTAP_TYPE_UNKNOWN,
@@ -59,6 +61,7 @@ struct tx_queue {
        uint16_t *mtu;                  /* Pointer to MTU from dev_data */
        uint16_t csum:1;                /* Enable checksum offloading */
        struct pkt_stats stats;         /* Stats for this TX queue */
+       struct rte_gso_ctx gso_ctx;     /* GSO context */
 };
 
 struct pmd_internals {
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 1e32c83..e2ee879 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -38,8 +38,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT)           += -lrte_port
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP)          += -lrte_pdump
 _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR)    += -lrte_distributor
 _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG)        += -lrte_ip_frag
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
 _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lrte_meter
 _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM)            += -lrte_lpm
 # librte_acl needs --whole-archive because of weak functions
@@ -61,6 +59,8 @@ endif
 _LDLIBS-y += --whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
 _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH)           += -lrte_hash
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER)         += -lrte_member
 _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST)          += -lrte_vhost
-- 
2.7.4

Reply via email to