> On Jun 12, 2018, at 11:31 AM, Ophir Munk <ophi...@mellanox.com> wrote:
> 
> This commit implements TCP segmentation offload in TAP.
> librte_gso library is used to segment large TCP payloads (e.g. packets
> of 64K bytes size) into smaller MTU size buffers.
> By supporting TSO offload capability in software a TAP device can be used
> as a failsafe sub device and be paired with another PCI device which
> supports TSO capability in HW.
> 
> For more details on librte_gso implementation please refer to dpdk
> documentation.
> The number of newly generated TCP TSO segments is limited to 64.
> 
> Reviewed-by: Raslan Darawsheh <rasl...@mellanox.com>
> Signed-off-by: Ophir Munk <ophi...@mellanox.com>
> ---
> drivers/net/tap/Makefile      |   2 +-
> drivers/net/tap/rte_eth_tap.c | 159 +++++++++++++++++++++++++++++++++++-------
> drivers/net/tap/rte_eth_tap.h |   3 +
> mk/rte.app.mk                 |   4 +-
> 4 files changed, 138 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
> index ccc5c5f..3243365 100644
> --- a/drivers/net/tap/Makefile
> +++ b/drivers/net/tap/Makefile
> @@ -24,7 +24,7 @@ CFLAGS += -I.
> CFLAGS += $(WERROR_FLAGS)
> LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
> LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
> -LDLIBS += -lrte_bus_vdev
> +LDLIBS += -lrte_bus_vdev -lrte_gso
> 
> CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
> 
> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
> index c19f053..62b931f 100644
> --- a/drivers/net/tap/rte_eth_tap.c
> +++ b/drivers/net/tap/rte_eth_tap.c
> @@ -17,6 +17,7 @@
> #include <rte_ip.h>
> #include <rte_string_fns.h>
> 
> +#include <assert.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <sys/socket.h>
> @@ -55,6 +56,9 @@
> #define ETH_TAP_CMP_MAC_FMT     "0123456789ABCDEFabcdef"
> #define ETH_TAP_MAC_ARG_FMT     ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
> 
> +#define TAP_GSO_MBUFS_NUM    128
> +#define TAP_GSO_MBUF_SEG_SIZE        128
> +
> static struct rte_vdev_driver pmd_tap_drv;
> static struct rte_vdev_driver pmd_tun_drv;
> 
> @@ -412,7 +416,8 @@ tap_tx_offload_get_queue_capa(void)
>       return DEV_TX_OFFLOAD_MULTI_SEGS |
>              DEV_TX_OFFLOAD_IPV4_CKSUM |
>              DEV_TX_OFFLOAD_UDP_CKSUM |
> -            DEV_TX_OFFLOAD_TCP_CKSUM;
> +            DEV_TX_OFFLOAD_TCP_CKSUM |
> +            DEV_TX_OFFLOAD_TCP_TSO;
> }
> 
> /* Finalize l4 checksum calculation */
> @@ -479,23 +484,15 @@ tap_tx_l3_cksum(char *packet, uint64_t ol_flags, 
> unsigned int l2_len,
>       }
> }
> 
> -/* Callback to handle sending packets from the tap interface
> - */
> -static uint16_t
> -pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> +static inline void
> +tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
> +                     struct rte_mbuf **pmbufs, uint16_t l234_hlen,
> +                     uint16_t *num_packets, unsigned long *num_tx_bytes)
> {
> -     struct tx_queue *txq = queue;
> -     uint16_t num_tx = 0;
> -     unsigned long num_tx_bytes = 0;
> -     uint32_t max_size;
>       int i;
> 
> -     if (unlikely(nb_pkts == 0))
> -             return 0;
> -
> -     max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> -     for (i = 0; i < nb_pkts; i++) {
> -             struct rte_mbuf *mbuf = bufs[num_tx];
> +     for (i = 0; i < num_mbufs; i++) {
> +             struct rte_mbuf *mbuf = pmbufs[i];
>               struct iovec iovecs[mbuf->nb_segs + 2];
>               struct tun_pi pi = { .flags = 0, .proto = 0x00 };
>               struct rte_mbuf *seg = mbuf;
> @@ -503,8 +500,7 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, 
> uint16_t nb_pkts)
>               int proto;
>               int n;
>               int j;
> -             int k; /* first index in iovecs for copying segments */
> -             uint16_t l234_hlen; /* length of layers 2,3,4 headers */
> +             int k; /* current index in iovecs for copying segments */
>               uint16_t seg_len; /* length of first segment */
>               uint16_t nb_segs;
>               uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
> @@ -512,10 +508,6 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, 
> uint16_t nb_pkts)
>               uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
>               uint16_t is_cksum = 0; /* in case cksum should be offloaded */
> 
> -             /* stats.errs will be incremented */
> -             if (rte_pktmbuf_pkt_len(mbuf) > max_size)
> -                     break;
> -
>               l4_cksum = NULL;
>               if (txq->type == ETH_TUNTAP_TYPE_TUN) {
>                       /*
> @@ -554,9 +546,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, 
> uint16_t nb_pkts)
>                       l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
>                       if (seg_len < l234_hlen)
>                               break;
> -
> -                     /* To change checksums, work on a
> -                      * copy of l2, l3 l4 headers.
> +                     /* To change checksums, work on a * copy of l2, l3
> +                      * headers + l4 pseudo header
>                        */
>                       rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
>                                       l234_hlen);
> @@ -598,13 +589,80 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, 
> uint16_t nb_pkts)
>               n = writev(txq->fd, iovecs, j);
>               if (n <= 0)
>                       break;
> +             (*num_packets)++;
> +             (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
> +     }
> +}
> +
> +/* Callback to handle sending packets from the tap interface
> + */
> +static uint16_t
> +pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> +{
> +     struct tx_queue *txq = queue;
> +     uint16_t num_tx = 0;
> +     uint16_t num_packets = 0;
> +     unsigned long num_tx_bytes = 0;
> +     uint16_t tso_segsz = 0;
> +     uint16_t hdrs_len;
> +     uint32_t max_size;
> +     int i;
> +     uint64_t tso;
> +     int ret;
> 
> +     if (unlikely(nb_pkts == 0))
> +             return 0;
> +
> +     struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
> +     max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> +     for (i = 0; i < nb_pkts; i++) {
> +             struct rte_mbuf *mbuf_in = bufs[num_tx];
> +             struct rte_mbuf **mbuf;
> +             uint16_t num_mbufs;
> +
> +             tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
> +             if (tso) {
> +                     struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;

Blank line here after declare.
> +                     assert(gso_ctx != NULL);
> +                     /* TCP segmentation implies TCP checksum offload */
> +                     mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
> +                     /* gso size is calculated without ETHER_CRC_LEN */
> +                     hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
> +                                     mbuf_in->l4_len;
> +                     tso_segsz = mbuf_in->tso_segsz + hdrs_len;
> +                     if (unlikely(tso_segsz == hdrs_len) ||
> +                             tso_segsz > *txq->mtu) {
> +                             txq->stats.errs++;
> +                             break;
> +                     }
> +                     gso_ctx->gso_size = tso_segsz;
> +                     ret = rte_gso_segment(mbuf_in, /* packet to segment */
> +                             gso_ctx, /* gso control block */
> +                             (struct rte_mbuf **)&gso_mbufs, /* out mbufs */
> +                             RTE_DIM(gso_mbufs)); /* max tso mbufs */
> +
> +                     /* ret contains the number of new created mbufs */
> +                     if (ret < 0)
> +                             break;
> +
> +                     mbuf = gso_mbufs;
> +                     num_mbufs = ret;
> +             } else {
> +                     /* stats.errs will be incremented */
> +                     if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
> +                             break;
> +
> +                     mbuf = &mbuf_in;
> +                     num_mbufs = 1;
> +             }
> +
> +             tap_write_mbufs(txq, num_mbufs, mbuf, hdrs_len,
> +                             &num_packets, &num_tx_bytes);
>               num_tx++;
> -             num_tx_bytes += mbuf->pkt_len;
> -             rte_pktmbuf_free(mbuf);
> +             rte_pktmbuf_free(mbuf_in);
>       }
> 
> -     txq->stats.opackets += num_tx;
> +     txq->stats.opackets += num_packets;
>       txq->stats.errs += nb_pkts - num_tx;
>       txq->stats.obytes += num_tx_bytes;
> 
> @@ -1066,31 +1124,73 @@ tap_mac_set(struct rte_eth_dev *dev, struct 
> ether_addr *mac_addr)
> }
> 
> static int
> +tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
> +{
> +     uint32_t gso_types;
> +     char pool_name[64];
> +
> +     /*
> +      * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes
> +      * size per mbuf use this pool for both direct and indirect mbufs
> +      */
> +
> +     struct rte_mempool *mp;      /* Mempool for GSO packets */

Blank line after declare.
> +     /* initialize GSO context */
> +     gso_types = DEV_TX_OFFLOAD_TCP_TSO;
> +     snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
> +     mp = rte_mempool_lookup((const char *)pool_name);
> +     if (!mp) {
> +             mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
> +                     0, 0, RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
> +                     SOCKET_ID_ANY);
> +             if (!mp) {
> +                     struct pmd_internals *pmd = dev->data->dev_private;
> +                     RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for 
> device %s\n",
> +                             pmd->name, dev->device->name);
> +                     return -1;
> +             }
> +     }
> +
> +     gso_ctx->direct_pool = mp;
> +     gso_ctx->indirect_pool = mp;
> +     gso_ctx->gso_types = gso_types;
> +     gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
> +     gso_ctx->flag = 0;
> +
> +     return 0;
> +}
> +
> +static int
> tap_setup_queue(struct rte_eth_dev *dev,
>               struct pmd_internals *internals,
>               uint16_t qid,
>               int is_rx)
> {
> +     int ret;
>       int *fd;
>       int *other_fd;
>       const char *dir;
>       struct pmd_internals *pmd = dev->data->dev_private;
>       struct rx_queue *rx = &internals->rxq[qid];
>       struct tx_queue *tx = &internals->txq[qid];
> +     struct rte_gso_ctx *gso_ctx;
> 
>       if (is_rx) {
>               fd = &rx->fd;
>               other_fd = &tx->fd;
>               dir = "rx";
> +             gso_ctx = NULL;
>       } else {
>               fd = &tx->fd;
>               other_fd = &rx->fd;
>               dir = "tx";
> +             gso_ctx = &tx->gso_ctx;
>       }
>       if (*fd != -1) {
>               /* fd for this queue already exists */
>               TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
>                       pmd->name, *fd, dir, qid);
> +             gso_ctx = NULL;
>       } else if (*other_fd != -1) {
>               /* Only other_fd exists. dup it */
>               *fd = dup(*other_fd);
> @@ -1115,6 +1215,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
> 
>       tx->mtu = &dev->data->mtu;
>       rx->rxmode = &dev->data->dev_conf.rxmode;
> +     if (gso_ctx) {
> +             ret = tap_gso_ctx_setup(gso_ctx, dev);
> +             if (ret)
> +                     return -1;
> +     }
> 
>       tx->type = pmd->type;
> 
> diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
> index 7b21d0d..44e2773 100644
> --- a/drivers/net/tap/rte_eth_tap.h
> +++ b/drivers/net/tap/rte_eth_tap.h
> @@ -15,6 +15,7 @@
> 
> #include <rte_ethdev_driver.h>
> #include <rte_ether.h>
> +#include <rte_gso.h>
> #include "tap_log.h"
> 
> #ifdef IFF_MULTI_QUEUE
> @@ -22,6 +23,7 @@
> #else
> #define RTE_PMD_TAP_MAX_QUEUES        1
> #endif
> +#define MAX_GSO_MBUFS 64
> 
> enum rte_tuntap_type {
>       ETH_TUNTAP_TYPE_UNKNOWN,
> @@ -59,6 +61,7 @@ struct tx_queue {
>       uint16_t *mtu;                  /* Pointer to MTU from dev_data */
>       uint16_t csum:1;                /* Enable checksum offloading */
>       struct pkt_stats stats;         /* Stats for this TX queue */
> +     struct rte_gso_ctx gso_ctx;     /* GSO context */
> };
> 
> struct pmd_internals {
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 1e32c83..e2ee879 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -38,8 +38,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT)           += -lrte_port
> _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP)          += -lrte_pdump
> _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR)    += -lrte_distributor
> _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG)        += -lrte_ip_frag
> -_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
> -_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
> _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lrte_meter
> _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM)            += -lrte_lpm
> # librte_acl needs --whole-archive because of weak functions
> @@ -61,6 +59,8 @@ endif
> _LDLIBS-y += --whole-archive
> 
> _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
> _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH)           += -lrte_hash
> _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER)         += -lrte_member
> _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST)          += -lrte_vhost
> -- 
> 2.7.4
> 

Regards,
Keith

Reply via email to