From: Yi Yang <[email protected]> This patch just show how VXLAN TSO works for developers, it isn't ready for merge, welcome comments.
Signed-off-by: Yi Yang <[email protected]> --- lib/dp-packet.h | 33 +++++++++++ lib/netdev-dpdk.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++-- lib/netdev-linux.c | 20 +++++++ lib/netdev.c | 16 ++--- 4 files changed, 220 insertions(+), 16 deletions(-) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 0430cca..1ed5eba 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -1032,6 +1032,39 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */ +static inline void +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b) +{ + b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN; + b->mbuf.l2_len += sizeof(struct udp_header) + + sizeof(struct vxlanhdr); + b->mbuf.outer_l2_len = ETH_HEADER_LEN; + b->mbuf.outer_l3_len = IP_HEADER_LEN; +} + +/* Set l2_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len) +{ + b->mbuf.l2_len = l2_len; +} + +/* Set l3_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len) +{ + b->mbuf.l3_len = l3_len; +} + +/* Set l4_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len) +{ + b->mbuf.l4_len = l4_len; +} + + static inline bool dp_packet_ip_checksum_valid(const struct dp_packet *p) { diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 44ebf96..bd9696d 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -44,6 +44,7 @@ #include <rte_pci.h> #include <rte_version.h> #include <rte_vhost.h> +#include <rte_ip.h> #include "cmap.h" #include "coverage.h" @@ -405,6 +406,7 @@ enum dpdk_hw_ol_features { NETDEV_RX_HW_SCATTER = 1 << 2, NETDEV_TX_TSO_OFFLOAD = 1 << 3, NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4, + NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5, }; /* @@ -988,6 +990,12 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS; + /* Enable VXLAN TSO support if available */ + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) { + conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO; + conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; + conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS; + } if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM; } @@ -1126,6 +1134,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) if ((info.tx_offload_capa & tx_tso_offload_capa) == tx_tso_offload_capa) { dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; + /* Enable VXLAN TSO support if available */ + if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) { + dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD; + } if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) { dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD; } else { @@ -2131,6 +2143,57 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq) rte_free(rx); } +/* Prepare the packet for VXLAN HWOL. + * Return True if the packet is OK to continue. */ +static void +netdev_dpdk_prep_vxlan_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) +{ + struct rte_ether_hdr *eth_hdr = + rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); + struct rte_ipv4_hdr *ip_hdr; + struct rte_udp_hdr *udp_hdr; + struct rte_tcp_hdr *tcp_hdr; + uint16_t orig_tcp_csum; + + /* Clean up offload flags from Rx side */ + mbuf->ol_flags &= ~PKT_RX_RSS_HASH; + + if ((mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) == 0) { + return; + } + + if (mbuf->ol_flags & PKT_TX_IPV4) { + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); + udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1); + + /* outer IP checksum */ + ip_hdr->hdr_checksum = 0; + mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM; + mbuf->ol_flags |= PKT_TX_OUTER_IPV4; + + /* inner IP checksum */ + mbuf->ol_flags |= PKT_TX_IP_CKSUM; + ip_hdr = (struct rte_ipv4_hdr *)((uint8_t *)udp_hdr + mbuf->l2_len); + ip_hdr->hdr_checksum = 0; + } + + if (mbuf->ol_flags & PKT_TX_TCP_SEG || mbuf->ol_flags & PKT_TX_TCP_CKSUM) { + ip_hdr = (struct rte_ipv4_hdr *)((uint8_t *)rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *) + mbuf->outer_l2_len + mbuf->outer_l3_len + mbuf->l2_len); + tcp_hdr = (struct rte_tcp_hdr *)((uint8_t *)ip_hdr + mbuf->l3_len); + + mbuf->ol_flags |= PKT_TX_TCP_CKSUM; + if (mbuf->ol_flags & PKT_TX_TCP_SEG) { + //mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; + mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len; + } else { + /* For non-TSO packet, l4_len isn't required */ + mbuf->tso_segsz = 0; + //mbuf->l4_len = 0; + } + } + +} + /* Prepare the packet for HWOL. * Return True if the packet is OK to continue. */ static bool @@ -2159,6 +2222,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; if (mbuf->ol_flags & PKT_TX_IPV4) { + struct ip_header *ip_hdr = dp_packet_l3(pkt); + + ip_hdr->ip_csum = 0; mbuf->ol_flags |= PKT_TX_IP_CKSUM; } } @@ -2737,13 +2803,97 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig) mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload; mbuf_dest->packet_type = pkt_orig->mbuf.packet_type; - mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags & - ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF)); + mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags; + //mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags & + // ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF)); + mbuf_dest->l2_len = pkt_orig->mbuf.l2_len; + mbuf_dest->l3_len = pkt_orig->mbuf.l3_len; + mbuf_dest->l4_len = pkt_orig->mbuf.l4_len; + mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len; + mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len; + + memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size, + sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size)); + + if ((mbuf_dest->outer_l2_len == 0) && + (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) { + mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest) + - (char *)dp_packet_eth(pkt_dest); + mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest) + - (char *) dp_packet_l3(pkt_dest); + } + + return pkt_dest; +} + +static struct dp_packet * +dpdk_copy_dp_packet_to_chained_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig, int mbuf_len) +{ + struct rte_mbuf *mbuf_dest; + struct dp_packet *pkt_dest; + uint32_t pkt_len; + + pkt_len = dp_packet_size(pkt_orig); + if (pkt_len <= mbuf_len) { //Single rte_mbuf + mbuf_dest = rte_pktmbuf_alloc(mp); + if (OVS_UNLIKELY(mbuf_dest == NULL)) { + return NULL; + } + + pkt_dest = CONTAINER_OF(mbuf_dest, struct dp_packet, mbuf); + memcpy(dp_packet_data(pkt_dest), dp_packet_data(pkt_orig), pkt_len); + dp_packet_set_size(pkt_dest, pkt_len); + } else { // Chained multi-segmented rte_mbuf + struct rte_mbuf * mbufs[48]; + int ret; + int count; + int i; + uint32_t pkt_off = 0; + uint32_t seg_len = mbuf_len; + uint32_t left_len = pkt_len; + + count = pkt_len / mbuf_len + ((pkt_len % mbuf_len) ? 1 : 0); + ret = rte_pktmbuf_alloc_bulk(mp, mbufs, count); + if (OVS_UNLIKELY(ret != 0)) { + return NULL; + } + + mbuf_dest = mbufs[0]; + for (i = 0; i < count; i++) { + pkt_dest = CONTAINER_OF(mbufs[i], struct dp_packet, mbuf); + memcpy(dp_packet_data(pkt_dest), dp_packet_data(pkt_orig) + pkt_off, seg_len); + mbufs[i]->nb_segs = 1; + mbufs[i]->next = NULL; + dp_packet_set_size(pkt_dest, seg_len); + pkt_off += seg_len; + left_len -= seg_len; + if (left_len < mbuf_len) { + seg_len = left_len; + } + if (i > 0) { + mbufs[i-1]->next = mbufs[i]; + mbuf_dest->nb_segs += 1; + mbuf_dest->pkt_len += mbufs[i]->pkt_len; + } + } + } + + mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload; + mbuf_dest->packet_type = pkt_orig->mbuf.packet_type; + mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags; + //mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags & + // ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF)); + mbuf_dest->l2_len = pkt_orig->mbuf.l2_len; + mbuf_dest->l3_len = pkt_orig->mbuf.l3_len; + mbuf_dest->l4_len = pkt_orig->mbuf.l4_len; + mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len; + mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len; memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size, sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size)); - if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) { + if ((mbuf_dest->outer_l2_len == 0) && + (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) { mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest) - (char *)dp_packet_eth(pkt_dest); mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest) @@ -2753,6 +2903,7 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig) return pkt_dest; } + /* Tx function. Transmit packets indefinitely */ static void dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) @@ -2773,6 +2924,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) uint32_t tx_failure = 0; uint32_t mtu_drops = 0; uint32_t qos_drops = 0; + struct rte_mbuf *mbuf; if (dev->type != DPDK_DEV_VHOST) { /* Check if QoS has been configured for this netdev. */ @@ -2795,12 +2947,15 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) continue; } - pkts[txcnt] = dpdk_copy_dp_packet_to_mbuf(dev->dpdk_mp->mp, packet); + pkts[txcnt] = dpdk_copy_dp_packet_to_chained_mbuf(dev->dpdk_mp->mp, packet, dev->mtu + RTE_ETHER_HDR_LEN); if (OVS_UNLIKELY(!pkts[txcnt])) { dropped = cnt - i; break; } + mbuf = (struct rte_mbuf *)pkts[txcnt]; + netdev_dpdk_prep_vxlan_hwol_packet(dev, mbuf); + txcnt++; } @@ -4949,6 +5104,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev) netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; + /* Enable VXLAN TSO support if available */ + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) { + netdev->ol_flags |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD; + } if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; } diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 6269c24..f6e80fc 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -6500,6 +6500,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) struct eth_header *eth_hdr; ovs_be16 eth_type; int l2_len; + int l3_len = 0; + int l4_len = 0; eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN); if (!eth_hdr) { @@ -6519,6 +6521,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) l2_len += VLAN_HEADER_LEN; } + dp_packet_hwol_set_l2_len(b, l2_len); + if (eth_type == htons(ETH_TYPE_IP)) { struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN); @@ -6526,6 +6530,7 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) return -EINVAL; } + l3_len = IP_HEADER_LEN; *l4proto = ip_hdr->ip_proto; dp_packet_hwol_set_tx_ipv4(b); } else if (eth_type == htons(ETH_TYPE_IPV6)) { @@ -6536,10 +6541,25 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) return -EINVAL; } + l3_len = IPV6_HEADER_LEN; *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt; dp_packet_hwol_set_tx_ipv6(b); } + dp_packet_hwol_set_l3_len(b, l3_len); + + if (*l4proto == IPPROTO_TCP) { + struct tcp_header *tcp_hdr = dp_packet_at(b, l2_len + l3_len, + sizeof(struct tcp_header)); + + if (!tcp_hdr) { + return -EINVAL; + } + + l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4; + dp_packet_hwol_set_l4_len(b, l4_len); + } + return 0; } diff --git a/lib/netdev.c b/lib/netdev.c index 90962ee..dbc130b 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -960,18 +960,10 @@ netdev_push_header(const struct netdev *netdev, size_t i, size = dp_packet_batch_size(batch); DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { - if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet) - || dp_packet_hwol_l4_mask(packet))) { - COVERAGE_INC(netdev_push_header_drops); - dp_packet_delete(packet); - VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is " - "not supported: packet dropped", - netdev_get_name(netdev)); - } else { - netdev->netdev_class->push_header(netdev, packet, data); - pkt_metadata_init(&packet->md, data->out_port); - dp_packet_batch_refill(batch, packet, i); - } + netdev->netdev_class->push_header(netdev, packet, data); + dp_packet_hwol_set_vxlan_tcp_seg(packet); + pkt_metadata_init(&packet->md, data->out_port); + dp_packet_batch_refill(batch, packet, i); } return 0; -- 1.8.3.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
