From: Yi Yang <[email protected]> Many NICs can support VXLAN TSO which can help improve across-compute-node VM-to-VM performance in case that MTU is set to 1500.
This patch allows dpdkvhostuserclient interface and veth/tap interface to leverage NICs' offload capability to maximize across-compute-node TCP performance, with it applied, OVS DPDK can reach line speed for across-compute-node VM-to-VM TCP performance. Signed-off-by: Yi Yang <[email protected]> --- Changelog: v3 -> v4: - Split it from v3 as a separate patch. - Add IPv6 support. - Remove GRO and GSO code for simplicity. - Remove dependency on multi-segmented mbuf, VXLAN TSO needn't it if without GRO and GSO. --- lib/automake.mk | 2 + lib/dp-packet.c | 9 ++ lib/dp-packet.h | 238 +++++++++++++++++++++++++++++++++++++- lib/netdev-dpdk.c | 284 ++++++++++++++++++++++++++++++++++++++++++++-- lib/netdev-linux.c | 154 +++++++++++++++++++++++-- lib/netdev-provider.h | 1 + lib/netdev.c | 149 +++++++++++++++++++++++- lib/userspace-tso-segsz.c | 55 +++++++++ lib/userspace-tso-segsz.h | 23 ++++ vswitchd/bridge.c | 2 + 10 files changed, 892 insertions(+), 25 deletions(-) create mode 100644 lib/userspace-tso-segsz.c create mode 100644 lib/userspace-tso-segsz.h diff --git a/lib/automake.mk b/lib/automake.mk index 8eeb6c3..7e0b9fc 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -345,6 +345,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/unixctl.h \ lib/userspace-tso.c \ lib/userspace-tso.h \ + lib/userspace-tso-segsz.c \ + lib/userspace-tso-segsz.h \ lib/util.c \ lib/util.h \ lib/uuid.c \ diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 72f6d09..ee0ccee 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -502,7 +502,16 @@ dp_packet_resize_l2_5(struct dp_packet *b, int increment) void * dp_packet_resize_l2(struct dp_packet *b, int increment) { + int outer_l2_len = dp_packet_hwol_get_outer_l2_len(b); + dp_packet_resize_l2_5(b, increment); dp_packet_adjust_layer_offset(&b->l2_5_ofs, increment); + if (outer_l2_len) { + dp_packet_hwol_set_outer_l2_len(b, outer_l2_len + increment); + } else { + int l2_len = dp_packet_hwol_get_l2_len(b); + + dp_packet_hwol_set_l2_len(b, l2_len + increment); + } return dp_packet_data(b); } diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 0430cca..f1c07e0 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -81,6 +81,14 @@ enum dp_packet_offload_mask { DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_CKSUM, PKT_TX_UDP_CKSUM, 0x400), /* Offload SCTP checksum. */ DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800), + /* VXLAN TCP Segmentation Offload. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, 0x1000), + /* UDP Segmentation Offload. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000), + /* Outer L3 Type IPV4 For Tunnel Offload. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4, PKT_TX_OUTER_IPV4, 0x4000), + /* Outer L3 Type IPV6 For Tunnel Offload. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV6, PKT_TX_OUTER_IPV6, 0x8000), /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */ }; @@ -95,7 +103,8 @@ enum dp_packet_offload_mask { DP_PACKET_OL_TX_IPV6 | \ DP_PACKET_OL_TX_TCP_CKSUM | \ DP_PACKET_OL_TX_UDP_CKSUM | \ - DP_PACKET_OL_TX_SCTP_CKSUM) + DP_PACKET_OL_TX_SCTP_CKSUM | \ + DP_PACKET_OL_TX_UDP_SEG) #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \ DP_PACKET_OL_TX_UDP_CKSUM | \ @@ -954,6 +963,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b) return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG); } +/* Returns 'true' if packet 'b' is marked for UDP fragmentation offloading. */ +static inline bool +dp_packet_hwol_is_ufo(const struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG); +} + /* Returns 'true' if packet 'b' is marked for IPv4 checksum offloading. */ static inline bool dp_packet_hwol_is_ipv4(const struct dp_packet *b) @@ -992,6 +1008,13 @@ dp_packet_hwol_set_tx_ipv4(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV4; } +/* Reset packet 'b' for IPv4 checksum offloading. */ +static inline void +dp_packet_hwol_reset_tx_ipv4(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) &= ~DP_PACKET_OL_TX_IPV4; +} + /* Mark packet 'b' for IPv6 checksum offloading. */ static inline void dp_packet_hwol_set_tx_ipv6(struct dp_packet *b) @@ -999,6 +1022,27 @@ dp_packet_hwol_set_tx_ipv6(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV6; } +/* Reset packet 'b' for IPv6 checksum offloading. */ +static inline void +dp_packet_hwol_reset_tx_ipv6(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) &= ~DP_PACKET_OL_TX_IPV6; +} + +/* Mark packet 'b' for Outer IPv4 checksum offloading. */ +static inline void +dp_packet_hwol_set_tx_outer_ipv4(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4; +} + +/* Mark packet 'b' for Outer IPv6 checksum offloading. */ +static inline void +dp_packet_hwol_set_tx_outer_ipv6(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV6; +} + /* Mark packet 'b' for TCP checksum offloading. It implies that either * the packet 'b' is marked for IPv4 or IPv6 checksum offloading. */ static inline void @@ -1007,6 +1051,14 @@ dp_packet_hwol_set_csum_tcp(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_CKSUM; } +/* Reset TCP checksum offloading flag for packet 'b'. + */ +static inline void +dp_packet_hwol_reset_csum_tcp(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) &= ~DP_PACKET_OL_TX_TCP_CKSUM; +} + /* Mark packet 'b' for UDP checksum offloading. It implies that either * the packet 'b' is marked for IPv4 or IPv6 checksum offloading. */ static inline void @@ -1015,6 +1067,15 @@ dp_packet_hwol_set_csum_udp(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_CKSUM; } +/* Reset UDP checksum offloading flag for packet 'b'. + */ +static inline void +dp_packet_hwol_reset_csum_udp(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) &= ~DP_PACKET_OL_TX_UDP_CKSUM; +} + + /* Mark packet 'b' for SCTP checksum offloading. It implies that either * the packet 'b' is marked for IPv4 or IPv6 checksum offloading. */ static inline void @@ -1032,6 +1093,181 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } +/* Mark packet 'b' for UDP segmentation offloading. It implies that + * either the packet 'b' is marked for IPv4 or IPv6 checksum offloading + * and also for UDP checksum offloading. */ +static inline void +dp_packet_hwol_set_udp_seg(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG; +} + +#ifdef DPDK_NETDEV +/* Set l2_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len) +{ + b->mbuf.l2_len = l2_len; +} + +/* Set l3_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len) +{ + b->mbuf.l3_len = l3_len; +} + +/* Set l4_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len) +{ + b->mbuf.l4_len = l4_len; +} + +/* Set outer_l2_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int outer_l2_len) +{ + b->mbuf.outer_l2_len = outer_l2_len; +} + +/* Set outer_l3_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int outer_l3_len) +{ + b->mbuf.outer_l3_len = outer_l3_len; +} + +/* Get l2_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_l2_len(struct dp_packet *b) +{ + return b->mbuf.l2_len; +} + +/* Get l3_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_l3_len(struct dp_packet *b) +{ + return b->mbuf.l3_len; +} + +/* Get l4_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_l4_len(struct dp_packet *b) +{ + return b->mbuf.l4_len; +} + +/* Get outer_l2_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_outer_l2_len(struct dp_packet *b) +{ + return b->mbuf.outer_l2_len; +} + + +/* Get outer_l3_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_outer_l3_len(struct dp_packet *b) +{ + return b->mbuf.outer_l3_len; +} + +#else +/* Set l2_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_l2_len(struct dp_packet *b OVS_UNUSED, + int l2_len OVS_UNUSED) +{ +} + +/* Set l3_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_l3_len(struct dp_packet *b OVS_UNUSED, + int l3_len OVS_UNUSED) +{ +} + +/* Set l4_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_l4_len(struct dp_packet *b OVS_UNUSED, + int l4_len OVS_UNUSED) +{ +} + +/* Set outer_l2_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b OVS_UNUSED, + int outer_l2_len OVS_UNUSED) +{ +} + +/* Set outer_l3_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b OVS_UNUSED, + int outer_l3_len OVS_UNUSED) +{ +} + +/* Get l2_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_l2_len(struct dp_packet *b) +{ + return ((char *)dp_packet_l3(b) - (char *)dp_packet_eth(b)); +} + +/* Get l3_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_l3_len(struct dp_packet *b) +{ + return ((char *)dp_packet_l4(b) - (char *)dp_packet_l3(b)); +} + +/* Get l4_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_l4_len(struct dp_packet *b OVS_UNUSED) +{ + return 0; +} + + +/* Get outer_l2_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_outer_l2_len(struct dp_packet *b) +{ + return ((char *)dp_packet_l3(b) - (char *)dp_packet_eth(b)); +} + +/* Get outer_l3_len for the packet 'b' */ +static inline int +dp_packet_hwol_get_outer_l3_len(struct dp_packet *b) +{ + return ((char *)dp_packet_l4(b) - (char *)dp_packet_l3(b)); +} + +#endif /* DPDK_NETDEV */ + +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */ +static inline void +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN; + /* Set outer_l2_len and outer_l3_len */ + dp_packet_hwol_set_outer_l2_len(b, (char *) dp_packet_l3(b) + - (char *) dp_packet_eth(b)); + dp_packet_hwol_set_outer_l3_len(b, (char *) dp_packet_l4(b) + - (char *) dp_packet_l3(b)); +} + +/* Check if it is a VXLAN packet */ +static inline bool +dp_packet_hwol_is_vxlan_tcp_seg(struct dp_packet *b) +{ + return (*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_VXLAN); +} + + static inline bool dp_packet_ip_checksum_valid(const struct dp_packet *p) { diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 75dffef..b2dd008 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -38,6 +38,7 @@ #include <rte_errno.h> #include <rte_ethdev.h> #include <rte_flow.h> +#include <rte_ip.h> #include <rte_malloc.h> #include <rte_mbuf.h> #include <rte_meter.h> @@ -72,6 +73,7 @@ #include "unaligned.h" #include "unixctl.h" #include "userspace-tso.h" +#include "userspace-tso-segsz.h" #include "util.h" #include "uuid.h" @@ -87,6 +89,7 @@ COVERAGE_DEFINE(vhost_notification); #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE #define OVS_VPORT_DPDK "ovs_dpdk" +#define DPDK_RTE_HDR_OFFSET 1 /* * need to reserve tons of extra space in the mbufs so we can align the @@ -96,6 +99,8 @@ COVERAGE_DEFINE(vhost_notification); */ #define ETHER_HDR_MAX_LEN (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN \ + (2 * VLAN_HEADER_LEN)) +#define ETHER_VLAN_HDR_MAX_LEN (RTE_ETHER_HDR_LEN + \ + + (2 * VLAN_HEADER_LEN)) #define MTU_TO_FRAME_LEN(mtu) ((mtu) + RTE_ETHER_HDR_LEN + \ RTE_ETHER_CRC_LEN) #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN) @@ -404,6 +409,7 @@ enum dpdk_hw_ol_features { NETDEV_RX_HW_SCATTER = 1 << 2, NETDEV_TX_TSO_OFFLOAD = 1 << 3, NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4, + NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5, }; /* @@ -998,6 +1004,11 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS; + /* Enable VXLAN TSO support if available */ + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) { + conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO; + conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; + } if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM; } @@ -1136,6 +1147,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) if ((info.tx_offload_capa & tx_tso_offload_capa) == tx_tso_offload_capa) { dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; + /* Enable VXLAN TSO support if available */ + if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) { + dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD; + } if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) { dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD; } else { @@ -2173,37 +2188,267 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq) rte_free(rx); } +static inline bool +is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev) +{ + bool ret = false; + struct netdev_dpdk *src_dev; + + if (src_port_id == UINT16_MAX) { + ret = true; + } else { + src_dev = netdev_dpdk_lookup_by_port_id(src_port_id); + if (src_dev && (netdev_dpdk_get_vid(src_dev) >= 0)) { + ret = true; + } + } + + if (ret) { + if (netdev_dpdk_get_vid(dev) < 0) { + ret = false; + } + } + + return ret; +} + +#define UDP_VXLAN_ETH_HDR_SIZE 30 + /* Prepare the packet for HWOL. * Return True if the packet is OK to continue. */ static bool netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) { struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf); + uint16_t l4_proto = 0; + uint8_t *l3_hdr_ptr = NULL; + struct rte_ether_hdr *eth_hdr = + rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); + struct rte_ipv4_hdr *ip_hdr; + struct rte_ipv6_hdr *ip6_hdr; + const uint16_t tso_segsz = get_userspace_tso_segsz(); + + /* Return directly if source and destitation of mbuf are local ports + * because mbuf has already set ol_flags and l*_len correctly. + */ + if (is_local_to_local(mbuf->port, dev)) { + if (mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) { + mbuf->tso_segsz = tso_segsz - mbuf->l3_len - mbuf->l4_len; + } + return true; + } + + if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) { + /* Handle VXLAN TSO */ + struct rte_udp_hdr *udp_hdr = NULL; + + /* Correct l2_len for VxLAN packet */ + mbuf->l2_len += sizeof(struct udp_header) + + sizeof(struct vxlanhdr); + + /* small packets whose size is less than or equal to MTU needn't + * VXLAN TSO. In addtion, if hardware can't support VXLAN TSO, it + * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared + * outer_l2_len and outer_l3_len must be zeroed. + */ + if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) + && (mbuf->pkt_len <= tso_segsz + mbuf->outer_l2_len + + mbuf->outer_l3_len + mbuf->l2_len)) { + mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN; + if ((mbuf->ol_flags & PKT_TX_IPV4) && + (mbuf->outer_l3_len == IPV6_HEADER_LEN)) { + dp_packet_hwol_reset_tx_ipv4(pkt); + dp_packet_hwol_set_tx_ipv6(pkt); + } else if ((mbuf->ol_flags & PKT_TX_IPV6) && + (mbuf->outer_l3_len == IP_HEADER_LEN)) { + dp_packet_hwol_reset_tx_ipv6(pkt); + dp_packet_hwol_set_tx_ipv4(pkt); + } + mbuf->l2_len = mbuf->outer_l2_len; + mbuf->l3_len = mbuf->outer_l3_len; + mbuf->l4_len = sizeof(struct rte_udp_hdr); + mbuf->outer_l2_len = 0; + mbuf->outer_l3_len = 0; + return true; + } + + /* Handle outer packet */ + if (mbuf->outer_l3_len == IP_HEADER_LEN) { + ip_hdr = (struct rte_ipv4_hdr *)((char *) eth_hdr + + mbuf->outer_l2_len); + /* outer IP checksum offload */ + ip_hdr->hdr_checksum = 0; + mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM; + mbuf->ol_flags |= PKT_TX_OUTER_IPV4; + + udp_hdr = (struct rte_udp_hdr *)(ip_hdr + DPDK_RTE_HDR_OFFSET); + } else if (mbuf->outer_l3_len == IPV6_HEADER_LEN) { + ip6_hdr = (struct rte_ipv6_hdr *)((char *) eth_hdr + + mbuf->outer_l2_len); + /* no IP checksum for outer IPv6 */ + mbuf->ol_flags |= PKT_TX_OUTER_IPV6; - if (mbuf->ol_flags & PKT_TX_L4_MASK) { + udp_hdr = (struct rte_udp_hdr *)(ip6_hdr + DPDK_RTE_HDR_OFFSET); + + } + + /* Handle inner packet */ + if (udp_hdr != NULL) { + if (mbuf->ol_flags & PKT_TX_IPV4) { + ip_hdr = (struct rte_ipv4_hdr *) + ((uint8_t *)udp_hdr + mbuf->l2_len); + l4_proto = ip_hdr->next_proto_id; + l3_hdr_ptr = (uint8_t *)ip_hdr; + + /* inner IP checksum offload */ + ip_hdr->hdr_checksum = 0; + mbuf->ol_flags |= PKT_TX_IP_CKSUM; + } else if (mbuf->ol_flags & PKT_TX_IPV6) { + ip6_hdr = (struct rte_ipv6_hdr *) + ((uint8_t *)udp_hdr + mbuf->l2_len); + l4_proto = ip6_hdr->proto; + l3_hdr_ptr = (uint8_t *)ip6_hdr; + } + } + + /* In case of MTU > tso_segsz, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't + * set by client/server, here is a place we can mark it. + */ + if ((mbuf->pkt_len > tso_segsz + mbuf->outer_l2_len + + mbuf->outer_l3_len + mbuf->l2_len) + && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) { + if (l4_proto == IPPROTO_UDP) { + mbuf->ol_flags |= PKT_TX_UDP_SEG; + } else if (l4_proto == IPPROTO_TCP) { + mbuf->ol_flags |= PKT_TX_TCP_SEG; + } + } + } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) { + /* Handle VLAN TSO */ mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt); mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt); mbuf->outer_l2_len = 0; mbuf->outer_l3_len = 0; + + if (mbuf->ol_flags & PKT_TX_IPV4) { + ip_hdr = (struct rte_ipv4_hdr *)((char *)eth_hdr + mbuf->l2_len); + l4_proto = ip_hdr->next_proto_id; + l3_hdr_ptr = (uint8_t *)ip_hdr; + + /* IP checksum offload */ + ip_hdr->hdr_checksum = 0; + mbuf->ol_flags |= PKT_TX_IP_CKSUM; + } else if (mbuf->ol_flags & PKT_TX_IPV6) { + ip6_hdr = (struct rte_ipv6_hdr *)((char *)eth_hdr + mbuf->l2_len); + l4_proto = ip6_hdr->proto; + l3_hdr_ptr = (uint8_t *)ip6_hdr; + } + + /* In some cases, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set, here is + * a place we can mark it. + */ + if ((mbuf->pkt_len > (tso_segsz + mbuf->l2_len)) + && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) { + if (l4_proto == IPPROTO_UDP) { + mbuf->ol_flags |= PKT_TX_UDP_SEG; + } else if (l4_proto == IPPROTO_TCP) { + mbuf->ol_flags |= PKT_TX_TCP_SEG; + } + } } - if (mbuf->ol_flags & PKT_TX_TCP_SEG) { - struct tcp_header *th = dp_packet_l4(pkt); + /* It is possible that l4_len isn't set for vhostuserclient */ + if ((l3_hdr_ptr != NULL) && (l4_proto == IPPROTO_TCP) + && (mbuf->l4_len < 20)) { + struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *) + (l3_hdr_ptr + mbuf->l3_len); - if (!th) { - VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" + mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; + } + + if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) { + return true; + } + + if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { + if (l4_proto != IPPROTO_UDP) { + VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header" " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); return false; } + } else if (mbuf->ol_flags & PKT_TX_TCP_SEG || + mbuf->ol_flags & PKT_TX_TCP_CKSUM) { + if (l4_proto != IPPROTO_TCP) { + VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" + " pkt len: %"PRIu32" l4_proto = %d", + dev->up.name, mbuf->pkt_len, l4_proto); + return false; + } + + if (mbuf->pkt_len > tso_segsz + mbuf->outer_l2_len + mbuf->outer_l3_len + + mbuf->l2_len) { + dp_packet_hwol_set_tcp_seg(pkt); + } - mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; mbuf->ol_flags |= PKT_TX_TCP_CKSUM; - mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; + if (mbuf->ol_flags & PKT_TX_TCP_SEG) { + mbuf->tso_segsz = tso_segsz - mbuf->l3_len - mbuf->l4_len; + } else { + mbuf->tso_segsz = 0; + } - if (mbuf->ol_flags & PKT_TX_IPV4) { - mbuf->ol_flags |= PKT_TX_IP_CKSUM; + if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { + /* PKT_TX_TCP_CKSUM must be cleaned because + * tcp checksum only can be caculated by software if NIC + * can not support it. + */ + mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM; } } + + if (l4_proto == IPPROTO_UDP) { + /* in case of pkt_len < dev->mtu, it still can be handled correctly */ + if (mbuf->pkt_len < dev->mtu + ETHER_VLAN_HDR_MAX_LEN) { + mbuf->ol_flags &= ~PKT_TX_UDP_SEG; + if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) { + /* Pretend it as a normal UDP and stop inner cksum offload */ + mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN; + mbuf->ol_flags &= ~PKT_TX_OUTER_IP_CKSUM; + if (mbuf->ol_flags & PKT_TX_OUTER_IPV4) { + mbuf->ol_flags &= ~PKT_TX_OUTER_IPV4; + if (mbuf->ol_flags & PKT_TX_IPV6) { + mbuf->ol_flags &= ~PKT_TX_IPV6; + } + if ((mbuf->ol_flags & PKT_TX_IPV4) == 0) { + mbuf->ol_flags |= PKT_TX_IPV4; + } + mbuf->ol_flags |= PKT_TX_IP_CKSUM; + } else if (mbuf->ol_flags & PKT_TX_OUTER_IPV6) { + mbuf->ol_flags &= ~PKT_TX_OUTER_IPV6; + if (mbuf->ol_flags & PKT_TX_IPV4) { + mbuf->ol_flags &= ~PKT_TX_IPV4; + mbuf->ol_flags &= ~PKT_TX_IP_CKSUM; + } + if ((mbuf->ol_flags & PKT_TX_IPV6) == 0) { + mbuf->ol_flags |= PKT_TX_IPV6; + } + /* For outer IPv6, outer udp checksum is incorrect */ + mbuf->ol_flags |= PKT_TX_UDP_CKSUM; + } + mbuf->l2_len = mbuf->outer_l2_len; + mbuf->l3_len = mbuf->outer_l3_len; + mbuf->outer_l2_len = 0; + mbuf->outer_l3_len = 0; + } + return true; + } + + /* Can't handle bigger UDP packet, so return false */ + VLOG_WARN_RL(&rl, "%s: too big UDP packet" + ", pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); + return false; + } + return true; } @@ -2781,17 +3026,26 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig) mbuf_dest->packet_type = pkt_orig->mbuf.packet_type; mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags & ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF)); + mbuf_dest->l2_len = pkt_orig->mbuf.l2_len; + mbuf_dest->l3_len = pkt_orig->mbuf.l3_len; + mbuf_dest->l4_len = pkt_orig->mbuf.l4_len; + mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len; + mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len; memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size, sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size)); - if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) { + if ((mbuf_dest->outer_l2_len == 0) && + (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) { mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest) - (char *)dp_packet_eth(pkt_dest); mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest) - (char *) dp_packet_l3(pkt_dest); } + /* Mark it as non-DPDK port */ + mbuf_dest->port = UINT16_MAX; + return pkt_dest; } @@ -2850,6 +3104,11 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) if (dev->type == DPDK_DEV_VHOST) { __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt); } else { + if (userspace_tso_enabled()) { + txcnt = netdev_dpdk_prep_hwol_batch(dev, + (struct rte_mbuf **)pkts, + txcnt); + } tx_failure += netdev_dpdk_eth_tx_burst(dev, qid, (struct rte_mbuf **)pkts, txcnt); @@ -2872,7 +3131,6 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, bool concurrent_txq OVS_UNUSED) { - if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) { dpdk_do_tx_copy(netdev, qid, batch); dp_packet_delete_batch(batch, true); @@ -5033,6 +5291,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev) netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; + /* Enable VXLAN TSO support if available */ + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) { + netdev->ol_flags |= NETDEV_TX_OFFLOAD_VXLAN_TSO; + } if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; } diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 6be23db..3965ae5 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -50,6 +50,7 @@ #include <unistd.h> #include "coverage.h" +#include "csum.h" #include "dp-packet.h" #include "dpif-netlink.h" #include "dpif-netdev.h" @@ -79,6 +80,7 @@ #include "unaligned.h" #include "openvswitch/vlog.h" #include "userspace-tso.h" +#include "userspace-tso-segsz.h" #include "util.h" VLOG_DEFINE_THIS_MODULE(netdev_linux); @@ -6508,6 +6510,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) struct eth_header *eth_hdr; ovs_be16 eth_type; int l2_len; + int l3_len = 0; + int l4_len = 0; eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN); if (!eth_hdr) { @@ -6527,6 +6531,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) l2_len += VLAN_HEADER_LEN; } + dp_packet_hwol_set_l2_len(b, l2_len); + if (eth_type == htons(ETH_TYPE_IP)) { struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN); @@ -6534,6 +6540,7 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) return -EINVAL; } + l3_len = IP_HEADER_LEN; *l4proto = ip_hdr->ip_proto; dp_packet_hwol_set_tx_ipv4(b); } else if (eth_type == htons(ETH_TYPE_IPV6)) { @@ -6544,10 +6551,35 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) return -EINVAL; } + l3_len = IPV6_HEADER_LEN; *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt; dp_packet_hwol_set_tx_ipv6(b); } + dp_packet_hwol_set_l3_len(b, l3_len); + + if (*l4proto == IPPROTO_TCP) { + struct tcp_header *tcp_hdr = dp_packet_at(b, l2_len + l3_len, + sizeof(struct tcp_header)); + + if (!tcp_hdr) { + return -EINVAL; + } + + l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4; + dp_packet_hwol_set_l4_len(b, l4_len); + } else if (*l4proto == IPPROTO_UDP) { + struct udp_header *udp_hdr = dp_packet_at(b, l2_len + l3_len, + sizeof(struct udp_header)); + + if (!udp_hdr) { + return -EINVAL; + } + + l4_len = sizeof(struct udp_header); + dp_packet_hwol_set_l4_len(b, l4_len); + } + return 0; } @@ -6561,10 +6593,6 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) return -EINVAL; } - if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) { - return 0; - } - if (netdev_linux_parse_l2(b, &l4proto)) { return -EINVAL; } @@ -6595,22 +6623,130 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) } static void -netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) +netdev_linux_set_ol_flags_and_cksum(struct dp_packet *b, int mtu) +{ + struct eth_header *eth_hdr; + struct ip_header *ip_hdr = NULL; + struct ovs_16aligned_ip6_hdr *nh6 = NULL; + uint16_t l4proto = 0; + ovs_be16 eth_type; + int l2_len; + int l3_len = 0; + int l4_len = 0; + + eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN); + if (!eth_hdr) { + return; + } + + l2_len = ETH_HEADER_LEN; + eth_type = eth_hdr->eth_type; + if (eth_type_vlan(eth_type)) { + struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN); + + if (!vlan) { + return; + } + + eth_type = vlan->vlan_next_type; + l2_len += VLAN_HEADER_LEN; + } + + if (eth_type == htons(ETH_TYPE_IP)) { + ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN); + + if (!ip_hdr) { + return; + } + + dp_packet_set_l3(b, ip_hdr); + ip_hdr->ip_csum = 0; + ip_hdr->ip_csum = csum(ip_hdr, sizeof *ip_hdr); + l4proto = ip_hdr->ip_proto; + dp_packet_hwol_set_tx_ipv4(b); + l3_len = IP_HEADER_LEN; + } else if (eth_type == htons(ETH_TYPE_IPV6)) { + nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN); + if (!nh6) { + return; + } + + dp_packet_set_l3(b, nh6); + l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt; + dp_packet_hwol_set_tx_ipv6(b); + l3_len = IPV6_HEADER_LEN; + } + + if (l4proto == IPPROTO_TCP) { + /* Note: need set tcp pseudo checksum */ + struct tcp_header *tcp_hdr = dp_packet_at(b, l2_len + l3_len, + sizeof(struct tcp_header)); + + if (!tcp_hdr) { + return; + } + l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4; + dp_packet_hwol_set_l4_len(b, l4_len); + dp_packet_set_l4(b, tcp_hdr); + + if (l3_len == IP_HEADER_LEN) { + tcp_hdr->tcp_csum = csum_finish(packet_csum_pseudoheader(ip_hdr)); + } else { + tcp_hdr->tcp_csum = csum_finish(packet_csum_pseudoheader6(nh6)); + } + if (dp_packet_size(b) > mtu + l2_len) { + dp_packet_hwol_set_tcp_seg(b); + } + dp_packet_hwol_set_csum_tcp(b); + } else if (l4proto == IPPROTO_UDP) { + struct udp_header *udp_hdr = dp_packet_at(b, l2_len + l3_len, + sizeof(struct udp_header)); + + if (!udp_hdr) { + return; + } + l4_len = sizeof(struct udp_header); + dp_packet_hwol_set_l4_len(b, l4_len); + dp_packet_set_l4(b, udp_hdr); + if (dp_packet_size(b) > mtu + l2_len) { + dp_packet_hwol_set_udp_seg(b); + } + dp_packet_hwol_set_csum_udp(b); + } +} + +static void +netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu OVS_UNUSED) { - struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet); + struct virtio_net_hdr *vnet; + uint16_t tso_segsz = get_userspace_tso_segsz(); + uint16_t l4proto; + + netdev_linux_parse_l2(b, &l4proto); + + /* ol_flags weren't set correctly for received packets which are from + * physical port, so it has to been set again in order that + * vnet_hdr can be prepended correctly. Note: here tso_segsz but not + * mtu are used because tso_segsz may be less than mtu. + */ + if ((dp_packet_size(b) > tso_segsz + dp_packet_hwol_get_l2_len(b)) + && !dp_packet_hwol_l4_mask(b)) { + netdev_linux_set_ol_flags_and_cksum(b, tso_segsz); + } + + vnet = dp_packet_push_zeros(b, sizeof *vnet); if (dp_packet_hwol_is_tso(b)) { uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b)) - + TCP_HEADER_LEN; + + dp_packet_hwol_get_l4_len(b); vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len; - vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len); + vnet->gso_size = (OVS_FORCE __virtio16)(tso_segsz - hdr_len); if (dp_packet_hwol_is_ipv4(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; } else { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; } - } else { vnet->flags = VIRTIO_NET_HDR_GSO_NONE; } diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index 73dce2f..d616d79 100644 --- a/lib/netdev-provider.h +++ b/lib/netdev-provider.h @@ -43,6 +43,7 @@ enum netdev_ol_flags { NETDEV_TX_OFFLOAD_UDP_CKSUM = 1 << 2, NETDEV_TX_OFFLOAD_SCTP_CKSUM = 1 << 3, NETDEV_TX_OFFLOAD_TCP_TSO = 1 << 4, + NETDEV_TX_OFFLOAD_VXLAN_TSO = 1 << 5, }; /* A network device (e.g. an Ethernet device). diff --git a/lib/netdev.c b/lib/netdev.c index 91e9195..8c881b0 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -33,6 +33,7 @@ #include "cmap.h" #include "coverage.h" +#include "csum.h" #include "dpif.h" #include "dp-packet.h" #include "openvswitch/dynamic-string.h" @@ -55,6 +56,7 @@ #include "svec.h" #include "openvswitch/vlog.h" #include "flow.h" +#include "userspace-tso.h" #include "util.h" #ifdef __linux__ #include "tc.h" @@ -785,6 +787,64 @@ netdev_get_pt_mode(const struct netdev *netdev) : NETDEV_PT_LEGACY_L2); } +static inline void +calculate_tcpudp_checksum(struct dp_packet *p) +{ + uint32_t pseudo_hdr_csum = 0; + bool is_ipv6 = false; + struct ovs_16aligned_ip6_hdr *ip6 = NULL; + size_t len_l2 = (char *) dp_packet_l3(p) - (char *) dp_packet_eth(p); + size_t len_l3 = (char *) dp_packet_l4(p) - (char *) dp_packet_l3(p); + size_t l4_len = (char *) dp_packet_tail(p) - (char *) dp_packet_l4(p); + uint16_t l4_proto = 0; + + /* It is possible l2_len and l3_len aren't set here, so set them if no */ + if (dp_packet_hwol_get_l2_len(p) != len_l2) { + dp_packet_hwol_set_l2_len(p, len_l2); + dp_packet_hwol_set_l3_len(p, len_l3); + } + + if (len_l3 == sizeof(struct ovs_16aligned_ip6_hdr)) { + ip6 = dp_packet_l3(p); + l4_proto = ip6->ip6_ctlun.ip6_un1.ip6_un1_nxt; + is_ipv6 = true; + } else { + struct ip_header *ip = dp_packet_l3(p); + + l4_proto = ip->ip_proto; + ip->ip_csum = 0; + ip->ip_csum = csum(ip, sizeof *ip); + pseudo_hdr_csum = packet_csum_pseudoheader(ip); + } + + if (l4_proto == IPPROTO_TCP) { + struct tcp_header *tcp = dp_packet_l4(p); + + tcp->tcp_csum = 0; + if (is_ipv6) { + tcp->tcp_csum = packet_csum_upperlayer6(ip6, tcp, l4_proto, + l4_len); + } else { + tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum, + tcp, l4_len)); + } + } else if (l4_proto == IPPROTO_UDP) { + struct udp_header *udp = dp_packet_l4(p); + + udp->udp_csum = 0; + if (is_ipv6) { + udp->udp_csum = packet_csum_upperlayer6(ip6, udp, l4_proto, + l4_len); + } else { + udp->udp_csum = csum_finish(csum_continue(pseudo_hdr_csum, + udp, l4_len)); + } + if (!udp->udp_csum) { + udp->udp_csum = htons(0xffff); + } + } +} + /* Check if a 'packet' is compatible with 'netdev_flags'. * If a packet is incompatible, return 'false' with the 'errormsg' * pointing to a reason. */ @@ -794,6 +854,14 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, { uint64_t l4_mask; + if (dp_packet_hwol_is_vxlan_tcp_seg(packet) + && (dp_packet_hwol_is_tso(packet) || dp_packet_hwol_l4_mask(packet)) + && !(netdev_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)) { + /* Fall back to GSO in software. */ + VLOG_ERR_BUF(errormsg, "No VXLAN TSO support"); + return false; + } + if (dp_packet_hwol_is_tso(packet) && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { /* Fall back to GSO in software. */ @@ -803,6 +871,33 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, l4_mask = dp_packet_hwol_l4_mask(packet); if (l4_mask) { + /* Calculate checksum for VLAN TSO case when no hardware offload + * feature is available. Note: for VXLAN TSO case, checksum has + * been calculated before here, so it won't be done here again + * because checksum flags in packet->m.ol_flags have been cleaned. + */ + if (dp_packet_hwol_l4_is_tcp(packet) + && !dp_packet_hwol_is_vxlan_tcp_seg(packet) + && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { + dp_packet_hwol_reset_csum_tcp(packet); + /* Only calculate TCP checksum for non-TSO packet. + */ + if (!dp_packet_hwol_is_tso(packet)) { + calculate_tcpudp_checksum(packet); + } + return true; + } else if (dp_packet_hwol_l4_is_udp(packet) + && !dp_packet_hwol_is_vxlan_tcp_seg(packet) + && !(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { + dp_packet_hwol_reset_csum_udp(packet); + /* Only calculate UDP checksum for non-UFO packet. + */ + if (!dp_packet_hwol_is_ufo(packet)) { + calculate_tcpudp_checksum(packet); + } + return true; + } + if (dp_packet_hwol_l4_is_tcp(packet)) { if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { /* Fall back to TCP csum in software. */ @@ -960,15 +1055,61 @@ netdev_push_header(const struct netdev *netdev, size_t i, size = dp_packet_batch_size(batch); DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { - if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet) - || dp_packet_hwol_l4_mask(packet))) { + if (OVS_UNLIKELY((dp_packet_hwol_is_tso(packet) + || dp_packet_hwol_l4_mask(packet)) + && (data->tnl_type != OVS_VPORT_TYPE_VXLAN))) { COVERAGE_INC(netdev_push_header_drops); dp_packet_delete(packet); - VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is " - "not supported: packet dropped", + VLOG_WARN_RL(&rl, + "%s: non-VxLAN Tunneling packets with HW offload " + "flags is not supported: packet dropped", netdev_get_name(netdev)); } else { + size_t len_l2 = (char *) dp_packet_l3(packet) + - (char *) dp_packet_eth(packet); + size_t len_l3 = (char *) dp_packet_l4(packet) + - (char *) dp_packet_l3(packet); + if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) { + /* VXLAN offload can't support udp checksum offload + * for inner udp packet, so udp checksum must be set + * before push header in order that outer checksum can + * be set correctly. + */ + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_hwol_reset_csum_udp(packet); + /* Only calculate UDP checksum for non-UFO packet. + */ + if (!dp_packet_hwol_is_ufo(packet)) { + calculate_tcpudp_checksum(packet); + } + } else if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_hwol_reset_csum_tcp(packet); + /* Only calculate TCP checksum for non-TSO packet. + */ + if (!dp_packet_hwol_is_tso(packet)) { + calculate_tcpudp_checksum(packet); + } + } + } + /* It is possible l2_len and l3_len aren't set here, so set them + * if no. + */ + if (dp_packet_hwol_get_l2_len(packet) != len_l2) { + dp_packet_hwol_set_l2_len(packet, len_l2); + dp_packet_hwol_set_l3_len(packet, len_l3); + } + netdev->netdev_class->push_header(netdev, packet, data); + if (userspace_tso_enabled() + && (data->tnl_type == OVS_VPORT_TYPE_VXLAN)) { + /* Just identify it as a vxlan packet, here netdev is + * vxlan_sys_*, netdev->ol_flags can't indicate if final + * physical output port can support VXLAN TSO, in + * netdev_send_prepare_packet will drop it if final + * physical output port can't support VXLAN TSO. + */ + dp_packet_hwol_set_vxlan_tcp_seg(packet); + } pkt_metadata_init(&packet->md, data->out_port); dp_packet_batch_refill(batch, packet, i); } diff --git a/lib/userspace-tso-segsz.c b/lib/userspace-tso-segsz.c new file mode 100644 index 0000000..2d31a5b --- /dev/null +++ b/lib/userspace-tso-segsz.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2020 Inspur, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> + +#include "smap.h" +#include "ovs-thread.h" +#include "openvswitch/vlog.h" +#include "dpdk.h" +#include "userspace-tso-segsz.h" +#include "vswitch-idl.h" + +VLOG_DEFINE_THIS_MODULE(userspace_tso_segsz); + +#define DEFAULT_TSO_SEGSZ 1500 +#define MAX_TSO_SEGSZ 9000 +static uint16_t userspace_tso_segsz = DEFAULT_TSO_SEGSZ; + +void +userspace_tso_segsz_init(const struct smap *ovs_other_config) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + + if (ovsthread_once_start(&once)) { + int tso_segsz; + + tso_segsz = smap_get_int(ovs_other_config, "userspace-tso-segsz", + DEFAULT_TSO_SEGSZ); + if ((tso_segsz < 0) || (tso_segsz > MAX_TSO_SEGSZ)) { + tso_segsz = DEFAULT_TSO_SEGSZ; + } + userspace_tso_segsz = tso_segsz; + VLOG_INFO("Userspace TSO segsz set to %u", userspace_tso_segsz); + ovsthread_once_done(&once); + } +} + +uint16_t +get_userspace_tso_segsz(void) +{ + return userspace_tso_segsz; +} diff --git a/lib/userspace-tso-segsz.h b/lib/userspace-tso-segsz.h new file mode 100644 index 0000000..c4e9e46 --- /dev/null +++ b/lib/userspace-tso-segsz.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2020 Inspur, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef USERSPACE_TSO_SEGSZ_H +#define USERSPACE_TSO_SEGSZ_H 1 + +void userspace_tso_segsz_init(const struct smap *ovs_other_config); +uint16_t get_userspace_tso_segsz(void); + +#endif /* userspace-tso-segsz.h */ diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 5ed7e82..b131e73 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -66,6 +66,7 @@ #include "timeval.h" #include "tnl-ports.h" #include "userspace-tso.h" +#include "userspace-tso-segsz.h" #include "util.h" #include "unixctl.h" #include "lib/vswitch-idl.h" @@ -3292,6 +3293,7 @@ bridge_run(void) netdev_set_flow_api_enabled(&cfg->other_config); dpdk_init(&cfg->other_config); userspace_tso_init(&cfg->other_config); + userspace_tso_segsz_init(&cfg->other_config); } /* Initialize the ofproto library. This only needs to run once, but -- 1.8.3.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
