Let's consider a TCP packet in a VxLAN tunnel:
Ethernet / IP / UDP / VxLAN / Ethernet / IP / TCP / Data
The outer UDP checksum is an accumulation of a pseudo header of the
outer IP infos (addresses, length, next proto) and the whole packet data:
UDP / VxLAN / Ethernet / IP / TCP / Data.
Similarly to the outer UDP checksum, the inner TCP checksum is an
accumulation of a pseudo header of the inner IP infos and the rest of
the packet data.
The inner TCP header will contain this inner checksum, so when computing
the outer UDP checksum the inner checksum will cancel any participation
of the TCP data.
As a consequence, the outer UDP checksum depends on the headers content
only and can be computed without looking at the data payload.
The same principle applies to inner UDP.
Thanks to this, we can re-enable IPv4, UDP and TCP inner checksums when
outer UDP checksum is not supported, and there is no need for falling
back to non tunnel API in netdev-dpdk.
TCP over IPv4 geneve (with checksum on tunnel) on a mlx5 nic:
Before: 4.37 Gbits/sec, 100% cpu ("full" csum + SW segmentation)
After: 7.80 Gbits/sec, 100% cpu (constant csum + SW segmentation)
Reported-at: https://issues.redhat.com/browse/FDP-1897
Signed-off-by: David Marchand <[email protected]>
---
lib/dp-packet-gso.c | 10 +++---
lib/dp-packet.c | 20 +++++------
lib/dp-packet.h | 12 +++++++
lib/netdev-dpdk.c | 84 ++++++++++++++++++---------------------------
lib/packets.c | 75 ++++++++++++++++++++++++++++++++++++++++
lib/packets.h | 1 +
6 files changed, 136 insertions(+), 66 deletions(-)
diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
index 362bc8f66d..fe7186ddf4 100644
--- a/lib/dp-packet-gso.c
+++ b/lib/dp-packet-gso.c
@@ -66,17 +66,15 @@ int
dp_packet_gso_nr_segs(struct dp_packet *p)
{
uint16_t segsz = dp_packet_get_tso_segsz(p);
- const char *data_tail;
- const char *data_pos;
+ uint32_t data_length;
if (dp_packet_tunnel(p)) {
- data_pos = dp_packet_get_inner_tcp_payload(p);
+ data_length = dp_packet_get_inner_tcp_payload_length(p);
} else {
- data_pos = dp_packet_get_tcp_payload(p);
+ data_length = dp_packet_get_tcp_payload_length(p);
}
- data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
- return DIV_ROUND_UP(data_tail - data_pos, segsz);
+ return DIV_ROUND_UP(data_length, segsz);
}
/* Perform software segmentation on packet 'p'.
diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index b34bcf26f3..1d4d77b703 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -587,19 +587,14 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t
flags)
return;
}
- if (dp_packet_tunnel_geneve(p)
- || dp_packet_tunnel_vxlan(p)) {
-
+ if (dp_packet_tunnel_geneve(p) || dp_packet_tunnel_vxlan(p)) {
/* If the TX interface doesn't support UDP tunnel offload but does
- * support inner checksum offload and an outer UDP checksum is
- * required, then we can't offload inner checksum either. As that would
+ * support inner SCTP checksum offload and an outer UDP checksum is
+ * required, then we can't offload inner checksum either as that would
* invalidate the outer checksum. */
if (!(flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)
&& dp_packet_l4_checksum_partial(p)) {
- flags &= ~(NETDEV_TX_OFFLOAD_TCP_CKSUM |
- NETDEV_TX_OFFLOAD_UDP_CKSUM |
- NETDEV_TX_OFFLOAD_SCTP_CKSUM |
- NETDEV_TX_OFFLOAD_IPV4_CKSUM);
+ flags &= ~NETDEV_TX_OFFLOAD_SCTP_CKSUM;
}
}
@@ -633,7 +628,12 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t
flags)
if (dp_packet_l4_checksum_partial(p)) {
ovs_assert(dp_packet_l4_proto_udp(p));
if (!(flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) {
- packet_udp_complete_csum(p, false);
+ if (dp_packet_inner_l4_proto_tcp(p)
+ || dp_packet_inner_l4_proto_udp(p)) {
+ packet_udp_tunnel_csum(p);
+ } else {
+ packet_udp_complete_csum(p, false);
+ }
}
}
}
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 285d0e43f6..a9e888f7ba 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -585,6 +585,18 @@ dp_packet_get_tcp_payload_length(const struct dp_packet
*pkt)
}
}
+static inline uint32_t
+dp_packet_get_inner_tcp_payload_length(const struct dp_packet *pkt)
+{
+ const char *tcp_payload = dp_packet_get_inner_tcp_payload(pkt);
+ if (tcp_payload) {
+ return ((char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
+ - tcp_payload);
+ } else {
+ return 0;
+ }
+}
+
static inline const void *
dp_packet_get_udp_payload(const struct dp_packet *b)
{
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index e3d3a21d2f..c424b71c66 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2652,59 +2652,43 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev,
struct rte_mbuf *mbuf)
&& (dp_packet_inner_ip_checksum_partial(pkt)
|| dp_packet_inner_l4_checksum_partial(pkt)
|| mbuf->tso_segsz)) {
- if (dp_packet_ip_checksum_partial(pkt)
- || dp_packet_l4_checksum_partial(pkt)) {
- mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
- (char *) dp_packet_eth(pkt);
- mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
- (char *) dp_packet_l3(pkt);
-
- if (dp_packet_tunnel_geneve(pkt)) {
- mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GENEVE;
- } else if (dp_packet_tunnel_vxlan(pkt)) {
- mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_VXLAN;
- } else {
- ovs_assert(dp_packet_tunnel_gre(pkt));
- mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GRE;
- }
-
- if (dp_packet_ip_checksum_partial(pkt)) {
- mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_IP_CKSUM;
- }
+ mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
+ (char *) dp_packet_eth(pkt);
+ mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
+ (char *) dp_packet_l3(pkt);
+
+ if (dp_packet_tunnel_geneve(pkt)) {
+ mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GENEVE;
+ } else if (dp_packet_tunnel_vxlan(pkt)) {
+ mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_VXLAN;
+ } else {
+ ovs_assert(dp_packet_tunnel_gre(pkt));
+ mbuf->ol_flags |= RTE_MBUF_F_TX_TUNNEL_GRE;
+ }
- if (dp_packet_l4_checksum_partial(pkt)) {
- ovs_assert(dp_packet_l4_proto_udp(pkt));
- mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_UDP_CKSUM;
- }
+ if (dp_packet_ip_checksum_partial(pkt)) {
+ mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_IP_CKSUM;
+ }
- ip = dp_packet_l3(pkt);
- mbuf->ol_flags |= IP_VER(ip->ip_ihl_ver) == 4
- ? RTE_MBUF_F_TX_OUTER_IPV4
- : RTE_MBUF_F_TX_OUTER_IPV6;
-
- /* Inner L2 length must account for the tunnel header length. */
- l2 = dp_packet_l4(pkt);
- l3 = dp_packet_inner_l3(pkt);
- l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
- l4 = dp_packet_inner_l4(pkt);
- l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
- is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
- is_udp = dp_packet_inner_l4_proto_udp(pkt);
- is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
- } else {
- mbuf->outer_l2_len = 0;
- mbuf->outer_l3_len = 0;
-
- /* Skip outer headers. */
- l2 = dp_packet_eth(pkt);
- l3 = dp_packet_inner_l3(pkt);
- l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
- l4 = dp_packet_inner_l4(pkt);
- l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
- is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
- is_udp = dp_packet_inner_l4_proto_udp(pkt);
- is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
+ if (dp_packet_l4_checksum_partial(pkt)) {
+ ovs_assert(dp_packet_l4_proto_udp(pkt));
+ mbuf->ol_flags |= RTE_MBUF_F_TX_OUTER_UDP_CKSUM;
}
+
+ ip = dp_packet_l3(pkt);
+ mbuf->ol_flags |= IP_VER(ip->ip_ihl_ver) == 4
+ ? RTE_MBUF_F_TX_OUTER_IPV4
+ : RTE_MBUF_F_TX_OUTER_IPV6;
+
+ /* Inner L2 length must account for the tunnel header length. */
+ l2 = dp_packet_l4(pkt);
+ l3 = dp_packet_inner_l3(pkt);
+ l3_csum = dp_packet_inner_ip_checksum_partial(pkt);
+ l4 = dp_packet_inner_l4(pkt);
+ l4_csum = dp_packet_inner_l4_checksum_partial(pkt);
+ is_tcp = dp_packet_inner_l4_proto_tcp(pkt);
+ is_udp = dp_packet_inner_l4_proto_udp(pkt);
+ is_sctp = dp_packet_inner_l4_proto_sctp(pkt);
} else {
mbuf->outer_l2_len = 0;
mbuf->outer_l3_len = 0;
diff --git a/lib/packets.c b/lib/packets.c
index e3e5bbc92c..8bb83e34a4 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -2079,6 +2079,81 @@ packet_udp_complete_csum(struct dp_packet *p, bool inner)
}
}
+/* This helper computes a "constant" UDP checksum without looking at the
+ * L4 payload.
+ *
+ * This is possible when L4 is either TCP or UDP: the L4 payload checksum
+ * is either computed in SW or in HW later, but its contribution to the
+ * outer checksum is cancelled by the L4 payload being part of the global
+ * packet sum. */
+void
+packet_udp_tunnel_csum(struct dp_packet *p)
+{
+ const ovs_be16 *inner_l4_csum_p;
+ struct ip_header *inner_ip;
+ const void *inner_l4_data;
+ struct udp_header *udp;
+ ovs_be16 inner_l4_csum;
+ uint32_t partial_csum;
+ struct ip_header *ip;
+ uint32_t inner_csum;
+ void *inner_l4;
+
+ inner_ip = dp_packet_inner_l3(p);
+ inner_l4 = dp_packet_inner_l4(p);
+ ip = dp_packet_l3(p);
+ udp = dp_packet_l4(p);
+
+ if (IP_VER(inner_ip->ip_ihl_ver) == 4) {
+ if (dp_packet_inner_ip_checksum_partial(p)) {
+ dp_packet_ip_set_header_csum(p, true);
+ }
+ inner_csum = packet_csum_pseudoheader(inner_ip);
+ } else {
+ struct ovs_16aligned_ip6_hdr *inner_ip6 = dp_packet_inner_l3(p);
+
+ inner_csum = packet_csum_pseudoheader6(inner_ip6);
+ }
+
+ if (dp_packet_inner_l4_proto_tcp(p)) {
+ inner_l4_csum_p = &(((struct tcp_header *) inner_l4)->tcp_csum);
+ inner_l4_data = dp_packet_get_inner_tcp_payload(p);
+ } else {
+ ovs_assert(dp_packet_inner_l4_proto_udp(p));
+ inner_l4_csum_p = &(((struct udp_header *) inner_l4)->udp_csum);
+ inner_l4_data = (char *) inner_l4 + sizeof (struct udp_header);
+ }
+
+ ovs_assert(inner_l4_data);
+ inner_csum = csum_continue(inner_csum, inner_l4,
+ (char *) inner_l4_csum_p - (char *) inner_l4);
+ inner_l4_csum = csum_finish(csum_continue(inner_csum, inner_l4_csum_p + 1,
+ (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1)));
+ if (dp_packet_inner_l4_proto_udp(p) && !inner_l4_csum) {
+ inner_l4_csum = htons(0xffff);
+ }
+
+ udp->udp_csum = 0;
+ if (IP_VER(ip->ip_ihl_ver) == 4) {
+ partial_csum = packet_csum_pseudoheader(ip);
+ } else {
+ struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(p);
+
+ partial_csum = packet_csum_pseudoheader6(ip6);
+ }
+
+ partial_csum = csum_continue(partial_csum, udp,
+ (char *) inner_l4_csum_p - (char *) udp);
+ partial_csum = csum_add16(partial_csum, inner_l4_csum);
+ partial_csum = csum_continue(partial_csum, inner_l4_csum_p + 1,
+ (char *) inner_l4_data - (char *)(inner_l4_csum_p + 1));
+ udp->udp_csum = csum_finish(partial_csum);
+ if (!udp->udp_csum) {
+ udp->udp_csum = htons(0xffff);
+ }
+ dp_packet_l4_checksum_set_good(p);
+}
+
/* Set SCTP checksum field in packet 'p' with complete checksum.
* The packet must have the L3 and L4 offsets. */
void
diff --git a/lib/packets.h b/lib/packets.h
index 6eba07700a..8185be8c0c 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -1689,6 +1689,7 @@ bool packet_rh_present(struct dp_packet *packet, uint8_t
*nexthdr,
void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6);
void packet_tcp_complete_csum(struct dp_packet *, bool is_inner);
void packet_udp_complete_csum(struct dp_packet *, bool is_inner);
+void packet_udp_tunnel_csum(struct dp_packet *);
void packet_sctp_complete_csum(struct dp_packet *, bool is_inner);
#define DNS_HEADER_LEN 12
--
2.51.0
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev