On 4/14/25 4:39 PM, Mike Pattrick wrote: > On Fri, Apr 11, 2025 at 7:36 PM Ilya Maximets <i.maxim...@ovn.org> wrote: >> >> On 4/9/25 8:16 PM, Mike Pattrick wrote: >>> On Thu, Mar 13, 2025 at 8:44 AM David Marchand >>> <david.march...@redhat.com> wrote: >>>> >>>> As the virtio-net offload API is used for netdev-linux ports, but >>>> provides no information about the potentially encapsulated protocol >>>> concerned by a checksum request, specific informations from this netdev- >>>> specific implementation is propagated into OVS code, and must be >>>> carefully evaluated when some tunnel gets decapsulated. >>>> >>>> This induces a cost in "normal" processing path, while the netdev-linux >>>> path is not performance critical. >>>> >>>> This patch removes such specific information, yet try harder to parse >>>> the packet on the Rx side and set offload flags accordingly for non >>>> encapsulated traffic and for encapsulated traffic, fixes the inner >>>> checksum. >>>> >>>> Signed-off-by: David Marchand <david.march...@redhat.com> >>>> --- >>>> Changes since v3: >>>> - dropped setting Tx flags, >>>> >>>> Changes since v1: >>>> - fixed patch reordering issue: offloads fields was being added instead >>>> of later in this series, >>>> >>>> --- >>>> lib/dp-packet.c | 3 - >>>> lib/dp-packet.h | 24 ------- >>>> lib/dpif-netdev-extract-avx512.c | 2 - >>>> lib/flow.c | 6 -- >>>> lib/netdev-linux.c | 113 ++++++++++++++++++++++++++++--- >>>> lib/netdev-native-tnl.c | 6 +- >>>> 6 files changed, 104 insertions(+), 50 deletions(-) >>>> >>>> diff --git a/lib/dp-packet.c b/lib/dp-packet.c >>>> index dad0d7be3a..6a9bfd63ba 100644 >>>> --- a/lib/dp-packet.c >>>> +++ b/lib/dp-packet.c >>>> @@ -39,9 +39,6 @@ dp_packet_init__(struct dp_packet *b, size_t allocated, >>>> enum dp_packet_source so >>>> dp_packet_init_specific(b); >>>> /* By default assume the packet type to be Ethernet. */ >>>> b->packet_type = htonl(PT_ETH); >>>> - /* Reset csum start and offset. */ >>>> - b->csum_start = 0; >>>> - b->csum_offset = 0; >>>> } >>>> >>>> static void >>>> diff --git a/lib/dp-packet.h b/lib/dp-packet.h >>>> index 549802167d..084e89bd99 100644 >>>> --- a/lib/dp-packet.h >>>> +++ b/lib/dp-packet.h >>>> @@ -178,8 +178,6 @@ struct dp_packet { >>>> or UINT16_MAX. */ >>>> uint32_t cutlen; /* length in bytes to cut from the >>>> end. */ >>>> ovs_be32 packet_type; /* Packet type as defined in OpenFlow >>>> */ >>>> - uint16_t csum_start; /* Position to start checksumming >>>> from. */ >>>> - uint16_t csum_offset; /* Offset to place checksum. */ >>>> union { >>>> struct pkt_metadata md; >>>> uint64_t data[DP_PACKET_CONTEXT_SIZE / 8]; >>>> @@ -1539,34 +1537,12 @@ dp_packet_ol_set_l4_csum_bad(struct dp_packet *p) >>>> *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_BAD; >>>> } >>>> >>>> -/* Marks packet 'p' with good integrity if checksum offload locations >>>> - * were provided. In the case of encapsulated packets, these values may >>>> - * be deeper into the packet than OVS might expect. But the packet >>>> - * should still be considered to have good integrity. >>>> - * The 'csum_start' is the offset from the begin of the packet headers. >>>> - * The 'csum_offset' is the offset from start to place the checksum. >>>> - * The csum_start and csum_offset fields are set from the virtio_net_hdr >>>> - * struct that may be provided by a netdev on packet ingress. */ >>>> -static inline void >>>> -dp_packet_ol_l4_csum_check_partial(struct dp_packet *p) >>>> -{ >>>> - if (p->csum_start && p->csum_offset) { >>>> - dp_packet_ol_set_l4_csum_partial(p); >>>> - } >>>> -} >>>> - >>>> static inline void >>>> dp_packet_reset_packet(struct dp_packet *b, int off) >>>> { >>>> dp_packet_set_size(b, dp_packet_size(b) - off); >>>> dp_packet_set_data(b, ((unsigned char *) dp_packet_data(b) + off)); >>>> dp_packet_reset_offsets(b); >>>> - >>>> - if (b->csum_start >= off && b->csum_offset) { >>>> - /* Adjust values for decapsulation. */ >>>> - b->csum_start -= off; >>>> - dp_packet_ol_set_l4_csum_partial(b); >>>> - } >>>> } >>>> >>>> static inline uint32_t ALWAYS_INLINE >>>> diff --git a/lib/dpif-netdev-extract-avx512.c >>>> b/lib/dpif-netdev-extract-avx512.c >>>> index 57ca4c71b7..3ae850c2d5 100644 >>>> --- a/lib/dpif-netdev-extract-avx512.c >>>> +++ b/lib/dpif-netdev-extract-avx512.c >>>> @@ -776,7 +776,6 @@ mfex_ipv6_set_hwol(struct dp_packet *pkt) >>>> static void >>>> mfex_tcp_set_hwol(struct dp_packet *pkt) >>>> { >>>> - dp_packet_ol_l4_csum_check_partial(pkt); >>>> if (dp_packet_l4_checksum_good(pkt) >>>> || dp_packet_ol_l4_csum_partial(pkt)) { >>>> dp_packet_hwol_set_csum_tcp(pkt); >>>> @@ -786,7 +785,6 @@ mfex_tcp_set_hwol(struct dp_packet *pkt) >>>> static void >>>> mfex_udp_set_hwol(struct dp_packet *pkt) >>>> { >>>> - dp_packet_ol_l4_csum_check_partial(pkt); >>>> if (dp_packet_l4_checksum_good(pkt) >>>> || dp_packet_ol_l4_csum_partial(pkt)) { >>>> dp_packet_hwol_set_csum_udp(pkt); >>>> diff --git a/lib/flow.c b/lib/flow.c >>>> index c2e1e4ad6f..988ae6bfdc 100644 >>>> --- a/lib/flow.c >>>> +++ b/lib/flow.c >>>> @@ -1080,7 +1080,6 @@ miniflow_extract(struct dp_packet *packet, struct >>>> miniflow *dst) >>>> } else if (dl_type == htons(ETH_TYPE_IPV6)) { >>>> dp_packet_update_rss_hash_ipv6_tcp_udp(packet); >>>> } >>>> - dp_packet_ol_l4_csum_check_partial(packet); >>>> if (dp_packet_l4_checksum_good(packet) >>>> || dp_packet_ol_l4_csum_partial(packet)) { >>>> dp_packet_hwol_set_csum_tcp(packet); >>>> @@ -1100,7 +1099,6 @@ miniflow_extract(struct dp_packet *packet, struct >>>> miniflow *dst) >>>> } else if (dl_type == htons(ETH_TYPE_IPV6)) { >>>> dp_packet_update_rss_hash_ipv6_tcp_udp(packet); >>>> } >>>> - dp_packet_ol_l4_csum_check_partial(packet); >>>> if (dp_packet_l4_checksum_good(packet) >>>> || dp_packet_ol_l4_csum_partial(packet)) { >>>> if (tunneling) { >>>> @@ -1118,7 +1116,6 @@ miniflow_extract(struct dp_packet *packet, struct >>>> miniflow *dst) >>>> miniflow_push_be16(mf, tp_dst, sctp->sctp_dst); >>>> miniflow_push_be16(mf, ct_tp_src, ct_tp_src); >>>> miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst); >>>> - dp_packet_ol_l4_csum_check_partial(packet); >>>> if (dp_packet_l4_checksum_good(packet) >>>> || dp_packet_ol_l4_csum_partial(packet)) { >>>> dp_packet_hwol_set_csum_sctp(packet); >>>> @@ -1309,20 +1306,17 @@ parse_tcp_flags(struct dp_packet *packet, >>>> if (nw_proto == IPPROTO_TCP && size >= TCP_HEADER_LEN) { >>>> const struct tcp_header *tcp = data; >>>> >>>> - dp_packet_ol_l4_csum_check_partial(packet); >>>> if (dp_packet_l4_checksum_good(packet) >>>> || dp_packet_ol_l4_csum_partial(packet)) { >>>> dp_packet_hwol_set_csum_tcp(packet); >>>> } >>>> return TCP_FLAGS(tcp->tcp_ctl); >>>> } else if (nw_proto == IPPROTO_UDP && size >= UDP_HEADER_LEN) { >>>> - dp_packet_ol_l4_csum_check_partial(packet); >>>> if (dp_packet_l4_checksum_good(packet) >>>> || dp_packet_ol_l4_csum_partial(packet)) { >>>> dp_packet_hwol_set_csum_udp(packet); >>>> } >>>> } else if (nw_proto == IPPROTO_SCTP && size >= SCTP_HEADER_LEN) { >>>> - dp_packet_ol_l4_csum_check_partial(packet); >>>> if (dp_packet_l4_checksum_good(packet) >>>> || dp_packet_ol_l4_csum_partial(packet)) { >>>> dp_packet_hwol_set_csum_sctp(packet); >>>> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c >>>> index af7438f1ee..c1c0addab7 100644 >>>> --- a/lib/netdev-linux.c >>>> +++ b/lib/netdev-linux.c >>>> @@ -89,6 +89,7 @@ COVERAGE_DEFINE(netdev_get_hwaddr); >>>> COVERAGE_DEFINE(netdev_set_hwaddr); >>>> COVERAGE_DEFINE(netdev_get_ethtool); >>>> COVERAGE_DEFINE(netdev_set_ethtool); >>>> +COVERAGE_DEFINE(netdev_linux_unknown_l4_csum); >>>> >>>> >>>> #ifndef IFLA_IF_NETNSID >>>> @@ -7038,6 +7039,70 @@ af_packet_sock(void) >>>> return sock; >>>> } >>>> >>>> +static int >>>> +netdev_linux_parse_packet(struct dp_packet *b, uint16_t *l2_len, >>>> + uint16_t *l3_len, uint16_t *l4proto) >>>> +{ >>>> + struct eth_header *eth_hdr; >>>> + ovs_be16 eth_type; >>>> + >>>> + eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN); >>>> + if (!eth_hdr) { >>>> + return -EINVAL; >>>> + } >>>> + >>>> + *l2_len = ETH_HEADER_LEN; >>>> + eth_type = eth_hdr->eth_type; >>>> + if (eth_type_vlan(eth_type)) { >>>> + struct vlan_header *vlan = dp_packet_at(b, *l2_len, >>>> VLAN_HEADER_LEN); >>>> + >>>> + if (!vlan) { >>>> + return -EINVAL; >>>> + } >>>> + >>>> + eth_type = vlan->vlan_next_type; >>>> + *l2_len += VLAN_HEADER_LEN; >>>> + } >>>> + >>>> + if (eth_type == htons(ETH_TYPE_IP)) { >>>> + struct ip_header *ip_hdr = dp_packet_at(b, *l2_len, >>>> IP_HEADER_LEN); >>>> + >>>> + if (!ip_hdr) { >>>> + return -EINVAL; >>>> + } >>>> + >>>> + *l3_len = IP_IHL(ip_hdr->ip_ihl_ver) * 4; >>>> + *l4proto = ip_hdr->ip_proto; >>>> + dp_packet_hwol_set_tx_ipv4(b); >>>> + } else if (eth_type == htons(ETH_TYPE_IPV6)) { >>>> + struct ovs_16aligned_ip6_hdr *nh6; >>>> + const void *data; >>>> + uint8_t nw_proto; >>>> + uint8_t nw_frag; >>>> + size_t size; >>>> + >>>> + nh6 = dp_packet_at(b, *l2_len, IPV6_HEADER_LEN); >>>> + if (!nh6) { >>>> + return -EINVAL; >>>> + } >>>> + >>>> + nw_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt; >>>> + data = (const char *) nh6 + sizeof *nh6; >>>> + size = (const char *) dp_packet_tail(b) - (const char *) data; >>>> + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, >>>> + NULL, NULL)) { >>>> + return -EINVAL; >>>> + } >>>> + *l3_len = (const char *) data - (const char *) nh6; >>>> + *l4proto = nw_proto; >>>> + dp_packet_hwol_set_tx_ipv6(b); >>>> + } else { >>>> + *l3_len = *l4proto = 0; >>>> + } >>>> + >>>> + return 0; >>>> +} >>>> + >>>> /* Initializes packet 'b' with features enabled in the prepended >>>> * struct virtio_net_hdr. Returns 0 if successful, otherwise a >>>> * positive errno value. */ >>>> @@ -7055,16 +7120,44 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) >>>> } >>>> >>>> if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { >>> >>> Instead of implementing our own checksumming here, why not just use >>> ethtool to disable all checksum offloading? >> >> Wouldn't that disable most of other offloads as well, like TSO? >> If so, we'll loose a lot of performance. > > We can toggle checksum and tso flags independently. I feel like this > approach is slightly more honest than indicating in the ovs-vsctl list > interface that checksums are offloaded, and then manually calculating > them all anyways. And I think doing this in usermode would be far more > impactful than the kernel/hw doing it.
While we can toggle these flags independently, most drivers will turn off TSO as well once checksum offload is disabled, because checksum offlaoding is a prerequisite for segmentation offloading in many cases, e.g. in virtio spec. We can't controll that, it's up to a driver. Also, we can't actually controll offloads on veth or similar interfaces, because disabling has to be done on the other side of the veth in order to be effective. Also, we're not re-calculating anything in a vast majority of cases. For any normal TCP/UDP/SCTP traffic offloading will work just fine. We will only recalculate for some protocols we do not understand or if the kenrel sends us some weird values. This should be a rare case. Disabling checksum offload with ethtool will turn it off for all types of traffic as there're no per-protocol options. For the performance, I did test this on my setup and I see that packet rate goes down from 9.8 Gbps to 9.4 Gbps in my setup when I force the checksum recalculation for TSO traffic with iperf. It is noticeable, but I don't think we care too much, as pumping 10G through the OVS main thread is not a good idea in the first place. We mostly just need to ensure correctness. So, I think, the approach in this patch should be good enough. > >> >>> >>>> - /* The packet has offloaded checksum. However, there is no >>>> - * additional information like the protocol used, so it would >>>> - * require to parse the packet here. The checksum starting point >>>> - * and offset are going to be verified when the packet headers >>>> - * are parsed during miniflow extraction. */ >>>> - b->csum_start = (OVS_FORCE uint16_t) vnet->csum_start; >>>> - b->csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset; >>>> - } else { >>>> - b->csum_start = 0; >>>> - b->csum_offset = 0; >>>> + uint16_t csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset; >>>> + uint16_t csum_start = (OVS_FORCE uint16_t) vnet->csum_start; >>>> + uint16_t l4proto; >>>> + uint16_t l2_len; >>>> + uint16_t l3_len; >>>> + >>>> + if (netdev_linux_parse_packet(b, &l2_len, &l3_len, &l4proto)) { >>>> + return EINVAL; >>>> + } >> >> This seems to me like the most contraversial part of the set. It doesn't >> seem wise to have a separate parsing method for this. While we may partially >> rely on kernel to not provide us with some broken packet, I'm not sure we can >> fully trust those packets either. We should be using more robust parsing >> implementation, e.g. call the parse_tcp_flags() function. It will take care >> of multiple vlan headers, IP fragments and potentially broken L3 headers. >> After this call we'll have all the offsets set in the dp_packet itself and >> we'll also have L4 protocol set with the previous patches. So, we should be >> able to use that information below to verify the checksum start and offset. >> >> If we didn't recognize the protocol, then just calculating it below seems >> reasonable. >> >>>> + >>>> + if (csum_start && csum_offset && csum_start == l2_len + l3_len >>>> + && ((csum_offset == offsetof(struct tcp_header, tcp_csum) >>>> + && l4proto == IPPROTO_TCP) >>>> + || (csum_offset == offsetof(struct udp_header, udp_csum) >>>> + && l4proto == IPPROTO_UDP) >>>> + || (csum_offset == offsetof(struct sctp_header, sctp_csum) >>>> + && l4proto == IPPROTO_SCTP))) { >>>> + dp_packet_ol_set_l4_csum_partial(b); >>>> + } else { >>>> + ovs_be16 *csum_l4; >>>> + void *l4; >>>> + >>>> + COVERAGE_INC(netdev_linux_unknown_l4_csum); >>>> + >>>> + csum_l4 = dp_packet_at(b, csum_start + csum_offset, >>>> + sizeof *csum_l4); >>>> + if (!csum_l4) { >>>> + return EINVAL; >>>> + } >>>> + >>>> + l4 = dp_packet_at(b, csum_start, dp_packet_size(b) - >>>> csum_start); >>>> + *csum_l4 = csum(l4, dp_packet_size(b) - csum_start); >>>> + >>>> + if (l4proto == IPPROTO_TCP || l4proto == IPPROTO_UDP >>>> + || l4proto == IPPROTO_SCTP) { >>>> + dp_packet_ol_set_l4_csum_good(b); >>>> + } >>>> + } >>>> } >>>> >>>> int ret = 0; >>>> diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c >>>> index cbb875bd26..d49597f380 100644 >>>> --- a/lib/netdev-native-tnl.c >>>> +++ b/lib/netdev-native-tnl.c >>>> @@ -338,11 +338,7 @@ netdev_tnl_push_udp_header(const struct netdev >>>> *netdev OVS_UNUSED, >>>> } else { >>>> dp_packet_hwol_set_csum_udp(packet); >>>> } >>>> - } >>>> - >>>> - if (packet->csum_start && packet->csum_offset) { >>>> - dp_packet_ol_set_l4_csum_partial(packet); >>>> - } else if (!udp->udp_csum) { >>>> + } else { >>>> dp_packet_ol_set_l4_csum_good(packet); >>>> } >>>> >>>> -- >>>> 2.48.1 >>>> >>> >> > _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev