On 4/14/25 4:39 PM, Mike Pattrick wrote:
> On Fri, Apr 11, 2025 at 7:36 PM Ilya Maximets <i.maxim...@ovn.org> wrote:
>>
>> On 4/9/25 8:16 PM, Mike Pattrick wrote:
>>> On Thu, Mar 13, 2025 at 8:44 AM David Marchand
>>> <david.march...@redhat.com> wrote:
>>>>
>>>> As the virtio-net offload API is used for netdev-linux ports, but
>>>> provides no information about the potentially encapsulated protocol
>>>> concerned by a checksum request, specific informations from this netdev-
>>>> specific implementation is propagated into OVS code, and must be
>>>> carefully evaluated when some tunnel gets decapsulated.
>>>>
>>>> This induces a cost in "normal" processing path, while the netdev-linux
>>>> path is not performance critical.
>>>>
>>>> This patch removes such specific information, yet try harder to parse
>>>> the packet on the Rx side and set offload flags accordingly for non
>>>> encapsulated traffic and for encapsulated traffic, fixes the inner
>>>> checksum.
>>>>
>>>> Signed-off-by: David Marchand <david.march...@redhat.com>
>>>> ---
>>>> Changes since v3:
>>>> - dropped setting Tx flags,
>>>>
>>>> Changes since v1:
>>>> - fixed patch reordering issue: offloads fields was being added instead
>>>>   of later in this series,
>>>>
>>>> ---
>>>>  lib/dp-packet.c                  |   3 -
>>>>  lib/dp-packet.h                  |  24 -------
>>>>  lib/dpif-netdev-extract-avx512.c |   2 -
>>>>  lib/flow.c                       |   6 --
>>>>  lib/netdev-linux.c               | 113 ++++++++++++++++++++++++++++---
>>>>  lib/netdev-native-tnl.c          |   6 +-
>>>>  6 files changed, 104 insertions(+), 50 deletions(-)
>>>>
>>>> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
>>>> index dad0d7be3a..6a9bfd63ba 100644
>>>> --- a/lib/dp-packet.c
>>>> +++ b/lib/dp-packet.c
>>>> @@ -39,9 +39,6 @@ dp_packet_init__(struct dp_packet *b, size_t allocated, 
>>>> enum dp_packet_source so
>>>>      dp_packet_init_specific(b);
>>>>      /* By default assume the packet type to be Ethernet. */
>>>>      b->packet_type = htonl(PT_ETH);
>>>> -    /* Reset csum start and offset. */
>>>> -    b->csum_start = 0;
>>>> -    b->csum_offset = 0;
>>>>  }
>>>>
>>>>  static void
>>>> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
>>>> index 549802167d..084e89bd99 100644
>>>> --- a/lib/dp-packet.h
>>>> +++ b/lib/dp-packet.h
>>>> @@ -178,8 +178,6 @@ struct dp_packet {
>>>>                                        or UINT16_MAX. */
>>>>      uint32_t cutlen;               /* length in bytes to cut from the 
>>>> end. */
>>>>      ovs_be32 packet_type;          /* Packet type as defined in OpenFlow 
>>>> */
>>>> -    uint16_t csum_start;           /* Position to start checksumming 
>>>> from. */
>>>> -    uint16_t csum_offset;          /* Offset to place checksum. */
>>>>      union {
>>>>          struct pkt_metadata md;
>>>>          uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];
>>>> @@ -1539,34 +1537,12 @@ dp_packet_ol_set_l4_csum_bad(struct dp_packet *p)
>>>>      *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_BAD;
>>>>  }
>>>>
>>>> -/* Marks packet 'p' with good integrity if checksum offload locations
>>>> - * were provided. In the case of encapsulated packets, these values may
>>>> - * be deeper into the packet than OVS might expect. But the packet
>>>> - * should still be considered to have good integrity.
>>>> - * The 'csum_start' is the offset from the begin of the packet headers.
>>>> - * The 'csum_offset' is the offset from start to place the checksum.
>>>> - * The csum_start and csum_offset fields are set from the virtio_net_hdr
>>>> - * struct that may be provided by a netdev on packet ingress. */
>>>> -static inline void
>>>> -dp_packet_ol_l4_csum_check_partial(struct dp_packet *p)
>>>> -{
>>>> -    if (p->csum_start && p->csum_offset) {
>>>> -        dp_packet_ol_set_l4_csum_partial(p);
>>>> -    }
>>>> -}
>>>> -
>>>>  static inline void
>>>>  dp_packet_reset_packet(struct dp_packet *b, int off)
>>>>  {
>>>>      dp_packet_set_size(b, dp_packet_size(b) - off);
>>>>      dp_packet_set_data(b, ((unsigned char *) dp_packet_data(b) + off));
>>>>      dp_packet_reset_offsets(b);
>>>> -
>>>> -    if (b->csum_start >= off && b->csum_offset) {
>>>> -        /* Adjust values for decapsulation. */
>>>> -        b->csum_start -= off;
>>>> -        dp_packet_ol_set_l4_csum_partial(b);
>>>> -    }
>>>>  }
>>>>
>>>>  static inline uint32_t ALWAYS_INLINE
>>>> diff --git a/lib/dpif-netdev-extract-avx512.c 
>>>> b/lib/dpif-netdev-extract-avx512.c
>>>> index 57ca4c71b7..3ae850c2d5 100644
>>>> --- a/lib/dpif-netdev-extract-avx512.c
>>>> +++ b/lib/dpif-netdev-extract-avx512.c
>>>> @@ -776,7 +776,6 @@ mfex_ipv6_set_hwol(struct dp_packet *pkt)
>>>>  static void
>>>>  mfex_tcp_set_hwol(struct dp_packet *pkt)
>>>>  {
>>>> -    dp_packet_ol_l4_csum_check_partial(pkt);
>>>>      if (dp_packet_l4_checksum_good(pkt)
>>>>          || dp_packet_ol_l4_csum_partial(pkt)) {
>>>>          dp_packet_hwol_set_csum_tcp(pkt);
>>>> @@ -786,7 +785,6 @@ mfex_tcp_set_hwol(struct dp_packet *pkt)
>>>>  static void
>>>>  mfex_udp_set_hwol(struct dp_packet *pkt)
>>>>  {
>>>> -    dp_packet_ol_l4_csum_check_partial(pkt);
>>>>      if (dp_packet_l4_checksum_good(pkt)
>>>>          || dp_packet_ol_l4_csum_partial(pkt)) {
>>>>          dp_packet_hwol_set_csum_udp(pkt);
>>>> diff --git a/lib/flow.c b/lib/flow.c
>>>> index c2e1e4ad6f..988ae6bfdc 100644
>>>> --- a/lib/flow.c
>>>> +++ b/lib/flow.c
>>>> @@ -1080,7 +1080,6 @@ miniflow_extract(struct dp_packet *packet, struct 
>>>> miniflow *dst)
>>>>                      } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>>>>                          dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>>>>                      }
>>>> -                    dp_packet_ol_l4_csum_check_partial(packet);
>>>>                      if (dp_packet_l4_checksum_good(packet)
>>>>                          || dp_packet_ol_l4_csum_partial(packet)) {
>>>>                          dp_packet_hwol_set_csum_tcp(packet);
>>>> @@ -1100,7 +1099,6 @@ miniflow_extract(struct dp_packet *packet, struct 
>>>> miniflow *dst)
>>>>                  } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>>>>                      dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
>>>>                  }
>>>> -                dp_packet_ol_l4_csum_check_partial(packet);
>>>>                  if (dp_packet_l4_checksum_good(packet)
>>>>                      || dp_packet_ol_l4_csum_partial(packet)) {
>>>>                      if (tunneling) {
>>>> @@ -1118,7 +1116,6 @@ miniflow_extract(struct dp_packet *packet, struct 
>>>> miniflow *dst)
>>>>                  miniflow_push_be16(mf, tp_dst, sctp->sctp_dst);
>>>>                  miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>>>>                  miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
>>>> -                dp_packet_ol_l4_csum_check_partial(packet);
>>>>                  if (dp_packet_l4_checksum_good(packet)
>>>>                      || dp_packet_ol_l4_csum_partial(packet)) {
>>>>                      dp_packet_hwol_set_csum_sctp(packet);
>>>> @@ -1309,20 +1306,17 @@ parse_tcp_flags(struct dp_packet *packet,
>>>>          if (nw_proto == IPPROTO_TCP && size >= TCP_HEADER_LEN) {
>>>>              const struct tcp_header *tcp = data;
>>>>
>>>> -            dp_packet_ol_l4_csum_check_partial(packet);
>>>>              if (dp_packet_l4_checksum_good(packet)
>>>>                  || dp_packet_ol_l4_csum_partial(packet)) {
>>>>                  dp_packet_hwol_set_csum_tcp(packet);
>>>>              }
>>>>              return TCP_FLAGS(tcp->tcp_ctl);
>>>>          } else if (nw_proto == IPPROTO_UDP && size >= UDP_HEADER_LEN) {
>>>> -            dp_packet_ol_l4_csum_check_partial(packet);
>>>>              if (dp_packet_l4_checksum_good(packet)
>>>>                  || dp_packet_ol_l4_csum_partial(packet)) {
>>>>                  dp_packet_hwol_set_csum_udp(packet);
>>>>              }
>>>>          } else if (nw_proto == IPPROTO_SCTP && size >= SCTP_HEADER_LEN) {
>>>> -            dp_packet_ol_l4_csum_check_partial(packet);
>>>>              if (dp_packet_l4_checksum_good(packet)
>>>>                  || dp_packet_ol_l4_csum_partial(packet)) {
>>>>                  dp_packet_hwol_set_csum_sctp(packet);
>>>> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
>>>> index af7438f1ee..c1c0addab7 100644
>>>> --- a/lib/netdev-linux.c
>>>> +++ b/lib/netdev-linux.c
>>>> @@ -89,6 +89,7 @@ COVERAGE_DEFINE(netdev_get_hwaddr);
>>>>  COVERAGE_DEFINE(netdev_set_hwaddr);
>>>>  COVERAGE_DEFINE(netdev_get_ethtool);
>>>>  COVERAGE_DEFINE(netdev_set_ethtool);
>>>> +COVERAGE_DEFINE(netdev_linux_unknown_l4_csum);
>>>>
>>>>
>>>>  #ifndef IFLA_IF_NETNSID
>>>> @@ -7038,6 +7039,70 @@ af_packet_sock(void)
>>>>      return sock;
>>>>  }
>>>>
>>>> +static int
>>>> +netdev_linux_parse_packet(struct dp_packet *b, uint16_t *l2_len,
>>>> +                          uint16_t *l3_len, uint16_t *l4proto)
>>>> +{
>>>> +    struct eth_header *eth_hdr;
>>>> +    ovs_be16 eth_type;
>>>> +
>>>> +    eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
>>>> +    if (!eth_hdr) {
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>> +    *l2_len = ETH_HEADER_LEN;
>>>> +    eth_type = eth_hdr->eth_type;
>>>> +    if (eth_type_vlan(eth_type)) {
>>>> +        struct vlan_header *vlan = dp_packet_at(b, *l2_len, 
>>>> VLAN_HEADER_LEN);
>>>> +
>>>> +        if (!vlan) {
>>>> +            return -EINVAL;
>>>> +        }
>>>> +
>>>> +        eth_type = vlan->vlan_next_type;
>>>> +        *l2_len += VLAN_HEADER_LEN;
>>>> +    }
>>>> +
>>>> +    if (eth_type == htons(ETH_TYPE_IP)) {
>>>> +        struct ip_header *ip_hdr = dp_packet_at(b, *l2_len, 
>>>> IP_HEADER_LEN);
>>>> +
>>>> +        if (!ip_hdr) {
>>>> +            return -EINVAL;
>>>> +        }
>>>> +
>>>> +        *l3_len = IP_IHL(ip_hdr->ip_ihl_ver) * 4;
>>>> +        *l4proto = ip_hdr->ip_proto;
>>>> +        dp_packet_hwol_set_tx_ipv4(b);
>>>> +    } else if (eth_type == htons(ETH_TYPE_IPV6)) {
>>>> +        struct ovs_16aligned_ip6_hdr *nh6;
>>>> +        const void *data;
>>>> +        uint8_t nw_proto;
>>>> +        uint8_t nw_frag;
>>>> +        size_t size;
>>>> +
>>>> +        nh6 = dp_packet_at(b, *l2_len, IPV6_HEADER_LEN);
>>>> +        if (!nh6) {
>>>> +            return -EINVAL;
>>>> +        }
>>>> +
>>>> +        nw_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
>>>> +        data = (const char *) nh6 + sizeof *nh6;
>>>> +        size = (const char *) dp_packet_tail(b) - (const char *) data;
>>>> +        if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag,
>>>> +                                 NULL, NULL)) {
>>>> +            return -EINVAL;
>>>> +        }
>>>> +        *l3_len = (const char *) data - (const char *) nh6;
>>>> +        *l4proto = nw_proto;
>>>> +        dp_packet_hwol_set_tx_ipv6(b);
>>>> +    } else {
>>>> +        *l3_len = *l4proto = 0;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>  /* Initializes packet 'b' with features enabled in the prepended
>>>>   * struct virtio_net_hdr.  Returns 0 if successful, otherwise a
>>>>   * positive errno value. */
>>>> @@ -7055,16 +7120,44 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b)
>>>>      }
>>>>
>>>>      if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
>>>
>>> Instead of implementing our own checksumming here, why not just use
>>> ethtool to disable all checksum offloading?
>>
>> Wouldn't that disable most of other offloads as well, like TSO?
>> If so, we'll loose a lot of performance.
> 
> We can toggle checksum and tso flags independently. I feel like this
> approach is slightly more honest than indicating in the ovs-vsctl list
> interface that checksums are offloaded, and then manually calculating
> them all anyways. And I think doing this in usermode would be far more
> impactful than the kernel/hw doing it.

While we can toggle these flags independently, most drivers will turn off TSO
as well once checksum offload is disabled, because checksum offlaoding is a
prerequisite for segmentation offloading in many cases, e.g. in virtio spec.
We can't controll that, it's up to a driver.  Also, we can't actually controll
offloads on veth or similar interfaces, because disabling has to be done on
the other side of the veth in order to be effective.

Also, we're not re-calculating anything in a vast majority of cases.  For any
normal TCP/UDP/SCTP traffic offloading will work just fine.  We will only
recalculate for some protocols we do not understand or if the kenrel sends us
some weird values.  This should be a rare case.  Disabling checksum offload
with ethtool will turn it off for all types of traffic as there're no
per-protocol options.

For the performance, I did test this on my setup and I see that packet rate
goes down from 9.8 Gbps to 9.4 Gbps in my setup when I force the checksum
recalculation for TSO traffic with iperf.  It is noticeable, but I don't think
we care too much, as pumping 10G through the OVS main thread is not a good
idea in the first place.  We mostly just need to ensure correctness.

So, I think, the approach in this patch should be good enough.

> 
>>
>>>
>>>> -        /* The packet has offloaded checksum. However, there is no
>>>> -         * additional information like the protocol used, so it would
>>>> -         * require to parse the packet here. The checksum starting point
>>>> -         * and offset are going to be verified when the packet headers
>>>> -         * are parsed during miniflow extraction. */
>>>> -        b->csum_start = (OVS_FORCE uint16_t) vnet->csum_start;
>>>> -        b->csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset;
>>>> -    } else {
>>>> -        b->csum_start = 0;
>>>> -        b->csum_offset = 0;
>>>> +        uint16_t csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset;
>>>> +        uint16_t csum_start = (OVS_FORCE uint16_t) vnet->csum_start;
>>>> +        uint16_t l4proto;
>>>> +        uint16_t l2_len;
>>>> +        uint16_t l3_len;
>>>> +
>>>> +        if (netdev_linux_parse_packet(b, &l2_len, &l3_len, &l4proto)) {
>>>> +            return EINVAL;
>>>> +        }
>>
>> This seems to me like the most contraversial part of the set.  It doesn't
>> seem wise to have a separate parsing method for this.  While we may partially
>> rely on kernel to not provide us with some broken packet, I'm not sure we can
>> fully trust those packets either.  We should be using more robust parsing
>> implementation, e.g. call the parse_tcp_flags() function.  It will take care
>> of multiple vlan headers, IP fragments and potentially broken L3 headers.
>> After this call we'll have all the offsets set in the dp_packet itself and
>> we'll also have L4 protocol set with the previous patches.  So, we should be
>> able to use that information below to verify the checksum start and offset.
>>
>> If we didn't recognize the protocol, then just calculating it below seems
>> reasonable.
>>
>>>> +
>>>> +        if (csum_start && csum_offset && csum_start == l2_len + l3_len
>>>> +            && ((csum_offset == offsetof(struct tcp_header, tcp_csum)
>>>> +                 && l4proto == IPPROTO_TCP)
>>>> +                || (csum_offset == offsetof(struct udp_header, udp_csum)
>>>> +                    && l4proto == IPPROTO_UDP)
>>>> +                || (csum_offset == offsetof(struct sctp_header, sctp_csum)
>>>> +                    && l4proto == IPPROTO_SCTP))) {
>>>> +            dp_packet_ol_set_l4_csum_partial(b);
>>>> +        } else {
>>>> +            ovs_be16 *csum_l4;
>>>> +            void *l4;
>>>> +
>>>> +            COVERAGE_INC(netdev_linux_unknown_l4_csum);
>>>> +
>>>> +            csum_l4 = dp_packet_at(b, csum_start + csum_offset,
>>>> +                                   sizeof *csum_l4);
>>>> +            if (!csum_l4) {
>>>> +                return EINVAL;
>>>> +            }
>>>> +
>>>> +            l4 = dp_packet_at(b, csum_start, dp_packet_size(b) - 
>>>> csum_start);
>>>> +            *csum_l4 = csum(l4, dp_packet_size(b) - csum_start);
>>>> +
>>>> +            if (l4proto == IPPROTO_TCP || l4proto == IPPROTO_UDP
>>>> +                || l4proto == IPPROTO_SCTP) {
>>>> +                dp_packet_ol_set_l4_csum_good(b);
>>>> +            }
>>>> +        }
>>>>      }
>>>>
>>>>      int ret = 0;
>>>> diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
>>>> index cbb875bd26..d49597f380 100644
>>>> --- a/lib/netdev-native-tnl.c
>>>> +++ b/lib/netdev-native-tnl.c
>>>> @@ -338,11 +338,7 @@ netdev_tnl_push_udp_header(const struct netdev 
>>>> *netdev OVS_UNUSED,
>>>>          } else {
>>>>              dp_packet_hwol_set_csum_udp(packet);
>>>>          }
>>>> -    }
>>>> -
>>>> -    if (packet->csum_start && packet->csum_offset) {
>>>> -        dp_packet_ol_set_l4_csum_partial(packet);
>>>> -    } else if (!udp->udp_csum) {
>>>> +    } else {
>>>>          dp_packet_ol_set_l4_csum_good(packet);
>>>>      }
>>>>
>>>> --
>>>> 2.48.1
>>>>
>>>
>>
> 

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to