On 3/27/23 12:50, Mike Pattrick wrote:
> From: Flavio Leitner <[email protected]>
>
> The netdev receiving packets is supposed to provide the flags
> indicating if the L4 checksum was verified and it is OK or BAD,
> otherwise the stack will check when appropriate by software.
>
> If the packet comes with good checksum, then postpone the
> checksum calculation to the egress device if needed.
>
> When encapsulate a packet with that flag, set the checksum
> of the inner L4 header since that is not yet supported.
>
> Calculate the L4 checksum when the packet is going to be sent
> over a device that doesn't support the feature.
>
> Linux tap devices allows enabling L3 and L4 offload, so this
> patch enables the feature. However, Linux socket interface
> remains disabled because the API doesn't allow enabling
> those two features without enabling TSO too.
>
> Signed-off-by: Flavio Leitner <[email protected]>
> Co-authored-by: Mike Pattrick <[email protected]>
> Signed-off-by: Mike Pattrick <[email protected]>
> ---
> Since v9:
> - Extended miniflow_extract changes into avx512 code
> - Formatting changes
> - Note that we cannot currently enable checksum offloading in
> CONFIGURE_VETH_OFFLOADS for check-system-userspace as
> netdev-linux.c currently only parses the vnet header if TSO
> is enabled.
> Since v10:
> - No change
>
> Signed-off-by: Mike Pattrick <[email protected]>
> ---
> lib/conntrack.c | 15 +-
> lib/dp-packet.c | 25 ++++
> lib/dp-packet.h | 78 +++++++++-
> lib/dpif-netdev-extract-avx512.c | 62 +++++++-
> lib/flow.c | 23 +++
> lib/netdev-dpdk.c | 174 +++++++++++++++-------
> lib/netdev-linux.c | 242 +++++++++++++++++++++----------
> lib/netdev-native-tnl.c | 32 +---
> lib/netdev.c | 46 ++----
> lib/odp-execute-avx512.c | 24 +--
> lib/packets.c | 175 +++++++++++++++++-----
> lib/packets.h | 3 +
> 12 files changed, 645 insertions(+), 254 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 54166c320..75f3d7fee 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -2044,13 +2044,12 @@ conn_key_extract(struct conntrack *ct, struct
> dp_packet *pkt, ovs_be16 dl_type,
> }
>
> if (ok) {
> - bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
> - if (!hwol_bad_l4_csum) {
> - bool hwol_good_l4_csum = dp_packet_l4_checksum_good(pkt)
> - || dp_packet_hwol_tx_l4_checksum(pkt);
> + if (!dp_packet_l4_checksum_bad(pkt)) {
> /* Validate the checksum only when hwol is not supported. */
> if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt),
> - &ctx->icmp_related, l3, !hwol_good_l4_csum,
> + &ctx->icmp_related, l3,
> + !dp_packet_l4_checksum_good(pkt) &&
> + !dp_packet_hwol_tx_l4_checksum(pkt),
> NULL)) {
> ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
> return true;
> @@ -3379,8 +3378,10 @@ handle_ftp_ctl(struct conntrack *ct, const struct
> conn_lookup_ctx *ctx,
> adj_seqnum(&th->tcp_seq, ec->seq_skew);
> }
>
> - th->tcp_csum = 0;
> - if (!dp_packet_hwol_tx_l4_checksum(pkt)) {
> + if (dp_packet_hwol_tx_l4_checksum(pkt)) {
> + dp_packet_ol_reset_l4_csum_good(pkt);
> + } else {
> + th->tcp_csum = 0;
> if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
> th->tcp_csum = packet_csum_upperlayer6(nh6, th,
> ctx->key.nw_proto,
> dp_packet_l4_size(pkt));
> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
> index 61c36de44..dfedd0e9b 100644
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -38,6 +38,9 @@ dp_packet_init__(struct dp_packet *b, size_t allocated,
> enum dp_packet_source so
> dp_packet_init_specific(b);
> /* By default assume the packet type to be Ethernet. */
> b->packet_type = htonl(PT_ETH);
> + /* Reset csum start and offset. */
> + b->csum_start = 0;
> + b->csum_offset = 0;
> }
>
> static void
> @@ -544,4 +547,26 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t
> flags)
> dp_packet_ol_set_ip_csum_good(p);
> dp_packet_hwol_reset_tx_ip_csum(p);
> }
> +
> + if (dp_packet_l4_checksum_good(p) || !dp_packet_hwol_tx_l4_checksum(p)) {
> + dp_packet_hwol_reset_tx_l4_csum(p);
> + return;
> + }
> +
> + if (dp_packet_hwol_l4_is_tcp(p)
> + && !(flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) {
> + packet_tcp_complete_csum(p);
> + dp_packet_ol_set_l4_csum_good(p);
> + dp_packet_hwol_reset_tx_l4_csum(p);
> + } else if (dp_packet_hwol_l4_is_udp(p)
> + && !(flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) {
> + packet_udp_complete_csum(p);
> + dp_packet_ol_set_l4_csum_good(p);
> + dp_packet_hwol_reset_tx_l4_csum(p);
> + } else if (!(flags & NETDEV_TX_OFFLOAD_SCTP_CKSUM)
> + && dp_packet_hwol_l4_is_sctp(p)) {
> + packet_sctp_complete_csum(p);
> + dp_packet_ol_set_l4_csum_good(p);
> + dp_packet_hwol_reset_tx_l4_csum(p);
> + }
> }
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index af0a2b7f0..c37c85857 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -140,6 +140,8 @@ struct dp_packet {
> or UINT16_MAX. */
> uint32_t cutlen; /* length in bytes to cut from the end. */
> ovs_be32 packet_type; /* Packet type as defined in OpenFlow */
> + uint16_t csum_start; /* Position to start checksumming from. */
> + uint16_t csum_offset; /* Offset to place checksum. */
> union {
> struct pkt_metadata md;
> uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];
> @@ -997,6 +999,13 @@ dp_packet_hwol_is_ipv4(const struct dp_packet *b)
> return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_IPV4);
> }
>
> +/* Returns 'true' if packet 'p' is marked as IPv6. */
> +static inline bool
> +dp_packet_hwol_tx_ipv6(const struct dp_packet *p)
> +{
> + return !!(*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_TX_IPV6);
> +}
> +
> /* Returns 'true' if packet 'b' is marked for TCP checksum offloading. */
> static inline bool
> dp_packet_hwol_l4_is_tcp(const struct dp_packet *b)
> @@ -1021,18 +1030,26 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b)
> DP_PACKET_OL_TX_SCTP_CKSUM;
> }
>
> -/* Mark packet 'b' for IPv4 checksum offloading. */
> static inline void
> -dp_packet_hwol_set_tx_ipv4(struct dp_packet *b)
> +dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p)
> +{
> + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_L4_MASK;
> +}
> +
> +/* Mark packet 'p' as IPv4. */
> +static inline void
> +dp_packet_hwol_set_tx_ipv4(struct dp_packet *p)
> {
> - *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV4;
> + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_IPV6;
> + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_TX_IPV4;
> }
>
> -/* Mark packet 'b' for IPv6 checksum offloading. */
> +/* Mark packet 'a' as IPv6. */
> static inline void
> -dp_packet_hwol_set_tx_ipv6(struct dp_packet *b)
> +dp_packet_hwol_set_tx_ipv6(struct dp_packet *a)
> {
> - *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV6;
> + *dp_packet_ol_flags_ptr(a) &= ~DP_PACKET_OL_TX_IPV4;
> + *dp_packet_ol_flags_ptr(a) |= DP_PACKET_OL_TX_IPV6;
> }
>
> /* Returns 'true' if packet 'p' is marked for IPv4 checksum offloading. */
> @@ -1147,6 +1164,55 @@ dp_packet_l4_checksum_bad(const struct dp_packet *p)
> DP_PACKET_OL_RX_L4_CKSUM_BAD;
> }
>
> +/* Returns 'true' if the packet has good integrity though the
> + * checksum in the packet 'p' is not complete. */
> +static inline bool
> +dp_packet_ol_l4_csum_partial(const struct dp_packet *p)
> +{
> + return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_L4_CKSUM_MASK) ==
> + DP_PACKET_OL_RX_L4_CKSUM_MASK;
> +}
> +
> +/* Marks packet 'p' with good integrity though the checksum in the
> + * packet is not complete. */
> +static inline void
> +dp_packet_ol_set_l4_csum_partial(struct dp_packet *p)
> +{
> + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_MASK;
> +}
> +
> +/* Marks packet 'p' with good L4 checksum. */
> +static inline void
> +dp_packet_ol_set_l4_csum_good(struct dp_packet *p)
> +{
> + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_L4_CKSUM_BAD;
> + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_GOOD;
> +}
> +
> +/* Marks packet 'p' with good L4 checksum as modified. */
> +static inline void
> +dp_packet_ol_reset_l4_csum_good(struct dp_packet *p)
> +{
> + if (!dp_packet_ol_l4_csum_partial(p)) {
> + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_L4_CKSUM_GOOD;
> + }
> +}
> +
> +/* Marks packet 'p' with good integrity if the 'start' and 'offset'
> + * matches with the 'csum_start' and 'csum_offset' in packet 'p'.
> + * The 'start' is the offset from the begin of the packet headers.
> + * The 'offset' is the offset from start to place the checksum.
> + * The csum_start and csum_offset fields are set from the virtio_net_hdr
> + * struct that may be provided by a netdev on packet ingress. */
> +static inline void
> +dp_packet_ol_vnet_csum_check(struct dp_packet *p, uint16_t start,
> + uint16_t offset)
Can we call this function dp_packet_ol_l4_csum_check_partial or
something like that?
> +{
> + if (p->csum_start == start && p->csum_offset == offset) {
> + dp_packet_ol_set_l4_csum_partial(p);
> + }
> +}
> +
> static inline uint32_t ALWAYS_INLINE
> dp_packet_calc_hash_ipv4(const uint8_t *pkt, const uint16_t l3_ofs,
> uint32_t hash)
> diff --git a/lib/dpif-netdev-extract-avx512.c
> b/lib/dpif-netdev-extract-avx512.c
> index 66884eaf0..a9299d229 100644
> --- a/lib/dpif-netdev-extract-avx512.c
> +++ b/lib/dpif-netdev-extract-avx512.c
> @@ -698,7 +698,6 @@ mfex_ipv6_set_l2_pad_size(struct dp_packet *pkt,
> return -1;
> }
> dp_packet_set_l2_pad_size(pkt, len_from_ipv6 - (p_len +
> IPV6_HEADER_LEN));
> - dp_packet_hwol_set_tx_ipv6(pkt);
> return 0;
> }
>
> @@ -729,10 +728,6 @@ mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt, struct
> ip_header *nh,
> return -1;
> }
> dp_packet_set_l2_pad_size(pkt, len_from_ipv4 - ip_tot_len);
> - dp_packet_hwol_set_tx_ipv4(pkt);
> - if (dp_packet_ip_checksum_good(pkt)) {
> - dp_packet_hwol_set_tx_ip_csum(pkt);
> - }
> return 0;
> }
>
> @@ -763,6 +758,45 @@ mfex_check_tcp_data_offset(const struct tcp_header *tcp)
> return ret;
> }
>
> +static void
> +mfex_ipv4_set_hwol(struct dp_packet *pkt)
> +{
> + dp_packet_hwol_set_tx_ipv4(pkt);
> + if (dp_packet_ip_checksum_good(pkt)) {
> + dp_packet_hwol_set_tx_ip_csum(pkt);
> + }
> +}
> +
> +static void
> +mfex_ipv6_set_hwol(struct dp_packet *pkt)
> +{
> + dp_packet_hwol_set_tx_ipv6(pkt);
> +}
> +
> +static void
> +mfex_tcp_set_hwol(struct dp_packet *pkt)
> +{
> + dp_packet_ol_vnet_csum_check(pkt, pkt->l4_ofs,
> + offsetof(struct tcp_header,
> + tcp_csum));
> + if (dp_packet_l4_checksum_good(pkt)
> + || dp_packet_ol_l4_csum_partial(pkt)) {
> + dp_packet_hwol_set_csum_tcp(pkt);
> + }
> +}
> +
> +static void
> +mfex_udp_set_hwol(struct dp_packet *pkt)
> +{
> + dp_packet_ol_vnet_csum_check(pkt, pkt->l4_ofs,
> + offsetof(struct udp_header,
> + udp_csum));
> + if (dp_packet_l4_checksum_good(pkt)
> + || dp_packet_ol_l4_csum_partial(pkt)) {
> + dp_packet_hwol_set_csum_udp(pkt);
> + }
> +}
> +
> /* Generic loop to process any mfex profile. This code is specialized into
> * multiple actual MFEX implementation functions. Its marked ALWAYS_INLINE
> * to ensure the compiler specializes each instance. The code is marked "hot"
> @@ -864,6 +898,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> const struct tcp_header *tcp = (void *)&pkt[38];
> mfex_handle_tcp_flags(tcp, &blocks[7]);
> dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> + mfex_ipv4_set_hwol(packet);
> + mfex_tcp_set_hwol(packet);
> } break;
>
> case PROFILE_ETH_VLAN_IPV4_UDP: {
> @@ -876,6 +912,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> continue;
> }
> dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> + mfex_ipv4_set_hwol(packet);
> + mfex_udp_set_hwol(packet);
> } break;
>
> case PROFILE_ETH_IPV4_TCP: {
> @@ -891,6 +929,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> continue;
> }
> dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> + mfex_ipv4_set_hwol(packet);
> + mfex_tcp_set_hwol(packet);
> } break;
>
> case PROFILE_ETH_IPV4_UDP: {
> @@ -902,6 +942,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> continue;
> }
> dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> + mfex_ipv4_set_hwol(packet);
> + mfex_udp_set_hwol(packet);
> } break;
>
> case PROFILE_ETH_IPV6_UDP: {
> @@ -920,6 +962,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> /* Process UDP header. */
> mfex_handle_ipv6_l4((void *)&pkt[54], &blocks[9]);
> dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> + mfex_ipv6_set_hwol(packet);
> + mfex_udp_set_hwol(packet);
> } break;
>
> case PROFILE_ETH_IPV6_TCP: {
> @@ -943,6 +987,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> }
> mfex_handle_tcp_flags(tcp, &blocks[9]);
> dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> + mfex_ipv6_set_hwol(packet);
> + mfex_tcp_set_hwol(packet);
> } break;
>
> case PROFILE_ETH_VLAN_IPV6_TCP: {
> @@ -969,6 +1015,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> }
> mfex_handle_tcp_flags(tcp, &blocks[10]);
> dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> + mfex_ipv6_set_hwol(packet);
> + mfex_tcp_set_hwol(packet);
> } break;
>
> case PROFILE_ETH_VLAN_IPV6_UDP: {
> @@ -990,6 +1038,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> /* Process UDP header. */
> mfex_handle_ipv6_l4((void *)&pkt[58], &blocks[10]);
> dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> + mfex_ipv6_set_hwol(packet);
> + mfex_udp_set_hwol(packet);
> } break;
>
> case PROFILE_ETH_IPV4_NVGRE: {
> @@ -1000,6 +1050,8 @@ mfex_avx512_process(struct dp_packet_batch *packets,
> continue;
> }
> dp_packet_update_rss_hash_ipv4(packet);
> + mfex_ipv4_set_hwol(packet);
> + mfex_udp_set_hwol(packet);
> } break;
>
> default:
> diff --git a/lib/flow.c b/lib/flow.c
> index 6c8bf7fc0..5aaf3b420 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -1027,6 +1027,13 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
> } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> }
> + dp_packet_ol_vnet_csum_check(packet, packet->l4_ofs,
> + offsetof(struct tcp_header,
> + tcp_csum));
> + if (dp_packet_l4_checksum_good(packet)
> + || dp_packet_ol_l4_csum_partial(packet)) {
> + dp_packet_hwol_set_csum_tcp(packet);
> + }
> }
> }
> } else if (OVS_LIKELY(nw_proto == IPPROTO_UDP)) {
> @@ -1042,6 +1049,13 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
> } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> }
> + dp_packet_ol_vnet_csum_check(packet, packet->l4_ofs,
> + offsetof(struct udp_header,
> + udp_csum));
> + if (dp_packet_l4_checksum_good(packet)
> + || dp_packet_ol_l4_csum_partial(packet)) {
> + dp_packet_hwol_set_csum_udp(packet);
> + }
> }
> } else if (OVS_LIKELY(nw_proto == IPPROTO_SCTP)) {
> if (OVS_LIKELY(size >= SCTP_HEADER_LEN)) {
> @@ -1051,6 +1065,13 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
> miniflow_push_be16(mf, tp_dst, sctp->sctp_dst);
> miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
> miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
> + dp_packet_ol_vnet_csum_check(packet, packet->l4_ofs,
> + offsetof(struct sctp_header,
> + sctp_csum));
> + if (dp_packet_l4_checksum_good(packet)
> + || dp_packet_ol_l4_csum_partial(packet)) {
> + dp_packet_hwol_set_csum_sctp(packet);
> + }
> }
> } else if (OVS_LIKELY(nw_proto == IPPROTO_ICMP)) {
> if (OVS_LIKELY(size >= ICMP_HEADER_LEN)) {
> @@ -3170,6 +3191,7 @@ flow_compose_l4_csum(struct dp_packet *p, const struct
> flow *flow,
> tcp->tcp_csum = 0;
> tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
> tcp, l4_len));
> + dp_packet_ol_set_l4_csum_good(p);
> } else if (flow->nw_proto == IPPROTO_UDP) {
> struct udp_header *udp = dp_packet_l4(p);
>
> @@ -3179,6 +3201,7 @@ flow_compose_l4_csum(struct dp_packet *p, const struct
> flow *flow,
> if (!udp->udp_csum) {
> udp->udp_csum = htons(0xffff);
> }
> + dp_packet_ol_set_l4_csum_good(p);
> } else if (flow->nw_proto == IPPROTO_ICMP) {
> struct icmp_header *icmp = dp_packet_l4(p);
>
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 8c2c07898..31ac534f6 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -412,8 +412,10 @@ enum dpdk_hw_ol_features {
> NETDEV_RX_HW_CRC_STRIP = 1 << 1,
> NETDEV_RX_HW_SCATTER = 1 << 2,
> NETDEV_TX_IPV4_CKSUM_OFFLOAD = 1 << 3,
> - NETDEV_TX_TSO_OFFLOAD = 1 << 4,
> - NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 5,
> + NETDEV_TX_TCP_CKSUM_OFFLOAD = 1 << 4,
> + NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5,
> + NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6,
> + NETDEV_TX_TSO_OFFLOAD = 1 << 7,
> };
>
> /*
> @@ -1008,6 +1010,35 @@ dpdk_watchdog(void *dummy OVS_UNUSED)
> return NULL;
> }
>
> +static void
> +netdev_dpdk_update_netdev_flag(struct netdev_dpdk *dev,
> + enum dpdk_hw_ol_features hw_ol_features,
> + enum netdev_ol_flags flag)
Annotation that it requires dev->mutex would be nice.
> +{
> + struct netdev *netdev = &dev->up;
> +
> + if (dev->hw_ol_features & hw_ol_features) {
> + netdev->ol_flags |= flag;
> + } else {
> + netdev->ol_flags &= ~flag;
> + }
> +}
> +
> +static void
> +netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev)
> +{
> + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_IPV4_CKSUM_OFFLOAD,
> + NETDEV_TX_OFFLOAD_IPV4_CKSUM);
> + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TCP_CKSUM_OFFLOAD,
> + NETDEV_TX_OFFLOAD_TCP_CKSUM);
> + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_UDP_CKSUM_OFFLOAD,
> + NETDEV_TX_OFFLOAD_UDP_CKSUM);
> + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_SCTP_CKSUM_OFFLOAD,
> + NETDEV_TX_OFFLOAD_SCTP_CKSUM);
> + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD,
> + NETDEV_TX_OFFLOAD_TCP_TSO);
> +}
> +
> static int
> dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
> {
> @@ -1044,11 +1075,20 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int
> n_rxq, int n_txq)
> conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_IPV4_CKSUM;
> }
>
> + if (dev->hw_ol_features & NETDEV_TX_TCP_CKSUM_OFFLOAD) {
> + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
> + }
> +
> + if (dev->hw_ol_features & NETDEV_TX_UDP_CKSUM_OFFLOAD) {
> + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_UDP_CKSUM;
> + }
> +
> + if (dev->hw_ol_features & NETDEV_TX_SCTP_CKSUM_OFFLOAD) {
> + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM;
> + }
> +
> if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
> - conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
> - if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
> - conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM;
> - }
> + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
> }
>
> /* Limit configured rss hash functions to only those supported
> @@ -1154,7 +1194,6 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
> struct rte_ether_addr eth_addr;
> int diag;
> int n_rxq, n_txq;
> - uint32_t tx_tso_offload_capa = DPDK_TX_TSO_OFFLOAD_FLAGS;
> uint32_t rx_chksm_offload_capa = RTE_ETH_RX_OFFLOAD_UDP_CKSUM |
> RTE_ETH_RX_OFFLOAD_TCP_CKSUM |
> RTE_ETH_RX_OFFLOAD_IPV4_CKSUM;
> @@ -1190,18 +1229,28 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
> dev->hw_ol_features &= ~NETDEV_TX_IPV4_CKSUM_OFFLOAD;
> }
>
> + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_CKSUM) {
> + dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD;
> + } else {
> + dev->hw_ol_features &= ~NETDEV_TX_TCP_CKSUM_OFFLOAD;
> + }
> +
> + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_UDP_CKSUM) {
> + dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD;
> + } else {
> + dev->hw_ol_features &= ~NETDEV_TX_UDP_CKSUM_OFFLOAD;
> + }
> +
> + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) {
> + dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD;
> + } else {
> + dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
> + }
> +
> dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
> if (userspace_tso_enabled()) {
> - if ((info.tx_offload_capa & tx_tso_offload_capa)
> - == tx_tso_offload_capa) {
> + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) {
> dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
> - if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) {
> - dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
> - } else {
> - VLOG_WARN("%s: Tx SCTP checksum offload is not supported, "
> - "SCTP packets sent to this device will be dropped",
> - netdev_get_name(&dev->up));
> - }
> } else {
> VLOG_WARN("%s: Tx TSO offload is not supported.",
> netdev_get_name(&dev->up));
> @@ -2213,6 +2262,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev,
> struct rte_mbuf *mbuf)
>
> mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt);
> mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt);
> + mbuf->l4_len = 0;
> mbuf->outer_l2_len = 0;
> mbuf->outer_l3_len = 0;
>
> @@ -4149,6 +4199,7 @@ new_device(int vid)
> ovs_mutex_lock(&dev->mutex);
> if (nullable_string_is_equal(ifname, dev->vhost_id)) {
> uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
> + uint64_t features;
>
> /* Get NUMA information */
> newnode = rte_vhost_get_numa_node(vid);
> @@ -4173,6 +4224,36 @@ new_device(int vid)
> dev->vhost_reconfigured = true;
> }
>
> + if (rte_vhost_get_negotiated_features(vid, &features)) {
> + VLOG_INFO("Error checking guest features for "
> + "vHost Device '%s'", dev->vhost_id);
> + } else {
> + if (features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) {
> + dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD;
> + dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD;
> + dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD;
> + }
> +
> + if (userspace_tso_enabled()) {
> + if (features & (1ULL << VIRTIO_NET_F_GUEST_TSO4)
> + && features & (1ULL << VIRTIO_NET_F_GUEST_TSO6)) {
> +
> + dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
> + VLOG_DBG("%s: TSO enabled on vhost port",
> + netdev_get_name(&dev->up));
> + } else {
> + VLOG_WARN("%s: Tx TSO offload is not supported.",
> + netdev_get_name(&dev->up));
> + }
> + }
> + }
> +
> + /* There is no support in virtio net to offload IPv4 csum,
> + * but the vhost library handles IPv4 csum offloading fine. */
> + dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD;
> +
> + netdev_dpdk_update_netdev_flags(dev);
> +
> ovsrcu_index_set(&dev->vid, vid);
> exists = true;
>
> @@ -4236,6 +4317,14 @@ destroy_device(int vid)
> dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
> netdev_dpdk_txq_map_clear(dev);
>
> + /* Clear offload capabilities before next new_device. */
> + dev->hw_ol_features &= ~NETDEV_TX_IPV4_CKSUM_OFFLOAD;
> + dev->hw_ol_features &= ~NETDEV_TX_TCP_CKSUM_OFFLOAD;
> + dev->hw_ol_features &= ~NETDEV_TX_UDP_CKSUM_OFFLOAD;
> + dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
> + dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
> + netdev_dpdk_update_netdev_flags(dev);
> +
> netdev_change_seq_changed(&dev->up);
> ovs_mutex_unlock(&dev->mutex);
> exists = true;
> @@ -5246,22 +5335,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
> }
>
> err = dpdk_eth_dev_init(dev);
> -
> - if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) {
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> - } else {
> - netdev->ol_flags &= ~NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> - }
> -
> - if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> - if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
> - }
> - }
> + netdev_dpdk_update_netdev_flags(dev);
>
> /* If both requested and actual hwaddr were previously
> * unset (initialized to 0), then first device init above
> @@ -5308,11 +5382,6 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
> memset(dev->sw_stats, 0, sizeof *dev->sw_stats);
> rte_spinlock_unlock(&dev->stats_lock);
>
> - if (userspace_tso_enabled()) {
> - dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
> - VLOG_DBG("%s: TSO enabled on vhost port", netdev_get_name(&dev->up));
> - }
> -
> netdev_dpdk_remap_txqs(dev);
>
> if (netdev_dpdk_get_vid(dev) >= 0) {
> @@ -5333,6 +5402,8 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
> }
> }
>
> + netdev_dpdk_update_netdev_flags(dev);
> +
> return 0;
> }
>
> @@ -5354,8 +5425,6 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev
> *netdev)
> {
> struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> int err;
> - uint64_t vhost_flags = 0;
> - uint64_t vhost_unsup_flags;
>
> ovs_mutex_lock(&dev->mutex);
>
> @@ -5365,6 +5434,9 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev
> *netdev)
> * 2. A path has been specified.
> */
> if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id)
> {
> + uint64_t virtio_unsup_features = 0;
> + uint64_t vhost_flags = 0;
> +
> /* Register client-mode device. */
> vhost_flags |= RTE_VHOST_USER_CLIENT;
>
> @@ -5411,22 +5483,22 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev
> *netdev)
> }
>
> if (userspace_tso_enabled()) {
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
> - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_HOST_ECN
> - | 1ULL << VIRTIO_NET_F_HOST_UFO;
> + virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_ECN
> + | 1ULL << VIRTIO_NET_F_HOST_UFO;
> + VLOG_DBG("%s: TSO enabled on vhost port",
> + netdev_get_name(&dev->up));
> } else {
> - /* This disables checksum offloading and all the features
> - * that depends on it (TSO, UFO, ECN) according to virtio
> - * specification. */
> - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_CSUM;
> + /* Advertise checksum offloading to the guest, but explicitly
> + * disable TSO and friends.
> + * NOTE: we can't disable HOST_ECN which may have been wrongly
> + * negotiated by a running guest. */
> + virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_TSO4
> + | 1ULL << VIRTIO_NET_F_HOST_TSO6
> + | 1ULL << VIRTIO_NET_F_HOST_UFO;
> }
>
> err = rte_vhost_driver_disable_features(dev->vhost_id,
> - vhost_unsup_flags);
> + virtio_unsup_features);
> if (err) {
> VLOG_ERR("rte_vhost_driver_disable_features failed for "
> "vhost user client port: %s\n", dev->up.name);
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 36620199e..8b4a327ae 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -938,14 +938,6 @@ netdev_linux_common_construct(struct netdev *netdev_)
> netnsid_unset(&netdev->netnsid);
> ovs_mutex_init(&netdev->mutex);
>
> - if (userspace_tso_enabled()) {
> - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
> - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
> - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
> - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
> - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> - }
> -
> return 0;
> }
>
> @@ -959,6 +951,16 @@ netdev_linux_construct(struct netdev *netdev_)
> return error;
> }
>
> + /* The socket interface doesn't offer the option to enable only
> + * csum offloading without TSO. */
> + if (userspace_tso_enabled()) {
> + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
> + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
> + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
> + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
> + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> + }
> +
> error = get_flags(&netdev->up, &netdev->ifi_flags);
> if (error == ENODEV) {
> if (netdev->up.netdev_class != &netdev_internal_class) {
> @@ -987,6 +989,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
> struct netdev_linux *netdev = netdev_linux_cast(netdev_);
> static const char tap_dev[] = "/dev/net/tun";
> const char *name = netdev_->name;
> + unsigned long oflags;
> struct ifreq ifr;
>
> int error = netdev_linux_common_construct(netdev_);
> @@ -1004,10 +1007,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
>
> /* Create tap device. */
> get_flags(&netdev->up, &netdev->ifi_flags);
> - ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
> - if (userspace_tso_enabled()) {
> - ifr.ifr_flags |= IFF_VNET_HDR;
> - }
> + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
>
> ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
> if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
> @@ -1030,21 +1030,22 @@ netdev_linux_construct_tap(struct netdev *netdev_)
> goto error_close;
> }
>
> + oflags = TUN_F_CSUM;
> if (userspace_tso_enabled()) {
> - /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
> - * available, it will return EINVAL when a flag is unknown.
> - * Therefore, try enabling offload with no flags to check
> - * if TUNSETOFFLOAD support is available or not. */
> - if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, 0) == 0 || errno != EINVAL)
> {
> - unsigned long oflags = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
> -
> - if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == -1) {
> - VLOG_WARN("%s: enabling tap offloading failed: %s", name,
> - ovs_strerror(errno));
> - error = errno;
> - goto error_close;
> - }
> - }
> + oflags |= (TUN_F_TSO4 | TUN_F_TSO6);
> + }
> +
> + if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == 0) {
> + netdev_->ol_flags |= (NETDEV_TX_OFFLOAD_IPV4_CKSUM
> + | NETDEV_TX_OFFLOAD_TCP_CKSUM
> + | NETDEV_TX_OFFLOAD_UDP_CKSUM);
> +
> + if (userspace_tso_enabled()) {
> + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
> + }
> + } else {
> + VLOG_WARN("%s: Disabling hardware offloading: %s", name,
> + ovs_strerror(errno));
This is not a great wording. 'Hardware offloading' is associated
with flow offloading and might be misinterpreted.
> }
>
> netdev->present = true;
> @@ -1344,18 +1345,22 @@ netdev_linux_batch_rxq_recv_sock(struct
> netdev_rxq_linux *rx, int mtu,
> pkt = buffers[i];
> }
>
> - if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
> - struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
> - struct netdev_linux *netdev = netdev_linux_cast(netdev_);
> + if (virtio_net_hdr_size) {
> + int ret = netdev_linux_parse_vnet_hdr(pkt);
> + if (OVS_UNLIKELY(ret)) {
> + struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
> + struct netdev_linux *netdev = netdev_linux_cast(netdev_);
>
> - /* Unexpected error situation: the virtio header is not present
> - * or corrupted. Drop the packet but continue in case next ones
> - * are correct. */
> - dp_packet_delete(pkt);
> - netdev->rx_dropped += 1;
> - VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net
> header",
> - netdev_get_name(netdev_));
> - continue;
> + /* Unexpected error situation: the virtio header is not
> + * present or corrupted or contains unsupported features.
> + * Drop the packet but continue in case next ones are
> + * correct. */
> + dp_packet_delete(pkt);
> + netdev->rx_dropped += 1;
> + VLOG_WARN_RL(&rl, "%s: Dropped packet: %s",
> + netdev_get_name(netdev_), ovs_strerror(ret));
We should probbaly still mention that it was an error in the virtio net
header. I doubt the error code provides sufficient information.
<snip>
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index 93b6b6ccc..ebb13d2d1 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -485,9 +485,11 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch
> *batch,
> size_t l4_size = dp_packet_l4_size(packet);
>
> if (nh->ip_proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) {
> - /* New UDP checksum. */
> struct udp_header *uh = dp_packet_l4(packet);
> - if (uh->udp_csum) {
> + if (dp_packet_hwol_l4_is_udp(packet)) {
> + dp_packet_ol_reset_l4_csum_good(packet);
> + } else if (uh->udp_csum) {
> + /* New UDP checksum. */
> uint16_t old_udp_checksum = ~uh->udp_csum;
> uint32_t udp_checksum = old_udp_checksum +
> delta_checksum;
> udp_checksum = csum_finish(udp_checksum);
> @@ -500,13 +502,17 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch
> *batch,
> }
> } else if (nh->ip_proto == IPPROTO_TCP &&
> l4_size >= TCP_HEADER_LEN) {
> - /* New TCP checksum. */
> - struct tcp_header *th = dp_packet_l4(packet);
> - uint16_t old_tcp_checksum = ~th->tcp_csum;
> - uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;
> - tcp_checksum = csum_finish(tcp_checksum);
> -
> - th->tcp_csum = tcp_checksum;
> + if (dp_packet_hwol_l4_is_tcp(packet)) {
> + dp_packet_ol_reset_l4_csum_good(packet);
> + } else {
> + /* New TCP checksum. */
> + struct tcp_header *th = dp_packet_l4(packet);
> + uint16_t old_tcp_checksum = ~th->tcp_csum;
> + uint32_t tcp_checksum = old_tcp_checksum +
> delta_checksum;
> + tcp_checksum = csum_finish(tcp_checksum);
> +
> + th->tcp_csum = tcp_checksum;
> + }
This covers IPv4, but we also have IPv6. See action_avx512_set_ipv6().
Best regrds, Ilya Maximets.
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev