Hello,

Quick (unfinished) comments, I will try to have a better look in the
next weeks (but I am busy with 23.11).


On Mon, Sep 25, 2023 at 3:38 AM Dexia Li <dexia...@jaguarmicro.com> wrote:
>
> For dpdk datapath, this patch provides vxlan and geneve tunnel tso.

I suppose you are talking about the userspace datapath, using DPDK support.
There is no dpdk datapath.


> Only support userspace vxlan or geneve tunnel, meanwhile support
> tunnel outter and inner csum offload. if netdev do not support offload
> features, there is a software fallback.

Can you provide performance numbers of the scenario you tested?
How does this patch impact performance with simple traffic?
Did you check L3 checksum, L4 checksum still work without tunnels?


>
> Signed-off-by: Dexia Li <dexia...@jaguarmicro.com>
> ---
>  lib/dp-packet.h         | 165 ++++++++++++++++++++++++++++++++++++----
>  lib/netdev-dpdk.c       | 100 +++++++++++++++++++++---
>  lib/netdev-native-tnl.c | 106 +++++++++++++++++++++++---
>  lib/netdev-native-tnl.h |   5 +-
>  lib/netdev-provider.h   |   4 +
>  lib/netdev.c            |  34 ++++++---
>  tests/dpif-netdev.at    |   4 +-
>  7 files changed, 369 insertions(+), 49 deletions(-)
>
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 70ddf8aa4..4b5883dba 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -86,22 +86,43 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800),
>      /* Offload IP checksum. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 0x1000),
> +    /* Offload packet is tunnel GENEVE. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_GENEVE,
> +                RTE_MBUF_F_TX_TUNNEL_GENEVE, 0x2000),
> +    /* Offload packet is tunnel VXLAN. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN,
> +                RTE_MBUF_F_TX_TUNNEL_VXLAN, 0x4000),
> +    /* Offload tunnel packet, out is ipv4 */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4,
> +                RTE_MBUF_F_TX_OUTER_IPV4, 0x8000),
> +    /* Offload TUNNEL out ipv4 checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_IP_CKSUM, 0x10000),
> +    /* Offload TUNNEL out udp checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_UDP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_UDP_CKSUM, 0x20000),
> +
>      /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
>  };
>
> -#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH         | \
> -                                     DP_PACKET_OL_FLOW_MARK        | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_TX_TCP_SEG       | \
> -                                     DP_PACKET_OL_TX_IPV4          | \
> -                                     DP_PACKET_OL_TX_IPV6          | \
> -                                     DP_PACKET_OL_TX_TCP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> -                                     DP_PACKET_OL_TX_IP_CKSUM)
> +#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH           | \
> +                                     DP_PACKET_OL_FLOW_MARK          | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_TX_TCP_SEG         | \
> +                                     DP_PACKET_OL_TX_IPV4            | \
> +                                     DP_PACKET_OL_TX_IPV6            | \
> +                                     DP_PACKET_OL_TX_TCP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_UDP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM      | \
> +                                     DP_PACKET_OL_TX_IP_CKSUM        | \
> +                                     DP_PACKET_OL_TX_TUNNEL_GENEVE   | \
> +                                     DP_PACKET_OL_TX_TUNNEL_VXLAN    | \
> +                                     DP_PACKET_OL_TX_OUTER_IPV4      | \
> +                                     DP_PACKET_OL_TX_OUTER_IP_CKSUM  | \
> +                                     DP_PACKET_OL_TX_OUTER_UDP_CKSUM)
>
>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \
> @@ -535,6 +556,25 @@ dp_packet_get_nd_payload(const struct dp_packet *b)
>  }
>
>  #ifdef DPDK_NETDEV
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b, size_t l2_len)
> +{
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b, size_t l3_len)
> +{
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b, size_t l4_len)
> +{
> +    b->mbuf.l4_len = l4_len;
> +}
> +
> +

Exposing DPDK specific fields looks incorrect to me.
This should be abstracted in some way so that OVS "generic" code is
unaware of what runs underneath.


>  static inline uint64_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)
>  {
> @@ -554,6 +594,24 @@ dp_packet_flow_mark_ptr(const struct dp_packet *b)
>  }
>
>  #else
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b OVS_UNUSED, size_t l2_len 
> OVS_UNUSED)
> +{
> +    /* There are no implementation */
> +}
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b OVS_UNUSED, size_t l3_len 
> OVS_UNUSED)
> +{
> +    /* There are no implementation */
> +}
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b OVS_UNUSED, size_t l4_len 
> OVS_UNUSED)
> +{
> +    /* There are no implementation */
> +}
> +

What happens if you run this code without DPDK?
Note that those empty stubs would not be needed with a proper abstraction.


>  static inline uint32_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)
>  {
> @@ -615,9 +673,10 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v)
>       * (and thus 'v') will always be <= UINT16_MAX; this means that there is 
> no
>       * loss of accuracy in assigning 'v' to 'data_len'.
>       */
> -    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
> -    b->mbuf.pkt_len = v;             /* Total length of all segments linked 
> to
> -                                      * this segment. */
> +    /* Current seg length. */
> +    b->mbuf.data_len += (uint16_t)(v - b->mbuf.pkt_len);

What is this change for?


> +    /* Total length of all segments linked to this segment. */
> +    b->mbuf.pkt_len = v;
>  }
>
>  static inline uint16_t
> @@ -1030,6 +1089,43 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b)
>              DP_PACKET_OL_TX_SCTP_CKSUM;
>  }
>
> +/* Returns 'true' if packet 'b' is marked for tunnel GENEVE
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_geneve(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_GENEVE);
> +}
> +
> +/* Returns 'true' if packet 'b' is marked for tunnel VXLAN
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_vxlan(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_VXLAN);
> +}
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4. */
> +static inline bool
> +dp_packet_hwol_is_outer_ipv4(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IPV4);
> +}
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4 csum offload. */
> +static inline bool
> +dp_packet_hwol_is_outer_ipv4_cksum(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IP_CKSUM);
> +}
> +
> +/* Returns 'true' if packet 'b' is marked for out udp csum offload. */
> +static inline bool
> +dp_packet_hwol_is_outer_UDP_cksum(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_UDP_CKSUM);
> +}
> +
>  static inline void
>  dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p)
>  {
> @@ -1105,6 +1201,43 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>  }
>
> +/* Mark packet 'b' for tunnel geneve offloading.  It implies that
> + * the packet 'b' is marked for tunnel geneve offloading. */
> +static inline void
> +dp_packet_hwol_set_tunnel_geneve(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_GENEVE;
> +}
> +
> +/* Mark packet 'b' for tunnel vxlan offloading.  It implies that
> + * the packet 'b' is marked for tunnel vxlan offloading. */
> +static inline void
> +dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN;
> +}
> +
> +/* Mark packet 'b' for out ipv4 packet. */
> +static inline void
> +dp_packet_hwol_set_outer_ipv4(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4;
> +}
> +
> +/* Mark packet 'b' for out ipv4 csum offloading. */
> +static inline void
> +dp_packet_hwol_set_outer_ipv4_csum(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IP_CKSUM;
> +}
> +
> +/* Mark packet 'b' for out udp csum offloading. */
> +static inline void
> +dp_packet_hwol_set_outer_udp_csum(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM;
> +}
> +
>  /* Returns 'true' if the IP header has good integrity and the
>   * checksum in it is complete. */
>  static inline bool
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 55700250d..fbe46fc4b 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -416,6 +416,10 @@ enum dpdk_hw_ol_features {
>      NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5,
>      NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6,
>      NETDEV_TX_TSO_OFFLOAD = 1 << 7,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8,
> +    NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9,
> +    NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10,
> +    NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11,
>  };
>
>  enum dpdk_rx_steer_flags {
> @@ -1075,6 +1079,14 @@ netdev_dpdk_update_netdev_flags(struct netdev_dpdk 
> *dev)
>                                     NETDEV_TX_OFFLOAD_SCTP_CKSUM);
>      netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD,
>                                     NETDEV_TX_OFFLOAD_TCP_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_VXLAN_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_GENEVE_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD,
> +                                   NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD,
> +                                   NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM);
>  }
>
>  static int
> @@ -1129,6 +1141,23 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int 
> n_rxq, int n_txq)
>          conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
>      }
>
> +    if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
> +    }
> +
> +
>      /* Limit configured rss hash functions to only those supported
>       * by the eth device. */
>      conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads;
> @@ -1346,6 +1375,18 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>          dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
>      }
>
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    }
> +
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    }
> +
>      dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
>      if (userspace_tso_enabled()) {
>          if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) {
> @@ -1354,6 +1395,20 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>              VLOG_WARN("%s: Tx TSO offload is not supported.",
>                        netdev_get_name(&dev->up));
>          }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
>      }
>
>      n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> @@ -2445,29 +2500,56 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, 
> struct rte_mbuf *mbuf)
>          return true;
>      }
>
> -    mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt);
> -    mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt);
> -    mbuf->l4_len = 0;
> -    mbuf->outer_l2_len = 0;
> -    mbuf->outer_l3_len = 0;
> +    /* If packet is vxlan or geneve tunnel packet, calculate outer
> +     * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated
> +     * before. */
> +    if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
> +        if (mbuf->ol_flags &
> +            (RTE_MBUF_F_TX_TUNNEL_GENEVE | RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
> +                     (char *) dp_packet_eth(pkt);
> +            mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
> +                     (char *) dp_packet_l3(pkt);
> +        } else {
> +            mbuf->l2_len = (char *) dp_packet_l3(pkt) -
> +                   (char *) dp_packet_eth(pkt);
> +            mbuf->l3_len = (char *) dp_packet_l4(pkt) -
> +                   (char *) dp_packet_l3(pkt);
> +            mbuf->outer_l2_len = 0;
> +            mbuf->outer_l3_len = 0;
> +        }
> +    }

This hunk above more or less reverts 5d11c47d3ebe ("userspace: Enable
IP checksum offloading by default.").
So I suspect it breaks IP checksum when OVS only requests it (and no L4).


>
>      if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
>          struct tcp_header *th = dp_packet_l4(pkt);
>
>          if (!th) {
> -            VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> -                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +            VLOG_WARN_RL(&rl,
> +            "%s: TCP Segmentation without L4 header,pkt len: %"PRIu32"",
> +                dev->up.name, mbuf->pkt_len);

Whitespace change damage? if so, please drop this part.


>              return false;
>          }
>
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +        if (mbuf->ol_flags & (RTE_MBUF_F_TX_TUNNEL_GENEVE |
> +            RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->tso_segsz  = dev->mtu - mbuf->l2_len - mbuf->l3_len -
> +            mbuf->l4_len - mbuf->outer_l3_len;
> +        } else {
> +            mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +            mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        }
> +
>          mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
>
>          if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
>              mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
>          }
>      }
> +
> +    if ((mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK)
> +        && (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4))
> +        mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;

This looks really strange, please explain why L4 offloads require ip
checksumming.


> +
>      return true;
>  }
>

I did not look at the rest of the implementation for now.



-- 
David Marchand

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to