On Fri, Jun 21, 2024 at 11:08 AM David Marchand
<[email protected]> wrote:
>
> Hi Mike,
>
> On Thu, Jun 13, 2024 at 4:54 AM Mike Pattrick <[email protected]> wrote:
> >
> > When sending packets that are flagged as requiring segmentation to an
> > interface that doens't support this feature, send the packet to the TSO
> > software fallback instead of dropping it.
> >
> > Signed-off-by: Mike Pattrick <[email protected]>
>
> I started testing this patch (still not finished), I have some
> questions and comments.
>
> >
> > ---
> > v2:
> >  - Fixed udp tunnel length
> >  - Added test that UDP headers are correct
> >  - Split inner and outer ip_id into different counters
> >  - Set tunnel flags in reset_tcp_seg
> >
> > Signed-off-by: Mike Pattrick <[email protected]>
> > ---
> >  lib/dp-packet-gso.c     | 89 +++++++++++++++++++++++++++++++++--------
> >  lib/dp-packet.h         | 34 ++++++++++++++++
> >  lib/netdev-native-tnl.c |  8 ++++
> >  lib/netdev.c            | 37 +++++++----------
> >  tests/system-traffic.at | 87 ++++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 216 insertions(+), 39 deletions(-)
> >
> > diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> > index 847685ad9..dc43ad662 100644
> > --- a/lib/dp-packet-gso.c
> > +++ b/lib/dp-packet-gso.c
> > @@ -47,6 +47,8 @@ dp_packet_gso_seg_new(const struct dp_packet *p, size_t 
> > hdr_len,
> >      seg->l2_5_ofs = p->l2_5_ofs;
> >      seg->l3_ofs = p->l3_ofs;
> >      seg->l4_ofs = p->l4_ofs;
> > +    seg->inner_l3_ofs = p->inner_l3_ofs;
> > +    seg->inner_l4_ofs = p->inner_l4_ofs;
> >
> >      /* The protocol headers remain the same, so preserve hash and mark. */
> >      *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p);
> > @@ -71,7 +73,12 @@ dp_packet_gso_nr_segs(struct dp_packet *p)
> >      const char *data_tail;
> >      const char *data_pos;
> >
> > -    data_pos = dp_packet_get_tcp_payload(p);
> > +    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
> > +        dp_packet_hwol_is_tunnel_geneve(p)) {
> > +        data_pos = dp_packet_get_inner_tcp_payload(p);
> > +    } else {
> > +        data_pos = dp_packet_get_tcp_payload(p);
> > +    }
> >      data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
> >
> >      return DIV_ROUND_UP(data_tail - data_pos, segsz);
> > @@ -89,14 +96,19 @@ dp_packet_gso(struct dp_packet *p, struct 
> > dp_packet_batch **batches)
> >      static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> >      struct dp_packet_batch *curr_batch = *batches;
> >      struct tcp_header *tcp_hdr;
> > +    struct udp_header *tnl_hdr;
> >      struct ip_header *ip_hdr;
> > +    uint16_t inner_ip_id = 0;
> > +    uint16_t outer_ip_id = 0;
> >      struct dp_packet *seg;
> > +    const char *data_pos;
> >      uint16_t tcp_offset;
> >      uint16_t tso_segsz;
> >      uint32_t tcp_seq;
> > -    uint16_t ip_id;
> > +    bool outer_ipv4;
> >      int hdr_len;
> >      int seg_len;
> > +    bool tnl;
> >
> >      tso_segsz = dp_packet_get_tso_segsz(p);
> >      if (!tso_segsz) {
> > @@ -105,20 +117,38 @@ dp_packet_gso(struct dp_packet *p, struct 
> > dp_packet_batch **batches)
> >          return false;
> >      }
> >
> > -    tcp_hdr = dp_packet_l4(p);
> > -    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> > -    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> > -    hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p))
> > -              + tcp_offset * 4;
> > -    ip_id = 0;
> > -    if (dp_packet_hwol_is_ipv4(p)) {
> > +    if (dp_packet_hwol_is_tunnel_vxlan(p) ||
> > +            dp_packet_hwol_is_tunnel_geneve(p)) {
> > +        data_pos =  dp_packet_get_inner_tcp_payload(p);
> > +        outer_ipv4 = dp_packet_hwol_is_outer_ipv4(p);
> > +        tcp_hdr = dp_packet_inner_l4(p);
> > +        ip_hdr = dp_packet_inner_l3(p);
> > +        tnl = true;
> > +
> > +        if (outer_ipv4) {
> > +            outer_ip_id = ntohs(((struct ip_header *) 
> > dp_packet_l3(p))->ip_id);
> > +        }
> > +        if (dp_packet_hwol_is_ipv4(p)) {
> > +            inner_ip_id = ntohs(ip_hdr->ip_id);
> > +        }
> > +    } else {
> > +        data_pos = dp_packet_get_tcp_payload(p);
> > +        outer_ipv4 = dp_packet_hwol_is_ipv4(p);
> > +        tcp_hdr = dp_packet_l4(p);
> >          ip_hdr = dp_packet_l3(p);
> > -        ip_id = ntohs(ip_hdr->ip_id);
> > +        tnl = false;
> > +
> > +        if (outer_ipv4) {
> > +            outer_ip_id = ntohs(ip_hdr->ip_id);
> > +        }
> >      }
> >
> > +    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> > +    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> > +    hdr_len = ((char *) tcp_hdr - (char *) dp_packet_eth(p))
> > +              + tcp_offset * 4;
> >      const char *data_tail = (char *) dp_packet_tail(p)
> >                              - dp_packet_l2_pad_size(p);
> > -    const char *data_pos = dp_packet_get_tcp_payload(p);
> >      int n_segs = dp_packet_gso_nr_segs(p);
> >
> >      for (int i = 0; i < n_segs; i++) {
> > @@ -130,14 +160,35 @@ dp_packet_gso(struct dp_packet *p, struct 
> > dp_packet_batch **batches)
> >          seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
> >          data_pos += seg_len;
> >
> > +        if (tnl) {
> > +            /* Update tunnel inner L3 header. */
> > +            if (dp_packet_hwol_is_ipv4(seg)) {
> > +                ip_hdr = dp_packet_inner_l3(seg);
> > +                ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
> > +                                           dp_packet_inner_l4_size(seg));
> > +                ip_hdr->ip_id = htons(inner_ip_id);
> > +                ip_hdr->ip_csum = 0;
> > +                inner_ip_id++;
> > +            } else {
> > +                struct ovs_16aligned_ip6_hdr *ip6_hdr;
> > +
> > +                ip6_hdr = dp_packet_inner_l3(seg);
> > +                ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
> > +                    = htons(dp_packet_inner_l3_size(seg) - sizeof 
> > *ip6_hdr);
> > +            }
> > +        }
> > +
> >          /* Update L3 header. */
> > -        if (dp_packet_hwol_is_ipv4(seg)) {
> > +        if (outer_ipv4) {
> > +            uint16_t ip_tot_len;
> >              ip_hdr = dp_packet_l3(seg);
> > -            ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
> > -                                       dp_packet_l4_size(seg));
> > -            ip_hdr->ip_id = htons(ip_id);
> > +            tnl_hdr = dp_packet_l4(seg);
> > +            ip_tot_len = sizeof *ip_hdr + dp_packet_l4_size(seg);
> > +            ip_hdr->ip_tot_len = htons(ip_tot_len);
> > +            ip_hdr->ip_id = htons(outer_ip_id);
> >              ip_hdr->ip_csum = 0;
> > -            ip_id++;
> > +            tnl_hdr->udp_len = htons(ip_tot_len - IP_HEADER_LEN);
>
> IIUC, this part is common for non-tunneled and tunneled packets but it
> does not make sense to update tnl_hdr->udp_len for non-tunneled
> packets.
> Plus, udp_len is not adjusted in the ipv6 case so the announced length
> is that of the original TSO frame.
>
> udp_len update should probably move in the if (tnl) { previous block.
> Something like:
> @@ -176,18 +177,19 @@ dp_packet_gso(struct dp_packet *p, struct
> dp_packet_batch **batches)
>                  ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
>                      = htons(dp_packet_inner_l3_size(seg) - sizeof *ip6_hdr);
>              }
> +
> +            tnl_hdr = dp_packet_l4(seg);
> +            tnl_hdr->udp_len = htons(dp_packet_l4_size(seg));
>          }
>
>          /* Update L3 header. */
>
> > +            outer_ip_id++;
> >          } else {
> >              struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
> >
> > @@ -146,7 +197,11 @@ dp_packet_gso(struct dp_packet *p, struct 
> > dp_packet_batch **batches)
> >          }
> >
> >          /* Update L4 header. */
> > -        tcp_hdr = dp_packet_l4(seg);
> > +        if (tnl) {
> > +            tcp_hdr = dp_packet_inner_l4(seg);
> > +        } else {
> > +            tcp_hdr = dp_packet_l4(seg);
> > +        }
> >          put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
> >          tcp_seq += seg_len;
> >          if (OVS_LIKELY(i < (n_segs - 1))) {
> > diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> > index a75b1c5cd..af1ff088f 100644
> > --- a/lib/dp-packet.h
> > +++ b/lib/dp-packet.h
> > @@ -529,6 +529,16 @@ dp_packet_inner_l3(const struct dp_packet *b)
> >             : NULL;
> >  }
> >
> > +static inline size_t
> > +dp_packet_inner_l3_size(const struct dp_packet *b)
> > +{
> > +    return OVS_LIKELY(b->inner_l3_ofs != UINT16_MAX)
> > +           ? (const char *) dp_packet_tail(b)
> > +           - (const char *) dp_packet_inner_l3(b)
> > +           - dp_packet_l2_pad_size(b)
> > +           : 0;
> > +}
> > +
> >  static inline void *
> >  dp_packet_inner_l4(const struct dp_packet *b)
> >  {
> > @@ -563,6 +573,22 @@ dp_packet_get_tcp_payload(const struct dp_packet *b)
> >      return NULL;
> >  }
> >
> > +static inline const void *
> > +dp_packet_get_inner_tcp_payload(const struct dp_packet *b)
> > +{
> > +    size_t l4_size = dp_packet_inner_l4_size(b);
> > +
> > +    if (OVS_LIKELY(l4_size >= TCP_HEADER_LEN)) {
> > +        struct tcp_header *tcp = dp_packet_inner_l4(b);
> > +        int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
> > +
> > +        if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= l4_size)) {
> > +            return (const char *) tcp + tcp_len;
> > +        }
> > +    }
> > +    return NULL;
> > +}
> > +
> >  static inline uint32_t
> >  dp_packet_get_tcp_payload_length(const struct dp_packet *pkt)
> >  {
> > @@ -1320,6 +1346,14 @@ dp_packet_hwol_reset_tcp_seg(struct dp_packet *p)
> >          ol_flags |= DP_PACKET_OL_TX_IP_CKSUM;
> >      }
> >
> > +    if (ol_flags & (DP_PACKET_OL_TX_TUNNEL_VXLAN |
> > +                    DP_PACKET_OL_TX_TUNNEL_GENEVE)) {
> > +        if (ol_flags & DP_PACKET_OL_TX_OUTER_IPV4) {
> > +            ol_flags |= DP_PACKET_OL_TX_OUTER_IP_CKSUM;
> > +        }
> > +        ol_flags |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM;
> > +    }
> > +
> >      *dp_packet_ol_flags_ptr(p) = ol_flags;
> >  }
> >
> > diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
> > index 0f9f07f44..22cc858e7 100644
> > --- a/lib/netdev-native-tnl.c
> > +++ b/lib/netdev-native-tnl.c
> > @@ -308,6 +308,14 @@ netdev_tnl_push_udp_header(const struct netdev *netdev 
> > OVS_UNUSED,
> >      if (l4_ofs != UINT16_MAX) {
> >          packet->inner_l4_ofs = l4_ofs + data->header_len;
> >      }
> > +
> > +    if (dp_packet_hwol_is_tso(packet)) {
> > +        uint16_t tso_segsz = dp_packet_get_tso_segsz(packet);
> > +        if (tso_segsz > data->header_len) {
> > +            tso_segsz -= data->header_len;
> > +            dp_packet_set_tso_segsz(packet, tso_segsz);
> > +        }
> > +    }
> >  }
> >
> >  static void *
>
> Is this hunk a fix valid even for hw-assisted tso?
> If so, I would make it a separate patch.

I just realized that I had forgotten to extract this into a seperate
patch. This code is actually over a year old but now that I look at it
with fresh eyes I think it might be more appropriate to adjust this in
push_ip_header, as the same concern with pushed headers potentially
exceeding the intended MTU post segmentation also exists with non-udp
tunnels.

I'll send in a new version of this patch, but it shouldn't affect any
performance tests in progress.

Cheers,
M

>
>
> > diff --git a/lib/netdev.c b/lib/netdev.c
> > index f2d921ed6..1d59bbe5d 100644
> > --- a/lib/netdev.c
> > +++ b/lib/netdev.c
> > @@ -69,8 +69,6 @@ COVERAGE_DEFINE(netdev_received);
> >  COVERAGE_DEFINE(netdev_sent);
> >  COVERAGE_DEFINE(netdev_add_router);
> >  COVERAGE_DEFINE(netdev_get_stats);
> > -COVERAGE_DEFINE(netdev_vxlan_tso_drops);
> > -COVERAGE_DEFINE(netdev_geneve_tso_drops);
> >  COVERAGE_DEFINE(netdev_push_header_drops);
> >  COVERAGE_DEFINE(netdev_soft_seg_good);
> >  COVERAGE_DEFINE(netdev_soft_seg_drops);
> > @@ -910,28 +908,23 @@ netdev_send(struct netdev *netdev, int qid, struct 
> > dp_packet_batch *batch,
> >      struct dp_packet *packet;
> >      int error;
> >
> > -    if (userspace_tso_enabled() &&
> > -        !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> > -        DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > -            if (dp_packet_hwol_is_tso(packet)) {
> > -                if (dp_packet_hwol_is_tunnel_vxlan(packet)
> > -                    && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) {
> > -                        VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support",
> > -                                     netdev_get_name(netdev));
> > -                        COVERAGE_INC(netdev_vxlan_tso_drops);
> > -                        dp_packet_delete_batch(batch, true);
> > -                        return false;
> > +    if (userspace_tso_enabled()) {
> > +        if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> > +            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +                if (dp_packet_hwol_is_tso(packet)) {
> > +                    return netdev_send_tso(netdev, qid, batch, 
> > concurrent_txq);
> >                  }
> > -
> > -                if (dp_packet_hwol_is_tunnel_geneve(packet)
> > -                    && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) {
> > -                        VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support",
> > -                                     netdev_get_name(netdev));
> > -                        COVERAGE_INC(netdev_geneve_tso_drops);
> > -                        dp_packet_delete_batch(batch, true);
> > -                        return false;
> > +            }
> > +        } else if (!(netdev_flags & (NETDEV_TX_VXLAN_TNL_TSO |
> > +                                     NETDEV_TX_GENEVE_TNL_TSO))) {
>
> Only checking those flags does not seem to work for me.
> But this seems due to a driver bug.
>
> Example on my system, with CX-5:
>
> # ovs-vsctl get interface dpdk0 status | tr ',' '\n'
> {bus_info="bus_name=pci
>  vendor_id=15b3
>  device_id=1019"
>  driver_name=mlx5_pci
>  if_descr="DPDK 23.11.0 mlx5_pci"
>  if_type="6"
>  link_speed="100Gbps"
>  max_hash_mac_addrs="0"
>  max_mac_addrs="128"
>  max_rx_pktlen="1518"
>  max_rx_queues="1024"
>  max_tx_queues="1024"
>  max_vfs="0"
>  max_vmdq_pools="0"
>  min_rx_bufsize="32"
>  n_rxq="1"
>  n_txq="5"
>  numa_id="1"
>  port_no="2"
>  rx-steering=rss
>  rx_csum_offload="true"
>  tx_geneve_tso_offload="true"
>  ^^^
>
>  tx_ip_csum_offload="true"
>  tx_out_ip_csum_offload="true"
>  tx_out_udp_csum_offload="false"
>  ^^^
>
>  tx_sctp_csum_offload="false"
>  tx_tcp_csum_offload="true"
>  tx_tcp_seg_offload="true"
>  tx_udp_csum_offload="true"
>  tx_vxlan_tso_offload="true"}
>  ^^^
>
> net/mlx5 is claiming (geneve|vxlan)_tso_offload while not supporting
> outer udp checksum.
>
> I added the hunk below for now:
>
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 3cdcf8f8d..077768832 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -1413,14 +1413,16 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>                        netdev_get_name(&dev->up));
>          }
>
> -        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM &&
> +            info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
>              dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
>          } else {
>              VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
>                        netdev_get_name(&dev->up));
>          }
>
> -        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM &&
> +            info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
>              dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
>          } else {
>              VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
>
>
> > +            DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +                if (!dp_packet_hwol_is_tso(packet)) {
> > +                    continue;
> > +                }
> > +                if (dp_packet_hwol_is_tunnel_vxlan(packet) ||
> > +                    dp_packet_hwol_is_tunnel_geneve(packet)) {
> > +                    return netdev_send_tso(netdev, qid, batch, 
> > concurrent_txq);
> >                  }
> > -                return netdev_send_tso(netdev, qid, batch, concurrent_txq);
> >              }
> >          }
> >      }
>
>
> --
> David Marchand
>

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to