Re: [ovs-dev] [PATCH v3 2/2] userspace: Add Generic Segmentation Offloading.

Mike Pattrick Tue, 11 Jul 2023 22:29:16 -0700

On Tue, Jul 4, 2023 at 9:00 PM Ilya Maximets <i.maxim...@ovn.org> wrote:
>
> On 6/21/23 22:36, Mike Pattrick wrote:
> > From: Flavio Leitner <f...@sysclose.org>
> >
> > This provides a software implementation in the case
> > the egress netdev doesn't support segmentation in hardware.
> >
> > The challenge here is to guarantee packet ordering in the
> > original batch that may be full of TSO packets. Each TSO
> > packet can go up to ~64kB, so with segment size of 1440
> > that means about 44 packets for each TSO. Each batch has
> > 32 packets, so the total batch amounts to 1408 normal
> > packets.
> >
> > The segmentation estimates the total number of packets
> > and then the total number of batches. Then allocate
> > enough memory and finally do the work.
> >
> > Finally each batch is sent in order to the netdev.
> >
> > Signed-off-by: Flavio Leitner <f...@sysclose.org>
> > Co-authored-by: Mike Pattrick <m...@redhat.com>
> > Signed-off-by: Mike Pattrick <m...@redhat.com>
> > ---
> >  lib/automake.mk     |   2 +
> >  lib/dp-packet-gso.c | 172 ++++++++++++++++++++++++++++++++++++++++++++
> >  lib/dp-packet-gso.h |  24 +++++++
> >  lib/dp-packet.h     |  11 +++
> >  lib/netdev-dpdk.c   |  46 ++++++++----
> >  lib/netdev-linux.c  |  58 ---------------
> >  lib/netdev.c        | 120 ++++++++++++++++++-------------
> >  lib/packets.c       |   4 +-
> >  8 files changed, 314 insertions(+), 123 deletions(-)
> >  create mode 100644 lib/dp-packet-gso.c
> >  create mode 100644 lib/dp-packet-gso.h
> >
> > diff --git a/lib/automake.mk b/lib/automake.mk
> > index e64ee76ce..49a92958d 100644
> > --- a/lib/automake.mk
> > +++ b/lib/automake.mk
> > @@ -118,6 +118,8 @@ lib_libopenvswitch_la_SOURCES = \
> >       lib/dpctl.h \
> >       lib/dp-packet.h \
> >       lib/dp-packet.c \
> > +     lib/dp-packet-gso.c \
> > +     lib/dp-packet-gso.h \
> >       lib/dpdk.h \
> >       lib/dpif-netdev-extract-study.c \
> >       lib/dpif-netdev-lookup.h \
> > diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
> > new file mode 100644
> > index 000000000..bc72e2f90
> > --- /dev/null
> > +++ b/lib/dp-packet-gso.c
> > @@ -0,0 +1,172 @@
> > +/*
> > + * Copyright (c) 2021 Red Hat, Inc.
>
> Need to adjust the year. :)
>
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#include <config.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +
> > +#include "coverage.h"
> > +#include "dp-packet.h"
> > +#include "dp-packet-gso.h"
> > +#include "netdev-provider.h"
> > +#include "openvswitch/vlog.h"
> > +
> > +VLOG_DEFINE_THIS_MODULE(dp_packet_gso);
> > +
> > +COVERAGE_DEFINE(soft_seg_good);
> > +
> > +/* Retuns a new packet that is a segment of packet 'p'.
> > + *
> > + * The new packet is initialized with 'hdr_len' bytes from the
> > + * start of packet 'p' and then appended with 'data_len' bytes
> > + * from the 'data' buffer.
> > + *
> > + * Note: The packet headers are not updated. */
> > +static struct dp_packet *
> > +dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len,
> > +                      const char *data, size_t data_len)
> > +{
> > +    struct dp_packet *seg = dp_packet_new_with_headroom(hdr_len + data_len,
> > +                                                        
> > dp_packet_headroom(p));
> > +
> > +    /* Append the original packet headers and then the payload. */
> > +    dp_packet_put(seg, dp_packet_data(p), hdr_len);
> > +    dp_packet_put(seg, data, data_len);
> > +
> > +    /* The new segment should have the same offsets. */
> > +    seg->l2_5_ofs = p->l2_5_ofs;
> > +    seg->l3_ofs = p->l3_ofs;
> > +    seg->l4_ofs = p->l4_ofs;
> > +
> > +    /* The protocol headers remain the same, so preserve hash and mark. */
> > +    *dp_packet_rss_ptr(seg) = dp_packet_get_rss_hash(p);
> > +    *dp_packet_flow_mark_ptr(seg) = *dp_packet_flow_mark_ptr(p);
> > +
> > +    /* The segment should inherit all the offloading flags from the
> > +     * original packet, except for the TCP segmentation, external
> > +     * buffer and indirect buffer flags. */
> > +    *dp_packet_ol_flags_ptr(seg) = *dp_packet_ol_flags_ptr(p)
> > +        & ~(DP_PACKET_OL_TX_TCP_SEG | DP_PACKET_OL_EXTERNAL
> > +            | DP_PACKET_OL_INDIRECT);
>
> We should inherit DP_PACKET_OL_SUPPORTED_MASK & ~DP_PACKET_OL_TX_TCP_SEG.
> So,
>
>     *dp_packet_ol_flags_ptr(seg) &= DP_PACKET_OL_SUPPORTED_MASK
>                                     & ~DP_PACKET_OL_TX_TCP_SEG;
>
> Is that right?
>
> > +
> > +    dp_packet_hwol_reset_tcp_seg(seg);
>
> Also, this function just resets the DP_PACKET_OL_TX_TCP_SEG.
> So, above should  be just:
>
>     *dp_packet_ol_flags_ptr(seg) &= DP_PACKET_OL_SUPPORTED_MASK;
>
> The same as in the clone function.  Either way one of these operations
> is redundant.
>
> > +
> > +    return seg;
> > +}
> > +
> > +/* Returns the calculated number of TCP segments in packet 'p'. */
> > +int
> > +dp_packet_gso_nr_segs(struct dp_packet *p)
> > +{
> > +    uint16_t segsz = dp_packet_get_tso_segsz(p);
> > +    const char *data_tail;
> > +    const char *data_pos;
> > +    int n_segs;
> > +
> > +    data_pos = dp_packet_get_tcp_payload(p);
> > +    data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
> > +    data_pos = dp_packet_get_tcp_payload(p);
>
> Assigning twice?
>
> > +    n_segs = DIV_ROUND_UP((data_tail - data_pos), segsz);
>
> Do we need this variable?  Maybe just return here?
>
> > +
> > +    return n_segs;
> > +
>
> Extra line.
>
> > +}
> > +
> > +/* Perform software segmentation on packet 'p'.
> > + *
> > + * Returns all the segments added to the array of preallocated
> > + * batches in 'batches' starting at batch position 'batch_pos'. */
> > +void
> > +dp_packet_gso(struct dp_packet *p, struct dp_packet_batch *batches,
> > +              size_t *batch_pos)
> > +{
> > +    struct tcp_header *tcp_hdr;
> > +    struct ip_header *ip_hdr;
> > +    struct dp_packet *seg;
> > +    uint16_t tcp_offset;
> > +    uint16_t tso_segsz;
> > +    uint32_t tcp_seq;
> > +    uint16_t ip_id;
> > +    int hdr_len;
> > +
> > +    tso_segsz = dp_packet_get_tso_segsz(p);
> > +    if (!tso_segsz) {
> > +        VLOG_WARN("GSO packet with len %d with no segment size.",
> > +                  dp_packet_size(p));
>
> This should likely be rate limited.  Also this packet is going to
> be leaked.  And maybe we should count that drop somehow?  A coverage
> counter?
>
> > +        return;
> > +    }
> > +
> > +    tcp_hdr = dp_packet_l4(p);
> > +    tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
> > +    tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
> > +    hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p))
> > +              + tcp_offset * 4;
> > +    ip_id = 0;
> > +    if (dp_packet_hwol_is_ipv4(p)) {
> > +        ip_hdr = dp_packet_l3(p);
> > +        ip_id = ntohs(ip_hdr->ip_id);
> > +    }
> > +
> > +    const char *data_tail = (char *) dp_packet_tail(p)
> > +                            - dp_packet_l2_pad_size(p);
> > +    const char *data_pos = dp_packet_get_tcp_payload(p);
> > +    int n_segs = dp_packet_gso_nr_segs(p);
> > +    int seg_len;
>
> An empty line would be nice.
>
> > +    for (int i = 0; i < n_segs; i++) {
> > +        seg_len = data_tail - data_pos;
> > +        if (seg_len > tso_segsz) {
> > +            seg_len = tso_segsz;
> > +        }
> > +
> > +        seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
> > +        data_pos += seg_len;
> > +
> > +        /* Update L3 header. */
> > +        if (dp_packet_hwol_is_ipv4(seg)) {
> > +            ip_hdr = dp_packet_l3(seg);
> > +            ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
> > +                                       dp_packet_l4_size(seg));
> > +            ip_hdr->ip_id = htons(ip_id);
> > +            ip_hdr->ip_csum = 0;
> > +            ip_id++;
>
> If we assume that IDs are sequential in the original stream,
> here we will produce packets with overlapping IDs.  I suppose
> that's not a big problem?


If the source stream came from Linux/Windows, ipid will be random.
This sequential behaviour is similar to the GSO implementation in DPDK
and Linux.

I'll send in an updated version with the other concerns addressed.


Cheers,
Mike

>
> > +        } else {
> > +            struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
> > +
> > +            ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(sizeof *ip_hdr
> > +                                                + dp_packet_l4_size(seg));
>
> Maybe break the line before or after '=' and start the next one with smaller
> indentation.
>
> > +
>
> Extra line.
>
> > +        }
> > +
> > +        /* Update L4 header. */
> > +        tcp_hdr = dp_packet_l4(seg);
> > +        put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
> > +        tcp_seq += seg_len;
> > +        if (OVS_LIKELY(i < (n_segs - 1))) {
> > +            /* Reset flags PUSH and FIN unless it is the last segment. */
> > +            uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
> > +                                 & ~(TCP_PSH | TCP_FIN);
> > +            tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset);
> > +        }
> > +
> > +        if (dp_packet_batch_is_full(&batches[ *batch_pos])) {
> > +            *batch_pos += 1;
> > +        }
> > +
> > +        dp_packet_batch_add(&batches[ *batch_pos], seg);
>
> The extra space looks strange.  We should remove it.
>
> Alternative approach to counting batches will be to just increment a batch
> pointer. i.e. to have a **curr_batch pointer and do (*curr_batch)++
> whenever it gets full:
>
>     if (dp_packet_batch_is_full(*curr_batch)) {
>         (*curr_batch)++;
>     }
>     dp_packet_batch_add(*curr_batch, seg);
>
> What do you think?
>
> > +    }
> > +
> > +    COVERAGE_INC(soft_seg_good);
> > +}
> > diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h
> > new file mode 100644
> > index 000000000..81cb52742
> > --- /dev/null
> > +++ b/lib/dp-packet-gso.h
> > @@ -0,0 +1,24 @@
> > +/*
> > + * Copyright (c) 2021 Red Hat, Inc.
>
> Year.
>
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifndef DP_PACKET_GSO_H
> > +#define DP_PACKET_GSO_H 1
> > +
> > +void dp_packet_gso(struct dp_packet *p, struct dp_packet_batch *batches,
> > +                   size_t *batch_pos);
> > +int dp_packet_gso_nr_segs(struct dp_packet *p);
>
> No need for struct names in prototypes.
>
> > +
> > +#endif /* dp-packet-gso.h */
> > diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> > index 6a924f3ff..dfa9a0d6b 100644
> > --- a/lib/dp-packet.h
> > +++ b/lib/dp-packet.h
> > @@ -86,6 +86,10 @@ enum dp_packet_offload_mask {
> >      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 
> > 0x800),
> >      /* Offload IP checksum. */
> >      DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 0x1000),
> > +    /* External Buffer attached. */
> > +    DEF_OL_FLAG(DP_PACKET_OL_EXTERNAL, RTE_MBUF_F_EXTERNAL, 0x4000),
> > +    /* Indirect Buffer attached. */
> > +    DEF_OL_FLAG(DP_PACKET_OL_INDIRECT, RTE_MBUF_F_INDIRECT, 0x8000),
>
> These are DPDK-specific and shouldn't be here.  DP_PACKET_OL_SUPPORTED_MASK
> can be used for clearing unknown bits.
>
> >      /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
> >  };
> >
> > @@ -1131,6 +1135,13 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
> >      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
> >  }
> >
> > +/* Resets TCP Segmentation flag in packet 'p'. */
> > +static inline void
> > +dp_packet_hwol_reset_tcp_seg(struct dp_packet *p)
> > +{
> > +    *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_TCP_SEG;
> > +}
> > +
> >  /* Returns 'true' if the IP header has good integrity and the
> >   * checksum in it is complete. */
> >  static inline bool
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > index 1c100c48e..82ce74c45 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -2288,6 +2288,7 @@ static bool
> >  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf 
> > *mbuf)
> >  {
> >      struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
> > +    struct tcp_header *th;
> >
> >      if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK
> >                              | RTE_MBUF_F_TX_TCP_SEG))) {
> > @@ -2299,27 +2300,38 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk 
> > *dev, struct rte_mbuf *mbuf)
> >      mbuf->l4_len = 0;
> >      mbuf->outer_l2_len = 0;
> >      mbuf->outer_l3_len = 0;
> > +    th = dp_packet_l4(pkt);
> >
> >      if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
> > -        struct tcp_header *th = dp_packet_l4(pkt);
> > -        int hdr_len;
> > -
> >          if (!th) {
> >              VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> > +                          " pkt len: %"PRIu32"", dev->up.name, 
> > mbuf->pkt_len);
> > +             return false;
> > +        }
> > +
> > +        mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
>
> How can that happen that packet marked for TSO is not marked for CKSUM?

I've seen "((ol_flags & RTE_MBUF_F_TX_TCP_CKSUM) || (ol_flags &
RTE_MBUF_F_TX_TCP_SEG))" logic in DPDK source code, but the
documentation suggests that one implies the other, and looking deeper
that seems to be the case.

>
> > +    }
> > +
> > +    if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_CKSUM) {
> > +        if (!th) {
> > +            VLOG_WARN_RL(&rl, "%s: TCP offloading without L4 header"
> >                           " pkt len: %"PRIu32"", dev->up.name, 
> > mbuf->pkt_len);
> >              return false;
> >          }
> >
> >          mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> > -        mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
> > -        hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
> >          mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> > -        if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > 
> > dev->max_packet_len)) {
> > -            VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. "
> > -                         "hdr: %"PRIu32", gso: %"PRIu32", max len: 
> > %"PRIu32"",
> > -                         dev->up.name, hdr_len, mbuf->tso_segsz,
> > -                         dev->max_packet_len);
> > -            return false;
> > +
> > +        if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
> > +            int hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
> > +            if (OVS_UNLIKELY((hdr_len +
> > +                              mbuf->tso_segsz) > dev->max_packet_len)) {
> > +                VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: 
> > %"PRIu32", "
> > +                             "gso: %"PRIu32", max len: %"PRIu32"",
> > +                             dev->up.name, hdr_len, mbuf->tso_segsz,
> > +                             dev->max_packet_len);
> > +                return false;
> > +            }
> >          }
> >
> >          if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
> > @@ -2707,6 +2719,7 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, 
> > struct dp_packet *pkt_orig)
> >      mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
> >      mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
> >                              ~(RTE_MBUF_F_EXTERNAL | RTE_MBUF_F_INDIRECT));
> > +    mbuf_dest->tso_segsz = pkt_orig->mbuf.tso_segsz;
> >
> >      memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
> >             sizeof(struct dp_packet) - offsetof(struct dp_packet, 
> > l2_pad_size));
> > @@ -2765,11 +2778,20 @@ netdev_dpdk_common_send(struct netdev *netdev, 
> > struct dp_packet_batch *batch,
> >      struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
> >      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> >      size_t cnt, pkt_cnt = dp_packet_batch_size(batch);
> > +    struct dp_packet *packet;
> > +    bool need_copy = false;
> >
> >      memset(stats, 0, sizeof *stats);
> >
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        if (packet->source != DPBUF_DPDK) {
> > +            need_copy = true;
> > +            break;
> > +        }
> > +    }
> > +
> >      /* Copy dp-packets to mbufs. */
> > -    if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
> > +    if (OVS_UNLIKELY(need_copy)) {
> >          cnt = dpdk_copy_batch_to_mbuf(netdev, batch);
> >          stats->tx_failure_drops += pkt_cnt - cnt;
> >          pkt_cnt = cnt;
> > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> > index 52ce4947d..7a92ee534 100644
> > --- a/lib/netdev-linux.c
> > +++ b/lib/netdev-linux.c
> > @@ -6849,55 +6849,6 @@ af_packet_sock(void)
> >      return sock;
> >  }
> >
> > -static int
> > -netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
> > -{
> > -    struct eth_header *eth_hdr;
> > -    ovs_be16 eth_type;
> > -    int l2_len;
> > -
> > -    eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
> > -    if (!eth_hdr) {
> > -        return -EINVAL;
> > -    }
> > -
> > -    l2_len = ETH_HEADER_LEN;
> > -    eth_type = eth_hdr->eth_type;
> > -    if (eth_type_vlan(eth_type)) {
> > -        struct vlan_header *vlan = dp_packet_at(b, l2_len, 
> > VLAN_HEADER_LEN);
> > -
> > -        if (!vlan) {
> > -            return -EINVAL;
> > -        }
> > -
> > -        eth_type = vlan->vlan_next_type;
> > -        l2_len += VLAN_HEADER_LEN;
> > -    }
> > -
> > -    if (eth_type == htons(ETH_TYPE_IP)) {
> > -        struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
> > -
> > -        if (!ip_hdr) {
> > -            return -EINVAL;
> > -        }
> > -
> > -        *l4proto = ip_hdr->ip_proto;
> > -        dp_packet_hwol_set_tx_ipv4(b);
> > -    } else if (eth_type == htons(ETH_TYPE_IPV6)) {
> > -        struct ovs_16aligned_ip6_hdr *nh6;
> > -
> > -        nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
> > -        if (!nh6) {
> > -            return -EINVAL;
> > -        }
> > -
> > -        *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
> > -        dp_packet_hwol_set_tx_ipv6(b);
> > -    }
> > -
> > -    return 0;
> > -}
> > -
> >  /* Initializes packet 'b' with features enabled in the prepended
> >   * struct virtio_net_hdr.  Returns 0 if successful, otherwise a
> >   * positive errno value. */
> > @@ -6915,15 +6866,6 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b)
> >      }
> >
> >      if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> > -        uint16_t l4proto = 0;
> > -
> > -        if (netdev_linux_parse_l2(b, &l4proto)) {
> > -            return EINVAL;
> > -        }
> > -
> > -        if (l4proto == IPPROTO_UDP) {
> > -            dp_packet_hwol_set_csum_udp(b);
> > -        }
>
> Why all that is no longer needed?
>
> >          /* The packet has offloaded checksum. However, there is no
> >           * additional information like the protocol used, so it would
> >           * require to parse the packet here. The checksum starting point
> > diff --git a/lib/netdev.c b/lib/netdev.c
> > index 8df7f8737..8eda6873d 100644
> > --- a/lib/netdev.c
> > +++ b/lib/netdev.c
> > @@ -35,6 +35,7 @@
> >  #include "coverage.h"
> >  #include "dpif.h"
> >  #include "dp-packet.h"
> > +#include "dp-packet-gso.h"
> >  #include "openvswitch/dynamic-string.h"
> >  #include "fatal-signal.h"
> >  #include "hash.h"
> > @@ -56,6 +57,7 @@
> >  #include "svec.h"
> >  #include "openvswitch/vlog.h"
> >  #include "flow.h"
> > +#include "userspace-tso.h"
> >  #include "util.h"
> >  #ifdef __linux__
> >  #include "tc.h"
> > @@ -67,7 +69,6 @@ COVERAGE_DEFINE(netdev_received);
> >  COVERAGE_DEFINE(netdev_sent);
> >  COVERAGE_DEFINE(netdev_add_router);
> >  COVERAGE_DEFINE(netdev_get_stats);
> > -COVERAGE_DEFINE(netdev_send_prepare_drops);
> >  COVERAGE_DEFINE(netdev_push_header_drops);
> >
> >  struct netdev_saved_flags {
> > @@ -792,60 +793,67 @@ netdev_get_pt_mode(const struct netdev *netdev)
> >              : NETDEV_PT_LEGACY_L2);
> >  }
> >
> > -/* Check if a 'packet' is compatible with 'netdev_flags'.
> > - * If a packet is incompatible, return 'false' with the 'errormsg'
> > - * pointing to a reason. */
> > -static bool
> > -netdev_send_prepare_packet(const uint64_t netdev_flags,
> > -                           struct dp_packet *packet, char **errormsg)
> > -{
> > -    if (dp_packet_hwol_is_tso(packet)
> > -        && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> > -            /* Fall back to GSO in software. */
> > -            VLOG_ERR_BUF(errormsg, "No TSO support");
> > -            return false;
> > -    }
> > -
> > -    /* Packet with IP csum offloading enabled was received with verified 
> > csum.
> > -     * Leave the IP csum offloading enabled even with good checksum to the
> > -     * netdev to decide what would be the best to do.
> > -     * Provide a software fallback in case the device doesn't support IP 
> > csum
> > -     * offloading. Note: Encapsulated packet must have the inner IP header
> > -     * csum already calculated.
> > -     * Packet with L4 csum offloading enabled was received with verified 
> > csum.
> > -     * Leave the L4 csum offloading enabled even with good checksum for the
> > -     * netdev to decide what would be the best to do.
> > -     * Netdev that requires pseudo header csum needs to calculate that.
> > -     * Provide a software fallback in case the netdev doesn't support L4 
> > csum
> > -     * offloading. Note: Encapsulated packet must have the inner L4 header
> > -     * csum already calculated. */
> > -    dp_packet_ol_send_prepare(packet, netdev_flags);
> > -
> > -    return true;
> > -}
> > -
> > -/* Check if each packet in 'batch' is compatible with 'netdev' features,
> > - * otherwise either fall back to software implementation or drop it. */
> > -static void
> > -netdev_send_prepare_batch(const struct netdev *netdev,
> > -                          struct dp_packet_batch *batch)
> > +static int
> > +netdev_send_tso(struct netdev *netdev, int qid,
> > +                struct dp_packet_batch *batch, bool concurrent_txq)
>
> A comment for this function would be nice.
>
> >  {
> > +    struct dp_packet_batch *batches;
> >      struct dp_packet *packet;
> > -    size_t i, size = dp_packet_batch_size(batch);
> > +    int n_packets;
> > +    int n_batches;
> > +    int error;
> >
> > -    DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
> > -        char *errormsg = NULL;
> > +    /* Calculate the total number of packets in the batch after
> > +     * the segmentation. */
> > +    n_packets = 0;
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        if (dp_packet_hwol_is_tso(packet)) {
> > +            n_packets += dp_packet_gso_nr_segs(packet);
> > +        } else {
> > +            n_packets++;
> > +        }
> > +    }
> >
> > -        if (netdev_send_prepare_packet(netdev->ol_flags, packet, 
> > &errormsg)) {
> > -            dp_packet_batch_refill(batch, packet, i);
> > +    if (!n_packets) {
> > +        return 0;
> > +    }
> > +
> > +    /* Allocate enough batches to store all the packets in order. */
> > +    n_batches = DIV_ROUND_UP(n_packets, NETDEV_MAX_BURST);
> > +    batches = xmalloc(n_batches * sizeof(struct dp_packet_batch));
>
> Use 'sizeof *batches' or 'sizeof batches[0]' instead.
>
> > +    size_t batch_pos = 0;
> > +    for (batch_pos = 0; batch_pos < n_batches; batch_pos++) {
> > +        dp_packet_batch_init(&batches[batch_pos]);
> > +    }
> > +    /* Do the packet segmentation if TSO is flagged. */
> > +    size_t size = dp_packet_batch_size(batch);
> > +    size_t k;
> > +    batch_pos = 0;
> > +    DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
> > +        if (dp_packet_hwol_is_tso(packet)) {
> > +            dp_packet_gso(packet, batches, &batch_pos);
> >          } else {
> > -            dp_packet_delete(packet);
> > -            COVERAGE_INC(netdev_send_prepare_drops);
> > -            VLOG_WARN_RL(&rl, "%s: Packet dropped: %s",
> > -                         netdev_get_name(netdev), errormsg);
> > -            free(errormsg);
> > +            if (dp_packet_batch_is_full(&batches[batch_pos])) {
> > +                batch_pos++;
> > +            }
> > +
> > +            dp_packet_batch_add(&batches[batch_pos], packet);
> > +        }
> > +    }
> > +
> > +    for (batch_pos = 0; batch_pos < n_batches; batch_pos++) {
> > +        DP_PACKET_BATCH_FOR_EACH (i, packet, (&batches[batch_pos])) {
>
> You shouldn't need to parethesize the macro argument.  We fixed that.
> Fix the macro instead if it's still a problem.
>
> > +            dp_packet_ol_send_prepare(packet, netdev->ol_flags);
> > +        }
> > +
> > +        error = netdev->netdev_class->send(netdev, qid, 
> > &batches[batch_pos],
> > +                                           concurrent_txq);
> > +        if (!error) {
> > +            COVERAGE_INC(netdev_sent);
> >          }
> >      }
> > +    free(batches);
> > +    return 0;
>
> This function never returns an error, even if all the send() calls failed.
> It should return an error even if one send() failed.
>
> >  }
> >
> >  /* Sends 'batch' on 'netdev'.  Returns 0 if successful (for every packet),
> > @@ -877,11 +885,21 @@ int
> >  netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
> >              bool concurrent_txq)
> >  {
> > +    const uint64_t netdev_flags = netdev->ol_flags;
> > +    struct dp_packet *packet;
> >      int error;
> >
> > -    netdev_send_prepare_batch(netdev, batch);
> > -    if (OVS_UNLIKELY(dp_packet_batch_is_empty(batch))) {
> > -        return 0;
> > +    if (userspace_tso_enabled() &&
> > +        !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> > +        DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +            if (dp_packet_hwol_is_tso(packet)) {
> > +                return netdev_send_tso(netdev, qid, batch, concurrent_txq);
> > +            }
> > +        }
> > +    }
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        dp_packet_ol_send_prepare(packet, netdev_flags);
> >      }
> >
> >      error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq);
> > diff --git a/lib/packets.c b/lib/packets.c
> > index 462b51f92..dab823ba2 100644
> > --- a/lib/packets.c
> > +++ b/lib/packets.c
> > @@ -427,7 +427,7 @@ add_mpls(struct dp_packet *packet, ovs_be16 ethtype, 
> > ovs_be32 lse,
> >      }
> >
> >      if (!l3_encap) {
> > -        struct mpls_hdr *header = dp_packet_push_uninit(packet, MPLS_HLEN);
> > +        struct mpls_hdr *header = dp_packet_resize_l2(packet, MPLS_HLEN);
> >
> >          put_16aligned_be32(&header->mpls_lse, lse);
> >          packet->l2_5_ofs = 0;
> > @@ -513,7 +513,7 @@ push_nsh(struct dp_packet *packet, const struct nsh_hdr 
> > *nsh_hdr_src)
> >              OVS_NOT_REACHED();
> >      }
> >
> > -    nsh = (struct nsh_hdr *) dp_packet_push_uninit(packet, length);
> > +    nsh = (struct nsh_hdr *) dp_packet_resize_l2(packet, length);
> >      memcpy(nsh, nsh_hdr_src, length);
> >      nsh->next_proto = next_proto;
> >      packet->packet_type = htonl(PT_NSH);
>

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v3 2/2] userspace: Add Generic Segmentation Offloading.

Reply via email to