On 2/25/26 12:03 PM, David Marchand via dev wrote:
> By default, DPDK based dp-packets points to data buffers that can't be
> expanded dynamically.
> Their layout is as follows:
> - a minimum 128 bytes headroom chosen at DPDK build time
>   (RTE_PKTMBUF_HEADROOM),
> - a maximum size chosen at mempool creation,
> 
> In some usecases though (like encapsulating with multiple tunnels),
> a 128 bytes headroom is too short.
> 
> Keep on using mono segment packets but dynamically allocate buffers
> in DPDK memory and make use of DPDK external buffers API
> (previously used for userspace TSO).
> 
> Signed-off-by: David Marchand <[email protected]>
> ---
> Changes since v3:
> - split buffer length calculation in a helper,
> - handled running test without qdisc (net/tap does not require
>   those qdiscs, but spews ERR level logs if absent),
> - added check on firewall,
> 
> Changes since v2:
> - moved check on uint16_t overflow in netdev_dpdk_extbuf_allocate(),
> 
> Changes since v1:
> - fixed new segment length (reset by extbuf attach helper),
> - added a system-dpdk unit test,
> 
> ---
>  lib/dp-packet.c      | 17 +++++++++++-
>  lib/netdev-dpdk.c    | 47 ++++++++++++++++++++++++++++----
>  lib/netdev-dpdk.h    |  4 +++
>  tests/system-dpdk.at | 65 ++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 127 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
> index c04d608be6..4c45636039 100644
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -255,8 +255,23 @@ dp_packet_resize(struct dp_packet *b, size_t 
> new_headroom, size_t new_tailroom)
>      new_allocated = new_headroom + dp_packet_size(b) + new_tailroom;
>  
>      switch (b->source) {
> -    case DPBUF_DPDK:
> +    case DPBUF_DPDK: {
> +#ifdef DPDK_NETDEV
> +        uint32_t buf_len;
> +
> +        buf_len = netdev_dpdk_extbuf_size(new_allocated);

Shouldn't we assign into new_allocated here?  The rte_pktmbuf_attach_extbuf()
will update the mbuf->buf_len to the result of this call.  However, there
is the dp_packet_set_allocated(b, new_allocated); call that will overwrite
that value with a potentially smaller 'new_allocated'.  I'm not sure if that
can cause any issues, since the value is smaller, but it doesn't feel right.

Or am I missing something here?

> +        ovs_assert(buf_len <= UINT16_MAX);
> +        new_base = netdev_dpdk_extbuf_allocate(buf_len);
> +        if (!new_base) {
> +            out_of_memory();
> +        }
> +        dp_packet_copy__(b, new_base, new_headroom, new_tailroom);
> +        netdev_dpdk_extbuf_replace(b, new_base, buf_len);
> +        break;
> +#else
>          OVS_NOT_REACHED();
> +#endif
> +    }
>  
>      case DPBUF_MALLOC:
>          if (new_headroom == dp_packet_headroom(b)) {
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 923191da84..cfd641b493 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -3072,12 +3072,51 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk 
> *dev, struct rte_mbuf **pkts,
>      return cnt;
>  }
>  
> +uint32_t
> +netdev_dpdk_extbuf_size(uint32_t data_len)
> +{
> +    uint32_t buf_len = data_len;
> +
> +    buf_len += sizeof(struct rte_mbuf_ext_shared_info) + sizeof(uintptr_t);
> +    buf_len = RTE_ALIGN_CEIL(buf_len, sizeof(uintptr_t));
> +
> +    return buf_len;
> +}
> +
> +void *
> +netdev_dpdk_extbuf_allocate(uint32_t buf_len)
> +{
> +    return rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
> +}
> +
>  static void
>  netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
>  {
>      rte_free(opaque);
>  }
>  
> +void
> +netdev_dpdk_extbuf_replace(struct dp_packet *b, void *buf, uint32_t data_len)
> +{
> +    struct rte_mbuf *pkt = (struct rte_mbuf *) b;
> +    struct rte_mbuf_ext_shared_info *shinfo;
> +    uint16_t buf_len = data_len;
> +
> +    shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
> +                                                netdev_dpdk_extbuf_free,
> +                                                buf);
> +    ovs_assert(shinfo != NULL);
> +
> +    if (RTE_MBUF_HAS_EXTBUF(pkt)) {
> +        rte_pktmbuf_detach_extbuf(pkt);
> +    }
> +    rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len,
> +                              shinfo);
> +    /* OVS only supports mono segment.
> +     * Packet size did not change, restore the current segment length. */
> +    pkt->data_len = pkt->pkt_len;
> +}
> +
>  static struct rte_mbuf *
>  dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
>  {
> @@ -3086,16 +3125,14 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, 
> uint32_t data_len)
>      uint16_t buf_len;
>      void *buf;
>  
> -    total_len += sizeof *shinfo + sizeof(uintptr_t);
> -    total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
> -
> +    total_len = netdev_dpdk_extbuf_size(total_len);
>      if (OVS_UNLIKELY(total_len > UINT16_MAX)) {
>          VLOG_ERR("Can't copy packet: too big %u", total_len);
>          return NULL;
>      }
>  
>      buf_len = total_len;
> -    buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
> +    buf = netdev_dpdk_extbuf_allocate(buf_len);
>      if (OVS_UNLIKELY(buf == NULL)) {
>          VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len);
>          return NULL;
> @@ -3106,7 +3143,7 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, 
> uint32_t data_len)
>                                                  netdev_dpdk_extbuf_free,
>                                                  buf);
>      if (OVS_UNLIKELY(shinfo == NULL)) {
> -        rte_free(buf);
> +        netdev_dpdk_extbuf_free(NULL, buf);
>          VLOG_ERR("Failed to initialize shared info for mbuf while "
>                   "attempting to attach an external buffer.");
>          return NULL;
> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
> index e6779d478a..0029372ee3 100644
> --- a/lib/netdev-dpdk.h
> +++ b/lib/netdev-dpdk.h
> @@ -32,6 +32,10 @@ struct netdev;
>  void netdev_dpdk_register(const struct smap *);
>  void free_dpdk_buf(struct dp_packet *);
>  
> +uint32_t netdev_dpdk_extbuf_size(uint32_t);
> +void *netdev_dpdk_extbuf_allocate(uint32_t);
> +void netdev_dpdk_extbuf_replace(struct dp_packet *, void *, uint32_t);

nit: Please, name the arguments with the non-descriptive types.

> +
>  bool netdev_dpdk_flow_api_supported(struct netdev *, bool check_only);
>  
>  int
> diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at
> index 17d3d25955..57d20acf3b 100644
> --- a/tests/system-dpdk.at
> +++ b/tests/system-dpdk.at
> @@ -976,3 +976,68 @@ AT_CHECK([ovs-appctl --format json --pretty 
> dpif/offload/show], [0], [dnl
>  OVS_DPDK_STOP_VSWITCHD
>  AT_CLEANUP
>  dnl 
> --------------------------------------------------------------------------
> +
> +dnl 
> --------------------------------------------------------------------------
> +dnl Test headroom expansion.
> +AT_SETUP([OVS-DPDK - headroom expansion])
> +AT_KEYWORDS([dpdk])
> +OVS_CHECK_FIREWALL()
> +OVS_DPDK_PRE_CHECK()
> +OVS_DPDK_START([--no-pci])

I think, this test deserves an explanation of the topology or a diagram.

The naming scheme also can be cleaned up, as it's hard to follow what is
what.  AFAIU, we have tap0, vxlan3 and vxlan1 in the test namespace and
we have corresponding br_underlay1, br_underlay0 and br0 in the root.

br_underlay1 and tap0 are in fc00:0::/64 network and are directly connected.
br_underlay0 and vxlan3 are terminating first vxlan tunnel within fc00:1::/64.
br0 and vxlan1 are terminating the second vxlan tunnel within fc00:2::/64.

This naming scheme doesn't make a lot of sense to me and is very confusing.

I'd suggest to rename:

  br_underlay1 --> br0    (fc00:0::100)
  br_underlay0 --> br1    (fc00:1::100, remote: fc00:0::1)
  br0          --> br2    (fc00:2::100, remote: fc00:1::1)
  tap1         --> p0     (fc00:0::1)
  vxlan3       --> vxlan1 (fc00:1::1,   remote: fc00:0::100)
  vxlan1       --> vxlan2 (fc00:2::1,   remote: fc00:1::100)

Also, it may be better to name the OVS interface and the tap port the
same name to avoid extra confusion.

The test also creates tap2 interface in the root namespace in the same
network as br0.  I don't think that interface plays any role in the setup.
There is no difference between pinging fc00:2::100 and fc00:2::200.

And I wonder if it's maybe worth doing 3 levels of encaps instead of 2,
just to be sure.  WDYT?

Best regards, Ilya Maximets.

> +
> +ADD_BR([br0])
> +ADD_BR([br-underlay0])
> +ADD_BR([br-underlay1])
> +
> +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
> +AT_CHECK([ovs-ofctl add-flow br-underlay0 "actions=normal"])
> +AT_CHECK([ovs-ofctl add-flow br-underlay1 "actions=normal"])
> +
> +ADD_NAMESPACES(at_ns0)
> +
> +AT_CHECK([ip link set dev br-underlay1 up])
> +AT_CHECK([ip link set dev br-underlay0 up])
> +
> +AT_CHECK([ip addr add dev br-underlay1 "fc00::100/64" nodad])
> +AT_CHECK([ovs-vsctl add-port br-underlay1 p0 -- \
> +          set interface p0 type=dpdk -- \
> +          set interface p0 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ { 
> print $NF + 1 }') -- \
> +          set interface p0 options:dpdk-devargs=net_tap0,iface=tap1])
> +AT_CHECK([ip link set tap1 netns at_ns0])
> +NS_CHECK_EXEC([at_ns0], [ip link set tap1 up])
> +OVS_WAIT_UNTIL([ip -n at_ns0 link show dev tap1 | grep -qw LOWER_UP])
> +NS_CHECK_EXEC([at_ns0], [ip -6 addr add "fc00::1/64" nodad dev tap1])
> +
> +ADD_OVS_TUNNEL6([vxlan], [br-underlay0], [at_vxlan2], [fc00::1],
> +                ["fc00:1::100/64" nodad], [options:key=0])
> +ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan3], [at_ns0], [fc00::100],
> +                   ["fc00:1::1/64" nodad], [id 0 dstport 4789])
> +
> +ADD_OVS_TUNNEL6([vxlan], [br0], [at_vxlan0], [fc00:1::1],
> +                ["fc00:2::100/64" nodad], [options:key=1])
> +ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan1], [at_ns0], [fc00:1::100],
> +                   ["fc00:2::1/64" nodad], [id 1 dstport 4789])
> +
> +AT_CHECK([ovs-vsctl add-port br0 p1 -- \
> +          set interface p1 type=dpdk -- \
> +          set interface p1 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ { 
> print $NF + 1 }') -- \
> +          set interface p1 options:dpdk-devargs=net_tap1,iface=tap2])
> +OVS_WAIT_UNTIL([ip link show dev tap2 | grep -qw LOWER_UP])
> +AT_CHECK([ip -6 addr add "fc00:2::200/64" nodad dev tap2], [], [stdout], 
> [stderr])
> +
> +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100])
> +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::100])
> +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:2::100])
> +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:2::200 | 
> FORMAT_PING], [0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +
> +dnl Clean up
> +OVS_DPDK_STOP_VSWITCHD(["dnl
> +/Failed to send start req to secondary 95/d
> +/eth_dev_tap_create(): .*: failed to create multiq qdisc./d
> +/eth_dev_tap_create():  Disabling rte flow support: No such file or 
> directory/d
> +/qdisc_create_multiq(): Could not add multiq qdisc (2): No such file or 
> directory/d
> +/tap_nl_dump_ext_ack(): Specified qdisc kind is unknown/d"])
> +AT_CLEANUP
> +dnl 
> --------------------------------------------------------------------------

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to