By default, DPDK based dp-packets points to data buffers that can't be expanded dynamically. Their layout is as follows: - a minimum 128 bytes headroom chosen at DPDK build time (RTE_PKTMBUF_HEADROOM), - a maximum size chosen at mempool creation,
In some usecases though (like encapsulating with multiple tunnels), a 128 bytes headroom is too short. Keep on using mono segment packets but dynamically allocate buffers in DPDK memory and make use of DPDK external buffers API (previously used for userspace TSO). Signed-off-by: David Marchand <[email protected]> --- Changes since v1: - fixed new segment length (reset by extbuf attach helper), - added a system-dpdk unit test, --- lib/dp-packet.c | 17 +++++++++++- lib/netdev-dpdk.c | 50 +++++++++++++++++++++++++++--------- lib/netdev-dpdk.h | 3 +++ tests/system-dpdk.at | 61 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 13 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index b34bcf26f3..c1ff299a75 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -255,8 +255,23 @@ dp_packet_resize(struct dp_packet *b, size_t new_headroom, size_t new_tailroom) new_allocated = new_headroom + dp_packet_size(b) + new_tailroom; switch (b->source) { - case DPBUF_DPDK: + case DPBUF_DPDK: { +#ifdef DPDK_NETDEV + uint32_t buf_len; + + buf_len = new_allocated; + new_base = netdev_dpdk_extbuf_allocate(&buf_len); + if (!new_base) { + out_of_memory(); + } + ovs_assert(buf_len <= UINT16_MAX); + dp_packet_copy__(b, new_base, new_headroom, new_tailroom); + netdev_dpdk_extbuf_replace(b, new_base, buf_len); + break; +#else OVS_NOT_REACHED(); +#endif + } case DPBUF_MALLOC: if (new_headroom == dp_packet_headroom(b)) { diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 687e1196b5..b416c404da 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -3094,41 +3094,67 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts, return cnt; } +void * +netdev_dpdk_extbuf_allocate(uint32_t *data_len) +{ + *data_len += sizeof(struct rte_mbuf_ext_shared_info) + sizeof(uintptr_t); + *data_len = RTE_ALIGN_CEIL(*data_len, sizeof(uintptr_t)); + return rte_malloc(NULL, *data_len, RTE_CACHE_LINE_SIZE); +} + static void netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque) { rte_free(opaque); } +void +netdev_dpdk_extbuf_replace(struct dp_packet *b, void *buf, uint32_t data_len) +{ + struct rte_mbuf *pkt = (struct rte_mbuf *) b; + struct rte_mbuf_ext_shared_info *shinfo; + uint16_t buf_len = data_len; + + shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len, + netdev_dpdk_extbuf_free, + buf); + ovs_assert(shinfo != NULL); + + if (RTE_MBUF_HAS_EXTBUF(pkt)) { + rte_pktmbuf_detach_extbuf(pkt); + } + rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len, + shinfo); + /* OVS only supports mono segment. + * Packet size did not change, restore the current segment length. */ + pkt->data_len = pkt->pkt_len; +} + static struct rte_mbuf * dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len) { uint32_t total_len = RTE_PKTMBUF_HEADROOM + data_len; - struct rte_mbuf_ext_shared_info *shinfo = NULL; + struct rte_mbuf_ext_shared_info *shinfo; uint16_t buf_len; void *buf; - total_len += sizeof *shinfo + sizeof(uintptr_t); - total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t)); - + buf = netdev_dpdk_extbuf_allocate(&total_len); + if (OVS_UNLIKELY(buf == NULL)) { + VLOG_ERR("Failed to allocate memory using rte_malloc: %u", total_len); + return NULL; + } if (OVS_UNLIKELY(total_len > UINT16_MAX)) { + netdev_dpdk_extbuf_free(NULL, buf); VLOG_ERR("Can't copy packet: too big %u", total_len); return NULL; } buf_len = total_len; - buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE); - if (OVS_UNLIKELY(buf == NULL)) { - VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len); - return NULL; - } - - /* Initialize shinfo. */ shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len, netdev_dpdk_extbuf_free, buf); if (OVS_UNLIKELY(shinfo == NULL)) { - rte_free(buf); + netdev_dpdk_extbuf_free(NULL, buf); VLOG_ERR("Failed to initialize shared info for mbuf while " "attempting to attach an external buffer."); return NULL; diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index 86df7a1e83..274900c062 100644 --- a/lib/netdev-dpdk.h +++ b/lib/netdev-dpdk.h @@ -32,6 +32,9 @@ struct netdev; void netdev_dpdk_register(const struct smap *); void free_dpdk_buf(struct dp_packet *); +void *netdev_dpdk_extbuf_allocate(uint32_t *); +void netdev_dpdk_extbuf_replace(struct dp_packet *, void *, uint32_t); + bool netdev_dpdk_flow_api_supported(struct netdev *); int diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index 393b76c76c..5c27f6fb72 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -941,3 +941,64 @@ AT_CHECK([ovs-vsctl del-port br10 p1], [], [stdout], [stderr]) OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- + + + +dnl -------------------------------------------------------------------------- +dnl Test headroom expansion. +AT_SETUP([OVS-DPDK - headroom expansion]) +AT_KEYWORDS([dpdk]) +OVS_DPDK_PRE_CHECK() +OVS_DPDK_START([--no-pci]) + +ADD_BR([br0]) +ADD_BR([br-underlay0]) +ADD_BR([br-underlay1]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay1 "actions=normal"]) + +ADD_NAMESPACES(at_ns0) + +AT_CHECK([ip link set dev br-underlay1 up]) +AT_CHECK([ip link set dev br-underlay0 up]) + +AT_CHECK([ip addr add dev br-underlay1 "fc00::100/64" nodad]) +AT_CHECK([ovs-vsctl add-port br-underlay1 p0 -- \ + set interface p0 type=dpdk -- \ + set interface p0 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ { print $NF + 1 }') -- \ + set interface p0 options:dpdk-devargs=net_tap0,iface=tap1]) +AT_CHECK([ip link set tap1 netns at_ns0]) +NS_CHECK_EXEC([at_ns0], [ip link set tap1 up]) +OVS_WAIT_UNTIL([ip -n at_ns0 link show dev tap1 | grep -qw LOWER_UP]) +NS_CHECK_EXEC([at_ns0], [ip -6 addr add "fc00::1/64" nodad dev tap1]) + +ADD_OVS_TUNNEL6([vxlan], [br-underlay0], [at_vxlan2], [fc00::1], + ["fc00:1::100/64" nodad], [options:key=0]) +ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan3], [at_ns0], [fc00::100], + ["fc00:1::1/64" nodad], [id 0 dstport 4789]) + +ADD_OVS_TUNNEL6([vxlan], [br0], [at_vxlan0], [fc00:1::1], + ["fc00:2::100/64" nodad], [options:key=1]) +ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan1], [at_ns0], [fc00:1::100], + ["fc00:2::1/64" nodad], [id 1 dstport 4789]) + +AT_CHECK([ovs-vsctl add-port br0 p1 -- \ + set interface p1 type=dpdk -- \ + set interface p1 options:n_rxq=$(lscpu | awk '/NUMA node\(s\)/ { print $NF + 1 }') -- \ + set interface p1 options:dpdk-devargs=net_tap1,iface=tap2]) +OVS_WAIT_UNTIL([ip link show dev tap2 | grep -qw LOWER_UP]) +AT_CHECK([ip -6 addr add "fc00:2::200/64" nodad dev tap2], [], [stdout], [stderr]) + +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::100]) +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:2::100]) +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:2::200 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Clean up +OVS_DPDK_STOP_VSWITCHD(["/Failed to send start req to secondary 95/d"]) +AT_CLEANUP +dnl -------------------------------------------------------------------------- -- 2.52.0 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
