On 15 Dec 2025, at 20:33, Ilya Maximets wrote:
> Kernels with AF_XDP support below v5.4 are not supported and are not
> used in any major distributions. Using kernels that old with AF_XDP
> that was rapidly changing at the time is also not a good idea in
> general. Let's drop support for them. This allows to have the
> XDP_USE_NEED_WAKEUP flag to be always on, so we can simplify the code
> and stop guessing how many packets kernel can send in one go.
>
> The next milestone may be to require libxdp 1.2+ and kernel 5.18+, so
> we can simplify acinclude.m4 and avoid checking for bpf_xdp_detach and
> bpf_xdp_query_id API. But it feels a little bit too early for that,
> as it excludes Ubuntu 22.04, even if it's probably not a great idea
> to run AF_XDP there either. The gains from moving the pole to 5.18
> are just a couple of checks in the code, so not doing that for now.
>
> Signed-off-by: Ilya Maximets <[email protected]>
Thanks for taking care of this cleanup! The patch looks fine to me. I did a
quick test on my local machine, and basic forwarding seems to work correctly.
Acked-by: Eelco Chaudron <[email protected]>
> ---
>
> Version 2:
> * Cleaned up leftover structure fields in netdev-linux-private.h.
>
> Documentation/intro/install/afxdp.rst | 8 +-
> NEWS | 5 +
> acinclude.m4 | 6 +-
> lib/netdev-afxdp.c | 127 ++++++--------------------
> lib/netdev-linux-private.h | 3 -
> vswitchd/vswitch.xml | 12 ---
> 6 files changed, 44 insertions(+), 117 deletions(-)
>
> diff --git a/Documentation/intro/install/afxdp.rst
> b/Documentation/intro/install/afxdp.rst
> index 7fa8088c6..63a10e328 100644
> --- a/Documentation/intro/install/afxdp.rst
> +++ b/Documentation/intro/install/afxdp.rst
> @@ -89,6 +89,9 @@ vSwitch with AF_XDP will require the following:
>
> - ``libbpf`` and ``libxdp`` (if version of ``libbpf`` if higher than
> ``0.6``).
>
> +- Linux kernel v5.4+ or a nominally older distribution kernel that has
> + required features backported.
> +
> - Linux kernel XDP support, with the following options (required)
>
> * CONFIG_BPF=y
> @@ -145,7 +148,7 @@ Finally, build and install OVS::
>
> To kick start end-to-end autotesting::
>
> - uname -a # make sure having 5.0+ kernel
> + uname -a # make sure having 5.4+ kernel
> ethtool --version # make sure ethtool is installed
> make check-afxdp TESTSUITEFLAGS='1'
>
> @@ -180,9 +183,6 @@ more details):
> ``native`` or ``generic``. Defaults to ``best-effort``, i.e. best of
> supported modes, so in most cases you don't need to change it.
>
> - * ``use-need-wakeup``: default ``true`` if libbpf supports it,
> - otherwise ``false``.
> -
> For example, to use 1 PMD (on core 4) on 1 queue (queue 0) device,
> configure these options: ``pmd-cpu-mask``, ``pmd-rxq-affinity``, and
> ``n_rxq``::
> diff --git a/NEWS b/NEWS
> index f9a74df1a..68c1b3eea 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -1,5 +1,10 @@
> Post-v3.6.0
> --------------------
> + - AF_XDP:
> + * AF_XDP support now requires Linux kernel v5.4+ or a nominally older
> + distribution kernel with sufficient features backported.
> + * Setting or reporting "use-need-wakeup" is no longer supported. The
> + feature is always enabled.
> - TLS:
> * Added support for TLS Server Name Indication (SNI) with the new
> --ssl-server-name option. This allows specifying the server name
> diff --git a/acinclude.m4 b/acinclude.m4
> index 369e37eae..bbc872142 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -310,7 +310,11 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
> AC_CHECK_HEADER([bpf/libbpf.h], [], [failed_dep="bpf/libbpf.h"])
>
> if test "$failed_dep" = none; then
> - AC_CHECK_HEADER([linux/if_xdp.h], [], [failed_dep="linux/if_xdp.h"])
> + AC_CHECK_HEADER([linux/if_xdp.h], [
> + AC_CHECK_DECLS([XDP_USE_NEED_WAKEUP], [],
> + [failed_dep="XDP_USE_NEED_WAKEUP"],
> + [[#include <linux/if_xdp.h>]])
> + ], [failed_dep="linux/if_xdp.h"])
> fi
>
> if test "$failed_dep" = none; then
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index 54029722e..09f4c5107 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -79,12 +79,6 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5,
> 20);
> #define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS
> #define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS
>
> -#ifdef XDP_USE_NEED_WAKEUP
> -#define NEED_WAKEUP_DEFAULT true
> -#else
> -#define NEED_WAKEUP_DEFAULT false
> -#endif
> -
> /* The worst case is all 4 queues TX/CQ/RX/FILL are full + some packets
> * still on processing in threads. Number of packets currently in OVS
> * processing is hard to estimate because it depends on number of ports.
> @@ -101,7 +95,6 @@ BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS);
>
> static struct xsk_socket_info *xsk_configure(int ifindex, int xdp_queue_id,
> enum afxdp_mode mode,
> - bool use_need_wakeup,
> bool report_socket_failures);
> static void xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode);
> static void xsk_destroy(struct xsk_socket_info *xsk);
> @@ -176,19 +169,13 @@ struct netdev_afxdp_tx_lock {
> );
> };
>
> -#ifdef XDP_USE_NEED_WAKEUP
> static inline void
> xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
> struct netdev *netdev, int fd)
> {
> - struct netdev_linux *dev = netdev_linux_cast(netdev);
> struct pollfd pfd;
> int ret;
>
> - if (!dev->use_need_wakeup) {
> - return;
> - }
> -
> if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
> pfd.fd = fd;
> pfd.events = POLLIN;
> @@ -208,22 +195,6 @@ xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info)
> return xsk_ring_prod__needs_wakeup(&xsk_info->tx);
> }
>
> -#else /* !XDP_USE_NEED_WAKEUP */
> -static inline void
> -xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem OVS_UNUSED,
> - struct netdev *netdev OVS_UNUSED,
> - int fd OVS_UNUSED)
> -{
> - /* Nothing. */
> -}
> -
> -static inline bool
> -xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED)
> -{
> - return true;
> -}
> -#endif /* XDP_USE_NEED_WAKEUP */
> -
> static void
> netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool)
> {
> @@ -341,7 +312,7 @@ xsk_configure_umem(void *buffer, uint64_t size)
> static struct xsk_socket_info *
> xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
> uint32_t queue_id, enum afxdp_mode mode,
> - bool use_need_wakeup, bool report_socket_failures)
> + bool report_socket_failures)
> {
> struct xsk_socket_config cfg;
> struct xsk_socket_info *xsk;
> @@ -355,15 +326,9 @@ xsk_configure_socket(struct xsk_umem_info *umem,
> uint32_t ifindex,
> cfg.rx_size = CONS_NUM_DESCS;
> cfg.tx_size = PROD_NUM_DESCS;
> cfg.libbpf_flags = 0;
> - cfg.bind_flags = xdp_modes[mode].bind_flags;
> + cfg.bind_flags = xdp_modes[mode].bind_flags | XDP_USE_NEED_WAKEUP;
> cfg.xdp_flags = xdp_modes[mode].xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST;
>
> -#ifdef XDP_USE_NEED_WAKEUP
> - if (use_need_wakeup) {
> - cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
> - }
> -#endif
> -
> if (if_indextoname(ifindex, devname) == NULL) {
> VLOG_ERR("ifindex %d to devname failed (%s)",
> ifindex, ovs_strerror(errno));
> @@ -375,10 +340,8 @@ xsk_configure_socket(struct xsk_umem_info *umem,
> uint32_t ifindex,
> &xsk->rx, &xsk->tx, &cfg);
> if (ret) {
> VLOG(report_socket_failures ? VLL_ERR : VLL_DBG,
> - "xsk_socket__create failed (%s) mode: %s, "
> - "use-need-wakeup: %s, qid: %d",
> - ovs_strerror(errno), xdp_modes[mode].name,
> - use_need_wakeup ? "true" : "false", queue_id);
> + "xsk_socket__create failed (%s) mode: %s, qid: %d",
> + ovs_strerror(errno), xdp_modes[mode].name, queue_id);
> free(xsk);
> return NULL;
> }
> @@ -424,7 +387,7 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t
> ifindex,
>
> static struct xsk_socket_info *
> xsk_configure(int ifindex, int xdp_queue_id, enum afxdp_mode mode,
> - bool use_need_wakeup, bool report_socket_failures)
> + bool report_socket_failures)
> {
> struct xsk_socket_info *xsk;
> struct xsk_umem_info *umem;
> @@ -450,7 +413,7 @@ xsk_configure(int ifindex, int xdp_queue_id, enum
> afxdp_mode mode,
> VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
>
> xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, mode,
> - use_need_wakeup, report_socket_failures);
> + report_socket_failures);
> if (!xsk) {
> /* Clean up umem and xpacket pool. */
> if (xsk_umem__delete(umem->umem)) {
> @@ -470,11 +433,10 @@ xsk_configure_queue(struct netdev_linux *dev, int
> ifindex, int queue_id,
> {
> struct xsk_socket_info *xsk_info;
>
> - VLOG_DBG("%s: configuring queue: %d, mode: %s, use-need-wakeup: %s.",
> - netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name,
> - dev->use_need_wakeup ? "true" : "false");
> - xsk_info = xsk_configure(ifindex, queue_id, mode, dev->use_need_wakeup,
> - report_socket_failures);
> + VLOG_DBG("%s: configuring queue: %d, mode: %s.",
> + netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name);
> +
> + xsk_info = xsk_configure(ifindex, queue_id, mode,
> report_socket_failures);
> if (!xsk_info) {
> VLOG(report_socket_failures ? VLL_ERR : VLL_DBG,
> "%s: Failed to create AF_XDP socket on queue %d in %s mode.",
> @@ -617,7 +579,6 @@ netdev_afxdp_set_config(struct netdev *netdev, const
> struct smap *args,
> struct netdev_linux *dev = netdev_linux_cast(netdev);
> const char *str_xdp_mode;
> enum afxdp_mode xdp_mode;
> - bool need_wakeup;
> int new_n_rxq;
>
> ovs_mutex_lock(&dev->mutex);
> @@ -644,20 +605,10 @@ netdev_afxdp_set_config(struct netdev *netdev, const
> struct smap *args,
> return EINVAL;
> }
>
> - need_wakeup = smap_get_bool(args, "use-need-wakeup",
> NEED_WAKEUP_DEFAULT);
> -#ifndef XDP_USE_NEED_WAKEUP
> - if (need_wakeup) {
> - VLOG_WARN("XDP need_wakeup is not supported in libbpf/libxdp.");
> - need_wakeup = false;
> - }
> -#endif
> -
> if (dev->requested_n_rxq != new_n_rxq
> - || dev->requested_xdp_mode != xdp_mode
> - || dev->requested_need_wakeup != need_wakeup) {
> + || dev->requested_xdp_mode != xdp_mode) {
> dev->requested_n_rxq = new_n_rxq;
> dev->requested_xdp_mode = xdp_mode;
> - dev->requested_need_wakeup = need_wakeup;
> netdev_request_reconfigure(netdev);
> }
> ovs_mutex_unlock(&dev->mutex);
> @@ -672,8 +623,6 @@ netdev_afxdp_get_config(const struct netdev *netdev,
> struct smap *args)
> ovs_mutex_lock(&dev->mutex);
> smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
> smap_add_format(args, "xdp-mode", "%s", xdp_modes[dev->xdp_mode].name);
> - smap_add_format(args, "use-need-wakeup", "%s",
> - dev->use_need_wakeup ? "true" : "false");
> ovs_mutex_unlock(&dev->mutex);
> return 0;
> }
> @@ -708,7 +657,6 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
>
> if (netdev->n_rxq == dev->requested_n_rxq
> && dev->xdp_mode == dev->requested_xdp_mode
> - && dev->use_need_wakeup == dev->requested_need_wakeup
> && dev->xsks) {
> goto out;
> }
> @@ -725,7 +673,6 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
> if (setrlimit(RLIMIT_MEMLOCK, &r)) {
> VLOG_ERR("setrlimit(RLIMIT_MEMLOCK) failed: %s",
> ovs_strerror(errno));
> }
> - dev->use_need_wakeup = dev->requested_need_wakeup;
>
> err = xsk_configure_all(netdev);
> if (err) {
> @@ -922,39 +869,26 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct
> dp_packet_batch *batch,
> }
>
> static inline int
> -kick_tx(struct xsk_socket_info *xsk_info, enum afxdp_mode mode,
> - bool use_need_wakeup)
> +kick_tx(struct xsk_socket_info *xsk_info)
> {
> - int ret, retries;
> - static const int KERNEL_TX_BATCH_SIZE = 16;
> -
> - if (use_need_wakeup && !xsk_tx_need_wakeup(xsk_info)) {
> - return 0;
> - }
> -
> - /* In all modes except native-with-zerocopy packet transmission is
> - * synchronous, and the kernel xmits only TX_BATCH_SIZE(16) packets for a
> - * single sendmsg syscall.
> - * So, we have to kick the kernel (n_packets / 16) times to be sure that
> - * all packets are transmitted. */
> - retries = (mode != OVS_AF_XDP_MODE_NATIVE_ZC)
> - ? xsk_info->outstanding_tx / KERNEL_TX_BATCH_SIZE
> - : 0;
> -kick_retry:
> - /* This causes system call into kernel's xsk_sendmsg, and
> xsk_generic_xmit
> - * (generic and native modes) or xsk_zc_xmit (native-with-zerocopy mode).
> - */
> - ret = sendto(xsk_socket__fd(xsk_info->xsk), NULL, 0, MSG_DONTWAIT,
> - NULL, 0);
> - if (ret < 0) {
> - if (retries-- && errno == EAGAIN) {
> - goto kick_retry;
> - }
> - if (errno == ENXIO || errno == ENOBUFS || errno == EOPNOTSUPP) {
> - return errno;
> + /* At most try as many times as the number of packets we have to send. */
> + uint32_t max_retry = xsk_info->outstanding_tx;
> +
> + while (max_retry-- && xsk_tx_need_wakeup(xsk_info)) {
> + int fd = xsk_socket__fd(xsk_info->xsk);
> + int ret;
> +
> + /* This causes system call into kernel's xsk_sendmsg, and
> + * xsk_generic_xmit (generic and native modes) or xsk_wakeup
> + * (native-with-zerocopy mode). */
> + ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
> + if (ret < 0 && errno != EAGAIN) {
> + /* It's possible that the device is busy, but we shouldn't
> + * alert users, as that can be too much noise. */
> + return errno == EBUSY ? 0 : errno;
> }
> }
> - /* No error, or EBUSY, or too many retries on EAGAIN. */
> + /* No error or too many retries on EAGAIN. */
> return 0;
> }
>
> @@ -1104,7 +1038,7 @@ __netdev_afxdp_batch_send(struct netdev *netdev, int
> qid,
> &orig);
> COVERAGE_INC(afxdp_tx_full);
> afxdp_complete_tx(xsk_info);
> - kick_tx(xsk_info, dev->xdp_mode_in_use, dev->use_need_wakeup);
> + kick_tx(xsk_info);
> error = ENOMEM;
> goto out;
> }
> @@ -1128,7 +1062,7 @@ __netdev_afxdp_batch_send(struct netdev *netdev, int
> qid,
> xsk_ring_prod__submit(&xsk_info->tx, dp_packet_batch_size(batch));
> xsk_info->outstanding_tx += dp_packet_batch_size(batch);
>
> - ret = kick_tx(xsk_info, dev->xdp_mode_in_use, dev->use_need_wakeup);
> + ret = kick_tx(xsk_info);
> if (OVS_UNLIKELY(ret)) {
> VLOG_WARN_RL(&rl, "%s: error sending AF_XDP packet: %s.",
> netdev_get_name(netdev), ovs_strerror(ret));
> @@ -1219,7 +1153,6 @@ netdev_afxdp_construct(struct netdev *netdev)
>
> dev->requested_n_rxq = NR_QUEUE;
> dev->requested_xdp_mode = OVS_AF_XDP_MODE_BEST_EFFORT;
> - dev->requested_need_wakeup = NEED_WAKEUP_DEFAULT;
>
> dev->xsks = NULL;
> dev->tx_locks = NULL;
> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
> index 4b5b60675..cf4a021d3 100644
> --- a/lib/netdev-linux-private.h
> +++ b/lib/netdev-linux-private.h
> @@ -119,9 +119,6 @@ struct netdev_linux {
> enum afxdp_mode requested_xdp_mode; /* Requested AF_XDP mode. */
> enum afxdp_mode xdp_mode_in_use; /* Effective AF_XDP mode. */
>
> - bool use_need_wakeup;
> - bool requested_need_wakeup;
> -
> struct netdev_afxdp_tx_lock *tx_locks; /* Array of locks for TX queues.
> */
> #endif
> };
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 3dcf74402..b93ad9344 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -3446,18 +3446,6 @@ ovs-vsctl add-port br0 p0 -- set Interface p0
> type=patch options:peer=p1 \
> </p>
> </column>
>
> - <column name="options" key="use-need-wakeup"
> - type='{"type": "boolean"}'>
> - <p>
> - Specifies whether to use need_wakeup feature in afxdp netdev.
> - If enabled, OVS explicitly wakes up the kernel RX, using poll()
> - syscall and wakes up TX, using sendto() syscall. For physical
> - devices, this feature improves the performance by avoiding
> - unnecessary sendto syscalls.
> - Defaults to true if supported by libbpf.
> - </p>
> - </column>
> -
> <column name="options" key="vhost-server-path"
> type='{"type": "string"}'>
> <p>
> --
> 2.52.0
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev