On 7/4/25 3:05 PM, Daniel Borkmann wrote:
> Extend 'inhibit=on' setting with the option to specify a pinned XSK map
> path along with a starting index (default 0) to push the created XSK
> sockets into. Example usage:
> 
>   # ./build/qemu-system-x86_64 [...] \
>    -netdev 
> af-xdp,ifname=enp2s0f0np0,id=net0,mode=native,queues=2,start-queue=14,inhibit=on,map-path=/sys/fs/bpf/xsks_map,map-start-index=14
>  \
>    -device virtio-net-pci,netdev=net0 [...]
> 
> This is useful for the case where an existing XDP program with XSK map
> is present on the AF_XDP supported phys device and the XSK map is not
> yet populated. For example, the former could have been pre-loaded onto
> the netdevice by a control plane, which later launches QEMU to populate
> it with XSK sockets.
> 
> Normally, the main idea behind 'inhibit=on' is that the QEMU instance
> doesn't need to have a lot of privileges to use the pre-loaded program
> and the pre-created sockets, but this mentioned use-case here is different
> where QEMU still needs privileges to create the sockets.
> 
> The 'map-start-index' parameter is optional and defaults to 0. It allows
> flexible placement of the XSK sockets, and is up to the user to specify
> when the XDP program with XSK map was already preloaded. In the simplest
> case the queue-to-map-slot mapping is just 1:1 based on ctx->rx_queue_index
> but the user might as well have a different scheme (or smaller map size,
> e.g. ctx->rx_queue_index % max_size) to push the inbound traffic to one
> of the XSK sockets.
> 
> Note that the bpf_xdp_query_id() is now only tested for 'inhibit=off'
> since only in the latter case the libxdp takes care of installing the
> XDP program which was installed based on the s->xdp_flags pointing to
> either driver or skb mode. For 'inhibit=on' we don't make any assumptions
> and neither go down the path of probing all possible options in which
> way the user installed the XDP program.
> 
> Signed-off-by: Daniel Borkmann <dan...@iogearbox.net>
> Cc: Ilya Maximets <i.maxim...@ovn.org>
> Cc: Jason Wang <jasow...@redhat.com>
> Cc: Anton Protopopov <as...@isovalent.com>
> ---
>  net/af-xdp.c    | 78 +++++++++++++++++++++++++++++++++++++++++++++----
>  qapi/net.json   | 29 +++++++++++-------
>  qemu-options.hx | 23 +++++++++++++--
>  3 files changed, 113 insertions(+), 17 deletions(-)

Thnaks, Daniel!  I have just a couple of small nits below that I missed in v3,
the rest looks good and is working fine.

> 
> diff --git a/net/af-xdp.c b/net/af-xdp.c
> index 29c5ad16cd..005117c336 100644
> --- a/net/af-xdp.c
> +++ b/net/af-xdp.c
> @@ -51,6 +51,10 @@ typedef struct AFXDPState {
>  
>      uint32_t             xdp_flags;
>      bool                 inhibit;
> +
> +    char                 *map_path;
> +    int                  map_fd;
> +    uint32_t             map_start_index;
>  } AFXDPState;
>  
>  #define AF_XDP_BATCH_SIZE 64
> @@ -260,6 +264,7 @@ static void af_xdp_send(void *opaque)
>  static void af_xdp_cleanup(NetClientState *nc)
>  {
>      AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
> +    int idx;
>  
>      qemu_purge_queued_packets(nc);
>  
> @@ -273,6 +278,18 @@ static void af_xdp_cleanup(NetClientState *nc)
>      s->umem = NULL;
>      qemu_vfree(s->buffer);
>      s->buffer = NULL;
> +
> +    if (s->map_fd >= 0) {
> +        idx = nc->queue_index + s->map_start_index;
> +        if (bpf_map_delete_elem(s->map_fd, &idx)) {
> +            fprintf(stderr, "af-xdp: unable to remove AF_XDP socket from "
> +                    "map %s\n", s->map_path);

nit :I'd suggest to keep the "map" on a previous line.  We have some space for 
it.

> +        }
> +        close(s->map_fd);
> +        s->map_fd = -1;
> +    }
> +    g_free(s->map_path);
> +    s->map_path = NULL;
>  }
>  
>  static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
> @@ -336,7 +353,6 @@ static int af_xdp_socket_create(AFXDPState *s,
>      };
>      int queue_id, error = 0;
>  
> -    s->inhibit = opts->has_inhibit && opts->inhibit;
>      if (s->inhibit) {
>          cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
>      }
> @@ -387,6 +403,35 @@ static int af_xdp_socket_create(AFXDPState *s,
>      return 0;
>  }
>  
> +static int af_xdp_update_xsk_map(AFXDPState *s, Error **errp)
> +{
> +    int xsk_fd, idx, error = 0;
> +
> +    if (!s->map_path) {
> +        return 0;
> +    }
> +
> +    s->map_fd = bpf_obj_get(s->map_path);
> +    if (s->map_fd < 0) {
> +        error = errno;
> +    } else {
> +        xsk_fd = xsk_socket__fd(s->xsk);
> +        idx = s->nc.queue_index + s->map_start_index;
> +        if (bpf_map_update_elem(s->map_fd, &idx, &xsk_fd, 0)) {
> +            error = errno;
> +        }
> +    }
> +
> +    if (error) {
> +        error_setg_errno(errp, error,
> +                         "failed to insert AF_XDP socket into map %s",
> +                         s->map_path);
> +        return -1;

nit: Maybe remove this line and return 'error' below?

> +    }
> +
> +    return 0;
> +}
> +
>  /* NetClientInfo methods. */
>  static NetClientInfo net_af_xdp_info = {
>      .type = NET_CLIENT_DRIVER_AF_XDP,
> @@ -441,6 +486,7 @@ int net_init_af_xdp(const Netdev *netdev,
>      int64_t i, queues;
>      Error *err = NULL;
>      AFXDPState *s;
> +    bool inhibit;
>  
>      ifindex = if_nametoindex(opts->ifname);
>      if (!ifindex) {
> @@ -456,8 +502,21 @@ int net_init_af_xdp(const Netdev *netdev,
>          return -1;
>      }
>  
> -    if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
> -        error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
> +    inhibit = opts->has_inhibit && opts->inhibit;
> +    if (inhibit && !opts->sock_fds && !opts->map_path) {
> +        error_setg(errp, "'inhibit=on' requires 'sock-fds' or 'map-path'");
> +        return -1;
> +    }
> +    if (!inhibit && (opts->sock_fds || opts->map_path)) {
> +        error_setg(errp, "'sock-fds' and 'map-path' require 'inhibit=on'");
> +        return -1;
> +    }
> +    if (opts->sock_fds && opts->map_path) {
> +        error_setg(errp, "'sock-fds' and 'map-path' are mutually exclusive");
> +        return -1;
> +    }
> +    if (!opts->map_path && opts->has_map_start_index) {
> +        error_setg(errp, "'map-start-index' requires 'map-path'");
>          return -1;
>      }
>  
> @@ -481,14 +540,23 @@ int net_init_af_xdp(const Netdev *netdev,
>  
>          pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
>          s->ifindex = ifindex;
> +        s->inhibit = inhibit;
> +
> +        s->map_path = g_strdup(opts->map_path);
> +        s->map_fd = -1;
> +        s->map_start_index = 0;
> +        if (opts->has_map_start_index && opts->map_start_index > 0) {

We should error out if the user specified a negative value.  I'd suggest
to add the check to the list of user-input validation above instead of
silently ignoring the incorrect value.  And then we could skip the value
check here.

> +            s->map_start_index = opts->map_start_index;
> +        }
>  
>          if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, &err) ||
> -            af_xdp_socket_create(s, opts, &err)) {
> +            af_xdp_socket_create(s, opts, &err) ||
> +            af_xdp_update_xsk_map(s, &err)) {
>              goto err;
>          }
>      }
>  
> -    if (nc0) {
> +    if (nc0 && !inhibit) {
>          s = DO_UPCAST(AFXDPState, nc, nc0);
>          if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || 
> !prog_id) {
>              error_setg_errno(errp, errno,
> diff --git a/qapi/net.json b/qapi/net.json
> index 97ea183981..3d80a9cacd 100644
> --- a/qapi/net.json
> +++ b/qapi/net.json
> @@ -454,25 +454,34 @@
>  #     (default: 0).
>  #
>  # @inhibit: Don't load a default XDP program, use one already loaded
> -#     to the interface (default: false).  Requires @sock-fds.
> +#     to the interface (default: false).  Requires @sock-fds or @map-path.
>  #
>  # @sock-fds: A colon (:) separated list of file descriptors for
>  #     already open but not bound AF_XDP sockets in the queue order.
>  #     One fd per queue.  These descriptors should already be added
> -#     into XDP socket map for corresponding queues.  Requires
> -#     @inhibit.
> +#     into XDP socket map for corresponding queues.  @sock-fds and
> +#     @map-path are mutually exclusive.  Requires @inhibit.
> +#
> +# @map-path: The path to a pinned xsk map to push file descriptors
> +#     for bound AF_XDP sockets into.  @map-path and @sock-fds are
> +#     mutually exclusive.  Requires @inhibit.  (Since 10.1)
> +#
> +# @map-start-index: Use @map-path to insert xsk sockets starting from
> +#     this index number (default: 0).  Requires @map-path.  (Since 10.1)
>  #
>  # Since: 8.2
>  ##
>  { 'struct': 'NetdevAFXDPOptions',
>    'data': {
> -    'ifname':       'str',
> -    '*mode':        'AFXDPMode',
> -    '*force-copy':  'bool',
> -    '*queues':      'int',
> -    '*start-queue': 'int',
> -    '*inhibit':     'bool',
> -    '*sock-fds':    'str' },
> +    'ifname':           'str',
> +    '*mode':            'AFXDPMode',
> +    '*force-copy':      'bool',
> +    '*queues':          'int',
> +    '*start-queue':     'int',
> +    '*inhibit':         'bool',
> +    '*sock-fds':        'str',
> +    '*map-path':        'str',
> +    '*map-start-index': 'int' },
>    'if': 'CONFIG_AF_XDP' }
>  
>  ##
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 1f862b19a6..0fd4fd8d46 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -2909,6 +2909,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
>  #ifdef CONFIG_AF_XDP
>      "-netdev 
> af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off]\n"
>      "         
> [,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]\n"
> +    "         [,map-path=/path/to/socket/map][,map-start-index=i]\n"
>      "                attach to the existing network interface 'name' with 
> AF_XDP socket\n"
>      "                use 'mode=MODE' to specify an XDP program attach mode\n"
>      "                use 'force-copy=on|off' to force XDP copy mode even if 
> device supports zero-copy (default: off)\n"
> @@ -2916,6 +2917,8 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
>      "                with inhibit=on,\n"
>      "                  use 'sock-fds' to provide file descriptors for 
> already open AF_XDP sockets\n"
>      "                  added to a socket map in XDP program.  One socket per 
> queue.\n"
> +    "                  use 'map-path' to provide the socket map location to 
> populate AF_XDP sockets with\n"

nit: it feels like we need some punctuation sign after the 'with', otherwise
it reads as "populate AF_XDP sockets with beginning from the", which makes no
sense.

> +    "                  beginning from the 'map-start-index' specified index 
> (default: 0) (Since 10.1)\n"
>      "                use 'queues=n' to specify how many queues of a 
> multiqueue interface should be used\n"
>      "                use 'start-queue=m' to specify the first queue that 
> should be used\n"
>  #endif
> @@ -3610,7 +3613,7 @@ SRST
>          # launch QEMU instance
>          |qemu_system| linux.img -nic vde,sock=/tmp/myswitch
>  
> -``-netdev 
> af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off][,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]``
> +``-netdev 
> af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off][,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z][,map-path=/path/to/socket/map][,map-start-index=i]``
>      Configure AF_XDP backend to connect to a network interface 'name'
>      using AF_XDP socket.  A specific program attach mode for a default
>      XDP program can be forced with 'mode', defaults to best-effort,
> @@ -3650,7 +3653,8 @@ SRST
>              -netdev af-xdp,id=n1,ifname=eth0,queues=1,start-queue=1
>  
>      XDP program can also be loaded externally.  In this case 'inhibit' option
> -    should be set to 'on' and 'sock-fds' provided with file descriptors for
> +    should be set to 'on'.  Either 'sock-fds' or 'map-path' can be used with
> +    'inhibit' enabled.  'sock-fds' can be provided with file descriptors for
>      already open but not bound XDP sockets already added to a socket map for
>      corresponding queues.  One socket per queue.
>  
> @@ -3659,6 +3663,21 @@ SRST
>          |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
>              -netdev 
> af-xdp,id=n1,ifname=eth0,queues=3,inhibit=on,sock-fds=15:16:17
>  
> +    For the 'inhibit' option set to 'on' used together with 'map-path' it is
> +    expected that the XDP program with the socket map is already loaded on
> +    the networking device and the map pinned into BPF file system.  The path
> +    to the pinned map is then passed to QEMU which then creates the file
> +    descriptors and inserts them into the existing socket map.
> +
> +    .. parsed-literal::
> +
> +        |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
> +            -netdev 
> af-xdp,id=n1,ifname=eth0,queues=2,inhibit=on,map-path=/sys/fs/bpf/xsks_map
> +
> +    Additionally, 'map-start-index' can be used to specify the start offset
> +    for insertion into the socket map.  The combination of 'map-path' and
> +    'sock-fds' together is not supported.
> +
>  ``-netdev vhost-user,chardev=id[,vhostforce=on|off][,queues=n]``
>      Establish a vhost-user netdev, backed by a chardev id. The chardev
>      should be a unix domain socket backed one. The vhost-user uses a


Reply via email to