On 3/1/2019 8:09 AM, Xiaolong Ye wrote:
> Add a new PMD driver for AF_XDP which is a proposed faster version of
> AF_PACKET interface in Linux. More info about AF_XDP, please refer to [1]
> [2].
> 
> This is the vanilla version PMD which just uses a raw buffer registered as
> the umem.
> 
> [1] https://fosdem.org/2018/schedule/event/af_xdp/
> [2] https://lwn.net/Articles/745934/
> 
> Signed-off-by: Xiaolong Ye <xiaolong...@intel.com>
> ---
>  MAINTAINERS                                   |   6 +
>  config/common_base                            |   5 +
>  doc/guides/nics/af_xdp.rst                    |  43 +

Can you please add new .rst file to index file, doc/guides/nics/index.rst ?

>  doc/guides/rel_notes/release_18_11.rst        |   7 +

Please switch to latest release notes.

>  drivers/net/Makefile                          |   1 +
>  drivers/net/af_xdp/Makefile                   |  31 +
>  drivers/net/af_xdp/meson.build                |   7 +
>  drivers/net/af_xdp/rte_eth_af_xdp.c           | 903 ++++++++++++++++++
>  drivers/net/af_xdp/rte_pmd_af_xdp_version.map |   4 +
>  mk/rte.app.mk                                 |   1 +

Can you please add .ini file too?

<...>

> @@ -416,6 +416,11 @@ CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n
>  #
>  CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n
>  
> +#
> +# Compile software PMD backed by AF_XDP sockets (Linux only)
> +#
> +CONFIG_RTE_LIBRTE_PMD_AF_XDP=n
> +

Why it is not enabled in linux config (config/common_linuxapp)? Is it because of
the external library dependencies?
I guess there is a requirement to the specific Linux kernel version, can it be
possible to detect it in Makefile and enable/disable according this information?

<...>

> +Prerequisites
> +-------------
> +
> +This is a Linux-specific PMD, thus the following prerequisites apply:
> +
> +*  A Linux Kernel with XDP sockets configuration enabled;

Can you please give more details of what exact vanilla kernel version?

> +*  libbpf with latest af_xdp support installed

Is there a specific version of libbpf for this?
I can see in makefile, libelf is also linked, is it a dependency?

<...>

> @@ -0,0 +1,31 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2018 Intel Corporation
> +
> +include $(RTE_SDK)/mk/rte.vars.mk
> +
> +#
> +# library name
> +#
> +LIB = librte_pmd_af_xdp.a
> +
> +EXPORT_MAP := rte_pmd_af_xdp_version.map
> +
> +LIBABIVER := 1
> +
> +
> +CFLAGS += -O3
> +# below line should be removed

+1

> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/include
> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/lib/bpf
> +
> +CFLAGS += $(WERROR_FLAGS)
> +LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
> +LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
> +LDLIBS += -lrte_bus_vdev

Dependent libraries should be linked here.

<...>

> +
> +#include <linux/if_ether.h>
> +#include <linux/if_xdp.h>
> +#include <linux/if_link.h>
> +#include <asm/barrier.h>

Getting an build error for this [1], can there be any include path param 
missing?

[1]
drivers/net/af_xdp/rte_eth_af_xdp.c:15:10: fatal error: asm/barrier.h: No such
file or directory

<...>

> +static void
> +eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
> +{
> +     struct pmd_internals *internals = dev->data->dev_private;
> +
> +     dev_info->if_index = internals->if_index;
> +     dev_info->max_mac_addrs = 1;
> +     dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
> +     dev_info->max_rx_queues = 1;
> +     dev_info->max_tx_queues = 1;

'ETH_AF_XDP_MAX_QUEUE_PAIRS' is '16' but you are forcing the max Rx/Tx queue
number to be '1', intentional?

> +     dev_info->min_rx_bufsize = 0;
> +
> +     dev_info->default_rxportconf.nb_queues = 1;
> +     dev_info->default_txportconf.nb_queues = 1;
> +     dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
> +     dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
> +}
> +
> +static int
> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
> +{
> +     struct pmd_internals *internals = dev->data->dev_private;
> +     struct xdp_statistics xdp_stats;
> +     struct pkt_rx_queue *rxq;
> +     socklen_t optlen;
> +     int i;
> +
> +     optlen = sizeof(struct xdp_statistics);
> +     for (i = 0; i < dev->data->nb_rx_queues; i++) {
> +             rxq = &internals->rx_queues[i];
> +             stats->q_ipackets[i] = internals->rx_queues[i].rx_pkts;
> +             stats->q_ibytes[i] = internals->rx_queues[i].rx_bytes;
> +
> +             stats->q_opackets[i] = internals->tx_queues[i].tx_pkts;
> +             stats->q_errors[i] = internals->tx_queues[i].err_pkts;

There is a patch from David, which points the 'q_errors' is for Rx only:
https://patches.dpdk.org/cover/50783/

<...>

> +static void xdp_umem_destroy(struct xsk_umem_info *umem)
> +{
> +     if (umem->buffer)
> +             free(umem->buffer);
> +
> +     free(umem);

Should we set freed pointers to 'null'?

Should free 'umem->buf_ring' before freeing 'umem'?

<...>

> +static int
> +eth_rx_queue_setup(struct rte_eth_dev *dev,
> +                uint16_t rx_queue_id,
> +                uint16_t nb_rx_desc,
> +                unsigned int socket_id __rte_unused,
> +                const struct rte_eth_rxconf *rx_conf __rte_unused,
> +                struct rte_mempool *mb_pool)
> +{
> +     struct pmd_internals *internals = dev->data->dev_private;
> +     unsigned int buf_size, data_size;
> +     struct pkt_rx_queue *rxq;
> +     int ret = 0;
> +
> +     if (mb_pool == NULL) {
> +             RTE_LOG(ERR, PMD,
> +                     "Invalid mb_pool\n");
> +             ret = -EINVAL;
> +             goto err;
> +     }

if 'mb_pool' is 'null', it will crash in 'rte_eth_rx_queue_setup()' before
coming here, I think we can drop this check.

> +
> +     if (dev->data->nb_rx_queues <= rx_queue_id) {
> +             RTE_LOG(ERR, PMD,
> +                     "Invalid rx queue id: %d\n", rx_queue_id);
> +             ret = -EINVAL;
> +             goto err;
> +     }

This check already done in 'rte_eth_rx_queue_setup()' shouldn't need to be done
here.
<...>

> +static int
> +eth_tx_queue_setup(struct rte_eth_dev *dev,
> +                uint16_t tx_queue_id,
> +                uint16_t nb_tx_desc,
> +                unsigned int socket_id __rte_unused,
> +                const struct rte_eth_txconf *tx_conf __rte_unused)
> +{
> +     struct pmd_internals *internals = dev->data->dev_private;
> +     struct pkt_tx_queue *txq;
> +
> +     if (dev->data->nb_tx_queues <= tx_queue_id) {
> +             RTE_LOG(ERR, PMD, "Invalid tx queue id: %d\n", tx_queue_id);
> +             return -EINVAL;
> +     }

Can skip the check, same as above.

> +
> +     RTE_LOG(WARNING, PMD, "tx queue setup size=%d will be skipped\n",
> +             nb_tx_desc);

Why setup will be skipped?

> +     txq = &internals->tx_queues[tx_queue_id];
> +
> +     dev->data->tx_queues[tx_queue_id] = txq;
> +     return 0;
> +}
> +
> +static int
> +eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
> +{
> +     struct pmd_internals *internals = dev->data->dev_private;
> +     struct ifreq ifr = { .ifr_mtu = mtu };
> +     int ret;
> +     int s;
> +
> +     s = socket(PF_INET, SOCK_DGRAM, 0);
> +     if (s < 0)
> +             return -EINVAL;
> +
> +     snprintf(ifr.ifr_name, IFNAMSIZ, "%s", internals->if_name);

Can you please prefer strlcpy?

> +     ret = ioctl(s, SIOCSIFMTU, &ifr);
> +     close(s);
> +
> +     if (ret < 0)
> +             return -EINVAL;
> +
> +     return 0;
> +}
> +
> +static void
> +eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
> +{
> +     struct ifreq ifr;
> +     int s;
> +
> +     s = socket(PF_INET, SOCK_DGRAM, 0);
> +     if (s < 0)
> +             return;
> +
> +     snprintf(ifr.ifr_name, IFNAMSIZ, "%s", if_name);

Can you please prefer strlcpy?

<...>

> +
> +static struct rte_vdev_driver pmd_af_xdp_drv;

Do we need this forward decleration?

> +
> +static void
> +parse_parameters(struct rte_kvargs *kvlist,
> +              char **if_name,
> +              int *queue_idx)
> +{
> +     struct rte_kvargs_pair *pair = NULL;
> +     unsigned int k_idx;
> +
> +     for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
> +             pair = &kvlist->pairs[k_idx];
> +             if (strstr(pair->key, ETH_AF_XDP_IFACE_ARG))

It is better to use 'rte_kvargs_process()' instead of accessing the 'kvargs'
internals.

> +                     *if_name = pair->value;
> +             else if (strstr(pair->key, ETH_AF_XDP_QUEUE_IDX_ARG))
> +                     *queue_idx = atoi(pair->value);
> +     }
> +}
> +
> +static int
> +get_iface_info(const char *if_name,
> +            struct ether_addr *eth_addr,
> +            int *if_index)
> +{
> +     struct ifreq ifr;
> +     int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
> +
> +     if (sock < 0)
> +             return -1;
> +
> +     strcpy(ifr.ifr_name, if_name);

Please prefer strlcpy.

> +     if (ioctl(sock, SIOCGIFINDEX, &ifr))
> +             goto error;
> +
> +     if (ioctl(sock, SIOCGIFHWADDR, &ifr))
> +             goto error;
> +
> +     memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, 6);

Can use 'ether_addr_copy()'

> +
> +     close(sock);
> +     *if_index = if_nametoindex(if_name);
> +     return 0;
> +
> +error:
> +     close(sock);
> +     return -1;
> +}
> +
> +static int
> +init_internals(struct rte_vdev_device *dev,
> +            const char *if_name,
> +            int queue_idx)
> +{
> +     const char *name = rte_vdev_device_name(dev);
> +     struct rte_eth_dev *eth_dev = NULL;
> +     const unsigned int numa_node = dev->device.numa_node;
> +     struct pmd_internals *internals = NULL;
> +     int ret;
> +     int i;
> +
> +     internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
> +     if (!internals)
> +             return -ENOMEM;
> +
> +     internals->queue_idx = queue_idx;
> +     strcpy(internals->if_name, if_name);

prefer 'strlcpy' please

> +
> +     for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
> +             internals->tx_queues[i].pair = &internals->rx_queues[i];
> +             internals->rx_queues[i].pair = &internals->tx_queues[i];
> +     }
> +
> +     ret = get_iface_info(if_name, &internals->eth_addr,
> +                          &internals->if_index);
> +     if (ret)
> +             goto err;
> +
> +     eth_dev = rte_eth_vdev_allocate(dev, 0);
> +     if (!eth_dev)
> +             goto err;
> +
> +     eth_dev->data->dev_private = internals;
> +     eth_dev->data->dev_link = pmd_link;
> +     eth_dev->data->mac_addrs = &internals->eth_addr;
> +     eth_dev->dev_ops = &ops;
> +     eth_dev->rx_pkt_burst = eth_af_xdp_rx;
> +     eth_dev->tx_pkt_burst = eth_af_xdp_tx;
> +
> +     rte_eth_dev_probing_finish(eth_dev);

What do you think moving this call into 'rte_pmd_af_xdp_probe' function if
'init_internals' returns sucess instead of setting here?

> +     return 0;
> +
> +err:
> +     rte_free(internals);
> +     return -1;
> +}
> +
> +static int
> +rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
> +{
> +     struct rte_kvargs *kvlist;
> +     char *if_name = NULL;
> +     int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;

This 'queue_idx' is for interface queue to pass to xsk_* API, also we have same
variable name 'queue_idx' that we use for DPDK queue index, they get confused
easily, what do you think rename this one something like 'xsk_queue_idx'?

> +     struct rte_eth_dev *eth_dev;
> +     const char *name;
> +     int ret;
> +
> +     RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
> +             rte_vdev_device_name(dev));
> +
> +     name = rte_vdev_device_name(dev);
> +     if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
> +             strlen(rte_vdev_device_args(dev)) == 0) {
> +             eth_dev = rte_eth_dev_attach_secondary(name);
> +             if (!eth_dev) {
> +                     RTE_LOG(ERR, PMD, "Failed to probe %s\n", name);
> +                     return -EINVAL;
> +             }
> +             eth_dev->dev_ops = &ops;
> +             rte_eth_dev_probing_finish(eth_dev);
> +     }
> +
> +     kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
> +     if (!kvlist) {
> +             RTE_LOG(ERR, PMD,
> +                     "Invalid kvargs\n");

No need to break the line.

> +             return -EINVAL;
> +     }
> +
> +     if (dev->device.numa_node == SOCKET_ID_ANY)
> +             dev->device.numa_node = rte_socket_id();
> +
> +     parse_parameters(kvlist, &if_name,
> +                      &queue_idx);

Same, no need to break the line.

> +
> +     ret = init_internals(dev, if_name, queue_idx);
> +
> +     rte_kvargs_free(kvlist);
> +
> +     return ret;
> +}
> +
> +static int
> +rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
> +{
> +     struct rte_eth_dev *eth_dev = NULL;
> +     struct pmd_internals *internals;
> +
> +     RTE_LOG(INFO, PMD, "Removing AF_XDP ethdev on numa socket %u\n",
> +             rte_socket_id());
> +
> +     if (!dev)
> +             return -1;
> +
> +     /* find the ethdev entry */
> +     eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
> +     if (!eth_dev)
> +             return -1;
> +
> +     internals = eth_dev->data->dev_private;
> +
> +     rte_ring_free(internals->umem->buf_ring);
> +     rte_free(internals->umem->buffer);
> +     rte_free(internals->umem);
> +     rte_free(internals);
> +
> +     rte_eth_dev_release_port(eth_dev);

This frees the 'eth_dev->data->dev_private', (internals), it can be problem to
try to free same pointer twice.

> +
> +
> +     return 0;
> +}
> +
> +static struct rte_vdev_driver pmd_af_xdp_drv = {
> +     .probe = rte_pmd_af_xdp_probe,
> +     .remove = rte_pmd_af_xdp_remove,
> +};
> +
> +RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
> +RTE_PMD_REGISTER_ALIAS(net_af_xdp, eth_af_xdp);

No need to create the alias, it is for backward compatibility.

> +RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
> +                           "iface=<string> "
> +                           "queue=<int> ");
> diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map 
> b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
> new file mode 100644
> index 000000000..ef3539840
> --- /dev/null
> +++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
> @@ -0,0 +1,4 @@
> +DPDK_2.0 {

Use release version please.

> +
> +     local: *;
> +};
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index d0ab942d5..db3271c7b 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -143,6 +143,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL)  += 
> -lrte_mempool_dpaa2
>  endif
>  
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     += -lrte_pmd_af_xdp -lelf -lbpf

For which API libelf is required?

Reply via email to