On 06/03, Bobby Eshleman wrote:
> From: Bobby Eshleman <[email protected]>
>
> Every devmem dmabuf binding today hands the page_pool PAGE_SIZE niovs.
> This caps a single RX descriptor at PAGE_SIZE, burning CPU on buffer
> churn for large flows.
>
> Add a bind-time netlink attribute, NETDEV_A_DMABUF_RX_BUF_SIZE, that
> lets userspace request a larger niov size. The value must be a power of
> two >= PAGE_SIZE.
>
> Measurements
> ------------
> Setup: kperf in devmem RX/TX cuda mode, 4 flows, 64 MB messages, 60s,
> dctcp, num-rx-queues=4, dmabuf-rx/tx-size-mb=2048, 10 runs per niov
> size, mlx5.
>
> CPU Util:
>
> niov net sirq % net idle % app sys % app idle
> %
> ----- ---------------- ---------------- ----------------
> ----------------
> 4K 62.38 +/- 8.27 33.40 +/- 7.51 54.15 +/- 10.23 43.67 +/-
> 10.53
> 16K 58.91 +/- 5.35 35.23 +/- 5.88 41.05 +/- 8.87 56.42 +/-
> 9.24
> 32K 64.12 +/- 0.68 31.09 +/- 1.48 44.54 +/- 3.51 52.63 +/-
> 3.65
> 64K 54.69 +/- 5.54 39.67 +/- 5.81 35.47 +/- 3.11 61.97 +/-
> 3.27
>
> RX app sys % drops ~19% from 4K to 64K.
>
> Throughput:
>
> niov RX dev Gbps RX flow avg Gbps
> ----- ---------------- -----------------
> 4K 300.63 +/- 53.21 75.16 +/- 13.30
> 16K 321.35 +/- 28.20 80.34 +/- 7.05
> 32K 347.63 +/- 2.20 86.91 +/- 0.55
> 64K 332.11 +/- 14.26 83.03 +/- 3.56
>
> Throughput seems to increase, but the stdev is pretty wide so could just
> be noise.
>
> kperf support (not yet merged):
> https://github.com/facebookexperimental/kperf/commit/8837577f920876bce6986ec18869ac04439ebcd2
>
> Signed-off-by: Bobby Eshleman <[email protected]>
> ---
> Documentation/netlink/specs/netdev.yaml | 8 +++++
> include/uapi/linux/netdev.h | 1 +
> net/core/devmem.c | 52
> +++++++++++++++++++--------------
> net/core/devmem.h | 13 ++++++---
> net/core/netdev-genl-gen.c | 5 ++--
> net/core/netdev-genl.c | 18 ++++++++++--
> tools/include/uapi/linux/netdev.h | 1 +
> 7 files changed, 68 insertions(+), 30 deletions(-)
>
> diff --git a/Documentation/netlink/specs/netdev.yaml
> b/Documentation/netlink/specs/netdev.yaml
> index a1f4c5a561e9..063119907983 100644
> --- a/Documentation/netlink/specs/netdev.yaml
> +++ b/Documentation/netlink/specs/netdev.yaml
> @@ -591,6 +591,13 @@ attribute-sets:
> type: u32
> checks:
> min: 1
> + -
> + name: rx-buf-size
> + doc: |
> + Size in bytes of each RX buffer the NIC writes into from the bound
> + dmabuf. Must be a power of two and >= PAGE_SIZE; defaults to
> + PAGE_SIZE.
> + type: u32
>
> operations:
> list:
> @@ -805,6 +812,7 @@ operations:
> - ifindex
> - fd
> - queues
> + - rx-buf-size
> reply:
> attributes:
> - id
> diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
> index 7df1056a35fd..180a4ffffd60 100644
> --- a/include/uapi/linux/netdev.h
> +++ b/include/uapi/linux/netdev.h
> @@ -217,6 +217,7 @@ enum {
> NETDEV_A_DMABUF_QUEUES,
> NETDEV_A_DMABUF_FD,
> NETDEV_A_DMABUF_ID,
> + NETDEV_A_DMABUF_RX_BUF_SIZE,
>
> __NETDEV_A_DMABUF_MAX,
> NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1)
> diff --git a/net/core/devmem.c b/net/core/devmem.c
> index 957d6b96216b..5a1c0d7984a8 100644
> --- a/net/core/devmem.c
> +++ b/net/core/devmem.c
> @@ -46,7 +46,7 @@ static dma_addr_t net_devmem_get_dma_addr(const struct
> net_iov *niov)
>
> owner = net_devmem_iov_to_chunk_owner(niov);
> return owner->base_dma_addr +
> - ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
> + ((dma_addr_t)net_iov_idx(niov) << owner->binding->niov_shift);
> }
>
> static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref)
> @@ -93,13 +93,14 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding
> *binding)
> ssize_t offset;
> ssize_t index;
>
> - dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE,
> + dma_addr = gen_pool_alloc_owner(binding->chunk_pool,
> + 1UL << binding->niov_shift,
> (void **)&owner);
> if (!dma_addr)
> return NULL;
>
> offset = dma_addr - owner->base_dma_addr;
> - index = offset / PAGE_SIZE;
> + index = offset >> binding->niov_shift;
> niov = &owner->area.niovs[index];
>
> niov->desc.pp_magic = 0;
> @@ -113,12 +114,13 @@ void net_devmem_free_dmabuf(struct net_iov *niov)
> {
> struct net_devmem_dmabuf_binding *binding =
> net_devmem_iov_binding(niov);
> unsigned long dma_addr = net_devmem_get_dma_addr(niov);
> + size_t niov_size = 1UL << binding->niov_shift;
>
> if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
> - PAGE_SIZE)))
> + niov_size)))
> return;
>
> - gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE);
> + gen_pool_free(binding->chunk_pool, dma_addr, niov_size);
> }
>
> void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
> @@ -163,6 +165,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device
> *dev, u32 rxq_idx,
> u32 xa_idx;
> int err;
>
> + if (binding->niov_shift != PAGE_SHIFT)
> + mp_params.rx_page_size = 1U << binding->niov_shift;
> +
> err = netif_mp_open_rxq(dev, rxq_idx, &mp_params, extack);
> if (err)
> return err;
> @@ -184,14 +189,16 @@ struct net_devmem_dmabuf_binding *
> net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
> struct device *dma_dev,
> enum dma_data_direction direction,
> - unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
> + unsigned int dmabuf_fd, unsigned int niov_shift,
> + struct netdev_nl_sock *priv,
> struct netlink_ext_ack *extack)
> {
> struct net_devmem_dmabuf_binding *binding;
> + size_t niov_size = 1UL << niov_shift;
> static u32 id_alloc_next;
> + unsigned int sg_idx, i;
> struct scatterlist *sg;
> struct dma_buf *dmabuf;
> - unsigned int sg_idx, i;
> unsigned long virtual;
> int err;
>
> @@ -213,6 +220,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
>
> binding->dev = dev;
> binding->vdev = vdev;
> + binding->niov_shift = niov_shift;
> xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
>
> err = percpu_ref_init(&binding->ref,
> @@ -248,18 +256,14 @@ net_devmem_bind_dmabuf(struct net_device *dev, void
> *vdev,
> goto err_unmap;
> }
> binding->tx_vec = kvmalloc_objs(struct net_iov *,
> - dmabuf->size / PAGE_SIZE);
> + dmabuf->size >> niov_shift);
> if (!binding->tx_vec) {
> err = -ENOMEM;
> goto err_unmap;
> }
> }
>
> - /* For simplicity we expect to make PAGE_SIZE allocations, but the
> - * binding can be much more flexible than that. We may be able to
> - * allocate MTU sized chunks here. Leave that for future work...
> - */
> - binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
> + binding->chunk_pool = gen_pool_create(niov_shift,
> dev_to_node(&dev->dev));
> if (!binding->chunk_pool) {
> err = -ENOMEM;
> @@ -273,9 +277,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, void
> *vdev,
> size_t len = sg_dma_len(sg);
> struct net_iov *niov;
>
> - if (!IS_ALIGNED(len, PAGE_SIZE)) {
> + if (!IS_ALIGNED(dma_addr, niov_size) ||
> + !IS_ALIGNED(len, niov_size)) {
> err = -EINVAL;
> - NL_SET_ERR_MSG(extack, "dma-buf SG length must be
> PAGE_SIZE aligned");
> + NL_SET_ERR_MSG(extack,
> + "dmabuf sg entry not aligned to niov
> size");
nit: should we NL_SET_ERR_MSG_FMT here and export chunk len and expected
alignment?
> goto err_free_chunks;
> }
>
> @@ -288,7 +294,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
>
> owner->area.base_virtual = virtual;
> owner->base_dma_addr = dma_addr;
> - owner->area.num_niovs = len / PAGE_SIZE;
> + owner->area.num_niovs = len >> niov_shift;
> owner->binding = binding;
>
> err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
> @@ -313,7 +319,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
> page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
>
> net_devmem_get_dma_addr(niov));
> if (direction == DMA_TO_DEVICE)
> - binding->tx_vec[owner->area.base_virtual /
> PAGE_SIZE + i] = niov;
> + binding->tx_vec[(owner->area.base_virtual >>
> niov_shift) + i] = niov;
> }
>
> virtual += len;
> @@ -430,13 +436,15 @@ struct net_iov *
> net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
> size_t virt_addr, size_t *off, size_t *size)
> {
> + size_t niov_size = 1UL << binding->niov_shift;
> +
> if (virt_addr >= binding->dmabuf->size)
> return NULL;
>
> - *off = virt_addr % PAGE_SIZE;
> - *size = PAGE_SIZE - *off;
> + *off = virt_addr & (niov_size - 1);
> + *size = niov_size - *off;
>
> - return binding->tx_vec[virt_addr / PAGE_SIZE];
> + return binding->tx_vec[virt_addr >> binding->niov_shift];
> }
>
> /*** "Dmabuf devmem memory provider" ***/
> @@ -454,8 +462,8 @@ int mp_dmabuf_devmem_init(struct page_pool *pool)
> pool->dma_sync = false;
> pool->dma_sync_for_cpu = false;
>
> - if (pool->p.order != 0)
> - return -E2BIG;
> + if (pool->p.order != binding->niov_shift - PAGE_SHIFT)
> + return -EINVAL;
Any specific reason you change E2BIG to EINVAL?