From: Bobby Eshleman <[email protected]>

Every devmem dmabuf binding today hands the page_pool PAGE_SIZE niovs.
This caps a single RX descriptor at PAGE_SIZE, burning CPU on buffer
churn for large flows.

Add a bind-time netlink attribute, NETDEV_A_DMABUF_RX_BUF_SIZE, that
lets userspace request a larger niov size. The value must be a power of
two >= PAGE_SIZE.

Measurements
------------
Setup: kperf in devmem RX/TX cuda mode, 4 flows, 64 MB messages, 60s,
dctcp, num-rx-queues=4, dmabuf-rx/tx-size-mb=2048, 10 runs per niov
size, mlx5.

CPU Util:

   niov        net sirq %        net idle %         app sys %        app idle %
  -----  ----------------  ----------------  ----------------  ----------------
     4K   62.38 +/-  8.27   33.40 +/-  7.51   54.15 +/- 10.23   43.67 +/- 10.53
    16K   58.91 +/-  5.35   35.23 +/-  5.88   41.05 +/-  8.87   56.42 +/-  9.24
    32K   64.12 +/-  0.68   31.09 +/-  1.48   44.54 +/-  3.51   52.63 +/-  3.65
    64K   54.69 +/-  5.54   39.67 +/-  5.81   35.47 +/-  3.11   61.97 +/-  3.27

RX app sys % drops ~19% from 4K to 64K.

Throughput:

   niov       RX dev Gbps   RX flow avg Gbps
  -----  ----------------  -----------------
     4K  300.63 +/- 53.21    75.16 +/- 13.30
    16K  321.35 +/- 28.20    80.34 +/-  7.05
    32K  347.63 +/-  2.20    86.91 +/-  0.55
    64K  332.11 +/- 14.26    83.03 +/-  3.56

Throughput seems to increase, but the stdev is pretty wide so could just
be noise.

kperf support (not yet merged):
https://github.com/facebookexperimental/kperf/commit/8837577f920876bce6986ec18869ac04439ebcd2

Signed-off-by: Bobby Eshleman <[email protected]>
---
 Documentation/netlink/specs/netdev.yaml |  8 +++++
 include/uapi/linux/netdev.h             |  1 +
 net/core/devmem.c                       | 52 +++++++++++++++++++--------------
 net/core/devmem.h                       | 13 ++++++---
 net/core/netdev-genl-gen.c              |  5 ++--
 net/core/netdev-genl.c                  | 18 ++++++++++--
 tools/include/uapi/linux/netdev.h       |  1 +
 7 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/Documentation/netlink/specs/netdev.yaml 
b/Documentation/netlink/specs/netdev.yaml
index a1f4c5a561e9..063119907983 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -591,6 +591,13 @@ attribute-sets:
         type: u32
         checks:
           min: 1
+      -
+        name: rx-buf-size
+        doc: |
+          Size in bytes of each RX buffer the NIC writes into from the bound
+          dmabuf. Must be a power of two and >= PAGE_SIZE; defaults to
+          PAGE_SIZE.
+        type: u32
 
 operations:
   list:
@@ -805,6 +812,7 @@ operations:
             - ifindex
             - fd
             - queues
+            - rx-buf-size
         reply:
           attributes:
             - id
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7df1056a35fd..180a4ffffd60 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -217,6 +217,7 @@ enum {
        NETDEV_A_DMABUF_QUEUES,
        NETDEV_A_DMABUF_FD,
        NETDEV_A_DMABUF_ID,
+       NETDEV_A_DMABUF_RX_BUF_SIZE,
 
        __NETDEV_A_DMABUF_MAX,
        NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1)
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 957d6b96216b..5a1c0d7984a8 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -46,7 +46,7 @@ static dma_addr_t net_devmem_get_dma_addr(const struct 
net_iov *niov)
 
        owner = net_devmem_iov_to_chunk_owner(niov);
        return owner->base_dma_addr +
-              ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
+              ((dma_addr_t)net_iov_idx(niov) << owner->binding->niov_shift);
 }
 
 static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref)
@@ -93,13 +93,14 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding 
*binding)
        ssize_t offset;
        ssize_t index;
 
-       dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE,
+       dma_addr = gen_pool_alloc_owner(binding->chunk_pool,
+                                       1UL << binding->niov_shift,
                                        (void **)&owner);
        if (!dma_addr)
                return NULL;
 
        offset = dma_addr - owner->base_dma_addr;
-       index = offset / PAGE_SIZE;
+       index = offset >> binding->niov_shift;
        niov = &owner->area.niovs[index];
 
        niov->desc.pp_magic = 0;
@@ -113,12 +114,13 @@ void net_devmem_free_dmabuf(struct net_iov *niov)
 {
        struct net_devmem_dmabuf_binding *binding = 
net_devmem_iov_binding(niov);
        unsigned long dma_addr = net_devmem_get_dma_addr(niov);
+       size_t niov_size = 1UL << binding->niov_shift;
 
        if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
-                                      PAGE_SIZE)))
+                                      niov_size)))
                return;
 
-       gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE);
+       gen_pool_free(binding->chunk_pool, dma_addr, niov_size);
 }
 
 void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
@@ -163,6 +165,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, 
u32 rxq_idx,
        u32 xa_idx;
        int err;
 
+       if (binding->niov_shift != PAGE_SHIFT)
+               mp_params.rx_page_size = 1U << binding->niov_shift;
+
        err = netif_mp_open_rxq(dev, rxq_idx, &mp_params, extack);
        if (err)
                return err;
@@ -184,14 +189,16 @@ struct net_devmem_dmabuf_binding *
 net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
                       struct device *dma_dev,
                       enum dma_data_direction direction,
-                      unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
+                      unsigned int dmabuf_fd, unsigned int niov_shift,
+                      struct netdev_nl_sock *priv,
                       struct netlink_ext_ack *extack)
 {
        struct net_devmem_dmabuf_binding *binding;
+       size_t niov_size = 1UL << niov_shift;
        static u32 id_alloc_next;
+       unsigned int sg_idx, i;
        struct scatterlist *sg;
        struct dma_buf *dmabuf;
-       unsigned int sg_idx, i;
        unsigned long virtual;
        int err;
 
@@ -213,6 +220,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
 
        binding->dev = dev;
        binding->vdev = vdev;
+       binding->niov_shift = niov_shift;
        xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
 
        err = percpu_ref_init(&binding->ref,
@@ -248,18 +256,14 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
                        goto err_unmap;
                }
                binding->tx_vec = kvmalloc_objs(struct net_iov *,
-                                               dmabuf->size / PAGE_SIZE);
+                                               dmabuf->size >> niov_shift);
                if (!binding->tx_vec) {
                        err = -ENOMEM;
                        goto err_unmap;
                }
        }
 
-       /* For simplicity we expect to make PAGE_SIZE allocations, but the
-        * binding can be much more flexible than that. We may be able to
-        * allocate MTU sized chunks here. Leave that for future work...
-        */
-       binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
+       binding->chunk_pool = gen_pool_create(niov_shift,
                                              dev_to_node(&dev->dev));
        if (!binding->chunk_pool) {
                err = -ENOMEM;
@@ -273,9 +277,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
                size_t len = sg_dma_len(sg);
                struct net_iov *niov;
 
-               if (!IS_ALIGNED(len, PAGE_SIZE)) {
+               if (!IS_ALIGNED(dma_addr, niov_size) ||
+                   !IS_ALIGNED(len, niov_size)) {
                        err = -EINVAL;
-                       NL_SET_ERR_MSG(extack, "dma-buf SG length must be 
PAGE_SIZE aligned");
+                       NL_SET_ERR_MSG(extack,
+                                      "dmabuf sg entry not aligned to niov 
size");
                        goto err_free_chunks;
                }
 
@@ -288,7 +294,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
 
                owner->area.base_virtual = virtual;
                owner->base_dma_addr = dma_addr;
-               owner->area.num_niovs = len / PAGE_SIZE;
+               owner->area.num_niovs = len >> niov_shift;
                owner->binding = binding;
 
                err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
@@ -313,7 +319,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
                        page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
                                                      
net_devmem_get_dma_addr(niov));
                        if (direction == DMA_TO_DEVICE)
-                               binding->tx_vec[owner->area.base_virtual / 
PAGE_SIZE + i] = niov;
+                               binding->tx_vec[(owner->area.base_virtual >> 
niov_shift) + i] = niov;
                }
 
                virtual += len;
@@ -430,13 +436,15 @@ struct net_iov *
 net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
                       size_t virt_addr, size_t *off, size_t *size)
 {
+       size_t niov_size = 1UL << binding->niov_shift;
+
        if (virt_addr >= binding->dmabuf->size)
                return NULL;
 
-       *off = virt_addr % PAGE_SIZE;
-       *size = PAGE_SIZE - *off;
+       *off = virt_addr & (niov_size - 1);
+       *size = niov_size - *off;
 
-       return binding->tx_vec[virt_addr / PAGE_SIZE];
+       return binding->tx_vec[virt_addr >> binding->niov_shift];
 }
 
 /*** "Dmabuf devmem memory provider" ***/
@@ -454,8 +462,8 @@ int mp_dmabuf_devmem_init(struct page_pool *pool)
        pool->dma_sync = false;
        pool->dma_sync_for_cpu = false;
 
-       if (pool->p.order != 0)
-               return -E2BIG;
+       if (pool->p.order != binding->niov_shift - PAGE_SHIFT)
+               return -EINVAL;
 
        net_devmem_dmabuf_binding_get(binding);
        return 0;
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 3852a56036cb..4a293a7d1149 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -71,6 +71,8 @@ struct net_devmem_dmabuf_binding {
         */
        struct net_iov **tx_vec;
 
+       unsigned int niov_shift;
+
        struct work_struct unbind_w;
 };
 
@@ -93,7 +95,8 @@ struct net_devmem_dmabuf_binding *
 net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
                       struct device *dma_dev,
                       enum dma_data_direction direction,
-                      unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
+                      unsigned int dmabuf_fd, unsigned int niov_shift,
+                      struct netdev_nl_sock *priv,
                       struct netlink_ext_ack *extack);
 struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id);
 void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
@@ -122,10 +125,11 @@ static inline u32 net_devmem_iov_binding_id(const struct 
net_iov *niov)
 
 static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
 {
-       struct net_iov_area *owner = net_iov_owner(niov);
+       struct dmabuf_genpool_chunk_owner *co =
+               net_devmem_iov_to_chunk_owner(niov);
 
-       return owner->base_virtual +
-              ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT);
+       return net_iov_owner(niov)->base_virtual +
+              ((unsigned long)net_iov_idx(niov) << co->binding->niov_shift);
 }
 
 static inline bool
@@ -175,6 +179,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, void *vdev,
                       struct device *dma_dev,
                       enum dma_data_direction direction,
                       unsigned int dmabuf_fd,
+                      unsigned int niov_shift,
                       struct netdev_nl_sock *priv,
                       struct netlink_ext_ack *extack)
 {
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index c7e138bfe345..55e03b9cd227 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -106,10 +106,11 @@ static const struct nla_policy 
netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE
 };
 
 /* NETDEV_CMD_BIND_RX - do */
-static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 
1] = {
+static const struct nla_policy 
netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_RX_BUF_SIZE + 1] = {
        [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
        [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
        [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
+       [NETDEV_A_DMABUF_RX_BUF_SIZE] = { .type = NLA_U32, },
 };
 
 /* NETDEV_CMD_NAPI_SET - do */
@@ -219,7 +220,7 @@ static const struct genl_split_ops netdev_nl_ops[] = {
                .cmd            = NETDEV_CMD_BIND_RX,
                .doit           = netdev_nl_bind_rx_doit,
                .policy         = netdev_bind_rx_nl_policy,
-               .maxattr        = NETDEV_A_DMABUF_FD,
+               .maxattr        = NETDEV_A_DMABUF_RX_BUF_SIZE,
                .flags          = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
        },
        {
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index b4d48f3672a5..9902a97698f5 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -1012,6 +1012,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct 
genl_info *info)
 {
        struct net_devmem_dmabuf_binding *binding;
        u32 ifindex, dmabuf_fd, rxq_idx;
+       unsigned int niov_shift = PAGE_SHIFT;
        struct netdev_nl_sock *priv;
        struct net_device *netdev;
        unsigned long *rxq_bitmap;
@@ -1028,6 +1029,18 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct 
genl_info *info)
        ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
        dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
 
+       if (info->attrs[NETDEV_A_DMABUF_RX_BUF_SIZE]) {
+               u32 rx_buf_size = 
nla_get_u32(info->attrs[NETDEV_A_DMABUF_RX_BUF_SIZE]);
+
+               if (!rx_buf_size || !is_power_of_2(rx_buf_size) ||
+                   rx_buf_size < PAGE_SIZE) {
+                       NL_SET_ERR_MSG(info->extack,
+                                      "rx_buf_size must be a power of 2 >= 
PAGE_SIZE");
+                       return -EINVAL;
+               }
+               niov_shift = ilog2(rx_buf_size);
+       }
+
        priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
        if (IS_ERR(priv))
                return PTR_ERR(priv);
@@ -1078,7 +1091,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct 
genl_info *info)
        }
 
        binding = net_devmem_bind_dmabuf(netdev, NULL, dma_dev, DMA_FROM_DEVICE,
-                                        dmabuf_fd, priv, info->extack);
+                                        dmabuf_fd, niov_shift, priv,
+                                        info->extack);
        if (IS_ERR(binding)) {
                err = PTR_ERR(binding);
                goto err_rxq_bitmap;
@@ -1221,7 +1235,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct 
genl_info *info)
        binding = net_devmem_bind_dmabuf(bind_dev,
                                         bind_dev != netdev ? netdev : NULL,
                                         dma_dev, DMA_TO_DEVICE, dmabuf_fd,
-                                        priv, info->extack);
+                                        PAGE_SHIFT, priv, info->extack);
        if (IS_ERR(binding)) {
                err = PTR_ERR(binding);
                goto err_unlock_bind_dev;
diff --git a/tools/include/uapi/linux/netdev.h 
b/tools/include/uapi/linux/netdev.h
index 7df1056a35fd..180a4ffffd60 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -217,6 +217,7 @@ enum {
        NETDEV_A_DMABUF_QUEUES,
        NETDEV_A_DMABUF_FD,
        NETDEV_A_DMABUF_ID,
+       NETDEV_A_DMABUF_RX_BUF_SIZE,
 
        __NETDEV_A_DMABUF_MAX,
        NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1)

-- 
2.53.0-Meta


Reply via email to