From: Bobby Eshleman <[email protected]>

When a netkit virtual device leases queues from a physical NIC, devmem
TX bindings created on the netkit device should use the physical NIC
for DMA operations rather than the virtual device, which has no DMA
capability.

In bind_tx_doit, walk the device's leased rx queues to discover the
underlying physical device that supports netmem_tx. Use this device
for DMA device lookup and pass it as the real_tx_dev in the binding.
When real_tx_dev is set, it is also used for NUMA-local allocations.

Extend validate_xmit_unreadable_skb() to support the netkit case, where
the skb is validated twice: once on the netkit guest device and again on
the physical NIC after BPF redirect or ip forwarding. Both invocations
must pass for the skb to be transmitted.

Signed-off-by: Bobby Eshleman <[email protected]>
---
 net/core/dev.c         | 26 +++++++++++++++++++-------
 net/core/devmem.c      | 16 ++++++++++------
 net/core/devmem.h      |  6 ++++--
 net/core/netdev-genl.c | 38 +++++++++++++++++++++++++++++++++-----
 4 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index ca4b26dfb1bd..105bd27be024 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3981,24 +3981,36 @@ static struct sk_buff *sk_validate_xmit_skb(struct 
sk_buff *skb,
 static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
                                                    struct net_device *dev)
 {
+       struct net_devmem_dmabuf_binding *binding;
        struct skb_shared_info *shinfo;
+       struct net_device *real_tx_dev;
        struct net_iov *niov;
 
        if (likely(skb_frags_readable(skb)))
                goto out;
 
-       if (!dev->netmem_tx)
-               goto out_free;
-
        shinfo = skb_shinfo(skb);
+       if (shinfo->nr_frags == 0)
+               goto out;
 
-       if (shinfo->nr_frags > 0) {
-               niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
-               if (net_is_devmem_iov(niov) &&
-                   READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev)
+       niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+       if (!net_is_devmem_iov(niov))
+               goto out;
+
+       binding = net_devmem_iov_binding(niov);
+       real_tx_dev = READ_ONCE(binding->real_tx_dev);
+
+       if (real_tx_dev) {
+               if (!real_tx_dev->netmem_tx)
+                       goto out_free;
+               if (READ_ONCE(binding->dev) != dev && real_tx_dev != dev)
                        goto out_free;
+               goto out;
        }
 
+       if (READ_ONCE(binding->dev) != dev || !dev->netmem_tx)
+               goto out_free;
+
 out:
        return skb;
 
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 7ede81509968..a4148cba5b5f 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -181,12 +181,13 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device 
*dev, u32 rxq_idx,
 }
 
 struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev,
                       struct device *dma_dev,
                       enum dma_data_direction direction,
                       unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
                       struct netlink_ext_ack *extack)
 {
+       struct net_device *node_dev = real_tx_dev ?: dev;
        struct net_devmem_dmabuf_binding *binding;
        static u32 id_alloc_next;
        struct scatterlist *sg;
@@ -205,13 +206,14 @@ net_devmem_bind_dmabuf(struct net_device *dev,
                return ERR_CAST(dmabuf);
 
        binding = kzalloc_node(sizeof(*binding), GFP_KERNEL,
-                              dev_to_node(&dev->dev));
+                              dev_to_node(&node_dev->dev));
        if (!binding) {
                err = -ENOMEM;
                goto err_put_dmabuf;
        }
 
        binding->dev = dev;
+       binding->real_tx_dev = real_tx_dev;
        xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
 
        err = percpu_ref_init(&binding->ref,
@@ -254,7 +256,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
         * allocate MTU sized chunks here. Leave that for future work...
         */
        binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
-                                             dev_to_node(&dev->dev));
+                                             dev_to_node(&node_dev->dev));
        if (!binding->chunk_pool) {
                err = -ENOMEM;
                goto err_tx_vec;
@@ -268,7 +270,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
                struct net_iov *niov;
 
                owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
-                                    dev_to_node(&dev->dev));
+                                    dev_to_node(&node_dev->dev));
                if (!owner) {
                        err = -ENOMEM;
                        goto err_free_chunks;
@@ -280,7 +282,8 @@ net_devmem_bind_dmabuf(struct net_device *dev,
                owner->binding = binding;
 
                err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
-                                        dma_addr, len, dev_to_node(&dev->dev),
+                                        dma_addr, len,
+                                        dev_to_node(&node_dev->dev),
                                         owner);
                if (err) {
                        kfree(owner);
@@ -397,7 +400,8 @@ struct net_devmem_dmabuf_binding 
*net_devmem_get_binding(struct sock *sk,
         */
        dst_dev = dst_dev_rcu(dst);
        if (unlikely(!dst_dev) ||
-           unlikely(dst_dev != READ_ONCE(binding->dev))) {
+           unlikely(dst_dev != READ_ONCE(binding->dev) &&
+                    dst_dev != READ_ONCE(binding->real_tx_dev))) {
                err = -ENODEV;
                goto out_unlock;
        }
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 1c5c18581fcb..ffcf97a33633 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -20,6 +20,8 @@ struct net_devmem_dmabuf_binding {
        struct dma_buf_attachment *attachment;
        struct sg_table *sgt;
        struct net_device *dev;
+       /* Phys dev behind a virtual dev (e.g. netkit) with a queue lease. */
+       struct net_device *real_tx_dev;
        struct gen_pool *chunk_pool;
        /* Protect dev */
        struct mutex lock;
@@ -84,7 +86,7 @@ struct dmabuf_genpool_chunk_owner {
 
 void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
 struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev,
                       struct device *dma_dev,
                       enum dma_data_direction direction,
                       unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
@@ -165,7 +167,7 @@ static inline void net_devmem_put_net_iov(struct net_iov 
*niov)
 }
 
 static inline struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev,
                       struct device *dma_dev,
                       enum dma_data_direction direction,
                       unsigned int dmabuf_fd,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 7d073894ca74..2b34924dc30f 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -1037,7 +1037,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct 
genl_info *info)
                goto err_rxq_bitmap;
        }
 
-       binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
+       binding = net_devmem_bind_dmabuf(netdev, NULL, dma_dev, DMA_FROM_DEVICE,
                                         dmabuf_fd, priv, info->extack);
        if (IS_ERR(binding)) {
                err = PTR_ERR(binding);
@@ -1082,6 +1082,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct 
genl_info *info)
 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
 {
        struct net_devmem_dmabuf_binding *binding;
+       struct net_device *real_tx_dev = NULL;
+       struct netdev_rx_queue *lease_rxq;
        struct netdev_nl_sock *priv;
        struct net_device *netdev;
        struct device *dma_dev;
@@ -1089,6 +1091,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct 
genl_info *info)
        struct sk_buff *rsp;
        int err = 0;
        void *hdr;
+       int i;
 
        if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
            GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
@@ -1124,16 +1127,41 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct 
genl_info *info)
                goto err_unlock_netdev;
        }
 
-       if (!netdev->netmem_tx) {
+       for (i = 0; i < netdev->real_num_rx_queues; i++) {
+               lease_rxq = READ_ONCE(__netif_get_rx_queue(netdev, i)->lease);
+
+               if (!lease_rxq)
+                       continue;
+
+               real_tx_dev = lease_rxq->dev;
+               break;
+       }
+
+       if (real_tx_dev) {
+               if (!netif_device_present(real_tx_dev)) {
+                       err = -ENODEV;
+                       goto err_unlock_netdev;
+               }
+
+               if (!real_tx_dev->netmem_tx) {
+                       err = -EOPNOTSUPP;
+                       NL_SET_ERR_MSG(info->extack,
+                                      "Driver for queue lease device does not 
support netmem TX");
+                       goto err_unlock_netdev;
+               }
+       }
+
+       if (!real_tx_dev && !netdev->netmem_tx) {
                err = -EOPNOTSUPP;
                NL_SET_ERR_MSG(info->extack,
                               "Driver does not support netmem TX");
                goto err_unlock_netdev;
        }
 
-       dma_dev = netdev_queue_get_dma_dev(netdev, 0);
-       binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
-                                        dmabuf_fd, priv, info->extack);
+       dma_dev = netdev_queue_get_dma_dev(real_tx_dev ?: netdev, 0);
+       binding = net_devmem_bind_dmabuf(netdev, real_tx_dev, dma_dev,
+                                        DMA_TO_DEVICE, dmabuf_fd, priv,
+                                        info->extack);
        if (IS_ERR(binding)) {
                err = PTR_ERR(binding);
                goto err_unlock_netdev;

-- 
2.52.0


Reply via email to