From: Bobby Eshleman <[email protected]> When a netkit virtual device leases queues from a physical NIC, devmem TX bindings created on the netkit device should use the physical NIC for DMA operations rather than the virtual device, which has no DMA capability.
In bind_tx_doit, walk the device's leased rx queues to discover the underlying physical device that supports netmem_tx. Use this device for DMA device lookup and pass it as the real_tx_dev in the binding. When real_tx_dev is set, it is also used for NUMA-local allocations. Extend validate_xmit_unreadable_skb() to support the netkit case, where the skb is validated twice: once on the netkit guest device and again on the physical NIC after BPF redirect or ip forwarding. Both invocations must pass for the skb to be transmitted. Signed-off-by: Bobby Eshleman <[email protected]> --- net/core/dev.c | 26 +++++++++++++++++++------- net/core/devmem.c | 16 ++++++++++------ net/core/devmem.h | 6 ++++-- net/core/netdev-genl.c | 38 +++++++++++++++++++++++++++++++++----- 4 files changed, 66 insertions(+), 20 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index ca4b26dfb1bd..105bd27be024 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3981,24 +3981,36 @@ static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, struct net_device *dev) { + struct net_devmem_dmabuf_binding *binding; struct skb_shared_info *shinfo; + struct net_device *real_tx_dev; struct net_iov *niov; if (likely(skb_frags_readable(skb))) goto out; - if (!dev->netmem_tx) - goto out_free; - shinfo = skb_shinfo(skb); + if (shinfo->nr_frags == 0) + goto out; - if (shinfo->nr_frags > 0) { - niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); - if (net_is_devmem_iov(niov) && - READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev) + niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); + if (!net_is_devmem_iov(niov)) + goto out; + + binding = net_devmem_iov_binding(niov); + real_tx_dev = READ_ONCE(binding->real_tx_dev); + + if (real_tx_dev) { + if (!real_tx_dev->netmem_tx) + goto out_free; + if (READ_ONCE(binding->dev) != dev && real_tx_dev != dev) goto out_free; + goto out; } + if (READ_ONCE(binding->dev) != dev || !dev->netmem_tx) + goto out_free; + out: return skb; diff --git a/net/core/devmem.c b/net/core/devmem.c index 7ede81509968..a4148cba5b5f 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -181,12 +181,13 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, } struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, +net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev, struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { + struct net_device *node_dev = real_tx_dev ?: dev; struct net_devmem_dmabuf_binding *binding; static u32 id_alloc_next; struct scatterlist *sg; @@ -205,13 +206,14 @@ net_devmem_bind_dmabuf(struct net_device *dev, return ERR_CAST(dmabuf); binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, - dev_to_node(&dev->dev)); + dev_to_node(&node_dev->dev)); if (!binding) { err = -ENOMEM; goto err_put_dmabuf; } binding->dev = dev; + binding->real_tx_dev = real_tx_dev; xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); err = percpu_ref_init(&binding->ref, @@ -254,7 +256,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, * allocate MTU sized chunks here. Leave that for future work... */ binding->chunk_pool = gen_pool_create(PAGE_SHIFT, - dev_to_node(&dev->dev)); + dev_to_node(&node_dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM; goto err_tx_vec; @@ -268,7 +270,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, struct net_iov *niov; owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, - dev_to_node(&dev->dev)); + dev_to_node(&node_dev->dev)); if (!owner) { err = -ENOMEM; goto err_free_chunks; @@ -280,7 +282,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, owner->binding = binding; err = gen_pool_add_owner(binding->chunk_pool, dma_addr, - dma_addr, len, dev_to_node(&dev->dev), + dma_addr, len, + dev_to_node(&node_dev->dev), owner); if (err) { kfree(owner); @@ -397,7 +400,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, */ dst_dev = dst_dev_rcu(dst); if (unlikely(!dst_dev) || - unlikely(dst_dev != READ_ONCE(binding->dev))) { + unlikely(dst_dev != READ_ONCE(binding->dev) && + dst_dev != READ_ONCE(binding->real_tx_dev))) { err = -ENODEV; goto out_unlock; } diff --git a/net/core/devmem.h b/net/core/devmem.h index 1c5c18581fcb..ffcf97a33633 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -20,6 +20,8 @@ struct net_devmem_dmabuf_binding { struct dma_buf_attachment *attachment; struct sg_table *sgt; struct net_device *dev; + /* Phys dev behind a virtual dev (e.g. netkit) with a queue lease. */ + struct net_device *real_tx_dev; struct gen_pool *chunk_pool; /* Protect dev */ struct mutex lock; @@ -84,7 +86,7 @@ struct dmabuf_genpool_chunk_owner { void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, +net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev, struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, @@ -165,7 +167,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) } static inline struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, +net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev, struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 7d073894ca74..2b34924dc30f 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -1037,7 +1037,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_rxq_bitmap; } - binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, + binding = net_devmem_bind_dmabuf(netdev, NULL, dma_dev, DMA_FROM_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); @@ -1082,6 +1082,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; + struct net_device *real_tx_dev = NULL; + struct netdev_rx_queue *lease_rxq; struct netdev_nl_sock *priv; struct net_device *netdev; struct device *dma_dev; @@ -1089,6 +1091,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) struct sk_buff *rsp; int err = 0; void *hdr; + int i; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) @@ -1124,16 +1127,41 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock_netdev; } - if (!netdev->netmem_tx) { + for (i = 0; i < netdev->real_num_rx_queues; i++) { + lease_rxq = READ_ONCE(__netif_get_rx_queue(netdev, i)->lease); + + if (!lease_rxq) + continue; + + real_tx_dev = lease_rxq->dev; + break; + } + + if (real_tx_dev) { + if (!netif_device_present(real_tx_dev)) { + err = -ENODEV; + goto err_unlock_netdev; + } + + if (!real_tx_dev->netmem_tx) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Driver for queue lease device does not support netmem TX"); + goto err_unlock_netdev; + } + } + + if (!real_tx_dev && !netdev->netmem_tx) { err = -EOPNOTSUPP; NL_SET_ERR_MSG(info->extack, "Driver does not support netmem TX"); goto err_unlock_netdev; } - dma_dev = netdev_queue_get_dma_dev(netdev, 0); - binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, - dmabuf_fd, priv, info->extack); + dma_dev = netdev_queue_get_dma_dev(real_tx_dev ?: netdev, 0); + binding = net_devmem_bind_dmabuf(netdev, real_tx_dev, dma_dev, + DMA_TO_DEVICE, dmabuf_fd, priv, + info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock_netdev; -- 2.52.0

