ndo_set_rx_mode is problematic as it cannot sleep. There are drivers that circumvent this by doing the rx_mode work in a work item. This requires extra work that can be avoided if core provided a mechanism to do that. This patch proposes such a mechanism.
Refactor set_rx_mode into 2 stages: A snapshot stage and the actual I/O. In this new model, when _dev_set_rx_mode is called, we take a snapshot of the current rx_config and then commit it to the hardware later via a work item To accomplish this, reinterpret set_rx_mode as the ndo for customizing the snapshot and enabling/disabling rx_mode set and add a new ndo write_rx_mode for the deferred I/O Suggested-by: Jakub Kicinski <[email protected]> Signed-off-by: I Viswanath <[email protected]> --- include/linux/netdevice.h | 113 +++++++++++++++- net/core/dev.c | 270 +++++++++++++++++++++++++++++++++++++- 2 files changed, 375 insertions(+), 8 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5870a9e514a5..43f2904d5b39 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1062,6 +1062,45 @@ struct netdev_net_notifier { struct notifier_block *nb; }; +struct netif_deferred_work_cleanup { + struct work_struct cleanup_work; + struct net_device *dev; +}; + +enum netif_rx_mode_cfg_flags { + NETIF_RX_MODE_CFG_ALLMULTI, + NETIF_RX_MODE_CFG_PROMISC, + NETIF_RX_MODE_CFG_VLAN +}; + +enum netif_rx_mode_ctrl_flags { + /* pending config state */ + NETIF_RX_MODE_CFG_READY, + + /* if set, rx_mode set work will be skipped */ + NETIF_RX_MODE_SET_SKIP, + + /* if set, uc/mc lists will not be part of rx_mode config */ + NETIF_RX_MODE_UC_SKIP, + NETIF_RX_MODE_MC_SKIP +}; + +struct netif_rx_mode_config { + char *uc_addrs; + char *mc_addrs; + int uc_count; + int mc_count; + int cfg_bits; +}; + +struct netif_rx_mode_work_ctx { + struct netif_rx_mode_config *pending; + struct netif_rx_mode_config *ready; + struct work_struct rx_mode_work; + struct net_device *dev; + int ctrl_bits; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1114,9 +1153,14 @@ struct netdev_net_notifier { * changes to configuration when multicast or promiscuous is enabled. * * void (*ndo_set_rx_mode)(struct net_device *dev); - * This function is called device changes address list filtering. + * This function is called when device changes address list filtering. * If driver handles unicast address filtering, it should set - * IFF_UNICAST_FLT in its priv_flags. + * IFF_UNICAST_FLT in its priv_flags. This is used to configure + * the rx_mode snapshot that will be written to the hardware. + * + * void (*ndo_write_rx_mode)(struct net_device *dev); + * This function is scheduled after set_rx_mode and is responsible for + * writing the rx_mode snapshot to the hardware. * * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); * This function is called when the Media Access Control address @@ -1437,6 +1481,7 @@ struct net_device_ops { void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); + void (*ndo_write_rx_mode)(struct net_device *dev); int (*ndo_set_mac_address)(struct net_device *dev, void *addr); int (*ndo_validate_addr)(struct net_device *dev); @@ -1939,7 +1984,7 @@ enum netdev_reg_state { * @ingress_queue: XXX: need comments on this one * @nf_hooks_ingress: netfilter hooks executed for ingress packets * @broadcast: hw bcast address - * + * @rx_mode_ctx: Context required for rx_mode work * @rx_cpu_rmap: CPU reverse-mapping for RX completion interrupts, * indexed by RX queue number. Assigned by driver. * This must only be set if the ndo_rx_flow_steer @@ -1971,6 +2016,9 @@ enum netdev_reg_state { * @link_watch_list: XXX: need comments on this one * * @reg_state: Register/unregister state machine + * @needs_deferred_cleanup:Should dev_close schedule cleanup of + * deferred work? + * @deferred_work_cleanup: Context required for cleanup of deferred work * @dismantle: Device is going to be freed * @needs_free_netdev: Should unregister perform free_netdev? * @priv_destructor: Called from unregister @@ -2350,6 +2398,7 @@ struct net_device { #endif unsigned char broadcast[MAX_ADDR_LEN]; + struct netif_rx_mode_work_ctx *rx_mode_ctx; #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rx_cpu_rmap; #endif @@ -2387,6 +2436,10 @@ struct net_device { u8 reg_state; + bool needs_deferred_cleanup; + + struct netif_deferred_work_cleanup *deferred_work_cleanup; + bool dismantle; /** @moving_ns: device is changing netns, protected by @lock */ @@ -3373,6 +3426,60 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); +/* Helpers to be used in the set_rx_mode implementation */ +static inline void netif_rx_mode_set_cfg_bit(struct net_device *dev, int b, + bool val) +{ + if (val) + dev->rx_mode_ctx->pending->cfg_bits |= BIT(b); + else + dev->rx_mode_ctx->pending->cfg_bits &= ~BIT(b); +} + +static inline void netif_rx_mode_set_ctrl_bit(struct net_device *dev, int b, + bool val) +{ + if (val) + dev->rx_mode_ctx->ctrl_bits |= BIT(b); + else + dev->rx_mode_ctx->ctrl_bits &= ~BIT(b); +} + +/* Helper to be used in the write_rx_mode implementation */ +static inline int netif_rx_mode_get_cfg_bit(struct net_device *dev, int b) +{ + return !!(dev->rx_mode_ctx->ready->cfg_bits & BIT(b)); +} + +static inline int netif_rx_mode_get_ctrl_bit(struct net_device *dev, int b) +{ + return !!(dev->rx_mode_ctx->ctrl_bits & BIT(b)); +} + +static inline int netif_rx_mode_get_mc_count(struct net_device *dev) +{ + return dev->rx_mode_ctx->ready->mc_count; +} + +static inline int netif_rx_mode_get_uc_count(struct net_device *dev) +{ + return dev->rx_mode_ctx->ready->uc_count; +} + +void netif_rx_mode_schedule_work(struct net_device *dev); + +void netif_rx_mode_flush_work(struct net_device *dev); + +#define netif_rx_mode_for_each_uc_addr(dev, ha_addr, idx) \ + for (ha_addr = (dev)->rx_mode_ctx->ready->uc_addrs, idx = 0; \ + idx < netif_rx_mode_get_uc_count((dev)); \ + ha_addr += (dev)->addr_len, idx++) + +#define netif_rx_mode_for_each_mc_addr(dev, ha_addr, idx) \ + for (ha_addr = (dev)->rx_mode_ctx->ready->mc_addrs, idx = 0; \ + idx < netif_rx_mode_get_mc_count((dev)); \ + ha_addr += (dev)->addr_len, idx++) + int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); diff --git a/net/core/dev.c b/net/core/dev.c index 9094c0fb8c68..ee6367ef43ab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1587,6 +1587,211 @@ void netif_state_change(struct net_device *dev) } } +/* This function attempts to copy the current state of the + * net device into pending (reallocating if necessary). If it fails, + * pending is guaranteed to be unmodified. + */ +static int netif_rx_mode_alloc_and_fill_pending(struct net_device *dev) +{ + struct netif_rx_mode_config *pending = dev->rx_mode_ctx->pending; + bool skip_uc = false, skip_mc = false; + int uc_count = 0, mc_count = 0; + struct netdev_hw_addr *ha; + char *tmp; + int i; + + skip_uc = netif_rx_mode_get_ctrl_bit(dev, NETIF_RX_MODE_UC_SKIP); + skip_mc = netif_rx_mode_get_ctrl_bit(dev, NETIF_RX_MODE_MC_SKIP); + + /* The allocations need to be atomic since this will be called under + * netif_addr_lock_bh() + */ + if (!skip_uc) { + uc_count = netdev_uc_count(dev); + tmp = krealloc(pending->uc_addrs, uc_count * dev->addr_len, + GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + pending->uc_addrs = tmp; + } + + if (!skip_mc) { + mc_count = netdev_mc_count(dev); + tmp = krealloc(pending->mc_addrs, mc_count * dev->addr_len, + GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + pending->mc_addrs = tmp; + } + + /* This function cannot fail after this point */ + + /* This is going to be the same for every single driver. Better to + * do it here than in the set_rx_mode impl + */ + netif_rx_mode_set_cfg_bit(dev, NETIF_RX_MODE_CFG_ALLMULTI, + !!(dev->flags & IFF_ALLMULTI)); + + netif_rx_mode_set_cfg_bit(dev, NETIF_RX_MODE_CFG_PROMISC, + !!(dev->flags & IFF_PROMISC)); + + i = 0; + if (!skip_uc) { + pending->uc_count = uc_count; + netdev_for_each_uc_addr(ha, dev) + memcpy(pending->uc_addrs + (i++) * dev->addr_len, + ha->addr, dev->addr_len); + } + + i = 0; + if (!skip_mc) { + pending->mc_count = mc_count; + netdev_for_each_mc_addr(ha, dev) + memcpy(pending->mc_addrs + (i++) * dev->addr_len, + ha->addr, dev->addr_len); + } + return 0; +} + +static void netif_rx_mode_prepare_pending(struct net_device *dev) +{ + lockdep_assert_held(&dev->addr_list_lock); + int rc; + + rc = netif_rx_mode_alloc_and_fill_pending(dev); + if (rc) + return; + + netif_rx_mode_set_ctrl_bit(dev, NETIF_RX_MODE_CFG_READY, true); +} + +static void netif_rx_mode_write_rx_mode(struct work_struct *param) +{ + rtnl_lock(); + + struct netif_rx_mode_work_ctx *rx_mode_ctx = container_of(param, + struct netif_rx_mode_work_ctx, rx_mode_work); + + struct net_device *dev = rx_mode_ctx->dev; + + if (!netif_running(dev)) { + rtnl_unlock(); + return; + } + + /* Paranoia. */ + if (WARN_ON(!dev->netdev_ops->ndo_write_rx_mode)) { + rtnl_unlock(); + return; + } + + /* We could introduce a new lock for this but reusing the addr + * lock works well enough + */ + netif_addr_lock_bh(dev); + + /* There's no point continuing if the pending config is not ready */ + if (!netif_rx_mode_get_ctrl_bit(dev, NETIF_RX_MODE_CFG_READY)) { + netif_addr_unlock_bh(dev); + rtnl_unlock(); + return; + } + + swap(rx_mode_ctx->ready, rx_mode_ctx->pending); + netif_rx_mode_set_ctrl_bit(dev, NETIF_RX_MODE_CFG_READY, false); + + netif_addr_unlock_bh(dev); + + dev->netdev_ops->ndo_write_rx_mode(dev); + rtnl_unlock(); +} + +static int netif_alloc_rx_mode_ctx(struct net_device *dev) +{ + dev->rx_mode_ctx = kzalloc(sizeof(*dev->rx_mode_ctx), GFP_KERNEL); + if (!dev->rx_mode_ctx) + goto fail_all; + + dev->rx_mode_ctx->ready = kzalloc(sizeof(*dev->rx_mode_ctx->ready), + GFP_KERNEL); + if (!dev->rx_mode_ctx->ready) + goto fail_ready; + + dev->rx_mode_ctx->pending = kzalloc(sizeof(*dev->rx_mode_ctx->pending), + GFP_KERNEL); + if (!dev->rx_mode_ctx->pending) + goto fail_pending; + + dev->rx_mode_ctx->dev = dev; + INIT_WORK(&dev->rx_mode_ctx->rx_mode_work, netif_rx_mode_write_rx_mode); + return 0; + +fail_pending: + kfree(dev->rx_mode_ctx->ready); + +fail_ready: + kfree(dev->rx_mode_ctx); + +fail_all: + return -ENOMEM; +} + +static void netif_free_rx_mode_ctx(struct net_device *dev) +{ + if (!dev->rx_mode_ctx) + return; + + cancel_work_sync(&dev->rx_mode_ctx->rx_mode_work); + + kfree(dev->rx_mode_ctx->ready->uc_addrs); + kfree(dev->rx_mode_ctx->ready->mc_addrs); + kfree(dev->rx_mode_ctx->ready); + + kfree(dev->rx_mode_ctx->pending->uc_addrs); + kfree(dev->rx_mode_ctx->pending->mc_addrs); + kfree(dev->rx_mode_ctx->pending); + + kfree(dev->rx_mode_ctx); + dev->rx_mode_ctx = NULL; +} + +static void netif_deferred_work_cleanup_fn(struct work_struct *param) +{ + struct netif_deferred_work_cleanup *ctx; + struct net_device *dev; + + ctx = container_of(param, struct netif_deferred_work_cleanup, + cleanup_work); + dev = ctx->dev; + + if (dev->netdev_ops->ndo_write_rx_mode) + netif_free_rx_mode_ctx(dev); +} + +static int netif_alloc_deferred_work_cleanup(struct net_device *dev) +{ + dev->deferred_work_cleanup = kzalloc(sizeof(*dev->deferred_work_cleanup), + GFP_KERNEL); + if (!dev->deferred_work_cleanup) + return -ENOMEM; + + dev->deferred_work_cleanup->dev = dev; + INIT_WORK(&dev->deferred_work_cleanup->cleanup_work, + netif_deferred_work_cleanup_fn); + return 0; +} + +static void netif_free_deferred_work_cleanup(struct net_device *dev) +{ + if (!dev->deferred_work_cleanup) + return; + + cancel_work_sync(&dev->deferred_work_cleanup->cleanup_work); + + kfree(dev->deferred_work_cleanup); + dev->deferred_work_cleanup = NULL; +} + /** * __netdev_notify_peers - notify network peers about existence of @dev, * to be called when rtnl lock is already held. @@ -1672,16 +1877,26 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) if (ret) return ret; + if (dev->needs_deferred_cleanup) { + if (!dev->deferred_work_cleanup) + ret = netif_alloc_deferred_work_cleanup(dev); + + cancel_work_sync(&dev->deferred_work_cleanup->cleanup_work); + } + set_bit(__LINK_STATE_START, &dev->state); netdev_ops_assert_locked(dev); - if (ops->ndo_validate_addr) + if (!ret && ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); + if (!ret && ops->ndo_write_rx_mode) + ret = netif_alloc_rx_mode_ctx(dev); + netpoll_poll_enable(dev); if (ret) @@ -1755,6 +1970,9 @@ static void __dev_close_many(struct list_head *head) if (ops->ndo_stop) ops->ndo_stop(dev); + if (dev->needs_deferred_cleanup) + schedule_work(&dev->deferred_work_cleanup->cleanup_work); + netif_set_up(dev, false); netpoll_poll_enable(dev); } @@ -9621,6 +9839,46 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify) return 0; } +/* netif_rx_mode_schedule_work - Sets up the rx_config snapshot and + * schedules the deferred I/O. + */ +void netif_rx_mode_schedule_work(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); + + /* This part is only for drivers that implement ndo_write_rx_mode */ + if (!ops->ndo_write_rx_mode) + return; + + /* If rx_mode set is to be skipped, we don't schedule the work */ + if (netif_rx_mode_get_ctrl_bit(dev, NETIF_RX_MODE_SET_SKIP)) + return; + + netif_rx_mode_prepare_pending(dev); + schedule_work(&dev->rx_mode_ctx->rx_mode_work); +} +EXPORT_SYMBOL(netif_rx_mode_schedule_work); + +/* Drivers that implement rx mode as work flush the work item when closing + * or suspending. This is the substitute for those calls. + */ +void netif_rx_mode_flush_work(struct net_device *dev) +{ + /* Calling this function with RTNL held will result in a deadlock. */ + if (WARN_ON(rtnl_is_locked())) + return; + + /* Doing nothing is enough to "flush" work on a closed interface */ + if (!netif_running(dev)) + return; + + flush_work(&dev->rx_mode_ctx->rx_mode_work); +} +EXPORT_SYMBOL(netif_rx_mode_flush_work); + /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast @@ -9629,8 +9887,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify) */ void __dev_set_rx_mode(struct net_device *dev) { - const struct net_device_ops *ops = dev->netdev_ops; - /* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) return; @@ -9651,8 +9907,7 @@ void __dev_set_rx_mode(struct net_device *dev) } } - if (ops->ndo_set_rx_mode) - ops->ndo_set_rx_mode(dev); + netif_rx_mode_schedule_work(dev); } void dev_set_rx_mode(struct net_device *dev) @@ -11323,6 +11578,9 @@ int register_netdevice(struct net_device *dev) } } + if (dev->netdev_ops->ndo_write_rx_mode) + dev->needs_deferred_cleanup = true; + if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_CTAG_FILTER) && (!dev->netdev_ops->ndo_vlan_rx_add_vid || @@ -12066,6 +12324,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; + dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); if (!dev->ethtool) goto free_all; @@ -12149,6 +12408,7 @@ void free_netdev(struct net_device *dev) kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); + netif_free_deferred_work_cleanup(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); -- 2.47.3
