The merge commit 043f84176868 ("Merge branch 'v6.1/standard/base'
into v6.1/standard/cn-sdkv5.15/octeon") incorrectly updated
net/core/dev.c to the version from the v6.6/standard/preempt-rt/base
branch. This change restores the file to its original state as found
in the v6.1/standard/base branch.Signed-off-by: Kevin Hao <[email protected]> --- Hi Bruce, Please merge this into the following branch: v6.1/standard/cn-sdkv5.15/octeon --- net/core/dev.c | 944 ++++++++++++++++++++++++--------------------------------- 1 file changed, 403 insertions(+), 541 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 5b53496b2666fa710bb75e1d5ef403e15f1dd974..69bb7ac73d047aa4428b4d4eaa67381850cf2b2d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -69,7 +69,7 @@ */ #include <linux/uaccess.h> -#include <linux/bitmap.h> +#include <linux/bitops.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> @@ -107,7 +107,6 @@ #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> -#include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> @@ -133,7 +132,6 @@ #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> -#include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> @@ -152,16 +150,18 @@ #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> -#include <net/netdev_rx_queue.h> #include "dev.h" #include "net-sysfs.h" + static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct netdev_notifier_info *info); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); @@ -396,9 +396,6 @@ static void list_netdevice(struct net_device *dev) netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); - /* We reserved the ifindex, this can't fail */ - WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); - dev_base_seq_inc(net); } @@ -408,12 +405,9 @@ static void list_netdevice(struct net_device *dev) static void unlist_netdevice(struct net_device *dev, bool lock) { struct netdev_name_node *name_node; - struct net *net = dev_net(dev); ASSERT_RTNL(); - xa_erase(&net->dev_by_index, dev->ifindex); - netdev_for_each_altname(dev, name_node) netdev_name_node_del(name_node); @@ -789,7 +783,18 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) } EXPORT_SYMBOL(dev_get_by_name_rcu); -/* Deprecated for new users, call netdev_get_by_name() instead */ +/** + * dev_get_by_name - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. This can be called from any + * context and does its own locking. The returned handle has + * the usage count incremented and the caller must use dev_put() to + * release it when it is no longer needed. %NULL is returned if no + * matching device is found. + */ + struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; @@ -802,31 +807,6 @@ struct net_device *dev_get_by_name(struct net *net, const char *name) } EXPORT_SYMBOL(dev_get_by_name); -/** - * netdev_get_by_name() - find a device by its name - * @net: the applicable net namespace - * @name: name to find - * @tracker: tracking object for the acquired reference - * @gfp: allocation flags for the tracker - * - * Find an interface by name. This can be called from any - * context and does its own locking. The returned handle has - * the usage count incremented and the caller must use netdev_put() to - * release it when it is no longer needed. %NULL is returned if no - * matching device is found. - */ -struct net_device *netdev_get_by_name(struct net *net, const char *name, - netdevice_tracker *tracker, gfp_t gfp) -{ - struct net_device *dev; - - dev = dev_get_by_name(net, name); - if (dev) - netdev_tracker_alloc(dev, tracker, gfp); - return dev; -} -EXPORT_SYMBOL(netdev_get_by_name); - /** * __dev_get_by_index - find a device by its ifindex * @net: the applicable net namespace @@ -876,7 +856,18 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) } EXPORT_SYMBOL(dev_get_by_index_rcu); -/* Deprecated for new users, call netdev_get_by_index() instead */ + +/** + * dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns NULL if the device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; @@ -889,30 +880,6 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) } EXPORT_SYMBOL(dev_get_by_index); -/** - * netdev_get_by_index() - find a device by its ifindex - * @net: the applicable net namespace - * @ifindex: index of device - * @tracker: tracking object for the acquired reference - * @gfp: allocation flags for the tracker - * - * Search for an interface by index. Returns NULL if the device - * is not found or a pointer to the device. The device returned has - * had a reference added and the pointer is safe until the user calls - * netdev_put() to indicate they have finished with it. - */ -struct net_device *netdev_get_by_index(struct net *net, int ifindex, - netdevice_tracker *tracker, gfp_t gfp) -{ - struct net_device *dev; - - dev = dev_get_by_index(net, ifindex); - if (dev) - netdev_tracker_alloc(dev, tracker, gfp); - return dev; -} -EXPORT_SYMBOL(netdev_get_by_index); - /** * dev_get_by_napi_id - find a device by napi_id * @napi_id: ID of the NAPI struct @@ -1134,7 +1101,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) return -EINVAL; /* Use one page as a bit array of possible slots */ - inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); + inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); if (!inuse) return -ENOMEM; @@ -1164,7 +1131,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) } i = find_first_zero_bit(inuse, max_netdevices); - bitmap_free(inuse); + free_page((unsigned long) inuse); } snprintf(buf, IFNAMSIZ, name, i); @@ -1265,6 +1232,22 @@ int dev_change_name(struct net_device *dev, const char *newname) net = dev_net(dev); + /* Some auto-enslaved devices e.g. failover slaves are + * special, as userspace might rename the device after + * the interface had been brought up and running since + * the point kernel initiated auto-enslavement. Allow + * live name change even when these slave devices are + * up and running. + * + * Typically, users of these auto-enslaving devices + * don't actually care about slave name change, as + * they are supposed to operate on master interface + * directly. + */ + if (dev->flags & IFF_UP && + likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) + return -EBUSY; + down_write(&devnet_rename_sem); if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { @@ -1281,8 +1264,7 @@ int dev_change_name(struct net_device *dev, const char *newname) } if (oldname[0] && !strchr(oldname, '%')) - netdev_info(dev, "renamed from %s%s\n", oldname, - dev->flags & IFF_UP ? " (while UP)" : ""); + netdev_info(dev, "renamed from %s\n", oldname); old_assign_type = dev->name_assign_type; dev->name_assign_type = NET_NAME_RENAMED; @@ -1420,7 +1402,7 @@ void netdev_state_change(struct net_device *dev) call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } EXPORT_SYMBOL(netdev_state_change); @@ -1556,7 +1538,7 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) if (ret < 0) return ret; - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; @@ -1628,7 +1610,7 @@ void dev_close_many(struct list_head *head, bool unlink) __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_DOWN, dev); if (unlink) list_del_init(&dev->close_list); @@ -1708,15 +1690,14 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) - N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) - N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) - N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) - N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) + N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) + N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) + N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) + N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) - N(XDP_FEAT_CHANGE) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -1943,7 +1924,7 @@ EXPORT_SYMBOL(register_netdevice_notifier_net); * @nb: notifier * * Unregister a notifier previously registered by - * register_netdevice_notifier_net(). The notifier is unlinked from the + * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * @@ -1964,14 +1945,6 @@ int unregister_netdevice_notifier_net(struct net *net, } EXPORT_SYMBOL(unregister_netdevice_notifier_net); -static void __move_netdevice_notifier_net(struct net *src_net, - struct net *dst_net, - struct notifier_block *nb) -{ - __unregister_netdevice_notifier_net(src_net, nb); - __register_netdevice_notifier_net(dst_net, nb, true); -} - int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) @@ -2008,8 +1981,10 @@ static void move_netdevice_notifiers_dev_net(struct net_device *dev, { struct netdev_net_notifier *nn; - list_for_each_entry(nn, &dev->net_notifier_list, list) - __move_netdevice_notifier_net(dev_net(dev), net, nn->nb); + list_for_each_entry(nn, &dev->net_notifier_list, list) { + __unregister_netdevice_notifier_net(dev_net(dev), nn->nb); + __register_netdevice_notifier_net(net, nn->nb, true); + } } /** @@ -2021,8 +1996,8 @@ static void move_netdevice_notifiers_dev_net(struct net_device *dev, * are as for raw_notifier_call_chain(). */ -int call_netdevice_notifiers_info(unsigned long val, - struct netdev_notifier_info *info) +static int call_netdevice_notifiers_info(unsigned long val, + struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); int ret; @@ -2168,10 +2143,13 @@ static DECLARE_WORK(netstamp_work, netstamp_clear); void net_enable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL - int wanted = atomic_read(&netstamp_wanted); + int wanted; - while (wanted > 0) { - if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1)) + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 0) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) return; } atomic_inc(&netstamp_needed_deferred); @@ -2185,10 +2163,13 @@ EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL - int wanted = atomic_read(&netstamp_wanted); + int wanted; - while (wanted > 1) { - if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1)) + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 1) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) return; } atomic_dec(&netstamp_needed_deferred); @@ -2202,7 +2183,7 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; - skb->tstamp_type = SKB_CLOCK_REALTIME; + skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } @@ -2461,7 +2442,8 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_map *map = NULL; int pos; - map = xmap_dereference(dev_maps->attr_map[tci]); + if (dev_maps) + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -3097,8 +3079,6 @@ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); - if (size < READ_ONCE(dev->gso_ipv4_max_size)) - netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); @@ -3178,7 +3158,7 @@ void __netif_schedule(struct Qdisc *q) EXPORT_SYMBOL(__netif_schedule); struct dev_kfree_skb_cb { - enum skb_drop_reason reason; + enum skb_free_reason reason; }; static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) @@ -3211,7 +3191,7 @@ void netif_tx_wake_queue(struct netdev_queue *dev_queue) } EXPORT_SYMBOL(netif_tx_wake_queue); -void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; @@ -3231,16 +3211,18 @@ void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } -EXPORT_SYMBOL(dev_kfree_skb_irq_reason); +EXPORT_SYMBOL(__dev_kfree_skb_irq); -void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason) +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) { if (in_hardirq() || irqs_disabled()) - dev_kfree_skb_irq_reason(skb, reason); + __dev_kfree_skb_irq(skb, reason); + else if (unlikely(reason == SKB_REASON_DROPPED)) + kfree_skb(skb); else - kfree_skb_reason(skb, reason); + consume_skb(skb); } -EXPORT_SYMBOL(dev_kfree_skb_any_reason); +EXPORT_SYMBOL(__dev_kfree_skb_any); /** @@ -3312,7 +3294,7 @@ static u16 skb_tx_hash(const struct net_device *dev, return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } -void skb_warn_bad_offload(const struct sk_buff *skb) +static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; @@ -3422,7 +3404,8 @@ int skb_crc32c_csum_help(struct sk_buff *skb) skb->len - start, ~(__u32)0, crc32c_csum_stub)); *(__le32 *)(skb->data + offset) = crc32c_csum; - skb_reset_csum_not_inet(skb); + skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; out: return ret; } @@ -3445,6 +3428,74 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) return vlan_get_protocol_and_depth(skb, type, depth); } +/* openvswitch calls this on rx path, so we need a different check. + */ +static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; + + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + struct sk_buff *segs; + + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + /* We're going to init ->check field in TCP or UDP header */ + err = skb_cow_head(skb, 0); + if (err < 0) + return ERR_PTR(err); + } + + /* Only report GSO partial support if it will enable us to + * support segmentation on this frame without needing additional + * work. + */ + if (features & NETIF_F_GSO_PARTIAL) { + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; + struct net_device *dev = skb->dev; + + partial_features |= dev->features & dev->gso_partial_features; + if (!skb_gso_ok(skb, features | partial_features)) + features &= ~NETIF_F_GSO_PARTIAL; + } + + BUILD_BUG_ON(SKB_GSO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + segs = skb_mac_gso_segment(skb, features); + + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + skb_warn_bad_offload(skb); + + return segs; +} +EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG @@ -3544,7 +3595,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; - if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) + if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size))) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { @@ -3793,25 +3844,25 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) * we add to pkt_len the headers size of all segments */ if (shinfo->gso_size && skb_transport_header_was_set(skb)) { - u16 gso_segs = shinfo->gso_segs; unsigned int hdr_len; + u16 gso_segs = shinfo->gso_segs; /* mac layer + network layer */ - hdr_len = skb_transport_offset(skb); + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; - th = skb_header_pointer(skb, hdr_len, + th = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { struct udphdr _udphdr; - if (skb_header_pointer(skb, hdr_len, + if (skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } @@ -3987,6 +4038,50 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS +static struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ +#ifdef CONFIG_NET_CLS_ACT + struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); + struct tcf_result cl_res; + + if (!miniq) + return skb; + + /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ + tc_skb_cb(skb)->mru = 0; + tc_skb_cb(skb)->post_ct = false; + mini_qdisc_bstats_cpu_update(miniq, skb); + + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + mini_qdisc_qstats_cpu_drop(miniq); + *ret = NET_XMIT_DROP; + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + return NULL; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *ret = NET_XMIT_SUCCESS; + consume_skb(skb); + return NULL; + case TC_ACT_REDIRECT: + /* No need to push/pop skb's mac_header here on egress! */ + skb_do_redirect(skb); + *ret = NET_XMIT_SUCCESS; + return NULL; + default: + break; + } +#endif /* CONFIG_NET_CLS_ACT */ + + return skb; +} + static struct netdev_queue * netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) { @@ -4007,182 +4102,6 @@ void netdev_xmit_skip_txqueue(bool skip) EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #endif /* CONFIG_NET_EGRESS */ -#ifdef CONFIG_NET_XGRESS -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb) -{ - int ret = TC_ACT_UNSPEC; -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); - struct tcf_result res; - - if (!miniq) - return ret; - - tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; - - mini_qdisc_bstats_cpu_update(miniq, skb); - ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); - /* Only tcf related quirks below. */ - switch (ret) { - case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - break; - case TC_ACT_OK: - case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(res.classid); - break; - } -#endif /* CONFIG_NET_CLS_ACT */ - return ret; -} - -static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); - -void tcx_inc(void) -{ - static_branch_inc(&tcx_needed_key); -} - -void tcx_dec(void) -{ - static_branch_dec(&tcx_needed_key); -} - -static __always_inline enum tcx_action_base -tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, - const bool needs_mac) -{ - const struct bpf_mprog_fp *fp; - const struct bpf_prog *prog; - int ret = TCX_NEXT; - - if (needs_mac) - __skb_push(skb, skb->mac_len); - bpf_mprog_foreach_prog(entry, fp, prog) { - bpf_compute_data_pointers(skb); - ret = bpf_prog_run(prog, skb); - if (ret != TCX_NEXT) - break; - } - if (needs_mac) - __skb_pull(skb, skb->mac_len); - return tcx_action_code(skb, ret); -} - -static __always_inline struct sk_buff * -sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev, bool *another) -{ - struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); - int sch_ret; - - if (!entry) - return skb; - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - qdisc_skb_cb(skb)->pkt_len = skb->len; - tcx_set_ingress(skb, true); - - if (static_branch_unlikely(&tcx_needed_key)) { - sch_ret = tcx_run(entry, skb, true); - if (sch_ret != TC_ACT_UNSPEC) - goto ingress_verdict; - } - sch_ret = tc_run(tcx_entry(entry), skb); -ingress_verdict: - switch (sch_ret) { - case TC_ACT_REDIRECT: - /* skb_mac_header check was done by BPF, so we can safely - * push the L2 header back before redirecting to another - * netdev. - */ - __skb_push(skb, skb->mac_len); - if (skb_do_redirect(skb) == -EAGAIN) { - __skb_pull(skb, skb->mac_len); - *another = true; - break; - } - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_SHOT: - kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); - *ret = NET_RX_DROP; - return NULL; - /* used by tc_run */ - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - case TC_ACT_TRAP: - consume_skb(skb); - fallthrough; - case TC_ACT_CONSUMED: - *ret = NET_RX_SUCCESS; - return NULL; - } - - return skb; -} - -static __always_inline struct sk_buff * -sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) -{ - struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); - int sch_ret; - - if (!entry) - return skb; - - /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was - * already set by the caller. - */ - if (static_branch_unlikely(&tcx_needed_key)) { - sch_ret = tcx_run(entry, skb, false); - if (sch_ret != TC_ACT_UNSPEC) - goto egress_verdict; - } - sch_ret = tc_run(tcx_entry(entry), skb); -egress_verdict: - switch (sch_ret) { - case TC_ACT_REDIRECT: - /* No need to push/pop skb's mac_header here on egress! */ - skb_do_redirect(skb); - *ret = NET_XMIT_SUCCESS; - return NULL; - case TC_ACT_SHOT: - kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); - *ret = NET_XMIT_DROP; - return NULL; - /* used by tc_run */ - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - case TC_ACT_TRAP: - consume_skb(skb); - fallthrough; - case TC_ACT_CONSUMED: - *ret = NET_XMIT_SUCCESS; - return NULL; - } - - return skb; -} -#else -static __always_inline struct sk_buff * -sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev, bool *another) -{ - return skb; -} - -static __always_inline struct sk_buff * -sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) -{ - return skb; -} -#endif /* CONFIG_NET_XGRESS */ - #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) @@ -4365,7 +4284,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); qdisc_pkt_len_init(skb); - tcx_set_ingress(skb, false); +#ifdef CONFIG_NET_CLS_ACT + skb->tc_at_ingress = 0; +#endif #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { @@ -4552,12 +4473,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, } list_add_tail(&napi->poll_list, &sd->poll_list); - WRITE_ONCE(napi->list_owner, smp_processor_id()); - /* If not called from net_rx_action() - * we have to raise NET_RX_SOFTIRQ. - */ - if (!sd->in_net_rx_action) - raise_softirq_irqoff(NET_RX_SOFTIRQ); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS @@ -4771,17 +4687,21 @@ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ +/* Called from hardirq (IPI) context */ +static void trigger_rx_softirq(void *data) +{ + struct softnet_data *sd = data; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); +} + /* - * After we queued a packet into sd->input_pkt_queue, - * we need to make sure this queue is serviced soon. - * - * - If this is another cpu queue, link it to our rps_ipi_list, - * and make sure we will process rps_ipi_list from net_rx_action(). - * - * - If this is our own queue, NAPI schedule our backlog. - * Note that this also raises NET_RX_SOFTIRQ. + * Check if this softnet_data structure is another cpu one + * If yes, queue it to our IPI list and return 1 + * If no, return 0 */ -static void napi_schedule_rps(struct softnet_data *sd) +static int napi_schedule_rps(struct softnet_data *sd) { struct softnet_data *mysd = this_cpu_ptr(&softnet_data); @@ -4790,15 +4710,12 @@ static void napi_schedule_rps(struct softnet_data *sd) sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; - /* If not called from net_rx_action() or napi_threaded_poll() - * we have to raise NET_RX_SOFTIRQ. - */ - if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - return; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return 1; } #endif /* CONFIG_RPS */ __napi_schedule_irqoff(&mysd->backlog); + return 0; } #ifdef CONFIG_NET_FLOW_LIMIT @@ -5218,17 +5135,16 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) clist = clist->next; WARN_ON(refcount_read(&skb->users)); - if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED)) - trace_consume_skb(skb, net_tx_action); + if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) + trace_consume_skb(skb); else trace_kfree_skb(skb, net_tx_action, - get_kfree_skb_cb(skb)->reason); + SKB_DROP_REASON_NOT_SPECIFIED); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else - __napi_kfree_skb(skb, - get_kfree_skb_cb(skb)->reason); + __kfree_skb_defer(skb); } } @@ -5290,6 +5206,72 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif +static inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ +#ifdef CONFIG_NET_CLS_ACT + struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); + struct tcf_result cl_res; + + /* If there's at least one ingress present somewhere (so + * we get here via enabled static key), remaining devices + * that are not configured with an ingress qdisc will bail + * out here. + */ + if (!miniq) + return skb; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_skb_cb(skb)->pkt_len = skb->len; + tc_skb_cb(skb)->mru = 0; + tc_skb_cb(skb)->post_ct = false; + skb->tc_at_ingress = 1; + mini_qdisc_bstats_cpu_update(miniq, skb); + + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + mini_qdisc_qstats_cpu_drop(miniq); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; + return NULL; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + consume_skb(skb); + *ret = NET_RX_SUCCESS; + return NULL; + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by cls/act_bpf, so + * we can safely push the L2 header back before + * redirecting to another netdev + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + return NULL; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; + return NULL; + default: + break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return skb; +} + /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check @@ -6109,9 +6091,10 @@ EXPORT_SYMBOL(__napi_schedule); */ bool napi_schedule_prep(struct napi_struct *n) { - unsigned long new, val = READ_ONCE(n->state); + unsigned long val, new; do { + val = READ_ONCE(n->state); if (unlikely(val & NAPIF_STATE_DISABLE)) return false; new = val | NAPIF_STATE_SCHED; @@ -6124,7 +6107,7 @@ bool napi_schedule_prep(struct napi_struct *n) */ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED; - } while (!try_cmpxchg(&n->state, &val, new)); + } while (cmpxchg(&n->state, val, new) != val); return !(val & NAPIF_STATE_SCHED); } @@ -6191,10 +6174,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) list_del_init(&n->poll_list); local_irq_restore(flags); } - WRITE_ONCE(n->list_owner, -1); - val = READ_ONCE(n->state); do { + val = READ_ONCE(n->state); + WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | @@ -6207,7 +6190,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) */ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * NAPIF_STATE_SCHED; - } while (!try_cmpxchg(&n->state, &val, new)); + } while (cmpxchg(&n->state, val, new) != val); if (unlikely(val & NAPIF_STATE_MISSED)) { __napi_schedule(n); @@ -6318,8 +6301,7 @@ void napi_busy_loop(unsigned int napi_id, if (!napi) goto out; - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable(); for (;;) { int work = 0; @@ -6361,8 +6343,7 @@ void napi_busy_loop(unsigned int napi_id, if (unlikely(need_resched())) { if (napi_poll) busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable(); rcu_read_unlock(); cond_resched(); if (loop_end(loop_end_arg, start_time)) @@ -6373,8 +6354,7 @@ void napi_busy_loop(unsigned int napi_id, } if (napi_poll) busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable(); out: rcu_read_unlock(); } @@ -6476,8 +6456,12 @@ int dev_set_threaded(struct net_device *dev, bool threaded) * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ - list_for_each_entry(napi, &dev->napi_list, dev_list) - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (threaded) + set_bit(NAPI_STATE_THREADED, &napi->state); + else + clear_bit(NAPI_STATE_THREADED, &napi->state); + } return err; } @@ -6506,7 +6490,6 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif - napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); @@ -6528,16 +6511,19 @@ void napi_disable(struct napi_struct *n) might_sleep(); set_bit(NAPI_STATE_DISABLE, &n->state); - val = READ_ONCE(n->state); - do { - while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { + for ( ; ; ) { + val = READ_ONCE(n->state); + if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); - val = READ_ONCE(n->state); + continue; } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); - } while (!try_cmpxchg(&n->state, &val, new)); + + if (cmpxchg(&n->state, val, new) == val) + break; + } hrtimer_cancel(&n->timer); @@ -6554,15 +6540,16 @@ EXPORT_SYMBOL(napi_disable); */ void napi_enable(struct napi_struct *n) { - unsigned long new, val = READ_ONCE(n->state); + unsigned long val, new; do { + val = READ_ONCE(n->state); BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); if (n->dev->threaded && n->thread) new |= NAPIF_STATE_THREADED; - } while (!try_cmpxchg(&n->state, &val, new)); + } while (cmpxchg(&n->state, val, new) != val); } EXPORT_SYMBOL(napi_enable); @@ -6718,57 +6705,9 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void skb_defer_free_flush(struct softnet_data *sd) -{ - struct sk_buff *skb, *next; - - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) - return; - - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - sd->defer_count = 0; - spin_unlock(&sd->defer_lock); - - while (skb != NULL) { - next = skb->next; - napi_consume_skb(skb, 1); - skb = next; - } -} - -#ifndef CONFIG_PREEMPT_RT - -/* Called from hardirq (IPI) context */ -static void trigger_rx_softirq(void *data) -{ - struct softnet_data *sd = data; - - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - smp_store_release(&sd->defer_ipi_scheduled, 0); -} - -#else - -static void trigger_rx_softirq(struct work_struct *defer_work) -{ - struct softnet_data *sd; - - sd = container_of(defer_work, struct softnet_data, defer_work); - smp_store_release(&sd->defer_ipi_scheduled, 0); - local_bh_disable(); - skb_defer_free_flush(sd); - local_bh_enable(); -} - -#endif - static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; - struct softnet_data *sd; void *have; while (!napi_thread_wait(napi)) { @@ -6778,21 +6717,11 @@ static int napi_threaded_poll(void *data) bool repoll = false; local_bh_disable(); - sd = this_cpu_ptr(&softnet_data); - sd->in_napi_threaded_poll = true; have = netpoll_poll_lock(napi); __napi_poll(napi, &repoll); netpoll_poll_unlock(have); - sd->in_napi_threaded_poll = false; - barrier(); - - if (sd_has_rps_ipi_waiting(sd)) { - local_irq_disable(); - net_rps_action_and_irq_enable(sd); - } - skb_defer_free_flush(sd); local_bh_enable(); if (!repoll) @@ -6805,6 +6734,28 @@ static int napi_threaded_poll(void *data) return 0; } +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + unsigned long flags; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock_irqsave(&sd->defer_lock, flags); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock_irqrestore(&sd->defer_lock, flags); + + while (skb != NULL) { + next = skb->next; + napi_consume_skb(skb, 1); + skb = next; + } +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -6814,8 +6765,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) LIST_HEAD(list); LIST_HEAD(repoll); -start: - sd->in_net_rx_action = true; local_irq_disable(); list_splice_init(&sd->poll_list, &list); local_irq_enable(); @@ -6826,18 +6775,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) skb_defer_free_flush(sd); if (list_empty(&list)) { - if (list_empty(&repoll)) { - sd->in_net_rx_action = false; - barrier(); - /* We need to check if ____napi_schedule() - * had refilled poll_list while - * sd->in_net_rx_action was true. - */ - if (!list_empty(&sd->poll_list)) - goto start; - if (!sd_has_rps_ipi_waiting(sd)) - goto end; - } + if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) + goto end; break; } @@ -6862,8 +6801,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) __raise_softirq_irqoff(NET_RX_SOFTIRQ); - else - sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); end:; @@ -8503,8 +8440,9 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) } } if (dev->flags != old_flags) { - netdev_info(dev, "%s promiscuous mode\n", - dev->flags & IFF_PROMISC ? "entered" : "left"); + pr_info("device %s %s promiscuous mode\n", + dev->name, + dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); audit_log(audit_context(), GFP_ATOMIC, @@ -8521,7 +8459,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) dev_change_rx_flags(dev, IFF_PROMISC); } if (notify) - __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); + __dev_notify_flags(dev, old_flags, IFF_PROMISC); return 0; } @@ -8572,13 +8510,11 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) } } if (dev->flags ^ old_flags) { - netdev_info(dev, "%s allmulticast mode\n", - dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); if (notify) __dev_notify_flags(dev, old_flags, - dev->gflags ^ old_gflags, 0, NULL); + dev->gflags ^ old_gflags); } return 0; } @@ -8741,13 +8677,12 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, } void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, - unsigned int gchanges, u32 portid, - const struct nlmsghdr *nlh) + unsigned int gchanges) { unsigned int changes = dev->flags ^ old_flags; if (gchanges) - rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh); + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); if (changes & IFF_UP) { if (dev->flags & IFF_UP) @@ -8789,7 +8724,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, return ret; changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); - __dev_notify_flags(dev, old_flags, changes, 0, NULL); + __dev_notify_flags(dev, old_flags, changes); return ret; } EXPORT_SYMBOL(dev_change_flags); @@ -8969,11 +8904,9 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); if (err) return err; - if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { - err = ops->ndo_set_mac_address(dev, sa); - if (err) - return err; - } + err = ops->ndo_set_mac_address(dev, sa); + if (err) + return err; dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -9411,16 +9344,8 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } - if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { - NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); - return -EINVAL; - } - if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { - NL_SET_ERR_MSG(extack, "Program bound to different device"); - return -EINVAL; - } - if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { - NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { @@ -9602,7 +9527,6 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; - struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; @@ -9630,13 +9554,12 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - err = dev_xdp_attach_link(dev, &extack, link); + err = dev_xdp_attach_link(dev, NULL, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); - trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } @@ -9700,40 +9623,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, } /** - * dev_index_reserve() - allocate an ifindex in a namespace - * @net: the applicable net namespace - * @ifindex: requested ifindex, pass %0 to get one allocated + * dev_new_index - allocate an ifindex + * @net: the applicable net namespace * - * Allocate a ifindex for a new device. Caller must either use the ifindex - * to store the device (via list_netdevice()) or call dev_index_release() - * to give the index up. - * - * Return: a suitable unique value for a new device interface number or -errno. + * Returns a suitable unique value for a new device interface + * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. */ -static int dev_index_reserve(struct net *net, u32 ifindex) +static int dev_new_index(struct net *net) { - int err; + int ifindex = net->ifindex; - if (ifindex > INT_MAX) { - DEBUG_NET_WARN_ON_ONCE(1); - return -EINVAL; + for (;;) { + if (++ifindex <= 0) + ifindex = 1; + if (!__dev_get_by_index(net, ifindex)) + return net->ifindex = ifindex; } - - if (!ifindex) - err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, - xa_limit_31b, &net->ifindex, GFP_KERNEL); - else - err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); - if (err < 0) - return err; - - return ifindex; -} - -static void dev_index_release(struct net *net, int ifindex) -{ - /* Expect only unused indexes, unlist_netdevice() removes the used */ - WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } /* Delayed registration/unregisteration */ @@ -10255,10 +10161,11 @@ int register_netdevice(struct net_device *dev) if (ret) goto err_uninit; - ret = dev_index_reserve(net, dev->ifindex); - if (ret < 0) + ret = -EBUSY; + if (!dev->ifindex) + dev->ifindex = dev_new_index(net); + else if (__dev_get_by_index(net, dev->ifindex)) goto err_free_pcpu; - dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). @@ -10305,14 +10212,14 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) - goto err_ifindex_release; + goto err_free_pcpu; ret = netdev_register_kobject(dev); write_lock(&dev_base_lock); dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; write_unlock(&dev_base_lock); if (ret) - goto err_uninit_notify; + goto err_free_pcpu; __netdev_update_features(dev); @@ -10354,15 +10261,11 @@ int register_netdevice(struct net_device *dev) */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); out: return ret; -err_uninit_notify: - call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); -err_ifindex_release: - dev_index_release(net, dev->ifindex); err_free_pcpu: netdev_do_free_pcpu_stats(dev); err_uninit: @@ -10730,12 +10633,12 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, stats = per_cpu_ptr(netstats, cpu); do { - start = u64_stats_fetch_begin(&stats->syncp); + start = u64_stats_fetch_begin_irq(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); - } while (u64_stats_fetch_retry(&stats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; @@ -10788,24 +10691,6 @@ void netdev_set_default_ethtool_ops(struct net_device *dev, } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); -/** - * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default - * @dev: netdev to enable the IRQ coalescing on - * - * Sets a conservative default for SW IRQ coalescing. Users can use - * sysfs attributes to override the default values. - */ -void netdev_sw_irq_coalesce_default_on(struct net_device *dev) -{ - WARN_ON(dev->reg_state == NETREG_REGISTERED); - - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { - dev->gro_flush_timeout = 20000; - dev->napi_defer_hard_irqs = 1; - } -} -EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); - void netdev_freemem(struct net_device *dev) { char *addr = (char *)dev - dev->padded; @@ -10863,7 +10748,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); + ref_tracker_dir_init(&dev->refcnt_tracker, 128); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) @@ -10882,11 +10767,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; - dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; - dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; - dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; @@ -11054,8 +10936,14 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); -void unregister_netdevice_many_notify(struct list_head *head, - u32 portid, const struct nlmsghdr *nlh) +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack wont be corrupted later. + */ +void unregister_netdevice_many(struct list_head *head) { struct net_device *dev, *tmp; LIST_HEAD(close_head); @@ -11104,9 +10992,8 @@ void unregister_netdevice_many_notify(struct list_head *head, /* Shutdown queueing discipline. */ dev_shutdown(dev); - dev_tcx_uninstall(dev); + dev_xdp_uninstall(dev); - bpf_dev_bound_netdev_unregister(dev); netdev_offload_xstats_disable_all(dev); @@ -11118,8 +11005,7 @@ void unregister_netdevice_many_notify(struct list_head *head, if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL, 0, - portid, nlh); + GFP_KERNEL, NULL, 0); /* * Flush the unicast and multicast chains @@ -11130,13 +11016,11 @@ void unregister_netdevice_many_notify(struct list_head *head, netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); - call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); - if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (skb) - rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); @@ -11159,18 +11043,6 @@ void unregister_netdevice_many_notify(struct list_head *head, list_del(head); } - -/** - * unregister_netdevice_many - unregister many devices - * @head: list of devices - * - * Note: As most callers use a stack allocated list_head, - * we force a list_del() to make sure stack wont be corrupted later. - */ -void unregister_netdevice_many(struct list_head *head) -{ - unregister_netdevice_many_notify(head, 0, NULL); -} EXPORT_SYMBOL(unregister_netdevice_many); /** @@ -11251,19 +11123,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, goto out; /* Check that new_ifindex isn't used yet. */ - if (new_ifindex) { - err = dev_index_reserve(net, new_ifindex); - if (err < 0) - goto out; - } else { - /* If there is an ifindex conflict assign a new one */ - err = dev_index_reserve(net, dev->ifindex); - if (err == -EBUSY) - err = dev_index_reserve(net, 0); - if (err < 0) - goto out; - new_ifindex = err; - } + err = -EBUSY; + if (new_ifindex && __dev_get_by_index(net, new_ifindex)) + goto out; /* * And now a mini version of register_netdevice unregister_netdevice. @@ -11291,6 +11153,13 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); + /* If there is an ifindex conflict assign a new one */ + if (!new_ifindex) { + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); + else + new_ifindex = dev->ifindex; + } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11339,7 +11208,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); synchronize_net(); err = 0; @@ -11471,8 +11340,6 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; - xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); - RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; @@ -11570,7 +11437,6 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); - xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } @@ -11706,11 +11572,7 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif -#ifndef CONFIG_PREEMPT_RT INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); -#else - INIT_WORK(&sd->defer_work, trigger_rx_softirq); -#endif spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); --- base-commit: 043f841768688ddc914578849f2b9cc11757a816 change-id: 20260210-v6-1-octeon-3e6680539e84 Best regards, -- Kevin Hao <[email protected]>
-=-=-=-=-=-=-=-=-=-=-=- Links: You receive all messages sent to this group. View/Reply Online (#16296): https://lists.yoctoproject.org/g/linux-yocto/message/16296 Mute This Topic: https://lists.yoctoproject.org/mt/117734674/21656 Group Owner: [email protected] Unsubscribe: https://lists.yoctoproject.org/g/linux-yocto/unsub [[email protected]] -=-=-=-=-=-=-=-=-=-=-=-
