[PATCH net-next 05/14] net: sched: act_ipt: remove dependency on rtnl lock
Use tcf spinlock to protect ipt action private data from concurrent modification during dump. Ipt init already takes tcf spinlock when modifying ipt state. Signed-off-by: Vlad Buslov --- net/sched/act_ipt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 0dc787a57798..e149f0e66cb6 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -288,6 +288,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, * for foolproof you need to not assume this */ + spin_lock_bh(&ipt->tcf_lock); t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); if (unlikely(!t)) goto nla_put_failure; @@ -307,10 +308,12 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; + spin_unlock_bh(&ipt->tcf_lock); kfree(t); return skb->len; nla_put_failure: + spin_unlock_bh(&ipt->tcf_lock); nlmsg_trim(skb, b); kfree(t); return -1; -- 2.7.5
[PATCH net-next 06/14] net: sched: act_pedit: remove dependency on rtnl lock
Rearrange pedit init code to only access pedit action data while holding tcf spinlock. Change keys allocation type to atomic to allow it to execute while holding tcf spinlock. Take tcf spinlock in dump function when accessing pedit action data. Signed-off-by: Vlad Buslov --- net/sched/act_pedit.c | 40 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 43ba999b2d23..3f62da72ab6a 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -187,44 +187,38 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, tcf_idr_cleanup(tn, parm->index); goto out_free; } - p = to_pedit(*a); - keys = kmalloc(ksize, GFP_KERNEL); - if (!keys) { - tcf_idr_release(*a, bind); - ret = -ENOMEM; - goto out_free; - } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) goto out_free; if (!ovr) { - tcf_idr_release(*a, bind); ret = -EEXIST; - goto out_free; - } - p = to_pedit(*a); - if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { - keys = kmalloc(ksize, GFP_KERNEL); - if (!keys) { - ret = -ENOMEM; - goto out_free; - } + goto out_release; } } else { return err; } + p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); - p->tcfp_flags = parm->flags; - p->tcf_action = parm->action; - if (keys) { + + if (ret == ACT_P_CREATED || + (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) { + keys = kmalloc(ksize, GFP_ATOMIC); + if (!keys) { + spin_unlock_bh(&p->tcf_lock); + ret = -ENOMEM; + goto out_release; + } kfree(p->tcfp_keys); p->tcfp_keys = keys; p->tcfp_nkeys = parm->nkeys; } memcpy(p->tcfp_keys, parm->keys, ksize); + p->tcfp_flags = parm->flags; + p->tcf_action = parm->action; + kfree(p->tcfp_keys_ex); p->tcfp_keys_ex = keys_ex; @@ -232,6 +226,9 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; + +out_release: + tcf_idr_release(*a, bind); out_free: kfree(keys_ex); return ret; @@ -410,6 +407,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, if (unlikely(!opt)) return -ENOBUFS; + spin_lock_bh(&p->tcf_lock); memcpy(opt->keys, p->tcfp_keys, p->tcfp_nkeys * sizeof(struct tc_pedit_key)); opt->index = p->tcf_index; @@ -432,11 +430,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; + spin_unlock_bh(&p->tcf_lock); kfree(opt); return skb->len; nla_put_failure: + spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); kfree(opt); return -1; -- 2.7.5
[PATCH net-next 14/14] net: sched: act_police: remove dependency on rtnl lock
Use tcf spinlock to protect police action private data from concurrent modification during dump. (init already uses tcf spinlock when changing police action state) Pass tcf spinlock as estimator lock argument to gen_replace_estimator() during action init. Signed-off-by: Vlad Buslov --- net/sched/act_police.c | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 3eb5fe60c62c..4777da7caadf 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -138,7 +138,8 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, if (est) { err = gen_replace_estimator(&police->tcf_bstats, NULL, - &police->tcf_rate_est, NULL, + &police->tcf_rate_est, + &police->tcf_lock, &police->tcf_lock, NULL, est); if (err) @@ -274,14 +275,15 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_police *police = to_police(a); struct tc_police opt = { .index = police->tcf_index, - .action = police->tcf_action, - .mtu = police->tcfp_mtu, - .burst = PSCHED_NS2TICKS(police->tcfp_burst), .refcnt = refcount_read(&police->tcf_refcnt) - ref, .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; + spin_lock_bh(&police->tcf_lock); + opt.action = police->tcf_action; + opt.mtu = police->tcfp_mtu; + opt.burst = PSCHED_NS2TICKS(police->tcfp_burst); if (police->rate_present) psched_ratecfg_getrate(&opt.rate, &police->rate); if (police->peak_present) @@ -301,10 +303,12 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, t.expires = jiffies_to_clock_t(police->tcf_tm.expires); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; + spin_unlock_bh(&police->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&police->tcf_lock); nlmsg_trim(skb, b); return -1; } -- 2.7.5
[PATCH net-next 12/14] net: sched: act_mirred: remove dependency on rtnl lock
Re-introduce mirred list spinlock, that was removed some time ago, in order to protect it from concurrent modifications, instead of relying on rtnl lock. Use tcf spinlock to protect mirred action private data from concurrent modification in init and dump. Rearrange access to mirred data in order to be performed only while holding the lock. Rearrange net dev access to always hold reference while working with it, instead of relying on rntl lock. Change get dev function to increment net device reference before returning it to caller, instead of assuming that caller is protected with rtnl lock. Provide rcu version of mirred dev and tunnel info access functions. (to be used by unlocked drivers) Signed-off-by: Vlad Buslov --- include/net/tc_act/tc_mirred.h | 5 +++ include/net/tc_act/tc_tunnel_key.h | 33 --- net/sched/act_mirred.c | 82 +- net/sched/cls_api.c| 1 + 4 files changed, 88 insertions(+), 33 deletions(-) diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index a2e9cbca5c9e..cb30be55e444 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -37,4 +37,9 @@ static inline struct net_device *tcf_mirred_dev(const struct tc_action *a) return rtnl_dereference(to_mirred(a)->tcfm_dev); } +static inline struct net_device *tcf_mirred_dev_rcu(const struct tc_action *a) +{ + return rcu_dereference(to_mirred(a)->tcfm_dev); +} + #endif /* __NET_TC_MIR_H */ diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index 46b8c7f1c8d5..e6e475d788c6 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -30,26 +30,47 @@ struct tcf_tunnel_key { static inline bool is_tcf_tunnel_set(const struct tc_action *a) { + bool ret = false; #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); - struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); + struct tcf_tunnel_key_params *params; + rcu_read_lock(); + params = rcu_dereference(t->params); if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY) - return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET; + ret = params->tcft_action == TCA_TUNNEL_KEY_ACT_SET; + rcu_read_unlock(); #endif - return false; + return ret; } static inline bool is_tcf_tunnel_release(const struct tc_action *a) { + bool ret = false; #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); - struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); + struct tcf_tunnel_key_params *params; + rcu_read_lock(); + params = rcu_dereference(t->params); if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY) - return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE; + ret = params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE; + rcu_read_unlock(); +#endif + return ret; +} + +static inline +struct ip_tunnel_info *tcf_tunnel_info_rcu(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params = rcu_dereference(t->params); + + return ¶ms->tcft_enc_metadata->u.tun_info; +#else + return NULL; #endif - return false; } static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index b26d060da08e..9f622114f5a5 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,6 +30,7 @@ #include static LIST_HEAD(mirred_list); +static DEFINE_SPINLOCK(mirred_list_lock); static bool tcf_mirred_is_act_redirect(int action) { @@ -62,13 +63,23 @@ static bool tcf_mirred_can_reinsert(int action) return false; } +static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m) +{ + return rcu_dereference_protected(m->tcfm_dev, +lockdep_is_held(&m->tcf_lock)); +} + static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; + spin_lock(&mirred_list_lock); list_del(&m->tcfm_list); - dev = rtnl_dereference(m->tcfm_dev); + spin_unlock(&mirred_list_lock); + + /* last reference to action, no need to lock */ + dev = rcu_dereference_protected(m->tcfm_dev, 1); if (dev) dev_put(dev); } @@ -128,22 +139,9 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } - if (parm->ifindex) { - dev = __dev_get_by_index(net, parm->ifindex); - if (dev == NULL) { - if (exists) - tcf_idr_rele
[PATCH net-next 10/14] net: sched: act_tunnel_key: remove dependency on rtnl lock
Use tcf lock to protect tunnel key action struct private data from concurrent modification in init and dump. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Remove rtnl lock assertion that is no longer required. Signed-off-by: Vlad Buslov --- net/sched/act_tunnel_key.c | 26 +- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index d42d9e112789..ba2ae9f75ef5 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -204,7 +204,6 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; - struct tcf_tunnel_key_params *params_old; struct tcf_tunnel_key_params *params_new; struct metadata_dst *metadata = NULL; struct tc_tunnel_key *parm; @@ -346,24 +345,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, t = to_tunnel_key(*a); - ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { tcf_idr_release(*a, bind); NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); return -ENOMEM; } - - params_old = rtnl_dereference(t->params); - - t->tcf_action = parm->action; params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; - rcu_assign_pointer(t->params, params_new); - - if (params_old) - kfree_rcu(params_old, rcu); + spin_lock(&t->tcf_lock); + t->tcf_action = parm->action; + rcu_swap_protected(t->params, params_new, + lockdep_is_held(&t->tcf_lock)); + spin_unlock(&t->tcf_lock); + if (params_new) + kfree_rcu(params_new, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -485,12 +482,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, .index= t->tcf_index, .refcnt = refcount_read(&t->tcf_refcnt) - ref, .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, - .action = t->tcf_action, }; struct tcf_t tm; - params = rtnl_dereference(t->params); - + spin_lock(&t->tcf_lock); + params = rcu_dereference_protected(t->params, + lockdep_is_held(&t->tcf_lock)); + opt.action = t->tcf_action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) @@ -522,10 +520,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; + spin_unlock(&t->tcf_lock); return skb->len; nla_put_failure: + spin_unlock(&t->tcf_lock); nlmsg_trim(skb, b); return -1; } -- 2.7.5
[PATCH net-next 09/14] net: sched: act_skbmod: remove dependency on rtnl lock
Move read of skbmod_p rcu pointer to be protected by tcf spinlock. Use tcf spinlock to protect private skbmod data from concurrent modification during dump. Signed-off-by: Vlad Buslov --- net/sched/act_skbmod.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index c437c6d51a71..e9c86ade3b40 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -156,7 +156,6 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, d = to_skbmod(*a); - ASSERT_RTNL(); p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); if (unlikely(!p)) { tcf_idr_release(*a, bind); @@ -166,10 +165,10 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, p->flags = lflags; d->tcf_action = parm->action; - p_old = rtnl_dereference(d->skbmod_p); - if (ovr) spin_lock_bh(&d->tcf_lock); + /* Protected by tcf_lock if overwriting existing action. */ + p_old = rcu_dereference_protected(d->skbmod_p, 1); if (lflags & SKBMOD_F_DMAC) ether_addr_copy(p->eth_dst, daddr); @@ -205,15 +204,18 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, { struct tcf_skbmod *d = to_skbmod(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); + struct tcf_skbmod_params *p; struct tc_skbmod opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - .action = d->tcf_action, }; struct tcf_t t; + spin_lock_bh(&d->tcf_lock); + opt.action = d->tcf_action; + p = rcu_dereference_protected(d->skbmod_p, + lockdep_is_held(&d->tcf_lock)); opt.flags = p->flags; if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -231,8 +233,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) goto nla_put_failure; + spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } -- 2.7.5
[PATCH net-next 00/14] Remove rtnl lock dependency from all action implementations
Currently, all netlink protocol handlers for updating rules, actions and qdiscs are protected with single global rtnl lock which removes any possibility for parallelism. This patch set is a second step to remove rtnl lock dependency from TC rules update path. Recently, new rtnl registration flag RTNL_FLAG_DOIT_UNLOCKED was added. Handlers registered with this flag are called without RTNL taken. End goal is to have rule update handlers(RTM_NEWTFILTER, RTM_DELTFILTER, etc.) to be registered with UNLOCKED flag to allow parallel execution. However, there is no intention to completely remove or split rtnl lock itself. This patch set addresses specific problems in implementation of tc actions that prevent their control path from being executed concurrently. Additional changes are required to refactor classifiers API and individual classifiers for parallel execution. This patch set lays groundwork to eventually register rule update handlers as rtnl-unlocked. Action API is already prepared for parallel execution with previous patch set, which means that action ops that use action API for their implementation do not require additional modifications. (delete, search, etc.) Action API implements concurrency-safe reference counting and guarantees that cleanup/delete is called only once, after last reference to action is released. The goal of this change is to update specific actions APIs that access action private state directly, in order to be independent from external locking. General approach is to re-use existing tcf_lock spinlock (used by some action implementation to synchronize control path with data path) to protect action private state from concurrent modification. If action has rcu-protected pointer, tcf spinlock is used to protect its update code, instead of relying on rtnl lock. Some actions need to determine rtnl mutex status in order to release it. For example, ife action can load additional kernel modules(meta ops) and must make sure that no locks are held during module load. In such cases 'rtnl_held' argument is used to conditionally release rtnl mutex. Vlad Buslov (14): net: sched: act_bpf: remove dependency on rtnl lock net: sched: act_csum: remove dependency on rtnl lock net: sched: act_gact: remove dependency on rtnl lock net: sched: act_ife: remove dependency on rtnl lock net: sched: act_ipt: remove dependency on rtnl lock net: sched: act_pedit: remove dependency on rtnl lock net: sched: act_sample: remove dependency on rtnl lock net: sched: act_simple: remove dependency on rtnl lock net: sched: act_skbmod: remove dependency on rtnl lock net: sched: act_tunnel_key: remove dependency on rtnl lock net: sched: act_vlan: remove dependency on rtnl lock net: sched: act_mirred: remove dependency on rtnl lock net: core: add new/replace rate estimator lock parameter net: sched: act_police: remove dependency on rtnl lock include/net/gen_stats.h| 2 + include/net/tc_act/tc_mirred.h | 5 +++ include/net/tc_act/tc_tunnel_key.h | 33 --- net/core/gen_estimator.c | 58 --- net/netfilter/xt_RATEEST.c | 2 +- net/sched/act_api.c| 2 +- net/sched/act_bpf.c| 10 +++-- net/sched/act_csum.c | 24 ++- net/sched/act_gact.c | 10 - net/sched/act_ife.c| 40 --- net/sched/act_ipt.c| 3 ++ net/sched/act_mirred.c | 82 +- net/sched/act_pedit.c | 40 +-- net/sched/act_police.c | 10 +++-- net/sched/act_sample.c | 12 +- net/sched/act_simple.c | 6 ++- net/sched/act_skbmod.c | 14 --- net/sched/act_tunnel_key.c | 26 ++-- net/sched/act_vlan.c | 27 +++-- net/sched/cls_api.c| 1 + net/sched/sch_api.c| 2 + net/sched/sch_cbq.c| 4 +- net/sched/sch_drr.c| 4 +- net/sched/sch_hfsc.c | 4 +- net/sched/sch_htb.c| 4 +- net/sched/sch_qfq.c| 4 +- 26 files changed, 285 insertions(+), 144 deletions(-) -- 2.7.5
[PATCH net-next 11/14] net: sched: act_vlan: remove dependency on rtnl lock
Use tcf spinlock to protect vlan action private data from concurrent modification during dump and init. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Remove rtnl assertion that is no longer necessary. Signed-off-by: Vlad Buslov --- net/sched/act_vlan.c | 27 +++ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 15a0ee214c9c..5bde17fe3608 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; - struct tcf_vlan_params *p, *p_old; + struct tcf_vlan_params *p; struct tc_vlan *parm; struct tcf_vlan *v; int action; @@ -202,26 +202,24 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v = to_vlan(*a); - ASSERT_RTNL(); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { tcf_idr_release(*a, bind); return -ENOMEM; } - v->tcf_action = parm->action; - - p_old = rtnl_dereference(v->vlan_p); - p->tcfv_action = action; p->tcfv_push_vid = push_vid; p->tcfv_push_prio = push_prio; p->tcfv_push_proto = push_proto; - rcu_assign_pointer(v->vlan_p, p); + spin_lock(&v->tcf_lock); + v->tcf_action = parm->action; + rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); + spin_unlock(&v->tcf_lock); - if (p_old) - kfree_rcu(p_old, rcu); + if (p) + kfree_rcu(p, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -243,16 +241,18 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_vlan *v = to_vlan(a); - struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); + struct tcf_vlan_params *p; struct tc_vlan opt = { .index= v->tcf_index, .refcnt = refcount_read(&v->tcf_refcnt) - ref, .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, - .action = v->tcf_action, - .v_action = p->tcfv_action, }; struct tcf_t t; + spin_lock(&v->tcf_lock); + opt.action = v->tcf_action; + p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); + opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -268,9 +268,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; + spin_unlock(&v->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&v->tcf_lock); nlmsg_trim(skb, b); return -1; } -- 2.7.5
[PATCH net-next 03/14] net: sched: act_gact: remove dependency on rtnl lock
Use tcf spinlock to protect gact action private state from concurrent modification during dump and init. Remove rtnl assertion that is no longer necessary. Signed-off-by: Vlad Buslov --- net/sched/act_gact.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 661b72b9147d..bfccd34a3968 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -113,7 +113,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact = to_gact(*a); - ASSERT_RTNL(); + spin_lock(&gact->tcf_lock); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB if (p_parm) { @@ -126,6 +126,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact->tcfg_ptype = p_parm->ptype; } #endif + spin_unlock(&gact->tcf_lock); + if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; @@ -178,10 +180,11 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, .index = gact->tcf_index, .refcnt = refcount_read(&gact->tcf_refcnt) - ref, .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, - .action = gact->tcf_action, }; struct tcf_t t; + spin_lock(&gact->tcf_lock); + opt.action = gact->tcf_action; if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; #ifdef CONFIG_GACT_PROB @@ -199,9 +202,12 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; + spin_unlock(&gact->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&gact->tcf_lock); nlmsg_trim(skb, b); return -1; } -- 2.7.5
[PATCH net-next 08/14] net: sched: act_simple: remove dependency on rtnl lock
Use tcf spinlock to protect private simple action data from concurrent modification during dump. (simple init already uses tcf spinlock when changing action state) Signed-off-by: Vlad Buslov --- net/sched/act_simple.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index aa51152e0066..18e4452574cd 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -156,10 +156,11 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - .action = d->tcf_action, }; struct tcf_t t; + spin_lock_bh(&d->tcf_lock); + opt.action = d->tcf_action; if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; @@ -167,9 +168,12 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; + spin_unlock_bh(&d->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } -- 2.7.5
[PATCH net-next 04/14] net: sched: act_ife: remove dependency on rtnl lock
Use tcf spinlock and rcu to protect params pointer from concurrent modification during dump and init. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Ife action has meta-actions that are compiled as standalone modules. Rtnl mutex must be released while loading a kernel module. In order to support execution without rtnl mutex, propagate 'rtnl_held' argument to meta action loading functions. When requesting meta action module, conditionally release rtnl lock depending on 'rtnl_held' argument. Signed-off-by: Vlad Buslov --- net/sched/act_ife.c | 40 +--- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index df4060e32d43..5d200495e467 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -268,7 +268,8 @@ static const char *ife_meta_id2name(u32 metaid) * under ife->tcf_lock for existing action */ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, - void *val, int len, bool exists) + void *val, int len, bool exists, + bool rtnl_held) { struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret = 0; @@ -278,9 +279,11 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, #ifdef CONFIG_MODULES if (exists) spin_unlock_bh(&ife->tcf_lock); - rtnl_unlock(); + if (rtnl_held) + rtnl_unlock(); request_module("ife-meta-%s", ife_meta_id2name(metaid)); - rtnl_lock(); + if (rtnl_held) + rtnl_lock(); if (exists) spin_lock_bh(&ife->tcf_lock); ops = find_ife_oplist(metaid); @@ -421,7 +424,7 @@ static void tcf_ife_cleanup(struct tc_action *a) /* under ife->tcf_lock for existing action */ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, -bool exists) +bool exists, bool rtnl_held) { int len = 0; int rc = 0; @@ -433,7 +436,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, val = nla_data(tb[i]); len = nla_len(tb[i]); - rc = load_metaops_and_vet(ife, i, val, len, exists); + rc = load_metaops_and_vet(ife, i, val, len, exists, + rtnl_held); if (rc != 0) return rc; @@ -454,7 +458,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; - struct tcf_ife_params *p, *p_old; + struct tcf_ife_params *p; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; @@ -558,7 +562,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, return err; } - err = populate_metalist(ife, tb2, exists); + err = populate_metalist(ife, tb2, exists, rtnl_held); if (err) goto metadata_parse_err; @@ -581,13 +585,13 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } ife->tcf_action = parm->action; + /* protected by tcf_lock when modifying existing action */ + rcu_swap_protected(ife->params, p, 1); + if (exists) spin_unlock_bh(&ife->tcf_lock); - - p_old = rtnl_dereference(ife->params); - rcu_assign_pointer(ife->params, p); - if (p_old) - kfree_rcu(p_old, rcu); + if (p) + kfree_rcu(p, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -600,16 +604,20 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); - struct tcf_ife_params *p = rtnl_dereference(ife->params); + struct tcf_ife_params *p; struct tc_ife opt = { .index = ife->tcf_index, .refcnt = refcount_read(&ife->tcf_refcnt) - ref, .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, - .action = ife->tcf_action, - .flags = p->flags, }; struct tcf_t t; + spin_lock_bh(&ife->tcf_lock); + opt.action = ife->tcf_action; + p = rcu_dereference_protected(ife->params, + lockdep_is_held(&ife->tcf_lock)); + opt.flags = p->flags; +
[PATCH net-next 01/14] net: sched: act_bpf: remove dependency on rtnl lock
Use tcf spinlock to protect bpf action private data from concurrent modification during dump and init. Remove rtnl lock assertion that is no longer necessary. Signed-off-by: Vlad Buslov --- net/sched/act_bpf.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 6203eb075c9a..9e8a33f9fee3 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -143,11 +143,12 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, .index = prog->tcf_index, .refcnt = refcount_read(&prog->tcf_refcnt) - ref, .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, - .action = prog->tcf_action, }; struct tcf_t tm; int ret; + spin_lock(&prog->tcf_lock); + opt.action = prog->tcf_action; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -163,9 +164,11 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, TCA_ACT_BPF_PAD)) goto nla_put_failure; + spin_unlock(&prog->tcf_lock); return skb->len; nla_put_failure: + spin_unlock(&prog->tcf_lock); nlmsg_trim(skb, tp); return -1; } @@ -264,7 +267,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, { cfg->is_ebpf = tcf_bpf_is_ebpf(prog); /* updates to prog->filter are prevented, since it's called either -* with rtnl lock or during final cleanup in rcu callback +* with tcf lock or during final cleanup in rcu callback */ cfg->filter = rcu_dereference_protected(prog->filter, 1); @@ -336,8 +339,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, goto out; prog = to_bpf(*act); - ASSERT_RTNL(); + spin_lock(&prog->tcf_lock); if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); @@ -349,6 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, prog->tcf_action = parm->action; rcu_assign_pointer(prog->filter, cfg.filter); + spin_unlock(&prog->tcf_lock); if (res == ACT_P_CREATED) { tcf_idr_insert(tn, *act); -- 2.7.5
[PATCH net-next 13/14] net: core: add new/replace rate estimator lock parameter
Extend rate estimator 'new' and 'replace' APIs with additional spinlock parameter to be used by rtnl-unlocked actions to protect rate_est pointer from concurrent modification. Extract code that requires synchronization from gen_new_estimator into __replace_estimator function in order to be used by both locked and unlocked paths. Signed-off-by: Vlad Buslov --- include/net/gen_stats.h| 2 ++ net/core/gen_estimator.c | 58 +++--- net/netfilter/xt_RATEEST.c | 2 +- net/sched/act_api.c| 2 +- net/sched/act_police.c | 2 +- net/sched/sch_api.c| 2 ++ net/sched/sch_cbq.c| 4 ++-- net/sched/sch_drr.c| 4 ++-- net/sched/sch_hfsc.c | 4 ++-- net/sched/sch_htb.c| 4 ++-- net/sched/sch_qfq.c| 4 ++-- 11 files changed, 61 insertions(+), 27 deletions(-) diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 0304ba2ae353..d1ef63d16abc 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -59,12 +59,14 @@ int gnet_stats_finish_copy(struct gnet_dump *d); int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, + spinlock_t *rate_est_lock, spinlock_t *stats_lock, seqcount_t *running, struct nlattr *opt); void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **ptr, + spinlock_t *rate_est_lock, spinlock_t *stats_lock, seqcount_t *running, struct nlattr *opt); bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 98fd12721221..012e37011147 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -107,11 +107,43 @@ static void est_timer(struct timer_list *t) mod_timer(&est->timer, est->next_jiffies); } +static void __replace_estimator(struct net_rate_estimator __rcu **rate_est, + struct net_rate_estimator *new) +{ + struct net_rate_estimator *old = rcu_dereference_protected(*rate_est, + 1); + + if (old) { + del_timer_sync(&old->timer); + new->avbps = old->avbps; + new->avpps = old->avpps; + } + + new->next_jiffies = jiffies + ((HZ / 4) << new->intvl_log); + timer_setup(&new->timer, est_timer, 0); + mod_timer(&new->timer, new->next_jiffies); + + rcu_assign_pointer(*rate_est, new); + + if (old) + kfree_rcu(old, rcu); +} + +static void replace_estimator(struct net_rate_estimator __rcu **rate_est, + struct net_rate_estimator *new, + spinlock_t *rate_est_lock) +{ + spin_lock_bh(rate_est_lock); + __replace_estimator(rate_est, new); + spin_unlock_bh(rate_est_lock); +} + /** * gen_new_estimator - create a new rate estimator * @bstats: basic statistics * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics + * @rate_est_lock: rate_est lock (might be NULL) * @stats_lock: statistics lock * @running: qdisc running seqcount * @opt: rate estimator configuration TLV @@ -128,12 +160,13 @@ static void est_timer(struct timer_list *t) int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, + spinlock_t *rate_est_lock, spinlock_t *stats_lock, seqcount_t *running, struct nlattr *opt) { struct gnet_estimator *parm = nla_data(opt); - struct net_rate_estimator *old, *est; + struct net_rate_estimator *est; struct gnet_stats_basic_packed b; int intvl_log; @@ -167,20 +200,15 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, local_bh_enable(); est->last_bytes = b.bytes; est->last_packets = b.packets; - old = rcu_dereference_protected(*rate_est, 1); - if (old) { - del_timer_sync(&old->timer); - est->avbps = old->avbps; - est->avpps = old->avpps; - } - est->next_jiffies = jiffies + ((HZ/4) << intvl_log); - timer_setup(&est->timer, est_timer, 0); - mod_timer(&est->timer, est->next_jiffies); + if (rate_est_lock) + replace_estimator(rate_est, est, rate_est_lock); + else + /* If no spinlock argu
[PATCH net-next 07/14] net: sched: act_sample: remove dependency on rtnl lock
Use tcf spinlock to protect private sample action data from concurrent modification during dump and init. Signed-off-by: Vlad Buslov --- net/sched/act_sample.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 2608ccc83e5e..81071afe1b43 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -80,11 +80,13 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, } s = to_sample(*a); + spin_lock(&s->tcf_lock); s->tcf_action = parm->action; s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, s->psample_group_num); if (!psample_group) { + spin_unlock(&s->tcf_lock); tcf_idr_release(*a, bind); return -ENOMEM; } @@ -94,6 +96,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, s->truncate = true; s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } + spin_unlock(&s->tcf_lock); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -105,7 +108,8 @@ static void tcf_sample_cleanup(struct tc_action *a) struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; - psample_group = rtnl_dereference(s->psample_group); + /* last reference to action, no need to lock */ + psample_group = rcu_dereference_protected(s->psample_group, 1); RCU_INIT_POINTER(s->psample_group, NULL); if (psample_group) psample_group_put(psample_group); @@ -174,12 +178,13 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_sample *s = to_sample(a); struct tc_sample opt = { .index = s->tcf_index, - .action = s->tcf_action, .refcnt = refcount_read(&s->tcf_refcnt) - ref, .bindcnt= atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; + spin_lock(&s->tcf_lock); + opt.action = s->tcf_action; if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -196,9 +201,12 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) goto nla_put_failure; + spin_unlock(&s->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&s->tcf_lock); nlmsg_trim(skb, b); return -1; } -- 2.7.5
[PATCH net-next 02/14] net: sched: act_csum: remove dependency on rtnl lock
Use tcf lock to protect csum action struct private data from concurrent modification in init and dump. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Remove rtnl assertion that is no longer necessary. Signed-off-by: Vlad Buslov --- net/sched/act_csum.c | 24 +++- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 648a3a35b720..f01c59ba6d12 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -50,7 +50,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); - struct tcf_csum_params *params_old, *params_new; + struct tcf_csum_params *params_new; struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tc_csum *parm; struct tcf_csum *p; @@ -88,20 +88,22 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } p = to_tcf_csum(*a); - ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { tcf_idr_release(*a, bind); return -ENOMEM; } - params_old = rtnl_dereference(p->params); + params_new->update_flags = parm->update_flags; + spin_lock(&p->tcf_lock); p->tcf_action = parm->action; - params_new->update_flags = parm->update_flags; - rcu_assign_pointer(p->params, params_new); - if (params_old) - kfree_rcu(params_old, rcu); + rcu_swap_protected(p->params, params_new, + lockdep_is_held(&p->tcf_lock)); + spin_unlock(&p->tcf_lock); + + if (params_new) + kfree_rcu(params_new, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -599,11 +601,13 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, - .action = p->tcf_action, }; struct tcf_t t; - params = rtnl_dereference(p->params); + spin_lock(&p->tcf_lock); + params = rcu_dereference_protected(p->params, + lockdep_is_held(&p->tcf_lock)); + opt.action = p->tcf_action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) @@ -612,10 +616,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; + spin_unlock(&p->tcf_lock); return skb->len; nla_put_failure: + spin_unlock(&p->tcf_lock); nlmsg_trim(skb, b); return -1; } -- 2.7.5
Re: [PATCH bpf] bpf: btf: Change tools/lib/bpf/btf to LGPL
On 08/06/2018 06:45 AM, Alexei Starovoitov wrote: > On Sun, Aug 05, 2018 at 05:19:13PM -0700, Martin KaFai Lau wrote: >> This patch changes the tools/lib/bpf/btf.[ch] to LGPL which >> is inline with libbpf also. >> >> Signed-off-by: Martin KaFai Lau > > technically it's bpf tree material, but since it changes > "comment" only and we're very late into the release, > I think bpf-next would be fine too. > I'm ok whichever way. > Acked-by: Alexei Starovoitov I think bpf tree is more appropriate, thus applied there, thanks Martin!
Re: [PATCH bpf] bpf: btf: Change tools/lib/bpf/btf to LGPL
On Sun, Aug 05, 2018 at 05:19:13PM -0700, Martin KaFai Lau wrote: > This patch changes the tools/lib/bpf/btf.[ch] to LGPL which > is inline with libbpf also. > > Signed-off-by: Martin KaFai Lau technically it's bpf tree material, but since it changes "comment" only and we're very late into the release, I think bpf-next would be fine too. I'm ok whichever way. Acked-by: Alexei Starovoitov
[net-next:master 457/471] ptp_qoriq.c:undefined reference to `__aeabi_uldivmod'
tree: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master head: 981467033a37d916649647fa3afe1fe99bba1817 commit: 91305f2812624c0cf7ccbb44133b66d3b24676e4 [457/471] ptp_qoriq: support automatic configuration for ptp timer config: arm-multi_v7_defconfig (attached as .config) compiler: arm-linux-gnueabi-gcc (Debian 7.2.0-11) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout 91305f2812624c0cf7ccbb44133b66d3b24676e4 # save the attached .config to linux build tree GCC_VERSION=7.2.0 make.cross ARCH=arm All errors (new ones prefixed by >>): drivers/ptp/ptp_qoriq.o: In function `qoriq_ptp_probe': >> ptp_qoriq.c:(.text+0xd0c): undefined reference to `__aeabi_uldivmod' --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
[net-next:master 1709/1739] drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2989:1: warning: the frame size of 1156 bytes is larger than 1024 bytes
tree: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master head: 981467033a37d916649647fa3afe1fe99bba1817 commit: 6c5657d085ae8c13a8565b98e6a23fe68f0bede4 [1709/1739] bnxt_en: Add support for ethtool get dump. config: microblaze-allmodconfig (attached as .config) compiler: microblaze-linux-gcc (GCC) 8.1.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout 6c5657d085ae8c13a8565b98e6a23fe68f0bede4 # save the attached .config to linux build tree GCC_VERSION=8.1.0 make.cross ARCH=microblaze All warnings (new ones prefixed by >>): In function 'bnxt_fill_coredump_record', inlined from 'bnxt_get_coredump' at drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2980:3: drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2863:2: warning: 'strncpy' output truncated before terminating nul copying as many bytes from a string as its length [-Wstringop-truncation] strncpy(record->system_name, utsname()->nodename, ^ strlen(utsname()->nodename)); drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c: In function 'bnxt_get_coredump': >> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2989:1: warning: the frame >> size of 1156 bytes is larger than 1024 bytes [-Wframe-larger-than=] } ^ vim +2989 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 2896 2897 static int bnxt_get_coredump(struct bnxt *bp, void *buf, u32 *dump_len) 2898 { 2899 u32 ver_get_resp_len = sizeof(struct hwrm_ver_get_output); 2900 struct coredump_segment_record *seg_record = NULL; 2901 u32 offset = 0, seg_hdr_len, seg_record_len; 2902 struct bnxt_coredump_segment_hdr seg_hdr; 2903 struct bnxt_coredump_record coredump_rec; 2904 struct bnxt_coredump coredump = {NULL}; 2905 time64_t start_time; 2906 u16 start_utc; 2907 int rc = 0, i; 2908 2909 start_time = ktime_get_real_seconds(); 2910 start_utc = sys_tz.tz_minuteswest * 60; 2911 seg_hdr_len = sizeof(seg_hdr); 2912 2913 /* First segment should be hwrm_ver_get response */ 2914 *dump_len = seg_hdr_len + ver_get_resp_len; 2915 if (buf) { 2916 bnxt_fill_coredump_seg_hdr(bp, &seg_hdr, NULL, ver_get_resp_len, 2917 0, 0, 0); 2918 memcpy(buf + offset, &seg_hdr, seg_hdr_len); 2919 offset += seg_hdr_len; 2920 memcpy(buf + offset, &bp->ver_resp, ver_get_resp_len); 2921 offset += ver_get_resp_len; 2922 } 2923 2924 rc = bnxt_hwrm_dbg_coredump_list(bp, &coredump); 2925 if (rc) { 2926 netdev_err(bp->dev, "Failed to get coredump segment list\n"); 2927 goto err; 2928 } 2929 2930 *dump_len += seg_hdr_len * coredump.total_segs; 2931 2932 seg_record = (struct coredump_segment_record *)coredump.data; 2933 seg_record_len = sizeof(*seg_record); 2934 2935 for (i = 0; i < coredump.total_segs; i++) { 2936 u16 comp_id = le16_to_cpu(seg_record->component_id); 2937 u16 seg_id = le16_to_cpu(seg_record->segment_id); 2938 u32 duration = 0, seg_len = 0; 2939 unsigned long start, end; 2940 2941 start = jiffies; 2942 2943 rc = bnxt_hwrm_dbg_coredump_initiate(bp, comp_id, seg_id); 2944 if (rc) { 2945 netdev_err(bp->dev, 2946 "Failed to initiate coredump for seg = %d\n", 2947 seg_record->segment_id); 2948 goto next_seg; 2949 } 2950 2951 /* Write segment data into the buffer */ 2952 rc = bnxt_hwrm_dbg_coredump_retrieve(bp, comp_id, seg_id, 2953 &seg_len, buf, 2954 offset + seg_hdr_len); 2955 if (rc) 2956 netdev_err(bp->dev, 2957 "Failed to retrieve coredump for seg = %d\n", 2958 seg_record->segment_id); 2959 2960 next_seg: 2961 end = jiffies; 2962 duration = jiffies_to_msecs(end - start); 2963 bnxt_fill_coredump_seg_hdr(bp, &seg_hdr, seg_record, seg_len, 2964 rc, duration, 0); 2965 2966 if (buf) { 2967
[PATCH] checkpatch: warn on unnecessary int declarations
On Sun, 2018-08-05 at 08:52 -0700, Linus Torvalds wrote: > "long unsigned int" isn't _technically_ wrong. But we normally > call that type "unsigned long". So add a checkpatch test for it. Signed-off-by: Joe Perches --- scripts/checkpatch.pl | 20 1 file changed, 20 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ce72cc4784e6..bc6dda34394e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3829,6 +3829,26 @@ sub process { "type '$tmp' should be specified in [[un]signed] [short|int|long|long long] order\n" . $herecurr); } +# check for unnecessary int declarations of short/long/long long + while ($sline =~ m{\b($TypeMisordered(\s*\*)*|$C90_int_types)\b}g) { + my $type = trim($1); + next if ($type !~ /\bint\b/); + next if ($type !~ /\b(?:short|long\s+long|long)\b/); + my $new_type = $type; + $new_type =~ s/\b\s*int\s*\b/ /; + $new_type =~ s/\b\s*(?:un)?signed\b\s*/ /; + $new_type =~ s/^const\s+//; + $new_type = "unsigned $new_type" if ($type =~ /\bunsigned\b/); + $new_type = "const $new_type" if ($type =~ /^const\b/); + $new_type =~ s/\s+/ /g; + $new_type = trim($new_type); + if (WARN("UNNECESSARY_INT", +"Prefer '$new_type' over '$type' as the int is unnecessary\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\b\Q$type\E\b/$new_type/; + } + } + # check for static const char * arrays. if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) { WARN("STATIC_CONST_CHAR_ARRAY",
Re: [PATCH] net/bridge/br_multicast: remove redundant variable "err"
On 2018/8/6 8:33, David Miller wrote: > From: zhong jiang > Date: Sun, 5 Aug 2018 22:18:43 +0800 > >> @@ -1797,7 +1795,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, >> struct sk_buff *skb_trimmed = NULL; >> const unsigned char *src; >> struct igmphdr *ih; >> -int err; >> +int err = 0; >> >> err = ip_mc_check_igmp(skb, &skb_trimmed); > The initialization of err to '0' is unnecessary, it gets assigned on > the very next line. > > . > That's right. I will drop that change and repost. Thanks, zhong jiang
linux-next: build failure after merge of the net-next tree
Hi all, After merging the net-next tree, today's linux-next build (arm multi_v7_defconfig) failed like this: drivers/ptp/ptp_qoriq.o: In function `qoriq_ptp_probe': ptp_qoriq.c:(.text+0xd0c): undefined reference to `__aeabi_uldivmod' Caused by commit 91305f281262 ("ptp_qoriq: support automatic configuration for ptp timer") I just reverted that commit for today. -- Cheers, Stephen Rothwell pgpEk7KnNE93L.pgp Description: OpenPGP digital signature
[net-next:master 1709/1739] drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2863:2: warning: 'strncpy' output truncated before terminating nul copying as many bytes from a string as its length
tree: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master head: 981467033a37d916649647fa3afe1fe99bba1817 commit: 6c5657d085ae8c13a8565b98e6a23fe68f0bede4 [1709/1739] bnxt_en: Add support for ethtool get dump. config: ia64-allmodconfig (attached as .config) compiler: ia64-linux-gcc (GCC) 8.1.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout 6c5657d085ae8c13a8565b98e6a23fe68f0bede4 # save the attached .config to linux build tree GCC_VERSION=8.1.0 make.cross ARCH=ia64 All warnings (new ones prefixed by >>): In function 'bnxt_fill_coredump_record', inlined from 'bnxt_get_coredump' at drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2980:3: >> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2863:2: warning: 'strncpy' >> output truncated before terminating nul copying as many bytes from a string >> as its length [-Wstringop-truncation] strncpy(record->system_name, utsname()->nodename, ^ strlen(utsname()->nodename)); vim +/strncpy +2863 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 2846 2847 static void 2848 bnxt_fill_coredump_record(struct bnxt *bp, struct bnxt_coredump_record *record, 2849time64_t start, s16 start_utc, u16 total_segs, 2850int status) 2851 { 2852 time64_t end = ktime_get_real_seconds(); 2853 u32 os_ver_major = 0, os_ver_minor = 0; 2854 struct tm tm; 2855 2856 time64_to_tm(start, 0, &tm); 2857 memset(record, 0, sizeof(*record)); 2858 strcpy(record->signature, "cOrE"); 2859 record->flags = 0; 2860 record->low_version = 0; 2861 record->high_version = 1; 2862 record->asic_state = 0; > 2863 strncpy(record->system_name, utsname()->nodename, 2864 strlen(utsname()->nodename)); 2865 record->year = cpu_to_le16(tm.tm_year); 2866 record->month = cpu_to_le16(tm.tm_mon); 2867 record->day = cpu_to_le16(tm.tm_mday); 2868 record->hour = cpu_to_le16(tm.tm_hour); 2869 record->minute = cpu_to_le16(tm.tm_min); 2870 record->second = cpu_to_le16(tm.tm_sec); 2871 record->utc_bias = cpu_to_le16(start_utc); 2872 strcpy(record->commandline, "ethtool -w"); 2873 record->total_segments = cpu_to_le32(total_segs); 2874 2875 sscanf(utsname()->release, "%u.%u", &os_ver_major, &os_ver_minor); 2876 record->os_ver_major = cpu_to_le32(os_ver_major); 2877 record->os_ver_minor = cpu_to_le32(os_ver_minor); 2878 2879 strcpy(record->os_name, utsname()->sysname); 2880 time64_to_tm(end, 0, &tm); 2881 record->end_year = cpu_to_le16(tm.tm_year + 1900); 2882 record->end_month = cpu_to_le16(tm.tm_mon + 1); 2883 record->end_day = cpu_to_le16(tm.tm_mday); 2884 record->end_hour = cpu_to_le16(tm.tm_hour); 2885 record->end_minute = cpu_to_le16(tm.tm_min); 2886 record->end_second = cpu_to_le16(tm.tm_sec); 2887 record->end_utc_bias = cpu_to_le16(sys_tz.tz_minuteswest * 60); 2888 record->asic_id1 = cpu_to_le32(bp->chip_num << 16 | 2889 bp->ver_resp.chip_rev << 8 | 2890 bp->ver_resp.chip_metal); 2891 record->asic_id2 = 0; 2892 record->coredump_status = cpu_to_le32(status); 2893 record->ioctl_low_version = 0; 2894 record->ioctl_high_version = 0; 2895 } 2896 2897 static int bnxt_get_coredump(struct bnxt *bp, void *buf, u32 *dump_len) 2898 { 2899 u32 ver_get_resp_len = sizeof(struct hwrm_ver_get_output); 2900 struct coredump_segment_record *seg_record = NULL; 2901 u32 offset = 0, seg_hdr_len, seg_record_len; 2902 struct bnxt_coredump_segment_hdr seg_hdr; 2903 struct bnxt_coredump_record coredump_rec; 2904 struct bnxt_coredump coredump = {NULL}; 2905 time64_t start_time; 2906 u16 start_utc; 2907 int rc = 0, i; 2908 2909 start_time = ktime_get_real_seconds(); 2910 start_utc = sys_tz.tz_minuteswest * 60; 2911 seg_hdr_len = sizeof(seg_hdr); 2912 2913 /* First segment should be hwrm_ver_get response */ 2914 *dump_len = seg_hdr_len + ver_get_resp_len; 2915 if (buf) { 2916 bnxt_fill_coredump_seg_hdr(bp, &seg_hdr, NULL, ver_get_resp_len, 2917 0, 0, 0); 2918 memcpy(buf + offset, &seg_hdr, seg_hdr_len); 2919
Re: [PATCH net-next] net: report netlink extack only if set
On Sun, Aug 5, 2018 at 8:37 PM David Miller wrote: > > From: Willem de Bruijn > Date: Sun, 5 Aug 2018 15:48:01 -0400 > > > From: Willem de Bruijn > > > > Initialize extack in dev_set_mtu and report only if set. > > > > Fixes: 7a4c53bee332 ("net: report invalid mtu value via netlink extack") > > Reported-by: syzbot > > Signed-off-by: Willem de Bruijn > > Someone beat you to it :-) And that version is in net-next. Oops. I had totally missed that thread. Glad to see it's fixed already.
Re: [PATCH net-next] tc-testing: fix ip address in u32 test
From: Vlad Buslov Date: Sun, 5 Aug 2018 22:35:56 +0300 > Fix expected ip address to actually match configured ip address. > Fix test to expect single matched filter. > > Signed-off-by: Vlad Buslov > Acked-by: Jamal Hadi Salim Applied.
Re: [PATCH net-next] tc-testing: remove duplicate spaces in skbedit match patterns
From: Vlad Buslov Date: Sun, 5 Aug 2018 22:37:09 +0300 > Match patterns for some skbedit tests contain duplicate whitespace that is > not present in actual tc output. This causes tests to fail because they > can't match required action, even when it was successfully created. > > Signed-off-by: Vlad Buslov > Acked-by: Jamal Hadi Salim Applied.
Re: [PATCH net-next] tc-testing: flush gact actions on test teardown
From: Vlad Buslov Date: Sun, 5 Aug 2018 22:36:25 +0300 > Test 6fb4 creates one mirred and one pipe action, but only flushes mirred > on teardown. Leaking pipe action causes failures in other tests. > > Add additional teardown command to also flush gact actions. > > Signed-off-by: Vlad Buslov > Acked-by: Jamal Hadi Salim Applied.
Re: [PATCH net-next] tc-testing: remove duplicate spaces in connmark match patterns
From: Vlad Buslov Date: Sun, 5 Aug 2018 22:36:44 +0300 > Match patterns for some connmark tests contain duplicate whitespace that is > not present in actual tc output. This causes tests to fail because they > can't match required action, even when it was successfully created. > > Fixes: 1dad0f97 ("tc-testing: add connmark action tests") > Signed-off-by: Vlad Buslov > Acked-by: Jamal Hadi Salim Applied.
Re: [PATCH net-next 00/13] bnxt_en: Updates for net-next.
From: Michael Chan Date: Sun, 5 Aug 2018 16:51:45 -0400 > This series includes the usual firmware spec update. The driver has > added external phy loopback test and phy setup retry logic that is > needed during hotplug. In the SRIOV space, the driver has added a > new VF resource allocation mode that requires the VF driver to > reserve resources during IFUP. IF state changes are now propagated > to firmware so that firmware can release some resources during IFDOWN. > > ethtool method to get firmware core dump and hwmon temperature reading > have been added. DSCP to user priority support has been added to > the driver's DCBNL interface, and the CoS queue logic has been refined > to make sure that the special RDMA Congestion Notification hardware CoS > queue will not be used for networking traffic. Series applied, thanks Michael.
Re: pull-request: wireless-drivers-next 2018-08-05
From: Kalle Valo Date: Sun, 05 Aug 2018 20:07:16 +0300 > a pull request to net-next tree for 4.19. If the merge window doesn't > open today I'm planning to submit one more pull request later next week. > > Please let me know if you have any problems. Pulled, thanks Kalle.
Re: [PATCH net-next] net: report netlink extack only if set
From: Willem de Bruijn Date: Sun, 5 Aug 2018 15:48:01 -0400 > From: Willem de Bruijn > > Initialize extack in dev_set_mtu and report only if set. > > Fixes: 7a4c53bee332 ("net: report invalid mtu value via netlink extack") > Reported-by: syzbot > Signed-off-by: Willem de Bruijn Someone beat you to it :-) And that version is in net-next. Thanks!
Re: [PATCH net] ip6_tunnel: use the right value for ipv4 min mtu check in ip6_tnl_xmit
From: Xin Long Date: Sun, 5 Aug 2018 22:46:07 +0800 > According to RFC791, 68 bytes is the minimum size of IPv4 datagram every > device must be able to forward without further fragmentation while 576 > bytes is the minimum size of IPv4 datagram every device has to be able > to receive, so in ip6_tnl_xmit(), 68(IPV4_MIN_MTU) should be the right > value for the ipv4 min mtu check in ip6_tnl_xmit. > > While at it, change to use max() instead of if statement. > > Fixes: c9fefa08190f ("ip6_tunnel: get the min mtu properly in ip6_tnl_xmit") > Reported-by: Sabrina Dubroca > Signed-off-by: Xin Long Applied and queued up for -stable, thanks Xin.
Re: [PATCH] net/bridge/br_multicast: remove redundant variable "err"
From: zhong jiang Date: Sun, 5 Aug 2018 22:18:43 +0800 > @@ -1797,7 +1795,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, > struct sk_buff *skb_trimmed = NULL; > const unsigned char *src; > struct igmphdr *ih; > - int err; > + int err = 0; > > err = ip_mc_check_igmp(skb, &skb_trimmed); The initialization of err to '0' is unnecessary, it gets assigned on the very next line.
Re: pull request: bluetooth-next 2018-08-05
From: Johan Hedberg Date: Sun, 5 Aug 2018 09:14:30 +0300 > Here's the main bluetooth-next pull request for the 4.19 kernel. > > - Added support for Bluetooth Advertising Extensions > - Added vendor driver support to hci_h5 HCI driver > - Added serdev support to hci_h5 driver > - Added support for Qualcomm wcn3990 controller > - Added support for RTL8723BS and RTL8723DS controllers > - btusb: Added new ID for Realtek 8723DE > - Several other smaller fixes & cleanups > > Please let me know if there are any issues pulling. Thanks. Pulled, thanks Johan.
Re: [PATCH net-next 0/3] mlxsw: Enable MC-aware mode for mlxsw ports
From: Ido Schimmel Date: Sun, 5 Aug 2018 09:03:05 +0300 > Petr says: > > Due to an issue in Spectrum chips, when unicast traffic shares the same > queue as BUM traffic, and there is a congestion, the BUM traffic is > admitted to the queue anyway, thus pushing out all UC traffic. In order > to give unicast traffic precedence over BUM traffic, configure > multicast-aware mode on all ports. > > Under multicast-aware regime, when assigning traffic class to a packet, > the switch doesn't merely take the value prescribed by the QTCT > register. For BUM traffic, it instead assigns that value plus 8. That > limits the number of available TCs, but since mlxsw currently only uses > the lower eight anyway, it is no real loss. > > The two TCs (UC and MC one) are then mapped to the same subgroup and > strictly prioritized so that UC traffic is preferred in case of > congestion. > > In patch #1, introduce a new register, QTCTM, which enables the > multicast-aware mode. > > In patch #2, fix a typo in related code. > > In patch #3, set up TCs and QTCTM to enable multicast-aware mode. Series applied, thanks.
Re: [PATCH net-next] virtio-net: mark expected switch fall-throughs
From: "Gustavo A. R. Silva" Date: Sat, 4 Aug 2018 21:42:05 -0500 > In preparation to enabling -Wimplicit-fallthrough, mark switch cases > where we are expecting to fall through. > > Addresses-Coverity-ID: 1402059 ("Missing break in switch") > Addresses-Coverity-ID: 1402060 ("Missing break in switch") > Addresses-Coverity-ID: 1402061 ("Missing break in switch") > Signed-off-by: Gustavo A. R. Silva Applied, thank you.
Re: [PATCH] mellanox: fix the dport endianness in call of __inet6_lookup_established()
From: Al Viro Date: Sat, 4 Aug 2018 21:41:27 +0100 > __inet6_lookup_established() expect th->dport passed in host-endian, > not net-endian. The reason is microoptimization in __inet6_lookup(), > but if you use the lower-level helpers, you have to play by their > rules... > > Signed-off-by: Al Viro Mellanox folks, please review.
Re: [PATCH net-next] net: sched: cls_flower: Fix an error code in fl_tmplt_create()
From: Dan Carpenter Date: Fri, 3 Aug 2018 22:27:55 +0300 > We forgot to set the error code on this path, so we return NULL instead > of an error pointer. In the current code kzalloc() won't fail for small > allocations so this doesn't really affect runtime. > > Fixes: b95ec7eb3b4d ("net: sched: cls_flower: implement chain templates") > Signed-off-by: Dan Carpenter Applied, thanks Dan.
Re: [PATCH][net-next] net: check extack._msg before print
From: Li RongQing Date: Fri, 3 Aug 2018 15:45:21 +0800 > dev_set_mtu_ext is able to fail with a valid mtu value, at that > condition, extack._msg is not set and random since it is in stack, > then kernel will crash when print it. > > Fixes: 7a4c53bee3324a ("net: report invalid mtu value via netlink extack") > Signed-off-by: Zhang Yu > Signed-off-by: Li RongQing Applied, thank you.
Re: [PATCH bpf-next 12/13] docs: net: Fix various minor typos
On Fri, Aug 03, 2018 at 10:41:12AM +0200, Daniel Borkmann wrote: > On 08/01/2018 07:09 AM, Tobin C. Harding wrote: > > There are a few minor typos and grammatical issues. We should however > > try to keep the current flavour of the document. > > > > Fix typos and grammar if glaringly required. > > > > Signed-off-by: Tobin C. Harding > > Overall looks good, just some minor nits: > > > Documentation/networking/filter.rst | 65 +++-- > > 1 file changed, 33 insertions(+), 32 deletions(-) > > > > diff --git a/Documentation/networking/filter.rst > > b/Documentation/networking/filter.rst > > index 99dfa74fc4f7..b989a6c882b8 100644 > > --- a/Documentation/networking/filter.rst > > +++ b/Documentation/networking/filter.rst > > @@ -32,10 +32,10 @@ removing the old one and placing your new one in its > > place, assuming your > > filter has passed the checks, otherwise if it fails the old filter will > > remain on that socket. > > > > -SO_LOCK_FILTER option allows to lock the filter attached to a socket. Once > > -set, a filter cannot be removed or changed. This allows one process to > > +SO_LOCK_FILTER option allows locking of the filter attached to a socket. > > +Once set, a filter cannot be removed or changed. This allows one process to > > setup a socket, attach a filter, lock it then drop privileges and be > > -assured that the filter will be kept until the socket is closed. > > +assured that the filter will be kept until the socket is closed. > > ^-- looks like extra whitespace slipped in? Fixed in the RFC. Thanks for taking the time review. Tobin
Re: [Patch net] ipv6: fix double refcount of fib6_metrics
From: Cong Wang Date: Thu, 2 Aug 2018 23:20:38 -0700 > All the callers of ip6_rt_copy_init()/rt6_set_from() hold refcnt > of the "from" fib6_info, so there is no need to hold fib6_metrics > refcnt again, because fib6_metrics refcnt is only released when > fib6_info is gone, that is, they have the same life time, so the > whole fib6_metrics refcnt can be removed actually. > > This fixes a kmemleak warning reported by Sabrina. > > Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst > based routes") > Reported-by: Sabrina Dubroca > Cc: Sabrina Dubroca > Cc: David Ahern > Signed-off-by: Cong Wang Applied, thanks Cong.
Re: [PATCH bpf-next 05/13] docs: net: Fix indentation issues for code snippets
On Fri, Aug 03, 2018 at 10:44:23AM +0200, Daniel Borkmann wrote: > On 08/01/2018 07:09 AM, Tobin C. Harding wrote: > [...] > > -Starting bpf_dbg is trivial and just requires issuing: > > +Starting bpf_dbg is trivial and just requires issuing:: > > > > -# ./bpf_dbg > > + # ./bpf_dbg > > > > In case input and output do not equal stdin/stdout, bpf_dbg takes an > > alternative stdin source as a first argument, and an alternative stdout > > @@ -381,86 +384,87 @@ file "~/.bpf_dbg_init" and the command history is > > stored in the file > > Interaction in bpf_dbg happens through a shell that also has > > auto-completion > > support (follow-up example commands starting with '>' denote bpf_dbg > > shell). > > The usual workflow would be to ... > > - > > -> load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0 > > - Loads a BPF filter from standard output of bpf_asm, or transformed via > > - e.g. `tcpdump -iem1 -ddd port 22 | tr '\n' ','`. Note that for JIT > > - debugging (next section), this command creates a temporary socket and > > - loads the BPF code into the kernel. Thus, this will also be useful for > > - JIT developers. > > - > > -> load pcap foo.pcap > > - Loads standard tcpdump pcap file. > > - > > -> run [] > > -bpf passes:1 fails:9 > > - Runs through all packets from a pcap to account how many passes and fails > > - the filter will generate. A limit of packets to traverse can be given. > > - > > -> disassemble > > -l0:ldh [12] > > -l1:jeq #0x800, l2, l5 > > -l2:ldb [23] > > -l3:jeq #0x1, l4, l5 > > -l4:ret #0x > > -l5:ret #0 > > - Prints out BPF code disassembly. > > - > > -> dump > > -/* { op, jt, jf, k }, */ > > -{ 0x28, 0, 0, 0x000c }, > > -{ 0x15, 0, 3, 0x0800 }, > > -{ 0x30, 0, 0, 0x0017 }, > > -{ 0x15, 0, 1, 0x0001 }, > > -{ 0x06, 0, 0, 0x }, > > -{ 0x06, 0, 0, 00 }, > > - Prints out C-style BPF code dump. > > - > > -> breakpoint 0 > > -breakpoint at: l0: ldh [12] > > -> breakpoint 1 > > -breakpoint at: l1: jeq #0x800, l2, l5 > > - ... > > - Sets breakpoints at particular BPF instructions. Issuing a `run` command > > - will walk through the pcap file continuing from the current packet and > > - break when a breakpoint is being hit (another `run` will continue from > > - the currently active breakpoint executing next instructions): > > - > > - > run > > - -- register dump -- > > - pc: [0] <-- program counter > > - code: [40] jt[0] jf[0] k[12]<-- plain BPF code of current > > instruction > > - curr: l0:ldh [12] <-- disassembly of current > > instruction > > - A:[][0] <-- content of A (hex, decimal) > > - X:[][0] <-- content of X (hex, decimal) > > - M[0,15]: [][0] <-- folded content of M (hex, > > decimal) > > - -- packet dump -- <-- Current packet from pcap (hex) > > - len: 42 > > -0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01 > > - 16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26 > > - 32: 00 00 00 00 00 00 0a 3b 01 01 > > - (breakpoint) > > - > > > - > > -> breakpoint > > -breakpoints: 0 1 > > - Prints currently set breakpoints. > > - > > -> step [-, +] > > - Performs single stepping through the BPF program from the current pc > > - offset. Thus, on each step invocation, above register dump is issued. > > - This can go forwards and backwards in time, a plain `step` will break > > - on the next BPF instruction, thus +1. (No `run` needs to be issued here.) > > - > > -> select > > - Selects a given packet from the pcap file to continue from. Thus, on > > - the next `run` or `step`, the BPF program is being evaluated against > > - the user pre-selected packet. Numbering starts just as in Wireshark > > - with index 1. > > - > > -> quit > > -# > > - Exits bpf_dbg. > > +:: > > + > > + > load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0 > > +Loads a BPF filter from standard output of bpf_asm, or transformed via > > +e.g. `tcpdump -iem1 -ddd port 22 | tr '\n' ','`. Note that for JIT > > +debugging (next section), this command creates a temporary socket and > > +loads the BPF code into the kernel. Thus, this will also be useful for > > +JIT developers. > > Here for the bpf_dbg howto, it would be good to separate explanation from > the cmdline code snippets. This would more easily clarify the commands > themselves if we already go the rst route, so I'd prefer splitting this up. Sure thing. Will do as suggested. Thanks for the review. Tobin
Re: [PATCH net-next v3] ipv6: defrag: drop non-last frags smaller than min mtu
From: Florian Westphal Date: Fri, 3 Aug 2018 02:22:20 +0200 > don't bother with pathological cases, they only waste cycles. > IPv6 requires a minimum MTU of 1280 so we should never see fragments > smaller than this (except last frag). > > v3: don't use awkward "-offset + len" > v2: drop IPv4 part, which added same check w. IPV4_MIN_MTU (68). > There were concerns that there could be even smaller frags > generated by intermediate nodes, e.g. on radio networks. > > Cc: Peter Oskolkov > Cc: Eric Dumazet > Signed-off-by: Florian Westphal Applied, thanks Florian.
Re: [PATCH] net/bridge/br_multicast: remove redundant variable "err"
On Sun, 5 Aug 2018 22:18:43 +0800 zhong jiang wrote: > The err is not used after initalization, So remove it and make > it void function. > > Signed-off-by: zhong jiang Makes sense to me. Acked-by: Stephen Hemminger
[PATCH bpf] bpf: btf: Change tools/lib/bpf/btf to LGPL
This patch changes the tools/lib/bpf/btf.[ch] to LGPL which is inline with libbpf also. Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/btf.c | 2 +- tools/lib/bpf/btf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 2d270c560df3..c36a3a76986a 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: LGPL-2.1 /* Copyright (c) 2018 Facebook */ #include diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index e2a09a155f84..caac3a404dc5 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: LGPL-2.1 */ /* Copyright (c) 2018 Facebook */ #ifndef __BPF_BTF_H -- 2.17.1
Re: Linux kernel error stack
Michal Kubecek wrote: > On Mon, Aug 06, 2018 at 01:15:37AM +0200, Florian Westphal wrote: > > Michal Kubecek wrote: > > > Oops, exactly this issue was already discussed almost a year ago: > > > > > > http://lkml.kernel.org/r/20170824104824.2c318a0...@unicorn.suse.cz > > > > > > But something more urgent came and I forgot to get back to it. :-( > > > > I did not even remeber, thanks for the pointer. > > So I think best course of action is to update man page to clearly > > say this only works in postrouting and with udp, and is ONLY > > intended for working around old dhcp software. > > As GSO for UDP is on its way to mainline, one might get into trouble > even with UDP if the rule is not specific enough. Yes, we still need a fix to ignore GSO too.
Re: [PATCH net-next 0/3] ip: Use rb trees for IP frag queue.
From: Peter Oskolkov Date: Thu, 2 Aug 2018 22:45:57 + > This patchset > * changes IPv4 defrag behavior to match that of IPv6: overlapping >fragments now cause the whole IP datagram to be discarded (suggested >by David Miller): there are no legitimate use cases for overlapping >fragments; > * changes IPv4 defrag queue from a list to a rb tree (suggested >by Eric Dumazet): this change removes a potential attach vector. > > Upcoming patches will contain similar changes for IPv6 frag queue, > as well as a comprehensive IP defrag self-test (temporarily delayed). Looks good, series applied, thanks!
Re: [PATCH net-next] net/tls: Mark the end in scatterlist table
From: Vakul Garg Date: Thu, 2 Aug 2018 20:43:10 +0530 > Function zerocopy_from_iter() unmarks the 'end' in input sgtable while > adding new entries in it. The last entry in sgtable remained unmarked. > This results in KASAN error report on using apis like sg_nents(). Before > returning, the function needs to mark the 'end' in the last entry it > adds. > > Signed-off-by: Vakul Garg Yes, a properly formed scatterlist table must always have it's end marked. Applied, thanks.
Re: [v2, 1/3] arm64: dts: fsl: add clocks property for fman ptp timer node
From: Yangbo Lu Date: Wed, 1 Aug 2018 18:05:52 +0800 > This patch is to add clocks property for fman ptp timer node. > > Signed-off-by: Yangbo Lu > --- > Changes for v2: > - None. Applied.
Re: [PATCH] ipv6: icmp: Updating pmtu for link local route
From: Georg Kohmann Date: Thu, 2 Aug 2018 13:56:58 +0200 > When a ICMPV6_PKT_TOOBIG is received from a link local address the pmtu will > be updated on a route with an arbitrary interface index. Subsequent packets > sent back to the same link local address may therefore end up not > considering the updated pmtu. > > Current behavior breaks TAHI v6LC4.1.4 Reduce PMTU On-link. Referring to RFC > 1981: Section 3: "Note that Path MTU Discovery must be performed even in > cases where a node "thinks" a destination is attached to the same link as > itself. In a situation such as when a neighboring router acts as proxy [ND] > for some destination, the destination can to appear to be directly > connected but is in fact more than one hop away." > > Using the interface index from the incoming ICMPV6_PKT_TOOBIG when updating > the pmtu. > > Signed-off-by: Georg Kohmann Applied, thanks Georg.
Re: [v2, 3/3] ptp_qoriq: support automatic configuration for ptp timer
From: Yangbo Lu Date: Wed, 1 Aug 2018 18:05:54 +0800 > This patch is to support automatic configuration for ptp timer. > If required ptp dts properties are not provided, driver could > try to calculate a set of default configurations to initialize > the ptp timer. This makes the driver work for many boards which > don't have the required ptp dts properties in current kernel. > Also the users could set dts properties by themselves according > to their requirement. > > Signed-off-by: Yangbo Lu > --- > Changes for v2: > - Dropped module_param. Applied.
Re: [v2, 2/3] powerpc/mpc85xx: add clocks property for fman ptp timer node
From: Yangbo Lu Date: Wed, 1 Aug 2018 18:05:53 +0800 > This patch is to add clocks property for fman ptp timer node. > > Signed-off-by: Yangbo Lu > --- > Changes for v2: > - None. Applied.
Re: Linux kernel error stack
On Mon, Aug 06, 2018 at 01:15:37AM +0200, Florian Westphal wrote: > Michal Kubecek wrote: > > Oops, exactly this issue was already discussed almost a year ago: > > > > http://lkml.kernel.org/r/20170824104824.2c318a0...@unicorn.suse.cz > > > > But something more urgent came and I forgot to get back to it. :-( > > I did not even remeber, thanks for the pointer. > So I think best course of action is to update man page to clearly > say this only works in postrouting and with udp, and is ONLY > intended for working around old dhcp software. As GSO for UDP is on its way to mainline, one might get into trouble even with UDP if the rule is not specific enough. > From kernel, have checkentry emit a warning > when this is used for protocols other than udp, > and make this a no-op for everything else. > > Same for postrouting: warn from checkentry if the hook is something > else, and have target function not do anything for !postrouting. > > Does that make sense? It sounds reasonable to me. > I can work on a patch later this week if needed. I'll be on vacation until Sunday so that I certainly won't get to it earlier than next week. Michal Kubecek
Re: [PATCH 00/21] Netfilter updates for net-next
From: Pablo Neira Ayuso Date: Sun, 5 Aug 2018 23:21:20 +0200 > The following patchset contains Netfilter updates for your net-next tree: ... > You can pull these changes from: > > git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git Pulled, thanks Pablo.
Re: KCM - recvmsg() mangles packets?
Dominique Martinet wrote on Sun, Aug 05, 2018: > It's getting late but I'll try adding a pskb_pull in there tomorrow, it > would be better to make the bpf program start with an offset but I don't > think that'll be easy to change... I can confirm the following patch fixes the issue for me: -8<- diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 625acb27efcc..348ff5945591 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -222,6 +222,16 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, if (!stm->strp.full_len) { ssize_t len; + /* Can only parse if there is no offset */ + if (unlikely(stm->strp.offset)) { + if (!pskb_pull(skb, stm->strp.offset)) { + STRP_STATS_INCR(strp->stats.mem_fail); + strp_parser_err(strp, -ENOMEM, desc); + break; + } + stm->strp.offset = 0; + } + len = (*strp->cb.parse_msg)(strp, head); if (!len) { 8<-- Now, I was looking at other users of strparser (I see sockmap, kcm and tls) and it looks like sockmap does not handle offsets either but tls does by using skb_copy_bits -- they're copying the tls header to a buffer on the stack. kcm cannot do that because we do not know how much data the user expects to read, and I'm not comfortable doing pskb_pull in the kcm callback either, but the cost of this pull is probably non-negligible if some user can make do without it... On the other hand, I do not see how to make the bpf program handle an offset in the skb as that offset is strparser-specific. Maybe add a flag in the cb that specifies wether the callback allows non-zero offset? I'll let you see if you can reproduce this and will wait for advices on how to solve this properly so we can work on a proper fix. Thanks, -- Dominique
Re: Linux kernel error stack
Michal Kubecek wrote: > Oops, exactly this issue was already discussed almost a year ago: > > http://lkml.kernel.org/r/20170824104824.2c318a0...@unicorn.suse.cz > > But something more urgent came and I forgot to get back to it. :-( I did not even remeber, thanks for the pointer. So I think best course of action is to update man page to clearly say this only works in postrouting and with udp, and is ONLY intended for working around old dhcp software. >From kernel, have checkentry emit a warning when this is used for protocols other than udp, and make this a no-op for everything else. Same for postrouting: warn from checkentry if the hook is something else, and have target function not do anything for !postrouting. Does that make sense? I can work on a patch later this week if needed.
Re: Linux kernel error stack
On Sun, Aug 05, 2018 at 11:03:42PM +0200, Florian Westphal wrote: > Satish Patel wrote: > > Florian, > > > > It seems those rules coming from here > > https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum > > This is crazy, and, as you found, it doesn't even do what they seem to > think it does. > > I see no reason for these rules at all. Oops, exactly this issue was already discussed almost a year ago: http://lkml.kernel.org/r/20170824104824.2c318a0...@unicorn.suse.cz But something more urgent came and I forgot to get back to it. :-( Michal Kubecek
Re: Linux kernel error stack
Satish Patel wrote: > Florian, > > It seems those rules coming from here > https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum This is crazy, and, as you found, it doesn't even do what they seem to think it does. I see no reason for these rules at all.
Re: Linux kernel error stack
Satish Patel wrote: > After reading further related DHCP checksum issue, it seems we need > that rules when you running DHCP on same host machine where your guest > using host DHCP service, in that case virtual nic won't do checksum. > If your DHCP running on different host then your physical nic perform > checksum. Partially right, its only needed for older dhcp software that relies on packet-header checksum correctness (metadata contains info when checksum will be filled in by nic later, modern servers will use that and ignore the packet-header checksums in this case). If you remove the rule and your VMs still get leases via DHCP then the CHECKSUM workaround isn't needed.
[PATCH net-next 02/13] bnxt_en: Adjust timer based on ethtool stats-block-usecs settings.
The driver gathers statistics using 2 mechanisms. Some stats are DMA'ed directly from hardware and others are polled from the driver's timer. Currently, we only adjust the DMA frequency based on the ethtool stats-block-usecs setting. This patch adjusts the driver's timer frequency as well to make everything consistent. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 3d40e49..1f626af 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -112,6 +112,11 @@ static int bnxt_set_coalesce(struct net_device *dev, BNXT_MAX_STATS_COAL_TICKS); stats_ticks = rounddown(stats_ticks, BNXT_MIN_STATS_COAL_TICKS); bp->stats_coal_ticks = stats_ticks; + if (bp->stats_coal_ticks) + bp->current_interval = + bp->stats_coal_ticks * HZ / 100; + else + bp->current_interval = BNXT_TIMER_INTERVAL; update_stats = true; } -- 2.5.1
[PATCH net-next 05/13] bnxt_en: Add new VF resource allocation strategy mode.
The new mode is "minimal-static" to be used when resources are more limited to support a large number of VFs, for example The PF driver will provision guaranteed minimum resources of 0. Each VF has no guranteed resources until it tries to reserve resources during device open. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 23 ++- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index fd936c5..e0e3b4b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5162,7 +5162,7 @@ int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp, bool all) pf->vf_resv_strategy = le16_to_cpu(resp->vf_reservation_strategy); - if (pf->vf_resv_strategy > BNXT_VF_RESV_STRATEGY_MINIMAL) + if (pf->vf_resv_strategy > BNXT_VF_RESV_STRATEGY_MINIMAL_STATIC) pf->vf_resv_strategy = BNXT_VF_RESV_STRATEGY_MAXIMAL; } hwrm_func_resc_qcaps_exit: diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 47eec14..b44a758 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -862,6 +862,7 @@ struct bnxt_pf_info { u8 vf_resv_strategy; #define BNXT_VF_RESV_STRATEGY_MAXIMAL 0 #define BNXT_VF_RESV_STRATEGY_MINIMAL 1 +#define BNXT_VF_RESV_STRATEGY_MINIMAL_STATIC 2 void*hwrm_cmd_req_addr[4]; dma_addr_t hwrm_cmd_req_dma_addr[4]; struct bnxt_vf_info *vf; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index f560845..b896a52 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -447,7 +447,7 @@ static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, int num_vfs) u16 vf_tx_rings, vf_rx_rings, vf_cp_rings; u16 vf_stat_ctx, vf_vnics, vf_ring_grps; struct bnxt_pf_info *pf = &bp->pf; - int i, rc = 0; + int i, rc = 0, min = 1; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_RESOURCE_CFG, -1, -1); @@ -464,14 +464,19 @@ static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, int num_vfs) req.min_rsscos_ctx = cpu_to_le16(BNXT_VF_MIN_RSS_CTX); req.max_rsscos_ctx = cpu_to_le16(BNXT_VF_MAX_RSS_CTX); - if (pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL) { - req.min_cmpl_rings = cpu_to_le16(1); - req.min_tx_rings = cpu_to_le16(1); - req.min_rx_rings = cpu_to_le16(1); - req.min_l2_ctxs = cpu_to_le16(BNXT_VF_MIN_L2_CTX); - req.min_vnics = cpu_to_le16(1); - req.min_stat_ctx = cpu_to_le16(1); - req.min_hw_ring_grps = cpu_to_le16(1); + if (pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL_STATIC) { + min = 0; + req.min_rsscos_ctx = cpu_to_le16(min); + } + if (pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL || + pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL_STATIC) { + req.min_cmpl_rings = cpu_to_le16(min); + req.min_tx_rings = cpu_to_le16(min); + req.min_rx_rings = cpu_to_le16(min); + req.min_l2_ctxs = cpu_to_le16(min); + req.min_vnics = cpu_to_le16(min); + req.min_stat_ctx = cpu_to_le16(min); + req.min_hw_ring_grps = cpu_to_le16(min); } else { vf_cp_rings /= num_vfs; vf_tx_rings /= num_vfs; -- 2.5.1
Re: Linux kernel error stack
After reading further related DHCP checksum issue, it seems we need that rules when you running DHCP on same host machine where your guest using host DHCP service, in that case virtual nic won't do checksum. If your DHCP running on different host then your physical nic perform checksum. On Sun, Aug 5, 2018 at 4:39 PM, Satish Patel wrote: > Florian, > > I have removed those port 80 CHECKSUM rules and everything looks good > i didn't see kernel WARN mesg. > > Thank you so much! You just nailed it :) > > On Sun, Aug 5, 2018 at 4:15 PM, Satish Patel wrote: >> Florian, >> >> It seems those rules coming from here >> https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum >> >> On Sun, Aug 5, 2018 at 4:09 PM, Satish Patel wrote: >>> Yes this is openstack-ansible deployment tool which set them up. I am >>> wondering where are these rules saved? I believe openstack-ansible use >>> LXC container to deploy services so must be part of LXC startup >>> scripts. >>> >>> I have checked there is no firewalld and iptables service running on >>> system.. >>> >>> You think i should get rid of all CHEKSUM option in iptables rules? Am i >>> right? >>> >>> >>> On Sun, Aug 5, 2018 at 4:02 PM, Florian Westphal wrote: Satish Patel wrote: > > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM > > --checksum-fill > > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM > > --checksum-fill These rules make no sense to me, and are also source of your backtrace. Who set this up? If this is coming from openstack, I suggest asking openstack developers WTH this is supposed to do. > > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp > > --dport 68 -j CHECKSUM --checksum-fill This was needed to work around dhcpd issues w. checksum offloading but I guess that DCHCP will work fine without this rule too nowadays. So I suggest you simply get rid of these rules.
[PATCH net-next 04/13] bnxt_en: Add PHY retry logic.
During hotplug, the driver's open function can be called almost immediately after power on reset. The PHY may not be ready and the firmware may return failure when the driver tries to update PHY settings. Add retry logic fired from the driver's timer to retry the operation for 5 seconds. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 31 ++- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d9fc905..fd936c5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6898,8 +6898,14 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) mutex_lock(&bp->link_lock); rc = bnxt_update_phy_setting(bp); mutex_unlock(&bp->link_lock); - if (rc) + if (rc) { netdev_warn(bp->dev, "failed to update phy settings\n"); + if (BNXT_SINGLE_PF(bp)) { + bp->link_info.phy_retry = true; + bp->link_info.phy_retry_expires = + jiffies + 5 * HZ; + } + } } if (irq_re_init) @@ -7583,6 +7589,16 @@ static void bnxt_timer(struct timer_list *t) set_bit(BNXT_FLOW_STATS_SP_EVENT, &bp->sp_event); bnxt_queue_sp_work(bp); } + + if (bp->link_info.phy_retry) { + if (time_after(jiffies, bp->link_info.phy_retry_expires)) { + bp->link_info.phy_retry = 0; + netdev_warn(bp->dev, "failed to update phy settings after maximum retries.\n"); + } else { + set_bit(BNXT_UPDATE_PHY_SP_EVENT, &bp->sp_event); + bnxt_queue_sp_work(bp); + } + } bnxt_restart_timer: mod_timer(&bp->timer, jiffies + bp->current_interval); } @@ -7670,6 +7686,19 @@ static void bnxt_sp_task(struct work_struct *work) netdev_err(bp->dev, "SP task can't update link (rc: %x)\n", rc); } + if (test_and_clear_bit(BNXT_UPDATE_PHY_SP_EVENT, &bp->sp_event)) { + int rc; + + mutex_lock(&bp->link_lock); + rc = bnxt_update_phy_setting(bp); + mutex_unlock(&bp->link_lock); + if (rc) { + netdev_warn(bp->dev, "update phy settings retry failed\n"); + } else { + bp->link_info.phy_retry = false; + netdev_info(bp->dev, "update phy settings retry succeeded\n"); + } + } if (test_and_clear_bit(BNXT_HWRM_PORT_MODULE_SP_EVENT, &bp->sp_event)) { mutex_lock(&bp->link_lock); bnxt_get_port_module_status(bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 0d49fe0..47eec14 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -959,6 +959,9 @@ struct bnxt_link_info { u16 advertising;/* user adv setting */ boolforce_link_chng; + boolphy_retry; + unsigned long phy_retry_expires; + /* a copy of phy_qcfg output used to report link * info to VF */ @@ -1344,6 +1347,7 @@ struct bnxt { #define BNXT_GENEVE_DEL_PORT_SP_EVENT 13 #define BNXT_LINK_SPEED_CHNG_SP_EVENT 14 #define BNXT_FLOW_STATS_SP_EVENT 15 +#define BNXT_UPDATE_PHY_SP_EVENT 16 struct bnxt_hw_resc hw_resc; struct bnxt_pf_info pf; -- 2.5.1
[PATCH net-next 13/13] bnxt_en: Do not use the CNP CoS queue for networking traffic.
The CNP CoS queue is reserved for internal RDMA Congestion Notification Packets (CNP) and should not be used for a TC. Modify the CoS queue discovery code to skip over the CNP CoS queue and to reduce bp->max_tc accordingly. However, if RDMA is disabled in NVRAM, the the CNP CoS queue can be used for a TC. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 22 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h | 4 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index dde904b..d7f51ab 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5281,7 +5281,8 @@ static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp) int rc = 0; struct hwrm_queue_qportcfg_input req = {0}; struct hwrm_queue_qportcfg_output *resp = bp->hwrm_cmd_resp_addr; - u8 i, *qptr; + u8 i, j, *qptr; + bool no_rdma; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_QPORTCFG, -1, -1); @@ -5299,19 +5300,24 @@ static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp) if (bp->max_tc > BNXT_MAX_QUEUE) bp->max_tc = BNXT_MAX_QUEUE; + no_rdma = !(bp->flags & BNXT_FLAG_ROCE_CAP); + qptr = &resp->queue_id0; + for (i = 0, j = 0; i < bp->max_tc; i++) { + bp->q_info[j].queue_id = *qptr++; + bp->q_info[j].queue_profile = *qptr++; + bp->tc_to_qidx[j] = j; + if (!BNXT_CNPQ(bp->q_info[j].queue_profile) || + (no_rdma && BNXT_PF(bp))) + j++; + } + bp->max_tc = max_t(u8, j, 1); + if (resp->queue_cfg_info & QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG) bp->max_tc = 1; if (bp->max_lltc > bp->max_tc) bp->max_lltc = bp->max_tc; - qptr = &resp->queue_id0; - for (i = 0; i < bp->max_tc; i++) { - bp->q_info[i].queue_id = *qptr++; - bp->q_info[i].queue_profile = *qptr++; - bp->tc_to_qidx[i] = i; - } - qportcfg_exit: mutex_unlock(&bp->hwrm_cmd_lock); return rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h index c0e16c0..6eed231 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h @@ -43,6 +43,10 @@ struct bnxt_dscp2pri_entry { ((q_profile) == \ QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS_ROCE) +#define BNXT_CNPQ(q_profile) \ + ((q_profile) == \ +QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSY_ROCE_CNP) + #define HWRM_STRUCT_DATA_SUBTYPE_HOST_OPERATIONAL 0x0300 void bnxt_dcb_init(struct bnxt *bp); -- 2.5.1
[PATCH net-next 10/13] bnxt_en: Notify firmware about IF state changes.
Use latest firmware API to notify firmware about IF state changes. Firmware has the option to clean up resources during IF down and to require the driver to reserve resources again during IF up. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 53 +-- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1659940..56bd097 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3638,7 +3638,9 @@ int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap, static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp) { + struct hwrm_func_drv_rgtr_output *resp = bp->hwrm_cmd_resp_addr; struct hwrm_func_drv_rgtr_input req = {0}; + int rc; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1); @@ -3676,7 +3678,15 @@ static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp) cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_VF_REQ_FWD); } - return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); + mutex_lock(&bp->hwrm_cmd_lock); + rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); + if (rc) + rc = -EIO; + else if (resp->flags & +cpu_to_le32(FUNC_DRV_RGTR_RESP_FLAGS_IF_CHANGE_SUPPORTED)) + bp->fw_cap |= BNXT_FW_CAP_IF_CHANGE; + mutex_unlock(&bp->hwrm_cmd_lock); + return rc; } static int bnxt_hwrm_func_drv_unrgtr(struct bnxt *bp) @@ -6637,6 +6647,39 @@ static int bnxt_hwrm_shutdown_link(struct bnxt *bp) return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); } +static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) +{ + struct hwrm_func_drv_if_change_output *resp = bp->hwrm_cmd_resp_addr; + struct hwrm_func_drv_if_change_input req = {0}; + bool resc_reinit = false; + int rc; + + if (!(bp->fw_cap & BNXT_FW_CAP_IF_CHANGE)) + return 0; + + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_IF_CHANGE, -1, -1); + if (up) + req.flags = cpu_to_le32(FUNC_DRV_IF_CHANGE_REQ_FLAGS_UP); + mutex_lock(&bp->hwrm_cmd_lock); + rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); + if (!rc && (resp->flags & + cpu_to_le32(FUNC_DRV_IF_CHANGE_RESP_FLAGS_RESC_CHANGE))) + resc_reinit = true; + mutex_unlock(&bp->hwrm_cmd_lock); + + if (up && resc_reinit && BNXT_NEW_RM(bp)) { + struct bnxt_hw_resc *hw_resc = &bp->hw_resc; + + rc = bnxt_hwrm_func_resc_qcaps(bp, true); + hw_resc->resv_cp_rings = 0; + hw_resc->resv_tx_rings = 0; + hw_resc->resv_rx_rings = 0; + hw_resc->resv_hw_ring_grps = 0; + hw_resc->resv_vnics = 0; + } + return rc; +} + static int bnxt_hwrm_port_led_qcaps(struct bnxt *bp) { struct hwrm_port_led_qcaps_output *resp = bp->hwrm_cmd_resp_addr; @@ -6991,8 +7034,13 @@ void bnxt_half_close_nic(struct bnxt *bp) static int bnxt_open(struct net_device *dev) { struct bnxt *bp = netdev_priv(dev); + int rc; - return __bnxt_open_nic(bp, true, true); + bnxt_hwrm_if_change(bp, true); + rc = __bnxt_open_nic(bp, true, true); + if (rc) + bnxt_hwrm_if_change(bp, false); + return rc; } static bool bnxt_drv_busy(struct bnxt *bp) @@ -7056,6 +7104,7 @@ static int bnxt_close(struct net_device *dev) bnxt_close_nic(bp, true, true); bnxt_hwrm_shutdown_link(bp); + bnxt_hwrm_if_change(bp, false); return 0; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index ded2aff..6c40b257 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1290,6 +1290,7 @@ struct bnxt { #define BNXT_FW_CAP_LLDP_AGENT 0x0002 #define BNXT_FW_CAP_DCBX_AGENT 0x0004 #define BNXT_FW_CAP_NEW_RM 0x0008 + #define BNXT_FW_CAP_IF_CHANGE 0x0010 #define BNXT_NEW_RM(bp)((bp)->fw_cap & BNXT_FW_CAP_NEW_RM) u32 hwrm_spec_code; -- 2.5.1
[PATCH net-next 06/13] bnxt_en: Update RSS setup and GRO-HW logic according to the latest spec.
Set the default hash mode flag in HWRM_VNIC_RSS_CFG to signal to the firmware that the driver is compliant with the latest spec. With that, the firmware can return expanded RSS profile IDs that the driver checks to setup the proper gso_type for GRO-HW packets. But instead of checking for the new profile IDs, we check the IP_TYPE flag in TPA_START which is more straight forward than checking a list of profile IDs. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index e0e3b4b..1714850 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1115,7 +1115,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, tpa_info->hash_type = PKT_HASH_TYPE_L4; tpa_info->gso_type = SKB_GSO_TCPV4; /* RSS profiles 1 and 3 with extract code 0 for inner 4-tuple */ - if (hash_type == 3) + if (hash_type == 3 || TPA_START_IS_IPV6(tpa_start1)) tpa_info->gso_type = SKB_GSO_TCPV6; tpa_info->rss_hash = le32_to_cpu(tpa_start->rx_tpa_start_cmp_rss_hash); @@ -3981,6 +3981,7 @@ static int bnxt_hwrm_vnic_set_rss(struct bnxt *bp, u16 vnic_id, bool set_rss) bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_RSS_CFG, -1, -1); if (set_rss) { req.hash_type = cpu_to_le32(bp->rss_hash_cfg); + req.hash_mode_flags = VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_DEFAULT; if (vnic->flags & BNXT_VNIC_RSS_FLAG) { if (BNXT_CHIP_TYPE_NITRO_A0(bp)) max_rings = bp->rx_nr_rings - 1; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index b44a758..7ea022d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -326,6 +326,10 @@ struct rx_tpa_start_cmp_ext { ((le32_to_cpu((rx_tpa_start)->rx_tpa_start_cmp_cfa_code_v2) & \ RX_TPA_START_CMP_CFA_CODE) >> RX_TPA_START_CMPL_CFA_CODE_SHIFT) +#define TPA_START_IS_IPV6(rx_tpa_start)\ + (!!((rx_tpa_start)->rx_tpa_start_cmp_flags2 & \ + cpu_to_le32(RX_TPA_START_CMP_FLAGS2_IP_TYPE))) + struct rx_tpa_end_cmp { __le32 rx_tpa_end_cmp_len_flags_type; #define RX_TPA_END_CMP_TYPE (0x3f << 0) -- 2.5.1
[PATCH net-next 07/13] bnxt_en: Add support for ethtool get dump.
From: Vasundhara Volam Add support to collect live firmware coredump via ethtool. Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h | 66 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 333 + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h | 37 +++ 3 files changed, 436 insertions(+) create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h new file mode 100644 index 000..09c22f8 --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h @@ -0,0 +1,66 @@ +/* Broadcom NetXtreme-C/E network driver. + * + * Copyright (c) 2018 Broadcom Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#ifndef BNXT_COREDUMP_H +#define BNXT_COREDUMP_H + +struct bnxt_coredump_segment_hdr { + __u8 signature[4]; + __le32 component_id; + __le32 segment_id; + __le32 flags; + __u8 low_version; + __u8 high_version; + __le16 function_id; + __le32 offset; + __le32 length; + __le32 status; + __le32 duration; + __le32 data_offset; + __le32 instance; + __le32 rsvd[5]; +}; + +struct bnxt_coredump_record { + __u8 signature[4]; + __le32 flags; + __u8 low_version; + __u8 high_version; + __u8 asic_state; + __u8 rsvd0[5]; + char system_name[32]; + __le16 year; + __le16 month; + __le16 day; + __le16 hour; + __le16 minute; + __le16 second; + __le16 utc_bias; + __le16 rsvd1; + char commandline[256]; + __le32 total_segments; + __le32 os_ver_major; + __le32 os_ver_minor; + __le32 rsvd2; + char os_name[32]; + __le16 end_year; + __le16 end_month; + __le16 end_day; + __le16 end_hour; + __le16 end_minute; + __le16 end_second; + __le16 end_utc_bias; + __le32 asic_id1; + __le32 asic_id2; + __le32 coredump_status; + __u8 ioctl_low_version; + __u8 ioctl_high_version; + __le16 rsvd3[313]; +}; +#endif diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 9517633..3fc7c74 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -16,12 +16,15 @@ #include #include #include +#include +#include #include "bnxt_hsi.h" #include "bnxt.h" #include "bnxt_xdp.h" #include "bnxt_ethtool.h" #include "bnxt_nvm_defs.h" /* NVRAM content constant and structure defs */ #include "bnxt_fw_hdr.h" /* Firmware hdr constant and structure defs */ +#include "bnxt_coredump.h" #define FLASH_NVRAM_TIMEOUT((HWRM_CMD_TIMEOUT) * 100) #define FLASH_PACKAGE_TIMEOUT ((HWRM_CMD_TIMEOUT) * 200) #define INSTALL_PACKAGE_TIMEOUT((HWRM_CMD_TIMEOUT) * 200) @@ -2685,6 +2688,334 @@ static int bnxt_reset(struct net_device *dev, u32 *flags) return rc; } +static int bnxt_hwrm_dbg_dma_data(struct bnxt *bp, void *msg, int msg_len, + struct bnxt_hwrm_dbg_dma_info *info) +{ + struct hwrm_dbg_cmn_output *cmn_resp = bp->hwrm_cmd_resp_addr; + struct hwrm_dbg_cmn_input *cmn_req = msg; + __le16 *seq_ptr = msg + info->seq_off; + u16 seq = 0, len, segs_off; + void *resp = cmn_resp; + dma_addr_t dma_handle; + int rc, off = 0; + void *dma_buf; + + dma_buf = dma_alloc_coherent(&bp->pdev->dev, info->dma_len, &dma_handle, +GFP_KERNEL); + if (!dma_buf) + return -ENOMEM; + + segs_off = offsetof(struct hwrm_dbg_coredump_list_output, + total_segments); + cmn_req->host_dest_addr = cpu_to_le64(dma_handle); + cmn_req->host_buf_len = cpu_to_le32(info->dma_len); + mutex_lock(&bp->hwrm_cmd_lock); + while (1) { + *seq_ptr = cpu_to_le16(seq); + rc = _hwrm_send_message(bp, msg, msg_len, HWRM_CMD_TIMEOUT); + if (rc) + break; + + len = le16_to_cpu(*((__le16 *)(resp + info->data_len_off))); + if (!seq && + cmn_req->req_type == cpu_to_le16(HWRM_DBG_COREDUMP_LIST)) { + info->segs = le16_to_cpu(*((__le16 *)(resp + + segs_off))); + if (!info->segs) { + rc = -EIO; + break; + } + + info->dest_buf_size = info->segs * + sizeof(struct coredump_segme
[PATCH net-next 03/13] bnxt_en: Add external loopback test to ethtool selftest.
Add code to detect firmware support for external loopback and the extra test entry for external loopback. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 +++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 32 ++- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c612d74..d9fc905 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6337,6 +6337,10 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp) bp->lpi_tmr_hi = le32_to_cpu(resp->valid_tx_lpi_timer_high) & PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_MASK; } + if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EXTERNAL_LPBK_SUPPORTED) { + if (bp->test_info) + bp->test_info->flags |= BNXT_TEST_FL_EXT_LPBK; + } if (resp->supported_speeds_auto_mode) link_info->support_auto_speeds = le16_to_cpu(resp->supported_speeds_auto_mode); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 3b5a55c..0d49fe0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -990,6 +990,8 @@ struct bnxt_led_info { struct bnxt_test_info { u8 offline_mask; + u8 flags; +#define BNXT_TEST_FL_EXT_LPBK 0x1 u16 timeout; char string[BNXT_MAX_TEST][ETH_GSTRING_LEN]; }; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 1f626af..9517633 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2397,7 +2397,7 @@ static int bnxt_disable_an_for_lpbk(struct bnxt *bp, return rc; } -static int bnxt_hwrm_phy_loopback(struct bnxt *bp, bool enable) +static int bnxt_hwrm_phy_loopback(struct bnxt *bp, bool enable, bool ext) { struct hwrm_port_phy_cfg_input req = {0}; @@ -2405,7 +2405,10 @@ static int bnxt_hwrm_phy_loopback(struct bnxt *bp, bool enable) if (enable) { bnxt_disable_an_for_lpbk(bp, &req); - req.lpbk = PORT_PHY_CFG_REQ_LPBK_LOCAL; + if (ext) + req.lpbk = PORT_PHY_CFG_REQ_LPBK_EXTERNAL; + else + req.lpbk = PORT_PHY_CFG_REQ_LPBK_LOCAL; } else { req.lpbk = PORT_PHY_CFG_REQ_LPBK_NONE; } @@ -2538,15 +2541,17 @@ static int bnxt_run_fw_tests(struct bnxt *bp, u8 test_mask, u8 *test_results) return rc; } -#define BNXT_DRV_TESTS 3 +#define BNXT_DRV_TESTS 4 #define BNXT_MACLPBK_TEST_IDX (bp->num_tests - BNXT_DRV_TESTS) #define BNXT_PHYLPBK_TEST_IDX (BNXT_MACLPBK_TEST_IDX + 1) -#define BNXT_IRQ_TEST_IDX (BNXT_MACLPBK_TEST_IDX + 2) +#define BNXT_EXTLPBK_TEST_IDX (BNXT_MACLPBK_TEST_IDX + 2) +#define BNXT_IRQ_TEST_IDX (BNXT_MACLPBK_TEST_IDX + 3) static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, u64 *buf) { struct bnxt *bp = netdev_priv(dev); + bool do_ext_lpbk = false; bool offline = false; u8 test_results = 0; u8 test_mask = 0; @@ -2560,6 +2565,10 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, return; } + if ((etest->flags & ETH_TEST_FL_EXTERNAL_LB) && + (bp->test_info->flags & BNXT_TEST_FL_EXT_LPBK)) + do_ext_lpbk = true; + if (etest->flags & ETH_TEST_FL_OFFLINE) { if (bp->pf.active_vfs) { etest->flags |= ETH_TEST_FL_FAILED; @@ -2600,13 +2609,22 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, buf[BNXT_MACLPBK_TEST_IDX] = 0; bnxt_hwrm_mac_loopback(bp, false); - bnxt_hwrm_phy_loopback(bp, true); + bnxt_hwrm_phy_loopback(bp, true, false); msleep(1000); if (bnxt_run_loopback(bp)) { buf[BNXT_PHYLPBK_TEST_IDX] = 1; etest->flags |= ETH_TEST_FL_FAILED; } - bnxt_hwrm_phy_loopback(bp, false); + if (do_ext_lpbk) { + etest->flags |= ETH_TEST_FL_EXTERNAL_LB_DONE; + bnxt_hwrm_phy_loopback(bp, true, true); + msleep(1000); + if (bnxt_run_loopback(bp)) { + buf[BNXT_EXTLPBK_TEST_IDX] = 1; + etest->flags |= ETH_TEST_FL_FAILED; + } + }
[PATCH net-next 01/13] bnxt_en: Update firmware interface version to 1.9.2.25.
New interface has firmware core dump support, new extended port statistics, and IF state change notifications to the firmware. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.h |4 +- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c |8 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |6 +- drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 1227 +++-- 4 files changed, 924 insertions(+), 321 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 934aa11..3b5a55c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -12,11 +12,11 @@ #define BNXT_H #define DRV_MODULE_NAME"bnxt_en" -#define DRV_MODULE_VERSION "1.9.1" +#define DRV_MODULE_VERSION "1.9.2" #define DRV_VER_MAJ1 #define DRV_VER_MIN9 -#define DRV_VER_UPD1 +#define DRV_VER_UPD2 #include #include diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 7bd96ab..f3b9fbc 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -29,7 +29,7 @@ static const struct bnxt_dl_nvm_param nvm_params[] = { static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg, int msg_len, union devlink_param_value *val) { - struct hwrm_nvm_variable_input *req = msg; + struct hwrm_nvm_get_variable_input *req = msg; void *data_addr = NULL, *buf = NULL; struct bnxt_dl_nvm_param nvm_param; int bytesize, idx = 0, rc, i; @@ -60,18 +60,18 @@ static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg, if (!data_addr) return -ENOMEM; - req->data_addr = cpu_to_le64(data_dma_addr); + req->dest_data_addr = cpu_to_le64(data_dma_addr); req->data_len = cpu_to_le16(nvm_param.num_bits); req->option_num = cpu_to_le16(nvm_param.offset); req->index_0 = cpu_to_le16(idx); if (idx) req->dimensions = cpu_to_le16(1); - if (req->req_type == HWRM_NVM_SET_VARIABLE) + if (req->req_type == cpu_to_le16(HWRM_NVM_SET_VARIABLE)) memcpy(data_addr, buf, bytesize); rc = hwrm_send_message(bp, msg, msg_len, HWRM_CMD_TIMEOUT); - if (!rc && req->req_type == HWRM_NVM_GET_VARIABLE) + if (!rc && req->req_type == cpu_to_le16(HWRM_NVM_GET_VARIABLE)) memcpy(buf, data_addr, bytesize); dma_free_coherent(&bp->pdev->dev, bytesize, data_addr, data_dma_addr); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 7270c8b..3d40e49 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -162,7 +162,7 @@ static const struct { BNXT_RX_STATS_ENTRY(rx_128b_255b_frames), BNXT_RX_STATS_ENTRY(rx_256b_511b_frames), BNXT_RX_STATS_ENTRY(rx_512b_1023b_frames), - BNXT_RX_STATS_ENTRY(rx_1024b_1518_frames), + BNXT_RX_STATS_ENTRY(rx_1024b_1518b_frames), BNXT_RX_STATS_ENTRY(rx_good_vlan_frames), BNXT_RX_STATS_ENTRY(rx_1519b_2047b_frames), BNXT_RX_STATS_ENTRY(rx_2048b_4095b_frames), @@ -205,9 +205,9 @@ static const struct { BNXT_TX_STATS_ENTRY(tx_128b_255b_frames), BNXT_TX_STATS_ENTRY(tx_256b_511b_frames), BNXT_TX_STATS_ENTRY(tx_512b_1023b_frames), - BNXT_TX_STATS_ENTRY(tx_1024b_1518_frames), + BNXT_TX_STATS_ENTRY(tx_1024b_1518b_frames), BNXT_TX_STATS_ENTRY(tx_good_vlan_frames), - BNXT_TX_STATS_ENTRY(tx_1519b_2047_frames), + BNXT_TX_STATS_ENTRY(tx_1519b_2047b_frames), BNXT_TX_STATS_ENTRY(tx_2048b_4095b_frames), BNXT_TX_STATS_ENTRY(tx_4096b_9216b_frames), BNXT_TX_STATS_ENTRY(tx_9217b_16383b_frames), diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h index c75d7fa..971ace5d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h @@ -96,6 +96,7 @@ struct hwrm_short_input { struct cmd_nums { __le16 req_type; #define HWRM_VER_GET 0x0UL + #define HWRM_FUNC_DRV_IF_CHANGE 0xdUL #define HWRM_FUNC_BUF_UNRGTR 0xeUL #define HWRM_FUNC_VF_CFG 0xfUL #define HWRM_RESERVED10x10UL @@ -159,6 +160,7 @@ struct cmd_nums { #define HWRM_RING_FREE0x51UL #define HWRM_RING_CMPL_RING_QAGGINT_PARAMS0x52UL #define HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS 0x53UL + #define HWRM_RING_AGGINT_QCAPS0x54UL #define HWRM_RING_RESET
[PATCH net-next 12/13] bnxt_en: Add DCBNL DSCP application protocol support.
Expand the .ieee_setapp() and ieee_delapp() DCBNL methods to support DSCP. This allows DSCP values to user priority mappings instead of using VLAN priorities. Each DSCP mapping is added or deleted one entry at a time using the firmware API. The firmware call can only be made from a PF. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 83 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h | 6 ++ 3 files changed, 89 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 006726c..fefa011 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1281,6 +1281,7 @@ struct bnxt { struct ieee_ets *ieee_ets; u8 dcbx_cap; u8 default_pri; + u8 max_dscp_value; #endif /* CONFIG_BNXT_DCB */ u32 msg_enable; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c index 00dd26d..ddc98c3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c @@ -385,6 +385,61 @@ static int bnxt_hwrm_set_dcbx_app(struct bnxt *bp, struct dcb_app *app, return rc; } +static int bnxt_hwrm_queue_dscp_qcaps(struct bnxt *bp) +{ + struct hwrm_queue_dscp_qcaps_output *resp = bp->hwrm_cmd_resp_addr; + struct hwrm_queue_dscp_qcaps_input req = {0}; + int rc; + + if (bp->hwrm_spec_code < 0x10800 || BNXT_VF(bp)) + return 0; + + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_DSCP_QCAPS, -1, -1); + mutex_lock(&bp->hwrm_cmd_lock); + rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); + if (!rc) { + bp->max_dscp_value = (1 << resp->num_dscp_bits) - 1; + if (bp->max_dscp_value < 0x3f) + bp->max_dscp_value = 0; + } + + mutex_unlock(&bp->hwrm_cmd_lock); + return rc; +} + +static int bnxt_hwrm_queue_dscp2pri_cfg(struct bnxt *bp, struct dcb_app *app, + bool add) +{ + struct hwrm_queue_dscp2pri_cfg_input req = {0}; + struct bnxt_dscp2pri_entry *dscp2pri; + dma_addr_t mapping; + int rc; + + if (bp->hwrm_spec_code < 0x10800) + return 0; + + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_DSCP2PRI_CFG, -1, -1); + dscp2pri = dma_alloc_coherent(&bp->pdev->dev, sizeof(*dscp2pri), + &mapping, GFP_KERNEL); + if (!dscp2pri) + return -ENOMEM; + + req.src_data_addr = cpu_to_le64(mapping); + dscp2pri->dscp = app->protocol; + if (add) + dscp2pri->mask = 0x3f; + else + dscp2pri->mask = 0; + dscp2pri->pri = app->priority; + req.entry_cnt = cpu_to_le16(1); + rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); + if (rc) + rc = -EIO; + dma_free_coherent(&bp->pdev->dev, sizeof(*dscp2pri), dscp2pri, + mapping); + return rc; +} + static int bnxt_ets_validate(struct bnxt *bp, struct ieee_ets *ets, u8 *tc) { int total_ets_bw = 0; @@ -551,15 +606,30 @@ static int bnxt_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) return rc; } +static int bnxt_dcbnl_ieee_dscp_app_prep(struct bnxt *bp, struct dcb_app *app) +{ + if (app->selector == IEEE_8021QAZ_APP_SEL_DSCP) { + if (!bp->max_dscp_value) + return -ENOTSUPP; + if (app->protocol > bp->max_dscp_value) + return -EINVAL; + } + return 0; +} + static int bnxt_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) { struct bnxt *bp = netdev_priv(dev); - int rc = -EINVAL; + int rc; if (!(bp->dcbx_cap & DCB_CAP_DCBX_VER_IEEE) || !(bp->dcbx_cap & DCB_CAP_DCBX_HOST)) return -EINVAL; + rc = bnxt_dcbnl_ieee_dscp_app_prep(bp, app); + if (rc) + return rc; + rc = dcb_ieee_setapp(dev, app); if (rc) return rc; @@ -570,6 +640,9 @@ static int bnxt_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) app->protocol == ROCE_V2_UDP_DPORT)) rc = bnxt_hwrm_set_dcbx_app(bp, app, true); + if (app->selector == IEEE_8021QAZ_APP_SEL_DSCP) + rc = bnxt_hwrm_queue_dscp2pri_cfg(bp, app, true); + return rc; } @@ -582,6 +655,10 @@ static int bnxt_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) !(bp->dcbx_cap & DCB_CAP_DCBX_HOST)) return -EINVAL; + rc = bnxt_dcbnl_ieee_dscp_app_prep(bp, app); +
[PATCH net-next 11/13] bnxt_en: Add hwmon sysfs support to read temperature
From: Vasundhara Volam Export temperature sensor reading via hwmon sysfs. Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/Kconfig | 8 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 62 +++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 3 files changed, 71 insertions(+) diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig index b7aa8ad..c1d3ee9b 100644 --- a/drivers/net/ethernet/broadcom/Kconfig +++ b/drivers/net/ethernet/broadcom/Kconfig @@ -230,4 +230,12 @@ config BNXT_DCB If unsure, say N. +config BNXT_HWMON + bool "Broadcom NetXtreme-C/E HWMON support" + default y + depends on BNXT && HWMON && !(BNXT=y && HWMON=m) + ---help--- + Say Y if you want to expose the thermal sensor data on NetXtreme-C/E + devices, via the hwmon sysfs interface. + endif # NET_VENDOR_BROADCOM diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 56bd097..dde904b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -51,6 +51,8 @@ #include #include #include +#include +#include #include "bnxt_hsi.h" #include "bnxt.h" @@ -6789,6 +6791,62 @@ static void bnxt_get_wol_settings(struct bnxt *bp) } while (handle && handle != 0x); } +#ifdef CONFIG_BNXT_HWMON +static ssize_t bnxt_show_temp(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct hwrm_temp_monitor_query_input req = {0}; + struct hwrm_temp_monitor_query_output *resp; + struct bnxt *bp = dev_get_drvdata(dev); + u32 temp = 0; + + resp = bp->hwrm_cmd_resp_addr; + bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_TEMP_MONITOR_QUERY, -1, -1); + mutex_lock(&bp->hwrm_cmd_lock); + if (!_hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT)) + temp = resp->temp * 1000; /* display millidegree */ + mutex_unlock(&bp->hwrm_cmd_lock); + + return sprintf(buf, "%u\n", temp); +} +static SENSOR_DEVICE_ATTR(temp1_input, 0444, bnxt_show_temp, NULL, 0); + +static struct attribute *bnxt_attrs[] = { + &sensor_dev_attr_temp1_input.dev_attr.attr, + NULL +}; +ATTRIBUTE_GROUPS(bnxt); + +static void bnxt_hwmon_close(struct bnxt *bp) +{ + if (bp->hwmon_dev) { + hwmon_device_unregister(bp->hwmon_dev); + bp->hwmon_dev = NULL; + } +} + +static void bnxt_hwmon_open(struct bnxt *bp) +{ + struct pci_dev *pdev = bp->pdev; + + bp->hwmon_dev = hwmon_device_register_with_groups(&pdev->dev, + DRV_MODULE_NAME, bp, + bnxt_groups); + if (IS_ERR(bp->hwmon_dev)) { + bp->hwmon_dev = NULL; + dev_warn(&pdev->dev, "Cannot register hwmon device\n"); + } +} +#else +static void bnxt_hwmon_close(struct bnxt *bp) +{ +} + +static void bnxt_hwmon_open(struct bnxt *bp) +{ +} +#endif + static bool bnxt_eee_config_ok(struct bnxt *bp) { struct ethtool_eee *eee = &bp->eee; @@ -7040,6 +7098,9 @@ static int bnxt_open(struct net_device *dev) rc = __bnxt_open_nic(bp, true, true); if (rc) bnxt_hwrm_if_change(bp, false); + + bnxt_hwmon_open(bp); + return rc; } @@ -7102,6 +7163,7 @@ static int bnxt_close(struct net_device *dev) { struct bnxt *bp = netdev_priv(dev); + bnxt_hwmon_close(bp); bnxt_close_nic(bp, true, true); bnxt_hwrm_shutdown_link(bp); bnxt_hwrm_if_change(bp, false); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 6c40b257..006726c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1411,6 +1411,7 @@ struct bnxt { struct bnxt_tc_info *tc_info; struct dentry *debugfs_pdev; struct dentry *debugfs_dim; + struct device *hwmon_dev; }; #define BNXT_RX_STATS_OFFSET(counter) \ -- 2.5.1
[PATCH net-next 00/13] bnxt_en: Updates for net-next.
This series includes the usual firmware spec update. The driver has added external phy loopback test and phy setup retry logic that is needed during hotplug. In the SRIOV space, the driver has added a new VF resource allocation mode that requires the VF driver to reserve resources during IFUP. IF state changes are now propagated to firmware so that firmware can release some resources during IFDOWN. ethtool method to get firmware core dump and hwmon temperature reading have been added. DSCP to user priority support has been added to the driver's DCBNL interface, and the CoS queue logic has been refined to make sure that the special RDMA Congestion Notification hardware CoS queue will not be used for networking traffic. Michael Chan (11): bnxt_en: Update firmware interface version to 1.9.2.25. bnxt_en: Adjust timer based on ethtool stats-block-usecs settings. bnxt_en: Add external loopback test to ethtool selftest. bnxt_en: Add PHY retry logic. bnxt_en: Add new VF resource allocation strategy mode. bnxt_en: Update RSS setup and GRO-HW logic according to the latest spec. bnxt_en: Add BNXT_NEW_RM() macro. bnxt_en: Move firmware related flags to a new fw_cap field in struct bnxt. bnxt_en: Notify firmware about IF state changes. bnxt_en: Add DCBNL DSCP application protocol support. bnxt_en: Do not use the CNP CoS queue for networking traffic. Vasundhara Volam (2): bnxt_en: Add support for ethtool get dump. bnxt_en: Add hwmon sysfs support to read temperature drivers/net/ethernet/broadcom/Kconfig |8 + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 216 +++- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 30 +- drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h | 66 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 89 +- drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h | 10 + drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c |8 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 378 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h | 37 + drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 1227 +++- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c| 25 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c |4 +- 12 files changed, 1716 insertions(+), 382 deletions(-) create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h -- 2.5.1
[PATCH net-next 08/13] bnxt_en: Add BNXT_NEW_RM() macro.
The BNXT_FLAG_NEW_RM flag is checked a lot in the code to determine if the new resource manager is in effect. Define a macro to perform this check. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 27 +++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 4 ++-- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1714850..5c9ee3c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4579,7 +4579,7 @@ static int bnxt_hwrm_get_rings(struct bnxt *bp) } hw_resc->resv_tx_rings = le16_to_cpu(resp->alloc_tx_rings); - if (bp->flags & BNXT_FLAG_NEW_RM) { + if (BNXT_NEW_RM(bp)) { u16 cp, stats; hw_resc->resv_rx_rings = le16_to_cpu(resp->alloc_rx_rings); @@ -4625,7 +4625,7 @@ __bnxt_hwrm_reserve_pf_rings(struct bnxt *bp, struct hwrm_func_cfg_input *req, req->fid = cpu_to_le16(0x); enables |= tx_rings ? FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS : 0; req->num_tx_rings = cpu_to_le16(tx_rings); - if (bp->flags & BNXT_FLAG_NEW_RM) { + if (BNXT_NEW_RM(bp)) { enables |= rx_rings ? FUNC_CFG_REQ_ENABLES_NUM_RX_RINGS : 0; enables |= cp_rings ? FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS | FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0; @@ -4698,7 +4698,7 @@ bnxt_hwrm_reserve_vf_rings(struct bnxt *bp, int tx_rings, int rx_rings, struct hwrm_func_vf_cfg_input req = {0}; int rc; - if (!(bp->flags & BNXT_FLAG_NEW_RM)) { + if (!BNXT_NEW_RM(bp)) { bp->hw_resc.resv_tx_rings = tx_rings; return 0; } @@ -4758,7 +4758,7 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp) vnic = rx + 1; if (bp->flags & BNXT_FLAG_AGG_RINGS) rx <<= 1; - if ((bp->flags & BNXT_FLAG_NEW_RM) && + if (BNXT_NEW_RM(bp) && (hw_resc->resv_rx_rings != rx || hw_resc->resv_cp_rings != cp || hw_resc->resv_hw_ring_grps != grp || hw_resc->resv_vnics != vnic)) return true; @@ -4794,7 +4794,7 @@ static int __bnxt_reserve_rings(struct bnxt *bp) return rc; tx = hw_resc->resv_tx_rings; - if (bp->flags & BNXT_FLAG_NEW_RM) { + if (BNXT_NEW_RM(bp)) { rx = hw_resc->resv_rx_rings; cp = hw_resc->resv_cp_rings; grp = hw_resc->resv_hw_ring_grps; @@ -4838,7 +4838,7 @@ static int bnxt_hwrm_check_vf_rings(struct bnxt *bp, int tx_rings, int rx_rings, u32 flags; int rc; - if (!(bp->flags & BNXT_FLAG_NEW_RM)) + if (!BNXT_NEW_RM(bp)) return 0; __bnxt_hwrm_reserve_vf_rings(bp, &req, tx_rings, rx_rings, ring_grps, @@ -4867,7 +4867,7 @@ static int bnxt_hwrm_check_pf_rings(struct bnxt *bp, int tx_rings, int rx_rings, __bnxt_hwrm_reserve_pf_rings(bp, &req, tx_rings, rx_rings, ring_grps, cp_rings, vnics); flags = FUNC_CFG_REQ_FLAGS_TX_ASSETS_TEST; - if (bp->flags & BNXT_FLAG_NEW_RM) + if (BNXT_NEW_RM(bp)) flags |= FUNC_CFG_REQ_FLAGS_RX_ASSETS_TEST | FUNC_CFG_REQ_FLAGS_CMPL_ASSETS_TEST | FUNC_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST | @@ -5921,7 +5921,7 @@ int bnxt_get_avail_msix(struct bnxt *bp, int num) max_idx = min_t(int, bp->total_irqs, max_cp); avail_msix = max_idx - bp->cp_nr_rings; - if (!(bp->flags & BNXT_FLAG_NEW_RM) || avail_msix >= num) + if (!BNXT_NEW_RM(bp) || avail_msix >= num) return avail_msix; if (max_irq < total_req) { @@ -5934,7 +5934,7 @@ int bnxt_get_avail_msix(struct bnxt *bp, int num) static int bnxt_get_num_msix(struct bnxt *bp) { - if (!(bp->flags & BNXT_FLAG_NEW_RM)) + if (!BNXT_NEW_RM(bp)) return bnxt_get_max_func_irqs(bp); return bnxt_cp_rings_in_use(bp); @@ -6057,8 +6057,7 @@ int bnxt_reserve_rings(struct bnxt *bp) netdev_err(bp->dev, "ring reservation failure rc: %d\n", rc); return rc; } - if ((bp->flags & BNXT_FLAG_NEW_RM) && - (bnxt_get_num_msix(bp) != bp->total_irqs)) { + if (BNXT_NEW_RM(bp) && (bnxt_get_num_msix(bp) != bp->total_irqs)) { bnxt_ulp_irq_stop(bp); bnxt_clear_int_mode(bp); rc = bnxt_init_int_mode(bp); @@ -7306,7 +7305,7 @@ static int bnxt_cfg_rx_mode(struct bnxt *bp) static bool bnxt_can_reserve_rings(struct bnxt *bp) { #ifdef CONFIG_BNXT_SRIOV - if ((bp->flags & BNXT_FLAG_
[PATCH net-next 09/13] bnxt_en: Move firmware related flags to a new fw_cap field in struct bnxt.
The flags field is almost getting full. Move firmware capability flags to a new fw_cap field to better organize these firmware flags. Signed-off-by: Michael Chan --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 ++-- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 13 +++-- drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 6 +++--- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 5c9ee3c..1659940 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3445,7 +3445,7 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, cp_ring_id = le16_to_cpu(req->cmpl_ring); intr_process = (cp_ring_id == INVALID_HW_RING_ID) ? 0 : 1; - if (bp->flags & BNXT_FLAG_SHORT_CMD) { + if (bp->fw_cap & BNXT_FW_CAP_SHORT_CMD) { void *short_cmd_req = bp->hwrm_short_cmd_req_addr; memcpy(short_cmd_req, req, msg_len); @@ -5089,9 +5089,9 @@ static int bnxt_hwrm_func_qcfg(struct bnxt *bp) flags = le16_to_cpu(resp->flags); if (flags & (FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED | FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED)) { - bp->flags |= BNXT_FLAG_FW_LLDP_AGENT; + bp->fw_cap |= BNXT_FW_CAP_LLDP_AGENT; if (flags & FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED) - bp->flags |= BNXT_FLAG_FW_DCBX_AGENT; + bp->fw_cap |= BNXT_FW_CAP_DCBX_AGENT; } if (BNXT_PF(bp) && (flags & FUNC_QCFG_RESP_FLAGS_MULTI_HOST)) bp->flags |= BNXT_FLAG_MULTI_HOST; @@ -5249,7 +5249,7 @@ static int bnxt_hwrm_func_qcaps(struct bnxt *bp) if (bp->hwrm_spec_code >= 0x10803) { rc = bnxt_hwrm_func_resc_qcaps(bp, true); if (!rc) - bp->flags |= BNXT_FLAG_NEW_RM; + bp->fw_cap |= BNXT_FW_CAP_NEW_RM; } return 0; } @@ -5352,7 +5352,7 @@ static int bnxt_hwrm_ver_get(struct bnxt *bp) dev_caps_cfg = le32_to_cpu(resp->dev_caps_cfg); if ((dev_caps_cfg & VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_SUPPORTED) && (dev_caps_cfg & VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_REQUIRED)) - bp->flags |= BNXT_FLAG_SHORT_CMD; + bp->fw_cap |= BNXT_FW_CAP_SHORT_CMD; hwrm_ver_get_exit: mutex_unlock(&bp->hwrm_cmd_lock); @@ -8760,7 +8760,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (rc) goto init_err_pci_clean; - if (bp->flags & BNXT_FLAG_SHORT_CMD) { + if (bp->fw_cap & BNXT_FW_CAP_SHORT_CMD) { rc = bnxt_alloc_hwrm_short_cmd_req(bp); if (rc) goto init_err_pci_clean; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 37dc896..ded2aff 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1144,7 +1144,6 @@ struct bnxt { atomic_tintr_sem; u32 flags; - #define BNXT_FLAG_DCB_ENABLED 0x1 #define BNXT_FLAG_VF0x2 #define BNXT_FLAG_LRO 0x4 #ifdef CONFIG_INET @@ -1173,15 +1172,11 @@ struct bnxt { BNXT_FLAG_ROCEV2_CAP) #define BNXT_FLAG_NO_AGG_RINGS 0x2 #define BNXT_FLAG_RX_PAGE_MODE 0x4 - #define BNXT_FLAG_FW_LLDP_AGENT 0x8 #define BNXT_FLAG_MULTI_HOST0x10 - #define BNXT_FLAG_SHORT_CMD 0x20 #define BNXT_FLAG_DOUBLE_DB 0x40 - #define BNXT_FLAG_FW_DCBX_AGENT 0x80 #define BNXT_FLAG_CHIP_NITRO_A0 0x100 #define BNXT_FLAG_DIM 0x200 #define BNXT_FLAG_ROCE_MIRROR_CAP 0x400 - #define BNXT_FLAG_NEW_RM0x800 #define BNXT_FLAG_PORT_STATS_EXT0x1000 #define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA | \ @@ -1195,7 +1190,6 @@ struct bnxt { #define BNXT_SINGLE_PF(bp) (BNXT_PF(bp) && !BNXT_NPAR(bp) && !BNXT_MH(bp)) #define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0) #define BNXT_RX_PAGE_MODE(bp) ((bp)->flags & BNXT_FLAG_RX_PAGE_MODE) -#define BNXT_NEW_RM(bp)((bp)->flags & BNXT_FLAG_NEW_RM) /* Chip class phase 4 and later */ #define BNXT_CHIP_P4_PLUS(bp) \ @@ -1291,6 +1285,13 @@ struct bnxt { u32 msg_enable; + u32 fw_cap; + #define BNXT_FW_CAP_SHORT_CMD 0x0001 + #define BNXT_FW_CAP_LLDP_AGENT 0x0002 + #define BNXT_FW_CAP_DCBX_AGENT 0x0004 + #define BNXT_FW_CAP_NEW_RM 0x0008 + +#define BNXT_NEW_RM(bp)
Re: Linux kernel error stack
Florian, I have removed those port 80 CHECKSUM rules and everything looks good i didn't see kernel WARN mesg. Thank you so much! You just nailed it :) On Sun, Aug 5, 2018 at 4:15 PM, Satish Patel wrote: > Florian, > > It seems those rules coming from here > https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum > > On Sun, Aug 5, 2018 at 4:09 PM, Satish Patel wrote: >> Yes this is openstack-ansible deployment tool which set them up. I am >> wondering where are these rules saved? I believe openstack-ansible use >> LXC container to deploy services so must be part of LXC startup >> scripts. >> >> I have checked there is no firewalld and iptables service running on system.. >> >> You think i should get rid of all CHEKSUM option in iptables rules? Am i >> right? >> >> >> On Sun, Aug 5, 2018 at 4:02 PM, Florian Westphal wrote: >>> Satish Patel wrote: > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM > --checksum-fill > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM > --checksum-fill >>> >>> These rules make no sense to me, and are also source of your backtrace. >>> Who set this up? >>> >>> If this is coming from openstack, I suggest asking openstack developers >>> WTH this is supposed to do. >>> > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp > --dport 68 -j CHECKSUM --checksum-fill >>> >>> This was needed to work around dhcpd issues w. checksum offloading but I >>> guess that DCHCP will work fine without this rule too nowadays. >>> >>> So I suggest you simply get rid of these rules.
Re: Linux kernel error stack
Florian, It seems those rules coming from here https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum On Sun, Aug 5, 2018 at 4:09 PM, Satish Patel wrote: > Yes this is openstack-ansible deployment tool which set them up. I am > wondering where are these rules saved? I believe openstack-ansible use > LXC container to deploy services so must be part of LXC startup > scripts. > > I have checked there is no firewalld and iptables service running on system.. > > You think i should get rid of all CHEKSUM option in iptables rules? Am i > right? > > > On Sun, Aug 5, 2018 at 4:02 PM, Florian Westphal wrote: >> Satish Patel wrote: >>> > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM >>> > --checksum-fill >>> > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM >>> > --checksum-fill >> >> These rules make no sense to me, and are also source of your backtrace. >> Who set this up? >> >> If this is coming from openstack, I suggest asking openstack developers >> WTH this is supposed to do. >> >>> > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp >>> > --dport 68 -j CHECKSUM --checksum-fill >> >> This was needed to work around dhcpd issues w. checksum offloading but I >> guess that DCHCP will work fine without this rule too nowadays. >> >> So I suggest you simply get rid of these rules.
Re: Linux kernel error stack
Yes this is openstack-ansible deployment tool which set them up. I am wondering where are these rules saved? I believe openstack-ansible use LXC container to deploy services so must be part of LXC startup scripts. I have checked there is no firewalld and iptables service running on system.. You think i should get rid of all CHEKSUM option in iptables rules? Am i right? On Sun, Aug 5, 2018 at 4:02 PM, Florian Westphal wrote: > Satish Patel wrote: >> > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM >> > --checksum-fill >> > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM >> > --checksum-fill > > These rules make no sense to me, and are also source of your backtrace. > Who set this up? > > If this is coming from openstack, I suggest asking openstack developers > WTH this is supposed to do. > >> > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp >> > --dport 68 -j CHECKSUM --checksum-fill > > This was needed to work around dhcpd issues w. checksum offloading but I > guess that DCHCP will work fine without this rule too nowadays. > > So I suggest you simply get rid of these rules.
Re: Linux kernel error stack
Satish Patel wrote: > > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM > > --checksum-fill > > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM > > --checksum-fill These rules make no sense to me, and are also source of your backtrace. Who set this up? If this is coming from openstack, I suggest asking openstack developers WTH this is supposed to do. > > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp > > --dport 68 -j CHECKSUM --checksum-fill This was needed to work around dhcpd issues w. checksum offloading but I guess that DCHCP will work fine without this rule too nowadays. So I suggest you simply get rid of these rules.
Re: Linux kernel error stack
I am configured where are those rules stored why iptables -L -n -t nat not showing them? On Sun, Aug 5, 2018 at 3:56 PM, Satish Patel wrote: > Oh wait!! I think you are right there i didn't notice that iptables > has following setting. > > "CHECKSUM --checksum-fill" > > On Sun, Aug 5, 2018 at 3:54 PM, Satish Patel wrote: >> I have following kernel logging set. >> >> [root@ostack-infra-02 tools]# cat /proc/sys/kernel/printk >> 3 4 1 3 >> >> >> iptables output >> >> [root@ostack-infra-02 tools]# iptables-save -c >> # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 >> *raw >> :PREROUTING ACCEPT [42284573:38782693391] >> :OUTPUT ACCEPT [219668:42675751] >> COMMIT >> # Completed on Sun Aug 5 15:52:56 2018 >> # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 >> *nat >> :PREROUTING ACCEPT [656413:38450886] >> :INPUT ACCEPT [5524:564390] >> :OUTPUT ACCEPT [3217:212903] >> :POSTROUTING ACCEPT [559058:34193539] >> [5197:311820] -A POSTROUTING -s 10.0.3.0/24 ! -d 10.0.3.0/24 -j MASQUERADE >> COMMIT >> # Completed on Sun Aug 5 15:52:56 2018 >> # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 >> *mangle >> :PREROUTING ACCEPT [42209568:38687818558] >> :INPUT ACCEPT [170634:197796840] >> :FORWARD ACCEPT [42939243:38526139954] >> :OUTPUT ACCEPT [198956:41288636] >> :POSTROUTING ACCEPT [43138199:38567428590] >> [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM >> --checksum-fill >> [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM >> --checksum-fill >> [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp >> --dport 68 -j CHECKSUM --checksum-fill >> COMMIT >> # Completed on Sun Aug 5 15:52:56 2018 >> # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 >> *filter >> :INPUT ACCEPT [164954:197217701] >> :FORWARD ACCEPT [42801701:38377075499] >> :OUTPUT ACCEPT [198963:41289860] >> [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 53 -j ACCEPT >> [4932:328918] -A INPUT -i lxcbr0 -p udp -m udp --dport 53 -j ACCEPT >> [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 67 -j ACCEPT >> [755:250585] -A INPUT -i lxcbr0 -p udp -m udp --dport 67 -j ACCEPT >> [80745:145594351] -A FORWARD -o lxcbr0 -j ACCEPT >> [56801:3471569] -A FORWARD -i lxcbr0 -j ACCEPT >> COMMIT >> # Completed on Sun Aug 5 15:52:56 2018 >> >> On Sun, Aug 5, 2018 at 3:42 PM, Florian Westphal wrote: >>> Satish Patel wrote: Thanks Florian, FYI, I don't have any CHECKSUM configure in my iptables, >>> >>> You have, according to WARN stacktrace you provided. >>> >>> iptables-save -c >>> ip6tables-save -c
Re: Linux kernel error stack
Oh wait!! I think you are right there i didn't notice that iptables has following setting. "CHECKSUM --checksum-fill" On Sun, Aug 5, 2018 at 3:54 PM, Satish Patel wrote: > I have following kernel logging set. > > [root@ostack-infra-02 tools]# cat /proc/sys/kernel/printk > 3 4 1 3 > > > iptables output > > [root@ostack-infra-02 tools]# iptables-save -c > # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 > *raw > :PREROUTING ACCEPT [42284573:38782693391] > :OUTPUT ACCEPT [219668:42675751] > COMMIT > # Completed on Sun Aug 5 15:52:56 2018 > # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 > *nat > :PREROUTING ACCEPT [656413:38450886] > :INPUT ACCEPT [5524:564390] > :OUTPUT ACCEPT [3217:212903] > :POSTROUTING ACCEPT [559058:34193539] > [5197:311820] -A POSTROUTING -s 10.0.3.0/24 ! -d 10.0.3.0/24 -j MASQUERADE > COMMIT > # Completed on Sun Aug 5 15:52:56 2018 > # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 > *mangle > :PREROUTING ACCEPT [42209568:38687818558] > :INPUT ACCEPT [170634:197796840] > :FORWARD ACCEPT [42939243:38526139954] > :OUTPUT ACCEPT [198956:41288636] > :POSTROUTING ACCEPT [43138199:38567428590] > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM > --checksum-fill > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM > --checksum-fill > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp > --dport 68 -j CHECKSUM --checksum-fill > COMMIT > # Completed on Sun Aug 5 15:52:56 2018 > # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 > *filter > :INPUT ACCEPT [164954:197217701] > :FORWARD ACCEPT [42801701:38377075499] > :OUTPUT ACCEPT [198963:41289860] > [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 53 -j ACCEPT > [4932:328918] -A INPUT -i lxcbr0 -p udp -m udp --dport 53 -j ACCEPT > [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 67 -j ACCEPT > [755:250585] -A INPUT -i lxcbr0 -p udp -m udp --dport 67 -j ACCEPT > [80745:145594351] -A FORWARD -o lxcbr0 -j ACCEPT > [56801:3471569] -A FORWARD -i lxcbr0 -j ACCEPT > COMMIT > # Completed on Sun Aug 5 15:52:56 2018 > > On Sun, Aug 5, 2018 at 3:42 PM, Florian Westphal wrote: >> Satish Patel wrote: >>> Thanks Florian, >>> >>> FYI, I don't have any CHECKSUM configure in my iptables, >> >> You have, according to WARN stacktrace you provided. >> >> iptables-save -c >> ip6tables-save -c
Re: Linux kernel error stack
I have following kernel logging set. [root@ostack-infra-02 tools]# cat /proc/sys/kernel/printk 3 4 1 3 iptables output [root@ostack-infra-02 tools]# iptables-save -c # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 *raw :PREROUTING ACCEPT [42284573:38782693391] :OUTPUT ACCEPT [219668:42675751] COMMIT # Completed on Sun Aug 5 15:52:56 2018 # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 *nat :PREROUTING ACCEPT [656413:38450886] :INPUT ACCEPT [5524:564390] :OUTPUT ACCEPT [3217:212903] :POSTROUTING ACCEPT [559058:34193539] [5197:311820] -A POSTROUTING -s 10.0.3.0/24 ! -d 10.0.3.0/24 -j MASQUERADE COMMIT # Completed on Sun Aug 5 15:52:56 2018 # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 *mangle :PREROUTING ACCEPT [42209568:38687818558] :INPUT ACCEPT [170634:197796840] :FORWARD ACCEPT [42939243:38526139954] :OUTPUT ACCEPT [198956:41288636] :POSTROUTING ACCEPT [43138199:38567428590] [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM --checksum-fill [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM --checksum-fill [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp --dport 68 -j CHECKSUM --checksum-fill COMMIT # Completed on Sun Aug 5 15:52:56 2018 # Generated by iptables-save v1.4.21 on Sun Aug 5 15:52:56 2018 *filter :INPUT ACCEPT [164954:197217701] :FORWARD ACCEPT [42801701:38377075499] :OUTPUT ACCEPT [198963:41289860] [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 53 -j ACCEPT [4932:328918] -A INPUT -i lxcbr0 -p udp -m udp --dport 53 -j ACCEPT [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 67 -j ACCEPT [755:250585] -A INPUT -i lxcbr0 -p udp -m udp --dport 67 -j ACCEPT [80745:145594351] -A FORWARD -o lxcbr0 -j ACCEPT [56801:3471569] -A FORWARD -i lxcbr0 -j ACCEPT COMMIT # Completed on Sun Aug 5 15:52:56 2018 On Sun, Aug 5, 2018 at 3:42 PM, Florian Westphal wrote: > Satish Patel wrote: >> Thanks Florian, >> >> FYI, I don't have any CHECKSUM configure in my iptables, > > You have, according to WARN stacktrace you provided. > > iptables-save -c > ip6tables-save -c
[PATCH net-next] net: report netlink extack only if set
From: Willem de Bruijn Initialize extack in dev_set_mtu and report only if set. Fixes: 7a4c53bee332 ("net: report invalid mtu value via netlink extack") Reported-by: syzbot Signed-off-by: Willem de Bruijn --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 36e994519488e..1d0122a766019 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7580,11 +7580,11 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, int dev_set_mtu(struct net_device *dev, int new_mtu) { - struct netlink_ext_ack extack; + struct netlink_ext_ack extack = {}; int err; err = dev_set_mtu_ext(dev, new_mtu, &extack); - if (err) + if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } -- 2.18.0.597.ga71716f1ad-goog
Re: Linux kernel error stack
Satish Patel wrote: > Thanks Florian, > > FYI, I don't have any CHECKSUM configure in my iptables, You have, according to WARN stacktrace you provided. iptables-save -c ip6tables-save -c
[PATCH net-next] tc-testing: remove duplicate spaces in skbedit match patterns
Match patterns for some skbedit tests contain duplicate whitespace that is not present in actual tc output. This causes tests to fail because they can't match required action, even when it was successfully created. Signed-off-by: Vlad Buslov Acked-by: Jamal Hadi Salim --- .../tc-testing/tc-tests/actions/skbedit.json | 26 +++--- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json index 37ecc2716fee..5aaf593b914a 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json @@ -17,7 +17,7 @@ "cmdUnderTest": "$TC actions add action skbedit mark 1", "expExitCode": "0", "verifyCmd": "$TC actions list action skbedit", -"matchPattern": "action order [0-9]*: skbedit mark 1", +"matchPattern": "action order [0-9]*: skbedit mark 1", "matchCount": "1", "teardown": [ "$TC actions flush action skbedit" @@ -65,7 +65,7 @@ "cmdUnderTest": "$TC actions add action skbedit prio 99", "expExitCode": "0", "verifyCmd": "$TC actions list action skbedit", -"matchPattern": "action order [0-9]*: skbedit priority :99", +"matchPattern": "action order [0-9]*: skbedit priority :99", "matchCount": "1", "teardown": [ "$TC actions flush action skbedit" @@ -113,7 +113,7 @@ "cmdUnderTest": "$TC actions add action skbedit queue_mapping 909", "expExitCode": "0", "verifyCmd": "$TC actions list action skbedit", -"matchPattern": "action order [0-9]*: skbedit queue_mapping 909", +"matchPattern": "action order [0-9]*: skbedit queue_mapping 909", "matchCount": "1", "teardown": [ "$TC actions flush action skbedit" @@ -161,7 +161,7 @@ "cmdUnderTest": "$TC actions add action skbedit ptype host", "expExitCode": "0", "verifyCmd": "$TC actions list action skbedit", -"matchPattern": "action order [0-9]*: skbedit ptype host", +"matchPattern": "action order [0-9]*: skbedit ptype host", "matchCount": "1", "teardown": [ "$TC actions flush action skbedit" @@ -185,7 +185,7 @@ "cmdUnderTest": "$TC actions add action skbedit ptype otherhost", "expExitCode": "0", "verifyCmd": "$TC actions list action skbedit", -"matchPattern": "action order [0-9]*: skbedit ptype otherhost", +"matchPattern": "action order [0-9]*: skbedit ptype otherhost", "matchCount": "1", "teardown": [ "$TC actions flush action skbedit" @@ -233,7 +233,7 @@ "cmdUnderTest": "$TC actions add action skbedit ptype host pipe index 11", "expExitCode": "0", "verifyCmd": "$TC actions get action skbedit index 11", -"matchPattern": "action order [0-9]*: skbedit ptype host pipe.*index 11 ref", +"matchPattern": "action order [0-9]*: skbedit ptype host pipe.*index 11 ref", "matchCount": "1", "teardown": [ "$TC actions flush action skbedit" @@ -257,7 +257,7 @@ "cmdUnderTest": "$TC actions add action skbedit mark 56789 reclassify index 90", "expExitCode": "0", "verifyCmd": "$TC actions get action skbedit index 90", -"matchPattern": "action order [0-9]*: skbedit mark 56789 reclassify.*index 90 ref", +"matchPattern": "action order [0-9]*: skbedit mark 56789 reclassify.*index 90 ref", "matchCount": "1", "teardown": [ "$TC actions flush action skbedit" @@ -281,7 +281,7 @@ "cmdUnderTest": "$TC actions add action skbedit queue_mapping 3 pass index 271", "expExitCode": "0", "verifyCmd": "$TC actions get action skbedit index 271", -"matchPattern": "action order [0-9]*: skbedit queue_mapping 3 pass.*index 271 ref", +"matchPattern": "action order [0-9]*: skbedit queue_mapping 3 pass.*index 271 ref", "matchCount": "1", "teardown": [ "$TC actions flush action skbedit" @@ -305,7 +305,7 @@ "cmdUnderTest": "$TC actions add action skbedit queue_mapping 3 drop index 271", "expExitCode": "0", "verifyCmd": "$TC actions get action skbedit index 271", -"matchPattern": "action order [0-9]*: skbedit queue_mapping 3 drop.*index 271 ref", +"matchPattern": "action order [0-9]*: skbedit queue_mapping 3 drop.*index 271 ref", "matchCount": "1", "teardown": [ "$TC actions flush action skbedit" @@ -329,7 +329,7 @@ "cmdUnderTest": "$TC actions add action skbedit priority 8 jump 9 index 2", "expExitCode": "0", "verifyCmd": "$TC actions get action skbedit index 2", -"matchPatter
[PATCH net-next] tc-testing: remove duplicate spaces in connmark match patterns
Match patterns for some connmark tests contain duplicate whitespace that is not present in actual tc output. This causes tests to fail because they can't match required action, even when it was successfully created. Fixes: 1dad0f97 ("tc-testing: add connmark action tests") Signed-off-by: Vlad Buslov Acked-by: Jamal Hadi Salim --- .../tc-testing/tc-tests/actions/connmark.json | 24 +++--- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json b/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json index 70952bd98ff9..13147a1f5731 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json @@ -17,7 +17,7 @@ "cmdUnderTest": "$TC actions add action connmark", "expExitCode": "0", "verifyCmd": "$TC actions list action connmark", -"matchPattern": "action order [0-9]+: connmark zone 0 pipe", +"matchPattern": "action order [0-9]+: connmark zone 0 pipe", "matchCount": "1", "teardown": [ "$TC actions flush action connmark" @@ -41,7 +41,7 @@ "cmdUnderTest": "$TC actions add action connmark pass index 1", "expExitCode": "0", "verifyCmd": "$TC actions get action connmark index 1", -"matchPattern": "action order [0-9]+: connmark zone 0 pass.*index 1 ref", +"matchPattern": "action order [0-9]+: connmark zone 0 pass.*index 1 ref", "matchCount": "1", "teardown": [ "$TC actions flush action connmark" @@ -65,7 +65,7 @@ "cmdUnderTest": "$TC actions add action connmark drop index 100", "expExitCode": "0", "verifyCmd": "$TC actions get action connmark index 100", -"matchPattern": "action order [0-9]+: connmark zone 0 drop.*index 100 ref", +"matchPattern": "action order [0-9]+: connmark zone 0 drop.*index 100 ref", "matchCount": "1", "teardown": [ "$TC actions flush action connmark" @@ -89,7 +89,7 @@ "cmdUnderTest": "$TC actions add action connmark pipe index 455", "expExitCode": "0", "verifyCmd": "$TC actions get action connmark index 455", -"matchPattern": "action order [0-9]+: connmark zone 0 pipe.*index 455 ref", +"matchPattern": "action order [0-9]+: connmark zone 0 pipe.*index 455 ref", "matchCount": "1", "teardown": [ "$TC actions flush action connmark" @@ -113,7 +113,7 @@ "cmdUnderTest": "$TC actions add action connmark reclassify index 7", "expExitCode": "0", "verifyCmd": "$TC actions list action connmark", -"matchPattern": "action order [0-9]+: connmark zone 0 reclassify.*index 7 ref", +"matchPattern": "action order [0-9]+: connmark zone 0 reclassify.*index 7 ref", "matchCount": "1", "teardown": [ "$TC actions flush action connmark" @@ -137,7 +137,7 @@ "cmdUnderTest": "$TC actions add action connmark continue index 17", "expExitCode": "0", "verifyCmd": "$TC actions list action connmark", -"matchPattern": "action order [0-9]+: connmark zone 0 continue.*index 17 ref", +"matchPattern": "action order [0-9]+: connmark zone 0 continue.*index 17 ref", "matchCount": "1", "teardown": [ "$TC actions flush action connmark" @@ -161,7 +161,7 @@ "cmdUnderTest": "$TC actions add action connmark jump 10 index 17", "expExitCode": "0", "verifyCmd": "$TC actions list action connmark", -"matchPattern": "action order [0-9]+: connmark zone 0 jump 10.*index 17 ref", +"matchPattern": "action order [0-9]+: connmark zone 0 jump 10.*index 17 ref", "matchCount": "1", "teardown": [ "$TC actions flush action connmark" @@ -185,7 +185,7 @@ "cmdUnderTest": "$TC actions add action connmark zone 100 pipe index 1", "expExitCode": "0", "verifyCmd": "$TC actions get action connmark index 1", -"matchPattern": "action order [0-9]+: connmark zone 100 pipe.*index 1 ref", +"matchPattern": "action order [0-9]+: connmark zone 100 pipe.*index 1 ref", "matchCount": "1", "teardown": [ "$TC actions flush action connmark" @@ -209,7 +209,7 @@ "cmdUnderTest": "$TC actions add action connmark zone 65536 reclassify index 21", "expExitCode": "255", "verifyCmd": "$TC actions get action connmark index 1", -"matchPattern": "action order [0-9]+: connmark zone 65536 reclassify.*index 21 ref", +"matchPattern": "action order [0-9]+: connmark zone 65536 reclassify.*index 21 ref", "matchCount": "0", "teardown": [ "$TC actions flush action connmark" @@ -233,7 +233,7 @@ "cmdUnderTest": "$TC a
[PATCH net-next] tc-testing: flush gact actions on test teardown
Test 6fb4 creates one mirred and one pipe action, but only flushes mirred on teardown. Leaking pipe action causes failures in other tests. Add additional teardown command to also flush gact actions. Signed-off-by: Vlad Buslov Acked-by: Jamal Hadi Salim --- tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json index 6e4edfae1799..db49fd0f8445 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json @@ -44,7 +44,8 @@ "matchPattern": "action order [0-9]*: mirred \\(Egress Redirect to device lo\\).*index 2 ref", "matchCount": "1", "teardown": [ -"$TC actions flush action mirred" +"$TC actions flush action mirred", +"$TC actions flush action gact" ] }, { -- 2.7.5
[PATCH net-next] tc-testing: fix ip address in u32 test
Fix expected ip address to actually match configured ip address. Fix test to expect single matched filter. Signed-off-by: Vlad Buslov Acked-by: Jamal Hadi Salim --- tools/testing/selftests/tc-testing/tc-tests/filters/tests.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json index 5fa02d86b35f..99a5ffca1088 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json @@ -12,8 +12,8 @@ "cmdUnderTest": "$TC filter add dev $DEV1 parent : protocol ip prio 1 u32 match ip src 127.0.0.1/32 flowid 1:1 action ok", "expExitCode": "0", "verifyCmd": "$TC filter show dev $DEV1 parent :", -"matchPattern": "match 7f02/ at 12", -"matchCount": "0", +"matchPattern": "match 7f01/ at 12", +"matchCount": "1", "teardown": [ "$TC qdisc del dev $DEV1 ingress" ] -- 2.7.5
[RFC] weirdness in cxgb3_main.c:init_tp_parity()
for (i = 0; i < 2048; i++) { ... req->l2t_idx = htonl(V_L2T_W_IDX(i)); ... in there is very odd; l2t_idx is a 16bit field, and #define V_L2T_W_IDX(x) ((x) << S_L2T_W_IDX) #define S_L2T_W_IDX0 IOW, we are taking htonl(something in range 0..2047) and shove it into 16bit field. Which would, on a little-endian host, be a fancy way of spelling req->l2t_idx = 0; What's the intended behaviour there? I'm not familiar with the hardware in question; this smells like a typoed req->l2t_idx = htons(...) but how does the current code manage to work (i.e. does anything even care about the value stored there)? It's not a big-endian-only driver, after all...
[endianness bug] cxgb4: mk_act_open_req() buggers ->{local,peer}_ip on big-endian hosts
Unlike fs.val.lport and fs.val.fport, cxgb4_process_flow_match() sets fs.val.{l,f}ip to net-endian values without conversion - they come straight from flow_dissector_key_ipv4_addrs ->dst and ->src resp. So the assignment in mk_act_open_req() ought to be a straigh copy. As far as I know, T4 PCIe cards do exist, so it's not as if that thing could only be found on little-endian systems... Signed-off-by: Al Viro --- diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 00fc5f1afb1d..7dddb9e748b8 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -1038,10 +1038,8 @@ static void mk_act_open_req(struct filter_entry *f, struct sk_buff *skb, OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_filterid)); req->local_port = cpu_to_be16(f->fs.val.lport); req->peer_port = cpu_to_be16(f->fs.val.fport); - req->local_ip = f->fs.val.lip[0] | f->fs.val.lip[1] << 8 | - f->fs.val.lip[2] << 16 | f->fs.val.lip[3] << 24; - req->peer_ip = f->fs.val.fip[0] | f->fs.val.fip[1] << 8 | - f->fs.val.fip[2] << 16 | f->fs.val.fip[3] << 24; + memcpy(&req->local_ip, f->fs.val.lip, 4); + memcpy(&req->peer_ip, f->fs.val.fip, 4); req->opt0 = cpu_to_be64(NAGLE_V(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | DELACK_V(f->fs.hitcnts) |
Re: [endianness bug?] cxgb4_next_header .match_val/.match_mask should be net-endian
On Sun, Aug 05, 2018 at 04:28:11PM +0100, Al Viro wrote: > On little-endian host those do yield the right values - e.g. 0x1100 is > {0, 17, 0, 0}, etc. On big-endian, though, these will end up checking > in IPv4 case the octet at offset 10 (i.e. upper 16 bits of checksum) and for > IPv6 > - the octet at offset 5 (i.e. the lower 8 bits of payload length). > > Unless I'm misreading that code, it needs the following to do the right > thing both on l-e and b-e. Comments? ... and it looks like the same story with ->mask - it's compared to ->offmask, which is __be16. For little-endian hosts the values make sense (htons(0x0f00), with offoff 0 and shift 6, i.e. "take the first two octets, treat them as net-endian, clear everything except IHL bits and shift down by 6, which'd yield IHL*4"), for big-endian they don't - you'd get TOS * 4 instead...
[endianness bug?] cxgb4_next_header .match_val/.match_mask should be net-endian
AFAICS, cxgb4_next_header() expects to find match_val/match_mask in struct cxgb4_next_header stored as big-endian: /* Found a possible candidate. Find a key that * matches the corresponding offset, value, and * mask to jump to next header. */ for (j = 0; j < cls->knode.sel->nkeys; j++) { off = cls->knode.sel->keys[j].off; val = cls->knode.sel->keys[j].val; mask = cls->knode.sel->keys[j].mask; if (next[i].match_off == off && next[i].match_val == val && next[i].match_mask == mask) { found = true; break; } } Here ->keys[] is struct tc_u32_key and there mask and val are definitely __be32. match_val and match_mask are never changed after initialization and they are set to: * .match_off = 8, .match_val = 0x600, .match_mask = 0xFF00 meant to check for IPV4.Protocol == TCP, i.e. octet at offset 9 being 6 * .match_off = 8, .match_val = 0x1100, .match_mask = 0xFF00 meant to check for IPV4.Protocol == UDP, i.e. octet at offset 9 being 0x11 * .match_off = 4, .match_val = 0x6, .match_mask = 0xFF IPV6.NextHeader == TCP, i.e. octet at offset 6 being 6 * .match_off = 4, .match_val = 0x11, .match_mask = 0xFF IPV6.NextHeader == UDP, i.e. octet at offset 6 being 0x11 On little-endian host those do yield the right values - e.g. 0x1100 is {0, 17, 0, 0}, etc. On big-endian, though, these will end up checking in IPv4 case the octet at offset 10 (i.e. upper 16 bits of checksum) and for IPv6 - the octet at offset 5 (i.e. the lower 8 bits of payload length). Unless I'm misreading that code, it needs the following to do the right thing both on l-e and b-e. Comments? Signed-off-by: Al Viro --- diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h index a4b99edcc339..ec226b1cebf4 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h @@ -259,11 +259,11 @@ struct cxgb4_next_header { */ static const struct cxgb4_next_header cxgb4_ipv4_jumps[] = { { .offset = 0, .offoff = 0, .shift = 6, .mask = 0xF, - .match_off = 8, .match_val = 0x600, .match_mask = 0xFF00, - .jump = cxgb4_tcp_fields }, + .match_off = 8, .match_val = htonl(6 << 16), + .match_mask = htonl(0xff<<16), .jump = cxgb4_tcp_fields }, { .offset = 0, .offoff = 0, .shift = 6, .mask = 0xF, - .match_off = 8, .match_val = 0x1100, .match_mask = 0xFF00, - .jump = cxgb4_udp_fields }, + .match_off = 8, .match_val = htonl(17 << 16), + .match_mask = htonl(0xff<<16), .jump = cxgb4_udp_fields }, { .jump = NULL } }; @@ -272,11 +272,11 @@ static const struct cxgb4_next_header cxgb4_ipv4_jumps[] = { */ static const struct cxgb4_next_header cxgb4_ipv6_jumps[] = { { .offset = 0x28, .offoff = 0, .shift = 0, .mask = 0, - .match_off = 4, .match_val = 0x6, .match_mask = 0xFF, - .jump = cxgb4_tcp_fields }, + .match_off = 4, .match_val = htonl(6 << 8), + .match_mask = htonl(0xff << 8), .jump = cxgb4_tcp_fields }, { .offset = 0x28, .offoff = 0, .shift = 0, .mask = 0, - .match_off = 4, .match_val = 0x11, .match_mask = 0xFF, - .jump = cxgb4_udp_fields }, + .match_off = 4, .match_val = htonl(17 << 8), + .match_mask = htonl(0xff << 8), .jump = cxgb4_udp_fields }, { .jump = NULL } };
Re: consequences of setting net_device_ops ndo_change_carrier()?
> > You should have a PHY device of some sort. Either a traditional > > copper PHY, or an SFP module. There should be a driver for this PHY. > > This could be one of those in drivers/net/phy. Or it could be > > firmware running, running on a little microcontroller inside your > > FPGA? > > in my case, it's properly in drivers/net/phy, so at least that part > is normal. back to investigating ... Hi Robert O.K, that makes thing simpler. PHYs are controlled via an MDIO bus. Do you have an MDIO bus driver? You said this was an FPGA design. MDIO might be a standard cell you can just drop in. If so, look around and see if there is an existing driver for it. Otherwise you might have to write one. They are quite simple, some examples are in driver/net/phy. Depending on the address range, the MDIO bus driver can also be embedded in the MAC driver. Andrew
[PATCH net] ip6_tunnel: use the right value for ipv4 min mtu check in ip6_tnl_xmit
According to RFC791, 68 bytes is the minimum size of IPv4 datagram every device must be able to forward without further fragmentation while 576 bytes is the minimum size of IPv4 datagram every device has to be able to receive, so in ip6_tnl_xmit(), 68(IPV4_MIN_MTU) should be the right value for the ipv4 min mtu check in ip6_tnl_xmit. While at it, change to use max() instead of if statement. Fixes: c9fefa08190f ("ip6_tunnel: get the min mtu properly in ip6_tnl_xmit") Reported-by: Sabrina Dubroca Signed-off-by: Xin Long --- net/ipv6/ip6_tunnel.c | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 00e138a..1cc9650 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1133,12 +1133,8 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, max_headroom += 8; mtu -= 8; } - if (skb->protocol == htons(ETH_P_IPV6)) { - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - } else if (mtu < 576) { - mtu = 576; - } + mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? + IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { -- 2.1.0
Re: KCM - recvmsg() mangles packets?
Dominique Martinet wrote on Sun, Aug 05, 2018: > (I'm not sure about offset, since we pass the full skb to parse message, > wouldn't it look at the start of the buffer everytime? Well, offset > seems to be 0 everytime the first time that check fails so I can > probably ignore that for now...) Oh, this might actually not have been such a bad remark; if I have the client write two "messages" in a single write() kcm seems to reliably fail the same way... Conversely, if I setsockopt(s, IPPROTO_TCP, TCP_NODELAY...) on the sender socket, *and* make it wait till the kcm socket has been created to start sending data, then it dramatically reduces the probability of this happening (I had to let the reproducer run in a loop for 5 minutes, wheras it used to happen within seconds). So I think the problem is packet aggregation, and strparser not handling that properly... The first packet still fails with TCP_NODELAY but there's probably aggregation on the recv side as well before the socket is attached to the multiplexor... I guess the low probability failure that is still happening could be similar. (I also noticed that I've mistakedly believed that the problem was the first packet contained the 2nd packet's data because of an off-by-one in the receiver, it really is the second packet, it only has the first packet's length) I've moved my check from kcm_rcv_strparser to just before parse_msg in __strp_recv and surely enough it fails everytime there's an offset. It's getting late but I'll try adding a pskb_pull in there tomorrow, it would be better to make the bpf program start with an offset but I don't think that'll be easy to change... I'm almost done spamming, thanks for being a good rubber duck! :p -- Dominique Martinet
Re: Linux kernel error stack
Thanks Florian, FYI, I don't have any CHECKSUM configure in my iptables, i have following rules, also do you think this kernel WARNNING is just warning and not impacting my performance, based on that i have to decided criticality of this issue. [root@ostack-infra-02 ~]# iptables -L -n Chain INPUT (policy ACCEPT) target prot opt source destination ACCEPT tcp -- 0.0.0.0/00.0.0.0/0tcp dpt:53 ACCEPT udp -- 0.0.0.0/00.0.0.0/0udp dpt:53 ACCEPT tcp -- 0.0.0.0/00.0.0.0/0tcp dpt:67 ACCEPT udp -- 0.0.0.0/00.0.0.0/0udp dpt:67 Chain FORWARD (policy ACCEPT) target prot opt source destination ACCEPT all -- 0.0.0.0/00.0.0.0/0 ACCEPT all -- 0.0.0.0/00.0.0.0/0 On Sun, Aug 5, 2018 at 5:48 AM, Florian Westphal wrote: > Satish Patel wrote: >> I am installing openstack and as you know i have lots of bridges and >> vlan interface on my Linux CentOS 7.5 >> >> I was getting following error stack on 3.10 kernel and found this is >> kernel bug which required kernel upgrade so now i have upgraded my >> kernel to 4.17.12 but i am still seeing same kernel stack error on my >> dmesg >> >> I have disable TSO, LRO, SG & GSO on my NIC but still getting error >> just wanted to understand what is this and why it popping up > > Get rid of CHECKSUM target in the iptables rules. > This thing was added 8 years ago to work around dhcp bugs, I don't > think its use is needed anymore. > Try removing it and see that all VMs can still retrieve IP address > via DHCP. > > I'm curious as to the rules, normally CHECKSUM target should be > limited to -p udp --dport bootp; its bad idea to feed it normal > packets, its expensive to do this in software rather than have device > do the checksumming. > > As for fix, I'm tempted to send patch to make checksum target > eval a no-op & add deprecation warning on init... > > Other "fix" is to > > diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c > index 9f4151ec3e06..23a17dda604d 100644 > --- a/net/netfilter/xt_CHECKSUM.c > +++ b/net/netfilter/xt_CHECKSUM.c > @@ -25,8 +25,12 @@ MODULE_ALIAS("ip6t_CHECKSUM"); > static unsigned int > checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) > { > - if (skb->ip_summed == CHECKSUM_PARTIAL) > - skb_checksum_help(skb); > + if (skb->ip_summed == CHECKSUM_PARTIAL) { > + if (skb_shinfo(skb)->gso_size) > + skb->ip_summed = CHECKSUM_NONE; > + else > + skb_checksum_help(skb); > + } > > return XT_CONTINUE; > } > > (unfortunately, the target isn't restricted to POSTROUTING, sigh).
Re: [PATCH net] l2tp: fix missing refcount drop in pppol2tp_tunnel_ioctl()
On Fri, Aug 03, 2018 at 12:42:22PM -0700, David Miller wrote: > From: Guillaume Nault > Date: Fri, 3 Aug 2018 17:00:11 +0200 > > > If 'session' is not NULL and is not a PPP pseudo-wire, then we fail to > > drop the reference taken by l2tp_session_get(). > > > > Fixes: ecd012e45ab5 ("l2tp: filter out non-PPP sessions in > > pppol2tp_tunnel_ioctl()") > > Signed-off-by: Guillaume Nault > > --- > > Sorry for the stupid mistake. I guess I got blinded by the apparent > > simplicity of the bug when I wrote the original patch. > > Applied, thanks. > > I'm pretty sure I backported the commit this fixes, so I'm queueing > this up for -stable as well. > Well, I think it wasn't. I didn't receive any notification from the stable team about it and I don't see it in Greg's stable queue nor in any -stable tree. Also, we'd have to queue 90904ff5f958 ("l2tp: fix pseudo-wire type for sessions created by pppol2tp_connect()") first, which is necessary for properly identifying PPP sessions. To recapitulate, three patches are needed to fix the original bug: * 90904ff5f958 ("l2tp: fix pseudo-wire type for sessions created by pppol2tp_connect()"): allows later patches to check if a session is PPP. * ecd012e45ab5 ("l2tp: filter out non-PPP sessions in pppol2tp_tunnel_ioctl()"): refuses calling pppol2tp_session_ioctl() on non-PPP sessions. This fixes an invalid pointer dereference when the session is Ethernet. Unfortunately it fails to drop the reference it takes on the session. * f664e37dcc52 ("l2tp: fix missing refcount drop in pppol2tp_tunnel_ioctl()"): fixes the memory leak introduced by the previous patch.
Re: consequences of setting net_device_ops ndo_change_carrier()?
On Sun, 5 Aug 2018, Andrew Lunn wrote: > On Sat, Aug 04, 2018 at 07:06:58AM -0400, Robert P. J. Day wrote: > > > > i'll try to keep this (relatively) short as there may be a > > simple answer to this, or it could just be a stupid question -- > > sort of related to previous question (thank you, florian). > > > > currently messing with networking device involving FPGA and some > > quad-port transceivers, and noticed that, when one unplugs or > > plugs a device into one of the ports, there is no change in the > > contents of the corresponding sysfs files > > /sys/class/net//carrier (or operstate, for that matter, > > which might be related to this as well). > > Hi Robert > > As other have pointed out, ndo_change_carrier is not what you want > here. i think i see that now ... based on the really adamant comment in netdevice.h: "Devices that determine carrier state from physical hardware properties (eg network cables) or protocol-dependent mechanisms (eg USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function." the impression i got was that implementing that routine for a physical device would actually cause problems, but it seems only that it would be a strange thing to do, but wouldn't necessarily cause problems if you chose not to take advantage of it. which brings me back to one of my original questions -- why *would* someone implement it? as some sort of debugging feature? in any event, i'm convinced that that's not where the problem lies. > You should have a PHY device of some sort. Either a traditional > copper PHY, or an SFP module. There should be a driver for this PHY. > This could be one of those in drivers/net/phy. Or it could be > firmware running, running on a little microcontroller inside your > FPGA? in my case, it's properly in drivers/net/phy, so at least that part is normal. back to investigating ... rday -- Robert P. J. Day Ottawa, Ontario, CANADA http://crashcourse.ca/dokuwiki Twitter: http://twitter.com/rpjday LinkedIn: http://ca.linkedin.com/in/rpjday
Re: Linux kernel error stack
Satish Patel wrote: > I am installing openstack and as you know i have lots of bridges and > vlan interface on my Linux CentOS 7.5 > > I was getting following error stack on 3.10 kernel and found this is > kernel bug which required kernel upgrade so now i have upgraded my > kernel to 4.17.12 but i am still seeing same kernel stack error on my > dmesg > > I have disable TSO, LRO, SG & GSO on my NIC but still getting error > just wanted to understand what is this and why it popping up Get rid of CHECKSUM target in the iptables rules. This thing was added 8 years ago to work around dhcp bugs, I don't think its use is needed anymore. Try removing it and see that all VMs can still retrieve IP address via DHCP. I'm curious as to the rules, normally CHECKSUM target should be limited to -p udp --dport bootp; its bad idea to feed it normal packets, its expensive to do this in software rather than have device do the checksumming. As for fix, I'm tempted to send patch to make checksum target eval a no-op & add deprecation warning on init... Other "fix" is to diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c index 9f4151ec3e06..23a17dda604d 100644 --- a/net/netfilter/xt_CHECKSUM.c +++ b/net/netfilter/xt_CHECKSUM.c @@ -25,8 +25,12 @@ MODULE_ALIAS("ip6t_CHECKSUM"); static unsigned int checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) { - if (skb->ip_summed == CHECKSUM_PARTIAL) - skb_checksum_help(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb_shinfo(skb)->gso_size) + skb->ip_summed = CHECKSUM_NONE; + else + skb_checksum_help(skb); + } return XT_CONTINUE; } (unfortunately, the target isn't restricted to POSTROUTING, sigh).