date:20180805

[PATCH net-next 05/14] net: sched: act_ipt: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf spinlock to protect ipt action private data from concurrent
modification during dump. Ipt init already takes tcf spinlock when
modifying ipt state.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_ipt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 0dc787a57798..e149f0e66cb6 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -288,6 +288,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
 * for foolproof you need to not assume this
 */
 
+   spin_lock_bh(&ipt->tcf_lock);
t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
if (unlikely(!t))
goto nla_put_failure;
@@ -307,10 +308,12 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
goto nla_put_failure;
 
+   spin_unlock_bh(&ipt->tcf_lock);
kfree(t);
return skb->len;
 
 nla_put_failure:
+   spin_unlock_bh(&ipt->tcf_lock);
nlmsg_trim(skb, b);
kfree(t);
return -1;
-- 
2.7.5

[PATCH net-next 06/14] net: sched: act_pedit: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Rearrange pedit init code to only access pedit action data while holding
tcf spinlock. Change keys allocation type to atomic to allow it to execute
while holding tcf spinlock. Take tcf spinlock in dump function when
accessing pedit action data.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_pedit.c | 40 
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 43ba999b2d23..3f62da72ab6a 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -187,44 +187,38 @@ static int tcf_pedit_init(struct net *net, struct nlattr 
*nla,
tcf_idr_cleanup(tn, parm->index);
goto out_free;
}
-   p = to_pedit(*a);
-   keys = kmalloc(ksize, GFP_KERNEL);
-   if (!keys) {
-   tcf_idr_release(*a, bind);
-   ret = -ENOMEM;
-   goto out_free;
-   }
ret = ACT_P_CREATED;
} else if (err > 0) {
if (bind)
goto out_free;
if (!ovr) {
-   tcf_idr_release(*a, bind);
ret = -EEXIST;
-   goto out_free;
-   }
-   p = to_pedit(*a);
-   if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
-   keys = kmalloc(ksize, GFP_KERNEL);
-   if (!keys) {
-   ret = -ENOMEM;
-   goto out_free;
-   }
+   goto out_release;
}
} else {
return err;
}
 
+   p = to_pedit(*a);
spin_lock_bh(&p->tcf_lock);
-   p->tcfp_flags = parm->flags;
-   p->tcf_action = parm->action;
-   if (keys) {
+
+   if (ret == ACT_P_CREATED ||
+   (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) {
+   keys = kmalloc(ksize, GFP_ATOMIC);
+   if (!keys) {
+   spin_unlock_bh(&p->tcf_lock);
+   ret = -ENOMEM;
+   goto out_release;
+   }
kfree(p->tcfp_keys);
p->tcfp_keys = keys;
p->tcfp_nkeys = parm->nkeys;
}
memcpy(p->tcfp_keys, parm->keys, ksize);
 
+   p->tcfp_flags = parm->flags;
+   p->tcf_action = parm->action;
+
kfree(p->tcfp_keys_ex);
p->tcfp_keys_ex = keys_ex;
 
@@ -232,6 +226,9 @@ static int tcf_pedit_init(struct net *net, struct nlattr 
*nla,
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+
+out_release:
+   tcf_idr_release(*a, bind);
 out_free:
kfree(keys_ex);
return ret;
@@ -410,6 +407,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct 
tc_action *a,
if (unlikely(!opt))
return -ENOBUFS;
 
+   spin_lock_bh(&p->tcf_lock);
memcpy(opt->keys, p->tcfp_keys,
   p->tcfp_nkeys * sizeof(struct tc_pedit_key));
opt->index = p->tcf_index;
@@ -432,11 +430,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct 
tc_action *a,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
goto nla_put_failure;
+   spin_unlock_bh(&p->tcf_lock);
 
kfree(opt);
return skb->len;
 
 nla_put_failure:
+   spin_unlock_bh(&p->tcf_lock);
nlmsg_trim(skb, b);
kfree(opt);
return -1;
-- 
2.7.5

[PATCH net-next 14/14] net: sched: act_police: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf spinlock to protect police action private data from concurrent
modification during dump. (init already uses tcf spinlock when changing
police action state)

Pass tcf spinlock as estimator lock argument to gen_replace_estimator()
during action init.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_police.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 3eb5fe60c62c..4777da7caadf 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -138,7 +138,8 @@ static int tcf_act_police_init(struct net *net, struct 
nlattr *nla,
 
if (est) {
err = gen_replace_estimator(&police->tcf_bstats, NULL,
-   &police->tcf_rate_est, NULL,
+   &police->tcf_rate_est,
+   &police->tcf_lock,
&police->tcf_lock,
NULL, est);
if (err)
@@ -274,14 +275,15 @@ static int tcf_act_police_dump(struct sk_buff *skb, 
struct tc_action *a,
struct tcf_police *police = to_police(a);
struct tc_police opt = {
.index = police->tcf_index,
-   .action = police->tcf_action,
-   .mtu = police->tcfp_mtu,
-   .burst = PSCHED_NS2TICKS(police->tcfp_burst),
.refcnt = refcount_read(&police->tcf_refcnt) - ref,
.bindcnt = atomic_read(&police->tcf_bindcnt) - bind,
};
struct tcf_t t;
 
+   spin_lock_bh(&police->tcf_lock);
+   opt.action = police->tcf_action;
+   opt.mtu = police->tcfp_mtu;
+   opt.burst = PSCHED_NS2TICKS(police->tcfp_burst);
if (police->rate_present)
psched_ratecfg_getrate(&opt.rate, &police->rate);
if (police->peak_present)
@@ -301,10 +303,12 @@ static int tcf_act_police_dump(struct sk_buff *skb, 
struct tc_action *a,
t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
goto nla_put_failure;
+   spin_unlock_bh(&police->tcf_lock);
 
return skb->len;
 
 nla_put_failure:
+   spin_unlock_bh(&police->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next 12/14] net: sched: act_mirred: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Re-introduce mirred list spinlock, that was removed some time ago, in order
to protect it from concurrent modifications, instead of relying on rtnl
lock.

Use tcf spinlock to protect mirred action private data from concurrent
modification in init and dump. Rearrange access to mirred data in order to
be performed only while holding the lock.

Rearrange net dev access to always hold reference while working with it,
instead of relying on rntl lock. Change get dev function to increment net
device reference before returning it to caller, instead of assuming that
caller is protected with rtnl lock.

Provide rcu version of mirred dev and tunnel info access functions. (to be
used by unlocked drivers)

Signed-off-by: Vlad Buslov 
---
 include/net/tc_act/tc_mirred.h |  5 +++
 include/net/tc_act/tc_tunnel_key.h | 33 ---
 net/sched/act_mirred.c | 82 +-
 net/sched/cls_api.c|  1 +
 4 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index a2e9cbca5c9e..cb30be55e444 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -37,4 +37,9 @@ static inline struct net_device *tcf_mirred_dev(const struct 
tc_action *a)
return rtnl_dereference(to_mirred(a)->tcfm_dev);
 }
 
+static inline struct net_device *tcf_mirred_dev_rcu(const struct tc_action *a)
+{
+   return rcu_dereference(to_mirred(a)->tcfm_dev);
+}
+
 #endif /* __NET_TC_MIR_H */
diff --git a/include/net/tc_act/tc_tunnel_key.h 
b/include/net/tc_act/tc_tunnel_key.h
index 46b8c7f1c8d5..e6e475d788c6 100644
--- a/include/net/tc_act/tc_tunnel_key.h
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -30,26 +30,47 @@ struct tcf_tunnel_key {
 
 static inline bool is_tcf_tunnel_set(const struct tc_action *a)
 {
+   bool ret = false;
 #ifdef CONFIG_NET_CLS_ACT
struct tcf_tunnel_key *t = to_tunnel_key(a);
-   struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
+   struct tcf_tunnel_key_params *params;
 
+   rcu_read_lock();
+   params = rcu_dereference(t->params);
if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY)
-   return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET;
+   ret = params->tcft_action == TCA_TUNNEL_KEY_ACT_SET;
+   rcu_read_unlock();
 #endif
-   return false;
+   return ret;
 }
 
 static inline bool is_tcf_tunnel_release(const struct tc_action *a)
 {
+   bool ret = false;
 #ifdef CONFIG_NET_CLS_ACT
struct tcf_tunnel_key *t = to_tunnel_key(a);
-   struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
+   struct tcf_tunnel_key_params *params;
 
+   rcu_read_lock();
+   params = rcu_dereference(t->params);
if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY)
-   return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE;
+   ret = params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE;
+   rcu_read_unlock();
+#endif
+   return ret;
+}
+
+static inline
+struct ip_tunnel_info *tcf_tunnel_info_rcu(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+   struct tcf_tunnel_key *t = to_tunnel_key(a);
+   struct tcf_tunnel_key_params *params = rcu_dereference(t->params);
+
+   return ¶ms->tcft_enc_metadata->u.tun_info;
+#else
+   return NULL;
 #endif
-   return false;
 }
 
 static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index b26d060da08e..9f622114f5a5 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -30,6 +30,7 @@
 #include 
 
 static LIST_HEAD(mirred_list);
+static DEFINE_SPINLOCK(mirred_list_lock);
 
 static bool tcf_mirred_is_act_redirect(int action)
 {
@@ -62,13 +63,23 @@ static bool tcf_mirred_can_reinsert(int action)
return false;
 }
 
+static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m)
+{
+   return rcu_dereference_protected(m->tcfm_dev,
+lockdep_is_held(&m->tcf_lock));
+}
+
 static void tcf_mirred_release(struct tc_action *a)
 {
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
 
+   spin_lock(&mirred_list_lock);
list_del(&m->tcfm_list);
-   dev = rtnl_dereference(m->tcfm_dev);
+   spin_unlock(&mirred_list_lock);
+
+   /* last reference to action, no need to lock */
+   dev = rcu_dereference_protected(m->tcfm_dev, 1);
if (dev)
dev_put(dev);
 }
@@ -128,22 +139,9 @@ static int tcf_mirred_init(struct net *net, struct nlattr 
*nla,
NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
return -EINVAL;
}
-   if (parm->ifindex) {
-   dev = __dev_get_by_index(net, parm->ifindex);
-   if (dev == NULL) {
-   if (exists)
-   tcf_idr_rele

[PATCH net-next 10/14] net: sched: act_tunnel_key: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf lock to protect tunnel key action struct private data from
concurrent modification in init and dump. Use rcu swap operation to
reassign params pointer under protection of tcf lock. (old params value is
not used by init, so there is no need of standalone rcu dereference step)

Remove rtnl lock assertion that is no longer required.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_tunnel_key.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index d42d9e112789..ba2ae9f75ef5 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -204,7 +204,6 @@ static int tunnel_key_init(struct net *net, struct nlattr 
*nla,
 {
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
-   struct tcf_tunnel_key_params *params_old;
struct tcf_tunnel_key_params *params_new;
struct metadata_dst *metadata = NULL;
struct tc_tunnel_key *parm;
@@ -346,24 +345,22 @@ static int tunnel_key_init(struct net *net, struct nlattr 
*nla,
 
t = to_tunnel_key(*a);
 
-   ASSERT_RTNL();
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
tcf_idr_release(*a, bind);
NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters");
return -ENOMEM;
}
-
-   params_old = rtnl_dereference(t->params);
-
-   t->tcf_action = parm->action;
params_new->tcft_action = parm->t_action;
params_new->tcft_enc_metadata = metadata;
 
-   rcu_assign_pointer(t->params, params_new);
-
-   if (params_old)
-   kfree_rcu(params_old, rcu);
+   spin_lock(&t->tcf_lock);
+   t->tcf_action = parm->action;
+   rcu_swap_protected(t->params, params_new,
+  lockdep_is_held(&t->tcf_lock));
+   spin_unlock(&t->tcf_lock);
+   if (params_new)
+   kfree_rcu(params_new, rcu);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -485,12 +482,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct 
tc_action *a,
.index= t->tcf_index,
.refcnt   = refcount_read(&t->tcf_refcnt) - ref,
.bindcnt  = atomic_read(&t->tcf_bindcnt) - bind,
-   .action   = t->tcf_action,
};
struct tcf_t tm;
 
-   params = rtnl_dereference(t->params);
-
+   spin_lock(&t->tcf_lock);
+   params = rcu_dereference_protected(t->params,
+  lockdep_is_held(&t->tcf_lock));
+   opt.action   = t->tcf_action;
opt.t_action = params->tcft_action;
 
if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
@@ -522,10 +520,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct 
tc_action *a,
if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
  &tm, TCA_TUNNEL_KEY_PAD))
goto nla_put_failure;
+   spin_unlock(&t->tcf_lock);
 
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&t->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next 09/14] net: sched: act_skbmod: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Move read of skbmod_p rcu pointer to be protected by tcf spinlock. Use tcf
spinlock to protect private skbmod data from concurrent modification during
dump.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_skbmod.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index c437c6d51a71..e9c86ade3b40 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -156,7 +156,6 @@ static int tcf_skbmod_init(struct net *net, struct nlattr 
*nla,
 
d = to_skbmod(*a);
 
-   ASSERT_RTNL();
p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
if (unlikely(!p)) {
tcf_idr_release(*a, bind);
@@ -166,10 +165,10 @@ static int tcf_skbmod_init(struct net *net, struct nlattr 
*nla,
p->flags = lflags;
d->tcf_action = parm->action;
 
-   p_old = rtnl_dereference(d->skbmod_p);
-
if (ovr)
spin_lock_bh(&d->tcf_lock);
+   /* Protected by tcf_lock if overwriting existing action. */
+   p_old = rcu_dereference_protected(d->skbmod_p, 1);
 
if (lflags & SKBMOD_F_DMAC)
ether_addr_copy(p->eth_dst, daddr);
@@ -205,15 +204,18 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct 
tc_action *a,
 {
struct tcf_skbmod *d = to_skbmod(a);
unsigned char *b = skb_tail_pointer(skb);
-   struct tcf_skbmod_params  *p = rtnl_dereference(d->skbmod_p);
+   struct tcf_skbmod_params  *p;
struct tc_skbmod opt = {
.index   = d->tcf_index,
.refcnt  = refcount_read(&d->tcf_refcnt) - ref,
.bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
-   .action  = d->tcf_action,
};
struct tcf_t t;
 
+   spin_lock_bh(&d->tcf_lock);
+   opt.action = d->tcf_action;
+   p = rcu_dereference_protected(d->skbmod_p,
+ lockdep_is_held(&d->tcf_lock));
opt.flags  = p->flags;
if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -231,8 +233,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct 
tc_action *a,
if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
goto nla_put_failure;
 
+   spin_unlock_bh(&d->tcf_lock);
return skb->len;
 nla_put_failure:
+   spin_unlock_bh(&d->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next 00/14] Remove rtnl lock dependency from all action implementations

2018-08-05 Thread Vlad Buslov

Currently, all netlink protocol handlers for updating rules, actions and
qdiscs are protected with single global rtnl lock which removes any
possibility for parallelism. This patch set is a second step to remove
rtnl lock dependency from TC rules update path.

Recently, new rtnl registration flag RTNL_FLAG_DOIT_UNLOCKED was added.
Handlers registered with this flag are called without RTNL taken. End
goal is to have rule update handlers(RTM_NEWTFILTER, RTM_DELTFILTER,
etc.) to be registered with UNLOCKED flag to allow parallel execution.
However, there is no intention to completely remove or split rtnl lock
itself. This patch set addresses specific problems in implementation of
tc actions that prevent their control path from being executed
concurrently. Additional changes are required to refactor classifiers
API and individual classifiers for parallel execution. This patch set
lays groundwork to eventually register rule update handlers as
rtnl-unlocked.

Action API is already prepared for parallel execution with previous
patch set, which means that action ops that use action API for their
implementation do not require additional modifications. (delete, search,
etc.) Action API implements concurrency-safe reference counting and
guarantees that cleanup/delete is called only once, after last reference
to action is released.

The goal of this change is to update specific actions APIs that access
action private state directly, in order to be independent from external
locking. General approach is to re-use existing tcf_lock spinlock (used
by some action implementation to synchronize control path with data
path) to protect action private state from concurrent modification. If
action has rcu-protected pointer, tcf spinlock is used to protect its
update code, instead of relying on rtnl lock.

Some actions need to determine rtnl mutex status in order to release it.
For example, ife action can load additional kernel modules(meta ops) and
must make sure that no locks are held during module load. In such cases
'rtnl_held' argument is used to conditionally release rtnl mutex.

Vlad Buslov (14):
  net: sched: act_bpf: remove dependency on rtnl lock
  net: sched: act_csum: remove dependency on rtnl lock
  net: sched: act_gact: remove dependency on rtnl lock
  net: sched: act_ife: remove dependency on rtnl lock
  net: sched: act_ipt: remove dependency on rtnl lock
  net: sched: act_pedit: remove dependency on rtnl lock
  net: sched: act_sample: remove dependency on rtnl lock
  net: sched: act_simple: remove dependency on rtnl lock
  net: sched: act_skbmod: remove dependency on rtnl lock
  net: sched: act_tunnel_key: remove dependency on rtnl lock
  net: sched: act_vlan: remove dependency on rtnl lock
  net: sched: act_mirred: remove dependency on rtnl lock
  net: core: add new/replace rate estimator lock parameter
  net: sched: act_police: remove dependency on rtnl lock

 include/net/gen_stats.h|  2 +
 include/net/tc_act/tc_mirred.h |  5 +++
 include/net/tc_act/tc_tunnel_key.h | 33 ---
 net/core/gen_estimator.c   | 58 ---
 net/netfilter/xt_RATEEST.c |  2 +-
 net/sched/act_api.c|  2 +-
 net/sched/act_bpf.c| 10 +++--
 net/sched/act_csum.c   | 24 ++-
 net/sched/act_gact.c   | 10 -
 net/sched/act_ife.c| 40 ---
 net/sched/act_ipt.c|  3 ++
 net/sched/act_mirred.c | 82 +-
 net/sched/act_pedit.c  | 40 +--
 net/sched/act_police.c | 10 +++--
 net/sched/act_sample.c | 12 +-
 net/sched/act_simple.c |  6 ++-
 net/sched/act_skbmod.c | 14 ---
 net/sched/act_tunnel_key.c | 26 ++--
 net/sched/act_vlan.c   | 27 +++--
 net/sched/cls_api.c|  1 +
 net/sched/sch_api.c|  2 +
 net/sched/sch_cbq.c|  4 +-
 net/sched/sch_drr.c|  4 +-
 net/sched/sch_hfsc.c   |  4 +-
 net/sched/sch_htb.c|  4 +-
 net/sched/sch_qfq.c|  4 +-
 26 files changed, 285 insertions(+), 144 deletions(-)

-- 
2.7.5

[PATCH net-next 11/14] net: sched: act_vlan: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf spinlock to protect vlan action private data from concurrent
modification during dump and init. Use rcu swap operation to reassign
params pointer under protection of tcf lock. (old params value is not used
by init, so there is no need of standalone rcu dereference step)

Remove rtnl assertion that is no longer necessary.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_vlan.c | 27 +++
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 15a0ee214c9c..5bde17fe3608 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -109,7 +109,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr 
*nla,
 {
struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
-   struct tcf_vlan_params *p, *p_old;
+   struct tcf_vlan_params *p;
struct tc_vlan *parm;
struct tcf_vlan *v;
int action;
@@ -202,26 +202,24 @@ static int tcf_vlan_init(struct net *net, struct nlattr 
*nla,
 
v = to_vlan(*a);
 
-   ASSERT_RTNL();
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
tcf_idr_release(*a, bind);
return -ENOMEM;
}
 
-   v->tcf_action = parm->action;
-
-   p_old = rtnl_dereference(v->vlan_p);
-
p->tcfv_action = action;
p->tcfv_push_vid = push_vid;
p->tcfv_push_prio = push_prio;
p->tcfv_push_proto = push_proto;
 
-   rcu_assign_pointer(v->vlan_p, p);
+   spin_lock(&v->tcf_lock);
+   v->tcf_action = parm->action;
+   rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
+   spin_unlock(&v->tcf_lock);
 
-   if (p_old)
-   kfree_rcu(p_old, rcu);
+   if (p)
+   kfree_rcu(p, rcu);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -243,16 +241,18 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct 
tc_action *a,
 {
unsigned char *b = skb_tail_pointer(skb);
struct tcf_vlan *v = to_vlan(a);
-   struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p);
+   struct tcf_vlan_params *p;
struct tc_vlan opt = {
.index= v->tcf_index,
.refcnt   = refcount_read(&v->tcf_refcnt) - ref,
.bindcnt  = atomic_read(&v->tcf_bindcnt) - bind,
-   .action   = v->tcf_action,
-   .v_action = p->tcfv_action,
};
struct tcf_t t;
 
+   spin_lock(&v->tcf_lock);
+   opt.action = v->tcf_action;
+   p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock));
+   opt.v_action = p->tcfv_action;
if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
 
@@ -268,9 +268,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct 
tc_action *a,
tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
+   spin_unlock(&v->tcf_lock);
+
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&v->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next 03/14] net: sched: act_gact: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf spinlock to protect gact action private state from concurrent
modification during dump and init. Remove rtnl assertion that is no longer
necessary.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_gact.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 661b72b9147d..bfccd34a3968 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -113,7 +113,7 @@ static int tcf_gact_init(struct net *net, struct nlattr 
*nla,
 
gact = to_gact(*a);
 
-   ASSERT_RTNL();
+   spin_lock(&gact->tcf_lock);
gact->tcf_action = parm->action;
 #ifdef CONFIG_GACT_PROB
if (p_parm) {
@@ -126,6 +126,8 @@ static int tcf_gact_init(struct net *net, struct nlattr 
*nla,
gact->tcfg_ptype   = p_parm->ptype;
}
 #endif
+   spin_unlock(&gact->tcf_lock);
+
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
@@ -178,10 +180,11 @@ static int tcf_gact_dump(struct sk_buff *skb, struct 
tc_action *a,
.index   = gact->tcf_index,
.refcnt  = refcount_read(&gact->tcf_refcnt) - ref,
.bindcnt = atomic_read(&gact->tcf_bindcnt) - bind,
-   .action  = gact->tcf_action,
};
struct tcf_t t;
 
+   spin_lock(&gact->tcf_lock);
+   opt.action = gact->tcf_action;
if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
 #ifdef CONFIG_GACT_PROB
@@ -199,9 +202,12 @@ static int tcf_gact_dump(struct sk_buff *skb, struct 
tc_action *a,
tcf_tm_dump(&t, &gact->tcf_tm);
if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
goto nla_put_failure;
+   spin_unlock(&gact->tcf_lock);
+
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&gact->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next 08/14] net: sched: act_simple: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf spinlock to protect private simple action data from concurrent
modification during dump. (simple init already uses tcf spinlock when
changing action state)

Signed-off-by: Vlad Buslov 
---
 net/sched/act_simple.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index aa51152e0066..18e4452574cd 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -156,10 +156,11 @@ static int tcf_simp_dump(struct sk_buff *skb, struct 
tc_action *a,
.index   = d->tcf_index,
.refcnt  = refcount_read(&d->tcf_refcnt) - ref,
.bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
-   .action  = d->tcf_action,
};
struct tcf_t t;
 
+   spin_lock_bh(&d->tcf_lock);
+   opt.action = d->tcf_action;
if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
goto nla_put_failure;
@@ -167,9 +168,12 @@ static int tcf_simp_dump(struct sk_buff *skb, struct 
tc_action *a,
tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
goto nla_put_failure;
+   spin_unlock_bh(&d->tcf_lock);
+
return skb->len;
 
 nla_put_failure:
+   spin_unlock_bh(&d->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next 04/14] net: sched: act_ife: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf spinlock and rcu to protect params pointer from concurrent
modification during dump and init. Use rcu swap operation to reassign
params pointer under protection of tcf lock. (old params value is not used
by init, so there is no need of standalone rcu dereference step)

Ife action has meta-actions that are compiled as standalone modules. Rtnl
mutex must be released while loading a kernel module. In order to support
execution without rtnl mutex, propagate 'rtnl_held' argument to meta action
loading functions. When requesting meta action module, conditionally
release rtnl lock depending on 'rtnl_held' argument.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_ife.c | 40 +---
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index df4060e32d43..5d200495e467 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -268,7 +268,8 @@ static const char *ife_meta_id2name(u32 metaid)
  * under ife->tcf_lock for existing action
 */
 static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
-   void *val, int len, bool exists)
+   void *val, int len, bool exists,
+   bool rtnl_held)
 {
struct tcf_meta_ops *ops = find_ife_oplist(metaid);
int ret = 0;
@@ -278,9 +279,11 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, 
u32 metaid,
 #ifdef CONFIG_MODULES
if (exists)
spin_unlock_bh(&ife->tcf_lock);
-   rtnl_unlock();
+   if (rtnl_held)
+   rtnl_unlock();
request_module("ife-meta-%s", ife_meta_id2name(metaid));
-   rtnl_lock();
+   if (rtnl_held)
+   rtnl_lock();
if (exists)
spin_lock_bh(&ife->tcf_lock);
ops = find_ife_oplist(metaid);
@@ -421,7 +424,7 @@ static void tcf_ife_cleanup(struct tc_action *a)
 
 /* under ife->tcf_lock for existing action */
 static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
-bool exists)
+bool exists, bool rtnl_held)
 {
int len = 0;
int rc = 0;
@@ -433,7 +436,8 @@ static int populate_metalist(struct tcf_ife_info *ife, 
struct nlattr **tb,
val = nla_data(tb[i]);
len = nla_len(tb[i]);
 
-   rc = load_metaops_and_vet(ife, i, val, len, exists);
+   rc = load_metaops_and_vet(ife, i, val, len, exists,
+ rtnl_held);
if (rc != 0)
return rc;
 
@@ -454,7 +458,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct tc_action_net *tn = net_generic(net, ife_net_id);
struct nlattr *tb[TCA_IFE_MAX + 1];
struct nlattr *tb2[IFE_META_MAX + 1];
-   struct tcf_ife_params *p, *p_old;
+   struct tcf_ife_params *p;
struct tcf_ife_info *ife;
u16 ife_type = ETH_P_IFE;
struct tc_ife *parm;
@@ -558,7 +562,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
return err;
}
 
-   err = populate_metalist(ife, tb2, exists);
+   err = populate_metalist(ife, tb2, exists, rtnl_held);
if (err)
goto metadata_parse_err;
 
@@ -581,13 +585,13 @@ static int tcf_ife_init(struct net *net, struct nlattr 
*nla,
}
 
ife->tcf_action = parm->action;
+   /* protected by tcf_lock when modifying existing action */
+   rcu_swap_protected(ife->params, p, 1);
+
if (exists)
spin_unlock_bh(&ife->tcf_lock);
-
-   p_old = rtnl_dereference(ife->params);
-   rcu_assign_pointer(ife->params, p);
-   if (p_old)
-   kfree_rcu(p_old, rcu);
+   if (p)
+   kfree_rcu(p, rcu);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -600,16 +604,20 @@ static int tcf_ife_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
 {
unsigned char *b = skb_tail_pointer(skb);
struct tcf_ife_info *ife = to_ife(a);
-   struct tcf_ife_params *p = rtnl_dereference(ife->params);
+   struct tcf_ife_params *p;
struct tc_ife opt = {
.index = ife->tcf_index,
.refcnt = refcount_read(&ife->tcf_refcnt) - ref,
.bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
-   .action = ife->tcf_action,
-   .flags = p->flags,
};
struct tcf_t t;
 
+   spin_lock_bh(&ife->tcf_lock);
+   opt.action = ife->tcf_action;
+   p = rcu_dereference_protected(ife->params,
+ lockdep_is_held(&ife->tcf_lock));
+   opt.flags = p->flags;
+

[PATCH net-next 01/14] net: sched: act_bpf: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf spinlock to protect bpf action private data from concurrent
modification during dump and init. Remove rtnl lock assertion that is no
longer necessary.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_bpf.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 6203eb075c9a..9e8a33f9fee3 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -143,11 +143,12 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct 
tc_action *act,
.index   = prog->tcf_index,
.refcnt  = refcount_read(&prog->tcf_refcnt) - ref,
.bindcnt = atomic_read(&prog->tcf_bindcnt) - bind,
-   .action  = prog->tcf_action,
};
struct tcf_t tm;
int ret;
 
+   spin_lock(&prog->tcf_lock);
+   opt.action = prog->tcf_action;
if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
 
@@ -163,9 +164,11 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct 
tc_action *act,
  TCA_ACT_BPF_PAD))
goto nla_put_failure;
 
+   spin_unlock(&prog->tcf_lock);
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&prog->tcf_lock);
nlmsg_trim(skb, tp);
return -1;
 }
@@ -264,7 +267,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf 
*prog,
 {
cfg->is_ebpf = tcf_bpf_is_ebpf(prog);
/* updates to prog->filter are prevented, since it's called either
-* with rtnl lock or during final cleanup in rcu callback
+* with tcf lock or during final cleanup in rcu callback
 */
cfg->filter = rcu_dereference_protected(prog->filter, 1);
 
@@ -336,8 +339,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
goto out;
 
prog = to_bpf(*act);
-   ASSERT_RTNL();
 
+   spin_lock(&prog->tcf_lock);
if (res != ACT_P_CREATED)
tcf_bpf_prog_fill_cfg(prog, &old);
 
@@ -349,6 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
prog->tcf_action = parm->action;
rcu_assign_pointer(prog->filter, cfg.filter);
+   spin_unlock(&prog->tcf_lock);
 
if (res == ACT_P_CREATED) {
tcf_idr_insert(tn, *act);
-- 
2.7.5

[PATCH net-next 13/14] net: core: add new/replace rate estimator lock parameter

2018-08-05 Thread Vlad Buslov

Extend rate estimator 'new' and 'replace' APIs with additional spinlock
parameter to be used by rtnl-unlocked actions to protect rate_est pointer
from concurrent modification.

Extract code that requires synchronization from gen_new_estimator into
__replace_estimator function in order to be used by both locked and
unlocked paths.

Signed-off-by: Vlad Buslov 
---
 include/net/gen_stats.h|  2 ++
 net/core/gen_estimator.c   | 58 +++---
 net/netfilter/xt_RATEEST.c |  2 +-
 net/sched/act_api.c|  2 +-
 net/sched/act_police.c |  2 +-
 net/sched/sch_api.c|  2 ++
 net/sched/sch_cbq.c|  4 ++--
 net/sched/sch_drr.c|  4 ++--
 net/sched/sch_hfsc.c   |  4 ++--
 net/sched/sch_htb.c|  4 ++--
 net/sched/sch_qfq.c|  4 ++--
 11 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 0304ba2ae353..d1ef63d16abc 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -59,12 +59,14 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
  struct net_rate_estimator __rcu **rate_est,
+ spinlock_t *rate_est_lock,
  spinlock_t *stats_lock,
  seqcount_t *running, struct nlattr *opt);
 void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
  struct net_rate_estimator __rcu **ptr,
+ spinlock_t *rate_est_lock,
  spinlock_t *stats_lock,
  seqcount_t *running, struct nlattr *opt);
 bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 98fd12721221..012e37011147 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -107,11 +107,43 @@ static void est_timer(struct timer_list *t)
mod_timer(&est->timer, est->next_jiffies);
 }
 
+static void __replace_estimator(struct net_rate_estimator __rcu **rate_est,
+   struct net_rate_estimator *new)
+{
+   struct net_rate_estimator *old = rcu_dereference_protected(*rate_est,
+  1);
+
+   if (old) {
+   del_timer_sync(&old->timer);
+   new->avbps = old->avbps;
+   new->avpps = old->avpps;
+   }
+
+   new->next_jiffies = jiffies + ((HZ / 4) << new->intvl_log);
+   timer_setup(&new->timer, est_timer, 0);
+   mod_timer(&new->timer, new->next_jiffies);
+
+   rcu_assign_pointer(*rate_est, new);
+
+   if (old)
+   kfree_rcu(old, rcu);
+}
+
+static void replace_estimator(struct net_rate_estimator __rcu **rate_est,
+ struct net_rate_estimator *new,
+ spinlock_t *rate_est_lock)
+{
+   spin_lock_bh(rate_est_lock);
+   __replace_estimator(rate_est, new);
+   spin_unlock_bh(rate_est_lock);
+}
+
 /**
  * gen_new_estimator - create a new rate estimator
  * @bstats: basic statistics
  * @cpu_bstats: bstats per cpu
  * @rate_est: rate estimator statistics
+ * @rate_est_lock: rate_est lock (might be NULL)
  * @stats_lock: statistics lock
  * @running: qdisc running seqcount
  * @opt: rate estimator configuration TLV
@@ -128,12 +160,13 @@ static void est_timer(struct timer_list *t)
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
  struct net_rate_estimator __rcu **rate_est,
+ spinlock_t *rate_est_lock,
  spinlock_t *stats_lock,
  seqcount_t *running,
  struct nlattr *opt)
 {
struct gnet_estimator *parm = nla_data(opt);
-   struct net_rate_estimator *old, *est;
+   struct net_rate_estimator *est;
struct gnet_stats_basic_packed b;
int intvl_log;
 
@@ -167,20 +200,15 @@ int gen_new_estimator(struct gnet_stats_basic_packed 
*bstats,
local_bh_enable();
est->last_bytes = b.bytes;
est->last_packets = b.packets;
-   old = rcu_dereference_protected(*rate_est, 1);
-   if (old) {
-   del_timer_sync(&old->timer);
-   est->avbps = old->avbps;
-   est->avpps = old->avpps;
-   }
 
-   est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
-   timer_setup(&est->timer, est_timer, 0);
-   mod_timer(&est->timer, est->next_jiffies);
+   if (rate_est_lock)
+   replace_estimator(rate_est, est, rate_est_lock);
+   else
+   /* If no spinlock argu

[PATCH net-next 07/14] net: sched: act_sample: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf spinlock to protect private sample action data from concurrent
modification during dump and init.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_sample.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 2608ccc83e5e..81071afe1b43 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -80,11 +80,13 @@ static int tcf_sample_init(struct net *net, struct nlattr 
*nla,
}
s = to_sample(*a);
 
+   spin_lock(&s->tcf_lock);
s->tcf_action = parm->action;
s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
psample_group = psample_group_get(net, s->psample_group_num);
if (!psample_group) {
+   spin_unlock(&s->tcf_lock);
tcf_idr_release(*a, bind);
return -ENOMEM;
}
@@ -94,6 +96,7 @@ static int tcf_sample_init(struct net *net, struct nlattr 
*nla,
s->truncate = true;
s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
}
+   spin_unlock(&s->tcf_lock);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -105,7 +108,8 @@ static void tcf_sample_cleanup(struct tc_action *a)
struct tcf_sample *s = to_sample(a);
struct psample_group *psample_group;
 
-   psample_group = rtnl_dereference(s->psample_group);
+   /* last reference to action, no need to lock */
+   psample_group = rcu_dereference_protected(s->psample_group, 1);
RCU_INIT_POINTER(s->psample_group, NULL);
if (psample_group)
psample_group_put(psample_group);
@@ -174,12 +178,13 @@ static int tcf_sample_dump(struct sk_buff *skb, struct 
tc_action *a,
struct tcf_sample *s = to_sample(a);
struct tc_sample opt = {
.index  = s->tcf_index,
-   .action = s->tcf_action,
.refcnt = refcount_read(&s->tcf_refcnt) - ref,
.bindcnt= atomic_read(&s->tcf_bindcnt) - bind,
};
struct tcf_t t;
 
+   spin_lock(&s->tcf_lock);
+   opt.action = s->tcf_action;
if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
 
@@ -196,9 +201,12 @@ static int tcf_sample_dump(struct sk_buff *skb, struct 
tc_action *a,
 
if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
goto nla_put_failure;
+   spin_unlock(&s->tcf_lock);
+
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&s->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

[PATCH net-next 02/14] net: sched: act_csum: remove dependency on rtnl lock

2018-08-05 Thread Vlad Buslov

Use tcf lock to protect csum action struct private data from concurrent
modification in init and dump. Use rcu swap operation to reassign params
pointer under protection of tcf lock. (old params value is not used by
init, so there is no need of standalone rcu dereference step)

Remove rtnl assertion that is no longer necessary.

Signed-off-by: Vlad Buslov 
---
 net/sched/act_csum.c | 24 +++-
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 648a3a35b720..f01c59ba6d12 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -50,7 +50,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 struct netlink_ext_ack *extack)
 {
struct tc_action_net *tn = net_generic(net, csum_net_id);
-   struct tcf_csum_params *params_old, *params_new;
+   struct tcf_csum_params *params_new;
struct nlattr *tb[TCA_CSUM_MAX + 1];
struct tc_csum *parm;
struct tcf_csum *p;
@@ -88,20 +88,22 @@ static int tcf_csum_init(struct net *net, struct nlattr 
*nla,
}
 
p = to_tcf_csum(*a);
-   ASSERT_RTNL();
 
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
tcf_idr_release(*a, bind);
return -ENOMEM;
}
-   params_old = rtnl_dereference(p->params);
+   params_new->update_flags = parm->update_flags;
 
+   spin_lock(&p->tcf_lock);
p->tcf_action = parm->action;
-   params_new->update_flags = parm->update_flags;
-   rcu_assign_pointer(p->params, params_new);
-   if (params_old)
-   kfree_rcu(params_old, rcu);
+   rcu_swap_protected(p->params, params_new,
+  lockdep_is_held(&p->tcf_lock));
+   spin_unlock(&p->tcf_lock);
+
+   if (params_new)
+   kfree_rcu(params_new, rcu);
 
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -599,11 +601,13 @@ static int tcf_csum_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
.index   = p->tcf_index,
.refcnt  = refcount_read(&p->tcf_refcnt) - ref,
.bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
-   .action  = p->tcf_action,
};
struct tcf_t t;
 
-   params = rtnl_dereference(p->params);
+   spin_lock(&p->tcf_lock);
+   params = rcu_dereference_protected(p->params,
+  lockdep_is_held(&p->tcf_lock));
+   opt.action = p->tcf_action;
opt.update_flags = params->update_flags;
 
if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
@@ -612,10 +616,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct 
tc_action *a, int bind,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
goto nla_put_failure;
+   spin_unlock(&p->tcf_lock);
 
return skb->len;
 
 nla_put_failure:
+   spin_unlock(&p->tcf_lock);
nlmsg_trim(skb, b);
return -1;
 }
-- 
2.7.5

Re: [PATCH bpf] bpf: btf: Change tools/lib/bpf/btf to LGPL

2018-08-05 Thread Daniel Borkmann

On 08/06/2018 06:45 AM, Alexei Starovoitov wrote:
> On Sun, Aug 05, 2018 at 05:19:13PM -0700, Martin KaFai Lau wrote:
>> This patch changes the tools/lib/bpf/btf.[ch] to LGPL which
>> is inline with libbpf also.
>>
>> Signed-off-by: Martin KaFai Lau 
> 
> technically it's bpf tree material, but since it changes
> "comment" only and we're very late into the release,
> I think bpf-next would be fine too.
> I'm ok whichever way.
> Acked-by: Alexei Starovoitov 

I think bpf tree is more appropriate, thus applied there, thanks Martin!

Re: [PATCH bpf] bpf: btf: Change tools/lib/bpf/btf to LGPL

2018-08-05 Thread Alexei Starovoitov

On Sun, Aug 05, 2018 at 05:19:13PM -0700, Martin KaFai Lau wrote:
> This patch changes the tools/lib/bpf/btf.[ch] to LGPL which
> is inline with libbpf also.
> 
> Signed-off-by: Martin KaFai Lau 

technically it's bpf tree material, but since it changes
"comment" only and we're very late into the release,
I think bpf-next would be fine too.
I'm ok whichever way.
Acked-by: Alexei Starovoitov

[net-next:master 457/471] ptp_qoriq.c:undefined reference to `__aeabi_uldivmod'

2018-08-05 Thread kbuild test robot

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   981467033a37d916649647fa3afe1fe99bba1817
commit: 91305f2812624c0cf7ccbb44133b66d3b24676e4 [457/471] ptp_qoriq: support 
automatic configuration for ptp timer
config: arm-multi_v7_defconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 91305f2812624c0cf7ccbb44133b66d3b24676e4
# save the attached .config to linux build tree
GCC_VERSION=7.2.0 make.cross ARCH=arm 

All errors (new ones prefixed by >>):

   drivers/ptp/ptp_qoriq.o: In function `qoriq_ptp_probe':
>> ptp_qoriq.c:(.text+0xd0c): undefined reference to `__aeabi_uldivmod'

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip

[net-next:master 1709/1739] drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2989:1: warning: the frame size of 1156 bytes is larger than 1024 bytes

2018-08-05 Thread kbuild test robot

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   981467033a37d916649647fa3afe1fe99bba1817
commit: 6c5657d085ae8c13a8565b98e6a23fe68f0bede4 [1709/1739] bnxt_en: Add 
support for ethtool get dump.
config: microblaze-allmodconfig (attached as .config)
compiler: microblaze-linux-gcc (GCC) 8.1.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 6c5657d085ae8c13a8565b98e6a23fe68f0bede4
# save the attached .config to linux build tree
GCC_VERSION=8.1.0 make.cross ARCH=microblaze 

All warnings (new ones prefixed by >>):

   In function 'bnxt_fill_coredump_record',
   inlined from 'bnxt_get_coredump' at 
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2980:3:
   drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2863:2: warning: 'strncpy' 
output truncated before terminating nul copying as many bytes from a string as 
its length [-Wstringop-truncation]
 strncpy(record->system_name, utsname()->nodename,
 ^
  strlen(utsname()->nodename));
  
   drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c: In function 
'bnxt_get_coredump':
>> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2989:1: warning: the frame 
>> size of 1156 bytes is larger than 1024 bytes [-Wframe-larger-than=]
}
^

vim +2989 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c

  2896  
  2897  static int bnxt_get_coredump(struct bnxt *bp, void *buf, u32 *dump_len)
  2898  {
  2899  u32 ver_get_resp_len = sizeof(struct hwrm_ver_get_output);
  2900  struct coredump_segment_record *seg_record = NULL;
  2901  u32 offset = 0, seg_hdr_len, seg_record_len;
  2902  struct bnxt_coredump_segment_hdr seg_hdr;
  2903  struct bnxt_coredump_record coredump_rec;
  2904  struct bnxt_coredump coredump = {NULL};
  2905  time64_t start_time;
  2906  u16 start_utc;
  2907  int rc = 0, i;
  2908  
  2909  start_time = ktime_get_real_seconds();
  2910  start_utc = sys_tz.tz_minuteswest * 60;
  2911  seg_hdr_len = sizeof(seg_hdr);
  2912  
  2913  /* First segment should be hwrm_ver_get response */
  2914  *dump_len = seg_hdr_len + ver_get_resp_len;
  2915  if (buf) {
  2916  bnxt_fill_coredump_seg_hdr(bp, &seg_hdr, NULL, 
ver_get_resp_len,
  2917 0, 0, 0);
  2918  memcpy(buf + offset, &seg_hdr, seg_hdr_len);
  2919  offset += seg_hdr_len;
  2920  memcpy(buf + offset, &bp->ver_resp, ver_get_resp_len);
  2921  offset += ver_get_resp_len;
  2922  }
  2923  
  2924  rc = bnxt_hwrm_dbg_coredump_list(bp, &coredump);
  2925  if (rc) {
  2926  netdev_err(bp->dev, "Failed to get coredump segment 
list\n");
  2927  goto err;
  2928  }
  2929  
  2930  *dump_len += seg_hdr_len * coredump.total_segs;
  2931  
  2932  seg_record = (struct coredump_segment_record *)coredump.data;
  2933  seg_record_len = sizeof(*seg_record);
  2934  
  2935  for (i = 0; i < coredump.total_segs; i++) {
  2936  u16 comp_id = le16_to_cpu(seg_record->component_id);
  2937  u16 seg_id = le16_to_cpu(seg_record->segment_id);
  2938  u32 duration = 0, seg_len = 0;
  2939  unsigned long start, end;
  2940  
  2941  start = jiffies;
  2942  
  2943  rc = bnxt_hwrm_dbg_coredump_initiate(bp, comp_id, 
seg_id);
  2944  if (rc) {
  2945  netdev_err(bp->dev,
  2946 "Failed to initiate coredump for seg 
= %d\n",
  2947 seg_record->segment_id);
  2948  goto next_seg;
  2949  }
  2950  
  2951  /* Write segment data into the buffer */
  2952  rc = bnxt_hwrm_dbg_coredump_retrieve(bp, comp_id, 
seg_id,
  2953   &seg_len, buf,
  2954   offset + 
seg_hdr_len);
  2955  if (rc)
  2956  netdev_err(bp->dev,
  2957 "Failed to retrieve coredump for seg 
= %d\n",
  2958 seg_record->segment_id);
  2959  
  2960  next_seg:
  2961  end = jiffies;
  2962  duration = jiffies_to_msecs(end - start);
  2963  bnxt_fill_coredump_seg_hdr(bp, &seg_hdr, seg_record, 
seg_len,
  2964 rc, duration, 0);
  2965  
  2966  if (buf) {
  2967

[PATCH] checkpatch: warn on unnecessary int declarations

2018-08-05 Thread Joe Perches

On Sun, 2018-08-05 at 08:52 -0700, Linus Torvalds wrote:
> "long unsigned int" isn't _technically_ wrong. But we normally
> call that type "unsigned long".

So add a checkpatch test for it.

Signed-off-by: Joe Perches 
---
 scripts/checkpatch.pl | 20 
 1 file changed, 20 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index ce72cc4784e6..bc6dda34394e 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3829,6 +3829,26 @@ sub process {
 "type '$tmp' should be specified in [[un]signed] 
[short|int|long|long long] order\n" . $herecurr);
}
 
+# check for unnecessary  int declarations of short/long/long long
+   while ($sline =~ 
m{\b($TypeMisordered(\s*\*)*|$C90_int_types)\b}g) {
+   my $type = trim($1);
+   next if ($type !~ /\bint\b/);
+   next if ($type !~ /\b(?:short|long\s+long|long)\b/);
+   my $new_type = $type;
+   $new_type =~ s/\b\s*int\s*\b/ /;
+   $new_type =~ s/\b\s*(?:un)?signed\b\s*/ /;
+   $new_type =~ s/^const\s+//;
+   $new_type = "unsigned $new_type" if ($type =~ 
/\bunsigned\b/);
+   $new_type = "const $new_type" if ($type =~ /^const\b/);
+   $new_type =~ s/\s+/ /g;
+   $new_type = trim($new_type);
+   if (WARN("UNNECESSARY_INT",
+"Prefer '$new_type' over '$type' as the int is 
unnecessary\n" . $herecurr) &&
+   $fix) {
+   $fixed[$fixlinenr] =~ 
s/\b\Q$type\E\b/$new_type/;
+   }
+   }
+
 # check for static const char * arrays.
if ($line =~ 
/\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) {
WARN("STATIC_CONST_CHAR_ARRAY",

Re: [PATCH] net/bridge/br_multicast: remove redundant variable "err"

2018-08-05 Thread zhong jiang

On 2018/8/6 8:33, David Miller wrote:
> From: zhong jiang 
> Date: Sun, 5 Aug 2018 22:18:43 +0800
>
>> @@ -1797,7 +1795,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
>>  struct sk_buff *skb_trimmed = NULL;
>>  const unsigned char *src;
>>  struct igmphdr *ih;
>> -int err;
>> +int err = 0;
>>  
>>  err = ip_mc_check_igmp(skb, &skb_trimmed);
> The initialization of err to '0' is unnecessary, it gets assigned on
> the very next line.
>
> .
>
 That's right.  I will drop that change and repost.

Thanks,
zhong jiang

linux-next: build failure after merge of the net-next tree

2018-08-05 Thread Stephen Rothwell

Hi all,

After merging the net-next tree, today's linux-next build (arm
multi_v7_defconfig) failed like this:

drivers/ptp/ptp_qoriq.o: In function `qoriq_ptp_probe':
ptp_qoriq.c:(.text+0xd0c): undefined reference to `__aeabi_uldivmod'

Caused by commit

  91305f281262 ("ptp_qoriq: support automatic configuration for ptp timer")

I just reverted that commit for today.

-- 
Cheers,
Stephen Rothwell


pgpEk7KnNE93L.pgp
Description: OpenPGP digital signature

[net-next:master 1709/1739] drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2863:2: warning: 'strncpy' output truncated before terminating nul copying as many bytes from a string as its length

2018-08-05 Thread kbuild test robot

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   981467033a37d916649647fa3afe1fe99bba1817
commit: 6c5657d085ae8c13a8565b98e6a23fe68f0bede4 [1709/1739] bnxt_en: Add 
support for ethtool get dump.
config: ia64-allmodconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 8.1.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 6c5657d085ae8c13a8565b98e6a23fe68f0bede4
# save the attached .config to linux build tree
GCC_VERSION=8.1.0 make.cross ARCH=ia64 

All warnings (new ones prefixed by >>):

   In function 'bnxt_fill_coredump_record',
   inlined from 'bnxt_get_coredump' at 
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2980:3:
>> drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c:2863:2: warning: 'strncpy' 
>> output truncated before terminating nul copying as many bytes from a string 
>> as its length [-Wstringop-truncation]
 strncpy(record->system_name, utsname()->nodename,
 ^
  strlen(utsname()->nodename));
  

vim +/strncpy +2863 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c

  2846  
  2847  static void
  2848  bnxt_fill_coredump_record(struct bnxt *bp, struct bnxt_coredump_record 
*record,
  2849time64_t start, s16 start_utc, u16 total_segs,
  2850int status)
  2851  {
  2852  time64_t end = ktime_get_real_seconds();
  2853  u32 os_ver_major = 0, os_ver_minor = 0;
  2854  struct tm tm;
  2855  
  2856  time64_to_tm(start, 0, &tm);
  2857  memset(record, 0, sizeof(*record));
  2858  strcpy(record->signature, "cOrE");
  2859  record->flags = 0;
  2860  record->low_version = 0;
  2861  record->high_version = 1;
  2862  record->asic_state = 0;
> 2863  strncpy(record->system_name, utsname()->nodename,
  2864  strlen(utsname()->nodename));
  2865  record->year = cpu_to_le16(tm.tm_year);
  2866  record->month = cpu_to_le16(tm.tm_mon);
  2867  record->day = cpu_to_le16(tm.tm_mday);
  2868  record->hour = cpu_to_le16(tm.tm_hour);
  2869  record->minute = cpu_to_le16(tm.tm_min);
  2870  record->second = cpu_to_le16(tm.tm_sec);
  2871  record->utc_bias = cpu_to_le16(start_utc);
  2872  strcpy(record->commandline, "ethtool -w");
  2873  record->total_segments = cpu_to_le32(total_segs);
  2874  
  2875  sscanf(utsname()->release, "%u.%u", &os_ver_major, 
&os_ver_minor);
  2876  record->os_ver_major = cpu_to_le32(os_ver_major);
  2877  record->os_ver_minor = cpu_to_le32(os_ver_minor);
  2878  
  2879  strcpy(record->os_name, utsname()->sysname);
  2880  time64_to_tm(end, 0, &tm);
  2881  record->end_year = cpu_to_le16(tm.tm_year + 1900);
  2882  record->end_month = cpu_to_le16(tm.tm_mon + 1);
  2883  record->end_day = cpu_to_le16(tm.tm_mday);
  2884  record->end_hour = cpu_to_le16(tm.tm_hour);
  2885  record->end_minute = cpu_to_le16(tm.tm_min);
  2886  record->end_second = cpu_to_le16(tm.tm_sec);
  2887  record->end_utc_bias = cpu_to_le16(sys_tz.tz_minuteswest * 60);
  2888  record->asic_id1 = cpu_to_le32(bp->chip_num << 16 |
  2889 bp->ver_resp.chip_rev << 8 |
  2890 bp->ver_resp.chip_metal);
  2891  record->asic_id2 = 0;
  2892  record->coredump_status = cpu_to_le32(status);
  2893  record->ioctl_low_version = 0;
  2894  record->ioctl_high_version = 0;
  2895  }
  2896  
  2897  static int bnxt_get_coredump(struct bnxt *bp, void *buf, u32 *dump_len)
  2898  {
  2899  u32 ver_get_resp_len = sizeof(struct hwrm_ver_get_output);
  2900  struct coredump_segment_record *seg_record = NULL;
  2901  u32 offset = 0, seg_hdr_len, seg_record_len;
  2902  struct bnxt_coredump_segment_hdr seg_hdr;
  2903  struct bnxt_coredump_record coredump_rec;
  2904  struct bnxt_coredump coredump = {NULL};
  2905  time64_t start_time;
  2906  u16 start_utc;
  2907  int rc = 0, i;
  2908  
  2909  start_time = ktime_get_real_seconds();
  2910  start_utc = sys_tz.tz_minuteswest * 60;
  2911  seg_hdr_len = sizeof(seg_hdr);
  2912  
  2913  /* First segment should be hwrm_ver_get response */
  2914  *dump_len = seg_hdr_len + ver_get_resp_len;
  2915  if (buf) {
  2916  bnxt_fill_coredump_seg_hdr(bp, &seg_hdr, NULL, 
ver_get_resp_len,
  2917 0, 0, 0);
  2918  memcpy(buf + offset, &seg_hdr, seg_hdr_len);
  2919

Re: [PATCH net-next] net: report netlink extack only if set

2018-08-05 Thread Willem de Bruijn

On Sun, Aug 5, 2018 at 8:37 PM David Miller  wrote:
>
> From: Willem de Bruijn 
> Date: Sun,  5 Aug 2018 15:48:01 -0400
>
> > From: Willem de Bruijn 
> >
> > Initialize extack in dev_set_mtu and report only if set.
> >
> > Fixes: 7a4c53bee332 ("net: report invalid mtu value via netlink extack")
> > Reported-by: syzbot 
> > Signed-off-by: Willem de Bruijn 
>
> Someone beat you to it :-)  And that version is in net-next.

Oops. I had totally missed that thread. Glad to see it's fixed already.

Re: [PATCH net-next] tc-testing: fix ip address in u32 test

2018-08-05 Thread David Miller

From: Vlad Buslov 
Date: Sun,  5 Aug 2018 22:35:56 +0300

> Fix expected ip address to actually match configured ip address.
> Fix test to expect single matched filter.
> 
> Signed-off-by: Vlad Buslov 
> Acked-by: Jamal Hadi Salim 

Applied.

Re: [PATCH net-next] tc-testing: remove duplicate spaces in skbedit match patterns

2018-08-05 Thread David Miller

From: Vlad Buslov 
Date: Sun,  5 Aug 2018 22:37:09 +0300

> Match patterns for some skbedit tests contain duplicate whitespace that is
> not present in actual tc output. This causes tests to fail because they
> can't match required action, even when it was successfully created.
> 
> Signed-off-by: Vlad Buslov 
> Acked-by: Jamal Hadi Salim 

Applied.

Re: [PATCH net-next] tc-testing: flush gact actions on test teardown

2018-08-05 Thread David Miller

From: Vlad Buslov 
Date: Sun,  5 Aug 2018 22:36:25 +0300

> Test 6fb4 creates one mirred and one pipe action, but only flushes mirred
> on teardown. Leaking pipe action causes failures in other tests.
> 
> Add additional teardown command to also flush gact actions.
> 
> Signed-off-by: Vlad Buslov 
> Acked-by: Jamal Hadi Salim 

Applied.

Re: [PATCH net-next] tc-testing: remove duplicate spaces in connmark match patterns

2018-08-05 Thread David Miller

From: Vlad Buslov 
Date: Sun,  5 Aug 2018 22:36:44 +0300

> Match patterns for some connmark tests contain duplicate whitespace that is
> not present in actual tc output. This causes tests to fail because they
> can't match required action, even when it was successfully created.
> 
> Fixes: 1dad0f97 ("tc-testing: add connmark action tests")
> Signed-off-by: Vlad Buslov 
> Acked-by: Jamal Hadi Salim 

Applied.

Re: [PATCH net-next 00/13] bnxt_en: Updates for net-next.

2018-08-05 Thread David Miller

From: Michael Chan 
Date: Sun,  5 Aug 2018 16:51:45 -0400

> This series includes the usual firmware spec update.  The driver has
> added external phy loopback test and phy setup retry logic that is
> needed during hotplug.  In the SRIOV space, the driver has added a
> new VF resource allocation mode that requires the VF driver to
> reserve resources during IFUP.  IF state changes are now propagated
> to firmware so that firmware can release some resources during IFDOWN.
> 
> ethtool method to get firmware core dump and hwmon temperature reading
> have been added.  DSCP to user priority support has been added to
> the driver's DCBNL interface, and the CoS queue logic has been refined
> to make sure that the special RDMA Congestion Notification hardware CoS
> queue will not be used for networking traffic.

Series applied, thanks Michael.

Re: pull-request: wireless-drivers-next 2018-08-05

2018-08-05 Thread David Miller

From: Kalle Valo 
Date: Sun, 05 Aug 2018 20:07:16 +0300

> a pull request to net-next tree for 4.19. If the merge window doesn't
> open today I'm planning to submit one more pull request later next week.
> 
> Please let me know if you have any problems.

Pulled, thanks Kalle.

Re: [PATCH net-next] net: report netlink extack only if set

2018-08-05 Thread David Miller

From: Willem de Bruijn 
Date: Sun,  5 Aug 2018 15:48:01 -0400

> From: Willem de Bruijn 
> 
> Initialize extack in dev_set_mtu and report only if set.
> 
> Fixes: 7a4c53bee332 ("net: report invalid mtu value via netlink extack")
> Reported-by: syzbot 
> Signed-off-by: Willem de Bruijn 

Someone beat you to it :-)  And that version is in net-next.

Thanks!

Re: [PATCH net] ip6_tunnel: use the right value for ipv4 min mtu check in ip6_tnl_xmit

2018-08-05 Thread David Miller

From: Xin Long 
Date: Sun,  5 Aug 2018 22:46:07 +0800

> According to RFC791, 68 bytes is the minimum size of IPv4 datagram every
> device must be able to forward without further fragmentation while 576
> bytes is the minimum size of IPv4 datagram every device has to be able
> to receive, so in ip6_tnl_xmit(), 68(IPV4_MIN_MTU) should be the right
> value for the ipv4 min mtu check in ip6_tnl_xmit.
> 
> While at it, change to use max() instead of if statement.
> 
> Fixes: c9fefa08190f ("ip6_tunnel: get the min mtu properly in ip6_tnl_xmit")
> Reported-by: Sabrina Dubroca 
> Signed-off-by: Xin Long 

Applied and queued up for -stable, thanks Xin.

Re: [PATCH] net/bridge/br_multicast: remove redundant variable "err"

2018-08-05 Thread David Miller

From: zhong jiang 
Date: Sun, 5 Aug 2018 22:18:43 +0800

> @@ -1797,7 +1795,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
>   struct sk_buff *skb_trimmed = NULL;
>   const unsigned char *src;
>   struct igmphdr *ih;
> - int err;
> + int err = 0;
>  
>   err = ip_mc_check_igmp(skb, &skb_trimmed);

The initialization of err to '0' is unnecessary, it gets assigned on
the very next line.

Re: pull request: bluetooth-next 2018-08-05

2018-08-05 Thread David Miller

From: Johan Hedberg 
Date: Sun, 5 Aug 2018 09:14:30 +0300

> Here's the main bluetooth-next pull request for the 4.19 kernel.
> 
>  - Added support for Bluetooth Advertising Extensions
>  - Added vendor driver support to hci_h5 HCI driver
>  - Added serdev support to hci_h5 driver
>  - Added support for Qualcomm wcn3990 controller
>  - Added support for RTL8723BS and RTL8723DS controllers
>  - btusb: Added new ID for Realtek 8723DE
>  - Several other smaller fixes & cleanups
> 
> Please let me know if there are any issues pulling. Thanks.

Pulled, thanks Johan.

Re: [PATCH net-next 0/3] mlxsw: Enable MC-aware mode for mlxsw ports

2018-08-05 Thread David Miller

From: Ido Schimmel 
Date: Sun,  5 Aug 2018 09:03:05 +0300

> Petr says:
> 
> Due to an issue in Spectrum chips, when unicast traffic shares the same
> queue as BUM traffic, and there is a congestion, the BUM traffic is
> admitted to the queue anyway, thus pushing out all UC traffic. In order
> to give unicast traffic precedence over BUM traffic, configure
> multicast-aware mode on all ports.
> 
> Under multicast-aware regime, when assigning traffic class to a packet,
> the switch doesn't merely take the value prescribed by the QTCT
> register. For BUM traffic, it instead assigns that value plus 8. That
> limits the number of available TCs, but since mlxsw currently only uses
> the lower eight anyway, it is no real loss.
> 
> The two TCs (UC and MC one) are then mapped to the same subgroup and
> strictly prioritized so that UC traffic is preferred in case of
> congestion.
> 
> In patch #1, introduce a new register, QTCTM, which enables the
> multicast-aware mode.
> 
> In patch #2, fix a typo in related code.
> 
> In patch #3, set up TCs and QTCTM to enable multicast-aware mode.

Series applied, thanks.

Re: [PATCH net-next] virtio-net: mark expected switch fall-throughs

2018-08-05 Thread David Miller

From: "Gustavo A. R. Silva" 
Date: Sat, 4 Aug 2018 21:42:05 -0500

> In preparation to enabling -Wimplicit-fallthrough, mark switch cases
> where we are expecting to fall through.
> 
> Addresses-Coverity-ID: 1402059 ("Missing break in switch")
> Addresses-Coverity-ID: 1402060 ("Missing break in switch")
> Addresses-Coverity-ID: 1402061 ("Missing break in switch")
> Signed-off-by: Gustavo A. R. Silva 

Applied, thank you.

Re: [PATCH] mellanox: fix the dport endianness in call of __inet6_lookup_established()

2018-08-05 Thread David Miller

From: Al Viro 
Date: Sat, 4 Aug 2018 21:41:27 +0100

> __inet6_lookup_established() expect th->dport passed in host-endian,
> not net-endian.  The reason is microoptimization in __inet6_lookup(),
> but if you use the lower-level helpers, you have to play by their
> rules...
> 
> Signed-off-by: Al Viro 

Mellanox folks, please review.

Re: [PATCH net-next] net: sched: cls_flower: Fix an error code in fl_tmplt_create()

2018-08-05 Thread David Miller

From: Dan Carpenter 
Date: Fri, 3 Aug 2018 22:27:55 +0300

> We forgot to set the error code on this path, so we return NULL instead
> of an error pointer.  In the current code kzalloc() won't fail for small
> allocations so this doesn't really affect runtime.
> 
> Fixes: b95ec7eb3b4d ("net: sched: cls_flower: implement chain templates")
> Signed-off-by: Dan Carpenter 

Applied, thanks Dan.

Re: [PATCH][net-next] net: check extack._msg before print

2018-08-05 Thread David Miller

From: Li RongQing 
Date: Fri,  3 Aug 2018 15:45:21 +0800

> dev_set_mtu_ext is able to fail with a valid mtu value, at that
> condition, extack._msg is not set and random since it is in stack,
> then kernel will crash when print it.
> 
> Fixes: 7a4c53bee3324a ("net: report invalid mtu value via netlink extack")
> Signed-off-by: Zhang Yu 
> Signed-off-by: Li RongQing 

Applied, thank you.

Re: [PATCH bpf-next 12/13] docs: net: Fix various minor typos

2018-08-05 Thread Tobin C. Harding

On Fri, Aug 03, 2018 at 10:41:12AM +0200, Daniel Borkmann wrote:
> On 08/01/2018 07:09 AM, Tobin C. Harding wrote:
> > There are a few minor typos and grammatical issues.  We should however
> > try to keep the current flavour of the document.
> > 
> > Fix typos and grammar if glaringly required.
> > 
> > Signed-off-by: Tobin C. Harding 
> 
> Overall looks good, just some minor nits:
> 
> >  Documentation/networking/filter.rst | 65 +++--
> >  1 file changed, 33 insertions(+), 32 deletions(-)
> > 
> > diff --git a/Documentation/networking/filter.rst 
> > b/Documentation/networking/filter.rst
> > index 99dfa74fc4f7..b989a6c882b8 100644
> > --- a/Documentation/networking/filter.rst
> > +++ b/Documentation/networking/filter.rst
> > @@ -32,10 +32,10 @@ removing the old one and placing your new one in its 
> > place, assuming your
> >  filter has passed the checks, otherwise if it fails the old filter will
> >  remain on that socket.
> >  
> > -SO_LOCK_FILTER option allows to lock the filter attached to a socket. Once
> > -set, a filter cannot be removed or changed. This allows one process to
> > +SO_LOCK_FILTER option allows locking of the filter attached to a socket.
> > +Once set, a filter cannot be removed or changed. This allows one process to
> >  setup a socket, attach a filter, lock it then drop privileges and be
> > -assured that the filter will be kept until the socket is closed.
> > +assured  that the filter will be kept until the socket is closed.
> 
>   ^-- looks like extra whitespace slipped in?

Fixed in the RFC.  Thanks for taking the time review.

Tobin

Re: [Patch net] ipv6: fix double refcount of fib6_metrics

2018-08-05 Thread David Miller

From: Cong Wang 
Date: Thu,  2 Aug 2018 23:20:38 -0700

> All the callers of ip6_rt_copy_init()/rt6_set_from() hold refcnt
> of the "from" fib6_info, so there is no need to hold fib6_metrics
> refcnt again, because fib6_metrics refcnt is only released when
> fib6_info is gone, that is, they have the same life time, so the
> whole fib6_metrics refcnt can be removed actually.
> 
> This fixes a kmemleak warning reported by Sabrina.
> 
> Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst 
> based routes")
> Reported-by: Sabrina Dubroca 
> Cc: Sabrina Dubroca 
> Cc: David Ahern 
> Signed-off-by: Cong Wang 

Applied, thanks Cong.

Re: [PATCH bpf-next 05/13] docs: net: Fix indentation issues for code snippets

2018-08-05 Thread Tobin C. Harding

On Fri, Aug 03, 2018 at 10:44:23AM +0200, Daniel Borkmann wrote:
> On 08/01/2018 07:09 AM, Tobin C. Harding wrote:
> [...]
> > -Starting bpf_dbg is trivial and just requires issuing:
> > +Starting bpf_dbg is trivial and just requires issuing::
> >  
> > -# ./bpf_dbg
> > +  # ./bpf_dbg
> >  
> >  In case input and output do not equal stdin/stdout, bpf_dbg takes an
> >  alternative stdin source as a first argument, and an alternative stdout
> > @@ -381,86 +384,87 @@ file "~/.bpf_dbg_init" and the command history is 
> > stored in the file
> >  Interaction in bpf_dbg happens through a shell that also has 
> > auto-completion
> >  support (follow-up example commands starting with '>' denote bpf_dbg 
> > shell).
> >  The usual workflow would be to ...
> > -
> > -> load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0
> > -  Loads a BPF filter from standard output of bpf_asm, or transformed via
> > -  e.g. `tcpdump -iem1 -ddd port 22 | tr '\n' ','`. Note that for JIT
> > -  debugging (next section), this command creates a temporary socket and
> > -  loads the BPF code into the kernel. Thus, this will also be useful for
> > -  JIT developers.
> > -
> > -> load pcap foo.pcap
> > -  Loads standard tcpdump pcap file.
> > -
> > -> run []
> > -bpf passes:1 fails:9
> > -  Runs through all packets from a pcap to account how many passes and fails
> > -  the filter will generate. A limit of packets to traverse can be given.
> > -
> > -> disassemble
> > -l0:ldh [12]
> > -l1:jeq #0x800, l2, l5
> > -l2:ldb [23]
> > -l3:jeq #0x1, l4, l5
> > -l4:ret #0x
> > -l5:ret #0
> > -  Prints out BPF code disassembly.
> > -
> > -> dump
> > -/* { op, jt, jf, k }, */
> > -{ 0x28,  0,  0, 0x000c },
> > -{ 0x15,  0,  3, 0x0800 },
> > -{ 0x30,  0,  0, 0x0017 },
> > -{ 0x15,  0,  1, 0x0001 },
> > -{ 0x06,  0,  0, 0x },
> > -{ 0x06,  0,  0, 00 },
> > -  Prints out C-style BPF code dump.
> > -
> > -> breakpoint 0
> > -breakpoint at: l0: ldh [12]
> > -> breakpoint 1
> > -breakpoint at: l1: jeq #0x800, l2, l5
> > -  ...
> > -  Sets breakpoints at particular BPF instructions. Issuing a `run` command
> > -  will walk through the pcap file continuing from the current packet and
> > -  break when a breakpoint is being hit (another `run` will continue from
> > -  the currently active breakpoint executing next instructions):
> > -
> > -  > run
> > -  -- register dump --
> > -  pc:   [0]   <-- program counter
> > -  code: [40] jt[0] jf[0] k[12]<-- plain BPF code of current 
> > instruction
> > -  curr: l0:ldh [12]  <-- disassembly of current 
> > instruction
> > -  A:[][0] <-- content of A (hex, decimal)
> > -  X:[][0] <-- content of X (hex, decimal)
> > -  M[0,15]:  [][0] <-- folded content of M (hex, 
> > decimal)
> > -  -- packet dump --   <-- Current packet from pcap (hex)
> > -  len: 42
> > -0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01
> > -   16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26
> > -   32: 00 00 00 00 00 00 0a 3b 01 01
> > -  (breakpoint)
> > -  >
> > -
> > -> breakpoint
> > -breakpoints: 0 1
> > -  Prints currently set breakpoints.
> > -
> > -> step [-, +]
> > -  Performs single stepping through the BPF program from the current pc
> > -  offset. Thus, on each step invocation, above register dump is issued.
> > -  This can go forwards and backwards in time, a plain `step` will break
> > -  on the next BPF instruction, thus +1. (No `run` needs to be issued here.)
> > -
> > -> select 
> > -  Selects a given packet from the pcap file to continue from. Thus, on
> > -  the next `run` or `step`, the BPF program is being evaluated against
> > -  the user pre-selected packet. Numbering starts just as in Wireshark
> > -  with index 1.
> > -
> > -> quit
> > -#
> > -  Exits bpf_dbg.
> > +::
> > +
> > +  > load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0
> > +Loads a BPF filter from standard output of bpf_asm, or transformed via
> > +e.g. `tcpdump -iem1 -ddd port 22 | tr '\n' ','`. Note that for JIT
> > +debugging (next section), this command creates a temporary socket and
> > +loads the BPF code into the kernel. Thus, this will also be useful for
> > +JIT developers.
> 
> Here for the bpf_dbg howto, it would be good to separate explanation from
> the cmdline code snippets. This would more easily clarify the commands
> themselves if we already go the rst route, so I'd prefer splitting this up.

Sure thing.  Will do as suggested. Thanks for the review.


Tobin

Re: [PATCH net-next v3] ipv6: defrag: drop non-last frags smaller than min mtu

2018-08-05 Thread David Miller

From: Florian Westphal 
Date: Fri,  3 Aug 2018 02:22:20 +0200

> don't bother with pathological cases, they only waste cycles.
> IPv6 requires a minimum MTU of 1280 so we should never see fragments
> smaller than this (except last frag).
> 
> v3: don't use awkward "-offset + len"
> v2: drop IPv4 part, which added same check w. IPV4_MIN_MTU (68).
> There were concerns that there could be even smaller frags
> generated by intermediate nodes, e.g. on radio networks.
> 
> Cc: Peter Oskolkov 
> Cc: Eric Dumazet 
> Signed-off-by: Florian Westphal 

Applied, thanks Florian.

Re: [PATCH] net/bridge/br_multicast: remove redundant variable "err"

2018-08-05 Thread Stephen Hemminger

On Sun, 5 Aug 2018 22:18:43 +0800
zhong jiang  wrote:

> The err is not used after initalization, So remove it and make
> it void function.
> 
> Signed-off-by: zhong jiang 

Makes sense to me.

Acked-by: Stephen Hemminger

[PATCH bpf] bpf: btf: Change tools/lib/bpf/btf to LGPL

2018-08-05 Thread Martin KaFai Lau

This patch changes the tools/lib/bpf/btf.[ch] to LGPL which
is inline with libbpf also.

Signed-off-by: Martin KaFai Lau 
---
 tools/lib/bpf/btf.c | 2 +-
 tools/lib/bpf/btf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 2d270c560df3..c36a3a76986a 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: LGPL-2.1
 /* Copyright (c) 2018 Facebook */
 
 #include 
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index e2a09a155f84..caac3a404dc5 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: LGPL-2.1 */
 /* Copyright (c) 2018 Facebook */
 
 #ifndef __BPF_BTF_H
-- 
2.17.1

Re: Linux kernel error stack

2018-08-05 Thread Florian Westphal

Michal Kubecek  wrote:
> On Mon, Aug 06, 2018 at 01:15:37AM +0200, Florian Westphal wrote:
> > Michal Kubecek  wrote:
> > > Oops, exactly this issue was already discussed almost a year ago:
> > > 
> > >   http://lkml.kernel.org/r/20170824104824.2c318a0...@unicorn.suse.cz
> > > 
> > > But something more urgent came and I forgot to get back to it. :-(
> > 
> > I did not even remeber, thanks for the pointer.
> > So I think best course of action is to update man page to clearly
> > say this only works in postrouting and with udp, and is ONLY
> > intended for working around old dhcp software.
> 
> As GSO for UDP is on its way to mainline, one might get into trouble
> even with UDP if the rule is not specific enough.

Yes, we still need a fix to ignore GSO too.

Re: [PATCH net-next 0/3] ip: Use rb trees for IP frag queue.

2018-08-05 Thread David Miller

From: Peter Oskolkov 
Date: Thu,  2 Aug 2018 22:45:57 +

> This patchset
>  * changes IPv4 defrag behavior to match that of IPv6: overlapping
>fragments now cause the whole IP datagram to be discarded (suggested
>by David Miller): there are no legitimate use cases for overlapping
>fragments;
>  * changes IPv4 defrag queue from a list to a rb tree (suggested
>by Eric Dumazet): this change removes a potential attach vector.
> 
> Upcoming patches will contain similar changes for IPv6 frag queue,
> as well as a comprehensive IP defrag self-test (temporarily delayed).

Looks good, series applied, thanks!

Re: [PATCH net-next] net/tls: Mark the end in scatterlist table

2018-08-05 Thread David Miller

From: Vakul Garg 
Date: Thu,  2 Aug 2018 20:43:10 +0530

> Function zerocopy_from_iter() unmarks the 'end' in input sgtable while
> adding new entries in it. The last entry in sgtable remained unmarked.
> This results in KASAN error report on using apis like sg_nents(). Before
> returning, the function needs to mark the 'end' in the last entry it
> adds.
> 
> Signed-off-by: Vakul Garg 

Yes, a properly formed scatterlist table must always have it's
end marked.

Applied, thanks.

Re: [v2, 1/3] arm64: dts: fsl: add clocks property for fman ptp timer node

2018-08-05 Thread David Miller

From: Yangbo Lu 
Date: Wed,  1 Aug 2018 18:05:52 +0800

> This patch is to add clocks property for fman ptp timer node.
> 
> Signed-off-by: Yangbo Lu 
> ---
> Changes for v2:
>   - None.

Applied.

Re: [PATCH] ipv6: icmp: Updating pmtu for link local route

2018-08-05 Thread David Miller

From: Georg Kohmann 
Date: Thu,  2 Aug 2018 13:56:58 +0200

> When a ICMPV6_PKT_TOOBIG is received from a link local address the pmtu will
> be updated on a route with an arbitrary interface index. Subsequent packets
> sent back to the same link local address may therefore end up not
> considering the updated pmtu.
> 
> Current behavior breaks TAHI v6LC4.1.4 Reduce PMTU On-link. Referring to RFC
> 1981: Section 3: "Note that Path MTU Discovery must be performed even in
> cases where a node "thinks" a destination is attached to the same link as
> itself. In a situation such as when a neighboring router acts as proxy [ND]
> for some destination, the destination can to appear to be directly
> connected but is in fact more than one hop away."
> 
> Using the interface index from the incoming ICMPV6_PKT_TOOBIG when updating
> the pmtu.
> 
> Signed-off-by: Georg Kohmann 

Applied, thanks Georg.

Re: [v2, 3/3] ptp_qoriq: support automatic configuration for ptp timer

2018-08-05 Thread David Miller

From: Yangbo Lu 
Date: Wed,  1 Aug 2018 18:05:54 +0800

> This patch is to support automatic configuration for ptp timer.
> If required ptp dts properties are not provided, driver could
> try to calculate a set of default configurations to initialize
> the ptp timer. This makes the driver work for many boards which
> don't have the required ptp dts properties in current kernel.
> Also the users could set dts properties by themselves according
> to their requirement.
> 
> Signed-off-by: Yangbo Lu 
> ---
> Changes for v2:
>   - Dropped module_param.

Applied.

Re: [v2, 2/3] powerpc/mpc85xx: add clocks property for fman ptp timer node

2018-08-05 Thread David Miller

From: Yangbo Lu 
Date: Wed,  1 Aug 2018 18:05:53 +0800

> This patch is to add clocks property for fman ptp timer node.
> 
> Signed-off-by: Yangbo Lu 
> ---
> Changes for v2:
>   - None.

Applied.

Re: Linux kernel error stack

2018-08-05 Thread Michal Kubecek

On Mon, Aug 06, 2018 at 01:15:37AM +0200, Florian Westphal wrote:
> Michal Kubecek  wrote:
> > Oops, exactly this issue was already discussed almost a year ago:
> > 
> >   http://lkml.kernel.org/r/20170824104824.2c318a0...@unicorn.suse.cz
> > 
> > But something more urgent came and I forgot to get back to it. :-(
> 
> I did not even remeber, thanks for the pointer.
> So I think best course of action is to update man page to clearly
> say this only works in postrouting and with udp, and is ONLY
> intended for working around old dhcp software.

As GSO for UDP is on its way to mainline, one might get into trouble
even with UDP if the rule is not specific enough.
 
> From kernel, have checkentry emit a warning
> when this is used for protocols other than udp,
> and make this a no-op for everything else.
> 
> Same for postrouting: warn from checkentry if the hook is something
> else, and have target function not do anything for !postrouting.
> 
> Does that make sense?

It sounds reasonable to me.

> I can work on a patch later this week if needed.

I'll be on vacation until Sunday so that I certainly won't get to it
earlier than next week.

Michal Kubecek

Re: [PATCH 00/21] Netfilter updates for net-next

2018-08-05 Thread David Miller

From: Pablo Neira Ayuso 
Date: Sun,  5 Aug 2018 23:21:20 +0200

> The following patchset contains Netfilter updates for your net-next tree:
 ...
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git

Pulled, thanks Pablo.

Re: KCM - recvmsg() mangles packets?

2018-08-05 Thread Dominique Martinet

Dominique Martinet wrote on Sun, Aug 05, 2018:
> It's getting late but I'll try adding a pskb_pull in there tomorrow, it
> would be better to make the bpf program start with an offset but I don't
> think that'll be easy to change...

I can confirm the following patch fixes the issue for me:
-8<-
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 625acb27efcc..348ff5945591 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -222,6 +222,16 @@ static int __strp_recv(read_descriptor_t *desc, struct 
sk_buff *orig_skb,
if (!stm->strp.full_len) {
ssize_t len;
 
+   /* Can only parse if there is no offset */
+   if (unlikely(stm->strp.offset)) {
+   if (!pskb_pull(skb, stm->strp.offset)) {
+   STRP_STATS_INCR(strp->stats.mem_fail);
+   strp_parser_err(strp, -ENOMEM, desc);
+   break;
+   }
+   stm->strp.offset = 0;
+   }
+
len = (*strp->cb.parse_msg)(strp, head);
 
if (!len) {
8<--

Now, I was looking at other users of strparser (I see sockmap, kcm and
tls) and it looks like sockmap does not handle offsets either but tls
does by using skb_copy_bits -- they're copying the tls header to a
buffer on the stack.

kcm cannot do that because we do not know how much data the user expects
to read, and I'm not comfortable doing pskb_pull in the kcm callback
either, but the cost of this pull is probably non-negligible if some
user can make do without it...

On the other hand, I do not see how to make the bpf program handle an
offset in the skb as that offset is strparser-specific.

Maybe add a flag in the cb that specifies wether the callback allows
non-zero offset?


I'll let you see if you can reproduce this and will wait for advices on
how to solve this properly so we can work on a proper fix.


Thanks,
-- 
Dominique

Re: Linux kernel error stack

2018-08-05 Thread Florian Westphal

Michal Kubecek  wrote:
> Oops, exactly this issue was already discussed almost a year ago:
> 
>   http://lkml.kernel.org/r/20170824104824.2c318a0...@unicorn.suse.cz
> 
> But something more urgent came and I forgot to get back to it. :-(

I did not even remeber, thanks for the pointer.
So I think best course of action is to update man page to clearly
say this only works in postrouting and with udp, and is ONLY
intended for working around old dhcp software.

>From kernel, have checkentry emit a warning
when this is used for protocols other than udp,
and make this a no-op for everything else.

Same for postrouting: warn from checkentry if the hook is something
else, and have target function not do anything for !postrouting.

Does that make sense?

I can work on a patch later this week if needed.

Re: Linux kernel error stack

2018-08-05 Thread Michal Kubecek

On Sun, Aug 05, 2018 at 11:03:42PM +0200, Florian Westphal wrote:
> Satish Patel  wrote:
> > Florian,
> > 
> > It seems those rules coming from here
> > https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum
> 
> This is crazy, and, as you found, it doesn't even do what they seem to
> think it does.
> 
> I see no reason for these rules at all.

Oops, exactly this issue was already discussed almost a year ago:

  http://lkml.kernel.org/r/20170824104824.2c318a0...@unicorn.suse.cz

But something more urgent came and I forgot to get back to it. :-(

Michal Kubecek

Re: Linux kernel error stack

2018-08-05 Thread Florian Westphal

Satish Patel  wrote:
> Florian,
> 
> It seems those rules coming from here
> https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum

This is crazy, and, as you found, it doesn't even do what they seem to
think it does.

I see no reason for these rules at all.

Re: Linux kernel error stack

2018-08-05 Thread Florian Westphal

Satish Patel  wrote:
> After reading further related DHCP checksum issue, it seems we need
> that rules when you running DHCP on same host machine where your guest
> using host DHCP service, in that case virtual nic won't do checksum.
> If your DHCP running on different host then your physical nic perform
> checksum.

Partially right, its only needed for older dhcp software that relies
on packet-header checksum correctness (metadata contains info when
checksum will be filled in by nic later, modern servers will
use that and ignore the packet-header checksums in this case).

If you remove the rule and your VMs still get leases via DHCP then
the CHECKSUM workaround isn't needed.

[PATCH net-next 02/13] bnxt_en: Adjust timer based on ethtool stats-block-usecs settings.

2018-08-05 Thread Michael Chan

The driver gathers statistics using 2 mechanisms.  Some stats are DMA'ed
directly from hardware and others are polled from the driver's timer.
Currently, we only adjust the DMA frequency based on the ethtool
stats-block-usecs setting.  This patch adjusts the driver's timer
frequency as well to make everything consistent.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 3d40e49..1f626af 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -112,6 +112,11 @@ static int bnxt_set_coalesce(struct net_device *dev,
  BNXT_MAX_STATS_COAL_TICKS);
stats_ticks = rounddown(stats_ticks, BNXT_MIN_STATS_COAL_TICKS);
bp->stats_coal_ticks = stats_ticks;
+   if (bp->stats_coal_ticks)
+   bp->current_interval =
+   bp->stats_coal_ticks * HZ / 100;
+   else
+   bp->current_interval = BNXT_TIMER_INTERVAL;
update_stats = true;
}
 
-- 
2.5.1

[PATCH net-next 05/13] bnxt_en: Add new VF resource allocation strategy mode.

2018-08-05 Thread Michael Chan

The new mode is "minimal-static" to be used when resources are more
limited to support a large number of VFs, for example  The PF driver
will provision guaranteed minimum resources of 0.  Each VF has no
guranteed resources until it tries to reserve resources during device
open.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c   |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h   |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 23 ++-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index fd936c5..e0e3b4b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5162,7 +5162,7 @@ int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp, bool all)
 
pf->vf_resv_strategy =
le16_to_cpu(resp->vf_reservation_strategy);
-   if (pf->vf_resv_strategy > BNXT_VF_RESV_STRATEGY_MINIMAL)
+   if (pf->vf_resv_strategy > BNXT_VF_RESV_STRATEGY_MINIMAL_STATIC)
pf->vf_resv_strategy = BNXT_VF_RESV_STRATEGY_MAXIMAL;
}
 hwrm_func_resc_qcaps_exit:
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 47eec14..b44a758 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -862,6 +862,7 @@ struct bnxt_pf_info {
u8  vf_resv_strategy;
 #define BNXT_VF_RESV_STRATEGY_MAXIMAL  0
 #define BNXT_VF_RESV_STRATEGY_MINIMAL  1
+#define BNXT_VF_RESV_STRATEGY_MINIMAL_STATIC   2
void*hwrm_cmd_req_addr[4];
dma_addr_t  hwrm_cmd_req_dma_addr[4];
struct bnxt_vf_info *vf;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index f560845..b896a52 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -447,7 +447,7 @@ static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, int 
num_vfs)
u16 vf_tx_rings, vf_rx_rings, vf_cp_rings;
u16 vf_stat_ctx, vf_vnics, vf_ring_grps;
struct bnxt_pf_info *pf = &bp->pf;
-   int i, rc = 0;
+   int i, rc = 0, min = 1;
 
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_RESOURCE_CFG, -1, -1);
 
@@ -464,14 +464,19 @@ static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, 
int num_vfs)
 
req.min_rsscos_ctx = cpu_to_le16(BNXT_VF_MIN_RSS_CTX);
req.max_rsscos_ctx = cpu_to_le16(BNXT_VF_MAX_RSS_CTX);
-   if (pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL) {
-   req.min_cmpl_rings = cpu_to_le16(1);
-   req.min_tx_rings = cpu_to_le16(1);
-   req.min_rx_rings = cpu_to_le16(1);
-   req.min_l2_ctxs = cpu_to_le16(BNXT_VF_MIN_L2_CTX);
-   req.min_vnics = cpu_to_le16(1);
-   req.min_stat_ctx = cpu_to_le16(1);
-   req.min_hw_ring_grps = cpu_to_le16(1);
+   if (pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL_STATIC) {
+   min = 0;
+   req.min_rsscos_ctx = cpu_to_le16(min);
+   }
+   if (pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL ||
+   pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL_STATIC) {
+   req.min_cmpl_rings = cpu_to_le16(min);
+   req.min_tx_rings = cpu_to_le16(min);
+   req.min_rx_rings = cpu_to_le16(min);
+   req.min_l2_ctxs = cpu_to_le16(min);
+   req.min_vnics = cpu_to_le16(min);
+   req.min_stat_ctx = cpu_to_le16(min);
+   req.min_hw_ring_grps = cpu_to_le16(min);
} else {
vf_cp_rings /= num_vfs;
vf_tx_rings /= num_vfs;
-- 
2.5.1

Re: Linux kernel error stack

2018-08-05 Thread Satish Patel

After reading further related DHCP checksum issue, it seems we need
that rules when you running DHCP on same host machine where your guest
using host DHCP service, in that case virtual nic won't do checksum.
If your DHCP running on different host then your physical nic perform
checksum.

On Sun, Aug 5, 2018 at 4:39 PM, Satish Patel  wrote:
> Florian,
>
> I have removed those port 80 CHECKSUM rules and everything looks good
> i didn't see kernel WARN mesg.
>
> Thank you so much! You just nailed it :)
>
> On Sun, Aug 5, 2018 at 4:15 PM, Satish Patel  wrote:
>> Florian,
>>
>> It seems those rules coming from here
>> https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum
>>
>> On Sun, Aug 5, 2018 at 4:09 PM, Satish Patel  wrote:
>>> Yes this is openstack-ansible deployment tool which set them up. I am
>>> wondering where are these rules saved? I believe openstack-ansible use
>>> LXC container to deploy services so must be part of LXC startup
>>> scripts.
>>>
>>> I have checked there is no firewalld and iptables service running on 
>>> system..
>>>
>>> You think i should get rid of all CHEKSUM option in iptables rules? Am i 
>>> right?
>>>
>>>
>>> On Sun, Aug 5, 2018 at 4:02 PM, Florian Westphal  wrote:
 Satish Patel  wrote:
> > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM
> > --checksum-fill
> > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM
> > --checksum-fill

 These rules make no sense to me, and are also source of your backtrace.
 Who set this up?

 If this is coming from openstack, I suggest asking openstack developers
 WTH this is supposed to do.

> > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp
> > --dport 68 -j CHECKSUM --checksum-fill

 This was needed to work around dhcpd issues w. checksum offloading but I
 guess that DCHCP will work fine without this rule too nowadays.

 So I suggest you simply get rid of these rules.

[PATCH net-next 04/13] bnxt_en: Add PHY retry logic.

2018-08-05 Thread Michael Chan

During hotplug, the driver's open function can be called almost
immediately after power on reset.  The PHY may not be ready and the
firmware may return failure when the driver tries to update PHY
settings.  Add retry logic fired from the driver's timer to retry
the operation for 5 seconds.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 31 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  4 
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index d9fc905..fd936c5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6898,8 +6898,14 @@ static int __bnxt_open_nic(struct bnxt *bp, bool 
irq_re_init, bool link_re_init)
mutex_lock(&bp->link_lock);
rc = bnxt_update_phy_setting(bp);
mutex_unlock(&bp->link_lock);
-   if (rc)
+   if (rc) {
netdev_warn(bp->dev, "failed to update phy settings\n");
+   if (BNXT_SINGLE_PF(bp)) {
+   bp->link_info.phy_retry = true;
+   bp->link_info.phy_retry_expires =
+   jiffies + 5 * HZ;
+   }
+   }
}
 
if (irq_re_init)
@@ -7583,6 +7589,16 @@ static void bnxt_timer(struct timer_list *t)
set_bit(BNXT_FLOW_STATS_SP_EVENT, &bp->sp_event);
bnxt_queue_sp_work(bp);
}
+
+   if (bp->link_info.phy_retry) {
+   if (time_after(jiffies, bp->link_info.phy_retry_expires)) {
+   bp->link_info.phy_retry = 0;
+   netdev_warn(bp->dev, "failed to update phy settings 
after maximum retries.\n");
+   } else {
+   set_bit(BNXT_UPDATE_PHY_SP_EVENT, &bp->sp_event);
+   bnxt_queue_sp_work(bp);
+   }
+   }
 bnxt_restart_timer:
mod_timer(&bp->timer, jiffies + bp->current_interval);
 }
@@ -7670,6 +7686,19 @@ static void bnxt_sp_task(struct work_struct *work)
netdev_err(bp->dev, "SP task can't update link (rc: 
%x)\n",
   rc);
}
+   if (test_and_clear_bit(BNXT_UPDATE_PHY_SP_EVENT, &bp->sp_event)) {
+   int rc;
+
+   mutex_lock(&bp->link_lock);
+   rc = bnxt_update_phy_setting(bp);
+   mutex_unlock(&bp->link_lock);
+   if (rc) {
+   netdev_warn(bp->dev, "update phy settings retry 
failed\n");
+   } else {
+   bp->link_info.phy_retry = false;
+   netdev_info(bp->dev, "update phy settings retry 
succeeded\n");
+   }
+   }
if (test_and_clear_bit(BNXT_HWRM_PORT_MODULE_SP_EVENT, &bp->sp_event)) {
mutex_lock(&bp->link_lock);
bnxt_get_port_module_status(bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 0d49fe0..47eec14 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -959,6 +959,9 @@ struct bnxt_link_info {
u16 advertising;/* user adv setting */
boolforce_link_chng;
 
+   boolphy_retry;
+   unsigned long   phy_retry_expires;
+
/* a copy of phy_qcfg output used to report link
 * info to VF
 */
@@ -1344,6 +1347,7 @@ struct bnxt {
 #define BNXT_GENEVE_DEL_PORT_SP_EVENT  13
 #define BNXT_LINK_SPEED_CHNG_SP_EVENT  14
 #define BNXT_FLOW_STATS_SP_EVENT   15
+#define BNXT_UPDATE_PHY_SP_EVENT   16
 
struct bnxt_hw_resc hw_resc;
struct bnxt_pf_info pf;
-- 
2.5.1

[PATCH net-next 13/13] bnxt_en: Do not use the CNP CoS queue for networking traffic.

2018-08-05 Thread Michael Chan

The CNP CoS queue is reserved for internal RDMA Congestion Notification
Packets (CNP) and should not be used for a TC.  Modify the CoS queue
discovery code to skip over the CNP CoS queue and to reduce
bp->max_tc accordingly.  However, if RDMA is disabled in NVRAM, the
the CNP CoS queue can be used for a TC.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 22 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h |  4 
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index dde904b..d7f51ab 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5281,7 +5281,8 @@ static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp)
int rc = 0;
struct hwrm_queue_qportcfg_input req = {0};
struct hwrm_queue_qportcfg_output *resp = bp->hwrm_cmd_resp_addr;
-   u8 i, *qptr;
+   u8 i, j, *qptr;
+   bool no_rdma;
 
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_QPORTCFG, -1, -1);
 
@@ -5299,19 +5300,24 @@ static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp)
if (bp->max_tc > BNXT_MAX_QUEUE)
bp->max_tc = BNXT_MAX_QUEUE;
 
+   no_rdma = !(bp->flags & BNXT_FLAG_ROCE_CAP);
+   qptr = &resp->queue_id0;
+   for (i = 0, j = 0; i < bp->max_tc; i++) {
+   bp->q_info[j].queue_id = *qptr++;
+   bp->q_info[j].queue_profile = *qptr++;
+   bp->tc_to_qidx[j] = j;
+   if (!BNXT_CNPQ(bp->q_info[j].queue_profile) ||
+   (no_rdma && BNXT_PF(bp)))
+   j++;
+   }
+   bp->max_tc = max_t(u8, j, 1);
+
if (resp->queue_cfg_info & QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG)
bp->max_tc = 1;
 
if (bp->max_lltc > bp->max_tc)
bp->max_lltc = bp->max_tc;
 
-   qptr = &resp->queue_id0;
-   for (i = 0; i < bp->max_tc; i++) {
-   bp->q_info[i].queue_id = *qptr++;
-   bp->q_info[i].queue_profile = *qptr++;
-   bp->tc_to_qidx[i] = i;
-   }
-
 qportcfg_exit:
mutex_unlock(&bp->hwrm_cmd_lock);
return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h
index c0e16c0..6eed231 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h
@@ -43,6 +43,10 @@ struct bnxt_dscp2pri_entry {
((q_profile) == \
 QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS_ROCE)
 
+#define BNXT_CNPQ(q_profile)   \
+   ((q_profile) == \
+QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSY_ROCE_CNP)
+
 #define HWRM_STRUCT_DATA_SUBTYPE_HOST_OPERATIONAL  0x0300
 
 void bnxt_dcb_init(struct bnxt *bp);
-- 
2.5.1

[PATCH net-next 10/13] bnxt_en: Notify firmware about IF state changes.

2018-08-05 Thread Michael Chan

Use latest firmware API to notify firmware about IF state changes.
Firmware has the option to clean up resources during IF down and
to require the driver to reserve resources again during IF up.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 53 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1659940..56bd097 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3638,7 +3638,9 @@ int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, 
unsigned long *bmap,
 
 static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
 {
+   struct hwrm_func_drv_rgtr_output *resp = bp->hwrm_cmd_resp_addr;
struct hwrm_func_drv_rgtr_input req = {0};
+   int rc;
 
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
 
@@ -3676,7 +3678,15 @@ static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_VF_REQ_FWD);
}
 
-   return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   mutex_lock(&bp->hwrm_cmd_lock);
+   rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   if (rc)
+   rc = -EIO;
+   else if (resp->flags &
+cpu_to_le32(FUNC_DRV_RGTR_RESP_FLAGS_IF_CHANGE_SUPPORTED))
+   bp->fw_cap |= BNXT_FW_CAP_IF_CHANGE;
+   mutex_unlock(&bp->hwrm_cmd_lock);
+   return rc;
 }
 
 static int bnxt_hwrm_func_drv_unrgtr(struct bnxt *bp)
@@ -6637,6 +6647,39 @@ static int bnxt_hwrm_shutdown_link(struct bnxt *bp)
return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 }
 
+static int bnxt_hwrm_if_change(struct bnxt *bp, bool up)
+{
+   struct hwrm_func_drv_if_change_output *resp = bp->hwrm_cmd_resp_addr;
+   struct hwrm_func_drv_if_change_input req = {0};
+   bool resc_reinit = false;
+   int rc;
+
+   if (!(bp->fw_cap & BNXT_FW_CAP_IF_CHANGE))
+   return 0;
+
+   bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_IF_CHANGE, -1, -1);
+   if (up)
+   req.flags = cpu_to_le32(FUNC_DRV_IF_CHANGE_REQ_FLAGS_UP);
+   mutex_lock(&bp->hwrm_cmd_lock);
+   rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   if (!rc && (resp->flags &
+   cpu_to_le32(FUNC_DRV_IF_CHANGE_RESP_FLAGS_RESC_CHANGE)))
+   resc_reinit = true;
+   mutex_unlock(&bp->hwrm_cmd_lock);
+
+   if (up && resc_reinit && BNXT_NEW_RM(bp)) {
+   struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+
+   rc = bnxt_hwrm_func_resc_qcaps(bp, true);
+   hw_resc->resv_cp_rings = 0;
+   hw_resc->resv_tx_rings = 0;
+   hw_resc->resv_rx_rings = 0;
+   hw_resc->resv_hw_ring_grps = 0;
+   hw_resc->resv_vnics = 0;
+   }
+   return rc;
+}
+
 static int bnxt_hwrm_port_led_qcaps(struct bnxt *bp)
 {
struct hwrm_port_led_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
@@ -6991,8 +7034,13 @@ void bnxt_half_close_nic(struct bnxt *bp)
 static int bnxt_open(struct net_device *dev)
 {
struct bnxt *bp = netdev_priv(dev);
+   int rc;
 
-   return __bnxt_open_nic(bp, true, true);
+   bnxt_hwrm_if_change(bp, true);
+   rc = __bnxt_open_nic(bp, true, true);
+   if (rc)
+   bnxt_hwrm_if_change(bp, false);
+   return rc;
 }
 
 static bool bnxt_drv_busy(struct bnxt *bp)
@@ -7056,6 +7104,7 @@ static int bnxt_close(struct net_device *dev)
 
bnxt_close_nic(bp, true, true);
bnxt_hwrm_shutdown_link(bp);
+   bnxt_hwrm_if_change(bp, false);
return 0;
 }
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index ded2aff..6c40b257 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1290,6 +1290,7 @@ struct bnxt {
#define BNXT_FW_CAP_LLDP_AGENT  0x0002
#define BNXT_FW_CAP_DCBX_AGENT  0x0004
#define BNXT_FW_CAP_NEW_RM  0x0008
+   #define BNXT_FW_CAP_IF_CHANGE   0x0010
 
 #define BNXT_NEW_RM(bp)((bp)->fw_cap & BNXT_FW_CAP_NEW_RM)
u32 hwrm_spec_code;
-- 
2.5.1

[PATCH net-next 06/13] bnxt_en: Update RSS setup and GRO-HW logic according to the latest spec.

2018-08-05 Thread Michael Chan

Set the default hash mode flag in HWRM_VNIC_RSS_CFG to signal to the
firmware that the driver is compliant with the latest spec.  With
that, the firmware can return expanded RSS profile IDs that the driver
checks to setup the proper gso_type for GRO-HW packets.  But instead
of checking for the new profile IDs, we check the IP_TYPE flag
in TPA_START which is more straight forward than checking a list of
profile IDs.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index e0e3b4b..1714850 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1115,7 +1115,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct 
bnxt_rx_ring_info *rxr,
tpa_info->hash_type = PKT_HASH_TYPE_L4;
tpa_info->gso_type = SKB_GSO_TCPV4;
/* RSS profiles 1 and 3 with extract code 0 for inner 4-tuple */
-   if (hash_type == 3)
+   if (hash_type == 3 || TPA_START_IS_IPV6(tpa_start1))
tpa_info->gso_type = SKB_GSO_TCPV6;
tpa_info->rss_hash =
le32_to_cpu(tpa_start->rx_tpa_start_cmp_rss_hash);
@@ -3981,6 +3981,7 @@ static int bnxt_hwrm_vnic_set_rss(struct bnxt *bp, u16 
vnic_id, bool set_rss)
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_RSS_CFG, -1, -1);
if (set_rss) {
req.hash_type = cpu_to_le32(bp->rss_hash_cfg);
+   req.hash_mode_flags = VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_DEFAULT;
if (vnic->flags & BNXT_VNIC_RSS_FLAG) {
if (BNXT_CHIP_TYPE_NITRO_A0(bp))
max_rings = bp->rx_nr_rings - 1;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index b44a758..7ea022d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -326,6 +326,10 @@ struct rx_tpa_start_cmp_ext {
((le32_to_cpu((rx_tpa_start)->rx_tpa_start_cmp_cfa_code_v2) &   \
 RX_TPA_START_CMP_CFA_CODE) >> RX_TPA_START_CMPL_CFA_CODE_SHIFT)
 
+#define TPA_START_IS_IPV6(rx_tpa_start)\
+   (!!((rx_tpa_start)->rx_tpa_start_cmp_flags2 &   \
+   cpu_to_le32(RX_TPA_START_CMP_FLAGS2_IP_TYPE)))
+
 struct rx_tpa_end_cmp {
__le32 rx_tpa_end_cmp_len_flags_type;
#define RX_TPA_END_CMP_TYPE (0x3f << 0)
-- 
2.5.1

[PATCH net-next 07/13] bnxt_en: Add support for ethtool get dump.

2018-08-05 Thread Michael Chan

From: Vasundhara Volam 

Add support to collect live firmware coredump via ethtool.

Signed-off-by: Vasundhara Volam 
Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h |  66 
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c  | 333 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h  |  37 +++
 3 files changed, 436 insertions(+)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h
new file mode 100644
index 000..09c22f8
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h
@@ -0,0 +1,66 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2018 Broadcom Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_COREDUMP_H
+#define BNXT_COREDUMP_H
+
+struct bnxt_coredump_segment_hdr {
+   __u8 signature[4];
+   __le32 component_id;
+   __le32 segment_id;
+   __le32 flags;
+   __u8 low_version;
+   __u8 high_version;
+   __le16 function_id;
+   __le32 offset;
+   __le32 length;
+   __le32 status;
+   __le32 duration;
+   __le32 data_offset;
+   __le32 instance;
+   __le32 rsvd[5];
+};
+
+struct bnxt_coredump_record {
+   __u8 signature[4];
+   __le32 flags;
+   __u8 low_version;
+   __u8 high_version;
+   __u8 asic_state;
+   __u8 rsvd0[5];
+   char system_name[32];
+   __le16 year;
+   __le16 month;
+   __le16 day;
+   __le16 hour;
+   __le16 minute;
+   __le16 second;
+   __le16 utc_bias;
+   __le16 rsvd1;
+   char commandline[256];
+   __le32 total_segments;
+   __le32 os_ver_major;
+   __le32 os_ver_minor;
+   __le32 rsvd2;
+   char os_name[32];
+   __le16 end_year;
+   __le16 end_month;
+   __le16 end_day;
+   __le16 end_hour;
+   __le16 end_minute;
+   __le16 end_second;
+   __le16 end_utc_bias;
+   __le32 asic_id1;
+   __le32 asic_id2;
+   __le32 coredump_status;
+   __u8 ioctl_low_version;
+   __u8 ioctl_high_version;
+   __le16 rsvd3[313];
+};
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 9517633..3fc7c74 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -16,12 +16,15 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "bnxt_hsi.h"
 #include "bnxt.h"
 #include "bnxt_xdp.h"
 #include "bnxt_ethtool.h"
 #include "bnxt_nvm_defs.h" /* NVRAM content constant and structure defs */
 #include "bnxt_fw_hdr.h"   /* Firmware hdr constant and structure defs */
+#include "bnxt_coredump.h"
 #define FLASH_NVRAM_TIMEOUT((HWRM_CMD_TIMEOUT) * 100)
 #define FLASH_PACKAGE_TIMEOUT  ((HWRM_CMD_TIMEOUT) * 200)
 #define INSTALL_PACKAGE_TIMEOUT((HWRM_CMD_TIMEOUT) * 200)
@@ -2685,6 +2688,334 @@ static int bnxt_reset(struct net_device *dev, u32 
*flags)
return rc;
 }
 
+static int bnxt_hwrm_dbg_dma_data(struct bnxt *bp, void *msg, int msg_len,
+ struct bnxt_hwrm_dbg_dma_info *info)
+{
+   struct hwrm_dbg_cmn_output *cmn_resp = bp->hwrm_cmd_resp_addr;
+   struct hwrm_dbg_cmn_input *cmn_req = msg;
+   __le16 *seq_ptr = msg + info->seq_off;
+   u16 seq = 0, len, segs_off;
+   void *resp = cmn_resp;
+   dma_addr_t dma_handle;
+   int rc, off = 0;
+   void *dma_buf;
+
+   dma_buf = dma_alloc_coherent(&bp->pdev->dev, info->dma_len, &dma_handle,
+GFP_KERNEL);
+   if (!dma_buf)
+   return -ENOMEM;
+
+   segs_off = offsetof(struct hwrm_dbg_coredump_list_output,
+   total_segments);
+   cmn_req->host_dest_addr = cpu_to_le64(dma_handle);
+   cmn_req->host_buf_len = cpu_to_le32(info->dma_len);
+   mutex_lock(&bp->hwrm_cmd_lock);
+   while (1) {
+   *seq_ptr = cpu_to_le16(seq);
+   rc = _hwrm_send_message(bp, msg, msg_len, HWRM_CMD_TIMEOUT);
+   if (rc)
+   break;
+
+   len = le16_to_cpu(*((__le16 *)(resp + info->data_len_off)));
+   if (!seq &&
+   cmn_req->req_type == cpu_to_le16(HWRM_DBG_COREDUMP_LIST)) {
+   info->segs = le16_to_cpu(*((__le16 *)(resp +
+ segs_off)));
+   if (!info->segs) {
+   rc = -EIO;
+   break;
+   }
+
+   info->dest_buf_size = info->segs *
+   sizeof(struct coredump_segme

[PATCH net-next 03/13] bnxt_en: Add external loopback test to ethtool selftest.

2018-08-05 Thread Michael Chan

Add code to detect firmware support for external loopback and the extra
test entry for external loopback.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |  4 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 32 ++-
 3 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c612d74..d9fc905 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6337,6 +6337,10 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp)
bp->lpi_tmr_hi = le32_to_cpu(resp->valid_tx_lpi_timer_high) &
 PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_MASK;
}
+   if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EXTERNAL_LPBK_SUPPORTED) {
+   if (bp->test_info)
+   bp->test_info->flags |= BNXT_TEST_FL_EXT_LPBK;
+   }
if (resp->supported_speeds_auto_mode)
link_info->support_auto_speeds =
le16_to_cpu(resp->supported_speeds_auto_mode);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 3b5a55c..0d49fe0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -990,6 +990,8 @@ struct bnxt_led_info {
 
 struct bnxt_test_info {
u8 offline_mask;
+   u8 flags;
+#define BNXT_TEST_FL_EXT_LPBK  0x1
u16 timeout;
char string[BNXT_MAX_TEST][ETH_GSTRING_LEN];
 };
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 1f626af..9517633 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2397,7 +2397,7 @@ static int bnxt_disable_an_for_lpbk(struct bnxt *bp,
return rc;
 }
 
-static int bnxt_hwrm_phy_loopback(struct bnxt *bp, bool enable)
+static int bnxt_hwrm_phy_loopback(struct bnxt *bp, bool enable, bool ext)
 {
struct hwrm_port_phy_cfg_input req = {0};
 
@@ -2405,7 +2405,10 @@ static int bnxt_hwrm_phy_loopback(struct bnxt *bp, bool 
enable)
 
if (enable) {
bnxt_disable_an_for_lpbk(bp, &req);
-   req.lpbk = PORT_PHY_CFG_REQ_LPBK_LOCAL;
+   if (ext)
+   req.lpbk = PORT_PHY_CFG_REQ_LPBK_EXTERNAL;
+   else
+   req.lpbk = PORT_PHY_CFG_REQ_LPBK_LOCAL;
} else {
req.lpbk = PORT_PHY_CFG_REQ_LPBK_NONE;
}
@@ -2538,15 +2541,17 @@ static int bnxt_run_fw_tests(struct bnxt *bp, u8 
test_mask, u8 *test_results)
return rc;
 }
 
-#define BNXT_DRV_TESTS 3
+#define BNXT_DRV_TESTS 4
 #define BNXT_MACLPBK_TEST_IDX  (bp->num_tests - BNXT_DRV_TESTS)
 #define BNXT_PHYLPBK_TEST_IDX  (BNXT_MACLPBK_TEST_IDX + 1)
-#define BNXT_IRQ_TEST_IDX  (BNXT_MACLPBK_TEST_IDX + 2)
+#define BNXT_EXTLPBK_TEST_IDX  (BNXT_MACLPBK_TEST_IDX + 2)
+#define BNXT_IRQ_TEST_IDX  (BNXT_MACLPBK_TEST_IDX + 3)
 
 static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest,
   u64 *buf)
 {
struct bnxt *bp = netdev_priv(dev);
+   bool do_ext_lpbk = false;
bool offline = false;
u8 test_results = 0;
u8 test_mask = 0;
@@ -2560,6 +2565,10 @@ static void bnxt_self_test(struct net_device *dev, 
struct ethtool_test *etest,
return;
}
 
+   if ((etest->flags & ETH_TEST_FL_EXTERNAL_LB) &&
+   (bp->test_info->flags & BNXT_TEST_FL_EXT_LPBK))
+   do_ext_lpbk = true;
+
if (etest->flags & ETH_TEST_FL_OFFLINE) {
if (bp->pf.active_vfs) {
etest->flags |= ETH_TEST_FL_FAILED;
@@ -2600,13 +2609,22 @@ static void bnxt_self_test(struct net_device *dev, 
struct ethtool_test *etest,
buf[BNXT_MACLPBK_TEST_IDX] = 0;
 
bnxt_hwrm_mac_loopback(bp, false);
-   bnxt_hwrm_phy_loopback(bp, true);
+   bnxt_hwrm_phy_loopback(bp, true, false);
msleep(1000);
if (bnxt_run_loopback(bp)) {
buf[BNXT_PHYLPBK_TEST_IDX] = 1;
etest->flags |= ETH_TEST_FL_FAILED;
}
-   bnxt_hwrm_phy_loopback(bp, false);
+   if (do_ext_lpbk) {
+   etest->flags |= ETH_TEST_FL_EXTERNAL_LB_DONE;
+   bnxt_hwrm_phy_loopback(bp, true, true);
+   msleep(1000);
+   if (bnxt_run_loopback(bp)) {
+   buf[BNXT_EXTLPBK_TEST_IDX] = 1;
+   etest->flags |= ETH_TEST_FL_FAILED;
+   }
+   }

[PATCH net-next 01/13] bnxt_en: Update firmware interface version to 1.9.2.25.

2018-08-05 Thread Michael Chan

New interface has firmware core dump support, new extended port
statistics, and IF state change notifications to the firmware.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |4 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c |8 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |6 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 1227 +++--
 4 files changed, 924 insertions(+), 321 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 934aa11..3b5a55c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -12,11 +12,11 @@
 #define BNXT_H
 
 #define DRV_MODULE_NAME"bnxt_en"
-#define DRV_MODULE_VERSION "1.9.1"
+#define DRV_MODULE_VERSION "1.9.2"
 
 #define DRV_VER_MAJ1
 #define DRV_VER_MIN9
-#define DRV_VER_UPD1
+#define DRV_VER_UPD2
 
 #include 
 #include 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 7bd96ab..f3b9fbc 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -29,7 +29,7 @@ static const struct bnxt_dl_nvm_param nvm_params[] = {
 static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg,
 int msg_len, union devlink_param_value *val)
 {
-   struct hwrm_nvm_variable_input *req = msg;
+   struct hwrm_nvm_get_variable_input *req = msg;
void *data_addr = NULL, *buf = NULL;
struct bnxt_dl_nvm_param nvm_param;
int bytesize, idx = 0, rc, i;
@@ -60,18 +60,18 @@ static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, 
void *msg,
if (!data_addr)
return -ENOMEM;
 
-   req->data_addr = cpu_to_le64(data_dma_addr);
+   req->dest_data_addr = cpu_to_le64(data_dma_addr);
req->data_len = cpu_to_le16(nvm_param.num_bits);
req->option_num = cpu_to_le16(nvm_param.offset);
req->index_0 = cpu_to_le16(idx);
if (idx)
req->dimensions = cpu_to_le16(1);
 
-   if (req->req_type == HWRM_NVM_SET_VARIABLE)
+   if (req->req_type == cpu_to_le16(HWRM_NVM_SET_VARIABLE))
memcpy(data_addr, buf, bytesize);
 
rc = hwrm_send_message(bp, msg, msg_len, HWRM_CMD_TIMEOUT);
-   if (!rc && req->req_type == HWRM_NVM_GET_VARIABLE)
+   if (!rc && req->req_type == cpu_to_le16(HWRM_NVM_GET_VARIABLE))
memcpy(buf, data_addr, bytesize);
 
dma_free_coherent(&bp->pdev->dev, bytesize, data_addr, data_dma_addr);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 7270c8b..3d40e49 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -162,7 +162,7 @@ static const struct {
BNXT_RX_STATS_ENTRY(rx_128b_255b_frames),
BNXT_RX_STATS_ENTRY(rx_256b_511b_frames),
BNXT_RX_STATS_ENTRY(rx_512b_1023b_frames),
-   BNXT_RX_STATS_ENTRY(rx_1024b_1518_frames),
+   BNXT_RX_STATS_ENTRY(rx_1024b_1518b_frames),
BNXT_RX_STATS_ENTRY(rx_good_vlan_frames),
BNXT_RX_STATS_ENTRY(rx_1519b_2047b_frames),
BNXT_RX_STATS_ENTRY(rx_2048b_4095b_frames),
@@ -205,9 +205,9 @@ static const struct {
BNXT_TX_STATS_ENTRY(tx_128b_255b_frames),
BNXT_TX_STATS_ENTRY(tx_256b_511b_frames),
BNXT_TX_STATS_ENTRY(tx_512b_1023b_frames),
-   BNXT_TX_STATS_ENTRY(tx_1024b_1518_frames),
+   BNXT_TX_STATS_ENTRY(tx_1024b_1518b_frames),
BNXT_TX_STATS_ENTRY(tx_good_vlan_frames),
-   BNXT_TX_STATS_ENTRY(tx_1519b_2047_frames),
+   BNXT_TX_STATS_ENTRY(tx_1519b_2047b_frames),
BNXT_TX_STATS_ENTRY(tx_2048b_4095b_frames),
BNXT_TX_STATS_ENTRY(tx_4096b_9216b_frames),
BNXT_TX_STATS_ENTRY(tx_9217b_16383b_frames),
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
index c75d7fa..971ace5d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
@@ -96,6 +96,7 @@ struct hwrm_short_input {
 struct cmd_nums {
__le16  req_type;
#define HWRM_VER_GET  0x0UL
+   #define HWRM_FUNC_DRV_IF_CHANGE   0xdUL
#define HWRM_FUNC_BUF_UNRGTR  0xeUL
#define HWRM_FUNC_VF_CFG  0xfUL
#define HWRM_RESERVED10x10UL
@@ -159,6 +160,7 @@ struct cmd_nums {
#define HWRM_RING_FREE0x51UL
#define HWRM_RING_CMPL_RING_QAGGINT_PARAMS0x52UL
#define HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS 0x53UL
+   #define HWRM_RING_AGGINT_QCAPS0x54UL
#define HWRM_RING_RESET

[PATCH net-next 12/13] bnxt_en: Add DCBNL DSCP application protocol support.

2018-08-05 Thread Michael Chan

Expand the .ieee_setapp() and ieee_delapp() DCBNL methods to support
DSCP.  This allows DSCP values to user priority mappings instead
of using VLAN priorities.  Each DSCP mapping is added or deleted one
entry at a time using the firmware API.  The firmware call can only be
made from a PF.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 83 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h |  6 ++
 3 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 006726c..fefa011 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1281,6 +1281,7 @@ struct bnxt {
struct ieee_ets *ieee_ets;
u8  dcbx_cap;
u8  default_pri;
+   u8  max_dscp_value;
 #endif /* CONFIG_BNXT_DCB */
 
u32 msg_enable;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
index 00dd26d..ddc98c3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
@@ -385,6 +385,61 @@ static int bnxt_hwrm_set_dcbx_app(struct bnxt *bp, struct 
dcb_app *app,
return rc;
 }
 
+static int bnxt_hwrm_queue_dscp_qcaps(struct bnxt *bp)
+{
+   struct hwrm_queue_dscp_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+   struct hwrm_queue_dscp_qcaps_input req = {0};
+   int rc;
+
+   if (bp->hwrm_spec_code < 0x10800 || BNXT_VF(bp))
+   return 0;
+
+   bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_DSCP_QCAPS, -1, -1);
+   mutex_lock(&bp->hwrm_cmd_lock);
+   rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   if (!rc) {
+   bp->max_dscp_value = (1 << resp->num_dscp_bits) - 1;
+   if (bp->max_dscp_value < 0x3f)
+   bp->max_dscp_value = 0;
+   }
+
+   mutex_unlock(&bp->hwrm_cmd_lock);
+   return rc;
+}
+
+static int bnxt_hwrm_queue_dscp2pri_cfg(struct bnxt *bp, struct dcb_app *app,
+   bool add)
+{
+   struct hwrm_queue_dscp2pri_cfg_input req = {0};
+   struct bnxt_dscp2pri_entry *dscp2pri;
+   dma_addr_t mapping;
+   int rc;
+
+   if (bp->hwrm_spec_code < 0x10800)
+   return 0;
+
+   bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_DSCP2PRI_CFG, -1, -1);
+   dscp2pri = dma_alloc_coherent(&bp->pdev->dev, sizeof(*dscp2pri),
+ &mapping, GFP_KERNEL);
+   if (!dscp2pri)
+   return -ENOMEM;
+
+   req.src_data_addr = cpu_to_le64(mapping);
+   dscp2pri->dscp = app->protocol;
+   if (add)
+   dscp2pri->mask = 0x3f;
+   else
+   dscp2pri->mask = 0;
+   dscp2pri->pri = app->priority;
+   req.entry_cnt = cpu_to_le16(1);
+   rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+   if (rc)
+   rc = -EIO;
+   dma_free_coherent(&bp->pdev->dev, sizeof(*dscp2pri), dscp2pri,
+ mapping);
+   return rc;
+}
+
 static int bnxt_ets_validate(struct bnxt *bp, struct ieee_ets *ets, u8 *tc)
 {
int total_ets_bw = 0;
@@ -551,15 +606,30 @@ static int bnxt_dcbnl_ieee_setpfc(struct net_device *dev, 
struct ieee_pfc *pfc)
return rc;
 }
 
+static int bnxt_dcbnl_ieee_dscp_app_prep(struct bnxt *bp, struct dcb_app *app)
+{
+   if (app->selector == IEEE_8021QAZ_APP_SEL_DSCP) {
+   if (!bp->max_dscp_value)
+   return -ENOTSUPP;
+   if (app->protocol > bp->max_dscp_value)
+   return -EINVAL;
+   }
+   return 0;
+}
+
 static int bnxt_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app)
 {
struct bnxt *bp = netdev_priv(dev);
-   int rc = -EINVAL;
+   int rc;
 
if (!(bp->dcbx_cap & DCB_CAP_DCBX_VER_IEEE) ||
!(bp->dcbx_cap & DCB_CAP_DCBX_HOST))
return -EINVAL;
 
+   rc = bnxt_dcbnl_ieee_dscp_app_prep(bp, app);
+   if (rc)
+   return rc;
+
rc = dcb_ieee_setapp(dev, app);
if (rc)
return rc;
@@ -570,6 +640,9 @@ static int bnxt_dcbnl_ieee_setapp(struct net_device *dev, 
struct dcb_app *app)
 app->protocol == ROCE_V2_UDP_DPORT))
rc = bnxt_hwrm_set_dcbx_app(bp, app, true);
 
+   if (app->selector == IEEE_8021QAZ_APP_SEL_DSCP)
+   rc = bnxt_hwrm_queue_dscp2pri_cfg(bp, app, true);
+
return rc;
 }
 
@@ -582,6 +655,10 @@ static int bnxt_dcbnl_ieee_delapp(struct net_device *dev, 
struct dcb_app *app)
!(bp->dcbx_cap & DCB_CAP_DCBX_HOST))
return -EINVAL;
 
+   rc = bnxt_dcbnl_ieee_dscp_app_prep(bp, app);
+

[PATCH net-next 11/13] bnxt_en: Add hwmon sysfs support to read temperature

2018-08-05 Thread Michael Chan

From: Vasundhara Volam 

Export temperature sensor reading via hwmon sysfs.

Signed-off-by: Vasundhara Volam 
Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/Kconfig |  8 
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 62 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 3 files changed, 71 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/Kconfig 
b/drivers/net/ethernet/broadcom/Kconfig
index b7aa8ad..c1d3ee9b 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -230,4 +230,12 @@ config BNXT_DCB
 
  If unsure, say N.
 
+config BNXT_HWMON
+   bool "Broadcom NetXtreme-C/E HWMON support"
+   default y
+   depends on BNXT && HWMON && !(BNXT=y && HWMON=m)
+   ---help---
+ Say Y if you want to expose the thermal sensor data on NetXtreme-C/E
+ devices, via the hwmon sysfs interface.
+
 endif # NET_VENDOR_BROADCOM
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 56bd097..dde904b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -51,6 +51,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "bnxt_hsi.h"
 #include "bnxt.h"
@@ -6789,6 +6791,62 @@ static void bnxt_get_wol_settings(struct bnxt *bp)
} while (handle && handle != 0x);
 }
 
+#ifdef CONFIG_BNXT_HWMON
+static ssize_t bnxt_show_temp(struct device *dev,
+ struct device_attribute *devattr, char *buf)
+{
+   struct hwrm_temp_monitor_query_input req = {0};
+   struct hwrm_temp_monitor_query_output *resp;
+   struct bnxt *bp = dev_get_drvdata(dev);
+   u32 temp = 0;
+
+   resp = bp->hwrm_cmd_resp_addr;
+   bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_TEMP_MONITOR_QUERY, -1, -1);
+   mutex_lock(&bp->hwrm_cmd_lock);
+   if (!_hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT))
+   temp = resp->temp * 1000; /* display millidegree */
+   mutex_unlock(&bp->hwrm_cmd_lock);
+
+   return sprintf(buf, "%u\n", temp);
+}
+static SENSOR_DEVICE_ATTR(temp1_input, 0444, bnxt_show_temp, NULL, 0);
+
+static struct attribute *bnxt_attrs[] = {
+   &sensor_dev_attr_temp1_input.dev_attr.attr,
+   NULL
+};
+ATTRIBUTE_GROUPS(bnxt);
+
+static void bnxt_hwmon_close(struct bnxt *bp)
+{
+   if (bp->hwmon_dev) {
+   hwmon_device_unregister(bp->hwmon_dev);
+   bp->hwmon_dev = NULL;
+   }
+}
+
+static void bnxt_hwmon_open(struct bnxt *bp)
+{
+   struct pci_dev *pdev = bp->pdev;
+
+   bp->hwmon_dev = hwmon_device_register_with_groups(&pdev->dev,
+ DRV_MODULE_NAME, bp,
+ bnxt_groups);
+   if (IS_ERR(bp->hwmon_dev)) {
+   bp->hwmon_dev = NULL;
+   dev_warn(&pdev->dev, "Cannot register hwmon device\n");
+   }
+}
+#else
+static void bnxt_hwmon_close(struct bnxt *bp)
+{
+}
+
+static void bnxt_hwmon_open(struct bnxt *bp)
+{
+}
+#endif
+
 static bool bnxt_eee_config_ok(struct bnxt *bp)
 {
struct ethtool_eee *eee = &bp->eee;
@@ -7040,6 +7098,9 @@ static int bnxt_open(struct net_device *dev)
rc = __bnxt_open_nic(bp, true, true);
if (rc)
bnxt_hwrm_if_change(bp, false);
+
+   bnxt_hwmon_open(bp);
+
return rc;
 }
 
@@ -7102,6 +7163,7 @@ static int bnxt_close(struct net_device *dev)
 {
struct bnxt *bp = netdev_priv(dev);
 
+   bnxt_hwmon_close(bp);
bnxt_close_nic(bp, true, true);
bnxt_hwrm_shutdown_link(bp);
bnxt_hwrm_if_change(bp, false);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 6c40b257..006726c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1411,6 +1411,7 @@ struct bnxt {
struct bnxt_tc_info *tc_info;
struct dentry   *debugfs_pdev;
struct dentry   *debugfs_dim;
+   struct device   *hwmon_dev;
 };
 
 #define BNXT_RX_STATS_OFFSET(counter)  \
-- 
2.5.1

[PATCH net-next 00/13] bnxt_en: Updates for net-next.

2018-08-05 Thread Michael Chan

This series includes the usual firmware spec update.  The driver has
added external phy loopback test and phy setup retry logic that is
needed during hotplug.  In the SRIOV space, the driver has added a
new VF resource allocation mode that requires the VF driver to
reserve resources during IFUP.  IF state changes are now propagated
to firmware so that firmware can release some resources during IFDOWN.

ethtool method to get firmware core dump and hwmon temperature reading
have been added.  DSCP to user priority support has been added to
the driver's DCBNL interface, and the CoS queue logic has been refined
to make sure that the special RDMA Congestion Notification hardware CoS
queue will not be used for networking traffic.

Michael Chan (11):
  bnxt_en: Update firmware interface version to 1.9.2.25.
  bnxt_en: Adjust timer based on ethtool stats-block-usecs settings.
  bnxt_en: Add external loopback test to ethtool selftest.
  bnxt_en: Add PHY retry logic.
  bnxt_en: Add new VF resource allocation strategy mode.
  bnxt_en: Update RSS setup and GRO-HW logic according to the latest
spec.
  bnxt_en: Add BNXT_NEW_RM() macro.
  bnxt_en: Move firmware related flags to a new fw_cap field in struct
bnxt.
  bnxt_en: Notify firmware about IF state changes.
  bnxt_en: Add DCBNL DSCP application protocol support.
  bnxt_en: Do not use the CNP CoS queue for networking traffic.

Vasundhara Volam (2):
  bnxt_en: Add support for ethtool get dump.
  bnxt_en: Add hwmon sysfs support to read temperature

 drivers/net/ethernet/broadcom/Kconfig  |8 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.c  |  216 +++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h  |   30 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h |   66 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c  |   89 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h  |   10 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c  |8 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c  |  378 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h  |   37 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h  | 1227 +++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c|   25 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c  |4 +-
 12 files changed, 1716 insertions(+), 382 deletions(-)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.h

-- 
2.5.1

[PATCH net-next 08/13] bnxt_en: Add BNXT_NEW_RM() macro.

2018-08-05 Thread Michael Chan

The BNXT_FLAG_NEW_RM flag is checked a lot in the code to determine if
the new resource manager is in effect.  Define a macro to perform
this check.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 27 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c   |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c |  4 ++--
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1714850..5c9ee3c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4579,7 +4579,7 @@ static int bnxt_hwrm_get_rings(struct bnxt *bp)
}
 
hw_resc->resv_tx_rings = le16_to_cpu(resp->alloc_tx_rings);
-   if (bp->flags & BNXT_FLAG_NEW_RM) {
+   if (BNXT_NEW_RM(bp)) {
u16 cp, stats;
 
hw_resc->resv_rx_rings = le16_to_cpu(resp->alloc_rx_rings);
@@ -4625,7 +4625,7 @@ __bnxt_hwrm_reserve_pf_rings(struct bnxt *bp, struct 
hwrm_func_cfg_input *req,
req->fid = cpu_to_le16(0x);
enables |= tx_rings ? FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS : 0;
req->num_tx_rings = cpu_to_le16(tx_rings);
-   if (bp->flags & BNXT_FLAG_NEW_RM) {
+   if (BNXT_NEW_RM(bp)) {
enables |= rx_rings ? FUNC_CFG_REQ_ENABLES_NUM_RX_RINGS : 0;
enables |= cp_rings ? FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS |
  FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0;
@@ -4698,7 +4698,7 @@ bnxt_hwrm_reserve_vf_rings(struct bnxt *bp, int tx_rings, 
int rx_rings,
struct hwrm_func_vf_cfg_input req = {0};
int rc;
 
-   if (!(bp->flags & BNXT_FLAG_NEW_RM)) {
+   if (!BNXT_NEW_RM(bp)) {
bp->hw_resc.resv_tx_rings = tx_rings;
return 0;
}
@@ -4758,7 +4758,7 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp)
vnic = rx + 1;
if (bp->flags & BNXT_FLAG_AGG_RINGS)
rx <<= 1;
-   if ((bp->flags & BNXT_FLAG_NEW_RM) &&
+   if (BNXT_NEW_RM(bp) &&
(hw_resc->resv_rx_rings != rx || hw_resc->resv_cp_rings != cp ||
 hw_resc->resv_hw_ring_grps != grp || hw_resc->resv_vnics != vnic))
return true;
@@ -4794,7 +4794,7 @@ static int __bnxt_reserve_rings(struct bnxt *bp)
return rc;
 
tx = hw_resc->resv_tx_rings;
-   if (bp->flags & BNXT_FLAG_NEW_RM) {
+   if (BNXT_NEW_RM(bp)) {
rx = hw_resc->resv_rx_rings;
cp = hw_resc->resv_cp_rings;
grp = hw_resc->resv_hw_ring_grps;
@@ -4838,7 +4838,7 @@ static int bnxt_hwrm_check_vf_rings(struct bnxt *bp, int 
tx_rings, int rx_rings,
u32 flags;
int rc;
 
-   if (!(bp->flags & BNXT_FLAG_NEW_RM))
+   if (!BNXT_NEW_RM(bp))
return 0;
 
__bnxt_hwrm_reserve_vf_rings(bp, &req, tx_rings, rx_rings, ring_grps,
@@ -4867,7 +4867,7 @@ static int bnxt_hwrm_check_pf_rings(struct bnxt *bp, int 
tx_rings, int rx_rings,
__bnxt_hwrm_reserve_pf_rings(bp, &req, tx_rings, rx_rings, ring_grps,
 cp_rings, vnics);
flags = FUNC_CFG_REQ_FLAGS_TX_ASSETS_TEST;
-   if (bp->flags & BNXT_FLAG_NEW_RM)
+   if (BNXT_NEW_RM(bp))
flags |= FUNC_CFG_REQ_FLAGS_RX_ASSETS_TEST |
 FUNC_CFG_REQ_FLAGS_CMPL_ASSETS_TEST |
 FUNC_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST |
@@ -5921,7 +5921,7 @@ int bnxt_get_avail_msix(struct bnxt *bp, int num)
 
max_idx = min_t(int, bp->total_irqs, max_cp);
avail_msix = max_idx - bp->cp_nr_rings;
-   if (!(bp->flags & BNXT_FLAG_NEW_RM) || avail_msix >= num)
+   if (!BNXT_NEW_RM(bp) || avail_msix >= num)
return avail_msix;
 
if (max_irq < total_req) {
@@ -5934,7 +5934,7 @@ int bnxt_get_avail_msix(struct bnxt *bp, int num)
 
 static int bnxt_get_num_msix(struct bnxt *bp)
 {
-   if (!(bp->flags & BNXT_FLAG_NEW_RM))
+   if (!BNXT_NEW_RM(bp))
return bnxt_get_max_func_irqs(bp);
 
return bnxt_cp_rings_in_use(bp);
@@ -6057,8 +6057,7 @@ int bnxt_reserve_rings(struct bnxt *bp)
netdev_err(bp->dev, "ring reservation failure rc: %d\n", rc);
return rc;
}
-   if ((bp->flags & BNXT_FLAG_NEW_RM) &&
-   (bnxt_get_num_msix(bp) != bp->total_irqs)) {
+   if (BNXT_NEW_RM(bp) && (bnxt_get_num_msix(bp) != bp->total_irqs)) {
bnxt_ulp_irq_stop(bp);
bnxt_clear_int_mode(bp);
rc = bnxt_init_int_mode(bp);
@@ -7306,7 +7305,7 @@ static int bnxt_cfg_rx_mode(struct bnxt *bp)
 static bool bnxt_can_reserve_rings(struct bnxt *bp)
 {
 #ifdef CONFIG_BNXT_SRIOV
-   if ((bp->flags & BNXT_FLAG_

[PATCH net-next 09/13] bnxt_en: Move firmware related flags to a new fw_cap field in struct bnxt.

2018-08-05 Thread Michael Chan

The flags field is almost getting full.  Move firmware capability flags
to a new fw_cap field to better organize these firmware flags.

Signed-off-by: Michael Chan 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 13 +++--
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c |  6 +++---
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 5c9ee3c..1659940 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3445,7 +3445,7 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void 
*msg, u32 msg_len,
cp_ring_id = le16_to_cpu(req->cmpl_ring);
intr_process = (cp_ring_id == INVALID_HW_RING_ID) ? 0 : 1;
 
-   if (bp->flags & BNXT_FLAG_SHORT_CMD) {
+   if (bp->fw_cap & BNXT_FW_CAP_SHORT_CMD) {
void *short_cmd_req = bp->hwrm_short_cmd_req_addr;
 
memcpy(short_cmd_req, req, msg_len);
@@ -5089,9 +5089,9 @@ static int bnxt_hwrm_func_qcfg(struct bnxt *bp)
flags = le16_to_cpu(resp->flags);
if (flags & (FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED |
 FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED)) {
-   bp->flags |= BNXT_FLAG_FW_LLDP_AGENT;
+   bp->fw_cap |= BNXT_FW_CAP_LLDP_AGENT;
if (flags & FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED)
-   bp->flags |= BNXT_FLAG_FW_DCBX_AGENT;
+   bp->fw_cap |= BNXT_FW_CAP_DCBX_AGENT;
}
if (BNXT_PF(bp) && (flags & FUNC_QCFG_RESP_FLAGS_MULTI_HOST))
bp->flags |= BNXT_FLAG_MULTI_HOST;
@@ -5249,7 +5249,7 @@ static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
if (bp->hwrm_spec_code >= 0x10803) {
rc = bnxt_hwrm_func_resc_qcaps(bp, true);
if (!rc)
-   bp->flags |= BNXT_FLAG_NEW_RM;
+   bp->fw_cap |= BNXT_FW_CAP_NEW_RM;
}
return 0;
 }
@@ -5352,7 +5352,7 @@ static int bnxt_hwrm_ver_get(struct bnxt *bp)
dev_caps_cfg = le32_to_cpu(resp->dev_caps_cfg);
if ((dev_caps_cfg & VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_SUPPORTED) &&
(dev_caps_cfg & VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_REQUIRED))
-   bp->flags |= BNXT_FLAG_SHORT_CMD;
+   bp->fw_cap |= BNXT_FW_CAP_SHORT_CMD;
 
 hwrm_ver_get_exit:
mutex_unlock(&bp->hwrm_cmd_lock);
@@ -8760,7 +8760,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
if (rc)
goto init_err_pci_clean;
 
-   if (bp->flags & BNXT_FLAG_SHORT_CMD) {
+   if (bp->fw_cap & BNXT_FW_CAP_SHORT_CMD) {
rc = bnxt_alloc_hwrm_short_cmd_req(bp);
if (rc)
goto init_err_pci_clean;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 37dc896..ded2aff 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1144,7 +1144,6 @@ struct bnxt {
atomic_tintr_sem;
 
u32 flags;
-   #define BNXT_FLAG_DCB_ENABLED   0x1
#define BNXT_FLAG_VF0x2
#define BNXT_FLAG_LRO   0x4
 #ifdef CONFIG_INET
@@ -1173,15 +1172,11 @@ struct bnxt {
 BNXT_FLAG_ROCEV2_CAP)
#define BNXT_FLAG_NO_AGG_RINGS  0x2
#define BNXT_FLAG_RX_PAGE_MODE  0x4
-   #define BNXT_FLAG_FW_LLDP_AGENT 0x8
#define BNXT_FLAG_MULTI_HOST0x10
-   #define BNXT_FLAG_SHORT_CMD 0x20
#define BNXT_FLAG_DOUBLE_DB 0x40
-   #define BNXT_FLAG_FW_DCBX_AGENT 0x80
#define BNXT_FLAG_CHIP_NITRO_A0 0x100
#define BNXT_FLAG_DIM   0x200
#define BNXT_FLAG_ROCE_MIRROR_CAP   0x400
-   #define BNXT_FLAG_NEW_RM0x800
#define BNXT_FLAG_PORT_STATS_EXT0x1000
 
#define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA | \
@@ -1195,7 +1190,6 @@ struct bnxt {
 #define BNXT_SINGLE_PF(bp) (BNXT_PF(bp) && !BNXT_NPAR(bp) && !BNXT_MH(bp))
 #define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0)
 #define BNXT_RX_PAGE_MODE(bp)  ((bp)->flags & BNXT_FLAG_RX_PAGE_MODE)
-#define BNXT_NEW_RM(bp)((bp)->flags & BNXT_FLAG_NEW_RM)
 
 /* Chip class phase 4 and later */
 #define BNXT_CHIP_P4_PLUS(bp)  \
@@ -1291,6 +1285,13 @@ struct bnxt {
 
u32 msg_enable;
 
+   u32 fw_cap;
+   #define BNXT_FW_CAP_SHORT_CMD   0x0001
+   #define BNXT_FW_CAP_LLDP_AGENT  0x0002
+   #define BNXT_FW_CAP_DCBX_AGENT  0x0004
+   #define BNXT_FW_CAP_NEW_RM  0x0008
+
+#define BNXT_NEW_RM(bp)

Re: Linux kernel error stack

2018-08-05 Thread Satish Patel

Florian,

I have removed those port 80 CHECKSUM rules and everything looks good
i didn't see kernel WARN mesg.

Thank you so much! You just nailed it :)

On Sun, Aug 5, 2018 at 4:15 PM, Satish Patel  wrote:
> Florian,
>
> It seems those rules coming from here
> https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum
>
> On Sun, Aug 5, 2018 at 4:09 PM, Satish Patel  wrote:
>> Yes this is openstack-ansible deployment tool which set them up. I am
>> wondering where are these rules saved? I believe openstack-ansible use
>> LXC container to deploy services so must be part of LXC startup
>> scripts.
>>
>> I have checked there is no firewalld and iptables service running on system..
>>
>> You think i should get rid of all CHEKSUM option in iptables rules? Am i 
>> right?
>>
>>
>> On Sun, Aug 5, 2018 at 4:02 PM, Florian Westphal  wrote:
>>> Satish Patel  wrote:
 > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM
 > --checksum-fill
 > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM
 > --checksum-fill
>>>
>>> These rules make no sense to me, and are also source of your backtrace.
>>> Who set this up?
>>>
>>> If this is coming from openstack, I suggest asking openstack developers
>>> WTH this is supposed to do.
>>>
 > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp
 > --dport 68 -j CHECKSUM --checksum-fill
>>>
>>> This was needed to work around dhcpd issues w. checksum offloading but I
>>> guess that DCHCP will work fine without this rule too nowadays.
>>>
>>> So I suggest you simply get rid of these rules.

Re: Linux kernel error stack

2018-08-05 Thread Satish Patel

Florian,

It seems those rules coming from here
https://github.com/openstack/openstack-ansible-os_neutron/blob/master/files/post-up-metadata-checksum

On Sun, Aug 5, 2018 at 4:09 PM, Satish Patel  wrote:
> Yes this is openstack-ansible deployment tool which set them up. I am
> wondering where are these rules saved? I believe openstack-ansible use
> LXC container to deploy services so must be part of LXC startup
> scripts.
>
> I have checked there is no firewalld and iptables service running on system..
>
> You think i should get rid of all CHEKSUM option in iptables rules? Am i 
> right?
>
>
> On Sun, Aug 5, 2018 at 4:02 PM, Florian Westphal  wrote:
>> Satish Patel  wrote:
>>> > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM
>>> > --checksum-fill
>>> > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM
>>> > --checksum-fill
>>
>> These rules make no sense to me, and are also source of your backtrace.
>> Who set this up?
>>
>> If this is coming from openstack, I suggest asking openstack developers
>> WTH this is supposed to do.
>>
>>> > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp
>>> > --dport 68 -j CHECKSUM --checksum-fill
>>
>> This was needed to work around dhcpd issues w. checksum offloading but I
>> guess that DCHCP will work fine without this rule too nowadays.
>>
>> So I suggest you simply get rid of these rules.

Re: Linux kernel error stack

2018-08-05 Thread Satish Patel

Yes this is openstack-ansible deployment tool which set them up. I am
wondering where are these rules saved? I believe openstack-ansible use
LXC container to deploy services so must be part of LXC startup
scripts.

I have checked there is no firewalld and iptables service running on system..

You think i should get rid of all CHEKSUM option in iptables rules? Am i right?

On Sun, Aug 5, 2018 at 4:02 PM, Florian Westphal  wrote:
> Satish Patel  wrote:
>> > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM
>> > --checksum-fill
>> > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM
>> > --checksum-fill
>
> These rules make no sense to me, and are also source of your backtrace.
> Who set this up?
>
> If this is coming from openstack, I suggest asking openstack developers
> WTH this is supposed to do.
>
>> > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp
>> > --dport 68 -j CHECKSUM --checksum-fill
>
> This was needed to work around dhcpd issues w. checksum offloading but I
> guess that DCHCP will work fine without this rule too nowadays.
>
> So I suggest you simply get rid of these rules.

Re: Linux kernel error stack

2018-08-05 Thread Florian Westphal

Satish Patel  wrote:
> > [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM
> > --checksum-fill
> > [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM
> > --checksum-fill

These rules make no sense to me, and are also source of your backtrace.
Who set this up?

If this is coming from openstack, I suggest asking openstack developers
WTH this is supposed to do.

> > [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp
> > --dport 68 -j CHECKSUM --checksum-fill

This was needed to work around dhcpd issues w. checksum offloading but I
guess that DCHCP will work fine without this rule too nowadays.

So I suggest you simply get rid of these rules.

Re: Linux kernel error stack

2018-08-05 Thread Satish Patel

I am configured where are those rules stored why iptables -L -n -t nat
not showing them?

On Sun, Aug 5, 2018 at 3:56 PM, Satish Patel  wrote:
> Oh wait!! I think you are right there i didn't notice that iptables
> has following setting.
>
> "CHECKSUM --checksum-fill"
>
> On Sun, Aug 5, 2018 at 3:54 PM, Satish Patel  wrote:
>> I have following kernel logging set.
>>
>> [root@ostack-infra-02 tools]# cat /proc/sys/kernel/printk
>> 3 4 1 3
>>
>>
>> iptables output
>>
>> [root@ostack-infra-02 tools]# iptables-save -c
>> # Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
>> *raw
>> :PREROUTING ACCEPT [42284573:38782693391]
>> :OUTPUT ACCEPT [219668:42675751]
>> COMMIT
>> # Completed on Sun Aug  5 15:52:56 2018
>> # Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
>> *nat
>> :PREROUTING ACCEPT [656413:38450886]
>> :INPUT ACCEPT [5524:564390]
>> :OUTPUT ACCEPT [3217:212903]
>> :POSTROUTING ACCEPT [559058:34193539]
>> [5197:311820] -A POSTROUTING -s 10.0.3.0/24 ! -d 10.0.3.0/24 -j MASQUERADE
>> COMMIT
>> # Completed on Sun Aug  5 15:52:56 2018
>> # Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
>> *mangle
>> :PREROUTING ACCEPT [42209568:38687818558]
>> :INPUT ACCEPT [170634:197796840]
>> :FORWARD ACCEPT [42939243:38526139954]
>> :OUTPUT ACCEPT [198956:41288636]
>> :POSTROUTING ACCEPT [43138199:38567428590]
>> [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM
>> --checksum-fill
>> [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM
>> --checksum-fill
>> [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp
>> --dport 68 -j CHECKSUM --checksum-fill
>> COMMIT
>> # Completed on Sun Aug  5 15:52:56 2018
>> # Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
>> *filter
>> :INPUT ACCEPT [164954:197217701]
>> :FORWARD ACCEPT [42801701:38377075499]
>> :OUTPUT ACCEPT [198963:41289860]
>> [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 53 -j ACCEPT
>> [4932:328918] -A INPUT -i lxcbr0 -p udp -m udp --dport 53 -j ACCEPT
>> [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 67 -j ACCEPT
>> [755:250585] -A INPUT -i lxcbr0 -p udp -m udp --dport 67 -j ACCEPT
>> [80745:145594351] -A FORWARD -o lxcbr0 -j ACCEPT
>> [56801:3471569] -A FORWARD -i lxcbr0 -j ACCEPT
>> COMMIT
>> # Completed on Sun Aug  5 15:52:56 2018
>>
>> On Sun, Aug 5, 2018 at 3:42 PM, Florian Westphal  wrote:
>>> Satish Patel  wrote:
 Thanks Florian,

 FYI, I don't have any CHECKSUM configure in my iptables,
>>>
>>> You have, according to WARN stacktrace you provided.
>>>
>>> iptables-save -c
>>> ip6tables-save -c

Re: Linux kernel error stack

2018-08-05 Thread Satish Patel

Oh wait!! I think you are right there i didn't notice that iptables
has following setting.

"CHECKSUM --checksum-fill"

On Sun, Aug 5, 2018 at 3:54 PM, Satish Patel  wrote:
> I have following kernel logging set.
>
> [root@ostack-infra-02 tools]# cat /proc/sys/kernel/printk
> 3 4 1 3
>
>
> iptables output
>
> [root@ostack-infra-02 tools]# iptables-save -c
> # Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
> *raw
> :PREROUTING ACCEPT [42284573:38782693391]
> :OUTPUT ACCEPT [219668:42675751]
> COMMIT
> # Completed on Sun Aug  5 15:52:56 2018
> # Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
> *nat
> :PREROUTING ACCEPT [656413:38450886]
> :INPUT ACCEPT [5524:564390]
> :OUTPUT ACCEPT [3217:212903]
> :POSTROUTING ACCEPT [559058:34193539]
> [5197:311820] -A POSTROUTING -s 10.0.3.0/24 ! -d 10.0.3.0/24 -j MASQUERADE
> COMMIT
> # Completed on Sun Aug  5 15:52:56 2018
> # Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
> *mangle
> :PREROUTING ACCEPT [42209568:38687818558]
> :INPUT ACCEPT [170634:197796840]
> :FORWARD ACCEPT [42939243:38526139954]
> :OUTPUT ACCEPT [198956:41288636]
> :POSTROUTING ACCEPT [43138199:38567428590]
> [84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM
> --checksum-fill
> [68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM
> --checksum-fill
> [755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp
> --dport 68 -j CHECKSUM --checksum-fill
> COMMIT
> # Completed on Sun Aug  5 15:52:56 2018
> # Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
> *filter
> :INPUT ACCEPT [164954:197217701]
> :FORWARD ACCEPT [42801701:38377075499]
> :OUTPUT ACCEPT [198963:41289860]
> [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 53 -j ACCEPT
> [4932:328918] -A INPUT -i lxcbr0 -p udp -m udp --dport 53 -j ACCEPT
> [0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 67 -j ACCEPT
> [755:250585] -A INPUT -i lxcbr0 -p udp -m udp --dport 67 -j ACCEPT
> [80745:145594351] -A FORWARD -o lxcbr0 -j ACCEPT
> [56801:3471569] -A FORWARD -i lxcbr0 -j ACCEPT
> COMMIT
> # Completed on Sun Aug  5 15:52:56 2018
>
> On Sun, Aug 5, 2018 at 3:42 PM, Florian Westphal  wrote:
>> Satish Patel  wrote:
>>> Thanks Florian,
>>>
>>> FYI, I don't have any CHECKSUM configure in my iptables,
>>
>> You have, according to WARN stacktrace you provided.
>>
>> iptables-save -c
>> ip6tables-save -c

Re: Linux kernel error stack

2018-08-05 Thread Satish Patel

I have following kernel logging set.

[root@ostack-infra-02 tools]# cat /proc/sys/kernel/printk
3 4 1 3


iptables output

[root@ostack-infra-02 tools]# iptables-save -c
# Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
*raw
:PREROUTING ACCEPT [42284573:38782693391]
:OUTPUT ACCEPT [219668:42675751]
COMMIT
# Completed on Sun Aug  5 15:52:56 2018
# Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
*nat
:PREROUTING ACCEPT [656413:38450886]
:INPUT ACCEPT [5524:564390]
:OUTPUT ACCEPT [3217:212903]
:POSTROUTING ACCEPT [559058:34193539]
[5197:311820] -A POSTROUTING -s 10.0.3.0/24 ! -d 10.0.3.0/24 -j MASQUERADE
COMMIT
# Completed on Sun Aug  5 15:52:56 2018
# Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
*mangle
:PREROUTING ACCEPT [42209568:38687818558]
:INPUT ACCEPT [170634:197796840]
:FORWARD ACCEPT [42939243:38526139954]
:OUTPUT ACCEPT [198956:41288636]
:POSTROUTING ACCEPT [43138199:38567428590]
[84166:59495417] -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM
--checksum-fill
[68739:5153476] -A POSTROUTING -p tcp -m tcp --sport 8000 -j CHECKSUM
--checksum-fill
[755:275452] -A POSTROUTING -s 10.0.3.0/24 -o lxcbr0 -p udp -m udp
--dport 68 -j CHECKSUM --checksum-fill
COMMIT
# Completed on Sun Aug  5 15:52:56 2018
# Generated by iptables-save v1.4.21 on Sun Aug  5 15:52:56 2018
*filter
:INPUT ACCEPT [164954:197217701]
:FORWARD ACCEPT [42801701:38377075499]
:OUTPUT ACCEPT [198963:41289860]
[0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 53 -j ACCEPT
[4932:328918] -A INPUT -i lxcbr0 -p udp -m udp --dport 53 -j ACCEPT
[0:0] -A INPUT -i lxcbr0 -p tcp -m tcp --dport 67 -j ACCEPT
[755:250585] -A INPUT -i lxcbr0 -p udp -m udp --dport 67 -j ACCEPT
[80745:145594351] -A FORWARD -o lxcbr0 -j ACCEPT
[56801:3471569] -A FORWARD -i lxcbr0 -j ACCEPT
COMMIT
# Completed on Sun Aug  5 15:52:56 2018

On Sun, Aug 5, 2018 at 3:42 PM, Florian Westphal  wrote:
> Satish Patel  wrote:
>> Thanks Florian,
>>
>> FYI, I don't have any CHECKSUM configure in my iptables,
>
> You have, according to WARN stacktrace you provided.
>
> iptables-save -c
> ip6tables-save -c

[PATCH net-next] net: report netlink extack only if set

2018-08-05 Thread Willem de Bruijn

From: Willem de Bruijn 

Initialize extack in dev_set_mtu and report only if set.

Fixes: 7a4c53bee332 ("net: report invalid mtu value via netlink extack")
Reported-by: syzbot 
Signed-off-by: Willem de Bruijn 
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 36e994519488e..1d0122a766019 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7580,11 +7580,11 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 
 int dev_set_mtu(struct net_device *dev, int new_mtu)
 {
-   struct netlink_ext_ack extack;
+   struct netlink_ext_ack extack = {};
int err;
 
err = dev_set_mtu_ext(dev, new_mtu, &extack);
-   if (err)
+   if (err && extack._msg)
net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
return err;
 }
-- 
2.18.0.597.ga71716f1ad-goog

Re: Linux kernel error stack

2018-08-05 Thread Florian Westphal

Satish Patel  wrote:
> Thanks Florian,
> 
> FYI, I don't have any CHECKSUM configure in my iptables,

You have, according to WARN stacktrace you provided.

iptables-save -c
ip6tables-save -c

[PATCH net-next] tc-testing: remove duplicate spaces in skbedit match patterns

2018-08-05 Thread Vlad Buslov

Match patterns for some skbedit tests contain duplicate whitespace that is
not present in actual tc output. This causes tests to fail because they
can't match required action, even when it was successfully created.

Signed-off-by: Vlad Buslov 
Acked-by: Jamal Hadi Salim 
---
 .../tc-testing/tc-tests/actions/skbedit.json   | 26 +++---
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json 
b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json
index 37ecc2716fee..5aaf593b914a 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json
@@ -17,7 +17,7 @@
 "cmdUnderTest": "$TC actions add action skbedit mark 1",
 "expExitCode": "0",
 "verifyCmd": "$TC actions list action skbedit",
-"matchPattern": "action order [0-9]*:  skbedit mark 1",
+"matchPattern": "action order [0-9]*: skbedit  mark 1",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action skbedit"
@@ -65,7 +65,7 @@
 "cmdUnderTest": "$TC actions add action skbedit prio 99",
 "expExitCode": "0",
 "verifyCmd": "$TC actions list action skbedit",
-"matchPattern": "action order [0-9]*:  skbedit priority :99",
+"matchPattern": "action order [0-9]*: skbedit  priority :99",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action skbedit"
@@ -113,7 +113,7 @@
 "cmdUnderTest": "$TC actions add action skbedit queue_mapping 909",
 "expExitCode": "0",
 "verifyCmd": "$TC actions list action skbedit",
-"matchPattern": "action order [0-9]*:  skbedit queue_mapping 909",
+"matchPattern": "action order [0-9]*: skbedit queue_mapping 909",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action skbedit"
@@ -161,7 +161,7 @@
 "cmdUnderTest": "$TC actions add action skbedit ptype host",
 "expExitCode": "0",
 "verifyCmd": "$TC actions list action skbedit",
-"matchPattern": "action order [0-9]*:  skbedit ptype host",
+"matchPattern": "action order [0-9]*: skbedit  ptype host",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action skbedit"
@@ -185,7 +185,7 @@
 "cmdUnderTest": "$TC actions add action skbedit ptype otherhost",
 "expExitCode": "0",
 "verifyCmd": "$TC actions list action skbedit",
-"matchPattern": "action order [0-9]*:  skbedit ptype otherhost",
+"matchPattern": "action order [0-9]*: skbedit  ptype otherhost",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action skbedit"
@@ -233,7 +233,7 @@
 "cmdUnderTest": "$TC actions add action skbedit ptype host pipe index 
11",
 "expExitCode": "0",
 "verifyCmd": "$TC actions get action skbedit index 11",
-"matchPattern": "action order [0-9]*:  skbedit ptype host pipe.*index 
11 ref",
+"matchPattern": "action order [0-9]*: skbedit  ptype host pipe.*index 
11 ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action skbedit"
@@ -257,7 +257,7 @@
 "cmdUnderTest": "$TC actions add action skbedit mark 56789 reclassify 
index 90",
 "expExitCode": "0",
 "verifyCmd": "$TC actions get action skbedit index 90",
-"matchPattern": "action order [0-9]*:  skbedit mark 56789 
reclassify.*index 90 ref",
+"matchPattern": "action order [0-9]*: skbedit  mark 56789 
reclassify.*index 90 ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action skbedit"
@@ -281,7 +281,7 @@
 "cmdUnderTest": "$TC actions add action skbedit queue_mapping 3 pass 
index 271",
 "expExitCode": "0",
 "verifyCmd": "$TC actions get action skbedit index 271",
-"matchPattern": "action order [0-9]*:  skbedit queue_mapping 3 
pass.*index 271 ref",
+"matchPattern": "action order [0-9]*: skbedit queue_mapping 3 
pass.*index 271 ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action skbedit"
@@ -305,7 +305,7 @@
 "cmdUnderTest": "$TC actions add action skbedit queue_mapping 3 drop 
index 271",
 "expExitCode": "0",
 "verifyCmd": "$TC actions get action skbedit index 271",
-"matchPattern": "action order [0-9]*:  skbedit queue_mapping 3 
drop.*index 271 ref",
+"matchPattern": "action order [0-9]*: skbedit queue_mapping 3 
drop.*index 271 ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action skbedit"
@@ -329,7 +329,7 @@
 "cmdUnderTest": "$TC actions add action skbedit priority 8 jump 9 
index 2",
 "expExitCode": "0",
 "verifyCmd": "$TC actions get action skbedit index 2",
-"matchPatter

[PATCH net-next] tc-testing: remove duplicate spaces in connmark match patterns

2018-08-05 Thread Vlad Buslov

Match patterns for some connmark tests contain duplicate whitespace that is
not present in actual tc output. This causes tests to fail because they
can't match required action, even when it was successfully created.

Fixes: 1dad0f97 ("tc-testing: add connmark action tests")
Signed-off-by: Vlad Buslov 
Acked-by: Jamal Hadi Salim 
---
 .../tc-testing/tc-tests/actions/connmark.json  | 24 +++---
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json 
b/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json
index 70952bd98ff9..13147a1f5731 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json
@@ -17,7 +17,7 @@
 "cmdUnderTest": "$TC actions add action connmark",
 "expExitCode": "0",
 "verifyCmd": "$TC actions list action connmark",
-"matchPattern": "action order [0-9]+:  connmark zone 0 pipe",
+"matchPattern": "action order [0-9]+: connmark zone 0 pipe",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action connmark"
@@ -41,7 +41,7 @@
 "cmdUnderTest": "$TC actions add action connmark pass index 1",
 "expExitCode": "0",
 "verifyCmd": "$TC actions get action connmark index 1",
-"matchPattern": "action order [0-9]+:  connmark zone 0 pass.*index 1 
ref",
+"matchPattern": "action order [0-9]+: connmark zone 0 pass.*index 1 
ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action connmark"
@@ -65,7 +65,7 @@
 "cmdUnderTest": "$TC actions add action connmark drop index 100",
 "expExitCode": "0",
 "verifyCmd": "$TC actions get action connmark index 100",
-"matchPattern": "action order [0-9]+:  connmark zone 0 drop.*index 100 
ref",
+"matchPattern": "action order [0-9]+: connmark zone 0 drop.*index 100 
ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action connmark"
@@ -89,7 +89,7 @@
 "cmdUnderTest": "$TC actions add action connmark pipe index 455",
 "expExitCode": "0",
 "verifyCmd": "$TC actions get action connmark index 455",
-"matchPattern": "action order [0-9]+:  connmark zone 0 pipe.*index 455 
ref",
+"matchPattern": "action order [0-9]+: connmark zone 0 pipe.*index 455 
ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action connmark"
@@ -113,7 +113,7 @@
 "cmdUnderTest": "$TC actions add action connmark reclassify index 7",
 "expExitCode": "0",
 "verifyCmd": "$TC actions list action connmark",
-"matchPattern": "action order [0-9]+:  connmark zone 0 
reclassify.*index 7 ref",
+"matchPattern": "action order [0-9]+: connmark zone 0 
reclassify.*index 7 ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action connmark"
@@ -137,7 +137,7 @@
 "cmdUnderTest": "$TC actions add action connmark continue index 17",
 "expExitCode": "0",
 "verifyCmd": "$TC actions list action connmark",
-"matchPattern": "action order [0-9]+:  connmark zone 0 continue.*index 
17 ref",
+"matchPattern": "action order [0-9]+: connmark zone 0 continue.*index 
17 ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action connmark"
@@ -161,7 +161,7 @@
 "cmdUnderTest": "$TC actions add action connmark jump 10 index 17",
 "expExitCode": "0",
 "verifyCmd": "$TC actions list action connmark",
-"matchPattern": "action order [0-9]+:  connmark zone 0 jump 10.*index 
17 ref",
+"matchPattern": "action order [0-9]+: connmark zone 0 jump 10.*index 
17 ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action connmark"
@@ -185,7 +185,7 @@
 "cmdUnderTest": "$TC actions add action connmark zone 100 pipe index 
1",
 "expExitCode": "0",
 "verifyCmd": "$TC actions get action connmark index 1",
-"matchPattern": "action order [0-9]+:  connmark zone 100 pipe.*index 1 
ref",
+"matchPattern": "action order [0-9]+: connmark zone 100 pipe.*index 1 
ref",
 "matchCount": "1",
 "teardown": [
 "$TC actions flush action connmark"
@@ -209,7 +209,7 @@
 "cmdUnderTest": "$TC actions add action connmark zone 65536 reclassify 
index 21",
 "expExitCode": "255",
 "verifyCmd": "$TC actions get action connmark index 1",
-"matchPattern": "action order [0-9]+:  connmark zone 65536 
reclassify.*index 21 ref",
+"matchPattern": "action order [0-9]+: connmark zone 65536 
reclassify.*index 21 ref",
 "matchCount": "0",
 "teardown": [
 "$TC actions flush action connmark"
@@ -233,7 +233,7 @@
 "cmdUnderTest": "$TC a

[PATCH net-next] tc-testing: flush gact actions on test teardown

2018-08-05 Thread Vlad Buslov

Test 6fb4 creates one mirred and one pipe action, but only flushes mirred
on teardown. Leaking pipe action causes failures in other tests.

Add additional teardown command to also flush gact actions.

Signed-off-by: Vlad Buslov 
Acked-by: Jamal Hadi Salim 
---
 tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json 
b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
index 6e4edfae1799..db49fd0f8445 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
@@ -44,7 +44,8 @@
 "matchPattern": "action order [0-9]*: mirred \\(Egress Redirect to 
device lo\\).*index 2 ref",
 "matchCount": "1",
 "teardown": [
-"$TC actions flush action mirred"
+"$TC actions flush action mirred",
+"$TC actions flush action gact"
 ]
 },
 {
-- 
2.7.5

[PATCH net-next] tc-testing: fix ip address in u32 test

2018-08-05 Thread Vlad Buslov

Fix expected ip address to actually match configured ip address.
Fix test to expect single matched filter.

Signed-off-by: Vlad Buslov 
Acked-by: Jamal Hadi Salim 
---
 tools/testing/selftests/tc-testing/tc-tests/filters/tests.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json 
b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
index 5fa02d86b35f..99a5ffca1088 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
@@ -12,8 +12,8 @@
 "cmdUnderTest": "$TC filter add dev $DEV1 parent : protocol ip 
prio 1 u32 match ip src 127.0.0.1/32 flowid 1:1 action ok",
 "expExitCode": "0",
 "verifyCmd": "$TC filter show dev $DEV1 parent :",
-"matchPattern": "match 7f02/ at 12",
-"matchCount": "0",
+"matchPattern": "match 7f01/ at 12",
+"matchCount": "1",
 "teardown": [
 "$TC qdisc del dev $DEV1 ingress"
 ]
-- 
2.7.5

[RFC] weirdness in cxgb3_main.c:init_tp_parity()

2018-08-05 Thread Al Viro

for (i = 0; i < 2048; i++) {
...
req->l2t_idx = htonl(V_L2T_W_IDX(i));
...

in there is very odd; l2t_idx is a 16bit field, and
#define V_L2T_W_IDX(x) ((x) << S_L2T_W_IDX)
#define S_L2T_W_IDX0

IOW, we are taking htonl(something in range 0..2047) and
shove it into 16bit field.  Which would, on a little-endian
host, be a fancy way of spelling 
req->l2t_idx = 0;

What's the intended behaviour there?  I'm not familiar with
the hardware in question; this smells like a typoed
req->l2t_idx = htons(...)
but how does the current code manage to work (i.e. does
anything even care about the value stored there)?  It's not
a big-endian-only driver, after all...

[endianness bug] cxgb4: mk_act_open_req() buggers ->{local,peer}_ip on big-endian hosts

2018-08-05 Thread Al Viro

Unlike fs.val.lport and fs.val.fport, cxgb4_process_flow_match()
sets fs.val.{l,f}ip to net-endian values without conversion - they come
straight from flow_dissector_key_ipv4_addrs ->dst and ->src resp.  So
the assignment in mk_act_open_req() ought to be a straigh copy.

As far as I know, T4 PCIe cards do exist, so it's not as if that
thing could only be found on little-endian systems...

Signed-off-by: Al Viro 
---
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index 00fc5f1afb1d..7dddb9e748b8 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -1038,10 +1038,8 @@ static void mk_act_open_req(struct filter_entry *f, 
struct sk_buff *skb,
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_filterid));
req->local_port = cpu_to_be16(f->fs.val.lport);
req->peer_port = cpu_to_be16(f->fs.val.fport);
-   req->local_ip = f->fs.val.lip[0] | f->fs.val.lip[1] << 8 |
-   f->fs.val.lip[2] << 16 | f->fs.val.lip[3] << 24;
-   req->peer_ip = f->fs.val.fip[0] | f->fs.val.fip[1] << 8 |
-   f->fs.val.fip[2] << 16 | f->fs.val.fip[3] << 24;
+   memcpy(&req->local_ip, f->fs.val.lip, 4);
+   memcpy(&req->peer_ip, f->fs.val.fip, 4);
req->opt0 = cpu_to_be64(NAGLE_V(f->fs.newvlan == VLAN_REMOVE ||
f->fs.newvlan == VLAN_REWRITE) |
DELACK_V(f->fs.hitcnts) |

Re: [endianness bug?] cxgb4_next_header .match_val/.match_mask should be net-endian

2018-08-05 Thread Al Viro

On Sun, Aug 05, 2018 at 04:28:11PM +0100, Al Viro wrote:
> On little-endian host those do yield the right values - e.g. 0x1100 is
> {0, 17, 0, 0}, etc.  On big-endian, though, these will end up checking
> in IPv4 case the octet at offset 10 (i.e. upper 16 bits of checksum) and for 
> IPv6
> - the octet at offset 5 (i.e.  the lower 8 bits of payload length).
> 
> Unless I'm misreading that code, it needs the following to do the right
> thing both on l-e and b-e.  Comments?

... and it looks like the same story with ->mask - it's compared to ->offmask,
which is __be16.  For little-endian hosts the values make sense (htons(0x0f00),
with offoff 0 and shift 6, i.e. "take the first two octets, treat them as
net-endian, clear everything except IHL bits and shift down by 6, which'd
yield IHL*4"), for big-endian they don't - you'd get TOS * 4 instead...

[endianness bug?] cxgb4_next_header .match_val/.match_mask should be net-endian

2018-08-05 Thread Al Viro

AFAICS, cxgb4_next_header() expects to find match_val/match_mask in 
struct cxgb4_next_header stored as big-endian:

/* Found a possible candidate.  Find a key that
 * matches the corresponding offset, value, and
 * mask to jump to next header.
 */
for (j = 0; j < cls->knode.sel->nkeys; j++) {
off = cls->knode.sel->keys[j].off;
val = cls->knode.sel->keys[j].val;
mask = cls->knode.sel->keys[j].mask;

if (next[i].match_off == off &&
next[i].match_val == val &&
next[i].match_mask == mask) {
found = true;
break;
}
}

Here ->keys[] is struct tc_u32_key and there mask and val are definitely
__be32.  match_val and match_mask are never changed after initialization
and they are set to:

* .match_off = 8, .match_val = 0x600, .match_mask = 0xFF00
meant to check for IPV4.Protocol == TCP, i.e. octet at offset 9 being 6
* .match_off = 8, .match_val = 0x1100, .match_mask = 0xFF00
meant to check for IPV4.Protocol == UDP, i.e. octet at offset 9 being 
0x11
* .match_off = 4, .match_val = 0x6, .match_mask = 0xFF
IPV6.NextHeader == TCP, i.e. octet at offset 6 being 6
* .match_off = 4, .match_val = 0x11, .match_mask = 0xFF
IPV6.NextHeader == UDP, i.e. octet at offset 6 being 0x11

On little-endian host those do yield the right values - e.g. 0x1100 is
{0, 17, 0, 0}, etc.  On big-endian, though, these will end up checking
in IPv4 case the octet at offset 10 (i.e. upper 16 bits of checksum) and for 
IPv6
- the octet at offset 5 (i.e.  the lower 8 bits of payload length).

Unless I'm misreading that code, it needs the following to do the right
thing both on l-e and b-e.  Comments?

Signed-off-by: Al Viro 
---
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h
index a4b99edcc339..ec226b1cebf4 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h
@@ -259,11 +259,11 @@ struct cxgb4_next_header {
  */
 static const struct cxgb4_next_header cxgb4_ipv4_jumps[] = {
{ .offset = 0, .offoff = 0, .shift = 6, .mask = 0xF,
- .match_off = 8, .match_val = 0x600, .match_mask = 0xFF00,
- .jump = cxgb4_tcp_fields },
+ .match_off = 8, .match_val = htonl(6 << 16),
+ .match_mask = htonl(0xff<<16), .jump = cxgb4_tcp_fields },
{ .offset = 0, .offoff = 0, .shift = 6, .mask = 0xF,
- .match_off = 8, .match_val = 0x1100, .match_mask = 0xFF00,
- .jump = cxgb4_udp_fields },
+ .match_off = 8, .match_val = htonl(17 << 16),
+ .match_mask = htonl(0xff<<16), .jump = cxgb4_udp_fields },
{ .jump = NULL }
 };
 
@@ -272,11 +272,11 @@ static const struct cxgb4_next_header cxgb4_ipv4_jumps[] 
= {
  */
 static const struct cxgb4_next_header cxgb4_ipv6_jumps[] = {
{ .offset = 0x28, .offoff = 0, .shift = 0, .mask = 0,
- .match_off = 4, .match_val = 0x6, .match_mask = 0xFF,
- .jump = cxgb4_tcp_fields },
+ .match_off = 4, .match_val = htonl(6 << 8),
+ .match_mask = htonl(0xff << 8), .jump = cxgb4_tcp_fields },
{ .offset = 0x28, .offoff = 0, .shift = 0, .mask = 0,
- .match_off = 4, .match_val = 0x11, .match_mask = 0xFF,
- .jump = cxgb4_udp_fields },
+ .match_off = 4, .match_val = htonl(17 << 8),
+ .match_mask = htonl(0xff << 8), .jump = cxgb4_udp_fields },
{ .jump = NULL }
 };

Re: consequences of setting net_device_ops ndo_change_carrier()?

2018-08-05 Thread Andrew Lunn

> > You should have a PHY device of some sort. Either a traditional
> > copper PHY, or an SFP module. There should be a driver for this PHY.
> > This could be one of those in drivers/net/phy. Or it could be
> > firmware running, running on a little microcontroller inside your
> > FPGA?
> 
>   in my case, it's properly in drivers/net/phy, so at least that part
> is normal. back to investigating ...

Hi Robert

O.K, that makes thing simpler.

PHYs are controlled via an MDIO bus. Do you have an MDIO bus driver?
You said this was an FPGA design. MDIO might be a standard cell you
can just drop in. If so, look around and see if there is an existing
driver for it. Otherwise you might have to write one. They are quite
simple, some examples are in driver/net/phy. Depending on the address
range, the MDIO bus driver can also be embedded in the MAC driver.

   Andrew

[PATCH net] ip6_tunnel: use the right value for ipv4 min mtu check in ip6_tnl_xmit

2018-08-05 Thread Xin Long

According to RFC791, 68 bytes is the minimum size of IPv4 datagram every
device must be able to forward without further fragmentation while 576
bytes is the minimum size of IPv4 datagram every device has to be able
to receive, so in ip6_tnl_xmit(), 68(IPV4_MIN_MTU) should be the right
value for the ipv4 min mtu check in ip6_tnl_xmit.

While at it, change to use max() instead of if statement.

Fixes: c9fefa08190f ("ip6_tunnel: get the min mtu properly in ip6_tnl_xmit")
Reported-by: Sabrina Dubroca 
Signed-off-by: Xin Long 
---
 net/ipv6/ip6_tunnel.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 00e138a..1cc9650 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1133,12 +1133,8 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev, __u8 dsfield,
max_headroom += 8;
mtu -= 8;
}
-   if (skb->protocol == htons(ETH_P_IPV6)) {
-   if (mtu < IPV6_MIN_MTU)
-   mtu = IPV6_MIN_MTU;
-   } else if (mtu < 576) {
-   mtu = 576;
-   }
+   mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
+  IPV6_MIN_MTU : IPV4_MIN_MTU);
 
skb_dst_update_pmtu(skb, mtu);
if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
-- 
2.1.0

Re: KCM - recvmsg() mangles packets?

2018-08-05 Thread Dominique Martinet

Dominique Martinet wrote on Sun, Aug 05, 2018:
> (I'm not sure about offset, since we pass the full skb to parse message,
> wouldn't it look at the start of the buffer everytime? Well, offset
> seems to be 0 everytime the first time that check fails so I can
> probably ignore that for now...)

Oh, this might actually not have been such a bad remark; if I have the
client write two "messages" in a single write() kcm seems to reliably
fail the same way...

Conversely, if I setsockopt(s, IPPROTO_TCP, TCP_NODELAY...) on the
sender socket, *and* make it wait till the kcm socket has been created
to start sending data, then it dramatically reduces the probability of
this happening (I had to let the reproducer run in a loop for 5 minutes,
wheras it used to happen within seconds).

So I think the problem is packet aggregation, and strparser not handling
that properly... The first packet still fails with TCP_NODELAY but
there's probably aggregation on the recv side as well before the socket
is attached to the multiplexor... I guess the low probability failure
that is still happening could be similar.
(I also noticed that I've mistakedly believed that the problem was the
first packet contained the 2nd packet's data because of an off-by-one in
the receiver, it really is the second packet, it only has the first
packet's length)



I've moved my check from kcm_rcv_strparser to just before parse_msg in
__strp_recv and surely enough it fails everytime there's an offset.


It's getting late but I'll try adding a pskb_pull in there tomorrow, it
would be better to make the bpf program start with an offset but I don't
think that'll be easy to change...



I'm almost done spamming, thanks for being a good rubber duck! :p
-- 
Dominique Martinet

Re: Linux kernel error stack

2018-08-05 Thread Satish Patel

Thanks Florian,

FYI, I don't have any CHECKSUM configure in my iptables, i have
following rules, also do you think this kernel WARNNING is just
warning and not impacting my performance, based on that i have to
decided criticality of this issue.

[root@ostack-infra-02 ~]# iptables -L -n
Chain INPUT (policy ACCEPT)
target prot opt source   destination
ACCEPT tcp  --  0.0.0.0/00.0.0.0/0tcp dpt:53
ACCEPT udp  --  0.0.0.0/00.0.0.0/0udp dpt:53
ACCEPT tcp  --  0.0.0.0/00.0.0.0/0tcp dpt:67
ACCEPT udp  --  0.0.0.0/00.0.0.0/0udp dpt:67

Chain FORWARD (policy ACCEPT)
target prot opt source   destination
ACCEPT all  --  0.0.0.0/00.0.0.0/0
ACCEPT all  --  0.0.0.0/00.0.0.0/0

On Sun, Aug 5, 2018 at 5:48 AM, Florian Westphal  wrote:
> Satish Patel  wrote:
>> I am installing openstack and as you know i have lots of bridges and
>> vlan interface on my Linux CentOS 7.5
>>
>> I was getting following error stack on 3.10 kernel and found this is
>> kernel bug which required kernel upgrade so now i have upgraded my
>> kernel to 4.17.12 but i am still seeing same kernel stack error on my
>> dmesg
>>
>> I have disable TSO, LRO, SG & GSO on my NIC but still getting error
>> just wanted to understand what is this and why it popping up
>
> Get rid of CHECKSUM target in the iptables rules.
> This thing was added 8 years ago to work around dhcp bugs, I don't
> think its use is needed anymore.
> Try removing it and see that all VMs can still retrieve IP address
> via DHCP.
>
> I'm curious as to the rules, normally CHECKSUM target should be
> limited to -p udp --dport bootp; its bad idea to feed it normal
> packets, its expensive to do this in software rather than have device
> do the checksumming.
>
> As for fix, I'm tempted to send patch to make checksum target
> eval a no-op & add deprecation warning on init...
>
> Other "fix" is to
>
> diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
> index 9f4151ec3e06..23a17dda604d 100644
> --- a/net/netfilter/xt_CHECKSUM.c
> +++ b/net/netfilter/xt_CHECKSUM.c
> @@ -25,8 +25,12 @@ MODULE_ALIAS("ip6t_CHECKSUM");
>  static unsigned int
>  checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
>  {
> -   if (skb->ip_summed == CHECKSUM_PARTIAL)
> -   skb_checksum_help(skb);
> +   if (skb->ip_summed == CHECKSUM_PARTIAL) {
> +   if (skb_shinfo(skb)->gso_size)
> +   skb->ip_summed = CHECKSUM_NONE;
> +   else
> +   skb_checksum_help(skb);
> +   }
>
> return XT_CONTINUE;
>  }
>
> (unfortunately, the target isn't restricted to POSTROUTING, sigh).

Re: [PATCH net] l2tp: fix missing refcount drop in pppol2tp_tunnel_ioctl()

2018-08-05 Thread Guillaume Nault

On Fri, Aug 03, 2018 at 12:42:22PM -0700, David Miller wrote:
> From: Guillaume Nault 
> Date: Fri, 3 Aug 2018 17:00:11 +0200
> 
> > If 'session' is not NULL and is not a PPP pseudo-wire, then we fail to
> > drop the reference taken by l2tp_session_get().
> > 
> > Fixes: ecd012e45ab5 ("l2tp: filter out non-PPP sessions in 
> > pppol2tp_tunnel_ioctl()")
> > Signed-off-by: Guillaume Nault 
> > ---
> > Sorry for the stupid mistake. I guess I got blinded by the apparent
> > simplicity of the bug when I wrote the original patch.
> 
> Applied, thanks.
> 
> I'm pretty sure I backported the commit this fixes, so I'm queueing
> this up for -stable as well.
> 
Well, I think it wasn't. I didn't receive any notification from the
stable team about it and I don't see it in Greg's stable queue nor
in any -stable tree.

Also, we'd have to queue 90904ff5f958 ("l2tp: fix pseudo-wire type for
sessions created by pppol2tp_connect()") first, which is necessary for
properly identifying PPP sessions.

To recapitulate, three patches are needed to fix the original bug:

  * 90904ff5f958 ("l2tp: fix pseudo-wire type for sessions created by
pppol2tp_connect()"): allows later patches to check if a session is
PPP.

  * ecd012e45ab5 ("l2tp: filter out non-PPP sessions in
pppol2tp_tunnel_ioctl()"): refuses calling pppol2tp_session_ioctl()
on non-PPP sessions. This fixes an invalid pointer dereference when
the session is Ethernet. Unfortunately it fails to drop the
reference it takes on the session.

  * f664e37dcc52 ("l2tp: fix missing refcount drop in
pppol2tp_tunnel_ioctl()"): fixes the memory leak introduced by the
previous patch.

Re: consequences of setting net_device_ops ndo_change_carrier()?

2018-08-05 Thread Robert P. J. Day

On Sun, 5 Aug 2018, Andrew Lunn wrote:

> On Sat, Aug 04, 2018 at 07:06:58AM -0400, Robert P. J. Day wrote:
> >
> >   i'll try to keep this (relatively) short as there may be a
> > simple answer to this, or it could just be a stupid question --
> > sort of related to previous question (thank you, florian).
> >
> >   currently messing with networking device involving FPGA and some
> > quad-port transceivers, and noticed that, when one unplugs or
> > plugs a device into one of the ports, there is no change in the
> > contents of the corresponding sysfs files
> > /sys/class/net//carrier (or operstate, for that matter,
> > which might be related to this as well).
>
> Hi Robert
>
> As other have pointed out, ndo_change_carrier is not what you want
> here.

  i think i see that now ... based on the really adamant comment in
netdevice.h:

"Devices that determine carrier state from physical hardware
properties (eg network cables) or protocol-dependent mechanisms (eg
USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this
function."

the impression i got was that implementing that routine for a physical
device would actually cause problems, but it seems only that it would
be a strange thing to do, but wouldn't necessarily cause problems if
you chose not to take advantage of it. which brings me back to one of
my original questions -- why *would* someone implement it? as some
sort of debugging feature? in any event, i'm convinced that that's not
where the problem lies.

> You should have a PHY device of some sort. Either a traditional
> copper PHY, or an SFP module. There should be a driver for this PHY.
> This could be one of those in drivers/net/phy. Or it could be
> firmware running, running on a little microcontroller inside your
> FPGA?

  in my case, it's properly in drivers/net/phy, so at least that part
is normal. back to investigating ...

rday

--

Robert P. J. Day Ottawa, Ontario, CANADA
  http://crashcourse.ca/dokuwiki

Twitter:   http://twitter.com/rpjday
LinkedIn:   http://ca.linkedin.com/in/rpjday

Re: Linux kernel error stack

2018-08-05 Thread Florian Westphal

Satish Patel  wrote:
> I am installing openstack and as you know i have lots of bridges and
> vlan interface on my Linux CentOS 7.5
> 
> I was getting following error stack on 3.10 kernel and found this is
> kernel bug which required kernel upgrade so now i have upgraded my
> kernel to 4.17.12 but i am still seeing same kernel stack error on my
> dmesg
> 
> I have disable TSO, LRO, SG & GSO on my NIC but still getting error
> just wanted to understand what is this and why it popping up

Get rid of CHECKSUM target in the iptables rules.
This thing was added 8 years ago to work around dhcp bugs, I don't
think its use is needed anymore.
Try removing it and see that all VMs can still retrieve IP address
via DHCP.

I'm curious as to the rules, normally CHECKSUM target should be
limited to -p udp --dport bootp; its bad idea to feed it normal
packets, its expensive to do this in software rather than have device
do the checksumming.

As for fix, I'm tempted to send patch to make checksum target
eval a no-op & add deprecation warning on init...

Other "fix" is to

diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
index 9f4151ec3e06..23a17dda604d 100644
--- a/net/netfilter/xt_CHECKSUM.c
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -25,8 +25,12 @@ MODULE_ALIAS("ip6t_CHECKSUM");
 static unsigned int
 checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
-   if (skb->ip_summed == CHECKSUM_PARTIAL)
-   skb_checksum_help(skb);
+   if (skb->ip_summed == CHECKSUM_PARTIAL) {
+   if (skb_shinfo(skb)->gso_size)
+   skb->ip_summed = CHECKSUM_NONE;
+   else
+   skb_checksum_help(skb);
+   }

return XT_CONTINUE;
 }

(unfortunately, the target isn't restricted to POSTROUTING, sigh).

98 matches

Mail list logo