Re: [patch net-next 2/2] net/sched: fix filter flushing

2017-05-20 Thread Jiri Pirko
Sun, May 21, 2017 at 02:16:45AM CEST, xiyou.wangc...@gmail.com wrote:
>On Sat, May 20, 2017 at 6:01 AM, Jiri Pirko  wrote:
>> +static void tcf_chain_destroy(struct tcf_chain *chain)
>> +{
>> +   list_del(>list);
>> +   tcf_chain_flush(chain);
>> kfree(chain);
>>  }
>>
>> @@ -510,7 +517,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct 
>> nlmsghdr *n,
>>
>> if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
>> tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
>> -   tcf_chain_destroy(chain);
>> +   tcf_chain_flush(chain);
>
>
>I wonder if we should return EBUSY and do nothing in case of busy?
>The chain is no longer visual to new actions after your list_del(), but
>the old one could still use and see it.

No. User request to flush the chain, that is what happens in the past
and that is what should happen now.
If there is still a reference, the chain_put will keep the empty chain.
If there is no longer a reference, chain_put will destroy the chain. All
good.



[PATCH iproute2] devlink: Add option to set and show eswitch encapsulation support

2017-05-20 Thread Roi Dayan
This is an e-switch global knob to enable HW support for applying
encapsulation/decapsulation to VF traffic as part of SRIOV e-switch offloading.

The actual encap/decap is carried out (along with the matching and other
actions) per offloaded e-switch rules, e.g as done when offloading the TC tunnel
key action.

Possible values are enable/disable.

Signed-off-by: Roi Dayan 
Reviewed-by: Jiri Pirko 
---
 devlink/devlink.c  | 48 +++-
 man/man8/devlink-dev.8 | 13 +
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index e22ee0a..f9bc16c 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -176,6 +176,7 @@ static void ifname_map_free(struct ifname_map *ifname_map)
 #define DL_OPT_ESWITCH_INLINE_MODE BIT(12)
 #define DL_OPT_DPIPE_TABLE_NAMEBIT(13)
 #define DL_OPT_DPIPE_TABLE_COUNTERSBIT(14)
+#define DL_OPT_ESWITCH_ENCAP_MODE  BIT(15)
 
 struct dl_opts {
uint32_t present; /* flags of present items */
@@ -195,6 +196,7 @@ struct dl_opts {
enum devlink_eswitch_inline_mode eswitch_inline_mode;
const char *dpipe_table_name;
bool dpipe_counters_enable;
+   bool eswitch_encap_mode;
 };
 
 struct dl {
@@ -299,6 +301,7 @@ static const enum mnl_attr_data_type 
devlink_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_OCC_MAX] = MNL_TYPE_U32,
[DEVLINK_ATTR_ESWITCH_MODE] = MNL_TYPE_U16,
[DEVLINK_ATTR_ESWITCH_INLINE_MODE] = MNL_TYPE_U8,
+   [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = MNL_TYPE_U8,
[DEVLINK_ATTR_DPIPE_TABLES] = MNL_TYPE_NESTED,
[DEVLINK_ATTR_DPIPE_TABLE] = MNL_TYPE_NESTED,
[DEVLINK_ATTR_DPIPE_TABLE_NAME] = MNL_TYPE_STRING,
@@ -754,6 +757,19 @@ static int dpipe_counters_enable_get(const char *typestr,
return 0;
 }
 
+static int eswitch_encap_mode_get(const char *typestr, bool *p_mode)
+{
+   if (strcmp(typestr, "enable") == 0) {
+   *p_mode = true;
+   } else if (strcmp(typestr, "disable") == 0) {
+   *p_mode = false;
+   } else {
+   pr_err("Unknown eswitch encap mode \"%s\"\n", typestr);
+   return -EINVAL;
+   }
+   return 0;
+}
+
 static int dl_argv_parse(struct dl *dl, uint32_t o_required,
 uint32_t o_optional)
 {
@@ -908,7 +924,19 @@ static int dl_argv_parse(struct dl *dl, uint32_t 
o_required,
if (err)
return err;
o_found |= DL_OPT_DPIPE_TABLE_COUNTERS;
+   } else if (dl_argv_match(dl, "encap") &&
+  (o_all & DL_OPT_ESWITCH_ENCAP_MODE)) {
+   const char *typestr;
 
+   dl_arg_inc(dl);
+   err = dl_argv_str(dl, );
+   if (err)
+   return err;
+   err = eswitch_encap_mode_get(typestr,
+>eswitch_encap_mode);
+   if (err)
+   return err;
+   o_found |= DL_OPT_ESWITCH_ENCAP_MODE;
} else {
pr_err("Unknown option \"%s\"\n", dl_argv(dl));
return -EINVAL;
@@ -986,6 +1014,13 @@ static int dl_argv_parse(struct dl *dl, uint32_t 
o_required,
pr_err("Dpipe table counter state expected\n");
return -EINVAL;
}
+
+   if ((o_required & DL_OPT_ESWITCH_ENCAP_MODE) &&
+   !(o_found & DL_OPT_ESWITCH_ENCAP_MODE)) {
+   pr_err("E-Switch encapsulation option expected.\n");
+   return -EINVAL;
+   }
+
return 0;
 }
 
@@ -1041,6 +1076,9 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl 
*dl)
if (opts->present & DL_OPT_DPIPE_TABLE_COUNTERS)
mnl_attr_put_u8(nlh, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED,
opts->dpipe_counters_enable);
+   if (opts->present & DL_OPT_ESWITCH_ENCAP_MODE)
+   mnl_attr_put_u8(nlh, DEVLINK_ATTR_ESWITCH_ENCAP_MODE,
+   opts->eswitch_encap_mode);
 }
 
 static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl,
@@ -1097,6 +1135,7 @@ static void cmd_dev_help(void)
pr_err("Usage: devlink dev show [ DEV ]\n");
pr_err("   devlink dev eswitch set DEV [ mode { legacy | switchdev 
} ]\n");
pr_err("   [ inline-mode { none | link | 
network | transport } ]\n");
+   pr_err("   [ encap { disable | enable } 
]\n");
pr_err("   devlink dev eswitch show DEV\n");
 }
 
@@ -1421,6 +1460,12 @@ static void pr_out_eswitch(struct dl *dl, struct nlattr 
**tb)
   eswitch_inline_mode_name(mnl_attr_get_u8(
   

Re: [PATCH net-next] geneve: always fill CSUM6_RX configuration

2017-05-20 Thread Pravin Shelar
On Sat, May 20, 2017 at 6:35 AM, Eric Garver  wrote:
> On Fri, May 19, 2017 at 06:57:46PM -0700, Pravin Shelar wrote:
>> On Thu, May 18, 2017 at 12:59 PM, Eric Garver  wrote:
>> > CSMU6_RX is relevant for collect_metadata as well. As such leave it
>> > outside of the dev's IPv4/IPv6 checks.
>> >
>> Can you explain it bit? is this flag used with ipv4 tunnels?
>
> It's used with collect_metadata as both ipv4 and ipv6 sockets will be
> created.
>
> openvswitch recently gained support for creating tunnels with rtnetlink.
> It sets COLLECT_METADATA and CSUM6_RX. After create, it does a get to
> verify the device got created with all the requested configuration. The
> verify was failing due to CSUM6_RX not being returned.
>
> Since ip_tunnel_info_af() defaults to returning AF_INET, we fall into
> the IPv4 case and CSUM6_RX is never returned. Other relevant areas that
> call ip_tunnel_info_af() do so using the info from the skb, not the
> geneve_dev.
>
ok.
I think ip_tunnel_info_af() check is not right. it does not work in
case of collect_metadata.
Better fix would be check for genene->sock4 and genene->sock6 and
build the netlink skb accordingly.


Re: BPF relocations

2017-05-20 Thread Maciej W. Rozycki
On Fri, 12 May 2017, David Miller wrote:

> Internally, we have to emit some kind of relocation as GAS makes it's
> first pass over the instructions.

 Not really, you can defer that until the relaxation pass, e.g. to avoid 
figuring out what to do about forward local symbol references, especially 
in complex expressions that may eventually resolve to assembly constants.

 This is how MIPS16 assembly works for example, due to its dual regular vs 
extended instruction encoding, so that no artificial relocations had to be 
invented in GAS for the regular instructions, which never get relocations 
recorded against in an object file.  Any instruction that ends up needing 
a relocation, if supported (not all encodings do), gets converted to the 
extended form in the relaxation pass and only then a suitable fixup is 
attached to it.

 Of course that may not matter for your specific case, but I think it's 
worth noting so as to avoid people getting misled about how GAS works.

 FWIW,

  Maciej


Deleting a dynamic mac entry..

2017-05-20 Thread Manohar Kumar
Hello,

In 3.19 the following bridge fdb command to delete a dynamically
learned entry fails..

root@net-3:~# bridge fdb show | grep 02:42:0a:ff:00:06
02:42:0a:ff:00:06 dev vxlan0 master br0
root@net-3:~# bridge fdb del 02:42:0a:ff:00:06 dev vxlan0 master
RTNETLINK answers: No such file or directory

It works in 4.4.

Can someone please point to the patch that made this change ?

In kernels without this patch is there an alternative to delete
(actually I want to do it programmatically) dynamic mac entries ?


[PATCH net-next v6] net: ipv6: fix code style error and warning of ndisc.c

2017-05-20 Thread yuan linyu
From: yuan linyu 

CC: Joe Perches 
Signed-off-by: yuan linyu 
---
 net/ipv6/ndisc.c | 324 +--
 1 file changed, 173 insertions(+), 151 deletions(-)

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d310dc4..5cf25bc 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -12,8 +12,7 @@
  *  2 of the License, or (at your option) any later version.
  */
 
-/*
- * Changes:
+/* Changes:
  *
  * Alexey I. Froloff   :   RFC6106 (DNSSL) support
  * Pierre Ynard:   export userland ND options
@@ -99,7 +98,6 @@ static const struct neigh_ops ndisc_hh_ops = {
.connected_output = neigh_resolve_output,
 };
 
-
 static const struct neigh_ops ndisc_direct_ops = {
.family =   AF_INET6,
.output =   neigh_direct_output,
@@ -147,13 +145,13 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int 
type, void *data,
u8 *opt = skb_put(skb, space);
 
opt[0] = type;
-   opt[1] = space>>3;
+   opt[1] = space >> 3;
 
memset(opt + 2, 0, pad);
opt   += pad;
space -= pad;
 
-   memcpy(opt+2, data, data_len);
+   memcpy(opt + 2, data, data_len);
data_len += 2;
opt += data_len;
space -= data_len;
@@ -182,6 +180,7 @@ static struct nd_opt_hdr *ndisc_next_option(struct 
nd_opt_hdr *cur,
struct nd_opt_hdr *end)
 {
int type;
+
if (!cur || !end || cur >= end)
return NULL;
type = cur->nd_opt_type;
@@ -222,6 +221,7 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
memset(ndopts, 0, sizeof(*ndopts));
while (opt_len) {
int l;
+
if (opt_len < sizeof(struct nd_opt_hdr))
return NULL;
l = nd_opt->nd_opt_len << 3;
@@ -234,20 +234,28 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
case ND_OPT_TARGET_LL_ADDR:
case ND_OPT_MTU:
case ND_OPT_NONCE:
-   case ND_OPT_REDIRECT_HDR:
-   if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
+   case ND_OPT_REDIRECT_HDR: {
+   struct nd_opt_hdr **hdr;
+
+   hdr = >nd_opt_array[nd_opt->nd_opt_type];
+   if (*hdr) {
ND_PRINTK(2, warn,
  "%s: duplicated ND6 option found: 
type=%d\n",
  __func__, nd_opt->nd_opt_type);
} else {
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   *hdr = nd_opt;
}
break;
-   case ND_OPT_PREFIX_INFO:
+   }
+   case ND_OPT_PREFIX_INFO: {
+   struct nd_opt_hdr **hdr;
+
+   hdr = >nd_opt_array[nd_opt->nd_opt_type];
ndopts->nd_opts_pi_end = nd_opt;
-   if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   if (!*hdr)
+   *hdr = nd_opt;
break;
+   }
 #ifdef CONFIG_IPV6_ROUTE_INFO
case ND_OPT_ROUTE_INFO:
ndopts->nd_opts_ri_end = nd_opt;
@@ -261,8 +269,7 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
if (!ndopts->nd_useropts)
ndopts->nd_useropts = nd_opt;
} else {
-   /*
-* Unknown options must be silently ignored,
+   /* Unknown options must be silently ignored,
 * to accommodate future extension to the
 * protocol.
 */
@@ -280,7 +287,8 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
return ndopts;
 }
 
-int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device 
*dev, int dir)
+int ndisc_mc_map(const struct in6_addr *addr, char *buf,
+struct net_device *dev, int dir)
 {
switch (dev->type) {
case ARPHRD_ETHER:
@@ -327,9 +335,8 @@ static int ndisc_constructor(struct neighbour *neigh)
bool is_multicast = ipv6_addr_is_multicast(addr);
 
in6_dev = in6_dev_get(dev);
-   if (!in6_dev) {
+   if (!in6_dev)
return -EINVAL;
-   }
 
parms = in6_dev->nd_parms;
__neigh_parms_put(neigh->parms);
@@ -344,12 

Re: [RFC net-next PATCH 3/5] net: introduce XDP driver features interface

2017-05-20 Thread Daniel Borkmann

On 05/20/2017 09:53 AM, Jesper Dangaard Brouer wrote:

On Fri, 19 May 2017 19:13:29 +0200
Daniel Borkmann  wrote:


On 05/18/2017 05:41 PM, Jesper Dangaard Brouer wrote:

There is a fundamental difference between normal eBPF programs
and (XDP) eBPF programs getting attached in a driver. For normal
eBPF programs it is easy to add a new bpf feature, like a bpf
helper, because is it strongly tied to the feature being
available in the current core kernel code.  When drivers invoke a
bpf_prog, then it is not sufficient to simply relying on whether
a bpf_helper exists or not.  When a driver haven't implemented a
given feature yet, then it is possible to expose uninitialized
parts of xdp_buff.  The driver pass in a pointer to xdp_buff,
usually "allocated" on the stack, which must not be exposed.


When xdp_buff is being extended, then we should at least zero
initialize all in-tree users that don't support or populate this
field, thus that it's not uninitialized memory. Better would be
to have a way to reject the prog in the first place until it's
implemented (but further comments on feature bits below).


Going down a path where we need to zero out the xdp_buff looks a lot
like the sk_buff zeroing, which is the top perf cost associated with
SKBs see[1].  XDP is is about not repeating the same issue we had with
the SKB...


But if we agree on implementing a certain feature that requires to
add a new member, then basic requirement should be that it needs to
be implementable by all XDP enabled drivers, so that users eventually
don't need to worry which driver they run to access a XDP feature.
In that case, zeroing that member until it gets properly implemented
is just temporary (and could be an incentive perhaps).


[1] 
https://prototype-kernel.readthedocs.io/en/latest/blogposts/xdp25_eval_generic_xdp_tx.html#analyzing-build-skb-and-memset


Only two user visible NETIF_F_XDP_* net_device feature flags are
exposed via ethtool (-k) seen as "xdp" and "xdp-partial".
The "xdp-partial" is detected when there is not feature equality
between kernel and driver, and a netdev_warn is given.


I think having something like a NETIF_F_XDP_BIT for ethtool to
indicate support as "xdp" is quite useful. Avoids having to grep
the kernel tree for ndo_xdp callback. ;) A "xdp-partial" would
still be unclear/confusing to the user whether his program loads
or doesn't which is the only thing a user (or some loading infra)
cares about eventually, so one still needs to go trying to load
the XDP code to see whether that fails for the native case.


Good that we agree on usefulness of the NETIF_F_XDP_BIT.  The
"xdp-partial" or "xdp-challenged" is an early indication to the user
that they should complain to the vendor.  I tried to keep it simple
towards the user. Do you think every feature bit should be exposed to
userspace?


That would potentially require us to go down that path and expose
feature bits for everything, even something as silly as new flags
for a specific helper that requires some sort of additional support.
We probably rather want to keep such thing in the kernel for now
and potentially reject loads instead.


The idea is that XDP_DRV_* feature bits define a contract between
the driver and the kernel, giving a reliable way to know that XDP
features a driver promised to implement. Thus, knowing what bpf
side features are safe to allow.

There are 3 levels of features: "required", "devel" and "optional".

The motivation is pushing driver vendors forward to support all
the new XDP features.  Once a given feature bit is moved into
the "required" features, the kernel will reject loading XDP
program if feature isn't implemented by driver.  Features under
developement, require help from the bpf infrastrucure to detect
when a given helper or direct-access is used, using a bpf_prog
bit to mark a need for the feature, and pulling in this bit in
the xdp_features_check().  When all drivers have implemented
a "devel" feature, it can be moved to the "required" feature and


The problem is that once you add bits markers to bpf_prog like we
used to do in the past, then as you do in patch 4/5 with the
xdp_rxhash_needed bit, they will need to be turned /on/ unconditionally
when a prog has tail calls.


Yes, with tail calls, we have to enable all features.  But that is a
good thing, as it forces vendors to quickly implement all features.
And it is no different from moving a feature into the "required" bits,
once all drivers implement it.  It is only a limitation for tail calls,
and something we can fix later (for handling this for tail calls).


But the issue I pointed out with this approach is that XDP programs
using tail calls will suddenly break and get rejected from one
version to another.

That's basically breaking the contract we have today where it must
be guaranteed that older programs keep working on newer kernels,
exactly the same with uapi. Breaking user programs is a no-go,
not a good thing at all (it will hurt 

[PATCH net-next v5] net: ipv6: fix code style error and warning of ndisc.c

2017-05-20 Thread yuan linyu
From: yuan linyu 

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d310dc4..5cf25bc 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -12,8 +12,7 @@
  *  2 of the License, or (at your option) any later version.
  */
 
-/*
- * Changes:
+/* Changes:
  *
  * Alexey I. Froloff   :   RFC6106 (DNSSL) support
  * Pierre Ynard:   export userland ND options
@@ -99,7 +98,6 @@ static const struct neigh_ops ndisc_hh_ops = {
.connected_output = neigh_resolve_output,
 };
 
-
 static const struct neigh_ops ndisc_direct_ops = {
.family =   AF_INET6,
.output =   neigh_direct_output,
@@ -147,13 +145,13 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int 
type, void *data,
u8 *opt = skb_put(skb, space);
 
opt[0] = type;
-   opt[1] = space>>3;
+   opt[1] = space >> 3;
 
memset(opt + 2, 0, pad);
opt   += pad;
space -= pad;
 
-   memcpy(opt+2, data, data_len);
+   memcpy(opt + 2, data, data_len);
data_len += 2;
opt += data_len;
space -= data_len;
@@ -182,6 +180,7 @@ static struct nd_opt_hdr *ndisc_next_option(struct 
nd_opt_hdr *cur,
struct nd_opt_hdr *end)
 {
int type;
+
if (!cur || !end || cur >= end)
return NULL;
type = cur->nd_opt_type;
@@ -222,6 +221,7 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
memset(ndopts, 0, sizeof(*ndopts));
while (opt_len) {
int l;
+
if (opt_len < sizeof(struct nd_opt_hdr))
return NULL;
l = nd_opt->nd_opt_len << 3;
@@ -234,20 +234,28 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
case ND_OPT_TARGET_LL_ADDR:
case ND_OPT_MTU:
case ND_OPT_NONCE:
-   case ND_OPT_REDIRECT_HDR:
-   if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
+   case ND_OPT_REDIRECT_HDR: {
+   struct nd_opt_hdr **hdr;
+
+   hdr = >nd_opt_array[nd_opt->nd_opt_type];
+   if (*hdr) {
ND_PRINTK(2, warn,
  "%s: duplicated ND6 option found: 
type=%d\n",
  __func__, nd_opt->nd_opt_type);
} else {
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   *hdr = nd_opt;
}
break;
-   case ND_OPT_PREFIX_INFO:
+   }
+   case ND_OPT_PREFIX_INFO: {
+   struct nd_opt_hdr **hdr;
+
+   hdr = >nd_opt_array[nd_opt->nd_opt_type];
ndopts->nd_opts_pi_end = nd_opt;
-   if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   if (!*hdr)
+   *hdr = nd_opt;
break;
+   }
 #ifdef CONFIG_IPV6_ROUTE_INFO
case ND_OPT_ROUTE_INFO:
ndopts->nd_opts_ri_end = nd_opt;
@@ -261,8 +269,7 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
if (!ndopts->nd_useropts)
ndopts->nd_useropts = nd_opt;
} else {
-   /*
-* Unknown options must be silently ignored,
+   /* Unknown options must be silently ignored,
 * to accommodate future extension to the
 * protocol.
 */
@@ -280,7 +287,8 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
return ndopts;
 }
 
-int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device 
*dev, int dir)
+int ndisc_mc_map(const struct in6_addr *addr, char *buf,
+struct net_device *dev, int dir)
 {
switch (dev->type) {
case ARPHRD_ETHER:
@@ -327,9 +335,8 @@ static int ndisc_constructor(struct neighbour *neigh)
bool is_multicast = ipv6_addr_is_multicast(addr);
 
in6_dev = in6_dev_get(dev);
-   if (!in6_dev) {
+   if (!in6_dev)
return -EINVAL;
-   }
 
parms = in6_dev->nd_parms;
__neigh_parms_put(neigh->parms);
@@ -344,12 +351,12 @@ static int ndisc_constructor(struct neighbour *neigh)
if (is_multicast) {
neigh->nud_state = NUD_NOARP;
ndisc_mc_map(addr, neigh->ha, dev, 1);
-   

Re: [patch net-next 2/2] net/sched: fix filter flushing

2017-05-20 Thread Cong Wang
On Sat, May 20, 2017 at 6:01 AM, Jiri Pirko  wrote:
> +static void tcf_chain_destroy(struct tcf_chain *chain)
> +{
> +   list_del(>list);
> +   tcf_chain_flush(chain);
> kfree(chain);
>  }
>
> @@ -510,7 +517,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct 
> nlmsghdr *n,
>
> if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
> tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
> -   tcf_chain_destroy(chain);
> +   tcf_chain_flush(chain);


I wonder if we should return EBUSY and do nothing in case of busy?
The chain is no longer visual to new actions after your list_del(), but
the old one could still use and see it.


[PATCH] include stdint.h explicitly for UINT16_MAX)

2017-05-20 Thread Khem Raj
Fixes
| tc_core.c:190:29: error: 'UINT16_MAX' undeclared (first use in this 
function); did you mean '__INT16_MAX__'?
|if ((sz >> s->size_log) > UINT16_MAX) {
|  ^~

Signed-off-by: Khem Raj 
---
 tc/tc_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tc/tc_core.c b/tc/tc_core.c
index 7bbe0d7..821b741 100644
--- a/tc/tc_core.c
+++ b/tc/tc_core.c
@@ -12,6 +12,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.13.0



Re: [PATCH 0/4] Configuring traffic classes via new hardware offload mechanism in tc/mqprio

2017-05-20 Thread Or Gerlitz
On Sat, May 20, 2017 at 3:58 AM, Amritha Nambiar
 wrote:
> The following series introduces a new harware offload mode in tc/mqprio

Wait, we have already a HW QoS model introduced by John F and Co
couple of years ago,  right?

Please elaborate in few sentence if you are extending it/replacing it,
why we want to do that and what are the implications on existing
applications UAPI wise.

Below you just say in the new mode Qos is configured with knobs XYZ --
this is way not enough

> where the TCs, the queue configurations and bandwidth rate limits
> are offloaded to the hardware.


> The i40e driver enables the new mqprio hardware offload mechanism factoring 
> the TCs, queue configuration and bandwidth rates by creating HW channel VSIs.
>
> In this mode, the priority to traffic class mapping and the user specified 
> queue ranges are used to configure the traffic class when the 'hw' option is 
> set to 2. This is achieved by creating HW channels(VSI). A new channel is 
> created for each of the traffic class configuration offloaded via mqprio 
> framework except for the first TC (TC0) which is for the main VSI. TC0 for 
> the main VSI is also reconfigured as per user provided queue parameters. 
> Finally, bandwidth rate limits are set on these traffic classes through the 
> mqprio offload framework by sending these rates in addition to the number of 
> TCs and the queue configurations.


Re: arch: arm: bpf: Converting cBPF to eBPF for arm 32 bit

2017-05-20 Thread Shubham Bansal
Hi Daniel and Kees,

Before I send the patch, I have tested the JIT compiler on ARMv7 but
not on ARMv5 or ARMv6. So can you tell me which arch versions I should
test it for?
Also for my testing, CONFIG_FRAME_POINTER and CONFIG_CPU_BIG_ENDIAN
are both disabled. But I need to test JIT with these flags as well.
Whenever I put these flags in .config file, the arm kernel is not
getting compiler with these flags. Can you tell me why? If you need
more information regarding this, please let me know.

With current config for ARMv7, benchmarks are :

[root@vexpress modules]# insmod test_bpf.ko
[   25.797766] test_bpf: #0 TAX jited:1 180 170 169 PASS
[   25.811395] test_bpf: #1 TXA jited:1 93 89 111 PASS
[   25.815073] test_bpf: #2 ADD_SUB_MUL_K jited:1 94 PASS
[   25.816779] test_bpf: #3 DIV_MOD_KX jited:1 983 PASS
[   25.827310] test_bpf: #4 AND_OR_LSH_K jited:1 94 93 PASS
[   25.829843] test_bpf: #5 LD_IMM_0 jited:1 83 PASS
[   25.831260] test_bpf: #6 LD_IND jited:1 338 266 305 PASS
[   25.840970] test_bpf: #7 LD_ABS jited:1 343 304 289 PASS
[   25.851005] test_bpf: #8 LD_ABS_LL jited:1 362 300 PASS
[   25.858119] test_bpf: #9 LD_IND_LL jited:1 244 241 245 PASS
[   25.865994] test_bpf: #10 LD_ABS_NET jited:1 318 316 PASS
[   25.872829] test_bpf: #11 LD_IND_NET jited:1 243 196 196 PASS
[   25.879717] test_bpf: #12 LD_PKTTYPE jited:1 129 140 PASS
[   25.883034] test_bpf: #13 LD_MARK jited:1 113 88 PASS
[   25.885545] test_bpf: #14 LD_RXHASH jited:1 81 79 PASS
[   25.887506] test_bpf: #15 LD_QUEUE jited:1 88 85 PASS
[   25.889593] test_bpf: #16 LD_PROTOCOL jited:1 322 353 PASS
[   25.896894] test_bpf: #17 LD_VLAN_TAG jited:1 92 92 PASS
[   25.899173] test_bpf: #18 LD_VLAN_TAG_PRESENT jited:1 85 88 PASS
[   25.901310] test_bpf: #19 LD_IFINDEX jited:1 94 130 PASS
[   25.904110] test_bpf: #20 LD_HATYPE jited:1 98 91 PASS
[   25.906393] test_bpf: #21 LD_CPU
[   25.906651] bpf_jit: *** NOT YET: opcode 85 ***
[   25.906795] jited:0 705 691 PASS
[   25.921007] test_bpf: #22 LD_NLATTR jited:0 577 668 PASS
[   25.933870] test_bpf: #23 LD_NLATTR_NEST jited:0 2253 3006 PASS
[   25.987020] test_bpf: #24 LD_PAYLOAD_OFF jited:0 3840 4922 PASS
[   26.075119] test_bpf: #25 LD_ANC_XOR jited:1 107 94 PASS
[   26.077583] test_bpf: #26 SPILL_FILL jited:1 159 148 173 PASS
[   26.083259] test_bpf: #27 JEQ jited:1 274 183 181 PASS
[   26.090383] test_bpf: #28 JGT jited:1 255 194 165 PASS
[   26.097163] test_bpf: #29 JGE jited:1 187 190 246 PASS
[   26.103932] test_bpf: #30 JSET jited:1 178 184 192 PASS
[   26.110229] test_bpf: #31 tcpdump port 22 jited:1 266 698 717 PASS
[   26.127698] test_bpf: #32 tcpdump complex jited:1 267 729 1129 PASS
[   26.149727] test_bpf: #33 RET_A jited:1 94 88 PASS
[   26.152114] test_bpf: #34 INT: ADD trivial jited:1 87 PASS
[   26.153900] test_bpf: #35 INT: MUL_X jited:1 95 PASS
[   26.155384] test_bpf: #36 INT: MUL_X2 jited:1 82 PASS
[   26.156606] test_bpf: #37 INT: MUL32_X jited:1 91 PASS
[   26.157846] test_bpf: #38 INT: ADD 64-bit jited:1 1055 PASS
[   26.169280] test_bpf: #39 INT: ADD 32-bit jited:1 701 PASS
[   26.177039] test_bpf: #40 INT: SUB jited:1 931 PASS
[   26.187108] test_bpf: #41 INT: XOR jited:1 355 PASS
[   26.191364] test_bpf: #42 INT: MUL jited:1 389 PASS
[   26.196286] test_bpf: #43 MOV REG64 jited:1 267 PASS
[   26.199759] test_bpf: #44 MOV REG32 jited:1 176 PASS
[   26.202060] test_bpf: #45 LD IMM64 jited:1 194 PASS
[   26.204607] test_bpf: #46 INT: ALU MIX jited:0 1174 PASS
[   26.216896] test_bpf: #47 INT: shifts by register jited:1 211 PASS
[   26.219956] test_bpf: #48 INT: DIV + ABS jited:1 559 517 PASS
[   26.231347] test_bpf: #49 INT: DIV by zero jited:1 395 277 PASS
[   26.238862] test_bpf: #50 check: missing ret PASS
[   26.239288] test_bpf: #51 check: div_k_0 PASS
[   26.239492] test_bpf: #52 check: unknown insn PASS
[   26.239640] test_bpf: #53 check: out of range spill/fill PASS
[   26.239803] test_bpf: #54 JUMPS + HOLES jited:1 295 PASS
[   26.243343] test_bpf: #55 check: RET X PASS
[   26.244065] test_bpf: #56 check: LDX + RET X PASS
[   26.244224] test_bpf: #57 M[]: alt STX + LDX jited:1 433 PASS
[   26.249126] test_bpf: #58 M[]: full STX + full LDX jited:1 427 PASS
[   26.254123] test_bpf: #59 check: SKF_AD_MAX PASS
[   26.254509] test_bpf: #60 LD [SKF_AD_OFF-1] jited:1 298 PASS
[   26.257882] test_bpf: #61 load 64-bit immediate jited:1 128 PASS
[   26.259813] test_bpf: #62 nmap reduced jited:1 655 PASS
[   26.267216] test_bpf: #63 ALU_MOV_X: dst = 2 jited:1 89 PASS
[   26.268766] test_bpf: #64 ALU_MOV_X: dst = 4294967295 jited:1 72 PASS
[   26.270126] test_bpf: #65 ALU64_MOV_X: dst = 2 jited:1 94 PASS
[   26.271768] test_bpf: #66 ALU64_MOV_X: dst = 4294967295 jited:1 145 PASS
[   26.274152] test_bpf: #67 ALU_MOV_K: dst = 2 jited:1 93 PASS
[   26.275673] test_bpf: #68 ALU_MOV_K: dst = 4294967295 jited:1 103 PASS
[   26.277371] test_bpf: #69 ALU_MOV_K: 0x =
0x jited:1 99 PASS
[   26.278966] test_bpf: #70 ALU64_MOV_K: dst = 2 jited:1 110 

Re: [RFC net-next PATCH 4/5] net: new XDP feature for reading HW rxhash from drivers

2017-05-20 Thread Tom Herbert
On Thu, May 18, 2017 at 8:41 AM, Jesper Dangaard Brouer
 wrote:
> Introducing a new XDP feature and associated bpf helper bpf_xdp_rxhash.
>
> The rxhash and type allow filtering on packets without touching
> packet memory.  The performance difference on my system with a
> 100 Gbit/s mlx5 NIC is 12Mpps to 19Mpps.
>
> TODO: desc RXHASH and associated type, and how XDP choose to map
> and export these to bpf_prog's.
>
> TODO: desc how this interacts with XDP driver features system.
> ---
>  include/linux/filter.h  |   31 -
>  include/linux/netdev_features.h |4 ++
>  include/uapi/linux/bpf.h|   56 +-
>  kernel/bpf/verifier.c   |3 ++
>  net/core/dev.c  |   16 -
>  net/core/filter.c   |   73 
> +++
>  samples/bpf/bpf_helpers.h   |2 +
>  tools/include/uapi/linux/bpf.h  |   10 +
>  8 files changed, 190 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 9a7786db14fa..33a254ccd47d 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -413,7 +413,8 @@ struct bpf_prog {
> locked:1,   /* Program image locked? */
> gpl_compatible:1, /* Is filter GPL 
> compatible? */
> cb_access:1,/* Is control block accessed? 
> */
> -   dst_needed:1;   /* Do we need dst entry? */
> +   dst_needed:1,   /* Do we need dst entry? */
> +   xdp_rxhash_needed:1;/* Req XDP RXHASH support 
> */
> kmemcheck_bitfield_end(meta);
> enum bpf_prog_type  type;   /* Type of BPF program */
> u32 len;/* Number of filter blocks */
> @@ -444,12 +445,40 @@ struct bpf_skb_data_end {
> void *data_end;
>  };
>
> +/* Kernel internal xdp_buff->flags */
> +#define XDP_CTX_F_RXHASH_TYPE_MASK XDP_HASH_TYPE_MASK
> +#define XDP_CTX_F_RXHASH_TYPE_BITS XDP_HASH_TYPE_BITS
> +#define XDP_CTX_F_RXHASH_SW(1ULL <<  XDP_CTX_F_RXHASH_TYPE_BITS)
> +#define XDP_CTX_F_RXHASH_HW(1ULL << 
> (XDP_CTX_F_RXHASH_TYPE_BITS+1))
> +
>  struct xdp_buff {
> void *data;
> void *data_end;
> void *data_hard_start;
> +   u64 flags;
> +   u32 rxhash;
>  };
>
> +/* helper functions for driver setting rxhash */
> +static inline void
> +xdp_record_hash(struct xdp_buff *xdp, u32 hash, u32 type)
> +{
> +   xdp->flags |= XDP_CTX_F_RXHASH_HW;
> +   xdp->flags |= type & XDP_CTX_F_RXHASH_TYPE_MASK;
> +   xdp->rxhash = hash;
> +}
> +
> +static inline void
> +xdp_set_skb_hash(struct xdp_buff *xdp, struct sk_buff *skb)
> +{
> +   if (likely(xdp->flags & (XDP_CTX_F_RXHASH_HW|XDP_CTX_F_RXHASH_SW))) {
> +   bool is_sw = !!(xdp->flags | XDP_CTX_F_RXHASH_SW);
> +   bool is_l4 = !!(xdp->flags & XDP_HASH_TYPE_L4_MASK);
> +
> +   __skb_set_hash(skb, xdp->rxhash, is_sw, is_l4);
> +   }
> +}
> +
>  /* compute the linear packet data range [data, data_end) which
>   * will be accessed by cls_bpf, act_bpf and lwt programs
>   */
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index ff81ee231410..4b50e8c606c5 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -219,11 +219,13 @@ enum {
>  /* XDP driver flags */
>  enum {
> XDP_DRV_F_ENABLED_BIT,
> +   XDP_DRV_F_RXHASH_BIT,
>  };
>
>  #define __XDP_DRV_F_BIT(bit)   ((netdev_features_t)1 << (bit))
>  #define __XDP_DRV_F(name)  __XDP_DRV_F_BIT(XDP_DRV_F_##name##_BIT)
>  #define XDP_DRV_F_ENABLED  __XDP_DRV_F(ENABLED)
> +#define XDP_DRV_F_RXHASH   __XDP_DRV_F(RXHASH)
>
>  /* XDP driver MUST support these features, else kernel MUST reject
>   * bpf_prog to guarantee safe access to data structures
> @@ -233,7 +235,7 @@ enum {
>  /* Some XDP features are under development. Based on bpf_prog loading
>   * detect if kernel feature can be activated.
>   */
> -#define XDP_DRV_FEATURES_DEVEL 0
> +#define XDP_DRV_FEATURES_DEVEL XDP_DRV_F_RXHASH
>
>  /* Some XDP features are optional, like action return code, as they
>   * are handled safely runtime.
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 945a1f5f63c5..1d9d3a46217d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -482,6 +482,9 @@ union bpf_attr {
>   * Get the owner uid of the socket stored inside sk_buff.
>   * @skb: pointer to skb
>   * Return: uid of the socket owner on success or overflowuid if failed.
> + *
> + * u64 bpf_xdp_rxhash(xdp_md, new_hash, type, flags)
> + * TODO: MISSING DESC
>   */
>  #define __BPF_FUNC_MAPPER(FN)  \
> FN(unspec), \
> @@ -531,7 +534,8 @@ union 

Re: [PATCH net-next v4] net: ipv6: fix code style error and warning of ndisc.c

2017-05-20 Thread Joe Perches
On Sat, 2017-05-20 at 21:57 +0800, yuan linyu wrote:
> From: yuan linyu 
[]
> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
[]
> @@ -240,13 +240,15 @@ struct ndisc_options *ndisc_parse_options(const struct 
> net_device *dev,
> "%s: duplicated ND6 option found: 
> type=%d\n",
> __func__, nd_opt->nd_opt_type);
>   } else {
> - ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
> nd_opt;
> + ndopts->nd_opt_array[nd_opt->nd_opt_type] =
> + nd_opt;
>   }
>   break;
>   case ND_OPT_PREFIX_INFO:
>   ndopts->nd_opts_pi_end = nd_opt;
>   if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
> - ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
> nd_opt;
> + ndopts->nd_opt_array[nd_opt->nd_opt_type] =
> + nd_opt;
>   break;
>  #ifdef CONFIG_IPV6_ROUTE_INFO
>   case ND_OPT_ROUTE_INFO:

If you are going to do these 80 column line length changes,
(and they are not really useful or necessary here), perhaps
it'd be better to use a temporary to reduce the line length.

Something like:
---
 net/ipv6/ndisc.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d310dc41209a..a8521bc86795 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -234,20 +234,28 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
case ND_OPT_TARGET_LL_ADDR:
case ND_OPT_MTU:
case ND_OPT_NONCE:
-   case ND_OPT_REDIRECT_HDR:
-   if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
+   case ND_OPT_REDIRECT_HDR: {
+   struct nd_opt_hdr **hdr;
+
+   hdr = >nd_opt_array[nd_opt->nd_opt_type];
+   if (*hdr) {
ND_PRINTK(2, warn,
  "%s: duplicated ND6 option found: 
type=%d\n",
  __func__, nd_opt->nd_opt_type);
} else {
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   *hdr = nd_opt;
}
break;
-   case ND_OPT_PREFIX_INFO:
+   }
+   case ND_OPT_PREFIX_INFO: {
+   struct nd_opt_hdr **hdr;
+
+   hdr = >nd_opt_array[nd_opt->nd_opt_type];
ndopts->nd_opts_pi_end = nd_opt;
-   if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   if (!*hdr)
+   *hdr = nd_opt;
break;
+   }
 #ifdef CONFIG_IPV6_ROUTE_INFO
case ND_OPT_ROUTE_INFO:
ndopts->nd_opts_ri_end = nd_opt;


[PATCH 2/2] vhost/scsi: Delete error messages for failed memory allocations in five functions

2017-05-20 Thread SF Markus Elfring
From: Markus Elfring 
Date: Sat, 20 May 2017 15:50:30 +0200

Omit seven extra messages for memory allocation failures in these functions.

This issue was detected by using the Coccinelle software.

Link: 
http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf
Signed-off-by: Markus Elfring 
---
 drivers/vhost/scsi.c | 24 +++-
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 650533916c19..49d07950e2e5 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -417,5 +417,4 @@ vhost_scsi_allocate_evt(struct vhost_scsi *vs,
if (!evt) {
-   vq_err(vq, "Failed to allocate vhost_scsi_evt\n");
vs->vs_events_missed = true;
return NULL;
}
@@ -1722,21 +1721,15 @@ static int vhost_scsi_nexus_cb(struct se_portal_group 
*se_tpg,
-   if (!tv_cmd->tvc_sgl) {
-   pr_err("Unable to allocate tv_cmd->tvc_sgl\n");
+   if (!tv_cmd->tvc_sgl)
goto out;
-   }
 
tv_cmd->tvc_upages = kzalloc(sizeof(struct page *) *
VHOST_SCSI_PREALLOC_UPAGES, GFP_KERNEL);
-   if (!tv_cmd->tvc_upages) {
-   pr_err("Unable to allocate tv_cmd->tvc_upages\n");
+   if (!tv_cmd->tvc_upages)
goto out;
-   }
 
tv_cmd->tvc_prot_sgl = kzalloc(sizeof(struct scatterlist) *
VHOST_SCSI_PREALLOC_PROT_SGLS, GFP_KERNEL);
-   if (!tv_cmd->tvc_prot_sgl) {
-   pr_err("Unable to allocate tv_cmd->tvc_prot_sgl\n");
+   if (!tv_cmd->tvc_prot_sgl)
goto out;
-   }
}
return 0;
 out:
@@ -1760,6 +1753,5 @@ static int vhost_scsi_make_nexus(struct vhost_scsi_tpg 
*tpg,
if (!tv_nexus) {
mutex_unlock(>tv_tpg_mutex);
-   pr_err("Unable to allocate struct vhost_scsi_nexus\n");
return -ENOMEM;
}
/*
@@ -1961,7 +1953,6 @@ vhost_scsi_make_tpg(struct se_wwn *wwn,
-   if (!tpg) {
-   pr_err("Unable to allocate struct vhost_scsi_tpg");
+   if (!tpg)
return ERR_PTR(-ENOMEM);
-   }
+
mutex_init(>tv_tpg_mutex);
INIT_LIST_HEAD(>tv_tpg_list);
tpg->tport = tport;
@@ -2015,7 +2006,6 @@ vhost_scsi_make_tport(struct target_fabric_configfs *tf,
-   if (!tport) {
-   pr_err("Unable to allocate struct vhost_scsi_tport");
+   if (!tport)
return ERR_PTR(-ENOMEM);
-   }
+
tport->tport_wwpn = wwpn;
/*
 * Determine the emulated Protocol Identifier and Target Port Name
-- 
2.13.0



[PATCH 0/2] vhost/scsi: Adjustments for five function implementations

2017-05-20 Thread SF Markus Elfring
From: Markus Elfring 
Date: Sat, 20 May 2017 16:25:04 +0200

A few update suggestions were taken into account
from static source code analysis.

Markus Elfring (2):
  Improve a size determination in four functions
  Delete error messages for failed memory allocations in five functions

 drivers/vhost/scsi.c | 33 +++--
 1 file changed, 11 insertions(+), 22 deletions(-)

-- 
2.13.0



[PATCH 1/2] vhost/scsi: Improve a size determination in four functions

2017-05-20 Thread SF Markus Elfring
From: Markus Elfring 
Date: Sat, 20 May 2017 13:48:44 +0200

Replace the specification of four data structures by pointer dereferences
as the parameter for the operator "sizeof" to make the corresponding size
determination a bit safer according to the Linux coding style convention.

Signed-off-by: Markus Elfring 
---
 drivers/vhost/scsi.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index fd6c8b66f06f..650533916c19 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -597,8 +597,7 @@ vhost_scsi_get_tag(struct vhost_virtqueue *vq, struct 
vhost_scsi_tpg *tpg,
sg = cmd->tvc_sgl;
prot_sg = cmd->tvc_prot_sgl;
pages = cmd->tvc_upages;
-   memset(cmd, 0, sizeof(struct vhost_scsi_cmd));
-
+   memset(cmd, 0, sizeof(*cmd));
cmd->tvc_sgl = sg;
cmd->tvc_prot_sgl = prot_sg;
cmd->tvc_upages = pages;
@@ -1757,5 +1756,5 @@ static int vhost_scsi_make_nexus(struct vhost_scsi_tpg 
*tpg,
return -EEXIST;
}
 
-   tv_nexus = kzalloc(sizeof(struct vhost_scsi_nexus), GFP_KERNEL);
+   tv_nexus = kzalloc(sizeof(*tv_nexus), GFP_KERNEL);
if (!tv_nexus) {
@@ -1958,5 +1957,5 @@ vhost_scsi_make_tpg(struct se_wwn *wwn,
if (kstrtou16(name + 5, 10, ) || tpgt >= VHOST_SCSI_MAX_TARGET)
return ERR_PTR(-EINVAL);
 
-   tpg = kzalloc(sizeof(struct vhost_scsi_tpg), GFP_KERNEL);
+   tpg = kzalloc(sizeof(*tpg), GFP_KERNEL);
if (!tpg) {
@@ -2012,5 +2011,5 @@ vhost_scsi_make_tport(struct target_fabric_configfs *tf,
/* if (vhost_scsi_parse_wwn(name, , 1) < 0)
return ERR_PTR(-EINVAL); */
 
-   tport = kzalloc(sizeof(struct vhost_scsi_tport), GFP_KERNEL);
+   tport = kzalloc(sizeof(*tport), GFP_KERNEL);
if (!tport) {
-- 
2.13.0



[PATCH net-next v4] net: ipv6: fix code style error and warning of ndisc.c

2017-05-20 Thread yuan linyu
From: yuan linyu 

Signed-off-by: yuan linyu 
---
 net/ipv6/ndisc.c | 310 +--
 1 file changed, 163 insertions(+), 147 deletions(-)

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d310dc4..df31f29 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -12,8 +12,7 @@
  *  2 of the License, or (at your option) any later version.
  */
 
-/*
- * Changes:
+/* Changes:
  *
  * Alexey I. Froloff   :   RFC6106 (DNSSL) support
  * Pierre Ynard:   export userland ND options
@@ -99,7 +98,6 @@ static const struct neigh_ops ndisc_hh_ops = {
.connected_output = neigh_resolve_output,
 };
 
-
 static const struct neigh_ops ndisc_direct_ops = {
.family =   AF_INET6,
.output =   neigh_direct_output,
@@ -147,13 +145,13 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int 
type, void *data,
u8 *opt = skb_put(skb, space);
 
opt[0] = type;
-   opt[1] = space>>3;
+   opt[1] = space >> 3;
 
memset(opt + 2, 0, pad);
opt   += pad;
space -= pad;
 
-   memcpy(opt+2, data, data_len);
+   memcpy(opt + 2, data, data_len);
data_len += 2;
opt += data_len;
space -= data_len;
@@ -182,6 +180,7 @@ static struct nd_opt_hdr *ndisc_next_option(struct 
nd_opt_hdr *cur,
struct nd_opt_hdr *end)
 {
int type;
+
if (!cur || !end || cur >= end)
return NULL;
type = cur->nd_opt_type;
@@ -222,6 +221,7 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
memset(ndopts, 0, sizeof(*ndopts));
while (opt_len) {
int l;
+
if (opt_len < sizeof(struct nd_opt_hdr))
return NULL;
l = nd_opt->nd_opt_len << 3;
@@ -240,13 +240,15 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
  "%s: duplicated ND6 option found: 
type=%d\n",
  __func__, nd_opt->nd_opt_type);
} else {
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   ndopts->nd_opt_array[nd_opt->nd_opt_type] =
+   nd_opt;
}
break;
case ND_OPT_PREFIX_INFO:
ndopts->nd_opts_pi_end = nd_opt;
if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   ndopts->nd_opt_array[nd_opt->nd_opt_type] =
+   nd_opt;
break;
 #ifdef CONFIG_IPV6_ROUTE_INFO
case ND_OPT_ROUTE_INFO:
@@ -261,8 +263,7 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
if (!ndopts->nd_useropts)
ndopts->nd_useropts = nd_opt;
} else {
-   /*
-* Unknown options must be silently ignored,
+   /* Unknown options must be silently ignored,
 * to accommodate future extension to the
 * protocol.
 */
@@ -280,7 +281,8 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
return ndopts;
 }
 
-int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device 
*dev, int dir)
+int ndisc_mc_map(const struct in6_addr *addr, char *buf,
+struct net_device *dev, int dir)
 {
switch (dev->type) {
case ARPHRD_ETHER:
@@ -327,9 +329,8 @@ static int ndisc_constructor(struct neighbour *neigh)
bool is_multicast = ipv6_addr_is_multicast(addr);
 
in6_dev = in6_dev_get(dev);
-   if (!in6_dev) {
+   if (!in6_dev)
return -EINVAL;
-   }
 
parms = in6_dev->nd_parms;
__neigh_parms_put(neigh->parms);
@@ -344,12 +345,12 @@ static int ndisc_constructor(struct neighbour *neigh)
if (is_multicast) {
neigh->nud_state = NUD_NOARP;
ndisc_mc_map(addr, neigh->ha, dev, 1);
-   } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+   } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
-   if (dev->flags_LOOPBACK)
+   if (dev->flags & IFF_LOOPBACK)
neigh->type = RTN_LOCAL;
-   

[PATCH net-next v3] net: ipv6: fix code style error and warning of ndisc.c

2017-05-20 Thread yuan linyu
From: yuan linyu 

Signed-off-by: yuan linyu 
---
 net/ipv6/ndisc.c | 310 +--
 1 file changed, 163 insertions(+), 147 deletions(-)

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d310dc4..7528c4c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -12,8 +12,7 @@
  *  2 of the License, or (at your option) any later version.
  */
 
-/*
- * Changes:
+/* Changes:
  *
  * Alexey I. Froloff   :   RFC6106 (DNSSL) support
  * Pierre Ynard:   export userland ND options
@@ -99,7 +98,6 @@ static const struct neigh_ops ndisc_hh_ops = {
.connected_output = neigh_resolve_output,
 };
 
-
 static const struct neigh_ops ndisc_direct_ops = {
.family =   AF_INET6,
.output =   neigh_direct_output,
@@ -147,13 +145,13 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int 
type, void *data,
u8 *opt = skb_put(skb, space);
 
opt[0] = type;
-   opt[1] = space>>3;
+   opt[1] = space >> 3;
 
memset(opt + 2, 0, pad);
opt   += pad;
space -= pad;
 
-   memcpy(opt+2, data, data_len);
+   memcpy(opt + 2, data, data_len);
data_len += 2;
opt += data_len;
space -= data_len;
@@ -182,6 +180,7 @@ static struct nd_opt_hdr *ndisc_next_option(struct 
nd_opt_hdr *cur,
struct nd_opt_hdr *end)
 {
int type;
+
if (!cur || !end || cur >= end)
return NULL;
type = cur->nd_opt_type;
@@ -222,6 +221,7 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
memset(ndopts, 0, sizeof(*ndopts));
while (opt_len) {
int l;
+
if (opt_len < sizeof(struct nd_opt_hdr))
return NULL;
l = nd_opt->nd_opt_len << 3;
@@ -240,13 +240,15 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
  "%s: duplicated ND6 option found: 
type=%d\n",
  __func__, nd_opt->nd_opt_type);
} else {
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   ndopts->nd_opt_array[nd_opt->nd_opt_type] =
+   nd_opt;
}
break;
case ND_OPT_PREFIX_INFO:
ndopts->nd_opts_pi_end = nd_opt;
if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
-   ndopts->nd_opt_array[nd_opt->nd_opt_type] = 
nd_opt;
+   ndopts->nd_opt_array[nd_opt->nd_opt_type] =
+   nd_opt;
break;
 #ifdef CONFIG_IPV6_ROUTE_INFO
case ND_OPT_ROUTE_INFO:
@@ -261,8 +263,7 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
if (!ndopts->nd_useropts)
ndopts->nd_useropts = nd_opt;
} else {
-   /*
-* Unknown options must be silently ignored,
+   /* Unknown options must be silently ignored,
 * to accommodate future extension to the
 * protocol.
 */
@@ -280,7 +281,8 @@ struct ndisc_options *ndisc_parse_options(const struct 
net_device *dev,
return ndopts;
 }
 
-int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device 
*dev, int dir)
+int ndisc_mc_map(const struct in6_addr *addr, char *buf,
+struct net_device *dev, int dir)
 {
switch (dev->type) {
case ARPHRD_ETHER:
@@ -327,9 +329,8 @@ static int ndisc_constructor(struct neighbour *neigh)
bool is_multicast = ipv6_addr_is_multicast(addr);
 
in6_dev = in6_dev_get(dev);
-   if (!in6_dev) {
+   if (!in6_dev)
return -EINVAL;
-   }
 
parms = in6_dev->nd_parms;
__neigh_parms_put(neigh->parms);
@@ -344,12 +345,12 @@ static int ndisc_constructor(struct neighbour *neigh)
if (is_multicast) {
neigh->nud_state = NUD_NOARP;
ndisc_mc_map(addr, neigh->ha, dev, 1);
-   } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+   } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
-   if (dev->flags_LOOPBACK)
+   if (dev->flags & IFF_LOOPBACK)
neigh->type = RTN_LOCAL;
-   

Re: [PATCH net-next] geneve: always fill CSUM6_RX configuration

2017-05-20 Thread Eric Garver
On Fri, May 19, 2017 at 06:57:46PM -0700, Pravin Shelar wrote:
> On Thu, May 18, 2017 at 12:59 PM, Eric Garver  wrote:
> > CSMU6_RX is relevant for collect_metadata as well. As such leave it
> > outside of the dev's IPv4/IPv6 checks.
> >
> Can you explain it bit? is this flag used with ipv4 tunnels?

It's used with collect_metadata as both ipv4 and ipv6 sockets will be
created.

openvswitch recently gained support for creating tunnels with rtnetlink.
It sets COLLECT_METADATA and CSUM6_RX. After create, it does a get to
verify the device got created with all the requested configuration. The
verify was failing due to CSUM6_RX not being returned.

Since ip_tunnel_info_af() defaults to returning AF_INET, we fall into
the IPv4 case and CSUM6_RX is never returned. Other relevant areas that
call ip_tunnel_info_af() do so using the info from the skb, not the
geneve_dev.

I hope that made it clear. Thanks for taking a look.

Eric.

> > Fixes: 9b4437a5b870 ("geneve: Unify LWT and netdev handling.")
> > Signed-off-by: Eric Garver 
> > ---
> >  drivers/net/geneve.c | 8 
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
> > index dec5d563ab19..f557d1dc3f9b 100644
> > --- a/drivers/net/geneve.c
> > +++ b/drivers/net/geneve.c
> > @@ -1311,13 +1311,13 @@ static int geneve_fill_info(struct sk_buff *skb, 
> > const struct net_device *dev)
> > if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
> >!(info->key.tun_flags & TUNNEL_CSUM)))
> > goto nla_put_failure;
> > -
> > -   if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
> > -  !geneve->use_udp6_rx_checksums))
> > -   goto nla_put_failure;
> >  #endif
> > }
> >
> > +   if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
> > +  !geneve->use_udp6_rx_checksums))
> > +   goto nla_put_failure;
> > +
> > if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) ||
> > nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) ||
> > nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
> > --
> > 2.12.0
> >


[patch net-next 1/2] net/sched: properly assign RCU pointer in tcf_chain_tp_insert/remove

2017-05-20 Thread Jiri Pirko
From: Jiri Pirko 

*p_filter_chain is rcu-dereferenced on reader path. So here in writer,
property assign the pointer.

Fixes: 2190d1d0944f ("net: sched: introduce helpers to work with filter chains")
Signed-off-by: Jiri Pirko 
---
 net/sched/cls_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 4020b8d..85088ed 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -351,7 +351,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain,
 {
if (chain->p_filter_chain &&
*chain_info->pprev == chain->filter_chain)
-   *chain->p_filter_chain = tp;
+   rcu_assign_pointer(*chain->p_filter_chain, tp);
RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
rcu_assign_pointer(*chain_info->pprev, tp);
 }
@@ -363,7 +363,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain,
struct tcf_proto *next = rtnl_dereference(chain_info->next);
 
if (chain->p_filter_chain && tp == chain->filter_chain)
-   *chain->p_filter_chain = next;
+   RCU_INIT_POINTER(*chain->p_filter_chain, next);
RCU_INIT_POINTER(*chain_info->pprev, next);
 }
 
-- 
2.9.3



[patch net-next 2/2] net/sched: fix filter flushing

2017-05-20 Thread Jiri Pirko
From: Jiri Pirko 

When user instructs to remove all filters from chain, we cannot destroy
the chain as other actions may hold a reference. Also the put in errout
would try to destroy it again. So instead, just walk the chain and remove
all existing filters.

Fixes: 5bc1701881e3 ("net: sched: introduce multichain support for filters")
Signed-off-by: Jiri Pirko 
---
 net/sched/cls_api.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 85088ed..01a8b8b 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -201,15 +201,22 @@ static struct tcf_chain *tcf_chain_create(struct 
tcf_block *block,
return chain;
 }
 
-static void tcf_chain_destroy(struct tcf_chain *chain)
+static void tcf_chain_flush(struct tcf_chain *chain)
 {
struct tcf_proto *tp;
 
-   list_del(>list);
+   if (*chain->p_filter_chain)
+   RCU_INIT_POINTER(*chain->p_filter_chain, NULL);
while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
RCU_INIT_POINTER(chain->filter_chain, tp->next);
tcf_proto_destroy(tp);
}
+}
+
+static void tcf_chain_destroy(struct tcf_chain *chain)
+{
+   list_del(>list);
+   tcf_chain_flush(chain);
kfree(chain);
 }
 
@@ -510,7 +517,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct 
nlmsghdr *n,
 
if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
-   tcf_chain_destroy(chain);
+   tcf_chain_flush(chain);
err = 0;
goto errout;
}
-- 
2.9.3



[PATCH net] netfilter: do not hold dev in ipt_CLUSTERIP

2017-05-20 Thread Xin Long
It's a terrible thing to hold dev in iptables target. When the dev is
being removed, unregister_netdevice has to wait for the dev to become
free. dmesg will keep logging the err:

  kernel:unregister_netdevice: waiting for veth0_in to become free. \
  Usage count = 1

until iptables rules with this target are removed manually.

The worse thing is when deleting a netns, a virtual nic will be deleted
instead of reset to init_net in default_device_ops exit/exit_batch. As
it is earlier than to flush the iptables rules in iptable_filter_net_ops
exit, unregister_netdevice will block to wait for the nic to become free.

As unregister_netdevice is actually waiting for iptables rules flushing
while iptables rules have to be flushed after unregister_netdevice. This
'dead lock' will cause unregister_netdevice to block there forever. As
the netns is not available to operate at that moment, iptables rules can
not even be flushed manually either.

The reproducer can be:

  # ip netns add test
  # ip link add veth0_in type veth peer name veth0_out
  # ip link set veth0_in netns test
  # ip netns exec test ip link set lo up
  # ip netns exec test ip link set veth0_in up
  # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \
CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \
--local-node 1 --hashmode sourceip-sourceport
  # ip netns del test

This issue can be triggered by all virtual nics with ipt_CLUSTERIP.

This patch is to fix it by not holding dev in ipt_CLUSTERIP, but only
save dev->ifindex instead of dev. When removing the mc from the dev,
it will get dev by c->ifindex through dev_get_by_index.

Note that it doesn't save dev->name but dev->ifindex, as a dev->name
can be changed, it will confuse ipt_CLUSTERIP.

Reported-by: Jianlin Shi 
Signed-off-by: Xin Long 
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 31 ++-
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 038f293..d1adb2f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -47,7 +47,7 @@ struct clusterip_config {
 
__be32 clusterip;   /* the IP address */
u_int8_t clustermac[ETH_ALEN];  /* the MAC address */
-   struct net_device *dev; /* device */
+   int ifindex;/* device ifindex */
u_int16_t num_total_nodes;  /* total number of nodes */
unsigned long local_nodes;  /* node number array */
 
@@ -98,19 +98,23 @@ clusterip_config_put(struct clusterip_config *c)
  * entry(rule) is removed, remove the config from lists, but don't free it
  * yet, since proc-files could still be holding references */
 static inline void
-clusterip_config_entry_put(struct clusterip_config *c)
+clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
 {
-   struct net *net = dev_net(c->dev);
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 
local_bh_disable();
if (refcount_dec_and_lock(>entries, >lock)) {
+   struct net_device *dev;
+
list_del_rcu(>list);
spin_unlock(>lock);
local_bh_enable();
 
-   dev_mc_del(c->dev, c->clustermac);
-   dev_put(c->dev);
+   dev = dev_get_by_index(net, c->ifindex);
+   if (dev) {
+   dev_mc_del(dev, c->clustermac);
+   dev_put(dev);
+   }
 
/* In case anyone still accesses the file, the open/close
 * functions are also incrementing the refcount on their own,
@@ -182,7 +186,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info 
*i, __be32 ip,
if (!c)
return ERR_PTR(-ENOMEM);
 
-   c->dev = dev;
+   c->ifindex = dev->ifindex;
c->clusterip = ip;
memcpy(>clustermac, >clustermac, ETH_ALEN);
c->num_total_nodes = i->num_total_nodes;
@@ -427,12 +431,14 @@ static int clusterip_tg_check(const struct xt_tgchk_param 
*par)
}
 
config = clusterip_config_init(cipinfo,
-   e->ip.dst.s_addr, dev);
+  e->ip.dst.s_addr, dev);
if (IS_ERR(config)) {
dev_put(dev);
return PTR_ERR(config);
}
-   dev_mc_add(config->dev, config->clustermac);
+
+   dev_mc_add(dev, config->clustermac);
+   dev_put(dev);
}
}
cipinfo->config = config;
@@ -458,7 +464,7 @@ static void clusterip_tg_destroy(const struct 
xt_tgdtor_param *par)
 
/* if no more entries 

Re: [RFC net-next PATCH 3/5] net: introduce XDP driver features interface

2017-05-20 Thread Jesper Dangaard Brouer
On Fri, 19 May 2017 19:13:29 +0200
Daniel Borkmann  wrote:

> On 05/18/2017 05:41 PM, Jesper Dangaard Brouer wrote:
> > There is a fundamental difference between normal eBPF programs
> > and (XDP) eBPF programs getting attached in a driver. For normal
> > eBPF programs it is easy to add a new bpf feature, like a bpf
> > helper, because is it strongly tied to the feature being
> > available in the current core kernel code.  When drivers invoke a
> > bpf_prog, then it is not sufficient to simply relying on whether
> > a bpf_helper exists or not.  When a driver haven't implemented a
> > given feature yet, then it is possible to expose uninitialized
> > parts of xdp_buff.  The driver pass in a pointer to xdp_buff,
> > usually "allocated" on the stack, which must not be exposed.  
> 
> When xdp_buff is being extended, then we should at least zero
> initialize all in-tree users that don't support or populate this
> field, thus that it's not uninitialized memory. Better would be
> to have a way to reject the prog in the first place until it's
> implemented (but further comments on feature bits below).

Going down a path where we need to zero out the xdp_buff looks a lot
like the sk_buff zeroing, which is the top perf cost associated with
SKBs see[1].  XDP is is about not repeating the same issue we had with
the SKB...

[1] 
https://prototype-kernel.readthedocs.io/en/latest/blogposts/xdp25_eval_generic_xdp_tx.html#analyzing-build-skb-and-memset


> > Only two user visible NETIF_F_XDP_* net_device feature flags are
> > exposed via ethtool (-k) seen as "xdp" and "xdp-partial".
> > The "xdp-partial" is detected when there is not feature equality
> > between kernel and driver, and a netdev_warn is given.  
> 
> I think having something like a NETIF_F_XDP_BIT for ethtool to
> indicate support as "xdp" is quite useful. Avoids having to grep
> the kernel tree for ndo_xdp callback. ;) A "xdp-partial" would
> still be unclear/confusing to the user whether his program loads
> or doesn't which is the only thing a user (or some loading infra)
> cares about eventually, so one still needs to go trying to load
> the XDP code to see whether that fails for the native case.

Good that we agree on usefulness of the NETIF_F_XDP_BIT.  The
"xdp-partial" or "xdp-challenged" is an early indication to the user
that they should complain to the vendor.  I tried to keep it simple
towards the user. Do you think every feature bit should be exposed to
userspace?


> > The idea is that XDP_DRV_* feature bits define a contract between
> > the driver and the kernel, giving a reliable way to know that XDP
> > features a driver promised to implement. Thus, knowing what bpf
> > side features are safe to allow.
> >
> > There are 3 levels of features: "required", "devel" and "optional".
> >
> > The motivation is pushing driver vendors forward to support all
> > the new XDP features.  Once a given feature bit is moved into
> > the "required" features, the kernel will reject loading XDP
> > program if feature isn't implemented by driver.  Features under
> > developement, require help from the bpf infrastrucure to detect
> > when a given helper or direct-access is used, using a bpf_prog
> > bit to mark a need for the feature, and pulling in this bit in
> > the xdp_features_check().  When all drivers have implemented
> > a "devel" feature, it can be moved to the "required" feature and  
> 
> The problem is that once you add bits markers to bpf_prog like we
> used to do in the past, then as you do in patch 4/5 with the
> xdp_rxhash_needed bit, they will need to be turned /on/ unconditionally
> when a prog has tail calls.

Yes, with tail calls, we have to enable all features.  But that is a
good thing, as it forces vendors to quickly implement all features.
And it is no different from moving a feature into the "required" bits,
once all drivers implement it.  It is only a limitation for tail calls,
and something we can fix later (for handling this for tail calls).

BPF have some nice features of evaluating the input program
"load-time", which is what I'm taking advantage of as an optimization
here (let use this nice bpf property).   It is only tail calls that
cannot evaluate this "load-time".  Thus, if you care about tail calls,
supporting intermediate features, we could later fix that by adding a
runtime feature check in the case of tail calls.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH net-next v2] bridge: fix hello and hold timers starting/stopping

2017-05-20 Thread Hangbin Liu
On Sat, May 20, 2017 at 09:06:16AM +0200, Ivan Vecera wrote:
> 2017-05-20 7:57 GMT+02:00 Hangbin Liu :
> > On Fri, May 19, 2017 at 07:30:43PM +0200, Ivan Vecera wrote:
> >> Current bridge code incorrectly handles starting/stopping of hello and
> >> hold timers during STP enable/disable.
> >>
> >> 1. Timers are stopped in br_stp_start() during NO_STP->USER_STP
> >>transition. The timers are already stopped in NO_STP state so
> >>this is confusing no-op.
> >
> > Hi Ivan,
> >
> > Shouldn't we start hello timer in br_stp_start when NO_STP -> BR_KERNEL_STP 
> > ?
> 
> As Nikolay mentioned, this is fixed by
> https://patchwork.ozlabs.org/patch/764685/

Ah, sorry. My mistake. I only saw xin's patch and your v2 patch. So I mixed
them up and thought this is xin's V2 patch. That's why I wonder we didn't
start hello timer in br_stp_start...

Now I see your v1 patch with:

The patch is a follow-up for "bridge: start hello_timer when enabling
KERNEL_STP in br_stp_start" patch from Xin Long."

Sorry for mixed them up.
> 
> >>
> >> 2. During USER_STP->NO_STP transition the timers are started. This
> >>does not make sense and is confusion because the timer should not be
> >>active in NO_STP state.
> >
> > Yes, but what about BR_KERNEL_STP -> NO_STP in function br_stp_stop() ?
> 
> The timer is lazily stopped by itself in its handler... or not rearmed
> respectively.

Yes, with xin's patch this timer will stoped by itself.

Thanks
Hangbin


Re: [PATCH] net: sched: fix a use-after-free error on chain on the error exit path

2017-05-20 Thread Jiri Pirko
Fri, May 19, 2017 at 07:17:59PM CEST, xiyou.wangc...@gmail.com wrote:
>On Thu, May 18, 2017 at 7:07 AM, Colin King  wrote:
>> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
>> index 4020b8d932a1..82ebdc3fcb2e 100644
>> --- a/net/sched/cls_api.c
>> +++ b/net/sched/cls_api.c
>> @@ -511,6 +511,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct 
>> nlmsghdr *n,
>> if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
>> tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
>> tcf_chain_destroy(chain);
>
>
>Jiri, how does this work...? An action could hold a refcnt to a filter
>chain, but here you destroy a whole chain without respecting
>the refcnt???

Correct. I missed this. Will fix, thanks.


>
>
>> +   chain = NULL;
>> err = 0;
>> goto errout;
>
>Colin, not your fault, I think we may miss something more serious
>when reviewing Jiri's patchset. ;)
>
>Thanks.


Re: [PATCH net-next v2] bridge: fix hello and hold timers starting/stopping

2017-05-20 Thread Ivan Vecera
2017-05-20 7:57 GMT+02:00 Hangbin Liu :
> On Fri, May 19, 2017 at 07:30:43PM +0200, Ivan Vecera wrote:
>> Current bridge code incorrectly handles starting/stopping of hello and
>> hold timers during STP enable/disable.
>>
>> 1. Timers are stopped in br_stp_start() during NO_STP->USER_STP
>>transition. The timers are already stopped in NO_STP state so
>>this is confusing no-op.
>
> Hi Ivan,
>
> Shouldn't we start hello timer in br_stp_start when NO_STP -> BR_KERNEL_STP ?

As Nikolay mentioned, this is fixed by
https://patchwork.ozlabs.org/patch/764685/

>>
>> 2. During USER_STP->NO_STP transition the timers are started. This
>>does not make sense and is confusion because the timer should not be
>>active in NO_STP state.
>
> Yes, but what about BR_KERNEL_STP -> NO_STP in function br_stp_stop() ?

The timer is lazily stopped by itself in its handler... or not rearmed
respectively.

>> Cc: da...@davemloft.net
>> Cc: sas...@cumulusnetworks.com
>> Cc: step...@networkplumber.org
>> Cc: bri...@lists.linux-foundation.org
>> Cc: lucien@gmail.com
>> Cc: niko...@cumulusnetworks.com
>> Signed-off-by: Ivan Vecera 
>> ---
>>  net/bridge/br_stp_if.c | 11 ---
>>  1 file changed, 11 deletions(-)
>>
>> diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
>> index 08341d2aa9c9..a05027027513 100644
>> --- a/net/bridge/br_stp_if.c
>> +++ b/net/bridge/br_stp_if.c
>> @@ -150,7 +150,6 @@ static int br_stp_call_user(struct net_bridge *br, char 
>> *arg)
>>
>>  static void br_stp_start(struct net_bridge *br)
>>  {
>> - struct net_bridge_port *p;
>>   int err = -ENOENT;
>>
>>   if (net_eq(dev_net(br->dev), _net))
>> @@ -169,11 +168,6 @@ static void br_stp_start(struct net_bridge *br)
>>   if (!err) {
>>   br->stp_enabled = BR_USER_STP;
>>   br_debug(br, "userspace STP started\n");
>> -
>> - /* Stop hello and hold timers */
>> - del_timer(>hello_timer);
>> - list_for_each_entry(p, >port_list, list)
>> - del_timer(>hold_timer);
>
> I'm not sure if user space daemon will send bpdu or not? In comment
> 76b91c32dd86 ("bridge: stp: when using userspace stp stop kernel hello and
> hold timers"). Nikolay said we should not handle it with BR_USER_STP.
>
>>   } else {
>>   br->stp_enabled = BR_KERNEL_STP;
>>   br_debug(br, "using kernel STP\n");
>> @@ -187,7 +181,6 @@ static void br_stp_start(struct net_bridge *br)
>>
>>  static void br_stp_stop(struct net_bridge *br)
>>  {
>> - struct net_bridge_port *p;
>>   int err;
>>
>>   if (br->stp_enabled == BR_USER_STP) {
>> @@ -196,10 +189,6 @@ static void br_stp_stop(struct net_bridge *br)
>>   br_err(br, "failed to stop userspace STP (%d)\n", err);
>>
>>   /* To start timers on any ports left in blocking */
>> - mod_timer(>hello_timer, jiffies + br->hello_time);
>> - list_for_each_entry(p, >port_list, list)
>> - mod_timer(>hold_timer,
>> -   round_jiffies(jiffies + BR_HOLD_TIME));
>
> If we do not del hello_timer. after it expired in br_hello_timer_expired(),
> Our state is br->dev->flags & IFF_UP and br->stp_enabled == NO_STP, it will
> call mod_timer(>hello_timer, round_jiffies(jiffies + br->hello_time))
> and we will keep sending bpdu message even after stp stoped.
>
>>   spin_lock_bh(>lock);
>>   br_port_state_selection(br);
>>   spin_unlock_bh(>lock);
>> --
>
> So how about just like
>
> diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
> index d8ad73b..0198f62 100644
> --- a/net/bridge/br_stp_if.c
> +++ b/net/bridge/br_stp_if.c
> @@ -183,6 +183,7 @@ static void br_stp_start(struct net_bridge *br)
> } else {
> br->stp_enabled = BR_KERNEL_STP;
> br_debug(br, "using kernel STP\n");
> +   mod_timer(>hello_timer, jiffies + br->hello_time);
>
> /* To start timers on any ports left in blocking */
> br_port_state_selection(br);
> @@ -202,7 +203,6 @@ static void br_stp_stop(struct net_bridge *br)
> br_err(br, "failed to stop userspace STP (%d)\n", 
> err);
>
> /* To start timers on any ports left in blocking */
> -   mod_timer(>hello_timer, jiffies + br->hello_time);
> list_for_each_entry(p, >port_list, list)
> mod_timer(>hold_timer,
>   round_jiffies(jiffies + BR_HOLD_TIME));
> @@ -211,6 +211,7 @@ static void br_stp_stop(struct net_bridge *br)
> spin_unlock_bh(>lock);
> }
>
> +   del_timer_sync(>hello_timer);
> br->stp_enabled = BR_NO_STP;
>  }
>
> Thanks
> Hangbin


Re: [PATCH net-next v2] bridge: fix hello and hold timers starting/stopping

2017-05-20 Thread Nikolay Aleksandrov

On 5/20/17 8:57 AM, Hangbin Liu wrote:

On Fri, May 19, 2017 at 07:30:43PM +0200, Ivan Vecera wrote:

Current bridge code incorrectly handles starting/stopping of hello and
hold timers during STP enable/disable.

1. Timers are stopped in br_stp_start() during NO_STP->USER_STP
transition. The timers are already stopped in NO_STP state so
this is confusing no-op.


Hi Ivan,

Shouldn't we start hello timer in br_stp_start when NO_STP -> BR_KERNEL_STP ?


Please see Xin Long's recent -net patch that fixes exactly this issue.
It will answer your questions below, too.

https://patchwork.ozlabs.org/patch/764685/



2. During USER_STP->NO_STP transition the timers are started. This
does not make sense and is confusion because the timer should not be
active in NO_STP state.


Yes, but what about BR_KERNEL_STP -> NO_STP in function br_stp_stop() ?


Cc: da...@davemloft.net
Cc: sas...@cumulusnetworks.com
Cc: step...@networkplumber.org
Cc: bri...@lists.linux-foundation.org
Cc: lucien@gmail.com
Cc: niko...@cumulusnetworks.com
Signed-off-by: Ivan Vecera 
---
  net/bridge/br_stp_if.c | 11 ---
  1 file changed, 11 deletions(-)

diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 08341d2aa9c9..a05027027513 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -150,7 +150,6 @@ static int br_stp_call_user(struct net_bridge *br, char 
*arg)
  
  static void br_stp_start(struct net_bridge *br)

  {
-   struct net_bridge_port *p;
int err = -ENOENT;
  
  	if (net_eq(dev_net(br->dev), _net))

@@ -169,11 +168,6 @@ static void br_stp_start(struct net_bridge *br)
if (!err) {
br->stp_enabled = BR_USER_STP;
br_debug(br, "userspace STP started\n");
-
-   /* Stop hello and hold timers */
-   del_timer(>hello_timer);
-   list_for_each_entry(p, >port_list, list)
-   del_timer(>hold_timer);


I'm not sure if user space daemon will send bpdu or not? In comment
76b91c32dd86 ("bridge: stp: when using userspace stp stop kernel hello and
hold timers"). Nikolay said we should not handle it with BR_USER_STP >

} else {
br->stp_enabled = BR_KERNEL_STP;
br_debug(br, "using kernel STP\n");
@@ -187,7 +181,6 @@ static void br_stp_start(struct net_bridge *br)
  
  static void br_stp_stop(struct net_bridge *br)

  {
-   struct net_bridge_port *p;
int err;
  
  	if (br->stp_enabled == BR_USER_STP) {

@@ -196,10 +189,6 @@ static void br_stp_stop(struct net_bridge *br)
br_err(br, "failed to stop userspace STP (%d)\n", err);
  
  		/* To start timers on any ports left in blocking */

-   mod_timer(>hello_timer, jiffies + br->hello_time);
-   list_for_each_entry(p, >port_list, list)
-   mod_timer(>hold_timer,
- round_jiffies(jiffies + BR_HOLD_TIME));


If we do not del hello_timer. after it expired in br_hello_timer_expired(),
Our state is br->dev->flags & IFF_UP and br->stp_enabled == NO_STP, it will
call mod_timer(>hello_timer, round_jiffies(jiffies + br->hello_time))
and we will keep sending bpdu message even after stp stoped.


Again see Xin Long's recent -net patch.




spin_lock_bh(>lock);
br_port_state_selection(br);
spin_unlock_bh(>lock);
--


So how about just like

diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index d8ad73b..0198f62 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -183,6 +183,7 @@ static void br_stp_start(struct net_bridge *br)
 } else {
 br->stp_enabled = BR_KERNEL_STP;
 br_debug(br, "using kernel STP\n");
+   mod_timer(>hello_timer, jiffies + br->hello_time);

 /* To start timers on any ports left in blocking */
 br_port_state_selection(br);
@@ -202,7 +203,6 @@ static void br_stp_stop(struct net_bridge *br)
 br_err(br, "failed to stop userspace STP (%d)\n", err);

 /* To start timers on any ports left in blocking */
-   mod_timer(>hello_timer, jiffies + br->hello_time);
 list_for_each_entry(p, >port_list, list)
 mod_timer(>hold_timer,
   round_jiffies(jiffies + BR_HOLD_TIME));
@@ -211,6 +211,7 @@ static void br_stp_stop(struct net_bridge *br)
 spin_unlock_bh(>lock);
 }

+   del_timer_sync(>hello_timer);
 br->stp_enabled = BR_NO_STP;
  }

Thanks
Hangbin





Re: [PATCH net-next 6/9] xfrm: make xfrm_dev_register static

2017-05-20 Thread Steffen Klassert
On Fri, May 19, 2017 at 09:55:53AM -0700, Stephen Hemminger wrote:
> This function is only used in this file and should not be global.
> 
> Signed-off-by: Stephen Hemminger 
> ---
>  net/xfrm/xfrm_device.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
> index 8ec8a3fcf8d4..50ec73399b48 100644
> --- a/net/xfrm/xfrm_device.c
> +++ b/net/xfrm/xfrm_device.c
> @@ -138,7 +138,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct 
> xfrm_state *x)
>  }
>  EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok);
>  
> -int xfrm_dev_register(struct net_device *dev)
> +static int xfrm_dev_register(struct net_device *dev)

I've applied a patch with this exact fix already to the
ipsec-next tree yesterday.