[PATCH net-next] mlxsw: spectrum_router: Simplify VRF enslavement

2017-04-30 Thread idosch
From: Ido Schimmel 

When a netdev is enslaved to a VRF master, its router interface (RIF)
needs to be destroyed (if exists) and a new one created using the
corresponding virtual router (VR).

>From the driver's perspective, the above is equivalent to an inetaddr
event sent for this netdev. Therefore, when a port netdev (or its
uppers) are enslaved to a VRF master, call the same function that
would've been called had a NETDEV_UP was sent for this netdev in the
inetaddr notification chain.

This patch also fixes a bug when a LAG netdev with an existing RIF is
enslaved to a VRF. Before this patch, each LAG port would drop the
reference on the RIF, but would re-join the same one (in the wrong VR)
soon after. With this patch, the corresponding RIF is first destroyed
and a new one is created using the correct VR.

Fixes: 7179eb5acd59 ("mlxsw: spectrum_router: Add support for VRFs")
Signed-off-by: Ido Schimmel 
Reviewed-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c |  77 +++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  10 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 107 +
 3 files changed, 63 insertions(+), 131 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 20c1b6c..88357ce 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4106,7 +4106,6 @@ static int mlxsw_sp_netdevice_port_upper_event(struct 
net_device *dev,
if (!is_vlan_dev(upper_dev) &&
!netif_is_lag_master(upper_dev) &&
!netif_is_bridge_master(upper_dev) &&
-   !netif_is_l3_master(upper_dev) &&
!netif_is_ovs_master(upper_dev))
return -EINVAL;
if (!info->linking)
@@ -4151,11 +4150,6 @@ static int mlxsw_sp_netdevice_port_upper_event(struct 
net_device *dev,
else
mlxsw_sp_port_lag_leave(mlxsw_sp_port,
upper_dev);
-   } else if (netif_is_l3_master(upper_dev)) {
-   if (info->linking)
-   err = mlxsw_sp_port_vrf_join(mlxsw_sp_port);
-   else
-   mlxsw_sp_port_vrf_leave(mlxsw_sp_port);
} else if (netif_is_ovs_master(upper_dev)) {
if (info->linking)
err = mlxsw_sp_port_ovs_join(mlxsw_sp_port);
@@ -4275,7 +4269,7 @@ static int mlxsw_sp_netdevice_bridge_event(struct 
net_device *br_dev,
switch (event) {
case NETDEV_PRECHANGEUPPER:
upper_dev = info->upper_dev;
-   if (!is_vlan_dev(upper_dev) && !netif_is_l3_master(upper_dev))
+   if (!is_vlan_dev(upper_dev))
return -EINVAL;
if (is_vlan_dev(upper_dev) &&
br_dev != mlxsw_sp->master_bridge.dev)
@@ -4290,12 +4284,6 @@ static int mlxsw_sp_netdevice_bridge_event(struct 
net_device *br_dev,
else
mlxsw_sp_master_bridge_vlan_unlink(mlxsw_sp,
   upper_dev);
-   } else if (netif_is_l3_master(upper_dev)) {
-   if (info->linking)
-   err = mlxsw_sp_bridge_vrf_join(mlxsw_sp,
-  br_dev);
-   else
-   mlxsw_sp_bridge_vrf_leave(mlxsw_sp, br_dev);
} else {
err = -EINVAL;
WARN_ON(1);
@@ -4529,8 +4517,7 @@ static int mlxsw_sp_netdevice_vport_event(struct 
net_device *dev,
switch (event) {
case NETDEV_PRECHANGEUPPER:
upper_dev = info->upper_dev;
-   if (!netif_is_bridge_master(upper_dev) &&
-   !netif_is_l3_master(upper_dev))
+   if (!netif_is_bridge_master(upper_dev))
return -EINVAL;
if (!info->linking)
break;
@@ -4550,11 +4537,6 @@ static int mlxsw_sp_netdevice_vport_event(struct 
net_device *dev,
 upper_dev);
else
mlxsw_sp_vport_bridge_leave(mlxsw_sp_vport);
-   } else if (netif_is_l3_master(upper_dev)) {
-   if (info->linking)
-   err = mlxsw_sp_vport_vrf_join(mlxsw_sp_vport);
-   else
-   mlxsw_sp_vport_vrf_leave(mlxsw_sp_vport);
} else {
err = -EINVAL;
  

[PATCH net v3 2/2] bridge: netlink: register netdevice before executing changelink

2017-04-10 Thread idosch
From: Ido Schimmel 

Peter reported a kernel oops when executing the following command:

$ ip link add name test type bridge vlan_default_pvid 1

[13634.939408] BUG: unable to handle kernel NULL pointer dereference at
0190
[13634.939436] IP: __vlan_add+0x73/0x5f0
[...]
[13634.939783] Call Trace:
[13634.939791]  ? pcpu_next_unpop+0x3b/0x50
[13634.939801]  ? pcpu_alloc+0x3d2/0x680
[13634.939810]  ? br_vlan_add+0x135/0x1b0
[13634.939820]  ? __br_vlan_set_default_pvid.part.28+0x204/0x2b0
[13634.939834]  ? br_changelink+0x120/0x4e0
[13634.939844]  ? br_dev_newlink+0x50/0x70
[13634.939854]  ? rtnl_newlink+0x5f5/0x8a0
[13634.939864]  ? rtnl_newlink+0x176/0x8a0
[13634.939874]  ? mem_cgroup_commit_charge+0x7c/0x4e0
[13634.939886]  ? rtnetlink_rcv_msg+0xe1/0x220
[13634.939896]  ? lookup_fast+0x52/0x370
[13634.939905]  ? rtnl_newlink+0x8a0/0x8a0
[13634.939915]  ? netlink_rcv_skb+0xa1/0xc0
[13634.939925]  ? rtnetlink_rcv+0x24/0x30
[13634.939934]  ? netlink_unicast+0x177/0x220
[13634.939944]  ? netlink_sendmsg+0x2fe/0x3b0
[13634.939954]  ? _copy_from_user+0x39/0x40
[13634.939964]  ? sock_sendmsg+0x30/0x40
[13634.940159]  ? ___sys_sendmsg+0x29d/0x2b0
[13634.940326]  ? __alloc_pages_nodemask+0xdf/0x230
[13634.940478]  ? mem_cgroup_commit_charge+0x7c/0x4e0
[13634.940592]  ? mem_cgroup_try_charge+0x76/0x1a0
[13634.940701]  ? __handle_mm_fault+0xdb9/0x10b0
[13634.940809]  ? __sys_sendmsg+0x51/0x90
[13634.940917]  ? entry_SYSCALL_64_fastpath+0x1e/0xad

The problem is that the bridge's VLAN group is created after setting the
default PVID, when registering the netdevice and executing its
ndo_init().

Fix this by changing the order of both operations, so that
br_changelink() is only processed after the netdevice is registered,
when the VLAN group is already initialized.

Fixes: b6677449dff6 ("bridge: netlink: call br_changelink() during 
br_dev_newlink()")
Signed-off-by: Nikolay Aleksandrov 
Signed-off-by: Ido Schimmel 
Reported-by: Peter V. Saveliev 
Tested-by: Peter V. Saveliev 
---
v2->v3:
* Drop goto
v1->v2:
* Error-out on changelink() failure

 net/bridge/br_netlink.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a8f6acd..225ef7d 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1165,11 +1165,14 @@ static int br_dev_newlink(struct net *src_net, struct 
net_device *dev,
spin_unlock_bh(>lock);
}
 
-   err = br_changelink(dev, tb, data);
+   err = register_netdevice(dev);
if (err)
return err;
 
-   return register_netdevice(dev);
+   err = br_changelink(dev, tb, data);
+   if (err)
+   unregister_netdevice(dev);
+   return err;
 }
 
 static size_t br_get_size(const struct net_device *brdev)
-- 
2.9.3



[PATCH net v3 0/2] bridge: Fix kernel oops during bridge creation

2017-04-10 Thread idosch
From: Ido Schimmel 

First patch adds a missing ndo_uninit() in the bridge driver, which is a
prerequisite for the second patch that actually fixes the oops.

Please consider both patches for 4.4.y, 4.9.y and 4.10.y

Ido Schimmel (2):
  bridge: implement missing ndo_uninit()
  bridge: netlink: register netdevice before executing changelink

 net/bridge/br_device.c| 20 +++-
 net/bridge/br_if.c|  1 -
 net/bridge/br_multicast.c |  7 +--
 net/bridge/br_netlink.c   |  7 +--
 net/bridge/br_private.h   |  5 +
 5 files changed, 26 insertions(+), 14 deletions(-)

-- 
2.9.3



[PATCH net v3 1/2] bridge: implement missing ndo_uninit()

2017-04-10 Thread idosch
From: Ido Schimmel 

While the bridge driver implements an ndo_init(), it was missing a
symmetric ndo_uninit(), causing the different de-initialization
operations to be scattered around its dellink() and destructor().

Implement a symmetric ndo_uninit() and remove the overlapping operations
from its dellink() and destructor().

This is a prerequisite for the next patch, as it allows us to have a
proper cleanup upon changelink() failure during the bridge's newlink().

Fixes: b6677449dff6 ("bridge: netlink: call br_changelink() during 
br_dev_newlink()")
Signed-off-by: Nikolay Aleksandrov 
Signed-off-by: Ido Schimmel 
---
v2->v3:
* Use free_netdev() for destructor()
v1->v2:
* New patch

 net/bridge/br_device.c| 20 +++-
 net/bridge/br_if.c|  1 -
 net/bridge/br_multicast.c |  7 +--
 net/bridge/br_private.h   |  5 +
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ea71513..90f49a1 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -119,6 +119,15 @@ static int br_dev_init(struct net_device *dev)
return err;
 }
 
+static void br_dev_uninit(struct net_device *dev)
+{
+   struct net_bridge *br = netdev_priv(dev);
+
+   br_multicast_uninit_stats(br);
+   br_vlan_flush(br);
+   free_percpu(br->stats);
+}
+
 static int br_dev_open(struct net_device *dev)
 {
struct net_bridge *br = netdev_priv(dev);
@@ -332,6 +341,7 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_open= br_dev_open,
.ndo_stop= br_dev_stop,
.ndo_init= br_dev_init,
+   .ndo_uninit  = br_dev_uninit,
.ndo_start_xmit  = br_dev_xmit,
.ndo_get_stats64 = br_get_stats64,
.ndo_set_mac_address = br_set_mac_address,
@@ -356,14 +366,6 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_features_check  = passthru_features_check,
 };
 
-static void br_dev_free(struct net_device *dev)
-{
-   struct net_bridge *br = netdev_priv(dev);
-
-   free_percpu(br->stats);
-   free_netdev(dev);
-}
-
 static struct device_type br_type = {
.name   = "bridge",
 };
@@ -376,7 +378,7 @@ void br_dev_setup(struct net_device *dev)
ether_setup(dev);
 
dev->netdev_ops = _netdev_ops;
-   dev->destructor = br_dev_free;
+   dev->destructor = free_netdev;
dev->ethtool_ops = _ethtool_ops;
SET_NETDEV_DEVTYPE(dev, _type);
dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8ac1770..56a2a72 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -311,7 +311,6 @@ void br_dev_delete(struct net_device *dev, struct list_head 
*head)
 
br_fdb_delete_by_port(br, NULL, 0, 1);
 
-   br_vlan_flush(br);
br_multicast_dev_del(br);
cancel_delayed_work_sync(>gc_work);
 
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b760f26..faa7261 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2031,8 +2031,6 @@ void br_multicast_dev_del(struct net_bridge *br)
 
 out:
spin_unlock_bh(>multicast_lock);
-
-   free_percpu(br->mcast_stats);
 }
 
 int br_multicast_set_router(struct net_bridge *br, unsigned long val)
@@ -2531,6 +2529,11 @@ int br_multicast_init_stats(struct net_bridge *br)
return 0;
 }
 
+void br_multicast_uninit_stats(struct net_bridge *br)
+{
+   free_percpu(br->mcast_stats);
+}
+
 static void mcast_stats_add_dir(u64 *dst, u64 *src)
 {
dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6136818..0d17728 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -620,6 +620,7 @@ void br_rtr_notify(struct net_device *dev, struct 
net_bridge_port *port,
 void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
const struct sk_buff *skb, u8 type, u8 dir);
 int br_multicast_init_stats(struct net_bridge *br);
+void br_multicast_uninit_stats(struct net_bridge *br);
 void br_multicast_get_stats(const struct net_bridge *br,
const struct net_bridge_port *p,
struct br_mcast_stats *dest);
@@ -760,6 +761,10 @@ static inline int br_multicast_init_stats(struct 
net_bridge *br)
return 0;
 }
 
+static inline void br_multicast_uninit_stats(struct net_bridge *br)
+{
+}
+
 static inline int br_multicast_igmp_type(const struct sk_buff *skb)
 {
return 0;
-- 
2.9.3



[PATCH net v2 2/2] bridge: netlink: register netdevice before executing changelink

2017-04-08 Thread idosch
From: Ido Schimmel 

Peter reported a kernel oops when executing the following command:

$ ip link add name test type bridge vlan_default_pvid 1

[13634.939408] BUG: unable to handle kernel NULL pointer dereference at
0190
[13634.939436] IP: __vlan_add+0x73/0x5f0
[...]
[13634.939783] Call Trace:
[13634.939791]  ? pcpu_next_unpop+0x3b/0x50
[13634.939801]  ? pcpu_alloc+0x3d2/0x680
[13634.939810]  ? br_vlan_add+0x135/0x1b0
[13634.939820]  ? __br_vlan_set_default_pvid.part.28+0x204/0x2b0
[13634.939834]  ? br_changelink+0x120/0x4e0
[13634.939844]  ? br_dev_newlink+0x50/0x70
[13634.939854]  ? rtnl_newlink+0x5f5/0x8a0
[13634.939864]  ? rtnl_newlink+0x176/0x8a0
[13634.939874]  ? mem_cgroup_commit_charge+0x7c/0x4e0
[13634.939886]  ? rtnetlink_rcv_msg+0xe1/0x220
[13634.939896]  ? lookup_fast+0x52/0x370
[13634.939905]  ? rtnl_newlink+0x8a0/0x8a0
[13634.939915]  ? netlink_rcv_skb+0xa1/0xc0
[13634.939925]  ? rtnetlink_rcv+0x24/0x30
[13634.939934]  ? netlink_unicast+0x177/0x220
[13634.939944]  ? netlink_sendmsg+0x2fe/0x3b0
[13634.939954]  ? _copy_from_user+0x39/0x40
[13634.939964]  ? sock_sendmsg+0x30/0x40
[13634.940159]  ? ___sys_sendmsg+0x29d/0x2b0
[13634.940326]  ? __alloc_pages_nodemask+0xdf/0x230
[13634.940478]  ? mem_cgroup_commit_charge+0x7c/0x4e0
[13634.940592]  ? mem_cgroup_try_charge+0x76/0x1a0
[13634.940701]  ? __handle_mm_fault+0xdb9/0x10b0
[13634.940809]  ? __sys_sendmsg+0x51/0x90
[13634.940917]  ? entry_SYSCALL_64_fastpath+0x1e/0xad

The problem is that the bridge's VLAN group is created after setting the
default PVID, when registering the netdevice and executing its
ndo_init().

Fix this by changing the order of both operations, so that
br_changelink() is only processed after the netdevice is registered,
when the VLAN group is already initialized.

Fixes: b6677449dff6 ("bridge: netlink: call br_changelink() during 
br_dev_newlink()")
Signed-off-by: Nikolay Aleksandrov 
Signed-off-by: Ido Schimmel 
Reported-by: Peter V. Saveliev 
---
 net/bridge/br_netlink.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a8f6acd..c38104e 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1165,11 +1165,19 @@ static int br_dev_newlink(struct net *src_net, struct 
net_device *dev,
spin_unlock_bh(>lock);
}
 
-   err = br_changelink(dev, tb, data);
+   err = register_netdevice(dev);
if (err)
return err;
 
-   return register_netdevice(dev);
+   err = br_changelink(dev, tb, data);
+   if (err)
+   goto unregister;
+
+   return 0;
+
+unregister:
+   unregister_netdevice(dev);
+   return err;
 }
 
 static size_t br_get_size(const struct net_device *brdev)
-- 
2.9.3



[PATCH net v2 0/2] bridge: Fix kernel oops during bridge creation

2017-04-08 Thread idosch
From: Ido Schimmel 

First patch adds a missing ndo_uninit() in the bridge driver, which is a
prerequisite for the second patch that actually fixes the oops.

First version was rejected for being "half baked", but given the amount
of changes in this version and that three stable kernels need to be
patched, a better approach might be to simply revert the patch in the
Fixes line and have this patchset go through net-next.

Ido Schimmel (2):
  bridge: implement missing ndo_uninit()
  bridge: netlink: register netdevice before executing changelink

 net/bridge/br_device.c| 13 ++---
 net/bridge/br_if.c|  1 -
 net/bridge/br_multicast.c |  7 +--
 net/bridge/br_netlink.c   | 12 ++--
 net/bridge/br_private.h   |  5 +
 5 files changed, 30 insertions(+), 8 deletions(-)

-- 
2.9.3



[PATCH net v2 1/2] bridge: implement missing ndo_uninit()

2017-04-08 Thread idosch
From: Ido Schimmel 

While the bridge driver implements an ndo_init(), it was missing a
symmetric ndo_uninit(), causing the different de-initialization
operations to be scattered around its dellink() and destructor().

Implement a symmetric ndo_uninit() and remove the overlapping operations
from its dellink() and destructor().

This is a prerequisite for the next patch, as it allows us to have a
proper cleanup upon changelink() failure during the bridge's newlink().

Fixes: b6677449dff6 ("bridge: netlink: call br_changelink() during 
br_dev_newlink()")
Signed-off-by: Nikolay Aleksandrov 
Signed-off-by: Ido Schimmel 
---
 net/bridge/br_device.c| 13 ++---
 net/bridge/br_if.c|  1 -
 net/bridge/br_multicast.c |  7 +--
 net/bridge/br_private.h   |  5 +
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ea71513..607210f 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -119,6 +119,15 @@ static int br_dev_init(struct net_device *dev)
return err;
 }
 
+static void br_dev_uninit(struct net_device *dev)
+{
+   struct net_bridge *br = netdev_priv(dev);
+
+   br_multicast_uninit_stats(br);
+   br_vlan_flush(br);
+   free_percpu(br->stats);
+}
+
 static int br_dev_open(struct net_device *dev)
 {
struct net_bridge *br = netdev_priv(dev);
@@ -332,6 +341,7 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_open= br_dev_open,
.ndo_stop= br_dev_stop,
.ndo_init= br_dev_init,
+   .ndo_uninit  = br_dev_uninit,
.ndo_start_xmit  = br_dev_xmit,
.ndo_get_stats64 = br_get_stats64,
.ndo_set_mac_address = br_set_mac_address,
@@ -358,9 +368,6 @@ static const struct net_device_ops br_netdev_ops = {
 
 static void br_dev_free(struct net_device *dev)
 {
-   struct net_bridge *br = netdev_priv(dev);
-
-   free_percpu(br->stats);
free_netdev(dev);
 }
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8ac1770..56a2a72 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -311,7 +311,6 @@ void br_dev_delete(struct net_device *dev, struct list_head 
*head)
 
br_fdb_delete_by_port(br, NULL, 0, 1);
 
-   br_vlan_flush(br);
br_multicast_dev_del(br);
cancel_delayed_work_sync(>gc_work);
 
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b760f26..faa7261 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2031,8 +2031,6 @@ void br_multicast_dev_del(struct net_bridge *br)
 
 out:
spin_unlock_bh(>multicast_lock);
-
-   free_percpu(br->mcast_stats);
 }
 
 int br_multicast_set_router(struct net_bridge *br, unsigned long val)
@@ -2531,6 +2529,11 @@ int br_multicast_init_stats(struct net_bridge *br)
return 0;
 }
 
+void br_multicast_uninit_stats(struct net_bridge *br)
+{
+   free_percpu(br->mcast_stats);
+}
+
 static void mcast_stats_add_dir(u64 *dst, u64 *src)
 {
dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6136818..0d17728 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -620,6 +620,7 @@ void br_rtr_notify(struct net_device *dev, struct 
net_bridge_port *port,
 void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
const struct sk_buff *skb, u8 type, u8 dir);
 int br_multicast_init_stats(struct net_bridge *br);
+void br_multicast_uninit_stats(struct net_bridge *br);
 void br_multicast_get_stats(const struct net_bridge *br,
const struct net_bridge_port *p,
struct br_mcast_stats *dest);
@@ -760,6 +761,10 @@ static inline int br_multicast_init_stats(struct 
net_bridge *br)
return 0;
 }
 
+static inline void br_multicast_uninit_stats(struct net_bridge *br)
+{
+}
+
 static inline int br_multicast_igmp_type(const struct sk_buff *skb)
 {
return 0;
-- 
2.9.3



[PATCH net] bridge: netlink: register netdevice before executing changelink

2017-04-07 Thread idosch
From: Ido Schimmel 

Peter reported a kernel oops when executing the following command:

$ ip link add name test type bridge vlan_default_pvid 1

[13634.939408] BUG: unable to handle kernel NULL pointer dereference at
0190
[13634.939436] IP: __vlan_add+0x73/0x5f0
[...]
[13634.939783] Call Trace:
[13634.939791]  ? pcpu_next_unpop+0x3b/0x50
[13634.939801]  ? pcpu_alloc+0x3d2/0x680
[13634.939810]  ? br_vlan_add+0x135/0x1b0
[13634.939820]  ? __br_vlan_set_default_pvid.part.28+0x204/0x2b0
[13634.939834]  ? br_changelink+0x120/0x4e0
[13634.939844]  ? br_dev_newlink+0x50/0x70
[13634.939854]  ? rtnl_newlink+0x5f5/0x8a0
[13634.939864]  ? rtnl_newlink+0x176/0x8a0
[13634.939874]  ? mem_cgroup_commit_charge+0x7c/0x4e0
[13634.939886]  ? rtnetlink_rcv_msg+0xe1/0x220
[13634.939896]  ? lookup_fast+0x52/0x370
[13634.939905]  ? rtnl_newlink+0x8a0/0x8a0
[13634.939915]  ? netlink_rcv_skb+0xa1/0xc0
[13634.939925]  ? rtnetlink_rcv+0x24/0x30
[13634.939934]  ? netlink_unicast+0x177/0x220
[13634.939944]  ? netlink_sendmsg+0x2fe/0x3b0
[13634.939954]  ? _copy_from_user+0x39/0x40
[13634.939964]  ? sock_sendmsg+0x30/0x40
[13634.940159]  ? ___sys_sendmsg+0x29d/0x2b0
[13634.940326]  ? __alloc_pages_nodemask+0xdf/0x230
[13634.940478]  ? mem_cgroup_commit_charge+0x7c/0x4e0
[13634.940592]  ? mem_cgroup_try_charge+0x76/0x1a0
[13634.940701]  ? __handle_mm_fault+0xdb9/0x10b0
[13634.940809]  ? __sys_sendmsg+0x51/0x90
[13634.940917]  ? entry_SYSCALL_64_fastpath+0x1e/0xad

The problem is that the bridge's VLAN group is created after setting the
default PVID, when registering the netdevice and executing its
ndo_init().

Fix this by changing the order of both operations, so that
br_changelink() is only processed after the netdevice is registered,
when the VLAN group is already initialized.

The changelink() call is done on a best-effort basis since unregistering
the netdevice upon failure won't perform a proper cleanup due to a
missing ndo_uninit(), which I'll try to add for net-next.

Fixes: b6677449dff6 ("bridge: netlink: call br_changelink() during 
br_dev_newlink()")
Signed-off-by: Nikolay Aleksandrov 
Signed-off-by: Ido Schimmel 
Reported-by: Peter V. Saveliev 
---
Please consider this for 4.4.y, 4.9.y and 4.10.y as well.
---
 net/bridge/br_netlink.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a8f6acd..cf06c1a 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1165,11 +1165,15 @@ static int br_dev_newlink(struct net *src_net, struct 
net_device *dev,
spin_unlock_bh(>lock);
}
 
-   err = br_changelink(dev, tb, data);
+   err = register_netdevice(dev);
if (err)
return err;
 
-   return register_netdevice(dev);
+   err = br_changelink(dev, tb, data);
+   if (err)
+   netdev_err(dev, "Failed to configure bridge device\n");
+
+   return 0;
 }
 
 static size_t br_get_size(const struct net_device *brdev)
-- 
2.9.3



[PATCH net-next v2] ipv4: fib: Replay events when registering FIB notifier

2016-10-31 Thread idosch
From: Ido Schimmel 

When registering a FIB notifier block we should notify the caller of
already existing FIB entries and rules, as it would otherwise have an
incomplete view of the FIB tables. This is especially important for
switchdev drivers that support FIB offloads. Failing to notify them of
existing entries may lead to packet loss.

Upon registration, walk the leafs of all the routing tables and for each
leaf send notification of existing FIB aliases. Similarly, when
unregistering the notifier synthesize a deletion event, thereby
relieving potential callers from the need to perform cleanup.

The above is consistent with the netdevice notification chain, where
"registration and up events are replayed to the new notifier" upon
registration.

Signed-off-by: Ido Schimmel 
Reviewed-by: Jiri Pirko 
---
v1->v2:
* Add #ifdef fixing compilation error reported by kbuild robot
---
 net/ipv4/fib_trie.c | 152 +++-
 1 file changed, 150 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 31cef36..455d94b 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -86,15 +86,107 @@
 
 static BLOCKING_NOTIFIER_HEAD(fib_chain);
 
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+enum fib_event_type event_type,
+struct fib_notifier_info *info)
+{
+   info->net = net;
+   return nb->notifier_call(nb, event_type, info);
+}
+
+static void fib_rules_notify(struct net *net, struct notifier_block *nb,
+enum fib_event_type event_type)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+   struct fib_notifier_info info;
+
+   if (net->ipv4.fib_has_custom_rules)
+   call_fib_notifier(nb, net, event_type, );
+#endif
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+  enum fib_event_type event_type);
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+  enum fib_event_type event_type, u32 dst,
+  int dst_len, struct fib_info *fi,
+  u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+   struct fib_entry_notifier_info info = {
+   .dst = dst,
+   .dst_len = dst_len,
+   .fi = fi,
+   .tos = tos,
+   .type = type,
+   .tb_id = tb_id,
+   .nlflags = nlflags,
+   };
+   return call_fib_notifier(nb, net, event_type, );
+}
+
+/**
+ * register_fib_notifier - register a fib notifier block
+ * @nb: notifier
+ *
+ * Register a notifier to be called when FIB entries or rules are
+ * added or removed. A negative errno code is returned on failure.
+ *
+ * When registered, all FIB addition events are replayed to the new
+ * notifier to allow the caller to have a complete view of the FIB
+ * tables.
+ */
+
 int register_fib_notifier(struct notifier_block *nb)
 {
-   return blocking_notifier_chain_register(_chain, nb);
+   struct net *net;
+   int err;
+
+   rtnl_lock();
+   err = blocking_notifier_chain_register(_chain, nb);
+   if (err)
+   goto unlock;
+   for_each_net(net) {
+   fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
+   fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+   }
+
+unlock:
+   rtnl_unlock();
+   return err;
 }
 EXPORT_SYMBOL(register_fib_notifier);
 
+/**
+ * unregister_fib_notifier - unregister a fib notifier block
+ * @nb: notifier
+ *
+ * unregister a notifier previously registered by
+ * register_fib_notifier(). A negative errno code is returned on
+ * failure.
+ *
+ * After unregistering, FIB deletion events are synthesized to the
+ * removed notifier for all present FIB entries. This removes the
+ * need for special case cleanup code.
+ */
+
 int unregister_fib_notifier(struct notifier_block *nb)
 {
-   return blocking_notifier_chain_unregister(_chain, nb);
+   struct net *net;
+   int err;
+
+   rtnl_lock();
+   err = blocking_notifier_chain_unregister(_chain, nb);
+   if (err)
+   goto unlock;
+   for_each_net(net) {
+   fib_notify(net, nb, FIB_EVENT_ENTRY_DEL);
+   fib_rules_notify(net, nb, FIB_EVENT_RULE_DEL);
+   }
+
+unlock:
+   rtnl_unlock();
+   return err;
 }
 EXPORT_SYMBOL(unregister_fib_notifier);
 
@@ -1834,6 +1926,62 @@ int fib_table_flush(struct net *net, struct fib_table 
*tb)
return found;
 }
 
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+   struct fib_table *tb, struct notifier_block *nb,
+   enum fib_event_type event_type)
+{
+   struct fib_alias *fa;
+
+   hlist_for_each_entry(fa, 

[PATCH net-next] ipv4: fib: Replay events when registering FIB notifier

2016-10-31 Thread idosch
From: Ido Schimmel 

When registering a FIB notifier block we should notify the caller of
already existing FIB entries and rules, as it would otherwise have an
incomplete view of the FIB tables. This is especially important for
switchdev drivers that support FIB offloads. Failing to notify them of
existing entries may lead to packet loss.

Upon registration, walk the leafs of all the routing tables and for each
leaf send notification of existing FIB aliases. Similarly, when
unregistering the notifier synthesize a deletion event, thereby
relieving potential callers from the need to perform cleanup.

The above is consistent with the netdevice notification chain, where
"registration and up events are replayed to the new notifier" upon
registration.

Signed-off-by: Ido Schimmel 
Reviewed-by: Jiri Pirko 
---
 net/ipv4/fib_trie.c | 147 +++-
 1 file changed, 145 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 31cef36..608f9d4 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -86,15 +86,102 @@
 
 static BLOCKING_NOTIFIER_HEAD(fib_chain);
 
+static void fib_notify(struct net *net, struct notifier_block *nb,
+  enum fib_event_type event_type);
+
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+enum fib_event_type event_type,
+struct fib_notifier_info *info)
+{
+   info->net = net;
+   return nb->notifier_call(nb, event_type, info);
+}
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+  enum fib_event_type event_type, u32 dst,
+  int dst_len, struct fib_info *fi,
+  u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+   struct fib_entry_notifier_info info = {
+   .dst = dst,
+   .dst_len = dst_len,
+   .fi = fi,
+   .tos = tos,
+   .type = type,
+   .tb_id = tb_id,
+   .nlflags = nlflags,
+   };
+   return call_fib_notifier(nb, net, event_type, );
+}
+
+/**
+ * register_fib_notifier - register a fib notifier block
+ * @nb: notifier
+ *
+ * Register a notifier to be called when FIB entries or rules are
+ * added or removed. A negative errno code is returned on failure.
+ *
+ * When registered, all FIB addition events are replayed to the new
+ * notifier to allow the caller to have a complete view of the FIB
+ * tables.
+ */
+
 int register_fib_notifier(struct notifier_block *nb)
 {
-   return blocking_notifier_chain_register(_chain, nb);
+   struct net *net;
+   int err;
+
+   rtnl_lock();
+   err = blocking_notifier_chain_register(_chain, nb);
+   if (err)
+   goto unlock;
+   for_each_net(net) {
+   struct fib_notifier_info info;
+
+   if (net->ipv4.fib_has_custom_rules)
+   call_fib_notifier(nb, net, FIB_EVENT_RULE_ADD, );
+   fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+   }
+
+unlock:
+   rtnl_unlock();
+   return err;
 }
 EXPORT_SYMBOL(register_fib_notifier);
 
+/**
+ * unregister_fib_notifier - unregister a fib notifier block
+ * @nb: notifier
+ *
+ * unregister a notifier previously registered by
+ * register_fib_notifier(). A negative errno code is returned on
+ * failure.
+ *
+ * After unregistering, FIB deletion events are synthesized to the
+ * removed notifier for all present FIB entries. This removes the
+ * need for special case cleanup code.
+ */
+
 int unregister_fib_notifier(struct notifier_block *nb)
 {
-   return blocking_notifier_chain_unregister(_chain, nb);
+   struct net *net;
+   int err;
+
+   rtnl_lock();
+   err = blocking_notifier_chain_unregister(_chain, nb);
+   if (err)
+   goto unlock;
+   for_each_net(net) {
+   struct fib_notifier_info info;
+
+   fib_notify(net, nb, FIB_EVENT_ENTRY_DEL);
+   if (net->ipv4.fib_has_custom_rules)
+   call_fib_notifier(nb, net, FIB_EVENT_RULE_DEL, );
+   }
+
+unlock:
+   rtnl_unlock();
+   return err;
 }
 EXPORT_SYMBOL(unregister_fib_notifier);
 
@@ -1834,6 +1921,62 @@ int fib_table_flush(struct net *net, struct fib_table 
*tb)
return found;
 }
 
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+   struct fib_table *tb, struct notifier_block *nb,
+   enum fib_event_type event_type)
+{
+   struct fib_alias *fa;
+
+   hlist_for_each_entry(fa, >leaf, fa_list) {
+   struct fib_info *fi = fa->fa_info;
+
+   if (!fi)
+   continue;
+
+   /* local and main table can share 

[PATCH net-next] switchdev: Remove redundant variable

2016-10-26 Thread idosch
From: Ido Schimmel 

Instead of storing return value in 'err' and returning, just return
directly.

Signed-off-by: Ido Schimmel 
Reviewed-by: Jiri Pirko 
---
 net/switchdev/switchdev.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 02beb35..6f145b5 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -624,13 +624,10 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
 int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
 struct switchdev_notifier_info *info)
 {
-   int err;
-
ASSERT_RTNL();
 
info->dev = dev;
-   err = raw_notifier_call_chain(_notif_chain, val, info);
-   return err;
+   return raw_notifier_call_chain(_notif_chain, val, info);
 }
 EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
 
-- 
2.7.4



[PATCH net-next] net: core: Traverse the adjacency list from first entry

2016-10-26 Thread idosch
From: Ido Schimmel 

netdev_next_lower_dev() returns NULL when we finished traversing the
adjacency list ('iter' points to the list's head). Therefore, we must
start traversing the list from the first entry and not its head.

Fixes: 1a3f060c1a47 ("net: Introduce new api for walking upper and lower 
devices")
Signed-off-by: Ido Schimmel 
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index f55fb45..d9c937f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5419,7 +5419,7 @@ int netdev_walk_all_lower_dev(struct net_device *dev,
struct list_head *iter;
int ret;
 
-   for (iter = >adj_list.lower,
+   for (iter = dev->adj_list.lower.next,
 ldev = netdev_next_lower_dev(dev, );
 ldev;
 ldev = netdev_next_lower_dev(dev, )) {
-- 
2.7.4



[PATCH net] net: core: Correctly iterate over lower adjacency list

2016-10-19 Thread idosch
From: Ido Schimmel 

Tamir reported the following trace when processing ARP requests received
via a vlan device on top of a VLAN-aware bridge:

 NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [swapper/1:0]
[...]
 CPU: 1 PID: 0 Comm: swapper/1 Tainted: GW   4.8.0-rc7 #1
 Hardware name: Mellanox Technologies Ltd. "MSN2100-CB2F"/"SA001017", BIOS 
5.6.5 06/07/2016
 task: 88017edfea40 task.stack: 88017ee1
 RIP: 0010:[]  [] 
netdev_all_lower_get_next_rcu+0x33/0x60
[...]
 Call Trace:
  
  [] mlxsw_sp_port_lower_dev_hold+0x5a/0xa0 [mlxsw_spectrum]
  [] mlxsw_sp_router_netevent_event+0x80/0x150 
[mlxsw_spectrum]
  [] notifier_call_chain+0x4a/0x70
  [] atomic_notifier_call_chain+0x1a/0x20
  [] call_netevent_notifiers+0x1b/0x20
  [] neigh_update+0x306/0x740
  [] neigh_event_ns+0x4e/0xb0
  [] arp_process+0x66f/0x700
  [] ? common_interrupt+0x8c/0x8c
  [] arp_rcv+0x139/0x1d0
  [] ? vlan_do_receive+0xda/0x320
  [] __netif_receive_skb_core+0x524/0xab0
  [] ? dev_queue_xmit+0x10/0x20
  [] ? br_forward_finish+0x3d/0xc0 [bridge]
  [] ? br_handle_vlan+0xf6/0x1b0 [bridge]
  [] __netif_receive_skb+0x18/0x60
  [] netif_receive_skb_internal+0x40/0xb0
  [] netif_receive_skb+0x1c/0x70
  [] br_pass_frame_up+0xc6/0x160 [bridge]
  [] ? deliver_clone+0x37/0x50 [bridge]
  [] ? br_flood+0xcc/0x160 [bridge]
  [] br_handle_frame_finish+0x224/0x4f0 [bridge]
  [] br_handle_frame+0x174/0x300 [bridge]
  [] __netif_receive_skb_core+0x329/0xab0
  [] ? find_next_bit+0x15/0x20
  [] ? cpumask_next_and+0x32/0x50
  [] ? load_balance+0x178/0x9b0
  [] __netif_receive_skb+0x18/0x60
  [] netif_receive_skb_internal+0x40/0xb0
  [] netif_receive_skb+0x1c/0x70
  [] mlxsw_sp_rx_listener_func+0x61/0xb0 [mlxsw_spectrum]
  [] mlxsw_core_skb_receive+0x187/0x200 [mlxsw_core]
  [] mlxsw_pci_cq_tasklet+0x63a/0x9b0 [mlxsw_pci]
  [] tasklet_action+0xf6/0x110
  [] __do_softirq+0xf6/0x280
  [] irq_exit+0xdf/0xf0
  [] do_IRQ+0x54/0xd0
  [] common_interrupt+0x8c/0x8c

The problem is that netdev_all_lower_get_next_rcu() never advances the
iterator, thereby causing the loop over the lower adjacency list to run
forever.

Fix this by advancing the iterator and avoid the infinite loop.

Fixes: 7ce856aaaf13 ("mlxsw: spectrum: Add couple of lower device helper 
functions")
Signed-off-by: Ido Schimmel 
Reported-by: Tamir Winetroub 
Reviewed-by: Jiri Pirko 
---
Note that DavidA's series removes this code in net-next, but it wasn't
picked up for net, so I'm sending this patch.
Please consider queueing this for 4.8.y
---
 include/linux/netdevice.h |  2 +-
 net/core/dev.c| 10 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 136ae6bb..465e128 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3877,7 +3877,7 @@ struct net_device *netdev_all_lower_get_next_rcu(struct 
net_device *dev,
 ldev = netdev_all_lower_get_next(dev, &(iter)))
 
 #define netdev_for_each_all_lower_dev_rcu(dev, ldev, iter) \
-   for (iter = (dev)->all_adj_list.lower.next, \
+   for (iter = &(dev)->all_adj_list.lower, \
 ldev = netdev_all_lower_get_next_rcu(dev, &(iter)); \
 ldev; \
 ldev = netdev_all_lower_get_next_rcu(dev, &(iter)))
diff --git a/net/core/dev.c b/net/core/dev.c
index f1fe26f..b09ac57 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5511,10 +5511,14 @@ struct net_device *netdev_all_lower_get_next_rcu(struct 
net_device *dev,
 {
struct netdev_adjacent *lower;
 
-   lower = list_first_or_null_rcu(>all_adj_list.lower,
-  struct netdev_adjacent, list);
+   lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+   if (>list == >all_adj_list.lower)
+   return NULL;
+
+   *iter = >list;
 
-   return lower ? lower->dev : NULL;
+   return lower->dev;
 }
 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
 
-- 
2.7.4